putty-source/crypto/aesgcm-neon.c

/*
 * Implementation of the GCM polynomial hash using Arm NEON vector
 * intrinsics, in particular the multiplication operation for
 * polynomials over GF(2).
 *
 * Follows the reference implementation in aesgcm-ref-poly.c; see
 * there for comments on the underlying technique. Here the comments
 * just discuss the NEON-specific details.
 */

#include "ssh.h"
#include "aesgcm.h"

#if USE_ARM64_NEON_H
#include <arm64_neon.h>
#else
#include <arm_neon.h>
#endif

typedef struct aesgcm_neon {
    AESGCM_COMMON_FIELDS;
    poly128_t var, acc, mask;
} aesgcm_neon;

static bool aesgcm_neon_available(void)
{
    return platform_pmull_neon_available();
}

/*
 * The NEON types involved are:
 *
 * 'poly128_t' is a type that lives in a 128-bit vector register and
 * represents a 128-bit polynomial over GF(2)
 *
 * 'poly64x2_t' is a type that lives in a 128-bit vector register and
 * represents a vector of two 64-bit polynomials. These appear as
 * intermediate results in some of the helper functions below, but we
 * never need to actually have a variable of that type.
 *
 * 'poly64x1_t' is a type that lives in a 128-bit vector register and
 * represents a vector of one 64-bit polynomial.
 *
 * That is distinct from 'poly64_t', which is a type that lives in
 * ordinary scalar registers and is a typedef for an integer type.
 *
 * Generally here we try to work in terms of poly128_t and 64-bit
 * integer types, and let everything else be handled as internal
 * details of these helper functions.
 */

/* Make a poly128_t from two halves */
static inline poly128_t create_p128(poly64_t hi, poly64_t lo)
{
    return vreinterpretq_p128_p64(
        vcombine_p64(vcreate_p64(lo), vcreate_p64(hi)));
}

/* Retrieve the high and low halves of a poly128_t */
static inline poly64_t hi_half(poly128_t v)
{
    return vgetq_lane_p64(vreinterpretq_p64_p128(v), 1);
}
static inline poly64_t lo_half(poly128_t v)
{
    return vgetq_lane_p64(vreinterpretq_p64_p128(v), 0);
}

/* 64x64 -> 128 bit polynomial multiplication, the largest we can do
 * in one CPU operation */
static inline poly128_t pmul(poly64_t v, poly64_t w)
{
    return vmull_p64(v, w);
}

/* Load and store a poly128_t in the form of big-endian bytes. This
 * involves separately swapping the halves of the register and
 * reversing the bytes within each half. */
static inline poly128_t load_p128_be(const void *p)
{
    poly128_t swapped = vreinterpretq_p128_u8(vrev64q_u8(vld1q_u8(p)));
    return create_p128(lo_half(swapped), hi_half(swapped));
}
static inline void store_p128_be(void *p, poly128_t v)
{
    poly128_t swapped = create_p128(lo_half(v), hi_half(v));
    vst1q_u8(p, vrev64q_u8(vreinterpretq_u8_p128(swapped)));
}

#if !HAVE_NEON_VADDQ_P128
static inline poly128_t vaddq_p128(poly128_t a, poly128_t b)
{
    return vreinterpretq_p128_u32(veorq_u32(
        vreinterpretq_u32_p128(a), vreinterpretq_u32_p128(b)));
}
#endif

/*
 * Key setup is just like in aesgcm-ref-poly.c. There's no point using
 * vector registers to accelerate this, because it happens rarely.
 */
static void aesgcm_neon_setkey_impl(aesgcm_neon *ctx, const unsigned char *var)
{
    uint64_t hi = GET_64BIT_MSB_FIRST(var);
    uint64_t lo = GET_64BIT_MSB_FIRST(var + 8);

    uint64_t bit = 1 & (hi >> 63);
    hi = (hi << 1) ^ (lo >> 63);
    lo = (lo << 1) ^ bit;
    hi ^= 0xC200000000000000 & -bit;

    ctx->var = create_p128(hi, lo);
}

static inline void aesgcm_neon_setup(aesgcm_neon *ctx,
                                     const unsigned char *mask)
{
    ctx->mask = load_p128_be(mask);
    ctx->acc = create_p128(0, 0);
}

/*
 * Folding a coefficient into the accumulator is done by exactly the
 * algorithm in aesgcm-ref-poly.c, translated line by line.
 *
 * It's possible that this could be improved by some clever manoeuvres
 * that avoid having to break vectors in half and put them together
 * again. Patches welcome if anyone has better ideas.
 */
static inline void aesgcm_neon_coeff(aesgcm_neon *ctx,
                                     const unsigned char *coeff)
{
    ctx->acc = vaddq_p128(ctx->acc, load_p128_be(coeff));

    poly64_t ah = hi_half(ctx->acc), al = lo_half(ctx->acc);
    poly64_t bh = hi_half(ctx->var), bl = lo_half(ctx->var);
    poly128_t md = pmul(ah ^ al, bh ^ bl);
    poly128_t lo = pmul(al, bl);
    poly128_t hi = pmul(ah, bh);
    md = vaddq_p128(md, vaddq_p128(hi, lo));
    hi = create_p128(hi_half(hi), lo_half(hi) ^ hi_half(md));
    lo = create_p128(hi_half(lo) ^ lo_half(md), lo_half(lo));

    poly128_t r1 = pmul((poly64_t)0xC200000000000000, lo_half(lo));
    hi = create_p128(hi_half(hi), lo_half(hi) ^ lo_half(lo) ^ hi_half(r1));
    lo = create_p128(hi_half(lo) ^ lo_half(r1), lo_half(lo));

    poly128_t r2 = pmul((poly64_t)0xC200000000000000, hi_half(lo));
    hi = vaddq_p128(hi, r2);
    hi = create_p128(hi_half(hi) ^ hi_half(lo), lo_half(hi));

    ctx->acc = hi;
}

static inline void aesgcm_neon_output(aesgcm_neon *ctx, unsigned char *output)
{
    store_p128_be(output, vaddq_p128(ctx->acc, ctx->mask));
    ctx->acc = create_p128(0, 0);
    ctx->mask = create_p128(0, 0);
}

#define AESGCM_FLAVOUR neon
#define AESGCM_NAME "NEON accelerated"
#include "aesgcm-footer.h"
Implement AES-GCM using the @openssh.com protocol IDs. I only recently found out that OpenSSH defined their own protocol IDs for AES-GCM, defined to work the same as the standard ones except that they fixed the semantics for how you select the linked cipher+MAC pair during key exchange. (RFC 5647 defines protocol ids for AES-GCM in both the cipher and MAC namespaces, and requires that you MUST select both or neither - but this contradicts the selection policy set out in the base SSH RFCs, and there's no discussion of how you resolve a conflict between them! OpenSSH's answer is to do it the same way ChaCha20-Poly1305 works, because that will ensure the two suites don't fight.) People do occasionally ask us for this linked cipher/MAC pair, and now I know it's actually feasible, I've implemented it, including a pair of vector implementations for x86 and Arm using their respective architecture extensions for multiplying polynomials over GF(2). Unlike ChaCha20-Poly1305, I've kept the cipher and MAC implementations in separate objects, with an arm's-length link between them that the MAC uses when it needs to encrypt single cipher blocks to use as the inputs to the MAC algorithm. That enables the cipher and the MAC to be independently selected from their hardware-accelerated versions, just in case someone runs on a system that has polynomial multiplication instructions but not AES acceleration, or vice versa. There's a fourth implementation of the GCM MAC, which is a pure software implementation of the same algorithm used in the vectorised versions. It's too slow to use live, but I've kept it in the code for future testing needs, and because it's a convenient place to dump my design comments. The vectorised implementations are fairly crude as far as optimisation goes. I'm sure serious x86 _or_ Arm optimisation engineers would look at them and laugh. But GCM is a fast MAC compared to HMAC-SHA-256 (indeed compared to HMAC-anything-at-all), so it should at least be good enough to use. And we've got a working version with some tests now, so if someone else wants to improve them, they can. 2022-08-16 17:36:58 +00:00			`/*`
			`* Implementation of the GCM polynomial hash using Arm NEON vector`
			`* intrinsics, in particular the multiplication operation for`
			`* polynomials over GF(2).`
			`*`
			`* Follows the reference implementation in aesgcm-ref-poly.c; see`
			`* there for comments on the underlying technique. Here the comments`
			`* just discuss the NEON-specific details.`
			`*/`

			`#include "ssh.h"`
			`#include "aesgcm.h"`

			`#if USE_ARM64_NEON_H`
			`#include <arm64_neon.h>`
			`#else`
			`#include <arm_neon.h>`
			`#endif`

			`typedef struct aesgcm_neon {`
			`AESGCM_COMMON_FIELDS;`
			`poly128_t var, acc, mask;`
			`} aesgcm_neon;`

			`static bool aesgcm_neon_available(void)`
			`{`
			`return platform_pmull_neon_available();`
			`}`

			`/*`
			`* The NEON types involved are:`
			`*`
			`* 'poly128_t' is a type that lives in a 128-bit vector register and`
			`* represents a 128-bit polynomial over GF(2)`
			`*`
			`* 'poly64x2_t' is a type that lives in a 128-bit vector register and`
			`* represents a vector of two 64-bit polynomials. These appear as`
			`* intermediate results in some of the helper functions below, but we`
			`* never need to actually have a variable of that type.`
			`*`
			`* 'poly64x1_t' is a type that lives in a 128-bit vector register and`
			`* represents a vector of one 64-bit polynomial.`
			`*`
			`* That is distinct from 'poly64_t', which is a type that lives in`
			`* ordinary scalar registers and is a typedef for an integer type.`
			`*`
			`* Generally here we try to work in terms of poly128_t and 64-bit`
			`* integer types, and let everything else be handled as internal`
			`* details of these helper functions.`
			`*/`

			`/* Make a poly128_t from two halves */`
			`static inline poly128_t create_p128(poly64_t hi, poly64_t lo)`
			`{`
			`return vreinterpretq_p128_p64(`
			`vcombine_p64(vcreate_p64(lo), vcreate_p64(hi)));`
			`}`

			`/* Retrieve the high and low halves of a poly128_t */`
			`static inline poly64_t hi_half(poly128_t v)`
			`{`
			`return vgetq_lane_p64(vreinterpretq_p64_p128(v), 1);`
			`}`
			`static inline poly64_t lo_half(poly128_t v)`
			`{`
			`return vgetq_lane_p64(vreinterpretq_p64_p128(v), 0);`
			`}`

			`/* 64x64 -> 128 bit polynomial multiplication, the largest we can do`
			`* in one CPU operation */`
			`static inline poly128_t pmul(poly64_t v, poly64_t w)`
			`{`
			`return vmull_p64(v, w);`
			`}`

			`/* Load and store a poly128_t in the form of big-endian bytes. This`
			`* involves separately swapping the halves of the register and`
			`* reversing the bytes within each half. */`
			`static inline poly128_t load_p128_be(const void *p)`
			`{`
			`poly128_t swapped = vreinterpretq_p128_u8(vrev64q_u8(vld1q_u8(p)));`
			`return create_p128(lo_half(swapped), hi_half(swapped));`
			`}`
			`static inline void store_p128_be(void *p, poly128_t v)`
			`{`
			`poly128_t swapped = create_p128(lo_half(v), hi_half(v));`
			`vst1q_u8(p, vrev64q_u8(vreinterpretq_u8_p128(swapped)));`
			`}`

AES-GCM NEON: cope with missing vaddq_p128. In some compilers (I'm told clang 10, in particular), the NEON intrinsic vaddq_p128 is missing, even though its input type poly128_t is provided. vaddq_p128 is just an XOR of two vector registers, so that's easy to work around by casting to a more mundane type and back. Added a configure-time test for that intrinsic, and a workaround to be used in its absence. 2022-10-12 11:54:36 +00:00			`#if !HAVE_NEON_VADDQ_P128`
			`static inline poly128_t vaddq_p128(poly128_t a, poly128_t b)`
			`{`
			`return vreinterpretq_p128_u32(veorq_u32(`
			`vreinterpretq_u32_p128(a), vreinterpretq_u32_p128(b)));`
			`}`
			`#endif`

Implement AES-GCM using the @openssh.com protocol IDs. I only recently found out that OpenSSH defined their own protocol IDs for AES-GCM, defined to work the same as the standard ones except that they fixed the semantics for how you select the linked cipher+MAC pair during key exchange. (RFC 5647 defines protocol ids for AES-GCM in both the cipher and MAC namespaces, and requires that you MUST select both or neither - but this contradicts the selection policy set out in the base SSH RFCs, and there's no discussion of how you resolve a conflict between them! OpenSSH's answer is to do it the same way ChaCha20-Poly1305 works, because that will ensure the two suites don't fight.) People do occasionally ask us for this linked cipher/MAC pair, and now I know it's actually feasible, I've implemented it, including a pair of vector implementations for x86 and Arm using their respective architecture extensions for multiplying polynomials over GF(2). Unlike ChaCha20-Poly1305, I've kept the cipher and MAC implementations in separate objects, with an arm's-length link between them that the MAC uses when it needs to encrypt single cipher blocks to use as the inputs to the MAC algorithm. That enables the cipher and the MAC to be independently selected from their hardware-accelerated versions, just in case someone runs on a system that has polynomial multiplication instructions but not AES acceleration, or vice versa. There's a fourth implementation of the GCM MAC, which is a pure software implementation of the same algorithm used in the vectorised versions. It's too slow to use live, but I've kept it in the code for future testing needs, and because it's a convenient place to dump my design comments. The vectorised implementations are fairly crude as far as optimisation goes. I'm sure serious x86 _or_ Arm optimisation engineers would look at them and laugh. But GCM is a fast MAC compared to HMAC-SHA-256 (indeed compared to HMAC-anything-at-all), so it should at least be good enough to use. And we've got a working version with some tests now, so if someone else wants to improve them, they can. 2022-08-16 17:36:58 +00:00			`/*`
			`* Key setup is just like in aesgcm-ref-poly.c. There's no point using`
			`* vector registers to accelerate this, because it happens rarely.`
			`*/`
			`static void aesgcm_neon_setkey_impl(aesgcm_neon ctx, const unsigned char var)`
			`{`
			`uint64_t hi = GET_64BIT_MSB_FIRST(var);`
			`uint64_t lo = GET_64BIT_MSB_FIRST(var + 8);`

			`uint64_t bit = 1 & (hi >> 63);`
			`hi = (hi << 1) ^ (lo >> 63);`
			`lo = (lo << 1) ^ bit;`
			`hi ^= 0xC200000000000000 & -bit;`

			`ctx->var = create_p128(hi, lo);`
			`}`

			`static inline void aesgcm_neon_setup(aesgcm_neon *ctx,`
			`const unsigned char *mask)`
			`{`
			`ctx->mask = load_p128_be(mask);`
			`ctx->acc = create_p128(0, 0);`
			`}`

			`/*`
			`* Folding a coefficient into the accumulator is done by exactly the`
			`* algorithm in aesgcm-ref-poly.c, translated line by line.`
			`*`
			`* It's possible that this could be improved by some clever manoeuvres`
			`* that avoid having to break vectors in half and put them together`
			`* again. Patches welcome if anyone has better ideas.`
			`*/`
			`static inline void aesgcm_neon_coeff(aesgcm_neon *ctx,`
			`const unsigned char *coeff)`
			`{`
			`ctx->acc = vaddq_p128(ctx->acc, load_p128_be(coeff));`

			`poly64_t ah = hi_half(ctx->acc), al = lo_half(ctx->acc);`
			`poly64_t bh = hi_half(ctx->var), bl = lo_half(ctx->var);`
			`poly128_t md = pmul(ah ^ al, bh ^ bl);`
			`poly128_t lo = pmul(al, bl);`
			`poly128_t hi = pmul(ah, bh);`
			`md = vaddq_p128(md, vaddq_p128(hi, lo));`
			`hi = create_p128(hi_half(hi), lo_half(hi) ^ hi_half(md));`
			`lo = create_p128(hi_half(lo) ^ lo_half(md), lo_half(lo));`

			`poly128_t r1 = pmul((poly64_t)0xC200000000000000, lo_half(lo));`
			`hi = create_p128(hi_half(hi), lo_half(hi) ^ lo_half(lo) ^ hi_half(r1));`
			`lo = create_p128(hi_half(lo) ^ lo_half(r1), lo_half(lo));`

			`poly128_t r2 = pmul((poly64_t)0xC200000000000000, hi_half(lo));`
			`hi = vaddq_p128(hi, r2);`
			`hi = create_p128(hi_half(hi) ^ hi_half(lo), lo_half(hi));`

			`ctx->acc = hi;`
			`}`

			`static inline void aesgcm_neon_output(aesgcm_neon ctx, unsigned char output)`
			`{`
			`store_p128_be(output, vaddq_p128(ctx->acc, ctx->mask));`
			`ctx->acc = create_p128(0, 0);`
			`ctx->mask = create_p128(0, 0);`
			`}`

			`#define AESGCM_FLAVOUR neon`
			`#define AESGCM_NAME "NEON accelerated"`
			`#include "aesgcm-footer.h"`