Implement AES-GCM using the @openssh.com protocol IDs.

I only recently found out that OpenSSH defined their own protocol IDs for AES-GCM, defined to work the same as the standard ones except that they fixed the semantics for how you select the linked cipher+MAC pair during key exchange. (RFC 5647 defines protocol ids for AES-GCM in both the cipher and MAC namespaces, and requires that you MUST select both or neither - but this contradicts the selection policy set out in the base SSH RFCs, and there's no discussion of how you resolve a conflict between them! OpenSSH's answer is to do it the same way ChaCha20-Poly1305 works, because that will ensure the two suites don't fight.) People do occasionally ask us for this linked cipher/MAC pair, and now I know it's actually feasible, I've implemented it, including a pair of vector implementations for x86 and Arm using their respective architecture extensions for multiplying polynomials over GF(2). Unlike ChaCha20-Poly1305, I've kept the cipher and MAC implementations in separate objects, with an arm's-length link between them that the MAC uses when it needs to encrypt single cipher blocks to use as the inputs to the MAC algorithm. That enables the cipher and the MAC to be independently selected from their hardware-accelerated versions, just in case someone runs on a system that has polynomial multiplication instructions but not AES acceleration, or vice versa. There's a fourth implementation of the GCM MAC, which is a pure software implementation of the same algorithm used in the vectorised versions. It's too slow to use live, but I've kept it in the code for future testing needs, and because it's a convenient place to dump my design comments. The vectorised implementations are fairly crude as far as optimisation goes. I'm sure serious x86 _or_ Arm optimisation engineers would look at them and laugh. But GCM is a fast MAC compared to HMAC-SHA-256 (indeed compared to HMAC-anything-at-all), so it should at least be good enough to use. And we've got a working version with some tests now, so if someone else wants to improve them, they can.
2025-07-07 22:35:06 -05:00 · 2022-08-16 18:36:58 +01:00
parent fd840f0dfe
commit c1a2114b28
30 changed files with 2167 additions and 11 deletions
--- a/cmake/cmake.h.in
+++ b/cmake/cmake.h.in
@ -49,7 +49,9 @@
 #cmakedefine01 HAVE_AES_NI
 #cmakedefine01 HAVE_SHA_NI
 #cmakedefine01 HAVE_SHAINTRIN_H
+#cmakedefine01 HAVE_CLMUL
 #cmakedefine01 HAVE_NEON_CRYPTO
+#cmakedefine01 HAVE_NEON_PMULL
 #cmakedefine01 HAVE_NEON_SHA512
 #cmakedefine01 HAVE_NEON_SHA512_INTRINSICS
 #cmakedefine01 USE_ARM64_NEON_H
--- a/config.c
+++ b/config.c
@ -491,6 +491,7 @@ static void cipherlist_handler(dlgcontrol *ctrl, dlgparam *dlg,

        static const struct { const char *s; int c; } ciphers[] = {
            { "ChaCha20 (SSH-2 only)",  CIPHER_CHACHA20 },
+            { "AES-GCM (SSH-2 only)",   CIPHER_AESGCM },
            { "3DES",                   CIPHER_3DES },
            { "Blowfish",               CIPHER_BLOWFISH },
            { "DES",                    CIPHER_DES },
--- a/crypto/CMakeLists.txt
+++ b/crypto/CMakeLists.txt
@ -2,6 +2,10 @@ add_sources_from_current_dir(crypto
  aes-common.c
  aes-select.c
  aes-sw.c
+  aesgcm-common.c
+  aesgcm-select.c
+  aesgcm-sw.c
+  aesgcm-ref-poly.c
  arcfour.c
  argon2.c
  bcrypt.c
@ -123,6 +127,16 @@ if(HAVE_WMMINTRIN_H)
      volatile __m128i r, a, b, c;
      int main(void) { r = _mm_sha256rnds2_epu32(a, b, c); }"
    ADD_SOURCES_IF_SUCCESSFUL sha256-ni.c sha1-ni.c)
+
+  test_compile_with_flags(HAVE_CLMUL
+    GNU_FLAGS -msse4.1 -mpclmul
+    TEST_SOURCE "
+      #include <wmmintrin.h>
+      #include <tmmintrin.h>
+      volatile __m128i r, a, b;
+      int main(void) { r = _mm_clmulepi64_si128(a, b, 5);
+                       r = _mm_shuffle_epi8(r, a); }"
+    ADD_SOURCES_IF_SUCCESSFUL aesgcm-clmul.c)
 endif()

 # ----------------------------------------------------------------------
@ -170,6 +184,17 @@ if(neon)
      int main(void) { r = vaeseq_u8(a, b); s = vsha256hq_u32(x, y, z); }"
    ADD_SOURCES_IF_SUCCESSFUL aes-neon.c sha256-neon.c sha1-neon.c)

+  test_compile_with_flags(HAVE_NEON_PMULL
+    GNU_FLAGS -march=armv8-a+crypto
+    MSVC_FLAGS -D_ARM_USE_NEW_NEON_INTRINSICS
+    TEST_SOURCE "
+      #include <${neon_header}>
+      volatile poly128_t r;
+      volatile poly64_t a, b;
+      volatile poly64x2_t u, v;
+      int main(void) { r = vmull_p64(a, b); r = vmull_high_p64(u, v); }"
+    ADD_SOURCES_IF_SUCCESSFUL aesgcm-neon.c)
+
  # The 'sha3' architecture extension, despite the name, includes
  # support for SHA-512 (from the SHA-2 standard) as well as SHA-3
  # proper.
--- a/crypto/aes-common.c
+++ b/crypto/aes-common.c
@ -12,3 +12,9 @@ const uint8_t aes_key_setup_round_constants[10] = {
     * regardless of the key. */
    0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1b, 0x36,
 };
+
+void aesgcm_cipher_crypt_length(
+    ssh_cipher *cipher, void *blk, int len, unsigned long seq)
+{
+    /* Do nothing: lengths are sent in clear for this cipher. */
+}
--- a/crypto/aes-neon.c
+++ b/crypto/aes-neon.c
@ -176,6 +176,18 @@ static inline uint8x16_t aes_neon_sdctr_increment(uint8x16_t in)
        vsubq_u64(vreinterpretq_u64_u8(in), subtrahend));
 }

+/*
+ * Much simpler auxiliary routine to increment the counter for GCM
+ * mode. This only has to increment the low word.
+ */
+static inline uint8x16_t aes_neon_gcm_increment(uint8x16_t in)
+{
+    uint32x4_t inw = vreinterpretq_u32_u8(in);
+    uint32x4_t ONE = vcombine_u32(vcreate_u32(0), vcreate_u32(1));
+    inw = vaddq_u32(inw, ONE);
+    return vreinterpretq_u8_u32(inw);
+}
+
 /*
 * The SSH interface and the cipher modes.
 */
@ -227,6 +239,28 @@ static void aes_neon_setiv_sdctr(ssh_cipher *ciph, const void *iv)
    ctx->iv = aes_neon_sdctr_reverse(counter);
 }

+static void aes_neon_setiv_gcm(ssh_cipher *ciph, const void *iv)
+{
+    aes_neon_context *ctx = container_of(ciph, aes_neon_context, ciph);
+    uint8x16_t counter = vld1q_u8(iv);
+    ctx->iv = aes_neon_sdctr_reverse(counter);
+    ctx->iv = vreinterpretq_u8_u32(vsetq_lane_u32(
+                                       1, vreinterpretq_u32_u8(ctx->iv), 2));
+}
+
+static void aes_neon_next_message_gcm(ssh_cipher *ciph)
+{
+    aes_neon_context *ctx = container_of(ciph, aes_neon_context, ciph);
+    uint32x4_t iv = vreinterpretq_u32_u8(ctx->iv);
+    uint64_t msg_counter = vgetq_lane_u32(iv, 0);
+    msg_counter = (msg_counter << 32) | vgetq_lane_u32(iv, 3);
+    msg_counter++;
+    iv = vsetq_lane_u32(msg_counter >> 32, iv, 0);
+    iv = vsetq_lane_u32(msg_counter, iv, 3);
+    iv = vsetq_lane_u32(1, iv, 2);
+    ctx->iv = vreinterpretq_u8_u32(iv);
+}
+
 typedef uint8x16_t (*aes_neon_fn)(uint8x16_t v, const uint8x16_t *keysched);

 static inline void aes_cbc_neon_encrypt(
@ -275,6 +309,31 @@ static inline void aes_sdctr_neon(
    }
 }

+static inline void aes_encrypt_ecb_block_neon(
+    ssh_cipher *ciph, void *blk, aes_neon_fn encrypt)
+{
+    aes_neon_context *ctx = container_of(ciph, aes_neon_context, ciph);
+    uint8x16_t plaintext = vld1q_u8(blk);
+    uint8x16_t ciphertext = encrypt(plaintext, ctx->keysched_e);
+    vst1q_u8(blk, ciphertext);
+}
+
+static inline void aes_gcm_neon(
+    ssh_cipher *ciph, void *vblk, int blklen, aes_neon_fn encrypt)
+{
+    aes_neon_context *ctx = container_of(ciph, aes_neon_context, ciph);
+
+    for (uint8_t *blk = (uint8_t *)vblk, *finish = blk + blklen;
+         blk < finish; blk += 16) {
+        uint8x16_t counter = aes_neon_sdctr_reverse(ctx->iv);
+        uint8x16_t keystream = encrypt(counter, ctx->keysched_e);
+        uint8x16_t input = vld1q_u8(blk);
+        uint8x16_t output = veorq_u8(input, keystream);
+        vst1q_u8(blk, output);
+        ctx->iv = aes_neon_gcm_increment(ctx->iv);
+    }
+}
+
 #define NEON_ENC_DEC(len)                                               \
    static void aes##len##_neon_cbc_encrypt(                            \
        ssh_cipher *ciph, void *vblk, int blklen)                       \
@ -285,6 +344,12 @@ static inline void aes_sdctr_neon(
    static void aes##len##_neon_sdctr(                                  \
        ssh_cipher *ciph, void *vblk, int blklen)                       \
    { aes_sdctr_neon(ciph, vblk, blklen, aes_neon_##len##_e); }         \
+    static void aes##len##_neon_gcm(                                    \
+        ssh_cipher *ciph, void *vblk, int blklen)                       \
+    { aes_gcm_neon(ciph, vblk, blklen, aes_neon_##len##_e); }           \
+    static void aes##len##_neon_encrypt_ecb_block(                      \
+        ssh_cipher *ciph, void *vblk)                                   \
+    { aes_encrypt_ecb_block_neon(ciph, vblk, aes_neon_##len##_e); }

 NEON_ENC_DEC(128)
 NEON_ENC_DEC(192)
--- a/crypto/aes-ni.c
+++ b/crypto/aes-ni.c
@ -137,6 +137,16 @@ static inline __m128i aes_ni_sdctr_increment(__m128i v)
    return v;
 }

+/*
+ * Much simpler auxiliary routine to increment the counter for GCM
+ * mode. This only has to increment the low word.
+ */
+static inline __m128i aes_ni_gcm_increment(__m128i v)
+{
+    const __m128i ONE  = _mm_setr_epi32(1,0,0,0);
+    return _mm_add_epi32(v, ONE);
+}
+
 /*
 * Auxiliary routine to reverse the byte order of a vector, so that
 * the SDCTR IV can be made big-endian for feeding to the cipher.
@ -214,6 +224,25 @@ static void aes_ni_setiv_sdctr(ssh_cipher *ciph, const void *iv)
    ctx->iv = aes_ni_sdctr_reverse(counter);
 }

+static void aes_ni_setiv_gcm(ssh_cipher *ciph, const void *iv)
+{
+    aes_ni_context *ctx = container_of(ciph, aes_ni_context, ciph);
+    __m128i counter = _mm_loadu_si128(iv);
+    ctx->iv = aes_ni_sdctr_reverse(counter);
+    ctx->iv = _mm_insert_epi32(ctx->iv, 1, 0);
+}
+
+static void aes_ni_next_message_gcm(ssh_cipher *ciph)
+{
+    aes_ni_context *ctx = container_of(ciph, aes_ni_context, ciph);
+    uint32_t fixed = _mm_extract_epi32(ctx->iv, 3);
+    uint64_t msg_counter = _mm_extract_epi32(ctx->iv, 2);
+    msg_counter <<= 32;
+    msg_counter |= (uint32_t)_mm_extract_epi32(ctx->iv, 1);
+    msg_counter++;
+    ctx->iv = _mm_set_epi32(fixed, msg_counter >> 32, msg_counter, 1);
+}
+
 typedef __m128i (*aes_ni_fn)(__m128i v, const __m128i *keysched);

 static inline void aes_cbc_ni_encrypt(
@ -262,6 +291,31 @@ static inline void aes_sdctr_ni(
    }
 }

+static inline void aes_encrypt_ecb_block_ni(
+    ssh_cipher *ciph, void *blk, aes_ni_fn encrypt)
+{
+    aes_ni_context *ctx = container_of(ciph, aes_ni_context, ciph);
+    __m128i plaintext = _mm_loadu_si128(blk);
+    __m128i ciphertext = encrypt(plaintext, ctx->keysched_e);
+    _mm_storeu_si128(blk, ciphertext);
+}
+
+static inline void aes_gcm_ni(
+    ssh_cipher *ciph, void *vblk, int blklen, aes_ni_fn encrypt)
+{
+    aes_ni_context *ctx = container_of(ciph, aes_ni_context, ciph);
+
+    for (uint8_t *blk = (uint8_t *)vblk, *finish = blk + blklen;
+         blk < finish; blk += 16) {
+        __m128i counter = aes_ni_sdctr_reverse(ctx->iv);
+        __m128i keystream = encrypt(counter, ctx->keysched_e);
+        __m128i input = _mm_loadu_si128((const __m128i *)blk);
+        __m128i output = _mm_xor_si128(input, keystream);
+        _mm_storeu_si128((__m128i *)blk, output);
+        ctx->iv = aes_ni_gcm_increment(ctx->iv);
+    }
+}
+
 #define NI_ENC_DEC(len)                                                 \
    static void aes##len##_ni_cbc_encrypt(                              \
        ssh_cipher *ciph, void *vblk, int blklen)                       \
@ -272,6 +326,12 @@ static inline void aes_sdctr_ni(
    static void aes##len##_ni_sdctr(                                    \
        ssh_cipher *ciph, void *vblk, int blklen)                       \
    { aes_sdctr_ni(ciph, vblk, blklen, aes_ni_##len##_e); }             \
+    static void aes##len##_ni_gcm(                                      \
+        ssh_cipher *ciph, void *vblk, int blklen)                       \
+    { aes_gcm_ni(ciph, vblk, blklen, aes_ni_##len##_e); }               \
+    static void aes##len##_ni_encrypt_ecb_block(                        \
+        ssh_cipher *ciph, void *vblk)                                   \
+    { aes_encrypt_ecb_block_ni(ciph, vblk, aes_ni_##len##_e); }

 NI_ENC_DEC(128)
 NI_ENC_DEC(192)
--- a/crypto/aes-select.c
+++ b/crypto/aes-select.c
@ -39,7 +39,7 @@ static ssh_cipher *aes_select(const ssh_cipheralg *alg)
 #define IF_NEON(...)
 #endif

-#define AES_SELECTOR_VTABLE(mode_c, mode_protocol, mode_display, bits)  \
+#define AES_SELECTOR_VTABLE(mode_c, mode_protocol, mode_display, bits, ...) \
    static const ssh_cipheralg *                                        \
    ssh_aes ## bits ## _ ## mode_c ## _impls[] = {                      \
        IF_NI(&ssh_aes ## bits ## _ ## mode_c ## _ni,)                  \
@ -56,6 +56,7 @@ static ssh_cipher *aes_select(const ssh_cipheralg *alg)
        .text_name = "AES-" #bits " " mode_display                      \
        " (dummy selector vtable)",                                     \
        .extra = ssh_aes ## bits ## _ ## mode_c ## _impls,              \
+        __VA_ARGS__                                                     \
    }

 AES_SELECTOR_VTABLE(cbc, "cbc", "CBC", 128);
@ -64,6 +65,17 @@ AES_SELECTOR_VTABLE(cbc, "cbc", "CBC", 256);
 AES_SELECTOR_VTABLE(sdctr, "ctr", "SDCTR", 128);
 AES_SELECTOR_VTABLE(sdctr, "ctr", "SDCTR", 192);
 AES_SELECTOR_VTABLE(sdctr, "ctr", "SDCTR", 256);
+AES_SELECTOR_VTABLE(gcm, "gcm@openssh.com", "GCM", 128,
+                    .required_mac = &ssh2_aesgcm_mac);
+AES_SELECTOR_VTABLE(gcm, "gcm@openssh.com", "GCM", 256,
+                    .required_mac = &ssh2_aesgcm_mac);
+
+/* 192-bit AES-GCM is included only so that testcrypt can run standard
+ * test vectors against it. OpenSSH doesn't define a protocol id for
+ * it. Hence the silly macro trick here to set its ssh2_id to 0, and
+ * more importantly, leaving it out of aesgcm_list[] below. */
+AES_SELECTOR_VTABLE(gcm, ?NULL:NULL, "GCM", 192,
+                    .required_mac = &ssh2_aesgcm_mac);

 static const ssh_cipheralg ssh_rijndael_lysator = {
    /* Same as aes256_cbc, but with a different protocol ID */
@ -87,3 +99,12 @@ static const ssh_cipheralg *const aes_list[] = {
 };

 const ssh2_ciphers ssh2_aes = { lenof(aes_list), aes_list };
+
+static const ssh_cipheralg *const aesgcm_list[] = {
+    /* OpenSSH only defines protocol ids for 128- and 256-bit AES-GCM,
+     * not 192-bit. */
+    &ssh_aes128_gcm,
+    &ssh_aes256_gcm,
+};
+
+const ssh2_ciphers ssh2_aesgcm = { lenof(aesgcm_list), aesgcm_list };
--- a/crypto/aes-sw.c
+++ b/crypto/aes-sw.c
@ -827,6 +827,18 @@ struct aes_sw_context {
            uint8_t keystream[SLICE_PARALLELISM * 16];
            uint8_t *keystream_pos;
        } sdctr;
+        struct {
+            /* In GCM mode, the cipher preimage consists of three
+             * sections: one fixed, one that increments per message
+             * sent and MACed, and one that increments per cipher
+             * block. */
+            uint64_t msg_counter;
+            uint32_t fixed_iv, block_counter;
+            /* But we keep the precomputed keystream chunks just like
+             * SDCTR mode. */
+            uint8_t keystream[SLICE_PARALLELISM * 16];
+            uint8_t *keystream_pos;
+        } gcm;
    } iv;
    ssh_cipher ciph;
 };
@ -874,6 +886,31 @@ static void aes_sw_setiv_sdctr(ssh_cipher *ciph, const void *viv)
        ctx->iv.sdctr.keystream + sizeof(ctx->iv.sdctr.keystream);
 }

+static void aes_sw_setiv_gcm(ssh_cipher *ciph, const void *viv)
+{
+    aes_sw_context *ctx = container_of(ciph, aes_sw_context, ciph);
+    const uint8_t *iv = (const uint8_t *)viv;
+
+    ctx->iv.gcm.fixed_iv = GET_32BIT_MSB_FIRST(iv);
+    ctx->iv.gcm.msg_counter = GET_64BIT_MSB_FIRST(iv + 4);
+    ctx->iv.gcm.block_counter = 1;
+
+    /* Set keystream_pos to indicate that the keystream cache is
+     * currently empty */
+    ctx->iv.gcm.keystream_pos =
+        ctx->iv.gcm.keystream + sizeof(ctx->iv.gcm.keystream);
+}
+
+static void aes_sw_next_message_gcm(ssh_cipher *ciph)
+{
+    aes_sw_context *ctx = container_of(ciph, aes_sw_context, ciph);
+
+    ctx->iv.gcm.msg_counter++;
+    ctx->iv.gcm.block_counter = 1;
+    ctx->iv.gcm.keystream_pos =
+        ctx->iv.gcm.keystream + sizeof(ctx->iv.gcm.keystream);
+}
+
 typedef void (*aes_sw_fn)(uint32_t v[4], const uint32_t *keysched);

 static inline void memxor16(void *vout, const void *vlhs, const void *vrhs)
@ -1021,6 +1058,56 @@ static inline void aes_sdctr_sw(
    }
 }

+static inline void aes_encrypt_ecb_block_sw(ssh_cipher *ciph, void *blk)
+{
+    aes_sw_context *ctx = container_of(ciph, aes_sw_context, ciph);
+    aes_sliced_e_serial(blk, blk, &ctx->sk);
+}
+
+static inline void aes_gcm_sw(
+    ssh_cipher *ciph, void *vblk, int blklen)
+{
+    aes_sw_context *ctx = container_of(ciph, aes_sw_context, ciph);
+
+    /*
+     * GCM encrypt/decrypt looks just like SDCTR, except that the
+     * method of generating more keystream varies slightly.
+     */
+
+    uint8_t *keystream_end =
+        ctx->iv.gcm.keystream + sizeof(ctx->iv.gcm.keystream);
+
+    for (uint8_t *blk = (uint8_t *)vblk, *finish = blk + blklen;
+         blk < finish; blk += 16) {
+
+        if (ctx->iv.gcm.keystream_pos == keystream_end) {
+            /*
+             * Generate some keystream.
+             */
+            for (uint8_t *block = ctx->iv.gcm.keystream;
+                 block < keystream_end; block += 16) {
+                /* Format the counter value into the buffer. */
+                PUT_32BIT_MSB_FIRST(block, ctx->iv.gcm.fixed_iv);
+                PUT_64BIT_MSB_FIRST(block + 4, ctx->iv.gcm.msg_counter);
+                PUT_32BIT_MSB_FIRST(block + 12, ctx->iv.gcm.block_counter);
+
+                /* Increment the counter. */
+                ctx->iv.gcm.block_counter++;
+            }
+
+            /* Encrypt all those counter blocks. */
+            aes_sliced_e_parallel(ctx->iv.gcm.keystream,
+                                  ctx->iv.gcm.keystream, &ctx->sk);
+
+            /* Reset keystream_pos to the start of the buffer. */
+            ctx->iv.gcm.keystream_pos = ctx->iv.gcm.keystream;
+        }
+
+        memxor16(blk, blk, ctx->iv.gcm.keystream_pos);
+        ctx->iv.gcm.keystream_pos += 16;
+    }
+}
+
 #define SW_ENC_DEC(len)                                 \
    static void aes##len##_sw_cbc_encrypt(              \
        ssh_cipher *ciph, void *vblk, int blklen)       \
@ -1030,7 +1117,13 @@ static inline void aes_sdctr_sw(
    { aes_cbc_sw_decrypt(ciph, vblk, blklen); }         \
    static void aes##len##_sw_sdctr(                    \
        ssh_cipher *ciph, void *vblk, int blklen)       \
-    { aes_sdctr_sw(ciph, vblk, blklen); }
+    { aes_sdctr_sw(ciph, vblk, blklen); }               \
+    static void aes##len##_sw_gcm(                      \
+        ssh_cipher *ciph, void *vblk, int blklen)       \
+    { aes_gcm_sw(ciph, vblk, blklen); }                 \
+    static void aes##len##_sw_encrypt_ecb_block(        \
+        ssh_cipher *ciph, void *vblk)                   \
+    { aes_encrypt_ecb_block_sw(ciph, vblk); }

 SW_ENC_DEC(128)
 SW_ENC_DEC(192)
--- a/crypto/aes.h
+++ b/crypto/aes.h
@ -15,6 +15,11 @@ struct aes_extra {

    /* Point to a writable substructure. */
    struct aes_extra_mutable *mut;
+
+    /* Extra API function specific to AES, to encrypt a single block
+     * in ECB mode without touching the IV. Used by AES-GCM MAC
+     * setup. */
+    void (*encrypt_ecb_block)(ssh_cipher *, void *);
 };
 struct aes_extra_mutable {
    bool checked_availability;
@ -30,6 +35,17 @@ static inline bool check_availability(const struct aes_extra *extra)
    return extra->mut->is_available;
 }

+/* Shared stub function for all the AES-GCM vtables. */
+void aesgcm_cipher_crypt_length(
+    ssh_cipher *cipher, void *blk, int len, unsigned long seq);
+
+/* External entry point for the encrypt_ecb_block function. */
+static inline void aes_encrypt_ecb_block(ssh_cipher *ciph, void *blk)
+{
+    const struct aes_extra *extra = ciph->vt->extra;
+    extra->encrypt_ecb_block(ciph, blk);
+}
+
 /*
 * Macros to define vtables for AES variants. There are a lot of
 * these, because of the cross product between cipher modes, key
@ -37,13 +53,19 @@ static inline bool check_availability(const struct aes_extra *extra)
 * some effort here to reduce the boilerplate in the sub-files.
 */

-#define AES_EXTRA(impl_c)                                               \
+#define AES_EXTRA_BITS(impl_c, bits)                                    \
    static struct aes_extra_mutable aes ## impl_c ## _extra_mut;        \
-    static const struct aes_extra aes ## impl_c ## _extra = {           \
+    static const struct aes_extra aes ## bits ## impl_c ## _extra = {   \
        .check_available = aes ## impl_c ## _available,                 \
        .mut = &aes ## impl_c ## _extra_mut,                            \
+        .encrypt_ecb_block = &aes ## bits ## impl_c ## _encrypt_ecb_block, \
    }

+#define AES_EXTRA(impl_c)                       \
+    AES_EXTRA_BITS(impl_c, 128);                \
+    AES_EXTRA_BITS(impl_c, 192);                \
+    AES_EXTRA_BITS(impl_c, 256)
+
 #define AES_CBC_VTABLE(impl_c, impl_display, bits)                      \
    const ssh_cipheralg ssh_aes ## bits ## _cbc ## impl_c = {           \
        .new = aes ## impl_c ## _new,                                   \
@ -59,7 +81,7 @@ static inline bool check_availability(const struct aes_extra *extra)
        .padded_keybytes = bits/8,                                      \
        .flags = SSH_CIPHER_IS_CBC,                                     \
        .text_name = "AES-" #bits " CBC (" impl_display ")",            \
-        .extra = &aes ## impl_c ## _extra,                              \
+        .extra = &aes ## bits ## impl_c ## _extra,                      \
    }

 #define AES_SDCTR_VTABLE(impl_c, impl_display, bits)                    \
@ -77,7 +99,31 @@ static inline bool check_availability(const struct aes_extra *extra)
        .padded_keybytes = bits/8,                                      \
        .flags = 0,                                                     \
        .text_name = "AES-" #bits " SDCTR (" impl_display ")",          \
-        .extra = &aes ## impl_c ## _extra,                              \
+        .extra = &aes ## bits ## impl_c ## _extra,                      \
+    }
+
+#define AES_GCM_VTABLE(impl_c, impl_display, bits)                      \
+    const ssh_cipheralg ssh_aes ## bits ## _gcm ## impl_c = {           \
+        .new = aes ## impl_c ## _new,                                   \
+        .free = aes ## impl_c ## _free,                                 \
+        .setiv = aes ## impl_c ## _setiv_gcm,                           \
+        .setkey = aes ## impl_c ## _setkey,                             \
+        .encrypt = aes ## bits ## impl_c ## _gcm,                       \
+        .decrypt = aes ## bits ## impl_c ## _gcm,                       \
+        .encrypt_length = aesgcm_cipher_crypt_length,                   \
+        .decrypt_length = aesgcm_cipher_crypt_length,                   \
+        .next_message = aes ## impl_c ## _next_message_gcm,             \
+        /* 192-bit AES-GCM is included only so that testcrypt can run   \
+         * standard test vectors against it. OpenSSH doesn't define a   \
+         * protocol id for it. So we set its ssh2_id to NULL. */        \
+        .ssh2_id = bits==192 ? NULL : "aes" #bits "-gcm@openssh.com",   \
+        .blksize = 16,                                                  \
+        .real_keybits = bits,                                           \
+        .padded_keybytes = bits/8,                                      \
+        .flags = SSH_CIPHER_SEPARATE_LENGTH,                            \
+        .text_name = "AES-" #bits " GCM (" impl_display ")",            \
+        .required_mac = &ssh2_aesgcm_mac,                               \
+        .extra = &aes ## bits ## impl_c ## _extra,                      \
    }

 #define AES_ALL_VTABLES(impl_c, impl_display)           \
@ -86,7 +132,10 @@ static inline bool check_availability(const struct aes_extra *extra)
    AES_CBC_VTABLE(impl_c, impl_display, 256);          \
    AES_SDCTR_VTABLE(impl_c, impl_display, 128);        \
    AES_SDCTR_VTABLE(impl_c, impl_display, 192);        \
-    AES_SDCTR_VTABLE(impl_c, impl_display, 256)
+    AES_SDCTR_VTABLE(impl_c, impl_display, 256);        \
+    AES_GCM_VTABLE(impl_c, impl_display, 128);          \
+    AES_GCM_VTABLE(impl_c, impl_display, 192);          \
+    AES_GCM_VTABLE(impl_c, impl_display, 256)

 /*
 * Macros to repeat a piece of code particular numbers of times that
--- a/crypto/aesgcm-clmul.c
+++ b/crypto/aesgcm-clmul.c
@ -0,0 +1,180 @@
+/*
+ * Implementation of the GCM polynomial hash using the x86 CLMUL
+ * extension, which provides 64x64->128 polynomial multiplication (or
+ * 'carry-less', which is what the CL stands for).
+ *
+ * Follows the reference implementation in aesgcm-ref-poly.c; see
+ * there for comments on the underlying technique. Here the comments
+ * just discuss the x86-specific details.
+ */
+
+#include <wmmintrin.h>
+#include <tmmintrin.h>
+
+#if defined(__clang__) || defined(__GNUC__)
+#include <cpuid.h>
+#define GET_CPU_ID(out) __cpuid(1, (out)[0], (out)[1], (out)[2], (out)[3])
+#else
+#define GET_CPU_ID(out) __cpuid(out, 1)
+#endif
+
+#include "ssh.h"
+#include "aesgcm.h"
+
+typedef struct aesgcm_clmul {
+    AESGCM_COMMON_FIELDS;
+    __m128i var, acc, mask;
+    void *ptr_to_free;
+} aesgcm_clmul;
+
+static bool aesgcm_clmul_available(void)
+{
+    /*
+     * Determine if CLMUL is available on this CPU.
+     */
+    unsigned int CPUInfo[4];
+    GET_CPU_ID(CPUInfo);
+    return (CPUInfo[2] & (1 << 1));
+}
+
+/*
+ * __m128i has to be aligned to 16 bytes, and x86 mallocs may not
+ * guarantee that, so we must over-allocate to make sure a large
+ * enough 16-byte region can be found, and ensure the aesgcm_clmul
+ * struct pointer is at least that well aligned.
+ */
+#define SPECIAL_ALLOC
+static aesgcm_clmul *aesgcm_clmul_alloc(void)
+{
+    char *p = smalloc(sizeof(aesgcm_clmul) + 15);
+    uintptr_t ip = (uintptr_t)p;
+    ip = (ip + 15) & ~15;
+    aesgcm_clmul *ctx = (aesgcm_clmul *)ip;
+    memset(ctx, 0, sizeof(aesgcm_clmul));
+    ctx->ptr_to_free = p;
+    return ctx;
+}
+
+#define SPECIAL_FREE
+static void aesgcm_clmul_free(aesgcm_clmul *ctx)
+{
+    void *ptf = ctx->ptr_to_free;
+    smemclr(ctx, sizeof(*ctx));
+    sfree(ptf);
+}
+
+/* Helper function to reverse the 16 bytes in a 128-bit vector */
+static inline __m128i mm_byteswap(__m128i vec)
+{
+    const __m128i reverse = _mm_set_epi64x(
+        0x0001020304050607ULL, 0x08090a0b0c0d0e0fULL);
+    return _mm_shuffle_epi8(vec, reverse);
+}
+
+/* Helper function to swap the two 64-bit words in a 128-bit vector */
+static inline __m128i mm_wordswap(__m128i vec)
+{
+    return _mm_shuffle_epi32(vec, 0x4E);
+}
+
+/* Load and store a 128-bit vector in big-endian fashion */
+static inline __m128i mm_load_be(const void *p)
+{
+    return mm_byteswap(_mm_loadu_si128(p));
+}
+static inline void mm_store_be(void *p, __m128i vec)
+{
+    _mm_storeu_si128(p, mm_byteswap(vec));
+}
+
+/*
+ * Key setup is just like in aesgcm-ref-poly.c. There's no point using
+ * vector registers to accelerate this, because it happens rarely.
+ */
+static void aesgcm_clmul_setkey_impl(aesgcm_clmul *ctx,
+                                     const unsigned char *var)
+{
+    uint64_t hi = GET_64BIT_MSB_FIRST(var);
+    uint64_t lo = GET_64BIT_MSB_FIRST(var + 8);
+
+    uint64_t bit = 1 & (hi >> 63);
+    hi = (hi << 1) ^ (lo >> 63);
+    lo = (lo << 1) ^ bit;
+    hi ^= 0xC200000000000000 & -bit;
+
+    ctx->var = _mm_set_epi64x(hi, lo);
+}
+
+static inline void aesgcm_clmul_setup(aesgcm_clmul *ctx,
+                                      const unsigned char *mask)
+{
+    ctx->mask = mm_load_be(mask);
+    ctx->acc = _mm_set_epi64x(0, 0);
+}
+
+/*
+ * Folding a coefficient into the accumulator is done by essentially
+ * the algorithm in aesgcm-ref-poly.c. I don't speak these intrinsics
+ * all that well, so in the parts where I needed to XOR half of one
+ * vector into half of another, I did a lot of faffing about with
+ * masks like 0xFFFFFFFFFFFFFFFF0000000000000000. Very likely this can
+ * be streamlined by a better x86-speaker than me. Patches welcome.
+ */
+static inline void aesgcm_clmul_coeff(aesgcm_clmul *ctx,
+                                      const unsigned char *coeff)
+{
+    ctx->acc = _mm_xor_si128(ctx->acc, mm_load_be(coeff));
+
+    /* Compute ah^al and bh^bl by word-swapping each of a and b and
+     * XORing with the original. That does more work than necessary -
+     * you end up with each of the desired values repeated twice -
+     * but I don't know of a neater way. */
+    __m128i aswap = mm_wordswap(ctx->acc);
+    __m128i vswap = mm_wordswap(ctx->var);
+    aswap = _mm_xor_si128(ctx->acc, aswap);
+    vswap = _mm_xor_si128(ctx->var, vswap);
+
+    /* Do the three multiplications required by Karatsuba */
+    __m128i md = _mm_clmulepi64_si128(aswap, vswap, 0x00);
+    __m128i lo = _mm_clmulepi64_si128(ctx->acc, ctx->var, 0x00);
+    __m128i hi = _mm_clmulepi64_si128(ctx->acc, ctx->var, 0x11);
+    /* Combine lo and hi into md */
+    md = _mm_xor_si128(md, lo);
+    md = _mm_xor_si128(md, hi);
+
+    /* Now we must XOR the high half of md into the low half of hi,
+     * and the low half of md into the high half of hi. Simplest thing
+     * is to swap the words of md (so that each one lines up with the
+     * register it's going to end up in), and then mask one off in
+     * each case. */
+    md = mm_wordswap(md);
+    lo = _mm_xor_si128(lo, _mm_and_si128(md, _mm_set_epi64x(~0ULL, 0ULL)));
+    hi = _mm_xor_si128(hi, _mm_and_si128(md, _mm_set_epi64x(0ULL, ~0ULL)));
+
+    /* The reduction stage is transformed similarly from the version
+     * in aesgcm-ref-poly.c. */
+    __m128i r1 = _mm_clmulepi64_si128(_mm_set_epi64x(0, 0xC200000000000000),
+                                     lo, 0x00);
+    r1 = mm_wordswap(r1);
+    r1 = _mm_xor_si128(r1, lo);
+    hi = _mm_xor_si128(hi, _mm_and_si128(r1, _mm_set_epi64x(~0ULL, 0ULL)));
+
+    __m128i r2 = _mm_clmulepi64_si128(_mm_set_epi64x(0, 0xC200000000000000),
+                                     r1, 0x10);
+    hi = _mm_xor_si128(hi, r2);
+    hi = _mm_xor_si128(hi, _mm_and_si128(r1, _mm_set_epi64x(0ULL, ~0ULL)));
+
+    ctx->acc = hi;
+}
+
+static inline void aesgcm_clmul_output(aesgcm_clmul *ctx,
+                                       unsigned char *output)
+{
+    mm_store_be(output, _mm_xor_si128(ctx->acc, ctx->mask));
+    smemclr(&ctx->acc, 16);
+    smemclr(&ctx->mask, 16);
+}
+
+#define AESGCM_FLAVOUR clmul
+#define AESGCM_NAME "CLMUL accelerated"
+#include "aesgcm-footer.h"
--- a/crypto/aesgcm-common.c
+++ b/crypto/aesgcm-common.c
@ -0,0 +1,8 @@
+#include "ssh.h"
+#include "aesgcm.h"
+
+void aesgcm_set_prefix_lengths(ssh2_mac *mac, size_t skip, size_t aad)
+{
+    const struct aesgcm_extra *extra = mac->vt->extra;
+    extra->set_prefix_lengths(mac, skip, aad);
+}
--- a/crypto/aesgcm-footer.h
+++ b/crypto/aesgcm-footer.h
@ -0,0 +1,368 @@
+/*
+ * Common footer included by every implementation of the AES-GCM MAC.
+ *
+ * The difficult part of AES-GCM, which is done differently depending
+ * on what hardware acceleration is available, is the actual
+ * evaluation of a polynomial over GF(2^128) whose coefficients are
+ * 128-bit chunks of data. But preparing those chunks in the first
+ * place (out of the ciphertext, associated data, and an
+ * administrative block containing the lengths of both) is done in the
+ * same way no matter what technique is used for the evaluation, so
+ * that's centralised into this file, along with as much of the other
+ * functionality as posible.
+ *
+ * This footer file is #included by each implementation, but each one
+ * will define its own struct type for the state, so that each alloc
+ * function will test sizeof() a different structure, and similarly
+ * for free when it zeroes out the state on cleanup.
+ *
+ * The functions in the source file may be defined as 'inline' so that
+ * the functions in here can inline them. The 'coeff' function in
+ * particular probably should be, because that's called once per
+ * 16-byte block, so eliminating function call overheads is especially
+ * useful there.
+ *
+ * This footer has the following expectations from the source file
+ * that #includes it:
+ *
+ *  - define AESGCM_FLAVOUR to be a fragment of a C identifier that
+ *    will be included in all the function names (both the ones
+ *    defined in the implementation source file and those in here).
+ *    For example purposes below I'll suppose that this is 'foo'.
+ *
+ *  - define AESGCM_NAME to be a string literal that will be included
+ *    in the display name of the implementation.
+ *
+ *  - define a typedef 'aesgcm_foo' to be the state structure for the
+ *    implementation, and inside that structure, expand the macro
+ *    AESGCM_COMMON_FIELDS defined in aesgcm.h
+ *
+ *  - define the following functions:
+ *
+ *    // Determine whether this implementation is available at run time
+ *    static bool aesgcm_foo_available(void);
+ *
+ *    // Set up the 'key' of the polynomial part of the MAC, that is,
+ *    // the value at which the polynomial will be evaluated. 'var' is
+ *    // a 16-byte data block in the byte order it comes out of AES.
+ *    static void aesgcm_foo_setkey_impl(aesgcm_foo *ctx,
+ *                                       const unsigned char *var);
+ *
+ *    // Set up at the start of evaluating an individual polynomial.
+ *    // 'mask' is the 16-byte data block that will be XORed into the
+ *    // output value of the polynomial, also in AES byte order. This
+ *    // function should store 'mask' in whatever form is most
+ *    // convenient, and initialise an accumulator to zero.
+ *    static void aesgcm_foo_setup(aesgcm_foo *ctx,
+ *                                 const unsigned char *mask);
+ *
+ *    // Fold in a coefficient of the polynomial, by means of XORing
+ *    // it into the accumulator and then multiplying the accumulator
+ *    // by the variable passed to setkey_impl() above.
+ *    //
+ *    // 'coeff' points to the 16-byte block of data that the
+ *    // polynomial coefficient will be made out of.
+ *    //
+ *    // You probably want to mark this function 'inline'.
+ *    static void aesgcm_foo_coeff(aesgcm_foo *ctx,
+ *                                 const unsigned char *coeff);
+ *
+ *    // Generate the output MAC, by XORing the accumulator's final
+ *    // value with the mask passed to setup() above.
+ *    //
+ *    // 'output' points to a 16-byte region of memory to write the
+ *    // result to.
+ *    static void aesgcm_foo_output(aesgcm_foo *ctx,
+ *                                  unsigned char *output);
+ *
+ *  - if allocation of the state structure must be done in a
+ *    non-standard way (e.g. x86 needs this to force greater alignment
+ *    than standard malloc provides), then #define SPECIAL_ALLOC and
+ *    define this additional function:
+ *
+ *    // Allocate a state structure, zero out its contents, and return it.
+ *    static aesgcm_foo *aesgcm_foo_alloc(void);
+ *
+ *  - if freeing must also be done in an unusual way, #define
+ *    SPECIAL_FREE and define this function:
+ *
+ *    // Zero out the state structure to avoid information leaks if the
+ *    // memory is reused, and then free it.
+ *    static void aesgcm_foo_free(aesgcm_foo *ctx);
+ */
+
+#ifndef AESGCM_FLAVOUR
+#error AESGCM_FLAVOUR must be defined by any module including this footer
+#endif
+#ifndef AESGCM_NAME
+#error AESGCM_NAME must be defined by any module including this footer
+#endif
+
+#define CONTEXT CAT(aesgcm_, AESGCM_FLAVOUR)
+#define PREFIX(name) CAT(CAT(aesgcm_, AESGCM_FLAVOUR), CAT(_, name))
+
+#include "aes.h" // for aes_encrypt_ecb_block
+
+static const char *PREFIX(mac_text_name)(ssh2_mac *mac)
+{
+    return "AES-GCM (" AESGCM_NAME ")";
+}
+
+static void PREFIX(mac_next_message)(ssh2_mac *mac)
+{
+    CONTEXT *ctx = container_of(mac, CONTEXT, mac);
+
+    /*
+     * Make the mask value for a single MAC instance, by encrypting
+     * the all-zeroes word using the associated AES instance in its
+     * ordinary GCM fashion. This consumes the first block of
+     * keystream (with per-block counter equal to 1), leaving the
+     * second block of keystream ready to be used on the first block
+     * of plaintext.
+     */
+    unsigned char buf[16];
+    memset(buf, 0, 16);
+    ssh_cipher_encrypt(ctx->cipher, buf, 16);
+    PREFIX(setup)(ctx, buf); /* give it to the implementation to store */
+    smemclr(buf, sizeof(buf));
+}
+
+static void PREFIX(mac_setkey)(ssh2_mac *mac, ptrlen key)
+{
+    CONTEXT *ctx = container_of(mac, CONTEXT, mac);
+
+    /*
+     * Make the value of the polynomial variable, by encrypting the
+     * all-zeroes word using the associated AES instance in the
+     * special ECB mode. This is done via the special AES-specific API
+     * function encrypt_ecb_block, which doesn't touch the counter
+     * state at all.
+     */
+    unsigned char var[16];
+    memset(var, 0, 16);
+    aes_encrypt_ecb_block(ctx->cipher, var);
+    PREFIX(setkey_impl)(ctx, var);
+    smemclr(var, sizeof(var));
+
+    PREFIX(mac_next_message)(mac);     /* set up mask */
+}
+
+static void PREFIX(mac_start)(ssh2_mac *mac)
+{
+    CONTEXT *ctx = container_of(mac, CONTEXT, mac);
+
+    ctx->skipgot = ctx->aadgot = ctx->ciphertextlen = ctx->partlen = 0;
+}
+
+/*
+ * Handle receiving data via the BinarySink API and turning it into a
+ * collection of 16-byte blocks to use as polynomial coefficients.
+ *
+ * This code is written in a fully general way, which is able to
+ * handle an arbitrary number of bytes at the start of the data to
+ * ignore completely (necessary for PuTTY integration), and an
+ * arbitrary number to treat as associated data, and the rest will be
+ * regarded as ciphertext. The stream can be interrupted at any byte
+ * position and resumed later; a partial block will be stored as
+ * necessary.
+ *
+ * At the time of writing this comment, in live use most of that
+ * generality isn't required: the full data is passed to this function
+ * in just one call. But there's no guarantee of that staying true in
+ * future, so we do the full deal here just in case, and the test
+ * vectors in cryptsuite.py will test it. (And they'll use
+ * set_prefix_lengths to set up different configurations from the SSH
+ * usage.)
+ */
+static void PREFIX(mac_BinarySink_write)(
+    BinarySink *bs, const void *blkv, size_t len)
+{
+    CONTEXT *ctx = BinarySink_DOWNCAST(bs, CONTEXT);
+    const unsigned char *blk = (const unsigned char *)blkv;
+
+    /*
+     * Skip the prefix sequence number used as implicit extra data in
+     * SSH MACs. This is not included in the associated data field for
+     * GCM, because the IV incrementation policy provides its own
+     * sequence numbering.
+     */
+    if (ctx->skipgot < ctx->skiplen) {
+        size_t n = ctx->skiplen - ctx->skipgot;
+        if (n > len)
+            n = len;
+        blk += n;
+        len -= n;
+        ctx->skipgot += n;
+
+        if (len == 0)
+            return;
+    }
+
+    /*
+     * Read additional authenticated data and fold it in to the MAC.
+     */
+    while (ctx->aadgot < ctx->aadlen) {
+        size_t n = ctx->aadlen - ctx->aadgot;
+        if (n > len)
+            n = len;
+
+        if (ctx->partlen || n < 16) {
+            /*
+             * Fold data into the partial block.
+             */
+            if (n > 16 - ctx->partlen)
+                n = 16 - ctx->partlen;
+            memcpy(ctx->partblk + ctx->partlen, blk, n);
+            ctx->partlen += n;
+        } else if (n >= 16) {
+            /*
+             * Consume a whole block of AAD.
+             */
+            PREFIX(coeff)(ctx, blk);
+            n = 16;
+        }
+        blk += n;
+        len -= n;
+        ctx->aadgot += n;
+
+        if (ctx->partlen == 16) {
+            PREFIX(coeff)(ctx, ctx->partblk);
+            ctx->partlen = 0;
+        }
+
+        if (ctx->aadgot == ctx->aadlen && ctx->partlen) {
+            memset(ctx->partblk + ctx->partlen, 0, 16 - ctx->partlen);
+            PREFIX(coeff)(ctx, ctx->partblk);
+            ctx->partlen = 0;
+        }
+
+        if (len == 0)
+            return;
+    }
+
+    /*
+     * Read the main ciphertext and fold it in to the MAC.
+     */
+    while (len > 0) {
+        size_t n = len;
+
+        if (ctx->partlen || n < 16) {
+            /*
+             * Fold data into the partial block.
+             */
+            if (n > 16 - ctx->partlen)
+                n = 16 - ctx->partlen;
+            memcpy(ctx->partblk + ctx->partlen, blk, n);
+            ctx->partlen += n;
+        } else if (n >= 16) {
+            /*
+             * Consume a whole block of ciphertext.
+             */
+            PREFIX(coeff)(ctx, blk);
+            n = 16;
+        }
+        blk += n;
+        len -= n;
+        ctx->ciphertextlen += n;
+
+        if (ctx->partlen == 16) {
+            PREFIX(coeff)(ctx, ctx->partblk);
+            ctx->partlen = 0;
+        }
+    }
+}
+
+static void PREFIX(mac_genresult)(ssh2_mac *mac, unsigned char *output)
+{
+    CONTEXT *ctx = container_of(mac, CONTEXT, mac);
+
+    /*
+     * Consume any partial block of ciphertext remaining.
+     */
+    if (ctx->partlen) {
+        memset(ctx->partblk + ctx->partlen, 0, 16 - ctx->partlen);
+        PREFIX(coeff)(ctx, ctx->partblk);
+    }
+
+    /*
+     * Consume the final block giving the lengths of the AAD and ciphertext.
+     */
+    unsigned char blk[16];
+    memset(blk, 0, 16);
+    PUT_64BIT_MSB_FIRST(blk, ctx->aadlen * 8);
+    PUT_64BIT_MSB_FIRST(blk + 8, ctx->ciphertextlen * 8);
+    PREFIX(coeff)(ctx, blk);
+
+    /*
+     * And call the implementation's output function.
+     */
+    PREFIX(output)(ctx, output);
+
+    smemclr(blk, sizeof(blk));
+    smemclr(ctx->partblk, 16);
+}
+
+static ssh2_mac *PREFIX(mac_new)(const ssh2_macalg *alg, ssh_cipher *cipher)
+{
+    const struct aesgcm_extra *extra = alg->extra;
+    if (!check_aesgcm_availability(extra))
+        return NULL;
+
+#ifdef SPECIAL_ALLOC
+    CONTEXT *ctx = PREFIX(alloc)();
+#else
+    CONTEXT *ctx = snew(CONTEXT);
+    memset(ctx, 0, sizeof(CONTEXT));
+#endif
+
+    ctx->mac.vt = alg;
+    ctx->cipher = cipher;
+    /* Default values for SSH-2, overridable by set_prefix_lengths for
+     * testcrypt purposes */
+    ctx->skiplen = 4;
+    ctx->aadlen = 4;
+    BinarySink_INIT(ctx, PREFIX(mac_BinarySink_write));
+    BinarySink_DELEGATE_INIT(&ctx->mac, ctx);
+    return &ctx->mac;
+}
+
+static void PREFIX(set_prefix_lengths)(ssh2_mac *mac, size_t skip, size_t aad)
+{
+    CONTEXT *ctx = container_of(mac, CONTEXT, mac);
+    ctx->skiplen = skip;
+    ctx->aadlen = aad;
+}
+
+static void PREFIX(mac_free)(ssh2_mac *mac)
+{
+    CONTEXT *ctx = container_of(mac, CONTEXT, mac);
+#ifdef SPECIAL_FREE
+    PREFIX(free)(ctx);
+#else
+    smemclr(ctx, sizeof(*ctx));
+    sfree(ctx);
+#endif
+}
+
+static struct aesgcm_extra_mutable PREFIX(extra_mut);
+
+static const struct aesgcm_extra PREFIX(extra) = {
+    .check_available = PREFIX(available),
+    .mut = &PREFIX(extra_mut),
+    .set_prefix_lengths = PREFIX(set_prefix_lengths),
+};
+
+const ssh2_macalg CAT(ssh2_aesgcm_mac_, AESGCM_FLAVOUR) = {
+    .new = PREFIX(mac_new),
+    .free = PREFIX(mac_free),
+    .setkey = PREFIX(mac_setkey),
+    .start = PREFIX(mac_start),
+    .genresult = PREFIX(mac_genresult),
+    .next_message = PREFIX(mac_next_message),
+    .text_name = PREFIX(mac_text_name),
+    .name = "",
+    .etm_name = "", /* Not selectable independently */
+    .len = 16,
+    .keylen = 0,
+    .extra = &PREFIX(extra),
+};
--- a/crypto/aesgcm-neon.c
+++ b/crypto/aesgcm-neon.c
@ -0,0 +1,156 @@
+/*
+ * Implementation of the GCM polynomial hash using Arm NEON vector
+ * intrinsics, in particular the multiplication operation for
+ * polynomials over GF(2).
+ *
+ * Follows the reference implementation in aesgcm-ref-poly.c; see
+ * there for comments on the underlying technique. Here the comments
+ * just discuss the NEON-specific details.
+ */
+
+#include "ssh.h"
+#include "aesgcm.h"
+
+#if USE_ARM64_NEON_H
+#include <arm64_neon.h>
+#else
+#include <arm_neon.h>
+#endif
+
+typedef struct aesgcm_neon {
+    AESGCM_COMMON_FIELDS;
+    poly128_t var, acc, mask;
+} aesgcm_neon;
+
+static bool aesgcm_neon_available(void)
+{
+    return platform_pmull_neon_available();
+}
+
+/*
+ * The NEON types involved are:
+ *
+ * 'poly128_t' is a type that lives in a 128-bit vector register and
+ * represents a 128-bit polynomial over GF(2)
+ *
+ * 'poly64x2_t' is a type that lives in a 128-bit vector register and
+ * represents a vector of two 64-bit polynomials. These appear as
+ * intermediate results in some of the helper functions below, but we
+ * never need to actually have a variable of that type.
+ *
+ * 'poly64x1_t' is a type that lives in a 128-bit vector register and
+ * represents a vector of one 64-bit polynomial.
+ *
+ * That is distinct from 'poly64_t', which is a type that lives in
+ * ordinary scalar registers and is a typedef for an integer type.
+ *
+ * Generally here we try to work in terms of poly128_t and 64-bit
+ * integer types, and let everything else be handled as internal
+ * details of these helper functions.
+ */
+
+/* Make a poly128_t from two halves */
+static inline poly128_t create_p128(poly64_t hi, poly64_t lo)
+{
+    return vreinterpretq_p128_p64(
+        vcombine_p64(vcreate_p64(lo), vcreate_p64(hi)));
+}
+
+/* Retrieve the high and low halves of a poly128_t */
+static inline poly64_t hi_half(poly128_t v)
+{
+    return vgetq_lane_p64(vreinterpretq_p64_p128(v), 1);
+}
+static inline poly64_t lo_half(poly128_t v)
+{
+    return vgetq_lane_p64(vreinterpretq_p64_p128(v), 0);
+}
+
+/* 64x64 -> 128 bit polynomial multiplication, the largest we can do
+ * in one CPU operation */
+static inline poly128_t pmul(poly64_t v, poly64_t w)
+{
+    return vmull_p64(v, w);
+}
+
+/* Load and store a poly128_t in the form of big-endian bytes. This
+ * involves separately swapping the halves of the register and
+ * reversing the bytes within each half. */
+static inline poly128_t load_p128_be(const void *p)
+{
+    poly128_t swapped = vreinterpretq_p128_u8(vrev64q_u8(vld1q_u8(p)));
+    return create_p128(lo_half(swapped), hi_half(swapped));
+}
+static inline void store_p128_be(void *p, poly128_t v)
+{
+    poly128_t swapped = create_p128(lo_half(v), hi_half(v));
+    vst1q_u8(p, vrev64q_u8(vreinterpretq_u8_p128(swapped)));
+}
+
+/*
+ * Key setup is just like in aesgcm-ref-poly.c. There's no point using
+ * vector registers to accelerate this, because it happens rarely.
+ */
+static void aesgcm_neon_setkey_impl(aesgcm_neon *ctx, const unsigned char *var)
+{
+    uint64_t hi = GET_64BIT_MSB_FIRST(var);
+    uint64_t lo = GET_64BIT_MSB_FIRST(var + 8);
+
+    uint64_t bit = 1 & (hi >> 63);
+    hi = (hi << 1) ^ (lo >> 63);
+    lo = (lo << 1) ^ bit;
+    hi ^= 0xC200000000000000 & -bit;
+
+    ctx->var = create_p128(hi, lo);
+}
+
+static inline void aesgcm_neon_setup(aesgcm_neon *ctx,
+                                     const unsigned char *mask)
+{
+    ctx->mask = load_p128_be(mask);
+    ctx->acc = create_p128(0, 0);
+}
+
+/*
+ * Folding a coefficient into the accumulator is done by exactly the
+ * algorithm in aesgcm-ref-poly.c, translated line by line.
+ *
+ * It's possible that this could be improved by some clever manoeuvres
+ * that avoid having to break vectors in half and put them together
+ * again. Patches welcome if anyone has better ideas.
+ */
+static inline void aesgcm_neon_coeff(aesgcm_neon *ctx,
+                                     const unsigned char *coeff)
+{
+    ctx->acc = vaddq_p128(ctx->acc, load_p128_be(coeff));
+
+    poly64_t ah = hi_half(ctx->acc), al = lo_half(ctx->acc);
+    poly64_t bh = hi_half(ctx->var), bl = lo_half(ctx->var);
+    poly128_t md = pmul(ah ^ al, bh ^ bl);
+    poly128_t lo = pmul(al, bl);
+    poly128_t hi = pmul(ah, bh);
+    md = vaddq_p128(md, vaddq_p128(hi, lo));
+    hi = create_p128(hi_half(hi), lo_half(hi) ^ hi_half(md));
+    lo = create_p128(hi_half(lo) ^ lo_half(md), lo_half(lo));
+
+    poly128_t r1 = pmul((poly64_t)0xC200000000000000, lo_half(lo));
+    hi = create_p128(hi_half(hi), lo_half(hi) ^ lo_half(lo) ^ hi_half(r1));
+    lo = create_p128(hi_half(lo) ^ lo_half(r1), lo_half(lo));
+
+    poly128_t r2 = pmul((poly64_t)0xC200000000000000, hi_half(lo));
+    hi = vaddq_p128(hi, r2);
+    hi = create_p128(hi_half(hi) ^ hi_half(lo), lo_half(hi));
+
+    ctx->acc = hi;
+}
+
+static inline void aesgcm_neon_output(aesgcm_neon *ctx, unsigned char *output)
+{
+    store_p128_be(output, vaddq_p128(ctx->acc, ctx->mask));
+    ctx->acc = create_p128(0, 0);
+    ctx->mask = create_p128(0, 0);
+}
+
+#define AESGCM_FLAVOUR neon
+#define AESGCM_NAME "NEON accelerated"
+#include "aesgcm-footer.h"
--- a/crypto/aesgcm-ref-poly.c
+++ b/crypto/aesgcm-ref-poly.c
@ -0,0 +1,364 @@
+/*
+ * Implementation of the GCM polynomial hash in pure software, but
+ * based on a primitive that performs 64x64->128 bit polynomial
+ * multiplication over GF(2).
+ *
+ * This implementation is technically correct (should even be
+ * side-channel safe as far as I can see), but it's hopelessly slow,
+ * so no live SSH connection should ever use it. Therefore, it's
+ * deliberately not included in the lists in aesgcm-select.c. For pure
+ * software GCM in live use, you want aesgcm-sw.c, and that's what the
+ * selection system will choose.
+ *
+ * However, this implementation _is_ made available to testcrypt, so
+ * all the GCM tests in cryptsuite.py are run over this as well as the
+ * other implementations.
+ *
+ * The reason why this code exists at all is to act as a reference for
+ * GCM implementations that use a CPU-specific polynomial multiply
+ * intrinsic or asm statement. This version will run on whatever
+ * platform you're trying to port to, and will generate all the same
+ * intermediate results you expect the CPU-specific one to go through.
+ * So you can insert parallel diagnostics in this version and in your
+ * new version, to see where the two diverge.
+ *
+ * Also, this version is a good place to put long comments explaining
+ * the entire strategy of this implementation and its rationale. That
+ * avoids those comments having to be duplicated in the multiple
+ * platform-specific implementations, which can focus on commenting
+ * the way the local platform's idioms match up to this version, and
+ * refer to this file for the explanation of the underlying technique.
+ */
+
+#include "ssh.h"
+#include "aesgcm.h"
+
+/*
+ * Store a 128-bit value in the most convenient form standard C will
+ * let us, namely two uint64_t giving its most and least significant
+ * halves.
+ */
+typedef struct {
+    uint64_t hi, lo;
+} value128_t;
+
+typedef struct aesgcm_ref_poly {
+    AESGCM_COMMON_FIELDS;
+
+    /*
+     * The state of our GCM implementation is represented entirely by
+     * three 128-bit values:
+     */
+
+    /*
+     * The value at which we're evaluating the polynomial. The GCM
+     * spec calls this 'H'. It's defined once at GCM key setup time,
+     * by encrypting the all-zeroes value with our block cipher.
+     */
+    value128_t var;
+
+    /*
+     * Accumulator containing the result of evaluating the polynomial
+     * so far.
+     */
+    value128_t acc;
+
+    /*
+     * The mask value that is XORed into the final value of 'acc' to
+     * produce the output MAC. This is different for every MAC
+     * generated, because its purpose is to ensure that no information
+     * gathered from a legal MAC can be used to help the forgery of
+     * another one, and that comparing two legal MACs gives you no
+     * useful information about the text they cover, because in each
+     * case, the masks are different and pseudorandom.
+     */
+    value128_t mask;
+} aesgcm_ref_poly;
+
+static bool aesgcm_ref_poly_available(void)
+{
+    return true;  /* pure software implementation, always available */
+}
+
+/*
+ * Primitive function that takes two uint64_t values representing
+ * polynomials, multiplies them, and returns a value128_t struct
+ * containing the full product.
+ *
+ * Because the input polynomials have maximum degree 63, the output
+ * has max degree 63+63 = 127, not 128. As a result, the topmost bit
+ * of the output is always zero.
+ *
+ * The inside of this function is implemented in the simplest way,
+ * with no attention paid to performance. The important feature of
+ * this implementation is not what's _inside_ this function, but
+ * what's _outside_ it: aesgcm_ref_poly_coeff() tries to minimise the
+ * number of these operations.
+ */
+static value128_t pmul(uint64_t x, uint64_t y)
+{
+    value128_t r;
+    r.hi = r.lo = 0;
+
+    uint64_t bit = 1 & y;
+    r.lo ^= x & -bit;
+
+    for (unsigned i = 1; i < 64; i++) {
+        bit = 1 & (y >> i);
+        uint64_t z = x & -bit;
+        r.lo ^= z << i;
+        r.hi ^= z >> (64-i);
+    }
+
+    return r;
+}
+
+/*
+ * OK, I promised a long comment explaining what's going on in this
+ * implementation, and now it's time.
+ *
+ * The way AES-GCM _itself_ is defined by its own spec, its finite
+ * field consists of polynomials over GF(2), constrained to be 128
+ * bits long by reducing them modulo P = x^128 + x^7 + x^2 + x + 1.
+ * Using the usual binary representation in which bit i is the
+ * coefficient of x^i, that's 0x100000000000000000000000000000087.
+ *
+ * That is, whenever you multiply two polynomials and find a term
+ * x^128, you can replace it with x^7+x^2+x+1. More generally,
+ * x^(128+n) can be replaced with x^(7+n)+x^(2+n)+x^(1+n)+x^n. In
+ * binary terms, a 1 bit at the 128th position or above is replaced by
+ * 0x87 exactly 128 bits further down.
+ *
+ * So you'd think that multiplying two 128-bit polynomials mod P would
+ * be a matter of generating their full 256-bit product in the form of
+ * four words HI:HU:LU:LO, and then reducing it mod P by a two-stage
+ * process of computing HI * 0x87 and XORing it into HU:LU, then
+ * computing HU * 0x87 and XORing it into LU:LO.
+ *
+ * But it's not!
+ *
+ * The reason why not is because when AES-GCM is applied to SSH,
+ * somehow the _bit_ order got reversed. A 16-byte block of data in
+ * memory is converted into a polynomial by regarding bit 7 of the
+ * first byte as the constant term, bit 0 of the first byte as the x^7
+ * coefficient, ..., bit 0 of the last byte as the x^127 coefficient.
+ * So if we load that 16-byte block as a big-endian 128-bit integer,
+ * we end up with it representing a polynomial back to front, with the
+ * constant term at the top and the x^127 bit at the bottom.
+ *
+ * Well, that _shouldn't_ be a problem, right? The nice thing about
+ * polynomial multiplication is that it's essentially reversible. If
+ * you reverse the order of the coefficients of two polynomials, then
+ * the product of the reversed polys is exactly the reversal of the
+ * product of the original ones. So we bit-reverse our modulo
+ * polynomial to get 0x1c2000000000000000000000000000001, and we just
+ * pretend we're working mod that instead.
+ *
+ * And that is basically what we're about to do. But there's one
+ * complication, that arises from the semantics of the polynomial
+ * multiplication function we're using as our primitive operation.
+ *
+ * That function multiplies two polynomials of degree at most 63, to
+ * give one with degree at most 127. So it returns a 128-bit integer
+ * whose low bit is the constant term, and its very highest bit is 0,
+ * and its _next_ highest bit is the product of the high bits of the
+ * two inputs.
+ *
+ * That operation is _not_ symmetric in bit-reversal. If you give it
+ * the 64-bit-wise reversals of two polynomials P,Q, then its output
+ * is not the 128-bit-wise reversal of their product PQ, because that
+ * would require the constant term of PQ to appear in bit 127 of the
+ * output, and in fact it appears in bit 126. So in fact, what we get
+ * is offset by one bit from where we'd like it: it's the bit-reversal
+ * of PQx, not of PQ.
+ *
+ * There's more than one way we could fix this. One approach would be
+ * to work around this off-by-one error by taking the 128-bit output
+ * of pmul() and shifting it left by a bit. Then it _is_ the bitwise
+ * reversal of the 128-bit value we'd have wanted to get, and we could
+ * implement the exact algorithm described above, in fully
+ * bit-reversed form.
+ *
+ * But a 128-bit left shift is not a trivial operation in the vector
+ * architectures that this code is acting as a reference for. So we'd
+ * prefer to find a fix that doesn't need compensation during the
+ * actual per-block multiplication step.
+ *
+ * If we did the obvious thing anyway - compute the unshifted 128-bit
+ * product representing the bit-reversal of PQx, and reduce it mod
+ * 0x1c2000000000000000000000000000001 - then we'd get a result which
+ * is exactly what we want, except that it's got a factor of x in it
+ * that we need to get rid of. The obvious answer is to divide by x
+ * (which is legal and safe, since mod P, x is invertible).
+ *
+ * Dividing a 128-bit polynomial by x is easy in principle. Shift left
+ * (because we're still bit-reversed), and if that shifted a 1 bit off
+ * the top, XOR 0xc2000000000000000000000000000001 into the remaining
+ * 128 bits.
+ *
+ * But we're back to having that expensive left shift. What can we do
+ * about that?
+ *
+ * Happily, one of the two input values to our per-block multiply
+ * operation is fixed! It's given to us at key setup stage, and never
+ * changed until the next rekey. So if at key setup time we do this
+ * left shift business _once_, replacing the logical value Q with Q/x,
+ * then that exactly cancels out the unwanted factor of x that shows
+ * up in our multiply operation. And now it doesn't matter that it's
+ * expensive (in the sense of 'a few more instructions than you'd
+ * like'), because it only happens once per SSH key exchange, not once
+ * per 16 bytes of data transferred.
+ */
+
+static void aesgcm_ref_poly_setkey_impl(aesgcm_ref_poly *ctx,
+                                        const unsigned char *var)
+{
+    /*
+     * Key setup function. We copy the provided 16-byte 'var'
+     * value into our polynomial. But, as discussed above, we also
+     * need to divide it by x.
+     */
+
+    ctx->var.hi = GET_64BIT_MSB_FIRST(var);
+    ctx->var.lo = GET_64BIT_MSB_FIRST(var + 8);
+
+    uint64_t bit = 1 & (ctx->var.hi >> 63);
+    ctx->var.hi = (ctx->var.hi << 1) ^ (ctx->var.lo >> 63);
+    ctx->var.lo = (ctx->var.lo << 1) ^ bit;
+    ctx->var.hi ^= 0xC200000000000000 & -bit;
+}
+
+static inline void aesgcm_ref_poly_setup(aesgcm_ref_poly *ctx,
+                                         const unsigned char *mask)
+{
+    /*
+     * Set up to start evaluating a particular MAC. Copy in the mask
+     * value for this packet, and initialise acc to zero.
+     */
+
+    ctx->mask.hi = GET_64BIT_MSB_FIRST(mask);
+    ctx->mask.lo = GET_64BIT_MSB_FIRST(mask + 8);
+    ctx->acc.hi = ctx->acc.lo = 0;
+}
+
+static inline void aesgcm_ref_poly_coeff(aesgcm_ref_poly *ctx,
+                                         const unsigned char *coeff)
+{
+    /*
+     * One step of Horner's-rule polynomial evaluation (with each
+     * coefficient of the polynomial being an element of GF(2^128),
+     * itself composed of polynomials over GF(2) mod P).
+     *
+     * We take our accumulator value, add the incoming coefficient
+     * (which means XOR, by GF(2) rules), and multiply by x (that is,
+     * 'var').
+     */
+
+    /*
+     * The addition first, which is easy.
+     */
+    ctx->acc.hi ^= GET_64BIT_MSB_FIRST(coeff);
+    ctx->acc.lo ^= GET_64BIT_MSB_FIRST(coeff + 8);
+
+    /*
+     * First, create the 256-bit product of the two 128-bit
+     * polynomials over GF(2) stored in ctx->acc and ctx->var.
+     *
+     * The obvious way to do this is by four smaller multiplications
+     * of 64x64 -> 128 bits. But we can do better using a single
+     * iteration of the Karatsuba technique, which is actually more
+     * convenient in polynomials over GF(2) than it is in integers,
+     * because there aren't all those awkward carries complicating
+     * things.
+     *
+     * Letting B denote x^64, and imagining our two inputs are split
+     * up into 64-bit chunks ah,al,bh,bl, the product we want is
+     *
+     *         (ah B + al) (bh B + bl)
+     *       = (ah bh) B^2 + (al bh + ah bl) B + (al bl)
+     *
+     * which looks like four smaller multiplications of each of ah,al
+     * with each of bh,bl. But Karatsuba's trick is to first compute
+     *
+     *         (ah + al) (bh + bl)
+     *       = ah bh + al bh + ah bl + al bl
+     *
+     * and then subtract the terms (ah bh) and (al bl), which we had
+     * to compute anyway, to get the middle two terms (al bh + ah bl)
+     * which are our coefficient of B.
+     *
+     * This involves more bookkeeping instructions like XORs, but with
+     * any luck those are faster than the main multiplication.
+     */
+    uint64_t ah = ctx->acc.hi, al = ctx->acc.lo;
+    uint64_t bh = ctx->var.hi, bl = ctx->var.lo;
+    /* Compute the outer two terms */
+    value128_t lo = pmul(al, bl);
+    value128_t hi = pmul(ah, bh);
+    /* Compute the trick product (ah+al)(bh+bl) */
+    value128_t md = pmul(ah ^ al, bh ^ bl);
+    /* Subtract off the outer two terms to get md = al bh + ah bl */
+    md.hi ^= lo.hi ^ hi.hi;
+    md.lo ^= lo.lo ^ hi.lo;
+    /* And add that into the 256-bit value given by hi * x^128 + lo */
+    lo.hi ^= md.lo;
+    hi.lo ^= md.hi;
+
+    /*
+     * OK. Now hi and lo together make up the 256-bit full product.
+     * Now reduce it mod the reversal of the GCM modulus polynomial.
+     * As discussed above, that's 0x1c2000000000000000000000000000001.
+     *
+     * We want the _topmost_ 128 bits of this, because we're working
+     * in a bit-reversed world. So what we fundamentally want to do is
+     * to take our 256-bit product, and add to it the product of its
+     * low 128 bits with 0x1c2000000000000000000000000000001. Then the
+     * top 128 bits will be the output we want.
+     *
+     * Since there's no carrying in this arithmetic, it's enough to
+     * discard the 1 bit at the bottom of that, because it won't
+     * affect anything in the half we're keeping. So it's enough to
+     * add 0x1c2000000000000000000000000000000 * lo to (hi:lo).
+     *
+     * We can only work with 64 bits at a time, so the first thing we
+     * do is to break that up:
+     *
+     *  - add 0x1c200000000000000 * lo.lo to (hi.lo : lo.hi)
+     *  - add 0x1c200000000000000 * lo.hi to (hi.hi : hi.lo)
+     *
+     * But there's still a problem: 0x1c200000000000000 is just too
+     * _big_ to fit in 64 bits. So we have to break it up into the low
+     * 64 bits 0xc200000000000000, and its leading 1. So each of those
+     * steps of the form 'add 0x1c200000000000000 * x to y:z' becomes
+     * 'add 0xc200000000000000 * x to y:z' followed by 'add x to y',
+     * the latter step dealing with the leading 1.
+     */
+
+    /* First step, adding to the middle two words of our number. After
+     * this the lowest word (in lo.lo) is ignored. */
+    value128_t r1 = pmul(0xC200000000000000, lo.lo);
+    hi.lo ^= r1.hi ^ lo.lo;
+    lo.hi ^= r1.lo;
+
+    /* Second of those steps, adding to the top two words, and
+     * discarding lo.hi. */
+    value128_t r2 = pmul(0xC200000000000000, lo.hi);
+    hi.hi ^= r2.hi ^ lo.hi;
+    hi.lo ^= r2.lo;
+
+    /* Now 'hi' is precisely what we have left. */
+    ctx->acc = hi;
+}
+
+static inline void aesgcm_ref_poly_output(aesgcm_ref_poly *ctx,
+                                          unsigned char *output)
+{
+    PUT_64BIT_MSB_FIRST(output, ctx->acc.hi ^ ctx->mask.hi);
+    PUT_64BIT_MSB_FIRST(output + 8, ctx->acc.lo ^ ctx->mask.lo);
+    smemclr(&ctx->acc, 16);
+    smemclr(&ctx->mask, 16);
+}
+
+#define AESGCM_FLAVOUR ref_poly
+#define AESGCM_NAME "reference polynomial-based implementation"
+#include "aesgcm-footer.h"
--- a/crypto/aesgcm-select.c
+++ b/crypto/aesgcm-select.c
@ -0,0 +1,38 @@
+#include "ssh.h"
+#include "aesgcm.h"
+
+static ssh2_mac *aesgcm_mac_selector_new(const ssh2_macalg *alg,
+                                         ssh_cipher *cipher)
+{
+    static const ssh2_macalg *const real_algs[] = {
+#if HAVE_CLMUL
+        &ssh2_aesgcm_mac_clmul,
+#endif
+#if HAVE_NEON_PMULL
+        &ssh2_aesgcm_mac_neon,
+#endif
+        &ssh2_aesgcm_mac_sw,
+        NULL,
+    };
+
+    for (size_t i = 0; real_algs[i]; i++) {
+        const ssh2_macalg *alg = real_algs[i];
+        const struct aesgcm_extra *alg_extra =
+            (const struct aesgcm_extra *)alg->extra;
+        if (check_aesgcm_availability(alg_extra))
+            return ssh2_mac_new(alg, cipher);
+    }
+
+    /* We should never reach the NULL at the end of the list, because
+     * the last non-NULL entry should be software-only GCM, which is
+     * always available. */
+    unreachable("aesgcm_select ran off the end of its list");
+}
+
+const ssh2_macalg ssh2_aesgcm_mac = {
+    .new = aesgcm_mac_selector_new,
+    .name = "",
+    .etm_name = "", /* Not selectable independently */
+    .len = 16,
+    .keylen = 0,
+};
--- a/crypto/aesgcm-sw.c
+++ b/crypto/aesgcm-sw.c
@ -0,0 +1,145 @@
+/*
+ * Implementation of the GCM polynomial hash in pure software.
+ *
+ * I don't know of a faster way to do this in a side-channel safe
+ * manner than by precomputing a giant table and iterating over the
+ * whole thing.
+ *
+ * The original GCM reference suggests that you precompute the effects
+ * of multiplying a 128-bit value by the fixed key, in the form of a
+ * table indexed by some number of bits of the input value, so that
+ * you end up computing something of the form
+ *
+ *   table1[x & 0xFF] ^ table2[(x>>8) & 0xFF] ^ ... ^ table15[(x>>120) & 0xFF]
+ *
+ * But that was obviously written before cache and timing leaks were
+ * known about. What's a time-safe approach?
+ *
+ * Well, the above technique isn't fixed to 8 bits of input per table.
+ * You could trade off the number of tables against the size of each
+ * table. At one extreme of this tradeoff, you have 128 tables each
+ * indexed by a single input bit - which is to say, you have 128
+ * values, each 128 bits wide, and you XOR together the subset of
+ * those values corresponding to the input bits, which you can do by
+ * making a bitmask out of each input bit using standard constant-
+ * time-coding bit twiddling techniques.
+ *
+ * That's pretty unpleasant when GCM is supposed to be a fast
+ * algorithm, but I don't know of a better approach that meets current
+ * security standards! Suggestions welcome, if they can get through
+ * testsc.
+ */
+
+#include "ssh.h"
+#include "aesgcm.h"
+
+/*
+ * Store a 128-bit value in the most convenient form standard C will
+ * let us, namely two uint64_t giving its most and least significant
+ * halves.
+ */
+typedef struct {
+    uint64_t hi, lo;
+} value128_t;
+
+typedef struct aesgcm_sw {
+    AESGCM_COMMON_FIELDS;
+
+    /* Accumulator for the current evaluation, and mask that will be
+     * XORed in at the end. High  */
+    value128_t acc, mask;
+
+    /*
+     * Table of values to XOR in for each bit, representing the effect
+     * of multiplying by the fixed key. The key itself doesn't need to
+     * be stored separately, because it's never used. (However, it is
+     * also the first entry in the table, so if you _do_ need it,
+     * there it is.)
+     *
+     * Table is indexed from the low bit of the input upwards.
+     */
+    value128_t table[128];
+} aesgcm_sw;
+
+static bool aesgcm_sw_available(void)
+{
+    return true;  /* pure software implementation, always available */
+}
+
+static void aesgcm_sw_setkey_impl(aesgcm_sw *gcm, const unsigned char *var)
+{
+    value128_t v;
+    v.hi = GET_64BIT_MSB_FIRST(var);
+    v.lo = GET_64BIT_MSB_FIRST(var + 8);
+
+    /*
+     * Prepare the table. This has to be done in reverse order, so
+     * that the original value of the variable corresponds to
+     * table[127], because AES-GCM works in the bit-reversal of its
+     * logical specification so that's where the logical constant term
+     * lives. (See more detailed comment in aesgcm-ref-poly.c.)
+     */
+    for (size_t i = 0; i < 128; i++) {
+        gcm->table[127 - i] = v;
+
+        /* Multiply v by x, which means shifting right (bit reversal
+         * again) and then adding 0xE1 at the top if we shifted a 1 out. */
+        uint64_t lobit = v.lo & 1;
+        v.lo = (v.lo >> 1) ^ (v.hi << 63);
+        v.hi = (v.hi >> 1) ^ (0xE100000000000000ULL & -lobit);
+    }
+}
+
+static inline void aesgcm_sw_setup(aesgcm_sw *gcm, const unsigned char *mask)
+{
+    gcm->mask.hi = GET_64BIT_MSB_FIRST(mask);
+    gcm->mask.lo = GET_64BIT_MSB_FIRST(mask + 8);
+    gcm->acc.hi = gcm->acc.lo = 0;
+}
+
+static inline void aesgcm_sw_coeff(aesgcm_sw *gcm, const unsigned char *coeff)
+{
+    /* XOR in the new coefficient */
+    gcm->acc.hi ^= GET_64BIT_MSB_FIRST(coeff);
+    gcm->acc.lo ^= GET_64BIT_MSB_FIRST(coeff + 8);
+
+    /* And now just loop over the bits of acc, making up a new value
+     * by XORing together the entries of 'table' corresponding to set
+     * bits. */
+
+    value128_t out;
+    out.lo = out.hi = 0;
+
+    const value128_t *tableptr = gcm->table;
+
+    for (size_t i = 0; i < 64; i++) {
+        uint64_t bit = 1 & gcm->acc.lo;
+        gcm->acc.lo >>= 1;
+        uint64_t mask = -bit;
+        out.hi ^= mask & tableptr->hi;
+        out.lo ^= mask & tableptr->lo;
+        tableptr++;
+    }
+    for (size_t i = 0; i < 64; i++) {
+        uint64_t bit = 1 & gcm->acc.hi;
+        gcm->acc.hi >>= 1;
+        uint64_t mask = -bit;
+        out.hi ^= mask & tableptr->hi;
+        out.lo ^= mask & tableptr->lo;
+        tableptr++;
+    }
+
+    gcm->acc = out;
+}
+
+static inline void aesgcm_sw_output(aesgcm_sw *gcm, unsigned char *output)
+{
+    PUT_64BIT_MSB_FIRST(output, gcm->acc.hi ^ gcm->mask.hi);
+    PUT_64BIT_MSB_FIRST(output + 8, gcm->acc.lo ^ gcm->mask.lo);
+    smemclr(&gcm->acc, 16);
+    smemclr(&gcm->mask, 16);
+}
+
+#define AESGCM_FLAVOUR sw
+#define AESGCM_NAME "unaccelerated"
+#include "aesgcm-footer.h"
--- a/crypto/aesgcm.h
+++ b/crypto/aesgcm.h
@ -0,0 +1,44 @@
+/*
+ * Common parts of the state structure for AESGCM MAC implementations.
+ */
+#define AESGCM_COMMON_FIELDS                    \
+    ssh_cipher *cipher;                         \
+    unsigned char partblk[16];                  \
+    size_t skiplen, aadlen, ciphertextlen;      \
+    size_t skipgot, aadgot, partlen;            \
+    BinarySink_IMPLEMENTATION;                  \
+    ssh2_mac mac
+
+/*
+ * The 'extra' structure is used to include information about how to
+ * check if a given implementation is available at run time, and
+ * whether we've already checked.
+ */
+struct aesgcm_extra_mutable;
+struct aesgcm_extra {
+    /* Function to check availability. Might be expensive, so we don't
+     * want to call it more than once. */
+    bool (*check_available)(void);
+
+    /* Point to a writable substructure. */
+    struct aesgcm_extra_mutable *mut;
+
+    /*
+     * Extra API function specific to this MAC type that allows
+     * testcrypt to set more general lengths for skiplen and aadlen.
+     */
+    void (*set_prefix_lengths)(ssh2_mac *mac, size_t skip, size_t aad);
+};
+struct aesgcm_extra_mutable {
+    bool checked_availability;
+    bool is_available;
+};
+static inline bool check_aesgcm_availability(const struct aesgcm_extra *extra)
+{
+    if (!extra->mut->checked_availability) {
+        extra->mut->is_available = extra->check_available();
+        extra->mut->checked_availability = true;
+    }
+
+    return extra->mut->is_available;
+}
--- a/putty.h
+++ b/putty.h
@ -453,6 +453,7 @@ enum {
    CIPHER_DES,
    CIPHER_ARCFOUR,
    CIPHER_CHACHA20,
+    CIPHER_AESGCM,
    CIPHER_MAX                         /* no. ciphers (inc warn) */
 };

--- a/settings.c
+++ b/settings.c
@ -17,6 +17,7 @@
 static const struct keyvalwhere ciphernames[] = {
    { "aes",        CIPHER_AES,             -1, -1 },
    { "chacha20",   CIPHER_CHACHA20,        CIPHER_AES, +1 },
+    { "aesgcm",     CIPHER_AESGCM,          CIPHER_CHACHA20, +1 },
    { "3des",       CIPHER_3DES,            -1, -1 },
    { "WARN",       CIPHER_WARN,            -1, -1 },
    { "des",        CIPHER_DES,             -1, -1 },
--- a/ssh.h
+++ b/ssh.h
@ -1072,6 +1072,10 @@ extern const ssh_cipheralg ssh_aes256_sdctr;
 extern const ssh_cipheralg ssh_aes256_sdctr_ni;
 extern const ssh_cipheralg ssh_aes256_sdctr_neon;
 extern const ssh_cipheralg ssh_aes256_sdctr_sw;
+extern const ssh_cipheralg ssh_aes256_gcm;
+extern const ssh_cipheralg ssh_aes256_gcm_ni;
+extern const ssh_cipheralg ssh_aes256_gcm_neon;
+extern const ssh_cipheralg ssh_aes256_gcm_sw;
 extern const ssh_cipheralg ssh_aes256_cbc;
 extern const ssh_cipheralg ssh_aes256_cbc_ni;
 extern const ssh_cipheralg ssh_aes256_cbc_neon;
@ -1080,6 +1084,10 @@ extern const ssh_cipheralg ssh_aes192_sdctr;
 extern const ssh_cipheralg ssh_aes192_sdctr_ni;
 extern const ssh_cipheralg ssh_aes192_sdctr_neon;
 extern const ssh_cipheralg ssh_aes192_sdctr_sw;
+extern const ssh_cipheralg ssh_aes192_gcm;
+extern const ssh_cipheralg ssh_aes192_gcm_ni;
+extern const ssh_cipheralg ssh_aes192_gcm_neon;
+extern const ssh_cipheralg ssh_aes192_gcm_sw;
 extern const ssh_cipheralg ssh_aes192_cbc;
 extern const ssh_cipheralg ssh_aes192_cbc_ni;
 extern const ssh_cipheralg ssh_aes192_cbc_neon;
@ -1088,6 +1096,10 @@ extern const ssh_cipheralg ssh_aes128_sdctr;
 extern const ssh_cipheralg ssh_aes128_sdctr_ni;
 extern const ssh_cipheralg ssh_aes128_sdctr_neon;
 extern const ssh_cipheralg ssh_aes128_sdctr_sw;
+extern const ssh_cipheralg ssh_aes128_gcm;
+extern const ssh_cipheralg ssh_aes128_gcm_ni;
+extern const ssh_cipheralg ssh_aes128_gcm_neon;
+extern const ssh_cipheralg ssh_aes128_gcm_sw;
 extern const ssh_cipheralg ssh_aes128_cbc;
 extern const ssh_cipheralg ssh_aes128_cbc_ni;
 extern const ssh_cipheralg ssh_aes128_cbc_neon;
@ -1103,6 +1115,7 @@ extern const ssh2_ciphers ssh2_aes;
 extern const ssh2_ciphers ssh2_blowfish;
 extern const ssh2_ciphers ssh2_arcfour;
 extern const ssh2_ciphers ssh2_ccp;
+extern const ssh2_ciphers ssh2_aesgcm;
 extern const ssh_hashalg ssh_md5;
 extern const ssh_hashalg ssh_sha1;
 extern const ssh_hashalg ssh_sha1_ni;
@ -1163,12 +1176,20 @@ extern const ssh2_macalg ssh_hmac_sha1_96;
 extern const ssh2_macalg ssh_hmac_sha1_96_buggy;
 extern const ssh2_macalg ssh_hmac_sha256;
 extern const ssh2_macalg ssh2_poly1305;
+extern const ssh2_macalg ssh2_aesgcm_mac;
+extern const ssh2_macalg ssh2_aesgcm_mac_sw;
+extern const ssh2_macalg ssh2_aesgcm_mac_ref_poly;
+extern const ssh2_macalg ssh2_aesgcm_mac_clmul;
+extern const ssh2_macalg ssh2_aesgcm_mac_neon;
 extern const ssh_compression_alg ssh_zlib;

 /* Special constructor: BLAKE2b can be instantiated with any hash
 * length up to 128 bytes */
 ssh_hash *blake2b_new_general(unsigned hashlen);

+/* Special test function for AES-GCM */
+void aesgcm_set_prefix_lengths(ssh2_mac *mac, size_t skip, size_t aad);
+
 /*
 * On some systems, you have to detect hardware crypto acceleration by
 * asking the local OS API rather than OS-agnostically asking the CPU
@ -1176,6 +1197,7 @@ ssh_hash *blake2b_new_general(unsigned hashlen);
 * platform subdirectory.
 */
 bool platform_aes_neon_available(void);
+bool platform_pmull_neon_available(void);
 bool platform_sha256_neon_available(void);
 bool platform_sha1_neon_available(void);
 bool platform_sha512_neon_available(void);
--- a/ssh/bpp2.c
+++ b/ssh/bpp2.c
@ -132,6 +132,12 @@ void ssh2_bpp_new_outgoing_crypto(
    s->out.etm_mode = etm_mode;
    if (mac) {
        s->out.mac = ssh2_mac_new(mac, s->out.cipher);
+        /*
+         * Important that mac_setkey comes after cipher_setkey,
+         * because in the case where the MAC makes use of the cipher
+         * (e.g. AES-GCM), it will need the cipher to be keyed
+         * already.
+         */
        ssh2_mac_setkey(s->out.mac, make_ptrlen(mac_key, mac->keylen));

        bpp_logevent("Initialised %s outbound MAC algorithm%s%s",
@ -189,6 +195,7 @@ void ssh2_bpp_new_incoming_crypto(
    s->in.etm_mode = etm_mode;
    if (mac) {
        s->in.mac = ssh2_mac_new(mac, s->in.cipher);
+        /* MAC setkey has to follow cipher, just as in outgoing_crypto above */
        ssh2_mac_setkey(s->in.mac, make_ptrlen(mac_key, mac->keylen));

        bpp_logevent("Initialised %s inbound MAC algorithm%s%s",
--- a/ssh/transport2.c
+++ b/ssh/transport2.c
@ -600,6 +600,9 @@ static void ssh2_write_kexinit_lists(
          case CIPHER_CHACHA20:
            preferred_ciphers[n_preferred_ciphers++] = &ssh2_ccp;
            break;
+          case CIPHER_AESGCM:
+            preferred_ciphers[n_preferred_ciphers++] = &ssh2_aesgcm;
+            break;
          case CIPHER_WARN:
            /* Flag for later. Don't bother if it's the last in
             * the list. */
--- a/test/cryptsuite.py
+++ b/test/cryptsuite.py
@ -145,6 +145,11 @@ def get_aes_impls():
            for impl in get_implementations("aes128_cbc")
            if impl.startswith("aes128_cbc_")]

+def get_aesgcm_impls():
+    return [impl.split("_", 1)[1]
+            for impl in get_implementations("aesgcm")
+            if impl.startswith("aesgcm_")]
+
 class MyTestBase(unittest.TestCase):
    "Intermediate class that adds useful helper methods."
    def assertEqualBin(self, x, y):
@ -2933,6 +2938,184 @@ Private-MAC: 5b1f6f4cc43eb0060d2c3e181bc0129343adba2b
        per_base_keytype_tests('rsa', run_ca_rsa_tests=True, ca_signflags=2)
        per_base_keytype_tests('rsa', run_ca_rsa_tests=True, ca_signflags=4)

+    def testAESGCMBlockBoundaries(self):
+        # For standard AES-GCM test vectors, see the separate tests in
+        # standard_test_vectors.testAESGCM. This function will test
+        # the local interface, including the skip length and the
+        # machinery for incremental MAC update.
+
+        def aesgcm(key, iv, aes_impl, gcm_impl):
+            c = ssh_cipher_new('aes{:d}_gcm_{}'.format(8*len(key), aes_impl))
+            m = ssh2_mac_new('aesgcm_{}'.format(gcm_impl), c)
+            if m is None: return # skip test if HW GCM not available
+            c.setkey(key)
+            c.setiv(iv + b'\0'*4)
+            m.setkey(b'')
+            return c, m
+
+        def test_one(aes_impl, gcm_impl):
+            # An actual test from a session with OpenSSH, which
+            # demonstrates that the implementation in practice matches up
+            # to what the test vectors say. This is its SSH2_MSG_EXT_INFO
+            # packet.
+            key = unhex('dbf98b2f56c83fb2f9476aa876511225')
+            iv = unhex('9af15ecccf2bacaaa9625a6a')
+            plain = unhex('1007000000020000000f736572766572'
+                          '2d7369672d616c6773000000db737368'
+                          '2d656432353531392c736b2d7373682d'
+                          '65643235353139406f70656e7373682e'
+                          '636f6d2c7373682d7273612c7273612d'
+                          '736861322d3235362c7273612d736861'
+                          '322d3531322c7373682d6473732c6563'
+                          '6473612d736861322d6e697374703235'
+                          '362c65636473612d736861322d6e6973'
+                          '74703338342c65636473612d73686132'
+                          '2d6e697374703532312c736b2d656364'
+                          '73612d736861322d6e69737470323536'
+                          '406f70656e7373682e636f6d2c776562'
+                          '617574686e2d736b2d65636473612d73'
+                          '6861322d6e69737470323536406f7065'
+                          '6e7373682e636f6d0000001f7075626c'
+                          '69636b65792d686f7374626f756e6440'
+                          '6f70656e7373682e636f6d0000000130'
+                          '5935130804ad4b19ed2789210290c438')
+            aad = unhex('00000130')
+            cipher = unhex('c4b88f35c1ef8aa6225033c3f185d648'
+                           '3c485d84930d5846f7851daacbff49d5'
+                           '8cf72169fca7ab3c170376df65dd69de'
+                           'c40a94c6b8e3da6d61161ab19be27466'
+                           '02e0dfa3330faae291ef4173a20e87a4'
+                           'd40728c645baa72916c1958531ef7b54'
+                           '27228513e53005e6d17b9bb384b8d8c1'
+                           '92b8a10b731459eed5a0fb120c283412'
+                           'e34445981df1257f1c35a06196731fed'
+                           '1b3115f419e754de0b634bf68768cb02'
+                           '29e70bb2259cedb5101ff6a4ac19aaad'
+                           '46f1c30697361b45d6c152c3069cee6b'
+                           'd46e9785d65ea6bf7fca41f0ac3c8e93'
+                           'ce940b0059c39d51e49c17f60d48d633'
+                           '5bae4402faab61d8d65221b24b400e65'
+                           '89f941ff48310231a42641851ea00832'
+                           '2c2d188f4cc6a4ec6002161c407d0a92'
+                           'f1697bb319fbec1ca63fa8e7ac171c85'
+                           '5b60142bfcf4e5b0a9ada3451799866e')
+
+            c, m = aesgcm(key, iv, aes_impl, gcm_impl)
+            len_dec = c.decrypt_length(aad, 123)
+            self.assertEqual(len_dec, aad) # length not actually encrypted
+            m.start()
+            # We expect 4 bytes skipped (the sequence number that
+            # ChaCha20-Poly1305 wants at the start of its MAC), and 4
+            # bytes AAD. These were initialised by the call to
+            # encrypt_length.
+            m.update(b'fake' + aad + cipher)
+            self.assertEqualBin(m.genresult(),
+                                unhex('4a5a6d57d54888b4e58c57a96e00b73a'))
+            self.assertEqualBin(c.decrypt(cipher), plain)
+
+            c, m = aesgcm(key, iv, aes_impl, gcm_impl)
+            len_enc = c.encrypt_length(aad, 123)
+            self.assertEqual(len_enc, aad) # length not actually encrypted
+            self.assertEqualBin(c.encrypt(plain), cipher)
+
+            # Test incremental update.
+            def testIncremental(skiplen, aad, plain):
+                key, iv = b'SomeRandomKeyVal', b'SomeRandomIV'
+                mac_input = b'x' * skiplen + aad + plain
+
+                c, m = aesgcm(key, iv, aes_impl, gcm_impl)
+                aesgcm_set_prefix_lengths(m, skiplen, len(aad))
+
+                m.start()
+                m.update(mac_input)
+                reference_mac = m.genresult()
+
+                # Break the input just once, at each possible byte
+                # position.
+                for i in range(1, len(mac_input)):
+                    c.setiv(iv + b'\0'*4)
+                    m.setkey(b'')
+                    aesgcm_set_prefix_lengths(m, skiplen, len(aad))
+                    m.start()
+                    m.update(mac_input[:i])
+                    m.update(mac_input[i:])
+                    self.assertEqualBin(m.genresult(), reference_mac)
+
+                # Feed the entire input in a byte at a time.
+                c.setiv(iv + b'\0'*4)
+                m.setkey(b'')
+                aesgcm_set_prefix_lengths(m, skiplen, len(aad))
+                m.start()
+                for i in range(len(mac_input)):
+                    m.update(mac_input[i:i+1])
+                self.assertEqualBin(m.genresult(), reference_mac)
+
+            # Incremental test with more than a full block of each thing
+            testIncremental(23, b'abcdefghijklmnopqrst',
+                            b'Lorem ipsum dolor sit amet')
+
+            # Incremental test with exactly a full block of each thing
+            testIncremental(16, b'abcdefghijklmnop',
+                            b'Lorem ipsum dolo')
+
+            # Incremental test with less than a full block of each thing
+            testIncremental(7, b'abcdefghij',
+                            b'Lorem ipsum')
+
+        for aes_impl in get_aes_impls():
+            for gcm_impl in get_aesgcm_impls():
+                with self.subTest(aes_impl=aes_impl, gcm_impl=gcm_impl):
+                    test_one(aes_impl, gcm_impl)
+
+    def testAESGCMIV(self):
+        key = b'SomeRandomKeyVal'
+
+        def test(gcm, cbc, iv_fixed, iv_msg):
+            gcm.setiv(ssh_uint32(iv_fixed) + ssh_uint64(iv_msg) + b'fake')
+
+            cbc.setiv(b'\0' * 16)
+            preimage = cbc.decrypt(gcm.encrypt(b'\0' * 16))
+            self.assertEqualBin(preimage, ssh_uint32(iv_fixed) +
+                                ssh_uint64(iv_msg) + ssh_uint32(1))
+            cbc.setiv(b'\0' * 16)
+            preimage = cbc.decrypt(gcm.encrypt(b'\0' * 16))
+            self.assertEqualBin(preimage, ssh_uint32(iv_fixed) +
+                                ssh_uint64(iv_msg) + ssh_uint32(2))
+
+            gcm.next_message()
+            iv_msg = (iv_msg + 1) & ((1<<64)-1)
+
+            cbc.setiv(b'\0' * 16)
+            preimage = cbc.decrypt(gcm.encrypt(b'\0' * 16))
+            self.assertEqualBin(preimage, ssh_uint32(iv_fixed) +
+                                ssh_uint64(iv_msg) + ssh_uint32(1))
+            cbc.setiv(b'\0' * 16)
+            preimage = cbc.decrypt(gcm.encrypt(b'\0' * 16))
+            self.assertEqualBin(preimage, ssh_uint32(iv_fixed) +
+                                ssh_uint64(iv_msg) + ssh_uint32(2))
+
+
+        for impl in get_aes_impls():
+            with self.subTest(aes_impl=impl):
+                gcm = ssh_cipher_new('aes{:d}_gcm_{}'.format(8*len(key), impl))
+                gcm.setkey(key)
+
+                cbc = ssh_cipher_new('aes{:d}_cbc_{}'.format(8*len(key), impl))
+                cbc.setkey(key)
+
+                # A simple test to ensure the low word gets
+                # incremented and that the whole IV looks basically
+                # the way we expect it to
+                test(gcm, cbc, 0x27182818, 0x3141592653589793)
+
+                # Test that carries are propagated into the high word
+                test(gcm, cbc, 0x27182818, 0x00000000FFFFFFFF)
+
+                # Test that carries _aren't_ propagated out of the
+                # high word of the message counter into the fixed word
+                # at the top
+                test(gcm, cbc, 0x27182818, 0xFFFFFFFFFFFFFFFF)
+
 class standard_test_vectors(MyTestBase):
    def testAES(self):
        def vector(cipher, key, plaintext, ciphertext):
@ -3726,6 +3909,178 @@ class standard_test_vectors(MyTestBase):
                             b'opaque="HRPCssKJSGjCrkzDg8OhwpzCiGPChXYjwrI2QmXDnsOS", '
                             b'userhash=true')

+    def testAESGCM(self):
+        def test(key, iv, plaintext, aad, ciphertext, mac):
+            c = ssh_cipher_new('aes{:d}_gcm'.format(8*len(key)))
+            m = ssh2_mac_new('aesgcm_{}'.format(impl), c)
+            if m is None: return # skip test if HW GCM not available
+            c.setkey(key)
+            c.setiv(iv + b'\0'*4)
+            m.setkey(b'')
+            aesgcm_set_prefix_lengths(m, 0, len(aad))
+
+            # Some test cases have plaintext/ciphertext that is not a
+            # multiple of the cipher block size. Our MAC
+            # implementation supports this, but the cipher
+            # implementation expects block-granular input.
+            padlen = 15 & -len(plaintext)
+            ciphertext_got = c.encrypt(plaintext + b'0' * padlen)[
+                :len(plaintext)]
+
+            m.start()
+            m.update(aad + ciphertext)
+            mac_got = m.genresult()
+
+            self.assertEqualBin(ciphertext_got, ciphertext)
+            self.assertEqualBin(mac_got, mac)
+
+            c.setiv(iv + b'\0'*4)
+
+        for impl in get_aesgcm_impls():
+            # 'The Galois/Counter Mode of Operation', McGrew and
+            # Viega, Appendix B. All the tests except the ones whose
+            # IV is the wrong length, because handling that requires
+            # an extra evaluation of the polynomial hash, which is
+            # never used in an SSH context, so I didn't implement it
+            # just for the sake of test vectors.
+
+            # Test Case 1
+            test(unhex('00000000000000000000000000000000'),
+                 unhex('000000000000000000000000'),
+                 unhex(''), unhex(''), unhex(''),
+                 unhex('58e2fccefa7e3061367f1d57a4e7455a'))
+
+            # Test Case 2
+            test(unhex('00000000000000000000000000000000'),
+                 unhex('000000000000000000000000'),
+                 unhex('00000000000000000000000000000000'),
+                 unhex(''),
+                 unhex('0388dace60b6a392f328c2b971b2fe78'),
+                 unhex('ab6e47d42cec13bdf53a67b21257bddf'))
+
+            # Test Case 3
+            test(unhex('feffe9928665731c6d6a8f9467308308'),
+                 unhex('cafebabefacedbaddecaf888'),
+                 unhex('d9313225f88406e5a55909c5aff5269a'
+                       '86a7a9531534f7da2e4c303d8a318a72'
+                       '1c3c0c95956809532fcf0e2449a6b525'
+                       'b16aedf5aa0de657ba637b391aafd255'),
+                 unhex(''),
+                 unhex('42831ec2217774244b7221b784d0d49c'
+                       'e3aa212f2c02a4e035c17e2329aca12e'
+                       '21d514b25466931c7d8f6a5aac84aa05'
+                       '1ba30b396a0aac973d58e091473f5985'),
+                 unhex('4d5c2af327cd64a62cf35abd2ba6fab4'))
+
+            # Test Case 4
+            test(unhex('feffe9928665731c6d6a8f9467308308'),
+                 unhex('cafebabefacedbaddecaf888'),
+                 unhex('d9313225f88406e5a55909c5aff5269a'
+                       '86a7a9531534f7da2e4c303d8a318a72'
+                       '1c3c0c95956809532fcf0e2449a6b525'
+                       'b16aedf5aa0de657ba637b39'),
+                 unhex('feedfacedeadbeeffeedfacedeadbeef'
+                       'abaddad2'),
+                 unhex('42831ec2217774244b7221b784d0d49c'
+                       'e3aa212f2c02a4e035c17e2329aca12e'
+                       '21d514b25466931c7d8f6a5aac84aa05'
+                       '1ba30b396a0aac973d58e091'),
+                 unhex('5bc94fbc3221a5db94fae95ae7121a47'))
+
+            # Test Case 7
+            test(unhex('00000000000000000000000000000000'
+                       '0000000000000000'),
+                 unhex('000000000000000000000000'),
+                 unhex(''), unhex(''), unhex(''),
+                 unhex('cd33b28ac773f74ba00ed1f312572435'))
+
+            # Test Case 8
+            test(unhex('00000000000000000000000000000000'
+                       '0000000000000000'),
+                 unhex('000000000000000000000000'),
+                 unhex('00000000000000000000000000000000'),
+                 unhex(''),
+                 unhex('98e7247c07f0fe411c267e4384b0f600'),
+                 unhex('2ff58d80033927ab8ef4d4587514f0fb'))
+
+            # Test Case 9
+            test(unhex('feffe9928665731c6d6a8f9467308308'
+                       'feffe9928665731c'),
+                 unhex('cafebabefacedbaddecaf888'),
+                 unhex('d9313225f88406e5a55909c5aff5269a'
+                       '86a7a9531534f7da2e4c303d8a318a72'
+                       '1c3c0c95956809532fcf0e2449a6b525'
+                       'b16aedf5aa0de657ba637b391aafd255'),
+                 unhex(''),
+                 unhex('3980ca0b3c00e841eb06fac4872a2757'
+                       '859e1ceaa6efd984628593b40ca1e19c'
+                       '7d773d00c144c525ac619d18c84a3f47'
+                       '18e2448b2fe324d9ccda2710acade256'),
+                 unhex('9924a7c8587336bfb118024db8674a14'))
+
+            # Test Case 10
+            test(unhex('feffe9928665731c6d6a8f9467308308'
+                       'feffe9928665731c'),
+                 unhex('cafebabefacedbaddecaf888'),
+                 unhex('d9313225f88406e5a55909c5aff5269a'
+                       '86a7a9531534f7da2e4c303d8a318a72'
+                       '1c3c0c95956809532fcf0e2449a6b525'
+                       'b16aedf5aa0de657ba637b39'),
+                 unhex('feedfacedeadbeeffeedfacedeadbeef'
+                       'abaddad2'),
+                 unhex('3980ca0b3c00e841eb06fac4872a2757'
+                       '859e1ceaa6efd984628593b40ca1e19c'
+                       '7d773d00c144c525ac619d18c84a3f47'
+                       '18e2448b2fe324d9ccda2710'),
+                 unhex('2519498e80f1478f37ba55bd6d27618c'))
+
+            # Test Case 13
+            test(unhex('00000000000000000000000000000000'
+                       '00000000000000000000000000000000'),
+                 unhex('000000000000000000000000'),
+                 unhex(''), unhex(''), unhex(''),
+                 unhex('530f8afbc74536b9a963b4f1c4cb738b'))
+
+            # Test Case 14
+            test(unhex('00000000000000000000000000000000'
+                       '00000000000000000000000000000000'),
+                 unhex('000000000000000000000000'),
+                 unhex('00000000000000000000000000000000'),
+                 unhex(''),
+                 unhex('cea7403d4d606b6e074ec5d3baf39d18'),
+                 unhex('d0d1c8a799996bf0265b98b5d48ab919'))
+
+            # Test Case 15
+            test(unhex('feffe9928665731c6d6a8f9467308308'
+                       'feffe9928665731c6d6a8f9467308308'),
+                 unhex('cafebabefacedbaddecaf888'),
+                 unhex('d9313225f88406e5a55909c5aff5269a'
+                       '86a7a9531534f7da2e4c303d8a318a72'
+                       '1c3c0c95956809532fcf0e2449a6b525'
+                       'b16aedf5aa0de657ba637b391aafd255'),
+                 unhex(''),
+                 unhex('522dc1f099567d07f47f37a32a84427d'
+                       '643a8cdcbfe5c0c97598a2bd2555d1aa'
+                       '8cb08e48590dbb3da7b08b1056828838'
+                       'c5f61e6393ba7a0abcc9f662898015ad'),
+                 unhex('b094dac5d93471bdec1a502270e3cc6c'))
+
+            # Test Case 16
+            test(unhex('feffe9928665731c6d6a8f9467308308'
+                       'feffe9928665731c6d6a8f9467308308'),
+                 unhex('cafebabefacedbaddecaf888'),
+                 unhex('d9313225f88406e5a55909c5aff5269a'
+                       '86a7a9531534f7da2e4c303d8a318a72'
+                       '1c3c0c95956809532fcf0e2449a6b525'
+                       'b16aedf5aa0de657ba637b39'),
+                 unhex('feedfacedeadbeeffeedfacedeadbeef'
+                       'abaddad2'),
+                 unhex('522dc1f099567d07f47f37a32a84427d'
+                       '643a8cdcbfe5c0c97598a2bd2555d1aa'
+                       '8cb08e48590dbb3da7b08b1056828838'
+                       'c5f61e6393ba7a0abcc9f662'),
+                 unhex('76fc6ece0f4e1768cddf8853bb2d551b'))
+
 if __name__ == "__main__":
    # Run the tests, suppressing automatic sys.exit and collecting the
    # unittest.TestProgram instance returned by unittest.main instead.
--- a/test/list-accel.py
+++ b/test/list-accel.py
@ -25,10 +25,14 @@ def list_implementations(alg, checkfn):
 def list_cipher_implementations(alg):
    list_implementations(alg, lambda impl: ssh_cipher_new(impl) is not None)

+def list_mac_implementations(alg):
+    list_implementations(alg, lambda impl: ssh2_mac_new(impl, None) is not None)
+
 def list_hash_implementations(alg):
    list_implementations(alg, lambda impl: ssh_hash_new(impl) is not None)

 list_cipher_implementations("aes256_cbc")
+list_mac_implementations("aesgcm")
 list_hash_implementations("sha1")
 list_hash_implementations("sha256")
 list_hash_implementations("sha512")
--- a/test/testcrypt-enum.h
+++ b/test/testcrypt-enum.h
@ -36,6 +36,16 @@ BEGIN_ENUM_TYPE(macalg)
    ENUM_VALUE("hmac_sha1_96_buggy", &ssh_hmac_sha1_96_buggy)
    ENUM_VALUE("hmac_sha256", &ssh_hmac_sha256)
    ENUM_VALUE("poly1305", &ssh2_poly1305)
+    ENUM_VALUE("aesgcm", &ssh2_aesgcm_mac)
+    ENUM_VALUE("aesgcm", &ssh2_aesgcm_mac)
+    ENUM_VALUE("aesgcm_sw", &ssh2_aesgcm_mac_sw)
+    ENUM_VALUE("aesgcm_ref_poly", &ssh2_aesgcm_mac_ref_poly)
+#if HAVE_CLMUL
+    ENUM_VALUE("aesgcm_clmul", &ssh2_aesgcm_mac_clmul)
+#endif
+#if HAVE_NEON_PMULL
+    ENUM_VALUE("aesgcm_neon", &ssh2_aesgcm_mac_neon)
+#endif
 END_ENUM_TYPE(macalg)

 BEGIN_ENUM_TYPE(keyalg)
@ -60,31 +70,43 @@ BEGIN_ENUM_TYPE(cipheralg)
    ENUM_VALUE("3des_ssh1", &ssh_3des_ssh1)
    ENUM_VALUE("des_cbc", &ssh_des)
    ENUM_VALUE("aes256_ctr", &ssh_aes256_sdctr)
+    ENUM_VALUE("aes256_gcm", &ssh_aes256_gcm)
    ENUM_VALUE("aes256_cbc", &ssh_aes256_cbc)
    ENUM_VALUE("aes192_ctr", &ssh_aes192_sdctr)
+    ENUM_VALUE("aes192_gcm", &ssh_aes192_gcm)
    ENUM_VALUE("aes192_cbc", &ssh_aes192_cbc)
    ENUM_VALUE("aes128_ctr", &ssh_aes128_sdctr)
+    ENUM_VALUE("aes128_gcm", &ssh_aes128_gcm)
    ENUM_VALUE("aes128_cbc", &ssh_aes128_cbc)
    ENUM_VALUE("aes256_ctr_sw", &ssh_aes256_sdctr_sw)
+    ENUM_VALUE("aes256_gcm_sw", &ssh_aes256_gcm_sw)
    ENUM_VALUE("aes256_cbc_sw", &ssh_aes256_cbc_sw)
    ENUM_VALUE("aes192_ctr_sw", &ssh_aes192_sdctr_sw)
+    ENUM_VALUE("aes192_gcm_sw", &ssh_aes192_gcm_sw)
    ENUM_VALUE("aes192_cbc_sw", &ssh_aes192_cbc_sw)
    ENUM_VALUE("aes128_ctr_sw", &ssh_aes128_sdctr_sw)
+    ENUM_VALUE("aes128_gcm_sw", &ssh_aes128_gcm_sw)
    ENUM_VALUE("aes128_cbc_sw", &ssh_aes128_cbc_sw)
 #if HAVE_AES_NI
    ENUM_VALUE("aes256_ctr_ni", &ssh_aes256_sdctr_ni)
+    ENUM_VALUE("aes256_gcm_ni", &ssh_aes256_gcm_ni)
    ENUM_VALUE("aes256_cbc_ni", &ssh_aes256_cbc_ni)
    ENUM_VALUE("aes192_ctr_ni", &ssh_aes192_sdctr_ni)
+    ENUM_VALUE("aes192_gcm_ni", &ssh_aes192_gcm_ni)
    ENUM_VALUE("aes192_cbc_ni", &ssh_aes192_cbc_ni)
    ENUM_VALUE("aes128_ctr_ni", &ssh_aes128_sdctr_ni)
+    ENUM_VALUE("aes128_gcm_ni", &ssh_aes128_gcm_ni)
    ENUM_VALUE("aes128_cbc_ni", &ssh_aes128_cbc_ni)
 #endif
 #if HAVE_NEON_CRYPTO
    ENUM_VALUE("aes256_ctr_neon", &ssh_aes256_sdctr_neon)
+    ENUM_VALUE("aes256_gcm_neon", &ssh_aes256_gcm_neon)
    ENUM_VALUE("aes256_cbc_neon", &ssh_aes256_cbc_neon)
    ENUM_VALUE("aes192_ctr_neon", &ssh_aes192_sdctr_neon)
+    ENUM_VALUE("aes192_gcm_neon", &ssh_aes192_gcm_neon)
    ENUM_VALUE("aes192_cbc_neon", &ssh_aes192_cbc_neon)
    ENUM_VALUE("aes128_ctr_neon", &ssh_aes128_sdctr_neon)
+    ENUM_VALUE("aes128_gcm_neon", &ssh_aes128_gcm_neon)
    ENUM_VALUE("aes128_cbc_neon", &ssh_aes128_cbc_neon)
 #endif
    ENUM_VALUE("blowfish_ctr", &ssh_blowfish_ssh2_ctr)
--- a/test/testcrypt-func.h
+++ b/test/testcrypt-func.h
@ -277,6 +277,9 @@ FUNC(void, ssh2_mac_next_message, ARG(val_mac, m))
 FUNC_WRAPPED(val_string, ssh2_mac_genresult, ARG(val_mac, m))
 FUNC(val_string_asciz_const, ssh2_mac_text_name, ARG(val_mac, m))

+FUNC(void, aesgcm_set_prefix_lengths,
+     ARG(val_mac, m), ARG(uint, skip), ARG(uint, aad))
+
 /*
 * The ssh_key abstraction. All the uses of BinarySink and
 * BinarySource in parameters are replaced with ordinary strings for
--- a/test/testcrypt.c
+++ b/test/testcrypt.c
@ -1329,7 +1329,16 @@ strbuf *get_implementations_commasep(ptrlen alg)
    strbuf *out = strbuf_new();
    put_datapl(out, alg);

-    if (ptrlen_startswith(alg, PTRLEN_LITERAL("aes"), NULL)) {
+    if (ptrlen_startswith(alg, PTRLEN_LITERAL("aesgcm"), NULL)) {
+        put_fmt(out, ",%.*s_sw", PTRLEN_PRINTF(alg));
+        put_fmt(out, ",%.*s_ref_poly", PTRLEN_PRINTF(alg));
+#if HAVE_CLMUL
+        put_fmt(out, ",%.*s_clmul", PTRLEN_PRINTF(alg));
+#endif
+#if HAVE_NEON_PMULL
+        put_fmt(out, ",%.*s_neon", PTRLEN_PRINTF(alg));
+#endif
+    } else if (ptrlen_startswith(alg, PTRLEN_LITERAL("aes"), NULL)) {
        put_fmt(out, ",%.*s_sw", PTRLEN_PRINTF(alg));
 #if HAVE_AES_NI
        put_fmt(out, ",%.*s_ni", PTRLEN_PRINTF(alg));
--- a/test/testsc.c
+++ b/test/testsc.c
@ -259,6 +259,12 @@ VOLATILE_WRAPPED_DEFN(static, size_t, looplimit, (size_t x))
 #define IF_SHA_NI(x)
 #endif

+#if HAVE_CLMUL
+#define IF_CLMUL(x) x
+#else
+#define IF_CLMUL(x)
+#endif
+
 #if HAVE_NEON_CRYPTO
 #define IF_NEON_CRYPTO(x) x
 #else
@ -271,6 +277,12 @@ VOLATILE_WRAPPED_DEFN(static, size_t, looplimit, (size_t x))
 #define IF_NEON_SHA512(x)
 #endif

+#if HAVE_NEON_PMULL
+#define IF_NEON_PMULL(x) x
+#else
+#define IF_NEON_PMULL(x)
+#endif
+
 /* Ciphers that we expect to pass this test. Blowfish and Arcfour are
 * intentionally omitted, because we already know they don't. */
 #define CIPHERS(X, Y)                           \
@ -280,28 +292,40 @@ VOLATILE_WRAPPED_DEFN(static, size_t, looplimit, (size_t x))
    X(Y, ssh_des)                               \
    X(Y, ssh_des_sshcom_ssh2)                   \
    X(Y, ssh_aes256_sdctr)                      \
+    X(Y, ssh_aes256_gcm)                        \
    X(Y, ssh_aes256_cbc)                        \
    X(Y, ssh_aes192_sdctr)                      \
+    X(Y, ssh_aes192_gcm)                        \
    X(Y, ssh_aes192_cbc)                        \
    X(Y, ssh_aes128_sdctr)                      \
+    X(Y, ssh_aes128_gcm)                        \
    X(Y, ssh_aes128_cbc)                        \
    X(Y, ssh_aes256_sdctr_sw)                   \
+    X(Y, ssh_aes256_gcm_sw)                     \
    X(Y, ssh_aes256_cbc_sw)                     \
    X(Y, ssh_aes192_sdctr_sw)                   \
+    X(Y, ssh_aes192_gcm_sw)                     \
    X(Y, ssh_aes192_cbc_sw)                     \
    X(Y, ssh_aes128_sdctr_sw)                   \
+    X(Y, ssh_aes128_gcm_sw)                     \
    X(Y, ssh_aes128_cbc_sw)                     \
    IF_AES_NI(X(Y, ssh_aes256_sdctr_ni))        \
+    IF_AES_NI(X(Y, ssh_aes256_gcm_ni))          \
    IF_AES_NI(X(Y, ssh_aes256_cbc_ni))          \
    IF_AES_NI(X(Y, ssh_aes192_sdctr_ni))        \
+    IF_AES_NI(X(Y, ssh_aes192_gcm_ni))          \
    IF_AES_NI(X(Y, ssh_aes192_cbc_ni))          \
    IF_AES_NI(X(Y, ssh_aes128_sdctr_ni))        \
+    IF_AES_NI(X(Y, ssh_aes128_gcm_ni))          \
    IF_AES_NI(X(Y, ssh_aes128_cbc_ni))          \
    IF_NEON_CRYPTO(X(Y, ssh_aes256_sdctr_neon)) \
+    IF_NEON_CRYPTO(X(Y, ssh_aes256_gcm_neon))   \
    IF_NEON_CRYPTO(X(Y, ssh_aes256_cbc_neon))   \
    IF_NEON_CRYPTO(X(Y, ssh_aes192_sdctr_neon)) \
+    IF_NEON_CRYPTO(X(Y, ssh_aes192_gcm_neon))   \
    IF_NEON_CRYPTO(X(Y, ssh_aes192_cbc_neon))   \
    IF_NEON_CRYPTO(X(Y, ssh_aes128_sdctr_neon)) \
+    IF_NEON_CRYPTO(X(Y, ssh_aes128_gcm_neon))   \
    IF_NEON_CRYPTO(X(Y, ssh_aes128_cbc_neon))   \
    X(Y, ssh2_chacha20_poly1305)                \
    /* end of list */
@ -317,9 +341,17 @@ VOLATILE_WRAPPED_DEFN(static, size_t, looplimit, (size_t x))
    X(Y, ssh_hmac_sha256)                       \
    /* end of list */

-#define ALL_MACS(X, Y)                          \
-    SIMPLE_MACS(X, Y)                           \
-    X(Y, poly1305)                              \
+#define ALL_MACS(X, Y)                                      \
+    SIMPLE_MACS(X, Y)                                       \
+    X(Y, poly1305)                                          \
+    X(Y, aesgcm_sw_sw)                                      \
+    X(Y, aesgcm_sw_refpoly)                                 \
+    IF_AES_NI(X(Y, aesgcm_ni_sw))                           \
+    IF_NEON_CRYPTO(X(Y, aesgcm_neon_sw))                    \
+    IF_CLMUL(X(Y, aesgcm_sw_clmul))                         \
+    IF_NEON_PMULL(X(Y, aesgcm_sw_neon))                     \
+    IF_AES_NI(IF_CLMUL(X(Y, aesgcm_ni_clmul)))              \
+    IF_NEON_CRYPTO(IF_NEON_PMULL(X(Y, aesgcm_neon_neon)))   \
    /* end of list */

 #define MAC_TESTLIST(X, name) X(mac_ ## name)
@ -1473,6 +1505,58 @@ static void test_mac_poly1305(void)
    test_mac(&ssh2_poly1305, &ssh2_chacha20_poly1305);
 }

+static void test_mac_aesgcm_sw_sw(void)
+{
+    test_mac(&ssh2_aesgcm_mac_sw, &ssh_aes128_gcm_sw);
+}
+
+static void test_mac_aesgcm_sw_refpoly(void)
+{
+    test_mac(&ssh2_aesgcm_mac_ref_poly, &ssh_aes128_gcm_sw);
+}
+
+#if HAVE_AES_NI
+static void test_mac_aesgcm_ni_sw(void)
+{
+    test_mac(&ssh2_aesgcm_mac_sw, &ssh_aes128_gcm_ni);
+}
+#endif
+
+#if HAVE_NEON_CRYPTO
+static void test_mac_aesgcm_neon_sw(void)
+{
+    test_mac(&ssh2_aesgcm_mac_sw, &ssh_aes128_gcm_neon);
+}
+#endif
+
+#if HAVE_CLMUL
+static void test_mac_aesgcm_sw_clmul(void)
+{
+    test_mac(&ssh2_aesgcm_mac_clmul, &ssh_aes128_gcm_sw);
+}
+#endif
+
+#if HAVE_NEON_PMULL
+static void test_mac_aesgcm_sw_neon(void)
+{
+    test_mac(&ssh2_aesgcm_mac_neon, &ssh_aes128_gcm_sw);
+}
+#endif
+
+#if HAVE_AES_NI && HAVE_CLMUL
+static void test_mac_aesgcm_ni_clmul(void)
+{
+    test_mac(&ssh2_aesgcm_mac_clmul, &ssh_aes128_gcm_ni);
+}
+#endif
+
+#if HAVE_NEON_CRYPTO && HAVE_NEON_PMULL
+static void test_mac_aesgcm_neon_neon(void)
+{
+    test_mac(&ssh2_aesgcm_mac_neon, &ssh_aes128_gcm_neon);
+}
+#endif
+
 static void test_hash(const ssh_hashalg *halg)
 {
    ssh_hash *h = ssh_hash_new(halg);
--- a/unix/utils/arm_arch_queries.c
+++ b/unix/utils/arm_arch_queries.c
@ -27,6 +27,21 @@ bool platform_aes_neon_available(void)
 #endif
 }

+bool platform_pmull_neon_available(void)
+{
+#if defined HWCAP_PMULL
+    return getauxval(AT_HWCAP) & HWCAP_PMULL;
+#elif defined HWCAP2_PMULL
+    return getauxval(AT_HWCAP2) & HWCAP2_PMULL;
+#elif defined __APPLE__
+    SysctlResult res = test_sysctl_flag("hw.optional.arm.FEAT_PMULL");
+    /* As above, treat 'missing' as enabled */
+    return res != SYSCTL_OFF;
+#else
+    return false;
+#endif
+}
+
 bool platform_sha256_neon_available(void)
 {
 #if defined HWCAP_SHA2
--- a/windows/utils/arm_arch_queries.c
+++ b/windows/utils/arm_arch_queries.c
@ -20,6 +20,11 @@ bool platform_aes_neon_available(void)
    return IsProcessorFeaturePresent(PF_ARM_V8_CRYPTO_INSTRUCTIONS_AVAILABLE);
 }

+bool platform_pmull_neon_available(void)
+{
+    return IsProcessorFeaturePresent(PF_ARM_V8_CRYPTO_INSTRUCTIONS_AVAILABLE);
+}
+
 bool platform_sha256_neon_available(void)
 {
    return IsProcessorFeaturePresent(PF_ARM_V8_CRYPTO_INSTRUCTIONS_AVAILABLE);