diff --git a/cmake/cmake.h.in b/cmake/cmake.h.in
index 6ed24b51..4ce869f4 100644
--- a/cmake/cmake.h.in
+++ b/cmake/cmake.h.in
@@ -49,7 +49,9 @@
 #cmakedefine01 HAVE_AES_NI
 #cmakedefine01 HAVE_SHA_NI
 #cmakedefine01 HAVE_SHAINTRIN_H
+#cmakedefine01 HAVE_CLMUL
 #cmakedefine01 HAVE_NEON_CRYPTO
+#cmakedefine01 HAVE_NEON_PMULL
 #cmakedefine01 HAVE_NEON_SHA512
 #cmakedefine01 HAVE_NEON_SHA512_INTRINSICS
 #cmakedefine01 USE_ARM64_NEON_H
diff --git a/config.c b/config.c
index 59b6976a..747af814 100644
--- a/config.c
+++ b/config.c
@@ -491,6 +491,7 @@ static void cipherlist_handler(dlgcontrol *ctrl, dlgparam *dlg,
 
         static const struct { const char *s; int c; } ciphers[] = {
             { "ChaCha20 (SSH-2 only)",  CIPHER_CHACHA20 },
+            { "AES-GCM (SSH-2 only)",   CIPHER_AESGCM },
             { "3DES",                   CIPHER_3DES },
             { "Blowfish",               CIPHER_BLOWFISH },
             { "DES",                    CIPHER_DES },
diff --git a/crypto/CMakeLists.txt b/crypto/CMakeLists.txt
index 08de141e..ff04efb5 100644
--- a/crypto/CMakeLists.txt
+++ b/crypto/CMakeLists.txt
@@ -2,6 +2,10 @@ add_sources_from_current_dir(crypto
   aes-common.c
   aes-select.c
   aes-sw.c
+  aesgcm-common.c
+  aesgcm-select.c
+  aesgcm-sw.c
+  aesgcm-ref-poly.c
   arcfour.c
   argon2.c
   bcrypt.c
@@ -123,6 +127,16 @@ if(HAVE_WMMINTRIN_H)
       volatile __m128i r, a, b, c;
       int main(void) { r = _mm_sha256rnds2_epu32(a, b, c); }"
     ADD_SOURCES_IF_SUCCESSFUL sha256-ni.c sha1-ni.c)
+
+  test_compile_with_flags(HAVE_CLMUL
+    GNU_FLAGS -msse4.1 -mpclmul
+    TEST_SOURCE "
+      #include <wmmintrin.h>
+      #include <tmmintrin.h>
+      volatile __m128i r, a, b;
+      int main(void) { r = _mm_clmulepi64_si128(a, b, 5);
+                       r = _mm_shuffle_epi8(r, a); }"
+    ADD_SOURCES_IF_SUCCESSFUL aesgcm-clmul.c)
 endif()
 
 # ----------------------------------------------------------------------
@@ -170,6 +184,17 @@ if(neon)
       int main(void) { r = vaeseq_u8(a, b); s = vsha256hq_u32(x, y, z); }"
     ADD_SOURCES_IF_SUCCESSFUL aes-neon.c sha256-neon.c sha1-neon.c)
 
+  test_compile_with_flags(HAVE_NEON_PMULL
+    GNU_FLAGS -march=armv8-a+crypto
+    MSVC_FLAGS -D_ARM_USE_NEW_NEON_INTRINSICS
+    TEST_SOURCE "
+      #include <${neon_header}>
+      volatile poly128_t r;
+      volatile poly64_t a, b;
+      volatile poly64x2_t u, v;
+      int main(void) { r = vmull_p64(a, b); r = vmull_high_p64(u, v); }"
+    ADD_SOURCES_IF_SUCCESSFUL aesgcm-neon.c)
+
   # The 'sha3' architecture extension, despite the name, includes
   # support for SHA-512 (from the SHA-2 standard) as well as SHA-3
   # proper.
diff --git a/crypto/aes-common.c b/crypto/aes-common.c
index e1c41ddf..3bed2af1 100644
--- a/crypto/aes-common.c
+++ b/crypto/aes-common.c
@@ -12,3 +12,9 @@ const uint8_t aes_key_setup_round_constants[10] = {
      * regardless of the key. */
     0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1b, 0x36,
 };
+
+void aesgcm_cipher_crypt_length(
+    ssh_cipher *cipher, void *blk, int len, unsigned long seq)
+{
+    /* Do nothing: lengths are sent in clear for this cipher. */
+}
diff --git a/crypto/aes-neon.c b/crypto/aes-neon.c
index f3b92832..5cd9f2d1 100644
--- a/crypto/aes-neon.c
+++ b/crypto/aes-neon.c
@@ -176,6 +176,18 @@ static inline uint8x16_t aes_neon_sdctr_increment(uint8x16_t in)
         vsubq_u64(vreinterpretq_u64_u8(in), subtrahend));
 }
 
+/*
+ * Much simpler auxiliary routine to increment the counter for GCM
+ * mode. This only has to increment the low word.
+ */
+static inline uint8x16_t aes_neon_gcm_increment(uint8x16_t in)
+{
+    uint32x4_t inw = vreinterpretq_u32_u8(in);
+    uint32x4_t ONE = vcombine_u32(vcreate_u32(0), vcreate_u32(1));
+    inw = vaddq_u32(inw, ONE);
+    return vreinterpretq_u8_u32(inw);
+}
+
 /*
  * The SSH interface and the cipher modes.
  */
@@ -227,6 +239,28 @@ static void aes_neon_setiv_sdctr(ssh_cipher *ciph, const void *iv)
     ctx->iv = aes_neon_sdctr_reverse(counter);
 }
 
+static void aes_neon_setiv_gcm(ssh_cipher *ciph, const void *iv)
+{
+    aes_neon_context *ctx = container_of(ciph, aes_neon_context, ciph);
+    uint8x16_t counter = vld1q_u8(iv);
+    ctx->iv = aes_neon_sdctr_reverse(counter);
+    ctx->iv = vreinterpretq_u8_u32(vsetq_lane_u32(
+                                       1, vreinterpretq_u32_u8(ctx->iv), 2));
+}
+
+static void aes_neon_next_message_gcm(ssh_cipher *ciph)
+{
+    aes_neon_context *ctx = container_of(ciph, aes_neon_context, ciph);
+    uint32x4_t iv = vreinterpretq_u32_u8(ctx->iv);
+    uint64_t msg_counter = vgetq_lane_u32(iv, 0);
+    msg_counter = (msg_counter << 32) | vgetq_lane_u32(iv, 3);
+    msg_counter++;
+    iv = vsetq_lane_u32(msg_counter >> 32, iv, 0);
+    iv = vsetq_lane_u32(msg_counter, iv, 3);
+    iv = vsetq_lane_u32(1, iv, 2);
+    ctx->iv = vreinterpretq_u8_u32(iv);
+}
+
 typedef uint8x16_t (*aes_neon_fn)(uint8x16_t v, const uint8x16_t *keysched);
 
 static inline void aes_cbc_neon_encrypt(
@@ -275,6 +309,31 @@ static inline void aes_sdctr_neon(
     }
 }
 
+static inline void aes_encrypt_ecb_block_neon(
+    ssh_cipher *ciph, void *blk, aes_neon_fn encrypt)
+{
+    aes_neon_context *ctx = container_of(ciph, aes_neon_context, ciph);
+    uint8x16_t plaintext = vld1q_u8(blk);
+    uint8x16_t ciphertext = encrypt(plaintext, ctx->keysched_e);
+    vst1q_u8(blk, ciphertext);
+}
+
+static inline void aes_gcm_neon(
+    ssh_cipher *ciph, void *vblk, int blklen, aes_neon_fn encrypt)
+{
+    aes_neon_context *ctx = container_of(ciph, aes_neon_context, ciph);
+
+    for (uint8_t *blk = (uint8_t *)vblk, *finish = blk + blklen;
+         blk < finish; blk += 16) {
+        uint8x16_t counter = aes_neon_sdctr_reverse(ctx->iv);
+        uint8x16_t keystream = encrypt(counter, ctx->keysched_e);
+        uint8x16_t input = vld1q_u8(blk);
+        uint8x16_t output = veorq_u8(input, keystream);
+        vst1q_u8(blk, output);
+        ctx->iv = aes_neon_gcm_increment(ctx->iv);
+    }
+}
+
 #define NEON_ENC_DEC(len)                                               \
     static void aes##len##_neon_cbc_encrypt(                            \
         ssh_cipher *ciph, void *vblk, int blklen)                       \
@@ -285,6 +344,12 @@ static inline void aes_sdctr_neon(
     static void aes##len##_neon_sdctr(                                  \
         ssh_cipher *ciph, void *vblk, int blklen)                       \
     { aes_sdctr_neon(ciph, vblk, blklen, aes_neon_##len##_e); }         \
+    static void aes##len##_neon_gcm(                                    \
+        ssh_cipher *ciph, void *vblk, int blklen)                       \
+    { aes_gcm_neon(ciph, vblk, blklen, aes_neon_##len##_e); }           \
+    static void aes##len##_neon_encrypt_ecb_block(                      \
+        ssh_cipher *ciph, void *vblk)                                   \
+    { aes_encrypt_ecb_block_neon(ciph, vblk, aes_neon_##len##_e); }
 
 NEON_ENC_DEC(128)
 NEON_ENC_DEC(192)
diff --git a/crypto/aes-ni.c b/crypto/aes-ni.c
index 22348de4..67d82b86 100644
--- a/crypto/aes-ni.c
+++ b/crypto/aes-ni.c
@@ -137,6 +137,16 @@ static inline __m128i aes_ni_sdctr_increment(__m128i v)
     return v;
 }
 
+/*
+ * Much simpler auxiliary routine to increment the counter for GCM
+ * mode. This only has to increment the low word.
+ */
+static inline __m128i aes_ni_gcm_increment(__m128i v)
+{
+    const __m128i ONE  = _mm_setr_epi32(1,0,0,0);
+    return _mm_add_epi32(v, ONE);
+}
+
 /*
  * Auxiliary routine to reverse the byte order of a vector, so that
  * the SDCTR IV can be made big-endian for feeding to the cipher.
@@ -214,6 +224,25 @@ static void aes_ni_setiv_sdctr(ssh_cipher *ciph, const void *iv)
     ctx->iv = aes_ni_sdctr_reverse(counter);
 }
 
+static void aes_ni_setiv_gcm(ssh_cipher *ciph, const void *iv)
+{
+    aes_ni_context *ctx = container_of(ciph, aes_ni_context, ciph);
+    __m128i counter = _mm_loadu_si128(iv);
+    ctx->iv = aes_ni_sdctr_reverse(counter);
+    ctx->iv = _mm_insert_epi32(ctx->iv, 1, 0);
+}
+
+static void aes_ni_next_message_gcm(ssh_cipher *ciph)
+{
+    aes_ni_context *ctx = container_of(ciph, aes_ni_context, ciph);
+    uint32_t fixed = _mm_extract_epi32(ctx->iv, 3);
+    uint64_t msg_counter = _mm_extract_epi32(ctx->iv, 2);
+    msg_counter <<= 32;
+    msg_counter |= (uint32_t)_mm_extract_epi32(ctx->iv, 1);
+    msg_counter++;
+    ctx->iv = _mm_set_epi32(fixed, msg_counter >> 32, msg_counter, 1);
+}
+
 typedef __m128i (*aes_ni_fn)(__m128i v, const __m128i *keysched);
 
 static inline void aes_cbc_ni_encrypt(
@@ -262,6 +291,31 @@ static inline void aes_sdctr_ni(
     }
 }
 
+static inline void aes_encrypt_ecb_block_ni(
+    ssh_cipher *ciph, void *blk, aes_ni_fn encrypt)
+{
+    aes_ni_context *ctx = container_of(ciph, aes_ni_context, ciph);
+    __m128i plaintext = _mm_loadu_si128(blk);
+    __m128i ciphertext = encrypt(plaintext, ctx->keysched_e);
+    _mm_storeu_si128(blk, ciphertext);
+}
+
+static inline void aes_gcm_ni(
+    ssh_cipher *ciph, void *vblk, int blklen, aes_ni_fn encrypt)
+{
+    aes_ni_context *ctx = container_of(ciph, aes_ni_context, ciph);
+
+    for (uint8_t *blk = (uint8_t *)vblk, *finish = blk + blklen;
+         blk < finish; blk += 16) {
+        __m128i counter = aes_ni_sdctr_reverse(ctx->iv);
+        __m128i keystream = encrypt(counter, ctx->keysched_e);
+        __m128i input = _mm_loadu_si128((const __m128i *)blk);
+        __m128i output = _mm_xor_si128(input, keystream);
+        _mm_storeu_si128((__m128i *)blk, output);
+        ctx->iv = aes_ni_gcm_increment(ctx->iv);
+    }
+}
+
 #define NI_ENC_DEC(len)                                                 \
     static void aes##len##_ni_cbc_encrypt(                              \
         ssh_cipher *ciph, void *vblk, int blklen)                       \
@@ -272,6 +326,12 @@ static inline void aes_sdctr_ni(
     static void aes##len##_ni_sdctr(                                    \
         ssh_cipher *ciph, void *vblk, int blklen)                       \
     { aes_sdctr_ni(ciph, vblk, blklen, aes_ni_##len##_e); }             \
+    static void aes##len##_ni_gcm(                                      \
+        ssh_cipher *ciph, void *vblk, int blklen)                       \
+    { aes_gcm_ni(ciph, vblk, blklen, aes_ni_##len##_e); }               \
+    static void aes##len##_ni_encrypt_ecb_block(                        \
+        ssh_cipher *ciph, void *vblk)                                   \
+    { aes_encrypt_ecb_block_ni(ciph, vblk, aes_ni_##len##_e); }
 
 NI_ENC_DEC(128)
 NI_ENC_DEC(192)
diff --git a/crypto/aes-select.c b/crypto/aes-select.c
index f0c5031f..892e7b58 100644
--- a/crypto/aes-select.c
+++ b/crypto/aes-select.c
@@ -39,7 +39,7 @@ static ssh_cipher *aes_select(const ssh_cipheralg *alg)
 #define IF_NEON(...)
 #endif
 
-#define AES_SELECTOR_VTABLE(mode_c, mode_protocol, mode_display, bits)  \
+#define AES_SELECTOR_VTABLE(mode_c, mode_protocol, mode_display, bits, ...) \
     static const ssh_cipheralg *                                        \
     ssh_aes ## bits ## _ ## mode_c ## _impls[] = {                      \
         IF_NI(&ssh_aes ## bits ## _ ## mode_c ## _ni,)                  \
@@ -56,6 +56,7 @@ static ssh_cipher *aes_select(const ssh_cipheralg *alg)
         .text_name = "AES-" #bits " " mode_display                      \
         " (dummy selector vtable)",                                     \
         .extra = ssh_aes ## bits ## _ ## mode_c ## _impls,              \
+        __VA_ARGS__                                                     \
     }
 
 AES_SELECTOR_VTABLE(cbc, "cbc", "CBC", 128);
@@ -64,6 +65,17 @@ AES_SELECTOR_VTABLE(cbc, "cbc", "CBC", 256);
 AES_SELECTOR_VTABLE(sdctr, "ctr", "SDCTR", 128);
 AES_SELECTOR_VTABLE(sdctr, "ctr", "SDCTR", 192);
 AES_SELECTOR_VTABLE(sdctr, "ctr", "SDCTR", 256);
+AES_SELECTOR_VTABLE(gcm, "gcm@openssh.com", "GCM", 128,
+                    .required_mac = &ssh2_aesgcm_mac);
+AES_SELECTOR_VTABLE(gcm, "gcm@openssh.com", "GCM", 256,
+                    .required_mac = &ssh2_aesgcm_mac);
+
+/* 192-bit AES-GCM is included only so that testcrypt can run standard
+ * test vectors against it. OpenSSH doesn't define a protocol id for
+ * it. Hence the silly macro trick here to set its ssh2_id to 0, and
+ * more importantly, leaving it out of aesgcm_list[] below. */
+AES_SELECTOR_VTABLE(gcm, ?NULL:NULL, "GCM", 192,
+                    .required_mac = &ssh2_aesgcm_mac);
 
 static const ssh_cipheralg ssh_rijndael_lysator = {
     /* Same as aes256_cbc, but with a different protocol ID */
@@ -87,3 +99,12 @@ static const ssh_cipheralg *const aes_list[] = {
 };
 
 const ssh2_ciphers ssh2_aes = { lenof(aes_list), aes_list };
+
+static const ssh_cipheralg *const aesgcm_list[] = {
+    /* OpenSSH only defines protocol ids for 128- and 256-bit AES-GCM,
+     * not 192-bit. */
+    &ssh_aes128_gcm,
+    &ssh_aes256_gcm,
+};
+
+const ssh2_ciphers ssh2_aesgcm = { lenof(aesgcm_list), aesgcm_list };
diff --git a/crypto/aes-sw.c b/crypto/aes-sw.c
index f8512388..aaa3c475 100644
--- a/crypto/aes-sw.c
+++ b/crypto/aes-sw.c
@@ -827,6 +827,18 @@ struct aes_sw_context {
             uint8_t keystream[SLICE_PARALLELISM * 16];
             uint8_t *keystream_pos;
         } sdctr;
+        struct {
+            /* In GCM mode, the cipher preimage consists of three
+             * sections: one fixed, one that increments per message
+             * sent and MACed, and one that increments per cipher
+             * block. */
+            uint64_t msg_counter;
+            uint32_t fixed_iv, block_counter;
+            /* But we keep the precomputed keystream chunks just like
+             * SDCTR mode. */
+            uint8_t keystream[SLICE_PARALLELISM * 16];
+            uint8_t *keystream_pos;
+        } gcm;
     } iv;
     ssh_cipher ciph;
 };
@@ -874,6 +886,31 @@ static void aes_sw_setiv_sdctr(ssh_cipher *ciph, const void *viv)
         ctx->iv.sdctr.keystream + sizeof(ctx->iv.sdctr.keystream);
 }
 
+static void aes_sw_setiv_gcm(ssh_cipher *ciph, const void *viv)
+{
+    aes_sw_context *ctx = container_of(ciph, aes_sw_context, ciph);
+    const uint8_t *iv = (const uint8_t *)viv;
+
+    ctx->iv.gcm.fixed_iv = GET_32BIT_MSB_FIRST(iv);
+    ctx->iv.gcm.msg_counter = GET_64BIT_MSB_FIRST(iv + 4);
+    ctx->iv.gcm.block_counter = 1;
+
+    /* Set keystream_pos to indicate that the keystream cache is
+     * currently empty */
+    ctx->iv.gcm.keystream_pos =
+        ctx->iv.gcm.keystream + sizeof(ctx->iv.gcm.keystream);
+}
+
+static void aes_sw_next_message_gcm(ssh_cipher *ciph)
+{
+    aes_sw_context *ctx = container_of(ciph, aes_sw_context, ciph);
+
+    ctx->iv.gcm.msg_counter++;
+    ctx->iv.gcm.block_counter = 1;
+    ctx->iv.gcm.keystream_pos =
+        ctx->iv.gcm.keystream + sizeof(ctx->iv.gcm.keystream);
+}
+
 typedef void (*aes_sw_fn)(uint32_t v[4], const uint32_t *keysched);
 
 static inline void memxor16(void *vout, const void *vlhs, const void *vrhs)
@@ -1021,6 +1058,56 @@ static inline void aes_sdctr_sw(
     }
 }
 
+static inline void aes_encrypt_ecb_block_sw(ssh_cipher *ciph, void *blk)
+{
+    aes_sw_context *ctx = container_of(ciph, aes_sw_context, ciph);
+    aes_sliced_e_serial(blk, blk, &ctx->sk);
+}
+
+static inline void aes_gcm_sw(
+    ssh_cipher *ciph, void *vblk, int blklen)
+{
+    aes_sw_context *ctx = container_of(ciph, aes_sw_context, ciph);
+
+    /*
+     * GCM encrypt/decrypt looks just like SDCTR, except that the
+     * method of generating more keystream varies slightly.
+     */
+
+    uint8_t *keystream_end =
+        ctx->iv.gcm.keystream + sizeof(ctx->iv.gcm.keystream);
+
+    for (uint8_t *blk = (uint8_t *)vblk, *finish = blk + blklen;
+         blk < finish; blk += 16) {
+
+        if (ctx->iv.gcm.keystream_pos == keystream_end) {
+            /*
+             * Generate some keystream.
+             */
+            for (uint8_t *block = ctx->iv.gcm.keystream;
+                 block < keystream_end; block += 16) {
+                /* Format the counter value into the buffer. */
+                PUT_32BIT_MSB_FIRST(block, ctx->iv.gcm.fixed_iv);
+                PUT_64BIT_MSB_FIRST(block + 4, ctx->iv.gcm.msg_counter);
+                PUT_32BIT_MSB_FIRST(block + 12, ctx->iv.gcm.block_counter);
+
+                /* Increment the counter. */
+                ctx->iv.gcm.block_counter++;
+            }
+
+            /* Encrypt all those counter blocks. */
+            aes_sliced_e_parallel(ctx->iv.gcm.keystream,
+                                  ctx->iv.gcm.keystream, &ctx->sk);
+
+            /* Reset keystream_pos to the start of the buffer. */
+            ctx->iv.gcm.keystream_pos = ctx->iv.gcm.keystream;
+        }
+
+        memxor16(blk, blk, ctx->iv.gcm.keystream_pos);
+        ctx->iv.gcm.keystream_pos += 16;
+    }
+}
+
 #define SW_ENC_DEC(len)                                 \
     static void aes##len##_sw_cbc_encrypt(              \
         ssh_cipher *ciph, void *vblk, int blklen)       \
@@ -1030,7 +1117,13 @@ static inline void aes_sdctr_sw(
     { aes_cbc_sw_decrypt(ciph, vblk, blklen); }         \
     static void aes##len##_sw_sdctr(                    \
         ssh_cipher *ciph, void *vblk, int blklen)       \
-    { aes_sdctr_sw(ciph, vblk, blklen); }
+    { aes_sdctr_sw(ciph, vblk, blklen); }               \
+    static void aes##len##_sw_gcm(                      \
+        ssh_cipher *ciph, void *vblk, int blklen)       \
+    { aes_gcm_sw(ciph, vblk, blklen); }                 \
+    static void aes##len##_sw_encrypt_ecb_block(        \
+        ssh_cipher *ciph, void *vblk)                   \
+    { aes_encrypt_ecb_block_sw(ciph, vblk); }
 
 SW_ENC_DEC(128)
 SW_ENC_DEC(192)
diff --git a/crypto/aes.h b/crypto/aes.h
index 433306ab..cab5b989 100644
--- a/crypto/aes.h
+++ b/crypto/aes.h
@@ -15,6 +15,11 @@ struct aes_extra {
 
     /* Point to a writable substructure. */
     struct aes_extra_mutable *mut;
+
+    /* Extra API function specific to AES, to encrypt a single block
+     * in ECB mode without touching the IV. Used by AES-GCM MAC
+     * setup. */
+    void (*encrypt_ecb_block)(ssh_cipher *, void *);
 };
 struct aes_extra_mutable {
     bool checked_availability;
@@ -30,6 +35,17 @@ static inline bool check_availability(const struct aes_extra *extra)
     return extra->mut->is_available;
 }
 
+/* Shared stub function for all the AES-GCM vtables. */
+void aesgcm_cipher_crypt_length(
+    ssh_cipher *cipher, void *blk, int len, unsigned long seq);
+
+/* External entry point for the encrypt_ecb_block function. */
+static inline void aes_encrypt_ecb_block(ssh_cipher *ciph, void *blk)
+{
+    const struct aes_extra *extra = ciph->vt->extra;
+    extra->encrypt_ecb_block(ciph, blk);
+}
+
 /*
  * Macros to define vtables for AES variants. There are a lot of
  * these, because of the cross product between cipher modes, key
@@ -37,13 +53,19 @@ static inline bool check_availability(const struct aes_extra *extra)
  * some effort here to reduce the boilerplate in the sub-files.
  */
 
-#define AES_EXTRA(impl_c)                                               \
+#define AES_EXTRA_BITS(impl_c, bits)                                    \
     static struct aes_extra_mutable aes ## impl_c ## _extra_mut;        \
-    static const struct aes_extra aes ## impl_c ## _extra = {           \
+    static const struct aes_extra aes ## bits ## impl_c ## _extra = {   \
         .check_available = aes ## impl_c ## _available,                 \
         .mut = &aes ## impl_c ## _extra_mut,                            \
+        .encrypt_ecb_block = &aes ## bits ## impl_c ## _encrypt_ecb_block, \
     }
 
+#define AES_EXTRA(impl_c)                       \
+    AES_EXTRA_BITS(impl_c, 128);                \
+    AES_EXTRA_BITS(impl_c, 192);                \
+    AES_EXTRA_BITS(impl_c, 256)
+
 #define AES_CBC_VTABLE(impl_c, impl_display, bits)                      \
     const ssh_cipheralg ssh_aes ## bits ## _cbc ## impl_c = {           \
         .new = aes ## impl_c ## _new,                                   \
@@ -59,7 +81,7 @@ static inline bool check_availability(const struct aes_extra *extra)
         .padded_keybytes = bits/8,                                      \
         .flags = SSH_CIPHER_IS_CBC,                                     \
         .text_name = "AES-" #bits " CBC (" impl_display ")",            \
-        .extra = &aes ## impl_c ## _extra,                              \
+        .extra = &aes ## bits ## impl_c ## _extra,                      \
     }
 
 #define AES_SDCTR_VTABLE(impl_c, impl_display, bits)                    \
@@ -77,7 +99,31 @@ static inline bool check_availability(const struct aes_extra *extra)
         .padded_keybytes = bits/8,                                      \
         .flags = 0,                                                     \
         .text_name = "AES-" #bits " SDCTR (" impl_display ")",          \
-        .extra = &aes ## impl_c ## _extra,                              \
+        .extra = &aes ## bits ## impl_c ## _extra,                      \
+    }
+
+#define AES_GCM_VTABLE(impl_c, impl_display, bits)                      \
+    const ssh_cipheralg ssh_aes ## bits ## _gcm ## impl_c = {           \
+        .new = aes ## impl_c ## _new,                                   \
+        .free = aes ## impl_c ## _free,                                 \
+        .setiv = aes ## impl_c ## _setiv_gcm,                           \
+        .setkey = aes ## impl_c ## _setkey,                             \
+        .encrypt = aes ## bits ## impl_c ## _gcm,                       \
+        .decrypt = aes ## bits ## impl_c ## _gcm,                       \
+        .encrypt_length = aesgcm_cipher_crypt_length,                   \
+        .decrypt_length = aesgcm_cipher_crypt_length,                   \
+        .next_message = aes ## impl_c ## _next_message_gcm,             \
+        /* 192-bit AES-GCM is included only so that testcrypt can run   \
+         * standard test vectors against it. OpenSSH doesn't define a   \
+         * protocol id for it. So we set its ssh2_id to NULL. */        \
+        .ssh2_id = bits==192 ? NULL : "aes" #bits "-gcm@openssh.com",   \
+        .blksize = 16,                                                  \
+        .real_keybits = bits,                                           \
+        .padded_keybytes = bits/8,                                      \
+        .flags = SSH_CIPHER_SEPARATE_LENGTH,                            \
+        .text_name = "AES-" #bits " GCM (" impl_display ")",            \
+        .required_mac = &ssh2_aesgcm_mac,                               \
+        .extra = &aes ## bits ## impl_c ## _extra,                      \
     }
 
 #define AES_ALL_VTABLES(impl_c, impl_display)           \
@@ -86,7 +132,10 @@ static inline bool check_availability(const struct aes_extra *extra)
     AES_CBC_VTABLE(impl_c, impl_display, 256);          \
     AES_SDCTR_VTABLE(impl_c, impl_display, 128);        \
     AES_SDCTR_VTABLE(impl_c, impl_display, 192);        \
-    AES_SDCTR_VTABLE(impl_c, impl_display, 256)
+    AES_SDCTR_VTABLE(impl_c, impl_display, 256);        \
+    AES_GCM_VTABLE(impl_c, impl_display, 128);          \
+    AES_GCM_VTABLE(impl_c, impl_display, 192);          \
+    AES_GCM_VTABLE(impl_c, impl_display, 256)
 
 /*
  * Macros to repeat a piece of code particular numbers of times that
diff --git a/crypto/aesgcm-clmul.c b/crypto/aesgcm-clmul.c
new file mode 100644
index 00000000..cfb72e26
--- /dev/null
+++ b/crypto/aesgcm-clmul.c
@@ -0,0 +1,180 @@
+/*
+ * Implementation of the GCM polynomial hash using the x86 CLMUL
+ * extension, which provides 64x64->128 polynomial multiplication (or
+ * 'carry-less', which is what the CL stands for).
+ *
+ * Follows the reference implementation in aesgcm-ref-poly.c; see
+ * there for comments on the underlying technique. Here the comments
+ * just discuss the x86-specific details.
+ */
+
+#include <wmmintrin.h>
+#include <tmmintrin.h>
+
+#if defined(__clang__) || defined(__GNUC__)
+#include <cpuid.h>
+#define GET_CPU_ID(out) __cpuid(1, (out)[0], (out)[1], (out)[2], (out)[3])
+#else
+#define GET_CPU_ID(out) __cpuid(out, 1)
+#endif
+
+#include "ssh.h"
+#include "aesgcm.h"
+
+typedef struct aesgcm_clmul {
+    AESGCM_COMMON_FIELDS;
+    __m128i var, acc, mask;
+    void *ptr_to_free;
+} aesgcm_clmul;
+
+static bool aesgcm_clmul_available(void)
+{
+    /*
+     * Determine if CLMUL is available on this CPU.
+     */
+    unsigned int CPUInfo[4];
+    GET_CPU_ID(CPUInfo);
+    return (CPUInfo[2] & (1 << 1));
+}
+
+/*
+ * __m128i has to be aligned to 16 bytes, and x86 mallocs may not
+ * guarantee that, so we must over-allocate to make sure a large
+ * enough 16-byte region can be found, and ensure the aesgcm_clmul
+ * struct pointer is at least that well aligned.
+ */
+#define SPECIAL_ALLOC
+static aesgcm_clmul *aesgcm_clmul_alloc(void)
+{
+    char *p = smalloc(sizeof(aesgcm_clmul) + 15);
+    uintptr_t ip = (uintptr_t)p;
+    ip = (ip + 15) & ~15;
+    aesgcm_clmul *ctx = (aesgcm_clmul *)ip;
+    memset(ctx, 0, sizeof(aesgcm_clmul));
+    ctx->ptr_to_free = p;
+    return ctx;
+}
+
+#define SPECIAL_FREE
+static void aesgcm_clmul_free(aesgcm_clmul *ctx)
+{
+    void *ptf = ctx->ptr_to_free;
+    smemclr(ctx, sizeof(*ctx));
+    sfree(ptf);
+}
+
+/* Helper function to reverse the 16 bytes in a 128-bit vector */
+static inline __m128i mm_byteswap(__m128i vec)
+{
+    const __m128i reverse = _mm_set_epi64x(
+        0x0001020304050607ULL, 0x08090a0b0c0d0e0fULL);
+    return _mm_shuffle_epi8(vec, reverse);
+}
+
+/* Helper function to swap the two 64-bit words in a 128-bit vector */
+static inline __m128i mm_wordswap(__m128i vec)
+{
+    return _mm_shuffle_epi32(vec, 0x4E);
+}
+
+/* Load and store a 128-bit vector in big-endian fashion */
+static inline __m128i mm_load_be(const void *p)
+{
+    return mm_byteswap(_mm_loadu_si128(p));
+}
+static inline void mm_store_be(void *p, __m128i vec)
+{
+    _mm_storeu_si128(p, mm_byteswap(vec));
+}
+
+/*
+ * Key setup is just like in aesgcm-ref-poly.c. There's no point using
+ * vector registers to accelerate this, because it happens rarely.
+ */
+static void aesgcm_clmul_setkey_impl(aesgcm_clmul *ctx,
+                                     const unsigned char *var)
+{
+    uint64_t hi = GET_64BIT_MSB_FIRST(var);
+    uint64_t lo = GET_64BIT_MSB_FIRST(var + 8);
+
+    uint64_t bit = 1 & (hi >> 63);
+    hi = (hi << 1) ^ (lo >> 63);
+    lo = (lo << 1) ^ bit;
+    hi ^= 0xC200000000000000 & -bit;
+
+    ctx->var = _mm_set_epi64x(hi, lo);
+}
+
+static inline void aesgcm_clmul_setup(aesgcm_clmul *ctx,
+                                      const unsigned char *mask)
+{
+    ctx->mask = mm_load_be(mask);
+    ctx->acc = _mm_set_epi64x(0, 0);
+}
+
+/*
+ * Folding a coefficient into the accumulator is done by essentially
+ * the algorithm in aesgcm-ref-poly.c. I don't speak these intrinsics
+ * all that well, so in the parts where I needed to XOR half of one
+ * vector into half of another, I did a lot of faffing about with
+ * masks like 0xFFFFFFFFFFFFFFFF0000000000000000. Very likely this can
+ * be streamlined by a better x86-speaker than me. Patches welcome.
+ */
+static inline void aesgcm_clmul_coeff(aesgcm_clmul *ctx,
+                                      const unsigned char *coeff)
+{
+    ctx->acc = _mm_xor_si128(ctx->acc, mm_load_be(coeff));
+
+    /* Compute ah^al and bh^bl by word-swapping each of a and b and
+     * XORing with the original. That does more work than necessary -
+     * you end up with each of the desired values repeated twice -
+     * but I don't know of a neater way. */
+    __m128i aswap = mm_wordswap(ctx->acc);
+    __m128i vswap = mm_wordswap(ctx->var);
+    aswap = _mm_xor_si128(ctx->acc, aswap);
+    vswap = _mm_xor_si128(ctx->var, vswap);
+
+    /* Do the three multiplications required by Karatsuba */
+    __m128i md = _mm_clmulepi64_si128(aswap, vswap, 0x00);
+    __m128i lo = _mm_clmulepi64_si128(ctx->acc, ctx->var, 0x00);
+    __m128i hi = _mm_clmulepi64_si128(ctx->acc, ctx->var, 0x11);
+    /* Combine lo and hi into md */
+    md = _mm_xor_si128(md, lo);
+    md = _mm_xor_si128(md, hi);
+
+    /* Now we must XOR the high half of md into the low half of hi,
+     * and the low half of md into the high half of hi. Simplest thing
+     * is to swap the words of md (so that each one lines up with the
+     * register it's going to end up in), and then mask one off in
+     * each case. */
+    md = mm_wordswap(md);
+    lo = _mm_xor_si128(lo, _mm_and_si128(md, _mm_set_epi64x(~0ULL, 0ULL)));
+    hi = _mm_xor_si128(hi, _mm_and_si128(md, _mm_set_epi64x(0ULL, ~0ULL)));
+
+    /* The reduction stage is transformed similarly from the version
+     * in aesgcm-ref-poly.c. */
+    __m128i r1 = _mm_clmulepi64_si128(_mm_set_epi64x(0, 0xC200000000000000),
+                                     lo, 0x00);
+    r1 = mm_wordswap(r1);
+    r1 = _mm_xor_si128(r1, lo);
+    hi = _mm_xor_si128(hi, _mm_and_si128(r1, _mm_set_epi64x(~0ULL, 0ULL)));
+
+    __m128i r2 = _mm_clmulepi64_si128(_mm_set_epi64x(0, 0xC200000000000000),
+                                     r1, 0x10);
+    hi = _mm_xor_si128(hi, r2);
+    hi = _mm_xor_si128(hi, _mm_and_si128(r1, _mm_set_epi64x(0ULL, ~0ULL)));
+
+    ctx->acc = hi;
+}
+
+static inline void aesgcm_clmul_output(aesgcm_clmul *ctx,
+                                       unsigned char *output)
+{
+    mm_store_be(output, _mm_xor_si128(ctx->acc, ctx->mask));
+    smemclr(&ctx->acc, 16);
+    smemclr(&ctx->mask, 16);
+}
+
+#define AESGCM_FLAVOUR clmul
+#define AESGCM_NAME "CLMUL accelerated"
+#include "aesgcm-footer.h"
diff --git a/crypto/aesgcm-common.c b/crypto/aesgcm-common.c
new file mode 100644
index 00000000..1e20c87b
--- /dev/null
+++ b/crypto/aesgcm-common.c
@@ -0,0 +1,8 @@
+#include "ssh.h"
+#include "aesgcm.h"
+
+void aesgcm_set_prefix_lengths(ssh2_mac *mac, size_t skip, size_t aad)
+{
+    const struct aesgcm_extra *extra = mac->vt->extra;
+    extra->set_prefix_lengths(mac, skip, aad);
+}
diff --git a/crypto/aesgcm-footer.h b/crypto/aesgcm-footer.h
new file mode 100644
index 00000000..981905da
--- /dev/null
+++ b/crypto/aesgcm-footer.h
@@ -0,0 +1,368 @@
+/*
+ * Common footer included by every implementation of the AES-GCM MAC.
+ *
+ * The difficult part of AES-GCM, which is done differently depending
+ * on what hardware acceleration is available, is the actual
+ * evaluation of a polynomial over GF(2^128) whose coefficients are
+ * 128-bit chunks of data. But preparing those chunks in the first
+ * place (out of the ciphertext, associated data, and an
+ * administrative block containing the lengths of both) is done in the
+ * same way no matter what technique is used for the evaluation, so
+ * that's centralised into this file, along with as much of the other
+ * functionality as posible.
+ *
+ * This footer file is #included by each implementation, but each one
+ * will define its own struct type for the state, so that each alloc
+ * function will test sizeof() a different structure, and similarly
+ * for free when it zeroes out the state on cleanup.
+ *
+ * The functions in the source file may be defined as 'inline' so that
+ * the functions in here can inline them. The 'coeff' function in
+ * particular probably should be, because that's called once per
+ * 16-byte block, so eliminating function call overheads is especially
+ * useful there.
+ *
+ * This footer has the following expectations from the source file
+ * that #includes it:
+ *
+ *  - define AESGCM_FLAVOUR to be a fragment of a C identifier that
+ *    will be included in all the function names (both the ones
+ *    defined in the implementation source file and those in here).
+ *    For example purposes below I'll suppose that this is 'foo'.
+ *
+ *  - define AESGCM_NAME to be a string literal that will be included
+ *    in the display name of the implementation.
+ *
+ *  - define a typedef 'aesgcm_foo' to be the state structure for the
+ *    implementation, and inside that structure, expand the macro
+ *    AESGCM_COMMON_FIELDS defined in aesgcm.h
+ *
+ *  - define the following functions:
+ *
+ *    // Determine whether this implementation is available at run time
+ *    static bool aesgcm_foo_available(void);
+ *
+ *    // Set up the 'key' of the polynomial part of the MAC, that is,
+ *    // the value at which the polynomial will be evaluated. 'var' is
+ *    // a 16-byte data block in the byte order it comes out of AES.
+ *    static void aesgcm_foo_setkey_impl(aesgcm_foo *ctx,
+ *                                       const unsigned char *var);
+ *
+ *    // Set up at the start of evaluating an individual polynomial.
+ *    // 'mask' is the 16-byte data block that will be XORed into the
+ *    // output value of the polynomial, also in AES byte order. This
+ *    // function should store 'mask' in whatever form is most
+ *    // convenient, and initialise an accumulator to zero.
+ *    static void aesgcm_foo_setup(aesgcm_foo *ctx,
+ *                                 const unsigned char *mask);
+ *
+ *    // Fold in a coefficient of the polynomial, by means of XORing
+ *    // it into the accumulator and then multiplying the accumulator
+ *    // by the variable passed to setkey_impl() above.
+ *    //
+ *    // 'coeff' points to the 16-byte block of data that the
+ *    // polynomial coefficient will be made out of.
+ *    //
+ *    // You probably want to mark this function 'inline'.
+ *    static void aesgcm_foo_coeff(aesgcm_foo *ctx,
+ *                                 const unsigned char *coeff);
+ *
+ *    // Generate the output MAC, by XORing the accumulator's final
+ *    // value with the mask passed to setup() above.
+ *    //
+ *    // 'output' points to a 16-byte region of memory to write the
+ *    // result to.
+ *    static void aesgcm_foo_output(aesgcm_foo *ctx,
+ *                                  unsigned char *output);
+ *
+ *  - if allocation of the state structure must be done in a
+ *    non-standard way (e.g. x86 needs this to force greater alignment
+ *    than standard malloc provides), then #define SPECIAL_ALLOC and
+ *    define this additional function:
+ *
+ *    // Allocate a state structure, zero out its contents, and return it.
+ *    static aesgcm_foo *aesgcm_foo_alloc(void);
+ *
+ *  - if freeing must also be done in an unusual way, #define
+ *    SPECIAL_FREE and define this function:
+ *
+ *    // Zero out the state structure to avoid information leaks if the
+ *    // memory is reused, and then free it.
+ *    static void aesgcm_foo_free(aesgcm_foo *ctx);
+ */
+
+#ifndef AESGCM_FLAVOUR
+#error AESGCM_FLAVOUR must be defined by any module including this footer
+#endif
+#ifndef AESGCM_NAME
+#error AESGCM_NAME must be defined by any module including this footer
+#endif
+
+#define CONTEXT CAT(aesgcm_, AESGCM_FLAVOUR)
+#define PREFIX(name) CAT(CAT(aesgcm_, AESGCM_FLAVOUR), CAT(_, name))
+
+#include "aes.h" // for aes_encrypt_ecb_block
+
+static const char *PREFIX(mac_text_name)(ssh2_mac *mac)
+{
+    return "AES-GCM (" AESGCM_NAME ")";
+}
+
+static void PREFIX(mac_next_message)(ssh2_mac *mac)
+{
+    CONTEXT *ctx = container_of(mac, CONTEXT, mac);
+
+    /*
+     * Make the mask value for a single MAC instance, by encrypting
+     * the all-zeroes word using the associated AES instance in its
+     * ordinary GCM fashion. This consumes the first block of
+     * keystream (with per-block counter equal to 1), leaving the
+     * second block of keystream ready to be used on the first block
+     * of plaintext.
+     */
+    unsigned char buf[16];
+    memset(buf, 0, 16);
+    ssh_cipher_encrypt(ctx->cipher, buf, 16);
+    PREFIX(setup)(ctx, buf); /* give it to the implementation to store */
+    smemclr(buf, sizeof(buf));
+}
+
+static void PREFIX(mac_setkey)(ssh2_mac *mac, ptrlen key)
+{
+    CONTEXT *ctx = container_of(mac, CONTEXT, mac);
+
+    /*
+     * Make the value of the polynomial variable, by encrypting the
+     * all-zeroes word using the associated AES instance in the
+     * special ECB mode. This is done via the special AES-specific API
+     * function encrypt_ecb_block, which doesn't touch the counter
+     * state at all.
+     */
+    unsigned char var[16];
+    memset(var, 0, 16);
+    aes_encrypt_ecb_block(ctx->cipher, var);
+    PREFIX(setkey_impl)(ctx, var);
+    smemclr(var, sizeof(var));
+
+    PREFIX(mac_next_message)(mac);     /* set up mask */
+}
+
+static void PREFIX(mac_start)(ssh2_mac *mac)
+{
+    CONTEXT *ctx = container_of(mac, CONTEXT, mac);
+
+    ctx->skipgot = ctx->aadgot = ctx->ciphertextlen = ctx->partlen = 0;
+}
+
+/*
+ * Handle receiving data via the BinarySink API and turning it into a
+ * collection of 16-byte blocks to use as polynomial coefficients.
+ *
+ * This code is written in a fully general way, which is able to
+ * handle an arbitrary number of bytes at the start of the data to
+ * ignore completely (necessary for PuTTY integration), and an
+ * arbitrary number to treat as associated data, and the rest will be
+ * regarded as ciphertext. The stream can be interrupted at any byte
+ * position and resumed later; a partial block will be stored as
+ * necessary.
+ *
+ * At the time of writing this comment, in live use most of that
+ * generality isn't required: the full data is passed to this function
+ * in just one call. But there's no guarantee of that staying true in
+ * future, so we do the full deal here just in case, and the test
+ * vectors in cryptsuite.py will test it. (And they'll use
+ * set_prefix_lengths to set up different configurations from the SSH
+ * usage.)
+ */
+static void PREFIX(mac_BinarySink_write)(
+    BinarySink *bs, const void *blkv, size_t len)
+{
+    CONTEXT *ctx = BinarySink_DOWNCAST(bs, CONTEXT);
+    const unsigned char *blk = (const unsigned char *)blkv;
+
+    /*
+     * Skip the prefix sequence number used as implicit extra data in
+     * SSH MACs. This is not included in the associated data field for
+     * GCM, because the IV incrementation policy provides its own
+     * sequence numbering.
+     */
+    if (ctx->skipgot < ctx->skiplen) {
+        size_t n = ctx->skiplen - ctx->skipgot;
+        if (n > len)
+            n = len;
+        blk += n;
+        len -= n;
+        ctx->skipgot += n;
+
+        if (len == 0)
+            return;
+    }
+
+    /*
+     * Read additional authenticated data and fold it in to the MAC.
+     */
+    while (ctx->aadgot < ctx->aadlen) {
+        size_t n = ctx->aadlen - ctx->aadgot;
+        if (n > len)
+            n = len;
+
+        if (ctx->partlen || n < 16) {
+            /*
+             * Fold data into the partial block.
+             */
+            if (n > 16 - ctx->partlen)
+                n = 16 - ctx->partlen;
+            memcpy(ctx->partblk + ctx->partlen, blk, n);
+            ctx->partlen += n;
+        } else if (n >= 16) {
+            /*
+             * Consume a whole block of AAD.
+             */
+            PREFIX(coeff)(ctx, blk);
+            n = 16;
+        }
+        blk += n;
+        len -= n;
+        ctx->aadgot += n;
+
+        if (ctx->partlen == 16) {
+            PREFIX(coeff)(ctx, ctx->partblk);
+            ctx->partlen = 0;
+        }
+
+        if (ctx->aadgot == ctx->aadlen && ctx->partlen) {
+            memset(ctx->partblk + ctx->partlen, 0, 16 - ctx->partlen);
+            PREFIX(coeff)(ctx, ctx->partblk);
+            ctx->partlen = 0;
+        }
+
+        if (len == 0)
+            return;
+    }
+
+    /*
+     * Read the main ciphertext and fold it in to the MAC.
+     */
+    while (len > 0) {
+        size_t n = len;
+
+        if (ctx->partlen || n < 16) {
+            /*
+             * Fold data into the partial block.
+             */
+            if (n > 16 - ctx->partlen)
+                n = 16 - ctx->partlen;
+            memcpy(ctx->partblk + ctx->partlen, blk, n);
+            ctx->partlen += n;
+        } else if (n >= 16) {
+            /*
+             * Consume a whole block of ciphertext.
+             */
+            PREFIX(coeff)(ctx, blk);
+            n = 16;
+        }
+        blk += n;
+        len -= n;
+        ctx->ciphertextlen += n;
+
+        if (ctx->partlen == 16) {
+            PREFIX(coeff)(ctx, ctx->partblk);
+            ctx->partlen = 0;
+        }
+    }
+}
+
+static void PREFIX(mac_genresult)(ssh2_mac *mac, unsigned char *output)
+{
+    CONTEXT *ctx = container_of(mac, CONTEXT, mac);
+
+    /*
+     * Consume any partial block of ciphertext remaining.
+     */
+    if (ctx->partlen) {
+        memset(ctx->partblk + ctx->partlen, 0, 16 - ctx->partlen);
+        PREFIX(coeff)(ctx, ctx->partblk);
+    }
+
+    /*
+     * Consume the final block giving the lengths of the AAD and ciphertext.
+     */
+    unsigned char blk[16];
+    memset(blk, 0, 16);
+    PUT_64BIT_MSB_FIRST(blk, ctx->aadlen * 8);
+    PUT_64BIT_MSB_FIRST(blk + 8, ctx->ciphertextlen * 8);
+    PREFIX(coeff)(ctx, blk);
+
+    /*
+     * And call the implementation's output function.
+     */
+    PREFIX(output)(ctx, output);
+
+    smemclr(blk, sizeof(blk));
+    smemclr(ctx->partblk, 16);
+}
+
+static ssh2_mac *PREFIX(mac_new)(const ssh2_macalg *alg, ssh_cipher *cipher)
+{
+    const struct aesgcm_extra *extra = alg->extra;
+    if (!check_aesgcm_availability(extra))
+        return NULL;
+
+#ifdef SPECIAL_ALLOC
+    CONTEXT *ctx = PREFIX(alloc)();
+#else
+    CONTEXT *ctx = snew(CONTEXT);
+    memset(ctx, 0, sizeof(CONTEXT));
+#endif
+
+    ctx->mac.vt = alg;
+    ctx->cipher = cipher;
+    /* Default values for SSH-2, overridable by set_prefix_lengths for
+     * testcrypt purposes */
+    ctx->skiplen = 4;
+    ctx->aadlen = 4;
+    BinarySink_INIT(ctx, PREFIX(mac_BinarySink_write));
+    BinarySink_DELEGATE_INIT(&ctx->mac, ctx);
+    return &ctx->mac;
+}
+
+static void PREFIX(set_prefix_lengths)(ssh2_mac *mac, size_t skip, size_t aad)
+{
+    CONTEXT *ctx = container_of(mac, CONTEXT, mac);
+    ctx->skiplen = skip;
+    ctx->aadlen = aad;
+}
+
+static void PREFIX(mac_free)(ssh2_mac *mac)
+{
+    CONTEXT *ctx = container_of(mac, CONTEXT, mac);
+#ifdef SPECIAL_FREE
+    PREFIX(free)(ctx);
+#else
+    smemclr(ctx, sizeof(*ctx));
+    sfree(ctx);
+#endif
+}
+
+static struct aesgcm_extra_mutable PREFIX(extra_mut);
+
+static const struct aesgcm_extra PREFIX(extra) = {
+    .check_available = PREFIX(available),
+    .mut = &PREFIX(extra_mut),
+    .set_prefix_lengths = PREFIX(set_prefix_lengths),
+};
+
+const ssh2_macalg CAT(ssh2_aesgcm_mac_, AESGCM_FLAVOUR) = {
+    .new = PREFIX(mac_new),
+    .free = PREFIX(mac_free),
+    .setkey = PREFIX(mac_setkey),
+    .start = PREFIX(mac_start),
+    .genresult = PREFIX(mac_genresult),
+    .next_message = PREFIX(mac_next_message),
+    .text_name = PREFIX(mac_text_name),
+    .name = "",
+    .etm_name = "", /* Not selectable independently */
+    .len = 16,
+    .keylen = 0,
+    .extra = &PREFIX(extra),
+};
diff --git a/crypto/aesgcm-neon.c b/crypto/aesgcm-neon.c
new file mode 100644
index 00000000..dd7b83cc
--- /dev/null
+++ b/crypto/aesgcm-neon.c
@@ -0,0 +1,156 @@
+/*
+ * Implementation of the GCM polynomial hash using Arm NEON vector
+ * intrinsics, in particular the multiplication operation for
+ * polynomials over GF(2).
+ *
+ * Follows the reference implementation in aesgcm-ref-poly.c; see
+ * there for comments on the underlying technique. Here the comments
+ * just discuss the NEON-specific details.
+ */
+
+#include "ssh.h"
+#include "aesgcm.h"
+
+#if USE_ARM64_NEON_H
+#include <arm64_neon.h>
+#else
+#include <arm_neon.h>
+#endif
+
+typedef struct aesgcm_neon {
+    AESGCM_COMMON_FIELDS;
+    poly128_t var, acc, mask;
+} aesgcm_neon;
+
+static bool aesgcm_neon_available(void)
+{
+    return platform_pmull_neon_available();
+}
+
+/*
+ * The NEON types involved are:
+ *
+ * 'poly128_t' is a type that lives in a 128-bit vector register and
+ * represents a 128-bit polynomial over GF(2)
+ *
+ * 'poly64x2_t' is a type that lives in a 128-bit vector register and
+ * represents a vector of two 64-bit polynomials. These appear as
+ * intermediate results in some of the helper functions below, but we
+ * never need to actually have a variable of that type.
+ *
+ * 'poly64x1_t' is a type that lives in a 128-bit vector register and
+ * represents a vector of one 64-bit polynomial.
+ *
+ * That is distinct from 'poly64_t', which is a type that lives in
+ * ordinary scalar registers and is a typedef for an integer type.
+ *
+ * Generally here we try to work in terms of poly128_t and 64-bit
+ * integer types, and let everything else be handled as internal
+ * details of these helper functions.
+ */
+
+/* Make a poly128_t from two halves */
+static inline poly128_t create_p128(poly64_t hi, poly64_t lo)
+{
+    return vreinterpretq_p128_p64(
+        vcombine_p64(vcreate_p64(lo), vcreate_p64(hi)));
+}
+
+/* Retrieve the high and low halves of a poly128_t */
+static inline poly64_t hi_half(poly128_t v)
+{
+    return vgetq_lane_p64(vreinterpretq_p64_p128(v), 1);
+}
+static inline poly64_t lo_half(poly128_t v)
+{
+    return vgetq_lane_p64(vreinterpretq_p64_p128(v), 0);
+}
+
+/* 64x64 -> 128 bit polynomial multiplication, the largest we can do
+ * in one CPU operation */
+static inline poly128_t pmul(poly64_t v, poly64_t w)
+{
+    return vmull_p64(v, w);
+}
+
+/* Load and store a poly128_t in the form of big-endian bytes. This
+ * involves separately swapping the halves of the register and
+ * reversing the bytes within each half. */
+static inline poly128_t load_p128_be(const void *p)
+{
+    poly128_t swapped = vreinterpretq_p128_u8(vrev64q_u8(vld1q_u8(p)));
+    return create_p128(lo_half(swapped), hi_half(swapped));
+}
+static inline void store_p128_be(void *p, poly128_t v)
+{
+    poly128_t swapped = create_p128(lo_half(v), hi_half(v));
+    vst1q_u8(p, vrev64q_u8(vreinterpretq_u8_p128(swapped)));
+}
+
+/*
+ * Key setup is just like in aesgcm-ref-poly.c. There's no point using
+ * vector registers to accelerate this, because it happens rarely.
+ */
+static void aesgcm_neon_setkey_impl(aesgcm_neon *ctx, const unsigned char *var)
+{
+    uint64_t hi = GET_64BIT_MSB_FIRST(var);
+    uint64_t lo = GET_64BIT_MSB_FIRST(var + 8);
+
+    uint64_t bit = 1 & (hi >> 63);
+    hi = (hi << 1) ^ (lo >> 63);
+    lo = (lo << 1) ^ bit;
+    hi ^= 0xC200000000000000 & -bit;
+
+    ctx->var = create_p128(hi, lo);
+}
+
+static inline void aesgcm_neon_setup(aesgcm_neon *ctx,
+                                     const unsigned char *mask)
+{
+    ctx->mask = load_p128_be(mask);
+    ctx->acc = create_p128(0, 0);
+}
+
+/*
+ * Folding a coefficient into the accumulator is done by exactly the
+ * algorithm in aesgcm-ref-poly.c, translated line by line.
+ *
+ * It's possible that this could be improved by some clever manoeuvres
+ * that avoid having to break vectors in half and put them together
+ * again. Patches welcome if anyone has better ideas.
+ */
+static inline void aesgcm_neon_coeff(aesgcm_neon *ctx,
+                                     const unsigned char *coeff)
+{
+    ctx->acc = vaddq_p128(ctx->acc, load_p128_be(coeff));
+
+    poly64_t ah = hi_half(ctx->acc), al = lo_half(ctx->acc);
+    poly64_t bh = hi_half(ctx->var), bl = lo_half(ctx->var);
+    poly128_t md = pmul(ah ^ al, bh ^ bl);
+    poly128_t lo = pmul(al, bl);
+    poly128_t hi = pmul(ah, bh);
+    md = vaddq_p128(md, vaddq_p128(hi, lo));
+    hi = create_p128(hi_half(hi), lo_half(hi) ^ hi_half(md));
+    lo = create_p128(hi_half(lo) ^ lo_half(md), lo_half(lo));
+
+    poly128_t r1 = pmul((poly64_t)0xC200000000000000, lo_half(lo));
+    hi = create_p128(hi_half(hi), lo_half(hi) ^ lo_half(lo) ^ hi_half(r1));
+    lo = create_p128(hi_half(lo) ^ lo_half(r1), lo_half(lo));
+
+    poly128_t r2 = pmul((poly64_t)0xC200000000000000, hi_half(lo));
+    hi = vaddq_p128(hi, r2);
+    hi = create_p128(hi_half(hi) ^ hi_half(lo), lo_half(hi));
+
+    ctx->acc = hi;
+}
+
+static inline void aesgcm_neon_output(aesgcm_neon *ctx, unsigned char *output)
+{
+    store_p128_be(output, vaddq_p128(ctx->acc, ctx->mask));
+    ctx->acc = create_p128(0, 0);
+    ctx->mask = create_p128(0, 0);
+}
+
+#define AESGCM_FLAVOUR neon
+#define AESGCM_NAME "NEON accelerated"
+#include "aesgcm-footer.h"
diff --git a/crypto/aesgcm-ref-poly.c b/crypto/aesgcm-ref-poly.c
new file mode 100644
index 00000000..f6ca0fa5
--- /dev/null
+++ b/crypto/aesgcm-ref-poly.c
@@ -0,0 +1,364 @@
+/*
+ * Implementation of the GCM polynomial hash in pure software, but
+ * based on a primitive that performs 64x64->128 bit polynomial
+ * multiplication over GF(2).
+ *
+ * This implementation is technically correct (should even be
+ * side-channel safe as far as I can see), but it's hopelessly slow,
+ * so no live SSH connection should ever use it. Therefore, it's
+ * deliberately not included in the lists in aesgcm-select.c. For pure
+ * software GCM in live use, you want aesgcm-sw.c, and that's what the
+ * selection system will choose.
+ *
+ * However, this implementation _is_ made available to testcrypt, so
+ * all the GCM tests in cryptsuite.py are run over this as well as the
+ * other implementations.
+ *
+ * The reason why this code exists at all is to act as a reference for
+ * GCM implementations that use a CPU-specific polynomial multiply
+ * intrinsic or asm statement. This version will run on whatever
+ * platform you're trying to port to, and will generate all the same
+ * intermediate results you expect the CPU-specific one to go through.
+ * So you can insert parallel diagnostics in this version and in your
+ * new version, to see where the two diverge.
+ *
+ * Also, this version is a good place to put long comments explaining
+ * the entire strategy of this implementation and its rationale. That
+ * avoids those comments having to be duplicated in the multiple
+ * platform-specific implementations, which can focus on commenting
+ * the way the local platform's idioms match up to this version, and
+ * refer to this file for the explanation of the underlying technique.
+ */
+
+#include "ssh.h"
+#include "aesgcm.h"
+
+/*
+ * Store a 128-bit value in the most convenient form standard C will
+ * let us, namely two uint64_t giving its most and least significant
+ * halves.
+ */
+typedef struct {
+    uint64_t hi, lo;
+} value128_t;
+
+typedef struct aesgcm_ref_poly {
+    AESGCM_COMMON_FIELDS;
+
+    /*
+     * The state of our GCM implementation is represented entirely by
+     * three 128-bit values:
+     */
+
+    /*
+     * The value at which we're evaluating the polynomial. The GCM
+     * spec calls this 'H'. It's defined once at GCM key setup time,
+     * by encrypting the all-zeroes value with our block cipher.
+     */
+    value128_t var;
+
+    /*
+     * Accumulator containing the result of evaluating the polynomial
+     * so far.
+     */
+    value128_t acc;
+
+    /*
+     * The mask value that is XORed into the final value of 'acc' to
+     * produce the output MAC. This is different for every MAC
+     * generated, because its purpose is to ensure that no information
+     * gathered from a legal MAC can be used to help the forgery of
+     * another one, and that comparing two legal MACs gives you no
+     * useful information about the text they cover, because in each
+     * case, the masks are different and pseudorandom.
+     */
+    value128_t mask;
+} aesgcm_ref_poly;
+
+static bool aesgcm_ref_poly_available(void)
+{
+    return true;  /* pure software implementation, always available */
+}
+
+/*
+ * Primitive function that takes two uint64_t values representing
+ * polynomials, multiplies them, and returns a value128_t struct
+ * containing the full product.
+ *
+ * Because the input polynomials have maximum degree 63, the output
+ * has max degree 63+63 = 127, not 128. As a result, the topmost bit
+ * of the output is always zero.
+ *
+ * The inside of this function is implemented in the simplest way,
+ * with no attention paid to performance. The important feature of
+ * this implementation is not what's _inside_ this function, but
+ * what's _outside_ it: aesgcm_ref_poly_coeff() tries to minimise the
+ * number of these operations.
+ */
+static value128_t pmul(uint64_t x, uint64_t y)
+{
+    value128_t r;
+    r.hi = r.lo = 0;
+
+    uint64_t bit = 1 & y;
+    r.lo ^= x & -bit;
+
+    for (unsigned i = 1; i < 64; i++) {
+        bit = 1 & (y >> i);
+        uint64_t z = x & -bit;
+        r.lo ^= z << i;
+        r.hi ^= z >> (64-i);
+    }
+
+    return r;
+}
+
+/*
+ * OK, I promised a long comment explaining what's going on in this
+ * implementation, and now it's time.
+ *
+ * The way AES-GCM _itself_ is defined by its own spec, its finite
+ * field consists of polynomials over GF(2), constrained to be 128
+ * bits long by reducing them modulo P = x^128 + x^7 + x^2 + x + 1.
+ * Using the usual binary representation in which bit i is the
+ * coefficient of x^i, that's 0x100000000000000000000000000000087.
+ *
+ * That is, whenever you multiply two polynomials and find a term
+ * x^128, you can replace it with x^7+x^2+x+1. More generally,
+ * x^(128+n) can be replaced with x^(7+n)+x^(2+n)+x^(1+n)+x^n. In
+ * binary terms, a 1 bit at the 128th position or above is replaced by
+ * 0x87 exactly 128 bits further down.
+ *
+ * So you'd think that multiplying two 128-bit polynomials mod P would
+ * be a matter of generating their full 256-bit product in the form of
+ * four words HI:HU:LU:LO, and then reducing it mod P by a two-stage
+ * process of computing HI * 0x87 and XORing it into HU:LU, then
+ * computing HU * 0x87 and XORing it into LU:LO.
+ *
+ * But it's not!
+ *
+ * The reason why not is because when AES-GCM is applied to SSH,
+ * somehow the _bit_ order got reversed. A 16-byte block of data in
+ * memory is converted into a polynomial by regarding bit 7 of the
+ * first byte as the constant term, bit 0 of the first byte as the x^7
+ * coefficient, ..., bit 0 of the last byte as the x^127 coefficient.
+ * So if we load that 16-byte block as a big-endian 128-bit integer,
+ * we end up with it representing a polynomial back to front, with the
+ * constant term at the top and the x^127 bit at the bottom.
+ *
+ * Well, that _shouldn't_ be a problem, right? The nice thing about
+ * polynomial multiplication is that it's essentially reversible. If
+ * you reverse the order of the coefficients of two polynomials, then
+ * the product of the reversed polys is exactly the reversal of the
+ * product of the original ones. So we bit-reverse our modulo
+ * polynomial to get 0x1c2000000000000000000000000000001, and we just
+ * pretend we're working mod that instead.
+ *
+ * And that is basically what we're about to do. But there's one
+ * complication, that arises from the semantics of the polynomial
+ * multiplication function we're using as our primitive operation.
+ *
+ * That function multiplies two polynomials of degree at most 63, to
+ * give one with degree at most 127. So it returns a 128-bit integer
+ * whose low bit is the constant term, and its very highest bit is 0,
+ * and its _next_ highest bit is the product of the high bits of the
+ * two inputs.
+ *
+ * That operation is _not_ symmetric in bit-reversal. If you give it
+ * the 64-bit-wise reversals of two polynomials P,Q, then its output
+ * is not the 128-bit-wise reversal of their product PQ, because that
+ * would require the constant term of PQ to appear in bit 127 of the
+ * output, and in fact it appears in bit 126. So in fact, what we get
+ * is offset by one bit from where we'd like it: it's the bit-reversal
+ * of PQx, not of PQ.
+ *
+ * There's more than one way we could fix this. One approach would be
+ * to work around this off-by-one error by taking the 128-bit output
+ * of pmul() and shifting it left by a bit. Then it _is_ the bitwise
+ * reversal of the 128-bit value we'd have wanted to get, and we could
+ * implement the exact algorithm described above, in fully
+ * bit-reversed form.
+ *
+ * But a 128-bit left shift is not a trivial operation in the vector
+ * architectures that this code is acting as a reference for. So we'd
+ * prefer to find a fix that doesn't need compensation during the
+ * actual per-block multiplication step.
+ *
+ * If we did the obvious thing anyway - compute the unshifted 128-bit
+ * product representing the bit-reversal of PQx, and reduce it mod
+ * 0x1c2000000000000000000000000000001 - then we'd get a result which
+ * is exactly what we want, except that it's got a factor of x in it
+ * that we need to get rid of. The obvious answer is to divide by x
+ * (which is legal and safe, since mod P, x is invertible).
+ *
+ * Dividing a 128-bit polynomial by x is easy in principle. Shift left
+ * (because we're still bit-reversed), and if that shifted a 1 bit off
+ * the top, XOR 0xc2000000000000000000000000000001 into the remaining
+ * 128 bits.
+ *
+ * But we're back to having that expensive left shift. What can we do
+ * about that?
+ *
+ * Happily, one of the two input values to our per-block multiply
+ * operation is fixed! It's given to us at key setup stage, and never
+ * changed until the next rekey. So if at key setup time we do this
+ * left shift business _once_, replacing the logical value Q with Q/x,
+ * then that exactly cancels out the unwanted factor of x that shows
+ * up in our multiply operation. And now it doesn't matter that it's
+ * expensive (in the sense of 'a few more instructions than you'd
+ * like'), because it only happens once per SSH key exchange, not once
+ * per 16 bytes of data transferred.
+ */
+
+static void aesgcm_ref_poly_setkey_impl(aesgcm_ref_poly *ctx,
+                                        const unsigned char *var)
+{
+    /*
+     * Key setup function. We copy the provided 16-byte 'var'
+     * value into our polynomial. But, as discussed above, we also
+     * need to divide it by x.
+     */
+
+    ctx->var.hi = GET_64BIT_MSB_FIRST(var);
+    ctx->var.lo = GET_64BIT_MSB_FIRST(var + 8);
+
+    uint64_t bit = 1 & (ctx->var.hi >> 63);
+    ctx->var.hi = (ctx->var.hi << 1) ^ (ctx->var.lo >> 63);
+    ctx->var.lo = (ctx->var.lo << 1) ^ bit;
+    ctx->var.hi ^= 0xC200000000000000 & -bit;
+}
+
+static inline void aesgcm_ref_poly_setup(aesgcm_ref_poly *ctx,
+                                         const unsigned char *mask)
+{
+    /*
+     * Set up to start evaluating a particular MAC. Copy in the mask
+     * value for this packet, and initialise acc to zero.
+     */
+
+    ctx->mask.hi = GET_64BIT_MSB_FIRST(mask);
+    ctx->mask.lo = GET_64BIT_MSB_FIRST(mask + 8);
+    ctx->acc.hi = ctx->acc.lo = 0;
+}
+
+static inline void aesgcm_ref_poly_coeff(aesgcm_ref_poly *ctx,
+                                         const unsigned char *coeff)
+{
+    /*
+     * One step of Horner's-rule polynomial evaluation (with each
+     * coefficient of the polynomial being an element of GF(2^128),
+     * itself composed of polynomials over GF(2) mod P).
+     *
+     * We take our accumulator value, add the incoming coefficient
+     * (which means XOR, by GF(2) rules), and multiply by x (that is,
+     * 'var').
+     */
+
+    /*
+     * The addition first, which is easy.
+     */
+    ctx->acc.hi ^= GET_64BIT_MSB_FIRST(coeff);
+    ctx->acc.lo ^= GET_64BIT_MSB_FIRST(coeff + 8);
+
+    /*
+     * First, create the 256-bit product of the two 128-bit
+     * polynomials over GF(2) stored in ctx->acc and ctx->var.
+     *
+     * The obvious way to do this is by four smaller multiplications
+     * of 64x64 -> 128 bits. But we can do better using a single
+     * iteration of the Karatsuba technique, which is actually more
+     * convenient in polynomials over GF(2) than it is in integers,
+     * because there aren't all those awkward carries complicating
+     * things.
+     *
+     * Letting B denote x^64, and imagining our two inputs are split
+     * up into 64-bit chunks ah,al,bh,bl, the product we want is
+     *
+     *         (ah B + al) (bh B + bl)
+     *       = (ah bh) B^2 + (al bh + ah bl) B + (al bl)
+     *
+     * which looks like four smaller multiplications of each of ah,al
+     * with each of bh,bl. But Karatsuba's trick is to first compute
+     *
+     *         (ah + al) (bh + bl)
+     *       = ah bh + al bh + ah bl + al bl
+     *
+     * and then subtract the terms (ah bh) and (al bl), which we had
+     * to compute anyway, to get the middle two terms (al bh + ah bl)
+     * which are our coefficient of B.
+     *
+     * This involves more bookkeeping instructions like XORs, but with
+     * any luck those are faster than the main multiplication.
+     */
+    uint64_t ah = ctx->acc.hi, al = ctx->acc.lo;
+    uint64_t bh = ctx->var.hi, bl = ctx->var.lo;
+    /* Compute the outer two terms */
+    value128_t lo = pmul(al, bl);
+    value128_t hi = pmul(ah, bh);
+    /* Compute the trick product (ah+al)(bh+bl) */
+    value128_t md = pmul(ah ^ al, bh ^ bl);
+    /* Subtract off the outer two terms to get md = al bh + ah bl */
+    md.hi ^= lo.hi ^ hi.hi;
+    md.lo ^= lo.lo ^ hi.lo;
+    /* And add that into the 256-bit value given by hi * x^128 + lo */
+    lo.hi ^= md.lo;
+    hi.lo ^= md.hi;
+
+    /*
+     * OK. Now hi and lo together make up the 256-bit full product.
+     * Now reduce it mod the reversal of the GCM modulus polynomial.
+     * As discussed above, that's 0x1c2000000000000000000000000000001.
+     *
+     * We want the _topmost_ 128 bits of this, because we're working
+     * in a bit-reversed world. So what we fundamentally want to do is
+     * to take our 256-bit product, and add to it the product of its
+     * low 128 bits with 0x1c2000000000000000000000000000001. Then the
+     * top 128 bits will be the output we want.
+     *
+     * Since there's no carrying in this arithmetic, it's enough to
+     * discard the 1 bit at the bottom of that, because it won't
+     * affect anything in the half we're keeping. So it's enough to
+     * add 0x1c2000000000000000000000000000000 * lo to (hi:lo).
+     *
+     * We can only work with 64 bits at a time, so the first thing we
+     * do is to break that up:
+     *
+     *  - add 0x1c200000000000000 * lo.lo to (hi.lo : lo.hi)
+     *  - add 0x1c200000000000000 * lo.hi to (hi.hi : hi.lo)
+     *
+     * But there's still a problem: 0x1c200000000000000 is just too
+     * _big_ to fit in 64 bits. So we have to break it up into the low
+     * 64 bits 0xc200000000000000, and its leading 1. So each of those
+     * steps of the form 'add 0x1c200000000000000 * x to y:z' becomes
+     * 'add 0xc200000000000000 * x to y:z' followed by 'add x to y',
+     * the latter step dealing with the leading 1.
+     */
+
+    /* First step, adding to the middle two words of our number. After
+     * this the lowest word (in lo.lo) is ignored. */
+    value128_t r1 = pmul(0xC200000000000000, lo.lo);
+    hi.lo ^= r1.hi ^ lo.lo;
+    lo.hi ^= r1.lo;
+
+    /* Second of those steps, adding to the top two words, and
+     * discarding lo.hi. */
+    value128_t r2 = pmul(0xC200000000000000, lo.hi);
+    hi.hi ^= r2.hi ^ lo.hi;
+    hi.lo ^= r2.lo;
+
+    /* Now 'hi' is precisely what we have left. */
+    ctx->acc = hi;
+}
+
+static inline void aesgcm_ref_poly_output(aesgcm_ref_poly *ctx,
+                                          unsigned char *output)
+{
+    PUT_64BIT_MSB_FIRST(output, ctx->acc.hi ^ ctx->mask.hi);
+    PUT_64BIT_MSB_FIRST(output + 8, ctx->acc.lo ^ ctx->mask.lo);
+    smemclr(&ctx->acc, 16);
+    smemclr(&ctx->mask, 16);
+}
+
+#define AESGCM_FLAVOUR ref_poly
+#define AESGCM_NAME "reference polynomial-based implementation"
+#include "aesgcm-footer.h"
diff --git a/crypto/aesgcm-select.c b/crypto/aesgcm-select.c
new file mode 100644
index 00000000..eefe7148
--- /dev/null
+++ b/crypto/aesgcm-select.c
@@ -0,0 +1,38 @@
+#include "ssh.h"
+#include "aesgcm.h"
+
+static ssh2_mac *aesgcm_mac_selector_new(const ssh2_macalg *alg,
+                                         ssh_cipher *cipher)
+{
+    static const ssh2_macalg *const real_algs[] = {
+#if HAVE_CLMUL
+        &ssh2_aesgcm_mac_clmul,
+#endif
+#if HAVE_NEON_PMULL
+        &ssh2_aesgcm_mac_neon,
+#endif
+        &ssh2_aesgcm_mac_sw,
+        NULL,
+    };
+
+    for (size_t i = 0; real_algs[i]; i++) {
+        const ssh2_macalg *alg = real_algs[i];
+        const struct aesgcm_extra *alg_extra =
+            (const struct aesgcm_extra *)alg->extra;
+        if (check_aesgcm_availability(alg_extra))
+            return ssh2_mac_new(alg, cipher);
+    }
+
+    /* We should never reach the NULL at the end of the list, because
+     * the last non-NULL entry should be software-only GCM, which is
+     * always available. */
+    unreachable("aesgcm_select ran off the end of its list");
+}
+
+const ssh2_macalg ssh2_aesgcm_mac = {
+    .new = aesgcm_mac_selector_new,
+    .name = "",
+    .etm_name = "", /* Not selectable independently */
+    .len = 16,
+    .keylen = 0,
+};
diff --git a/crypto/aesgcm-sw.c b/crypto/aesgcm-sw.c
new file mode 100644
index 00000000..f322ae30
--- /dev/null
+++ b/crypto/aesgcm-sw.c
@@ -0,0 +1,145 @@
+/*
+ * Implementation of the GCM polynomial hash in pure software.
+ *
+ * I don't know of a faster way to do this in a side-channel safe
+ * manner than by precomputing a giant table and iterating over the
+ * whole thing.
+ *
+ * The original GCM reference suggests that you precompute the effects
+ * of multiplying a 128-bit value by the fixed key, in the form of a
+ * table indexed by some number of bits of the input value, so that
+ * you end up computing something of the form
+ *
+ *   table1[x & 0xFF] ^ table2[(x>>8) & 0xFF] ^ ... ^ table15[(x>>120) & 0xFF]
+ *
+ * But that was obviously written before cache and timing leaks were
+ * known about. What's a time-safe approach?
+ *
+ * Well, the above technique isn't fixed to 8 bits of input per table.
+ * You could trade off the number of tables against the size of each
+ * table. At one extreme of this tradeoff, you have 128 tables each
+ * indexed by a single input bit - which is to say, you have 128
+ * values, each 128 bits wide, and you XOR together the subset of
+ * those values corresponding to the input bits, which you can do by
+ * making a bitmask out of each input bit using standard constant-
+ * time-coding bit twiddling techniques.
+ *
+ * That's pretty unpleasant when GCM is supposed to be a fast
+ * algorithm, but I don't know of a better approach that meets current
+ * security standards! Suggestions welcome, if they can get through
+ * testsc.
+ */
+
+#include "ssh.h"
+#include "aesgcm.h"
+
+/*
+ * Store a 128-bit value in the most convenient form standard C will
+ * let us, namely two uint64_t giving its most and least significant
+ * halves.
+ */
+typedef struct {
+    uint64_t hi, lo;
+} value128_t;
+
+typedef struct aesgcm_sw {
+    AESGCM_COMMON_FIELDS;
+
+    /* Accumulator for the current evaluation, and mask that will be
+     * XORed in at the end. High  */
+    value128_t acc, mask;
+
+    /*
+     * Table of values to XOR in for each bit, representing the effect
+     * of multiplying by the fixed key. The key itself doesn't need to
+     * be stored separately, because it's never used. (However, it is
+     * also the first entry in the table, so if you _do_ need it,
+     * there it is.)
+     *
+     * Table is indexed from the low bit of the input upwards.
+     */
+    value128_t table[128];
+} aesgcm_sw;
+
+static bool aesgcm_sw_available(void)
+{
+    return true;  /* pure software implementation, always available */
+}
+
+static void aesgcm_sw_setkey_impl(aesgcm_sw *gcm, const unsigned char *var)
+{
+    value128_t v;
+    v.hi = GET_64BIT_MSB_FIRST(var);
+    v.lo = GET_64BIT_MSB_FIRST(var + 8);
+
+    /*
+     * Prepare the table. This has to be done in reverse order, so
+     * that the original value of the variable corresponds to
+     * table[127], because AES-GCM works in the bit-reversal of its
+     * logical specification so that's where the logical constant term
+     * lives. (See more detailed comment in aesgcm-ref-poly.c.)
+     */
+    for (size_t i = 0; i < 128; i++) {
+        gcm->table[127 - i] = v;
+
+        /* Multiply v by x, which means shifting right (bit reversal
+         * again) and then adding 0xE1 at the top if we shifted a 1 out. */
+        uint64_t lobit = v.lo & 1;
+        v.lo = (v.lo >> 1) ^ (v.hi << 63);
+        v.hi = (v.hi >> 1) ^ (0xE100000000000000ULL & -lobit);
+    }
+}
+
+static inline void aesgcm_sw_setup(aesgcm_sw *gcm, const unsigned char *mask)
+{
+    gcm->mask.hi = GET_64BIT_MSB_FIRST(mask);
+    gcm->mask.lo = GET_64BIT_MSB_FIRST(mask + 8);
+    gcm->acc.hi = gcm->acc.lo = 0;
+}
+
+static inline void aesgcm_sw_coeff(aesgcm_sw *gcm, const unsigned char *coeff)
+{
+    /* XOR in the new coefficient */
+    gcm->acc.hi ^= GET_64BIT_MSB_FIRST(coeff);
+    gcm->acc.lo ^= GET_64BIT_MSB_FIRST(coeff + 8);
+
+    /* And now just loop over the bits of acc, making up a new value
+     * by XORing together the entries of 'table' corresponding to set
+     * bits. */
+
+    value128_t out;
+    out.lo = out.hi = 0;
+
+    const value128_t *tableptr = gcm->table;
+
+    for (size_t i = 0; i < 64; i++) {
+        uint64_t bit = 1 & gcm->acc.lo;
+        gcm->acc.lo >>= 1;
+        uint64_t mask = -bit;
+        out.hi ^= mask & tableptr->hi;
+        out.lo ^= mask & tableptr->lo;
+        tableptr++;
+    }
+    for (size_t i = 0; i < 64; i++) {
+        uint64_t bit = 1 & gcm->acc.hi;
+        gcm->acc.hi >>= 1;
+        uint64_t mask = -bit;
+        out.hi ^= mask & tableptr->hi;
+        out.lo ^= mask & tableptr->lo;
+        tableptr++;
+    }
+
+    gcm->acc = out;
+}
+
+static inline void aesgcm_sw_output(aesgcm_sw *gcm, unsigned char *output)
+{
+    PUT_64BIT_MSB_FIRST(output, gcm->acc.hi ^ gcm->mask.hi);
+    PUT_64BIT_MSB_FIRST(output + 8, gcm->acc.lo ^ gcm->mask.lo);
+    smemclr(&gcm->acc, 16);
+    smemclr(&gcm->mask, 16);
+}
+
+#define AESGCM_FLAVOUR sw
+#define AESGCM_NAME "unaccelerated"
+#include "aesgcm-footer.h"
diff --git a/crypto/aesgcm.h b/crypto/aesgcm.h
new file mode 100644
index 00000000..48077004
--- /dev/null
+++ b/crypto/aesgcm.h
@@ -0,0 +1,44 @@
+/*
+ * Common parts of the state structure for AESGCM MAC implementations.
+ */
+#define AESGCM_COMMON_FIELDS                    \
+    ssh_cipher *cipher;                         \
+    unsigned char partblk[16];                  \
+    size_t skiplen, aadlen, ciphertextlen;      \
+    size_t skipgot, aadgot, partlen;            \
+    BinarySink_IMPLEMENTATION;                  \
+    ssh2_mac mac
+
+/*
+ * The 'extra' structure is used to include information about how to
+ * check if a given implementation is available at run time, and
+ * whether we've already checked.
+ */
+struct aesgcm_extra_mutable;
+struct aesgcm_extra {
+    /* Function to check availability. Might be expensive, so we don't
+     * want to call it more than once. */
+    bool (*check_available)(void);
+
+    /* Point to a writable substructure. */
+    struct aesgcm_extra_mutable *mut;
+
+    /*
+     * Extra API function specific to this MAC type that allows
+     * testcrypt to set more general lengths for skiplen and aadlen.
+     */
+    void (*set_prefix_lengths)(ssh2_mac *mac, size_t skip, size_t aad);
+};
+struct aesgcm_extra_mutable {
+    bool checked_availability;
+    bool is_available;
+};
+static inline bool check_aesgcm_availability(const struct aesgcm_extra *extra)
+{
+    if (!extra->mut->checked_availability) {
+        extra->mut->is_available = extra->check_available();
+        extra->mut->checked_availability = true;
+    }
+
+    return extra->mut->is_available;
+}
diff --git a/putty.h b/putty.h
index 727a07f7..175f5969 100644
--- a/putty.h
+++ b/putty.h
@@ -453,6 +453,7 @@ enum {
     CIPHER_DES,
     CIPHER_ARCFOUR,
     CIPHER_CHACHA20,
+    CIPHER_AESGCM,
     CIPHER_MAX                         /* no. ciphers (inc warn) */
 };
 
diff --git a/settings.c b/settings.c
index 40e26b8d..cc2176ce 100644
--- a/settings.c
+++ b/settings.c
@@ -17,6 +17,7 @@
 static const struct keyvalwhere ciphernames[] = {
     { "aes",        CIPHER_AES,             -1, -1 },
     { "chacha20",   CIPHER_CHACHA20,        CIPHER_AES, +1 },
+    { "aesgcm",     CIPHER_AESGCM,          CIPHER_CHACHA20, +1 },
     { "3des",       CIPHER_3DES,            -1, -1 },
     { "WARN",       CIPHER_WARN,            -1, -1 },
     { "des",        CIPHER_DES,             -1, -1 },
diff --git a/ssh.h b/ssh.h
index e14b44c6..d3ee5065 100644
--- a/ssh.h
+++ b/ssh.h
@@ -1072,6 +1072,10 @@ extern const ssh_cipheralg ssh_aes256_sdctr;
 extern const ssh_cipheralg ssh_aes256_sdctr_ni;
 extern const ssh_cipheralg ssh_aes256_sdctr_neon;
 extern const ssh_cipheralg ssh_aes256_sdctr_sw;
+extern const ssh_cipheralg ssh_aes256_gcm;
+extern const ssh_cipheralg ssh_aes256_gcm_ni;
+extern const ssh_cipheralg ssh_aes256_gcm_neon;
+extern const ssh_cipheralg ssh_aes256_gcm_sw;
 extern const ssh_cipheralg ssh_aes256_cbc;
 extern const ssh_cipheralg ssh_aes256_cbc_ni;
 extern const ssh_cipheralg ssh_aes256_cbc_neon;
@@ -1080,6 +1084,10 @@ extern const ssh_cipheralg ssh_aes192_sdctr;
 extern const ssh_cipheralg ssh_aes192_sdctr_ni;
 extern const ssh_cipheralg ssh_aes192_sdctr_neon;
 extern const ssh_cipheralg ssh_aes192_sdctr_sw;
+extern const ssh_cipheralg ssh_aes192_gcm;
+extern const ssh_cipheralg ssh_aes192_gcm_ni;
+extern const ssh_cipheralg ssh_aes192_gcm_neon;
+extern const ssh_cipheralg ssh_aes192_gcm_sw;
 extern const ssh_cipheralg ssh_aes192_cbc;
 extern const ssh_cipheralg ssh_aes192_cbc_ni;
 extern const ssh_cipheralg ssh_aes192_cbc_neon;
@@ -1088,6 +1096,10 @@ extern const ssh_cipheralg ssh_aes128_sdctr;
 extern const ssh_cipheralg ssh_aes128_sdctr_ni;
 extern const ssh_cipheralg ssh_aes128_sdctr_neon;
 extern const ssh_cipheralg ssh_aes128_sdctr_sw;
+extern const ssh_cipheralg ssh_aes128_gcm;
+extern const ssh_cipheralg ssh_aes128_gcm_ni;
+extern const ssh_cipheralg ssh_aes128_gcm_neon;
+extern const ssh_cipheralg ssh_aes128_gcm_sw;
 extern const ssh_cipheralg ssh_aes128_cbc;
 extern const ssh_cipheralg ssh_aes128_cbc_ni;
 extern const ssh_cipheralg ssh_aes128_cbc_neon;
@@ -1103,6 +1115,7 @@ extern const ssh2_ciphers ssh2_aes;
 extern const ssh2_ciphers ssh2_blowfish;
 extern const ssh2_ciphers ssh2_arcfour;
 extern const ssh2_ciphers ssh2_ccp;
+extern const ssh2_ciphers ssh2_aesgcm;
 extern const ssh_hashalg ssh_md5;
 extern const ssh_hashalg ssh_sha1;
 extern const ssh_hashalg ssh_sha1_ni;
@@ -1163,12 +1176,20 @@ extern const ssh2_macalg ssh_hmac_sha1_96;
 extern const ssh2_macalg ssh_hmac_sha1_96_buggy;
 extern const ssh2_macalg ssh_hmac_sha256;
 extern const ssh2_macalg ssh2_poly1305;
+extern const ssh2_macalg ssh2_aesgcm_mac;
+extern const ssh2_macalg ssh2_aesgcm_mac_sw;
+extern const ssh2_macalg ssh2_aesgcm_mac_ref_poly;
+extern const ssh2_macalg ssh2_aesgcm_mac_clmul;
+extern const ssh2_macalg ssh2_aesgcm_mac_neon;
 extern const ssh_compression_alg ssh_zlib;
 
 /* Special constructor: BLAKE2b can be instantiated with any hash
  * length up to 128 bytes */
 ssh_hash *blake2b_new_general(unsigned hashlen);
 
+/* Special test function for AES-GCM */
+void aesgcm_set_prefix_lengths(ssh2_mac *mac, size_t skip, size_t aad);
+
 /*
  * On some systems, you have to detect hardware crypto acceleration by
  * asking the local OS API rather than OS-agnostically asking the CPU
@@ -1176,6 +1197,7 @@ ssh_hash *blake2b_new_general(unsigned hashlen);
  * platform subdirectory.
  */
 bool platform_aes_neon_available(void);
+bool platform_pmull_neon_available(void);
 bool platform_sha256_neon_available(void);
 bool platform_sha1_neon_available(void);
 bool platform_sha512_neon_available(void);
diff --git a/ssh/bpp2.c b/ssh/bpp2.c
index a3ab99f9..e019dd2e 100644
--- a/ssh/bpp2.c
+++ b/ssh/bpp2.c
@@ -132,6 +132,12 @@ void ssh2_bpp_new_outgoing_crypto(
     s->out.etm_mode = etm_mode;
     if (mac) {
         s->out.mac = ssh2_mac_new(mac, s->out.cipher);
+        /*
+         * Important that mac_setkey comes after cipher_setkey,
+         * because in the case where the MAC makes use of the cipher
+         * (e.g. AES-GCM), it will need the cipher to be keyed
+         * already.
+         */
         ssh2_mac_setkey(s->out.mac, make_ptrlen(mac_key, mac->keylen));
 
         bpp_logevent("Initialised %s outbound MAC algorithm%s%s",
@@ -189,6 +195,7 @@ void ssh2_bpp_new_incoming_crypto(
     s->in.etm_mode = etm_mode;
     if (mac) {
         s->in.mac = ssh2_mac_new(mac, s->in.cipher);
+        /* MAC setkey has to follow cipher, just as in outgoing_crypto above */
         ssh2_mac_setkey(s->in.mac, make_ptrlen(mac_key, mac->keylen));
 
         bpp_logevent("Initialised %s inbound MAC algorithm%s%s",
diff --git a/ssh/transport2.c b/ssh/transport2.c
index 705df466..aba8cd0b 100644
--- a/ssh/transport2.c
+++ b/ssh/transport2.c
@@ -600,6 +600,9 @@ static void ssh2_write_kexinit_lists(
           case CIPHER_CHACHA20:
             preferred_ciphers[n_preferred_ciphers++] = &ssh2_ccp;
             break;
+          case CIPHER_AESGCM:
+            preferred_ciphers[n_preferred_ciphers++] = &ssh2_aesgcm;
+            break;
           case CIPHER_WARN:
             /* Flag for later. Don't bother if it's the last in
              * the list. */
diff --git a/test/cryptsuite.py b/test/cryptsuite.py
index 35114a16..69b492e8 100755
--- a/test/cryptsuite.py
+++ b/test/cryptsuite.py
@@ -145,6 +145,11 @@ def get_aes_impls():
             for impl in get_implementations("aes128_cbc")
             if impl.startswith("aes128_cbc_")]
 
+def get_aesgcm_impls():
+    return [impl.split("_", 1)[1]
+            for impl in get_implementations("aesgcm")
+            if impl.startswith("aesgcm_")]
+
 class MyTestBase(unittest.TestCase):
     "Intermediate class that adds useful helper methods."
     def assertEqualBin(self, x, y):
@@ -2933,6 +2938,184 @@ Private-MAC: 5b1f6f4cc43eb0060d2c3e181bc0129343adba2b
         per_base_keytype_tests('rsa', run_ca_rsa_tests=True, ca_signflags=2)
         per_base_keytype_tests('rsa', run_ca_rsa_tests=True, ca_signflags=4)
 
+    def testAESGCMBlockBoundaries(self):
+        # For standard AES-GCM test vectors, see the separate tests in
+        # standard_test_vectors.testAESGCM. This function will test
+        # the local interface, including the skip length and the
+        # machinery for incremental MAC update.
+
+        def aesgcm(key, iv, aes_impl, gcm_impl):
+            c = ssh_cipher_new('aes{:d}_gcm_{}'.format(8*len(key), aes_impl))
+            m = ssh2_mac_new('aesgcm_{}'.format(gcm_impl), c)
+            if m is None: return # skip test if HW GCM not available
+            c.setkey(key)
+            c.setiv(iv + b'\0'*4)
+            m.setkey(b'')
+            return c, m
+
+        def test_one(aes_impl, gcm_impl):
+            # An actual test from a session with OpenSSH, which
+            # demonstrates that the implementation in practice matches up
+            # to what the test vectors say. This is its SSH2_MSG_EXT_INFO
+            # packet.
+            key = unhex('dbf98b2f56c83fb2f9476aa876511225')
+            iv = unhex('9af15ecccf2bacaaa9625a6a')
+            plain = unhex('1007000000020000000f736572766572'
+                          '2d7369672d616c6773000000db737368'
+                          '2d656432353531392c736b2d7373682d'
+                          '65643235353139406f70656e7373682e'
+                          '636f6d2c7373682d7273612c7273612d'
+                          '736861322d3235362c7273612d736861'
+                          '322d3531322c7373682d6473732c6563'
+                          '6473612d736861322d6e697374703235'
+                          '362c65636473612d736861322d6e6973'
+                          '74703338342c65636473612d73686132'
+                          '2d6e697374703532312c736b2d656364'
+                          '73612d736861322d6e69737470323536'
+                          '406f70656e7373682e636f6d2c776562'
+                          '617574686e2d736b2d65636473612d73'
+                          '6861322d6e69737470323536406f7065'
+                          '6e7373682e636f6d0000001f7075626c'
+                          '69636b65792d686f7374626f756e6440'
+                          '6f70656e7373682e636f6d0000000130'
+                          '5935130804ad4b19ed2789210290c438')
+            aad = unhex('00000130')
+            cipher = unhex('c4b88f35c1ef8aa6225033c3f185d648'
+                           '3c485d84930d5846f7851daacbff49d5'
+                           '8cf72169fca7ab3c170376df65dd69de'
+                           'c40a94c6b8e3da6d61161ab19be27466'
+                           '02e0dfa3330faae291ef4173a20e87a4'
+                           'd40728c645baa72916c1958531ef7b54'
+                           '27228513e53005e6d17b9bb384b8d8c1'
+                           '92b8a10b731459eed5a0fb120c283412'
+                           'e34445981df1257f1c35a06196731fed'
+                           '1b3115f419e754de0b634bf68768cb02'
+                           '29e70bb2259cedb5101ff6a4ac19aaad'
+                           '46f1c30697361b45d6c152c3069cee6b'
+                           'd46e9785d65ea6bf7fca41f0ac3c8e93'
+                           'ce940b0059c39d51e49c17f60d48d633'
+                           '5bae4402faab61d8d65221b24b400e65'
+                           '89f941ff48310231a42641851ea00832'
+                           '2c2d188f4cc6a4ec6002161c407d0a92'
+                           'f1697bb319fbec1ca63fa8e7ac171c85'
+                           '5b60142bfcf4e5b0a9ada3451799866e')
+
+            c, m = aesgcm(key, iv, aes_impl, gcm_impl)
+            len_dec = c.decrypt_length(aad, 123)
+            self.assertEqual(len_dec, aad) # length not actually encrypted
+            m.start()
+            # We expect 4 bytes skipped (the sequence number that
+            # ChaCha20-Poly1305 wants at the start of its MAC), and 4
+            # bytes AAD. These were initialised by the call to
+            # encrypt_length.
+            m.update(b'fake' + aad + cipher)
+            self.assertEqualBin(m.genresult(),
+                                unhex('4a5a6d57d54888b4e58c57a96e00b73a'))
+            self.assertEqualBin(c.decrypt(cipher), plain)
+
+            c, m = aesgcm(key, iv, aes_impl, gcm_impl)
+            len_enc = c.encrypt_length(aad, 123)
+            self.assertEqual(len_enc, aad) # length not actually encrypted
+            self.assertEqualBin(c.encrypt(plain), cipher)
+
+            # Test incremental update.
+            def testIncremental(skiplen, aad, plain):
+                key, iv = b'SomeRandomKeyVal', b'SomeRandomIV'
+                mac_input = b'x' * skiplen + aad + plain
+
+                c, m = aesgcm(key, iv, aes_impl, gcm_impl)
+                aesgcm_set_prefix_lengths(m, skiplen, len(aad))
+
+                m.start()
+                m.update(mac_input)
+                reference_mac = m.genresult()
+
+                # Break the input just once, at each possible byte
+                # position.
+                for i in range(1, len(mac_input)):
+                    c.setiv(iv + b'\0'*4)
+                    m.setkey(b'')
+                    aesgcm_set_prefix_lengths(m, skiplen, len(aad))
+                    m.start()
+                    m.update(mac_input[:i])
+                    m.update(mac_input[i:])
+                    self.assertEqualBin(m.genresult(), reference_mac)
+
+                # Feed the entire input in a byte at a time.
+                c.setiv(iv + b'\0'*4)
+                m.setkey(b'')
+                aesgcm_set_prefix_lengths(m, skiplen, len(aad))
+                m.start()
+                for i in range(len(mac_input)):
+                    m.update(mac_input[i:i+1])
+                self.assertEqualBin(m.genresult(), reference_mac)
+
+            # Incremental test with more than a full block of each thing
+            testIncremental(23, b'abcdefghijklmnopqrst',
+                            b'Lorem ipsum dolor sit amet')
+
+            # Incremental test with exactly a full block of each thing
+            testIncremental(16, b'abcdefghijklmnop',
+                            b'Lorem ipsum dolo')
+
+            # Incremental test with less than a full block of each thing
+            testIncremental(7, b'abcdefghij',
+                            b'Lorem ipsum')
+
+        for aes_impl in get_aes_impls():
+            for gcm_impl in get_aesgcm_impls():
+                with self.subTest(aes_impl=aes_impl, gcm_impl=gcm_impl):
+                    test_one(aes_impl, gcm_impl)
+
+    def testAESGCMIV(self):
+        key = b'SomeRandomKeyVal'
+
+        def test(gcm, cbc, iv_fixed, iv_msg):
+            gcm.setiv(ssh_uint32(iv_fixed) + ssh_uint64(iv_msg) + b'fake')
+
+            cbc.setiv(b'\0' * 16)
+            preimage = cbc.decrypt(gcm.encrypt(b'\0' * 16))
+            self.assertEqualBin(preimage, ssh_uint32(iv_fixed) +
+                                ssh_uint64(iv_msg) + ssh_uint32(1))
+            cbc.setiv(b'\0' * 16)
+            preimage = cbc.decrypt(gcm.encrypt(b'\0' * 16))
+            self.assertEqualBin(preimage, ssh_uint32(iv_fixed) +
+                                ssh_uint64(iv_msg) + ssh_uint32(2))
+
+            gcm.next_message()
+            iv_msg = (iv_msg + 1) & ((1<<64)-1)
+
+            cbc.setiv(b'\0' * 16)
+            preimage = cbc.decrypt(gcm.encrypt(b'\0' * 16))
+            self.assertEqualBin(preimage, ssh_uint32(iv_fixed) +
+                                ssh_uint64(iv_msg) + ssh_uint32(1))
+            cbc.setiv(b'\0' * 16)
+            preimage = cbc.decrypt(gcm.encrypt(b'\0' * 16))
+            self.assertEqualBin(preimage, ssh_uint32(iv_fixed) +
+                                ssh_uint64(iv_msg) + ssh_uint32(2))
+
+
+        for impl in get_aes_impls():
+            with self.subTest(aes_impl=impl):
+                gcm = ssh_cipher_new('aes{:d}_gcm_{}'.format(8*len(key), impl))
+                gcm.setkey(key)
+
+                cbc = ssh_cipher_new('aes{:d}_cbc_{}'.format(8*len(key), impl))
+                cbc.setkey(key)
+
+                # A simple test to ensure the low word gets
+                # incremented and that the whole IV looks basically
+                # the way we expect it to
+                test(gcm, cbc, 0x27182818, 0x3141592653589793)
+
+                # Test that carries are propagated into the high word
+                test(gcm, cbc, 0x27182818, 0x00000000FFFFFFFF)
+
+                # Test that carries _aren't_ propagated out of the
+                # high word of the message counter into the fixed word
+                # at the top
+                test(gcm, cbc, 0x27182818, 0xFFFFFFFFFFFFFFFF)
+
 class standard_test_vectors(MyTestBase):
     def testAES(self):
         def vector(cipher, key, plaintext, ciphertext):
@@ -3726,6 +3909,178 @@ class standard_test_vectors(MyTestBase):
                              b'opaque="HRPCssKJSGjCrkzDg8OhwpzCiGPChXYjwrI2QmXDnsOS", '
                              b'userhash=true')
 
+    def testAESGCM(self):
+        def test(key, iv, plaintext, aad, ciphertext, mac):
+            c = ssh_cipher_new('aes{:d}_gcm'.format(8*len(key)))
+            m = ssh2_mac_new('aesgcm_{}'.format(impl), c)
+            if m is None: return # skip test if HW GCM not available
+            c.setkey(key)
+            c.setiv(iv + b'\0'*4)
+            m.setkey(b'')
+            aesgcm_set_prefix_lengths(m, 0, len(aad))
+
+            # Some test cases have plaintext/ciphertext that is not a
+            # multiple of the cipher block size. Our MAC
+            # implementation supports this, but the cipher
+            # implementation expects block-granular input.
+            padlen = 15 & -len(plaintext)
+            ciphertext_got = c.encrypt(plaintext + b'0' * padlen)[
+                :len(plaintext)]
+
+            m.start()
+            m.update(aad + ciphertext)
+            mac_got = m.genresult()
+
+            self.assertEqualBin(ciphertext_got, ciphertext)
+            self.assertEqualBin(mac_got, mac)
+
+            c.setiv(iv + b'\0'*4)
+
+        for impl in get_aesgcm_impls():
+            # 'The Galois/Counter Mode of Operation', McGrew and
+            # Viega, Appendix B. All the tests except the ones whose
+            # IV is the wrong length, because handling that requires
+            # an extra evaluation of the polynomial hash, which is
+            # never used in an SSH context, so I didn't implement it
+            # just for the sake of test vectors.
+
+            # Test Case 1
+            test(unhex('00000000000000000000000000000000'),
+                 unhex('000000000000000000000000'),
+                 unhex(''), unhex(''), unhex(''),
+                 unhex('58e2fccefa7e3061367f1d57a4e7455a'))
+
+            # Test Case 2
+            test(unhex('00000000000000000000000000000000'),
+                 unhex('000000000000000000000000'),
+                 unhex('00000000000000000000000000000000'),
+                 unhex(''),
+                 unhex('0388dace60b6a392f328c2b971b2fe78'),
+                 unhex('ab6e47d42cec13bdf53a67b21257bddf'))
+
+            # Test Case 3
+            test(unhex('feffe9928665731c6d6a8f9467308308'),
+                 unhex('cafebabefacedbaddecaf888'),
+                 unhex('d9313225f88406e5a55909c5aff5269a'
+                       '86a7a9531534f7da2e4c303d8a318a72'
+                       '1c3c0c95956809532fcf0e2449a6b525'
+                       'b16aedf5aa0de657ba637b391aafd255'),
+                 unhex(''),
+                 unhex('42831ec2217774244b7221b784d0d49c'
+                       'e3aa212f2c02a4e035c17e2329aca12e'
+                       '21d514b25466931c7d8f6a5aac84aa05'
+                       '1ba30b396a0aac973d58e091473f5985'),
+                 unhex('4d5c2af327cd64a62cf35abd2ba6fab4'))
+
+            # Test Case 4
+            test(unhex('feffe9928665731c6d6a8f9467308308'),
+                 unhex('cafebabefacedbaddecaf888'),
+                 unhex('d9313225f88406e5a55909c5aff5269a'
+                       '86a7a9531534f7da2e4c303d8a318a72'
+                       '1c3c0c95956809532fcf0e2449a6b525'
+                       'b16aedf5aa0de657ba637b39'),
+                 unhex('feedfacedeadbeeffeedfacedeadbeef'
+                       'abaddad2'),
+                 unhex('42831ec2217774244b7221b784d0d49c'
+                       'e3aa212f2c02a4e035c17e2329aca12e'
+                       '21d514b25466931c7d8f6a5aac84aa05'
+                       '1ba30b396a0aac973d58e091'),
+                 unhex('5bc94fbc3221a5db94fae95ae7121a47'))
+
+            # Test Case 7
+            test(unhex('00000000000000000000000000000000'
+                       '0000000000000000'),
+                 unhex('000000000000000000000000'),
+                 unhex(''), unhex(''), unhex(''),
+                 unhex('cd33b28ac773f74ba00ed1f312572435'))
+
+            # Test Case 8
+            test(unhex('00000000000000000000000000000000'
+                       '0000000000000000'),
+                 unhex('000000000000000000000000'),
+                 unhex('00000000000000000000000000000000'),
+                 unhex(''),
+                 unhex('98e7247c07f0fe411c267e4384b0f600'),
+                 unhex('2ff58d80033927ab8ef4d4587514f0fb'))
+
+            # Test Case 9
+            test(unhex('feffe9928665731c6d6a8f9467308308'
+                       'feffe9928665731c'),
+                 unhex('cafebabefacedbaddecaf888'),
+                 unhex('d9313225f88406e5a55909c5aff5269a'
+                       '86a7a9531534f7da2e4c303d8a318a72'
+                       '1c3c0c95956809532fcf0e2449a6b525'
+                       'b16aedf5aa0de657ba637b391aafd255'),
+                 unhex(''),
+                 unhex('3980ca0b3c00e841eb06fac4872a2757'
+                       '859e1ceaa6efd984628593b40ca1e19c'
+                       '7d773d00c144c525ac619d18c84a3f47'
+                       '18e2448b2fe324d9ccda2710acade256'),
+                 unhex('9924a7c8587336bfb118024db8674a14'))
+
+            # Test Case 10
+            test(unhex('feffe9928665731c6d6a8f9467308308'
+                       'feffe9928665731c'),
+                 unhex('cafebabefacedbaddecaf888'),
+                 unhex('d9313225f88406e5a55909c5aff5269a'
+                       '86a7a9531534f7da2e4c303d8a318a72'
+                       '1c3c0c95956809532fcf0e2449a6b525'
+                       'b16aedf5aa0de657ba637b39'),
+                 unhex('feedfacedeadbeeffeedfacedeadbeef'
+                       'abaddad2'),
+                 unhex('3980ca0b3c00e841eb06fac4872a2757'
+                       '859e1ceaa6efd984628593b40ca1e19c'
+                       '7d773d00c144c525ac619d18c84a3f47'
+                       '18e2448b2fe324d9ccda2710'),
+                 unhex('2519498e80f1478f37ba55bd6d27618c'))
+
+            # Test Case 13
+            test(unhex('00000000000000000000000000000000'
+                       '00000000000000000000000000000000'),
+                 unhex('000000000000000000000000'),
+                 unhex(''), unhex(''), unhex(''),
+                 unhex('530f8afbc74536b9a963b4f1c4cb738b'))
+
+            # Test Case 14
+            test(unhex('00000000000000000000000000000000'
+                       '00000000000000000000000000000000'),
+                 unhex('000000000000000000000000'),
+                 unhex('00000000000000000000000000000000'),
+                 unhex(''),
+                 unhex('cea7403d4d606b6e074ec5d3baf39d18'),
+                 unhex('d0d1c8a799996bf0265b98b5d48ab919'))
+
+            # Test Case 15
+            test(unhex('feffe9928665731c6d6a8f9467308308'
+                       'feffe9928665731c6d6a8f9467308308'),
+                 unhex('cafebabefacedbaddecaf888'),
+                 unhex('d9313225f88406e5a55909c5aff5269a'
+                       '86a7a9531534f7da2e4c303d8a318a72'
+                       '1c3c0c95956809532fcf0e2449a6b525'
+                       'b16aedf5aa0de657ba637b391aafd255'),
+                 unhex(''),
+                 unhex('522dc1f099567d07f47f37a32a84427d'
+                       '643a8cdcbfe5c0c97598a2bd2555d1aa'
+                       '8cb08e48590dbb3da7b08b1056828838'
+                       'c5f61e6393ba7a0abcc9f662898015ad'),
+                 unhex('b094dac5d93471bdec1a502270e3cc6c'))
+
+            # Test Case 16
+            test(unhex('feffe9928665731c6d6a8f9467308308'
+                       'feffe9928665731c6d6a8f9467308308'),
+                 unhex('cafebabefacedbaddecaf888'),
+                 unhex('d9313225f88406e5a55909c5aff5269a'
+                       '86a7a9531534f7da2e4c303d8a318a72'
+                       '1c3c0c95956809532fcf0e2449a6b525'
+                       'b16aedf5aa0de657ba637b39'),
+                 unhex('feedfacedeadbeeffeedfacedeadbeef'
+                       'abaddad2'),
+                 unhex('522dc1f099567d07f47f37a32a84427d'
+                       '643a8cdcbfe5c0c97598a2bd2555d1aa'
+                       '8cb08e48590dbb3da7b08b1056828838'
+                       'c5f61e6393ba7a0abcc9f662'),
+                 unhex('76fc6ece0f4e1768cddf8853bb2d551b'))
+
 if __name__ == "__main__":
     # Run the tests, suppressing automatic sys.exit and collecting the
     # unittest.TestProgram instance returned by unittest.main instead.
diff --git a/test/list-accel.py b/test/list-accel.py
index af93d420..ac92d376 100755
--- a/test/list-accel.py
+++ b/test/list-accel.py
@@ -25,10 +25,14 @@ def list_implementations(alg, checkfn):
 def list_cipher_implementations(alg):
     list_implementations(alg, lambda impl: ssh_cipher_new(impl) is not None)
 
+def list_mac_implementations(alg):
+    list_implementations(alg, lambda impl: ssh2_mac_new(impl, None) is not None)
+
 def list_hash_implementations(alg):
     list_implementations(alg, lambda impl: ssh_hash_new(impl) is not None)
 
 list_cipher_implementations("aes256_cbc")
+list_mac_implementations("aesgcm")
 list_hash_implementations("sha1")
 list_hash_implementations("sha256")
 list_hash_implementations("sha512")
diff --git a/test/testcrypt-enum.h b/test/testcrypt-enum.h
index ac73a766..90458fc0 100644
--- a/test/testcrypt-enum.h
+++ b/test/testcrypt-enum.h
@@ -36,6 +36,16 @@ BEGIN_ENUM_TYPE(macalg)
     ENUM_VALUE("hmac_sha1_96_buggy", &ssh_hmac_sha1_96_buggy)
     ENUM_VALUE("hmac_sha256", &ssh_hmac_sha256)
     ENUM_VALUE("poly1305", &ssh2_poly1305)
+    ENUM_VALUE("aesgcm", &ssh2_aesgcm_mac)
+    ENUM_VALUE("aesgcm", &ssh2_aesgcm_mac)
+    ENUM_VALUE("aesgcm_sw", &ssh2_aesgcm_mac_sw)
+    ENUM_VALUE("aesgcm_ref_poly", &ssh2_aesgcm_mac_ref_poly)
+#if HAVE_CLMUL
+    ENUM_VALUE("aesgcm_clmul", &ssh2_aesgcm_mac_clmul)
+#endif
+#if HAVE_NEON_PMULL
+    ENUM_VALUE("aesgcm_neon", &ssh2_aesgcm_mac_neon)
+#endif
 END_ENUM_TYPE(macalg)
 
 BEGIN_ENUM_TYPE(keyalg)
@@ -60,31 +70,43 @@ BEGIN_ENUM_TYPE(cipheralg)
     ENUM_VALUE("3des_ssh1", &ssh_3des_ssh1)
     ENUM_VALUE("des_cbc", &ssh_des)
     ENUM_VALUE("aes256_ctr", &ssh_aes256_sdctr)
+    ENUM_VALUE("aes256_gcm", &ssh_aes256_gcm)
     ENUM_VALUE("aes256_cbc", &ssh_aes256_cbc)
     ENUM_VALUE("aes192_ctr", &ssh_aes192_sdctr)
+    ENUM_VALUE("aes192_gcm", &ssh_aes192_gcm)
     ENUM_VALUE("aes192_cbc", &ssh_aes192_cbc)
     ENUM_VALUE("aes128_ctr", &ssh_aes128_sdctr)
+    ENUM_VALUE("aes128_gcm", &ssh_aes128_gcm)
     ENUM_VALUE("aes128_cbc", &ssh_aes128_cbc)
     ENUM_VALUE("aes256_ctr_sw", &ssh_aes256_sdctr_sw)
+    ENUM_VALUE("aes256_gcm_sw", &ssh_aes256_gcm_sw)
     ENUM_VALUE("aes256_cbc_sw", &ssh_aes256_cbc_sw)
     ENUM_VALUE("aes192_ctr_sw", &ssh_aes192_sdctr_sw)
+    ENUM_VALUE("aes192_gcm_sw", &ssh_aes192_gcm_sw)
     ENUM_VALUE("aes192_cbc_sw", &ssh_aes192_cbc_sw)
     ENUM_VALUE("aes128_ctr_sw", &ssh_aes128_sdctr_sw)
+    ENUM_VALUE("aes128_gcm_sw", &ssh_aes128_gcm_sw)
     ENUM_VALUE("aes128_cbc_sw", &ssh_aes128_cbc_sw)
 #if HAVE_AES_NI
     ENUM_VALUE("aes256_ctr_ni", &ssh_aes256_sdctr_ni)
+    ENUM_VALUE("aes256_gcm_ni", &ssh_aes256_gcm_ni)
     ENUM_VALUE("aes256_cbc_ni", &ssh_aes256_cbc_ni)
     ENUM_VALUE("aes192_ctr_ni", &ssh_aes192_sdctr_ni)
+    ENUM_VALUE("aes192_gcm_ni", &ssh_aes192_gcm_ni)
     ENUM_VALUE("aes192_cbc_ni", &ssh_aes192_cbc_ni)
     ENUM_VALUE("aes128_ctr_ni", &ssh_aes128_sdctr_ni)
+    ENUM_VALUE("aes128_gcm_ni", &ssh_aes128_gcm_ni)
     ENUM_VALUE("aes128_cbc_ni", &ssh_aes128_cbc_ni)
 #endif
 #if HAVE_NEON_CRYPTO
     ENUM_VALUE("aes256_ctr_neon", &ssh_aes256_sdctr_neon)
+    ENUM_VALUE("aes256_gcm_neon", &ssh_aes256_gcm_neon)
     ENUM_VALUE("aes256_cbc_neon", &ssh_aes256_cbc_neon)
     ENUM_VALUE("aes192_ctr_neon", &ssh_aes192_sdctr_neon)
+    ENUM_VALUE("aes192_gcm_neon", &ssh_aes192_gcm_neon)
     ENUM_VALUE("aes192_cbc_neon", &ssh_aes192_cbc_neon)
     ENUM_VALUE("aes128_ctr_neon", &ssh_aes128_sdctr_neon)
+    ENUM_VALUE("aes128_gcm_neon", &ssh_aes128_gcm_neon)
     ENUM_VALUE("aes128_cbc_neon", &ssh_aes128_cbc_neon)
 #endif
     ENUM_VALUE("blowfish_ctr", &ssh_blowfish_ssh2_ctr)
diff --git a/test/testcrypt-func.h b/test/testcrypt-func.h
index f79c966e..bd007293 100644
--- a/test/testcrypt-func.h
+++ b/test/testcrypt-func.h
@@ -277,6 +277,9 @@ FUNC(void, ssh2_mac_next_message, ARG(val_mac, m))
 FUNC_WRAPPED(val_string, ssh2_mac_genresult, ARG(val_mac, m))
 FUNC(val_string_asciz_const, ssh2_mac_text_name, ARG(val_mac, m))
 
+FUNC(void, aesgcm_set_prefix_lengths,
+     ARG(val_mac, m), ARG(uint, skip), ARG(uint, aad))
+
 /*
  * The ssh_key abstraction. All the uses of BinarySink and
  * BinarySource in parameters are replaced with ordinary strings for
diff --git a/test/testcrypt.c b/test/testcrypt.c
index de09af33..3755ae72 100644
--- a/test/testcrypt.c
+++ b/test/testcrypt.c
@@ -1329,7 +1329,16 @@ strbuf *get_implementations_commasep(ptrlen alg)
     strbuf *out = strbuf_new();
     put_datapl(out, alg);
 
-    if (ptrlen_startswith(alg, PTRLEN_LITERAL("aes"), NULL)) {
+    if (ptrlen_startswith(alg, PTRLEN_LITERAL("aesgcm"), NULL)) {
+        put_fmt(out, ",%.*s_sw", PTRLEN_PRINTF(alg));
+        put_fmt(out, ",%.*s_ref_poly", PTRLEN_PRINTF(alg));
+#if HAVE_CLMUL
+        put_fmt(out, ",%.*s_clmul", PTRLEN_PRINTF(alg));
+#endif
+#if HAVE_NEON_PMULL
+        put_fmt(out, ",%.*s_neon", PTRLEN_PRINTF(alg));
+#endif
+    } else if (ptrlen_startswith(alg, PTRLEN_LITERAL("aes"), NULL)) {
         put_fmt(out, ",%.*s_sw", PTRLEN_PRINTF(alg));
 #if HAVE_AES_NI
         put_fmt(out, ",%.*s_ni", PTRLEN_PRINTF(alg));
diff --git a/test/testsc.c b/test/testsc.c
index 6068dd86..0a643e97 100644
--- a/test/testsc.c
+++ b/test/testsc.c
@@ -259,6 +259,12 @@ VOLATILE_WRAPPED_DEFN(static, size_t, looplimit, (size_t x))
 #define IF_SHA_NI(x)
 #endif
 
+#if HAVE_CLMUL
+#define IF_CLMUL(x) x
+#else
+#define IF_CLMUL(x)
+#endif
+
 #if HAVE_NEON_CRYPTO
 #define IF_NEON_CRYPTO(x) x
 #else
@@ -271,6 +277,12 @@ VOLATILE_WRAPPED_DEFN(static, size_t, looplimit, (size_t x))
 #define IF_NEON_SHA512(x)
 #endif
 
+#if HAVE_NEON_PMULL
+#define IF_NEON_PMULL(x) x
+#else
+#define IF_NEON_PMULL(x)
+#endif
+
 /* Ciphers that we expect to pass this test. Blowfish and Arcfour are
  * intentionally omitted, because we already know they don't. */
 #define CIPHERS(X, Y)                           \
@@ -280,28 +292,40 @@ VOLATILE_WRAPPED_DEFN(static, size_t, looplimit, (size_t x))
     X(Y, ssh_des)                               \
     X(Y, ssh_des_sshcom_ssh2)                   \
     X(Y, ssh_aes256_sdctr)                      \
+    X(Y, ssh_aes256_gcm)                        \
     X(Y, ssh_aes256_cbc)                        \
     X(Y, ssh_aes192_sdctr)                      \
+    X(Y, ssh_aes192_gcm)                        \
     X(Y, ssh_aes192_cbc)                        \
     X(Y, ssh_aes128_sdctr)                      \
+    X(Y, ssh_aes128_gcm)                        \
     X(Y, ssh_aes128_cbc)                        \
     X(Y, ssh_aes256_sdctr_sw)                   \
+    X(Y, ssh_aes256_gcm_sw)                     \
     X(Y, ssh_aes256_cbc_sw)                     \
     X(Y, ssh_aes192_sdctr_sw)                   \
+    X(Y, ssh_aes192_gcm_sw)                     \
     X(Y, ssh_aes192_cbc_sw)                     \
     X(Y, ssh_aes128_sdctr_sw)                   \
+    X(Y, ssh_aes128_gcm_sw)                     \
     X(Y, ssh_aes128_cbc_sw)                     \
     IF_AES_NI(X(Y, ssh_aes256_sdctr_ni))        \
+    IF_AES_NI(X(Y, ssh_aes256_gcm_ni))          \
     IF_AES_NI(X(Y, ssh_aes256_cbc_ni))          \
     IF_AES_NI(X(Y, ssh_aes192_sdctr_ni))        \
+    IF_AES_NI(X(Y, ssh_aes192_gcm_ni))          \
     IF_AES_NI(X(Y, ssh_aes192_cbc_ni))          \
     IF_AES_NI(X(Y, ssh_aes128_sdctr_ni))        \
+    IF_AES_NI(X(Y, ssh_aes128_gcm_ni))          \
     IF_AES_NI(X(Y, ssh_aes128_cbc_ni))          \
     IF_NEON_CRYPTO(X(Y, ssh_aes256_sdctr_neon)) \
+    IF_NEON_CRYPTO(X(Y, ssh_aes256_gcm_neon))   \
     IF_NEON_CRYPTO(X(Y, ssh_aes256_cbc_neon))   \
     IF_NEON_CRYPTO(X(Y, ssh_aes192_sdctr_neon)) \
+    IF_NEON_CRYPTO(X(Y, ssh_aes192_gcm_neon))   \
     IF_NEON_CRYPTO(X(Y, ssh_aes192_cbc_neon))   \
     IF_NEON_CRYPTO(X(Y, ssh_aes128_sdctr_neon)) \
+    IF_NEON_CRYPTO(X(Y, ssh_aes128_gcm_neon))   \
     IF_NEON_CRYPTO(X(Y, ssh_aes128_cbc_neon))   \
     X(Y, ssh2_chacha20_poly1305)                \
     /* end of list */
@@ -317,9 +341,17 @@ VOLATILE_WRAPPED_DEFN(static, size_t, looplimit, (size_t x))
     X(Y, ssh_hmac_sha256)                       \
     /* end of list */
 
-#define ALL_MACS(X, Y)                          \
-    SIMPLE_MACS(X, Y)                           \
-    X(Y, poly1305)                              \
+#define ALL_MACS(X, Y)                                      \
+    SIMPLE_MACS(X, Y)                                       \
+    X(Y, poly1305)                                          \
+    X(Y, aesgcm_sw_sw)                                      \
+    X(Y, aesgcm_sw_refpoly)                                 \
+    IF_AES_NI(X(Y, aesgcm_ni_sw))                           \
+    IF_NEON_CRYPTO(X(Y, aesgcm_neon_sw))                    \
+    IF_CLMUL(X(Y, aesgcm_sw_clmul))                         \
+    IF_NEON_PMULL(X(Y, aesgcm_sw_neon))                     \
+    IF_AES_NI(IF_CLMUL(X(Y, aesgcm_ni_clmul)))              \
+    IF_NEON_CRYPTO(IF_NEON_PMULL(X(Y, aesgcm_neon_neon)))   \
     /* end of list */
 
 #define MAC_TESTLIST(X, name) X(mac_ ## name)
@@ -1473,6 +1505,58 @@ static void test_mac_poly1305(void)
     test_mac(&ssh2_poly1305, &ssh2_chacha20_poly1305);
 }
 
+static void test_mac_aesgcm_sw_sw(void)
+{
+    test_mac(&ssh2_aesgcm_mac_sw, &ssh_aes128_gcm_sw);
+}
+
+static void test_mac_aesgcm_sw_refpoly(void)
+{
+    test_mac(&ssh2_aesgcm_mac_ref_poly, &ssh_aes128_gcm_sw);
+}
+
+#if HAVE_AES_NI
+static void test_mac_aesgcm_ni_sw(void)
+{
+    test_mac(&ssh2_aesgcm_mac_sw, &ssh_aes128_gcm_ni);
+}
+#endif
+
+#if HAVE_NEON_CRYPTO
+static void test_mac_aesgcm_neon_sw(void)
+{
+    test_mac(&ssh2_aesgcm_mac_sw, &ssh_aes128_gcm_neon);
+}
+#endif
+
+#if HAVE_CLMUL
+static void test_mac_aesgcm_sw_clmul(void)
+{
+    test_mac(&ssh2_aesgcm_mac_clmul, &ssh_aes128_gcm_sw);
+}
+#endif
+
+#if HAVE_NEON_PMULL
+static void test_mac_aesgcm_sw_neon(void)
+{
+    test_mac(&ssh2_aesgcm_mac_neon, &ssh_aes128_gcm_sw);
+}
+#endif
+
+#if HAVE_AES_NI && HAVE_CLMUL
+static void test_mac_aesgcm_ni_clmul(void)
+{
+    test_mac(&ssh2_aesgcm_mac_clmul, &ssh_aes128_gcm_ni);
+}
+#endif
+
+#if HAVE_NEON_CRYPTO && HAVE_NEON_PMULL
+static void test_mac_aesgcm_neon_neon(void)
+{
+    test_mac(&ssh2_aesgcm_mac_neon, &ssh_aes128_gcm_neon);
+}
+#endif
+
 static void test_hash(const ssh_hashalg *halg)
 {
     ssh_hash *h = ssh_hash_new(halg);
diff --git a/unix/utils/arm_arch_queries.c b/unix/utils/arm_arch_queries.c
index d6dc97bc..c3dc286b 100644
--- a/unix/utils/arm_arch_queries.c
+++ b/unix/utils/arm_arch_queries.c
@@ -27,6 +27,21 @@ bool platform_aes_neon_available(void)
 #endif
 }
 
+bool platform_pmull_neon_available(void)
+{
+#if defined HWCAP_PMULL
+    return getauxval(AT_HWCAP) & HWCAP_PMULL;
+#elif defined HWCAP2_PMULL
+    return getauxval(AT_HWCAP2) & HWCAP2_PMULL;
+#elif defined __APPLE__
+    SysctlResult res = test_sysctl_flag("hw.optional.arm.FEAT_PMULL");
+    /* As above, treat 'missing' as enabled */
+    return res != SYSCTL_OFF;
+#else
+    return false;
+#endif
+}
+
 bool platform_sha256_neon_available(void)
 {
 #if defined HWCAP_SHA2
diff --git a/windows/utils/arm_arch_queries.c b/windows/utils/arm_arch_queries.c
index 439a59fb..b683ac15 100644
--- a/windows/utils/arm_arch_queries.c
+++ b/windows/utils/arm_arch_queries.c
@@ -20,6 +20,11 @@ bool platform_aes_neon_available(void)
     return IsProcessorFeaturePresent(PF_ARM_V8_CRYPTO_INSTRUCTIONS_AVAILABLE);
 }
 
+bool platform_pmull_neon_available(void)
+{
+    return IsProcessorFeaturePresent(PF_ARM_V8_CRYPTO_INSTRUCTIONS_AVAILABLE);
+}
+
 bool platform_sha256_neon_available(void)
 {
     return IsProcessorFeaturePresent(PF_ARM_V8_CRYPTO_INSTRUCTIONS_AVAILABLE);