diff --git a/cmake/cmake.h.in b/cmake/cmake.h.in
index f051c759..b6c07d2c 100644
--- a/cmake/cmake.h.in
+++ b/cmake/cmake.h.in
@@ -40,3 +40,11 @@
 #cmakedefine01 HAVE_SO_PEERCRED
 #cmakedefine01 HAVE_PANGO_FONT_FAMILY_IS_MONOSPACE
 #cmakedefine01 HAVE_PANGO_FONT_MAP_LIST_FAMILIES
+
+#cmakedefine01 HAVE_AES_NI
+#cmakedefine01 HAVE_SHA_NI
+#cmakedefine01 HAVE_SHAINTRIN_H
+#cmakedefine01 HAVE_NEON_CRYPTO
+#cmakedefine01 HAVE_NEON_SHA512
+#cmakedefine01 HAVE_NEON_SHA512_INTRINSICS
+#cmakedefine01 USE_ARM64_NEON_H
diff --git a/crypto/CMakeLists.txt b/crypto/CMakeLists.txt
index 74f86cd4..917614be 100644
--- a/crypto/CMakeLists.txt
+++ b/crypto/CMakeLists.txt
@@ -1,5 +1,7 @@
 add_sources_from_current_dir(crypto
-  aes.c
+  aes-common.c
+  aes-select.c
+  aes-sw.c
   arcfour.c
   argon2.c
   bcrypt.c
@@ -23,8 +25,181 @@ add_sources_from_current_dir(crypto
   pubkey-ppk.c
   pubkey-ssh1.c
   rsa.c
-  sha256.c
-  sha512.c
+  sha256-common.c
+  sha256-select.c
+  sha256-sw.c
+  sha512-common.c
+  sha512-select.c
+  sha512-sw.c
   sha3.c
-  sha1.c
+  sha1-common.c
+  sha1-select.c
+  sha1-sw.c
   xdmauth.c)
+
+include(CheckCSourceCompiles)
+
+function(test_compile_with_flags outvar)
+  cmake_parse_arguments(OPT "" ""
+    "GNU_FLAGS;MSVC_FLAGS;ADD_SOURCES_IF_SUCCESSFUL;TEST_SOURCE" "${ARGN}")
+
+  # Figure out what flags are applicable to this compiler.
+  set(flags)
+  if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR
+     CMAKE_C_COMPILER_ID MATCHES "Clang")
+    set(flags ${OPT_GNU_FLAGS})
+  endif()
+  if(CMAKE_C_COMPILER_ID MATCHES "MSVC")
+    set(flags ${OPT_MSVC_FLAGS})
+  endif()
+
+  # See if we can compile the provided test program.
+  string(JOIN " " CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS} ${flags})
+  check_c_source_compiles("${OPT_TEST_SOURCE}" "${outvar}")
+
+  if(${outvar} AND OPT_ADD_SOURCES_IF_SUCCESSFUL)
+    # Make an object library that compiles the implementation with the
+    # necessary flags, and add the resulting objects to the crypto
+    # library.
+    set(libname object_lib_${outvar})
+    add_library(${libname} OBJECT ${OPT_ADD_SOURCES_IF_SUCCESSFUL})
+    target_compile_options(${libname} PRIVATE ${flags})
+    target_sources(crypto PRIVATE $<TARGET_OBJECTS:${libname}>)
+  endif()
+
+  # Export the output to the caller's scope, so that further tests can
+  # be based on it.
+  set(${outvar} ${${outvar}} PARENT_SCOPE)
+endfunction()
+
+# ----------------------------------------------------------------------
+# Try to enable x86 intrinsics-based crypto implementations.
+
+test_compile_with_flags(HAVE_WMMINTRIN_H
+  GNU_FLAGS -msse4.1
+  TEST_SOURCE "
+    #include <wmmintrin.h>
+    #include <smmintrin.h>
+    volatile __m128i r, a, b;
+    int main(void) { r = _mm_xor_si128(a, b); }")
+if(HAVE_WMMINTRIN_H)
+  test_compile_with_flags(HAVE_AES_NI
+    GNU_FLAGS -msse4.1 -maes
+    TEST_SOURCE "
+      #include <wmmintrin.h>
+      #include <smmintrin.h>
+      volatile __m128i r, a, b;
+      int main(void) { r = _mm_aesenc_si128(a, b); }"
+    ADD_SOURCES_IF_SUCCESSFUL aes-ni aes-ni.c)
+
+  # shaintrin.h doesn't exist on all compilers; sometimes it's folded
+  # into the other headers
+  test_compile_with_flags(HAVE_SHAINTRIN_H
+    GNU_FLAGS -msse4.1 -msha
+    TEST_SOURCE "
+      #include <wmmintrin.h>
+      #include <smmintrin.h>
+      #include <immintrin.h>
+      #include <shaintrin.h>
+      volatile __m128i r, a, b;
+      int main(void) { r = _mm_xor_si128(a, b); }")
+  if(HAVE_SHAINTRIN_H)
+    set(include_shaintrin "#include <shaintrin.h>")
+  else()
+    set(include_shaintrin "")
+  endif()
+
+  test_compile_with_flags(HAVE_SHA_NI
+    GNU_FLAGS -msse4.1 -msha
+    TEST_SOURCE "
+      #include <wmmintrin.h>
+      #include <smmintrin.h>
+      #include <immintrin.h>
+      ${include_shaintrin}
+      volatile __m128i r, a, b, c;
+      int main(void) { r = _mm_sha256rnds2_epu32(a, b, c); }
+      " sha-ni sha256-ni.c sha1-ni.c)
+endif()
+
+# ----------------------------------------------------------------------
+# Try to enable Arm Neon intrinsics-based crypto implementations.
+
+# Start by checking which header file we need. ACLE specifies that it
+# ought to be <arm_neon.h>, on both 32- and 64-bit Arm, but Visual
+# Studio for some reason renamed the header to <arm64_neon.h> in
+# 64-bit, and gives an error if you use the standard name. (However,
+# clang-cl does let you use the standard name.)
+test_compile_with_flags(HAVE_ARM_NEON_H
+  MSVC_FLAGS -D_ARM_USE_NEW_NEON_INTRINSICS
+  TEST_SOURCE "
+    #include <arm_neon.h>
+    volatile uint8x16_t r, a, b;
+    int main(void) { r = veorq_u8(a, b); }")
+if(HAVE_ARM_NEON_H)
+  set(neon ON)
+  set(neon_header "arm_neon.h")
+else()
+  test_compile_with_flags(HAVE_ARM64_NEON_H TEST_SOURCE "
+    #include <arm64_neon.h>
+    volatile uint8x16_t r, a, b;
+    int main(void) { r = veorq_u8(a, b); }")
+  if(HAVE_ARM64_NEON_H)
+    set(neon ON)
+    set(neon_header "arm64_neon.h")
+    set(USE_ARM64_NEON_H ON)
+  endif()
+endif()
+
+if(neon)
+  # If we have _some_ NEON header, look for the individual things we
+  # can enable with it.
+
+  # The 'crypto' architecture extension includes support for AES,
+  # SHA-1, and SHA-256.
+  test_compile_with_flags(HAVE_NEON_CRYPTO
+    GNU_FLAGS -march=armv8-a+crypto
+    MSVC_FLAGS -D_ARM_USE_NEW_NEON_INTRINSICS
+    TEST_SOURCE "
+      #include <${neon_header}>
+      volatile uint8x16_t r, a, b;
+      volatile uint32x4_t s, x, y, z;
+      int main(void) { r = vaeseq_u8(a, b); s = vsha256hq_u32(x, y, z); }"
+    ADD_SOURCES_IF_SUCCESSFUL aes-neon.c sha256-neon.c sha1-neon.c)
+
+  # The 'sha3' architecture extension, despite the name, includes
+  # support for SHA-512 (from the SHA-2 standard) as well as SHA-3
+  # proper.
+  #
+  # Versions of clang up to and including clang 12 support this
+  # extension in assembly language, but not the ACLE intrinsics for
+  # it. So we check both.
+  test_compile_with_flags(HAVE_NEON_SHA512_INTRINSICS
+    GNU_FLAGS -march=armv8.2-a+crypto+sha3
+    TEST_SOURCE "
+      #include <${neon_header}>
+      volatile uint64x2_t r, a, b;
+      int main(void) { r = vsha512su0q_u64(a, b); }"
+    ADD_SOURCES_IF_SUCCESSFUL sha512-neon.c)
+  if(HAVE_NEON_SHA512_INTRINSICS)
+    set(HAVE_NEON_SHA512 ON)
+  else()
+    test_compile_with_flags(HAVE_NEON_SHA512_ASM
+      GNU_FLAGS -march=armv8.2-a+crypto+sha3
+      TEST_SOURCE "
+        #include <${neon_header}>
+        volatile uint64x2_t r, a;
+        int main(void) { __asm__(\"sha512su0 %0.2D,%1.2D\" : \"+w\" (r) : \"w\" (a)); }"
+      ADD_SOURCES_IF_SUCCESSFUL sha512-neon.c)
+    if(HAVE_NEON_SHA512_ASM)
+      set(HAVE_NEON_SHA512 ON)
+    endif()
+  endif()
+endif()
+
+set(HAVE_AES_NI ${HAVE_AES_NI} PARENT_SCOPE)
+set(HAVE_SHA_NI ${HAVE_SHA_NI} PARENT_SCOPE)
+set(HAVE_SHAINTRIN_H ${HAVE_SHAINTRIN_H} PARENT_SCOPE)
+set(HAVE_NEON_CRYPTO ${HAVE_NEON_CRYPTO} PARENT_SCOPE)
+set(HAVE_NEON_SHA512 ${HAVE_NEON_SHA512} PARENT_SCOPE)
+set(HAVE_NEON_SHA512_INTRINSICS ${HAVE_NEON_SHA512_INTRINSICS} PARENT_SCOPE)
+set(USE_ARM64_NEON_H ${USE_ARM64_NEON_H} PARENT_SCOPE)
diff --git a/crypto/aes-common.c b/crypto/aes-common.c
new file mode 100644
index 00000000..e1c41ddf
--- /dev/null
+++ b/crypto/aes-common.c
@@ -0,0 +1,14 @@
+/*
+ * Common variable definitions across all the AES implementations.
+ */
+
+#include "ssh.h"
+#include "aes.h"
+
+const uint8_t aes_key_setup_round_constants[10] = {
+    /* The first few powers of X in GF(2^8), used during key setup.
+     * This can safely be a lookup table without side channel risks,
+     * because key setup iterates through it once in a standard way
+     * regardless of the key. */
+    0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1b, 0x36,
+};
diff --git a/crypto/aes-neon.c b/crypto/aes-neon.c
new file mode 100644
index 00000000..d47d2fd3
--- /dev/null
+++ b/crypto/aes-neon.c
@@ -0,0 +1,294 @@
+/* ----------------------------------------------------------------------
+ * Hardware-accelerated implementation of AES using Arm NEON.
+ */
+
+#include "ssh.h"
+#include "aes.h"
+
+#if USE_ARM64_NEON_H
+#include <arm64_neon.h>
+#else
+#include <arm_neon.h>
+#endif
+
+static bool aes_neon_available(void)
+{
+    /*
+     * For Arm, we delegate to a per-platform AES detection function,
+     * because it has to be implemented by asking the operating system
+     * rather than directly querying the CPU.
+     *
+     * That's because Arm systems commonly have multiple cores that
+     * are not all alike, so any method of querying whether NEON
+     * crypto instructions work on the _current_ CPU - even one as
+     * crude as just trying one and catching the SIGILL - wouldn't
+     * give an answer that you could still rely on the first time the
+     * OS migrated your process to another CPU.
+     */
+    return platform_aes_neon_available();
+}
+
+/*
+ * Core NEON encrypt/decrypt functions, one per length and direction.
+ */
+
+#define NEON_CIPHER(len, repmacro)                              \
+    static inline uint8x16_t aes_neon_##len##_e(                \
+        uint8x16_t v, const uint8x16_t *keysched)               \
+    {                                                           \
+        repmacro(v = vaesmcq_u8(vaeseq_u8(v, *keysched++)););   \
+        v = vaeseq_u8(v, *keysched++);                          \
+        return veorq_u8(v, *keysched);                          \
+    }                                                           \
+    static inline uint8x16_t aes_neon_##len##_d(                \
+        uint8x16_t v, const uint8x16_t *keysched)               \
+    {                                                           \
+        repmacro(v = vaesimcq_u8(vaesdq_u8(v, *keysched++)););  \
+        v = vaesdq_u8(v, *keysched++);                          \
+        return veorq_u8(v, *keysched);                          \
+    }
+
+NEON_CIPHER(128, REP9)
+NEON_CIPHER(192, REP11)
+NEON_CIPHER(256, REP13)
+
+/*
+ * The main key expansion.
+ */
+static void aes_neon_key_expand(
+    const unsigned char *key, size_t key_words,
+    uint8x16_t *keysched_e, uint8x16_t *keysched_d)
+{
+    size_t rounds = key_words + 6;
+    size_t sched_words = (rounds + 1) * 4;
+
+    /*
+     * Store the key schedule as 32-bit integers during expansion, so
+     * that it's easy to refer back to individual previous words. We
+     * collect them into the final uint8x16_t form at the end.
+     */
+    uint32_t sched[MAXROUNDKEYS * 4];
+
+    unsigned rconpos = 0;
+
+    for (size_t i = 0; i < sched_words; i++) {
+        if (i < key_words) {
+            sched[i] = GET_32BIT_LSB_FIRST(key + 4 * i);
+        } else {
+            uint32_t temp = sched[i - 1];
+
+            bool rotate_and_round_constant = (i % key_words == 0);
+            bool sub = rotate_and_round_constant ||
+                (key_words == 8 && i % 8 == 4);
+
+            if (rotate_and_round_constant)
+                temp = (temp << 24) | (temp >> 8);
+
+            if (sub) {
+                uint32x4_t v32 = vdupq_n_u32(temp);
+                uint8x16_t v8 = vreinterpretq_u8_u32(v32);
+                v8 = vaeseq_u8(v8, vdupq_n_u8(0));
+                v32 = vreinterpretq_u32_u8(v8);
+                temp = vget_lane_u32(vget_low_u32(v32), 0);
+            }
+
+            if (rotate_and_round_constant) {
+                assert(rconpos < lenof(aes_key_setup_round_constants));
+                temp ^= aes_key_setup_round_constants[rconpos++];
+            }
+
+            sched[i] = sched[i - key_words] ^ temp;
+        }
+    }
+
+    /*
+     * Combine the key schedule words into uint8x16_t vectors and
+     * store them in the output context.
+     */
+    for (size_t round = 0; round <= rounds; round++)
+        keysched_e[round] = vreinterpretq_u8_u32(vld1q_u32(sched + 4*round));
+
+    smemclr(sched, sizeof(sched));
+
+    /*
+     * Now prepare the modified keys for the inverse cipher.
+     */
+    for (size_t eround = 0; eround <= rounds; eround++) {
+        size_t dround = rounds - eround;
+        uint8x16_t rkey = keysched_e[eround];
+        if (eround && dround)      /* neither first nor last */
+            rkey = vaesimcq_u8(rkey);
+        keysched_d[dround] = rkey;
+    }
+}
+
+/*
+ * Auxiliary routine to reverse the byte order of a vector, so that
+ * the SDCTR IV can be made big-endian for feeding to the cipher.
+ *
+ * In fact we don't need to reverse the vector _all_ the way; we leave
+ * the two lanes in MSW,LSW order, because that makes no difference to
+ * the efficiency of the increment. That way we only have to reverse
+ * bytes within each lane in this function.
+ */
+static inline uint8x16_t aes_neon_sdctr_reverse(uint8x16_t v)
+{
+    return vrev64q_u8(v);
+}
+
+/*
+ * Auxiliary routine to increment the 128-bit counter used in SDCTR
+ * mode. There's no instruction to treat a 128-bit vector as a single
+ * long integer, so instead we have to increment the bottom half
+ * unconditionally, and the top half if the bottom half started off as
+ * all 1s (in which case there was about to be a carry).
+ */
+static inline uint8x16_t aes_neon_sdctr_increment(uint8x16_t in)
+{
+#ifdef __aarch64__
+    /* There will be a carry if the low 64 bits are all 1s. */
+    uint64x1_t all1 = vcreate_u64(0xFFFFFFFFFFFFFFFF);
+    uint64x1_t carry = vceq_u64(vget_high_u64(vreinterpretq_u64_u8(in)), all1);
+
+    /* Make a word whose bottom half is unconditionally all 1s, and
+     * the top half is 'carry', i.e. all 0s most of the time but all
+     * 1s if we need to increment the top half. Then that word is what
+     * we need to _subtract_ from the input counter. */
+    uint64x2_t subtrahend = vcombine_u64(carry, all1);
+#else
+    /* AArch32 doesn't have comparisons that operate on a 64-bit lane,
+     * so we start by comparing each 32-bit half of the low 64 bits
+     * _separately_ to all-1s. */
+    uint32x2_t all1 = vdup_n_u32(0xFFFFFFFF);
+    uint32x2_t carry = vceq_u32(
+        vget_high_u32(vreinterpretq_u32_u8(in)), all1);
+
+    /* Swap the 32-bit words of the compare output, and AND with the
+     * unswapped version. Now carry is all 1s iff the bottom half of
+     * the input counter was all 1s, and all 0s otherwise. */
+    carry = vand_u32(carry, vrev64_u32(carry));
+
+    /* Now make the vector to subtract in the same way as above. */
+    uint64x2_t subtrahend = vreinterpretq_u64_u32(vcombine_u32(carry, all1));
+#endif
+
+    return vreinterpretq_u8_u64(
+        vsubq_u64(vreinterpretq_u64_u8(in), subtrahend));
+}
+
+/*
+ * The SSH interface and the cipher modes.
+ */
+
+typedef struct aes_neon_context aes_neon_context;
+struct aes_neon_context {
+    uint8x16_t keysched_e[MAXROUNDKEYS], keysched_d[MAXROUNDKEYS], iv;
+
+    ssh_cipher ciph;
+};
+
+static ssh_cipher *aes_neon_new(const ssh_cipheralg *alg)
+{
+    const struct aes_extra *extra = (const struct aes_extra *)alg->extra;
+    if (!check_availability(extra))
+        return NULL;
+
+    aes_neon_context *ctx = snew(aes_neon_context);
+    ctx->ciph.vt = alg;
+    return &ctx->ciph;
+}
+
+static void aes_neon_free(ssh_cipher *ciph)
+{
+    aes_neon_context *ctx = container_of(ciph, aes_neon_context, ciph);
+    smemclr(ctx, sizeof(*ctx));
+    sfree(ctx);
+}
+
+static void aes_neon_setkey(ssh_cipher *ciph, const void *vkey)
+{
+    aes_neon_context *ctx = container_of(ciph, aes_neon_context, ciph);
+    const unsigned char *key = (const unsigned char *)vkey;
+
+    aes_neon_key_expand(key, ctx->ciph.vt->real_keybits / 32,
+                      ctx->keysched_e, ctx->keysched_d);
+}
+
+static void aes_neon_setiv_cbc(ssh_cipher *ciph, const void *iv)
+{
+    aes_neon_context *ctx = container_of(ciph, aes_neon_context, ciph);
+    ctx->iv = vld1q_u8(iv);
+}
+
+static void aes_neon_setiv_sdctr(ssh_cipher *ciph, const void *iv)
+{
+    aes_neon_context *ctx = container_of(ciph, aes_neon_context, ciph);
+    uint8x16_t counter = vld1q_u8(iv);
+    ctx->iv = aes_neon_sdctr_reverse(counter);
+}
+
+typedef uint8x16_t (*aes_neon_fn)(uint8x16_t v, const uint8x16_t *keysched);
+
+static inline void aes_cbc_neon_encrypt(
+    ssh_cipher *ciph, void *vblk, int blklen, aes_neon_fn encrypt)
+{
+    aes_neon_context *ctx = container_of(ciph, aes_neon_context, ciph);
+
+    for (uint8_t *blk = (uint8_t *)vblk, *finish = blk + blklen;
+         blk < finish; blk += 16) {
+        uint8x16_t plaintext = vld1q_u8(blk);
+        uint8x16_t cipher_input = veorq_u8(plaintext, ctx->iv);
+        uint8x16_t ciphertext = encrypt(cipher_input, ctx->keysched_e);
+        vst1q_u8(blk, ciphertext);
+        ctx->iv = ciphertext;
+    }
+}
+
+static inline void aes_cbc_neon_decrypt(
+    ssh_cipher *ciph, void *vblk, int blklen, aes_neon_fn decrypt)
+{
+    aes_neon_context *ctx = container_of(ciph, aes_neon_context, ciph);
+
+    for (uint8_t *blk = (uint8_t *)vblk, *finish = blk + blklen;
+         blk < finish; blk += 16) {
+        uint8x16_t ciphertext = vld1q_u8(blk);
+        uint8x16_t decrypted = decrypt(ciphertext, ctx->keysched_d);
+        uint8x16_t plaintext = veorq_u8(decrypted, ctx->iv);
+        vst1q_u8(blk, plaintext);
+        ctx->iv = ciphertext;
+    }
+}
+
+static inline void aes_sdctr_neon(
+    ssh_cipher *ciph, void *vblk, int blklen, aes_neon_fn encrypt)
+{
+    aes_neon_context *ctx = container_of(ciph, aes_neon_context, ciph);
+
+    for (uint8_t *blk = (uint8_t *)vblk, *finish = blk + blklen;
+         blk < finish; blk += 16) {
+        uint8x16_t counter = aes_neon_sdctr_reverse(ctx->iv);
+        uint8x16_t keystream = encrypt(counter, ctx->keysched_e);
+        uint8x16_t input = vld1q_u8(blk);
+        uint8x16_t output = veorq_u8(input, keystream);
+        vst1q_u8(blk, output);
+        ctx->iv = aes_neon_sdctr_increment(ctx->iv);
+    }
+}
+
+#define NEON_ENC_DEC(len)                                               \
+    static void aes##len##_neon_cbc_encrypt(                            \
+        ssh_cipher *ciph, void *vblk, int blklen)                       \
+    { aes_cbc_neon_encrypt(ciph, vblk, blklen, aes_neon_##len##_e); }   \
+    static void aes##len##_neon_cbc_decrypt(                            \
+        ssh_cipher *ciph, void *vblk, int blklen)                       \
+    { aes_cbc_neon_decrypt(ciph, vblk, blklen, aes_neon_##len##_d); }   \
+    static void aes##len##_neon_sdctr(                                  \
+        ssh_cipher *ciph, void *vblk, int blklen)                       \
+    { aes_sdctr_neon(ciph, vblk, blklen, aes_neon_##len##_e); }         \
+
+NEON_ENC_DEC(128)
+NEON_ENC_DEC(192)
+NEON_ENC_DEC(256)
+
+AES_EXTRA(_neon);
+AES_ALL_VTABLES(_neon, "NEON accelerated");
diff --git a/crypto/aes-ni.c b/crypto/aes-ni.c
new file mode 100644
index 00000000..22348de4
--- /dev/null
+++ b/crypto/aes-ni.c
@@ -0,0 +1,281 @@
+/*
+ * Hardware-accelerated implementation of AES using x86 AES-NI.
+ */
+
+#include "ssh.h"
+#include "aes.h"
+
+#include <wmmintrin.h>
+#include <smmintrin.h>
+
+#if defined(__clang__) || defined(__GNUC__)
+#include <cpuid.h>
+#define GET_CPU_ID(out) __cpuid(1, (out)[0], (out)[1], (out)[2], (out)[3])
+#else
+#define GET_CPU_ID(out) __cpuid(out, 1)
+#endif
+
+static bool aes_ni_available(void)
+{
+    /*
+     * Determine if AES is available on this CPU, by checking that
+     * both AES itself and SSE4.1 are supported.
+     */
+    unsigned int CPUInfo[4];
+    GET_CPU_ID(CPUInfo);
+    return (CPUInfo[2] & (1 << 25)) && (CPUInfo[2] & (1 << 19));
+}
+
+/*
+ * Core AES-NI encrypt/decrypt functions, one per length and direction.
+ */
+
+#define NI_CIPHER(len, dir, dirlong, repmacro)                          \
+    static inline __m128i aes_ni_##len##_##dir(                         \
+        __m128i v, const __m128i *keysched)                             \
+    {                                                                   \
+        v = _mm_xor_si128(v, *keysched++);                              \
+        repmacro(v = _mm_aes##dirlong##_si128(v, *keysched++););        \
+        return _mm_aes##dirlong##last_si128(v, *keysched);              \
+    }
+
+NI_CIPHER(128, e, enc, REP9)
+NI_CIPHER(128, d, dec, REP9)
+NI_CIPHER(192, e, enc, REP11)
+NI_CIPHER(192, d, dec, REP11)
+NI_CIPHER(256, e, enc, REP13)
+NI_CIPHER(256, d, dec, REP13)
+
+/*
+ * The main key expansion.
+ */
+static void aes_ni_key_expand(
+    const unsigned char *key, size_t key_words,
+    __m128i *keysched_e, __m128i *keysched_d)
+{
+    size_t rounds = key_words + 6;
+    size_t sched_words = (rounds + 1) * 4;
+
+    /*
+     * Store the key schedule as 32-bit integers during expansion, so
+     * that it's easy to refer back to individual previous words. We
+     * collect them into the final __m128i form at the end.
+     */
+    uint32_t sched[MAXROUNDKEYS * 4];
+
+    unsigned rconpos = 0;
+
+    for (size_t i = 0; i < sched_words; i++) {
+        if (i < key_words) {
+            sched[i] = GET_32BIT_LSB_FIRST(key + 4 * i);
+        } else {
+            uint32_t temp = sched[i - 1];
+
+            bool rotate_and_round_constant = (i % key_words == 0);
+            bool only_sub = (key_words == 8 && i % 8 == 4);
+
+            if (rotate_and_round_constant) {
+                __m128i v = _mm_setr_epi32(0,temp,0,0);
+                v = _mm_aeskeygenassist_si128(v, 0);
+                temp = _mm_extract_epi32(v, 1);
+
+                assert(rconpos < lenof(aes_key_setup_round_constants));
+                temp ^= aes_key_setup_round_constants[rconpos++];
+            } else if (only_sub) {
+                __m128i v = _mm_setr_epi32(0,temp,0,0);
+                v = _mm_aeskeygenassist_si128(v, 0);
+                temp = _mm_extract_epi32(v, 0);
+            }
+
+            sched[i] = sched[i - key_words] ^ temp;
+        }
+    }
+
+    /*
+     * Combine the key schedule words into __m128i vectors and store
+     * them in the output context.
+     */
+    for (size_t round = 0; round <= rounds; round++)
+        keysched_e[round] = _mm_setr_epi32(
+            sched[4*round  ], sched[4*round+1],
+            sched[4*round+2], sched[4*round+3]);
+
+    smemclr(sched, sizeof(sched));
+
+    /*
+     * Now prepare the modified keys for the inverse cipher.
+     */
+    for (size_t eround = 0; eround <= rounds; eround++) {
+        size_t dround = rounds - eround;
+        __m128i rkey = keysched_e[eround];
+        if (eround && dround)      /* neither first nor last */
+            rkey = _mm_aesimc_si128(rkey);
+        keysched_d[dround] = rkey;
+    }
+}
+
+/*
+ * Auxiliary routine to increment the 128-bit counter used in SDCTR
+ * mode.
+ */
+static inline __m128i aes_ni_sdctr_increment(__m128i v)
+{
+    const __m128i ONE  = _mm_setr_epi32(1,0,0,0);
+    const __m128i ZERO = _mm_setzero_si128();
+
+    /* Increment the low-order 64 bits of v */
+    v  = _mm_add_epi64(v, ONE);
+    /* Check if they've become zero */
+    __m128i cmp = _mm_cmpeq_epi64(v, ZERO);
+    /* If so, the low half of cmp is all 1s. Pack that into the high
+     * half of addend with zero in the low half. */
+    __m128i addend = _mm_unpacklo_epi64(ZERO, cmp);
+    /* And subtract that from v, which increments the high 64 bits iff
+     * the low 64 wrapped round. */
+    v = _mm_sub_epi64(v, addend);
+
+    return v;
+}
+
+/*
+ * Auxiliary routine to reverse the byte order of a vector, so that
+ * the SDCTR IV can be made big-endian for feeding to the cipher.
+ */
+static inline __m128i aes_ni_sdctr_reverse(__m128i v)
+{
+    v = _mm_shuffle_epi8(
+        v, _mm_setr_epi8(15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0));
+    return v;
+}
+
+/*
+ * The SSH interface and the cipher modes.
+ */
+
+typedef struct aes_ni_context aes_ni_context;
+struct aes_ni_context {
+    __m128i keysched_e[MAXROUNDKEYS], keysched_d[MAXROUNDKEYS], iv;
+
+    void *pointer_to_free;
+    ssh_cipher ciph;
+};
+
+static ssh_cipher *aes_ni_new(const ssh_cipheralg *alg)
+{
+    const struct aes_extra *extra = (const struct aes_extra *)alg->extra;
+    if (!check_availability(extra))
+        return NULL;
+
+    /*
+     * The __m128i variables in the context structure need to be
+     * 16-byte aligned, but not all malloc implementations that this
+     * code has to work with will guarantee to return a 16-byte
+     * aligned pointer. So we over-allocate, manually realign the
+     * pointer ourselves, and store the original one inside the
+     * context so we know how to free it later.
+     */
+    void *allocation = smalloc(sizeof(aes_ni_context) + 15);
+    uintptr_t alloc_address = (uintptr_t)allocation;
+    uintptr_t aligned_address = (alloc_address + 15) & ~15;
+    aes_ni_context *ctx = (aes_ni_context *)aligned_address;
+
+    ctx->ciph.vt = alg;
+    ctx->pointer_to_free = allocation;
+    return &ctx->ciph;
+}
+
+static void aes_ni_free(ssh_cipher *ciph)
+{
+    aes_ni_context *ctx = container_of(ciph, aes_ni_context, ciph);
+    void *allocation = ctx->pointer_to_free;
+    smemclr(ctx, sizeof(*ctx));
+    sfree(allocation);
+}
+
+static void aes_ni_setkey(ssh_cipher *ciph, const void *vkey)
+{
+    aes_ni_context *ctx = container_of(ciph, aes_ni_context, ciph);
+    const unsigned char *key = (const unsigned char *)vkey;
+
+    aes_ni_key_expand(key, ctx->ciph.vt->real_keybits / 32,
+                      ctx->keysched_e, ctx->keysched_d);
+}
+
+static void aes_ni_setiv_cbc(ssh_cipher *ciph, const void *iv)
+{
+    aes_ni_context *ctx = container_of(ciph, aes_ni_context, ciph);
+    ctx->iv = _mm_loadu_si128(iv);
+}
+
+static void aes_ni_setiv_sdctr(ssh_cipher *ciph, const void *iv)
+{
+    aes_ni_context *ctx = container_of(ciph, aes_ni_context, ciph);
+    __m128i counter = _mm_loadu_si128(iv);
+    ctx->iv = aes_ni_sdctr_reverse(counter);
+}
+
+typedef __m128i (*aes_ni_fn)(__m128i v, const __m128i *keysched);
+
+static inline void aes_cbc_ni_encrypt(
+    ssh_cipher *ciph, void *vblk, int blklen, aes_ni_fn encrypt)
+{
+    aes_ni_context *ctx = container_of(ciph, aes_ni_context, ciph);
+
+    for (uint8_t *blk = (uint8_t *)vblk, *finish = blk + blklen;
+         blk < finish; blk += 16) {
+        __m128i plaintext = _mm_loadu_si128((const __m128i *)blk);
+        __m128i cipher_input = _mm_xor_si128(plaintext, ctx->iv);
+        __m128i ciphertext = encrypt(cipher_input, ctx->keysched_e);
+        _mm_storeu_si128((__m128i *)blk, ciphertext);
+        ctx->iv = ciphertext;
+    }
+}
+
+static inline void aes_cbc_ni_decrypt(
+    ssh_cipher *ciph, void *vblk, int blklen, aes_ni_fn decrypt)
+{
+    aes_ni_context *ctx = container_of(ciph, aes_ni_context, ciph);
+
+    for (uint8_t *blk = (uint8_t *)vblk, *finish = blk + blklen;
+         blk < finish; blk += 16) {
+        __m128i ciphertext = _mm_loadu_si128((const __m128i *)blk);
+        __m128i decrypted = decrypt(ciphertext, ctx->keysched_d);
+        __m128i plaintext = _mm_xor_si128(decrypted, ctx->iv);
+        _mm_storeu_si128((__m128i *)blk, plaintext);
+        ctx->iv = ciphertext;
+    }
+}
+
+static inline void aes_sdctr_ni(
+    ssh_cipher *ciph, void *vblk, int blklen, aes_ni_fn encrypt)
+{
+    aes_ni_context *ctx = container_of(ciph, aes_ni_context, ciph);
+
+    for (uint8_t *blk = (uint8_t *)vblk, *finish = blk + blklen;
+         blk < finish; blk += 16) {
+        __m128i counter = aes_ni_sdctr_reverse(ctx->iv);
+        __m128i keystream = encrypt(counter, ctx->keysched_e);
+        __m128i input = _mm_loadu_si128((const __m128i *)blk);
+        __m128i output = _mm_xor_si128(input, keystream);
+        _mm_storeu_si128((__m128i *)blk, output);
+        ctx->iv = aes_ni_sdctr_increment(ctx->iv);
+    }
+}
+
+#define NI_ENC_DEC(len)                                                 \
+    static void aes##len##_ni_cbc_encrypt(                              \
+        ssh_cipher *ciph, void *vblk, int blklen)                       \
+    { aes_cbc_ni_encrypt(ciph, vblk, blklen, aes_ni_##len##_e); }       \
+    static void aes##len##_ni_cbc_decrypt(                              \
+        ssh_cipher *ciph, void *vblk, int blklen)                       \
+    { aes_cbc_ni_decrypt(ciph, vblk, blklen, aes_ni_##len##_d); }       \
+    static void aes##len##_ni_sdctr(                                    \
+        ssh_cipher *ciph, void *vblk, int blklen)                       \
+    { aes_sdctr_ni(ciph, vblk, blklen, aes_ni_##len##_e); }             \
+
+NI_ENC_DEC(128)
+NI_ENC_DEC(192)
+NI_ENC_DEC(256)
+
+AES_EXTRA(_ni);
+AES_ALL_VTABLES(_ni, "AES-NI accelerated");
diff --git a/crypto/aes-select.c b/crypto/aes-select.c
new file mode 100644
index 00000000..f0c5031f
--- /dev/null
+++ b/crypto/aes-select.c
@@ -0,0 +1,89 @@
+/*
+ * Top-level vtables to select an AES implementation.
+ */
+
+#include <assert.h>
+#include <stdlib.h>
+
+#include "putty.h"
+#include "ssh.h"
+#include "aes.h"
+
+static ssh_cipher *aes_select(const ssh_cipheralg *alg)
+{
+    const ssh_cipheralg *const *real_algs = (const ssh_cipheralg **)alg->extra;
+
+    for (size_t i = 0; real_algs[i]; i++) {
+        const ssh_cipheralg *alg = real_algs[i];
+        const struct aes_extra *alg_extra =
+            (const struct aes_extra *)alg->extra;
+        if (check_availability(alg_extra))
+            return ssh_cipher_new(alg);
+    }
+
+    /* We should never reach the NULL at the end of the list, because
+     * the last non-NULL entry should be software-only AES, which is
+     * always available. */
+    unreachable("aes_select ran off the end of its list");
+}
+
+#if HAVE_AES_NI
+#define IF_NI(...) __VA_ARGS__
+#else
+#define IF_NI(...)
+#endif
+
+#if HAVE_NEON_CRYPTO
+#define IF_NEON(...) __VA_ARGS__
+#else
+#define IF_NEON(...)
+#endif
+
+#define AES_SELECTOR_VTABLE(mode_c, mode_protocol, mode_display, bits)  \
+    static const ssh_cipheralg *                                        \
+    ssh_aes ## bits ## _ ## mode_c ## _impls[] = {                      \
+        IF_NI(&ssh_aes ## bits ## _ ## mode_c ## _ni,)                  \
+        IF_NEON(&ssh_aes ## bits ## _ ## mode_c ## _neon,)              \
+        &ssh_aes ## bits ## _ ## mode_c ## _sw,                         \
+        NULL,                                                           \
+    };                                                                  \
+    const ssh_cipheralg ssh_aes ## bits ## _ ## mode_c = {              \
+        .new = aes_select,                                              \
+        .ssh2_id = "aes" #bits "-" mode_protocol,                       \
+        .blksize = 16,                                                  \
+        .real_keybits = bits,                                           \
+        .padded_keybytes = bits/8,                                      \
+        .text_name = "AES-" #bits " " mode_display                      \
+        " (dummy selector vtable)",                                     \
+        .extra = ssh_aes ## bits ## _ ## mode_c ## _impls,              \
+    }
+
+AES_SELECTOR_VTABLE(cbc, "cbc", "CBC", 128);
+AES_SELECTOR_VTABLE(cbc, "cbc", "CBC", 192);
+AES_SELECTOR_VTABLE(cbc, "cbc", "CBC", 256);
+AES_SELECTOR_VTABLE(sdctr, "ctr", "SDCTR", 128);
+AES_SELECTOR_VTABLE(sdctr, "ctr", "SDCTR", 192);
+AES_SELECTOR_VTABLE(sdctr, "ctr", "SDCTR", 256);
+
+static const ssh_cipheralg ssh_rijndael_lysator = {
+    /* Same as aes256_cbc, but with a different protocol ID */
+    .new = aes_select,
+    .ssh2_id = "rijndael-cbc@lysator.liu.se",
+    .blksize = 16,
+    .real_keybits = 256,
+    .padded_keybytes = 256/8,
+    .text_name = "AES-256 CBC (dummy selector vtable)",
+    .extra = ssh_aes256_cbc_impls,
+};
+
+static const ssh_cipheralg *const aes_list[] = {
+    &ssh_aes256_sdctr,
+    &ssh_aes256_cbc,
+    &ssh_rijndael_lysator,
+    &ssh_aes192_sdctr,
+    &ssh_aes192_cbc,
+    &ssh_aes128_sdctr,
+    &ssh_aes128_cbc,
+};
+
+const ssh2_ciphers ssh2_aes = { lenof(aes_list), aes_list };
diff --git a/crypto/aes.c b/crypto/aes-sw.c
similarity index 59%
rename from crypto/aes.c
rename to crypto/aes-sw.c
index a7ca1117..f8512388 100644
--- a/crypto/aes.c
+++ b/crypto/aes-sw.c
@@ -1,247 +1,4 @@
 /*
- * Implementation of AES.
- */
-
-#include <assert.h>
-#include <stdlib.h>
-
-#include "ssh.h"
-#include "mpint_i.h"               /* we reuse the BignumInt system */
-
-/*
- * Start by deciding whether we can support hardware AES at all.
- */
-#define HW_AES_NONE 0
-#define HW_AES_NI 1
-#define HW_AES_NEON 2
-
-#ifdef _FORCE_AES_NI
-#   define HW_AES HW_AES_NI
-#elif defined(__clang__)
-#   if __has_attribute(target) && __has_include(<wmmintrin.h>) &&       \
-    (defined(__x86_64__) || defined(__i386))
-#       define HW_AES HW_AES_NI
-#   endif
-#elif defined(__GNUC__)
-#    if (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 4)) && \
-    (defined(__x86_64__) || defined(__i386))
-#       define HW_AES HW_AES_NI
-#    endif
-#elif defined (_MSC_VER)
-#   if (defined(_M_X64) || defined(_M_IX86)) && _MSC_FULL_VER >= 150030729
-#      define HW_AES HW_AES_NI
-#   endif
-#endif
-
-#ifdef _FORCE_AES_NEON
-#   define HW_AES HW_AES_NEON
-#elif defined __BYTE_ORDER__ && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
-    /* Arm can potentially support both endiannesses, but this code
-     * hasn't been tested on anything but little. If anyone wants to
-     * run big-endian, they'll need to fix it first. */
-#elif defined __ARM_FEATURE_CRYPTO
-    /* If the Arm crypto extension is available already, we can
-     * support NEON AES without having to enable anything by hand */
-#   define HW_AES HW_AES_NEON
-#elif defined(__clang__)
-#   if __has_attribute(target) && __has_include(<arm_neon.h>) &&       \
-    (defined(__aarch64__))
-        /* clang can enable the crypto extension in AArch64 using
-         * __attribute__((target)) */
-#       define HW_AES HW_AES_NEON
-#       define USE_CLANG_ATTR_TARGET_AARCH64
-#   endif
-#elif defined _MSC_VER
-#   if defined _M_ARM64
-#       define HW_AES HW_AES_NEON
-        /* 64-bit Visual Studio uses the header <arm64_neon.h> in place
-         * of the standard <arm_neon.h> */
-#       define USE_ARM64_NEON_H
-#   elif defined _M_ARM
-#       define HW_AES HW_AES_NEON
-        /* 32-bit Visual Studio uses the right header name, but requires
-         * this #define to enable a set of intrinsic definitions that
-         * do not omit one of the parameters for vaes[ed]q_u8 */
-#       define _ARM_USE_NEW_NEON_INTRINSICS
-#   endif
-#endif
-
-#if defined _FORCE_SOFTWARE_AES || !defined HW_AES
-#   undef HW_AES
-#   define HW_AES HW_AES_NONE
-#endif
-
-#if HW_AES == HW_AES_NI
-#define HW_NAME_SUFFIX " (AES-NI accelerated)"
-#elif HW_AES == HW_AES_NEON
-#define HW_NAME_SUFFIX " (NEON accelerated)"
-#else
-#define HW_NAME_SUFFIX " (!NONEXISTENT ACCELERATED VERSION!)"
-#endif
-
-/*
- * Vtable collection for AES. For each SSH-level cipher id (i.e.
- * combination of key length and cipher mode), we provide three
- * vtables: one for the pure software implementation, one using
- * hardware acceleration (if available), and a top-level one which is
- * never actually instantiated, and only contains a new() method whose
- * job is to decide which of the other two to return an actual
- * instance of.
- */
-
-static ssh_cipher *aes_select(const ssh_cipheralg *alg);
-static ssh_cipher *aes_sw_new(const ssh_cipheralg *alg);
-static void aes_sw_free(ssh_cipher *);
-static void aes_sw_setiv_cbc(ssh_cipher *, const void *iv);
-static void aes_sw_setiv_sdctr(ssh_cipher *, const void *iv);
-static void aes_sw_setkey(ssh_cipher *, const void *key);
-static ssh_cipher *aes_hw_new(const ssh_cipheralg *alg);
-static void aes_hw_free(ssh_cipher *);
-static void aes_hw_setiv_cbc(ssh_cipher *, const void *iv);
-static void aes_hw_setiv_sdctr(ssh_cipher *, const void *iv);
-static void aes_hw_setkey(ssh_cipher *, const void *key);
-
-struct aes_extra {
-    const ssh_cipheralg *sw, *hw;
-};
-
-#define VTABLES_INNER(cid, pid, bits, name, encsuffix,                  \
-                      decsuffix, setivsuffix, flagsval)                 \
-    static void cid##_sw##encsuffix(ssh_cipher *, void *blk, int len);  \
-    static void cid##_sw##decsuffix(ssh_cipher *, void *blk, int len);  \
-    const ssh_cipheralg ssh_##cid##_sw = {                              \
-        .new = aes_sw_new,                                              \
-        .free = aes_sw_free,                                            \
-        .setiv = aes_sw_##setivsuffix,                                  \
-        .setkey = aes_sw_setkey,                                        \
-        .encrypt = cid##_sw##encsuffix,                                 \
-        .decrypt = cid##_sw##decsuffix,                                 \
-        .ssh2_id = pid,                                                 \
-        .blksize = 16,                                                  \
-        .real_keybits = bits,                                           \
-        .padded_keybytes = bits/8,                                      \
-        .flags = flagsval,                                              \
-        .text_name = name " (unaccelerated)",                           \
-    };                                                                  \
-                                                                        \
-    static void cid##_hw##encsuffix(ssh_cipher *, void *blk, int len);  \
-    static void cid##_hw##decsuffix(ssh_cipher *, void *blk, int len);  \
-    const ssh_cipheralg ssh_##cid##_hw = {                              \
-        .new = aes_hw_new,                                              \
-        .free = aes_hw_free,                                            \
-        .setiv = aes_hw_##setivsuffix,                                  \
-        .setkey = aes_hw_setkey,                                        \
-        .encrypt = cid##_hw##encsuffix,                                 \
-        .decrypt = cid##_hw##decsuffix,                                 \
-        .ssh2_id = pid,                                                 \
-        .blksize = 16,                                                  \
-        .real_keybits = bits,                                           \
-        .padded_keybytes = bits/8,                                      \
-        .flags = flagsval,                                              \
-        .text_name = name HW_NAME_SUFFIX,                               \
-    };                                                                  \
-                                                                        \
-    static const struct aes_extra extra_##cid = {                       \
-        &ssh_##cid##_sw, &ssh_##cid##_hw };                             \
-                                                                        \
-    const ssh_cipheralg ssh_##cid = {                                   \
-        .new = aes_select,                                              \
-        .ssh2_id = pid,                                                 \
-        .blksize = 16,                                                  \
-        .real_keybits = bits,                                           \
-        .padded_keybytes = bits/8,                                      \
-        .flags = flagsval,                                              \
-        .text_name = name " (dummy selector vtable)",                   \
-        .extra = &extra_##cid                                           \
-    };                                                                  \
-
-#define VTABLES(keylen)                                                 \
-    VTABLES_INNER(aes ## keylen ## _cbc, "aes" #keylen "-cbc",          \
-                  keylen, "AES-" #keylen " CBC", _encrypt, _decrypt,    \
-                  setiv_cbc, SSH_CIPHER_IS_CBC)                         \
-    VTABLES_INNER(aes ## keylen ## _sdctr, "aes" #keylen "-ctr",        \
-                  keylen, "AES-" #keylen " SDCTR",,, setiv_sdctr, 0)
-
-VTABLES(128)
-VTABLES(192)
-VTABLES(256)
-
-static const ssh_cipheralg ssh_rijndael_lysator = {
-    /* Same as aes256_cbc, but with a different protocol ID */
-    .new = aes_select,
-    .ssh2_id = "rijndael-cbc@lysator.liu.se",
-    .blksize = 16,
-    .real_keybits = 256,
-    .padded_keybytes = 256/8,
-    .flags = 0,
-    .text_name = "AES-256 CBC (dummy selector vtable)",
-    .extra = &extra_aes256_cbc,
-};
-
-static const ssh_cipheralg *const aes_list[] = {
-    &ssh_aes256_sdctr,
-    &ssh_aes256_cbc,
-    &ssh_rijndael_lysator,
-    &ssh_aes192_sdctr,
-    &ssh_aes192_cbc,
-    &ssh_aes128_sdctr,
-    &ssh_aes128_cbc,
-};
-
-const ssh2_ciphers ssh2_aes = { lenof(aes_list), aes_list };
-
-/*
- * The actual query function that asks if hardware acceleration is
- * available.
- */
-static bool aes_hw_available(void);
-
-/*
- * The top-level selection function, caching the results of
- * aes_hw_available() so it only has to run once.
- */
-static bool aes_hw_available_cached(void)
-{
-    static bool initialised = false;
-    static bool hw_available;
-    if (!initialised) {
-        hw_available = aes_hw_available();
-        initialised = true;
-    }
-    return hw_available;
-}
-
-static ssh_cipher *aes_select(const ssh_cipheralg *alg)
-{
-    const struct aes_extra *extra = (const struct aes_extra *)alg->extra;
-    const ssh_cipheralg *real_alg =
-        aes_hw_available_cached() ? extra->hw : extra->sw;
-
-    return ssh_cipher_new(real_alg);
-}
-
-/* ----------------------------------------------------------------------
- * Definitions likely to be helpful to multiple implementations.
- */
-
-#define REP2(x) x x
-#define REP4(x) REP2(REP2(x))
-#define REP8(x) REP2(REP4(x))
-#define REP9(x) REP8(x) x
-#define REP11(x) REP8(x) REP2(x) x
-#define REP13(x) REP8(x) REP4(x) x
-
-static const uint8_t key_setup_round_constants[] = {
-    /* The first few powers of X in GF(2^8), used during key setup.
-     * This can safely be a lookup table without side channel risks,
-     * because key setup iterates through it once in a standard way
-     * regardless of the key. */
-    0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1b, 0x36,
-};
-
-#define MAXROUNDKEYS 15
-
-/* ----------------------------------------------------------------------
  * Software implementation of AES.
  *
  * This implementation uses a bit-sliced representation. Instead of
@@ -257,6 +14,16 @@ static const uint8_t key_setup_round_constants[] = {
  * ops you get 64 S-box lookups, not just one.
  */
 
+#include "ssh.h"
+#include "aes.h"
+#include "mpint_i.h"               /* we reuse the BignumInt system */
+
+static bool aes_sw_available(void)
+{
+    /* Software AES is always available */
+    return true;
+}
+
 #define SLICE_PARALLELISM (BIGNUM_INT_BYTES / 2)
 
 #ifdef BITSLICED_DEBUG
@@ -922,8 +689,8 @@ static void aes_sliced_key_setup(
             }
 
             if (rotate_and_round_constant) {
-                assert(rconpos < lenof(key_setup_round_constants));
-                uint8_t rcon = key_setup_round_constants[rconpos++];
+                assert(rconpos < lenof(aes_key_setup_round_constants));
+                uint8_t rcon = aes_key_setup_round_constants[rconpos++];
                 for (size_t i = 0; i < 8; i++)
                     slices[i] ^= 1 & (rcon >> i);
             }
@@ -1255,13 +1022,13 @@ static inline void aes_sdctr_sw(
 }
 
 #define SW_ENC_DEC(len)                                 \
-    static void aes##len##_cbc_sw_encrypt(              \
+    static void aes##len##_sw_cbc_encrypt(              \
         ssh_cipher *ciph, void *vblk, int blklen)       \
     { aes_cbc_sw_encrypt(ciph, vblk, blklen); }         \
-    static void aes##len##_cbc_sw_decrypt(              \
+    static void aes##len##_sw_cbc_decrypt(              \
         ssh_cipher *ciph, void *vblk, int blklen)       \
     { aes_cbc_sw_decrypt(ciph, vblk, blklen); }         \
-    static void aes##len##_sdctr_sw(                    \
+    static void aes##len##_sw_sdctr(                    \
         ssh_cipher *ciph, void *vblk, int blklen)       \
     { aes_sdctr_sw(ciph, vblk, blklen); }
 
@@ -1269,644 +1036,5 @@ SW_ENC_DEC(128)
 SW_ENC_DEC(192)
 SW_ENC_DEC(256)
 
-/* ----------------------------------------------------------------------
- * Hardware-accelerated implementation of AES using x86 AES-NI.
- */
-
-#if HW_AES == HW_AES_NI
-
-/*
- * Set target architecture for Clang and GCC
- */
-#if !defined(__clang__) && defined(__GNUC__)
-#    pragma GCC target("aes")
-#    pragma GCC target("sse4.1")
-#endif
-
-#if defined(__clang__) || (defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)))
-#    define FUNC_ISA __attribute__ ((target("sse4.1,aes")))
-#else
-#    define FUNC_ISA
-#endif
-
-#include <wmmintrin.h>
-#include <smmintrin.h>
-
-#if defined(__clang__) || defined(__GNUC__)
-#include <cpuid.h>
-#define GET_CPU_ID(out) __cpuid(1, (out)[0], (out)[1], (out)[2], (out)[3])
-#else
-#define GET_CPU_ID(out) __cpuid(out, 1)
-#endif
-
-bool aes_hw_available(void)
-{
-    /*
-     * Determine if AES is available on this CPU, by checking that
-     * both AES itself and SSE4.1 are supported.
-     */
-    unsigned int CPUInfo[4];
-    GET_CPU_ID(CPUInfo);
-    return (CPUInfo[2] & (1 << 25)) && (CPUInfo[2] & (1 << 19));
-}
-
-/*
- * Core AES-NI encrypt/decrypt functions, one per length and direction.
- */
-
-#define NI_CIPHER(len, dir, dirlong, repmacro)                          \
-    static FUNC_ISA inline __m128i aes_ni_##len##_##dir(                \
-        __m128i v, const __m128i *keysched)                             \
-    {                                                                   \
-        v = _mm_xor_si128(v, *keysched++);                              \
-        repmacro(v = _mm_aes##dirlong##_si128(v, *keysched++););        \
-        return _mm_aes##dirlong##last_si128(v, *keysched);              \
-    }
-
-NI_CIPHER(128, e, enc, REP9)
-NI_CIPHER(128, d, dec, REP9)
-NI_CIPHER(192, e, enc, REP11)
-NI_CIPHER(192, d, dec, REP11)
-NI_CIPHER(256, e, enc, REP13)
-NI_CIPHER(256, d, dec, REP13)
-
-/*
- * The main key expansion.
- */
-static FUNC_ISA void aes_ni_key_expand(
-    const unsigned char *key, size_t key_words,
-    __m128i *keysched_e, __m128i *keysched_d)
-{
-    size_t rounds = key_words + 6;
-    size_t sched_words = (rounds + 1) * 4;
-
-    /*
-     * Store the key schedule as 32-bit integers during expansion, so
-     * that it's easy to refer back to individual previous words. We
-     * collect them into the final __m128i form at the end.
-     */
-    uint32_t sched[MAXROUNDKEYS * 4];
-
-    unsigned rconpos = 0;
-
-    for (size_t i = 0; i < sched_words; i++) {
-        if (i < key_words) {
-            sched[i] = GET_32BIT_LSB_FIRST(key + 4 * i);
-        } else {
-            uint32_t temp = sched[i - 1];
-
-            bool rotate_and_round_constant = (i % key_words == 0);
-            bool only_sub = (key_words == 8 && i % 8 == 4);
-
-            if (rotate_and_round_constant) {
-                __m128i v = _mm_setr_epi32(0,temp,0,0);
-                v = _mm_aeskeygenassist_si128(v, 0);
-                temp = _mm_extract_epi32(v, 1);
-
-                assert(rconpos < lenof(key_setup_round_constants));
-                temp ^= key_setup_round_constants[rconpos++];
-            } else if (only_sub) {
-                __m128i v = _mm_setr_epi32(0,temp,0,0);
-                v = _mm_aeskeygenassist_si128(v, 0);
-                temp = _mm_extract_epi32(v, 0);
-            }
-
-            sched[i] = sched[i - key_words] ^ temp;
-        }
-    }
-
-    /*
-     * Combine the key schedule words into __m128i vectors and store
-     * them in the output context.
-     */
-    for (size_t round = 0; round <= rounds; round++)
-        keysched_e[round] = _mm_setr_epi32(
-            sched[4*round  ], sched[4*round+1],
-            sched[4*round+2], sched[4*round+3]);
-
-    smemclr(sched, sizeof(sched));
-
-    /*
-     * Now prepare the modified keys for the inverse cipher.
-     */
-    for (size_t eround = 0; eround <= rounds; eround++) {
-        size_t dround = rounds - eround;
-        __m128i rkey = keysched_e[eround];
-        if (eround && dround)      /* neither first nor last */
-            rkey = _mm_aesimc_si128(rkey);
-        keysched_d[dround] = rkey;
-    }
-}
-
-/*
- * Auxiliary routine to increment the 128-bit counter used in SDCTR
- * mode.
- */
-static FUNC_ISA inline __m128i aes_ni_sdctr_increment(__m128i v)
-{
-    const __m128i ONE  = _mm_setr_epi32(1,0,0,0);
-    const __m128i ZERO = _mm_setzero_si128();
-
-    /* Increment the low-order 64 bits of v */
-    v  = _mm_add_epi64(v, ONE);
-    /* Check if they've become zero */
-    __m128i cmp = _mm_cmpeq_epi64(v, ZERO);
-    /* If so, the low half of cmp is all 1s. Pack that into the high
-     * half of addend with zero in the low half. */
-    __m128i addend = _mm_unpacklo_epi64(ZERO, cmp);
-    /* And subtract that from v, which increments the high 64 bits iff
-     * the low 64 wrapped round. */
-    v = _mm_sub_epi64(v, addend);
-
-    return v;
-}
-
-/*
- * Auxiliary routine to reverse the byte order of a vector, so that
- * the SDCTR IV can be made big-endian for feeding to the cipher.
- */
-static FUNC_ISA inline __m128i aes_ni_sdctr_reverse(__m128i v)
-{
-    v = _mm_shuffle_epi8(
-        v, _mm_setr_epi8(15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0));
-    return v;
-}
-
-/*
- * The SSH interface and the cipher modes.
- */
-
-typedef struct aes_ni_context aes_ni_context;
-struct aes_ni_context {
-    __m128i keysched_e[MAXROUNDKEYS], keysched_d[MAXROUNDKEYS], iv;
-
-    void *pointer_to_free;
-    ssh_cipher ciph;
-};
-
-static ssh_cipher *aes_hw_new(const ssh_cipheralg *alg)
-{
-    if (!aes_hw_available_cached())
-        return NULL;
-
-    /*
-     * The __m128i variables in the context structure need to be
-     * 16-byte aligned, but not all malloc implementations that this
-     * code has to work with will guarantee to return a 16-byte
-     * aligned pointer. So we over-allocate, manually realign the
-     * pointer ourselves, and store the original one inside the
-     * context so we know how to free it later.
-     */
-    void *allocation = smalloc(sizeof(aes_ni_context) + 15);
-    uintptr_t alloc_address = (uintptr_t)allocation;
-    uintptr_t aligned_address = (alloc_address + 15) & ~15;
-    aes_ni_context *ctx = (aes_ni_context *)aligned_address;
-
-    ctx->ciph.vt = alg;
-    ctx->pointer_to_free = allocation;
-    return &ctx->ciph;
-}
-
-static void aes_hw_free(ssh_cipher *ciph)
-{
-    aes_ni_context *ctx = container_of(ciph, aes_ni_context, ciph);
-    void *allocation = ctx->pointer_to_free;
-    smemclr(ctx, sizeof(*ctx));
-    sfree(allocation);
-}
-
-static void aes_hw_setkey(ssh_cipher *ciph, const void *vkey)
-{
-    aes_ni_context *ctx = container_of(ciph, aes_ni_context, ciph);
-    const unsigned char *key = (const unsigned char *)vkey;
-
-    aes_ni_key_expand(key, ctx->ciph.vt->real_keybits / 32,
-                      ctx->keysched_e, ctx->keysched_d);
-}
-
-static FUNC_ISA void aes_hw_setiv_cbc(ssh_cipher *ciph, const void *iv)
-{
-    aes_ni_context *ctx = container_of(ciph, aes_ni_context, ciph);
-    ctx->iv = _mm_loadu_si128(iv);
-}
-
-static FUNC_ISA void aes_hw_setiv_sdctr(ssh_cipher *ciph, const void *iv)
-{
-    aes_ni_context *ctx = container_of(ciph, aes_ni_context, ciph);
-    __m128i counter = _mm_loadu_si128(iv);
-    ctx->iv = aes_ni_sdctr_reverse(counter);
-}
-
-typedef __m128i (*aes_ni_fn)(__m128i v, const __m128i *keysched);
-
-static FUNC_ISA inline void aes_cbc_ni_encrypt(
-    ssh_cipher *ciph, void *vblk, int blklen, aes_ni_fn encrypt)
-{
-    aes_ni_context *ctx = container_of(ciph, aes_ni_context, ciph);
-
-    for (uint8_t *blk = (uint8_t *)vblk, *finish = blk + blklen;
-         blk < finish; blk += 16) {
-        __m128i plaintext = _mm_loadu_si128((const __m128i *)blk);
-        __m128i cipher_input = _mm_xor_si128(plaintext, ctx->iv);
-        __m128i ciphertext = encrypt(cipher_input, ctx->keysched_e);
-        _mm_storeu_si128((__m128i *)blk, ciphertext);
-        ctx->iv = ciphertext;
-    }
-}
-
-static FUNC_ISA inline void aes_cbc_ni_decrypt(
-    ssh_cipher *ciph, void *vblk, int blklen, aes_ni_fn decrypt)
-{
-    aes_ni_context *ctx = container_of(ciph, aes_ni_context, ciph);
-
-    for (uint8_t *blk = (uint8_t *)vblk, *finish = blk + blklen;
-         blk < finish; blk += 16) {
-        __m128i ciphertext = _mm_loadu_si128((const __m128i *)blk);
-        __m128i decrypted = decrypt(ciphertext, ctx->keysched_d);
-        __m128i plaintext = _mm_xor_si128(decrypted, ctx->iv);
-        _mm_storeu_si128((__m128i *)blk, plaintext);
-        ctx->iv = ciphertext;
-    }
-}
-
-static FUNC_ISA inline void aes_sdctr_ni(
-    ssh_cipher *ciph, void *vblk, int blklen, aes_ni_fn encrypt)
-{
-    aes_ni_context *ctx = container_of(ciph, aes_ni_context, ciph);
-
-    for (uint8_t *blk = (uint8_t *)vblk, *finish = blk + blklen;
-         blk < finish; blk += 16) {
-        __m128i counter = aes_ni_sdctr_reverse(ctx->iv);
-        __m128i keystream = encrypt(counter, ctx->keysched_e);
-        __m128i input = _mm_loadu_si128((const __m128i *)blk);
-        __m128i output = _mm_xor_si128(input, keystream);
-        _mm_storeu_si128((__m128i *)blk, output);
-        ctx->iv = aes_ni_sdctr_increment(ctx->iv);
-    }
-}
-
-#define NI_ENC_DEC(len)                                                 \
-    static FUNC_ISA void aes##len##_cbc_hw_encrypt(                     \
-        ssh_cipher *ciph, void *vblk, int blklen)                       \
-    { aes_cbc_ni_encrypt(ciph, vblk, blklen, aes_ni_##len##_e); }       \
-    static FUNC_ISA void aes##len##_cbc_hw_decrypt(                     \
-        ssh_cipher *ciph, void *vblk, int blklen)                       \
-    { aes_cbc_ni_decrypt(ciph, vblk, blklen, aes_ni_##len##_d); }       \
-    static FUNC_ISA void aes##len##_sdctr_hw(                           \
-        ssh_cipher *ciph, void *vblk, int blklen)                       \
-    { aes_sdctr_ni(ciph, vblk, blklen, aes_ni_##len##_e); }             \
-
-NI_ENC_DEC(128)
-NI_ENC_DEC(192)
-NI_ENC_DEC(256)
-
-/* ----------------------------------------------------------------------
- * Hardware-accelerated implementation of AES using Arm NEON.
- */
-
-#elif HW_AES == HW_AES_NEON
-
-/*
- * Manually set the target architecture, if we decided above that we
- * need to.
- */
-#ifdef USE_CLANG_ATTR_TARGET_AARCH64
-/*
- * A spot of cheating: redefine some ACLE feature macros before
- * including arm_neon.h. Otherwise we won't get the AES intrinsics
- * defined by that header, because it will be looking at the settings
- * for the whole translation unit rather than the ones we're going to
- * put on some particular functions using __attribute__((target)).
- */
-#define __ARM_NEON 1
-#define __ARM_FEATURE_CRYPTO 1
-#define FUNC_ISA __attribute__ ((target("neon,crypto")))
-#endif /* USE_CLANG_ATTR_TARGET_AARCH64 */
-
-#ifndef FUNC_ISA
-#define FUNC_ISA
-#endif
-
-#ifdef USE_ARM64_NEON_H
-#include <arm64_neon.h>
-#else
-#include <arm_neon.h>
-#endif
-
-static bool aes_hw_available(void)
-{
-    /*
-     * For Arm, we delegate to a per-platform AES detection function,
-     * because it has to be implemented by asking the operating system
-     * rather than directly querying the CPU.
-     *
-     * That's because Arm systems commonly have multiple cores that
-     * are not all alike, so any method of querying whether NEON
-     * crypto instructions work on the _current_ CPU - even one as
-     * crude as just trying one and catching the SIGILL - wouldn't
-     * give an answer that you could still rely on the first time the
-     * OS migrated your process to another CPU.
-     */
-    return platform_aes_hw_available();
-}
-
-/*
- * Core NEON encrypt/decrypt functions, one per length and direction.
- */
-
-#define NEON_CIPHER(len, repmacro)                              \
-    static FUNC_ISA inline uint8x16_t aes_neon_##len##_e(       \
-        uint8x16_t v, const uint8x16_t *keysched)               \
-    {                                                           \
-        repmacro(v = vaesmcq_u8(vaeseq_u8(v, *keysched++)););   \
-        v = vaeseq_u8(v, *keysched++);                          \
-        return veorq_u8(v, *keysched);                          \
-    }                                                           \
-    static FUNC_ISA inline uint8x16_t aes_neon_##len##_d(       \
-        uint8x16_t v, const uint8x16_t *keysched)               \
-    {                                                           \
-        repmacro(v = vaesimcq_u8(vaesdq_u8(v, *keysched++)););  \
-        v = vaesdq_u8(v, *keysched++);                          \
-        return veorq_u8(v, *keysched);                          \
-    }
-
-NEON_CIPHER(128, REP9)
-NEON_CIPHER(192, REP11)
-NEON_CIPHER(256, REP13)
-
-/*
- * The main key expansion.
- */
-static FUNC_ISA void aes_neon_key_expand(
-    const unsigned char *key, size_t key_words,
-    uint8x16_t *keysched_e, uint8x16_t *keysched_d)
-{
-    size_t rounds = key_words + 6;
-    size_t sched_words = (rounds + 1) * 4;
-
-    /*
-     * Store the key schedule as 32-bit integers during expansion, so
-     * that it's easy to refer back to individual previous words. We
-     * collect them into the final uint8x16_t form at the end.
-     */
-    uint32_t sched[MAXROUNDKEYS * 4];
-
-    unsigned rconpos = 0;
-
-    for (size_t i = 0; i < sched_words; i++) {
-        if (i < key_words) {
-            sched[i] = GET_32BIT_LSB_FIRST(key + 4 * i);
-        } else {
-            uint32_t temp = sched[i - 1];
-
-            bool rotate_and_round_constant = (i % key_words == 0);
-            bool sub = rotate_and_round_constant ||
-                (key_words == 8 && i % 8 == 4);
-
-            if (rotate_and_round_constant)
-                temp = (temp << 24) | (temp >> 8);
-
-            if (sub) {
-                uint32x4_t v32 = vdupq_n_u32(temp);
-                uint8x16_t v8 = vreinterpretq_u8_u32(v32);
-                v8 = vaeseq_u8(v8, vdupq_n_u8(0));
-                v32 = vreinterpretq_u32_u8(v8);
-                temp = vget_lane_u32(vget_low_u32(v32), 0);
-            }
-
-            if (rotate_and_round_constant) {
-                assert(rconpos < lenof(key_setup_round_constants));
-                temp ^= key_setup_round_constants[rconpos++];
-            }
-
-            sched[i] = sched[i - key_words] ^ temp;
-        }
-    }
-
-    /*
-     * Combine the key schedule words into uint8x16_t vectors and
-     * store them in the output context.
-     */
-    for (size_t round = 0; round <= rounds; round++)
-        keysched_e[round] = vreinterpretq_u8_u32(vld1q_u32(sched + 4*round));
-
-    smemclr(sched, sizeof(sched));
-
-    /*
-     * Now prepare the modified keys for the inverse cipher.
-     */
-    for (size_t eround = 0; eround <= rounds; eround++) {
-        size_t dround = rounds - eround;
-        uint8x16_t rkey = keysched_e[eround];
-        if (eround && dround)      /* neither first nor last */
-            rkey = vaesimcq_u8(rkey);
-        keysched_d[dround] = rkey;
-    }
-}
-
-/*
- * Auxiliary routine to reverse the byte order of a vector, so that
- * the SDCTR IV can be made big-endian for feeding to the cipher.
- *
- * In fact we don't need to reverse the vector _all_ the way; we leave
- * the two lanes in MSW,LSW order, because that makes no difference to
- * the efficiency of the increment. That way we only have to reverse
- * bytes within each lane in this function.
- */
-static FUNC_ISA inline uint8x16_t aes_neon_sdctr_reverse(uint8x16_t v)
-{
-    return vrev64q_u8(v);
-}
-
-/*
- * Auxiliary routine to increment the 128-bit counter used in SDCTR
- * mode. There's no instruction to treat a 128-bit vector as a single
- * long integer, so instead we have to increment the bottom half
- * unconditionally, and the top half if the bottom half started off as
- * all 1s (in which case there was about to be a carry).
- */
-static FUNC_ISA inline uint8x16_t aes_neon_sdctr_increment(uint8x16_t in)
-{
-#ifdef __aarch64__
-    /* There will be a carry if the low 64 bits are all 1s. */
-    uint64x1_t all1 = vcreate_u64(0xFFFFFFFFFFFFFFFF);
-    uint64x1_t carry = vceq_u64(vget_high_u64(vreinterpretq_u64_u8(in)), all1);
-
-    /* Make a word whose bottom half is unconditionally all 1s, and
-     * the top half is 'carry', i.e. all 0s most of the time but all
-     * 1s if we need to increment the top half. Then that word is what
-     * we need to _subtract_ from the input counter. */
-    uint64x2_t subtrahend = vcombine_u64(carry, all1);
-#else
-    /* AArch32 doesn't have comparisons that operate on a 64-bit lane,
-     * so we start by comparing each 32-bit half of the low 64 bits
-     * _separately_ to all-1s. */
-    uint32x2_t all1 = vdup_n_u32(0xFFFFFFFF);
-    uint32x2_t carry = vceq_u32(
-        vget_high_u32(vreinterpretq_u32_u8(in)), all1);
-
-    /* Swap the 32-bit words of the compare output, and AND with the
-     * unswapped version. Now carry is all 1s iff the bottom half of
-     * the input counter was all 1s, and all 0s otherwise. */
-    carry = vand_u32(carry, vrev64_u32(carry));
-
-    /* Now make the vector to subtract in the same way as above. */
-    uint64x2_t subtrahend = vreinterpretq_u64_u32(vcombine_u32(carry, all1));
-#endif
-
-    return vreinterpretq_u8_u64(
-        vsubq_u64(vreinterpretq_u64_u8(in), subtrahend));
-}
-
-/*
- * The SSH interface and the cipher modes.
- */
-
-typedef struct aes_neon_context aes_neon_context;
-struct aes_neon_context {
-    uint8x16_t keysched_e[MAXROUNDKEYS], keysched_d[MAXROUNDKEYS], iv;
-
-    ssh_cipher ciph;
-};
-
-static ssh_cipher *aes_hw_new(const ssh_cipheralg *alg)
-{
-    if (!aes_hw_available_cached())
-        return NULL;
-
-    aes_neon_context *ctx = snew(aes_neon_context);
-    ctx->ciph.vt = alg;
-    return &ctx->ciph;
-}
-
-static void aes_hw_free(ssh_cipher *ciph)
-{
-    aes_neon_context *ctx = container_of(ciph, aes_neon_context, ciph);
-    smemclr(ctx, sizeof(*ctx));
-    sfree(ctx);
-}
-
-static void aes_hw_setkey(ssh_cipher *ciph, const void *vkey)
-{
-    aes_neon_context *ctx = container_of(ciph, aes_neon_context, ciph);
-    const unsigned char *key = (const unsigned char *)vkey;
-
-    aes_neon_key_expand(key, ctx->ciph.vt->real_keybits / 32,
-                      ctx->keysched_e, ctx->keysched_d);
-}
-
-static FUNC_ISA void aes_hw_setiv_cbc(ssh_cipher *ciph, const void *iv)
-{
-    aes_neon_context *ctx = container_of(ciph, aes_neon_context, ciph);
-    ctx->iv = vld1q_u8(iv);
-}
-
-static FUNC_ISA void aes_hw_setiv_sdctr(ssh_cipher *ciph, const void *iv)
-{
-    aes_neon_context *ctx = container_of(ciph, aes_neon_context, ciph);
-    uint8x16_t counter = vld1q_u8(iv);
-    ctx->iv = aes_neon_sdctr_reverse(counter);
-}
-
-typedef uint8x16_t (*aes_neon_fn)(uint8x16_t v, const uint8x16_t *keysched);
-
-static FUNC_ISA inline void aes_cbc_neon_encrypt(
-    ssh_cipher *ciph, void *vblk, int blklen, aes_neon_fn encrypt)
-{
-    aes_neon_context *ctx = container_of(ciph, aes_neon_context, ciph);
-
-    for (uint8_t *blk = (uint8_t *)vblk, *finish = blk + blklen;
-         blk < finish; blk += 16) {
-        uint8x16_t plaintext = vld1q_u8(blk);
-        uint8x16_t cipher_input = veorq_u8(plaintext, ctx->iv);
-        uint8x16_t ciphertext = encrypt(cipher_input, ctx->keysched_e);
-        vst1q_u8(blk, ciphertext);
-        ctx->iv = ciphertext;
-    }
-}
-
-static FUNC_ISA inline void aes_cbc_neon_decrypt(
-    ssh_cipher *ciph, void *vblk, int blklen, aes_neon_fn decrypt)
-{
-    aes_neon_context *ctx = container_of(ciph, aes_neon_context, ciph);
-
-    for (uint8_t *blk = (uint8_t *)vblk, *finish = blk + blklen;
-         blk < finish; blk += 16) {
-        uint8x16_t ciphertext = vld1q_u8(blk);
-        uint8x16_t decrypted = decrypt(ciphertext, ctx->keysched_d);
-        uint8x16_t plaintext = veorq_u8(decrypted, ctx->iv);
-        vst1q_u8(blk, plaintext);
-        ctx->iv = ciphertext;
-    }
-}
-
-static FUNC_ISA inline void aes_sdctr_neon(
-    ssh_cipher *ciph, void *vblk, int blklen, aes_neon_fn encrypt)
-{
-    aes_neon_context *ctx = container_of(ciph, aes_neon_context, ciph);
-
-    for (uint8_t *blk = (uint8_t *)vblk, *finish = blk + blklen;
-         blk < finish; blk += 16) {
-        uint8x16_t counter = aes_neon_sdctr_reverse(ctx->iv);
-        uint8x16_t keystream = encrypt(counter, ctx->keysched_e);
-        uint8x16_t input = vld1q_u8(blk);
-        uint8x16_t output = veorq_u8(input, keystream);
-        vst1q_u8(blk, output);
-        ctx->iv = aes_neon_sdctr_increment(ctx->iv);
-    }
-}
-
-#define NEON_ENC_DEC(len)                                               \
-    static FUNC_ISA void aes##len##_cbc_hw_encrypt(                     \
-        ssh_cipher *ciph, void *vblk, int blklen)                       \
-    { aes_cbc_neon_encrypt(ciph, vblk, blklen, aes_neon_##len##_e); }   \
-    static FUNC_ISA void aes##len##_cbc_hw_decrypt(                     \
-        ssh_cipher *ciph, void *vblk, int blklen)                       \
-    { aes_cbc_neon_decrypt(ciph, vblk, blklen, aes_neon_##len##_d); }   \
-    static FUNC_ISA void aes##len##_sdctr_hw(                           \
-        ssh_cipher *ciph, void *vblk, int blklen)                       \
-    { aes_sdctr_neon(ciph, vblk, blklen, aes_neon_##len##_e); }         \
-
-NEON_ENC_DEC(128)
-NEON_ENC_DEC(192)
-NEON_ENC_DEC(256)
-
-/* ----------------------------------------------------------------------
- * Stub functions if we have no hardware-accelerated AES. In this
- * case, aes_hw_new returns NULL (though it should also never be
- * selected by aes_select, so the only thing that should even be
- * _able_ to call it is testcrypt). As a result, the remaining vtable
- * functions should never be called at all.
- */
-
-#elif HW_AES == HW_AES_NONE
-
-bool aes_hw_available(void)
-{
-    return false;
-}
-
-static ssh_cipher *aes_hw_new(const ssh_cipheralg *alg)
-{
-    return NULL;
-}
-
-#define STUB_BODY { unreachable("Should never be called"); }
-
-static void aes_hw_free(ssh_cipher *ciph) STUB_BODY
-static void aes_hw_setkey(ssh_cipher *ciph, const void *key) STUB_BODY
-static void aes_hw_setiv_cbc(ssh_cipher *ciph, const void *iv) STUB_BODY
-static void aes_hw_setiv_sdctr(ssh_cipher *ciph, const void *iv) STUB_BODY
-#define STUB_ENC_DEC(len)                                       \
-    static void aes##len##_cbc_hw_encrypt(                      \
-        ssh_cipher *ciph, void *vblk, int blklen) STUB_BODY     \
-    static void aes##len##_cbc_hw_decrypt(                      \
-        ssh_cipher *ciph, void *vblk, int blklen) STUB_BODY     \
-    static void aes##len##_sdctr_hw(                            \
-        ssh_cipher *ciph, void *vblk, int blklen) STUB_BODY
-
-STUB_ENC_DEC(128)
-STUB_ENC_DEC(192)
-STUB_ENC_DEC(256)
-
-#endif /* HW_AES */
+AES_EXTRA(_sw);
+AES_ALL_VTABLES(_sw, "unaccelerated");
diff --git a/crypto/aes.h b/crypto/aes.h
new file mode 100644
index 00000000..1960713a
--- /dev/null
+++ b/crypto/aes.h
@@ -0,0 +1,109 @@
+/*
+ * Definitions likely to be helpful to multiple AES implementations.
+ */
+
+/*
+ * The 'extra' structure used by AES implementations is used to
+ * include information about how to check if a given implementation is
+ * available at run time, and whether we've already checked.
+ */
+struct aes_extra_mutable;
+struct aes_extra {
+    /* Function to check availability. Might be expensive, so we don't
+     * want to call it more than once. */
+    bool (*check_available)(void);
+
+    /* Point to a writable substructure. */
+    struct aes_extra_mutable *mut;
+};
+struct aes_extra_mutable {
+    bool checked_availability;
+    bool is_available;
+};
+static inline bool check_availability(const struct aes_extra *extra)
+{
+    if (!extra->mut->checked_availability) {
+        extra->mut->is_available = extra->check_available();
+        extra->mut->checked_availability = true;
+    }
+
+    return extra->mut->is_available;
+}
+
+/*
+ * Macros to define vtables for AES variants. There are a lot of
+ * these, because of the cross product between cipher modes, key
+ * sizes, and assorted HW/SW implementations, so it's worth spending
+ * some effort here to reduce the boilerplate in the sub-files.
+ */
+
+#define AES_EXTRA(impl_c)                                               \
+    static struct aes_extra_mutable aes ## impl_c ## _extra_mut;        \
+    static const struct aes_extra aes ## impl_c ## _extra = {           \
+        .check_available = aes ## impl_c ## _available,                 \
+        .mut = &aes ## impl_c ## _extra_mut,                            \
+    }
+
+#define AES_CBC_VTABLE(impl_c, impl_display, bits)                      \
+    const ssh_cipheralg ssh_aes ## bits ## _cbc ## impl_c = {           \
+        .new = aes ## impl_c ## _new,                                   \
+        .free = aes ## impl_c ## _free,                                 \
+        .setiv = aes ## impl_c ## _setiv_cbc,                           \
+        .setkey = aes ## impl_c ## _setkey,                             \
+        .encrypt = aes ## bits ## impl_c ## _cbc_encrypt,               \
+        .decrypt = aes ## bits ## impl_c ## _cbc_decrypt,               \
+        .ssh2_id = "aes" #bits "-cbc",                                  \
+        .blksize = 16,                                                  \
+        .real_keybits = bits,                                           \
+        .padded_keybytes = bits/8,                                      \
+        .flags = SSH_CIPHER_IS_CBC,                                     \
+        .text_name = "AES-" #bits " CBC (" impl_display ")",            \
+        .extra = &aes ## impl_c ## _extra,                              \
+    }
+
+#define AES_SDCTR_VTABLE(impl_c, impl_display, bits)                    \
+    const ssh_cipheralg ssh_aes ## bits ## _sdctr ## impl_c = {         \
+        .new = aes ## impl_c ## _new,                                   \
+        .free = aes ## impl_c ## _free,                                 \
+        .setiv = aes ## impl_c ## _setiv_sdctr,                         \
+        .setkey = aes ## impl_c ## _setkey,                             \
+        .encrypt = aes ## bits ## impl_c ## _sdctr,                     \
+        .decrypt = aes ## bits ## impl_c ## _sdctr,                     \
+        .ssh2_id = "aes" #bits "-ctr",                                  \
+        .blksize = 16,                                                  \
+        .real_keybits = bits,                                           \
+        .padded_keybytes = bits/8,                                      \
+        .flags = 0,                                                     \
+        .text_name = "AES-" #bits " SDCTR (" impl_display ")",          \
+        .extra = &aes ## impl_c ## _extra,                              \
+    }
+
+#define AES_ALL_VTABLES(impl_c, impl_display)           \
+    AES_CBC_VTABLE(impl_c, impl_display, 128);          \
+    AES_CBC_VTABLE(impl_c, impl_display, 192);          \
+    AES_CBC_VTABLE(impl_c, impl_display, 256);          \
+    AES_SDCTR_VTABLE(impl_c, impl_display, 128);        \
+    AES_SDCTR_VTABLE(impl_c, impl_display, 192);        \
+    AES_SDCTR_VTABLE(impl_c, impl_display, 256)
+
+/*
+ * Macros to repeat a piece of code particular numbers of times that
+ * correspond to 1 fewer than the number of AES rounds. (Because the
+ * last round is different.)
+ */
+#define REP2(x) x x
+#define REP4(x) REP2(REP2(x))
+#define REP8(x) REP2(REP4(x))
+#define REP9(x) REP8(x) x
+#define REP11(x) REP8(x) REP2(x) x
+#define REP13(x) REP8(x) REP4(x) x
+
+/*
+ * The round constants used in key schedule expansion.
+ */
+extern const uint8_t aes_key_setup_round_constants[10];
+
+/*
+ * The largest number of round keys ever needed.
+ */
+#define MAXROUNDKEYS 15
diff --git a/crypto/sha1-common.c b/crypto/sha1-common.c
new file mode 100644
index 00000000..bf1db67a
--- /dev/null
+++ b/crypto/sha1-common.c
@@ -0,0 +1,10 @@
+/*
+ * Common variable definitions across all the SHA-1 implementations.
+ */
+
+#include "ssh.h"
+#include "sha1.h"
+
+const uint32_t sha1_initial_state[5] = {
+    0x67452301, 0xefcdab89, 0x98badcfe, 0x10325476, 0xc3d2e1f0,
+};
diff --git a/crypto/sha1-neon.c b/crypto/sha1-neon.c
new file mode 100644
index 00000000..99045714
--- /dev/null
+++ b/crypto/sha1-neon.c
@@ -0,0 +1,190 @@
+/*
+ * Hardware-accelerated implementation of SHA-1 using Arm NEON.
+ */
+
+#include "ssh.h"
+#include "sha1.h"
+
+#if USE_ARM64_NEON_H
+#include <arm64_neon.h>
+#else
+#include <arm_neon.h>
+#endif
+
+static bool sha1_neon_available(void)
+{
+    /*
+     * For Arm, we delegate to a per-platform detection function (see
+     * explanation in aes-neon.c).
+     */
+    return platform_sha1_neon_available();
+}
+
+typedef struct sha1_neon_core sha1_neon_core;
+struct sha1_neon_core {
+    uint32x4_t abcd;
+    uint32_t e;
+};
+
+static inline uint32x4_t sha1_neon_load_input(const uint8_t *p)
+{
+    return vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(p)));
+}
+
+static inline uint32x4_t sha1_neon_schedule_update(
+    uint32x4_t m4, uint32x4_t m3, uint32x4_t m2, uint32x4_t m1)
+{
+    return vsha1su1q_u32(vsha1su0q_u32(m4, m3, m2), m1);
+}
+
+/*
+ * SHA-1 has three different kinds of round, differing in whether they
+ * use the Ch, Maj or Par functions defined above. Each one uses a
+ * separate NEON instruction, so we define three inline functions for
+ * the different round types using this macro.
+ *
+ * The two batches of Par-type rounds also use a different constant,
+ * but that's passed in as an operand, so we don't need a fourth
+ * inline function just for that.
+ */
+#define SHA1_NEON_ROUND_FN(type)                                        \
+    static inline sha1_neon_core sha1_neon_round4_##type(               \
+        sha1_neon_core old, uint32x4_t sched, uint32x4_t constant)      \
+    {                                                                   \
+        sha1_neon_core new;                                             \
+        uint32x4_t round_input = vaddq_u32(sched, constant);            \
+        new.abcd = vsha1##type##q_u32(old.abcd, old.e, round_input);    \
+        new.e = vsha1h_u32(vget_lane_u32(vget_low_u32(old.abcd), 0));   \
+        return new;                                                     \
+    }
+SHA1_NEON_ROUND_FN(c)
+SHA1_NEON_ROUND_FN(p)
+SHA1_NEON_ROUND_FN(m)
+
+static inline void sha1_neon_block(sha1_neon_core *core, const uint8_t *p)
+{
+    uint32x4_t constant, s0, s1, s2, s3;
+    sha1_neon_core cr = *core;
+
+    constant = vdupq_n_u32(SHA1_STAGE0_CONSTANT);
+    s0 = sha1_neon_load_input(p);
+    cr = sha1_neon_round4_c(cr, s0, constant);
+    s1 = sha1_neon_load_input(p + 16);
+    cr = sha1_neon_round4_c(cr, s1, constant);
+    s2 = sha1_neon_load_input(p + 32);
+    cr = sha1_neon_round4_c(cr, s2, constant);
+    s3 = sha1_neon_load_input(p + 48);
+    cr = sha1_neon_round4_c(cr, s3, constant);
+    s0 = sha1_neon_schedule_update(s0, s1, s2, s3);
+    cr = sha1_neon_round4_c(cr, s0, constant);
+
+    constant = vdupq_n_u32(SHA1_STAGE1_CONSTANT);
+    s1 = sha1_neon_schedule_update(s1, s2, s3, s0);
+    cr = sha1_neon_round4_p(cr, s1, constant);
+    s2 = sha1_neon_schedule_update(s2, s3, s0, s1);
+    cr = sha1_neon_round4_p(cr, s2, constant);
+    s3 = sha1_neon_schedule_update(s3, s0, s1, s2);
+    cr = sha1_neon_round4_p(cr, s3, constant);
+    s0 = sha1_neon_schedule_update(s0, s1, s2, s3);
+    cr = sha1_neon_round4_p(cr, s0, constant);
+    s1 = sha1_neon_schedule_update(s1, s2, s3, s0);
+    cr = sha1_neon_round4_p(cr, s1, constant);
+
+    constant = vdupq_n_u32(SHA1_STAGE2_CONSTANT);
+    s2 = sha1_neon_schedule_update(s2, s3, s0, s1);
+    cr = sha1_neon_round4_m(cr, s2, constant);
+    s3 = sha1_neon_schedule_update(s3, s0, s1, s2);
+    cr = sha1_neon_round4_m(cr, s3, constant);
+    s0 = sha1_neon_schedule_update(s0, s1, s2, s3);
+    cr = sha1_neon_round4_m(cr, s0, constant);
+    s1 = sha1_neon_schedule_update(s1, s2, s3, s0);
+    cr = sha1_neon_round4_m(cr, s1, constant);
+    s2 = sha1_neon_schedule_update(s2, s3, s0, s1);
+    cr = sha1_neon_round4_m(cr, s2, constant);
+
+    constant = vdupq_n_u32(SHA1_STAGE3_CONSTANT);
+    s3 = sha1_neon_schedule_update(s3, s0, s1, s2);
+    cr = sha1_neon_round4_p(cr, s3, constant);
+    s0 = sha1_neon_schedule_update(s0, s1, s2, s3);
+    cr = sha1_neon_round4_p(cr, s0, constant);
+    s1 = sha1_neon_schedule_update(s1, s2, s3, s0);
+    cr = sha1_neon_round4_p(cr, s1, constant);
+    s2 = sha1_neon_schedule_update(s2, s3, s0, s1);
+    cr = sha1_neon_round4_p(cr, s2, constant);
+    s3 = sha1_neon_schedule_update(s3, s0, s1, s2);
+    cr = sha1_neon_round4_p(cr, s3, constant);
+
+    core->abcd = vaddq_u32(core->abcd, cr.abcd);
+    core->e += cr.e;
+}
+
+typedef struct sha1_neon {
+    sha1_neon_core core;
+    sha1_block blk;
+    BinarySink_IMPLEMENTATION;
+    ssh_hash hash;
+} sha1_neon;
+
+static void sha1_neon_write(BinarySink *bs, const void *vp, size_t len);
+
+static ssh_hash *sha1_neon_new(const ssh_hashalg *alg)
+{
+    const struct sha1_extra *extra = (const struct sha1_extra *)alg->extra;
+    if (!check_availability(extra))
+        return NULL;
+
+    sha1_neon *s = snew(sha1_neon);
+
+    s->hash.vt = alg;
+    BinarySink_INIT(s, sha1_neon_write);
+    BinarySink_DELEGATE_INIT(&s->hash, s);
+    return &s->hash;
+}
+
+static void sha1_neon_reset(ssh_hash *hash)
+{
+    sha1_neon *s = container_of(hash, sha1_neon, hash);
+
+    s->core.abcd = vld1q_u32(sha1_initial_state);
+    s->core.e = sha1_initial_state[4];
+
+    sha1_block_setup(&s->blk);
+}
+
+static void sha1_neon_copyfrom(ssh_hash *hcopy, ssh_hash *horig)
+{
+    sha1_neon *copy = container_of(hcopy, sha1_neon, hash);
+    sha1_neon *orig = container_of(horig, sha1_neon, hash);
+
+    *copy = *orig; /* structure copy */
+
+    BinarySink_COPIED(copy);
+    BinarySink_DELEGATE_INIT(&copy->hash, copy);
+}
+
+static void sha1_neon_free(ssh_hash *hash)
+{
+    sha1_neon *s = container_of(hash, sha1_neon, hash);
+    smemclr(s, sizeof(*s));
+    sfree(s);
+}
+
+static void sha1_neon_write(BinarySink *bs, const void *vp, size_t len)
+{
+    sha1_neon *s = BinarySink_DOWNCAST(bs, sha1_neon);
+
+    while (len > 0)
+        if (sha1_block_write(&s->blk, &vp, &len))
+            sha1_neon_block(&s->core, s->blk.block);
+}
+
+static void sha1_neon_digest(ssh_hash *hash, uint8_t *digest)
+{
+    sha1_neon *s = container_of(hash, sha1_neon, hash);
+
+    sha1_block_pad(&s->blk, BinarySink_UPCAST(s));
+    vst1q_u8(digest, vrev32q_u8(vreinterpretq_u8_u32(s->core.abcd)));
+    PUT_32BIT_MSB_FIRST(digest + 16, s->core.e);
+}
+
+SHA1_VTABLE(neon, "NEON accelerated");
diff --git a/crypto/sha1-ni.c b/crypto/sha1-ni.c
new file mode 100644
index 00000000..04e6386b
--- /dev/null
+++ b/crypto/sha1-ni.c
@@ -0,0 +1,325 @@
+/*
+ * Hardware-accelerated implementation of SHA-1 using x86 SHA-NI.
+ */
+
+#include "ssh.h"
+#include "sha1.h"
+
+#include <wmmintrin.h>
+#include <smmintrin.h>
+#include <immintrin.h>
+#if HAVE_SHAINTRIN_H
+#include <shaintrin.h>
+#endif
+
+#if defined(__clang__) || defined(__GNUC__)
+#include <cpuid.h>
+#define GET_CPU_ID_0(out)                               \
+    __cpuid(0, (out)[0], (out)[1], (out)[2], (out)[3])
+#define GET_CPU_ID_7(out)                                       \
+    __cpuid_count(7, 0, (out)[0], (out)[1], (out)[2], (out)[3])
+#else
+#define GET_CPU_ID_0(out) __cpuid(out, 0)
+#define GET_CPU_ID_7(out) __cpuidex(out, 7, 0)
+#endif
+
+static bool sha1_ni_available(void)
+{
+    unsigned int CPUInfo[4];
+    GET_CPU_ID_0(CPUInfo);
+    if (CPUInfo[0] < 7)
+        return false;
+
+    GET_CPU_ID_7(CPUInfo);
+    return CPUInfo[1] & (1 << 29); /* Check SHA */
+}
+
+/* SHA1 implementation using new instructions
+   The code is based on Jeffrey Walton's SHA1 implementation:
+   https://github.com/noloader/SHA-Intrinsics
+*/
+static inline void sha1_ni_block(__m128i *core, const uint8_t *p)
+{
+    __m128i ABCD, E0, E1, MSG0, MSG1, MSG2, MSG3;
+    const __m128i MASK = _mm_set_epi64x(
+        0x0001020304050607ULL, 0x08090a0b0c0d0e0fULL);
+
+    const __m128i *block = (const __m128i *)p;
+
+    /* Load initial values */
+    ABCD = core[0];
+    E0 = core[1];
+
+    /* Rounds 0-3 */
+    MSG0 = _mm_loadu_si128(block);
+    MSG0 = _mm_shuffle_epi8(MSG0, MASK);
+    E0 = _mm_add_epi32(E0, MSG0);
+    E1 = ABCD;
+    ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 0);
+
+    /* Rounds 4-7 */
+    MSG1 = _mm_loadu_si128(block + 1);
+    MSG1 = _mm_shuffle_epi8(MSG1, MASK);
+    E1 = _mm_sha1nexte_epu32(E1, MSG1);
+    E0 = ABCD;
+    ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 0);
+    MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1);
+
+    /* Rounds 8-11 */
+    MSG2 = _mm_loadu_si128(block + 2);
+    MSG2 = _mm_shuffle_epi8(MSG2, MASK);
+    E0 = _mm_sha1nexte_epu32(E0, MSG2);
+    E1 = ABCD;
+    ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 0);
+    MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2);
+    MSG0 = _mm_xor_si128(MSG0, MSG2);
+
+    /* Rounds 12-15 */
+    MSG3 = _mm_loadu_si128(block + 3);
+    MSG3 = _mm_shuffle_epi8(MSG3, MASK);
+    E1 = _mm_sha1nexte_epu32(E1, MSG3);
+    E0 = ABCD;
+    MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3);
+    ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 0);
+    MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3);
+    MSG1 = _mm_xor_si128(MSG1, MSG3);
+
+    /* Rounds 16-19 */
+    E0 = _mm_sha1nexte_epu32(E0, MSG0);
+    E1 = ABCD;
+    MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0);
+    ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 0);
+    MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0);
+    MSG2 = _mm_xor_si128(MSG2, MSG0);
+
+    /* Rounds 20-23 */
+    E1 = _mm_sha1nexte_epu32(E1, MSG1);
+    E0 = ABCD;
+    MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1);
+    ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 1);
+    MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1);
+    MSG3 = _mm_xor_si128(MSG3, MSG1);
+
+    /* Rounds 24-27 */
+    E0 = _mm_sha1nexte_epu32(E0, MSG2);
+    E1 = ABCD;
+    MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2);
+    ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 1);
+    MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2);
+    MSG0 = _mm_xor_si128(MSG0, MSG2);
+
+    /* Rounds 28-31 */
+    E1 = _mm_sha1nexte_epu32(E1, MSG3);
+    E0 = ABCD;
+    MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3);
+    ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 1);
+    MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3);
+    MSG1 = _mm_xor_si128(MSG1, MSG3);
+
+    /* Rounds 32-35 */
+    E0 = _mm_sha1nexte_epu32(E0, MSG0);
+    E1 = ABCD;
+    MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0);
+    ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 1);
+    MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0);
+    MSG2 = _mm_xor_si128(MSG2, MSG0);
+
+    /* Rounds 36-39 */
+    E1 = _mm_sha1nexte_epu32(E1, MSG1);
+    E0 = ABCD;
+    MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1);
+    ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 1);
+    MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1);
+    MSG3 = _mm_xor_si128(MSG3, MSG1);
+
+    /* Rounds 40-43 */
+    E0 = _mm_sha1nexte_epu32(E0, MSG2);
+    E1 = ABCD;
+    MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2);
+    ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 2);
+    MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2);
+    MSG0 = _mm_xor_si128(MSG0, MSG2);
+
+    /* Rounds 44-47 */
+    E1 = _mm_sha1nexte_epu32(E1, MSG3);
+    E0 = ABCD;
+    MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3);
+    ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 2);
+    MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3);
+    MSG1 = _mm_xor_si128(MSG1, MSG3);
+
+    /* Rounds 48-51 */
+    E0 = _mm_sha1nexte_epu32(E0, MSG0);
+    E1 = ABCD;
+    MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0);
+    ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 2);
+    MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0);
+    MSG2 = _mm_xor_si128(MSG2, MSG0);
+
+    /* Rounds 52-55 */
+    E1 = _mm_sha1nexte_epu32(E1, MSG1);
+    E0 = ABCD;
+    MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1);
+    ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 2);
+    MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1);
+    MSG3 = _mm_xor_si128(MSG3, MSG1);
+
+    /* Rounds 56-59 */
+    E0 = _mm_sha1nexte_epu32(E0, MSG2);
+    E1 = ABCD;
+    MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2);
+    ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 2);
+    MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2);
+    MSG0 = _mm_xor_si128(MSG0, MSG2);
+
+    /* Rounds 60-63 */
+    E1 = _mm_sha1nexte_epu32(E1, MSG3);
+    E0 = ABCD;
+    MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3);
+    ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 3);
+    MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3);
+    MSG1 = _mm_xor_si128(MSG1, MSG3);
+
+    /* Rounds 64-67 */
+    E0 = _mm_sha1nexte_epu32(E0, MSG0);
+    E1 = ABCD;
+    MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0);
+    ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 3);
+    MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0);
+    MSG2 = _mm_xor_si128(MSG2, MSG0);
+
+    /* Rounds 68-71 */
+    E1 = _mm_sha1nexte_epu32(E1, MSG1);
+    E0 = ABCD;
+    MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1);
+    ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 3);
+    MSG3 = _mm_xor_si128(MSG3, MSG1);
+
+    /* Rounds 72-75 */
+    E0 = _mm_sha1nexte_epu32(E0, MSG2);
+    E1 = ABCD;
+    MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2);
+    ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 3);
+
+    /* Rounds 76-79 */
+    E1 = _mm_sha1nexte_epu32(E1, MSG3);
+    E0 = ABCD;
+    ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 3);
+
+    /* Combine state */
+    core[0] = _mm_add_epi32(ABCD, core[0]);
+    core[1] = _mm_sha1nexte_epu32(E0, core[1]);
+}
+
+typedef struct sha1_ni {
+    /*
+     * core[0] stores the first four words of the SHA-1 state. core[1]
+     * stores just the fifth word, in the vector lane at the highest
+     * address.
+     */
+    __m128i core[2];
+    sha1_block blk;
+    void *pointer_to_free;
+    BinarySink_IMPLEMENTATION;
+    ssh_hash hash;
+} sha1_ni;
+
+static void sha1_ni_write(BinarySink *bs, const void *vp, size_t len);
+
+static sha1_ni *sha1_ni_alloc(void)
+{
+    /*
+     * The __m128i variables in the context structure need to be
+     * 16-byte aligned, but not all malloc implementations that this
+     * code has to work with will guarantee to return a 16-byte
+     * aligned pointer. So we over-allocate, manually realign the
+     * pointer ourselves, and store the original one inside the
+     * context so we know how to free it later.
+     */
+    void *allocation = smalloc(sizeof(sha1_ni) + 15);
+    uintptr_t alloc_address = (uintptr_t)allocation;
+    uintptr_t aligned_address = (alloc_address + 15) & ~15;
+    sha1_ni *s = (sha1_ni *)aligned_address;
+    s->pointer_to_free = allocation;
+    return s;
+}
+
+static ssh_hash *sha1_ni_new(const ssh_hashalg *alg)
+{
+    const struct sha1_extra *extra = (const struct sha1_extra *)alg->extra;
+    if (!check_availability(extra))
+        return NULL;
+
+    sha1_ni *s = sha1_ni_alloc();
+
+    s->hash.vt = alg;
+    BinarySink_INIT(s, sha1_ni_write);
+    BinarySink_DELEGATE_INIT(&s->hash, s);
+    return &s->hash;
+}
+
+static void sha1_ni_reset(ssh_hash *hash)
+{
+    sha1_ni *s = container_of(hash, sha1_ni, hash);
+
+    /* Initialise the core vectors in their storage order */
+    s->core[0] = _mm_set_epi64x(
+        0x67452301efcdab89ULL, 0x98badcfe10325476ULL);
+    s->core[1] = _mm_set_epi32(0xc3d2e1f0, 0, 0, 0);
+
+    sha1_block_setup(&s->blk);
+}
+
+static void sha1_ni_copyfrom(ssh_hash *hcopy, ssh_hash *horig)
+{
+    sha1_ni *copy = container_of(hcopy, sha1_ni, hash);
+    sha1_ni *orig = container_of(horig, sha1_ni, hash);
+
+    void *ptf_save = copy->pointer_to_free;
+    *copy = *orig; /* structure copy */
+    copy->pointer_to_free = ptf_save;
+
+    BinarySink_COPIED(copy);
+    BinarySink_DELEGATE_INIT(&copy->hash, copy);
+}
+
+static void sha1_ni_free(ssh_hash *hash)
+{
+    sha1_ni *s = container_of(hash, sha1_ni, hash);
+
+    void *ptf = s->pointer_to_free;
+    smemclr(s, sizeof(*s));
+    sfree(ptf);
+}
+
+static void sha1_ni_write(BinarySink *bs, const void *vp, size_t len)
+{
+    sha1_ni *s = BinarySink_DOWNCAST(bs, sha1_ni);
+
+    while (len > 0)
+        if (sha1_block_write(&s->blk, &vp, &len))
+            sha1_ni_block(s->core, s->blk.block);
+}
+
+static void sha1_ni_digest(ssh_hash *hash, uint8_t *digest)
+{
+    sha1_ni *s = container_of(hash, sha1_ni, hash);
+
+    sha1_block_pad(&s->blk, BinarySink_UPCAST(s));
+
+    /* Rearrange the first vector into its output order */
+    __m128i abcd = _mm_shuffle_epi32(s->core[0], 0x1B);
+
+    /* Byte-swap it into the output endianness */
+    const __m128i mask = _mm_setr_epi8(3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12);
+    abcd = _mm_shuffle_epi8(abcd, mask);
+
+    /* And store it */
+    _mm_storeu_si128((__m128i *)digest, abcd);
+
+    /* Finally, store the leftover word */
+    uint32_t e = _mm_extract_epi32(s->core[1], 3);
+    PUT_32BIT_MSB_FIRST(digest + 16, e);
+}
+
+SHA1_VTABLE(ni, "SHA-NI accelerated");
diff --git a/crypto/sha1-select.c b/crypto/sha1-select.c
new file mode 100644
index 00000000..1e8a6ce9
--- /dev/null
+++ b/crypto/sha1-select.c
@@ -0,0 +1,44 @@
+/*
+ * Top-level vtables to select a SHA-1 implementation.
+ */
+
+#include <assert.h>
+#include <stdlib.h>
+
+#include "putty.h"
+#include "ssh.h"
+#include "sha1.h"
+
+static ssh_hash *sha1_select(const ssh_hashalg *alg)
+{
+    static const ssh_hashalg *const real_algs[] = {
+#if HAVE_SHA_NI
+        &ssh_sha1_ni,
+#endif
+#if HAVE_NEON_CRYPTO
+        &ssh_sha1_neon,
+#endif
+        &ssh_sha1_sw,
+        NULL,
+    };
+
+    for (size_t i = 0; real_algs[i]; i++) {
+        const ssh_hashalg *alg = real_algs[i];
+        const struct sha1_extra *alg_extra =
+            (const struct sha1_extra *)alg->extra;
+        if (check_availability(alg_extra))
+            return ssh_hash_new(alg);
+    }
+
+    /* We should never reach the NULL at the end of the list, because
+     * the last non-NULL entry should be software-only SHA-1, which
+     * is always available. */
+    unreachable("sha1_select ran off the end of its list");
+}
+
+const ssh_hashalg ssh_sha1 = {
+    .new = sha1_select,
+    .hlen = 20,
+    .blocklen = 64,
+    HASHALG_NAMES_ANNOTATED("SHA-1", "dummy selector vtable"),
+};
diff --git a/crypto/sha1-sw.c b/crypto/sha1-sw.c
new file mode 100644
index 00000000..905d97f3
--- /dev/null
+++ b/crypto/sha1-sw.c
@@ -0,0 +1,155 @@
+/*
+ * Software implementation of SHA-1.
+ */
+
+#include "ssh.h"
+#include "sha1.h"
+
+static bool sha1_sw_available(void)
+{
+    /* Software SHA-1 is always available */
+    return true;
+}
+
+static inline uint32_t rol(uint32_t x, unsigned y)
+{
+    return (x << (31 & y)) | (x >> (31 & -y));
+}
+
+static inline uint32_t Ch(uint32_t ctrl, uint32_t if1, uint32_t if0)
+{
+    return if0 ^ (ctrl & (if1 ^ if0));
+}
+
+static inline uint32_t Maj(uint32_t x, uint32_t y, uint32_t z)
+{
+    return (x & y) | (z & (x | y));
+}
+
+static inline uint32_t Par(uint32_t x, uint32_t y, uint32_t z)
+{
+    return (x ^ y ^ z);
+}
+
+static inline void sha1_sw_round(
+    unsigned round_index, const uint32_t *schedule,
+    uint32_t *a, uint32_t *b, uint32_t *c, uint32_t *d, uint32_t *e,
+    uint32_t f, uint32_t constant)
+{
+    *e = rol(*a, 5) + f + *e + schedule[round_index] + constant;
+    *b = rol(*b, 30);
+}
+
+static void sha1_sw_block(uint32_t *core, const uint8_t *block)
+{
+    uint32_t w[SHA1_ROUNDS];
+    uint32_t a,b,c,d,e;
+
+    for (size_t t = 0; t < 16; t++)
+        w[t] = GET_32BIT_MSB_FIRST(block + 4*t);
+
+    for (size_t t = 16; t < SHA1_ROUNDS; t++)
+        w[t] = rol(w[t - 3] ^ w[t - 8] ^ w[t - 14] ^ w[t - 16], 1);
+
+    a = core[0]; b = core[1]; c = core[2]; d = core[3];
+    e = core[4];
+
+    size_t t = 0;
+    for (size_t u = 0; u < SHA1_ROUNDS_PER_STAGE/5; u++) {
+        sha1_sw_round(t++,w, &a,&b,&c,&d,&e, Ch(b,c,d), SHA1_STAGE0_CONSTANT);
+        sha1_sw_round(t++,w, &e,&a,&b,&c,&d, Ch(a,b,c), SHA1_STAGE0_CONSTANT);
+        sha1_sw_round(t++,w, &d,&e,&a,&b,&c, Ch(e,a,b), SHA1_STAGE0_CONSTANT);
+        sha1_sw_round(t++,w, &c,&d,&e,&a,&b, Ch(d,e,a), SHA1_STAGE0_CONSTANT);
+        sha1_sw_round(t++,w, &b,&c,&d,&e,&a, Ch(c,d,e), SHA1_STAGE0_CONSTANT);
+    }
+    for (size_t u = 0; u < SHA1_ROUNDS_PER_STAGE/5; u++) {
+        sha1_sw_round(t++,w, &a,&b,&c,&d,&e, Par(b,c,d), SHA1_STAGE1_CONSTANT);
+        sha1_sw_round(t++,w, &e,&a,&b,&c,&d, Par(a,b,c), SHA1_STAGE1_CONSTANT);
+        sha1_sw_round(t++,w, &d,&e,&a,&b,&c, Par(e,a,b), SHA1_STAGE1_CONSTANT);
+        sha1_sw_round(t++,w, &c,&d,&e,&a,&b, Par(d,e,a), SHA1_STAGE1_CONSTANT);
+        sha1_sw_round(t++,w, &b,&c,&d,&e,&a, Par(c,d,e), SHA1_STAGE1_CONSTANT);
+    }
+    for (size_t u = 0; u < SHA1_ROUNDS_PER_STAGE/5; u++) {
+        sha1_sw_round(t++,w, &a,&b,&c,&d,&e, Maj(b,c,d), SHA1_STAGE2_CONSTANT);
+        sha1_sw_round(t++,w, &e,&a,&b,&c,&d, Maj(a,b,c), SHA1_STAGE2_CONSTANT);
+        sha1_sw_round(t++,w, &d,&e,&a,&b,&c, Maj(e,a,b), SHA1_STAGE2_CONSTANT);
+        sha1_sw_round(t++,w, &c,&d,&e,&a,&b, Maj(d,e,a), SHA1_STAGE2_CONSTANT);
+        sha1_sw_round(t++,w, &b,&c,&d,&e,&a, Maj(c,d,e), SHA1_STAGE2_CONSTANT);
+    }
+    for (size_t u = 0; u < SHA1_ROUNDS_PER_STAGE/5; u++) {
+        sha1_sw_round(t++,w, &a,&b,&c,&d,&e, Par(b,c,d), SHA1_STAGE3_CONSTANT);
+        sha1_sw_round(t++,w, &e,&a,&b,&c,&d, Par(a,b,c), SHA1_STAGE3_CONSTANT);
+        sha1_sw_round(t++,w, &d,&e,&a,&b,&c, Par(e,a,b), SHA1_STAGE3_CONSTANT);
+        sha1_sw_round(t++,w, &c,&d,&e,&a,&b, Par(d,e,a), SHA1_STAGE3_CONSTANT);
+        sha1_sw_round(t++,w, &b,&c,&d,&e,&a, Par(c,d,e), SHA1_STAGE3_CONSTANT);
+    }
+
+    core[0] += a; core[1] += b; core[2] += c; core[3] += d; core[4] += e;
+
+    smemclr(w, sizeof(w));
+}
+
+typedef struct sha1_sw {
+    uint32_t core[5];
+    sha1_block blk;
+    BinarySink_IMPLEMENTATION;
+    ssh_hash hash;
+} sha1_sw;
+
+static void sha1_sw_write(BinarySink *bs, const void *vp, size_t len);
+
+static ssh_hash *sha1_sw_new(const ssh_hashalg *alg)
+{
+    sha1_sw *s = snew(sha1_sw);
+
+    s->hash.vt = alg;
+    BinarySink_INIT(s, sha1_sw_write);
+    BinarySink_DELEGATE_INIT(&s->hash, s);
+    return &s->hash;
+}
+
+static void sha1_sw_reset(ssh_hash *hash)
+{
+    sha1_sw *s = container_of(hash, sha1_sw, hash);
+
+    memcpy(s->core, sha1_initial_state, sizeof(s->core));
+    sha1_block_setup(&s->blk);
+}
+
+static void sha1_sw_copyfrom(ssh_hash *hcopy, ssh_hash *horig)
+{
+    sha1_sw *copy = container_of(hcopy, sha1_sw, hash);
+    sha1_sw *orig = container_of(horig, sha1_sw, hash);
+
+    memcpy(copy, orig, sizeof(*copy));
+    BinarySink_COPIED(copy);
+    BinarySink_DELEGATE_INIT(&copy->hash, copy);
+}
+
+static void sha1_sw_free(ssh_hash *hash)
+{
+    sha1_sw *s = container_of(hash, sha1_sw, hash);
+
+    smemclr(s, sizeof(*s));
+    sfree(s);
+}
+
+static void sha1_sw_write(BinarySink *bs, const void *vp, size_t len)
+{
+    sha1_sw *s = BinarySink_DOWNCAST(bs, sha1_sw);
+
+    while (len > 0)
+        if (sha1_block_write(&s->blk, &vp, &len))
+            sha1_sw_block(s->core, s->blk.block);
+}
+
+static void sha1_sw_digest(ssh_hash *hash, uint8_t *digest)
+{
+    sha1_sw *s = container_of(hash, sha1_sw, hash);
+
+    sha1_block_pad(&s->blk, BinarySink_UPCAST(s));
+    for (size_t i = 0; i < 5; i++)
+        PUT_32BIT_MSB_FIRST(digest + 4*i, s->core[i]);
+}
+
+SHA1_VTABLE(sw, "unaccelerated");
diff --git a/crypto/sha1.c b/crypto/sha1.c
deleted file mode 100644
index 536d474f..00000000
--- a/crypto/sha1.c
+++ /dev/null
@@ -1,933 +0,0 @@
-/*
- * SHA-1 algorithm as described at
- *
- *   http://csrc.nist.gov/cryptval/shs.html
- */
-
-#include "ssh.h"
-#include <assert.h>
-
-/*
- * Start by deciding whether we can support hardware SHA at all.
- */
-#define HW_SHA1_NONE 0
-#define HW_SHA1_NI 1
-#define HW_SHA1_NEON 2
-
-#ifdef _FORCE_SHA_NI
-#   define HW_SHA1 HW_SHA1_NI
-#elif defined(__clang__)
-#   if __has_attribute(target) && __has_include(<wmmintrin.h>) &&       \
-    (defined(__x86_64__) || defined(__i386))
-#       define HW_SHA1 HW_SHA1_NI
-#   endif
-#elif defined(__GNUC__)
-#    if (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 9)) && \
-        (defined(__x86_64__) || defined(__i386))
-#       define HW_SHA1 HW_SHA1_NI
-#    endif
-#elif defined (_MSC_VER)
-#   if (defined(_M_X64) || defined(_M_IX86)) && _MSC_FULL_VER >= 150030729
-#      define HW_SHA1 HW_SHA1_NI
-#   endif
-#endif
-
-#ifdef _FORCE_SHA_NEON
-#   define HW_SHA1 HW_SHA1_NEON
-#elif defined __BYTE_ORDER__ && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
-    /* Arm can potentially support both endiannesses, but this code
-     * hasn't been tested on anything but little. If anyone wants to
-     * run big-endian, they'll need to fix it first. */
-#elif defined __ARM_FEATURE_CRYPTO
-    /* If the Arm crypto extension is available already, we can
-     * support NEON SHA without having to enable anything by hand */
-#   define HW_SHA1 HW_SHA1_NEON
-#elif defined(__clang__)
-#   if __has_attribute(target) && __has_include(<arm_neon.h>) &&       \
-    (defined(__aarch64__))
-        /* clang can enable the crypto extension in AArch64 using
-         * __attribute__((target)) */
-#       define HW_SHA1 HW_SHA1_NEON
-#       define USE_CLANG_ATTR_TARGET_AARCH64
-#   endif
-#elif defined _MSC_VER
-    /* Visual Studio supports the crypto extension when targeting
-     * AArch64, but as of VS2017, the AArch32 header doesn't quite
-     * manage it (declaring the shae/shad intrinsics without a round
-     * key operand). */
-#   if defined _M_ARM64
-#       define HW_SHA1 HW_SHA1_NEON
-#       if defined _M_ARM64
-#           define USE_ARM64_NEON_H /* unusual header name in this case */
-#       endif
-#   endif
-#endif
-
-#if defined _FORCE_SOFTWARE_SHA || !defined HW_SHA1
-#   undef HW_SHA1
-#   define HW_SHA1 HW_SHA1_NONE
-#endif
-
-/*
- * The actual query function that asks if hardware acceleration is
- * available.
- */
-static bool sha1_hw_available(void);
-
-/*
- * The top-level selection function, caching the results of
- * sha1_hw_available() so it only has to run once.
- */
-static bool sha1_hw_available_cached(void)
-{
-    static bool initialised = false;
-    static bool hw_available;
-    if (!initialised) {
-        hw_available = sha1_hw_available();
-        initialised = true;
-    }
-    return hw_available;
-}
-
-static ssh_hash *sha1_select(const ssh_hashalg *alg)
-{
-    const ssh_hashalg *real_alg =
-        sha1_hw_available_cached() ? &ssh_sha1_hw : &ssh_sha1_sw;
-
-    return ssh_hash_new(real_alg);
-}
-
-const ssh_hashalg ssh_sha1 = {
-    .new = sha1_select,
-    .hlen = 20,
-    .blocklen = 64,
-    HASHALG_NAMES_ANNOTATED("SHA-1", "dummy selector vtable"),
-};
-
-/* ----------------------------------------------------------------------
- * Definitions likely to be helpful to multiple implementations.
- */
-
-static const uint32_t sha1_initial_state[] = {
-    0x67452301, 0xefcdab89, 0x98badcfe, 0x10325476, 0xc3d2e1f0,
-};
-
-#define SHA1_ROUNDS_PER_STAGE 20
-#define SHA1_STAGE0_CONSTANT 0x5a827999
-#define SHA1_STAGE1_CONSTANT 0x6ed9eba1
-#define SHA1_STAGE2_CONSTANT 0x8f1bbcdc
-#define SHA1_STAGE3_CONSTANT 0xca62c1d6
-#define SHA1_ROUNDS (4 * SHA1_ROUNDS_PER_STAGE)
-
-typedef struct sha1_block sha1_block;
-struct sha1_block {
-    uint8_t block[64];
-    size_t used;
-    uint64_t len;
-};
-
-static inline void sha1_block_setup(sha1_block *blk)
-{
-    blk->used = 0;
-    blk->len = 0;
-}
-
-static inline bool sha1_block_write(
-    sha1_block *blk, const void **vdata, size_t *len)
-{
-    size_t blkleft = sizeof(blk->block) - blk->used;
-    size_t chunk = *len < blkleft ? *len : blkleft;
-
-    const uint8_t *p = *vdata;
-    memcpy(blk->block + blk->used, p, chunk);
-    *vdata = p + chunk;
-    *len -= chunk;
-    blk->used += chunk;
-    blk->len += chunk;
-
-    if (blk->used == sizeof(blk->block)) {
-        blk->used = 0;
-        return true;
-    }
-
-    return false;
-}
-
-static inline void sha1_block_pad(sha1_block *blk, BinarySink *bs)
-{
-    uint64_t final_len = blk->len << 3;
-    size_t pad = 1 + (63 & (55 - blk->used));
-
-    put_byte(bs, 0x80);
-    for (size_t i = 1; i < pad; i++)
-        put_byte(bs, 0);
-    put_uint64(bs, final_len);
-
-    assert(blk->used == 0 && "Should have exactly hit a block boundary");
-}
-
-/* ----------------------------------------------------------------------
- * Software implementation of SHA-1.
- */
-
-static inline uint32_t rol(uint32_t x, unsigned y)
-{
-    return (x << (31 & y)) | (x >> (31 & -y));
-}
-
-static inline uint32_t Ch(uint32_t ctrl, uint32_t if1, uint32_t if0)
-{
-    return if0 ^ (ctrl & (if1 ^ if0));
-}
-
-static inline uint32_t Maj(uint32_t x, uint32_t y, uint32_t z)
-{
-    return (x & y) | (z & (x | y));
-}
-
-static inline uint32_t Par(uint32_t x, uint32_t y, uint32_t z)
-{
-    return (x ^ y ^ z);
-}
-
-static inline void sha1_sw_round(
-    unsigned round_index, const uint32_t *schedule,
-    uint32_t *a, uint32_t *b, uint32_t *c, uint32_t *d, uint32_t *e,
-    uint32_t f, uint32_t constant)
-{
-    *e = rol(*a, 5) + f + *e + schedule[round_index] + constant;
-    *b = rol(*b, 30);
-}
-
-static void sha1_sw_block(uint32_t *core, const uint8_t *block)
-{
-    uint32_t w[SHA1_ROUNDS];
-    uint32_t a,b,c,d,e;
-
-    for (size_t t = 0; t < 16; t++)
-        w[t] = GET_32BIT_MSB_FIRST(block + 4*t);
-
-    for (size_t t = 16; t < SHA1_ROUNDS; t++)
-        w[t] = rol(w[t - 3] ^ w[t - 8] ^ w[t - 14] ^ w[t - 16], 1);
-
-    a = core[0]; b = core[1]; c = core[2]; d = core[3];
-    e = core[4];
-
-    size_t t = 0;
-    for (size_t u = 0; u < SHA1_ROUNDS_PER_STAGE/5; u++) {
-        sha1_sw_round(t++,w, &a,&b,&c,&d,&e, Ch(b,c,d), SHA1_STAGE0_CONSTANT);
-        sha1_sw_round(t++,w, &e,&a,&b,&c,&d, Ch(a,b,c), SHA1_STAGE0_CONSTANT);
-        sha1_sw_round(t++,w, &d,&e,&a,&b,&c, Ch(e,a,b), SHA1_STAGE0_CONSTANT);
-        sha1_sw_round(t++,w, &c,&d,&e,&a,&b, Ch(d,e,a), SHA1_STAGE0_CONSTANT);
-        sha1_sw_round(t++,w, &b,&c,&d,&e,&a, Ch(c,d,e), SHA1_STAGE0_CONSTANT);
-    }
-    for (size_t u = 0; u < SHA1_ROUNDS_PER_STAGE/5; u++) {
-        sha1_sw_round(t++,w, &a,&b,&c,&d,&e, Par(b,c,d), SHA1_STAGE1_CONSTANT);
-        sha1_sw_round(t++,w, &e,&a,&b,&c,&d, Par(a,b,c), SHA1_STAGE1_CONSTANT);
-        sha1_sw_round(t++,w, &d,&e,&a,&b,&c, Par(e,a,b), SHA1_STAGE1_CONSTANT);
-        sha1_sw_round(t++,w, &c,&d,&e,&a,&b, Par(d,e,a), SHA1_STAGE1_CONSTANT);
-        sha1_sw_round(t++,w, &b,&c,&d,&e,&a, Par(c,d,e), SHA1_STAGE1_CONSTANT);
-    }
-    for (size_t u = 0; u < SHA1_ROUNDS_PER_STAGE/5; u++) {
-        sha1_sw_round(t++,w, &a,&b,&c,&d,&e, Maj(b,c,d), SHA1_STAGE2_CONSTANT);
-        sha1_sw_round(t++,w, &e,&a,&b,&c,&d, Maj(a,b,c), SHA1_STAGE2_CONSTANT);
-        sha1_sw_round(t++,w, &d,&e,&a,&b,&c, Maj(e,a,b), SHA1_STAGE2_CONSTANT);
-        sha1_sw_round(t++,w, &c,&d,&e,&a,&b, Maj(d,e,a), SHA1_STAGE2_CONSTANT);
-        sha1_sw_round(t++,w, &b,&c,&d,&e,&a, Maj(c,d,e), SHA1_STAGE2_CONSTANT);
-    }
-    for (size_t u = 0; u < SHA1_ROUNDS_PER_STAGE/5; u++) {
-        sha1_sw_round(t++,w, &a,&b,&c,&d,&e, Par(b,c,d), SHA1_STAGE3_CONSTANT);
-        sha1_sw_round(t++,w, &e,&a,&b,&c,&d, Par(a,b,c), SHA1_STAGE3_CONSTANT);
-        sha1_sw_round(t++,w, &d,&e,&a,&b,&c, Par(e,a,b), SHA1_STAGE3_CONSTANT);
-        sha1_sw_round(t++,w, &c,&d,&e,&a,&b, Par(d,e,a), SHA1_STAGE3_CONSTANT);
-        sha1_sw_round(t++,w, &b,&c,&d,&e,&a, Par(c,d,e), SHA1_STAGE3_CONSTANT);
-    }
-
-    core[0] += a; core[1] += b; core[2] += c; core[3] += d; core[4] += e;
-
-    smemclr(w, sizeof(w));
-}
-
-typedef struct sha1_sw {
-    uint32_t core[5];
-    sha1_block blk;
-    BinarySink_IMPLEMENTATION;
-    ssh_hash hash;
-} sha1_sw;
-
-static void sha1_sw_write(BinarySink *bs, const void *vp, size_t len);
-
-static ssh_hash *sha1_sw_new(const ssh_hashalg *alg)
-{
-    sha1_sw *s = snew(sha1_sw);
-
-    s->hash.vt = alg;
-    BinarySink_INIT(s, sha1_sw_write);
-    BinarySink_DELEGATE_INIT(&s->hash, s);
-    return &s->hash;
-}
-
-static void sha1_sw_reset(ssh_hash *hash)
-{
-    sha1_sw *s = container_of(hash, sha1_sw, hash);
-
-    memcpy(s->core, sha1_initial_state, sizeof(s->core));
-    sha1_block_setup(&s->blk);
-}
-
-static void sha1_sw_copyfrom(ssh_hash *hcopy, ssh_hash *horig)
-{
-    sha1_sw *copy = container_of(hcopy, sha1_sw, hash);
-    sha1_sw *orig = container_of(horig, sha1_sw, hash);
-
-    memcpy(copy, orig, sizeof(*copy));
-    BinarySink_COPIED(copy);
-    BinarySink_DELEGATE_INIT(&copy->hash, copy);
-}
-
-static void sha1_sw_free(ssh_hash *hash)
-{
-    sha1_sw *s = container_of(hash, sha1_sw, hash);
-
-    smemclr(s, sizeof(*s));
-    sfree(s);
-}
-
-static void sha1_sw_write(BinarySink *bs, const void *vp, size_t len)
-{
-    sha1_sw *s = BinarySink_DOWNCAST(bs, sha1_sw);
-
-    while (len > 0)
-        if (sha1_block_write(&s->blk, &vp, &len))
-            sha1_sw_block(s->core, s->blk.block);
-}
-
-static void sha1_sw_digest(ssh_hash *hash, uint8_t *digest)
-{
-    sha1_sw *s = container_of(hash, sha1_sw, hash);
-
-    sha1_block_pad(&s->blk, BinarySink_UPCAST(s));
-    for (size_t i = 0; i < 5; i++)
-        PUT_32BIT_MSB_FIRST(digest + 4*i, s->core[i]);
-}
-
-const ssh_hashalg ssh_sha1_sw = {
-    .new = sha1_sw_new,
-    .reset = sha1_sw_reset,
-    .copyfrom = sha1_sw_copyfrom,
-    .digest = sha1_sw_digest,
-    .free = sha1_sw_free,
-    .hlen = 20,
-    .blocklen = 64,
-    HASHALG_NAMES_ANNOTATED("SHA-1", "unaccelerated"),
-};
-
-/* ----------------------------------------------------------------------
- * Hardware-accelerated implementation of SHA-1 using x86 SHA-NI.
- */
-
-#if HW_SHA1 == HW_SHA1_NI
-
-/*
- * Set target architecture for Clang and GCC
- */
-
-#if defined(__clang__) || defined(__GNUC__)
-#    define FUNC_ISA __attribute__ ((target("sse4.1,sha")))
-#if !defined(__clang__)
-#    pragma GCC target("sha")
-#    pragma GCC target("sse4.1")
-#endif
-#else
-#    define FUNC_ISA
-#endif
-
-#include <wmmintrin.h>
-#include <smmintrin.h>
-#include <immintrin.h>
-#if defined(__clang__) || defined(__GNUC__)
-#include <shaintrin.h>
-#endif
-
-#if defined(__clang__) || defined(__GNUC__)
-#include <cpuid.h>
-#define GET_CPU_ID_0(out)                               \
-    __cpuid(0, (out)[0], (out)[1], (out)[2], (out)[3])
-#define GET_CPU_ID_7(out)                                       \
-    __cpuid_count(7, 0, (out)[0], (out)[1], (out)[2], (out)[3])
-#else
-#define GET_CPU_ID_0(out) __cpuid(out, 0)
-#define GET_CPU_ID_7(out) __cpuidex(out, 7, 0)
-#endif
-
-static bool sha1_hw_available(void)
-{
-    unsigned int CPUInfo[4];
-    GET_CPU_ID_0(CPUInfo);
-    if (CPUInfo[0] < 7)
-        return false;
-
-    GET_CPU_ID_7(CPUInfo);
-    return CPUInfo[1] & (1 << 29); /* Check SHA */
-}
-
-/* SHA1 implementation using new instructions
-   The code is based on Jeffrey Walton's SHA1 implementation:
-   https://github.com/noloader/SHA-Intrinsics
-*/
-FUNC_ISA
-static inline void sha1_ni_block(__m128i *core, const uint8_t *p)
-{
-    __m128i ABCD, E0, E1, MSG0, MSG1, MSG2, MSG3;
-    const __m128i MASK = _mm_set_epi64x(
-        0x0001020304050607ULL, 0x08090a0b0c0d0e0fULL);
-
-    const __m128i *block = (const __m128i *)p;
-
-    /* Load initial values */
-    ABCD = core[0];
-    E0 = core[1];
-
-    /* Rounds 0-3 */
-    MSG0 = _mm_loadu_si128(block);
-    MSG0 = _mm_shuffle_epi8(MSG0, MASK);
-    E0 = _mm_add_epi32(E0, MSG0);
-    E1 = ABCD;
-    ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 0);
-
-    /* Rounds 4-7 */
-    MSG1 = _mm_loadu_si128(block + 1);
-    MSG1 = _mm_shuffle_epi8(MSG1, MASK);
-    E1 = _mm_sha1nexte_epu32(E1, MSG1);
-    E0 = ABCD;
-    ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 0);
-    MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1);
-
-    /* Rounds 8-11 */
-    MSG2 = _mm_loadu_si128(block + 2);
-    MSG2 = _mm_shuffle_epi8(MSG2, MASK);
-    E0 = _mm_sha1nexte_epu32(E0, MSG2);
-    E1 = ABCD;
-    ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 0);
-    MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2);
-    MSG0 = _mm_xor_si128(MSG0, MSG2);
-
-    /* Rounds 12-15 */
-    MSG3 = _mm_loadu_si128(block + 3);
-    MSG3 = _mm_shuffle_epi8(MSG3, MASK);
-    E1 = _mm_sha1nexte_epu32(E1, MSG3);
-    E0 = ABCD;
-    MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3);
-    ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 0);
-    MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3);
-    MSG1 = _mm_xor_si128(MSG1, MSG3);
-
-    /* Rounds 16-19 */
-    E0 = _mm_sha1nexte_epu32(E0, MSG0);
-    E1 = ABCD;
-    MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0);
-    ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 0);
-    MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0);
-    MSG2 = _mm_xor_si128(MSG2, MSG0);
-
-    /* Rounds 20-23 */
-    E1 = _mm_sha1nexte_epu32(E1, MSG1);
-    E0 = ABCD;
-    MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1);
-    ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 1);
-    MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1);
-    MSG3 = _mm_xor_si128(MSG3, MSG1);
-
-    /* Rounds 24-27 */
-    E0 = _mm_sha1nexte_epu32(E0, MSG2);
-    E1 = ABCD;
-    MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2);
-    ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 1);
-    MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2);
-    MSG0 = _mm_xor_si128(MSG0, MSG2);
-
-    /* Rounds 28-31 */
-    E1 = _mm_sha1nexte_epu32(E1, MSG3);
-    E0 = ABCD;
-    MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3);
-    ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 1);
-    MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3);
-    MSG1 = _mm_xor_si128(MSG1, MSG3);
-
-    /* Rounds 32-35 */
-    E0 = _mm_sha1nexte_epu32(E0, MSG0);
-    E1 = ABCD;
-    MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0);
-    ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 1);
-    MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0);
-    MSG2 = _mm_xor_si128(MSG2, MSG0);
-
-    /* Rounds 36-39 */
-    E1 = _mm_sha1nexte_epu32(E1, MSG1);
-    E0 = ABCD;
-    MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1);
-    ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 1);
-    MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1);
-    MSG3 = _mm_xor_si128(MSG3, MSG1);
-
-    /* Rounds 40-43 */
-    E0 = _mm_sha1nexte_epu32(E0, MSG2);
-    E1 = ABCD;
-    MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2);
-    ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 2);
-    MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2);
-    MSG0 = _mm_xor_si128(MSG0, MSG2);
-
-    /* Rounds 44-47 */
-    E1 = _mm_sha1nexte_epu32(E1, MSG3);
-    E0 = ABCD;
-    MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3);
-    ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 2);
-    MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3);
-    MSG1 = _mm_xor_si128(MSG1, MSG3);
-
-    /* Rounds 48-51 */
-    E0 = _mm_sha1nexte_epu32(E0, MSG0);
-    E1 = ABCD;
-    MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0);
-    ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 2);
-    MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0);
-    MSG2 = _mm_xor_si128(MSG2, MSG0);
-
-    /* Rounds 52-55 */
-    E1 = _mm_sha1nexte_epu32(E1, MSG1);
-    E0 = ABCD;
-    MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1);
-    ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 2);
-    MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1);
-    MSG3 = _mm_xor_si128(MSG3, MSG1);
-
-    /* Rounds 56-59 */
-    E0 = _mm_sha1nexte_epu32(E0, MSG2);
-    E1 = ABCD;
-    MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2);
-    ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 2);
-    MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2);
-    MSG0 = _mm_xor_si128(MSG0, MSG2);
-
-    /* Rounds 60-63 */
-    E1 = _mm_sha1nexte_epu32(E1, MSG3);
-    E0 = ABCD;
-    MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3);
-    ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 3);
-    MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3);
-    MSG1 = _mm_xor_si128(MSG1, MSG3);
-
-    /* Rounds 64-67 */
-    E0 = _mm_sha1nexte_epu32(E0, MSG0);
-    E1 = ABCD;
-    MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0);
-    ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 3);
-    MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0);
-    MSG2 = _mm_xor_si128(MSG2, MSG0);
-
-    /* Rounds 68-71 */
-    E1 = _mm_sha1nexte_epu32(E1, MSG1);
-    E0 = ABCD;
-    MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1);
-    ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 3);
-    MSG3 = _mm_xor_si128(MSG3, MSG1);
-
-    /* Rounds 72-75 */
-    E0 = _mm_sha1nexte_epu32(E0, MSG2);
-    E1 = ABCD;
-    MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2);
-    ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 3);
-
-    /* Rounds 76-79 */
-    E1 = _mm_sha1nexte_epu32(E1, MSG3);
-    E0 = ABCD;
-    ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 3);
-
-    /* Combine state */
-    core[0] = _mm_add_epi32(ABCD, core[0]);
-    core[1] = _mm_sha1nexte_epu32(E0, core[1]);
-}
-
-typedef struct sha1_ni {
-    /*
-     * core[0] stores the first four words of the SHA-1 state. core[1]
-     * stores just the fifth word, in the vector lane at the highest
-     * address.
-     */
-    __m128i core[2];
-    sha1_block blk;
-    void *pointer_to_free;
-    BinarySink_IMPLEMENTATION;
-    ssh_hash hash;
-} sha1_ni;
-
-static void sha1_ni_write(BinarySink *bs, const void *vp, size_t len);
-
-static sha1_ni *sha1_ni_alloc(void)
-{
-    /*
-     * The __m128i variables in the context structure need to be
-     * 16-byte aligned, but not all malloc implementations that this
-     * code has to work with will guarantee to return a 16-byte
-     * aligned pointer. So we over-allocate, manually realign the
-     * pointer ourselves, and store the original one inside the
-     * context so we know how to free it later.
-     */
-    void *allocation = smalloc(sizeof(sha1_ni) + 15);
-    uintptr_t alloc_address = (uintptr_t)allocation;
-    uintptr_t aligned_address = (alloc_address + 15) & ~15;
-    sha1_ni *s = (sha1_ni *)aligned_address;
-    s->pointer_to_free = allocation;
-    return s;
-}
-
-static ssh_hash *sha1_ni_new(const ssh_hashalg *alg)
-{
-    if (!sha1_hw_available_cached())
-        return NULL;
-
-    sha1_ni *s = sha1_ni_alloc();
-
-    s->hash.vt = alg;
-    BinarySink_INIT(s, sha1_ni_write);
-    BinarySink_DELEGATE_INIT(&s->hash, s);
-    return &s->hash;
-}
-
-FUNC_ISA static void sha1_ni_reset(ssh_hash *hash)
-{
-    sha1_ni *s = container_of(hash, sha1_ni, hash);
-
-    /* Initialise the core vectors in their storage order */
-    s->core[0] = _mm_set_epi64x(
-        0x67452301efcdab89ULL, 0x98badcfe10325476ULL);
-    s->core[1] = _mm_set_epi32(0xc3d2e1f0, 0, 0, 0);
-
-    sha1_block_setup(&s->blk);
-}
-
-static void sha1_ni_copyfrom(ssh_hash *hcopy, ssh_hash *horig)
-{
-    sha1_ni *copy = container_of(hcopy, sha1_ni, hash);
-    sha1_ni *orig = container_of(horig, sha1_ni, hash);
-
-    void *ptf_save = copy->pointer_to_free;
-    *copy = *orig; /* structure copy */
-    copy->pointer_to_free = ptf_save;
-
-    BinarySink_COPIED(copy);
-    BinarySink_DELEGATE_INIT(&copy->hash, copy);
-}
-
-static void sha1_ni_free(ssh_hash *hash)
-{
-    sha1_ni *s = container_of(hash, sha1_ni, hash);
-
-    void *ptf = s->pointer_to_free;
-    smemclr(s, sizeof(*s));
-    sfree(ptf);
-}
-
-static void sha1_ni_write(BinarySink *bs, const void *vp, size_t len)
-{
-    sha1_ni *s = BinarySink_DOWNCAST(bs, sha1_ni);
-
-    while (len > 0)
-        if (sha1_block_write(&s->blk, &vp, &len))
-            sha1_ni_block(s->core, s->blk.block);
-}
-
-FUNC_ISA static void sha1_ni_digest(ssh_hash *hash, uint8_t *digest)
-{
-    sha1_ni *s = container_of(hash, sha1_ni, hash);
-
-    sha1_block_pad(&s->blk, BinarySink_UPCAST(s));
-
-    /* Rearrange the first vector into its output order */
-    __m128i abcd = _mm_shuffle_epi32(s->core[0], 0x1B);
-
-    /* Byte-swap it into the output endianness */
-    const __m128i mask = _mm_setr_epi8(3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12);
-    abcd = _mm_shuffle_epi8(abcd, mask);
-
-    /* And store it */
-    _mm_storeu_si128((__m128i *)digest, abcd);
-
-    /* Finally, store the leftover word */
-    uint32_t e = _mm_extract_epi32(s->core[1], 3);
-    PUT_32BIT_MSB_FIRST(digest + 16, e);
-}
-
-const ssh_hashalg ssh_sha1_hw = {
-    .new = sha1_ni_new,
-    .reset = sha1_ni_reset,
-    .copyfrom = sha1_ni_copyfrom,
-    .digest = sha1_ni_digest,
-    .free = sha1_ni_free,
-    .hlen = 20,
-    .blocklen = 64,
-    HASHALG_NAMES_ANNOTATED("SHA-1", "SHA-NI accelerated"),
-};
-
-/* ----------------------------------------------------------------------
- * Hardware-accelerated implementation of SHA-1 using Arm NEON.
- */
-
-#elif HW_SHA1 == HW_SHA1_NEON
-
-/*
- * Manually set the target architecture, if we decided above that we
- * need to.
- */
-#ifdef USE_CLANG_ATTR_TARGET_AARCH64
-/*
- * A spot of cheating: redefine some ACLE feature macros before
- * including arm_neon.h. Otherwise we won't get the SHA intrinsics
- * defined by that header, because it will be looking at the settings
- * for the whole translation unit rather than the ones we're going to
- * put on some particular functions using __attribute__((target)).
- */
-#define __ARM_NEON 1
-#define __ARM_FEATURE_CRYPTO 1
-#define FUNC_ISA __attribute__ ((target("neon,crypto")))
-#endif /* USE_CLANG_ATTR_TARGET_AARCH64 */
-
-#ifndef FUNC_ISA
-#define FUNC_ISA
-#endif
-
-#ifdef USE_ARM64_NEON_H
-#include <arm64_neon.h>
-#else
-#include <arm_neon.h>
-#endif
-
-static bool sha1_hw_available(void)
-{
-    /*
-     * For Arm, we delegate to a per-platform detection function (see
-     * explanation in sshaes.c).
-     */
-    return platform_sha1_hw_available();
-}
-
-typedef struct sha1_neon_core sha1_neon_core;
-struct sha1_neon_core {
-    uint32x4_t abcd;
-    uint32_t e;
-};
-
-FUNC_ISA
-static inline uint32x4_t sha1_neon_load_input(const uint8_t *p)
-{
-    return vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(p)));
-}
-
-FUNC_ISA
-static inline uint32x4_t sha1_neon_schedule_update(
-    uint32x4_t m4, uint32x4_t m3, uint32x4_t m2, uint32x4_t m1)
-{
-    return vsha1su1q_u32(vsha1su0q_u32(m4, m3, m2), m1);
-}
-
-/*
- * SHA-1 has three different kinds of round, differing in whether they
- * use the Ch, Maj or Par functions defined above. Each one uses a
- * separate NEON instruction, so we define three inline functions for
- * the different round types using this macro.
- *
- * The two batches of Par-type rounds also use a different constant,
- * but that's passed in as an operand, so we don't need a fourth
- * inline function just for that.
- */
-#define SHA1_NEON_ROUND_FN(type)                                        \
-    FUNC_ISA static inline sha1_neon_core sha1_neon_round4_##type(      \
-        sha1_neon_core old, uint32x4_t sched, uint32x4_t constant)      \
-    {                                                                   \
-        sha1_neon_core new;                                             \
-        uint32x4_t round_input = vaddq_u32(sched, constant);            \
-        new.abcd = vsha1##type##q_u32(old.abcd, old.e, round_input);    \
-        new.e = vsha1h_u32(vget_lane_u32(vget_low_u32(old.abcd), 0));   \
-        return new;                                                     \
-    }
-SHA1_NEON_ROUND_FN(c)
-SHA1_NEON_ROUND_FN(p)
-SHA1_NEON_ROUND_FN(m)
-
-FUNC_ISA
-static inline void sha1_neon_block(sha1_neon_core *core, const uint8_t *p)
-{
-    uint32x4_t constant, s0, s1, s2, s3;
-    sha1_neon_core cr = *core;
-
-    constant = vdupq_n_u32(SHA1_STAGE0_CONSTANT);
-    s0 = sha1_neon_load_input(p);
-    cr = sha1_neon_round4_c(cr, s0, constant);
-    s1 = sha1_neon_load_input(p + 16);
-    cr = sha1_neon_round4_c(cr, s1, constant);
-    s2 = sha1_neon_load_input(p + 32);
-    cr = sha1_neon_round4_c(cr, s2, constant);
-    s3 = sha1_neon_load_input(p + 48);
-    cr = sha1_neon_round4_c(cr, s3, constant);
-    s0 = sha1_neon_schedule_update(s0, s1, s2, s3);
-    cr = sha1_neon_round4_c(cr, s0, constant);
-
-    constant = vdupq_n_u32(SHA1_STAGE1_CONSTANT);
-    s1 = sha1_neon_schedule_update(s1, s2, s3, s0);
-    cr = sha1_neon_round4_p(cr, s1, constant);
-    s2 = sha1_neon_schedule_update(s2, s3, s0, s1);
-    cr = sha1_neon_round4_p(cr, s2, constant);
-    s3 = sha1_neon_schedule_update(s3, s0, s1, s2);
-    cr = sha1_neon_round4_p(cr, s3, constant);
-    s0 = sha1_neon_schedule_update(s0, s1, s2, s3);
-    cr = sha1_neon_round4_p(cr, s0, constant);
-    s1 = sha1_neon_schedule_update(s1, s2, s3, s0);
-    cr = sha1_neon_round4_p(cr, s1, constant);
-
-    constant = vdupq_n_u32(SHA1_STAGE2_CONSTANT);
-    s2 = sha1_neon_schedule_update(s2, s3, s0, s1);
-    cr = sha1_neon_round4_m(cr, s2, constant);
-    s3 = sha1_neon_schedule_update(s3, s0, s1, s2);
-    cr = sha1_neon_round4_m(cr, s3, constant);
-    s0 = sha1_neon_schedule_update(s0, s1, s2, s3);
-    cr = sha1_neon_round4_m(cr, s0, constant);
-    s1 = sha1_neon_schedule_update(s1, s2, s3, s0);
-    cr = sha1_neon_round4_m(cr, s1, constant);
-    s2 = sha1_neon_schedule_update(s2, s3, s0, s1);
-    cr = sha1_neon_round4_m(cr, s2, constant);
-
-    constant = vdupq_n_u32(SHA1_STAGE3_CONSTANT);
-    s3 = sha1_neon_schedule_update(s3, s0, s1, s2);
-    cr = sha1_neon_round4_p(cr, s3, constant);
-    s0 = sha1_neon_schedule_update(s0, s1, s2, s3);
-    cr = sha1_neon_round4_p(cr, s0, constant);
-    s1 = sha1_neon_schedule_update(s1, s2, s3, s0);
-    cr = sha1_neon_round4_p(cr, s1, constant);
-    s2 = sha1_neon_schedule_update(s2, s3, s0, s1);
-    cr = sha1_neon_round4_p(cr, s2, constant);
-    s3 = sha1_neon_schedule_update(s3, s0, s1, s2);
-    cr = sha1_neon_round4_p(cr, s3, constant);
-
-    core->abcd = vaddq_u32(core->abcd, cr.abcd);
-    core->e += cr.e;
-}
-
-typedef struct sha1_neon {
-    sha1_neon_core core;
-    sha1_block blk;
-    BinarySink_IMPLEMENTATION;
-    ssh_hash hash;
-} sha1_neon;
-
-static void sha1_neon_write(BinarySink *bs, const void *vp, size_t len);
-
-static ssh_hash *sha1_neon_new(const ssh_hashalg *alg)
-{
-    if (!sha1_hw_available_cached())
-        return NULL;
-
-    sha1_neon *s = snew(sha1_neon);
-
-    s->hash.vt = alg;
-    BinarySink_INIT(s, sha1_neon_write);
-    BinarySink_DELEGATE_INIT(&s->hash, s);
-    return &s->hash;
-}
-
-static void sha1_neon_reset(ssh_hash *hash)
-{
-    sha1_neon *s = container_of(hash, sha1_neon, hash);
-
-    s->core.abcd = vld1q_u32(sha1_initial_state);
-    s->core.e = sha1_initial_state[4];
-
-    sha1_block_setup(&s->blk);
-}
-
-static void sha1_neon_copyfrom(ssh_hash *hcopy, ssh_hash *horig)
-{
-    sha1_neon *copy = container_of(hcopy, sha1_neon, hash);
-    sha1_neon *orig = container_of(horig, sha1_neon, hash);
-
-    *copy = *orig; /* structure copy */
-
-    BinarySink_COPIED(copy);
-    BinarySink_DELEGATE_INIT(&copy->hash, copy);
-}
-
-static void sha1_neon_free(ssh_hash *hash)
-{
-    sha1_neon *s = container_of(hash, sha1_neon, hash);
-    smemclr(s, sizeof(*s));
-    sfree(s);
-}
-
-static void sha1_neon_write(BinarySink *bs, const void *vp, size_t len)
-{
-    sha1_neon *s = BinarySink_DOWNCAST(bs, sha1_neon);
-
-    while (len > 0)
-        if (sha1_block_write(&s->blk, &vp, &len))
-            sha1_neon_block(&s->core, s->blk.block);
-}
-
-static void sha1_neon_digest(ssh_hash *hash, uint8_t *digest)
-{
-    sha1_neon *s = container_of(hash, sha1_neon, hash);
-
-    sha1_block_pad(&s->blk, BinarySink_UPCAST(s));
-    vst1q_u8(digest, vrev32q_u8(vreinterpretq_u8_u32(s->core.abcd)));
-    PUT_32BIT_MSB_FIRST(digest + 16, s->core.e);
-}
-
-const ssh_hashalg ssh_sha1_hw = {
-    .new = sha1_neon_new,
-    .reset = sha1_neon_reset,
-    .copyfrom = sha1_neon_copyfrom,
-    .digest = sha1_neon_digest,
-    .free = sha1_neon_free,
-    .hlen = 20,
-    .blocklen = 64,
-    HASHALG_NAMES_ANNOTATED("SHA-1", "NEON accelerated"),
-};
-
-/* ----------------------------------------------------------------------
- * Stub functions if we have no hardware-accelerated SHA-1. In this
- * case, sha1_hw_new returns NULL (though it should also never be
- * selected by sha1_select, so the only thing that should even be
- * _able_ to call it is testcrypt). As a result, the remaining vtable
- * functions should never be called at all.
- */
-
-#elif HW_SHA1 == HW_SHA1_NONE
-
-static bool sha1_hw_available(void)
-{
-    return false;
-}
-
-static ssh_hash *sha1_stub_new(const ssh_hashalg *alg)
-{
-    return NULL;
-}
-
-#define STUB_BODY { unreachable("Should never be called"); }
-
-static void sha1_stub_reset(ssh_hash *hash) STUB_BODY
-static void sha1_stub_copyfrom(ssh_hash *hash, ssh_hash *orig) STUB_BODY
-static void sha1_stub_free(ssh_hash *hash) STUB_BODY
-static void sha1_stub_digest(ssh_hash *hash, uint8_t *digest) STUB_BODY
-
-const ssh_hashalg ssh_sha1_hw = {
-    .new = sha1_stub_new,
-    .reset = sha1_stub_reset,
-    .copyfrom = sha1_stub_copyfrom,
-    .digest = sha1_stub_digest,
-    .free = sha1_stub_free,
-    .hlen = 20,
-    .blocklen = 64,
-    HASHALG_NAMES_ANNOTATED("SHA-1", "!NONEXISTENT ACCELERATED VERSION!"),
-};
-
-#endif /* HW_SHA1 */
diff --git a/crypto/sha1.h b/crypto/sha1.h
new file mode 100644
index 00000000..2cdba0d4
--- /dev/null
+++ b/crypto/sha1.h
@@ -0,0 +1,109 @@
+/*
+ * Definitions likely to be helpful to multiple SHA-1 implementations.
+ */
+
+/*
+ * The 'extra' structure used by SHA-1 implementations is used to
+ * include information about how to check if a given implementation is
+ * available at run time, and whether we've already checked.
+ */
+struct sha1_extra_mutable;
+struct sha1_extra {
+    /* Function to check availability. Might be expensive, so we don't
+     * want to call it more than once. */
+    bool (*check_available)(void);
+
+    /* Point to a writable substructure. */
+    struct sha1_extra_mutable *mut;
+};
+struct sha1_extra_mutable {
+    bool checked_availability;
+    bool is_available;
+};
+static inline bool check_availability(const struct sha1_extra *extra)
+{
+    if (!extra->mut->checked_availability) {
+        extra->mut->is_available = extra->check_available();
+        extra->mut->checked_availability = true;
+    }
+
+    return extra->mut->is_available;
+}
+
+/*
+ * Macro to define a SHA-1 vtable together with its 'extra'
+ * structure.
+ */
+#define SHA1_VTABLE(impl_c, impl_display)                               \
+    static struct sha1_extra_mutable sha1_ ## impl_c ## _extra_mut;     \
+    static const struct sha1_extra sha1_ ## impl_c ## _extra = {        \
+        .check_available = sha1_ ## impl_c ## _available,               \
+        .mut = &sha1_ ## impl_c ## _extra_mut,                          \
+    };                                                                  \
+    const ssh_hashalg ssh_sha1_ ## impl_c = {                           \
+        .new = sha1_ ## impl_c ## _new,                                 \
+        .reset = sha1_ ## impl_c ## _reset,                             \
+        .copyfrom = sha1_ ## impl_c ## _copyfrom,                       \
+        .digest = sha1_ ## impl_c ## _digest,                           \
+        .free = sha1_ ## impl_c ## _free,                               \
+        .hlen = 20,                                                     \
+        .blocklen = 64,                                                 \
+        HASHALG_NAMES_ANNOTATED("SHA-1", impl_display),                 \
+        .extra = &sha1_ ## impl_c ## _extra,                            \
+    }
+
+extern const uint32_t sha1_initial_state[5];
+
+#define SHA1_ROUNDS_PER_STAGE 20
+#define SHA1_STAGE0_CONSTANT 0x5a827999
+#define SHA1_STAGE1_CONSTANT 0x6ed9eba1
+#define SHA1_STAGE2_CONSTANT 0x8f1bbcdc
+#define SHA1_STAGE3_CONSTANT 0xca62c1d6
+#define SHA1_ROUNDS (4 * SHA1_ROUNDS_PER_STAGE)
+
+typedef struct sha1_block sha1_block;
+struct sha1_block {
+    uint8_t block[64];
+    size_t used;
+    uint64_t len;
+};
+
+static inline void sha1_block_setup(sha1_block *blk)
+{
+    blk->used = 0;
+    blk->len = 0;
+}
+
+static inline bool sha1_block_write(
+    sha1_block *blk, const void **vdata, size_t *len)
+{
+    size_t blkleft = sizeof(blk->block) - blk->used;
+    size_t chunk = *len < blkleft ? *len : blkleft;
+
+    const uint8_t *p = *vdata;
+    memcpy(blk->block + blk->used, p, chunk);
+    *vdata = p + chunk;
+    *len -= chunk;
+    blk->used += chunk;
+    blk->len += chunk;
+
+    if (blk->used == sizeof(blk->block)) {
+        blk->used = 0;
+        return true;
+    }
+
+    return false;
+}
+
+static inline void sha1_block_pad(sha1_block *blk, BinarySink *bs)
+{
+    uint64_t final_len = blk->len << 3;
+    size_t pad = 1 + (63 & (55 - blk->used));
+
+    put_byte(bs, 0x80);
+    for (size_t i = 1; i < pad; i++)
+        put_byte(bs, 0);
+    put_uint64(bs, final_len);
+
+    assert(blk->used == 0 && "Should have exactly hit a block boundary");
+}
diff --git a/crypto/sha256-common.c b/crypto/sha256-common.c
new file mode 100644
index 00000000..52904c08
--- /dev/null
+++ b/crypto/sha256-common.c
@@ -0,0 +1,30 @@
+/*
+ * Common variable definitions across all the SHA-256 implementations.
+ */
+
+#include "ssh.h"
+#include "sha256.h"
+
+const uint32_t sha256_initial_state[8] = {
+    0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a,
+    0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19,
+};
+
+const uint32_t sha256_round_constants[64] = {
+    0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
+    0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
+    0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
+    0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
+    0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
+    0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
+    0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
+    0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
+    0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
+    0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
+    0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
+    0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
+    0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
+    0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
+    0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
+    0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
+};
diff --git a/crypto/sha256-neon.c b/crypto/sha256-neon.c
new file mode 100644
index 00000000..87d24d0c
--- /dev/null
+++ b/crypto/sha256-neon.c
@@ -0,0 +1,162 @@
+/*
+ * Hardware-accelerated implementation of SHA-256 using Arm NEON.
+ */
+
+#include "ssh.h"
+#include "sha256.h"
+
+#if USE_ARM64_NEON_H
+#include <arm64_neon.h>
+#else
+#include <arm_neon.h>
+#endif
+
+static bool sha256_neon_available(void)
+{
+    /*
+     * For Arm, we delegate to a per-platform detection function (see
+     * explanation in aes-neon.c).
+     */
+    return platform_sha256_neon_available();
+}
+
+typedef struct sha256_neon_core sha256_neon_core;
+struct sha256_neon_core {
+    uint32x4_t abcd, efgh;
+};
+
+static inline uint32x4_t sha256_neon_load_input(const uint8_t *p)
+{
+    return vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(p)));
+}
+
+static inline uint32x4_t sha256_neon_schedule_update(
+    uint32x4_t m4, uint32x4_t m3, uint32x4_t m2, uint32x4_t m1)
+{
+    return vsha256su1q_u32(vsha256su0q_u32(m4, m3), m2, m1);
+}
+
+static inline sha256_neon_core sha256_neon_round4(
+    sha256_neon_core old, uint32x4_t sched, unsigned round)
+{
+    sha256_neon_core new;
+
+    uint32x4_t round_input = vaddq_u32(
+        sched, vld1q_u32(sha256_round_constants + round));
+    new.abcd = vsha256hq_u32 (old.abcd, old.efgh, round_input);
+    new.efgh = vsha256h2q_u32(old.efgh, old.abcd, round_input);
+    return new;
+}
+
+static inline void sha256_neon_block(sha256_neon_core *core, const uint8_t *p)
+{
+    uint32x4_t s0, s1, s2, s3;
+    sha256_neon_core cr = *core;
+
+    s0 = sha256_neon_load_input(p);
+    cr = sha256_neon_round4(cr, s0, 0);
+    s1 = sha256_neon_load_input(p+16);
+    cr = sha256_neon_round4(cr, s1, 4);
+    s2 = sha256_neon_load_input(p+32);
+    cr = sha256_neon_round4(cr, s2, 8);
+    s3 = sha256_neon_load_input(p+48);
+    cr = sha256_neon_round4(cr, s3, 12);
+    s0 = sha256_neon_schedule_update(s0, s1, s2, s3);
+    cr = sha256_neon_round4(cr, s0, 16);
+    s1 = sha256_neon_schedule_update(s1, s2, s3, s0);
+    cr = sha256_neon_round4(cr, s1, 20);
+    s2 = sha256_neon_schedule_update(s2, s3, s0, s1);
+    cr = sha256_neon_round4(cr, s2, 24);
+    s3 = sha256_neon_schedule_update(s3, s0, s1, s2);
+    cr = sha256_neon_round4(cr, s3, 28);
+    s0 = sha256_neon_schedule_update(s0, s1, s2, s3);
+    cr = sha256_neon_round4(cr, s0, 32);
+    s1 = sha256_neon_schedule_update(s1, s2, s3, s0);
+    cr = sha256_neon_round4(cr, s1, 36);
+    s2 = sha256_neon_schedule_update(s2, s3, s0, s1);
+    cr = sha256_neon_round4(cr, s2, 40);
+    s3 = sha256_neon_schedule_update(s3, s0, s1, s2);
+    cr = sha256_neon_round4(cr, s3, 44);
+    s0 = sha256_neon_schedule_update(s0, s1, s2, s3);
+    cr = sha256_neon_round4(cr, s0, 48);
+    s1 = sha256_neon_schedule_update(s1, s2, s3, s0);
+    cr = sha256_neon_round4(cr, s1, 52);
+    s2 = sha256_neon_schedule_update(s2, s3, s0, s1);
+    cr = sha256_neon_round4(cr, s2, 56);
+    s3 = sha256_neon_schedule_update(s3, s0, s1, s2);
+    cr = sha256_neon_round4(cr, s3, 60);
+
+    core->abcd = vaddq_u32(core->abcd, cr.abcd);
+    core->efgh = vaddq_u32(core->efgh, cr.efgh);
+}
+
+typedef struct sha256_neon {
+    sha256_neon_core core;
+    sha256_block blk;
+    BinarySink_IMPLEMENTATION;
+    ssh_hash hash;
+} sha256_neon;
+
+static void sha256_neon_write(BinarySink *bs, const void *vp, size_t len);
+
+static ssh_hash *sha256_neon_new(const ssh_hashalg *alg)
+{
+    const struct sha256_extra *extra = (const struct sha256_extra *)alg->extra;
+    if (!check_availability(extra))
+        return NULL;
+
+    sha256_neon *s = snew(sha256_neon);
+
+    s->hash.vt = alg;
+    BinarySink_INIT(s, sha256_neon_write);
+    BinarySink_DELEGATE_INIT(&s->hash, s);
+    return &s->hash;
+}
+
+static void sha256_neon_reset(ssh_hash *hash)
+{
+    sha256_neon *s = container_of(hash, sha256_neon, hash);
+
+    s->core.abcd = vld1q_u32(sha256_initial_state);
+    s->core.efgh = vld1q_u32(sha256_initial_state + 4);
+
+    sha256_block_setup(&s->blk);
+}
+
+static void sha256_neon_copyfrom(ssh_hash *hcopy, ssh_hash *horig)
+{
+    sha256_neon *copy = container_of(hcopy, sha256_neon, hash);
+    sha256_neon *orig = container_of(horig, sha256_neon, hash);
+
+    *copy = *orig; /* structure copy */
+
+    BinarySink_COPIED(copy);
+    BinarySink_DELEGATE_INIT(&copy->hash, copy);
+}
+
+static void sha256_neon_free(ssh_hash *hash)
+{
+    sha256_neon *s = container_of(hash, sha256_neon, hash);
+    smemclr(s, sizeof(*s));
+    sfree(s);
+}
+
+static void sha256_neon_write(BinarySink *bs, const void *vp, size_t len)
+{
+    sha256_neon *s = BinarySink_DOWNCAST(bs, sha256_neon);
+
+    while (len > 0)
+        if (sha256_block_write(&s->blk, &vp, &len))
+            sha256_neon_block(&s->core, s->blk.block);
+}
+
+static void sha256_neon_digest(ssh_hash *hash, uint8_t *digest)
+{
+    sha256_neon *s = container_of(hash, sha256_neon, hash);
+
+    sha256_block_pad(&s->blk, BinarySink_UPCAST(s));
+    vst1q_u8(digest,      vrev32q_u8(vreinterpretq_u8_u32(s->core.abcd)));
+    vst1q_u8(digest + 16, vrev32q_u8(vreinterpretq_u8_u32(s->core.efgh)));
+}
+
+SHA256_VTABLE(neon, "NEON accelerated");
diff --git a/crypto/sha256-ni.c b/crypto/sha256-ni.c
new file mode 100644
index 00000000..530fa433
--- /dev/null
+++ b/crypto/sha256-ni.c
@@ -0,0 +1,342 @@
+/*
+ * Hardware-accelerated implementation of SHA-256 using x86 SHA-NI.
+ */
+
+#include "ssh.h"
+#include "sha256.h"
+
+#include <wmmintrin.h>
+#include <smmintrin.h>
+#include <immintrin.h>
+#if HAVE_SHAINTRIN_H
+#include <shaintrin.h>
+#endif
+
+#if defined(__clang__) || defined(__GNUC__)
+#include <cpuid.h>
+#define GET_CPU_ID_0(out)                               \
+    __cpuid(0, (out)[0], (out)[1], (out)[2], (out)[3])
+#define GET_CPU_ID_7(out)                                       \
+    __cpuid_count(7, 0, (out)[0], (out)[1], (out)[2], (out)[3])
+#else
+#define GET_CPU_ID_0(out) __cpuid(out, 0)
+#define GET_CPU_ID_7(out) __cpuidex(out, 7, 0)
+#endif
+
+static bool sha256_ni_available(void)
+{
+    unsigned int CPUInfo[4];
+    GET_CPU_ID_0(CPUInfo);
+    if (CPUInfo[0] < 7)
+        return false;
+
+    GET_CPU_ID_7(CPUInfo);
+    return CPUInfo[1] & (1 << 29); /* Check SHA */
+}
+
+/* SHA256 implementation using new instructions
+   The code is based on Jeffrey Walton's SHA256 implementation:
+   https://github.com/noloader/SHA-Intrinsics
+*/
+static inline void sha256_ni_block(__m128i *core, const uint8_t *p)
+{
+    __m128i STATE0, STATE1;
+    __m128i MSG, TMP;
+    __m128i MSG0, MSG1, MSG2, MSG3;
+    const __m128i *block = (const __m128i *)p;
+    const __m128i MASK = _mm_set_epi64x(
+        0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL);
+
+    /* Load initial values */
+    STATE0 = core[0];
+    STATE1 = core[1];
+
+    /* Rounds 0-3 */
+    MSG = _mm_loadu_si128(block);
+    MSG0 = _mm_shuffle_epi8(MSG, MASK);
+    MSG = _mm_add_epi32(MSG0, _mm_set_epi64x(
+                            0xE9B5DBA5B5C0FBCFULL, 0x71374491428A2F98ULL));
+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+    MSG = _mm_shuffle_epi32(MSG, 0x0E);
+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+
+    /* Rounds 4-7 */
+    MSG1 = _mm_loadu_si128(block + 1);
+    MSG1 = _mm_shuffle_epi8(MSG1, MASK);
+    MSG = _mm_add_epi32(MSG1, _mm_set_epi64x(
+                            0xAB1C5ED5923F82A4ULL, 0x59F111F13956C25BULL));
+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+    MSG = _mm_shuffle_epi32(MSG, 0x0E);
+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+    MSG0 = _mm_sha256msg1_epu32(MSG0, MSG1);
+
+    /* Rounds 8-11 */
+    MSG2 = _mm_loadu_si128(block + 2);
+    MSG2 = _mm_shuffle_epi8(MSG2, MASK);
+    MSG = _mm_add_epi32(MSG2, _mm_set_epi64x(
+                            0x550C7DC3243185BEULL, 0x12835B01D807AA98ULL));
+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+    MSG = _mm_shuffle_epi32(MSG, 0x0E);
+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+    MSG1 = _mm_sha256msg1_epu32(MSG1, MSG2);
+
+    /* Rounds 12-15 */
+    MSG3 = _mm_loadu_si128(block + 3);
+    MSG3 = _mm_shuffle_epi8(MSG3, MASK);
+    MSG = _mm_add_epi32(MSG3, _mm_set_epi64x(
+                            0xC19BF1749BDC06A7ULL, 0x80DEB1FE72BE5D74ULL));
+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+    TMP = _mm_alignr_epi8(MSG3, MSG2, 4);
+    MSG0 = _mm_add_epi32(MSG0, TMP);
+    MSG0 = _mm_sha256msg2_epu32(MSG0, MSG3);
+    MSG = _mm_shuffle_epi32(MSG, 0x0E);
+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+    MSG2 = _mm_sha256msg1_epu32(MSG2, MSG3);
+
+    /* Rounds 16-19 */
+    MSG = _mm_add_epi32(MSG0, _mm_set_epi64x(
+                            0x240CA1CC0FC19DC6ULL, 0xEFBE4786E49B69C1ULL));
+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+    TMP = _mm_alignr_epi8(MSG0, MSG3, 4);
+    MSG1 = _mm_add_epi32(MSG1, TMP);
+    MSG1 = _mm_sha256msg2_epu32(MSG1, MSG0);
+    MSG = _mm_shuffle_epi32(MSG, 0x0E);
+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+    MSG3 = _mm_sha256msg1_epu32(MSG3, MSG0);
+
+    /* Rounds 20-23 */
+    MSG = _mm_add_epi32(MSG1, _mm_set_epi64x(
+                            0x76F988DA5CB0A9DCULL, 0x4A7484AA2DE92C6FULL));
+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+    TMP = _mm_alignr_epi8(MSG1, MSG0, 4);
+    MSG2 = _mm_add_epi32(MSG2, TMP);
+    MSG2 = _mm_sha256msg2_epu32(MSG2, MSG1);
+    MSG = _mm_shuffle_epi32(MSG, 0x0E);
+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+    MSG0 = _mm_sha256msg1_epu32(MSG0, MSG1);
+
+    /* Rounds 24-27 */
+    MSG = _mm_add_epi32(MSG2, _mm_set_epi64x(
+                            0xBF597FC7B00327C8ULL, 0xA831C66D983E5152ULL));
+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+    TMP = _mm_alignr_epi8(MSG2, MSG1, 4);
+    MSG3 = _mm_add_epi32(MSG3, TMP);
+    MSG3 = _mm_sha256msg2_epu32(MSG3, MSG2);
+    MSG = _mm_shuffle_epi32(MSG, 0x0E);
+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+    MSG1 = _mm_sha256msg1_epu32(MSG1, MSG2);
+
+    /* Rounds 28-31 */
+    MSG = _mm_add_epi32(MSG3, _mm_set_epi64x(
+                            0x1429296706CA6351ULL,  0xD5A79147C6E00BF3ULL));
+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+    TMP = _mm_alignr_epi8(MSG3, MSG2, 4);
+    MSG0 = _mm_add_epi32(MSG0, TMP);
+    MSG0 = _mm_sha256msg2_epu32(MSG0, MSG3);
+    MSG = _mm_shuffle_epi32(MSG, 0x0E);
+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+    MSG2 = _mm_sha256msg1_epu32(MSG2, MSG3);
+
+    /* Rounds 32-35 */
+    MSG = _mm_add_epi32(MSG0, _mm_set_epi64x(
+                            0x53380D134D2C6DFCULL, 0x2E1B213827B70A85ULL));
+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+    TMP = _mm_alignr_epi8(MSG0, MSG3, 4);
+    MSG1 = _mm_add_epi32(MSG1, TMP);
+    MSG1 = _mm_sha256msg2_epu32(MSG1, MSG0);
+    MSG = _mm_shuffle_epi32(MSG, 0x0E);
+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+    MSG3 = _mm_sha256msg1_epu32(MSG3, MSG0);
+
+    /* Rounds 36-39 */
+    MSG = _mm_add_epi32(MSG1, _mm_set_epi64x(
+                            0x92722C8581C2C92EULL, 0x766A0ABB650A7354ULL));
+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+    TMP = _mm_alignr_epi8(MSG1, MSG0, 4);
+    MSG2 = _mm_add_epi32(MSG2, TMP);
+    MSG2 = _mm_sha256msg2_epu32(MSG2, MSG1);
+    MSG = _mm_shuffle_epi32(MSG, 0x0E);
+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+    MSG0 = _mm_sha256msg1_epu32(MSG0, MSG1);
+
+    /* Rounds 40-43 */
+    MSG = _mm_add_epi32(MSG2, _mm_set_epi64x(
+                            0xC76C51A3C24B8B70ULL, 0xA81A664BA2BFE8A1ULL));
+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+    TMP = _mm_alignr_epi8(MSG2, MSG1, 4);
+    MSG3 = _mm_add_epi32(MSG3, TMP);
+    MSG3 = _mm_sha256msg2_epu32(MSG3, MSG2);
+    MSG = _mm_shuffle_epi32(MSG, 0x0E);
+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+    MSG1 = _mm_sha256msg1_epu32(MSG1, MSG2);
+
+    /* Rounds 44-47 */
+    MSG = _mm_add_epi32(MSG3, _mm_set_epi64x(
+                            0x106AA070F40E3585ULL, 0xD6990624D192E819ULL));
+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+    TMP = _mm_alignr_epi8(MSG3, MSG2, 4);
+    MSG0 = _mm_add_epi32(MSG0, TMP);
+    MSG0 = _mm_sha256msg2_epu32(MSG0, MSG3);
+    MSG = _mm_shuffle_epi32(MSG, 0x0E);
+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+    MSG2 = _mm_sha256msg1_epu32(MSG2, MSG3);
+
+    /* Rounds 48-51 */
+    MSG = _mm_add_epi32(MSG0, _mm_set_epi64x(
+                            0x34B0BCB52748774CULL, 0x1E376C0819A4C116ULL));
+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+    TMP = _mm_alignr_epi8(MSG0, MSG3, 4);
+    MSG1 = _mm_add_epi32(MSG1, TMP);
+    MSG1 = _mm_sha256msg2_epu32(MSG1, MSG0);
+    MSG = _mm_shuffle_epi32(MSG, 0x0E);
+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+    MSG3 = _mm_sha256msg1_epu32(MSG3, MSG0);
+
+    /* Rounds 52-55 */
+    MSG = _mm_add_epi32(MSG1, _mm_set_epi64x(
+                            0x682E6FF35B9CCA4FULL, 0x4ED8AA4A391C0CB3ULL));
+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+    TMP = _mm_alignr_epi8(MSG1, MSG0, 4);
+    MSG2 = _mm_add_epi32(MSG2, TMP);
+    MSG2 = _mm_sha256msg2_epu32(MSG2, MSG1);
+    MSG = _mm_shuffle_epi32(MSG, 0x0E);
+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+
+    /* Rounds 56-59 */
+    MSG = _mm_add_epi32(MSG2, _mm_set_epi64x(
+                            0x8CC7020884C87814ULL, 0x78A5636F748F82EEULL));
+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+    TMP = _mm_alignr_epi8(MSG2, MSG1, 4);
+    MSG3 = _mm_add_epi32(MSG3, TMP);
+    MSG3 = _mm_sha256msg2_epu32(MSG3, MSG2);
+    MSG = _mm_shuffle_epi32(MSG, 0x0E);
+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+
+    /* Rounds 60-63 */
+    MSG = _mm_add_epi32(MSG3, _mm_set_epi64x(
+                            0xC67178F2BEF9A3F7ULL, 0xA4506CEB90BEFFFAULL));
+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+    MSG = _mm_shuffle_epi32(MSG, 0x0E);
+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+
+    /* Combine state */
+    core[0] = _mm_add_epi32(STATE0, core[0]);
+    core[1] = _mm_add_epi32(STATE1, core[1]);
+}
+
+typedef struct sha256_ni {
+    /*
+     * These two vectors store the 8 words of the SHA-256 state, but
+     * not in the same order they appear in the spec: the first word
+     * holds A,B,E,F and the second word C,D,G,H.
+     */
+    __m128i core[2];
+    sha256_block blk;
+    void *pointer_to_free;
+    BinarySink_IMPLEMENTATION;
+    ssh_hash hash;
+} sha256_ni;
+
+static void sha256_ni_write(BinarySink *bs, const void *vp, size_t len);
+
+static sha256_ni *sha256_ni_alloc(void)
+{
+    /*
+     * The __m128i variables in the context structure need to be
+     * 16-byte aligned, but not all malloc implementations that this
+     * code has to work with will guarantee to return a 16-byte
+     * aligned pointer. So we over-allocate, manually realign the
+     * pointer ourselves, and store the original one inside the
+     * context so we know how to free it later.
+     */
+    void *allocation = smalloc(sizeof(sha256_ni) + 15);
+    uintptr_t alloc_address = (uintptr_t)allocation;
+    uintptr_t aligned_address = (alloc_address + 15) & ~15;
+    sha256_ni *s = (sha256_ni *)aligned_address;
+    s->pointer_to_free = allocation;
+    return s;
+}
+
+static ssh_hash *sha256_ni_new(const ssh_hashalg *alg)
+{
+    const struct sha256_extra *extra = (const struct sha256_extra *)alg->extra;
+    if (!check_availability(extra))
+        return NULL;
+
+    sha256_ni *s = sha256_ni_alloc();
+
+    s->hash.vt = alg;
+    BinarySink_INIT(s, sha256_ni_write);
+    BinarySink_DELEGATE_INIT(&s->hash, s);
+
+    return &s->hash;
+}
+
+static void sha256_ni_reset(ssh_hash *hash)
+{
+    sha256_ni *s = container_of(hash, sha256_ni, hash);
+
+    /* Initialise the core vectors in their storage order */
+    s->core[0] = _mm_set_epi64x(
+        0x6a09e667bb67ae85ULL, 0x510e527f9b05688cULL);
+    s->core[1] = _mm_set_epi64x(
+        0x3c6ef372a54ff53aULL, 0x1f83d9ab5be0cd19ULL);
+
+    sha256_block_setup(&s->blk);
+}
+
+static void sha256_ni_copyfrom(ssh_hash *hcopy, ssh_hash *horig)
+{
+    sha256_ni *copy = container_of(hcopy, sha256_ni, hash);
+    sha256_ni *orig = container_of(horig, sha256_ni, hash);
+
+    void *ptf_save = copy->pointer_to_free;
+    *copy = *orig; /* structure copy */
+    copy->pointer_to_free = ptf_save;
+
+    BinarySink_COPIED(copy);
+    BinarySink_DELEGATE_INIT(&copy->hash, copy);
+}
+
+static void sha256_ni_free(ssh_hash *hash)
+{
+    sha256_ni *s = container_of(hash, sha256_ni, hash);
+
+    void *ptf = s->pointer_to_free;
+    smemclr(s, sizeof(*s));
+    sfree(ptf);
+}
+
+static void sha256_ni_write(BinarySink *bs, const void *vp, size_t len)
+{
+    sha256_ni *s = BinarySink_DOWNCAST(bs, sha256_ni);
+
+    while (len > 0)
+        if (sha256_block_write(&s->blk, &vp, &len))
+            sha256_ni_block(s->core, s->blk.block);
+}
+
+static void sha256_ni_digest(ssh_hash *hash, uint8_t *digest)
+{
+    sha256_ni *s = container_of(hash, sha256_ni, hash);
+
+    sha256_block_pad(&s->blk, BinarySink_UPCAST(s));
+
+    /* Rearrange the words into the output order */
+    __m128i feba = _mm_shuffle_epi32(s->core[0], 0x1B);
+    __m128i dchg = _mm_shuffle_epi32(s->core[1], 0xB1);
+    __m128i dcba = _mm_blend_epi16(feba, dchg, 0xF0);
+    __m128i hgfe = _mm_alignr_epi8(dchg, feba, 8);
+
+    /* Byte-swap them into the output endianness */
+    const __m128i mask = _mm_setr_epi8(3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12);
+    dcba = _mm_shuffle_epi8(dcba, mask);
+    hgfe = _mm_shuffle_epi8(hgfe, mask);
+
+    /* And store them */
+    __m128i *output = (__m128i *)digest;
+    _mm_storeu_si128(output, dcba);
+    _mm_storeu_si128(output+1, hgfe);
+}
+
+SHA256_VTABLE(ni, "SHA-NI accelerated");
diff --git a/crypto/sha256-select.c b/crypto/sha256-select.c
new file mode 100644
index 00000000..78e5b7e4
--- /dev/null
+++ b/crypto/sha256-select.c
@@ -0,0 +1,44 @@
+/*
+ * Top-level vtables to select a SHA-256 implementation.
+ */
+
+#include <assert.h>
+#include <stdlib.h>
+
+#include "putty.h"
+#include "ssh.h"
+#include "sha256.h"
+
+static ssh_hash *sha256_select(const ssh_hashalg *alg)
+{
+    static const ssh_hashalg *const real_algs[] = {
+#if HAVE_SHA_NI
+        &ssh_sha256_ni,
+#endif
+#if HAVE_NEON_CRYPTO
+        &ssh_sha256_neon,
+#endif
+        &ssh_sha256_sw,
+        NULL,
+    };
+
+    for (size_t i = 0; real_algs[i]; i++) {
+        const ssh_hashalg *alg = real_algs[i];
+        const struct sha256_extra *alg_extra =
+            (const struct sha256_extra *)alg->extra;
+        if (check_availability(alg_extra))
+            return ssh_hash_new(alg);
+    }
+
+    /* We should never reach the NULL at the end of the list, because
+     * the last non-NULL entry should be software-only SHA-256, which
+     * is always available. */
+    unreachable("sha256_select ran off the end of its list");
+}
+
+const ssh_hashalg ssh_sha256 = {
+    .new = sha256_select,
+    .hlen = 32,
+    .blocklen = 64,
+    HASHALG_NAMES_ANNOTATED("SHA-256", "dummy selector vtable"),
+};
diff --git a/crypto/sha256-sw.c b/crypto/sha256-sw.c
new file mode 100644
index 00000000..82a116c6
--- /dev/null
+++ b/crypto/sha256-sw.c
@@ -0,0 +1,157 @@
+/*
+ * Software implementation of SHA-256.
+ */
+
+#include "ssh.h"
+#include "sha256.h"
+
+static bool sha256_sw_available(void)
+{
+    /* Software SHA-256 is always available */
+    return true;
+}
+
+static inline uint32_t ror(uint32_t x, unsigned y)
+{
+    return (x << (31 & -y)) | (x >> (31 & y));
+}
+
+static inline uint32_t Ch(uint32_t ctrl, uint32_t if1, uint32_t if0)
+{
+    return if0 ^ (ctrl & (if1 ^ if0));
+}
+
+static inline uint32_t Maj(uint32_t x, uint32_t y, uint32_t z)
+{
+    return (x & y) | (z & (x | y));
+}
+
+static inline uint32_t Sigma_0(uint32_t x)
+{
+    return ror(x,2) ^ ror(x,13) ^ ror(x,22);
+}
+
+static inline uint32_t Sigma_1(uint32_t x)
+{
+    return ror(x,6) ^ ror(x,11) ^ ror(x,25);
+}
+
+static inline uint32_t sigma_0(uint32_t x)
+{
+    return ror(x,7) ^ ror(x,18) ^ (x >> 3);
+}
+
+static inline uint32_t sigma_1(uint32_t x)
+{
+    return ror(x,17) ^ ror(x,19) ^ (x >> 10);
+}
+
+static inline void sha256_sw_round(
+    unsigned round_index, const uint32_t *schedule,
+    uint32_t *a, uint32_t *b, uint32_t *c, uint32_t *d,
+    uint32_t *e, uint32_t *f, uint32_t *g, uint32_t *h)
+{
+    uint32_t t1 = *h + Sigma_1(*e) + Ch(*e,*f,*g) +
+        sha256_round_constants[round_index] + schedule[round_index];
+
+    uint32_t t2 = Sigma_0(*a) + Maj(*a,*b,*c);
+
+    *d += t1;
+    *h = t1 + t2;
+}
+
+static void sha256_sw_block(uint32_t *core, const uint8_t *block)
+{
+    uint32_t w[SHA256_ROUNDS];
+    uint32_t a,b,c,d,e,f,g,h;
+
+    for (size_t t = 0; t < 16; t++)
+        w[t] = GET_32BIT_MSB_FIRST(block + 4*t);
+
+    for (size_t t = 16; t < SHA256_ROUNDS; t++)
+        w[t] = sigma_1(w[t-2]) + w[t-7] + sigma_0(w[t-15]) + w[t-16];
+
+    a = core[0]; b = core[1]; c = core[2]; d = core[3];
+    e = core[4]; f = core[5]; g = core[6]; h = core[7];
+
+    for (size_t t = 0; t < SHA256_ROUNDS; t += 8) {
+        sha256_sw_round(t+0, w, &a,&b,&c,&d,&e,&f,&g,&h);
+        sha256_sw_round(t+1, w, &h,&a,&b,&c,&d,&e,&f,&g);
+        sha256_sw_round(t+2, w, &g,&h,&a,&b,&c,&d,&e,&f);
+        sha256_sw_round(t+3, w, &f,&g,&h,&a,&b,&c,&d,&e);
+        sha256_sw_round(t+4, w, &e,&f,&g,&h,&a,&b,&c,&d);
+        sha256_sw_round(t+5, w, &d,&e,&f,&g,&h,&a,&b,&c);
+        sha256_sw_round(t+6, w, &c,&d,&e,&f,&g,&h,&a,&b);
+        sha256_sw_round(t+7, w, &b,&c,&d,&e,&f,&g,&h,&a);
+    }
+
+    core[0] += a; core[1] += b; core[2] += c; core[3] += d;
+    core[4] += e; core[5] += f; core[6] += g; core[7] += h;
+
+    smemclr(w, sizeof(w));
+}
+
+typedef struct sha256_sw {
+    uint32_t core[8];
+    sha256_block blk;
+    BinarySink_IMPLEMENTATION;
+    ssh_hash hash;
+} sha256_sw;
+
+static void sha256_sw_write(BinarySink *bs, const void *vp, size_t len);
+
+static ssh_hash *sha256_sw_new(const ssh_hashalg *alg)
+{
+    sha256_sw *s = snew(sha256_sw);
+
+    s->hash.vt = alg;
+    BinarySink_INIT(s, sha256_sw_write);
+    BinarySink_DELEGATE_INIT(&s->hash, s);
+    return &s->hash;
+}
+
+static void sha256_sw_reset(ssh_hash *hash)
+{
+    sha256_sw *s = container_of(hash, sha256_sw, hash);
+
+    memcpy(s->core, sha256_initial_state, sizeof(s->core));
+    sha256_block_setup(&s->blk);
+}
+
+static void sha256_sw_copyfrom(ssh_hash *hcopy, ssh_hash *horig)
+{
+    sha256_sw *copy = container_of(hcopy, sha256_sw, hash);
+    sha256_sw *orig = container_of(horig, sha256_sw, hash);
+
+    memcpy(copy, orig, sizeof(*copy));
+    BinarySink_COPIED(copy);
+    BinarySink_DELEGATE_INIT(&copy->hash, copy);
+}
+
+static void sha256_sw_free(ssh_hash *hash)
+{
+    sha256_sw *s = container_of(hash, sha256_sw, hash);
+
+    smemclr(s, sizeof(*s));
+    sfree(s);
+}
+
+static void sha256_sw_write(BinarySink *bs, const void *vp, size_t len)
+{
+    sha256_sw *s = BinarySink_DOWNCAST(bs, sha256_sw);
+
+    while (len > 0)
+        if (sha256_block_write(&s->blk, &vp, &len))
+            sha256_sw_block(s->core, s->blk.block);
+}
+
+static void sha256_sw_digest(ssh_hash *hash, uint8_t *digest)
+{
+    sha256_sw *s = container_of(hash, sha256_sw, hash);
+
+    sha256_block_pad(&s->blk, BinarySink_UPCAST(s));
+    for (size_t i = 0; i < 8; i++)
+        PUT_32BIT_MSB_FIRST(digest + 4*i, s->core[i]);
+}
+
+SHA256_VTABLE(sw, "unaccelerated");
diff --git a/crypto/sha256.c b/crypto/sha256.c
deleted file mode 100644
index 206a976c..00000000
--- a/crypto/sha256.c
+++ /dev/null
@@ -1,939 +0,0 @@
-/*
- * SHA-256 algorithm as described at
- *
- *   http://csrc.nist.gov/cryptval/shs.html
- */
-
-#include "ssh.h"
-#include <assert.h>
-
-/*
- * Start by deciding whether we can support hardware SHA at all.
- */
-#define HW_SHA256_NONE 0
-#define HW_SHA256_NI 1
-#define HW_SHA256_NEON 2
-
-#ifdef _FORCE_SHA_NI
-#   define HW_SHA256 HW_SHA256_NI
-#elif defined(__clang__)
-#   if __has_attribute(target) && __has_include(<wmmintrin.h>) &&       \
-    (defined(__x86_64__) || defined(__i386))
-#       define HW_SHA256 HW_SHA256_NI
-#   endif
-#elif defined(__GNUC__)
-#    if (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 9)) && \
-        (defined(__x86_64__) || defined(__i386))
-#       define HW_SHA256 HW_SHA256_NI
-#    endif
-#elif defined (_MSC_VER)
-#   if (defined(_M_X64) || defined(_M_IX86)) && _MSC_FULL_VER >= 150030729
-#      define HW_SHA256 HW_SHA256_NI
-#   endif
-#endif
-
-#ifdef _FORCE_SHA_NEON
-#   define HW_SHA256 HW_SHA256_NEON
-#elif defined __BYTE_ORDER__ && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
-    /* Arm can potentially support both endiannesses, but this code
-     * hasn't been tested on anything but little. If anyone wants to
-     * run big-endian, they'll need to fix it first. */
-#elif defined __ARM_FEATURE_CRYPTO
-    /* If the Arm crypto extension is available already, we can
-     * support NEON SHA without having to enable anything by hand */
-#   define HW_SHA256 HW_SHA256_NEON
-#elif defined(__clang__)
-#   if __has_attribute(target) && __has_include(<arm_neon.h>) &&       \
-    (defined(__aarch64__))
-        /* clang can enable the crypto extension in AArch64 using
-         * __attribute__((target)) */
-#       define HW_SHA256 HW_SHA256_NEON
-#       define USE_CLANG_ATTR_TARGET_AARCH64
-#   endif
-#elif defined _MSC_VER
-    /* Visual Studio supports the crypto extension when targeting
-     * AArch64, but as of VS2017, the AArch32 header doesn't quite
-     * manage it (declaring the shae/shad intrinsics without a round
-     * key operand). */
-#   if defined _M_ARM64
-#       define HW_SHA256 HW_SHA256_NEON
-#       if defined _M_ARM64
-#           define USE_ARM64_NEON_H /* unusual header name in this case */
-#       endif
-#   endif
-#endif
-
-#if defined _FORCE_SOFTWARE_SHA || !defined HW_SHA256
-#   undef HW_SHA256
-#   define HW_SHA256 HW_SHA256_NONE
-#endif
-
-/*
- * The actual query function that asks if hardware acceleration is
- * available.
- */
-static bool sha256_hw_available(void);
-
-/*
- * The top-level selection function, caching the results of
- * sha256_hw_available() so it only has to run once.
- */
-static bool sha256_hw_available_cached(void)
-{
-    static bool initialised = false;
-    static bool hw_available;
-    if (!initialised) {
-        hw_available = sha256_hw_available();
-        initialised = true;
-    }
-    return hw_available;
-}
-
-static ssh_hash *sha256_select(const ssh_hashalg *alg)
-{
-    const ssh_hashalg *real_alg =
-        sha256_hw_available_cached() ? &ssh_sha256_hw : &ssh_sha256_sw;
-
-    return ssh_hash_new(real_alg);
-}
-
-const ssh_hashalg ssh_sha256 = {
-    .new = sha256_select,
-    .hlen = 32,
-    .blocklen = 64,
-    HASHALG_NAMES_ANNOTATED("SHA-256", "dummy selector vtable"),
-};
-
-/* ----------------------------------------------------------------------
- * Definitions likely to be helpful to multiple implementations.
- */
-
-static const uint32_t sha256_initial_state[] = {
-    0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a,
-    0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19,
-};
-
-static const uint32_t sha256_round_constants[] = {
-    0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
-    0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
-    0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
-    0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
-    0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
-    0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
-    0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
-    0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
-    0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
-    0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
-    0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
-    0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
-    0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
-    0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
-    0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
-    0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
-};
-
-#define SHA256_ROUNDS 64
-
-typedef struct sha256_block sha256_block;
-struct sha256_block {
-    uint8_t block[64];
-    size_t used;
-    uint64_t len;
-};
-
-static inline void sha256_block_setup(sha256_block *blk)
-{
-    blk->used = 0;
-    blk->len = 0;
-}
-
-static inline bool sha256_block_write(
-    sha256_block *blk, const void **vdata, size_t *len)
-{
-    size_t blkleft = sizeof(blk->block) - blk->used;
-    size_t chunk = *len < blkleft ? *len : blkleft;
-
-    const uint8_t *p = *vdata;
-    memcpy(blk->block + blk->used, p, chunk);
-    *vdata = p + chunk;
-    *len -= chunk;
-    blk->used += chunk;
-    blk->len += chunk;
-
-    if (blk->used == sizeof(blk->block)) {
-        blk->used = 0;
-        return true;
-    }
-
-    return false;
-}
-
-static inline void sha256_block_pad(sha256_block *blk, BinarySink *bs)
-{
-    uint64_t final_len = blk->len << 3;
-    size_t pad = 1 + (63 & (55 - blk->used));
-
-    put_byte(bs, 0x80);
-    for (size_t i = 1; i < pad; i++)
-        put_byte(bs, 0);
-    put_uint64(bs, final_len);
-
-    assert(blk->used == 0 && "Should have exactly hit a block boundary");
-}
-
-/* ----------------------------------------------------------------------
- * Software implementation of SHA-256.
- */
-
-static inline uint32_t ror(uint32_t x, unsigned y)
-{
-    return (x << (31 & -y)) | (x >> (31 & y));
-}
-
-static inline uint32_t Ch(uint32_t ctrl, uint32_t if1, uint32_t if0)
-{
-    return if0 ^ (ctrl & (if1 ^ if0));
-}
-
-static inline uint32_t Maj(uint32_t x, uint32_t y, uint32_t z)
-{
-    return (x & y) | (z & (x | y));
-}
-
-static inline uint32_t Sigma_0(uint32_t x)
-{
-    return ror(x,2) ^ ror(x,13) ^ ror(x,22);
-}
-
-static inline uint32_t Sigma_1(uint32_t x)
-{
-    return ror(x,6) ^ ror(x,11) ^ ror(x,25);
-}
-
-static inline uint32_t sigma_0(uint32_t x)
-{
-    return ror(x,7) ^ ror(x,18) ^ (x >> 3);
-}
-
-static inline uint32_t sigma_1(uint32_t x)
-{
-    return ror(x,17) ^ ror(x,19) ^ (x >> 10);
-}
-
-static inline void sha256_sw_round(
-    unsigned round_index, const uint32_t *schedule,
-    uint32_t *a, uint32_t *b, uint32_t *c, uint32_t *d,
-    uint32_t *e, uint32_t *f, uint32_t *g, uint32_t *h)
-{
-    uint32_t t1 = *h + Sigma_1(*e) + Ch(*e,*f,*g) +
-        sha256_round_constants[round_index] + schedule[round_index];
-
-    uint32_t t2 = Sigma_0(*a) + Maj(*a,*b,*c);
-
-    *d += t1;
-    *h = t1 + t2;
-}
-
-static void sha256_sw_block(uint32_t *core, const uint8_t *block)
-{
-    uint32_t w[SHA256_ROUNDS];
-    uint32_t a,b,c,d,e,f,g,h;
-
-    for (size_t t = 0; t < 16; t++)
-        w[t] = GET_32BIT_MSB_FIRST(block + 4*t);
-
-    for (size_t t = 16; t < SHA256_ROUNDS; t++)
-        w[t] = sigma_1(w[t-2]) + w[t-7] + sigma_0(w[t-15]) + w[t-16];
-
-    a = core[0]; b = core[1]; c = core[2]; d = core[3];
-    e = core[4]; f = core[5]; g = core[6]; h = core[7];
-
-    for (size_t t = 0; t < SHA256_ROUNDS; t += 8) {
-        sha256_sw_round(t+0, w, &a,&b,&c,&d,&e,&f,&g,&h);
-        sha256_sw_round(t+1, w, &h,&a,&b,&c,&d,&e,&f,&g);
-        sha256_sw_round(t+2, w, &g,&h,&a,&b,&c,&d,&e,&f);
-        sha256_sw_round(t+3, w, &f,&g,&h,&a,&b,&c,&d,&e);
-        sha256_sw_round(t+4, w, &e,&f,&g,&h,&a,&b,&c,&d);
-        sha256_sw_round(t+5, w, &d,&e,&f,&g,&h,&a,&b,&c);
-        sha256_sw_round(t+6, w, &c,&d,&e,&f,&g,&h,&a,&b);
-        sha256_sw_round(t+7, w, &b,&c,&d,&e,&f,&g,&h,&a);
-    }
-
-    core[0] += a; core[1] += b; core[2] += c; core[3] += d;
-    core[4] += e; core[5] += f; core[6] += g; core[7] += h;
-
-    smemclr(w, sizeof(w));
-}
-
-typedef struct sha256_sw {
-    uint32_t core[8];
-    sha256_block blk;
-    BinarySink_IMPLEMENTATION;
-    ssh_hash hash;
-} sha256_sw;
-
-static void sha256_sw_write(BinarySink *bs, const void *vp, size_t len);
-
-static ssh_hash *sha256_sw_new(const ssh_hashalg *alg)
-{
-    sha256_sw *s = snew(sha256_sw);
-
-    s->hash.vt = alg;
-    BinarySink_INIT(s, sha256_sw_write);
-    BinarySink_DELEGATE_INIT(&s->hash, s);
-    return &s->hash;
-}
-
-static void sha256_sw_reset(ssh_hash *hash)
-{
-    sha256_sw *s = container_of(hash, sha256_sw, hash);
-
-    memcpy(s->core, sha256_initial_state, sizeof(s->core));
-    sha256_block_setup(&s->blk);
-}
-
-static void sha256_sw_copyfrom(ssh_hash *hcopy, ssh_hash *horig)
-{
-    sha256_sw *copy = container_of(hcopy, sha256_sw, hash);
-    sha256_sw *orig = container_of(horig, sha256_sw, hash);
-
-    memcpy(copy, orig, sizeof(*copy));
-    BinarySink_COPIED(copy);
-    BinarySink_DELEGATE_INIT(&copy->hash, copy);
-}
-
-static void sha256_sw_free(ssh_hash *hash)
-{
-    sha256_sw *s = container_of(hash, sha256_sw, hash);
-
-    smemclr(s, sizeof(*s));
-    sfree(s);
-}
-
-static void sha256_sw_write(BinarySink *bs, const void *vp, size_t len)
-{
-    sha256_sw *s = BinarySink_DOWNCAST(bs, sha256_sw);
-
-    while (len > 0)
-        if (sha256_block_write(&s->blk, &vp, &len))
-            sha256_sw_block(s->core, s->blk.block);
-}
-
-static void sha256_sw_digest(ssh_hash *hash, uint8_t *digest)
-{
-    sha256_sw *s = container_of(hash, sha256_sw, hash);
-
-    sha256_block_pad(&s->blk, BinarySink_UPCAST(s));
-    for (size_t i = 0; i < 8; i++)
-        PUT_32BIT_MSB_FIRST(digest + 4*i, s->core[i]);
-}
-
-const ssh_hashalg ssh_sha256_sw = {
-    .new = sha256_sw_new,
-    .reset = sha256_sw_reset,
-    .copyfrom = sha256_sw_copyfrom,
-    .digest = sha256_sw_digest,
-    .free = sha256_sw_free,
-    .hlen = 32,
-    .blocklen = 64,
-    HASHALG_NAMES_ANNOTATED("SHA-256", "unaccelerated"),
-};
-
-/* ----------------------------------------------------------------------
- * Hardware-accelerated implementation of SHA-256 using x86 SHA-NI.
- */
-
-#if HW_SHA256 == HW_SHA256_NI
-
-/*
- * Set target architecture for Clang and GCC
- */
-#if defined(__clang__) || defined(__GNUC__)
-#    define FUNC_ISA __attribute__ ((target("sse4.1,sha")))
-#if !defined(__clang__)
-#    pragma GCC target("sha")
-#    pragma GCC target("sse4.1")
-#endif
-#else
-#    define FUNC_ISA
-#endif
-
-#include <wmmintrin.h>
-#include <smmintrin.h>
-#include <immintrin.h>
-#if defined(__clang__) || defined(__GNUC__)
-#include <shaintrin.h>
-#endif
-
-#if defined(__clang__) || defined(__GNUC__)
-#include <cpuid.h>
-#define GET_CPU_ID_0(out)                               \
-    __cpuid(0, (out)[0], (out)[1], (out)[2], (out)[3])
-#define GET_CPU_ID_7(out)                                       \
-    __cpuid_count(7, 0, (out)[0], (out)[1], (out)[2], (out)[3])
-#else
-#define GET_CPU_ID_0(out) __cpuid(out, 0)
-#define GET_CPU_ID_7(out) __cpuidex(out, 7, 0)
-#endif
-
-static bool sha256_hw_available(void)
-{
-    unsigned int CPUInfo[4];
-    GET_CPU_ID_0(CPUInfo);
-    if (CPUInfo[0] < 7)
-        return false;
-
-    GET_CPU_ID_7(CPUInfo);
-    return CPUInfo[1] & (1 << 29); /* Check SHA */
-}
-
-/* SHA256 implementation using new instructions
-   The code is based on Jeffrey Walton's SHA256 implementation:
-   https://github.com/noloader/SHA-Intrinsics
-*/
-FUNC_ISA
-static inline void sha256_ni_block(__m128i *core, const uint8_t *p)
-{
-    __m128i STATE0, STATE1;
-    __m128i MSG, TMP;
-    __m128i MSG0, MSG1, MSG2, MSG3;
-    const __m128i *block = (const __m128i *)p;
-    const __m128i MASK = _mm_set_epi64x(
-        0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL);
-
-    /* Load initial values */
-    STATE0 = core[0];
-    STATE1 = core[1];
-
-    /* Rounds 0-3 */
-    MSG = _mm_loadu_si128(block);
-    MSG0 = _mm_shuffle_epi8(MSG, MASK);
-    MSG = _mm_add_epi32(MSG0, _mm_set_epi64x(
-                            0xE9B5DBA5B5C0FBCFULL, 0x71374491428A2F98ULL));
-    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
-    MSG = _mm_shuffle_epi32(MSG, 0x0E);
-    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
-
-    /* Rounds 4-7 */
-    MSG1 = _mm_loadu_si128(block + 1);
-    MSG1 = _mm_shuffle_epi8(MSG1, MASK);
-    MSG = _mm_add_epi32(MSG1, _mm_set_epi64x(
-                            0xAB1C5ED5923F82A4ULL, 0x59F111F13956C25BULL));
-    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
-    MSG = _mm_shuffle_epi32(MSG, 0x0E);
-    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
-    MSG0 = _mm_sha256msg1_epu32(MSG0, MSG1);
-
-    /* Rounds 8-11 */
-    MSG2 = _mm_loadu_si128(block + 2);
-    MSG2 = _mm_shuffle_epi8(MSG2, MASK);
-    MSG = _mm_add_epi32(MSG2, _mm_set_epi64x(
-                            0x550C7DC3243185BEULL, 0x12835B01D807AA98ULL));
-    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
-    MSG = _mm_shuffle_epi32(MSG, 0x0E);
-    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
-    MSG1 = _mm_sha256msg1_epu32(MSG1, MSG2);
-
-    /* Rounds 12-15 */
-    MSG3 = _mm_loadu_si128(block + 3);
-    MSG3 = _mm_shuffle_epi8(MSG3, MASK);
-    MSG = _mm_add_epi32(MSG3, _mm_set_epi64x(
-                            0xC19BF1749BDC06A7ULL, 0x80DEB1FE72BE5D74ULL));
-    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
-    TMP = _mm_alignr_epi8(MSG3, MSG2, 4);
-    MSG0 = _mm_add_epi32(MSG0, TMP);
-    MSG0 = _mm_sha256msg2_epu32(MSG0, MSG3);
-    MSG = _mm_shuffle_epi32(MSG, 0x0E);
-    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
-    MSG2 = _mm_sha256msg1_epu32(MSG2, MSG3);
-
-    /* Rounds 16-19 */
-    MSG = _mm_add_epi32(MSG0, _mm_set_epi64x(
-                            0x240CA1CC0FC19DC6ULL, 0xEFBE4786E49B69C1ULL));
-    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
-    TMP = _mm_alignr_epi8(MSG0, MSG3, 4);
-    MSG1 = _mm_add_epi32(MSG1, TMP);
-    MSG1 = _mm_sha256msg2_epu32(MSG1, MSG0);
-    MSG = _mm_shuffle_epi32(MSG, 0x0E);
-    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
-    MSG3 = _mm_sha256msg1_epu32(MSG3, MSG0);
-
-    /* Rounds 20-23 */
-    MSG = _mm_add_epi32(MSG1, _mm_set_epi64x(
-                            0x76F988DA5CB0A9DCULL, 0x4A7484AA2DE92C6FULL));
-    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
-    TMP = _mm_alignr_epi8(MSG1, MSG0, 4);
-    MSG2 = _mm_add_epi32(MSG2, TMP);
-    MSG2 = _mm_sha256msg2_epu32(MSG2, MSG1);
-    MSG = _mm_shuffle_epi32(MSG, 0x0E);
-    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
-    MSG0 = _mm_sha256msg1_epu32(MSG0, MSG1);
-
-    /* Rounds 24-27 */
-    MSG = _mm_add_epi32(MSG2, _mm_set_epi64x(
-                            0xBF597FC7B00327C8ULL, 0xA831C66D983E5152ULL));
-    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
-    TMP = _mm_alignr_epi8(MSG2, MSG1, 4);
-    MSG3 = _mm_add_epi32(MSG3, TMP);
-    MSG3 = _mm_sha256msg2_epu32(MSG3, MSG2);
-    MSG = _mm_shuffle_epi32(MSG, 0x0E);
-    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
-    MSG1 = _mm_sha256msg1_epu32(MSG1, MSG2);
-
-    /* Rounds 28-31 */
-    MSG = _mm_add_epi32(MSG3, _mm_set_epi64x(
-                            0x1429296706CA6351ULL,  0xD5A79147C6E00BF3ULL));
-    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
-    TMP = _mm_alignr_epi8(MSG3, MSG2, 4);
-    MSG0 = _mm_add_epi32(MSG0, TMP);
-    MSG0 = _mm_sha256msg2_epu32(MSG0, MSG3);
-    MSG = _mm_shuffle_epi32(MSG, 0x0E);
-    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
-    MSG2 = _mm_sha256msg1_epu32(MSG2, MSG3);
-
-    /* Rounds 32-35 */
-    MSG = _mm_add_epi32(MSG0, _mm_set_epi64x(
-                            0x53380D134D2C6DFCULL, 0x2E1B213827B70A85ULL));
-    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
-    TMP = _mm_alignr_epi8(MSG0, MSG3, 4);
-    MSG1 = _mm_add_epi32(MSG1, TMP);
-    MSG1 = _mm_sha256msg2_epu32(MSG1, MSG0);
-    MSG = _mm_shuffle_epi32(MSG, 0x0E);
-    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
-    MSG3 = _mm_sha256msg1_epu32(MSG3, MSG0);
-
-    /* Rounds 36-39 */
-    MSG = _mm_add_epi32(MSG1, _mm_set_epi64x(
-                            0x92722C8581C2C92EULL, 0x766A0ABB650A7354ULL));
-    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
-    TMP = _mm_alignr_epi8(MSG1, MSG0, 4);
-    MSG2 = _mm_add_epi32(MSG2, TMP);
-    MSG2 = _mm_sha256msg2_epu32(MSG2, MSG1);
-    MSG = _mm_shuffle_epi32(MSG, 0x0E);
-    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
-    MSG0 = _mm_sha256msg1_epu32(MSG0, MSG1);
-
-    /* Rounds 40-43 */
-    MSG = _mm_add_epi32(MSG2, _mm_set_epi64x(
-                            0xC76C51A3C24B8B70ULL, 0xA81A664BA2BFE8A1ULL));
-    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
-    TMP = _mm_alignr_epi8(MSG2, MSG1, 4);
-    MSG3 = _mm_add_epi32(MSG3, TMP);
-    MSG3 = _mm_sha256msg2_epu32(MSG3, MSG2);
-    MSG = _mm_shuffle_epi32(MSG, 0x0E);
-    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
-    MSG1 = _mm_sha256msg1_epu32(MSG1, MSG2);
-
-    /* Rounds 44-47 */
-    MSG = _mm_add_epi32(MSG3, _mm_set_epi64x(
-                            0x106AA070F40E3585ULL, 0xD6990624D192E819ULL));
-    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
-    TMP = _mm_alignr_epi8(MSG3, MSG2, 4);
-    MSG0 = _mm_add_epi32(MSG0, TMP);
-    MSG0 = _mm_sha256msg2_epu32(MSG0, MSG3);
-    MSG = _mm_shuffle_epi32(MSG, 0x0E);
-    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
-    MSG2 = _mm_sha256msg1_epu32(MSG2, MSG3);
-
-    /* Rounds 48-51 */
-    MSG = _mm_add_epi32(MSG0, _mm_set_epi64x(
-                            0x34B0BCB52748774CULL, 0x1E376C0819A4C116ULL));
-    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
-    TMP = _mm_alignr_epi8(MSG0, MSG3, 4);
-    MSG1 = _mm_add_epi32(MSG1, TMP);
-    MSG1 = _mm_sha256msg2_epu32(MSG1, MSG0);
-    MSG = _mm_shuffle_epi32(MSG, 0x0E);
-    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
-    MSG3 = _mm_sha256msg1_epu32(MSG3, MSG0);
-
-    /* Rounds 52-55 */
-    MSG = _mm_add_epi32(MSG1, _mm_set_epi64x(
-                            0x682E6FF35B9CCA4FULL, 0x4ED8AA4A391C0CB3ULL));
-    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
-    TMP = _mm_alignr_epi8(MSG1, MSG0, 4);
-    MSG2 = _mm_add_epi32(MSG2, TMP);
-    MSG2 = _mm_sha256msg2_epu32(MSG2, MSG1);
-    MSG = _mm_shuffle_epi32(MSG, 0x0E);
-    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
-
-    /* Rounds 56-59 */
-    MSG = _mm_add_epi32(MSG2, _mm_set_epi64x(
-                            0x8CC7020884C87814ULL, 0x78A5636F748F82EEULL));
-    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
-    TMP = _mm_alignr_epi8(MSG2, MSG1, 4);
-    MSG3 = _mm_add_epi32(MSG3, TMP);
-    MSG3 = _mm_sha256msg2_epu32(MSG3, MSG2);
-    MSG = _mm_shuffle_epi32(MSG, 0x0E);
-    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
-
-    /* Rounds 60-63 */
-    MSG = _mm_add_epi32(MSG3, _mm_set_epi64x(
-                            0xC67178F2BEF9A3F7ULL, 0xA4506CEB90BEFFFAULL));
-    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
-    MSG = _mm_shuffle_epi32(MSG, 0x0E);
-    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
-
-    /* Combine state */
-    core[0] = _mm_add_epi32(STATE0, core[0]);
-    core[1] = _mm_add_epi32(STATE1, core[1]);
-}
-
-typedef struct sha256_ni {
-    /*
-     * These two vectors store the 8 words of the SHA-256 state, but
-     * not in the same order they appear in the spec: the first word
-     * holds A,B,E,F and the second word C,D,G,H.
-     */
-    __m128i core[2];
-    sha256_block blk;
-    void *pointer_to_free;
-    BinarySink_IMPLEMENTATION;
-    ssh_hash hash;
-} sha256_ni;
-
-static void sha256_ni_write(BinarySink *bs, const void *vp, size_t len);
-
-static sha256_ni *sha256_ni_alloc(void)
-{
-    /*
-     * The __m128i variables in the context structure need to be
-     * 16-byte aligned, but not all malloc implementations that this
-     * code has to work with will guarantee to return a 16-byte
-     * aligned pointer. So we over-allocate, manually realign the
-     * pointer ourselves, and store the original one inside the
-     * context so we know how to free it later.
-     */
-    void *allocation = smalloc(sizeof(sha256_ni) + 15);
-    uintptr_t alloc_address = (uintptr_t)allocation;
-    uintptr_t aligned_address = (alloc_address + 15) & ~15;
-    sha256_ni *s = (sha256_ni *)aligned_address;
-    s->pointer_to_free = allocation;
-    return s;
-}
-
-static ssh_hash *sha256_ni_new(const ssh_hashalg *alg)
-{
-    if (!sha256_hw_available_cached())
-        return NULL;
-
-    sha256_ni *s = sha256_ni_alloc();
-
-    s->hash.vt = alg;
-    BinarySink_INIT(s, sha256_ni_write);
-    BinarySink_DELEGATE_INIT(&s->hash, s);
-
-    return &s->hash;
-}
-
-FUNC_ISA static void sha256_ni_reset(ssh_hash *hash)
-{
-    sha256_ni *s = container_of(hash, sha256_ni, hash);
-
-    /* Initialise the core vectors in their storage order */
-    s->core[0] = _mm_set_epi64x(
-        0x6a09e667bb67ae85ULL, 0x510e527f9b05688cULL);
-    s->core[1] = _mm_set_epi64x(
-        0x3c6ef372a54ff53aULL, 0x1f83d9ab5be0cd19ULL);
-
-    sha256_block_setup(&s->blk);
-}
-
-static void sha256_ni_copyfrom(ssh_hash *hcopy, ssh_hash *horig)
-{
-    sha256_ni *copy = container_of(hcopy, sha256_ni, hash);
-    sha256_ni *orig = container_of(horig, sha256_ni, hash);
-
-    void *ptf_save = copy->pointer_to_free;
-    *copy = *orig; /* structure copy */
-    copy->pointer_to_free = ptf_save;
-
-    BinarySink_COPIED(copy);
-    BinarySink_DELEGATE_INIT(&copy->hash, copy);
-}
-
-static void sha256_ni_free(ssh_hash *hash)
-{
-    sha256_ni *s = container_of(hash, sha256_ni, hash);
-
-    void *ptf = s->pointer_to_free;
-    smemclr(s, sizeof(*s));
-    sfree(ptf);
-}
-
-static void sha256_ni_write(BinarySink *bs, const void *vp, size_t len)
-{
-    sha256_ni *s = BinarySink_DOWNCAST(bs, sha256_ni);
-
-    while (len > 0)
-        if (sha256_block_write(&s->blk, &vp, &len))
-            sha256_ni_block(s->core, s->blk.block);
-}
-
-FUNC_ISA static void sha256_ni_digest(ssh_hash *hash, uint8_t *digest)
-{
-    sha256_ni *s = container_of(hash, sha256_ni, hash);
-
-    sha256_block_pad(&s->blk, BinarySink_UPCAST(s));
-
-    /* Rearrange the words into the output order */
-    __m128i feba = _mm_shuffle_epi32(s->core[0], 0x1B);
-    __m128i dchg = _mm_shuffle_epi32(s->core[1], 0xB1);
-    __m128i dcba = _mm_blend_epi16(feba, dchg, 0xF0);
-    __m128i hgfe = _mm_alignr_epi8(dchg, feba, 8);
-
-    /* Byte-swap them into the output endianness */
-    const __m128i mask = _mm_setr_epi8(3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12);
-    dcba = _mm_shuffle_epi8(dcba, mask);
-    hgfe = _mm_shuffle_epi8(hgfe, mask);
-
-    /* And store them */
-    __m128i *output = (__m128i *)digest;
-    _mm_storeu_si128(output, dcba);
-    _mm_storeu_si128(output+1, hgfe);
-}
-
-const ssh_hashalg ssh_sha256_hw = {
-    .new = sha256_ni_new,
-    .reset = sha256_ni_reset,
-    .copyfrom = sha256_ni_copyfrom,
-    .digest = sha256_ni_digest,
-    .free = sha256_ni_free,
-    .hlen = 32,
-    .blocklen = 64,
-    HASHALG_NAMES_ANNOTATED("SHA-256", "SHA-NI accelerated"),
-};
-
-/* ----------------------------------------------------------------------
- * Hardware-accelerated implementation of SHA-256 using Arm NEON.
- */
-
-#elif HW_SHA256 == HW_SHA256_NEON
-
-/*
- * Manually set the target architecture, if we decided above that we
- * need to.
- */
-#ifdef USE_CLANG_ATTR_TARGET_AARCH64
-/*
- * A spot of cheating: redefine some ACLE feature macros before
- * including arm_neon.h. Otherwise we won't get the SHA intrinsics
- * defined by that header, because it will be looking at the settings
- * for the whole translation unit rather than the ones we're going to
- * put on some particular functions using __attribute__((target)).
- */
-#define __ARM_NEON 1
-#define __ARM_FEATURE_CRYPTO 1
-#define FUNC_ISA __attribute__ ((target("neon,crypto")))
-#endif /* USE_CLANG_ATTR_TARGET_AARCH64 */
-
-#ifndef FUNC_ISA
-#define FUNC_ISA
-#endif
-
-#ifdef USE_ARM64_NEON_H
-#include <arm64_neon.h>
-#else
-#include <arm_neon.h>
-#endif
-
-static bool sha256_hw_available(void)
-{
-    /*
-     * For Arm, we delegate to a per-platform detection function (see
-     * explanation in sshaes.c).
-     */
-    return platform_sha256_hw_available();
-}
-
-typedef struct sha256_neon_core sha256_neon_core;
-struct sha256_neon_core {
-    uint32x4_t abcd, efgh;
-};
-
-FUNC_ISA
-static inline uint32x4_t sha256_neon_load_input(const uint8_t *p)
-{
-    return vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(p)));
-}
-
-FUNC_ISA
-static inline uint32x4_t sha256_neon_schedule_update(
-    uint32x4_t m4, uint32x4_t m3, uint32x4_t m2, uint32x4_t m1)
-{
-    return vsha256su1q_u32(vsha256su0q_u32(m4, m3), m2, m1);
-}
-
-FUNC_ISA
-static inline sha256_neon_core sha256_neon_round4(
-    sha256_neon_core old, uint32x4_t sched, unsigned round)
-{
-    sha256_neon_core new;
-
-    uint32x4_t round_input = vaddq_u32(
-        sched, vld1q_u32(sha256_round_constants + round));
-    new.abcd = vsha256hq_u32 (old.abcd, old.efgh, round_input);
-    new.efgh = vsha256h2q_u32(old.efgh, old.abcd, round_input);
-    return new;
-}
-
-FUNC_ISA
-static inline void sha256_neon_block(sha256_neon_core *core, const uint8_t *p)
-{
-    uint32x4_t s0, s1, s2, s3;
-    sha256_neon_core cr = *core;
-
-    s0 = sha256_neon_load_input(p);
-    cr = sha256_neon_round4(cr, s0, 0);
-    s1 = sha256_neon_load_input(p+16);
-    cr = sha256_neon_round4(cr, s1, 4);
-    s2 = sha256_neon_load_input(p+32);
-    cr = sha256_neon_round4(cr, s2, 8);
-    s3 = sha256_neon_load_input(p+48);
-    cr = sha256_neon_round4(cr, s3, 12);
-    s0 = sha256_neon_schedule_update(s0, s1, s2, s3);
-    cr = sha256_neon_round4(cr, s0, 16);
-    s1 = sha256_neon_schedule_update(s1, s2, s3, s0);
-    cr = sha256_neon_round4(cr, s1, 20);
-    s2 = sha256_neon_schedule_update(s2, s3, s0, s1);
-    cr = sha256_neon_round4(cr, s2, 24);
-    s3 = sha256_neon_schedule_update(s3, s0, s1, s2);
-    cr = sha256_neon_round4(cr, s3, 28);
-    s0 = sha256_neon_schedule_update(s0, s1, s2, s3);
-    cr = sha256_neon_round4(cr, s0, 32);
-    s1 = sha256_neon_schedule_update(s1, s2, s3, s0);
-    cr = sha256_neon_round4(cr, s1, 36);
-    s2 = sha256_neon_schedule_update(s2, s3, s0, s1);
-    cr = sha256_neon_round4(cr, s2, 40);
-    s3 = sha256_neon_schedule_update(s3, s0, s1, s2);
-    cr = sha256_neon_round4(cr, s3, 44);
-    s0 = sha256_neon_schedule_update(s0, s1, s2, s3);
-    cr = sha256_neon_round4(cr, s0, 48);
-    s1 = sha256_neon_schedule_update(s1, s2, s3, s0);
-    cr = sha256_neon_round4(cr, s1, 52);
-    s2 = sha256_neon_schedule_update(s2, s3, s0, s1);
-    cr = sha256_neon_round4(cr, s2, 56);
-    s3 = sha256_neon_schedule_update(s3, s0, s1, s2);
-    cr = sha256_neon_round4(cr, s3, 60);
-
-    core->abcd = vaddq_u32(core->abcd, cr.abcd);
-    core->efgh = vaddq_u32(core->efgh, cr.efgh);
-}
-
-typedef struct sha256_neon {
-    sha256_neon_core core;
-    sha256_block blk;
-    BinarySink_IMPLEMENTATION;
-    ssh_hash hash;
-} sha256_neon;
-
-static void sha256_neon_write(BinarySink *bs, const void *vp, size_t len);
-
-static ssh_hash *sha256_neon_new(const ssh_hashalg *alg)
-{
-    if (!sha256_hw_available_cached())
-        return NULL;
-
-    sha256_neon *s = snew(sha256_neon);
-
-    s->hash.vt = alg;
-    BinarySink_INIT(s, sha256_neon_write);
-    BinarySink_DELEGATE_INIT(&s->hash, s);
-    return &s->hash;
-}
-
-static void sha256_neon_reset(ssh_hash *hash)
-{
-    sha256_neon *s = container_of(hash, sha256_neon, hash);
-
-    s->core.abcd = vld1q_u32(sha256_initial_state);
-    s->core.efgh = vld1q_u32(sha256_initial_state + 4);
-
-    sha256_block_setup(&s->blk);
-}
-
-static void sha256_neon_copyfrom(ssh_hash *hcopy, ssh_hash *horig)
-{
-    sha256_neon *copy = container_of(hcopy, sha256_neon, hash);
-    sha256_neon *orig = container_of(horig, sha256_neon, hash);
-
-    *copy = *orig; /* structure copy */
-
-    BinarySink_COPIED(copy);
-    BinarySink_DELEGATE_INIT(&copy->hash, copy);
-}
-
-static void sha256_neon_free(ssh_hash *hash)
-{
-    sha256_neon *s = container_of(hash, sha256_neon, hash);
-    smemclr(s, sizeof(*s));
-    sfree(s);
-}
-
-static void sha256_neon_write(BinarySink *bs, const void *vp, size_t len)
-{
-    sha256_neon *s = BinarySink_DOWNCAST(bs, sha256_neon);
-
-    while (len > 0)
-        if (sha256_block_write(&s->blk, &vp, &len))
-            sha256_neon_block(&s->core, s->blk.block);
-}
-
-static void sha256_neon_digest(ssh_hash *hash, uint8_t *digest)
-{
-    sha256_neon *s = container_of(hash, sha256_neon, hash);
-
-    sha256_block_pad(&s->blk, BinarySink_UPCAST(s));
-    vst1q_u8(digest,      vrev32q_u8(vreinterpretq_u8_u32(s->core.abcd)));
-    vst1q_u8(digest + 16, vrev32q_u8(vreinterpretq_u8_u32(s->core.efgh)));
-}
-
-const ssh_hashalg ssh_sha256_hw = {
-    .new = sha256_neon_new,
-    .reset = sha256_neon_reset,
-    .copyfrom = sha256_neon_copyfrom,
-    .digest = sha256_neon_digest,
-    .free = sha256_neon_free,
-    .hlen = 32,
-    .blocklen = 64,
-    HASHALG_NAMES_ANNOTATED("SHA-256", "NEON accelerated"),
-};
-
-/* ----------------------------------------------------------------------
- * Stub functions if we have no hardware-accelerated SHA-256. In this
- * case, sha256_hw_new returns NULL (though it should also never be
- * selected by sha256_select, so the only thing that should even be
- * _able_ to call it is testcrypt). As a result, the remaining vtable
- * functions should never be called at all.
- */
-
-#elif HW_SHA256 == HW_SHA256_NONE
-
-static bool sha256_hw_available(void)
-{
-    return false;
-}
-
-static ssh_hash *sha256_stub_new(const ssh_hashalg *alg)
-{
-    return NULL;
-}
-
-#define STUB_BODY { unreachable("Should never be called"); }
-
-static void sha256_stub_reset(ssh_hash *hash) STUB_BODY
-static void sha256_stub_copyfrom(ssh_hash *hash, ssh_hash *orig) STUB_BODY
-static void sha256_stub_free(ssh_hash *hash) STUB_BODY
-static void sha256_stub_digest(ssh_hash *hash, uint8_t *digest) STUB_BODY
-
-const ssh_hashalg ssh_sha256_hw = {
-    .new = sha256_stub_new,
-    .reset = sha256_stub_reset,
-    .copyfrom = sha256_stub_copyfrom,
-    .digest = sha256_stub_digest,
-    .free = sha256_stub_free,
-    .hlen = 32,
-    .blocklen = 64,
-    HASHALG_NAMES_ANNOTATED("SHA-256", "!NONEXISTENT ACCELERATED VERSION!"),
-};
-
-#endif /* HW_SHA256 */
diff --git a/crypto/sha256.h b/crypto/sha256.h
new file mode 100644
index 00000000..e6ca7564
--- /dev/null
+++ b/crypto/sha256.h
@@ -0,0 +1,105 @@
+/*
+ * Definitions likely to be helpful to multiple SHA-256 implementations.
+ */
+
+/*
+ * The 'extra' structure used by SHA-256 implementations is used to
+ * include information about how to check if a given implementation is
+ * available at run time, and whether we've already checked.
+ */
+struct sha256_extra_mutable;
+struct sha256_extra {
+    /* Function to check availability. Might be expensive, so we don't
+     * want to call it more than once. */
+    bool (*check_available)(void);
+
+    /* Point to a writable substructure. */
+    struct sha256_extra_mutable *mut;
+};
+struct sha256_extra_mutable {
+    bool checked_availability;
+    bool is_available;
+};
+static inline bool check_availability(const struct sha256_extra *extra)
+{
+    if (!extra->mut->checked_availability) {
+        extra->mut->is_available = extra->check_available();
+        extra->mut->checked_availability = true;
+    }
+
+    return extra->mut->is_available;
+}
+
+/*
+ * Macro to define a SHA-256 vtable together with its 'extra'
+ * structure.
+ */
+#define SHA256_VTABLE(impl_c, impl_display)                             \
+    static struct sha256_extra_mutable sha256_ ## impl_c ## _extra_mut; \
+    static const struct sha256_extra sha256_ ## impl_c ## _extra = {    \
+        .check_available = sha256_ ## impl_c ## _available,             \
+        .mut = &sha256_ ## impl_c ## _extra_mut,                        \
+    };                                                                  \
+    const ssh_hashalg ssh_sha256_ ## impl_c = {                         \
+        .new = sha256_ ## impl_c ## _new,                               \
+        .reset = sha256_ ## impl_c ## _reset,                           \
+        .copyfrom = sha256_ ## impl_c ## _copyfrom,                     \
+        .digest = sha256_ ## impl_c ## _digest,                         \
+        .free = sha256_ ## impl_c ## _free,                             \
+        .hlen = 32,                                                     \
+        .blocklen = 64,                                                 \
+        HASHALG_NAMES_ANNOTATED("SHA-256", impl_display),               \
+        .extra = &sha256_ ## impl_c ## _extra,                          \
+    }
+
+extern const uint32_t sha256_initial_state[8];
+extern const uint32_t sha256_round_constants[64];
+
+#define SHA256_ROUNDS 64
+
+typedef struct sha256_block sha256_block;
+struct sha256_block {
+    uint8_t block[64];
+    size_t used;
+    uint64_t len;
+};
+
+static inline void sha256_block_setup(sha256_block *blk)
+{
+    blk->used = 0;
+    blk->len = 0;
+}
+
+static inline bool sha256_block_write(
+    sha256_block *blk, const void **vdata, size_t *len)
+{
+    size_t blkleft = sizeof(blk->block) - blk->used;
+    size_t chunk = *len < blkleft ? *len : blkleft;
+
+    const uint8_t *p = *vdata;
+    memcpy(blk->block + blk->used, p, chunk);
+    *vdata = p + chunk;
+    *len -= chunk;
+    blk->used += chunk;
+    blk->len += chunk;
+
+    if (blk->used == sizeof(blk->block)) {
+        blk->used = 0;
+        return true;
+    }
+
+    return false;
+}
+
+static inline void sha256_block_pad(sha256_block *blk, BinarySink *bs)
+{
+    uint64_t final_len = blk->len << 3;
+    size_t pad = 1 + (63 & (55 - blk->used));
+
+    put_byte(bs, 0x80);
+    for (size_t i = 1; i < pad; i++)
+        put_byte(bs, 0);
+    put_uint64(bs, final_len);
+
+    assert(blk->used == 0 && "Should have exactly hit a block boundary");
+}
diff --git a/crypto/sha512-common.c b/crypto/sha512-common.c
new file mode 100644
index 00000000..89ac136c
--- /dev/null
+++ b/crypto/sha512-common.c
@@ -0,0 +1,71 @@
+/*
+ * Common variable definitions across all the SHA-512 implementations.
+ */
+
+#include "ssh.h"
+#include "sha512.h"
+
+const uint64_t sha512_initial_state[8] = {
+    0x6a09e667f3bcc908ULL,
+    0xbb67ae8584caa73bULL,
+    0x3c6ef372fe94f82bULL,
+    0xa54ff53a5f1d36f1ULL,
+    0x510e527fade682d1ULL,
+    0x9b05688c2b3e6c1fULL,
+    0x1f83d9abfb41bd6bULL,
+    0x5be0cd19137e2179ULL,
+};
+
+const uint64_t sha384_initial_state[8] = {
+    0xcbbb9d5dc1059ed8ULL,
+    0x629a292a367cd507ULL,
+    0x9159015a3070dd17ULL,
+    0x152fecd8f70e5939ULL,
+    0x67332667ffc00b31ULL,
+    0x8eb44a8768581511ULL,
+    0xdb0c2e0d64f98fa7ULL,
+    0x47b5481dbefa4fa4ULL,
+};
+
+const uint64_t sha512_round_constants[80] = {
+    0x428a2f98d728ae22ULL, 0x7137449123ef65cdULL,
+    0xb5c0fbcfec4d3b2fULL, 0xe9b5dba58189dbbcULL,
+    0x3956c25bf348b538ULL, 0x59f111f1b605d019ULL,
+    0x923f82a4af194f9bULL, 0xab1c5ed5da6d8118ULL,
+    0xd807aa98a3030242ULL, 0x12835b0145706fbeULL,
+    0x243185be4ee4b28cULL, 0x550c7dc3d5ffb4e2ULL,
+    0x72be5d74f27b896fULL, 0x80deb1fe3b1696b1ULL,
+    0x9bdc06a725c71235ULL, 0xc19bf174cf692694ULL,
+    0xe49b69c19ef14ad2ULL, 0xefbe4786384f25e3ULL,
+    0x0fc19dc68b8cd5b5ULL, 0x240ca1cc77ac9c65ULL,
+    0x2de92c6f592b0275ULL, 0x4a7484aa6ea6e483ULL,
+    0x5cb0a9dcbd41fbd4ULL, 0x76f988da831153b5ULL,
+    0x983e5152ee66dfabULL, 0xa831c66d2db43210ULL,
+    0xb00327c898fb213fULL, 0xbf597fc7beef0ee4ULL,
+    0xc6e00bf33da88fc2ULL, 0xd5a79147930aa725ULL,
+    0x06ca6351e003826fULL, 0x142929670a0e6e70ULL,
+    0x27b70a8546d22ffcULL, 0x2e1b21385c26c926ULL,
+    0x4d2c6dfc5ac42aedULL, 0x53380d139d95b3dfULL,
+    0x650a73548baf63deULL, 0x766a0abb3c77b2a8ULL,
+    0x81c2c92e47edaee6ULL, 0x92722c851482353bULL,
+    0xa2bfe8a14cf10364ULL, 0xa81a664bbc423001ULL,
+    0xc24b8b70d0f89791ULL, 0xc76c51a30654be30ULL,
+    0xd192e819d6ef5218ULL, 0xd69906245565a910ULL,
+    0xf40e35855771202aULL, 0x106aa07032bbd1b8ULL,
+    0x19a4c116b8d2d0c8ULL, 0x1e376c085141ab53ULL,
+    0x2748774cdf8eeb99ULL, 0x34b0bcb5e19b48a8ULL,
+    0x391c0cb3c5c95a63ULL, 0x4ed8aa4ae3418acbULL,
+    0x5b9cca4f7763e373ULL, 0x682e6ff3d6b2b8a3ULL,
+    0x748f82ee5defb2fcULL, 0x78a5636f43172f60ULL,
+    0x84c87814a1f0ab72ULL, 0x8cc702081a6439ecULL,
+    0x90befffa23631e28ULL, 0xa4506cebde82bde9ULL,
+    0xbef9a3f7b2c67915ULL, 0xc67178f2e372532bULL,
+    0xca273eceea26619cULL, 0xd186b8c721c0c207ULL,
+    0xeada7dd6cde0eb1eULL, 0xf57d4f7fee6ed178ULL,
+    0x06f067aa72176fbaULL, 0x0a637dc5a2c898a6ULL,
+    0x113f9804bef90daeULL, 0x1b710b35131c471bULL,
+    0x28db77f523047d84ULL, 0x32caab7b40c72493ULL,
+    0x3c9ebe0a15c9bebcULL, 0x431d67c49c100d4cULL,
+    0x4cc5d4becb3e42b6ULL, 0x597f299cfc657e2aULL,
+    0x5fcb6fab3ad6faecULL, 0x6c44198c4a475817ULL,
+};
diff --git a/crypto/sha512-neon.c b/crypto/sha512-neon.c
new file mode 100644
index 00000000..849a79d7
--- /dev/null
+++ b/crypto/sha512-neon.c
@@ -0,0 +1,329 @@
+/*
+ * Hardware-accelerated implementation of SHA-512 using Arm NEON.
+ */
+
+#include "ssh.h"
+#include "sha512.h"
+
+#if USE_ARM64_NEON_H
+#include <arm64_neon.h>
+#else
+#include <arm_neon.h>
+#endif
+
+static bool sha512_neon_available(void)
+{
+    /*
+     * For Arm, we delegate to a per-platform detection function (see
+     * explanation in aes-neon.c).
+     */
+    return platform_sha512_neon_available();
+}
+
+#if !HAVE_NEON_SHA512_INTRINSICS
+/*
+ * clang 12 and before do not provide the SHA-512 NEON intrinsics, but
+ * do provide assembler support for the underlying instructions. So I
+ * define the intrinsic functions myself, using inline assembler.
+ */
+static inline uint64x2_t vsha512su0q_u64(uint64x2_t x, uint64x2_t y)
+{
+    __asm__("sha512su0 %0.2D,%1.2D" : "+w" (x) : "w" (y));
+    return x;
+}
+static inline uint64x2_t vsha512su1q_u64(uint64x2_t x, uint64x2_t y,
+                                         uint64x2_t z)
+{
+    __asm__("sha512su1 %0.2D,%1.2D,%2.2D" : "+w" (x) : "w" (y), "w" (z));
+    return x;
+}
+static inline uint64x2_t vsha512hq_u64(uint64x2_t x, uint64x2_t y,
+                                       uint64x2_t z)
+{
+    __asm__("sha512h %0,%1,%2.2D" : "+w" (x) : "w" (y), "w" (z));
+    return x;
+}
+static inline uint64x2_t vsha512h2q_u64(uint64x2_t x, uint64x2_t y,
+                                        uint64x2_t z)
+{
+    __asm__("sha512h2 %0,%1,%2.2D" : "+w" (x) : "w" (y), "w" (z));
+    return x;
+}
+#endif /* HAVE_NEON_SHA512_INTRINSICS */
+
+typedef struct sha512_neon_core sha512_neon_core;
+struct sha512_neon_core {
+    uint64x2_t ab, cd, ef, gh;
+};
+
+static inline uint64x2_t sha512_neon_load_input(const uint8_t *p)
+{
+    return vreinterpretq_u64_u8(vrev64q_u8(vld1q_u8(p)));
+}
+
+static inline uint64x2_t sha512_neon_schedule_update(
+    uint64x2_t m8, uint64x2_t m7, uint64x2_t m4, uint64x2_t m3, uint64x2_t m1)
+{
+    /*
+     * vsha512su0q_u64() takes words from a long way back in the
+     * schedule and performs the sigma_0 half of the computation of
+     * the next two 64-bit message-schedule words.
+     *
+     * vsha512su1q_u64() combines the result of that with the sigma_1
+     * steps, to output the finished version of those two words. The
+     * total amount of input data it requires fits nicely into three
+     * 128-bit vector registers, but one of those registers is
+     * misaligned compared to the 128-bit chunks that the message
+     * schedule is stored in. So we use vextq_u64 to make one of its
+     * input words out of the second half of m4 and the first half of
+     * m3.
+     */
+    return vsha512su1q_u64(vsha512su0q_u64(m8, m7), m1, vextq_u64(m4, m3, 1));
+}
+
+static inline void sha512_neon_round2(
+    unsigned round_index, uint64x2_t schedule_words,
+    uint64x2_t *ab, uint64x2_t *cd, uint64x2_t *ef, uint64x2_t *gh)
+{
+    /*
+     * vsha512hq_u64 performs the Sigma_1 and Ch half of the
+     * computation of two rounds of SHA-512 (including feeding back
+     * one of the outputs from the first of those half-rounds into the
+     * second one).
+     *
+     * vsha512h2q_u64 combines the result of that with the Sigma_0 and
+     * Maj steps, and outputs one 128-bit vector that replaces the gh
+     * piece of the input hash state, and a second that updates cd by
+     * addition.
+     *
+     * Similarly to vsha512su1q_u64 above, some of the input registers
+     * expected by these instructions are misaligned by 64 bits
+     * relative to the chunks we've divided the hash state into, so we
+     * have to start by making 'de' and 'fg' words out of our input
+     * cd,ef,gh, using vextq_u64.
+     *
+     * Also, one of the inputs to vsha512hq_u64 is expected to contain
+     * the results of summing gh + two round constants + two words of
+     * message schedule, but the two words of the message schedule
+     * have to be the opposite way round in the vector register from
+     * the way that vsha512su1q_u64 output them. Hence, there's
+     * another vextq_u64 in here that swaps the two halves of the
+     * initial_sum vector register.
+     *
+     * (This also means that I don't have to prepare a specially
+     * reordered version of the sha512_round_constants[] array: as
+     * long as I'm unavoidably doing a swap at run time _anyway_, I
+     * can load from the normally ordered version of that array, and
+     * just take care to fold in that data _before_ the swap rather
+     * than after.)
+     */
+
+    /* Load two round constants, with the first one in the low half */
+    uint64x2_t round_constants = vld1q_u64(
+        sha512_round_constants + round_index);
+
+    /* Add schedule words to round constants */
+    uint64x2_t initial_sum = vaddq_u64(schedule_words, round_constants);
+
+    /* Swap that sum around so the word used in the first of the two
+     * rounds is in the _high_ half of the vector, matching where h
+     * lives in the gh vector */
+    uint64x2_t swapped_initial_sum = vextq_u64(initial_sum, initial_sum, 1);
+
+    /* Add gh to that, now that they're matching ways round */
+    uint64x2_t sum = vaddq_u64(swapped_initial_sum, *gh);
+
+    /* Make the misaligned de and fg words */
+    uint64x2_t de = vextq_u64(*cd, *ef, 1);
+    uint64x2_t fg = vextq_u64(*ef, *gh, 1);
+
+    /* Now we're ready to put all the pieces together. The output from
+     * vsha512h2q_u64 can be used directly as the new gh, and the
+     * output from vsha512hq_u64 is simultaneously the intermediate
+     * value passed to h2 and the thing you have to add on to cd. */
+    uint64x2_t intermed = vsha512hq_u64(sum, fg, de);
+    *gh = vsha512h2q_u64(intermed, *cd, *ab);
+    *cd = vaddq_u64(*cd, intermed);
+}
+
+static inline void sha512_neon_block(sha512_neon_core *core, const uint8_t *p)
+{
+    uint64x2_t s0, s1, s2, s3, s4, s5, s6, s7;
+
+    uint64x2_t ab = core->ab, cd = core->cd, ef = core->ef, gh = core->gh;
+
+    s0 = sha512_neon_load_input(p + 16*0);
+    sha512_neon_round2(0, s0, &ab, &cd, &ef, &gh);
+    s1 = sha512_neon_load_input(p + 16*1);
+    sha512_neon_round2(2, s1, &gh, &ab, &cd, &ef);
+    s2 = sha512_neon_load_input(p + 16*2);
+    sha512_neon_round2(4, s2, &ef, &gh, &ab, &cd);
+    s3 = sha512_neon_load_input(p + 16*3);
+    sha512_neon_round2(6, s3, &cd, &ef, &gh, &ab);
+    s4 = sha512_neon_load_input(p + 16*4);
+    sha512_neon_round2(8, s4, &ab, &cd, &ef, &gh);
+    s5 = sha512_neon_load_input(p + 16*5);
+    sha512_neon_round2(10, s5, &gh, &ab, &cd, &ef);
+    s6 = sha512_neon_load_input(p + 16*6);
+    sha512_neon_round2(12, s6, &ef, &gh, &ab, &cd);
+    s7 = sha512_neon_load_input(p + 16*7);
+    sha512_neon_round2(14, s7, &cd, &ef, &gh, &ab);
+    s0 = sha512_neon_schedule_update(s0, s1, s4, s5, s7);
+    sha512_neon_round2(16, s0, &ab, &cd, &ef, &gh);
+    s1 = sha512_neon_schedule_update(s1, s2, s5, s6, s0);
+    sha512_neon_round2(18, s1, &gh, &ab, &cd, &ef);
+    s2 = sha512_neon_schedule_update(s2, s3, s6, s7, s1);
+    sha512_neon_round2(20, s2, &ef, &gh, &ab, &cd);
+    s3 = sha512_neon_schedule_update(s3, s4, s7, s0, s2);
+    sha512_neon_round2(22, s3, &cd, &ef, &gh, &ab);
+    s4 = sha512_neon_schedule_update(s4, s5, s0, s1, s3);
+    sha512_neon_round2(24, s4, &ab, &cd, &ef, &gh);
+    s5 = sha512_neon_schedule_update(s5, s6, s1, s2, s4);
+    sha512_neon_round2(26, s5, &gh, &ab, &cd, &ef);
+    s6 = sha512_neon_schedule_update(s6, s7, s2, s3, s5);
+    sha512_neon_round2(28, s6, &ef, &gh, &ab, &cd);
+    s7 = sha512_neon_schedule_update(s7, s0, s3, s4, s6);
+    sha512_neon_round2(30, s7, &cd, &ef, &gh, &ab);
+    s0 = sha512_neon_schedule_update(s0, s1, s4, s5, s7);
+    sha512_neon_round2(32, s0, &ab, &cd, &ef, &gh);
+    s1 = sha512_neon_schedule_update(s1, s2, s5, s6, s0);
+    sha512_neon_round2(34, s1, &gh, &ab, &cd, &ef);
+    s2 = sha512_neon_schedule_update(s2, s3, s6, s7, s1);
+    sha512_neon_round2(36, s2, &ef, &gh, &ab, &cd);
+    s3 = sha512_neon_schedule_update(s3, s4, s7, s0, s2);
+    sha512_neon_round2(38, s3, &cd, &ef, &gh, &ab);
+    s4 = sha512_neon_schedule_update(s4, s5, s0, s1, s3);
+    sha512_neon_round2(40, s4, &ab, &cd, &ef, &gh);
+    s5 = sha512_neon_schedule_update(s5, s6, s1, s2, s4);
+    sha512_neon_round2(42, s5, &gh, &ab, &cd, &ef);
+    s6 = sha512_neon_schedule_update(s6, s7, s2, s3, s5);
+    sha512_neon_round2(44, s6, &ef, &gh, &ab, &cd);
+    s7 = sha512_neon_schedule_update(s7, s0, s3, s4, s6);
+    sha512_neon_round2(46, s7, &cd, &ef, &gh, &ab);
+    s0 = sha512_neon_schedule_update(s0, s1, s4, s5, s7);
+    sha512_neon_round2(48, s0, &ab, &cd, &ef, &gh);
+    s1 = sha512_neon_schedule_update(s1, s2, s5, s6, s0);
+    sha512_neon_round2(50, s1, &gh, &ab, &cd, &ef);
+    s2 = sha512_neon_schedule_update(s2, s3, s6, s7, s1);
+    sha512_neon_round2(52, s2, &ef, &gh, &ab, &cd);
+    s3 = sha512_neon_schedule_update(s3, s4, s7, s0, s2);
+    sha512_neon_round2(54, s3, &cd, &ef, &gh, &ab);
+    s4 = sha512_neon_schedule_update(s4, s5, s0, s1, s3);
+    sha512_neon_round2(56, s4, &ab, &cd, &ef, &gh);
+    s5 = sha512_neon_schedule_update(s5, s6, s1, s2, s4);
+    sha512_neon_round2(58, s5, &gh, &ab, &cd, &ef);
+    s6 = sha512_neon_schedule_update(s6, s7, s2, s3, s5);
+    sha512_neon_round2(60, s6, &ef, &gh, &ab, &cd);
+    s7 = sha512_neon_schedule_update(s7, s0, s3, s4, s6);
+    sha512_neon_round2(62, s7, &cd, &ef, &gh, &ab);
+    s0 = sha512_neon_schedule_update(s0, s1, s4, s5, s7);
+    sha512_neon_round2(64, s0, &ab, &cd, &ef, &gh);
+    s1 = sha512_neon_schedule_update(s1, s2, s5, s6, s0);
+    sha512_neon_round2(66, s1, &gh, &ab, &cd, &ef);
+    s2 = sha512_neon_schedule_update(s2, s3, s6, s7, s1);
+    sha512_neon_round2(68, s2, &ef, &gh, &ab, &cd);
+    s3 = sha512_neon_schedule_update(s3, s4, s7, s0, s2);
+    sha512_neon_round2(70, s3, &cd, &ef, &gh, &ab);
+    s4 = sha512_neon_schedule_update(s4, s5, s0, s1, s3);
+    sha512_neon_round2(72, s4, &ab, &cd, &ef, &gh);
+    s5 = sha512_neon_schedule_update(s5, s6, s1, s2, s4);
+    sha512_neon_round2(74, s5, &gh, &ab, &cd, &ef);
+    s6 = sha512_neon_schedule_update(s6, s7, s2, s3, s5);
+    sha512_neon_round2(76, s6, &ef, &gh, &ab, &cd);
+    s7 = sha512_neon_schedule_update(s7, s0, s3, s4, s6);
+    sha512_neon_round2(78, s7, &cd, &ef, &gh, &ab);
+
+    core->ab = vaddq_u64(core->ab, ab);
+    core->cd = vaddq_u64(core->cd, cd);
+    core->ef = vaddq_u64(core->ef, ef);
+    core->gh = vaddq_u64(core->gh, gh);
+}
+
+typedef struct sha512_neon {
+    sha512_neon_core core;
+    sha512_block blk;
+    BinarySink_IMPLEMENTATION;
+    ssh_hash hash;
+} sha512_neon;
+
+static void sha512_neon_write(BinarySink *bs, const void *vp, size_t len);
+
+static ssh_hash *sha512_neon_new(const ssh_hashalg *alg)
+{
+    const struct sha512_extra *extra = (const struct sha512_extra *)alg->extra;
+    if (!check_availability(extra))
+        return NULL;
+
+    sha512_neon *s = snew(sha512_neon);
+
+    s->hash.vt = alg;
+    BinarySink_INIT(s, sha512_neon_write);
+    BinarySink_DELEGATE_INIT(&s->hash, s);
+    return &s->hash;
+}
+
+static void sha512_neon_reset(ssh_hash *hash)
+{
+    sha512_neon *s = container_of(hash, sha512_neon, hash);
+    const struct sha512_extra *extra =
+        (const struct sha512_extra *)hash->vt->extra;
+
+    s->core.ab = vld1q_u64(extra->initial_state);
+    s->core.cd = vld1q_u64(extra->initial_state+2);
+    s->core.ef = vld1q_u64(extra->initial_state+4);
+    s->core.gh = vld1q_u64(extra->initial_state+6);
+
+    sha512_block_setup(&s->blk);
+}
+
+static void sha512_neon_copyfrom(ssh_hash *hcopy, ssh_hash *horig)
+{
+    sha512_neon *copy = container_of(hcopy, sha512_neon, hash);
+    sha512_neon *orig = container_of(horig, sha512_neon, hash);
+
+    *copy = *orig; /* structure copy */
+
+    BinarySink_COPIED(copy);
+    BinarySink_DELEGATE_INIT(&copy->hash, copy);
+}
+
+static void sha512_neon_free(ssh_hash *hash)
+{
+    sha512_neon *s = container_of(hash, sha512_neon, hash);
+    smemclr(s, sizeof(*s));
+    sfree(s);
+}
+
+static void sha512_neon_write(BinarySink *bs, const void *vp, size_t len)
+{
+    sha512_neon *s = BinarySink_DOWNCAST(bs, sha512_neon);
+
+    while (len > 0)
+        if (sha512_block_write(&s->blk, &vp, &len))
+            sha512_neon_block(&s->core, s->blk.block);
+}
+
+static void sha512_neon_digest(ssh_hash *hash, uint8_t *digest)
+{
+    sha512_neon *s = container_of(hash, sha512_neon, hash);
+
+    sha512_block_pad(&s->blk, BinarySink_UPCAST(s));
+
+    vst1q_u8(digest,    vrev64q_u8(vreinterpretq_u8_u64(s->core.ab)));
+    vst1q_u8(digest+16, vrev64q_u8(vreinterpretq_u8_u64(s->core.cd)));
+    vst1q_u8(digest+32, vrev64q_u8(vreinterpretq_u8_u64(s->core.ef)));
+    vst1q_u8(digest+48, vrev64q_u8(vreinterpretq_u8_u64(s->core.gh)));
+}
+
+static void sha384_neon_digest(ssh_hash *hash, uint8_t *digest)
+{
+    sha512_neon *s = container_of(hash, sha512_neon, hash);
+
+    sha512_block_pad(&s->blk, BinarySink_UPCAST(s));
+
+    vst1q_u8(digest,    vrev64q_u8(vreinterpretq_u8_u64(s->core.ab)));
+    vst1q_u8(digest+16, vrev64q_u8(vreinterpretq_u8_u64(s->core.cd)));
+    vst1q_u8(digest+32, vrev64q_u8(vreinterpretq_u8_u64(s->core.ef)));
+}
+
+SHA512_VTABLES(neon, "NEON accelerated");
diff --git a/crypto/sha512-select.c b/crypto/sha512-select.c
new file mode 100644
index 00000000..ecd567bd
--- /dev/null
+++ b/crypto/sha512-select.c
@@ -0,0 +1,61 @@
+/*
+ * Top-level vtables to select a SHA-512 implementation.
+ */
+
+#include <assert.h>
+#include <stdlib.h>
+
+#include "putty.h"
+#include "ssh.h"
+#include "sha512.h"
+
+static const ssh_hashalg *const real_sha512_algs[] = {
+#if HAVE_NEON_SHA512
+    &ssh_sha512_neon,
+#endif
+    &ssh_sha512_sw,
+    NULL,
+};
+
+static const ssh_hashalg *const real_sha384_algs[] = {
+#if HAVE_NEON_SHA512
+    &ssh_sha384_neon,
+#endif
+    &ssh_sha384_sw,
+    NULL,
+};
+
+static ssh_hash *sha512_select(const ssh_hashalg *alg)
+{
+    const ssh_hashalg *const *real_algs =
+        (const ssh_hashalg *const *)alg->extra;
+
+    for (size_t i = 0; real_algs[i]; i++) {
+        const ssh_hashalg *alg = real_algs[i];
+        const struct sha512_extra *alg_extra =
+            (const struct sha512_extra *)alg->extra;
+        if (check_availability(alg_extra))
+            return ssh_hash_new(alg);
+    }
+
+    /* We should never reach the NULL at the end of the list, because
+     * the last non-NULL entry should be software-only SHA-512, which
+     * is always available. */
+    unreachable("sha512_select ran off the end of its list");
+}
+
+const ssh_hashalg ssh_sha512 = {
+    .new = sha512_select,
+    .hlen = 64,
+    .blocklen = 128,
+    HASHALG_NAMES_ANNOTATED("SHA-512", "dummy selector vtable"),
+    .extra = real_sha512_algs,
+};
+
+const ssh_hashalg ssh_sha384 = {
+    .new = sha512_select,
+    .hlen = 48,
+    .blocklen = 128,
+    HASHALG_NAMES_ANNOTATED("SHA-384", "dummy selector vtable"),
+    .extra = real_sha384_algs,
+};
diff --git a/crypto/sha512-sw.c b/crypto/sha512-sw.c
new file mode 100644
index 00000000..9e47bbb9
--- /dev/null
+++ b/crypto/sha512-sw.c
@@ -0,0 +1,168 @@
+/*
+ * Software implementation of SHA-512.
+ */
+
+#include "ssh.h"
+#include "sha512.h"
+
+static bool sha512_sw_available(void)
+{
+    /* Software SHA-512 is always available */
+    return true;
+}
+
+static inline uint64_t ror(uint64_t x, unsigned y)
+{
+    return (x << (63 & -y)) | (x >> (63 & y));
+}
+
+static inline uint64_t Ch(uint64_t ctrl, uint64_t if1, uint64_t if0)
+{
+    return if0 ^ (ctrl & (if1 ^ if0));
+}
+
+static inline uint64_t Maj(uint64_t x, uint64_t y, uint64_t z)
+{
+    return (x & y) | (z & (x | y));
+}
+
+static inline uint64_t Sigma_0(uint64_t x)
+{
+    return ror(x,28) ^ ror(x,34) ^ ror(x,39);
+}
+
+static inline uint64_t Sigma_1(uint64_t x)
+{
+    return ror(x,14) ^ ror(x,18) ^ ror(x,41);
+}
+
+static inline uint64_t sigma_0(uint64_t x)
+{
+    return ror(x,1) ^ ror(x,8) ^ (x >> 7);
+}
+
+static inline uint64_t sigma_1(uint64_t x)
+{
+    return ror(x,19) ^ ror(x,61) ^ (x >> 6);
+}
+
+static inline void sha512_sw_round(
+    unsigned round_index, const uint64_t *schedule,
+    uint64_t *a, uint64_t *b, uint64_t *c, uint64_t *d,
+    uint64_t *e, uint64_t *f, uint64_t *g, uint64_t *h)
+{
+    uint64_t t1 = *h + Sigma_1(*e) + Ch(*e,*f,*g) +
+        sha512_round_constants[round_index] + schedule[round_index];
+
+    uint64_t t2 = Sigma_0(*a) + Maj(*a,*b,*c);
+
+    *d += t1;
+    *h = t1 + t2;
+}
+
+static void sha512_sw_block(uint64_t *core, const uint8_t *block)
+{
+    uint64_t w[SHA512_ROUNDS];
+    uint64_t a,b,c,d,e,f,g,h;
+
+    int t;
+
+    for (t = 0; t < 16; t++)
+        w[t] = GET_64BIT_MSB_FIRST(block + 8*t);
+
+    for (t = 16; t < SHA512_ROUNDS; t++)
+        w[t] = w[t-16] + w[t-7] + sigma_0(w[t-15]) + sigma_1(w[t-2]);
+
+    a = core[0]; b = core[1]; c = core[2]; d = core[3];
+    e = core[4]; f = core[5]; g = core[6]; h = core[7];
+
+    for (t = 0; t < SHA512_ROUNDS; t+=8) {
+        sha512_sw_round(t+0, w, &a,&b,&c,&d,&e,&f,&g,&h);
+        sha512_sw_round(t+1, w, &h,&a,&b,&c,&d,&e,&f,&g);
+        sha512_sw_round(t+2, w, &g,&h,&a,&b,&c,&d,&e,&f);
+        sha512_sw_round(t+3, w, &f,&g,&h,&a,&b,&c,&d,&e);
+        sha512_sw_round(t+4, w, &e,&f,&g,&h,&a,&b,&c,&d);
+        sha512_sw_round(t+5, w, &d,&e,&f,&g,&h,&a,&b,&c);
+        sha512_sw_round(t+6, w, &c,&d,&e,&f,&g,&h,&a,&b);
+        sha512_sw_round(t+7, w, &b,&c,&d,&e,&f,&g,&h,&a);
+    }
+
+    core[0] += a; core[1] += b; core[2] += c; core[3] += d;
+    core[4] += e; core[5] += f; core[6] += g; core[7] += h;
+
+    smemclr(w, sizeof(w));
+}
+
+typedef struct sha512_sw {
+    uint64_t core[8];
+    sha512_block blk;
+    BinarySink_IMPLEMENTATION;
+    ssh_hash hash;
+} sha512_sw;
+
+static void sha512_sw_write(BinarySink *bs, const void *vp, size_t len);
+
+static ssh_hash *sha512_sw_new(const ssh_hashalg *alg)
+{
+    sha512_sw *s = snew(sha512_sw);
+
+    s->hash.vt = alg;
+    BinarySink_INIT(s, sha512_sw_write);
+    BinarySink_DELEGATE_INIT(&s->hash, s);
+    return &s->hash;
+}
+
+static void sha512_sw_reset(ssh_hash *hash)
+{
+    sha512_sw *s = container_of(hash, sha512_sw, hash);
+    const struct sha512_extra *extra =
+        (const struct sha512_extra *)hash->vt->extra;
+
+    memcpy(s->core, extra->initial_state, sizeof(s->core));
+    sha512_block_setup(&s->blk);
+}
+
+static void sha512_sw_copyfrom(ssh_hash *hcopy, ssh_hash *horig)
+{
+    sha512_sw *copy = container_of(hcopy, sha512_sw, hash);
+    sha512_sw *orig = container_of(horig, sha512_sw, hash);
+
+    memcpy(copy, orig, sizeof(*copy));
+    BinarySink_COPIED(copy);
+    BinarySink_DELEGATE_INIT(&copy->hash, copy);
+}
+
+static void sha512_sw_free(ssh_hash *hash)
+{
+    sha512_sw *s = container_of(hash, sha512_sw, hash);
+
+    smemclr(s, sizeof(*s));
+    sfree(s);
+}
+
+static void sha512_sw_write(BinarySink *bs, const void *vp, size_t len)
+{
+    sha512_sw *s = BinarySink_DOWNCAST(bs, sha512_sw);
+
+    while (len > 0)
+        if (sha512_block_write(&s->blk, &vp, &len))
+            sha512_sw_block(s->core, s->blk.block);
+}
+
+static void sha512_sw_digest(ssh_hash *hash, uint8_t *digest)
+{
+    sha512_sw *s = container_of(hash, sha512_sw, hash);
+
+    sha512_block_pad(&s->blk, BinarySink_UPCAST(s));
+    for (size_t i = 0; i < hash->vt->hlen / 8; i++)
+        PUT_64BIT_MSB_FIRST(digest + 8*i, s->core[i]);
+}
+
+/*
+ * This implementation doesn't need separate digest methods for
+ * SHA-384 and SHA-512, because the above implementation reads the
+ * hash length out of the vtable.
+ */
+#define sha384_sw_digest sha512_sw_digest
+
+SHA512_VTABLES(sw, "unaccelerated");
diff --git a/crypto/sha512.c b/crypto/sha512.c
deleted file mode 100644
index cba7f38d..00000000
--- a/crypto/sha512.c
+++ /dev/null
@@ -1,836 +0,0 @@
-/*
- * SHA-512 algorithm as described at
- *
- *   http://csrc.nist.gov/cryptval/shs.html
- *
- * Modifications made for SHA-384 also
- */
-
-#include <assert.h>
-#include "ssh.h"
-
-/*
- * Start by deciding whether we can support hardware SHA at all.
- */
-#define HW_SHA512_NONE 0
-#define HW_SHA512_NEON 1
-
-#ifdef _FORCE_SHA512_NEON
-#   define HW_SHA512 HW_SHA512_NEON
-#elif defined __BYTE_ORDER__ && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
-    /* Arm can potentially support both endiannesses, but this code
-     * hasn't been tested on anything but little. If anyone wants to
-     * run big-endian, they'll need to fix it first. */
-#elif defined __ARM_FEATURE_SHA512
-    /* If the Arm SHA-512 extension is available already, we can
-     * support NEON SHA without having to enable anything by hand */
-#   define HW_SHA512 HW_SHA512_NEON
-#elif defined(__clang__)
-#   if __has_attribute(target) && __has_include(<arm_neon.h>) &&       \
-    (defined(__aarch64__))
-        /* clang can enable the crypto extension in AArch64 using
-         * __attribute__((target)) */
-#       define HW_SHA512 HW_SHA512_NEON
-#       define USE_CLANG_ATTR_TARGET_AARCH64
-#   endif
-#endif
-
-#if defined _FORCE_SOFTWARE_SHA || !defined HW_SHA512
-#   undef HW_SHA512
-#   define HW_SHA512 HW_SHA512_NONE
-#endif
-
-/*
- * The actual query function that asks if hardware acceleration is
- * available.
- */
-static bool sha512_hw_available(void);
-
-/*
- * The top-level selection function, caching the results of
- * sha512_hw_available() so it only has to run once.
- */
-static bool sha512_hw_available_cached(void)
-{
-    static bool initialised = false;
-    static bool hw_available;
-    if (!initialised) {
-        hw_available = sha512_hw_available();
-        initialised = true;
-    }
-    return hw_available;
-}
-
-struct sha512_select_options {
-    const ssh_hashalg *hw, *sw;
-};
-
-static ssh_hash *sha512_select(const ssh_hashalg *alg)
-{
-    const struct sha512_select_options *options =
-        (const struct sha512_select_options *)alg->extra;
-
-    const ssh_hashalg *real_alg =
-        sha512_hw_available_cached() ? options->hw : options->sw;
-
-    return ssh_hash_new(real_alg);
-}
-
-const struct sha512_select_options ssh_sha512_select_options = {
-    &ssh_sha512_hw, &ssh_sha512_sw,
-};
-const struct sha512_select_options ssh_sha384_select_options = {
-    &ssh_sha384_hw, &ssh_sha384_sw,
-};
-
-const ssh_hashalg ssh_sha512 = {
-    .new = sha512_select,
-    .hlen = 64,
-    .blocklen = 128,
-    HASHALG_NAMES_ANNOTATED("SHA-512", "dummy selector vtable"),
-    .extra = &ssh_sha512_select_options,
-};
-
-const ssh_hashalg ssh_sha384 = {
-    .new = sha512_select,
-    .hlen = 48,
-    .blocklen = 128,
-    HASHALG_NAMES_ANNOTATED("SHA-384", "dummy selector vtable"),
-    .extra = &ssh_sha384_select_options,
-};
-
-/* ----------------------------------------------------------------------
- * Definitions likely to be helpful to multiple implementations.
- */
-
-static const uint64_t sha512_initial_state[] = {
-    0x6a09e667f3bcc908ULL,
-    0xbb67ae8584caa73bULL,
-    0x3c6ef372fe94f82bULL,
-    0xa54ff53a5f1d36f1ULL,
-    0x510e527fade682d1ULL,
-    0x9b05688c2b3e6c1fULL,
-    0x1f83d9abfb41bd6bULL,
-    0x5be0cd19137e2179ULL,
-};
-
-static const uint64_t sha384_initial_state[] = {
-    0xcbbb9d5dc1059ed8ULL,
-    0x629a292a367cd507ULL,
-    0x9159015a3070dd17ULL,
-    0x152fecd8f70e5939ULL,
-    0x67332667ffc00b31ULL,
-    0x8eb44a8768581511ULL,
-    0xdb0c2e0d64f98fa7ULL,
-    0x47b5481dbefa4fa4ULL,
-};
-
-static const uint64_t sha512_round_constants[] = {
-    0x428a2f98d728ae22ULL, 0x7137449123ef65cdULL,
-    0xb5c0fbcfec4d3b2fULL, 0xe9b5dba58189dbbcULL,
-    0x3956c25bf348b538ULL, 0x59f111f1b605d019ULL,
-    0x923f82a4af194f9bULL, 0xab1c5ed5da6d8118ULL,
-    0xd807aa98a3030242ULL, 0x12835b0145706fbeULL,
-    0x243185be4ee4b28cULL, 0x550c7dc3d5ffb4e2ULL,
-    0x72be5d74f27b896fULL, 0x80deb1fe3b1696b1ULL,
-    0x9bdc06a725c71235ULL, 0xc19bf174cf692694ULL,
-    0xe49b69c19ef14ad2ULL, 0xefbe4786384f25e3ULL,
-    0x0fc19dc68b8cd5b5ULL, 0x240ca1cc77ac9c65ULL,
-    0x2de92c6f592b0275ULL, 0x4a7484aa6ea6e483ULL,
-    0x5cb0a9dcbd41fbd4ULL, 0x76f988da831153b5ULL,
-    0x983e5152ee66dfabULL, 0xa831c66d2db43210ULL,
-    0xb00327c898fb213fULL, 0xbf597fc7beef0ee4ULL,
-    0xc6e00bf33da88fc2ULL, 0xd5a79147930aa725ULL,
-    0x06ca6351e003826fULL, 0x142929670a0e6e70ULL,
-    0x27b70a8546d22ffcULL, 0x2e1b21385c26c926ULL,
-    0x4d2c6dfc5ac42aedULL, 0x53380d139d95b3dfULL,
-    0x650a73548baf63deULL, 0x766a0abb3c77b2a8ULL,
-    0x81c2c92e47edaee6ULL, 0x92722c851482353bULL,
-    0xa2bfe8a14cf10364ULL, 0xa81a664bbc423001ULL,
-    0xc24b8b70d0f89791ULL, 0xc76c51a30654be30ULL,
-    0xd192e819d6ef5218ULL, 0xd69906245565a910ULL,
-    0xf40e35855771202aULL, 0x106aa07032bbd1b8ULL,
-    0x19a4c116b8d2d0c8ULL, 0x1e376c085141ab53ULL,
-    0x2748774cdf8eeb99ULL, 0x34b0bcb5e19b48a8ULL,
-    0x391c0cb3c5c95a63ULL, 0x4ed8aa4ae3418acbULL,
-    0x5b9cca4f7763e373ULL, 0x682e6ff3d6b2b8a3ULL,
-    0x748f82ee5defb2fcULL, 0x78a5636f43172f60ULL,
-    0x84c87814a1f0ab72ULL, 0x8cc702081a6439ecULL,
-    0x90befffa23631e28ULL, 0xa4506cebde82bde9ULL,
-    0xbef9a3f7b2c67915ULL, 0xc67178f2e372532bULL,
-    0xca273eceea26619cULL, 0xd186b8c721c0c207ULL,
-    0xeada7dd6cde0eb1eULL, 0xf57d4f7fee6ed178ULL,
-    0x06f067aa72176fbaULL, 0x0a637dc5a2c898a6ULL,
-    0x113f9804bef90daeULL, 0x1b710b35131c471bULL,
-    0x28db77f523047d84ULL, 0x32caab7b40c72493ULL,
-    0x3c9ebe0a15c9bebcULL, 0x431d67c49c100d4cULL,
-    0x4cc5d4becb3e42b6ULL, 0x597f299cfc657e2aULL,
-    0x5fcb6fab3ad6faecULL, 0x6c44198c4a475817ULL,
-};
-
-#define SHA512_ROUNDS 80
-
-typedef struct sha512_block sha512_block;
-struct sha512_block {
-    uint8_t block[128];
-    size_t used;
-    uint64_t lenhi, lenlo;
-};
-
-static inline void sha512_block_setup(sha512_block *blk)
-{
-    blk->used = 0;
-    blk->lenhi = blk->lenlo = 0;
-}
-
-static inline bool sha512_block_write(
-    sha512_block *blk, const void **vdata, size_t *len)
-{
-    size_t blkleft = sizeof(blk->block) - blk->used;
-    size_t chunk = *len < blkleft ? *len : blkleft;
-
-    const uint8_t *p = *vdata;
-    memcpy(blk->block + blk->used, p, chunk);
-    *vdata = p + chunk;
-    *len -= chunk;
-    blk->used += chunk;
-
-    size_t chunkbits = chunk << 3;
-
-    blk->lenlo += chunkbits;
-    blk->lenhi += (blk->lenlo < chunkbits);
-
-    if (blk->used == sizeof(blk->block)) {
-        blk->used = 0;
-        return true;
-    }
-
-    return false;
-}
-
-static inline void sha512_block_pad(sha512_block *blk, BinarySink *bs)
-{
-    uint64_t final_lenhi = blk->lenhi;
-    uint64_t final_lenlo = blk->lenlo;
-    size_t pad = 127 & (111 - blk->used);
-
-    put_byte(bs, 0x80);
-    put_padding(bs, pad, 0);
-    put_uint64(bs, final_lenhi);
-    put_uint64(bs, final_lenlo);
-
-    assert(blk->used == 0 && "Should have exactly hit a block boundary");
-}
-
-/* ----------------------------------------------------------------------
- * Software implementation of SHA-512.
- */
-
-static inline uint64_t ror(uint64_t x, unsigned y)
-{
-    return (x << (63 & -y)) | (x >> (63 & y));
-}
-
-static inline uint64_t Ch(uint64_t ctrl, uint64_t if1, uint64_t if0)
-{
-    return if0 ^ (ctrl & (if1 ^ if0));
-}
-
-static inline uint64_t Maj(uint64_t x, uint64_t y, uint64_t z)
-{
-    return (x & y) | (z & (x | y));
-}
-
-static inline uint64_t Sigma_0(uint64_t x)
-{
-    return ror(x,28) ^ ror(x,34) ^ ror(x,39);
-}
-
-static inline uint64_t Sigma_1(uint64_t x)
-{
-    return ror(x,14) ^ ror(x,18) ^ ror(x,41);
-}
-
-static inline uint64_t sigma_0(uint64_t x)
-{
-    return ror(x,1) ^ ror(x,8) ^ (x >> 7);
-}
-
-static inline uint64_t sigma_1(uint64_t x)
-{
-    return ror(x,19) ^ ror(x,61) ^ (x >> 6);
-}
-
-static inline void sha512_sw_round(
-    unsigned round_index, const uint64_t *schedule,
-    uint64_t *a, uint64_t *b, uint64_t *c, uint64_t *d,
-    uint64_t *e, uint64_t *f, uint64_t *g, uint64_t *h)
-{
-    uint64_t t1 = *h + Sigma_1(*e) + Ch(*e,*f,*g) +
-        sha512_round_constants[round_index] + schedule[round_index];
-
-    uint64_t t2 = Sigma_0(*a) + Maj(*a,*b,*c);
-
-    *d += t1;
-    *h = t1 + t2;
-}
-
-static void sha512_sw_block(uint64_t *core, const uint8_t *block)
-{
-    uint64_t w[SHA512_ROUNDS];
-    uint64_t a,b,c,d,e,f,g,h;
-
-    int t;
-
-    for (t = 0; t < 16; t++)
-        w[t] = GET_64BIT_MSB_FIRST(block + 8*t);
-
-    for (t = 16; t < SHA512_ROUNDS; t++)
-        w[t] = w[t-16] + w[t-7] + sigma_0(w[t-15]) + sigma_1(w[t-2]);
-
-    a = core[0]; b = core[1]; c = core[2]; d = core[3];
-    e = core[4]; f = core[5]; g = core[6]; h = core[7];
-
-    for (t = 0; t < SHA512_ROUNDS; t+=8) {
-        sha512_sw_round(t+0, w, &a,&b,&c,&d,&e,&f,&g,&h);
-        sha512_sw_round(t+1, w, &h,&a,&b,&c,&d,&e,&f,&g);
-        sha512_sw_round(t+2, w, &g,&h,&a,&b,&c,&d,&e,&f);
-        sha512_sw_round(t+3, w, &f,&g,&h,&a,&b,&c,&d,&e);
-        sha512_sw_round(t+4, w, &e,&f,&g,&h,&a,&b,&c,&d);
-        sha512_sw_round(t+5, w, &d,&e,&f,&g,&h,&a,&b,&c);
-        sha512_sw_round(t+6, w, &c,&d,&e,&f,&g,&h,&a,&b);
-        sha512_sw_round(t+7, w, &b,&c,&d,&e,&f,&g,&h,&a);
-    }
-
-    core[0] += a; core[1] += b; core[2] += c; core[3] += d;
-    core[4] += e; core[5] += f; core[6] += g; core[7] += h;
-
-    smemclr(w, sizeof(w));
-}
-
-typedef struct sha512_sw {
-    uint64_t core[8];
-    sha512_block blk;
-    BinarySink_IMPLEMENTATION;
-    ssh_hash hash;
-} sha512_sw;
-
-static void sha512_sw_write(BinarySink *bs, const void *vp, size_t len);
-
-static ssh_hash *sha512_sw_new(const ssh_hashalg *alg)
-{
-    sha512_sw *s = snew(sha512_sw);
-
-    s->hash.vt = alg;
-    BinarySink_INIT(s, sha512_sw_write);
-    BinarySink_DELEGATE_INIT(&s->hash, s);
-    return &s->hash;
-}
-
-static void sha512_sw_reset(ssh_hash *hash)
-{
-    sha512_sw *s = container_of(hash, sha512_sw, hash);
-
-    /* The 'extra' field in the ssh_hashalg indicates which
-     * initialisation vector we're using */
-    memcpy(s->core, hash->vt->extra, sizeof(s->core));
-    sha512_block_setup(&s->blk);
-}
-
-static void sha512_sw_copyfrom(ssh_hash *hcopy, ssh_hash *horig)
-{
-    sha512_sw *copy = container_of(hcopy, sha512_sw, hash);
-    sha512_sw *orig = container_of(horig, sha512_sw, hash);
-
-    memcpy(copy, orig, sizeof(*copy));
-    BinarySink_COPIED(copy);
-    BinarySink_DELEGATE_INIT(&copy->hash, copy);
-}
-
-static void sha512_sw_free(ssh_hash *hash)
-{
-    sha512_sw *s = container_of(hash, sha512_sw, hash);
-
-    smemclr(s, sizeof(*s));
-    sfree(s);
-}
-
-static void sha512_sw_write(BinarySink *bs, const void *vp, size_t len)
-{
-    sha512_sw *s = BinarySink_DOWNCAST(bs, sha512_sw);
-
-    while (len > 0)
-        if (sha512_block_write(&s->blk, &vp, &len))
-            sha512_sw_block(s->core, s->blk.block);
-}
-
-static void sha512_sw_digest(ssh_hash *hash, uint8_t *digest)
-{
-    sha512_sw *s = container_of(hash, sha512_sw, hash);
-
-    sha512_block_pad(&s->blk, BinarySink_UPCAST(s));
-    for (size_t i = 0; i < hash->vt->hlen / 8; i++)
-        PUT_64BIT_MSB_FIRST(digest + 8*i, s->core[i]);
-}
-
-const ssh_hashalg ssh_sha512_sw = {
-    .new = sha512_sw_new,
-    .reset = sha512_sw_reset,
-    .copyfrom = sha512_sw_copyfrom,
-    .digest = sha512_sw_digest,
-    .free = sha512_sw_free,
-    .hlen = 64,
-    .blocklen = 128,
-    HASHALG_NAMES_ANNOTATED("SHA-512", "unaccelerated"),
-    .extra = sha512_initial_state,
-};
-
-const ssh_hashalg ssh_sha384_sw = {
-    .new = sha512_sw_new,
-    .reset = sha512_sw_reset,
-    .copyfrom = sha512_sw_copyfrom,
-    .digest = sha512_sw_digest,
-    .free = sha512_sw_free,
-    .hlen = 48,
-    .blocklen = 128,
-    HASHALG_NAMES_ANNOTATED("SHA-384", "unaccelerated"),
-    .extra = sha384_initial_state,
-};
-
-/* ----------------------------------------------------------------------
- * Hardware-accelerated implementation of SHA-512 using Arm NEON.
- */
-
-#if HW_SHA512 == HW_SHA512_NEON
-
-/*
- * Manually set the target architecture, if we decided above that we
- * need to.
- */
-#ifdef USE_CLANG_ATTR_TARGET_AARCH64
-/*
- * A spot of cheating: redefine some ACLE feature macros before
- * including arm_neon.h. Otherwise we won't get the SHA intrinsics
- * defined by that header, because it will be looking at the settings
- * for the whole translation unit rather than the ones we're going to
- * put on some particular functions using __attribute__((target)).
- */
-#define __ARM_NEON 1
-#define __ARM_FEATURE_CRYPTO 1
-#define FUNC_ISA __attribute__ ((target("neon,sha3")))
-#endif /* USE_CLANG_ATTR_TARGET_AARCH64 */
-
-#ifndef FUNC_ISA
-#define FUNC_ISA
-#endif
-
-#ifdef USE_ARM64_NEON_H
-#include <arm64_neon.h>
-#else
-#include <arm_neon.h>
-#endif
-
-static bool sha512_hw_available(void)
-{
-    /*
-     * For Arm, we delegate to a per-platform detection function (see
-     * explanation in sshaes.c).
-     */
-    return platform_sha512_hw_available();
-}
-
-#if defined __clang__
-/*
- * As of 2020-12-24, I've found that clang doesn't provide the SHA-512
- * NEON intrinsics. So I define my own set using inline assembler, and
- * use #define to effectively rename them over the top of the standard
- * names.
- *
- * The aim of that #define technique is that it should avoid a build
- * failure if these intrinsics _are_ defined in <arm_neon.h>.
- * Obviously it would be better in that situation to switch back to
- * using the real intrinsics, but until I see a version of clang that
- * supports them, I won't know what version number to test in the
- * ifdef.
- */
-static inline FUNC_ISA
-uint64x2_t vsha512su0q_u64_asm(uint64x2_t x, uint64x2_t y) {
-    __asm__("sha512su0 %0.2D,%1.2D" : "+w" (x) : "w" (y));
-    return x;
-}
-static inline FUNC_ISA
-uint64x2_t vsha512su1q_u64_asm(uint64x2_t x, uint64x2_t y, uint64x2_t z) {
-    __asm__("sha512su1 %0.2D,%1.2D,%2.2D" : "+w" (x) : "w" (y), "w" (z));
-    return x;
-}
-static inline FUNC_ISA
-uint64x2_t vsha512hq_u64_asm(uint64x2_t x, uint64x2_t y, uint64x2_t z) {
-    __asm__("sha512h %0,%1,%2.2D" : "+w" (x) : "w" (y), "w" (z));
-    return x;
-}
-static inline FUNC_ISA
-uint64x2_t vsha512h2q_u64_asm(uint64x2_t x, uint64x2_t y, uint64x2_t z) {
-    __asm__("sha512h2 %0,%1,%2.2D" : "+w" (x) : "w" (y), "w" (z));
-    return x;
-}
-#undef vsha512su0q_u64
-#define vsha512su0q_u64 vsha512su0q_u64_asm
-#undef vsha512su1q_u64
-#define vsha512su1q_u64 vsha512su1q_u64_asm
-#undef vsha512hq_u64
-#define vsha512hq_u64 vsha512hq_u64_asm
-#undef vsha512h2q_u64
-#define vsha512h2q_u64 vsha512h2q_u64_asm
-#endif /* defined __clang__ */
-
-typedef struct sha512_neon_core sha512_neon_core;
-struct sha512_neon_core {
-    uint64x2_t ab, cd, ef, gh;
-};
-
-FUNC_ISA
-static inline uint64x2_t sha512_neon_load_input(const uint8_t *p)
-{
-    return vreinterpretq_u64_u8(vrev64q_u8(vld1q_u8(p)));
-}
-
-FUNC_ISA
-static inline uint64x2_t sha512_neon_schedule_update(
-    uint64x2_t m8, uint64x2_t m7, uint64x2_t m4, uint64x2_t m3, uint64x2_t m1)
-{
-    /*
-     * vsha512su0q_u64() takes words from a long way back in the
-     * schedule and performs the sigma_0 half of the computation of
-     * the next two 64-bit message-schedule words.
-     *
-     * vsha512su1q_u64() combines the result of that with the sigma_1
-     * steps, to output the finished version of those two words. The
-     * total amount of input data it requires fits nicely into three
-     * 128-bit vector registers, but one of those registers is
-     * misaligned compared to the 128-bit chunks that the message
-     * schedule is stored in. So we use vextq_u64 to make one of its
-     * input words out of the second half of m4 and the first half of
-     * m3.
-     */
-    return vsha512su1q_u64(vsha512su0q_u64(m8, m7), m1, vextq_u64(m4, m3, 1));
-}
-
-FUNC_ISA
-static inline void sha512_neon_round2(
-    unsigned round_index, uint64x2_t schedule_words,
-    uint64x2_t *ab, uint64x2_t *cd, uint64x2_t *ef, uint64x2_t *gh)
-{
-    /*
-     * vsha512hq_u64 performs the Sigma_1 and Ch half of the
-     * computation of two rounds of SHA-512 (including feeding back
-     * one of the outputs from the first of those half-rounds into the
-     * second one).
-     *
-     * vsha512h2q_u64 combines the result of that with the Sigma_0 and
-     * Maj steps, and outputs one 128-bit vector that replaces the gh
-     * piece of the input hash state, and a second that updates cd by
-     * addition.
-     *
-     * Similarly to vsha512su1q_u64 above, some of the input registers
-     * expected by these instructions are misaligned by 64 bits
-     * relative to the chunks we've divided the hash state into, so we
-     * have to start by making 'de' and 'fg' words out of our input
-     * cd,ef,gh, using vextq_u64.
-     *
-     * Also, one of the inputs to vsha512hq_u64 is expected to contain
-     * the results of summing gh + two round constants + two words of
-     * message schedule, but the two words of the message schedule
-     * have to be the opposite way round in the vector register from
-     * the way that vsha512su1q_u64 output them. Hence, there's
-     * another vextq_u64 in here that swaps the two halves of the
-     * initial_sum vector register.
-     *
-     * (This also means that I don't have to prepare a specially
-     * reordered version of the sha512_round_constants[] array: as
-     * long as I'm unavoidably doing a swap at run time _anyway_, I
-     * can load from the normally ordered version of that array, and
-     * just take care to fold in that data _before_ the swap rather
-     * than after.)
-     */
-
-    /* Load two round constants, with the first one in the low half */
-    uint64x2_t round_constants = vld1q_u64(
-        sha512_round_constants + round_index);
-
-    /* Add schedule words to round constants */
-    uint64x2_t initial_sum = vaddq_u64(schedule_words, round_constants);
-
-    /* Swap that sum around so the word used in the first of the two
-     * rounds is in the _high_ half of the vector, matching where h
-     * lives in the gh vector */
-    uint64x2_t swapped_initial_sum = vextq_u64(initial_sum, initial_sum, 1);
-
-    /* Add gh to that, now that they're matching ways round */
-    uint64x2_t sum = vaddq_u64(swapped_initial_sum, *gh);
-
-    /* Make the misaligned de and fg words */
-    uint64x2_t de = vextq_u64(*cd, *ef, 1);
-    uint64x2_t fg = vextq_u64(*ef, *gh, 1);
-
-    /* Now we're ready to put all the pieces together. The output from
-     * vsha512h2q_u64 can be used directly as the new gh, and the
-     * output from vsha512hq_u64 is simultaneously the intermediate
-     * value passed to h2 and the thing you have to add on to cd. */
-    uint64x2_t intermed = vsha512hq_u64(sum, fg, de);
-    *gh = vsha512h2q_u64(intermed, *cd, *ab);
-    *cd = vaddq_u64(*cd, intermed);
-}
-
-FUNC_ISA
-static inline void sha512_neon_block(sha512_neon_core *core, const uint8_t *p)
-{
-    uint64x2_t s0, s1, s2, s3, s4, s5, s6, s7;
-
-    uint64x2_t ab = core->ab, cd = core->cd, ef = core->ef, gh = core->gh;
-
-    s0 = sha512_neon_load_input(p + 16*0);
-    sha512_neon_round2(0, s0, &ab, &cd, &ef, &gh);
-    s1 = sha512_neon_load_input(p + 16*1);
-    sha512_neon_round2(2, s1, &gh, &ab, &cd, &ef);
-    s2 = sha512_neon_load_input(p + 16*2);
-    sha512_neon_round2(4, s2, &ef, &gh, &ab, &cd);
-    s3 = sha512_neon_load_input(p + 16*3);
-    sha512_neon_round2(6, s3, &cd, &ef, &gh, &ab);
-    s4 = sha512_neon_load_input(p + 16*4);
-    sha512_neon_round2(8, s4, &ab, &cd, &ef, &gh);
-    s5 = sha512_neon_load_input(p + 16*5);
-    sha512_neon_round2(10, s5, &gh, &ab, &cd, &ef);
-    s6 = sha512_neon_load_input(p + 16*6);
-    sha512_neon_round2(12, s6, &ef, &gh, &ab, &cd);
-    s7 = sha512_neon_load_input(p + 16*7);
-    sha512_neon_round2(14, s7, &cd, &ef, &gh, &ab);
-    s0 = sha512_neon_schedule_update(s0, s1, s4, s5, s7);
-    sha512_neon_round2(16, s0, &ab, &cd, &ef, &gh);
-    s1 = sha512_neon_schedule_update(s1, s2, s5, s6, s0);
-    sha512_neon_round2(18, s1, &gh, &ab, &cd, &ef);
-    s2 = sha512_neon_schedule_update(s2, s3, s6, s7, s1);
-    sha512_neon_round2(20, s2, &ef, &gh, &ab, &cd);
-    s3 = sha512_neon_schedule_update(s3, s4, s7, s0, s2);
-    sha512_neon_round2(22, s3, &cd, &ef, &gh, &ab);
-    s4 = sha512_neon_schedule_update(s4, s5, s0, s1, s3);
-    sha512_neon_round2(24, s4, &ab, &cd, &ef, &gh);
-    s5 = sha512_neon_schedule_update(s5, s6, s1, s2, s4);
-    sha512_neon_round2(26, s5, &gh, &ab, &cd, &ef);
-    s6 = sha512_neon_schedule_update(s6, s7, s2, s3, s5);
-    sha512_neon_round2(28, s6, &ef, &gh, &ab, &cd);
-    s7 = sha512_neon_schedule_update(s7, s0, s3, s4, s6);
-    sha512_neon_round2(30, s7, &cd, &ef, &gh, &ab);
-    s0 = sha512_neon_schedule_update(s0, s1, s4, s5, s7);
-    sha512_neon_round2(32, s0, &ab, &cd, &ef, &gh);
-    s1 = sha512_neon_schedule_update(s1, s2, s5, s6, s0);
-    sha512_neon_round2(34, s1, &gh, &ab, &cd, &ef);
-    s2 = sha512_neon_schedule_update(s2, s3, s6, s7, s1);
-    sha512_neon_round2(36, s2, &ef, &gh, &ab, &cd);
-    s3 = sha512_neon_schedule_update(s3, s4, s7, s0, s2);
-    sha512_neon_round2(38, s3, &cd, &ef, &gh, &ab);
-    s4 = sha512_neon_schedule_update(s4, s5, s0, s1, s3);
-    sha512_neon_round2(40, s4, &ab, &cd, &ef, &gh);
-    s5 = sha512_neon_schedule_update(s5, s6, s1, s2, s4);
-    sha512_neon_round2(42, s5, &gh, &ab, &cd, &ef);
-    s6 = sha512_neon_schedule_update(s6, s7, s2, s3, s5);
-    sha512_neon_round2(44, s6, &ef, &gh, &ab, &cd);
-    s7 = sha512_neon_schedule_update(s7, s0, s3, s4, s6);
-    sha512_neon_round2(46, s7, &cd, &ef, &gh, &ab);
-    s0 = sha512_neon_schedule_update(s0, s1, s4, s5, s7);
-    sha512_neon_round2(48, s0, &ab, &cd, &ef, &gh);
-    s1 = sha512_neon_schedule_update(s1, s2, s5, s6, s0);
-    sha512_neon_round2(50, s1, &gh, &ab, &cd, &ef);
-    s2 = sha512_neon_schedule_update(s2, s3, s6, s7, s1);
-    sha512_neon_round2(52, s2, &ef, &gh, &ab, &cd);
-    s3 = sha512_neon_schedule_update(s3, s4, s7, s0, s2);
-    sha512_neon_round2(54, s3, &cd, &ef, &gh, &ab);
-    s4 = sha512_neon_schedule_update(s4, s5, s0, s1, s3);
-    sha512_neon_round2(56, s4, &ab, &cd, &ef, &gh);
-    s5 = sha512_neon_schedule_update(s5, s6, s1, s2, s4);
-    sha512_neon_round2(58, s5, &gh, &ab, &cd, &ef);
-    s6 = sha512_neon_schedule_update(s6, s7, s2, s3, s5);
-    sha512_neon_round2(60, s6, &ef, &gh, &ab, &cd);
-    s7 = sha512_neon_schedule_update(s7, s0, s3, s4, s6);
-    sha512_neon_round2(62, s7, &cd, &ef, &gh, &ab);
-    s0 = sha512_neon_schedule_update(s0, s1, s4, s5, s7);
-    sha512_neon_round2(64, s0, &ab, &cd, &ef, &gh);
-    s1 = sha512_neon_schedule_update(s1, s2, s5, s6, s0);
-    sha512_neon_round2(66, s1, &gh, &ab, &cd, &ef);
-    s2 = sha512_neon_schedule_update(s2, s3, s6, s7, s1);
-    sha512_neon_round2(68, s2, &ef, &gh, &ab, &cd);
-    s3 = sha512_neon_schedule_update(s3, s4, s7, s0, s2);
-    sha512_neon_round2(70, s3, &cd, &ef, &gh, &ab);
-    s4 = sha512_neon_schedule_update(s4, s5, s0, s1, s3);
-    sha512_neon_round2(72, s4, &ab, &cd, &ef, &gh);
-    s5 = sha512_neon_schedule_update(s5, s6, s1, s2, s4);
-    sha512_neon_round2(74, s5, &gh, &ab, &cd, &ef);
-    s6 = sha512_neon_schedule_update(s6, s7, s2, s3, s5);
-    sha512_neon_round2(76, s6, &ef, &gh, &ab, &cd);
-    s7 = sha512_neon_schedule_update(s7, s0, s3, s4, s6);
-    sha512_neon_round2(78, s7, &cd, &ef, &gh, &ab);
-
-    core->ab = vaddq_u64(core->ab, ab);
-    core->cd = vaddq_u64(core->cd, cd);
-    core->ef = vaddq_u64(core->ef, ef);
-    core->gh = vaddq_u64(core->gh, gh);
-}
-
-typedef struct sha512_neon {
-    sha512_neon_core core;
-    sha512_block blk;
-    BinarySink_IMPLEMENTATION;
-    ssh_hash hash;
-} sha512_neon;
-
-static void sha512_neon_write(BinarySink *bs, const void *vp, size_t len);
-
-static ssh_hash *sha512_neon_new(const ssh_hashalg *alg)
-{
-    if (!sha512_hw_available_cached())
-        return NULL;
-
-    sha512_neon *s = snew(sha512_neon);
-
-    s->hash.vt = alg;
-    BinarySink_INIT(s, sha512_neon_write);
-    BinarySink_DELEGATE_INIT(&s->hash, s);
-    return &s->hash;
-}
-
-static void sha512_neon_reset(ssh_hash *hash)
-{
-    sha512_neon *s = container_of(hash, sha512_neon, hash);
-    const uint64_t *iv = (const uint64_t *)hash->vt->extra;
-
-    s->core.ab = vld1q_u64(iv);
-    s->core.cd = vld1q_u64(iv+2);
-    s->core.ef = vld1q_u64(iv+4);
-    s->core.gh = vld1q_u64(iv+6);
-
-    sha512_block_setup(&s->blk);
-}
-
-static void sha512_neon_copyfrom(ssh_hash *hcopy, ssh_hash *horig)
-{
-    sha512_neon *copy = container_of(hcopy, sha512_neon, hash);
-    sha512_neon *orig = container_of(horig, sha512_neon, hash);
-
-    *copy = *orig; /* structure copy */
-
-    BinarySink_COPIED(copy);
-    BinarySink_DELEGATE_INIT(&copy->hash, copy);
-}
-
-static void sha512_neon_free(ssh_hash *hash)
-{
-    sha512_neon *s = container_of(hash, sha512_neon, hash);
-    smemclr(s, sizeof(*s));
-    sfree(s);
-}
-
-static void sha512_neon_write(BinarySink *bs, const void *vp, size_t len)
-{
-    sha512_neon *s = BinarySink_DOWNCAST(bs, sha512_neon);
-
-    while (len > 0)
-        if (sha512_block_write(&s->blk, &vp, &len))
-            sha512_neon_block(&s->core, s->blk.block);
-}
-
-static void sha512_neon_digest(ssh_hash *hash, uint8_t *digest)
-{
-    sha512_neon *s = container_of(hash, sha512_neon, hash);
-
-    sha512_block_pad(&s->blk, BinarySink_UPCAST(s));
-
-    vst1q_u8(digest,    vrev64q_u8(vreinterpretq_u8_u64(s->core.ab)));
-    vst1q_u8(digest+16, vrev64q_u8(vreinterpretq_u8_u64(s->core.cd)));
-    vst1q_u8(digest+32, vrev64q_u8(vreinterpretq_u8_u64(s->core.ef)));
-    vst1q_u8(digest+48, vrev64q_u8(vreinterpretq_u8_u64(s->core.gh)));
-}
-
-static void sha384_neon_digest(ssh_hash *hash, uint8_t *digest)
-{
-    sha512_neon *s = container_of(hash, sha512_neon, hash);
-
-    sha512_block_pad(&s->blk, BinarySink_UPCAST(s));
-
-    vst1q_u8(digest,    vrev64q_u8(vreinterpretq_u8_u64(s->core.ab)));
-    vst1q_u8(digest+16, vrev64q_u8(vreinterpretq_u8_u64(s->core.cd)));
-    vst1q_u8(digest+32, vrev64q_u8(vreinterpretq_u8_u64(s->core.ef)));
-}
-
-const ssh_hashalg ssh_sha512_hw = {
-    .new = sha512_neon_new,
-    .reset = sha512_neon_reset,
-    .copyfrom = sha512_neon_copyfrom,
-    .digest = sha512_neon_digest,
-    .free = sha512_neon_free,
-    .hlen = 64,
-    .blocklen = 128,
-    HASHALG_NAMES_ANNOTATED("SHA-512", "NEON accelerated"),
-    .extra = sha512_initial_state,
-};
-
-const ssh_hashalg ssh_sha384_hw = {
-    .new = sha512_neon_new,
-    .reset = sha512_neon_reset,
-    .copyfrom = sha512_neon_copyfrom,
-    .digest = sha384_neon_digest,
-    .free = sha512_neon_free,
-    .hlen = 48,
-    .blocklen = 128,
-    HASHALG_NAMES_ANNOTATED("SHA-384", "NEON accelerated"),
-    .extra = sha384_initial_state,
-};
-
-/* ----------------------------------------------------------------------
- * Stub functions if we have no hardware-accelerated SHA-512. In this
- * case, sha512_hw_new returns NULL (though it should also never be
- * selected by sha512_select, so the only thing that should even be
- * _able_ to call it is testcrypt). As a result, the remaining vtable
- * functions should never be called at all.
- */
-
-#elif HW_SHA512 == HW_SHA512_NONE
-
-static bool sha512_hw_available(void)
-{
-    return false;
-}
-
-static ssh_hash *sha512_stub_new(const ssh_hashalg *alg)
-{
-    return NULL;
-}
-
-#define STUB_BODY { unreachable("Should never be called"); }
-
-static void sha512_stub_reset(ssh_hash *hash) STUB_BODY
-static void sha512_stub_copyfrom(ssh_hash *hash, ssh_hash *orig) STUB_BODY
-static void sha512_stub_free(ssh_hash *hash) STUB_BODY
-static void sha512_stub_digest(ssh_hash *hash, uint8_t *digest) STUB_BODY
-
-const ssh_hashalg ssh_sha512_hw = {
-    .new = sha512_stub_new,
-    .reset = sha512_stub_reset,
-    .copyfrom = sha512_stub_copyfrom,
-    .digest = sha512_stub_digest,
-    .free = sha512_stub_free,
-    .hlen = 64,
-    .blocklen = 128,
-    HASHALG_NAMES_ANNOTATED("SHA-512", "!NONEXISTENT ACCELERATED VERSION!"),
-};
-
-const ssh_hashalg ssh_sha384_hw = {
-    .new = sha512_stub_new,
-    .reset = sha512_stub_reset,
-    .copyfrom = sha512_stub_copyfrom,
-    .digest = sha512_stub_digest,
-    .free = sha512_stub_free,
-    .hlen = 48,
-    .blocklen = 128,
-    HASHALG_NAMES_ANNOTATED("SHA-384", "!NONEXISTENT ACCELERATED VERSION!"),
-};
-
-#endif /* HW_SHA512 */
diff --git a/crypto/sha512.h b/crypto/sha512.h
new file mode 100644
index 00000000..98145558
--- /dev/null
+++ b/crypto/sha512.h
@@ -0,0 +1,131 @@
+/*
+ * Definitions likely to be helpful to multiple SHA-512 implementations.
+ */
+
+/*
+ * The 'extra' structure used by SHA-512 implementations is used to
+ * include information about how to check if a given implementation is
+ * available at run time, and whether we've already checked.
+ */
+struct sha512_extra_mutable;
+struct sha512_extra {
+    /* Pointer to the initial state (distinguishes SHA-384 from -512) */
+    const uint64_t *initial_state;
+
+    /* Function to check availability. Might be expensive, so we don't
+     * want to call it more than once. */
+    bool (*check_available)(void);
+
+    /* Point to a writable substructure. */
+    struct sha512_extra_mutable *mut;
+};
+struct sha512_extra_mutable {
+    bool checked_availability;
+    bool is_available;
+};
+static inline bool check_availability(const struct sha512_extra *extra)
+{
+    if (!extra->mut->checked_availability) {
+        extra->mut->is_available = extra->check_available();
+        extra->mut->checked_availability = true;
+    }
+
+    return extra->mut->is_available;
+}
+
+/*
+ * Macro to define a pair of SHA-{384,512} vtables together with their
+ * 'extra' structure.
+ */
+#define SHA512_VTABLES(impl_c, impl_display)                            \
+    static struct sha512_extra_mutable sha512_ ## impl_c ## _extra_mut; \
+    static const struct sha512_extra sha384_ ## impl_c ## _extra = {    \
+        .initial_state = sha384_initial_state,                          \
+        .check_available = sha512_ ## impl_c ## _available,             \
+        .mut = &sha512_ ## impl_c ## _extra_mut,                        \
+    };                                                                  \
+    static const struct sha512_extra sha512_ ## impl_c ## _extra = {    \
+        .initial_state = sha512_initial_state,                          \
+        .check_available = sha512_ ## impl_c ## _available,             \
+        .mut = &sha512_ ## impl_c ## _extra_mut,                        \
+    };                                                                  \
+    const ssh_hashalg ssh_sha384_ ## impl_c = {                         \
+        .new = sha512_ ## impl_c ## _new,                               \
+        .reset = sha512_ ## impl_c ## _reset,                           \
+        .copyfrom = sha512_ ## impl_c ## _copyfrom,                     \
+        .digest = sha384_ ## impl_c ## _digest,                         \
+        .free = sha512_ ## impl_c ## _free,                             \
+        .hlen = 48,                                                     \
+        .blocklen = 128,                                                \
+        HASHALG_NAMES_ANNOTATED("SHA-384", impl_display),               \
+        .extra = &sha384_ ## impl_c ## _extra,                          \
+    };                                                                  \
+    const ssh_hashalg ssh_sha512_ ## impl_c = {                         \
+        .new = sha512_ ## impl_c ## _new,                               \
+        .reset = sha512_ ## impl_c ## _reset,                           \
+        .copyfrom = sha512_ ## impl_c ## _copyfrom,                     \
+        .digest = sha512_ ## impl_c ## _digest,                         \
+        .free = sha512_ ## impl_c ## _free,                             \
+        .hlen = 64,                                                     \
+        .blocklen = 128,                                                \
+        HASHALG_NAMES_ANNOTATED("SHA-512", impl_display),               \
+        .extra = &sha512_ ## impl_c ## _extra,                          \
+    }
+
+extern const uint64_t sha512_initial_state[8];
+extern const uint64_t sha384_initial_state[8];
+extern const uint64_t sha512_round_constants[80];
+
+#define SHA512_ROUNDS 80
+
+typedef struct sha512_block sha512_block;
+struct sha512_block {
+    uint8_t block[128];
+    size_t used;
+    uint64_t lenhi, lenlo;
+};
+
+static inline void sha512_block_setup(sha512_block *blk)
+{
+    blk->used = 0;
+    blk->lenhi = blk->lenlo = 0;
+}
+
+static inline bool sha512_block_write(
+    sha512_block *blk, const void **vdata, size_t *len)
+{
+    size_t blkleft = sizeof(blk->block) - blk->used;
+    size_t chunk = *len < blkleft ? *len : blkleft;
+
+    const uint8_t *p = *vdata;
+    memcpy(blk->block + blk->used, p, chunk);
+    *vdata = p + chunk;
+    *len -= chunk;
+    blk->used += chunk;
+
+    size_t chunkbits = chunk << 3;
+
+    blk->lenlo += chunkbits;
+    blk->lenhi += (blk->lenlo < chunkbits);
+
+    if (blk->used == sizeof(blk->block)) {
+        blk->used = 0;
+        return true;
+    }
+
+    return false;
+}
+
+static inline void sha512_block_pad(sha512_block *blk, BinarySink *bs)
+{
+    uint64_t final_lenhi = blk->lenhi;
+    uint64_t final_lenlo = blk->lenlo;
+    size_t pad = 127 & (111 - blk->used);
+
+    put_byte(bs, 0x80);
+    put_padding(bs, pad, 0);
+    put_uint64(bs, final_lenhi);
+    put_uint64(bs, final_lenlo);
+
+    assert(blk->used == 0 && "Should have exactly hit a block boundary");
+}
diff --git a/ssh.h b/ssh.h
index 4162fc1e..24d7f1a5 100644
--- a/ssh.h
+++ b/ssh.h
@@ -953,22 +953,28 @@ extern const ssh_cipheralg ssh_3des_ssh2;
 extern const ssh_cipheralg ssh_des;
 extern const ssh_cipheralg ssh_des_sshcom_ssh2;
 extern const ssh_cipheralg ssh_aes256_sdctr;
-extern const ssh_cipheralg ssh_aes256_sdctr_hw;
+extern const ssh_cipheralg ssh_aes256_sdctr_ni;
+extern const ssh_cipheralg ssh_aes256_sdctr_neon;
 extern const ssh_cipheralg ssh_aes256_sdctr_sw;
 extern const ssh_cipheralg ssh_aes256_cbc;
-extern const ssh_cipheralg ssh_aes256_cbc_hw;
+extern const ssh_cipheralg ssh_aes256_cbc_ni;
+extern const ssh_cipheralg ssh_aes256_cbc_neon;
 extern const ssh_cipheralg ssh_aes256_cbc_sw;
 extern const ssh_cipheralg ssh_aes192_sdctr;
-extern const ssh_cipheralg ssh_aes192_sdctr_hw;
+extern const ssh_cipheralg ssh_aes192_sdctr_ni;
+extern const ssh_cipheralg ssh_aes192_sdctr_neon;
 extern const ssh_cipheralg ssh_aes192_sdctr_sw;
 extern const ssh_cipheralg ssh_aes192_cbc;
-extern const ssh_cipheralg ssh_aes192_cbc_hw;
+extern const ssh_cipheralg ssh_aes192_cbc_ni;
+extern const ssh_cipheralg ssh_aes192_cbc_neon;
 extern const ssh_cipheralg ssh_aes192_cbc_sw;
 extern const ssh_cipheralg ssh_aes128_sdctr;
-extern const ssh_cipheralg ssh_aes128_sdctr_hw;
+extern const ssh_cipheralg ssh_aes128_sdctr_ni;
+extern const ssh_cipheralg ssh_aes128_sdctr_neon;
 extern const ssh_cipheralg ssh_aes128_sdctr_sw;
 extern const ssh_cipheralg ssh_aes128_cbc;
-extern const ssh_cipheralg ssh_aes128_cbc_hw;
+extern const ssh_cipheralg ssh_aes128_cbc_ni;
+extern const ssh_cipheralg ssh_aes128_cbc_neon;
 extern const ssh_cipheralg ssh_aes128_cbc_sw;
 extern const ssh_cipheralg ssh_blowfish_ssh2_ctr;
 extern const ssh_cipheralg ssh_blowfish_ssh2;
@@ -983,16 +989,18 @@ extern const ssh2_ciphers ssh2_arcfour;
 extern const ssh2_ciphers ssh2_ccp;
 extern const ssh_hashalg ssh_md5;
 extern const ssh_hashalg ssh_sha1;
-extern const ssh_hashalg ssh_sha1_hw;
+extern const ssh_hashalg ssh_sha1_ni;
+extern const ssh_hashalg ssh_sha1_neon;
 extern const ssh_hashalg ssh_sha1_sw;
 extern const ssh_hashalg ssh_sha256;
-extern const ssh_hashalg ssh_sha256_hw;
+extern const ssh_hashalg ssh_sha256_ni;
+extern const ssh_hashalg ssh_sha256_neon;
 extern const ssh_hashalg ssh_sha256_sw;
 extern const ssh_hashalg ssh_sha384;
-extern const ssh_hashalg ssh_sha384_hw;
+extern const ssh_hashalg ssh_sha384_neon;
 extern const ssh_hashalg ssh_sha384_sw;
 extern const ssh_hashalg ssh_sha512;
-extern const ssh_hashalg ssh_sha512_hw;
+extern const ssh_hashalg ssh_sha512_neon;
 extern const ssh_hashalg ssh_sha512_sw;
 extern const ssh_hashalg ssh_sha3_224;
 extern const ssh_hashalg ssh_sha3_256;
@@ -1039,10 +1047,10 @@ ssh_hash *blake2b_new_general(unsigned hashlen);
  * itself. If so, then this function should be implemented in each
  * platform subdirectory.
  */
-bool platform_aes_hw_available(void);
-bool platform_sha256_hw_available(void);
-bool platform_sha1_hw_available(void);
-bool platform_sha512_hw_available(void);
+bool platform_aes_neon_available(void);
+bool platform_sha256_neon_available(void);
+bool platform_sha1_neon_available(void);
+bool platform_sha512_neon_available(void);
 
 /*
  * PuTTY version number formatted as an SSH version string.
diff --git a/test/cryptsuite.py b/test/cryptsuite.py
index 757de673..9ed0c3f5 100755
--- a/test/cryptsuite.py
+++ b/test/cryptsuite.py
@@ -141,6 +141,14 @@ def mac_str(alg, key, message, cipher=None):
 def lcm(a, b):
     return a * b // gcd(a, b)
 
+def get_implementations(alg):
+    return get_implementations_commasep(alg).decode("ASCII").split(",")
+
+def get_aes_impls():
+    return [impl.rsplit("_", 1)[-1]
+            for impl in get_implementations("aes128_cbc")
+            if impl.startswith("aes128_cbc_")]
+
 class MyTestBase(unittest.TestCase):
     "Intermediate class that adds useful helper methods."
     def assertEqualBin(self, x, y):
@@ -1181,9 +1189,9 @@ class crypt(MyTestBase):
         # reference implementation of AES in Python. ('Mostly'
         # independent in that it was written by me.)
 
-        def vector(cipher, key, iv, plaintext, ciphertext):
-            for suffix in "hw", "sw":
-                c = ssh_cipher_new("{}_{}".format(cipher, suffix))
+        def vector(cipherbase, key, iv, plaintext, ciphertext):
+            for cipher in get_implementations(cipherbase):
+                c = ssh_cipher_new(cipher)
                 if c is None: return # skip test if HW AES not available
                 ssh_cipher_setkey(c, key)
                 ssh_cipher_setiv(c, iv)
@@ -1302,7 +1310,7 @@ class crypt(MyTestBase):
         # We also test this at all three AES key lengths, in case the
         # core cipher routines are written separately for each one.
 
-        for suffix in "hw", "sw":
+        for suffix in get_aes_impls():
             for keylen in [128, 192, 256]:
                 hexTestValues = ["00000000", "00000001", "ffffffff"]
                 for ivHexBytes in itertools.product(*([hexTestValues] * 4)):
@@ -1325,7 +1333,7 @@ class crypt(MyTestBase):
         for keylen in [128, 192, 256]:
             decryptions = []
 
-            for suffix in "hw", "sw":
+            for suffix in get_aes_impls():
                 c = ssh_cipher_new("aes{:d}_cbc_{}".format(keylen, suffix))
                 if c is None: continue
                 ssh_cipher_setkey(c, test_key[:keylen//8])
@@ -1493,23 +1501,11 @@ class crypt(MyTestBase):
             ("3des_ssh1",     24,    8, False, unhex('d5f1cc25b8fbc62de63590b9b92344adf6dd72753273ff0fb32d4dbc6af858529129f34242f3d557eed3a5c84204eb4f868474294964cf70df5d8f45dfccfc45')),
             ("des_cbc",        8,    8, True,  unhex('051524e77fb40e109d9fffeceacf0f28c940e2f8415ddccc117020bdd2612af5036490b12085d0e46129919b8e499f51cb82a4b341d7a1a1ea3e65201ef248f6')),
             ("aes256_ctr",    32,   16, False, unhex('b87b35e819f60f0f398a37b05d7bcf0b04ad4ebe570bd08e8bfa8606bafb0db2cfcd82baf2ccceae5de1a3c1ae08a8b8fdd884fdc5092031ea8ce53333e62976')),
-            ("aes256_ctr_hw", 32,   16, False, unhex('b87b35e819f60f0f398a37b05d7bcf0b04ad4ebe570bd08e8bfa8606bafb0db2cfcd82baf2ccceae5de1a3c1ae08a8b8fdd884fdc5092031ea8ce53333e62976')),
-            ("aes256_ctr_sw", 32,   16, False, unhex('b87b35e819f60f0f398a37b05d7bcf0b04ad4ebe570bd08e8bfa8606bafb0db2cfcd82baf2ccceae5de1a3c1ae08a8b8fdd884fdc5092031ea8ce53333e62976')),
             ("aes256_cbc",    32,   16, True,  unhex('381cbb2fbcc48118d0094540242bd990dd6af5b9a9890edd013d5cad2d904f34b9261c623a452f32ea60e5402919a77165df12862742f1059f8c4a862f0827c5')),
-            ("aes256_cbc_hw", 32,   16, True,  unhex('381cbb2fbcc48118d0094540242bd990dd6af5b9a9890edd013d5cad2d904f34b9261c623a452f32ea60e5402919a77165df12862742f1059f8c4a862f0827c5')),
-            ("aes256_cbc_sw", 32,   16, True,  unhex('381cbb2fbcc48118d0094540242bd990dd6af5b9a9890edd013d5cad2d904f34b9261c623a452f32ea60e5402919a77165df12862742f1059f8c4a862f0827c5')),
             ("aes192_ctr",    24,   16, False, unhex('06bcfa7ccf075d723e12b724695a571a0fad67c56287ea609c410ac12749c51bb96e27fa7e1c7ea3b14792bbbb8856efb0617ebec24a8e4a87340d820cf347b8')),
-            ("aes192_ctr_hw", 24,   16, False, unhex('06bcfa7ccf075d723e12b724695a571a0fad67c56287ea609c410ac12749c51bb96e27fa7e1c7ea3b14792bbbb8856efb0617ebec24a8e4a87340d820cf347b8')),
-            ("aes192_ctr_sw", 24,   16, False, unhex('06bcfa7ccf075d723e12b724695a571a0fad67c56287ea609c410ac12749c51bb96e27fa7e1c7ea3b14792bbbb8856efb0617ebec24a8e4a87340d820cf347b8')),
             ("aes192_cbc",    24,   16, True,  unhex('ac97f8698170f9c05341214bd7624d5d2efef8311596163dc597d9fe6c868971bd7557389974612cbf49ea4e7cc6cc302d4cc90519478dd88a4f09b530c141f3')),
-            ("aes192_cbc_hw", 24,   16, True,  unhex('ac97f8698170f9c05341214bd7624d5d2efef8311596163dc597d9fe6c868971bd7557389974612cbf49ea4e7cc6cc302d4cc90519478dd88a4f09b530c141f3')),
-            ("aes192_cbc_sw", 24,   16, True,  unhex('ac97f8698170f9c05341214bd7624d5d2efef8311596163dc597d9fe6c868971bd7557389974612cbf49ea4e7cc6cc302d4cc90519478dd88a4f09b530c141f3')),
             ("aes128_ctr",    16,   16, False, unhex('0ad4ddfd2360ec59d77dcb9a981f92109437c68c5e7f02f92017d9f424f89ab7850473ac0e19274125e740f252c84ad1f6ad138b6020a03bdaba2f3a7378ce1e')),
-            ("aes128_ctr_hw", 16,   16, False, unhex('0ad4ddfd2360ec59d77dcb9a981f92109437c68c5e7f02f92017d9f424f89ab7850473ac0e19274125e740f252c84ad1f6ad138b6020a03bdaba2f3a7378ce1e')),
-            ("aes128_ctr_sw", 16,   16, False, unhex('0ad4ddfd2360ec59d77dcb9a981f92109437c68c5e7f02f92017d9f424f89ab7850473ac0e19274125e740f252c84ad1f6ad138b6020a03bdaba2f3a7378ce1e')),
             ("aes128_cbc",    16,   16, True,  unhex('36de36917fb7955a711c8b0bf149b29120a77524f393ae3490f4ce5b1d5ca2a0d7064ce3c38e267807438d12c0e40cd0d84134647f9f4a5b11804a0cc5070e62')),
-            ("aes128_cbc_hw", 16,   16, True,  unhex('36de36917fb7955a711c8b0bf149b29120a77524f393ae3490f4ce5b1d5ca2a0d7064ce3c38e267807438d12c0e40cd0d84134647f9f4a5b11804a0cc5070e62')),
-            ("aes128_cbc_sw", 16,   16, True,  unhex('36de36917fb7955a711c8b0bf149b29120a77524f393ae3490f4ce5b1d5ca2a0d7064ce3c38e267807438d12c0e40cd0d84134647f9f4a5b11804a0cc5070e62')),
             ("blowfish_ctr",  32,    8, False, unhex('079daf0f859363ccf72e975764d709232ec48adc74f88ccd1f342683f0bfa89ca0e8dbfccc8d4d99005d6b61e9cc4e6eaa2fd2a8163271b94bf08ef212129f01')),
             ("blowfish_ssh2", 16,    8, True,  unhex('e986b7b01f17dfe80ee34cac81fa029b771ec0f859ae21ae3ec3df1674bc4ceb54a184c6c56c17dd2863c3e9c068e76fd9aef5673465995f0d648b0bb848017f')),
             ("blowfish_ssh1", 32,    8, True,  unhex('d44092a9035d895acf564ba0365d19570fbb4f125d5a4fd2a1812ee6c8a1911a51bb181fbf7d1a261253cab71ee19346eb477b3e7ecf1d95dd941e635c1a4fbf')),
@@ -1517,36 +1513,37 @@ class crypt(MyTestBase):
             ("arcfour128",    16, None, False, unhex('fd4af54c5642cb29629e50a15d22e4944e21ffba77d0543b27590eafffe3886686d1aefae0484afc9e67edc0e67eb176bbb5340af1919ea39adfe866d066dd05')),
         ]
 
-        for alg, keylen, ivlen, simple_cbc, c in ciphers:
-            cipher = ssh_cipher_new(alg)
-            if cipher is None:
-                continue # hardware-accelerated cipher not available
+        for algbase, keylen, ivlen, simple_cbc, c in ciphers:
+            for alg in get_implementations(algbase):
+                cipher = ssh_cipher_new(alg)
+                if cipher is None:
+                    continue # hardware-accelerated cipher not available
 
-            ssh_cipher_setkey(cipher, k[:keylen])
-            if ivlen is not None:
-                ssh_cipher_setiv(cipher, iv[:ivlen])
-            self.assertEqualBin(ssh_cipher_encrypt(cipher, p), c)
-
-            ssh_cipher_setkey(cipher, k[:keylen])
-            if ivlen is not None:
-                ssh_cipher_setiv(cipher, iv[:ivlen])
-            self.assertEqualBin(ssh_cipher_decrypt(cipher, c), p)
-
-            if simple_cbc:
-                # CBC ciphers (other than the three-layered CBC used
-                # by SSH-1 3DES) have more specific semantics for
-                # their IV than 'some kind of starting state for the
-                # cipher mode': the IV is specifically supposed to
-                # represent the previous block of ciphertext. So we
-                # can check that, by supplying the IV _as_ a
-                # ciphertext block via a call to decrypt(), and seeing
-                # if that causes our test ciphertext to decrypt the
-                # same way as when we provided the same IV via
-                # setiv().
                 ssh_cipher_setkey(cipher, k[:keylen])
-                ssh_cipher_decrypt(cipher, iv[:ivlen])
+                if ivlen is not None:
+                    ssh_cipher_setiv(cipher, iv[:ivlen])
+                self.assertEqualBin(ssh_cipher_encrypt(cipher, p), c)
+
+                ssh_cipher_setkey(cipher, k[:keylen])
+                if ivlen is not None:
+                    ssh_cipher_setiv(cipher, iv[:ivlen])
                 self.assertEqualBin(ssh_cipher_decrypt(cipher, c), p)
 
+                if simple_cbc:
+                    # CBC ciphers (other than the three-layered CBC used
+                    # by SSH-1 3DES) have more specific semantics for
+                    # their IV than 'some kind of starting state for the
+                    # cipher mode': the IV is specifically supposed to
+                    # represent the previous block of ciphertext. So we
+                    # can check that, by supplying the IV _as_ a
+                    # ciphertext block via a call to decrypt(), and seeing
+                    # if that causes our test ciphertext to decrypt the
+                    # same way as when we provided the same IV via
+                    # setiv().
+                    ssh_cipher_setkey(cipher, k[:keylen])
+                    ssh_cipher_decrypt(cipher, iv[:ivlen])
+                    self.assertEqualBin(ssh_cipher_decrypt(cipher, c), p)
+
     def testRSAKex(self):
         # Round-trip test of the RSA key exchange functions, plus a
         # hardcoded plain/ciphertext pair to guard against the
@@ -2324,7 +2321,7 @@ Private-MAC: 5b1f6f4cc43eb0060d2c3e181bc0129343adba2b
 class standard_test_vectors(MyTestBase):
     def testAES(self):
         def vector(cipher, key, plaintext, ciphertext):
-            for suffix in "hw", "sw":
+            for suffix in get_aes_impls():
                 c = ssh_cipher_new("{}_{}".format(cipher, suffix))
                 if c is None: return # skip test if HW AES not available
                 ssh_cipher_setkey(c, key)
@@ -2540,7 +2537,7 @@ class standard_test_vectors(MyTestBase):
                          unhex('56be34521d144c88dbb8c733f0e8b3f6'))
 
     def testSHA1(self):
-        for hashname in ['sha1_sw', 'sha1_hw']:
+        for hashname in get_implementations("sha1"):
             if ssh_hash_new(hashname) is None:
                 continue # skip testing of unavailable HW implementation
 
@@ -2577,7 +2574,7 @@ class standard_test_vectors(MyTestBase):
                 "cb0082c8f197d260991ba6a460e76e202bad27b3"))
 
     def testSHA256(self):
-        for hashname in ['sha256_sw', 'sha256_hw']:
+        for hashname in get_implementations("sha256"):
             if ssh_hash_new(hashname) is None:
                 continue # skip testing of unavailable HW implementation
 
@@ -2621,7 +2618,7 @@ class standard_test_vectors(MyTestBase):
                     "8ad3361763f7e9b2d95f4f0da6e1ccbc"))
 
     def testSHA384(self):
-        for hashname in ['sha384_sw', 'sha384_hw']:
+        for hashname in get_implementations("sha384"):
             if ssh_hash_new(hashname) is None:
                 continue # skip testing of unavailable HW implementation
 
@@ -2663,7 +2660,7 @@ class standard_test_vectors(MyTestBase):
                 '38e42b5c4de660f5de8fb2a5b2fbd2a3cbffd20cff1288c0'))
 
     def testSHA512(self):
-        for hashname in ['sha512_sw', 'sha512_hw']:
+        for hashname in get_implementations("sha512"):
             if ssh_hash_new(hashname) is None:
                 continue # skip testing of unavailable HW implementation
 
diff --git a/testcrypt.c b/testcrypt.c
index 752947cf..1948da08 100644
--- a/testcrypt.c
+++ b/testcrypt.c
@@ -207,16 +207,24 @@ static const ssh_hashalg *get_hashalg(BinarySource *in)
         {"md5", &ssh_md5},
         {"sha1", &ssh_sha1},
         {"sha1_sw", &ssh_sha1_sw},
-        {"sha1_hw", &ssh_sha1_hw},
         {"sha256", &ssh_sha256},
-        {"sha256_sw", &ssh_sha256_sw},
-        {"sha256_hw", &ssh_sha256_hw},
         {"sha384", &ssh_sha384},
-        {"sha384_sw", &ssh_sha384_sw},
-        {"sha384_hw", &ssh_sha384_hw},
         {"sha512", &ssh_sha512},
+        {"sha256_sw", &ssh_sha256_sw},
+        {"sha384_sw", &ssh_sha384_sw},
         {"sha512_sw", &ssh_sha512_sw},
-        {"sha512_hw", &ssh_sha512_hw},
+#if HAVE_SHA_NI
+        {"sha1_ni", &ssh_sha1_ni},
+        {"sha256_ni", &ssh_sha256_ni},
+#endif
+#if HAVE_NEON_CRYPTO
+        {"sha1_neon", &ssh_sha1_neon},
+        {"sha256_neon", &ssh_sha256_neon},
+#endif
+#if HAVE_NEON_SHA512
+        {"sha384_neon", &ssh_sha384_neon},
+        {"sha512_neon", &ssh_sha512_neon},
+#endif
         {"sha3_224", &ssh_sha3_224},
         {"sha3_256", &ssh_sha3_256},
         {"sha3_384", &ssh_sha3_384},
@@ -290,23 +298,33 @@ static const ssh_cipheralg *get_cipheralg(BinarySource *in)
         {"3des_ssh1", &ssh_3des_ssh1},
         {"des_cbc", &ssh_des},
         {"aes256_ctr", &ssh_aes256_sdctr},
-        {"aes256_ctr_hw", &ssh_aes256_sdctr_hw},
-        {"aes256_ctr_sw", &ssh_aes256_sdctr_sw},
         {"aes256_cbc", &ssh_aes256_cbc},
-        {"aes256_cbc_hw", &ssh_aes256_cbc_hw},
-        {"aes256_cbc_sw", &ssh_aes256_cbc_sw},
         {"aes192_ctr", &ssh_aes192_sdctr},
-        {"aes192_ctr_hw", &ssh_aes192_sdctr_hw},
-        {"aes192_ctr_sw", &ssh_aes192_sdctr_sw},
         {"aes192_cbc", &ssh_aes192_cbc},
-        {"aes192_cbc_hw", &ssh_aes192_cbc_hw},
-        {"aes192_cbc_sw", &ssh_aes192_cbc_sw},
         {"aes128_ctr", &ssh_aes128_sdctr},
-        {"aes128_ctr_hw", &ssh_aes128_sdctr_hw},
-        {"aes128_ctr_sw", &ssh_aes128_sdctr_sw},
         {"aes128_cbc", &ssh_aes128_cbc},
-        {"aes128_cbc_hw", &ssh_aes128_cbc_hw},
+        {"aes256_ctr_sw", &ssh_aes256_sdctr_sw},
+        {"aes256_cbc_sw", &ssh_aes256_cbc_sw},
+        {"aes192_ctr_sw", &ssh_aes192_sdctr_sw},
+        {"aes192_cbc_sw", &ssh_aes192_cbc_sw},
+        {"aes128_ctr_sw", &ssh_aes128_sdctr_sw},
         {"aes128_cbc_sw", &ssh_aes128_cbc_sw},
+#if HAVE_AES_NI
+        {"aes256_ctr_ni", &ssh_aes256_sdctr_ni},
+        {"aes256_cbc_ni", &ssh_aes256_cbc_ni},
+        {"aes192_ctr_ni", &ssh_aes192_sdctr_ni},
+        {"aes192_cbc_ni", &ssh_aes192_cbc_ni},
+        {"aes128_ctr_ni", &ssh_aes128_sdctr_ni},
+        {"aes128_cbc_ni", &ssh_aes128_cbc_ni},
+#endif
+#if HAVE_NEON_CRYPTO
+        {"aes256_ctr_neon", &ssh_aes256_sdctr_neon},
+        {"aes256_cbc_neon", &ssh_aes256_cbc_neon},
+        {"aes192_ctr_neon", &ssh_aes192_sdctr_neon},
+        {"aes192_cbc_neon", &ssh_aes192_cbc_neon},
+        {"aes128_ctr_neon", &ssh_aes128_sdctr_neon},
+        {"aes128_cbc_neon", &ssh_aes128_cbc_neon},
+#endif
         {"blowfish_ctr", &ssh_blowfish_ssh2_ctr},
         {"blowfish_ssh2", &ssh_blowfish_ssh2},
         {"blowfish_ssh1", &ssh_blowfish_ssh1},
@@ -1285,6 +1303,38 @@ strbuf *argon2_wrapper(Argon2Flavour flavour, uint32_t mem, uint32_t passes,
 }
 #define argon2 argon2_wrapper
 
+strbuf *get_implementations_commasep(ptrlen alg)
+{
+    strbuf *out = strbuf_new();
+    put_datapl(out, alg);
+
+    if (ptrlen_startswith(alg, PTRLEN_LITERAL("aes"), NULL)) {
+        strbuf_catf(out, ",%.*s_sw", PTRLEN_PRINTF(alg));
+#if HAVE_AES_NI
+        strbuf_catf(out, ",%.*s_ni", PTRLEN_PRINTF(alg));
+#endif
+#if HAVE_NEON_CRYPTO
+        strbuf_catf(out, ",%.*s_neon", PTRLEN_PRINTF(alg));
+#endif
+    } else if (ptrlen_startswith(alg, PTRLEN_LITERAL("sha256"), NULL) ||
+               ptrlen_startswith(alg, PTRLEN_LITERAL("sha1"), NULL)) {
+        strbuf_catf(out, ",%.*s_sw", PTRLEN_PRINTF(alg));
+#if HAVE_SHA_NI
+        strbuf_catf(out, ",%.*s_ni", PTRLEN_PRINTF(alg));
+#endif
+#if HAVE_NEON_CRYPTO
+        strbuf_catf(out, ",%.*s_neon", PTRLEN_PRINTF(alg));
+#endif
+    } else if (ptrlen_startswith(alg, PTRLEN_LITERAL("sha512"), NULL)) {
+        strbuf_catf(out, ",%.*s_sw", PTRLEN_PRINTF(alg));
+#if HAVE_NEON_SHA512
+        strbuf_catf(out, ",%.*s_neon", PTRLEN_PRINTF(alg));
+#endif
+    }
+
+    return out;
+}
+
 #define OPTIONAL_PTR_FUNC(type)                                         \
     typedef TD_val_##type TD_opt_val_##type;                            \
     static TD_opt_val_##type get_opt_val_##type(BinarySource *in) {     \
diff --git a/testcrypt.h b/testcrypt.h
index 298abc0f..2e6e993b 100644
--- a/testcrypt.h
+++ b/testcrypt.h
@@ -315,6 +315,7 @@ FUNC1(uint, crc32_rfc1662, val_string_ptrlen)
 FUNC1(uint, crc32_ssh1, val_string_ptrlen)
 FUNC2(uint, crc32_update, uint, val_string_ptrlen)
 FUNC2(boolean, crcda_detect, val_string_ptrlen, val_string_ptrlen)
+FUNC1(val_string, get_implementations_commasep, val_string_ptrlen)
 
 /*
  * These functions aren't part of PuTTY's own API, but are additions
diff --git a/testsc.c b/testsc.c
index 93bf263a..b182d382 100644
--- a/testsc.c
+++ b/testsc.c
@@ -216,6 +216,31 @@ VOLATILE_WRAPPED_DEFN(static, size_t, looplimit, (size_t x))
     return x;
 }
 
+#if HAVE_AES_NI
+#define CIPHERS_AES_NI(X, Y) \
+    X(Y, ssh_aes256_sdctr_ni)                   \
+    X(Y, ssh_aes256_cbc_ni)                     \
+    X(Y, ssh_aes192_sdctr_ni)                   \
+    X(Y, ssh_aes192_cbc_ni)                     \
+    X(Y, ssh_aes128_sdctr_ni)                   \
+    X(Y, ssh_aes128_cbc_ni)                     \
+    /* end of list */
+#else
+#define CIPHERS_AES_NI(X, Y)
+#endif
+#if HAVE_NEON_CRYPTO
+#define CIPHERS_AES_NEON(X, Y) \
+    X(Y, ssh_aes256_sdctr_neon)                   \
+    X(Y, ssh_aes256_cbc_neon)                     \
+    X(Y, ssh_aes192_sdctr_neon)                   \
+    X(Y, ssh_aes192_cbc_neon)                     \
+    X(Y, ssh_aes128_sdctr_neon)                   \
+    X(Y, ssh_aes128_cbc_neon)                     \
+    /* end of list */
+#else
+#define CIPHERS_AES_NEON(X, Y)
+#endif
+
 /* Ciphers that we expect to pass this test. Blowfish and Arcfour are
  * intentionally omitted, because we already know they don't. */
 #define CIPHERS(X, Y)                           \
@@ -225,23 +250,19 @@ VOLATILE_WRAPPED_DEFN(static, size_t, looplimit, (size_t x))
     X(Y, ssh_des)                               \
     X(Y, ssh_des_sshcom_ssh2)                   \
     X(Y, ssh_aes256_sdctr)                      \
-    X(Y, ssh_aes256_sdctr_hw)                   \
-    X(Y, ssh_aes256_sdctr_sw)                   \
     X(Y, ssh_aes256_cbc)                        \
-    X(Y, ssh_aes256_cbc_hw)                     \
-    X(Y, ssh_aes256_cbc_sw)                     \
     X(Y, ssh_aes192_sdctr)                      \
-    X(Y, ssh_aes192_sdctr_hw)                   \
-    X(Y, ssh_aes192_sdctr_sw)                   \
     X(Y, ssh_aes192_cbc)                        \
-    X(Y, ssh_aes192_cbc_hw)                     \
-    X(Y, ssh_aes192_cbc_sw)                     \
     X(Y, ssh_aes128_sdctr)                      \
-    X(Y, ssh_aes128_sdctr_hw)                   \
-    X(Y, ssh_aes128_sdctr_sw)                   \
     X(Y, ssh_aes128_cbc)                        \
-    X(Y, ssh_aes128_cbc_hw)                     \
+    X(Y, ssh_aes256_sdctr_sw)                   \
+    X(Y, ssh_aes256_cbc_sw)                     \
+    X(Y, ssh_aes192_sdctr_sw)                   \
+    X(Y, ssh_aes192_cbc_sw)                     \
+    X(Y, ssh_aes128_sdctr_sw)                   \
     X(Y, ssh_aes128_cbc_sw)                     \
+    CIPHERS_AES_NI(X, Y)                        \
+    CIPHERS_AES_NEON(X, Y)                      \
     X(Y, ssh2_chacha20_poly1305)                \
     /* end of list */
 
@@ -258,16 +279,35 @@ VOLATILE_WRAPPED_DEFN(static, size_t, looplimit, (size_t x))
 
 #define MAC_TESTLIST(X, name) X(mac_ ## name)
 
+#if HAVE_SHA_NI
+#define HASH_SHA_NI(X, Y) X(Y, ssh_sha256_ni) X(Y, ssh_sha1_ni)
+#else
+#define HASH_SHA_NI(X, Y)
+#endif
+#if HAVE_NEON_CRYPTO
+#define HASH_SHA_NEON(X, Y) X(Y, ssh_sha256_neon) X(Y, ssh_sha1_neon)
+#else
+#define HASH_SHA_NEON(X, Y)
+#endif
+#if HAVE_NEON_SHA512
+#define HASH_SHA512_NEON(X, Y) X(Y, ssh_sha384_neon) X(Y, ssh_sha512_neon)
+#else
+#define HASH_SHA512_NEON(X, Y)
+#endif
+
 #define HASHES(X, Y)                            \
     X(Y, ssh_md5)                               \
     X(Y, ssh_sha1)                              \
-    X(Y, ssh_sha1_hw)                           \
     X(Y, ssh_sha1_sw)                           \
     X(Y, ssh_sha256)                            \
-    X(Y, ssh_sha256_hw)                         \
     X(Y, ssh_sha256_sw)                         \
     X(Y, ssh_sha384)                            \
     X(Y, ssh_sha512)                            \
+    X(Y, ssh_sha384_sw)                         \
+    X(Y, ssh_sha512_sw)                         \
+    HASH_SHA_NI(X, Y)                           \
+    HASH_SHA_NEON(X, Y)                         \
+    HASH_SHA512_NEON(X, Y)                      \
     X(Y, ssh_sha3_224)                          \
     X(Y, ssh_sha3_256)                          \
     X(Y, ssh_sha3_384)                          \
diff --git a/unix/utils/arm_arch_queries.c b/unix/utils/arm_arch_queries.c
index 7c0957fa..cc3e4125 100644
--- a/unix/utils/arm_arch_queries.c
+++ b/unix/utils/arm_arch_queries.c
@@ -10,7 +10,7 @@
 
 #if defined __arm__ || defined __aarch64__
 
-bool platform_aes_hw_available(void)
+bool platform_aes_neon_available(void)
 {
 #if defined HWCAP_AES
     return getauxval(AT_HWCAP) & HWCAP_AES;
@@ -26,7 +26,7 @@ bool platform_aes_hw_available(void)
 #endif
 }
 
-bool platform_sha256_hw_available(void)
+bool platform_sha256_neon_available(void)
 {
 #if defined HWCAP_SHA2
     return getauxval(AT_HWCAP) & HWCAP_SHA2;
@@ -40,7 +40,7 @@ bool platform_sha256_hw_available(void)
 #endif
 }
 
-bool platform_sha1_hw_available(void)
+bool platform_sha1_neon_available(void)
 {
 #if defined HWCAP_SHA1
     return getauxval(AT_HWCAP) & HWCAP_SHA1;
@@ -54,7 +54,7 @@ bool platform_sha1_hw_available(void)
 #endif
 }
 
-bool platform_sha512_hw_available(void)
+bool platform_sha512_neon_available(void)
 {
 #if defined HWCAP_SHA512
     return getauxval(AT_HWCAP) & HWCAP_SHA512;
diff --git a/windows/utils/arm_arch_queries.c b/windows/utils/arm_arch_queries.c
index 05132b14..439a59fb 100644
--- a/windows/utils/arm_arch_queries.c
+++ b/windows/utils/arm_arch_queries.c
@@ -15,22 +15,22 @@
 #define IsProcessorFeaturePresent(...) false
 #endif
 
-bool platform_aes_hw_available(void)
+bool platform_aes_neon_available(void)
 {
     return IsProcessorFeaturePresent(PF_ARM_V8_CRYPTO_INSTRUCTIONS_AVAILABLE);
 }
 
-bool platform_sha256_hw_available(void)
+bool platform_sha256_neon_available(void)
 {
     return IsProcessorFeaturePresent(PF_ARM_V8_CRYPTO_INSTRUCTIONS_AVAILABLE);
 }
 
-bool platform_sha1_hw_available(void)
+bool platform_sha1_neon_available(void)
 {
     return IsProcessorFeaturePresent(PF_ARM_V8_CRYPTO_INSTRUCTIONS_AVAILABLE);
 }
 
-bool platform_sha512_hw_available(void)
+bool platform_sha512_neon_available(void)
 {
     /* As of 2020-12-24, as far as I can tell from docs.microsoft.com,
      * Windows on Arm does not yet provide a PF_ARM_V8_* flag for the