Break up crypto modules containing HW acceleration.

This applies to all of AES, SHA-1, SHA-256 and SHA-512. All those source files previously contained multiple implementations of the algorithm, enabled or disabled by ifdefs detecting whether they would work on a given compiler. And in order to get advanced machine instructions like AES-NI or NEON crypto into the output file when the compile flags hadn't enabled them, we had to do nasty stuff with compiler-specific pragmas or attributes. Now we can do the detection at cmake time, and enable advanced instructions in the more sensible way, by compile-time flags. So I've broken up each of these modules into lots of sub-pieces: a file called (e.g.) 'foo-common.c' containing common definitions across all implementations (such as round constants), one called 'foo-select.c' containing the top-level vtable(s), and a separate file for each implementation exporting just the vtable(s) for that implementation. One advantage of this is that it depends a lot less on compiler- specific bodgery. My particular least favourite part of the previous setup was the part where I had to _manually_ define some Arm ACLE feature macros before including <arm_neon.h>, so that it would define the intrinsics I wanted. Now I'm enabling interesting architecture features in the normal way, on the compiler command line, there's no need for that kind of trick: the right feature macros are already defined and <arm_neon.h> does the right thing. Another change in this reorganisation is that I've stopped assuming there's just one hardware implementation per platform. Previously, the accelerated vtables were called things like sha256_hw, and varied between FOO-NI and NEON depending on platform; and the selection code would simply ask 'is hw available? if so, use hw, else sw'. Now, each HW acceleration strategy names its vtable its own way, and the selection vtable has a whole list of possibilities to iterate over looking for a supported one. So if someone feels like writing a second accelerated implementation of something for a given platform - for example, I've heard you can use plain NEON to speed up AES somewhat even without the crypto extension - then it will now have somewhere to drop in alongside the existing ones.
2025-06-30 11:02:48 -05:00 · 2021-04-19 06:42:12 +01:00
parent 5b30e6f7a6
commit fca13a17b1
35 changed files with 3620 additions and 3701 deletions
--- a/cmake/cmake.h.in
+++ b/cmake/cmake.h.in
@ -40,3 +40,11 @@
 #cmakedefine01 HAVE_SO_PEERCRED
 #cmakedefine01 HAVE_PANGO_FONT_FAMILY_IS_MONOSPACE
 #cmakedefine01 HAVE_PANGO_FONT_MAP_LIST_FAMILIES
+
+#cmakedefine01 HAVE_AES_NI
+#cmakedefine01 HAVE_SHA_NI
+#cmakedefine01 HAVE_SHAINTRIN_H
+#cmakedefine01 HAVE_NEON_CRYPTO
+#cmakedefine01 HAVE_NEON_SHA512
+#cmakedefine01 HAVE_NEON_SHA512_INTRINSICS
+#cmakedefine01 USE_ARM64_NEON_H
--- a/crypto/CMakeLists.txt
+++ b/crypto/CMakeLists.txt
@ -1,5 +1,7 @@
 add_sources_from_current_dir(crypto
-  aes.c
+  aes-common.c
+  aes-select.c
+  aes-sw.c
  arcfour.c
  argon2.c
  bcrypt.c
@ -23,8 +25,181 @@ add_sources_from_current_dir(crypto
  pubkey-ppk.c
  pubkey-ssh1.c
  rsa.c
-  sha256.c
-  sha512.c
+  sha256-common.c
+  sha256-select.c
+  sha256-sw.c
+  sha512-common.c
+  sha512-select.c
+  sha512-sw.c
  sha3.c
-  sha1.c
+  sha1-common.c
+  sha1-select.c
+  sha1-sw.c
  xdmauth.c)
+
+include(CheckCSourceCompiles)
+
+function(test_compile_with_flags outvar)
+  cmake_parse_arguments(OPT "" ""
+    "GNU_FLAGS;MSVC_FLAGS;ADD_SOURCES_IF_SUCCESSFUL;TEST_SOURCE" "${ARGN}")
+
+  # Figure out what flags are applicable to this compiler.
+  set(flags)
+  if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR
+     CMAKE_C_COMPILER_ID MATCHES "Clang")
+    set(flags ${OPT_GNU_FLAGS})
+  endif()
+  if(CMAKE_C_COMPILER_ID MATCHES "MSVC")
+    set(flags ${OPT_MSVC_FLAGS})
+  endif()
+
+  # See if we can compile the provided test program.
+  string(JOIN " " CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS} ${flags})
+  check_c_source_compiles("${OPT_TEST_SOURCE}" "${outvar}")
+
+  if(${outvar} AND OPT_ADD_SOURCES_IF_SUCCESSFUL)
+    # Make an object library that compiles the implementation with the
+    # necessary flags, and add the resulting objects to the crypto
+    # library.
+    set(libname object_lib_${outvar})
+    add_library(${libname} OBJECT ${OPT_ADD_SOURCES_IF_SUCCESSFUL})
+    target_compile_options(${libname} PRIVATE ${flags})
+    target_sources(crypto PRIVATE $<TARGET_OBJECTS:${libname}>)
+  endif()
+
+  # Export the output to the caller's scope, so that further tests can
+  # be based on it.
+  set(${outvar} ${${outvar}} PARENT_SCOPE)
+endfunction()
+
+# ----------------------------------------------------------------------
+# Try to enable x86 intrinsics-based crypto implementations.
+
+test_compile_with_flags(HAVE_WMMINTRIN_H
+  GNU_FLAGS -msse4.1
+  TEST_SOURCE "
+    #include <wmmintrin.h>
+    #include <smmintrin.h>
+    volatile __m128i r, a, b;
+    int main(void) { r = _mm_xor_si128(a, b); }")
+if(HAVE_WMMINTRIN_H)
+  test_compile_with_flags(HAVE_AES_NI
+    GNU_FLAGS -msse4.1 -maes
+    TEST_SOURCE "
+      #include <wmmintrin.h>
+      #include <smmintrin.h>
+      volatile __m128i r, a, b;
+      int main(void) { r = _mm_aesenc_si128(a, b); }"
+    ADD_SOURCES_IF_SUCCESSFUL aes-ni aes-ni.c)
+
+  # shaintrin.h doesn't exist on all compilers; sometimes it's folded
+  # into the other headers
+  test_compile_with_flags(HAVE_SHAINTRIN_H
+    GNU_FLAGS -msse4.1 -msha
+    TEST_SOURCE "
+      #include <wmmintrin.h>
+      #include <smmintrin.h>
+      #include <immintrin.h>
+      #include <shaintrin.h>
+      volatile __m128i r, a, b;
+      int main(void) { r = _mm_xor_si128(a, b); }")
+  if(HAVE_SHAINTRIN_H)
+    set(include_shaintrin "#include <shaintrin.h>")
+  else()
+    set(include_shaintrin "")
+  endif()
+
+  test_compile_with_flags(HAVE_SHA_NI
+    GNU_FLAGS -msse4.1 -msha
+    TEST_SOURCE "
+      #include <wmmintrin.h>
+      #include <smmintrin.h>
+      #include <immintrin.h>
+      ${include_shaintrin}
+      volatile __m128i r, a, b, c;
+      int main(void) { r = _mm_sha256rnds2_epu32(a, b, c); }
+      " sha-ni sha256-ni.c sha1-ni.c)
+endif()
+
+# ----------------------------------------------------------------------
+# Try to enable Arm Neon intrinsics-based crypto implementations.
+
+# Start by checking which header file we need. ACLE specifies that it
+# ought to be <arm_neon.h>, on both 32- and 64-bit Arm, but Visual
+# Studio for some reason renamed the header to <arm64_neon.h> in
+# 64-bit, and gives an error if you use the standard name. (However,
+# clang-cl does let you use the standard name.)
+test_compile_with_flags(HAVE_ARM_NEON_H
+  MSVC_FLAGS -D_ARM_USE_NEW_NEON_INTRINSICS
+  TEST_SOURCE "
+    #include <arm_neon.h>
+    volatile uint8x16_t r, a, b;
+    int main(void) { r = veorq_u8(a, b); }")
+if(HAVE_ARM_NEON_H)
+  set(neon ON)
+  set(neon_header "arm_neon.h")
+else()
+  test_compile_with_flags(HAVE_ARM64_NEON_H TEST_SOURCE "
+    #include <arm64_neon.h>
+    volatile uint8x16_t r, a, b;
+    int main(void) { r = veorq_u8(a, b); }")
+  if(HAVE_ARM64_NEON_H)
+    set(neon ON)
+    set(neon_header "arm64_neon.h")
+    set(USE_ARM64_NEON_H ON)
+  endif()
+endif()
+
+if(neon)
+  # If we have _some_ NEON header, look for the individual things we
+  # can enable with it.
+
+  # The 'crypto' architecture extension includes support for AES,
+  # SHA-1, and SHA-256.
+  test_compile_with_flags(HAVE_NEON_CRYPTO
+    GNU_FLAGS -march=armv8-a+crypto
+    MSVC_FLAGS -D_ARM_USE_NEW_NEON_INTRINSICS
+    TEST_SOURCE "
+      #include <${neon_header}>
+      volatile uint8x16_t r, a, b;
+      volatile uint32x4_t s, x, y, z;
+      int main(void) { r = vaeseq_u8(a, b); s = vsha256hq_u32(x, y, z); }"
+    ADD_SOURCES_IF_SUCCESSFUL aes-neon.c sha256-neon.c sha1-neon.c)
+
+  # The 'sha3' architecture extension, despite the name, includes
+  # support for SHA-512 (from the SHA-2 standard) as well as SHA-3
+  # proper.
+  #
+  # Versions of clang up to and including clang 12 support this
+  # extension in assembly language, but not the ACLE intrinsics for
+  # it. So we check both.
+  test_compile_with_flags(HAVE_NEON_SHA512_INTRINSICS
+    GNU_FLAGS -march=armv8.2-a+crypto+sha3
+    TEST_SOURCE "
+      #include <${neon_header}>
+      volatile uint64x2_t r, a, b;
+      int main(void) { r = vsha512su0q_u64(a, b); }"
+    ADD_SOURCES_IF_SUCCESSFUL sha512-neon.c)
+  if(HAVE_NEON_SHA512_INTRINSICS)
+    set(HAVE_NEON_SHA512 ON)
+  else()
+    test_compile_with_flags(HAVE_NEON_SHA512_ASM
+      GNU_FLAGS -march=armv8.2-a+crypto+sha3
+      TEST_SOURCE "
+        #include <${neon_header}>
+        volatile uint64x2_t r, a;
+        int main(void) { __asm__(\"sha512su0 %0.2D,%1.2D\" : \"+w\" (r) : \"w\" (a)); }"
+      ADD_SOURCES_IF_SUCCESSFUL sha512-neon.c)
+    if(HAVE_NEON_SHA512_ASM)
+      set(HAVE_NEON_SHA512 ON)
+    endif()
+  endif()
+endif()
+
+set(HAVE_AES_NI ${HAVE_AES_NI} PARENT_SCOPE)
+set(HAVE_SHA_NI ${HAVE_SHA_NI} PARENT_SCOPE)
+set(HAVE_SHAINTRIN_H ${HAVE_SHAINTRIN_H} PARENT_SCOPE)
+set(HAVE_NEON_CRYPTO ${HAVE_NEON_CRYPTO} PARENT_SCOPE)
+set(HAVE_NEON_SHA512 ${HAVE_NEON_SHA512} PARENT_SCOPE)
+set(HAVE_NEON_SHA512_INTRINSICS ${HAVE_NEON_SHA512_INTRINSICS} PARENT_SCOPE)
+set(USE_ARM64_NEON_H ${USE_ARM64_NEON_H} PARENT_SCOPE)
--- a/crypto/aes-common.c
+++ b/crypto/aes-common.c
@ -0,0 +1,14 @@
+/*
+ * Common variable definitions across all the AES implementations.
+ */
+
+#include "ssh.h"
+#include "aes.h"
+
+const uint8_t aes_key_setup_round_constants[10] = {
+    /* The first few powers of X in GF(2^8), used during key setup.
+     * This can safely be a lookup table without side channel risks,
+     * because key setup iterates through it once in a standard way
+     * regardless of the key. */
+    0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1b, 0x36,
+};
--- a/crypto/aes-neon.c
+++ b/crypto/aes-neon.c
@ -0,0 +1,294 @@
+/* ----------------------------------------------------------------------
+ * Hardware-accelerated implementation of AES using Arm NEON.
+ */
+
+#include "ssh.h"
+#include "aes.h"
+
+#if USE_ARM64_NEON_H
+#include <arm64_neon.h>
+#else
+#include <arm_neon.h>
+#endif
+
+static bool aes_neon_available(void)
+{
+    /*
+     * For Arm, we delegate to a per-platform AES detection function,
+     * because it has to be implemented by asking the operating system
+     * rather than directly querying the CPU.
+     *
+     * That's because Arm systems commonly have multiple cores that
+     * are not all alike, so any method of querying whether NEON
+     * crypto instructions work on the _current_ CPU - even one as
+     * crude as just trying one and catching the SIGILL - wouldn't
+     * give an answer that you could still rely on the first time the
+     * OS migrated your process to another CPU.
+     */
+    return platform_aes_neon_available();
+}
+
+/*
+ * Core NEON encrypt/decrypt functions, one per length and direction.
+ */
+
+#define NEON_CIPHER(len, repmacro)                              \
+    static inline uint8x16_t aes_neon_##len##_e(                \
+        uint8x16_t v, const uint8x16_t *keysched)               \
+    {                                                           \
+        repmacro(v = vaesmcq_u8(vaeseq_u8(v, *keysched++)););   \
+        v = vaeseq_u8(v, *keysched++);                          \
+        return veorq_u8(v, *keysched);                          \
+    }                                                           \
+    static inline uint8x16_t aes_neon_##len##_d(                \
+        uint8x16_t v, const uint8x16_t *keysched)               \
+    {                                                           \
+        repmacro(v = vaesimcq_u8(vaesdq_u8(v, *keysched++)););  \
+        v = vaesdq_u8(v, *keysched++);                          \
+        return veorq_u8(v, *keysched);                          \
+    }
+
+NEON_CIPHER(128, REP9)
+NEON_CIPHER(192, REP11)
+NEON_CIPHER(256, REP13)
+
+/*
+ * The main key expansion.
+ */
+static void aes_neon_key_expand(
+    const unsigned char *key, size_t key_words,
+    uint8x16_t *keysched_e, uint8x16_t *keysched_d)
+{
+    size_t rounds = key_words + 6;
+    size_t sched_words = (rounds + 1) * 4;
+
+    /*
+     * Store the key schedule as 32-bit integers during expansion, so
+     * that it's easy to refer back to individual previous words. We
+     * collect them into the final uint8x16_t form at the end.
+     */
+    uint32_t sched[MAXROUNDKEYS * 4];
+
+    unsigned rconpos = 0;
+
+    for (size_t i = 0; i < sched_words; i++) {
+        if (i < key_words) {
+            sched[i] = GET_32BIT_LSB_FIRST(key + 4 * i);
+        } else {
+            uint32_t temp = sched[i - 1];
+
+            bool rotate_and_round_constant = (i % key_words == 0);
+            bool sub = rotate_and_round_constant ||
+                (key_words == 8 && i % 8 == 4);
+
+            if (rotate_and_round_constant)
+                temp = (temp << 24) | (temp >> 8);
+
+            if (sub) {
+                uint32x4_t v32 = vdupq_n_u32(temp);
+                uint8x16_t v8 = vreinterpretq_u8_u32(v32);
+                v8 = vaeseq_u8(v8, vdupq_n_u8(0));
+                v32 = vreinterpretq_u32_u8(v8);
+                temp = vget_lane_u32(vget_low_u32(v32), 0);
+            }
+
+            if (rotate_and_round_constant) {
+                assert(rconpos < lenof(aes_key_setup_round_constants));
+                temp ^= aes_key_setup_round_constants[rconpos++];
+            }
+
+            sched[i] = sched[i - key_words] ^ temp;
+        }
+    }
+
+    /*
+     * Combine the key schedule words into uint8x16_t vectors and
+     * store them in the output context.
+     */
+    for (size_t round = 0; round <= rounds; round++)
+        keysched_e[round] = vreinterpretq_u8_u32(vld1q_u32(sched + 4*round));
+
+    smemclr(sched, sizeof(sched));
+
+    /*
+     * Now prepare the modified keys for the inverse cipher.
+     */
+    for (size_t eround = 0; eround <= rounds; eround++) {
+        size_t dround = rounds - eround;
+        uint8x16_t rkey = keysched_e[eround];
+        if (eround && dround)      /* neither first nor last */
+            rkey = vaesimcq_u8(rkey);
+        keysched_d[dround] = rkey;
+    }
+}
+
+/*
+ * Auxiliary routine to reverse the byte order of a vector, so that
+ * the SDCTR IV can be made big-endian for feeding to the cipher.
+ *
+ * In fact we don't need to reverse the vector _all_ the way; we leave
+ * the two lanes in MSW,LSW order, because that makes no difference to
+ * the efficiency of the increment. That way we only have to reverse
+ * bytes within each lane in this function.
+ */
+static inline uint8x16_t aes_neon_sdctr_reverse(uint8x16_t v)
+{
+    return vrev64q_u8(v);
+}
+
+/*
+ * Auxiliary routine to increment the 128-bit counter used in SDCTR
+ * mode. There's no instruction to treat a 128-bit vector as a single
+ * long integer, so instead we have to increment the bottom half
+ * unconditionally, and the top half if the bottom half started off as
+ * all 1s (in which case there was about to be a carry).
+ */
+static inline uint8x16_t aes_neon_sdctr_increment(uint8x16_t in)
+{
+#ifdef __aarch64__
+    /* There will be a carry if the low 64 bits are all 1s. */
+    uint64x1_t all1 = vcreate_u64(0xFFFFFFFFFFFFFFFF);
+    uint64x1_t carry = vceq_u64(vget_high_u64(vreinterpretq_u64_u8(in)), all1);
+
+    /* Make a word whose bottom half is unconditionally all 1s, and
+     * the top half is 'carry', i.e. all 0s most of the time but all
+     * 1s if we need to increment the top half. Then that word is what
+     * we need to _subtract_ from the input counter. */
+    uint64x2_t subtrahend = vcombine_u64(carry, all1);
+#else
+    /* AArch32 doesn't have comparisons that operate on a 64-bit lane,
+     * so we start by comparing each 32-bit half of the low 64 bits
+     * _separately_ to all-1s. */
+    uint32x2_t all1 = vdup_n_u32(0xFFFFFFFF);
+    uint32x2_t carry = vceq_u32(
+        vget_high_u32(vreinterpretq_u32_u8(in)), all1);
+
+    /* Swap the 32-bit words of the compare output, and AND with the
+     * unswapped version. Now carry is all 1s iff the bottom half of
+     * the input counter was all 1s, and all 0s otherwise. */
+    carry = vand_u32(carry, vrev64_u32(carry));
+
+    /* Now make the vector to subtract in the same way as above. */
+    uint64x2_t subtrahend = vreinterpretq_u64_u32(vcombine_u32(carry, all1));
+#endif
+
+    return vreinterpretq_u8_u64(
+        vsubq_u64(vreinterpretq_u64_u8(in), subtrahend));
+}
+
+/*
+ * The SSH interface and the cipher modes.
+ */
+
+typedef struct aes_neon_context aes_neon_context;
+struct aes_neon_context {
+    uint8x16_t keysched_e[MAXROUNDKEYS], keysched_d[MAXROUNDKEYS], iv;
+
+    ssh_cipher ciph;
+};
+
+static ssh_cipher *aes_neon_new(const ssh_cipheralg *alg)
+{
+    const struct aes_extra *extra = (const struct aes_extra *)alg->extra;
+    if (!check_availability(extra))
+        return NULL;
+
+    aes_neon_context *ctx = snew(aes_neon_context);
+    ctx->ciph.vt = alg;
+    return &ctx->ciph;
+}
+
+static void aes_neon_free(ssh_cipher *ciph)
+{
+    aes_neon_context *ctx = container_of(ciph, aes_neon_context, ciph);
+    smemclr(ctx, sizeof(*ctx));
+    sfree(ctx);
+}
+
+static void aes_neon_setkey(ssh_cipher *ciph, const void *vkey)
+{
+    aes_neon_context *ctx = container_of(ciph, aes_neon_context, ciph);
+    const unsigned char *key = (const unsigned char *)vkey;
+
+    aes_neon_key_expand(key, ctx->ciph.vt->real_keybits / 32,
+                      ctx->keysched_e, ctx->keysched_d);
+}
+
+static void aes_neon_setiv_cbc(ssh_cipher *ciph, const void *iv)
+{
+    aes_neon_context *ctx = container_of(ciph, aes_neon_context, ciph);
+    ctx->iv = vld1q_u8(iv);
+}
+
+static void aes_neon_setiv_sdctr(ssh_cipher *ciph, const void *iv)
+{
+    aes_neon_context *ctx = container_of(ciph, aes_neon_context, ciph);
+    uint8x16_t counter = vld1q_u8(iv);
+    ctx->iv = aes_neon_sdctr_reverse(counter);
+}
+
+typedef uint8x16_t (*aes_neon_fn)(uint8x16_t v, const uint8x16_t *keysched);
+
+static inline void aes_cbc_neon_encrypt(
+    ssh_cipher *ciph, void *vblk, int blklen, aes_neon_fn encrypt)
+{
+    aes_neon_context *ctx = container_of(ciph, aes_neon_context, ciph);
+
+    for (uint8_t *blk = (uint8_t *)vblk, *finish = blk + blklen;
+         blk < finish; blk += 16) {
+        uint8x16_t plaintext = vld1q_u8(blk);
+        uint8x16_t cipher_input = veorq_u8(plaintext, ctx->iv);
+        uint8x16_t ciphertext = encrypt(cipher_input, ctx->keysched_e);
+        vst1q_u8(blk, ciphertext);
+        ctx->iv = ciphertext;
+    }
+}
+
+static inline void aes_cbc_neon_decrypt(
+    ssh_cipher *ciph, void *vblk, int blklen, aes_neon_fn decrypt)
+{
+    aes_neon_context *ctx = container_of(ciph, aes_neon_context, ciph);
+
+    for (uint8_t *blk = (uint8_t *)vblk, *finish = blk + blklen;
+         blk < finish; blk += 16) {
+        uint8x16_t ciphertext = vld1q_u8(blk);
+        uint8x16_t decrypted = decrypt(ciphertext, ctx->keysched_d);
+        uint8x16_t plaintext = veorq_u8(decrypted, ctx->iv);
+        vst1q_u8(blk, plaintext);
+        ctx->iv = ciphertext;
+    }
+}
+
+static inline void aes_sdctr_neon(
+    ssh_cipher *ciph, void *vblk, int blklen, aes_neon_fn encrypt)
+{
+    aes_neon_context *ctx = container_of(ciph, aes_neon_context, ciph);
+
+    for (uint8_t *blk = (uint8_t *)vblk, *finish = blk + blklen;
+         blk < finish; blk += 16) {
+        uint8x16_t counter = aes_neon_sdctr_reverse(ctx->iv);
+        uint8x16_t keystream = encrypt(counter, ctx->keysched_e);
+        uint8x16_t input = vld1q_u8(blk);
+        uint8x16_t output = veorq_u8(input, keystream);
+        vst1q_u8(blk, output);
+        ctx->iv = aes_neon_sdctr_increment(ctx->iv);
+    }
+}
+
+#define NEON_ENC_DEC(len)                                               \
+    static void aes##len##_neon_cbc_encrypt(                            \
+        ssh_cipher *ciph, void *vblk, int blklen)                       \
+    { aes_cbc_neon_encrypt(ciph, vblk, blklen, aes_neon_##len##_e); }   \
+    static void aes##len##_neon_cbc_decrypt(                            \
+        ssh_cipher *ciph, void *vblk, int blklen)                       \
+    { aes_cbc_neon_decrypt(ciph, vblk, blklen, aes_neon_##len##_d); }   \
+    static void aes##len##_neon_sdctr(                                  \
+        ssh_cipher *ciph, void *vblk, int blklen)                       \
+    { aes_sdctr_neon(ciph, vblk, blklen, aes_neon_##len##_e); }         \
+
+NEON_ENC_DEC(128)
+NEON_ENC_DEC(192)
+NEON_ENC_DEC(256)
+
+AES_EXTRA(_neon);
+AES_ALL_VTABLES(_neon, "NEON accelerated");
--- a/crypto/aes-ni.c
+++ b/crypto/aes-ni.c
@ -0,0 +1,281 @@
+/*
+ * Hardware-accelerated implementation of AES using x86 AES-NI.
+ */
+
+#include "ssh.h"
+#include "aes.h"
+
+#include <wmmintrin.h>
+#include <smmintrin.h>
+
+#if defined(__clang__) || defined(__GNUC__)
+#include <cpuid.h>
+#define GET_CPU_ID(out) __cpuid(1, (out)[0], (out)[1], (out)[2], (out)[3])
+#else
+#define GET_CPU_ID(out) __cpuid(out, 1)
+#endif
+
+static bool aes_ni_available(void)
+{
+    /*
+     * Determine if AES is available on this CPU, by checking that
+     * both AES itself and SSE4.1 are supported.
+     */
+    unsigned int CPUInfo[4];
+    GET_CPU_ID(CPUInfo);
+    return (CPUInfo[2] & (1 << 25)) && (CPUInfo[2] & (1 << 19));
+}
+
+/*
+ * Core AES-NI encrypt/decrypt functions, one per length and direction.
+ */
+
+#define NI_CIPHER(len, dir, dirlong, repmacro)                          \
+    static inline __m128i aes_ni_##len##_##dir(                         \
+        __m128i v, const __m128i *keysched)                             \
+    {                                                                   \
+        v = _mm_xor_si128(v, *keysched++);                              \
+        repmacro(v = _mm_aes##dirlong##_si128(v, *keysched++););        \
+        return _mm_aes##dirlong##last_si128(v, *keysched);              \
+    }
+
+NI_CIPHER(128, e, enc, REP9)
+NI_CIPHER(128, d, dec, REP9)
+NI_CIPHER(192, e, enc, REP11)
+NI_CIPHER(192, d, dec, REP11)
+NI_CIPHER(256, e, enc, REP13)
+NI_CIPHER(256, d, dec, REP13)
+
+/*
+ * The main key expansion.
+ */
+static void aes_ni_key_expand(
+    const unsigned char *key, size_t key_words,
+    __m128i *keysched_e, __m128i *keysched_d)
+{
+    size_t rounds = key_words + 6;
+    size_t sched_words = (rounds + 1) * 4;
+
+    /*
+     * Store the key schedule as 32-bit integers during expansion, so
+     * that it's easy to refer back to individual previous words. We
+     * collect them into the final __m128i form at the end.
+     */
+    uint32_t sched[MAXROUNDKEYS * 4];
+
+    unsigned rconpos = 0;
+
+    for (size_t i = 0; i < sched_words; i++) {
+        if (i < key_words) {
+            sched[i] = GET_32BIT_LSB_FIRST(key + 4 * i);
+        } else {
+            uint32_t temp = sched[i - 1];
+
+            bool rotate_and_round_constant = (i % key_words == 0);
+            bool only_sub = (key_words == 8 && i % 8 == 4);
+
+            if (rotate_and_round_constant) {
+                __m128i v = _mm_setr_epi32(0,temp,0,0);
+                v = _mm_aeskeygenassist_si128(v, 0);
+                temp = _mm_extract_epi32(v, 1);
+
+                assert(rconpos < lenof(aes_key_setup_round_constants));
+                temp ^= aes_key_setup_round_constants[rconpos++];
+            } else if (only_sub) {
+                __m128i v = _mm_setr_epi32(0,temp,0,0);
+                v = _mm_aeskeygenassist_si128(v, 0);
+                temp = _mm_extract_epi32(v, 0);
+            }
+
+            sched[i] = sched[i - key_words] ^ temp;
+        }
+    }
+
+    /*
+     * Combine the key schedule words into __m128i vectors and store
+     * them in the output context.
+     */
+    for (size_t round = 0; round <= rounds; round++)
+        keysched_e[round] = _mm_setr_epi32(
+            sched[4*round  ], sched[4*round+1],
+            sched[4*round+2], sched[4*round+3]);
+
+    smemclr(sched, sizeof(sched));
+
+    /*
+     * Now prepare the modified keys for the inverse cipher.
+     */
+    for (size_t eround = 0; eround <= rounds; eround++) {
+        size_t dround = rounds - eround;
+        __m128i rkey = keysched_e[eround];
+        if (eround && dround)      /* neither first nor last */
+            rkey = _mm_aesimc_si128(rkey);
+        keysched_d[dround] = rkey;
+    }
+}
+
+/*
+ * Auxiliary routine to increment the 128-bit counter used in SDCTR
+ * mode.
+ */
+static inline __m128i aes_ni_sdctr_increment(__m128i v)
+{
+    const __m128i ONE  = _mm_setr_epi32(1,0,0,0);
+    const __m128i ZERO = _mm_setzero_si128();
+
+    /* Increment the low-order 64 bits of v */
+    v  = _mm_add_epi64(v, ONE);
+    /* Check if they've become zero */
+    __m128i cmp = _mm_cmpeq_epi64(v, ZERO);
+    /* If so, the low half of cmp is all 1s. Pack that into the high
+     * half of addend with zero in the low half. */
+    __m128i addend = _mm_unpacklo_epi64(ZERO, cmp);
+    /* And subtract that from v, which increments the high 64 bits iff
+     * the low 64 wrapped round. */
+    v = _mm_sub_epi64(v, addend);
+
+    return v;
+}
+
+/*
+ * Auxiliary routine to reverse the byte order of a vector, so that
+ * the SDCTR IV can be made big-endian for feeding to the cipher.
+ */
+static inline __m128i aes_ni_sdctr_reverse(__m128i v)
+{
+    v = _mm_shuffle_epi8(
+        v, _mm_setr_epi8(15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0));
+    return v;
+}
+
+/*
+ * The SSH interface and the cipher modes.
+ */
+
+typedef struct aes_ni_context aes_ni_context;
+struct aes_ni_context {
+    __m128i keysched_e[MAXROUNDKEYS], keysched_d[MAXROUNDKEYS], iv;
+
+    void *pointer_to_free;
+    ssh_cipher ciph;
+};
+
+static ssh_cipher *aes_ni_new(const ssh_cipheralg *alg)
+{
+    const struct aes_extra *extra = (const struct aes_extra *)alg->extra;
+    if (!check_availability(extra))
+        return NULL;
+
+    /*
+     * The __m128i variables in the context structure need to be
+     * 16-byte aligned, but not all malloc implementations that this
+     * code has to work with will guarantee to return a 16-byte
+     * aligned pointer. So we over-allocate, manually realign the
+     * pointer ourselves, and store the original one inside the
+     * context so we know how to free it later.
+     */
+    void *allocation = smalloc(sizeof(aes_ni_context) + 15);
+    uintptr_t alloc_address = (uintptr_t)allocation;
+    uintptr_t aligned_address = (alloc_address + 15) & ~15;
+    aes_ni_context *ctx = (aes_ni_context *)aligned_address;
+
+    ctx->ciph.vt = alg;
+    ctx->pointer_to_free = allocation;
+    return &ctx->ciph;
+}
+
+static void aes_ni_free(ssh_cipher *ciph)
+{
+    aes_ni_context *ctx = container_of(ciph, aes_ni_context, ciph);
+    void *allocation = ctx->pointer_to_free;
+    smemclr(ctx, sizeof(*ctx));
+    sfree(allocation);
+}
+
+static void aes_ni_setkey(ssh_cipher *ciph, const void *vkey)
+{
+    aes_ni_context *ctx = container_of(ciph, aes_ni_context, ciph);
+    const unsigned char *key = (const unsigned char *)vkey;
+
+    aes_ni_key_expand(key, ctx->ciph.vt->real_keybits / 32,
+                      ctx->keysched_e, ctx->keysched_d);
+}
+
+static void aes_ni_setiv_cbc(ssh_cipher *ciph, const void *iv)
+{
+    aes_ni_context *ctx = container_of(ciph, aes_ni_context, ciph);
+    ctx->iv = _mm_loadu_si128(iv);
+}
+
+static void aes_ni_setiv_sdctr(ssh_cipher *ciph, const void *iv)
+{
+    aes_ni_context *ctx = container_of(ciph, aes_ni_context, ciph);
+    __m128i counter = _mm_loadu_si128(iv);
+    ctx->iv = aes_ni_sdctr_reverse(counter);
+}
+
+typedef __m128i (*aes_ni_fn)(__m128i v, const __m128i *keysched);
+
+static inline void aes_cbc_ni_encrypt(
+    ssh_cipher *ciph, void *vblk, int blklen, aes_ni_fn encrypt)
+{
+    aes_ni_context *ctx = container_of(ciph, aes_ni_context, ciph);
+
+    for (uint8_t *blk = (uint8_t *)vblk, *finish = blk + blklen;
+         blk < finish; blk += 16) {
+        __m128i plaintext = _mm_loadu_si128((const __m128i *)blk);
+        __m128i cipher_input = _mm_xor_si128(plaintext, ctx->iv);
+        __m128i ciphertext = encrypt(cipher_input, ctx->keysched_e);
+        _mm_storeu_si128((__m128i *)blk, ciphertext);
+        ctx->iv = ciphertext;
+    }
+}
+
+static inline void aes_cbc_ni_decrypt(
+    ssh_cipher *ciph, void *vblk, int blklen, aes_ni_fn decrypt)
+{
+    aes_ni_context *ctx = container_of(ciph, aes_ni_context, ciph);
+
+    for (uint8_t *blk = (uint8_t *)vblk, *finish = blk + blklen;
+         blk < finish; blk += 16) {
+        __m128i ciphertext = _mm_loadu_si128((const __m128i *)blk);
+        __m128i decrypted = decrypt(ciphertext, ctx->keysched_d);
+        __m128i plaintext = _mm_xor_si128(decrypted, ctx->iv);
+        _mm_storeu_si128((__m128i *)blk, plaintext);
+        ctx->iv = ciphertext;
+    }
+}
+
+static inline void aes_sdctr_ni(
+    ssh_cipher *ciph, void *vblk, int blklen, aes_ni_fn encrypt)
+{
+    aes_ni_context *ctx = container_of(ciph, aes_ni_context, ciph);
+
+    for (uint8_t *blk = (uint8_t *)vblk, *finish = blk + blklen;
+         blk < finish; blk += 16) {
+        __m128i counter = aes_ni_sdctr_reverse(ctx->iv);
+        __m128i keystream = encrypt(counter, ctx->keysched_e);
+        __m128i input = _mm_loadu_si128((const __m128i *)blk);
+        __m128i output = _mm_xor_si128(input, keystream);
+        _mm_storeu_si128((__m128i *)blk, output);
+        ctx->iv = aes_ni_sdctr_increment(ctx->iv);
+    }
+}
+
+#define NI_ENC_DEC(len)                                                 \
+    static void aes##len##_ni_cbc_encrypt(                              \
+        ssh_cipher *ciph, void *vblk, int blklen)                       \
+    { aes_cbc_ni_encrypt(ciph, vblk, blklen, aes_ni_##len##_e); }       \
+    static void aes##len##_ni_cbc_decrypt(                              \
+        ssh_cipher *ciph, void *vblk, int blklen)                       \
+    { aes_cbc_ni_decrypt(ciph, vblk, blklen, aes_ni_##len##_d); }       \
+    static void aes##len##_ni_sdctr(                                    \
+        ssh_cipher *ciph, void *vblk, int blklen)                       \
+    { aes_sdctr_ni(ciph, vblk, blklen, aes_ni_##len##_e); }             \
+
+NI_ENC_DEC(128)
+NI_ENC_DEC(192)
+NI_ENC_DEC(256)
+
+AES_EXTRA(_ni);
+AES_ALL_VTABLES(_ni, "AES-NI accelerated");
--- a/crypto/aes-select.c
+++ b/crypto/aes-select.c
@ -0,0 +1,89 @@
+/*
+ * Top-level vtables to select an AES implementation.
+ */
+
+#include <assert.h>
+#include <stdlib.h>
+
+#include "putty.h"
+#include "ssh.h"
+#include "aes.h"
+
+static ssh_cipher *aes_select(const ssh_cipheralg *alg)
+{
+    const ssh_cipheralg *const *real_algs = (const ssh_cipheralg **)alg->extra;
+
+    for (size_t i = 0; real_algs[i]; i++) {
+        const ssh_cipheralg *alg = real_algs[i];
+        const struct aes_extra *alg_extra =
+            (const struct aes_extra *)alg->extra;
+        if (check_availability(alg_extra))
+            return ssh_cipher_new(alg);
+    }
+
+    /* We should never reach the NULL at the end of the list, because
+     * the last non-NULL entry should be software-only AES, which is
+     * always available. */
+    unreachable("aes_select ran off the end of its list");
+}
+
+#if HAVE_AES_NI
+#define IF_NI(...) __VA_ARGS__
+#else
+#define IF_NI(...)
+#endif
+
+#if HAVE_NEON_CRYPTO
+#define IF_NEON(...) __VA_ARGS__
+#else
+#define IF_NEON(...)
+#endif
+
+#define AES_SELECTOR_VTABLE(mode_c, mode_protocol, mode_display, bits)  \
+    static const ssh_cipheralg *                                        \
+    ssh_aes ## bits ## _ ## mode_c ## _impls[] = {                      \
+        IF_NI(&ssh_aes ## bits ## _ ## mode_c ## _ni,)                  \
+        IF_NEON(&ssh_aes ## bits ## _ ## mode_c ## _neon,)              \
+        &ssh_aes ## bits ## _ ## mode_c ## _sw,                         \
+        NULL,                                                           \
+    };                                                                  \
+    const ssh_cipheralg ssh_aes ## bits ## _ ## mode_c = {              \
+        .new = aes_select,                                              \
+        .ssh2_id = "aes" #bits "-" mode_protocol,                       \
+        .blksize = 16,                                                  \
+        .real_keybits = bits,                                           \
+        .padded_keybytes = bits/8,                                      \
+        .text_name = "AES-" #bits " " mode_display                      \
+        " (dummy selector vtable)",                                     \
+        .extra = ssh_aes ## bits ## _ ## mode_c ## _impls,              \
+    }
+
+AES_SELECTOR_VTABLE(cbc, "cbc", "CBC", 128);
+AES_SELECTOR_VTABLE(cbc, "cbc", "CBC", 192);
+AES_SELECTOR_VTABLE(cbc, "cbc", "CBC", 256);
+AES_SELECTOR_VTABLE(sdctr, "ctr", "SDCTR", 128);
+AES_SELECTOR_VTABLE(sdctr, "ctr", "SDCTR", 192);
+AES_SELECTOR_VTABLE(sdctr, "ctr", "SDCTR", 256);
+
+static const ssh_cipheralg ssh_rijndael_lysator = {
+    /* Same as aes256_cbc, but with a different protocol ID */
+    .new = aes_select,
+    .ssh2_id = "rijndael-cbc@lysator.liu.se",
+    .blksize = 16,
+    .real_keybits = 256,
+    .padded_keybytes = 256/8,
+    .text_name = "AES-256 CBC (dummy selector vtable)",
+    .extra = ssh_aes256_cbc_impls,
+};
+
+static const ssh_cipheralg *const aes_list[] = {
+    &ssh_aes256_sdctr,
+    &ssh_aes256_cbc,
+    &ssh_rijndael_lysator,
+    &ssh_aes192_sdctr,
+    &ssh_aes192_cbc,
+    &ssh_aes128_sdctr,
+    &ssh_aes128_cbc,
+};
+
+const ssh2_ciphers ssh2_aes = { lenof(aes_list), aes_list };
--- a/crypto/aes-sw.c
+++ b/crypto/aes-sw.c
@ -1,247 +1,4 @@
 /*
- * Implementation of AES.
- */
-
-#include <assert.h>
-#include <stdlib.h>
-
-#include "ssh.h"
-#include "mpint_i.h"               /* we reuse the BignumInt system */
-
-/*
- * Start by deciding whether we can support hardware AES at all.
- */
-#define HW_AES_NONE 0
-#define HW_AES_NI 1
-#define HW_AES_NEON 2
-
-#ifdef _FORCE_AES_NI
-#   define HW_AES HW_AES_NI
-#elif defined(__clang__)
-#   if __has_attribute(target) && __has_include(<wmmintrin.h>) &&       \
-    (defined(__x86_64__) || defined(__i386))
-#       define HW_AES HW_AES_NI
-#   endif
-#elif defined(__GNUC__)
-#    if (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 4)) && \
-    (defined(__x86_64__) || defined(__i386))
-#       define HW_AES HW_AES_NI
-#    endif
-#elif defined (_MSC_VER)
-#   if (defined(_M_X64) || defined(_M_IX86)) && _MSC_FULL_VER >= 150030729
-#      define HW_AES HW_AES_NI
-#   endif
-#endif
-
-#ifdef _FORCE_AES_NEON
-#   define HW_AES HW_AES_NEON
-#elif defined __BYTE_ORDER__ && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
-    /* Arm can potentially support both endiannesses, but this code
-     * hasn't been tested on anything but little. If anyone wants to
-     * run big-endian, they'll need to fix it first. */
-#elif defined __ARM_FEATURE_CRYPTO
-    /* If the Arm crypto extension is available already, we can
-     * support NEON AES without having to enable anything by hand */
-#   define HW_AES HW_AES_NEON
-#elif defined(__clang__)
-#   if __has_attribute(target) && __has_include(<arm_neon.h>) &&       \
-    (defined(__aarch64__))
-        /* clang can enable the crypto extension in AArch64 using
-         * __attribute__((target)) */
-#       define HW_AES HW_AES_NEON
-#       define USE_CLANG_ATTR_TARGET_AARCH64
-#   endif
-#elif defined _MSC_VER
-#   if defined _M_ARM64
-#       define HW_AES HW_AES_NEON
-        /* 64-bit Visual Studio uses the header <arm64_neon.h> in place
-         * of the standard <arm_neon.h> */
-#       define USE_ARM64_NEON_H
-#   elif defined _M_ARM
-#       define HW_AES HW_AES_NEON
-        /* 32-bit Visual Studio uses the right header name, but requires
-         * this #define to enable a set of intrinsic definitions that
-         * do not omit one of the parameters for vaes[ed]q_u8 */
-#       define _ARM_USE_NEW_NEON_INTRINSICS
-#   endif
-#endif
-
-#if defined _FORCE_SOFTWARE_AES || !defined HW_AES
-#   undef HW_AES
-#   define HW_AES HW_AES_NONE
-#endif
-
-#if HW_AES == HW_AES_NI
-#define HW_NAME_SUFFIX " (AES-NI accelerated)"
-#elif HW_AES == HW_AES_NEON
-#define HW_NAME_SUFFIX " (NEON accelerated)"
-#else
-#define HW_NAME_SUFFIX " (!NONEXISTENT ACCELERATED VERSION!)"
-#endif
-
-/*
- * Vtable collection for AES. For each SSH-level cipher id (i.e.
- * combination of key length and cipher mode), we provide three
- * vtables: one for the pure software implementation, one using
- * hardware acceleration (if available), and a top-level one which is
- * never actually instantiated, and only contains a new() method whose
- * job is to decide which of the other two to return an actual
- * instance of.
- */
-
-static ssh_cipher *aes_select(const ssh_cipheralg *alg);
-static ssh_cipher *aes_sw_new(const ssh_cipheralg *alg);
-static void aes_sw_free(ssh_cipher *);
-static void aes_sw_setiv_cbc(ssh_cipher *, const void *iv);
-static void aes_sw_setiv_sdctr(ssh_cipher *, const void *iv);
-static void aes_sw_setkey(ssh_cipher *, const void *key);
-static ssh_cipher *aes_hw_new(const ssh_cipheralg *alg);
-static void aes_hw_free(ssh_cipher *);
-static void aes_hw_setiv_cbc(ssh_cipher *, const void *iv);
-static void aes_hw_setiv_sdctr(ssh_cipher *, const void *iv);
-static void aes_hw_setkey(ssh_cipher *, const void *key);
-
-struct aes_extra {
-    const ssh_cipheralg *sw, *hw;
-};
-
-#define VTABLES_INNER(cid, pid, bits, name, encsuffix,                  \
-                      decsuffix, setivsuffix, flagsval)                 \
-    static void cid##_sw##encsuffix(ssh_cipher *, void *blk, int len);  \
-    static void cid##_sw##decsuffix(ssh_cipher *, void *blk, int len);  \
-    const ssh_cipheralg ssh_##cid##_sw = {                              \
-        .new = aes_sw_new,                                              \
-        .free = aes_sw_free,                                            \
-        .setiv = aes_sw_##setivsuffix,                                  \
-        .setkey = aes_sw_setkey,                                        \
-        .encrypt = cid##_sw##encsuffix,                                 \
-        .decrypt = cid##_sw##decsuffix,                                 \
-        .ssh2_id = pid,                                                 \
-        .blksize = 16,                                                  \
-        .real_keybits = bits,                                           \
-        .padded_keybytes = bits/8,                                      \
-        .flags = flagsval,                                              \
-        .text_name = name " (unaccelerated)",                           \
-    };                                                                  \
-                                                                        \
-    static void cid##_hw##encsuffix(ssh_cipher *, void *blk, int len);  \
-    static void cid##_hw##decsuffix(ssh_cipher *, void *blk, int len);  \
-    const ssh_cipheralg ssh_##cid##_hw = {                              \
-        .new = aes_hw_new,                                              \
-        .free = aes_hw_free,                                            \
-        .setiv = aes_hw_##setivsuffix,                                  \
-        .setkey = aes_hw_setkey,                                        \
-        .encrypt = cid##_hw##encsuffix,                                 \
-        .decrypt = cid##_hw##decsuffix,                                 \
-        .ssh2_id = pid,                                                 \
-        .blksize = 16,                                                  \
-        .real_keybits = bits,                                           \
-        .padded_keybytes = bits/8,                                      \
-        .flags = flagsval,                                              \
-        .text_name = name HW_NAME_SUFFIX,                               \
-    };                                                                  \
-                                                                        \
-    static const struct aes_extra extra_##cid = {                       \
-        &ssh_##cid##_sw, &ssh_##cid##_hw };                             \
-                                                                        \
-    const ssh_cipheralg ssh_##cid = {                                   \
-        .new = aes_select,                                              \
-        .ssh2_id = pid,                                                 \
-        .blksize = 16,                                                  \
-        .real_keybits = bits,                                           \
-        .padded_keybytes = bits/8,                                      \
-        .flags = flagsval,                                              \
-        .text_name = name " (dummy selector vtable)",                   \
-        .extra = &extra_##cid                                           \
-    };                                                                  \
-
-#define VTABLES(keylen)                                                 \
-    VTABLES_INNER(aes ## keylen ## _cbc, "aes" #keylen "-cbc",          \
-                  keylen, "AES-" #keylen " CBC", _encrypt, _decrypt,    \
-                  setiv_cbc, SSH_CIPHER_IS_CBC)                         \
-    VTABLES_INNER(aes ## keylen ## _sdctr, "aes" #keylen "-ctr",        \
-                  keylen, "AES-" #keylen " SDCTR",,, setiv_sdctr, 0)
-
-VTABLES(128)
-VTABLES(192)
-VTABLES(256)
-
-static const ssh_cipheralg ssh_rijndael_lysator = {
-    /* Same as aes256_cbc, but with a different protocol ID */
-    .new = aes_select,
-    .ssh2_id = "rijndael-cbc@lysator.liu.se",
-    .blksize = 16,
-    .real_keybits = 256,
-    .padded_keybytes = 256/8,
-    .flags = 0,
-    .text_name = "AES-256 CBC (dummy selector vtable)",
-    .extra = &extra_aes256_cbc,
-};
-
-static const ssh_cipheralg *const aes_list[] = {
-    &ssh_aes256_sdctr,
-    &ssh_aes256_cbc,
-    &ssh_rijndael_lysator,
-    &ssh_aes192_sdctr,
-    &ssh_aes192_cbc,
-    &ssh_aes128_sdctr,
-    &ssh_aes128_cbc,
-};
-
-const ssh2_ciphers ssh2_aes = { lenof(aes_list), aes_list };
-
-/*
- * The actual query function that asks if hardware acceleration is
- * available.
- */
-static bool aes_hw_available(void);
-
-/*
- * The top-level selection function, caching the results of
- * aes_hw_available() so it only has to run once.
- */
-static bool aes_hw_available_cached(void)
-{
-    static bool initialised = false;
-    static bool hw_available;
-    if (!initialised) {
-        hw_available = aes_hw_available();
-        initialised = true;
-    }
-    return hw_available;
-}
-
-static ssh_cipher *aes_select(const ssh_cipheralg *alg)
-{
-    const struct aes_extra *extra = (const struct aes_extra *)alg->extra;
-    const ssh_cipheralg *real_alg =
-        aes_hw_available_cached() ? extra->hw : extra->sw;
-
-    return ssh_cipher_new(real_alg);
-}
-
-/* ----------------------------------------------------------------------
- * Definitions likely to be helpful to multiple implementations.
- */
-
-#define REP2(x) x x
-#define REP4(x) REP2(REP2(x))
-#define REP8(x) REP2(REP4(x))
-#define REP9(x) REP8(x) x
-#define REP11(x) REP8(x) REP2(x) x
-#define REP13(x) REP8(x) REP4(x) x
-
-static const uint8_t key_setup_round_constants[] = {
-    /* The first few powers of X in GF(2^8), used during key setup.
-     * This can safely be a lookup table without side channel risks,
-     * because key setup iterates through it once in a standard way
-     * regardless of the key. */
-    0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1b, 0x36,
-};
-
-#define MAXROUNDKEYS 15
-
-/* ----------------------------------------------------------------------
 * Software implementation of AES.
 *
 * This implementation uses a bit-sliced representation. Instead of
@ -257,6 +14,16 @@ static const uint8_t key_setup_round_constants[] = {
 * ops you get 64 S-box lookups, not just one.
 */

+#include "ssh.h"
+#include "aes.h"
+#include "mpint_i.h"               /* we reuse the BignumInt system */
+
+static bool aes_sw_available(void)
+{
+    /* Software AES is always available */
+    return true;
+}
+
 #define SLICE_PARALLELISM (BIGNUM_INT_BYTES / 2)

 #ifdef BITSLICED_DEBUG
@ -922,8 +689,8 @@ static void aes_sliced_key_setup(
            }

            if (rotate_and_round_constant) {
-                assert(rconpos < lenof(key_setup_round_constants));
-                uint8_t rcon = key_setup_round_constants[rconpos++];
+                assert(rconpos < lenof(aes_key_setup_round_constants));
+                uint8_t rcon = aes_key_setup_round_constants[rconpos++];
                for (size_t i = 0; i < 8; i++)
                    slices[i] ^= 1 & (rcon >> i);
            }
@ -1255,13 +1022,13 @@ static inline void aes_sdctr_sw(
 }

 #define SW_ENC_DEC(len)                                 \
-    static void aes##len##_cbc_sw_encrypt(              \
+    static void aes##len##_sw_cbc_encrypt(              \
        ssh_cipher *ciph, void *vblk, int blklen)       \
    { aes_cbc_sw_encrypt(ciph, vblk, blklen); }         \
-    static void aes##len##_cbc_sw_decrypt(              \
+    static void aes##len##_sw_cbc_decrypt(              \
        ssh_cipher *ciph, void *vblk, int blklen)       \
    { aes_cbc_sw_decrypt(ciph, vblk, blklen); }         \
-    static void aes##len##_sdctr_sw(                    \
+    static void aes##len##_sw_sdctr(                    \
        ssh_cipher *ciph, void *vblk, int blklen)       \
    { aes_sdctr_sw(ciph, vblk, blklen); }

@ -1269,644 +1036,5 @@ SW_ENC_DEC(128)
 SW_ENC_DEC(192)
 SW_ENC_DEC(256)

-/* ----------------------------------------------------------------------
- * Hardware-accelerated implementation of AES using x86 AES-NI.
- */
-
-#if HW_AES == HW_AES_NI
-
-/*
- * Set target architecture for Clang and GCC
- */
-#if !defined(__clang__) && defined(__GNUC__)
-#    pragma GCC target("aes")
-#    pragma GCC target("sse4.1")
-#endif
-
-#if defined(__clang__) || (defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)))
-#    define FUNC_ISA __attribute__ ((target("sse4.1,aes")))
-#else
-#    define FUNC_ISA
-#endif
-
-#include <wmmintrin.h>
-#include <smmintrin.h>
-
-#if defined(__clang__) || defined(__GNUC__)
-#include <cpuid.h>
-#define GET_CPU_ID(out) __cpuid(1, (out)[0], (out)[1], (out)[2], (out)[3])
-#else
-#define GET_CPU_ID(out) __cpuid(out, 1)
-#endif
-
-bool aes_hw_available(void)
-{
-    /*
-     * Determine if AES is available on this CPU, by checking that
-     * both AES itself and SSE4.1 are supported.
-     */
-    unsigned int CPUInfo[4];
-    GET_CPU_ID(CPUInfo);
-    return (CPUInfo[2] & (1 << 25)) && (CPUInfo[2] & (1 << 19));
-}
-
-/*
- * Core AES-NI encrypt/decrypt functions, one per length and direction.
- */
-
-#define NI_CIPHER(len, dir, dirlong, repmacro)                          \
-    static FUNC_ISA inline __m128i aes_ni_##len##_##dir(                \
-        __m128i v, const __m128i *keysched)                             \
-    {                                                                   \
-        v = _mm_xor_si128(v, *keysched++);                              \
-        repmacro(v = _mm_aes##dirlong##_si128(v, *keysched++););        \
-        return _mm_aes##dirlong##last_si128(v, *keysched);              \
-    }
-
-NI_CIPHER(128, e, enc, REP9)
-NI_CIPHER(128, d, dec, REP9)
-NI_CIPHER(192, e, enc, REP11)
-NI_CIPHER(192, d, dec, REP11)
-NI_CIPHER(256, e, enc, REP13)
-NI_CIPHER(256, d, dec, REP13)
-
-/*
- * The main key expansion.
- */
-static FUNC_ISA void aes_ni_key_expand(
-    const unsigned char *key, size_t key_words,
-    __m128i *keysched_e, __m128i *keysched_d)
-{
-    size_t rounds = key_words + 6;
-    size_t sched_words = (rounds + 1) * 4;
-
-    /*
-     * Store the key schedule as 32-bit integers during expansion, so
-     * that it's easy to refer back to individual previous words. We
-     * collect them into the final __m128i form at the end.
-     */
-    uint32_t sched[MAXROUNDKEYS * 4];
-
-    unsigned rconpos = 0;
-
-    for (size_t i = 0; i < sched_words; i++) {
-        if (i < key_words) {
-            sched[i] = GET_32BIT_LSB_FIRST(key + 4 * i);
-        } else {
-            uint32_t temp = sched[i - 1];
-
-            bool rotate_and_round_constant = (i % key_words == 0);
-            bool only_sub = (key_words == 8 && i % 8 == 4);
-
-            if (rotate_and_round_constant) {
-                __m128i v = _mm_setr_epi32(0,temp,0,0);
-                v = _mm_aeskeygenassist_si128(v, 0);
-                temp = _mm_extract_epi32(v, 1);
-
-                assert(rconpos < lenof(key_setup_round_constants));
-                temp ^= key_setup_round_constants[rconpos++];
-            } else if (only_sub) {
-                __m128i v = _mm_setr_epi32(0,temp,0,0);
-                v = _mm_aeskeygenassist_si128(v, 0);
-                temp = _mm_extract_epi32(v, 0);
-            }
-
-            sched[i] = sched[i - key_words] ^ temp;
-        }
-    }
-
-    /*
-     * Combine the key schedule words into __m128i vectors and store
-     * them in the output context.
-     */
-    for (size_t round = 0; round <= rounds; round++)
-        keysched_e[round] = _mm_setr_epi32(
-            sched[4*round  ], sched[4*round+1],
-            sched[4*round+2], sched[4*round+3]);
-
-    smemclr(sched, sizeof(sched));
-
-    /*
-     * Now prepare the modified keys for the inverse cipher.
-     */
-    for (size_t eround = 0; eround <= rounds; eround++) {
-        size_t dround = rounds - eround;
-        __m128i rkey = keysched_e[eround];
-        if (eround && dround)      /* neither first nor last */
-            rkey = _mm_aesimc_si128(rkey);
-        keysched_d[dround] = rkey;
-    }
-}
-
-/*
- * Auxiliary routine to increment the 128-bit counter used in SDCTR
- * mode.
- */
-static FUNC_ISA inline __m128i aes_ni_sdctr_increment(__m128i v)
-{
-    const __m128i ONE  = _mm_setr_epi32(1,0,0,0);
-    const __m128i ZERO = _mm_setzero_si128();
-
-    /* Increment the low-order 64 bits of v */
-    v  = _mm_add_epi64(v, ONE);
-    /* Check if they've become zero */
-    __m128i cmp = _mm_cmpeq_epi64(v, ZERO);
-    /* If so, the low half of cmp is all 1s. Pack that into the high
-     * half of addend with zero in the low half. */
-    __m128i addend = _mm_unpacklo_epi64(ZERO, cmp);
-    /* And subtract that from v, which increments the high 64 bits iff
-     * the low 64 wrapped round. */
-    v = _mm_sub_epi64(v, addend);
-
-    return v;
-}
-
-/*
- * Auxiliary routine to reverse the byte order of a vector, so that
- * the SDCTR IV can be made big-endian for feeding to the cipher.
- */
-static FUNC_ISA inline __m128i aes_ni_sdctr_reverse(__m128i v)
-{
-    v = _mm_shuffle_epi8(
-        v, _mm_setr_epi8(15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0));
-    return v;
-}
-
-/*
- * The SSH interface and the cipher modes.
- */
-
-typedef struct aes_ni_context aes_ni_context;
-struct aes_ni_context {
-    __m128i keysched_e[MAXROUNDKEYS], keysched_d[MAXROUNDKEYS], iv;
-
-    void *pointer_to_free;
-    ssh_cipher ciph;
-};
-
-static ssh_cipher *aes_hw_new(const ssh_cipheralg *alg)
-{
-    if (!aes_hw_available_cached())
-        return NULL;
-
-    /*
-     * The __m128i variables in the context structure need to be
-     * 16-byte aligned, but not all malloc implementations that this
-     * code has to work with will guarantee to return a 16-byte
-     * aligned pointer. So we over-allocate, manually realign the
-     * pointer ourselves, and store the original one inside the
-     * context so we know how to free it later.
-     */
-    void *allocation = smalloc(sizeof(aes_ni_context) + 15);
-    uintptr_t alloc_address = (uintptr_t)allocation;
-    uintptr_t aligned_address = (alloc_address + 15) & ~15;
-    aes_ni_context *ctx = (aes_ni_context *)aligned_address;
-
-    ctx->ciph.vt = alg;
-    ctx->pointer_to_free = allocation;
-    return &ctx->ciph;
-}
-
-static void aes_hw_free(ssh_cipher *ciph)
-{
-    aes_ni_context *ctx = container_of(ciph, aes_ni_context, ciph);
-    void *allocation = ctx->pointer_to_free;
-    smemclr(ctx, sizeof(*ctx));
-    sfree(allocation);
-}
-
-static void aes_hw_setkey(ssh_cipher *ciph, const void *vkey)
-{
-    aes_ni_context *ctx = container_of(ciph, aes_ni_context, ciph);
-    const unsigned char *key = (const unsigned char *)vkey;
-
-    aes_ni_key_expand(key, ctx->ciph.vt->real_keybits / 32,
-                      ctx->keysched_e, ctx->keysched_d);
-}
-
-static FUNC_ISA void aes_hw_setiv_cbc(ssh_cipher *ciph, const void *iv)
-{
-    aes_ni_context *ctx = container_of(ciph, aes_ni_context, ciph);
-    ctx->iv = _mm_loadu_si128(iv);
-}
-
-static FUNC_ISA void aes_hw_setiv_sdctr(ssh_cipher *ciph, const void *iv)
-{
-    aes_ni_context *ctx = container_of(ciph, aes_ni_context, ciph);
-    __m128i counter = _mm_loadu_si128(iv);
-    ctx->iv = aes_ni_sdctr_reverse(counter);
-}
-
-typedef __m128i (*aes_ni_fn)(__m128i v, const __m128i *keysched);
-
-static FUNC_ISA inline void aes_cbc_ni_encrypt(
-    ssh_cipher *ciph, void *vblk, int blklen, aes_ni_fn encrypt)
-{
-    aes_ni_context *ctx = container_of(ciph, aes_ni_context, ciph);
-
-    for (uint8_t *blk = (uint8_t *)vblk, *finish = blk + blklen;
-         blk < finish; blk += 16) {
-        __m128i plaintext = _mm_loadu_si128((const __m128i *)blk);
-        __m128i cipher_input = _mm_xor_si128(plaintext, ctx->iv);
-        __m128i ciphertext = encrypt(cipher_input, ctx->keysched_e);
-        _mm_storeu_si128((__m128i *)blk, ciphertext);
-        ctx->iv = ciphertext;
-    }
-}
-
-static FUNC_ISA inline void aes_cbc_ni_decrypt(
-    ssh_cipher *ciph, void *vblk, int blklen, aes_ni_fn decrypt)
-{
-    aes_ni_context *ctx = container_of(ciph, aes_ni_context, ciph);
-
-    for (uint8_t *blk = (uint8_t *)vblk, *finish = blk + blklen;
-         blk < finish; blk += 16) {
-        __m128i ciphertext = _mm_loadu_si128((const __m128i *)blk);
-        __m128i decrypted = decrypt(ciphertext, ctx->keysched_d);
-        __m128i plaintext = _mm_xor_si128(decrypted, ctx->iv);
-        _mm_storeu_si128((__m128i *)blk, plaintext);
-        ctx->iv = ciphertext;
-    }
-}
-
-static FUNC_ISA inline void aes_sdctr_ni(
-    ssh_cipher *ciph, void *vblk, int blklen, aes_ni_fn encrypt)
-{
-    aes_ni_context *ctx = container_of(ciph, aes_ni_context, ciph);
-
-    for (uint8_t *blk = (uint8_t *)vblk, *finish = blk + blklen;
-         blk < finish; blk += 16) {
-        __m128i counter = aes_ni_sdctr_reverse(ctx->iv);
-        __m128i keystream = encrypt(counter, ctx->keysched_e);
-        __m128i input = _mm_loadu_si128((const __m128i *)blk);
-        __m128i output = _mm_xor_si128(input, keystream);
-        _mm_storeu_si128((__m128i *)blk, output);
-        ctx->iv = aes_ni_sdctr_increment(ctx->iv);
-    }
-}
-
-#define NI_ENC_DEC(len)                                                 \
-    static FUNC_ISA void aes##len##_cbc_hw_encrypt(                     \
-        ssh_cipher *ciph, void *vblk, int blklen)                       \
-    { aes_cbc_ni_encrypt(ciph, vblk, blklen, aes_ni_##len##_e); }       \
-    static FUNC_ISA void aes##len##_cbc_hw_decrypt(                     \
-        ssh_cipher *ciph, void *vblk, int blklen)                       \
-    { aes_cbc_ni_decrypt(ciph, vblk, blklen, aes_ni_##len##_d); }       \
-    static FUNC_ISA void aes##len##_sdctr_hw(                           \
-        ssh_cipher *ciph, void *vblk, int blklen)                       \
-    { aes_sdctr_ni(ciph, vblk, blklen, aes_ni_##len##_e); }             \
-
-NI_ENC_DEC(128)
-NI_ENC_DEC(192)
-NI_ENC_DEC(256)
-
-/* ----------------------------------------------------------------------
- * Hardware-accelerated implementation of AES using Arm NEON.
- */
-
-#elif HW_AES == HW_AES_NEON
-
-/*
- * Manually set the target architecture, if we decided above that we
- * need to.
- */
-#ifdef USE_CLANG_ATTR_TARGET_AARCH64
-/*
- * A spot of cheating: redefine some ACLE feature macros before
- * including arm_neon.h. Otherwise we won't get the AES intrinsics
- * defined by that header, because it will be looking at the settings
- * for the whole translation unit rather than the ones we're going to
- * put on some particular functions using __attribute__((target)).
- */
-#define __ARM_NEON 1
-#define __ARM_FEATURE_CRYPTO 1
-#define FUNC_ISA __attribute__ ((target("neon,crypto")))
-#endif /* USE_CLANG_ATTR_TARGET_AARCH64 */
-
-#ifndef FUNC_ISA
-#define FUNC_ISA
-#endif
-
-#ifdef USE_ARM64_NEON_H
-#include <arm64_neon.h>
-#else
-#include <arm_neon.h>
-#endif
-
-static bool aes_hw_available(void)
-{
-    /*
-     * For Arm, we delegate to a per-platform AES detection function,
-     * because it has to be implemented by asking the operating system
-     * rather than directly querying the CPU.
-     *
-     * That's because Arm systems commonly have multiple cores that
-     * are not all alike, so any method of querying whether NEON
-     * crypto instructions work on the _current_ CPU - even one as
-     * crude as just trying one and catching the SIGILL - wouldn't
-     * give an answer that you could still rely on the first time the
-     * OS migrated your process to another CPU.
-     */
-    return platform_aes_hw_available();
-}
-
-/*
- * Core NEON encrypt/decrypt functions, one per length and direction.
- */
-
-#define NEON_CIPHER(len, repmacro)                              \
-    static FUNC_ISA inline uint8x16_t aes_neon_##len##_e(       \
-        uint8x16_t v, const uint8x16_t *keysched)               \
-    {                                                           \
-        repmacro(v = vaesmcq_u8(vaeseq_u8(v, *keysched++)););   \
-        v = vaeseq_u8(v, *keysched++);                          \
-        return veorq_u8(v, *keysched);                          \
-    }                                                           \
-    static FUNC_ISA inline uint8x16_t aes_neon_##len##_d(       \
-        uint8x16_t v, const uint8x16_t *keysched)               \
-    {                                                           \
-        repmacro(v = vaesimcq_u8(vaesdq_u8(v, *keysched++)););  \
-        v = vaesdq_u8(v, *keysched++);                          \
-        return veorq_u8(v, *keysched);                          \
-    }
-
-NEON_CIPHER(128, REP9)
-NEON_CIPHER(192, REP11)
-NEON_CIPHER(256, REP13)
-
-/*
- * The main key expansion.
- */
-static FUNC_ISA void aes_neon_key_expand(
-    const unsigned char *key, size_t key_words,
-    uint8x16_t *keysched_e, uint8x16_t *keysched_d)
-{
-    size_t rounds = key_words + 6;
-    size_t sched_words = (rounds + 1) * 4;
-
-    /*
-     * Store the key schedule as 32-bit integers during expansion, so
-     * that it's easy to refer back to individual previous words. We
-     * collect them into the final uint8x16_t form at the end.
-     */
-    uint32_t sched[MAXROUNDKEYS * 4];
-
-    unsigned rconpos = 0;
-
-    for (size_t i = 0; i < sched_words; i++) {
-        if (i < key_words) {
-            sched[i] = GET_32BIT_LSB_FIRST(key + 4 * i);
-        } else {
-            uint32_t temp = sched[i - 1];
-
-            bool rotate_and_round_constant = (i % key_words == 0);
-            bool sub = rotate_and_round_constant ||
-                (key_words == 8 && i % 8 == 4);
-
-            if (rotate_and_round_constant)
-                temp = (temp << 24) | (temp >> 8);
-
-            if (sub) {
-                uint32x4_t v32 = vdupq_n_u32(temp);
-                uint8x16_t v8 = vreinterpretq_u8_u32(v32);
-                v8 = vaeseq_u8(v8, vdupq_n_u8(0));
-                v32 = vreinterpretq_u32_u8(v8);
-                temp = vget_lane_u32(vget_low_u32(v32), 0);
-            }
-
-            if (rotate_and_round_constant) {
-                assert(rconpos < lenof(key_setup_round_constants));
-                temp ^= key_setup_round_constants[rconpos++];
-            }
-
-            sched[i] = sched[i - key_words] ^ temp;
-        }
-    }
-
-    /*
-     * Combine the key schedule words into uint8x16_t vectors and
-     * store them in the output context.
-     */
-    for (size_t round = 0; round <= rounds; round++)
-        keysched_e[round] = vreinterpretq_u8_u32(vld1q_u32(sched + 4*round));
-
-    smemclr(sched, sizeof(sched));
-
-    /*
-     * Now prepare the modified keys for the inverse cipher.
-     */
-    for (size_t eround = 0; eround <= rounds; eround++) {
-        size_t dround = rounds - eround;
-        uint8x16_t rkey = keysched_e[eround];
-        if (eround && dround)      /* neither first nor last */
-            rkey = vaesimcq_u8(rkey);
-        keysched_d[dround] = rkey;
-    }
-}
-
-/*
- * Auxiliary routine to reverse the byte order of a vector, so that
- * the SDCTR IV can be made big-endian for feeding to the cipher.
- *
- * In fact we don't need to reverse the vector _all_ the way; we leave
- * the two lanes in MSW,LSW order, because that makes no difference to
- * the efficiency of the increment. That way we only have to reverse
- * bytes within each lane in this function.
- */
-static FUNC_ISA inline uint8x16_t aes_neon_sdctr_reverse(uint8x16_t v)
-{
-    return vrev64q_u8(v);
-}
-
-/*
- * Auxiliary routine to increment the 128-bit counter used in SDCTR
- * mode. There's no instruction to treat a 128-bit vector as a single
- * long integer, so instead we have to increment the bottom half
- * unconditionally, and the top half if the bottom half started off as
- * all 1s (in which case there was about to be a carry).
- */
-static FUNC_ISA inline uint8x16_t aes_neon_sdctr_increment(uint8x16_t in)
-{
-#ifdef __aarch64__
-    /* There will be a carry if the low 64 bits are all 1s. */
-    uint64x1_t all1 = vcreate_u64(0xFFFFFFFFFFFFFFFF);
-    uint64x1_t carry = vceq_u64(vget_high_u64(vreinterpretq_u64_u8(in)), all1);
-
-    /* Make a word whose bottom half is unconditionally all 1s, and
-     * the top half is 'carry', i.e. all 0s most of the time but all
-     * 1s if we need to increment the top half. Then that word is what
-     * we need to _subtract_ from the input counter. */
-    uint64x2_t subtrahend = vcombine_u64(carry, all1);
-#else
-    /* AArch32 doesn't have comparisons that operate on a 64-bit lane,
-     * so we start by comparing each 32-bit half of the low 64 bits
-     * _separately_ to all-1s. */
-    uint32x2_t all1 = vdup_n_u32(0xFFFFFFFF);
-    uint32x2_t carry = vceq_u32(
-        vget_high_u32(vreinterpretq_u32_u8(in)), all1);
-
-    /* Swap the 32-bit words of the compare output, and AND with the
-     * unswapped version. Now carry is all 1s iff the bottom half of
-     * the input counter was all 1s, and all 0s otherwise. */
-    carry = vand_u32(carry, vrev64_u32(carry));
-
-    /* Now make the vector to subtract in the same way as above. */
-    uint64x2_t subtrahend = vreinterpretq_u64_u32(vcombine_u32(carry, all1));
-#endif
-
-    return vreinterpretq_u8_u64(
-        vsubq_u64(vreinterpretq_u64_u8(in), subtrahend));
-}
-
-/*
- * The SSH interface and the cipher modes.
- */
-
-typedef struct aes_neon_context aes_neon_context;
-struct aes_neon_context {
-    uint8x16_t keysched_e[MAXROUNDKEYS], keysched_d[MAXROUNDKEYS], iv;
-
-    ssh_cipher ciph;
-};
-
-static ssh_cipher *aes_hw_new(const ssh_cipheralg *alg)
-{
-    if (!aes_hw_available_cached())
-        return NULL;
-
-    aes_neon_context *ctx = snew(aes_neon_context);
-    ctx->ciph.vt = alg;
-    return &ctx->ciph;
-}
-
-static void aes_hw_free(ssh_cipher *ciph)
-{
-    aes_neon_context *ctx = container_of(ciph, aes_neon_context, ciph);
-    smemclr(ctx, sizeof(*ctx));
-    sfree(ctx);
-}
-
-static void aes_hw_setkey(ssh_cipher *ciph, const void *vkey)
-{
-    aes_neon_context *ctx = container_of(ciph, aes_neon_context, ciph);
-    const unsigned char *key = (const unsigned char *)vkey;
-
-    aes_neon_key_expand(key, ctx->ciph.vt->real_keybits / 32,
-                      ctx->keysched_e, ctx->keysched_d);
-}
-
-static FUNC_ISA void aes_hw_setiv_cbc(ssh_cipher *ciph, const void *iv)
-{
-    aes_neon_context *ctx = container_of(ciph, aes_neon_context, ciph);
-    ctx->iv = vld1q_u8(iv);
-}
-
-static FUNC_ISA void aes_hw_setiv_sdctr(ssh_cipher *ciph, const void *iv)
-{
-    aes_neon_context *ctx = container_of(ciph, aes_neon_context, ciph);
-    uint8x16_t counter = vld1q_u8(iv);
-    ctx->iv = aes_neon_sdctr_reverse(counter);
-}
-
-typedef uint8x16_t (*aes_neon_fn)(uint8x16_t v, const uint8x16_t *keysched);
-
-static FUNC_ISA inline void aes_cbc_neon_encrypt(
-    ssh_cipher *ciph, void *vblk, int blklen, aes_neon_fn encrypt)
-{
-    aes_neon_context *ctx = container_of(ciph, aes_neon_context, ciph);
-
-    for (uint8_t *blk = (uint8_t *)vblk, *finish = blk + blklen;
-         blk < finish; blk += 16) {
-        uint8x16_t plaintext = vld1q_u8(blk);
-        uint8x16_t cipher_input = veorq_u8(plaintext, ctx->iv);
-        uint8x16_t ciphertext = encrypt(cipher_input, ctx->keysched_e);
-        vst1q_u8(blk, ciphertext);
-        ctx->iv = ciphertext;
-    }
-}
-
-static FUNC_ISA inline void aes_cbc_neon_decrypt(
-    ssh_cipher *ciph, void *vblk, int blklen, aes_neon_fn decrypt)
-{
-    aes_neon_context *ctx = container_of(ciph, aes_neon_context, ciph);
-
-    for (uint8_t *blk = (uint8_t *)vblk, *finish = blk + blklen;
-         blk < finish; blk += 16) {
-        uint8x16_t ciphertext = vld1q_u8(blk);
-        uint8x16_t decrypted = decrypt(ciphertext, ctx->keysched_d);
-        uint8x16_t plaintext = veorq_u8(decrypted, ctx->iv);
-        vst1q_u8(blk, plaintext);
-        ctx->iv = ciphertext;
-    }
-}
-
-static FUNC_ISA inline void aes_sdctr_neon(
-    ssh_cipher *ciph, void *vblk, int blklen, aes_neon_fn encrypt)
-{
-    aes_neon_context *ctx = container_of(ciph, aes_neon_context, ciph);
-
-    for (uint8_t *blk = (uint8_t *)vblk, *finish = blk + blklen;
-         blk < finish; blk += 16) {
-        uint8x16_t counter = aes_neon_sdctr_reverse(ctx->iv);
-        uint8x16_t keystream = encrypt(counter, ctx->keysched_e);
-        uint8x16_t input = vld1q_u8(blk);
-        uint8x16_t output = veorq_u8(input, keystream);
-        vst1q_u8(blk, output);
-        ctx->iv = aes_neon_sdctr_increment(ctx->iv);
-    }
-}
-
-#define NEON_ENC_DEC(len)                                               \
-    static FUNC_ISA void aes##len##_cbc_hw_encrypt(                     \
-        ssh_cipher *ciph, void *vblk, int blklen)                       \
-    { aes_cbc_neon_encrypt(ciph, vblk, blklen, aes_neon_##len##_e); }   \
-    static FUNC_ISA void aes##len##_cbc_hw_decrypt(                     \
-        ssh_cipher *ciph, void *vblk, int blklen)                       \
-    { aes_cbc_neon_decrypt(ciph, vblk, blklen, aes_neon_##len##_d); }   \
-    static FUNC_ISA void aes##len##_sdctr_hw(                           \
-        ssh_cipher *ciph, void *vblk, int blklen)                       \
-    { aes_sdctr_neon(ciph, vblk, blklen, aes_neon_##len##_e); }         \
-
-NEON_ENC_DEC(128)
-NEON_ENC_DEC(192)
-NEON_ENC_DEC(256)
-
-/* ----------------------------------------------------------------------
- * Stub functions if we have no hardware-accelerated AES. In this
- * case, aes_hw_new returns NULL (though it should also never be
- * selected by aes_select, so the only thing that should even be
- * _able_ to call it is testcrypt). As a result, the remaining vtable
- * functions should never be called at all.
- */
-
-#elif HW_AES == HW_AES_NONE
-
-bool aes_hw_available(void)
-{
-    return false;
-}
-
-static ssh_cipher *aes_hw_new(const ssh_cipheralg *alg)
-{
-    return NULL;
-}
-
-#define STUB_BODY { unreachable("Should never be called"); }
-
-static void aes_hw_free(ssh_cipher *ciph) STUB_BODY
-static void aes_hw_setkey(ssh_cipher *ciph, const void *key) STUB_BODY
-static void aes_hw_setiv_cbc(ssh_cipher *ciph, const void *iv) STUB_BODY
-static void aes_hw_setiv_sdctr(ssh_cipher *ciph, const void *iv) STUB_BODY
-#define STUB_ENC_DEC(len)                                       \
-    static void aes##len##_cbc_hw_encrypt(                      \
-        ssh_cipher *ciph, void *vblk, int blklen) STUB_BODY     \
-    static void aes##len##_cbc_hw_decrypt(                      \
-        ssh_cipher *ciph, void *vblk, int blklen) STUB_BODY     \
-    static void aes##len##_sdctr_hw(                            \
-        ssh_cipher *ciph, void *vblk, int blklen) STUB_BODY
-
-STUB_ENC_DEC(128)
-STUB_ENC_DEC(192)
-STUB_ENC_DEC(256)
-
-#endif /* HW_AES */
+AES_EXTRA(_sw);
+AES_ALL_VTABLES(_sw, "unaccelerated");
--- a/crypto/aes.h
+++ b/crypto/aes.h
@ -0,0 +1,109 @@
+/*
+ * Definitions likely to be helpful to multiple AES implementations.
+ */
+
+/*
+ * The 'extra' structure used by AES implementations is used to
+ * include information about how to check if a given implementation is
+ * available at run time, and whether we've already checked.
+ */
+struct aes_extra_mutable;
+struct aes_extra {
+    /* Function to check availability. Might be expensive, so we don't
+     * want to call it more than once. */
+    bool (*check_available)(void);
+
+    /* Point to a writable substructure. */
+    struct aes_extra_mutable *mut;
+};
+struct aes_extra_mutable {
+    bool checked_availability;
+    bool is_available;
+};
+static inline bool check_availability(const struct aes_extra *extra)
+{
+    if (!extra->mut->checked_availability) {
+        extra->mut->is_available = extra->check_available();
+        extra->mut->checked_availability = true;
+    }
+
+    return extra->mut->is_available;
+}
+
+/*
+ * Macros to define vtables for AES variants. There are a lot of
+ * these, because of the cross product between cipher modes, key
+ * sizes, and assorted HW/SW implementations, so it's worth spending
+ * some effort here to reduce the boilerplate in the sub-files.
+ */
+
+#define AES_EXTRA(impl_c)                                               \
+    static struct aes_extra_mutable aes ## impl_c ## _extra_mut;        \
+    static const struct aes_extra aes ## impl_c ## _extra = {           \
+        .check_available = aes ## impl_c ## _available,                 \
+        .mut = &aes ## impl_c ## _extra_mut,                            \
+    }
+
+#define AES_CBC_VTABLE(impl_c, impl_display, bits)                      \
+    const ssh_cipheralg ssh_aes ## bits ## _cbc ## impl_c = {           \
+        .new = aes ## impl_c ## _new,                                   \
+        .free = aes ## impl_c ## _free,                                 \
+        .setiv = aes ## impl_c ## _setiv_cbc,                           \
+        .setkey = aes ## impl_c ## _setkey,                             \
+        .encrypt = aes ## bits ## impl_c ## _cbc_encrypt,               \
+        .decrypt = aes ## bits ## impl_c ## _cbc_decrypt,               \
+        .ssh2_id = "aes" #bits "-cbc",                                  \
+        .blksize = 16,                                                  \
+        .real_keybits = bits,                                           \
+        .padded_keybytes = bits/8,                                      \
+        .flags = SSH_CIPHER_IS_CBC,                                     \
+        .text_name = "AES-" #bits " CBC (" impl_display ")",            \
+        .extra = &aes ## impl_c ## _extra,                              \
+    }
+
+#define AES_SDCTR_VTABLE(impl_c, impl_display, bits)                    \
+    const ssh_cipheralg ssh_aes ## bits ## _sdctr ## impl_c = {         \
+        .new = aes ## impl_c ## _new,                                   \
+        .free = aes ## impl_c ## _free,                                 \
+        .setiv = aes ## impl_c ## _setiv_sdctr,                         \
+        .setkey = aes ## impl_c ## _setkey,                             \
+        .encrypt = aes ## bits ## impl_c ## _sdctr,                     \
+        .decrypt = aes ## bits ## impl_c ## _sdctr,                     \
+        .ssh2_id = "aes" #bits "-ctr",                                  \
+        .blksize = 16,                                                  \
+        .real_keybits = bits,                                           \
+        .padded_keybytes = bits/8,                                      \
+        .flags = 0,                                                     \
+        .text_name = "AES-" #bits " SDCTR (" impl_display ")",          \
+        .extra = &aes ## impl_c ## _extra,                              \
+    }
+
+#define AES_ALL_VTABLES(impl_c, impl_display)           \
+    AES_CBC_VTABLE(impl_c, impl_display, 128);          \
+    AES_CBC_VTABLE(impl_c, impl_display, 192);          \
+    AES_CBC_VTABLE(impl_c, impl_display, 256);          \
+    AES_SDCTR_VTABLE(impl_c, impl_display, 128);        \
+    AES_SDCTR_VTABLE(impl_c, impl_display, 192);        \
+    AES_SDCTR_VTABLE(impl_c, impl_display, 256)
+
+/*
+ * Macros to repeat a piece of code particular numbers of times that
+ * correspond to 1 fewer than the number of AES rounds. (Because the
+ * last round is different.)
+ */
+#define REP2(x) x x
+#define REP4(x) REP2(REP2(x))
+#define REP8(x) REP2(REP4(x))
+#define REP9(x) REP8(x) x
+#define REP11(x) REP8(x) REP2(x) x
+#define REP13(x) REP8(x) REP4(x) x
+
+/*
+ * The round constants used in key schedule expansion.
+ */
+extern const uint8_t aes_key_setup_round_constants[10];
+
+/*
+ * The largest number of round keys ever needed.
+ */
+#define MAXROUNDKEYS 15
--- a/crypto/sha1-common.c
+++ b/crypto/sha1-common.c
@ -0,0 +1,10 @@
+/*
+ * Common variable definitions across all the SHA-1 implementations.
+ */
+
+#include "ssh.h"
+#include "sha1.h"
+
+const uint32_t sha1_initial_state[5] = {
+    0x67452301, 0xefcdab89, 0x98badcfe, 0x10325476, 0xc3d2e1f0,
+};
--- a/crypto/sha1-neon.c
+++ b/crypto/sha1-neon.c
@ -0,0 +1,190 @@
+/*
+ * Hardware-accelerated implementation of SHA-1 using Arm NEON.
+ */
+
+#include "ssh.h"
+#include "sha1.h"
+
+#if USE_ARM64_NEON_H
+#include <arm64_neon.h>
+#else
+#include <arm_neon.h>
+#endif
+
+static bool sha1_neon_available(void)
+{
+    /*
+     * For Arm, we delegate to a per-platform detection function (see
+     * explanation in aes-neon.c).
+     */
+    return platform_sha1_neon_available();
+}
+
+typedef struct sha1_neon_core sha1_neon_core;
+struct sha1_neon_core {
+    uint32x4_t abcd;
+    uint32_t e;
+};
+
+static inline uint32x4_t sha1_neon_load_input(const uint8_t *p)
+{
+    return vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(p)));
+}
+
+static inline uint32x4_t sha1_neon_schedule_update(
+    uint32x4_t m4, uint32x4_t m3, uint32x4_t m2, uint32x4_t m1)
+{
+    return vsha1su1q_u32(vsha1su0q_u32(m4, m3, m2), m1);
+}
+
+/*
+ * SHA-1 has three different kinds of round, differing in whether they
+ * use the Ch, Maj or Par functions defined above. Each one uses a
+ * separate NEON instruction, so we define three inline functions for
+ * the different round types using this macro.
+ *
+ * The two batches of Par-type rounds also use a different constant,
+ * but that's passed in as an operand, so we don't need a fourth
+ * inline function just for that.
+ */
+#define SHA1_NEON_ROUND_FN(type)                                        \
+    static inline sha1_neon_core sha1_neon_round4_##type(               \
+        sha1_neon_core old, uint32x4_t sched, uint32x4_t constant)      \
+    {                                                                   \
+        sha1_neon_core new;                                             \
+        uint32x4_t round_input = vaddq_u32(sched, constant);            \
+        new.abcd = vsha1##type##q_u32(old.abcd, old.e, round_input);    \
+        new.e = vsha1h_u32(vget_lane_u32(vget_low_u32(old.abcd), 0));   \
+        return new;                                                     \
+    }
+SHA1_NEON_ROUND_FN(c)
+SHA1_NEON_ROUND_FN(p)
+SHA1_NEON_ROUND_FN(m)
+
+static inline void sha1_neon_block(sha1_neon_core *core, const uint8_t *p)
+{
+    uint32x4_t constant, s0, s1, s2, s3;
+    sha1_neon_core cr = *core;
+
+    constant = vdupq_n_u32(SHA1_STAGE0_CONSTANT);
+    s0 = sha1_neon_load_input(p);
+    cr = sha1_neon_round4_c(cr, s0, constant);
+    s1 = sha1_neon_load_input(p + 16);
+    cr = sha1_neon_round4_c(cr, s1, constant);
+    s2 = sha1_neon_load_input(p + 32);
+    cr = sha1_neon_round4_c(cr, s2, constant);
+    s3 = sha1_neon_load_input(p + 48);
+    cr = sha1_neon_round4_c(cr, s3, constant);
+    s0 = sha1_neon_schedule_update(s0, s1, s2, s3);
+    cr = sha1_neon_round4_c(cr, s0, constant);
+
+    constant = vdupq_n_u32(SHA1_STAGE1_CONSTANT);
+    s1 = sha1_neon_schedule_update(s1, s2, s3, s0);
+    cr = sha1_neon_round4_p(cr, s1, constant);
+    s2 = sha1_neon_schedule_update(s2, s3, s0, s1);
+    cr = sha1_neon_round4_p(cr, s2, constant);
+    s3 = sha1_neon_schedule_update(s3, s0, s1, s2);
+    cr = sha1_neon_round4_p(cr, s3, constant);
+    s0 = sha1_neon_schedule_update(s0, s1, s2, s3);
+    cr = sha1_neon_round4_p(cr, s0, constant);
+    s1 = sha1_neon_schedule_update(s1, s2, s3, s0);
+    cr = sha1_neon_round4_p(cr, s1, constant);
+
+    constant = vdupq_n_u32(SHA1_STAGE2_CONSTANT);
+    s2 = sha1_neon_schedule_update(s2, s3, s0, s1);
+    cr = sha1_neon_round4_m(cr, s2, constant);
+    s3 = sha1_neon_schedule_update(s3, s0, s1, s2);
+    cr = sha1_neon_round4_m(cr, s3, constant);
+    s0 = sha1_neon_schedule_update(s0, s1, s2, s3);
+    cr = sha1_neon_round4_m(cr, s0, constant);
+    s1 = sha1_neon_schedule_update(s1, s2, s3, s0);
+    cr = sha1_neon_round4_m(cr, s1, constant);
+    s2 = sha1_neon_schedule_update(s2, s3, s0, s1);
+    cr = sha1_neon_round4_m(cr, s2, constant);
+
+    constant = vdupq_n_u32(SHA1_STAGE3_CONSTANT);
+    s3 = sha1_neon_schedule_update(s3, s0, s1, s2);
+    cr = sha1_neon_round4_p(cr, s3, constant);
+    s0 = sha1_neon_schedule_update(s0, s1, s2, s3);
+    cr = sha1_neon_round4_p(cr, s0, constant);
+    s1 = sha1_neon_schedule_update(s1, s2, s3, s0);
+    cr = sha1_neon_round4_p(cr, s1, constant);
+    s2 = sha1_neon_schedule_update(s2, s3, s0, s1);
+    cr = sha1_neon_round4_p(cr, s2, constant);
+    s3 = sha1_neon_schedule_update(s3, s0, s1, s2);
+    cr = sha1_neon_round4_p(cr, s3, constant);
+
+    core->abcd = vaddq_u32(core->abcd, cr.abcd);
+    core->e += cr.e;
+}
+
+typedef struct sha1_neon {
+    sha1_neon_core core;
+    sha1_block blk;
+    BinarySink_IMPLEMENTATION;
+    ssh_hash hash;
+} sha1_neon;
+
+static void sha1_neon_write(BinarySink *bs, const void *vp, size_t len);
+
+static ssh_hash *sha1_neon_new(const ssh_hashalg *alg)
+{
+    const struct sha1_extra *extra = (const struct sha1_extra *)alg->extra;
+    if (!check_availability(extra))
+        return NULL;
+
+    sha1_neon *s = snew(sha1_neon);
+
+    s->hash.vt = alg;
+    BinarySink_INIT(s, sha1_neon_write);
+    BinarySink_DELEGATE_INIT(&s->hash, s);
+    return &s->hash;
+}
+
+static void sha1_neon_reset(ssh_hash *hash)
+{
+    sha1_neon *s = container_of(hash, sha1_neon, hash);
+
+    s->core.abcd = vld1q_u32(sha1_initial_state);
+    s->core.e = sha1_initial_state[4];
+
+    sha1_block_setup(&s->blk);
+}
+
+static void sha1_neon_copyfrom(ssh_hash *hcopy, ssh_hash *horig)
+{
+    sha1_neon *copy = container_of(hcopy, sha1_neon, hash);
+    sha1_neon *orig = container_of(horig, sha1_neon, hash);
+
+    *copy = *orig; /* structure copy */
+
+    BinarySink_COPIED(copy);
+    BinarySink_DELEGATE_INIT(&copy->hash, copy);
+}
+
+static void sha1_neon_free(ssh_hash *hash)
+{
+    sha1_neon *s = container_of(hash, sha1_neon, hash);
+    smemclr(s, sizeof(*s));
+    sfree(s);
+}
+
+static void sha1_neon_write(BinarySink *bs, const void *vp, size_t len)
+{
+    sha1_neon *s = BinarySink_DOWNCAST(bs, sha1_neon);
+
+    while (len > 0)
+        if (sha1_block_write(&s->blk, &vp, &len))
+            sha1_neon_block(&s->core, s->blk.block);
+}
+
+static void sha1_neon_digest(ssh_hash *hash, uint8_t *digest)
+{
+    sha1_neon *s = container_of(hash, sha1_neon, hash);
+
+    sha1_block_pad(&s->blk, BinarySink_UPCAST(s));
+    vst1q_u8(digest, vrev32q_u8(vreinterpretq_u8_u32(s->core.abcd)));
+    PUT_32BIT_MSB_FIRST(digest + 16, s->core.e);
+}
+
+SHA1_VTABLE(neon, "NEON accelerated");
--- a/crypto/sha1-ni.c
+++ b/crypto/sha1-ni.c
@ -0,0 +1,325 @@
+/*
+ * Hardware-accelerated implementation of SHA-1 using x86 SHA-NI.
+ */
+
+#include "ssh.h"
+#include "sha1.h"
+
+#include <wmmintrin.h>
+#include <smmintrin.h>
+#include <immintrin.h>
+#if HAVE_SHAINTRIN_H
+#include <shaintrin.h>
+#endif
+
+#if defined(__clang__) || defined(__GNUC__)
+#include <cpuid.h>
+#define GET_CPU_ID_0(out)                               \
+    __cpuid(0, (out)[0], (out)[1], (out)[2], (out)[3])
+#define GET_CPU_ID_7(out)                                       \
+    __cpuid_count(7, 0, (out)[0], (out)[1], (out)[2], (out)[3])
+#else
+#define GET_CPU_ID_0(out) __cpuid(out, 0)
+#define GET_CPU_ID_7(out) __cpuidex(out, 7, 0)
+#endif
+
+static bool sha1_ni_available(void)
+{
+    unsigned int CPUInfo[4];
+    GET_CPU_ID_0(CPUInfo);
+    if (CPUInfo[0] < 7)
+        return false;
+
+    GET_CPU_ID_7(CPUInfo);
+    return CPUInfo[1] & (1 << 29); /* Check SHA */
+}
+
+/* SHA1 implementation using new instructions
+   The code is based on Jeffrey Walton's SHA1 implementation:
+   https://github.com/noloader/SHA-Intrinsics
+*/
+static inline void sha1_ni_block(__m128i *core, const uint8_t *p)
+{
+    __m128i ABCD, E0, E1, MSG0, MSG1, MSG2, MSG3;
+    const __m128i MASK = _mm_set_epi64x(
+        0x0001020304050607ULL, 0x08090a0b0c0d0e0fULL);
+
+    const __m128i *block = (const __m128i *)p;
+
+    /* Load initial values */
+    ABCD = core[0];
+    E0 = core[1];
+
+    /* Rounds 0-3 */
+    MSG0 = _mm_loadu_si128(block);
+    MSG0 = _mm_shuffle_epi8(MSG0, MASK);
+    E0 = _mm_add_epi32(E0, MSG0);
+    E1 = ABCD;
+    ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 0);
+
+    /* Rounds 4-7 */
+    MSG1 = _mm_loadu_si128(block + 1);
+    MSG1 = _mm_shuffle_epi8(MSG1, MASK);
+    E1 = _mm_sha1nexte_epu32(E1, MSG1);
+    E0 = ABCD;
+    ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 0);
+    MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1);
+
+    /* Rounds 8-11 */
+    MSG2 = _mm_loadu_si128(block + 2);
+    MSG2 = _mm_shuffle_epi8(MSG2, MASK);
+    E0 = _mm_sha1nexte_epu32(E0, MSG2);
+    E1 = ABCD;
+    ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 0);
+    MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2);
+    MSG0 = _mm_xor_si128(MSG0, MSG2);
+
+    /* Rounds 12-15 */
+    MSG3 = _mm_loadu_si128(block + 3);
+    MSG3 = _mm_shuffle_epi8(MSG3, MASK);
+    E1 = _mm_sha1nexte_epu32(E1, MSG3);
+    E0 = ABCD;
+    MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3);
+    ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 0);
+    MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3);
+    MSG1 = _mm_xor_si128(MSG1, MSG3);
+
+    /* Rounds 16-19 */
+    E0 = _mm_sha1nexte_epu32(E0, MSG0);
+    E1 = ABCD;
+    MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0);
+    ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 0);
+    MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0);
+    MSG2 = _mm_xor_si128(MSG2, MSG0);
+
+    /* Rounds 20-23 */
+    E1 = _mm_sha1nexte_epu32(E1, MSG1);
+    E0 = ABCD;
+    MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1);
+    ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 1);
+    MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1);
+    MSG3 = _mm_xor_si128(MSG3, MSG1);
+
+    /* Rounds 24-27 */
+    E0 = _mm_sha1nexte_epu32(E0, MSG2);
+    E1 = ABCD;
+    MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2);
+    ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 1);
+    MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2);
+    MSG0 = _mm_xor_si128(MSG0, MSG2);
+
+    /* Rounds 28-31 */
+    E1 = _mm_sha1nexte_epu32(E1, MSG3);
+    E0 = ABCD;
+    MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3);
+    ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 1);
+    MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3);
+    MSG1 = _mm_xor_si128(MSG1, MSG3);
+
+    /* Rounds 32-35 */
+    E0 = _mm_sha1nexte_epu32(E0, MSG0);
+    E1 = ABCD;
+    MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0);
+    ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 1);
+    MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0);
+    MSG2 = _mm_xor_si128(MSG2, MSG0);
+
+    /* Rounds 36-39 */
+    E1 = _mm_sha1nexte_epu32(E1, MSG1);
+    E0 = ABCD;
+    MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1);
+    ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 1);
+    MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1);
+    MSG3 = _mm_xor_si128(MSG3, MSG1);
+
+    /* Rounds 40-43 */
+    E0 = _mm_sha1nexte_epu32(E0, MSG2);
+    E1 = ABCD;
+    MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2);
+    ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 2);
+    MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2);
+    MSG0 = _mm_xor_si128(MSG0, MSG2);
+
+    /* Rounds 44-47 */
+    E1 = _mm_sha1nexte_epu32(E1, MSG3);
+    E0 = ABCD;
+    MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3);
+    ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 2);
+    MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3);
+    MSG1 = _mm_xor_si128(MSG1, MSG3);
+
+    /* Rounds 48-51 */
+    E0 = _mm_sha1nexte_epu32(E0, MSG0);
+    E1 = ABCD;
+    MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0);
+    ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 2);
+    MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0);
+    MSG2 = _mm_xor_si128(MSG2, MSG0);
+
+    /* Rounds 52-55 */
+    E1 = _mm_sha1nexte_epu32(E1, MSG1);
+    E0 = ABCD;
+    MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1);
+    ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 2);
+    MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1);
+    MSG3 = _mm_xor_si128(MSG3, MSG1);
+
+    /* Rounds 56-59 */
+    E0 = _mm_sha1nexte_epu32(E0, MSG2);
+    E1 = ABCD;
+    MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2);
+    ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 2);
+    MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2);
+    MSG0 = _mm_xor_si128(MSG0, MSG2);
+
+    /* Rounds 60-63 */
+    E1 = _mm_sha1nexte_epu32(E1, MSG3);
+    E0 = ABCD;
+    MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3);
+    ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 3);
+    MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3);
+    MSG1 = _mm_xor_si128(MSG1, MSG3);
+
+    /* Rounds 64-67 */
+    E0 = _mm_sha1nexte_epu32(E0, MSG0);
+    E1 = ABCD;
+    MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0);
+    ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 3);
+    MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0);
+    MSG2 = _mm_xor_si128(MSG2, MSG0);
+
+    /* Rounds 68-71 */
+    E1 = _mm_sha1nexte_epu32(E1, MSG1);
+    E0 = ABCD;
+    MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1);
+    ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 3);
+    MSG3 = _mm_xor_si128(MSG3, MSG1);
+
+    /* Rounds 72-75 */
+    E0 = _mm_sha1nexte_epu32(E0, MSG2);
+    E1 = ABCD;
+    MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2);
+    ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 3);
+
+    /* Rounds 76-79 */
+    E1 = _mm_sha1nexte_epu32(E1, MSG3);
+    E0 = ABCD;
+    ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 3);
+
+    /* Combine state */
+    core[0] = _mm_add_epi32(ABCD, core[0]);
+    core[1] = _mm_sha1nexte_epu32(E0, core[1]);
+}
+
+typedef struct sha1_ni {
+    /*
+     * core[0] stores the first four words of the SHA-1 state. core[1]
+     * stores just the fifth word, in the vector lane at the highest
+     * address.
+     */
+    __m128i core[2];
+    sha1_block blk;
+    void *pointer_to_free;
+    BinarySink_IMPLEMENTATION;
+    ssh_hash hash;
+} sha1_ni;
+
+static void sha1_ni_write(BinarySink *bs, const void *vp, size_t len);
+
+static sha1_ni *sha1_ni_alloc(void)
+{
+    /*
+     * The __m128i variables in the context structure need to be
+     * 16-byte aligned, but not all malloc implementations that this
+     * code has to work with will guarantee to return a 16-byte
+     * aligned pointer. So we over-allocate, manually realign the
+     * pointer ourselves, and store the original one inside the
+     * context so we know how to free it later.
+     */
+    void *allocation = smalloc(sizeof(sha1_ni) + 15);
+    uintptr_t alloc_address = (uintptr_t)allocation;
+    uintptr_t aligned_address = (alloc_address + 15) & ~15;
+    sha1_ni *s = (sha1_ni *)aligned_address;
+    s->pointer_to_free = allocation;
+    return s;
+}
+
+static ssh_hash *sha1_ni_new(const ssh_hashalg *alg)
+{
+    const struct sha1_extra *extra = (const struct sha1_extra *)alg->extra;
+    if (!check_availability(extra))
+        return NULL;
+
+    sha1_ni *s = sha1_ni_alloc();
+
+    s->hash.vt = alg;
+    BinarySink_INIT(s, sha1_ni_write);
+    BinarySink_DELEGATE_INIT(&s->hash, s);
+    return &s->hash;
+}
+
+static void sha1_ni_reset(ssh_hash *hash)
+{
+    sha1_ni *s = container_of(hash, sha1_ni, hash);
+
+    /* Initialise the core vectors in their storage order */
+    s->core[0] = _mm_set_epi64x(
+        0x67452301efcdab89ULL, 0x98badcfe10325476ULL);
+    s->core[1] = _mm_set_epi32(0xc3d2e1f0, 0, 0, 0);
+
+    sha1_block_setup(&s->blk);
+}
+
+static void sha1_ni_copyfrom(ssh_hash *hcopy, ssh_hash *horig)
+{
+    sha1_ni *copy = container_of(hcopy, sha1_ni, hash);
+    sha1_ni *orig = container_of(horig, sha1_ni, hash);
+
+    void *ptf_save = copy->pointer_to_free;
+    *copy = *orig; /* structure copy */
+    copy->pointer_to_free = ptf_save;
+
+    BinarySink_COPIED(copy);
+    BinarySink_DELEGATE_INIT(&copy->hash, copy);
+}
+
+static void sha1_ni_free(ssh_hash *hash)
+{
+    sha1_ni *s = container_of(hash, sha1_ni, hash);
+
+    void *ptf = s->pointer_to_free;
+    smemclr(s, sizeof(*s));
+    sfree(ptf);
+}
+
+static void sha1_ni_write(BinarySink *bs, const void *vp, size_t len)
+{
+    sha1_ni *s = BinarySink_DOWNCAST(bs, sha1_ni);
+
+    while (len > 0)
+        if (sha1_block_write(&s->blk, &vp, &len))
+            sha1_ni_block(s->core, s->blk.block);
+}
+
+static void sha1_ni_digest(ssh_hash *hash, uint8_t *digest)
+{
+    sha1_ni *s = container_of(hash, sha1_ni, hash);
+
+    sha1_block_pad(&s->blk, BinarySink_UPCAST(s));
+
+    /* Rearrange the first vector into its output order */
+    __m128i abcd = _mm_shuffle_epi32(s->core[0], 0x1B);
+
+    /* Byte-swap it into the output endianness */
+    const __m128i mask = _mm_setr_epi8(3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12);
+    abcd = _mm_shuffle_epi8(abcd, mask);
+
+    /* And store it */
+    _mm_storeu_si128((__m128i *)digest, abcd);
+
+    /* Finally, store the leftover word */
+    uint32_t e = _mm_extract_epi32(s->core[1], 3);
+    PUT_32BIT_MSB_FIRST(digest + 16, e);
+}
+
+SHA1_VTABLE(ni, "SHA-NI accelerated");
--- a/crypto/sha1-select.c
+++ b/crypto/sha1-select.c
@ -0,0 +1,44 @@
+/*
+ * Top-level vtables to select a SHA-1 implementation.
+ */
+
+#include <assert.h>
+#include <stdlib.h>
+
+#include "putty.h"
+#include "ssh.h"
+#include "sha1.h"
+
+static ssh_hash *sha1_select(const ssh_hashalg *alg)
+{
+    static const ssh_hashalg *const real_algs[] = {
+#if HAVE_SHA_NI
+        &ssh_sha1_ni,
+#endif
+#if HAVE_NEON_CRYPTO
+        &ssh_sha1_neon,
+#endif
+        &ssh_sha1_sw,
+        NULL,
+    };
+
+    for (size_t i = 0; real_algs[i]; i++) {
+        const ssh_hashalg *alg = real_algs[i];
+        const struct sha1_extra *alg_extra =
+            (const struct sha1_extra *)alg->extra;
+        if (check_availability(alg_extra))
+            return ssh_hash_new(alg);
+    }
+
+    /* We should never reach the NULL at the end of the list, because
+     * the last non-NULL entry should be software-only SHA-1, which
+     * is always available. */
+    unreachable("sha1_select ran off the end of its list");
+}
+
+const ssh_hashalg ssh_sha1 = {
+    .new = sha1_select,
+    .hlen = 20,
+    .blocklen = 64,
+    HASHALG_NAMES_ANNOTATED("SHA-1", "dummy selector vtable"),
+};
--- a/crypto/sha1-sw.c
+++ b/crypto/sha1-sw.c
@ -0,0 +1,155 @@
+/*
+ * Software implementation of SHA-1.
+ */
+
+#include "ssh.h"
+#include "sha1.h"
+
+static bool sha1_sw_available(void)
+{
+    /* Software SHA-1 is always available */
+    return true;
+}
+
+static inline uint32_t rol(uint32_t x, unsigned y)
+{
+    return (x << (31 & y)) | (x >> (31 & -y));
+}
+
+static inline uint32_t Ch(uint32_t ctrl, uint32_t if1, uint32_t if0)
+{
+    return if0 ^ (ctrl & (if1 ^ if0));
+}
+
+static inline uint32_t Maj(uint32_t x, uint32_t y, uint32_t z)
+{
+    return (x & y) | (z & (x | y));
+}
+
+static inline uint32_t Par(uint32_t x, uint32_t y, uint32_t z)
+{
+    return (x ^ y ^ z);
+}
+
+static inline void sha1_sw_round(
+    unsigned round_index, const uint32_t *schedule,
+    uint32_t *a, uint32_t *b, uint32_t *c, uint32_t *d, uint32_t *e,
+    uint32_t f, uint32_t constant)
+{
+    *e = rol(*a, 5) + f + *e + schedule[round_index] + constant;
+    *b = rol(*b, 30);
+}
+
+static void sha1_sw_block(uint32_t *core, const uint8_t *block)
+{
+    uint32_t w[SHA1_ROUNDS];
+    uint32_t a,b,c,d,e;
+
+    for (size_t t = 0; t < 16; t++)
+        w[t] = GET_32BIT_MSB_FIRST(block + 4*t);
+
+    for (size_t t = 16; t < SHA1_ROUNDS; t++)
+        w[t] = rol(w[t - 3] ^ w[t - 8] ^ w[t - 14] ^ w[t - 16], 1);
+
+    a = core[0]; b = core[1]; c = core[2]; d = core[3];
+    e = core[4];
+
+    size_t t = 0;
+    for (size_t u = 0; u < SHA1_ROUNDS_PER_STAGE/5; u++) {
+        sha1_sw_round(t++,w, &a,&b,&c,&d,&e, Ch(b,c,d), SHA1_STAGE0_CONSTANT);
+        sha1_sw_round(t++,w, &e,&a,&b,&c,&d, Ch(a,b,c), SHA1_STAGE0_CONSTANT);
+        sha1_sw_round(t++,w, &d,&e,&a,&b,&c, Ch(e,a,b), SHA1_STAGE0_CONSTANT);
+        sha1_sw_round(t++,w, &c,&d,&e,&a,&b, Ch(d,e,a), SHA1_STAGE0_CONSTANT);
+        sha1_sw_round(t++,w, &b,&c,&d,&e,&a, Ch(c,d,e), SHA1_STAGE0_CONSTANT);
+    }
+    for (size_t u = 0; u < SHA1_ROUNDS_PER_STAGE/5; u++) {
+        sha1_sw_round(t++,w, &a,&b,&c,&d,&e, Par(b,c,d), SHA1_STAGE1_CONSTANT);
+        sha1_sw_round(t++,w, &e,&a,&b,&c,&d, Par(a,b,c), SHA1_STAGE1_CONSTANT);
+        sha1_sw_round(t++,w, &d,&e,&a,&b,&c, Par(e,a,b), SHA1_STAGE1_CONSTANT);
+        sha1_sw_round(t++,w, &c,&d,&e,&a,&b, Par(d,e,a), SHA1_STAGE1_CONSTANT);
+        sha1_sw_round(t++,w, &b,&c,&d,&e,&a, Par(c,d,e), SHA1_STAGE1_CONSTANT);
+    }
+    for (size_t u = 0; u < SHA1_ROUNDS_PER_STAGE/5; u++) {
+        sha1_sw_round(t++,w, &a,&b,&c,&d,&e, Maj(b,c,d), SHA1_STAGE2_CONSTANT);
+        sha1_sw_round(t++,w, &e,&a,&b,&c,&d, Maj(a,b,c), SHA1_STAGE2_CONSTANT);
+        sha1_sw_round(t++,w, &d,&e,&a,&b,&c, Maj(e,a,b), SHA1_STAGE2_CONSTANT);
+        sha1_sw_round(t++,w, &c,&d,&e,&a,&b, Maj(d,e,a), SHA1_STAGE2_CONSTANT);
+        sha1_sw_round(t++,w, &b,&c,&d,&e,&a, Maj(c,d,e), SHA1_STAGE2_CONSTANT);
+    }
+    for (size_t u = 0; u < SHA1_ROUNDS_PER_STAGE/5; u++) {
+        sha1_sw_round(t++,w, &a,&b,&c,&d,&e, Par(b,c,d), SHA1_STAGE3_CONSTANT);
+        sha1_sw_round(t++,w, &e,&a,&b,&c,&d, Par(a,b,c), SHA1_STAGE3_CONSTANT);
+        sha1_sw_round(t++,w, &d,&e,&a,&b,&c, Par(e,a,b), SHA1_STAGE3_CONSTANT);
+        sha1_sw_round(t++,w, &c,&d,&e,&a,&b, Par(d,e,a), SHA1_STAGE3_CONSTANT);
+        sha1_sw_round(t++,w, &b,&c,&d,&e,&a, Par(c,d,e), SHA1_STAGE3_CONSTANT);
+    }
+
+    core[0] += a; core[1] += b; core[2] += c; core[3] += d; core[4] += e;
+
+    smemclr(w, sizeof(w));
+}
+
+typedef struct sha1_sw {
+    uint32_t core[5];
+    sha1_block blk;
+    BinarySink_IMPLEMENTATION;
+    ssh_hash hash;
+} sha1_sw;
+
+static void sha1_sw_write(BinarySink *bs, const void *vp, size_t len);
+
+static ssh_hash *sha1_sw_new(const ssh_hashalg *alg)
+{
+    sha1_sw *s = snew(sha1_sw);
+
+    s->hash.vt = alg;
+    BinarySink_INIT(s, sha1_sw_write);
+    BinarySink_DELEGATE_INIT(&s->hash, s);
+    return &s->hash;
+}
+
+static void sha1_sw_reset(ssh_hash *hash)
+{
+    sha1_sw *s = container_of(hash, sha1_sw, hash);
+
+    memcpy(s->core, sha1_initial_state, sizeof(s->core));
+    sha1_block_setup(&s->blk);
+}
+
+static void sha1_sw_copyfrom(ssh_hash *hcopy, ssh_hash *horig)
+{
+    sha1_sw *copy = container_of(hcopy, sha1_sw, hash);
+    sha1_sw *orig = container_of(horig, sha1_sw, hash);
+
+    memcpy(copy, orig, sizeof(*copy));
+    BinarySink_COPIED(copy);
+    BinarySink_DELEGATE_INIT(&copy->hash, copy);
+}
+
+static void sha1_sw_free(ssh_hash *hash)
+{
+    sha1_sw *s = container_of(hash, sha1_sw, hash);
+
+    smemclr(s, sizeof(*s));
+    sfree(s);
+}
+
+static void sha1_sw_write(BinarySink *bs, const void *vp, size_t len)
+{
+    sha1_sw *s = BinarySink_DOWNCAST(bs, sha1_sw);
+
+    while (len > 0)
+        if (sha1_block_write(&s->blk, &vp, &len))
+            sha1_sw_block(s->core, s->blk.block);
+}
+
+static void sha1_sw_digest(ssh_hash *hash, uint8_t *digest)
+{
+    sha1_sw *s = container_of(hash, sha1_sw, hash);
+
+    sha1_block_pad(&s->blk, BinarySink_UPCAST(s));
+    for (size_t i = 0; i < 5; i++)
+        PUT_32BIT_MSB_FIRST(digest + 4*i, s->core[i]);
+}
+
+SHA1_VTABLE(sw, "unaccelerated");
--- a/crypto/sha1.c
+++ b/crypto/sha1.c
@ -1,933 +0,0 @@
-/*
- * SHA-1 algorithm as described at
- *
- *   http://csrc.nist.gov/cryptval/shs.html
- */
-
-#include "ssh.h"
-#include <assert.h>
-
-/*
- * Start by deciding whether we can support hardware SHA at all.
- */
-#define HW_SHA1_NONE 0
-#define HW_SHA1_NI 1
-#define HW_SHA1_NEON 2
-
-#ifdef _FORCE_SHA_NI
-#   define HW_SHA1 HW_SHA1_NI
-#elif defined(__clang__)
-#   if __has_attribute(target) && __has_include(<wmmintrin.h>) &&       \
-    (defined(__x86_64__) || defined(__i386))
-#       define HW_SHA1 HW_SHA1_NI
-#   endif
-#elif defined(__GNUC__)
-#    if (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 9)) && \
-        (defined(__x86_64__) || defined(__i386))
-#       define HW_SHA1 HW_SHA1_NI
-#    endif
-#elif defined (_MSC_VER)
-#   if (defined(_M_X64) || defined(_M_IX86)) && _MSC_FULL_VER >= 150030729
-#      define HW_SHA1 HW_SHA1_NI
-#   endif
-#endif
-
-#ifdef _FORCE_SHA_NEON
-#   define HW_SHA1 HW_SHA1_NEON
-#elif defined __BYTE_ORDER__ && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
-    /* Arm can potentially support both endiannesses, but this code
-     * hasn't been tested on anything but little. If anyone wants to
-     * run big-endian, they'll need to fix it first. */
-#elif defined __ARM_FEATURE_CRYPTO
-    /* If the Arm crypto extension is available already, we can
-     * support NEON SHA without having to enable anything by hand */
-#   define HW_SHA1 HW_SHA1_NEON
-#elif defined(__clang__)
-#   if __has_attribute(target) && __has_include(<arm_neon.h>) &&       \
-    (defined(__aarch64__))
-        /* clang can enable the crypto extension in AArch64 using
-         * __attribute__((target)) */
-#       define HW_SHA1 HW_SHA1_NEON
-#       define USE_CLANG_ATTR_TARGET_AARCH64
-#   endif
-#elif defined _MSC_VER
-    /* Visual Studio supports the crypto extension when targeting
-     * AArch64, but as of VS2017, the AArch32 header doesn't quite
-     * manage it (declaring the shae/shad intrinsics without a round
-     * key operand). */
-#   if defined _M_ARM64
-#       define HW_SHA1 HW_SHA1_NEON
-#       if defined _M_ARM64
-#           define USE_ARM64_NEON_H /* unusual header name in this case */
-#       endif
-#   endif
-#endif
-
-#if defined _FORCE_SOFTWARE_SHA || !defined HW_SHA1
-#   undef HW_SHA1
-#   define HW_SHA1 HW_SHA1_NONE
-#endif
-
-/*
- * The actual query function that asks if hardware acceleration is
- * available.
- */
-static bool sha1_hw_available(void);
-
-/*
- * The top-level selection function, caching the results of
- * sha1_hw_available() so it only has to run once.
- */
-static bool sha1_hw_available_cached(void)
-{
-    static bool initialised = false;
-    static bool hw_available;
-    if (!initialised) {
-        hw_available = sha1_hw_available();
-        initialised = true;
-    }
-    return hw_available;
-}
-
-static ssh_hash *sha1_select(const ssh_hashalg *alg)
-{
-    const ssh_hashalg *real_alg =
-        sha1_hw_available_cached() ? &ssh_sha1_hw : &ssh_sha1_sw;
-
-    return ssh_hash_new(real_alg);
-}
-
-const ssh_hashalg ssh_sha1 = {
-    .new = sha1_select,
-    .hlen = 20,
-    .blocklen = 64,
-    HASHALG_NAMES_ANNOTATED("SHA-1", "dummy selector vtable"),
-};
-
-/* ----------------------------------------------------------------------
- * Definitions likely to be helpful to multiple implementations.
- */
-
-static const uint32_t sha1_initial_state[] = {
-    0x67452301, 0xefcdab89, 0x98badcfe, 0x10325476, 0xc3d2e1f0,
-};
-
-#define SHA1_ROUNDS_PER_STAGE 20
-#define SHA1_STAGE0_CONSTANT 0x5a827999
-#define SHA1_STAGE1_CONSTANT 0x6ed9eba1
-#define SHA1_STAGE2_CONSTANT 0x8f1bbcdc
-#define SHA1_STAGE3_CONSTANT 0xca62c1d6
-#define SHA1_ROUNDS (4 * SHA1_ROUNDS_PER_STAGE)
-
-typedef struct sha1_block sha1_block;
-struct sha1_block {
-    uint8_t block[64];
-    size_t used;
-    uint64_t len;
-};
-
-static inline void sha1_block_setup(sha1_block *blk)
-{
-    blk->used = 0;
-    blk->len = 0;
-}
-
-static inline bool sha1_block_write(
-    sha1_block *blk, const void **vdata, size_t *len)
-{
-    size_t blkleft = sizeof(blk->block) - blk->used;
-    size_t chunk = *len < blkleft ? *len : blkleft;
-
-    const uint8_t *p = *vdata;
-    memcpy(blk->block + blk->used, p, chunk);
-    *vdata = p + chunk;
-    *len -= chunk;
-    blk->used += chunk;
-    blk->len += chunk;
-
-    if (blk->used == sizeof(blk->block)) {
-        blk->used = 0;
-        return true;
-    }
-
-    return false;
-}
-
-static inline void sha1_block_pad(sha1_block *blk, BinarySink *bs)
-{
-    uint64_t final_len = blk->len << 3;
-    size_t pad = 1 + (63 & (55 - blk->used));
-
-    put_byte(bs, 0x80);
-    for (size_t i = 1; i < pad; i++)
-        put_byte(bs, 0);
-    put_uint64(bs, final_len);
-
-    assert(blk->used == 0 && "Should have exactly hit a block boundary");
-}
-
-/* ----------------------------------------------------------------------
- * Software implementation of SHA-1.
- */
-
-static inline uint32_t rol(uint32_t x, unsigned y)
-{
-    return (x << (31 & y)) | (x >> (31 & -y));
-}
-
-static inline uint32_t Ch(uint32_t ctrl, uint32_t if1, uint32_t if0)
-{
-    return if0 ^ (ctrl & (if1 ^ if0));
-}
-
-static inline uint32_t Maj(uint32_t x, uint32_t y, uint32_t z)
-{
-    return (x & y) | (z & (x | y));
-}
-
-static inline uint32_t Par(uint32_t x, uint32_t y, uint32_t z)
-{
-    return (x ^ y ^ z);
-}
-
-static inline void sha1_sw_round(
-    unsigned round_index, const uint32_t *schedule,
-    uint32_t *a, uint32_t *b, uint32_t *c, uint32_t *d, uint32_t *e,
-    uint32_t f, uint32_t constant)
-{
-    *e = rol(*a, 5) + f + *e + schedule[round_index] + constant;
-    *b = rol(*b, 30);
-}
-
-static void sha1_sw_block(uint32_t *core, const uint8_t *block)
-{
-    uint32_t w[SHA1_ROUNDS];
-    uint32_t a,b,c,d,e;
-
-    for (size_t t = 0; t < 16; t++)
-        w[t] = GET_32BIT_MSB_FIRST(block + 4*t);
-
-    for (size_t t = 16; t < SHA1_ROUNDS; t++)
-        w[t] = rol(w[t - 3] ^ w[t - 8] ^ w[t - 14] ^ w[t - 16], 1);
-
-    a = core[0]; b = core[1]; c = core[2]; d = core[3];
-    e = core[4];
-
-    size_t t = 0;
-    for (size_t u = 0; u < SHA1_ROUNDS_PER_STAGE/5; u++) {
-        sha1_sw_round(t++,w, &a,&b,&c,&d,&e, Ch(b,c,d), SHA1_STAGE0_CONSTANT);
-        sha1_sw_round(t++,w, &e,&a,&b,&c,&d, Ch(a,b,c), SHA1_STAGE0_CONSTANT);
-        sha1_sw_round(t++,w, &d,&e,&a,&b,&c, Ch(e,a,b), SHA1_STAGE0_CONSTANT);
-        sha1_sw_round(t++,w, &c,&d,&e,&a,&b, Ch(d,e,a), SHA1_STAGE0_CONSTANT);
-        sha1_sw_round(t++,w, &b,&c,&d,&e,&a, Ch(c,d,e), SHA1_STAGE0_CONSTANT);
-    }
-    for (size_t u = 0; u < SHA1_ROUNDS_PER_STAGE/5; u++) {
-        sha1_sw_round(t++,w, &a,&b,&c,&d,&e, Par(b,c,d), SHA1_STAGE1_CONSTANT);
-        sha1_sw_round(t++,w, &e,&a,&b,&c,&d, Par(a,b,c), SHA1_STAGE1_CONSTANT);
-        sha1_sw_round(t++,w, &d,&e,&a,&b,&c, Par(e,a,b), SHA1_STAGE1_CONSTANT);
-        sha1_sw_round(t++,w, &c,&d,&e,&a,&b, Par(d,e,a), SHA1_STAGE1_CONSTANT);
-        sha1_sw_round(t++,w, &b,&c,&d,&e,&a, Par(c,d,e), SHA1_STAGE1_CONSTANT);
-    }
-    for (size_t u = 0; u < SHA1_ROUNDS_PER_STAGE/5; u++) {
-        sha1_sw_round(t++,w, &a,&b,&c,&d,&e, Maj(b,c,d), SHA1_STAGE2_CONSTANT);
-        sha1_sw_round(t++,w, &e,&a,&b,&c,&d, Maj(a,b,c), SHA1_STAGE2_CONSTANT);
-        sha1_sw_round(t++,w, &d,&e,&a,&b,&c, Maj(e,a,b), SHA1_STAGE2_CONSTANT);
-        sha1_sw_round(t++,w, &c,&d,&e,&a,&b, Maj(d,e,a), SHA1_STAGE2_CONSTANT);
-        sha1_sw_round(t++,w, &b,&c,&d,&e,&a, Maj(c,d,e), SHA1_STAGE2_CONSTANT);
-    }
-    for (size_t u = 0; u < SHA1_ROUNDS_PER_STAGE/5; u++) {
-        sha1_sw_round(t++,w, &a,&b,&c,&d,&e, Par(b,c,d), SHA1_STAGE3_CONSTANT);
-        sha1_sw_round(t++,w, &e,&a,&b,&c,&d, Par(a,b,c), SHA1_STAGE3_CONSTANT);
-        sha1_sw_round(t++,w, &d,&e,&a,&b,&c, Par(e,a,b), SHA1_STAGE3_CONSTANT);
-        sha1_sw_round(t++,w, &c,&d,&e,&a,&b, Par(d,e,a), SHA1_STAGE3_CONSTANT);
-        sha1_sw_round(t++,w, &b,&c,&d,&e,&a, Par(c,d,e), SHA1_STAGE3_CONSTANT);
-    }
-
-    core[0] += a; core[1] += b; core[2] += c; core[3] += d; core[4] += e;
-
-    smemclr(w, sizeof(w));
-}
-
-typedef struct sha1_sw {
-    uint32_t core[5];
-    sha1_block blk;
-    BinarySink_IMPLEMENTATION;
-    ssh_hash hash;
-} sha1_sw;
-
-static void sha1_sw_write(BinarySink *bs, const void *vp, size_t len);
-
-static ssh_hash *sha1_sw_new(const ssh_hashalg *alg)
-{
-    sha1_sw *s = snew(sha1_sw);
-
-    s->hash.vt = alg;
-    BinarySink_INIT(s, sha1_sw_write);
-    BinarySink_DELEGATE_INIT(&s->hash, s);
-    return &s->hash;
-}
-
-static void sha1_sw_reset(ssh_hash *hash)
-{
-    sha1_sw *s = container_of(hash, sha1_sw, hash);
-
-    memcpy(s->core, sha1_initial_state, sizeof(s->core));
-    sha1_block_setup(&s->blk);
-}
-
-static void sha1_sw_copyfrom(ssh_hash *hcopy, ssh_hash *horig)
-{
-    sha1_sw *copy = container_of(hcopy, sha1_sw, hash);
-    sha1_sw *orig = container_of(horig, sha1_sw, hash);
-
-    memcpy(copy, orig, sizeof(*copy));
-    BinarySink_COPIED(copy);
-    BinarySink_DELEGATE_INIT(&copy->hash, copy);
-}
-
-static void sha1_sw_free(ssh_hash *hash)
-{
-    sha1_sw *s = container_of(hash, sha1_sw, hash);
-
-    smemclr(s, sizeof(*s));
-    sfree(s);
-}
-
-static void sha1_sw_write(BinarySink *bs, const void *vp, size_t len)
-{
-    sha1_sw *s = BinarySink_DOWNCAST(bs, sha1_sw);
-
-    while (len > 0)
-        if (sha1_block_write(&s->blk, &vp, &len))
-            sha1_sw_block(s->core, s->blk.block);
-}
-
-static void sha1_sw_digest(ssh_hash *hash, uint8_t *digest)
-{
-    sha1_sw *s = container_of(hash, sha1_sw, hash);
-
-    sha1_block_pad(&s->blk, BinarySink_UPCAST(s));
-    for (size_t i = 0; i < 5; i++)
-        PUT_32BIT_MSB_FIRST(digest + 4*i, s->core[i]);
-}
-
-const ssh_hashalg ssh_sha1_sw = {
-    .new = sha1_sw_new,
-    .reset = sha1_sw_reset,
-    .copyfrom = sha1_sw_copyfrom,
-    .digest = sha1_sw_digest,
-    .free = sha1_sw_free,
-    .hlen = 20,
-    .blocklen = 64,
-    HASHALG_NAMES_ANNOTATED("SHA-1", "unaccelerated"),
-};
-
-/* ----------------------------------------------------------------------
- * Hardware-accelerated implementation of SHA-1 using x86 SHA-NI.
- */
-
-#if HW_SHA1 == HW_SHA1_NI
-
-/*
- * Set target architecture for Clang and GCC
- */
-
-#if defined(__clang__) || defined(__GNUC__)
-#    define FUNC_ISA __attribute__ ((target("sse4.1,sha")))
-#if !defined(__clang__)
-#    pragma GCC target("sha")
-#    pragma GCC target("sse4.1")
-#endif
-#else
-#    define FUNC_ISA
-#endif
-
-#include <wmmintrin.h>
-#include <smmintrin.h>
-#include <immintrin.h>
-#if defined(__clang__) || defined(__GNUC__)
-#include <shaintrin.h>
-#endif
-
-#if defined(__clang__) || defined(__GNUC__)
-#include <cpuid.h>
-#define GET_CPU_ID_0(out)                               \
-    __cpuid(0, (out)[0], (out)[1], (out)[2], (out)[3])
-#define GET_CPU_ID_7(out)                                       \
-    __cpuid_count(7, 0, (out)[0], (out)[1], (out)[2], (out)[3])
-#else
-#define GET_CPU_ID_0(out) __cpuid(out, 0)
-#define GET_CPU_ID_7(out) __cpuidex(out, 7, 0)
-#endif
-
-static bool sha1_hw_available(void)
-{
-    unsigned int CPUInfo[4];
-    GET_CPU_ID_0(CPUInfo);
-    if (CPUInfo[0] < 7)
-        return false;
-
-    GET_CPU_ID_7(CPUInfo);
-    return CPUInfo[1] & (1 << 29); /* Check SHA */
-}
-
-/* SHA1 implementation using new instructions
-   The code is based on Jeffrey Walton's SHA1 implementation:
-   https://github.com/noloader/SHA-Intrinsics
-*/
-FUNC_ISA
-static inline void sha1_ni_block(__m128i *core, const uint8_t *p)
-{
-    __m128i ABCD, E0, E1, MSG0, MSG1, MSG2, MSG3;
-    const __m128i MASK = _mm_set_epi64x(
-        0x0001020304050607ULL, 0x08090a0b0c0d0e0fULL);
-
-    const __m128i *block = (const __m128i *)p;
-
-    /* Load initial values */
-    ABCD = core[0];
-    E0 = core[1];
-
-    /* Rounds 0-3 */
-    MSG0 = _mm_loadu_si128(block);
-    MSG0 = _mm_shuffle_epi8(MSG0, MASK);
-    E0 = _mm_add_epi32(E0, MSG0);
-    E1 = ABCD;
-    ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 0);
-
-    /* Rounds 4-7 */
-    MSG1 = _mm_loadu_si128(block + 1);
-    MSG1 = _mm_shuffle_epi8(MSG1, MASK);
-    E1 = _mm_sha1nexte_epu32(E1, MSG1);
-    E0 = ABCD;
-    ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 0);
-    MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1);
-
-    /* Rounds 8-11 */
-    MSG2 = _mm_loadu_si128(block + 2);
-    MSG2 = _mm_shuffle_epi8(MSG2, MASK);
-    E0 = _mm_sha1nexte_epu32(E0, MSG2);
-    E1 = ABCD;
-    ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 0);
-    MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2);
-    MSG0 = _mm_xor_si128(MSG0, MSG2);
-
-    /* Rounds 12-15 */
-    MSG3 = _mm_loadu_si128(block + 3);
-    MSG3 = _mm_shuffle_epi8(MSG3, MASK);
-    E1 = _mm_sha1nexte_epu32(E1, MSG3);
-    E0 = ABCD;
-    MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3);
-    ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 0);
-    MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3);
-    MSG1 = _mm_xor_si128(MSG1, MSG3);
-
-    /* Rounds 16-19 */
-    E0 = _mm_sha1nexte_epu32(E0, MSG0);
-    E1 = ABCD;
-    MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0);
-    ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 0);
-    MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0);
-    MSG2 = _mm_xor_si128(MSG2, MSG0);
-
-    /* Rounds 20-23 */
-    E1 = _mm_sha1nexte_epu32(E1, MSG1);
-    E0 = ABCD;
-    MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1);
-    ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 1);
-    MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1);
-    MSG3 = _mm_xor_si128(MSG3, MSG1);
-
-    /* Rounds 24-27 */
-    E0 = _mm_sha1nexte_epu32(E0, MSG2);
-    E1 = ABCD;
-    MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2);
-    ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 1);
-    MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2);
-    MSG0 = _mm_xor_si128(MSG0, MSG2);
-
-    /* Rounds 28-31 */
-    E1 = _mm_sha1nexte_epu32(E1, MSG3);
-    E0 = ABCD;
-    MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3);
-    ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 1);
-    MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3);
-    MSG1 = _mm_xor_si128(MSG1, MSG3);
-
-    /* Rounds 32-35 */
-    E0 = _mm_sha1nexte_epu32(E0, MSG0);
-    E1 = ABCD;
-    MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0);
-    ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 1);
-    MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0);
-    MSG2 = _mm_xor_si128(MSG2, MSG0);
-
-    /* Rounds 36-39 */
-    E1 = _mm_sha1nexte_epu32(E1, MSG1);
-    E0 = ABCD;
-    MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1);
-    ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 1);
-    MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1);
-    MSG3 = _mm_xor_si128(MSG3, MSG1);
-
-    /* Rounds 40-43 */
-    E0 = _mm_sha1nexte_epu32(E0, MSG2);
-    E1 = ABCD;
-    MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2);
-    ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 2);
-    MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2);
-    MSG0 = _mm_xor_si128(MSG0, MSG2);
-
-    /* Rounds 44-47 */
-    E1 = _mm_sha1nexte_epu32(E1, MSG3);
-    E0 = ABCD;
-    MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3);
-    ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 2);
-    MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3);
-    MSG1 = _mm_xor_si128(MSG1, MSG3);
-
-    /* Rounds 48-51 */
-    E0 = _mm_sha1nexte_epu32(E0, MSG0);
-    E1 = ABCD;
-    MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0);
-    ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 2);
-    MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0);
-    MSG2 = _mm_xor_si128(MSG2, MSG0);
-
-    /* Rounds 52-55 */
-    E1 = _mm_sha1nexte_epu32(E1, MSG1);
-    E0 = ABCD;
-    MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1);
-    ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 2);
-    MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1);
-    MSG3 = _mm_xor_si128(MSG3, MSG1);
-
-    /* Rounds 56-59 */
-    E0 = _mm_sha1nexte_epu32(E0, MSG2);
-    E1 = ABCD;
-    MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2);
-    ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 2);
-    MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2);
-    MSG0 = _mm_xor_si128(MSG0, MSG2);
-
-    /* Rounds 60-63 */
-    E1 = _mm_sha1nexte_epu32(E1, MSG3);
-    E0 = ABCD;
-    MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3);
-    ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 3);
-    MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3);
-    MSG1 = _mm_xor_si128(MSG1, MSG3);
-
-    /* Rounds 64-67 */
-    E0 = _mm_sha1nexte_epu32(E0, MSG0);
-    E1 = ABCD;
-    MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0);
-    ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 3);
-    MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0);
-    MSG2 = _mm_xor_si128(MSG2, MSG0);
-
-    /* Rounds 68-71 */
-    E1 = _mm_sha1nexte_epu32(E1, MSG1);
-    E0 = ABCD;
-    MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1);
-    ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 3);
-    MSG3 = _mm_xor_si128(MSG3, MSG1);
-
-    /* Rounds 72-75 */
-    E0 = _mm_sha1nexte_epu32(E0, MSG2);
-    E1 = ABCD;
-    MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2);
-    ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 3);
-
-    /* Rounds 76-79 */
-    E1 = _mm_sha1nexte_epu32(E1, MSG3);
-    E0 = ABCD;
-    ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 3);
-
-    /* Combine state */
-    core[0] = _mm_add_epi32(ABCD, core[0]);
-    core[1] = _mm_sha1nexte_epu32(E0, core[1]);
-}
-
-typedef struct sha1_ni {
-    /*
-     * core[0] stores the first four words of the SHA-1 state. core[1]
-     * stores just the fifth word, in the vector lane at the highest
-     * address.
-     */
-    __m128i core[2];
-    sha1_block blk;
-    void *pointer_to_free;
-    BinarySink_IMPLEMENTATION;
-    ssh_hash hash;
-} sha1_ni;
-
-static void sha1_ni_write(BinarySink *bs, const void *vp, size_t len);
-
-static sha1_ni *sha1_ni_alloc(void)
-{
-    /*
-     * The __m128i variables in the context structure need to be
-     * 16-byte aligned, but not all malloc implementations that this
-     * code has to work with will guarantee to return a 16-byte
-     * aligned pointer. So we over-allocate, manually realign the
-     * pointer ourselves, and store the original one inside the
-     * context so we know how to free it later.
-     */
-    void *allocation = smalloc(sizeof(sha1_ni) + 15);
-    uintptr_t alloc_address = (uintptr_t)allocation;
-    uintptr_t aligned_address = (alloc_address + 15) & ~15;
-    sha1_ni *s = (sha1_ni *)aligned_address;
-    s->pointer_to_free = allocation;
-    return s;
-}
-
-static ssh_hash *sha1_ni_new(const ssh_hashalg *alg)
-{
-    if (!sha1_hw_available_cached())
-        return NULL;
-
-    sha1_ni *s = sha1_ni_alloc();
-
-    s->hash.vt = alg;
-    BinarySink_INIT(s, sha1_ni_write);
-    BinarySink_DELEGATE_INIT(&s->hash, s);
-    return &s->hash;
-}
-
-FUNC_ISA static void sha1_ni_reset(ssh_hash *hash)
-{
-    sha1_ni *s = container_of(hash, sha1_ni, hash);
-
-    /* Initialise the core vectors in their storage order */
-    s->core[0] = _mm_set_epi64x(
-        0x67452301efcdab89ULL, 0x98badcfe10325476ULL);
-    s->core[1] = _mm_set_epi32(0xc3d2e1f0, 0, 0, 0);
-
-    sha1_block_setup(&s->blk);
-}
-
-static void sha1_ni_copyfrom(ssh_hash *hcopy, ssh_hash *horig)
-{
-    sha1_ni *copy = container_of(hcopy, sha1_ni, hash);
-    sha1_ni *orig = container_of(horig, sha1_ni, hash);
-
-    void *ptf_save = copy->pointer_to_free;
-    *copy = *orig; /* structure copy */
-    copy->pointer_to_free = ptf_save;
-
-    BinarySink_COPIED(copy);
-    BinarySink_DELEGATE_INIT(&copy->hash, copy);
-}
-
-static void sha1_ni_free(ssh_hash *hash)
-{
-    sha1_ni *s = container_of(hash, sha1_ni, hash);
-
-    void *ptf = s->pointer_to_free;
-    smemclr(s, sizeof(*s));
-    sfree(ptf);
-}
-
-static void sha1_ni_write(BinarySink *bs, const void *vp, size_t len)
-{
-    sha1_ni *s = BinarySink_DOWNCAST(bs, sha1_ni);
-
-    while (len > 0)
-        if (sha1_block_write(&s->blk, &vp, &len))
-            sha1_ni_block(s->core, s->blk.block);
-}
-
-FUNC_ISA static void sha1_ni_digest(ssh_hash *hash, uint8_t *digest)
-{
-    sha1_ni *s = container_of(hash, sha1_ni, hash);
-
-    sha1_block_pad(&s->blk, BinarySink_UPCAST(s));
-
-    /* Rearrange the first vector into its output order */
-    __m128i abcd = _mm_shuffle_epi32(s->core[0], 0x1B);
-
-    /* Byte-swap it into the output endianness */
-    const __m128i mask = _mm_setr_epi8(3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12);
-    abcd = _mm_shuffle_epi8(abcd, mask);
-
-    /* And store it */
-    _mm_storeu_si128((__m128i *)digest, abcd);
-
-    /* Finally, store the leftover word */
-    uint32_t e = _mm_extract_epi32(s->core[1], 3);
-    PUT_32BIT_MSB_FIRST(digest + 16, e);
-}
-
-const ssh_hashalg ssh_sha1_hw = {
-    .new = sha1_ni_new,
-    .reset = sha1_ni_reset,
-    .copyfrom = sha1_ni_copyfrom,
-    .digest = sha1_ni_digest,
-    .free = sha1_ni_free,
-    .hlen = 20,
-    .blocklen = 64,
-    HASHALG_NAMES_ANNOTATED("SHA-1", "SHA-NI accelerated"),
-};
-
-/* ----------------------------------------------------------------------
- * Hardware-accelerated implementation of SHA-1 using Arm NEON.
- */
-
-#elif HW_SHA1 == HW_SHA1_NEON
-
-/*
- * Manually set the target architecture, if we decided above that we
- * need to.
- */
-#ifdef USE_CLANG_ATTR_TARGET_AARCH64
-/*
- * A spot of cheating: redefine some ACLE feature macros before
- * including arm_neon.h. Otherwise we won't get the SHA intrinsics
- * defined by that header, because it will be looking at the settings
- * for the whole translation unit rather than the ones we're going to
- * put on some particular functions using __attribute__((target)).
- */
-#define __ARM_NEON 1
-#define __ARM_FEATURE_CRYPTO 1
-#define FUNC_ISA __attribute__ ((target("neon,crypto")))
-#endif /* USE_CLANG_ATTR_TARGET_AARCH64 */
-
-#ifndef FUNC_ISA
-#define FUNC_ISA
-#endif
-
-#ifdef USE_ARM64_NEON_H
-#include <arm64_neon.h>
-#else
-#include <arm_neon.h>
-#endif
-
-static bool sha1_hw_available(void)
-{
-    /*
-     * For Arm, we delegate to a per-platform detection function (see
-     * explanation in sshaes.c).
-     */
-    return platform_sha1_hw_available();
-}
-
-typedef struct sha1_neon_core sha1_neon_core;
-struct sha1_neon_core {
-    uint32x4_t abcd;
-    uint32_t e;
-};
-
-FUNC_ISA
-static inline uint32x4_t sha1_neon_load_input(const uint8_t *p)
-{
-    return vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(p)));
-}
-
-FUNC_ISA
-static inline uint32x4_t sha1_neon_schedule_update(
-    uint32x4_t m4, uint32x4_t m3, uint32x4_t m2, uint32x4_t m1)
-{
-    return vsha1su1q_u32(vsha1su0q_u32(m4, m3, m2), m1);
-}
-
-/*
- * SHA-1 has three different kinds of round, differing in whether they
- * use the Ch, Maj or Par functions defined above. Each one uses a
- * separate NEON instruction, so we define three inline functions for
- * the different round types using this macro.
- *
- * The two batches of Par-type rounds also use a different constant,
- * but that's passed in as an operand, so we don't need a fourth
- * inline function just for that.
- */
-#define SHA1_NEON_ROUND_FN(type)                                        \
-    FUNC_ISA static inline sha1_neon_core sha1_neon_round4_##type(      \
-        sha1_neon_core old, uint32x4_t sched, uint32x4_t constant)      \
-    {                                                                   \
-        sha1_neon_core new;                                             \
-        uint32x4_t round_input = vaddq_u32(sched, constant);            \
-        new.abcd = vsha1##type##q_u32(old.abcd, old.e, round_input);    \
-        new.e = vsha1h_u32(vget_lane_u32(vget_low_u32(old.abcd), 0));   \
-        return new;                                                     \
-    }
-SHA1_NEON_ROUND_FN(c)
-SHA1_NEON_ROUND_FN(p)
-SHA1_NEON_ROUND_FN(m)
-
-FUNC_ISA
-static inline void sha1_neon_block(sha1_neon_core *core, const uint8_t *p)
-{
-    uint32x4_t constant, s0, s1, s2, s3;
-    sha1_neon_core cr = *core;
-
-    constant = vdupq_n_u32(SHA1_STAGE0_CONSTANT);
-    s0 = sha1_neon_load_input(p);
-    cr = sha1_neon_round4_c(cr, s0, constant);
-    s1 = sha1_neon_load_input(p + 16);
-    cr = sha1_neon_round4_c(cr, s1, constant);
-    s2 = sha1_neon_load_input(p + 32);
-    cr = sha1_neon_round4_c(cr, s2, constant);
-    s3 = sha1_neon_load_input(p + 48);
-    cr = sha1_neon_round4_c(cr, s3, constant);
-    s0 = sha1_neon_schedule_update(s0, s1, s2, s3);
-    cr = sha1_neon_round4_c(cr, s0, constant);
-
-    constant = vdupq_n_u32(SHA1_STAGE1_CONSTANT);
-    s1 = sha1_neon_schedule_update(s1, s2, s3, s0);
-    cr = sha1_neon_round4_p(cr, s1, constant);
-    s2 = sha1_neon_schedule_update(s2, s3, s0, s1);
-    cr = sha1_neon_round4_p(cr, s2, constant);
-    s3 = sha1_neon_schedule_update(s3, s0, s1, s2);
-    cr = sha1_neon_round4_p(cr, s3, constant);
-    s0 = sha1_neon_schedule_update(s0, s1, s2, s3);
-    cr = sha1_neon_round4_p(cr, s0, constant);
-    s1 = sha1_neon_schedule_update(s1, s2, s3, s0);
-    cr = sha1_neon_round4_p(cr, s1, constant);
-
-    constant = vdupq_n_u32(SHA1_STAGE2_CONSTANT);
-    s2 = sha1_neon_schedule_update(s2, s3, s0, s1);
-    cr = sha1_neon_round4_m(cr, s2, constant);
-    s3 = sha1_neon_schedule_update(s3, s0, s1, s2);
-    cr = sha1_neon_round4_m(cr, s3, constant);
-    s0 = sha1_neon_schedule_update(s0, s1, s2, s3);
-    cr = sha1_neon_round4_m(cr, s0, constant);
-    s1 = sha1_neon_schedule_update(s1, s2, s3, s0);
-    cr = sha1_neon_round4_m(cr, s1, constant);
-    s2 = sha1_neon_schedule_update(s2, s3, s0, s1);
-    cr = sha1_neon_round4_m(cr, s2, constant);
-
-    constant = vdupq_n_u32(SHA1_STAGE3_CONSTANT);
-    s3 = sha1_neon_schedule_update(s3, s0, s1, s2);
-    cr = sha1_neon_round4_p(cr, s3, constant);
-    s0 = sha1_neon_schedule_update(s0, s1, s2, s3);
-    cr = sha1_neon_round4_p(cr, s0, constant);
-    s1 = sha1_neon_schedule_update(s1, s2, s3, s0);
-    cr = sha1_neon_round4_p(cr, s1, constant);
-    s2 = sha1_neon_schedule_update(s2, s3, s0, s1);
-    cr = sha1_neon_round4_p(cr, s2, constant);
-    s3 = sha1_neon_schedule_update(s3, s0, s1, s2);
-    cr = sha1_neon_round4_p(cr, s3, constant);
-
-    core->abcd = vaddq_u32(core->abcd, cr.abcd);
-    core->e += cr.e;
-}
-
-typedef struct sha1_neon {
-    sha1_neon_core core;
-    sha1_block blk;
-    BinarySink_IMPLEMENTATION;
-    ssh_hash hash;
-} sha1_neon;
-
-static void sha1_neon_write(BinarySink *bs, const void *vp, size_t len);
-
-static ssh_hash *sha1_neon_new(const ssh_hashalg *alg)
-{
-    if (!sha1_hw_available_cached())
-        return NULL;
-
-    sha1_neon *s = snew(sha1_neon);
-
-    s->hash.vt = alg;
-    BinarySink_INIT(s, sha1_neon_write);
-    BinarySink_DELEGATE_INIT(&s->hash, s);
-    return &s->hash;
-}
-
-static void sha1_neon_reset(ssh_hash *hash)
-{
-    sha1_neon *s = container_of(hash, sha1_neon, hash);
-
-    s->core.abcd = vld1q_u32(sha1_initial_state);
-    s->core.e = sha1_initial_state[4];
-
-    sha1_block_setup(&s->blk);
-}
-
-static void sha1_neon_copyfrom(ssh_hash *hcopy, ssh_hash *horig)
-{
-    sha1_neon *copy = container_of(hcopy, sha1_neon, hash);
-    sha1_neon *orig = container_of(horig, sha1_neon, hash);
-
-    *copy = *orig; /* structure copy */
-
-    BinarySink_COPIED(copy);
-    BinarySink_DELEGATE_INIT(&copy->hash, copy);
-}
-
-static void sha1_neon_free(ssh_hash *hash)
-{
-    sha1_neon *s = container_of(hash, sha1_neon, hash);
-    smemclr(s, sizeof(*s));
-    sfree(s);
-}
-
-static void sha1_neon_write(BinarySink *bs, const void *vp, size_t len)
-{
-    sha1_neon *s = BinarySink_DOWNCAST(bs, sha1_neon);
-
-    while (len > 0)
-        if (sha1_block_write(&s->blk, &vp, &len))
-            sha1_neon_block(&s->core, s->blk.block);
-}
-
-static void sha1_neon_digest(ssh_hash *hash, uint8_t *digest)
-{
-    sha1_neon *s = container_of(hash, sha1_neon, hash);
-
-    sha1_block_pad(&s->blk, BinarySink_UPCAST(s));
-    vst1q_u8(digest, vrev32q_u8(vreinterpretq_u8_u32(s->core.abcd)));
-    PUT_32BIT_MSB_FIRST(digest + 16, s->core.e);
-}
-
-const ssh_hashalg ssh_sha1_hw = {
-    .new = sha1_neon_new,
-    .reset = sha1_neon_reset,
-    .copyfrom = sha1_neon_copyfrom,
-    .digest = sha1_neon_digest,
-    .free = sha1_neon_free,
-    .hlen = 20,
-    .blocklen = 64,
-    HASHALG_NAMES_ANNOTATED("SHA-1", "NEON accelerated"),
-};
-
-/* ----------------------------------------------------------------------
- * Stub functions if we have no hardware-accelerated SHA-1. In this
- * case, sha1_hw_new returns NULL (though it should also never be
- * selected by sha1_select, so the only thing that should even be
- * _able_ to call it is testcrypt). As a result, the remaining vtable
- * functions should never be called at all.
- */
-
-#elif HW_SHA1 == HW_SHA1_NONE
-
-static bool sha1_hw_available(void)
-{
-    return false;
-}
-
-static ssh_hash *sha1_stub_new(const ssh_hashalg *alg)
-{
-    return NULL;
-}
-
-#define STUB_BODY { unreachable("Should never be called"); }
-
-static void sha1_stub_reset(ssh_hash *hash) STUB_BODY
-static void sha1_stub_copyfrom(ssh_hash *hash, ssh_hash *orig) STUB_BODY
-static void sha1_stub_free(ssh_hash *hash) STUB_BODY
-static void sha1_stub_digest(ssh_hash *hash, uint8_t *digest) STUB_BODY
-
-const ssh_hashalg ssh_sha1_hw = {
-    .new = sha1_stub_new,
-    .reset = sha1_stub_reset,
-    .copyfrom = sha1_stub_copyfrom,
-    .digest = sha1_stub_digest,
-    .free = sha1_stub_free,
-    .hlen = 20,
-    .blocklen = 64,
-    HASHALG_NAMES_ANNOTATED("SHA-1", "!NONEXISTENT ACCELERATED VERSION!"),
-};
-
-#endif /* HW_SHA1 */
--- a/crypto/sha1.h
+++ b/crypto/sha1.h
@ -0,0 +1,109 @@
+/*
+ * Definitions likely to be helpful to multiple SHA-1 implementations.
+ */
+
+/*
+ * The 'extra' structure used by SHA-1 implementations is used to
+ * include information about how to check if a given implementation is
+ * available at run time, and whether we've already checked.
+ */
+struct sha1_extra_mutable;
+struct sha1_extra {
+    /* Function to check availability. Might be expensive, so we don't
+     * want to call it more than once. */
+    bool (*check_available)(void);
+
+    /* Point to a writable substructure. */
+    struct sha1_extra_mutable *mut;
+};
+struct sha1_extra_mutable {
+    bool checked_availability;
+    bool is_available;
+};
+static inline bool check_availability(const struct sha1_extra *extra)
+{
+    if (!extra->mut->checked_availability) {
+        extra->mut->is_available = extra->check_available();
+        extra->mut->checked_availability = true;
+    }
+
+    return extra->mut->is_available;
+}
+
+/*
+ * Macro to define a SHA-1 vtable together with its 'extra'
+ * structure.
+ */
+#define SHA1_VTABLE(impl_c, impl_display)                               \
+    static struct sha1_extra_mutable sha1_ ## impl_c ## _extra_mut;     \
+    static const struct sha1_extra sha1_ ## impl_c ## _extra = {        \
+        .check_available = sha1_ ## impl_c ## _available,               \
+        .mut = &sha1_ ## impl_c ## _extra_mut,                          \
+    };                                                                  \
+    const ssh_hashalg ssh_sha1_ ## impl_c = {                           \
+        .new = sha1_ ## impl_c ## _new,                                 \
+        .reset = sha1_ ## impl_c ## _reset,                             \
+        .copyfrom = sha1_ ## impl_c ## _copyfrom,                       \
+        .digest = sha1_ ## impl_c ## _digest,                           \
+        .free = sha1_ ## impl_c ## _free,                               \
+        .hlen = 20,                                                     \
+        .blocklen = 64,                                                 \
+        HASHALG_NAMES_ANNOTATED("SHA-1", impl_display),                 \
+        .extra = &sha1_ ## impl_c ## _extra,                            \
+    }
+
+extern const uint32_t sha1_initial_state[5];
+
+#define SHA1_ROUNDS_PER_STAGE 20
+#define SHA1_STAGE0_CONSTANT 0x5a827999
+#define SHA1_STAGE1_CONSTANT 0x6ed9eba1
+#define SHA1_STAGE2_CONSTANT 0x8f1bbcdc
+#define SHA1_STAGE3_CONSTANT 0xca62c1d6
+#define SHA1_ROUNDS (4 * SHA1_ROUNDS_PER_STAGE)
+
+typedef struct sha1_block sha1_block;
+struct sha1_block {
+    uint8_t block[64];
+    size_t used;
+    uint64_t len;
+};
+
+static inline void sha1_block_setup(sha1_block *blk)
+{
+    blk->used = 0;
+    blk->len = 0;
+}
+
+static inline bool sha1_block_write(
+    sha1_block *blk, const void **vdata, size_t *len)
+{
+    size_t blkleft = sizeof(blk->block) - blk->used;
+    size_t chunk = *len < blkleft ? *len : blkleft;
+
+    const uint8_t *p = *vdata;
+    memcpy(blk->block + blk->used, p, chunk);
+    *vdata = p + chunk;
+    *len -= chunk;
+    blk->used += chunk;
+    blk->len += chunk;
+
+    if (blk->used == sizeof(blk->block)) {
+        blk->used = 0;
+        return true;
+    }
+
+    return false;
+}
+
+static inline void sha1_block_pad(sha1_block *blk, BinarySink *bs)
+{
+    uint64_t final_len = blk->len << 3;
+    size_t pad = 1 + (63 & (55 - blk->used));
+
+    put_byte(bs, 0x80);
+    for (size_t i = 1; i < pad; i++)
+        put_byte(bs, 0);
+    put_uint64(bs, final_len);
+
+    assert(blk->used == 0 && "Should have exactly hit a block boundary");
+}
--- a/crypto/sha256-common.c
+++ b/crypto/sha256-common.c
@ -0,0 +1,30 @@
+/*
+ * Common variable definitions across all the SHA-256 implementations.
+ */
+
+#include "ssh.h"
+#include "sha256.h"
+
+const uint32_t sha256_initial_state[8] = {
+    0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a,
+    0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19,
+};
+
+const uint32_t sha256_round_constants[64] = {
+    0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
+    0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
+    0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
+    0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
+    0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
+    0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
+    0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
+    0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
+    0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
+    0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
+    0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
+    0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
+    0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
+    0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
+    0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
+    0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
+};
--- a/crypto/sha256-neon.c
+++ b/crypto/sha256-neon.c
@ -0,0 +1,162 @@
+/*
+ * Hardware-accelerated implementation of SHA-256 using Arm NEON.
+ */
+
+#include "ssh.h"
+#include "sha256.h"
+
+#if USE_ARM64_NEON_H
+#include <arm64_neon.h>
+#else
+#include <arm_neon.h>
+#endif
+
+static bool sha256_neon_available(void)
+{
+    /*
+     * For Arm, we delegate to a per-platform detection function (see
+     * explanation in aes-neon.c).
+     */
+    return platform_sha256_neon_available();
+}
+
+typedef struct sha256_neon_core sha256_neon_core;
+struct sha256_neon_core {
+    uint32x4_t abcd, efgh;
+};
+
+static inline uint32x4_t sha256_neon_load_input(const uint8_t *p)
+{
+    return vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(p)));
+}
+
+static inline uint32x4_t sha256_neon_schedule_update(
+    uint32x4_t m4, uint32x4_t m3, uint32x4_t m2, uint32x4_t m1)
+{
+    return vsha256su1q_u32(vsha256su0q_u32(m4, m3), m2, m1);
+}
+
+static inline sha256_neon_core sha256_neon_round4(
+    sha256_neon_core old, uint32x4_t sched, unsigned round)
+{
+    sha256_neon_core new;
+
+    uint32x4_t round_input = vaddq_u32(
+        sched, vld1q_u32(sha256_round_constants + round));
+    new.abcd = vsha256hq_u32 (old.abcd, old.efgh, round_input);
+    new.efgh = vsha256h2q_u32(old.efgh, old.abcd, round_input);
+    return new;
+}
+
+static inline void sha256_neon_block(sha256_neon_core *core, const uint8_t *p)
+{
+    uint32x4_t s0, s1, s2, s3;
+    sha256_neon_core cr = *core;
+
+    s0 = sha256_neon_load_input(p);
+    cr = sha256_neon_round4(cr, s0, 0);
+    s1 = sha256_neon_load_input(p+16);
+    cr = sha256_neon_round4(cr, s1, 4);
+    s2 = sha256_neon_load_input(p+32);
+    cr = sha256_neon_round4(cr, s2, 8);
+    s3 = sha256_neon_load_input(p+48);
+    cr = sha256_neon_round4(cr, s3, 12);
+    s0 = sha256_neon_schedule_update(s0, s1, s2, s3);
+    cr = sha256_neon_round4(cr, s0, 16);
+    s1 = sha256_neon_schedule_update(s1, s2, s3, s0);
+    cr = sha256_neon_round4(cr, s1, 20);
+    s2 = sha256_neon_schedule_update(s2, s3, s0, s1);
+    cr = sha256_neon_round4(cr, s2, 24);
+    s3 = sha256_neon_schedule_update(s3, s0, s1, s2);
+    cr = sha256_neon_round4(cr, s3, 28);
+    s0 = sha256_neon_schedule_update(s0, s1, s2, s3);
+    cr = sha256_neon_round4(cr, s0, 32);
+    s1 = sha256_neon_schedule_update(s1, s2, s3, s0);
+    cr = sha256_neon_round4(cr, s1, 36);
+    s2 = sha256_neon_schedule_update(s2, s3, s0, s1);
+    cr = sha256_neon_round4(cr, s2, 40);
+    s3 = sha256_neon_schedule_update(s3, s0, s1, s2);
+    cr = sha256_neon_round4(cr, s3, 44);
+    s0 = sha256_neon_schedule_update(s0, s1, s2, s3);
+    cr = sha256_neon_round4(cr, s0, 48);
+    s1 = sha256_neon_schedule_update(s1, s2, s3, s0);
+    cr = sha256_neon_round4(cr, s1, 52);
+    s2 = sha256_neon_schedule_update(s2, s3, s0, s1);
+    cr = sha256_neon_round4(cr, s2, 56);
+    s3 = sha256_neon_schedule_update(s3, s0, s1, s2);
+    cr = sha256_neon_round4(cr, s3, 60);
+
+    core->abcd = vaddq_u32(core->abcd, cr.abcd);
+    core->efgh = vaddq_u32(core->efgh, cr.efgh);
+}
+
+typedef struct sha256_neon {
+    sha256_neon_core core;
+    sha256_block blk;
+    BinarySink_IMPLEMENTATION;
+    ssh_hash hash;
+} sha256_neon;
+
+static void sha256_neon_write(BinarySink *bs, const void *vp, size_t len);
+
+static ssh_hash *sha256_neon_new(const ssh_hashalg *alg)
+{
+    const struct sha256_extra *extra = (const struct sha256_extra *)alg->extra;
+    if (!check_availability(extra))
+        return NULL;
+
+    sha256_neon *s = snew(sha256_neon);
+
+    s->hash.vt = alg;
+    BinarySink_INIT(s, sha256_neon_write);
+    BinarySink_DELEGATE_INIT(&s->hash, s);
+    return &s->hash;
+}
+
+static void sha256_neon_reset(ssh_hash *hash)
+{
+    sha256_neon *s = container_of(hash, sha256_neon, hash);
+
+    s->core.abcd = vld1q_u32(sha256_initial_state);
+    s->core.efgh = vld1q_u32(sha256_initial_state + 4);
+
+    sha256_block_setup(&s->blk);
+}
+
+static void sha256_neon_copyfrom(ssh_hash *hcopy, ssh_hash *horig)
+{
+    sha256_neon *copy = container_of(hcopy, sha256_neon, hash);
+    sha256_neon *orig = container_of(horig, sha256_neon, hash);
+
+    *copy = *orig; /* structure copy */
+
+    BinarySink_COPIED(copy);
+    BinarySink_DELEGATE_INIT(&copy->hash, copy);
+}
+
+static void sha256_neon_free(ssh_hash *hash)
+{
+    sha256_neon *s = container_of(hash, sha256_neon, hash);
+    smemclr(s, sizeof(*s));
+    sfree(s);
+}
+
+static void sha256_neon_write(BinarySink *bs, const void *vp, size_t len)
+{
+    sha256_neon *s = BinarySink_DOWNCAST(bs, sha256_neon);
+
+    while (len > 0)
+        if (sha256_block_write(&s->blk, &vp, &len))
+            sha256_neon_block(&s->core, s->blk.block);
+}
+
+static void sha256_neon_digest(ssh_hash *hash, uint8_t *digest)
+{
+    sha256_neon *s = container_of(hash, sha256_neon, hash);
+
+    sha256_block_pad(&s->blk, BinarySink_UPCAST(s));
+    vst1q_u8(digest,      vrev32q_u8(vreinterpretq_u8_u32(s->core.abcd)));
+    vst1q_u8(digest + 16, vrev32q_u8(vreinterpretq_u8_u32(s->core.efgh)));
+}
+
+SHA256_VTABLE(neon, "NEON accelerated");
--- a/crypto/sha256-ni.c
+++ b/crypto/sha256-ni.c
@ -0,0 +1,342 @@
+/*
+ * Hardware-accelerated implementation of SHA-256 using x86 SHA-NI.
+ */
+
+#include "ssh.h"
+#include "sha256.h"
+
+#include <wmmintrin.h>
+#include <smmintrin.h>
+#include <immintrin.h>
+#if HAVE_SHAINTRIN_H
+#include <shaintrin.h>
+#endif
+
+#if defined(__clang__) || defined(__GNUC__)
+#include <cpuid.h>
+#define GET_CPU_ID_0(out)                               \
+    __cpuid(0, (out)[0], (out)[1], (out)[2], (out)[3])
+#define GET_CPU_ID_7(out)                                       \
+    __cpuid_count(7, 0, (out)[0], (out)[1], (out)[2], (out)[3])
+#else
+#define GET_CPU_ID_0(out) __cpuid(out, 0)
+#define GET_CPU_ID_7(out) __cpuidex(out, 7, 0)
+#endif
+
+static bool sha256_ni_available(void)
+{
+    unsigned int CPUInfo[4];
+    GET_CPU_ID_0(CPUInfo);
+    if (CPUInfo[0] < 7)
+        return false;
+
+    GET_CPU_ID_7(CPUInfo);
+    return CPUInfo[1] & (1 << 29); /* Check SHA */
+}
+
+/* SHA256 implementation using new instructions
+   The code is based on Jeffrey Walton's SHA256 implementation:
+   https://github.com/noloader/SHA-Intrinsics
+*/
+static inline void sha256_ni_block(__m128i *core, const uint8_t *p)
+{
+    __m128i STATE0, STATE1;
+    __m128i MSG, TMP;
+    __m128i MSG0, MSG1, MSG2, MSG3;
+    const __m128i *block = (const __m128i *)p;
+    const __m128i MASK = _mm_set_epi64x(
+        0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL);
+
+    /* Load initial values */
+    STATE0 = core[0];
+    STATE1 = core[1];
+
+    /* Rounds 0-3 */
+    MSG = _mm_loadu_si128(block);
+    MSG0 = _mm_shuffle_epi8(MSG, MASK);
+    MSG = _mm_add_epi32(MSG0, _mm_set_epi64x(
+                            0xE9B5DBA5B5C0FBCFULL, 0x71374491428A2F98ULL));
+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+    MSG = _mm_shuffle_epi32(MSG, 0x0E);
+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+
+    /* Rounds 4-7 */
+    MSG1 = _mm_loadu_si128(block + 1);
+    MSG1 = _mm_shuffle_epi8(MSG1, MASK);
+    MSG = _mm_add_epi32(MSG1, _mm_set_epi64x(
+                            0xAB1C5ED5923F82A4ULL, 0x59F111F13956C25BULL));
+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+    MSG = _mm_shuffle_epi32(MSG, 0x0E);
+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+    MSG0 = _mm_sha256msg1_epu32(MSG0, MSG1);
+
+    /* Rounds 8-11 */
+    MSG2 = _mm_loadu_si128(block + 2);
+    MSG2 = _mm_shuffle_epi8(MSG2, MASK);
+    MSG = _mm_add_epi32(MSG2, _mm_set_epi64x(
+                            0x550C7DC3243185BEULL, 0x12835B01D807AA98ULL));
+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+    MSG = _mm_shuffle_epi32(MSG, 0x0E);
+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+    MSG1 = _mm_sha256msg1_epu32(MSG1, MSG2);
+
+    /* Rounds 12-15 */
+    MSG3 = _mm_loadu_si128(block + 3);
+    MSG3 = _mm_shuffle_epi8(MSG3, MASK);
+    MSG = _mm_add_epi32(MSG3, _mm_set_epi64x(
+                            0xC19BF1749BDC06A7ULL, 0x80DEB1FE72BE5D74ULL));
+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+    TMP = _mm_alignr_epi8(MSG3, MSG2, 4);
+    MSG0 = _mm_add_epi32(MSG0, TMP);
+    MSG0 = _mm_sha256msg2_epu32(MSG0, MSG3);
+    MSG = _mm_shuffle_epi32(MSG, 0x0E);
+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+    MSG2 = _mm_sha256msg1_epu32(MSG2, MSG3);
+
+    /* Rounds 16-19 */
+    MSG = _mm_add_epi32(MSG0, _mm_set_epi64x(
+                            0x240CA1CC0FC19DC6ULL, 0xEFBE4786E49B69C1ULL));
+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+    TMP = _mm_alignr_epi8(MSG0, MSG3, 4);
+    MSG1 = _mm_add_epi32(MSG1, TMP);
+    MSG1 = _mm_sha256msg2_epu32(MSG1, MSG0);
+    MSG = _mm_shuffle_epi32(MSG, 0x0E);
+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+    MSG3 = _mm_sha256msg1_epu32(MSG3, MSG0);
+
+    /* Rounds 20-23 */
+    MSG = _mm_add_epi32(MSG1, _mm_set_epi64x(
+                            0x76F988DA5CB0A9DCULL, 0x4A7484AA2DE92C6FULL));
+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+    TMP = _mm_alignr_epi8(MSG1, MSG0, 4);
+    MSG2 = _mm_add_epi32(MSG2, TMP);
+    MSG2 = _mm_sha256msg2_epu32(MSG2, MSG1);
+    MSG = _mm_shuffle_epi32(MSG, 0x0E);
+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+    MSG0 = _mm_sha256msg1_epu32(MSG0, MSG1);
+
+    /* Rounds 24-27 */
+    MSG = _mm_add_epi32(MSG2, _mm_set_epi64x(
+                            0xBF597FC7B00327C8ULL, 0xA831C66D983E5152ULL));
+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+    TMP = _mm_alignr_epi8(MSG2, MSG1, 4);
+    MSG3 = _mm_add_epi32(MSG3, TMP);
+    MSG3 = _mm_sha256msg2_epu32(MSG3, MSG2);
+    MSG = _mm_shuffle_epi32(MSG, 0x0E);
+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+    MSG1 = _mm_sha256msg1_epu32(MSG1, MSG2);
+
+    /* Rounds 28-31 */
+    MSG = _mm_add_epi32(MSG3, _mm_set_epi64x(
+                            0x1429296706CA6351ULL,  0xD5A79147C6E00BF3ULL));
+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+    TMP = _mm_alignr_epi8(MSG3, MSG2, 4);
+    MSG0 = _mm_add_epi32(MSG0, TMP);
+    MSG0 = _mm_sha256msg2_epu32(MSG0, MSG3);
+    MSG = _mm_shuffle_epi32(MSG, 0x0E);
+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+    MSG2 = _mm_sha256msg1_epu32(MSG2, MSG3);
+
+    /* Rounds 32-35 */
+    MSG = _mm_add_epi32(MSG0, _mm_set_epi64x(
+                            0x53380D134D2C6DFCULL, 0x2E1B213827B70A85ULL));
+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+    TMP = _mm_alignr_epi8(MSG0, MSG3, 4);
+    MSG1 = _mm_add_epi32(MSG1, TMP);
+    MSG1 = _mm_sha256msg2_epu32(MSG1, MSG0);
+    MSG = _mm_shuffle_epi32(MSG, 0x0E);
+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+    MSG3 = _mm_sha256msg1_epu32(MSG3, MSG0);
+
+    /* Rounds 36-39 */
+    MSG = _mm_add_epi32(MSG1, _mm_set_epi64x(
+                            0x92722C8581C2C92EULL, 0x766A0ABB650A7354ULL));
+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+    TMP = _mm_alignr_epi8(MSG1, MSG0, 4);
+    MSG2 = _mm_add_epi32(MSG2, TMP);
+    MSG2 = _mm_sha256msg2_epu32(MSG2, MSG1);
+    MSG = _mm_shuffle_epi32(MSG, 0x0E);
+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+    MSG0 = _mm_sha256msg1_epu32(MSG0, MSG1);
+
+    /* Rounds 40-43 */
+    MSG = _mm_add_epi32(MSG2, _mm_set_epi64x(
+                            0xC76C51A3C24B8B70ULL, 0xA81A664BA2BFE8A1ULL));
+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+    TMP = _mm_alignr_epi8(MSG2, MSG1, 4);
+    MSG3 = _mm_add_epi32(MSG3, TMP);
+    MSG3 = _mm_sha256msg2_epu32(MSG3, MSG2);
+    MSG = _mm_shuffle_epi32(MSG, 0x0E);
+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+    MSG1 = _mm_sha256msg1_epu32(MSG1, MSG2);
+
+    /* Rounds 44-47 */
+    MSG = _mm_add_epi32(MSG3, _mm_set_epi64x(
+                            0x106AA070F40E3585ULL, 0xD6990624D192E819ULL));
+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+    TMP = _mm_alignr_epi8(MSG3, MSG2, 4);
+    MSG0 = _mm_add_epi32(MSG0, TMP);
+    MSG0 = _mm_sha256msg2_epu32(MSG0, MSG3);
+    MSG = _mm_shuffle_epi32(MSG, 0x0E);
+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+    MSG2 = _mm_sha256msg1_epu32(MSG2, MSG3);
+
+    /* Rounds 48-51 */
+    MSG = _mm_add_epi32(MSG0, _mm_set_epi64x(
+                            0x34B0BCB52748774CULL, 0x1E376C0819A4C116ULL));
+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+    TMP = _mm_alignr_epi8(MSG0, MSG3, 4);
+    MSG1 = _mm_add_epi32(MSG1, TMP);
+    MSG1 = _mm_sha256msg2_epu32(MSG1, MSG0);
+    MSG = _mm_shuffle_epi32(MSG, 0x0E);
+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+    MSG3 = _mm_sha256msg1_epu32(MSG3, MSG0);
+
+    /* Rounds 52-55 */
+    MSG = _mm_add_epi32(MSG1, _mm_set_epi64x(
+                            0x682E6FF35B9CCA4FULL, 0x4ED8AA4A391C0CB3ULL));
+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+    TMP = _mm_alignr_epi8(MSG1, MSG0, 4);
+    MSG2 = _mm_add_epi32(MSG2, TMP);
+    MSG2 = _mm_sha256msg2_epu32(MSG2, MSG1);
+    MSG = _mm_shuffle_epi32(MSG, 0x0E);
+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+
+    /* Rounds 56-59 */
+    MSG = _mm_add_epi32(MSG2, _mm_set_epi64x(
+                            0x8CC7020884C87814ULL, 0x78A5636F748F82EEULL));
+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+    TMP = _mm_alignr_epi8(MSG2, MSG1, 4);
+    MSG3 = _mm_add_epi32(MSG3, TMP);
+    MSG3 = _mm_sha256msg2_epu32(MSG3, MSG2);
+    MSG = _mm_shuffle_epi32(MSG, 0x0E);
+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+
+    /* Rounds 60-63 */
+    MSG = _mm_add_epi32(MSG3, _mm_set_epi64x(
+                            0xC67178F2BEF9A3F7ULL, 0xA4506CEB90BEFFFAULL));
+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+    MSG = _mm_shuffle_epi32(MSG, 0x0E);
+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+
+    /* Combine state */
+    core[0] = _mm_add_epi32(STATE0, core[0]);
+    core[1] = _mm_add_epi32(STATE1, core[1]);
+}
+
+typedef struct sha256_ni {
+    /*
+     * These two vectors store the 8 words of the SHA-256 state, but
+     * not in the same order they appear in the spec: the first word
+     * holds A,B,E,F and the second word C,D,G,H.
+     */
+    __m128i core[2];
+    sha256_block blk;
+    void *pointer_to_free;
+    BinarySink_IMPLEMENTATION;
+    ssh_hash hash;
+} sha256_ni;
+
+static void sha256_ni_write(BinarySink *bs, const void *vp, size_t len);
+
+static sha256_ni *sha256_ni_alloc(void)
+{
+    /*
+     * The __m128i variables in the context structure need to be
+     * 16-byte aligned, but not all malloc implementations that this
+     * code has to work with will guarantee to return a 16-byte
+     * aligned pointer. So we over-allocate, manually realign the
+     * pointer ourselves, and store the original one inside the
+     * context so we know how to free it later.
+     */
+    void *allocation = smalloc(sizeof(sha256_ni) + 15);
+    uintptr_t alloc_address = (uintptr_t)allocation;
+    uintptr_t aligned_address = (alloc_address + 15) & ~15;
+    sha256_ni *s = (sha256_ni *)aligned_address;
+    s->pointer_to_free = allocation;
+    return s;
+}
+
+static ssh_hash *sha256_ni_new(const ssh_hashalg *alg)
+{
+    const struct sha256_extra *extra = (const struct sha256_extra *)alg->extra;
+    if (!check_availability(extra))
+        return NULL;
+
+    sha256_ni *s = sha256_ni_alloc();
+
+    s->hash.vt = alg;
+    BinarySink_INIT(s, sha256_ni_write);
+    BinarySink_DELEGATE_INIT(&s->hash, s);
+
+    return &s->hash;
+}
+
+static void sha256_ni_reset(ssh_hash *hash)
+{
+    sha256_ni *s = container_of(hash, sha256_ni, hash);
+
+    /* Initialise the core vectors in their storage order */
+    s->core[0] = _mm_set_epi64x(
+        0x6a09e667bb67ae85ULL, 0x510e527f9b05688cULL);
+    s->core[1] = _mm_set_epi64x(
+        0x3c6ef372a54ff53aULL, 0x1f83d9ab5be0cd19ULL);
+
+    sha256_block_setup(&s->blk);
+}
+
+static void sha256_ni_copyfrom(ssh_hash *hcopy, ssh_hash *horig)
+{
+    sha256_ni *copy = container_of(hcopy, sha256_ni, hash);
+    sha256_ni *orig = container_of(horig, sha256_ni, hash);
+
+    void *ptf_save = copy->pointer_to_free;
+    *copy = *orig; /* structure copy */
+    copy->pointer_to_free = ptf_save;
+
+    BinarySink_COPIED(copy);
+    BinarySink_DELEGATE_INIT(&copy->hash, copy);
+}
+
+static void sha256_ni_free(ssh_hash *hash)
+{
+    sha256_ni *s = container_of(hash, sha256_ni, hash);
+
+    void *ptf = s->pointer_to_free;
+    smemclr(s, sizeof(*s));
+    sfree(ptf);
+}
+
+static void sha256_ni_write(BinarySink *bs, const void *vp, size_t len)
+{
+    sha256_ni *s = BinarySink_DOWNCAST(bs, sha256_ni);
+
+    while (len > 0)
+        if (sha256_block_write(&s->blk, &vp, &len))
+            sha256_ni_block(s->core, s->blk.block);
+}
+
+static void sha256_ni_digest(ssh_hash *hash, uint8_t *digest)
+{
+    sha256_ni *s = container_of(hash, sha256_ni, hash);
+
+    sha256_block_pad(&s->blk, BinarySink_UPCAST(s));
+
+    /* Rearrange the words into the output order */
+    __m128i feba = _mm_shuffle_epi32(s->core[0], 0x1B);
+    __m128i dchg = _mm_shuffle_epi32(s->core[1], 0xB1);
+    __m128i dcba = _mm_blend_epi16(feba, dchg, 0xF0);
+    __m128i hgfe = _mm_alignr_epi8(dchg, feba, 8);
+
+    /* Byte-swap them into the output endianness */
+    const __m128i mask = _mm_setr_epi8(3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12);
+    dcba = _mm_shuffle_epi8(dcba, mask);
+    hgfe = _mm_shuffle_epi8(hgfe, mask);
+
+    /* And store them */
+    __m128i *output = (__m128i *)digest;
+    _mm_storeu_si128(output, dcba);
+    _mm_storeu_si128(output+1, hgfe);
+}
+
+SHA256_VTABLE(ni, "SHA-NI accelerated");
--- a/crypto/sha256-select.c
+++ b/crypto/sha256-select.c
@ -0,0 +1,44 @@
+/*
+ * Top-level vtables to select a SHA-256 implementation.
+ */
+
+#include <assert.h>
+#include <stdlib.h>
+
+#include "putty.h"
+#include "ssh.h"
+#include "sha256.h"
+
+static ssh_hash *sha256_select(const ssh_hashalg *alg)
+{
+    static const ssh_hashalg *const real_algs[] = {
+#if HAVE_SHA_NI
+        &ssh_sha256_ni,
+#endif
+#if HAVE_NEON_CRYPTO
+        &ssh_sha256_neon,
+#endif
+        &ssh_sha256_sw,
+        NULL,
+    };
+
+    for (size_t i = 0; real_algs[i]; i++) {
+        const ssh_hashalg *alg = real_algs[i];
+        const struct sha256_extra *alg_extra =
+            (const struct sha256_extra *)alg->extra;
+        if (check_availability(alg_extra))
+            return ssh_hash_new(alg);
+    }
+
+    /* We should never reach the NULL at the end of the list, because
+     * the last non-NULL entry should be software-only SHA-256, which
+     * is always available. */
+    unreachable("sha256_select ran off the end of its list");
+}
+
+const ssh_hashalg ssh_sha256 = {
+    .new = sha256_select,
+    .hlen = 32,
+    .blocklen = 64,
+    HASHALG_NAMES_ANNOTATED("SHA-256", "dummy selector vtable"),
+};
--- a/crypto/sha256-sw.c
+++ b/crypto/sha256-sw.c
@ -0,0 +1,157 @@
+/*
+ * Software implementation of SHA-256.
+ */
+
+#include "ssh.h"
+#include "sha256.h"
+
+static bool sha256_sw_available(void)
+{
+    /* Software SHA-256 is always available */
+    return true;
+}
+
+static inline uint32_t ror(uint32_t x, unsigned y)
+{
+    return (x << (31 & -y)) | (x >> (31 & y));
+}
+
+static inline uint32_t Ch(uint32_t ctrl, uint32_t if1, uint32_t if0)
+{
+    return if0 ^ (ctrl & (if1 ^ if0));
+}
+
+static inline uint32_t Maj(uint32_t x, uint32_t y, uint32_t z)
+{
+    return (x & y) | (z & (x | y));
+}
+
+static inline uint32_t Sigma_0(uint32_t x)
+{
+    return ror(x,2) ^ ror(x,13) ^ ror(x,22);
+}
+
+static inline uint32_t Sigma_1(uint32_t x)
+{
+    return ror(x,6) ^ ror(x,11) ^ ror(x,25);
+}
+
+static inline uint32_t sigma_0(uint32_t x)
+{
+    return ror(x,7) ^ ror(x,18) ^ (x >> 3);
+}
+
+static inline uint32_t sigma_1(uint32_t x)
+{
+    return ror(x,17) ^ ror(x,19) ^ (x >> 10);
+}
+
+static inline void sha256_sw_round(
+    unsigned round_index, const uint32_t *schedule,
+    uint32_t *a, uint32_t *b, uint32_t *c, uint32_t *d,
+    uint32_t *e, uint32_t *f, uint32_t *g, uint32_t *h)
+{
+    uint32_t t1 = *h + Sigma_1(*e) + Ch(*e,*f,*g) +
+        sha256_round_constants[round_index] + schedule[round_index];
+
+    uint32_t t2 = Sigma_0(*a) + Maj(*a,*b,*c);
+
+    *d += t1;
+    *h = t1 + t2;
+}
+
+static void sha256_sw_block(uint32_t *core, const uint8_t *block)
+{
+    uint32_t w[SHA256_ROUNDS];
+    uint32_t a,b,c,d,e,f,g,h;
+
+    for (size_t t = 0; t < 16; t++)
+        w[t] = GET_32BIT_MSB_FIRST(block + 4*t);
+
+    for (size_t t = 16; t < SHA256_ROUNDS; t++)
+        w[t] = sigma_1(w[t-2]) + w[t-7] + sigma_0(w[t-15]) + w[t-16];
+
+    a = core[0]; b = core[1]; c = core[2]; d = core[3];
+    e = core[4]; f = core[5]; g = core[6]; h = core[7];
+
+    for (size_t t = 0; t < SHA256_ROUNDS; t += 8) {
+        sha256_sw_round(t+0, w, &a,&b,&c,&d,&e,&f,&g,&h);
+        sha256_sw_round(t+1, w, &h,&a,&b,&c,&d,&e,&f,&g);
+        sha256_sw_round(t+2, w, &g,&h,&a,&b,&c,&d,&e,&f);
+        sha256_sw_round(t+3, w, &f,&g,&h,&a,&b,&c,&d,&e);
+        sha256_sw_round(t+4, w, &e,&f,&g,&h,&a,&b,&c,&d);
+        sha256_sw_round(t+5, w, &d,&e,&f,&g,&h,&a,&b,&c);
+        sha256_sw_round(t+6, w, &c,&d,&e,&f,&g,&h,&a,&b);
+        sha256_sw_round(t+7, w, &b,&c,&d,&e,&f,&g,&h,&a);
+    }
+
+    core[0] += a; core[1] += b; core[2] += c; core[3] += d;
+    core[4] += e; core[5] += f; core[6] += g; core[7] += h;
+
+    smemclr(w, sizeof(w));
+}
+
+typedef struct sha256_sw {
+    uint32_t core[8];
+    sha256_block blk;
+    BinarySink_IMPLEMENTATION;
+    ssh_hash hash;
+} sha256_sw;
+
+static void sha256_sw_write(BinarySink *bs, const void *vp, size_t len);
+
+static ssh_hash *sha256_sw_new(const ssh_hashalg *alg)
+{
+    sha256_sw *s = snew(sha256_sw);
+
+    s->hash.vt = alg;
+    BinarySink_INIT(s, sha256_sw_write);
+    BinarySink_DELEGATE_INIT(&s->hash, s);
+    return &s->hash;
+}
+
+static void sha256_sw_reset(ssh_hash *hash)
+{
+    sha256_sw *s = container_of(hash, sha256_sw, hash);
+
+    memcpy(s->core, sha256_initial_state, sizeof(s->core));
+    sha256_block_setup(&s->blk);
+}
+
+static void sha256_sw_copyfrom(ssh_hash *hcopy, ssh_hash *horig)
+{
+    sha256_sw *copy = container_of(hcopy, sha256_sw, hash);
+    sha256_sw *orig = container_of(horig, sha256_sw, hash);
+
+    memcpy(copy, orig, sizeof(*copy));
+    BinarySink_COPIED(copy);
+    BinarySink_DELEGATE_INIT(&copy->hash, copy);
+}
+
+static void sha256_sw_free(ssh_hash *hash)
+{
+    sha256_sw *s = container_of(hash, sha256_sw, hash);
+
+    smemclr(s, sizeof(*s));
+    sfree(s);
+}
+
+static void sha256_sw_write(BinarySink *bs, const void *vp, size_t len)
+{
+    sha256_sw *s = BinarySink_DOWNCAST(bs, sha256_sw);
+
+    while (len > 0)
+        if (sha256_block_write(&s->blk, &vp, &len))
+            sha256_sw_block(s->core, s->blk.block);
+}
+
+static void sha256_sw_digest(ssh_hash *hash, uint8_t *digest)
+{
+    sha256_sw *s = container_of(hash, sha256_sw, hash);
+
+    sha256_block_pad(&s->blk, BinarySink_UPCAST(s));
+    for (size_t i = 0; i < 8; i++)
+        PUT_32BIT_MSB_FIRST(digest + 4*i, s->core[i]);
+}
+
+SHA256_VTABLE(sw, "unaccelerated");
--- a/crypto/sha256.c
+++ b/crypto/sha256.c
@ -1,939 +0,0 @@
-/*
- * SHA-256 algorithm as described at
- *
- *   http://csrc.nist.gov/cryptval/shs.html
- */
-
-#include "ssh.h"
-#include <assert.h>
-
-/*
- * Start by deciding whether we can support hardware SHA at all.
- */
-#define HW_SHA256_NONE 0
-#define HW_SHA256_NI 1
-#define HW_SHA256_NEON 2
-
-#ifdef _FORCE_SHA_NI
-#   define HW_SHA256 HW_SHA256_NI
-#elif defined(__clang__)
-#   if __has_attribute(target) && __has_include(<wmmintrin.h>) &&       \
-    (defined(__x86_64__) || defined(__i386))
-#       define HW_SHA256 HW_SHA256_NI
-#   endif
-#elif defined(__GNUC__)
-#    if (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 9)) && \
-        (defined(__x86_64__) || defined(__i386))
-#       define HW_SHA256 HW_SHA256_NI
-#    endif
-#elif defined (_MSC_VER)
-#   if (defined(_M_X64) || defined(_M_IX86)) && _MSC_FULL_VER >= 150030729
-#      define HW_SHA256 HW_SHA256_NI
-#   endif
-#endif
-
-#ifdef _FORCE_SHA_NEON
-#   define HW_SHA256 HW_SHA256_NEON
-#elif defined __BYTE_ORDER__ && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
-    /* Arm can potentially support both endiannesses, but this code
-     * hasn't been tested on anything but little. If anyone wants to
-     * run big-endian, they'll need to fix it first. */
-#elif defined __ARM_FEATURE_CRYPTO
-    /* If the Arm crypto extension is available already, we can
-     * support NEON SHA without having to enable anything by hand */
-#   define HW_SHA256 HW_SHA256_NEON
-#elif defined(__clang__)
-#   if __has_attribute(target) && __has_include(<arm_neon.h>) &&       \
-    (defined(__aarch64__))
-        /* clang can enable the crypto extension in AArch64 using
-         * __attribute__((target)) */
-#       define HW_SHA256 HW_SHA256_NEON
-#       define USE_CLANG_ATTR_TARGET_AARCH64
-#   endif
-#elif defined _MSC_VER
-    /* Visual Studio supports the crypto extension when targeting
-     * AArch64, but as of VS2017, the AArch32 header doesn't quite
-     * manage it (declaring the shae/shad intrinsics without a round
-     * key operand). */
-#   if defined _M_ARM64
-#       define HW_SHA256 HW_SHA256_NEON
-#       if defined _M_ARM64
-#           define USE_ARM64_NEON_H /* unusual header name in this case */
-#       endif
-#   endif
-#endif
-
-#if defined _FORCE_SOFTWARE_SHA || !defined HW_SHA256
-#   undef HW_SHA256
-#   define HW_SHA256 HW_SHA256_NONE
-#endif
-
-/*
- * The actual query function that asks if hardware acceleration is
- * available.
- */
-static bool sha256_hw_available(void);
-
-/*
- * The top-level selection function, caching the results of
- * sha256_hw_available() so it only has to run once.
- */
-static bool sha256_hw_available_cached(void)
-{
-    static bool initialised = false;
-    static bool hw_available;
-    if (!initialised) {
-        hw_available = sha256_hw_available();
-        initialised = true;
-    }
-    return hw_available;
-}
-
-static ssh_hash *sha256_select(const ssh_hashalg *alg)
-{
-    const ssh_hashalg *real_alg =
-        sha256_hw_available_cached() ? &ssh_sha256_hw : &ssh_sha256_sw;
-
-    return ssh_hash_new(real_alg);
-}
-
-const ssh_hashalg ssh_sha256 = {
-    .new = sha256_select,
-    .hlen = 32,
-    .blocklen = 64,
-    HASHALG_NAMES_ANNOTATED("SHA-256", "dummy selector vtable"),
-};
-
-/* ----------------------------------------------------------------------
- * Definitions likely to be helpful to multiple implementations.
- */
-
-static const uint32_t sha256_initial_state[] = {
-    0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a,
-    0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19,
-};
-
-static const uint32_t sha256_round_constants[] = {
-    0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
-    0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
-    0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
-    0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
-    0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
-    0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
-    0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
-    0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
-    0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
-    0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
-    0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
-    0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
-    0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
-    0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
-    0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
-    0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
-};
-
-#define SHA256_ROUNDS 64
-
-typedef struct sha256_block sha256_block;
-struct sha256_block {
-    uint8_t block[64];
-    size_t used;
-    uint64_t len;
-};
-
-static inline void sha256_block_setup(sha256_block *blk)
-{
-    blk->used = 0;
-    blk->len = 0;
-}
-
-static inline bool sha256_block_write(
-    sha256_block *blk, const void **vdata, size_t *len)
-{
-    size_t blkleft = sizeof(blk->block) - blk->used;
-    size_t chunk = *len < blkleft ? *len : blkleft;
-
-    const uint8_t *p = *vdata;
-    memcpy(blk->block + blk->used, p, chunk);
-    *vdata = p + chunk;
-    *len -= chunk;
-    blk->used += chunk;
-    blk->len += chunk;
-
-    if (blk->used == sizeof(blk->block)) {
-        blk->used = 0;
-        return true;
-    }
-
-    return false;
-}
-
-static inline void sha256_block_pad(sha256_block *blk, BinarySink *bs)
-{
-    uint64_t final_len = blk->len << 3;
-    size_t pad = 1 + (63 & (55 - blk->used));
-
-    put_byte(bs, 0x80);
-    for (size_t i = 1; i < pad; i++)
-        put_byte(bs, 0);
-    put_uint64(bs, final_len);
-
-    assert(blk->used == 0 && "Should have exactly hit a block boundary");
-}
-
-/* ----------------------------------------------------------------------
- * Software implementation of SHA-256.
- */
-
-static inline uint32_t ror(uint32_t x, unsigned y)
-{
-    return (x << (31 & -y)) | (x >> (31 & y));
-}
-
-static inline uint32_t Ch(uint32_t ctrl, uint32_t if1, uint32_t if0)
-{
-    return if0 ^ (ctrl & (if1 ^ if0));
-}
-
-static inline uint32_t Maj(uint32_t x, uint32_t y, uint32_t z)
-{
-    return (x & y) | (z & (x | y));
-}
-
-static inline uint32_t Sigma_0(uint32_t x)
-{
-    return ror(x,2) ^ ror(x,13) ^ ror(x,22);
-}
-
-static inline uint32_t Sigma_1(uint32_t x)
-{
-    return ror(x,6) ^ ror(x,11) ^ ror(x,25);
-}
-
-static inline uint32_t sigma_0(uint32_t x)
-{
-    return ror(x,7) ^ ror(x,18) ^ (x >> 3);
-}
-
-static inline uint32_t sigma_1(uint32_t x)
-{
-    return ror(x,17) ^ ror(x,19) ^ (x >> 10);
-}
-
-static inline void sha256_sw_round(
-    unsigned round_index, const uint32_t *schedule,
-    uint32_t *a, uint32_t *b, uint32_t *c, uint32_t *d,
-    uint32_t *e, uint32_t *f, uint32_t *g, uint32_t *h)
-{
-    uint32_t t1 = *h + Sigma_1(*e) + Ch(*e,*f,*g) +
-        sha256_round_constants[round_index] + schedule[round_index];
-
-    uint32_t t2 = Sigma_0(*a) + Maj(*a,*b,*c);
-
-    *d += t1;
-    *h = t1 + t2;
-}
-
-static void sha256_sw_block(uint32_t *core, const uint8_t *block)
-{
-    uint32_t w[SHA256_ROUNDS];
-    uint32_t a,b,c,d,e,f,g,h;
-
-    for (size_t t = 0; t < 16; t++)
-        w[t] = GET_32BIT_MSB_FIRST(block + 4*t);
-
-    for (size_t t = 16; t < SHA256_ROUNDS; t++)
-        w[t] = sigma_1(w[t-2]) + w[t-7] + sigma_0(w[t-15]) + w[t-16];
-
-    a = core[0]; b = core[1]; c = core[2]; d = core[3];
-    e = core[4]; f = core[5]; g = core[6]; h = core[7];
-
-    for (size_t t = 0; t < SHA256_ROUNDS; t += 8) {
-        sha256_sw_round(t+0, w, &a,&b,&c,&d,&e,&f,&g,&h);
-        sha256_sw_round(t+1, w, &h,&a,&b,&c,&d,&e,&f,&g);
-        sha256_sw_round(t+2, w, &g,&h,&a,&b,&c,&d,&e,&f);
-        sha256_sw_round(t+3, w, &f,&g,&h,&a,&b,&c,&d,&e);
-        sha256_sw_round(t+4, w, &e,&f,&g,&h,&a,&b,&c,&d);
-        sha256_sw_round(t+5, w, &d,&e,&f,&g,&h,&a,&b,&c);
-        sha256_sw_round(t+6, w, &c,&d,&e,&f,&g,&h,&a,&b);
-        sha256_sw_round(t+7, w, &b,&c,&d,&e,&f,&g,&h,&a);
-    }
-
-    core[0] += a; core[1] += b; core[2] += c; core[3] += d;
-    core[4] += e; core[5] += f; core[6] += g; core[7] += h;
-
-    smemclr(w, sizeof(w));
-}
-
-typedef struct sha256_sw {
-    uint32_t core[8];
-    sha256_block blk;
-    BinarySink_IMPLEMENTATION;
-    ssh_hash hash;
-} sha256_sw;
-
-static void sha256_sw_write(BinarySink *bs, const void *vp, size_t len);
-
-static ssh_hash *sha256_sw_new(const ssh_hashalg *alg)
-{
-    sha256_sw *s = snew(sha256_sw);
-
-    s->hash.vt = alg;
-    BinarySink_INIT(s, sha256_sw_write);
-    BinarySink_DELEGATE_INIT(&s->hash, s);
-    return &s->hash;
-}
-
-static void sha256_sw_reset(ssh_hash *hash)
-{
-    sha256_sw *s = container_of(hash, sha256_sw, hash);
-
-    memcpy(s->core, sha256_initial_state, sizeof(s->core));
-    sha256_block_setup(&s->blk);
-}
-
-static void sha256_sw_copyfrom(ssh_hash *hcopy, ssh_hash *horig)
-{
-    sha256_sw *copy = container_of(hcopy, sha256_sw, hash);
-    sha256_sw *orig = container_of(horig, sha256_sw, hash);
-
-    memcpy(copy, orig, sizeof(*copy));
-    BinarySink_COPIED(copy);
-    BinarySink_DELEGATE_INIT(&copy->hash, copy);
-}
-
-static void sha256_sw_free(ssh_hash *hash)
-{
-    sha256_sw *s = container_of(hash, sha256_sw, hash);
-
-    smemclr(s, sizeof(*s));
-    sfree(s);
-}
-
-static void sha256_sw_write(BinarySink *bs, const void *vp, size_t len)
-{
-    sha256_sw *s = BinarySink_DOWNCAST(bs, sha256_sw);
-
-    while (len > 0)
-        if (sha256_block_write(&s->blk, &vp, &len))
-            sha256_sw_block(s->core, s->blk.block);
-}
-
-static void sha256_sw_digest(ssh_hash *hash, uint8_t *digest)
-{
-    sha256_sw *s = container_of(hash, sha256_sw, hash);
-
-    sha256_block_pad(&s->blk, BinarySink_UPCAST(s));
-    for (size_t i = 0; i < 8; i++)
-        PUT_32BIT_MSB_FIRST(digest + 4*i, s->core[i]);
-}
-
-const ssh_hashalg ssh_sha256_sw = {
-    .new = sha256_sw_new,
-    .reset = sha256_sw_reset,
-    .copyfrom = sha256_sw_copyfrom,
-    .digest = sha256_sw_digest,
-    .free = sha256_sw_free,
-    .hlen = 32,
-    .blocklen = 64,
-    HASHALG_NAMES_ANNOTATED("SHA-256", "unaccelerated"),
-};
-
-/* ----------------------------------------------------------------------
- * Hardware-accelerated implementation of SHA-256 using x86 SHA-NI.
- */
-
-#if HW_SHA256 == HW_SHA256_NI
-
-/*
- * Set target architecture for Clang and GCC
- */
-#if defined(__clang__) || defined(__GNUC__)
-#    define FUNC_ISA __attribute__ ((target("sse4.1,sha")))
-#if !defined(__clang__)
-#    pragma GCC target("sha")
-#    pragma GCC target("sse4.1")
-#endif
-#else
-#    define FUNC_ISA
-#endif
-
-#include <wmmintrin.h>
-#include <smmintrin.h>
-#include <immintrin.h>
-#if defined(__clang__) || defined(__GNUC__)
-#include <shaintrin.h>
-#endif
-
-#if defined(__clang__) || defined(__GNUC__)
-#include <cpuid.h>
-#define GET_CPU_ID_0(out)                               \
-    __cpuid(0, (out)[0], (out)[1], (out)[2], (out)[3])
-#define GET_CPU_ID_7(out)                                       \
-    __cpuid_count(7, 0, (out)[0], (out)[1], (out)[2], (out)[3])
-#else
-#define GET_CPU_ID_0(out) __cpuid(out, 0)
-#define GET_CPU_ID_7(out) __cpuidex(out, 7, 0)
-#endif
-
-static bool sha256_hw_available(void)
-{
-    unsigned int CPUInfo[4];
-    GET_CPU_ID_0(CPUInfo);
-    if (CPUInfo[0] < 7)
-        return false;
-
-    GET_CPU_ID_7(CPUInfo);
-    return CPUInfo[1] & (1 << 29); /* Check SHA */
-}
-
-/* SHA256 implementation using new instructions
-   The code is based on Jeffrey Walton's SHA256 implementation:
-   https://github.com/noloader/SHA-Intrinsics
-*/
-FUNC_ISA
-static inline void sha256_ni_block(__m128i *core, const uint8_t *p)
-{
-    __m128i STATE0, STATE1;
-    __m128i MSG, TMP;
-    __m128i MSG0, MSG1, MSG2, MSG3;
-    const __m128i *block = (const __m128i *)p;
-    const __m128i MASK = _mm_set_epi64x(
-        0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL);
-
-    /* Load initial values */
-    STATE0 = core[0];
-    STATE1 = core[1];
-
-    /* Rounds 0-3 */
-    MSG = _mm_loadu_si128(block);
-    MSG0 = _mm_shuffle_epi8(MSG, MASK);
-    MSG = _mm_add_epi32(MSG0, _mm_set_epi64x(
-                            0xE9B5DBA5B5C0FBCFULL, 0x71374491428A2F98ULL));
-    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
-    MSG = _mm_shuffle_epi32(MSG, 0x0E);
-    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
-
-    /* Rounds 4-7 */
-    MSG1 = _mm_loadu_si128(block + 1);
-    MSG1 = _mm_shuffle_epi8(MSG1, MASK);
-    MSG = _mm_add_epi32(MSG1, _mm_set_epi64x(
-                            0xAB1C5ED5923F82A4ULL, 0x59F111F13956C25BULL));
-    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
-    MSG = _mm_shuffle_epi32(MSG, 0x0E);
-    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
-    MSG0 = _mm_sha256msg1_epu32(MSG0, MSG1);
-
-    /* Rounds 8-11 */
-    MSG2 = _mm_loadu_si128(block + 2);
-    MSG2 = _mm_shuffle_epi8(MSG2, MASK);
-    MSG = _mm_add_epi32(MSG2, _mm_set_epi64x(
-                            0x550C7DC3243185BEULL, 0x12835B01D807AA98ULL));
-    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
-    MSG = _mm_shuffle_epi32(MSG, 0x0E);
-    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
-    MSG1 = _mm_sha256msg1_epu32(MSG1, MSG2);
-
-    /* Rounds 12-15 */
-    MSG3 = _mm_loadu_si128(block + 3);
-    MSG3 = _mm_shuffle_epi8(MSG3, MASK);
-    MSG = _mm_add_epi32(MSG3, _mm_set_epi64x(
-                            0xC19BF1749BDC06A7ULL, 0x80DEB1FE72BE5D74ULL));
-    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
-    TMP = _mm_alignr_epi8(MSG3, MSG2, 4);
-    MSG0 = _mm_add_epi32(MSG0, TMP);
-    MSG0 = _mm_sha256msg2_epu32(MSG0, MSG3);
-    MSG = _mm_shuffle_epi32(MSG, 0x0E);
-    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
-    MSG2 = _mm_sha256msg1_epu32(MSG2, MSG3);
-
-    /* Rounds 16-19 */
-    MSG = _mm_add_epi32(MSG0, _mm_set_epi64x(
-                            0x240CA1CC0FC19DC6ULL, 0xEFBE4786E49B69C1ULL));
-    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
-    TMP = _mm_alignr_epi8(MSG0, MSG3, 4);
-    MSG1 = _mm_add_epi32(MSG1, TMP);
-    MSG1 = _mm_sha256msg2_epu32(MSG1, MSG0);
-    MSG = _mm_shuffle_epi32(MSG, 0x0E);
-    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
-    MSG3 = _mm_sha256msg1_epu32(MSG3, MSG0);
-
-    /* Rounds 20-23 */
-    MSG = _mm_add_epi32(MSG1, _mm_set_epi64x(
-                            0x76F988DA5CB0A9DCULL, 0x4A7484AA2DE92C6FULL));
-    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
-    TMP = _mm_alignr_epi8(MSG1, MSG0, 4);
-    MSG2 = _mm_add_epi32(MSG2, TMP);
-    MSG2 = _mm_sha256msg2_epu32(MSG2, MSG1);
-    MSG = _mm_shuffle_epi32(MSG, 0x0E);
-    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
-    MSG0 = _mm_sha256msg1_epu32(MSG0, MSG1);
-
-    /* Rounds 24-27 */
-    MSG = _mm_add_epi32(MSG2, _mm_set_epi64x(
-                            0xBF597FC7B00327C8ULL, 0xA831C66D983E5152ULL));
-    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
-    TMP = _mm_alignr_epi8(MSG2, MSG1, 4);
-    MSG3 = _mm_add_epi32(MSG3, TMP);
-    MSG3 = _mm_sha256msg2_epu32(MSG3, MSG2);
-    MSG = _mm_shuffle_epi32(MSG, 0x0E);
-    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
-    MSG1 = _mm_sha256msg1_epu32(MSG1, MSG2);
-
-    /* Rounds 28-31 */
-    MSG = _mm_add_epi32(MSG3, _mm_set_epi64x(
-                            0x1429296706CA6351ULL,  0xD5A79147C6E00BF3ULL));
-    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
-    TMP = _mm_alignr_epi8(MSG3, MSG2, 4);
-    MSG0 = _mm_add_epi32(MSG0, TMP);
-    MSG0 = _mm_sha256msg2_epu32(MSG0, MSG3);
-    MSG = _mm_shuffle_epi32(MSG, 0x0E);
-    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
-    MSG2 = _mm_sha256msg1_epu32(MSG2, MSG3);
-
-    /* Rounds 32-35 */
-    MSG = _mm_add_epi32(MSG0, _mm_set_epi64x(
-                            0x53380D134D2C6DFCULL, 0x2E1B213827B70A85ULL));
-    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
-    TMP = _mm_alignr_epi8(MSG0, MSG3, 4);
-    MSG1 = _mm_add_epi32(MSG1, TMP);
-    MSG1 = _mm_sha256msg2_epu32(MSG1, MSG0);
-    MSG = _mm_shuffle_epi32(MSG, 0x0E);
-    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
-    MSG3 = _mm_sha256msg1_epu32(MSG3, MSG0);
-
-    /* Rounds 36-39 */
-    MSG = _mm_add_epi32(MSG1, _mm_set_epi64x(
-                            0x92722C8581C2C92EULL, 0x766A0ABB650A7354ULL));
-    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
-    TMP = _mm_alignr_epi8(MSG1, MSG0, 4);
-    MSG2 = _mm_add_epi32(MSG2, TMP);
-    MSG2 = _mm_sha256msg2_epu32(MSG2, MSG1);
-    MSG = _mm_shuffle_epi32(MSG, 0x0E);
-    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
-    MSG0 = _mm_sha256msg1_epu32(MSG0, MSG1);
-
-    /* Rounds 40-43 */
-    MSG = _mm_add_epi32(MSG2, _mm_set_epi64x(
-                            0xC76C51A3C24B8B70ULL, 0xA81A664BA2BFE8A1ULL));
-    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
-    TMP = _mm_alignr_epi8(MSG2, MSG1, 4);
-    MSG3 = _mm_add_epi32(MSG3, TMP);
-    MSG3 = _mm_sha256msg2_epu32(MSG3, MSG2);
-    MSG = _mm_shuffle_epi32(MSG, 0x0E);
-    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
-    MSG1 = _mm_sha256msg1_epu32(MSG1, MSG2);
-
-    /* Rounds 44-47 */
-    MSG = _mm_add_epi32(MSG3, _mm_set_epi64x(
-                            0x106AA070F40E3585ULL, 0xD6990624D192E819ULL));
-    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
-    TMP = _mm_alignr_epi8(MSG3, MSG2, 4);
-    MSG0 = _mm_add_epi32(MSG0, TMP);
-    MSG0 = _mm_sha256msg2_epu32(MSG0, MSG3);
-    MSG = _mm_shuffle_epi32(MSG, 0x0E);
-    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
-    MSG2 = _mm_sha256msg1_epu32(MSG2, MSG3);
-
-    /* Rounds 48-51 */
-    MSG = _mm_add_epi32(MSG0, _mm_set_epi64x(
-                            0x34B0BCB52748774CULL, 0x1E376C0819A4C116ULL));
-    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
-    TMP = _mm_alignr_epi8(MSG0, MSG3, 4);
-    MSG1 = _mm_add_epi32(MSG1, TMP);
-    MSG1 = _mm_sha256msg2_epu32(MSG1, MSG0);
-    MSG = _mm_shuffle_epi32(MSG, 0x0E);
-    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
-    MSG3 = _mm_sha256msg1_epu32(MSG3, MSG0);
-
-    /* Rounds 52-55 */
-    MSG = _mm_add_epi32(MSG1, _mm_set_epi64x(
-                            0x682E6FF35B9CCA4FULL, 0x4ED8AA4A391C0CB3ULL));
-    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
-    TMP = _mm_alignr_epi8(MSG1, MSG0, 4);
-    MSG2 = _mm_add_epi32(MSG2, TMP);
-    MSG2 = _mm_sha256msg2_epu32(MSG2, MSG1);
-    MSG = _mm_shuffle_epi32(MSG, 0x0E);
-    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
-
-    /* Rounds 56-59 */
-    MSG = _mm_add_epi32(MSG2, _mm_set_epi64x(
-                            0x8CC7020884C87814ULL, 0x78A5636F748F82EEULL));
-    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
-    TMP = _mm_alignr_epi8(MSG2, MSG1, 4);
-    MSG3 = _mm_add_epi32(MSG3, TMP);
-    MSG3 = _mm_sha256msg2_epu32(MSG3, MSG2);
-    MSG = _mm_shuffle_epi32(MSG, 0x0E);
-    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
-
-    /* Rounds 60-63 */
-    MSG = _mm_add_epi32(MSG3, _mm_set_epi64x(
-                            0xC67178F2BEF9A3F7ULL, 0xA4506CEB90BEFFFAULL));
-    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
-    MSG = _mm_shuffle_epi32(MSG, 0x0E);
-    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
-
-    /* Combine state */
-    core[0] = _mm_add_epi32(STATE0, core[0]);
-    core[1] = _mm_add_epi32(STATE1, core[1]);
-}
-
-typedef struct sha256_ni {
-    /*
-     * These two vectors store the 8 words of the SHA-256 state, but
-     * not in the same order they appear in the spec: the first word
-     * holds A,B,E,F and the second word C,D,G,H.
-     */
-    __m128i core[2];
-    sha256_block blk;
-    void *pointer_to_free;
-    BinarySink_IMPLEMENTATION;
-    ssh_hash hash;
-} sha256_ni;
-
-static void sha256_ni_write(BinarySink *bs, const void *vp, size_t len);
-
-static sha256_ni *sha256_ni_alloc(void)
-{
-    /*
-     * The __m128i variables in the context structure need to be
-     * 16-byte aligned, but not all malloc implementations that this
-     * code has to work with will guarantee to return a 16-byte
-     * aligned pointer. So we over-allocate, manually realign the
-     * pointer ourselves, and store the original one inside the
-     * context so we know how to free it later.
-     */
-    void *allocation = smalloc(sizeof(sha256_ni) + 15);
-    uintptr_t alloc_address = (uintptr_t)allocation;
-    uintptr_t aligned_address = (alloc_address + 15) & ~15;
-    sha256_ni *s = (sha256_ni *)aligned_address;
-    s->pointer_to_free = allocation;
-    return s;
-}
-
-static ssh_hash *sha256_ni_new(const ssh_hashalg *alg)
-{
-    if (!sha256_hw_available_cached())
-        return NULL;
-
-    sha256_ni *s = sha256_ni_alloc();
-
-    s->hash.vt = alg;
-    BinarySink_INIT(s, sha256_ni_write);
-    BinarySink_DELEGATE_INIT(&s->hash, s);
-
-    return &s->hash;
-}
-
-FUNC_ISA static void sha256_ni_reset(ssh_hash *hash)
-{
-    sha256_ni *s = container_of(hash, sha256_ni, hash);
-
-    /* Initialise the core vectors in their storage order */
-    s->core[0] = _mm_set_epi64x(
-        0x6a09e667bb67ae85ULL, 0x510e527f9b05688cULL);
-    s->core[1] = _mm_set_epi64x(
-        0x3c6ef372a54ff53aULL, 0x1f83d9ab5be0cd19ULL);
-
-    sha256_block_setup(&s->blk);
-}
-
-static void sha256_ni_copyfrom(ssh_hash *hcopy, ssh_hash *horig)
-{
-    sha256_ni *copy = container_of(hcopy, sha256_ni, hash);
-    sha256_ni *orig = container_of(horig, sha256_ni, hash);
-
-    void *ptf_save = copy->pointer_to_free;
-    *copy = *orig; /* structure copy */
-    copy->pointer_to_free = ptf_save;
-
-    BinarySink_COPIED(copy);
-    BinarySink_DELEGATE_INIT(&copy->hash, copy);
-}
-
-static void sha256_ni_free(ssh_hash *hash)
-{
-    sha256_ni *s = container_of(hash, sha256_ni, hash);
-
-    void *ptf = s->pointer_to_free;
-    smemclr(s, sizeof(*s));
-    sfree(ptf);
-}
-
-static void sha256_ni_write(BinarySink *bs, const void *vp, size_t len)
-{
-    sha256_ni *s = BinarySink_DOWNCAST(bs, sha256_ni);
-
-    while (len > 0)
-        if (sha256_block_write(&s->blk, &vp, &len))
-            sha256_ni_block(s->core, s->blk.block);
-}
-
-FUNC_ISA static void sha256_ni_digest(ssh_hash *hash, uint8_t *digest)
-{
-    sha256_ni *s = container_of(hash, sha256_ni, hash);
-
-    sha256_block_pad(&s->blk, BinarySink_UPCAST(s));
-
-    /* Rearrange the words into the output order */
-    __m128i feba = _mm_shuffle_epi32(s->core[0], 0x1B);
-    __m128i dchg = _mm_shuffle_epi32(s->core[1], 0xB1);
-    __m128i dcba = _mm_blend_epi16(feba, dchg, 0xF0);
-    __m128i hgfe = _mm_alignr_epi8(dchg, feba, 8);
-
-    /* Byte-swap them into the output endianness */
-    const __m128i mask = _mm_setr_epi8(3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12);
-    dcba = _mm_shuffle_epi8(dcba, mask);
-    hgfe = _mm_shuffle_epi8(hgfe, mask);
-
-    /* And store them */
-    __m128i *output = (__m128i *)digest;
-    _mm_storeu_si128(output, dcba);
-    _mm_storeu_si128(output+1, hgfe);
-}
-
-const ssh_hashalg ssh_sha256_hw = {
-    .new = sha256_ni_new,
-    .reset = sha256_ni_reset,
-    .copyfrom = sha256_ni_copyfrom,
-    .digest = sha256_ni_digest,
-    .free = sha256_ni_free,
-    .hlen = 32,
-    .blocklen = 64,
-    HASHALG_NAMES_ANNOTATED("SHA-256", "SHA-NI accelerated"),
-};
-
-/* ----------------------------------------------------------------------
- * Hardware-accelerated implementation of SHA-256 using Arm NEON.
- */
-
-#elif HW_SHA256 == HW_SHA256_NEON
-
-/*
- * Manually set the target architecture, if we decided above that we
- * need to.
- */
-#ifdef USE_CLANG_ATTR_TARGET_AARCH64
-/*
- * A spot of cheating: redefine some ACLE feature macros before
- * including arm_neon.h. Otherwise we won't get the SHA intrinsics
- * defined by that header, because it will be looking at the settings
- * for the whole translation unit rather than the ones we're going to
- * put on some particular functions using __attribute__((target)).
- */
-#define __ARM_NEON 1
-#define __ARM_FEATURE_CRYPTO 1
-#define FUNC_ISA __attribute__ ((target("neon,crypto")))
-#endif /* USE_CLANG_ATTR_TARGET_AARCH64 */
-
-#ifndef FUNC_ISA
-#define FUNC_ISA
-#endif
-
-#ifdef USE_ARM64_NEON_H
-#include <arm64_neon.h>
-#else
-#include <arm_neon.h>
-#endif
-
-static bool sha256_hw_available(void)
-{
-    /*
-     * For Arm, we delegate to a per-platform detection function (see
-     * explanation in sshaes.c).
-     */
-    return platform_sha256_hw_available();
-}
-
-typedef struct sha256_neon_core sha256_neon_core;
-struct sha256_neon_core {
-    uint32x4_t abcd, efgh;
-};
-
-FUNC_ISA
-static inline uint32x4_t sha256_neon_load_input(const uint8_t *p)
-{
-    return vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(p)));
-}
-
-FUNC_ISA
-static inline uint32x4_t sha256_neon_schedule_update(
-    uint32x4_t m4, uint32x4_t m3, uint32x4_t m2, uint32x4_t m1)
-{
-    return vsha256su1q_u32(vsha256su0q_u32(m4, m3), m2, m1);
-}
-
-FUNC_ISA
-static inline sha256_neon_core sha256_neon_round4(
-    sha256_neon_core old, uint32x4_t sched, unsigned round)
-{
-    sha256_neon_core new;
-
-    uint32x4_t round_input = vaddq_u32(
-        sched, vld1q_u32(sha256_round_constants + round));
-    new.abcd = vsha256hq_u32 (old.abcd, old.efgh, round_input);
-    new.efgh = vsha256h2q_u32(old.efgh, old.abcd, round_input);
-    return new;
-}
-
-FUNC_ISA
-static inline void sha256_neon_block(sha256_neon_core *core, const uint8_t *p)
-{
-    uint32x4_t s0, s1, s2, s3;
-    sha256_neon_core cr = *core;
-
-    s0 = sha256_neon_load_input(p);
-    cr = sha256_neon_round4(cr, s0, 0);
-    s1 = sha256_neon_load_input(p+16);
-    cr = sha256_neon_round4(cr, s1, 4);
-    s2 = sha256_neon_load_input(p+32);
-    cr = sha256_neon_round4(cr, s2, 8);
-    s3 = sha256_neon_load_input(p+48);
-    cr = sha256_neon_round4(cr, s3, 12);
-    s0 = sha256_neon_schedule_update(s0, s1, s2, s3);
-    cr = sha256_neon_round4(cr, s0, 16);
-    s1 = sha256_neon_schedule_update(s1, s2, s3, s0);
-    cr = sha256_neon_round4(cr, s1, 20);
-    s2 = sha256_neon_schedule_update(s2, s3, s0, s1);
-    cr = sha256_neon_round4(cr, s2, 24);
-    s3 = sha256_neon_schedule_update(s3, s0, s1, s2);
-    cr = sha256_neon_round4(cr, s3, 28);
-    s0 = sha256_neon_schedule_update(s0, s1, s2, s3);
-    cr = sha256_neon_round4(cr, s0, 32);
-    s1 = sha256_neon_schedule_update(s1, s2, s3, s0);
-    cr = sha256_neon_round4(cr, s1, 36);
-    s2 = sha256_neon_schedule_update(s2, s3, s0, s1);
-    cr = sha256_neon_round4(cr, s2, 40);
-    s3 = sha256_neon_schedule_update(s3, s0, s1, s2);
-    cr = sha256_neon_round4(cr, s3, 44);
-    s0 = sha256_neon_schedule_update(s0, s1, s2, s3);
-    cr = sha256_neon_round4(cr, s0, 48);
-    s1 = sha256_neon_schedule_update(s1, s2, s3, s0);
-    cr = sha256_neon_round4(cr, s1, 52);
-    s2 = sha256_neon_schedule_update(s2, s3, s0, s1);
-    cr = sha256_neon_round4(cr, s2, 56);
-    s3 = sha256_neon_schedule_update(s3, s0, s1, s2);
-    cr = sha256_neon_round4(cr, s3, 60);
-
-    core->abcd = vaddq_u32(core->abcd, cr.abcd);
-    core->efgh = vaddq_u32(core->efgh, cr.efgh);
-}
-
-typedef struct sha256_neon {
-    sha256_neon_core core;
-    sha256_block blk;
-    BinarySink_IMPLEMENTATION;
-    ssh_hash hash;
-} sha256_neon;
-
-static void sha256_neon_write(BinarySink *bs, const void *vp, size_t len);
-
-static ssh_hash *sha256_neon_new(const ssh_hashalg *alg)
-{
-    if (!sha256_hw_available_cached())
-        return NULL;
-
-    sha256_neon *s = snew(sha256_neon);
-
-    s->hash.vt = alg;
-    BinarySink_INIT(s, sha256_neon_write);
-    BinarySink_DELEGATE_INIT(&s->hash, s);
-    return &s->hash;
-}
-
-static void sha256_neon_reset(ssh_hash *hash)
-{
-    sha256_neon *s = container_of(hash, sha256_neon, hash);
-
-    s->core.abcd = vld1q_u32(sha256_initial_state);
-    s->core.efgh = vld1q_u32(sha256_initial_state + 4);
-
-    sha256_block_setup(&s->blk);
-}
-
-static void sha256_neon_copyfrom(ssh_hash *hcopy, ssh_hash *horig)
-{
-    sha256_neon *copy = container_of(hcopy, sha256_neon, hash);
-    sha256_neon *orig = container_of(horig, sha256_neon, hash);
-
-    *copy = *orig; /* structure copy */
-
-    BinarySink_COPIED(copy);
-    BinarySink_DELEGATE_INIT(&copy->hash, copy);
-}
-
-static void sha256_neon_free(ssh_hash *hash)
-{
-    sha256_neon *s = container_of(hash, sha256_neon, hash);
-    smemclr(s, sizeof(*s));
-    sfree(s);
-}
-
-static void sha256_neon_write(BinarySink *bs, const void *vp, size_t len)
-{
-    sha256_neon *s = BinarySink_DOWNCAST(bs, sha256_neon);
-
-    while (len > 0)
-        if (sha256_block_write(&s->blk, &vp, &len))
-            sha256_neon_block(&s->core, s->blk.block);
-}
-
-static void sha256_neon_digest(ssh_hash *hash, uint8_t *digest)
-{
-    sha256_neon *s = container_of(hash, sha256_neon, hash);
-
-    sha256_block_pad(&s->blk, BinarySink_UPCAST(s));
-    vst1q_u8(digest,      vrev32q_u8(vreinterpretq_u8_u32(s->core.abcd)));
-    vst1q_u8(digest + 16, vrev32q_u8(vreinterpretq_u8_u32(s->core.efgh)));
-}
-
-const ssh_hashalg ssh_sha256_hw = {
-    .new = sha256_neon_new,
-    .reset = sha256_neon_reset,
-    .copyfrom = sha256_neon_copyfrom,
-    .digest = sha256_neon_digest,
-    .free = sha256_neon_free,
-    .hlen = 32,
-    .blocklen = 64,
-    HASHALG_NAMES_ANNOTATED("SHA-256", "NEON accelerated"),
-};
-
-/* ----------------------------------------------------------------------
- * Stub functions if we have no hardware-accelerated SHA-256. In this
- * case, sha256_hw_new returns NULL (though it should also never be
- * selected by sha256_select, so the only thing that should even be
- * _able_ to call it is testcrypt). As a result, the remaining vtable
- * functions should never be called at all.
- */
-
-#elif HW_SHA256 == HW_SHA256_NONE
-
-static bool sha256_hw_available(void)
-{
-    return false;
-}
-
-static ssh_hash *sha256_stub_new(const ssh_hashalg *alg)
-{
-    return NULL;
-}
-
-#define STUB_BODY { unreachable("Should never be called"); }
-
-static void sha256_stub_reset(ssh_hash *hash) STUB_BODY
-static void sha256_stub_copyfrom(ssh_hash *hash, ssh_hash *orig) STUB_BODY
-static void sha256_stub_free(ssh_hash *hash) STUB_BODY
-static void sha256_stub_digest(ssh_hash *hash, uint8_t *digest) STUB_BODY
-
-const ssh_hashalg ssh_sha256_hw = {
-    .new = sha256_stub_new,
-    .reset = sha256_stub_reset,
-    .copyfrom = sha256_stub_copyfrom,
-    .digest = sha256_stub_digest,
-    .free = sha256_stub_free,
-    .hlen = 32,
-    .blocklen = 64,
-    HASHALG_NAMES_ANNOTATED("SHA-256", "!NONEXISTENT ACCELERATED VERSION!"),
-};
-
-#endif /* HW_SHA256 */
--- a/crypto/sha256.h
+++ b/crypto/sha256.h
@ -0,0 +1,105 @@
+/*
+ * Definitions likely to be helpful to multiple SHA-256 implementations.
+ */
+
+/*
+ * The 'extra' structure used by SHA-256 implementations is used to
+ * include information about how to check if a given implementation is
+ * available at run time, and whether we've already checked.
+ */
+struct sha256_extra_mutable;
+struct sha256_extra {
+    /* Function to check availability. Might be expensive, so we don't
+     * want to call it more than once. */
+    bool (*check_available)(void);
+
+    /* Point to a writable substructure. */
+    struct sha256_extra_mutable *mut;
+};
+struct sha256_extra_mutable {
+    bool checked_availability;
+    bool is_available;
+};
+static inline bool check_availability(const struct sha256_extra *extra)
+{
+    if (!extra->mut->checked_availability) {
+        extra->mut->is_available = extra->check_available();
+        extra->mut->checked_availability = true;
+    }
+
+    return extra->mut->is_available;
+}
+
+/*
+ * Macro to define a SHA-256 vtable together with its 'extra'
+ * structure.
+ */
+#define SHA256_VTABLE(impl_c, impl_display)                             \
+    static struct sha256_extra_mutable sha256_ ## impl_c ## _extra_mut; \
+    static const struct sha256_extra sha256_ ## impl_c ## _extra = {    \
+        .check_available = sha256_ ## impl_c ## _available,             \
+        .mut = &sha256_ ## impl_c ## _extra_mut,                        \
+    };                                                                  \
+    const ssh_hashalg ssh_sha256_ ## impl_c = {                         \
+        .new = sha256_ ## impl_c ## _new,                               \
+        .reset = sha256_ ## impl_c ## _reset,                           \
+        .copyfrom = sha256_ ## impl_c ## _copyfrom,                     \
+        .digest = sha256_ ## impl_c ## _digest,                         \
+        .free = sha256_ ## impl_c ## _free,                             \
+        .hlen = 32,                                                     \
+        .blocklen = 64,                                                 \
+        HASHALG_NAMES_ANNOTATED("SHA-256", impl_display),               \
+        .extra = &sha256_ ## impl_c ## _extra,                          \
+    }
+
+extern const uint32_t sha256_initial_state[8];
+extern const uint32_t sha256_round_constants[64];
+
+#define SHA256_ROUNDS 64
+
+typedef struct sha256_block sha256_block;
+struct sha256_block {
+    uint8_t block[64];
+    size_t used;
+    uint64_t len;
+};
+
+static inline void sha256_block_setup(sha256_block *blk)
+{
+    blk->used = 0;
+    blk->len = 0;
+}
+
+static inline bool sha256_block_write(
+    sha256_block *blk, const void **vdata, size_t *len)
+{
+    size_t blkleft = sizeof(blk->block) - blk->used;
+    size_t chunk = *len < blkleft ? *len : blkleft;
+
+    const uint8_t *p = *vdata;
+    memcpy(blk->block + blk->used, p, chunk);
+    *vdata = p + chunk;
+    *len -= chunk;
+    blk->used += chunk;
+    blk->len += chunk;
+
+    if (blk->used == sizeof(blk->block)) {
+        blk->used = 0;
+        return true;
+    }
+
+    return false;
+}
+
+static inline void sha256_block_pad(sha256_block *blk, BinarySink *bs)
+{
+    uint64_t final_len = blk->len << 3;
+    size_t pad = 1 + (63 & (55 - blk->used));
+
+    put_byte(bs, 0x80);
+    for (size_t i = 1; i < pad; i++)
+        put_byte(bs, 0);
+    put_uint64(bs, final_len);
+
+    assert(blk->used == 0 && "Should have exactly hit a block boundary");
+}
--- a/crypto/sha512-common.c
+++ b/crypto/sha512-common.c
@ -0,0 +1,71 @@
+/*
+ * Common variable definitions across all the SHA-512 implementations.
+ */
+
+#include "ssh.h"
+#include "sha512.h"
+
+const uint64_t sha512_initial_state[8] = {
+    0x6a09e667f3bcc908ULL,
+    0xbb67ae8584caa73bULL,
+    0x3c6ef372fe94f82bULL,
+    0xa54ff53a5f1d36f1ULL,
+    0x510e527fade682d1ULL,
+    0x9b05688c2b3e6c1fULL,
+    0x1f83d9abfb41bd6bULL,
+    0x5be0cd19137e2179ULL,
+};
+
+const uint64_t sha384_initial_state[8] = {
+    0xcbbb9d5dc1059ed8ULL,
+    0x629a292a367cd507ULL,
+    0x9159015a3070dd17ULL,
+    0x152fecd8f70e5939ULL,
+    0x67332667ffc00b31ULL,
+    0x8eb44a8768581511ULL,
+    0xdb0c2e0d64f98fa7ULL,
+    0x47b5481dbefa4fa4ULL,
+};
+
+const uint64_t sha512_round_constants[80] = {
+    0x428a2f98d728ae22ULL, 0x7137449123ef65cdULL,
+    0xb5c0fbcfec4d3b2fULL, 0xe9b5dba58189dbbcULL,
+    0x3956c25bf348b538ULL, 0x59f111f1b605d019ULL,
+    0x923f82a4af194f9bULL, 0xab1c5ed5da6d8118ULL,
+    0xd807aa98a3030242ULL, 0x12835b0145706fbeULL,
+    0x243185be4ee4b28cULL, 0x550c7dc3d5ffb4e2ULL,
+    0x72be5d74f27b896fULL, 0x80deb1fe3b1696b1ULL,
+    0x9bdc06a725c71235ULL, 0xc19bf174cf692694ULL,
+    0xe49b69c19ef14ad2ULL, 0xefbe4786384f25e3ULL,
+    0x0fc19dc68b8cd5b5ULL, 0x240ca1cc77ac9c65ULL,
+    0x2de92c6f592b0275ULL, 0x4a7484aa6ea6e483ULL,
+    0x5cb0a9dcbd41fbd4ULL, 0x76f988da831153b5ULL,
+    0x983e5152ee66dfabULL, 0xa831c66d2db43210ULL,
+    0xb00327c898fb213fULL, 0xbf597fc7beef0ee4ULL,
+    0xc6e00bf33da88fc2ULL, 0xd5a79147930aa725ULL,
+    0x06ca6351e003826fULL, 0x142929670a0e6e70ULL,
+    0x27b70a8546d22ffcULL, 0x2e1b21385c26c926ULL,
+    0x4d2c6dfc5ac42aedULL, 0x53380d139d95b3dfULL,
+    0x650a73548baf63deULL, 0x766a0abb3c77b2a8ULL,
+    0x81c2c92e47edaee6ULL, 0x92722c851482353bULL,
+    0xa2bfe8a14cf10364ULL, 0xa81a664bbc423001ULL,
+    0xc24b8b70d0f89791ULL, 0xc76c51a30654be30ULL,
+    0xd192e819d6ef5218ULL, 0xd69906245565a910ULL,
+    0xf40e35855771202aULL, 0x106aa07032bbd1b8ULL,
+    0x19a4c116b8d2d0c8ULL, 0x1e376c085141ab53ULL,
+    0x2748774cdf8eeb99ULL, 0x34b0bcb5e19b48a8ULL,
+    0x391c0cb3c5c95a63ULL, 0x4ed8aa4ae3418acbULL,
+    0x5b9cca4f7763e373ULL, 0x682e6ff3d6b2b8a3ULL,
+    0x748f82ee5defb2fcULL, 0x78a5636f43172f60ULL,
+    0x84c87814a1f0ab72ULL, 0x8cc702081a6439ecULL,
+    0x90befffa23631e28ULL, 0xa4506cebde82bde9ULL,
+    0xbef9a3f7b2c67915ULL, 0xc67178f2e372532bULL,
+    0xca273eceea26619cULL, 0xd186b8c721c0c207ULL,
+    0xeada7dd6cde0eb1eULL, 0xf57d4f7fee6ed178ULL,
+    0x06f067aa72176fbaULL, 0x0a637dc5a2c898a6ULL,
+    0x113f9804bef90daeULL, 0x1b710b35131c471bULL,
+    0x28db77f523047d84ULL, 0x32caab7b40c72493ULL,
+    0x3c9ebe0a15c9bebcULL, 0x431d67c49c100d4cULL,
+    0x4cc5d4becb3e42b6ULL, 0x597f299cfc657e2aULL,
+    0x5fcb6fab3ad6faecULL, 0x6c44198c4a475817ULL,
+};
--- a/crypto/sha512-neon.c
+++ b/crypto/sha512-neon.c
@ -0,0 +1,329 @@
+/*
+ * Hardware-accelerated implementation of SHA-512 using Arm NEON.
+ */
+
+#include "ssh.h"
+#include "sha512.h"
+
+#if USE_ARM64_NEON_H
+#include <arm64_neon.h>
+#else
+#include <arm_neon.h>
+#endif
+
+static bool sha512_neon_available(void)
+{
+    /*
+     * For Arm, we delegate to a per-platform detection function (see
+     * explanation in aes-neon.c).
+     */
+    return platform_sha512_neon_available();
+}
+
+#if !HAVE_NEON_SHA512_INTRINSICS
+/*
+ * clang 12 and before do not provide the SHA-512 NEON intrinsics, but
+ * do provide assembler support for the underlying instructions. So I
+ * define the intrinsic functions myself, using inline assembler.
+ */
+static inline uint64x2_t vsha512su0q_u64(uint64x2_t x, uint64x2_t y)
+{
+    __asm__("sha512su0 %0.2D,%1.2D" : "+w" (x) : "w" (y));
+    return x;
+}
+static inline uint64x2_t vsha512su1q_u64(uint64x2_t x, uint64x2_t y,
+                                         uint64x2_t z)
+{
+    __asm__("sha512su1 %0.2D,%1.2D,%2.2D" : "+w" (x) : "w" (y), "w" (z));
+    return x;
+}
+static inline uint64x2_t vsha512hq_u64(uint64x2_t x, uint64x2_t y,
+                                       uint64x2_t z)
+{
+    __asm__("sha512h %0,%1,%2.2D" : "+w" (x) : "w" (y), "w" (z));
+    return x;
+}
+static inline uint64x2_t vsha512h2q_u64(uint64x2_t x, uint64x2_t y,
+                                        uint64x2_t z)
+{
+    __asm__("sha512h2 %0,%1,%2.2D" : "+w" (x) : "w" (y), "w" (z));
+    return x;
+}
+#endif /* HAVE_NEON_SHA512_INTRINSICS */
+
+typedef struct sha512_neon_core sha512_neon_core;
+struct sha512_neon_core {
+    uint64x2_t ab, cd, ef, gh;
+};
+
+static inline uint64x2_t sha512_neon_load_input(const uint8_t *p)
+{
+    return vreinterpretq_u64_u8(vrev64q_u8(vld1q_u8(p)));
+}
+
+static inline uint64x2_t sha512_neon_schedule_update(
+    uint64x2_t m8, uint64x2_t m7, uint64x2_t m4, uint64x2_t m3, uint64x2_t m1)
+{
+    /*
+     * vsha512su0q_u64() takes words from a long way back in the
+     * schedule and performs the sigma_0 half of the computation of
+     * the next two 64-bit message-schedule words.
+     *
+     * vsha512su1q_u64() combines the result of that with the sigma_1
+     * steps, to output the finished version of those two words. The
+     * total amount of input data it requires fits nicely into three
+     * 128-bit vector registers, but one of those registers is
+     * misaligned compared to the 128-bit chunks that the message
+     * schedule is stored in. So we use vextq_u64 to make one of its
+     * input words out of the second half of m4 and the first half of
+     * m3.
+     */
+    return vsha512su1q_u64(vsha512su0q_u64(m8, m7), m1, vextq_u64(m4, m3, 1));
+}
+
+static inline void sha512_neon_round2(
+    unsigned round_index, uint64x2_t schedule_words,
+    uint64x2_t *ab, uint64x2_t *cd, uint64x2_t *ef, uint64x2_t *gh)
+{
+    /*
+     * vsha512hq_u64 performs the Sigma_1 and Ch half of the
+     * computation of two rounds of SHA-512 (including feeding back
+     * one of the outputs from the first of those half-rounds into the
+     * second one).
+     *
+     * vsha512h2q_u64 combines the result of that with the Sigma_0 and
+     * Maj steps, and outputs one 128-bit vector that replaces the gh
+     * piece of the input hash state, and a second that updates cd by
+     * addition.
+     *
+     * Similarly to vsha512su1q_u64 above, some of the input registers
+     * expected by these instructions are misaligned by 64 bits
+     * relative to the chunks we've divided the hash state into, so we
+     * have to start by making 'de' and 'fg' words out of our input
+     * cd,ef,gh, using vextq_u64.
+     *
+     * Also, one of the inputs to vsha512hq_u64 is expected to contain
+     * the results of summing gh + two round constants + two words of
+     * message schedule, but the two words of the message schedule
+     * have to be the opposite way round in the vector register from
+     * the way that vsha512su1q_u64 output them. Hence, there's
+     * another vextq_u64 in here that swaps the two halves of the
+     * initial_sum vector register.
+     *
+     * (This also means that I don't have to prepare a specially
+     * reordered version of the sha512_round_constants[] array: as
+     * long as I'm unavoidably doing a swap at run time _anyway_, I
+     * can load from the normally ordered version of that array, and
+     * just take care to fold in that data _before_ the swap rather
+     * than after.)
+     */
+
+    /* Load two round constants, with the first one in the low half */
+    uint64x2_t round_constants = vld1q_u64(
+        sha512_round_constants + round_index);
+
+    /* Add schedule words to round constants */
+    uint64x2_t initial_sum = vaddq_u64(schedule_words, round_constants);
+
+    /* Swap that sum around so the word used in the first of the two
+     * rounds is in the _high_ half of the vector, matching where h
+     * lives in the gh vector */
+    uint64x2_t swapped_initial_sum = vextq_u64(initial_sum, initial_sum, 1);
+
+    /* Add gh to that, now that they're matching ways round */
+    uint64x2_t sum = vaddq_u64(swapped_initial_sum, *gh);
+
+    /* Make the misaligned de and fg words */
+    uint64x2_t de = vextq_u64(*cd, *ef, 1);
+    uint64x2_t fg = vextq_u64(*ef, *gh, 1);
+
+    /* Now we're ready to put all the pieces together. The output from
+     * vsha512h2q_u64 can be used directly as the new gh, and the
+     * output from vsha512hq_u64 is simultaneously the intermediate
+     * value passed to h2 and the thing you have to add on to cd. */
+    uint64x2_t intermed = vsha512hq_u64(sum, fg, de);
+    *gh = vsha512h2q_u64(intermed, *cd, *ab);
+    *cd = vaddq_u64(*cd, intermed);
+}
+
+static inline void sha512_neon_block(sha512_neon_core *core, const uint8_t *p)
+{
+    uint64x2_t s0, s1, s2, s3, s4, s5, s6, s7;
+
+    uint64x2_t ab = core->ab, cd = core->cd, ef = core->ef, gh = core->gh;
+
+    s0 = sha512_neon_load_input(p + 16*0);
+    sha512_neon_round2(0, s0, &ab, &cd, &ef, &gh);
+    s1 = sha512_neon_load_input(p + 16*1);
+    sha512_neon_round2(2, s1, &gh, &ab, &cd, &ef);
+    s2 = sha512_neon_load_input(p + 16*2);
+    sha512_neon_round2(4, s2, &ef, &gh, &ab, &cd);
+    s3 = sha512_neon_load_input(p + 16*3);
+    sha512_neon_round2(6, s3, &cd, &ef, &gh, &ab);
+    s4 = sha512_neon_load_input(p + 16*4);
+    sha512_neon_round2(8, s4, &ab, &cd, &ef, &gh);
+    s5 = sha512_neon_load_input(p + 16*5);
+    sha512_neon_round2(10, s5, &gh, &ab, &cd, &ef);
+    s6 = sha512_neon_load_input(p + 16*6);
+    sha512_neon_round2(12, s6, &ef, &gh, &ab, &cd);
+    s7 = sha512_neon_load_input(p + 16*7);
+    sha512_neon_round2(14, s7, &cd, &ef, &gh, &ab);
+    s0 = sha512_neon_schedule_update(s0, s1, s4, s5, s7);
+    sha512_neon_round2(16, s0, &ab, &cd, &ef, &gh);
+    s1 = sha512_neon_schedule_update(s1, s2, s5, s6, s0);
+    sha512_neon_round2(18, s1, &gh, &ab, &cd, &ef);
+    s2 = sha512_neon_schedule_update(s2, s3, s6, s7, s1);
+    sha512_neon_round2(20, s2, &ef, &gh, &ab, &cd);
+    s3 = sha512_neon_schedule_update(s3, s4, s7, s0, s2);
+    sha512_neon_round2(22, s3, &cd, &ef, &gh, &ab);
+    s4 = sha512_neon_schedule_update(s4, s5, s0, s1, s3);
+    sha512_neon_round2(24, s4, &ab, &cd, &ef, &gh);
+    s5 = sha512_neon_schedule_update(s5, s6, s1, s2, s4);
+    sha512_neon_round2(26, s5, &gh, &ab, &cd, &ef);
+    s6 = sha512_neon_schedule_update(s6, s7, s2, s3, s5);
+    sha512_neon_round2(28, s6, &ef, &gh, &ab, &cd);
+    s7 = sha512_neon_schedule_update(s7, s0, s3, s4, s6);
+    sha512_neon_round2(30, s7, &cd, &ef, &gh, &ab);
+    s0 = sha512_neon_schedule_update(s0, s1, s4, s5, s7);
+    sha512_neon_round2(32, s0, &ab, &cd, &ef, &gh);
+    s1 = sha512_neon_schedule_update(s1, s2, s5, s6, s0);
+    sha512_neon_round2(34, s1, &gh, &ab, &cd, &ef);
+    s2 = sha512_neon_schedule_update(s2, s3, s6, s7, s1);
+    sha512_neon_round2(36, s2, &ef, &gh, &ab, &cd);
+    s3 = sha512_neon_schedule_update(s3, s4, s7, s0, s2);
+    sha512_neon_round2(38, s3, &cd, &ef, &gh, &ab);
+    s4 = sha512_neon_schedule_update(s4, s5, s0, s1, s3);
+    sha512_neon_round2(40, s4, &ab, &cd, &ef, &gh);
+    s5 = sha512_neon_schedule_update(s5, s6, s1, s2, s4);
+    sha512_neon_round2(42, s5, &gh, &ab, &cd, &ef);
+    s6 = sha512_neon_schedule_update(s6, s7, s2, s3, s5);
+    sha512_neon_round2(44, s6, &ef, &gh, &ab, &cd);
+    s7 = sha512_neon_schedule_update(s7, s0, s3, s4, s6);
+    sha512_neon_round2(46, s7, &cd, &ef, &gh, &ab);
+    s0 = sha512_neon_schedule_update(s0, s1, s4, s5, s7);
+    sha512_neon_round2(48, s0, &ab, &cd, &ef, &gh);
+    s1 = sha512_neon_schedule_update(s1, s2, s5, s6, s0);
+    sha512_neon_round2(50, s1, &gh, &ab, &cd, &ef);
+    s2 = sha512_neon_schedule_update(s2, s3, s6, s7, s1);
+    sha512_neon_round2(52, s2, &ef, &gh, &ab, &cd);
+    s3 = sha512_neon_schedule_update(s3, s4, s7, s0, s2);
+    sha512_neon_round2(54, s3, &cd, &ef, &gh, &ab);
+    s4 = sha512_neon_schedule_update(s4, s5, s0, s1, s3);
+    sha512_neon_round2(56, s4, &ab, &cd, &ef, &gh);
+    s5 = sha512_neon_schedule_update(s5, s6, s1, s2, s4);
+    sha512_neon_round2(58, s5, &gh, &ab, &cd, &ef);
+    s6 = sha512_neon_schedule_update(s6, s7, s2, s3, s5);
+    sha512_neon_round2(60, s6, &ef, &gh, &ab, &cd);
+    s7 = sha512_neon_schedule_update(s7, s0, s3, s4, s6);
+    sha512_neon_round2(62, s7, &cd, &ef, &gh, &ab);
+    s0 = sha512_neon_schedule_update(s0, s1, s4, s5, s7);
+    sha512_neon_round2(64, s0, &ab, &cd, &ef, &gh);
+    s1 = sha512_neon_schedule_update(s1, s2, s5, s6, s0);
+    sha512_neon_round2(66, s1, &gh, &ab, &cd, &ef);
+    s2 = sha512_neon_schedule_update(s2, s3, s6, s7, s1);
+    sha512_neon_round2(68, s2, &ef, &gh, &ab, &cd);
+    s3 = sha512_neon_schedule_update(s3, s4, s7, s0, s2);
+    sha512_neon_round2(70, s3, &cd, &ef, &gh, &ab);
+    s4 = sha512_neon_schedule_update(s4, s5, s0, s1, s3);
+    sha512_neon_round2(72, s4, &ab, &cd, &ef, &gh);
+    s5 = sha512_neon_schedule_update(s5, s6, s1, s2, s4);
+    sha512_neon_round2(74, s5, &gh, &ab, &cd, &ef);
+    s6 = sha512_neon_schedule_update(s6, s7, s2, s3, s5);
+    sha512_neon_round2(76, s6, &ef, &gh, &ab, &cd);
+    s7 = sha512_neon_schedule_update(s7, s0, s3, s4, s6);
+    sha512_neon_round2(78, s7, &cd, &ef, &gh, &ab);
+
+    core->ab = vaddq_u64(core->ab, ab);
+    core->cd = vaddq_u64(core->cd, cd);
+    core->ef = vaddq_u64(core->ef, ef);
+    core->gh = vaddq_u64(core->gh, gh);
+}
+
+typedef struct sha512_neon {
+    sha512_neon_core core;
+    sha512_block blk;
+    BinarySink_IMPLEMENTATION;
+    ssh_hash hash;
+} sha512_neon;
+
+static void sha512_neon_write(BinarySink *bs, const void *vp, size_t len);
+
+static ssh_hash *sha512_neon_new(const ssh_hashalg *alg)
+{
+    const struct sha512_extra *extra = (const struct sha512_extra *)alg->extra;
+    if (!check_availability(extra))
+        return NULL;
+
+    sha512_neon *s = snew(sha512_neon);
+
+    s->hash.vt = alg;
+    BinarySink_INIT(s, sha512_neon_write);
+    BinarySink_DELEGATE_INIT(&s->hash, s);
+    return &s->hash;
+}
+
+static void sha512_neon_reset(ssh_hash *hash)
+{
+    sha512_neon *s = container_of(hash, sha512_neon, hash);
+    const struct sha512_extra *extra =
+        (const struct sha512_extra *)hash->vt->extra;
+
+    s->core.ab = vld1q_u64(extra->initial_state);
+    s->core.cd = vld1q_u64(extra->initial_state+2);
+    s->core.ef = vld1q_u64(extra->initial_state+4);
+    s->core.gh = vld1q_u64(extra->initial_state+6);
+
+    sha512_block_setup(&s->blk);
+}
+
+static void sha512_neon_copyfrom(ssh_hash *hcopy, ssh_hash *horig)
+{
+    sha512_neon *copy = container_of(hcopy, sha512_neon, hash);
+    sha512_neon *orig = container_of(horig, sha512_neon, hash);
+
+    *copy = *orig; /* structure copy */
+
+    BinarySink_COPIED(copy);
+    BinarySink_DELEGATE_INIT(&copy->hash, copy);
+}
+
+static void sha512_neon_free(ssh_hash *hash)
+{
+    sha512_neon *s = container_of(hash, sha512_neon, hash);
+    smemclr(s, sizeof(*s));
+    sfree(s);
+}
+
+static void sha512_neon_write(BinarySink *bs, const void *vp, size_t len)
+{
+    sha512_neon *s = BinarySink_DOWNCAST(bs, sha512_neon);
+
+    while (len > 0)
+        if (sha512_block_write(&s->blk, &vp, &len))
+            sha512_neon_block(&s->core, s->blk.block);
+}
+
+static void sha512_neon_digest(ssh_hash *hash, uint8_t *digest)
+{
+    sha512_neon *s = container_of(hash, sha512_neon, hash);
+
+    sha512_block_pad(&s->blk, BinarySink_UPCAST(s));
+
+    vst1q_u8(digest,    vrev64q_u8(vreinterpretq_u8_u64(s->core.ab)));
+    vst1q_u8(digest+16, vrev64q_u8(vreinterpretq_u8_u64(s->core.cd)));
+    vst1q_u8(digest+32, vrev64q_u8(vreinterpretq_u8_u64(s->core.ef)));
+    vst1q_u8(digest+48, vrev64q_u8(vreinterpretq_u8_u64(s->core.gh)));
+}
+
+static void sha384_neon_digest(ssh_hash *hash, uint8_t *digest)
+{
+    sha512_neon *s = container_of(hash, sha512_neon, hash);
+
+    sha512_block_pad(&s->blk, BinarySink_UPCAST(s));
+
+    vst1q_u8(digest,    vrev64q_u8(vreinterpretq_u8_u64(s->core.ab)));
+    vst1q_u8(digest+16, vrev64q_u8(vreinterpretq_u8_u64(s->core.cd)));
+    vst1q_u8(digest+32, vrev64q_u8(vreinterpretq_u8_u64(s->core.ef)));
+}
+
+SHA512_VTABLES(neon, "NEON accelerated");
--- a/crypto/sha512-select.c
+++ b/crypto/sha512-select.c
@ -0,0 +1,61 @@
+/*
+ * Top-level vtables to select a SHA-512 implementation.
+ */
+
+#include <assert.h>
+#include <stdlib.h>
+
+#include "putty.h"
+#include "ssh.h"
+#include "sha512.h"
+
+static const ssh_hashalg *const real_sha512_algs[] = {
+#if HAVE_NEON_SHA512
+    &ssh_sha512_neon,
+#endif
+    &ssh_sha512_sw,
+    NULL,
+};
+
+static const ssh_hashalg *const real_sha384_algs[] = {
+#if HAVE_NEON_SHA512
+    &ssh_sha384_neon,
+#endif
+    &ssh_sha384_sw,
+    NULL,
+};
+
+static ssh_hash *sha512_select(const ssh_hashalg *alg)
+{
+    const ssh_hashalg *const *real_algs =
+        (const ssh_hashalg *const *)alg->extra;
+
+    for (size_t i = 0; real_algs[i]; i++) {
+        const ssh_hashalg *alg = real_algs[i];
+        const struct sha512_extra *alg_extra =
+            (const struct sha512_extra *)alg->extra;
+        if (check_availability(alg_extra))
+            return ssh_hash_new(alg);
+    }
+
+    /* We should never reach the NULL at the end of the list, because
+     * the last non-NULL entry should be software-only SHA-512, which
+     * is always available. */
+    unreachable("sha512_select ran off the end of its list");
+}
+
+const ssh_hashalg ssh_sha512 = {
+    .new = sha512_select,
+    .hlen = 64,
+    .blocklen = 128,
+    HASHALG_NAMES_ANNOTATED("SHA-512", "dummy selector vtable"),
+    .extra = real_sha512_algs,
+};
+
+const ssh_hashalg ssh_sha384 = {
+    .new = sha512_select,
+    .hlen = 48,
+    .blocklen = 128,
+    HASHALG_NAMES_ANNOTATED("SHA-384", "dummy selector vtable"),
+    .extra = real_sha384_algs,
+};
--- a/crypto/sha512-sw.c
+++ b/crypto/sha512-sw.c
@ -0,0 +1,168 @@
+/*
+ * Software implementation of SHA-512.
+ */
+
+#include "ssh.h"
+#include "sha512.h"
+
+static bool sha512_sw_available(void)
+{
+    /* Software SHA-512 is always available */
+    return true;
+}
+
+static inline uint64_t ror(uint64_t x, unsigned y)
+{
+    return (x << (63 & -y)) | (x >> (63 & y));
+}
+
+static inline uint64_t Ch(uint64_t ctrl, uint64_t if1, uint64_t if0)
+{
+    return if0 ^ (ctrl & (if1 ^ if0));
+}
+
+static inline uint64_t Maj(uint64_t x, uint64_t y, uint64_t z)
+{
+    return (x & y) | (z & (x | y));
+}
+
+static inline uint64_t Sigma_0(uint64_t x)
+{
+    return ror(x,28) ^ ror(x,34) ^ ror(x,39);
+}
+
+static inline uint64_t Sigma_1(uint64_t x)
+{
+    return ror(x,14) ^ ror(x,18) ^ ror(x,41);
+}
+
+static inline uint64_t sigma_0(uint64_t x)
+{
+    return ror(x,1) ^ ror(x,8) ^ (x >> 7);
+}
+
+static inline uint64_t sigma_1(uint64_t x)
+{
+    return ror(x,19) ^ ror(x,61) ^ (x >> 6);
+}
+
+static inline void sha512_sw_round(
+    unsigned round_index, const uint64_t *schedule,
+    uint64_t *a, uint64_t *b, uint64_t *c, uint64_t *d,
+    uint64_t *e, uint64_t *f, uint64_t *g, uint64_t *h)
+{
+    uint64_t t1 = *h + Sigma_1(*e) + Ch(*e,*f,*g) +
+        sha512_round_constants[round_index] + schedule[round_index];
+
+    uint64_t t2 = Sigma_0(*a) + Maj(*a,*b,*c);
+
+    *d += t1;
+    *h = t1 + t2;
+}
+
+static void sha512_sw_block(uint64_t *core, const uint8_t *block)
+{
+    uint64_t w[SHA512_ROUNDS];
+    uint64_t a,b,c,d,e,f,g,h;
+
+    int t;
+
+    for (t = 0; t < 16; t++)
+        w[t] = GET_64BIT_MSB_FIRST(block + 8*t);
+
+    for (t = 16; t < SHA512_ROUNDS; t++)
+        w[t] = w[t-16] + w[t-7] + sigma_0(w[t-15]) + sigma_1(w[t-2]);
+
+    a = core[0]; b = core[1]; c = core[2]; d = core[3];
+    e = core[4]; f = core[5]; g = core[6]; h = core[7];
+
+    for (t = 0; t < SHA512_ROUNDS; t+=8) {
+        sha512_sw_round(t+0, w, &a,&b,&c,&d,&e,&f,&g,&h);
+        sha512_sw_round(t+1, w, &h,&a,&b,&c,&d,&e,&f,&g);
+        sha512_sw_round(t+2, w, &g,&h,&a,&b,&c,&d,&e,&f);
+        sha512_sw_round(t+3, w, &f,&g,&h,&a,&b,&c,&d,&e);
+        sha512_sw_round(t+4, w, &e,&f,&g,&h,&a,&b,&c,&d);
+        sha512_sw_round(t+5, w, &d,&e,&f,&g,&h,&a,&b,&c);
+        sha512_sw_round(t+6, w, &c,&d,&e,&f,&g,&h,&a,&b);
+        sha512_sw_round(t+7, w, &b,&c,&d,&e,&f,&g,&h,&a);
+    }
+
+    core[0] += a; core[1] += b; core[2] += c; core[3] += d;
+    core[4] += e; core[5] += f; core[6] += g; core[7] += h;
+
+    smemclr(w, sizeof(w));
+}
+
+typedef struct sha512_sw {
+    uint64_t core[8];
+    sha512_block blk;
+    BinarySink_IMPLEMENTATION;
+    ssh_hash hash;
+} sha512_sw;
+
+static void sha512_sw_write(BinarySink *bs, const void *vp, size_t len);
+
+static ssh_hash *sha512_sw_new(const ssh_hashalg *alg)
+{
+    sha512_sw *s = snew(sha512_sw);
+
+    s->hash.vt = alg;
+    BinarySink_INIT(s, sha512_sw_write);
+    BinarySink_DELEGATE_INIT(&s->hash, s);
+    return &s->hash;
+}
+
+static void sha512_sw_reset(ssh_hash *hash)
+{
+    sha512_sw *s = container_of(hash, sha512_sw, hash);
+    const struct sha512_extra *extra =
+        (const struct sha512_extra *)hash->vt->extra;
+
+    memcpy(s->core, extra->initial_state, sizeof(s->core));
+    sha512_block_setup(&s->blk);
+}
+
+static void sha512_sw_copyfrom(ssh_hash *hcopy, ssh_hash *horig)
+{
+    sha512_sw *copy = container_of(hcopy, sha512_sw, hash);
+    sha512_sw *orig = container_of(horig, sha512_sw, hash);
+
+    memcpy(copy, orig, sizeof(*copy));
+    BinarySink_COPIED(copy);
+    BinarySink_DELEGATE_INIT(&copy->hash, copy);
+}
+
+static void sha512_sw_free(ssh_hash *hash)
+{
+    sha512_sw *s = container_of(hash, sha512_sw, hash);
+
+    smemclr(s, sizeof(*s));
+    sfree(s);
+}
+
+static void sha512_sw_write(BinarySink *bs, const void *vp, size_t len)
+{
+    sha512_sw *s = BinarySink_DOWNCAST(bs, sha512_sw);
+
+    while (len > 0)
+        if (sha512_block_write(&s->blk, &vp, &len))
+            sha512_sw_block(s->core, s->blk.block);
+}
+
+static void sha512_sw_digest(ssh_hash *hash, uint8_t *digest)
+{
+    sha512_sw *s = container_of(hash, sha512_sw, hash);
+
+    sha512_block_pad(&s->blk, BinarySink_UPCAST(s));
+    for (size_t i = 0; i < hash->vt->hlen / 8; i++)
+        PUT_64BIT_MSB_FIRST(digest + 8*i, s->core[i]);
+}
+
+/*
+ * This implementation doesn't need separate digest methods for
+ * SHA-384 and SHA-512, because the above implementation reads the
+ * hash length out of the vtable.
+ */
+#define sha384_sw_digest sha512_sw_digest
+
+SHA512_VTABLES(sw, "unaccelerated");
--- a/crypto/sha512.c
+++ b/crypto/sha512.c
@ -1,836 +0,0 @@
-/*
- * SHA-512 algorithm as described at
- *
- *   http://csrc.nist.gov/cryptval/shs.html
- *
- * Modifications made for SHA-384 also
- */
-
-#include <assert.h>
-#include "ssh.h"
-
-/*
- * Start by deciding whether we can support hardware SHA at all.
- */
-#define HW_SHA512_NONE 0
-#define HW_SHA512_NEON 1
-
-#ifdef _FORCE_SHA512_NEON
-#   define HW_SHA512 HW_SHA512_NEON
-#elif defined __BYTE_ORDER__ && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
-    /* Arm can potentially support both endiannesses, but this code
-     * hasn't been tested on anything but little. If anyone wants to
-     * run big-endian, they'll need to fix it first. */
-#elif defined __ARM_FEATURE_SHA512
-    /* If the Arm SHA-512 extension is available already, we can
-     * support NEON SHA without having to enable anything by hand */
-#   define HW_SHA512 HW_SHA512_NEON
-#elif defined(__clang__)
-#   if __has_attribute(target) && __has_include(<arm_neon.h>) &&       \
-    (defined(__aarch64__))
-        /* clang can enable the crypto extension in AArch64 using
-         * __attribute__((target)) */
-#       define HW_SHA512 HW_SHA512_NEON
-#       define USE_CLANG_ATTR_TARGET_AARCH64
-#   endif
-#endif
-
-#if defined _FORCE_SOFTWARE_SHA || !defined HW_SHA512
-#   undef HW_SHA512
-#   define HW_SHA512 HW_SHA512_NONE
-#endif
-
-/*
- * The actual query function that asks if hardware acceleration is
- * available.
- */
-static bool sha512_hw_available(void);
-
-/*
- * The top-level selection function, caching the results of
- * sha512_hw_available() so it only has to run once.
- */
-static bool sha512_hw_available_cached(void)
-{
-    static bool initialised = false;
-    static bool hw_available;
-    if (!initialised) {
-        hw_available = sha512_hw_available();
-        initialised = true;
-    }
-    return hw_available;
-}
-
-struct sha512_select_options {
-    const ssh_hashalg *hw, *sw;
-};
-
-static ssh_hash *sha512_select(const ssh_hashalg *alg)
-{
-    const struct sha512_select_options *options =
-        (const struct sha512_select_options *)alg->extra;
-
-    const ssh_hashalg *real_alg =
-        sha512_hw_available_cached() ? options->hw : options->sw;
-
-    return ssh_hash_new(real_alg);
-}
-
-const struct sha512_select_options ssh_sha512_select_options = {
-    &ssh_sha512_hw, &ssh_sha512_sw,
-};
-const struct sha512_select_options ssh_sha384_select_options = {
-    &ssh_sha384_hw, &ssh_sha384_sw,
-};
-
-const ssh_hashalg ssh_sha512 = {
-    .new = sha512_select,
-    .hlen = 64,
-    .blocklen = 128,
-    HASHALG_NAMES_ANNOTATED("SHA-512", "dummy selector vtable"),
-    .extra = &ssh_sha512_select_options,
-};
-
-const ssh_hashalg ssh_sha384 = {
-    .new = sha512_select,
-    .hlen = 48,
-    .blocklen = 128,
-    HASHALG_NAMES_ANNOTATED("SHA-384", "dummy selector vtable"),
-    .extra = &ssh_sha384_select_options,
-};
-
-/* ----------------------------------------------------------------------
- * Definitions likely to be helpful to multiple implementations.
- */
-
-static const uint64_t sha512_initial_state[] = {
-    0x6a09e667f3bcc908ULL,
-    0xbb67ae8584caa73bULL,
-    0x3c6ef372fe94f82bULL,
-    0xa54ff53a5f1d36f1ULL,
-    0x510e527fade682d1ULL,
-    0x9b05688c2b3e6c1fULL,
-    0x1f83d9abfb41bd6bULL,
-    0x5be0cd19137e2179ULL,
-};
-
-static const uint64_t sha384_initial_state[] = {
-    0xcbbb9d5dc1059ed8ULL,
-    0x629a292a367cd507ULL,
-    0x9159015a3070dd17ULL,
-    0x152fecd8f70e5939ULL,
-    0x67332667ffc00b31ULL,
-    0x8eb44a8768581511ULL,
-    0xdb0c2e0d64f98fa7ULL,
-    0x47b5481dbefa4fa4ULL,
-};
-
-static const uint64_t sha512_round_constants[] = {
-    0x428a2f98d728ae22ULL, 0x7137449123ef65cdULL,
-    0xb5c0fbcfec4d3b2fULL, 0xe9b5dba58189dbbcULL,
-    0x3956c25bf348b538ULL, 0x59f111f1b605d019ULL,
-    0x923f82a4af194f9bULL, 0xab1c5ed5da6d8118ULL,
-    0xd807aa98a3030242ULL, 0x12835b0145706fbeULL,
-    0x243185be4ee4b28cULL, 0x550c7dc3d5ffb4e2ULL,
-    0x72be5d74f27b896fULL, 0x80deb1fe3b1696b1ULL,
-    0x9bdc06a725c71235ULL, 0xc19bf174cf692694ULL,
-    0xe49b69c19ef14ad2ULL, 0xefbe4786384f25e3ULL,
-    0x0fc19dc68b8cd5b5ULL, 0x240ca1cc77ac9c65ULL,
-    0x2de92c6f592b0275ULL, 0x4a7484aa6ea6e483ULL,
-    0x5cb0a9dcbd41fbd4ULL, 0x76f988da831153b5ULL,
-    0x983e5152ee66dfabULL, 0xa831c66d2db43210ULL,
-    0xb00327c898fb213fULL, 0xbf597fc7beef0ee4ULL,
-    0xc6e00bf33da88fc2ULL, 0xd5a79147930aa725ULL,
-    0x06ca6351e003826fULL, 0x142929670a0e6e70ULL,
-    0x27b70a8546d22ffcULL, 0x2e1b21385c26c926ULL,
-    0x4d2c6dfc5ac42aedULL, 0x53380d139d95b3dfULL,
-    0x650a73548baf63deULL, 0x766a0abb3c77b2a8ULL,
-    0x81c2c92e47edaee6ULL, 0x92722c851482353bULL,
-    0xa2bfe8a14cf10364ULL, 0xa81a664bbc423001ULL,
-    0xc24b8b70d0f89791ULL, 0xc76c51a30654be30ULL,
-    0xd192e819d6ef5218ULL, 0xd69906245565a910ULL,
-    0xf40e35855771202aULL, 0x106aa07032bbd1b8ULL,
-    0x19a4c116b8d2d0c8ULL, 0x1e376c085141ab53ULL,
-    0x2748774cdf8eeb99ULL, 0x34b0bcb5e19b48a8ULL,
-    0x391c0cb3c5c95a63ULL, 0x4ed8aa4ae3418acbULL,
-    0x5b9cca4f7763e373ULL, 0x682e6ff3d6b2b8a3ULL,
-    0x748f82ee5defb2fcULL, 0x78a5636f43172f60ULL,
-    0x84c87814a1f0ab72ULL, 0x8cc702081a6439ecULL,
-    0x90befffa23631e28ULL, 0xa4506cebde82bde9ULL,
-    0xbef9a3f7b2c67915ULL, 0xc67178f2e372532bULL,
-    0xca273eceea26619cULL, 0xd186b8c721c0c207ULL,
-    0xeada7dd6cde0eb1eULL, 0xf57d4f7fee6ed178ULL,
-    0x06f067aa72176fbaULL, 0x0a637dc5a2c898a6ULL,
-    0x113f9804bef90daeULL, 0x1b710b35131c471bULL,
-    0x28db77f523047d84ULL, 0x32caab7b40c72493ULL,
-    0x3c9ebe0a15c9bebcULL, 0x431d67c49c100d4cULL,
-    0x4cc5d4becb3e42b6ULL, 0x597f299cfc657e2aULL,
-    0x5fcb6fab3ad6faecULL, 0x6c44198c4a475817ULL,
-};
-
-#define SHA512_ROUNDS 80
-
-typedef struct sha512_block sha512_block;
-struct sha512_block {
-    uint8_t block[128];
-    size_t used;
-    uint64_t lenhi, lenlo;
-};
-
-static inline void sha512_block_setup(sha512_block *blk)
-{
-    blk->used = 0;
-    blk->lenhi = blk->lenlo = 0;
-}
-
-static inline bool sha512_block_write(
-    sha512_block *blk, const void **vdata, size_t *len)
-{
-    size_t blkleft = sizeof(blk->block) - blk->used;
-    size_t chunk = *len < blkleft ? *len : blkleft;
-
-    const uint8_t *p = *vdata;
-    memcpy(blk->block + blk->used, p, chunk);
-    *vdata = p + chunk;
-    *len -= chunk;
-    blk->used += chunk;
-
-    size_t chunkbits = chunk << 3;
-
-    blk->lenlo += chunkbits;
-    blk->lenhi += (blk->lenlo < chunkbits);
-
-    if (blk->used == sizeof(blk->block)) {
-        blk->used = 0;
-        return true;
-    }
-
-    return false;
-}
-
-static inline void sha512_block_pad(sha512_block *blk, BinarySink *bs)
-{
-    uint64_t final_lenhi = blk->lenhi;
-    uint64_t final_lenlo = blk->lenlo;
-    size_t pad = 127 & (111 - blk->used);
-
-    put_byte(bs, 0x80);
-    put_padding(bs, pad, 0);
-    put_uint64(bs, final_lenhi);
-    put_uint64(bs, final_lenlo);
-
-    assert(blk->used == 0 && "Should have exactly hit a block boundary");
-}
-
-/* ----------------------------------------------------------------------
- * Software implementation of SHA-512.
- */
-
-static inline uint64_t ror(uint64_t x, unsigned y)
-{
-    return (x << (63 & -y)) | (x >> (63 & y));
-}
-
-static inline uint64_t Ch(uint64_t ctrl, uint64_t if1, uint64_t if0)
-{
-    return if0 ^ (ctrl & (if1 ^ if0));
-}
-
-static inline uint64_t Maj(uint64_t x, uint64_t y, uint64_t z)
-{
-    return (x & y) | (z & (x | y));
-}
-
-static inline uint64_t Sigma_0(uint64_t x)
-{
-    return ror(x,28) ^ ror(x,34) ^ ror(x,39);
-}
-
-static inline uint64_t Sigma_1(uint64_t x)
-{
-    return ror(x,14) ^ ror(x,18) ^ ror(x,41);
-}
-
-static inline uint64_t sigma_0(uint64_t x)
-{
-    return ror(x,1) ^ ror(x,8) ^ (x >> 7);
-}
-
-static inline uint64_t sigma_1(uint64_t x)
-{
-    return ror(x,19) ^ ror(x,61) ^ (x >> 6);
-}
-
-static inline void sha512_sw_round(
-    unsigned round_index, const uint64_t *schedule,
-    uint64_t *a, uint64_t *b, uint64_t *c, uint64_t *d,
-    uint64_t *e, uint64_t *f, uint64_t *g, uint64_t *h)
-{
-    uint64_t t1 = *h + Sigma_1(*e) + Ch(*e,*f,*g) +
-        sha512_round_constants[round_index] + schedule[round_index];
-
-    uint64_t t2 = Sigma_0(*a) + Maj(*a,*b,*c);
-
-    *d += t1;
-    *h = t1 + t2;
-}
-
-static void sha512_sw_block(uint64_t *core, const uint8_t *block)
-{
-    uint64_t w[SHA512_ROUNDS];
-    uint64_t a,b,c,d,e,f,g,h;
-
-    int t;
-
-    for (t = 0; t < 16; t++)
-        w[t] = GET_64BIT_MSB_FIRST(block + 8*t);
-
-    for (t = 16; t < SHA512_ROUNDS; t++)
-        w[t] = w[t-16] + w[t-7] + sigma_0(w[t-15]) + sigma_1(w[t-2]);
-
-    a = core[0]; b = core[1]; c = core[2]; d = core[3];
-    e = core[4]; f = core[5]; g = core[6]; h = core[7];
-
-    for (t = 0; t < SHA512_ROUNDS; t+=8) {
-        sha512_sw_round(t+0, w, &a,&b,&c,&d,&e,&f,&g,&h);
-        sha512_sw_round(t+1, w, &h,&a,&b,&c,&d,&e,&f,&g);
-        sha512_sw_round(t+2, w, &g,&h,&a,&b,&c,&d,&e,&f);
-        sha512_sw_round(t+3, w, &f,&g,&h,&a,&b,&c,&d,&e);
-        sha512_sw_round(t+4, w, &e,&f,&g,&h,&a,&b,&c,&d);
-        sha512_sw_round(t+5, w, &d,&e,&f,&g,&h,&a,&b,&c);
-        sha512_sw_round(t+6, w, &c,&d,&e,&f,&g,&h,&a,&b);
-        sha512_sw_round(t+7, w, &b,&c,&d,&e,&f,&g,&h,&a);
-    }
-
-    core[0] += a; core[1] += b; core[2] += c; core[3] += d;
-    core[4] += e; core[5] += f; core[6] += g; core[7] += h;
-
-    smemclr(w, sizeof(w));
-}
-
-typedef struct sha512_sw {
-    uint64_t core[8];
-    sha512_block blk;
-    BinarySink_IMPLEMENTATION;
-    ssh_hash hash;
-} sha512_sw;
-
-static void sha512_sw_write(BinarySink *bs, const void *vp, size_t len);
-
-static ssh_hash *sha512_sw_new(const ssh_hashalg *alg)
-{
-    sha512_sw *s = snew(sha512_sw);
-
-    s->hash.vt = alg;
-    BinarySink_INIT(s, sha512_sw_write);
-    BinarySink_DELEGATE_INIT(&s->hash, s);
-    return &s->hash;
-}
-
-static void sha512_sw_reset(ssh_hash *hash)
-{
-    sha512_sw *s = container_of(hash, sha512_sw, hash);
-
-    /* The 'extra' field in the ssh_hashalg indicates which
-     * initialisation vector we're using */
-    memcpy(s->core, hash->vt->extra, sizeof(s->core));
-    sha512_block_setup(&s->blk);
-}
-
-static void sha512_sw_copyfrom(ssh_hash *hcopy, ssh_hash *horig)
-{
-    sha512_sw *copy = container_of(hcopy, sha512_sw, hash);
-    sha512_sw *orig = container_of(horig, sha512_sw, hash);
-
-    memcpy(copy, orig, sizeof(*copy));
-    BinarySink_COPIED(copy);
-    BinarySink_DELEGATE_INIT(&copy->hash, copy);
-}
-
-static void sha512_sw_free(ssh_hash *hash)
-{
-    sha512_sw *s = container_of(hash, sha512_sw, hash);
-
-    smemclr(s, sizeof(*s));
-    sfree(s);
-}
-
-static void sha512_sw_write(BinarySink *bs, const void *vp, size_t len)
-{
-    sha512_sw *s = BinarySink_DOWNCAST(bs, sha512_sw);
-
-    while (len > 0)
-        if (sha512_block_write(&s->blk, &vp, &len))
-            sha512_sw_block(s->core, s->blk.block);
-}
-
-static void sha512_sw_digest(ssh_hash *hash, uint8_t *digest)
-{
-    sha512_sw *s = container_of(hash, sha512_sw, hash);
-
-    sha512_block_pad(&s->blk, BinarySink_UPCAST(s));
-    for (size_t i = 0; i < hash->vt->hlen / 8; i++)
-        PUT_64BIT_MSB_FIRST(digest + 8*i, s->core[i]);
-}
-
-const ssh_hashalg ssh_sha512_sw = {
-    .new = sha512_sw_new,
-    .reset = sha512_sw_reset,
-    .copyfrom = sha512_sw_copyfrom,
-    .digest = sha512_sw_digest,
-    .free = sha512_sw_free,
-    .hlen = 64,
-    .blocklen = 128,
-    HASHALG_NAMES_ANNOTATED("SHA-512", "unaccelerated"),
-    .extra = sha512_initial_state,
-};
-
-const ssh_hashalg ssh_sha384_sw = {
-    .new = sha512_sw_new,
-    .reset = sha512_sw_reset,
-    .copyfrom = sha512_sw_copyfrom,
-    .digest = sha512_sw_digest,
-    .free = sha512_sw_free,
-    .hlen = 48,
-    .blocklen = 128,
-    HASHALG_NAMES_ANNOTATED("SHA-384", "unaccelerated"),
-    .extra = sha384_initial_state,
-};
-
-/* ----------------------------------------------------------------------
- * Hardware-accelerated implementation of SHA-512 using Arm NEON.
- */
-
-#if HW_SHA512 == HW_SHA512_NEON
-
-/*
- * Manually set the target architecture, if we decided above that we
- * need to.
- */
-#ifdef USE_CLANG_ATTR_TARGET_AARCH64
-/*
- * A spot of cheating: redefine some ACLE feature macros before
- * including arm_neon.h. Otherwise we won't get the SHA intrinsics
- * defined by that header, because it will be looking at the settings
- * for the whole translation unit rather than the ones we're going to
- * put on some particular functions using __attribute__((target)).
- */
-#define __ARM_NEON 1
-#define __ARM_FEATURE_CRYPTO 1
-#define FUNC_ISA __attribute__ ((target("neon,sha3")))
-#endif /* USE_CLANG_ATTR_TARGET_AARCH64 */
-
-#ifndef FUNC_ISA
-#define FUNC_ISA
-#endif
-
-#ifdef USE_ARM64_NEON_H
-#include <arm64_neon.h>
-#else
-#include <arm_neon.h>
-#endif
-
-static bool sha512_hw_available(void)
-{
-    /*
-     * For Arm, we delegate to a per-platform detection function (see
-     * explanation in sshaes.c).
-     */
-    return platform_sha512_hw_available();
-}
-
-#if defined __clang__
-/*
- * As of 2020-12-24, I've found that clang doesn't provide the SHA-512
- * NEON intrinsics. So I define my own set using inline assembler, and
- * use #define to effectively rename them over the top of the standard
- * names.
- *
- * The aim of that #define technique is that it should avoid a build
- * failure if these intrinsics _are_ defined in <arm_neon.h>.
- * Obviously it would be better in that situation to switch back to
- * using the real intrinsics, but until I see a version of clang that
- * supports them, I won't know what version number to test in the
- * ifdef.
- */
-static inline FUNC_ISA
-uint64x2_t vsha512su0q_u64_asm(uint64x2_t x, uint64x2_t y) {
-    __asm__("sha512su0 %0.2D,%1.2D" : "+w" (x) : "w" (y));
-    return x;
-}
-static inline FUNC_ISA
-uint64x2_t vsha512su1q_u64_asm(uint64x2_t x, uint64x2_t y, uint64x2_t z) {
-    __asm__("sha512su1 %0.2D,%1.2D,%2.2D" : "+w" (x) : "w" (y), "w" (z));
-    return x;
-}
-static inline FUNC_ISA
-uint64x2_t vsha512hq_u64_asm(uint64x2_t x, uint64x2_t y, uint64x2_t z) {
-    __asm__("sha512h %0,%1,%2.2D" : "+w" (x) : "w" (y), "w" (z));
-    return x;
-}
-static inline FUNC_ISA
-uint64x2_t vsha512h2q_u64_asm(uint64x2_t x, uint64x2_t y, uint64x2_t z) {
-    __asm__("sha512h2 %0,%1,%2.2D" : "+w" (x) : "w" (y), "w" (z));
-    return x;
-}
-#undef vsha512su0q_u64
-#define vsha512su0q_u64 vsha512su0q_u64_asm
-#undef vsha512su1q_u64
-#define vsha512su1q_u64 vsha512su1q_u64_asm
-#undef vsha512hq_u64
-#define vsha512hq_u64 vsha512hq_u64_asm
-#undef vsha512h2q_u64
-#define vsha512h2q_u64 vsha512h2q_u64_asm
-#endif /* defined __clang__ */
-
-typedef struct sha512_neon_core sha512_neon_core;
-struct sha512_neon_core {
-    uint64x2_t ab, cd, ef, gh;
-};
-
-FUNC_ISA
-static inline uint64x2_t sha512_neon_load_input(const uint8_t *p)
-{
-    return vreinterpretq_u64_u8(vrev64q_u8(vld1q_u8(p)));
-}
-
-FUNC_ISA
-static inline uint64x2_t sha512_neon_schedule_update(
-    uint64x2_t m8, uint64x2_t m7, uint64x2_t m4, uint64x2_t m3, uint64x2_t m1)
-{
-    /*
-     * vsha512su0q_u64() takes words from a long way back in the
-     * schedule and performs the sigma_0 half of the computation of
-     * the next two 64-bit message-schedule words.
-     *
-     * vsha512su1q_u64() combines the result of that with the sigma_1
-     * steps, to output the finished version of those two words. The
-     * total amount of input data it requires fits nicely into three
-     * 128-bit vector registers, but one of those registers is
-     * misaligned compared to the 128-bit chunks that the message
-     * schedule is stored in. So we use vextq_u64 to make one of its
-     * input words out of the second half of m4 and the first half of
-     * m3.
-     */
-    return vsha512su1q_u64(vsha512su0q_u64(m8, m7), m1, vextq_u64(m4, m3, 1));
-}
-
-FUNC_ISA
-static inline void sha512_neon_round2(
-    unsigned round_index, uint64x2_t schedule_words,
-    uint64x2_t *ab, uint64x2_t *cd, uint64x2_t *ef, uint64x2_t *gh)
-{
-    /*
-     * vsha512hq_u64 performs the Sigma_1 and Ch half of the
-     * computation of two rounds of SHA-512 (including feeding back
-     * one of the outputs from the first of those half-rounds into the
-     * second one).
-     *
-     * vsha512h2q_u64 combines the result of that with the Sigma_0 and
-     * Maj steps, and outputs one 128-bit vector that replaces the gh
-     * piece of the input hash state, and a second that updates cd by
-     * addition.
-     *
-     * Similarly to vsha512su1q_u64 above, some of the input registers
-     * expected by these instructions are misaligned by 64 bits
-     * relative to the chunks we've divided the hash state into, so we
-     * have to start by making 'de' and 'fg' words out of our input
-     * cd,ef,gh, using vextq_u64.
-     *
-     * Also, one of the inputs to vsha512hq_u64 is expected to contain
-     * the results of summing gh + two round constants + two words of
-     * message schedule, but the two words of the message schedule
-     * have to be the opposite way round in the vector register from
-     * the way that vsha512su1q_u64 output them. Hence, there's
-     * another vextq_u64 in here that swaps the two halves of the
-     * initial_sum vector register.
-     *
-     * (This also means that I don't have to prepare a specially
-     * reordered version of the sha512_round_constants[] array: as
-     * long as I'm unavoidably doing a swap at run time _anyway_, I
-     * can load from the normally ordered version of that array, and
-     * just take care to fold in that data _before_ the swap rather
-     * than after.)
-     */
-
-    /* Load two round constants, with the first one in the low half */
-    uint64x2_t round_constants = vld1q_u64(
-        sha512_round_constants + round_index);
-
-    /* Add schedule words to round constants */
-    uint64x2_t initial_sum = vaddq_u64(schedule_words, round_constants);
-
-    /* Swap that sum around so the word used in the first of the two
-     * rounds is in the _high_ half of the vector, matching where h
-     * lives in the gh vector */
-    uint64x2_t swapped_initial_sum = vextq_u64(initial_sum, initial_sum, 1);
-
-    /* Add gh to that, now that they're matching ways round */
-    uint64x2_t sum = vaddq_u64(swapped_initial_sum, *gh);
-
-    /* Make the misaligned de and fg words */
-    uint64x2_t de = vextq_u64(*cd, *ef, 1);
-    uint64x2_t fg = vextq_u64(*ef, *gh, 1);
-
-    /* Now we're ready to put all the pieces together. The output from
-     * vsha512h2q_u64 can be used directly as the new gh, and the
-     * output from vsha512hq_u64 is simultaneously the intermediate
-     * value passed to h2 and the thing you have to add on to cd. */
-    uint64x2_t intermed = vsha512hq_u64(sum, fg, de);
-    *gh = vsha512h2q_u64(intermed, *cd, *ab);
-    *cd = vaddq_u64(*cd, intermed);
-}
-
-FUNC_ISA
-static inline void sha512_neon_block(sha512_neon_core *core, const uint8_t *p)
-{
-    uint64x2_t s0, s1, s2, s3, s4, s5, s6, s7;
-
-    uint64x2_t ab = core->ab, cd = core->cd, ef = core->ef, gh = core->gh;
-
-    s0 = sha512_neon_load_input(p + 16*0);
-    sha512_neon_round2(0, s0, &ab, &cd, &ef, &gh);
-    s1 = sha512_neon_load_input(p + 16*1);
-    sha512_neon_round2(2, s1, &gh, &ab, &cd, &ef);
-    s2 = sha512_neon_load_input(p + 16*2);
-    sha512_neon_round2(4, s2, &ef, &gh, &ab, &cd);
-    s3 = sha512_neon_load_input(p + 16*3);
-    sha512_neon_round2(6, s3, &cd, &ef, &gh, &ab);
-    s4 = sha512_neon_load_input(p + 16*4);
-    sha512_neon_round2(8, s4, &ab, &cd, &ef, &gh);
-    s5 = sha512_neon_load_input(p + 16*5);
-    sha512_neon_round2(10, s5, &gh, &ab, &cd, &ef);
-    s6 = sha512_neon_load_input(p + 16*6);
-    sha512_neon_round2(12, s6, &ef, &gh, &ab, &cd);
-    s7 = sha512_neon_load_input(p + 16*7);
-    sha512_neon_round2(14, s7, &cd, &ef, &gh, &ab);
-    s0 = sha512_neon_schedule_update(s0, s1, s4, s5, s7);
-    sha512_neon_round2(16, s0, &ab, &cd, &ef, &gh);
-    s1 = sha512_neon_schedule_update(s1, s2, s5, s6, s0);
-    sha512_neon_round2(18, s1, &gh, &ab, &cd, &ef);
-    s2 = sha512_neon_schedule_update(s2, s3, s6, s7, s1);
-    sha512_neon_round2(20, s2, &ef, &gh, &ab, &cd);
-    s3 = sha512_neon_schedule_update(s3, s4, s7, s0, s2);
-    sha512_neon_round2(22, s3, &cd, &ef, &gh, &ab);
-    s4 = sha512_neon_schedule_update(s4, s5, s0, s1, s3);
-    sha512_neon_round2(24, s4, &ab, &cd, &ef, &gh);
-    s5 = sha512_neon_schedule_update(s5, s6, s1, s2, s4);
-    sha512_neon_round2(26, s5, &gh, &ab, &cd, &ef);
-    s6 = sha512_neon_schedule_update(s6, s7, s2, s3, s5);
-    sha512_neon_round2(28, s6, &ef, &gh, &ab, &cd);
-    s7 = sha512_neon_schedule_update(s7, s0, s3, s4, s6);
-    sha512_neon_round2(30, s7, &cd, &ef, &gh, &ab);
-    s0 = sha512_neon_schedule_update(s0, s1, s4, s5, s7);
-    sha512_neon_round2(32, s0, &ab, &cd, &ef, &gh);
-    s1 = sha512_neon_schedule_update(s1, s2, s5, s6, s0);
-    sha512_neon_round2(34, s1, &gh, &ab, &cd, &ef);
-    s2 = sha512_neon_schedule_update(s2, s3, s6, s7, s1);
-    sha512_neon_round2(36, s2, &ef, &gh, &ab, &cd);
-    s3 = sha512_neon_schedule_update(s3, s4, s7, s0, s2);
-    sha512_neon_round2(38, s3, &cd, &ef, &gh, &ab);
-    s4 = sha512_neon_schedule_update(s4, s5, s0, s1, s3);
-    sha512_neon_round2(40, s4, &ab, &cd, &ef, &gh);
-    s5 = sha512_neon_schedule_update(s5, s6, s1, s2, s4);
-    sha512_neon_round2(42, s5, &gh, &ab, &cd, &ef);
-    s6 = sha512_neon_schedule_update(s6, s7, s2, s3, s5);
-    sha512_neon_round2(44, s6, &ef, &gh, &ab, &cd);
-    s7 = sha512_neon_schedule_update(s7, s0, s3, s4, s6);
-    sha512_neon_round2(46, s7, &cd, &ef, &gh, &ab);
-    s0 = sha512_neon_schedule_update(s0, s1, s4, s5, s7);
-    sha512_neon_round2(48, s0, &ab, &cd, &ef, &gh);
-    s1 = sha512_neon_schedule_update(s1, s2, s5, s6, s0);
-    sha512_neon_round2(50, s1, &gh, &ab, &cd, &ef);
-    s2 = sha512_neon_schedule_update(s2, s3, s6, s7, s1);
-    sha512_neon_round2(52, s2, &ef, &gh, &ab, &cd);
-    s3 = sha512_neon_schedule_update(s3, s4, s7, s0, s2);
-    sha512_neon_round2(54, s3, &cd, &ef, &gh, &ab);
-    s4 = sha512_neon_schedule_update(s4, s5, s0, s1, s3);
-    sha512_neon_round2(56, s4, &ab, &cd, &ef, &gh);
-    s5 = sha512_neon_schedule_update(s5, s6, s1, s2, s4);
-    sha512_neon_round2(58, s5, &gh, &ab, &cd, &ef);
-    s6 = sha512_neon_schedule_update(s6, s7, s2, s3, s5);
-    sha512_neon_round2(60, s6, &ef, &gh, &ab, &cd);
-    s7 = sha512_neon_schedule_update(s7, s0, s3, s4, s6);
-    sha512_neon_round2(62, s7, &cd, &ef, &gh, &ab);
-    s0 = sha512_neon_schedule_update(s0, s1, s4, s5, s7);
-    sha512_neon_round2(64, s0, &ab, &cd, &ef, &gh);
-    s1 = sha512_neon_schedule_update(s1, s2, s5, s6, s0);
-    sha512_neon_round2(66, s1, &gh, &ab, &cd, &ef);
-    s2 = sha512_neon_schedule_update(s2, s3, s6, s7, s1);
-    sha512_neon_round2(68, s2, &ef, &gh, &ab, &cd);
-    s3 = sha512_neon_schedule_update(s3, s4, s7, s0, s2);
-    sha512_neon_round2(70, s3, &cd, &ef, &gh, &ab);
-    s4 = sha512_neon_schedule_update(s4, s5, s0, s1, s3);
-    sha512_neon_round2(72, s4, &ab, &cd, &ef, &gh);
-    s5 = sha512_neon_schedule_update(s5, s6, s1, s2, s4);
-    sha512_neon_round2(74, s5, &gh, &ab, &cd, &ef);
-    s6 = sha512_neon_schedule_update(s6, s7, s2, s3, s5);
-    sha512_neon_round2(76, s6, &ef, &gh, &ab, &cd);
-    s7 = sha512_neon_schedule_update(s7, s0, s3, s4, s6);
-    sha512_neon_round2(78, s7, &cd, &ef, &gh, &ab);
-
-    core->ab = vaddq_u64(core->ab, ab);
-    core->cd = vaddq_u64(core->cd, cd);
-    core->ef = vaddq_u64(core->ef, ef);
-    core->gh = vaddq_u64(core->gh, gh);
-}
-
-typedef struct sha512_neon {
-    sha512_neon_core core;
-    sha512_block blk;
-    BinarySink_IMPLEMENTATION;
-    ssh_hash hash;
-} sha512_neon;
-
-static void sha512_neon_write(BinarySink *bs, const void *vp, size_t len);
-
-static ssh_hash *sha512_neon_new(const ssh_hashalg *alg)
-{
-    if (!sha512_hw_available_cached())
-        return NULL;
-
-    sha512_neon *s = snew(sha512_neon);
-
-    s->hash.vt = alg;
-    BinarySink_INIT(s, sha512_neon_write);
-    BinarySink_DELEGATE_INIT(&s->hash, s);
-    return &s->hash;
-}
-
-static void sha512_neon_reset(ssh_hash *hash)
-{
-    sha512_neon *s = container_of(hash, sha512_neon, hash);
-    const uint64_t *iv = (const uint64_t *)hash->vt->extra;
-
-    s->core.ab = vld1q_u64(iv);
-    s->core.cd = vld1q_u64(iv+2);
-    s->core.ef = vld1q_u64(iv+4);
-    s->core.gh = vld1q_u64(iv+6);
-
-    sha512_block_setup(&s->blk);
-}
-
-static void sha512_neon_copyfrom(ssh_hash *hcopy, ssh_hash *horig)
-{
-    sha512_neon *copy = container_of(hcopy, sha512_neon, hash);
-    sha512_neon *orig = container_of(horig, sha512_neon, hash);
-
-    *copy = *orig; /* structure copy */
-
-    BinarySink_COPIED(copy);
-    BinarySink_DELEGATE_INIT(&copy->hash, copy);
-}
-
-static void sha512_neon_free(ssh_hash *hash)
-{
-    sha512_neon *s = container_of(hash, sha512_neon, hash);
-    smemclr(s, sizeof(*s));
-    sfree(s);
-}
-
-static void sha512_neon_write(BinarySink *bs, const void *vp, size_t len)
-{
-    sha512_neon *s = BinarySink_DOWNCAST(bs, sha512_neon);
-
-    while (len > 0)
-        if (sha512_block_write(&s->blk, &vp, &len))
-            sha512_neon_block(&s->core, s->blk.block);
-}
-
-static void sha512_neon_digest(ssh_hash *hash, uint8_t *digest)
-{
-    sha512_neon *s = container_of(hash, sha512_neon, hash);
-
-    sha512_block_pad(&s->blk, BinarySink_UPCAST(s));
-
-    vst1q_u8(digest,    vrev64q_u8(vreinterpretq_u8_u64(s->core.ab)));
-    vst1q_u8(digest+16, vrev64q_u8(vreinterpretq_u8_u64(s->core.cd)));
-    vst1q_u8(digest+32, vrev64q_u8(vreinterpretq_u8_u64(s->core.ef)));
-    vst1q_u8(digest+48, vrev64q_u8(vreinterpretq_u8_u64(s->core.gh)));
-}
-
-static void sha384_neon_digest(ssh_hash *hash, uint8_t *digest)
-{
-    sha512_neon *s = container_of(hash, sha512_neon, hash);
-
-    sha512_block_pad(&s->blk, BinarySink_UPCAST(s));
-
-    vst1q_u8(digest,    vrev64q_u8(vreinterpretq_u8_u64(s->core.ab)));
-    vst1q_u8(digest+16, vrev64q_u8(vreinterpretq_u8_u64(s->core.cd)));
-    vst1q_u8(digest+32, vrev64q_u8(vreinterpretq_u8_u64(s->core.ef)));
-}
-
-const ssh_hashalg ssh_sha512_hw = {
-    .new = sha512_neon_new,
-    .reset = sha512_neon_reset,
-    .copyfrom = sha512_neon_copyfrom,
-    .digest = sha512_neon_digest,
-    .free = sha512_neon_free,
-    .hlen = 64,
-    .blocklen = 128,
-    HASHALG_NAMES_ANNOTATED("SHA-512", "NEON accelerated"),
-    .extra = sha512_initial_state,
-};
-
-const ssh_hashalg ssh_sha384_hw = {
-    .new = sha512_neon_new,
-    .reset = sha512_neon_reset,
-    .copyfrom = sha512_neon_copyfrom,
-    .digest = sha384_neon_digest,
-    .free = sha512_neon_free,
-    .hlen = 48,
-    .blocklen = 128,
-    HASHALG_NAMES_ANNOTATED("SHA-384", "NEON accelerated"),
-    .extra = sha384_initial_state,
-};
-
-/* ----------------------------------------------------------------------
- * Stub functions if we have no hardware-accelerated SHA-512. In this
- * case, sha512_hw_new returns NULL (though it should also never be
- * selected by sha512_select, so the only thing that should even be
- * _able_ to call it is testcrypt). As a result, the remaining vtable
- * functions should never be called at all.
- */
-
-#elif HW_SHA512 == HW_SHA512_NONE
-
-static bool sha512_hw_available(void)
-{
-    return false;
-}
-
-static ssh_hash *sha512_stub_new(const ssh_hashalg *alg)
-{
-    return NULL;
-}
-
-#define STUB_BODY { unreachable("Should never be called"); }
-
-static void sha512_stub_reset(ssh_hash *hash) STUB_BODY
-static void sha512_stub_copyfrom(ssh_hash *hash, ssh_hash *orig) STUB_BODY
-static void sha512_stub_free(ssh_hash *hash) STUB_BODY
-static void sha512_stub_digest(ssh_hash *hash, uint8_t *digest) STUB_BODY
-
-const ssh_hashalg ssh_sha512_hw = {
-    .new = sha512_stub_new,
-    .reset = sha512_stub_reset,
-    .copyfrom = sha512_stub_copyfrom,
-    .digest = sha512_stub_digest,
-    .free = sha512_stub_free,
-    .hlen = 64,
-    .blocklen = 128,
-    HASHALG_NAMES_ANNOTATED("SHA-512", "!NONEXISTENT ACCELERATED VERSION!"),
-};
-
-const ssh_hashalg ssh_sha384_hw = {
-    .new = sha512_stub_new,
-    .reset = sha512_stub_reset,
-    .copyfrom = sha512_stub_copyfrom,
-    .digest = sha512_stub_digest,
-    .free = sha512_stub_free,
-    .hlen = 48,
-    .blocklen = 128,
-    HASHALG_NAMES_ANNOTATED("SHA-384", "!NONEXISTENT ACCELERATED VERSION!"),
-};
-
-#endif /* HW_SHA512 */
--- a/crypto/sha512.h
+++ b/crypto/sha512.h
@ -0,0 +1,131 @@
+/*
+ * Definitions likely to be helpful to multiple SHA-512 implementations.
+ */
+
+/*
+ * The 'extra' structure used by SHA-512 implementations is used to
+ * include information about how to check if a given implementation is
+ * available at run time, and whether we've already checked.
+ */
+struct sha512_extra_mutable;
+struct sha512_extra {
+    /* Pointer to the initial state (distinguishes SHA-384 from -512) */
+    const uint64_t *initial_state;
+
+    /* Function to check availability. Might be expensive, so we don't
+     * want to call it more than once. */
+    bool (*check_available)(void);
+
+    /* Point to a writable substructure. */
+    struct sha512_extra_mutable *mut;
+};
+struct sha512_extra_mutable {
+    bool checked_availability;
+    bool is_available;
+};
+static inline bool check_availability(const struct sha512_extra *extra)
+{
+    if (!extra->mut->checked_availability) {
+        extra->mut->is_available = extra->check_available();
+        extra->mut->checked_availability = true;
+    }
+
+    return extra->mut->is_available;
+}
+
+/*
+ * Macro to define a pair of SHA-{384,512} vtables together with their
+ * 'extra' structure.
+ */
+#define SHA512_VTABLES(impl_c, impl_display)                            \
+    static struct sha512_extra_mutable sha512_ ## impl_c ## _extra_mut; \
+    static const struct sha512_extra sha384_ ## impl_c ## _extra = {    \
+        .initial_state = sha384_initial_state,                          \
+        .check_available = sha512_ ## impl_c ## _available,             \
+        .mut = &sha512_ ## impl_c ## _extra_mut,                        \
+    };                                                                  \
+    static const struct sha512_extra sha512_ ## impl_c ## _extra = {    \
+        .initial_state = sha512_initial_state,                          \
+        .check_available = sha512_ ## impl_c ## _available,             \
+        .mut = &sha512_ ## impl_c ## _extra_mut,                        \
+    };                                                                  \
+    const ssh_hashalg ssh_sha384_ ## impl_c = {                         \
+        .new = sha512_ ## impl_c ## _new,                               \
+        .reset = sha512_ ## impl_c ## _reset,                           \
+        .copyfrom = sha512_ ## impl_c ## _copyfrom,                     \
+        .digest = sha384_ ## impl_c ## _digest,                         \
+        .free = sha512_ ## impl_c ## _free,                             \
+        .hlen = 48,                                                     \
+        .blocklen = 128,                                                \
+        HASHALG_NAMES_ANNOTATED("SHA-384", impl_display),               \
+        .extra = &sha384_ ## impl_c ## _extra,                          \
+    };                                                                  \
+    const ssh_hashalg ssh_sha512_ ## impl_c = {                         \
+        .new = sha512_ ## impl_c ## _new,                               \
+        .reset = sha512_ ## impl_c ## _reset,                           \
+        .copyfrom = sha512_ ## impl_c ## _copyfrom,                     \
+        .digest = sha512_ ## impl_c ## _digest,                         \
+        .free = sha512_ ## impl_c ## _free,                             \
+        .hlen = 64,                                                     \
+        .blocklen = 128,                                                \
+        HASHALG_NAMES_ANNOTATED("SHA-512", impl_display),               \
+        .extra = &sha512_ ## impl_c ## _extra,                          \
+    }
+
+extern const uint64_t sha512_initial_state[8];
+extern const uint64_t sha384_initial_state[8];
+extern const uint64_t sha512_round_constants[80];
+
+#define SHA512_ROUNDS 80
+
+typedef struct sha512_block sha512_block;
+struct sha512_block {
+    uint8_t block[128];
+    size_t used;
+    uint64_t lenhi, lenlo;
+};
+
+static inline void sha512_block_setup(sha512_block *blk)
+{
+    blk->used = 0;
+    blk->lenhi = blk->lenlo = 0;
+}
+
+static inline bool sha512_block_write(
+    sha512_block *blk, const void **vdata, size_t *len)
+{
+    size_t blkleft = sizeof(blk->block) - blk->used;
+    size_t chunk = *len < blkleft ? *len : blkleft;
+
+    const uint8_t *p = *vdata;
+    memcpy(blk->block + blk->used, p, chunk);
+    *vdata = p + chunk;
+    *len -= chunk;
+    blk->used += chunk;
+
+    size_t chunkbits = chunk << 3;
+
+    blk->lenlo += chunkbits;
+    blk->lenhi += (blk->lenlo < chunkbits);
+
+    if (blk->used == sizeof(blk->block)) {
+        blk->used = 0;
+        return true;
+    }
+
+    return false;
+}
+
+static inline void sha512_block_pad(sha512_block *blk, BinarySink *bs)
+{
+    uint64_t final_lenhi = blk->lenhi;
+    uint64_t final_lenlo = blk->lenlo;
+    size_t pad = 127 & (111 - blk->used);
+
+    put_byte(bs, 0x80);
+    put_padding(bs, pad, 0);
+    put_uint64(bs, final_lenhi);
+    put_uint64(bs, final_lenlo);
+
+    assert(blk->used == 0 && "Should have exactly hit a block boundary");
+}
--- a/ssh.h
+++ b/ssh.h
@ -953,22 +953,28 @@ extern const ssh_cipheralg ssh_3des_ssh2;
 extern const ssh_cipheralg ssh_des;
 extern const ssh_cipheralg ssh_des_sshcom_ssh2;
 extern const ssh_cipheralg ssh_aes256_sdctr;
-extern const ssh_cipheralg ssh_aes256_sdctr_hw;
+extern const ssh_cipheralg ssh_aes256_sdctr_ni;
+extern const ssh_cipheralg ssh_aes256_sdctr_neon;
 extern const ssh_cipheralg ssh_aes256_sdctr_sw;
 extern const ssh_cipheralg ssh_aes256_cbc;
-extern const ssh_cipheralg ssh_aes256_cbc_hw;
+extern const ssh_cipheralg ssh_aes256_cbc_ni;
+extern const ssh_cipheralg ssh_aes256_cbc_neon;
 extern const ssh_cipheralg ssh_aes256_cbc_sw;
 extern const ssh_cipheralg ssh_aes192_sdctr;
-extern const ssh_cipheralg ssh_aes192_sdctr_hw;
+extern const ssh_cipheralg ssh_aes192_sdctr_ni;
+extern const ssh_cipheralg ssh_aes192_sdctr_neon;
 extern const ssh_cipheralg ssh_aes192_sdctr_sw;
 extern const ssh_cipheralg ssh_aes192_cbc;
-extern const ssh_cipheralg ssh_aes192_cbc_hw;
+extern const ssh_cipheralg ssh_aes192_cbc_ni;
+extern const ssh_cipheralg ssh_aes192_cbc_neon;
 extern const ssh_cipheralg ssh_aes192_cbc_sw;
 extern const ssh_cipheralg ssh_aes128_sdctr;
-extern const ssh_cipheralg ssh_aes128_sdctr_hw;
+extern const ssh_cipheralg ssh_aes128_sdctr_ni;
+extern const ssh_cipheralg ssh_aes128_sdctr_neon;
 extern const ssh_cipheralg ssh_aes128_sdctr_sw;
 extern const ssh_cipheralg ssh_aes128_cbc;
-extern const ssh_cipheralg ssh_aes128_cbc_hw;
+extern const ssh_cipheralg ssh_aes128_cbc_ni;
+extern const ssh_cipheralg ssh_aes128_cbc_neon;
 extern const ssh_cipheralg ssh_aes128_cbc_sw;
 extern const ssh_cipheralg ssh_blowfish_ssh2_ctr;
 extern const ssh_cipheralg ssh_blowfish_ssh2;
@ -983,16 +989,18 @@ extern const ssh2_ciphers ssh2_arcfour;
 extern const ssh2_ciphers ssh2_ccp;
 extern const ssh_hashalg ssh_md5;
 extern const ssh_hashalg ssh_sha1;
-extern const ssh_hashalg ssh_sha1_hw;
+extern const ssh_hashalg ssh_sha1_ni;
+extern const ssh_hashalg ssh_sha1_neon;
 extern const ssh_hashalg ssh_sha1_sw;
 extern const ssh_hashalg ssh_sha256;
-extern const ssh_hashalg ssh_sha256_hw;
+extern const ssh_hashalg ssh_sha256_ni;
+extern const ssh_hashalg ssh_sha256_neon;
 extern const ssh_hashalg ssh_sha256_sw;
 extern const ssh_hashalg ssh_sha384;
-extern const ssh_hashalg ssh_sha384_hw;
+extern const ssh_hashalg ssh_sha384_neon;
 extern const ssh_hashalg ssh_sha384_sw;
 extern const ssh_hashalg ssh_sha512;
-extern const ssh_hashalg ssh_sha512_hw;
+extern const ssh_hashalg ssh_sha512_neon;
 extern const ssh_hashalg ssh_sha512_sw;
 extern const ssh_hashalg ssh_sha3_224;
 extern const ssh_hashalg ssh_sha3_256;
@ -1039,10 +1047,10 @@ ssh_hash *blake2b_new_general(unsigned hashlen);
 * itself. If so, then this function should be implemented in each
 * platform subdirectory.
 */
-bool platform_aes_hw_available(void);
-bool platform_sha256_hw_available(void);
-bool platform_sha1_hw_available(void);
-bool platform_sha512_hw_available(void);
+bool platform_aes_neon_available(void);
+bool platform_sha256_neon_available(void);
+bool platform_sha1_neon_available(void);
+bool platform_sha512_neon_available(void);

 /*
 * PuTTY version number formatted as an SSH version string.
--- a/test/cryptsuite.py
+++ b/test/cryptsuite.py
@ -141,6 +141,14 @@ def mac_str(alg, key, message, cipher=None):
 def lcm(a, b):
    return a * b // gcd(a, b)

+def get_implementations(alg):
+    return get_implementations_commasep(alg).decode("ASCII").split(",")
+
+def get_aes_impls():
+    return [impl.rsplit("_", 1)[-1]
+            for impl in get_implementations("aes128_cbc")
+            if impl.startswith("aes128_cbc_")]
+
 class MyTestBase(unittest.TestCase):
    "Intermediate class that adds useful helper methods."
    def assertEqualBin(self, x, y):
@ -1181,9 +1189,9 @@ class crypt(MyTestBase):
        # reference implementation of AES in Python. ('Mostly'
        # independent in that it was written by me.)

-        def vector(cipher, key, iv, plaintext, ciphertext):
-            for suffix in "hw", "sw":
-                c = ssh_cipher_new("{}_{}".format(cipher, suffix))
+        def vector(cipherbase, key, iv, plaintext, ciphertext):
+            for cipher in get_implementations(cipherbase):
+                c = ssh_cipher_new(cipher)
                if c is None: return # skip test if HW AES not available
                ssh_cipher_setkey(c, key)
                ssh_cipher_setiv(c, iv)
@ -1302,7 +1310,7 @@ class crypt(MyTestBase):
        # We also test this at all three AES key lengths, in case the
        # core cipher routines are written separately for each one.

-        for suffix in "hw", "sw":
+        for suffix in get_aes_impls():
            for keylen in [128, 192, 256]:
                hexTestValues = ["00000000", "00000001", "ffffffff"]
                for ivHexBytes in itertools.product(*([hexTestValues] * 4)):
@ -1325,7 +1333,7 @@ class crypt(MyTestBase):
        for keylen in [128, 192, 256]:
            decryptions = []

-            for suffix in "hw", "sw":
+            for suffix in get_aes_impls():
                c = ssh_cipher_new("aes{:d}_cbc_{}".format(keylen, suffix))
                if c is None: continue
                ssh_cipher_setkey(c, test_key[:keylen//8])
@ -1493,23 +1501,11 @@ class crypt(MyTestBase):
            ("3des_ssh1",     24,    8, False, unhex('d5f1cc25b8fbc62de63590b9b92344adf6dd72753273ff0fb32d4dbc6af858529129f34242f3d557eed3a5c84204eb4f868474294964cf70df5d8f45dfccfc45')),
            ("des_cbc",        8,    8, True,  unhex('051524e77fb40e109d9fffeceacf0f28c940e2f8415ddccc117020bdd2612af5036490b12085d0e46129919b8e499f51cb82a4b341d7a1a1ea3e65201ef248f6')),
            ("aes256_ctr",    32,   16, False, unhex('b87b35e819f60f0f398a37b05d7bcf0b04ad4ebe570bd08e8bfa8606bafb0db2cfcd82baf2ccceae5de1a3c1ae08a8b8fdd884fdc5092031ea8ce53333e62976')),
-            ("aes256_ctr_hw", 32,   16, False, unhex('b87b35e819f60f0f398a37b05d7bcf0b04ad4ebe570bd08e8bfa8606bafb0db2cfcd82baf2ccceae5de1a3c1ae08a8b8fdd884fdc5092031ea8ce53333e62976')),
-            ("aes256_ctr_sw", 32,   16, False, unhex('b87b35e819f60f0f398a37b05d7bcf0b04ad4ebe570bd08e8bfa8606bafb0db2cfcd82baf2ccceae5de1a3c1ae08a8b8fdd884fdc5092031ea8ce53333e62976')),
            ("aes256_cbc",    32,   16, True,  unhex('381cbb2fbcc48118d0094540242bd990dd6af5b9a9890edd013d5cad2d904f34b9261c623a452f32ea60e5402919a77165df12862742f1059f8c4a862f0827c5')),
-            ("aes256_cbc_hw", 32,   16, True,  unhex('381cbb2fbcc48118d0094540242bd990dd6af5b9a9890edd013d5cad2d904f34b9261c623a452f32ea60e5402919a77165df12862742f1059f8c4a862f0827c5')),
-            ("aes256_cbc_sw", 32,   16, True,  unhex('381cbb2fbcc48118d0094540242bd990dd6af5b9a9890edd013d5cad2d904f34b9261c623a452f32ea60e5402919a77165df12862742f1059f8c4a862f0827c5')),
            ("aes192_ctr",    24,   16, False, unhex('06bcfa7ccf075d723e12b724695a571a0fad67c56287ea609c410ac12749c51bb96e27fa7e1c7ea3b14792bbbb8856efb0617ebec24a8e4a87340d820cf347b8')),
-            ("aes192_ctr_hw", 24,   16, False, unhex('06bcfa7ccf075d723e12b724695a571a0fad67c56287ea609c410ac12749c51bb96e27fa7e1c7ea3b14792bbbb8856efb0617ebec24a8e4a87340d820cf347b8')),
-            ("aes192_ctr_sw", 24,   16, False, unhex('06bcfa7ccf075d723e12b724695a571a0fad67c56287ea609c410ac12749c51bb96e27fa7e1c7ea3b14792bbbb8856efb0617ebec24a8e4a87340d820cf347b8')),
            ("aes192_cbc",    24,   16, True,  unhex('ac97f8698170f9c05341214bd7624d5d2efef8311596163dc597d9fe6c868971bd7557389974612cbf49ea4e7cc6cc302d4cc90519478dd88a4f09b530c141f3')),
-            ("aes192_cbc_hw", 24,   16, True,  unhex('ac97f8698170f9c05341214bd7624d5d2efef8311596163dc597d9fe6c868971bd7557389974612cbf49ea4e7cc6cc302d4cc90519478dd88a4f09b530c141f3')),
-            ("aes192_cbc_sw", 24,   16, True,  unhex('ac97f8698170f9c05341214bd7624d5d2efef8311596163dc597d9fe6c868971bd7557389974612cbf49ea4e7cc6cc302d4cc90519478dd88a4f09b530c141f3')),
            ("aes128_ctr",    16,   16, False, unhex('0ad4ddfd2360ec59d77dcb9a981f92109437c68c5e7f02f92017d9f424f89ab7850473ac0e19274125e740f252c84ad1f6ad138b6020a03bdaba2f3a7378ce1e')),
-            ("aes128_ctr_hw", 16,   16, False, unhex('0ad4ddfd2360ec59d77dcb9a981f92109437c68c5e7f02f92017d9f424f89ab7850473ac0e19274125e740f252c84ad1f6ad138b6020a03bdaba2f3a7378ce1e')),
-            ("aes128_ctr_sw", 16,   16, False, unhex('0ad4ddfd2360ec59d77dcb9a981f92109437c68c5e7f02f92017d9f424f89ab7850473ac0e19274125e740f252c84ad1f6ad138b6020a03bdaba2f3a7378ce1e')),
            ("aes128_cbc",    16,   16, True,  unhex('36de36917fb7955a711c8b0bf149b29120a77524f393ae3490f4ce5b1d5ca2a0d7064ce3c38e267807438d12c0e40cd0d84134647f9f4a5b11804a0cc5070e62')),
-            ("aes128_cbc_hw", 16,   16, True,  unhex('36de36917fb7955a711c8b0bf149b29120a77524f393ae3490f4ce5b1d5ca2a0d7064ce3c38e267807438d12c0e40cd0d84134647f9f4a5b11804a0cc5070e62')),
-            ("aes128_cbc_sw", 16,   16, True,  unhex('36de36917fb7955a711c8b0bf149b29120a77524f393ae3490f4ce5b1d5ca2a0d7064ce3c38e267807438d12c0e40cd0d84134647f9f4a5b11804a0cc5070e62')),
            ("blowfish_ctr",  32,    8, False, unhex('079daf0f859363ccf72e975764d709232ec48adc74f88ccd1f342683f0bfa89ca0e8dbfccc8d4d99005d6b61e9cc4e6eaa2fd2a8163271b94bf08ef212129f01')),
            ("blowfish_ssh2", 16,    8, True,  unhex('e986b7b01f17dfe80ee34cac81fa029b771ec0f859ae21ae3ec3df1674bc4ceb54a184c6c56c17dd2863c3e9c068e76fd9aef5673465995f0d648b0bb848017f')),
            ("blowfish_ssh1", 32,    8, True,  unhex('d44092a9035d895acf564ba0365d19570fbb4f125d5a4fd2a1812ee6c8a1911a51bb181fbf7d1a261253cab71ee19346eb477b3e7ecf1d95dd941e635c1a4fbf')),
@ -1517,36 +1513,37 @@ class crypt(MyTestBase):
            ("arcfour128",    16, None, False, unhex('fd4af54c5642cb29629e50a15d22e4944e21ffba77d0543b27590eafffe3886686d1aefae0484afc9e67edc0e67eb176bbb5340af1919ea39adfe866d066dd05')),
        ]

-        for alg, keylen, ivlen, simple_cbc, c in ciphers:
-            cipher = ssh_cipher_new(alg)
-            if cipher is None:
-                continue # hardware-accelerated cipher not available
+        for algbase, keylen, ivlen, simple_cbc, c in ciphers:
+            for alg in get_implementations(algbase):
+                cipher = ssh_cipher_new(alg)
+                if cipher is None:
+                    continue # hardware-accelerated cipher not available

-            ssh_cipher_setkey(cipher, k[:keylen])
-            if ivlen is not None:
-                ssh_cipher_setiv(cipher, iv[:ivlen])
-            self.assertEqualBin(ssh_cipher_encrypt(cipher, p), c)
-
-            ssh_cipher_setkey(cipher, k[:keylen])
-            if ivlen is not None:
-                ssh_cipher_setiv(cipher, iv[:ivlen])
-            self.assertEqualBin(ssh_cipher_decrypt(cipher, c), p)
-
-            if simple_cbc:
-                # CBC ciphers (other than the three-layered CBC used
-                # by SSH-1 3DES) have more specific semantics for
-                # their IV than 'some kind of starting state for the
-                # cipher mode': the IV is specifically supposed to
-                # represent the previous block of ciphertext. So we
-                # can check that, by supplying the IV _as_ a
-                # ciphertext block via a call to decrypt(), and seeing
-                # if that causes our test ciphertext to decrypt the
-                # same way as when we provided the same IV via
-                # setiv().
                ssh_cipher_setkey(cipher, k[:keylen])
-                ssh_cipher_decrypt(cipher, iv[:ivlen])
+                if ivlen is not None:
+                    ssh_cipher_setiv(cipher, iv[:ivlen])
+                self.assertEqualBin(ssh_cipher_encrypt(cipher, p), c)
+
+                ssh_cipher_setkey(cipher, k[:keylen])
+                if ivlen is not None:
+                    ssh_cipher_setiv(cipher, iv[:ivlen])
                self.assertEqualBin(ssh_cipher_decrypt(cipher, c), p)

+                if simple_cbc:
+                    # CBC ciphers (other than the three-layered CBC used
+                    # by SSH-1 3DES) have more specific semantics for
+                    # their IV than 'some kind of starting state for the
+                    # cipher mode': the IV is specifically supposed to
+                    # represent the previous block of ciphertext. So we
+                    # can check that, by supplying the IV _as_ a
+                    # ciphertext block via a call to decrypt(), and seeing
+                    # if that causes our test ciphertext to decrypt the
+                    # same way as when we provided the same IV via
+                    # setiv().
+                    ssh_cipher_setkey(cipher, k[:keylen])
+                    ssh_cipher_decrypt(cipher, iv[:ivlen])
+                    self.assertEqualBin(ssh_cipher_decrypt(cipher, c), p)
+
    def testRSAKex(self):
        # Round-trip test of the RSA key exchange functions, plus a
        # hardcoded plain/ciphertext pair to guard against the
@ -2324,7 +2321,7 @@ Private-MAC: 5b1f6f4cc43eb0060d2c3e181bc0129343adba2b
 class standard_test_vectors(MyTestBase):
    def testAES(self):
        def vector(cipher, key, plaintext, ciphertext):
-            for suffix in "hw", "sw":
+            for suffix in get_aes_impls():
                c = ssh_cipher_new("{}_{}".format(cipher, suffix))
                if c is None: return # skip test if HW AES not available
                ssh_cipher_setkey(c, key)
@ -2540,7 +2537,7 @@ class standard_test_vectors(MyTestBase):
                         unhex('56be34521d144c88dbb8c733f0e8b3f6'))

    def testSHA1(self):
-        for hashname in ['sha1_sw', 'sha1_hw']:
+        for hashname in get_implementations("sha1"):
            if ssh_hash_new(hashname) is None:
                continue # skip testing of unavailable HW implementation

@ -2577,7 +2574,7 @@ class standard_test_vectors(MyTestBase):
                "cb0082c8f197d260991ba6a460e76e202bad27b3"))

    def testSHA256(self):
-        for hashname in ['sha256_sw', 'sha256_hw']:
+        for hashname in get_implementations("sha256"):
            if ssh_hash_new(hashname) is None:
                continue # skip testing of unavailable HW implementation

@ -2621,7 +2618,7 @@ class standard_test_vectors(MyTestBase):
                    "8ad3361763f7e9b2d95f4f0da6e1ccbc"))

    def testSHA384(self):
-        for hashname in ['sha384_sw', 'sha384_hw']:
+        for hashname in get_implementations("sha384"):
            if ssh_hash_new(hashname) is None:
                continue # skip testing of unavailable HW implementation

@ -2663,7 +2660,7 @@ class standard_test_vectors(MyTestBase):
                '38e42b5c4de660f5de8fb2a5b2fbd2a3cbffd20cff1288c0'))

    def testSHA512(self):
-        for hashname in ['sha512_sw', 'sha512_hw']:
+        for hashname in get_implementations("sha512"):
            if ssh_hash_new(hashname) is None:
                continue # skip testing of unavailable HW implementation

--- a/testcrypt.c
+++ b/testcrypt.c
@ -207,16 +207,24 @@ static const ssh_hashalg *get_hashalg(BinarySource *in)
        {"md5", &ssh_md5},
        {"sha1", &ssh_sha1},
        {"sha1_sw", &ssh_sha1_sw},
-        {"sha1_hw", &ssh_sha1_hw},
        {"sha256", &ssh_sha256},
-        {"sha256_sw", &ssh_sha256_sw},
-        {"sha256_hw", &ssh_sha256_hw},
        {"sha384", &ssh_sha384},
-        {"sha384_sw", &ssh_sha384_sw},
-        {"sha384_hw", &ssh_sha384_hw},
        {"sha512", &ssh_sha512},
+        {"sha256_sw", &ssh_sha256_sw},
+        {"sha384_sw", &ssh_sha384_sw},
        {"sha512_sw", &ssh_sha512_sw},
-        {"sha512_hw", &ssh_sha512_hw},
+#if HAVE_SHA_NI
+        {"sha1_ni", &ssh_sha1_ni},
+        {"sha256_ni", &ssh_sha256_ni},
+#endif
+#if HAVE_NEON_CRYPTO
+        {"sha1_neon", &ssh_sha1_neon},
+        {"sha256_neon", &ssh_sha256_neon},
+#endif
+#if HAVE_NEON_SHA512
+        {"sha384_neon", &ssh_sha384_neon},
+        {"sha512_neon", &ssh_sha512_neon},
+#endif
        {"sha3_224", &ssh_sha3_224},
        {"sha3_256", &ssh_sha3_256},
        {"sha3_384", &ssh_sha3_384},
@ -290,23 +298,33 @@ static const ssh_cipheralg *get_cipheralg(BinarySource *in)
        {"3des_ssh1", &ssh_3des_ssh1},
        {"des_cbc", &ssh_des},
        {"aes256_ctr", &ssh_aes256_sdctr},
-        {"aes256_ctr_hw", &ssh_aes256_sdctr_hw},
-        {"aes256_ctr_sw", &ssh_aes256_sdctr_sw},
        {"aes256_cbc", &ssh_aes256_cbc},
-        {"aes256_cbc_hw", &ssh_aes256_cbc_hw},
-        {"aes256_cbc_sw", &ssh_aes256_cbc_sw},
        {"aes192_ctr", &ssh_aes192_sdctr},
-        {"aes192_ctr_hw", &ssh_aes192_sdctr_hw},
-        {"aes192_ctr_sw", &ssh_aes192_sdctr_sw},
        {"aes192_cbc", &ssh_aes192_cbc},
-        {"aes192_cbc_hw", &ssh_aes192_cbc_hw},
-        {"aes192_cbc_sw", &ssh_aes192_cbc_sw},
        {"aes128_ctr", &ssh_aes128_sdctr},
-        {"aes128_ctr_hw", &ssh_aes128_sdctr_hw},
-        {"aes128_ctr_sw", &ssh_aes128_sdctr_sw},
        {"aes128_cbc", &ssh_aes128_cbc},
-        {"aes128_cbc_hw", &ssh_aes128_cbc_hw},
+        {"aes256_ctr_sw", &ssh_aes256_sdctr_sw},
+        {"aes256_cbc_sw", &ssh_aes256_cbc_sw},
+        {"aes192_ctr_sw", &ssh_aes192_sdctr_sw},
+        {"aes192_cbc_sw", &ssh_aes192_cbc_sw},
+        {"aes128_ctr_sw", &ssh_aes128_sdctr_sw},
        {"aes128_cbc_sw", &ssh_aes128_cbc_sw},
+#if HAVE_AES_NI
+        {"aes256_ctr_ni", &ssh_aes256_sdctr_ni},
+        {"aes256_cbc_ni", &ssh_aes256_cbc_ni},
+        {"aes192_ctr_ni", &ssh_aes192_sdctr_ni},
+        {"aes192_cbc_ni", &ssh_aes192_cbc_ni},
+        {"aes128_ctr_ni", &ssh_aes128_sdctr_ni},
+        {"aes128_cbc_ni", &ssh_aes128_cbc_ni},
+#endif
+#if HAVE_NEON_CRYPTO
+        {"aes256_ctr_neon", &ssh_aes256_sdctr_neon},
+        {"aes256_cbc_neon", &ssh_aes256_cbc_neon},
+        {"aes192_ctr_neon", &ssh_aes192_sdctr_neon},
+        {"aes192_cbc_neon", &ssh_aes192_cbc_neon},
+        {"aes128_ctr_neon", &ssh_aes128_sdctr_neon},
+        {"aes128_cbc_neon", &ssh_aes128_cbc_neon},
+#endif
        {"blowfish_ctr", &ssh_blowfish_ssh2_ctr},
        {"blowfish_ssh2", &ssh_blowfish_ssh2},
        {"blowfish_ssh1", &ssh_blowfish_ssh1},
@ -1285,6 +1303,38 @@ strbuf *argon2_wrapper(Argon2Flavour flavour, uint32_t mem, uint32_t passes,
 }
 #define argon2 argon2_wrapper

+strbuf *get_implementations_commasep(ptrlen alg)
+{
+    strbuf *out = strbuf_new();
+    put_datapl(out, alg);
+
+    if (ptrlen_startswith(alg, PTRLEN_LITERAL("aes"), NULL)) {
+        strbuf_catf(out, ",%.*s_sw", PTRLEN_PRINTF(alg));
+#if HAVE_AES_NI
+        strbuf_catf(out, ",%.*s_ni", PTRLEN_PRINTF(alg));
+#endif
+#if HAVE_NEON_CRYPTO
+        strbuf_catf(out, ",%.*s_neon", PTRLEN_PRINTF(alg));
+#endif
+    } else if (ptrlen_startswith(alg, PTRLEN_LITERAL("sha256"), NULL) ||
+               ptrlen_startswith(alg, PTRLEN_LITERAL("sha1"), NULL)) {
+        strbuf_catf(out, ",%.*s_sw", PTRLEN_PRINTF(alg));
+#if HAVE_SHA_NI
+        strbuf_catf(out, ",%.*s_ni", PTRLEN_PRINTF(alg));
+#endif
+#if HAVE_NEON_CRYPTO
+        strbuf_catf(out, ",%.*s_neon", PTRLEN_PRINTF(alg));
+#endif
+    } else if (ptrlen_startswith(alg, PTRLEN_LITERAL("sha512"), NULL)) {
+        strbuf_catf(out, ",%.*s_sw", PTRLEN_PRINTF(alg));
+#if HAVE_NEON_SHA512
+        strbuf_catf(out, ",%.*s_neon", PTRLEN_PRINTF(alg));
+#endif
+    }
+
+    return out;
+}
+
 #define OPTIONAL_PTR_FUNC(type)                                         \
    typedef TD_val_##type TD_opt_val_##type;                            \
    static TD_opt_val_##type get_opt_val_##type(BinarySource *in) {     \
--- a/testcrypt.h
+++ b/testcrypt.h
@ -315,6 +315,7 @@ FUNC1(uint, crc32_rfc1662, val_string_ptrlen)
 FUNC1(uint, crc32_ssh1, val_string_ptrlen)
 FUNC2(uint, crc32_update, uint, val_string_ptrlen)
 FUNC2(boolean, crcda_detect, val_string_ptrlen, val_string_ptrlen)
+FUNC1(val_string, get_implementations_commasep, val_string_ptrlen)

 /*
 * These functions aren't part of PuTTY's own API, but are additions
--- a/testsc.c
+++ b/testsc.c
@ -216,6 +216,31 @@ VOLATILE_WRAPPED_DEFN(static, size_t, looplimit, (size_t x))
    return x;
 }

+#if HAVE_AES_NI
+#define CIPHERS_AES_NI(X, Y) \
+    X(Y, ssh_aes256_sdctr_ni)                   \
+    X(Y, ssh_aes256_cbc_ni)                     \
+    X(Y, ssh_aes192_sdctr_ni)                   \
+    X(Y, ssh_aes192_cbc_ni)                     \
+    X(Y, ssh_aes128_sdctr_ni)                   \
+    X(Y, ssh_aes128_cbc_ni)                     \
+    /* end of list */
+#else
+#define CIPHERS_AES_NI(X, Y)
+#endif
+#if HAVE_NEON_CRYPTO
+#define CIPHERS_AES_NEON(X, Y) \
+    X(Y, ssh_aes256_sdctr_neon)                   \
+    X(Y, ssh_aes256_cbc_neon)                     \
+    X(Y, ssh_aes192_sdctr_neon)                   \
+    X(Y, ssh_aes192_cbc_neon)                     \
+    X(Y, ssh_aes128_sdctr_neon)                   \
+    X(Y, ssh_aes128_cbc_neon)                     \
+    /* end of list */
+#else
+#define CIPHERS_AES_NEON(X, Y)
+#endif
+
 /* Ciphers that we expect to pass this test. Blowfish and Arcfour are
 * intentionally omitted, because we already know they don't. */
 #define CIPHERS(X, Y)                           \
@ -225,23 +250,19 @@ VOLATILE_WRAPPED_DEFN(static, size_t, looplimit, (size_t x))
    X(Y, ssh_des)                               \
    X(Y, ssh_des_sshcom_ssh2)                   \
    X(Y, ssh_aes256_sdctr)                      \
-    X(Y, ssh_aes256_sdctr_hw)                   \
-    X(Y, ssh_aes256_sdctr_sw)                   \
    X(Y, ssh_aes256_cbc)                        \
-    X(Y, ssh_aes256_cbc_hw)                     \
-    X(Y, ssh_aes256_cbc_sw)                     \
    X(Y, ssh_aes192_sdctr)                      \
-    X(Y, ssh_aes192_sdctr_hw)                   \
-    X(Y, ssh_aes192_sdctr_sw)                   \
    X(Y, ssh_aes192_cbc)                        \
-    X(Y, ssh_aes192_cbc_hw)                     \
-    X(Y, ssh_aes192_cbc_sw)                     \
    X(Y, ssh_aes128_sdctr)                      \
-    X(Y, ssh_aes128_sdctr_hw)                   \
-    X(Y, ssh_aes128_sdctr_sw)                   \
    X(Y, ssh_aes128_cbc)                        \
-    X(Y, ssh_aes128_cbc_hw)                     \
+    X(Y, ssh_aes256_sdctr_sw)                   \
+    X(Y, ssh_aes256_cbc_sw)                     \
+    X(Y, ssh_aes192_sdctr_sw)                   \
+    X(Y, ssh_aes192_cbc_sw)                     \
+    X(Y, ssh_aes128_sdctr_sw)                   \
    X(Y, ssh_aes128_cbc_sw)                     \
+    CIPHERS_AES_NI(X, Y)                        \
+    CIPHERS_AES_NEON(X, Y)                      \
    X(Y, ssh2_chacha20_poly1305)                \
    /* end of list */

@ -258,16 +279,35 @@ VOLATILE_WRAPPED_DEFN(static, size_t, looplimit, (size_t x))

 #define MAC_TESTLIST(X, name) X(mac_ ## name)

+#if HAVE_SHA_NI
+#define HASH_SHA_NI(X, Y) X(Y, ssh_sha256_ni) X(Y, ssh_sha1_ni)
+#else
+#define HASH_SHA_NI(X, Y)
+#endif
+#if HAVE_NEON_CRYPTO
+#define HASH_SHA_NEON(X, Y) X(Y, ssh_sha256_neon) X(Y, ssh_sha1_neon)
+#else
+#define HASH_SHA_NEON(X, Y)
+#endif
+#if HAVE_NEON_SHA512
+#define HASH_SHA512_NEON(X, Y) X(Y, ssh_sha384_neon) X(Y, ssh_sha512_neon)
+#else
+#define HASH_SHA512_NEON(X, Y)
+#endif
+
 #define HASHES(X, Y)                            \
    X(Y, ssh_md5)                               \
    X(Y, ssh_sha1)                              \
-    X(Y, ssh_sha1_hw)                           \
    X(Y, ssh_sha1_sw)                           \
    X(Y, ssh_sha256)                            \
-    X(Y, ssh_sha256_hw)                         \
    X(Y, ssh_sha256_sw)                         \
    X(Y, ssh_sha384)                            \
    X(Y, ssh_sha512)                            \
+    X(Y, ssh_sha384_sw)                         \
+    X(Y, ssh_sha512_sw)                         \
+    HASH_SHA_NI(X, Y)                           \
+    HASH_SHA_NEON(X, Y)                         \
+    HASH_SHA512_NEON(X, Y)                      \
    X(Y, ssh_sha3_224)                          \
    X(Y, ssh_sha3_256)                          \
    X(Y, ssh_sha3_384)                          \
--- a/unix/utils/arm_arch_queries.c
+++ b/unix/utils/arm_arch_queries.c
@ -10,7 +10,7 @@

 #if defined __arm__ || defined __aarch64__

-bool platform_aes_hw_available(void)
+bool platform_aes_neon_available(void)
 {
 #if defined HWCAP_AES
    return getauxval(AT_HWCAP) & HWCAP_AES;
@ -26,7 +26,7 @@ bool platform_aes_hw_available(void)
 #endif
 }

-bool platform_sha256_hw_available(void)
+bool platform_sha256_neon_available(void)
 {
 #if defined HWCAP_SHA2
    return getauxval(AT_HWCAP) & HWCAP_SHA2;
@ -40,7 +40,7 @@ bool platform_sha256_hw_available(void)
 #endif
 }

-bool platform_sha1_hw_available(void)
+bool platform_sha1_neon_available(void)
 {
 #if defined HWCAP_SHA1
    return getauxval(AT_HWCAP) & HWCAP_SHA1;
@ -54,7 +54,7 @@ bool platform_sha1_hw_available(void)
 #endif
 }

-bool platform_sha512_hw_available(void)
+bool platform_sha512_neon_available(void)
 {
 #if defined HWCAP_SHA512
    return getauxval(AT_HWCAP) & HWCAP_SHA512;
--- a/windows/utils/arm_arch_queries.c
+++ b/windows/utils/arm_arch_queries.c
@ -15,22 +15,22 @@
 #define IsProcessorFeaturePresent(...) false
 #endif

-bool platform_aes_hw_available(void)
+bool platform_aes_neon_available(void)
 {
    return IsProcessorFeaturePresent(PF_ARM_V8_CRYPTO_INSTRUCTIONS_AVAILABLE);
 }

-bool platform_sha256_hw_available(void)
+bool platform_sha256_neon_available(void)
 {
    return IsProcessorFeaturePresent(PF_ARM_V8_CRYPTO_INSTRUCTIONS_AVAILABLE);
 }

-bool platform_sha1_hw_available(void)
+bool platform_sha1_neon_available(void)
 {
    return IsProcessorFeaturePresent(PF_ARM_V8_CRYPTO_INSTRUCTIONS_AVAILABLE);
 }

-bool platform_sha512_hw_available(void)
+bool platform_sha512_neon_available(void)
 {
    /* As of 2020-12-24, as far as I can tell from docs.microsoft.com,
     * Windows on Arm does not yet provide a PF_ARM_V8_* flag for the