Move crypto into its own subdirectory.

Similarly to 'utils', I've moved all the stuff in the crypto build-time library into a source directory of its own, and while I'm at it, split up the monolithic sshauxcrypt.c into its various unrelated parts. This is also an opportunity to remove the annoying 'ssh' prefix from the front of the file names, and give several of them less cryptic names.
2025-07-01 11:32:48 -05:00 · 2021-04-18 13:16:59 +01:00
parent 2b26ddf261
commit 5b30e6f7a6
32 changed files with 230 additions and 187 deletions
--- a/crypto/CMakeLists.txt
+++ b/crypto/CMakeLists.txt
@ -0,0 +1,30 @@
+add_sources_from_current_dir(crypto
+  aes.c
+  arcfour.c
+  argon2.c
+  bcrypt.c
+  blake2.c
+  blowfish.c
+  chacha20-poly1305.c
+  crc32.c
+  des.c
+  diffie-hellman.c
+  dsa.c
+  ecc-arithmetic.c
+  ecc-ssh.c
+  hash_simple.c
+  hmac.c
+  mac.c
+  mac_simple.c
+  md5.c
+  mpint.c
+  prng.c
+  pubkey-pem.c
+  pubkey-ppk.c
+  pubkey-ssh1.c
+  rsa.c
+  sha256.c
+  sha512.c
+  sha3.c
+  sha1.c
+  xdmauth.c)
--- a/crypto/aes.c
+++ b/crypto/aes.c
--- a/crypto/arcfour.c
+++ b/crypto/arcfour.c
@ -0,0 +1,141 @@
+/*
+ * Arcfour (RC4) implementation for PuTTY.
+ *
+ * Coded from Schneier.
+ */
+
+#include <assert.h>
+#include "ssh.h"
+
+typedef struct {
+    unsigned char i, j, s[256];
+    ssh_cipher ciph;
+} ArcfourContext;
+
+static void arcfour_block(void *handle, void *vblk, int len)
+{
+    unsigned char *blk = (unsigned char *)vblk;
+    ArcfourContext *ctx = (ArcfourContext *)handle;
+    unsigned k;
+    unsigned char tmp, i, j, *s;
+
+    s = ctx->s;
+    i = ctx->i; j = ctx->j;
+    for (k = 0; (int)k < len; k++) {
+        i  = (i + 1) & 0xff;
+        j  = (j + s[i]) & 0xff;
+        tmp = s[i]; s[i] = s[j]; s[j] = tmp;
+        blk[k] ^= s[(s[i]+s[j]) & 0xff];
+    }
+    ctx->i = i; ctx->j = j;
+}
+
+static void arcfour_setkey(ArcfourContext *ctx, unsigned char const *key,
+                           unsigned keybytes)
+{
+    unsigned char tmp, k[256], *s;
+    unsigned i, j;
+
+    s = ctx->s;
+    assert(keybytes <= 256);
+    ctx->i = ctx->j = 0;
+    for (i = 0; i < 256; i++) {
+        s[i] = i;
+        k[i] = key[i % keybytes];
+    }
+    j = 0;
+    for (i = 0; i < 256; i++) {
+        j = (j + s[i] + k[i]) & 0xff;
+        tmp = s[i]; s[i] = s[j]; s[j] = tmp;
+    }
+}
+
+/* -- Interface with PuTTY -- */
+
+/*
+ * We don't implement Arcfour in SSH-1 because it's utterly insecure in
+ * several ways.  See CERT Vulnerability Notes VU#25309, VU#665372,
+ * and VU#565052.
+ *
+ * We don't implement the "arcfour" algorithm in SSH-2 because it doesn't
+ * stir the cipher state before emitting keystream, and hence is likely
+ * to leak data about the key.
+ */
+
+static ssh_cipher *arcfour_new(const ssh_cipheralg *alg)
+{
+    ArcfourContext *ctx = snew(ArcfourContext);
+    ctx->ciph.vt = alg;
+    return &ctx->ciph;
+}
+
+static void arcfour_free(ssh_cipher *cipher)
+{
+    ArcfourContext *ctx = container_of(cipher, ArcfourContext, ciph);
+    smemclr(ctx, sizeof(*ctx));
+    sfree(ctx);
+}
+
+static void arcfour_stir(ArcfourContext *ctx)
+{
+    unsigned char *junk = snewn(1536, unsigned char);
+    memset(junk, 0, 1536);
+    arcfour_block(ctx, junk, 1536);
+    smemclr(junk, 1536);
+    sfree(junk);
+}
+
+static void arcfour_ssh2_setiv(ssh_cipher *cipher, const void *key)
+{
+    /* As a pure stream cipher, Arcfour has no IV separate from the key */
+}
+
+static void arcfour_ssh2_setkey(ssh_cipher *cipher, const void *key)
+{
+    ArcfourContext *ctx = container_of(cipher, ArcfourContext, ciph);
+    arcfour_setkey(ctx, key, ctx->ciph.vt->padded_keybytes);
+    arcfour_stir(ctx);
+}
+
+static void arcfour_ssh2_block(ssh_cipher *cipher, void *blk, int len)
+{
+    ArcfourContext *ctx = container_of(cipher, ArcfourContext, ciph);
+    arcfour_block(ctx, blk, len);
+}
+
+const ssh_cipheralg ssh_arcfour128_ssh2 = {
+    .new = arcfour_new,
+    .free = arcfour_free,
+    .setiv = arcfour_ssh2_setiv,
+    .setkey = arcfour_ssh2_setkey,
+    .encrypt = arcfour_ssh2_block,
+    .decrypt = arcfour_ssh2_block,
+    .ssh2_id = "arcfour128",
+    .blksize = 1,
+    .real_keybits = 128,
+    .padded_keybytes = 16,
+    .flags = 0,
+    .text_name = "Arcfour-128",
+};
+
+const ssh_cipheralg ssh_arcfour256_ssh2 = {
+    .new = arcfour_new,
+    .free = arcfour_free,
+    .setiv = arcfour_ssh2_setiv,
+    .setkey = arcfour_ssh2_setkey,
+    .encrypt = arcfour_ssh2_block,
+    .decrypt = arcfour_ssh2_block,
+    .ssh2_id = "arcfour256",
+    .blksize = 1,
+    .real_keybits = 256,
+    .padded_keybytes = 32,
+    .flags = 0,
+    .text_name = "Arcfour-256",
+};
+
+static const ssh_cipheralg *const arcfour_list[] = {
+    &ssh_arcfour256_ssh2,
+    &ssh_arcfour128_ssh2,
+};
+
+const ssh2_ciphers ssh2_arcfour = { lenof(arcfour_list), arcfour_list };
--- a/crypto/argon2.c
+++ b/crypto/argon2.c
@ -0,0 +1,565 @@
+/*
+ * Implementation of the Argon2 password hash function.
+ *
+ * My sources for the algorithm description and test vectors (the latter in
+ * test/cryptsuite.py) were the reference implementation on Github, and also
+ * the Internet-Draft description:
+ *
+ *   https://github.com/P-H-C/phc-winner-argon2
+ *   https://datatracker.ietf.org/doc/html/draft-irtf-cfrg-argon2-13
+ */
+
+#include <assert.h>
+
+#include "putty.h"
+#include "ssh.h"
+#include "marshal.h"
+
+/* ----------------------------------------------------------------------
+ * Argon2 uses data marshalling rules similar to SSH but with 32-bit integers
+ * stored little-endian. Start with some local BinarySink routines for storing
+ * a uint32 and a string in that fashion.
+ */
+
+static void BinarySink_put_uint32_le(BinarySink *bs, unsigned long val)
+{
+    unsigned char data[4];
+    PUT_32BIT_LSB_FIRST(data, val);
+    bs->write(bs, data, sizeof(data));
+}
+
+static void BinarySink_put_stringpl_le(BinarySink *bs, ptrlen pl)
+{
+    /* Check that the string length fits in a uint32, without doing a
+     * potentially implementation-defined shift of more than 31 bits */
+    assert((pl.len >> 31) < 2);
+
+    BinarySink_put_uint32_le(bs, pl.len);
+    bs->write(bs, pl.ptr, pl.len);
+}
+
+#define put_uint32_le(bs, val) \
+    BinarySink_put_uint32_le(BinarySink_UPCAST(bs), val)
+#define put_stringpl_le(bs, val) \
+    BinarySink_put_stringpl_le(BinarySink_UPCAST(bs), val)
+
+/* ----------------------------------------------------------------------
+ * Argon2 defines a hash-function family that's an extension of BLAKE2b to
+ * generate longer output digests, by repeatedly outputting half of a BLAKE2
+ * hash output and then re-hashing the whole thing until there are 64 or fewer
+ * bytes left to output. The spec calls this H' (a variant of the original
+ * hash it calls H, which is the unmodified BLAKE2b).
+ */
+
+static ssh_hash *hprime_new(unsigned length)
+{
+    ssh_hash *h = blake2b_new_general(length > 64 ? 64 : length);
+    put_uint32_le(h, length);
+    return h;
+}
+
+static void hprime_final(ssh_hash *h, unsigned length, void *vout)
+{
+    uint8_t *out = (uint8_t *)vout;
+
+    while (length > 64) {
+        uint8_t hashbuf[64];
+        ssh_hash_final(h, hashbuf);
+
+        memcpy(out, hashbuf, 32);
+        out += 32;
+        length -= 32;
+
+        h = blake2b_new_general(length > 64 ? 64 : length);
+        put_data(h, hashbuf, 64);
+
+        smemclr(hashbuf, sizeof(hashbuf));
+    }
+
+    ssh_hash_final(h, out);
+}
+
+/* Externally visible entry point for the long hash function. This is only
+ * used by testcrypt, so it would be overkill to set it up like a proper
+ * ssh_hash. */
+strbuf *argon2_long_hash(unsigned length, ptrlen data)
+{
+    ssh_hash *h = hprime_new(length);
+    put_datapl(h, data);
+    strbuf *out = strbuf_new();
+    hprime_final(h, length, strbuf_append(out, length));
+    return out;
+}
+
+/* ----------------------------------------------------------------------
+ * Argon2's own mixing function G, which operates on 1Kb blocks of data.
+ *
+ * The definition of G in the spec takes two 1Kb blocks as input and produces
+ * a 1Kb output block. The first thing that happens to the input blocks is
+ * that they get XORed together, and then only the XOR output is used, so you
+ * could perfectly well regard G as a 1Kb->1Kb function.
+ */
+
+static inline uint64_t ror(uint64_t x, unsigned rotation)
+{
+    unsigned lshift = 63 & -rotation, rshift = 63 & rotation;
+    return (x << lshift) | (x >> rshift);
+}
+
+static inline uint64_t trunc32(uint64_t x)
+{
+    return x & 0xFFFFFFFF;
+}
+
+/* Internal function similar to the BLAKE2b round, which mixes up four 64-bit
+ * words */
+static inline void GB(uint64_t *a, uint64_t *b, uint64_t *c, uint64_t *d)
+{
+    *a += *b + 2 * trunc32(*a) * trunc32(*b);
+    *d = ror(*d ^ *a, 32);
+    *c += *d + 2 * trunc32(*c) * trunc32(*d);
+    *b = ror(*b ^ *c, 24);
+    *a += *b + 2 * trunc32(*a) * trunc32(*b);
+    *d = ror(*d ^ *a, 16);
+    *c += *d + 2 * trunc32(*c) * trunc32(*d);
+    *b = ror(*b ^ *c, 63);
+}
+
+/* Higher-level internal function which mixes up sixteen 64-bit words. This is
+ * applied to different subsets of the 128 words in a kilobyte block, and the
+ * API here is designed to make it easy to apply in the circumstances the spec
+ * requires. In every call, the sixteen words form eight pairs adjacent in
+ * memory, whose addresses are in arithmetic progression. So the 16 input
+ * words are in[0], in[1], in[instep], in[instep+1], ..., in[7*instep],
+ * in[7*instep+1], and the 16 output words similarly. */
+static inline void P(uint64_t *out, unsigned outstep,
+                     uint64_t *in, unsigned instep)
+{
+    for (unsigned i = 0; i < 8; i++) {
+        out[i*outstep] = in[i*instep];
+        out[i*outstep+1] = in[i*instep+1];
+    }
+
+    GB(out+0*outstep+0, out+2*outstep+0, out+4*outstep+0, out+6*outstep+0);
+    GB(out+0*outstep+1, out+2*outstep+1, out+4*outstep+1, out+6*outstep+1);
+    GB(out+1*outstep+0, out+3*outstep+0, out+5*outstep+0, out+7*outstep+0);
+    GB(out+1*outstep+1, out+3*outstep+1, out+5*outstep+1, out+7*outstep+1);
+
+    GB(out+0*outstep+0, out+2*outstep+1, out+5*outstep+0, out+7*outstep+1);
+    GB(out+0*outstep+1, out+3*outstep+0, out+5*outstep+1, out+6*outstep+0);
+    GB(out+1*outstep+0, out+3*outstep+1, out+4*outstep+0, out+6*outstep+1);
+    GB(out+1*outstep+1, out+2*outstep+0, out+4*outstep+1, out+7*outstep+0);
+}
+
+/* The full G function, taking input blocks X and Y. The result of G is most
+ * often XORed into an existing output block, so this API is designed with
+ * that in mind: the mixing function's output is always XORed into whatever
+ * 1Kb of data is already at 'out'. */
+static void G_xor(uint8_t *out, const uint8_t *X, const uint8_t *Y)
+{
+    uint64_t R[128], Q[128], Z[128];
+
+    for (unsigned i = 0; i < 128; i++)
+        R[i] = GET_64BIT_LSB_FIRST(X + 8*i) ^ GET_64BIT_LSB_FIRST(Y + 8*i);
+
+    for (unsigned i = 0; i < 8; i++)
+        P(Q+16*i, 2, R+16*i, 2);
+
+    for (unsigned i = 0; i < 8; i++)
+        P(Z+2*i, 16, Q+2*i, 16);
+
+    for (unsigned i = 0; i < 128; i++)
+        PUT_64BIT_LSB_FIRST(out + 8*i,
+                            GET_64BIT_LSB_FIRST(out + 8*i) ^ R[i] ^ Z[i]);
+
+    smemclr(R, sizeof(R));
+    smemclr(Q, sizeof(Q));
+    smemclr(Z, sizeof(Z));
+}
+
+/* ----------------------------------------------------------------------
+ * The main Argon2 function.
+ */
+
+static void argon2_internal(uint32_t p, uint32_t T, uint32_t m, uint32_t t,
+                            uint32_t y, ptrlen P, ptrlen S, ptrlen K, ptrlen X,
+                            uint8_t *out)
+{
+    /*
+     * Start by hashing all the input data together: the four string arguments
+     * (password P, salt S, optional secret key K, optional associated data
+     * X), plus all the parameters for the function's memory and time usage.
+     *
+     * The output of this hash is the sole input to the subsequent mixing
+     * step: Argon2 does not preserve any more entropy from the inputs, it
+     * just makes it extra painful to get the final answer.
+     */
+    uint8_t h0[64];
+    {
+        ssh_hash *h = blake2b_new_general(64);
+        put_uint32_le(h, p);
+        put_uint32_le(h, T);
+        put_uint32_le(h, m);
+        put_uint32_le(h, t);
+        put_uint32_le(h, 0x13);        /* hash function version number */
+        put_uint32_le(h, y);
+        put_stringpl_le(h, P);
+        put_stringpl_le(h, S);
+        put_stringpl_le(h, K);
+        put_stringpl_le(h, X);
+        ssh_hash_final(h, h0);
+    }
+
+    struct blk { uint8_t data[1024]; };
+
+    /*
+     * Array of 1Kb blocks. The total size is (approximately) m, the
+     * caller-specified parameter for how much memory to use; the blocks are
+     * regarded as a rectangular array of p rows ('lanes') by q columns, where
+     * p is the 'parallelism' input parameter (the lanes can be processed
+     * concurrently up to a point) and q is whatever makes the product pq come
+     * to m.
+     *
+     * Additionally, each row is divided into four equal 'segments', which are
+     * important to the way the algorithm decides which blocks to use as input
+     * to each step of the function.
+     *
+     * The term 'slice' refers to a whole set of vertically aligned segments,
+     * i.e. slice 0 is the whole left quarter of the array, and slice 3 the
+     * whole right quarter.
+     */
+    size_t SL = m / (4*p); /* segment length: # of 1Kb blocks in a segment */
+    size_t q = 4 * SL;     /* width of the array: 4 segments times SL */
+    size_t mprime = q * p; /* total size of the array, approximately m */
+
+    /* Allocate the memory. */
+    struct blk *B = snewn(mprime, struct blk);
+    memset(B, 0, mprime * sizeof(struct blk));
+
+    /*
+     * Initial setup: fill the first two full columns of the array with data
+     * expanded from the starting hash h0. Each block is the result of using
+     * the long-output hash function H' to hash h0 itself plus the block's
+     * coordinates in the array.
+     */
+    for (size_t i = 0; i < p; i++) {
+        ssh_hash *h = hprime_new(1024);
+        put_data(h, h0, 64);
+        put_uint32_le(h, 0);
+        put_uint32_le(h, i);
+        hprime_final(h, 1024, B[i].data);
+    }
+    for (size_t i = 0; i < p; i++) {
+        ssh_hash *h = hprime_new(1024);
+        put_data(h, h0, 64);
+        put_uint32_le(h, 1);
+        put_uint32_le(h, i);
+        hprime_final(h, 1024, B[i+p].data);
+    }
+
+    /*
+     * Declarations for the main loop.
+     *
+     * The basic structure of the main loop is going to involve processing the
+     * array one whole slice (vertically divided quarter) at a time. Usually
+     * we'll write a new value into every single block in the slice, except
+     * that in the initial slice on the first pass, we've already written
+     * values into the first two columns during the initial setup above. So
+     * 'jstart' indicates the starting index in each segment we process; it
+     * starts off as 2 so that we don't overwrite the inital setup, and then
+     * after the first slice is done, we set it to 0, and it stays there.
+     *
+     * d_mode indicates whether we're being data-dependent (true) or
+     * data-independent (false). In the hybrid Argon2id mode, we start off
+     * independent, and then once we've mixed things up enough, switch over to
+     * dependent mode to force long serial chains of computation.
+     */
+    size_t jstart = 2;
+    bool d_mode = (y == 0);
+    struct blk out2i, tmp2i, in2i;
+
+    /* Outermost loop: t whole passes from left to right over the array */
+    for (size_t pass = 0; pass < t; pass++) {
+
+        /* Within that, we process the array in its four main slices */
+        for (unsigned slice = 0; slice < 4; slice++) {
+
+            /* In Argon2id mode, if we're half way through the first pass,
+             * this is the moment to switch d_mode from false to true */
+            if (pass == 0 && slice == 2 && y == 2)
+                d_mode = true;
+
+            /* Loop over every segment in the slice (i.e. every row). So i is
+             * the y-coordinate of each block we process. */
+            for (size_t i = 0; i < p; i++) {
+
+                /* And within that segment, process the blocks from left to
+                 * right, starting at 'jstart' (usually 0, but 2 in the first
+                 * slice). */
+                for (size_t jpre = jstart; jpre < SL; jpre++) {
+
+                    /* j is the x-coordinate of each block we process, made up
+                     * of the slice number and the index 'jpre' within the
+                     * segment. */
+                    size_t j = slice * SL + jpre;
+
+                    /* jm1 is j-1 (mod q) */
+                    uint32_t jm1 = (j == 0 ? q-1 : j-1);
+
+                    /*
+                     * Construct two 32-bit pseudorandom integers J1 and J2.
+                     * This is the part of the algorithm that varies between
+                     * the data-dependent and independent modes.
+                     */
+                    uint32_t J1, J2;
+                    if (d_mode) {
+                        /*
+                         * Data-dependent: grab the first 64 bits of the block
+                         * to the left of this one.
+                         */
+                        J1 = GET_32BIT_LSB_FIRST(B[i + p * jm1].data);
+                        J2 = GET_32BIT_LSB_FIRST(B[i + p * jm1].data + 4);
+                    } else {
+                        /*
+                         * Data-independent: generate pseudorandom data by
+                         * hashing a sequence of preimage blocks that include
+                         * all our input parameters, plus the coordinates of
+                         * this point in the algorithm (array position and
+                         * pass number) to make all the hash outputs distinct.
+                         *
+                         * The hash we use is G itself, applied twice. So we
+                         * generate 1Kb of data at a time, which is enough for
+                         * 128 (J1,J2) pairs. Hence we only need to do the
+                         * hashing if our index within the segment is a
+                         * multiple of 128, or if we're at the very start of
+                         * the algorithm (in which case we started at 2 rather
+                         * than 0). After that we can just keep picking data
+                         * out of our most recent hash output.
+                         */
+                        if (jpre == jstart || jpre % 128 == 0) {
+                            /*
+                             * Hash preimage is mostly zeroes, with a
+                             * collection of assorted integer values we had
+                             * anyway.
+                             */
+                            memset(in2i.data, 0, sizeof(in2i.data));
+                            PUT_64BIT_LSB_FIRST(in2i.data +  0, pass);
+                            PUT_64BIT_LSB_FIRST(in2i.data +  8, i);
+                            PUT_64BIT_LSB_FIRST(in2i.data + 16, slice);
+                            PUT_64BIT_LSB_FIRST(in2i.data + 24, mprime);
+                            PUT_64BIT_LSB_FIRST(in2i.data + 32, t);
+                            PUT_64BIT_LSB_FIRST(in2i.data + 40, y);
+                            PUT_64BIT_LSB_FIRST(in2i.data + 48, jpre / 128 + 1);
+
+                            /*
+                             * Now apply G twice to generate the hash output
+                             * in out2i.
+                             */
+                            memset(tmp2i.data, 0, sizeof(tmp2i.data));
+                            G_xor(tmp2i.data, tmp2i.data, in2i.data);
+                            memset(out2i.data, 0, sizeof(out2i.data));
+                            G_xor(out2i.data, out2i.data, tmp2i.data);
+                        }
+
+                        /*
+                         * Extract J1 and J2 from the most recent hash output
+                         * (whether we've just computed it or not).
+                         */
+                        J1 = GET_32BIT_LSB_FIRST(
+                            out2i.data + 8 * (jpre % 128));
+                        J2 = GET_32BIT_LSB_FIRST(
+                            out2i.data + 8 * (jpre % 128) + 4);
+                    }
+
+                    /*
+                     * Now convert J1 and J2 into the index of an existing
+                     * block of the array to use as input to this step. This
+                     * is fairly fiddly.
+                     *
+                     * The easy part: the y-coordinate of the input block is
+                     * obtained by reducing J2 mod p, except that at the very
+                     * start of the algorithm (processing the first slice on
+                     * the first pass) we simply use the same y-coordinate as
+                     * our output block.
+                     *
+                     * Note that it's safe to use the ordinary % operator
+                     * here, without any concern for timing side channels: in
+                     * data-independent mode J2 is not correlated to any
+                     * secrets, and in data-dependent mode we're going to be
+                     * giving away side-channel data _anyway_ when we use it
+                     * as an array index (and by assumption we don't care,
+                     * because it's already massively randomised from the real
+                     * inputs).
+                     */
+                    uint32_t index_l = (pass == 0 && slice == 0) ? i : J2 % p;
+
+                    /*
+                     * The hard part: which block in this array row do we use?
+                     *
+                     * First, we decide what the possible candidates are. This
+                     * requires some case analysis, and depends on whether the
+                     * array row is the same one we're writing into or not.
+                     *
+                     * If it's not the same row: we can't use any block from
+                     * the current slice (because the segments within a slice
+                     * have to be processable in parallel, so in a concurrent
+                     * implementation those blocks are potentially in the
+                     * process of being overwritten by other threads). But the
+                     * other three slices are fair game, except that in the
+                     * first pass, slices to the right of us won't have had
+                     * any values written into them yet at all.
+                     *
+                     * If it is the same row, we _are_ allowed to use blocks
+                     * from the current slice, but only the ones before our
+                     * current position.
+                     *
+                     * In both cases, we also exclude the individual _column_
+                     * just to the left of the current one. (The block
+                     * immediately to our left is going to be the _other_
+                     * input to G, but the spec also says that we avoid that
+                     * column even in a different row.)
+                     *
+                     * All of this means that we end up choosing from a
+                     * cyclically contiguous interval of blocks within this
+                     * lane, but the start and end points require some thought
+                     * to get them right.
+                     */
+
+                    /* Start position is the beginning of the _next_ slice
+                     * (containing data from the previous pass), unless we're
+                     * on pass 0, where the start position has to be 0. */
+                    uint32_t Wstart = (pass == 0 ? 0 : (slice + 1) % 4 * SL);
+
+                    /* End position splits up by cases. */
+                    uint32_t Wend;
+                    if (index_l == i) {
+                        /* Same lane as output: we can use anything up to (but
+                         * not including) the block immediately left of us. */
+                        Wend = jm1;
+                    } else {
+                        /* Different lane from output: we can use anything up
+                         * to the previous slice boundary, or one less than
+                         * that if we're at the very left edge of our slice
+                         * right now. */
+                        Wend = SL * slice;
+                        if (jpre == 0)
+                            Wend = (Wend + q-1) % q;
+                    }
+
+                    /* Total number of blocks available to choose from */
+                    uint32_t Wsize = (Wend + q - Wstart) % q;
+
+                    /* Fiddly computation from the spec that chooses from the
+                     * available blocks, in a deliberately non-uniform
+                     * fashion, using J1 as pseudorandom input data. Output is
+                     * zz which is the index within our contiguous interval. */
+                    uint32_t x = ((uint64_t)J1 * J1) >> 32;
+                    uint32_t y = ((uint64_t)Wsize * x) >> 32;
+                    uint32_t zz = Wsize - 1 - y;
+
+                    /* And index_z is the actual x coordinate of the block we
+                     * want. */
+                    uint32_t index_z = (Wstart + zz) % q;
+
+                    /* Phew! Combine that block with the one immediately to
+                     * our left, and XOR over the top of whatever is already
+                     * in our current output block. */
+                    G_xor(B[i + p * j].data, B[i + p * jm1].data,
+                          B[index_l + p * index_z].data);
+                }
+            }
+
+            /* We've finished processing a slice. Reset jstart to 0. It will
+             * onily _not_ have been 0 if this was pass 0 slice 0, in which
+             * case it still had its initial value of 2 to avoid the starting
+             * data. */
+            jstart = 0;
+        }
+    }
+
+    /*
+     * The main output is all done. Final output works by taking the XOR of
+     * all the blocks in the rightmost column of the array, and then using
+     * that as input to our long hash H'. The output of _that_ is what we
+     * deliver to the caller.
+     */
+
+    struct blk C = B[p * (q-1)];
+    for (size_t i = 1; i < p; i++)
+        memxor(C.data, C.data, B[i + p * (q-1)].data, 1024);
+
+    {
+        ssh_hash *h = hprime_new(T);
+        put_data(h, C.data, 1024);
+        hprime_final(h, T, out);
+    }
+
+    /*
+     * Clean up.
+     */
+    smemclr(out2i.data, sizeof(out2i.data));
+    smemclr(tmp2i.data, sizeof(tmp2i.data));
+    smemclr(in2i.data, sizeof(in2i.data));
+    smemclr(C.data, sizeof(C.data));
+    smemclr(B, mprime * sizeof(struct blk));
+    sfree(B);
+}
+
+/*
+ * Wrapper function that appends to a strbuf (which sshpubk.c will want).
+ */
+void argon2(Argon2Flavour flavour, uint32_t mem, uint32_t passes,
+            uint32_t parallel, uint32_t taglen,
+            ptrlen P, ptrlen S, ptrlen K, ptrlen X, strbuf *out)
+{
+    argon2_internal(parallel, taglen, mem, passes, flavour,
+                    P, S, K, X, strbuf_append(out, taglen));
+}
+
+/*
+ * Wrapper function which dynamically chooses the number of passes to run in
+ * order to hit an approximate total amount of CPU time. Writes the result
+ * into 'passes'.
+ */
+void argon2_choose_passes(
+    Argon2Flavour flavour, uint32_t mem,
+    uint32_t milliseconds, uint32_t *passes,
+    uint32_t parallel, uint32_t taglen,
+    ptrlen P, ptrlen S, ptrlen K, ptrlen X,
+    strbuf *out)
+{
+    unsigned long desired_time = (TICKSPERSEC * milliseconds) / 1000;
+
+    /*
+     * We only need the time taken to be approximately right, so we
+     * scale up the number of passes geometrically, which avoids
+     * taking O(t^2) time to find a pass count taking time t.
+     *
+     * Using the Fibonacci numbers is slightly nicer than the obvious
+     * approach of powers of 2, because it's still very easy to
+     * compute, and grows less fast (powers of 1.6 instead of 2), so
+     * you get just a touch more precision.
+     */
+    uint32_t a = 1, b = 1;
+
+    while (true) {
+        unsigned long start_time = GETTICKCOUNT();
+        argon2(flavour, mem, b, parallel, taglen, P, S, K, X, out);
+        unsigned long ticks = GETTICKCOUNT() - start_time;
+
+        /* But just in case computers get _too_ fast, we have to cap
+         * the growth before it gets past the uint32_t upper bound! So
+         * if computing a+b would overflow, stop here. */
+
+        if (ticks >= desired_time || a > (uint32_t)~b) {
+            *passes = b;
+            return;
+        } else {
+            strbuf_clear(out);
+
+            /* Next Fibonacci number: replace (a, b) with (b, a+b) */
+            b += a;
+            a = b - a;
+        }
+    }
+}
--- a/crypto/bcrypt.c
+++ b/crypto/bcrypt.c
@ -0,0 +1,119 @@
+/*
+ * 'bcrypt' password hash function, for PuTTY's import/export of
+ * OpenSSH encrypted private key files.
+ *
+ * This is not really the same as the original bcrypt; OpenSSH has
+ * modified it in various ways, and of course we have to do the same.
+ */
+
+#include <stddef.h>
+#include <string.h>
+#include "ssh.h"
+#include "sshblowf.h"
+
+BlowfishContext *bcrypt_setup(const unsigned char *key, int keybytes,
+                              const unsigned char *salt, int saltbytes)
+{
+    int i;
+    BlowfishContext *ctx;
+
+    ctx = blowfish_make_context();
+    blowfish_initkey(ctx);
+    blowfish_expandkey(ctx, key, keybytes, salt, saltbytes);
+
+    /* Original bcrypt replaces this fixed loop count with the
+     * variable cost. OpenSSH instead iterates the whole thing more
+     * than once if it wants extra rounds. */
+    for (i = 0; i < 64; i++) {
+        blowfish_expandkey(ctx, salt, saltbytes, NULL, 0);
+        blowfish_expandkey(ctx, key, keybytes, NULL, 0);
+    }
+
+    return ctx;
+}
+
+void bcrypt_hash(const unsigned char *key, int keybytes,
+                 const unsigned char *salt, int saltbytes,
+                 unsigned char output[32])
+{
+    BlowfishContext *ctx;
+    int i;
+
+    ctx = bcrypt_setup(key, keybytes, salt, saltbytes);
+    /* This was quite a nice starting string until it ran into
+     * little-endian Blowfish :-/ */
+    memcpy(output, "cyxOmorhcitawolBhsiftawSanyDetim", 32);
+    for (i = 0; i < 64; i++) {
+        blowfish_lsb_encrypt_ecb(output, 32, ctx);
+    }
+    blowfish_free_context(ctx);
+}
+
+void bcrypt_genblock(int counter,
+                     const unsigned char hashed_passphrase[64],
+                     const unsigned char *salt, int saltbytes,
+                     unsigned char output[32])
+{
+    unsigned char hashed_salt[64];
+
+    /* Hash the input salt with the counter value optionally suffixed
+     * to get our real 32-byte salt */
+    ssh_hash *h = ssh_hash_new(&ssh_sha512);
+    put_data(h, salt, saltbytes);
+    if (counter)
+        put_uint32(h, counter);
+    ssh_hash_final(h, hashed_salt);
+
+    bcrypt_hash(hashed_passphrase, 64, hashed_salt, 64, output);
+
+    smemclr(&hashed_salt, sizeof(hashed_salt));
+}
+
+void openssh_bcrypt(const char *passphrase,
+                    const unsigned char *salt, int saltbytes,
+                    int rounds, unsigned char *out, int outbytes)
+{
+    unsigned char hashed_passphrase[64];
+    unsigned char block[32], outblock[32];
+    const unsigned char *thissalt;
+    int thissaltbytes;
+    int modulus, residue, i, j, round;
+
+    /* Hash the passphrase to get the bcrypt key material */
+    hash_simple(&ssh_sha512, ptrlen_from_asciz(passphrase), hashed_passphrase);
+
+    /* We output key bytes in a scattered fashion to meld all output
+     * key blocks into all parts of the output. To do this, we pick a
+     * modulus, and we output the key bytes to indices of out[] in the
+     * following order: first the indices that are multiples of the
+     * modulus, then the ones congruent to 1 mod modulus, etc. Each of
+     * those passes consumes exactly one block output from
+     * bcrypt_genblock, so we must pick a modulus large enough that at
+     * most 32 bytes are used in the pass. */
+    modulus = (outbytes + 31) / 32;
+
+    for (residue = 0; residue < modulus; residue++) {
+        /* Our output block of data is the XOR of all blocks generated
+         * by bcrypt in the following loop */
+        memset(outblock, 0, sizeof(outblock));
+
+        thissalt = salt;
+        thissaltbytes = saltbytes;
+        for (round = 0; round < rounds; round++) {
+            bcrypt_genblock(round == 0 ? residue+1 : 0,
+                            hashed_passphrase,
+                            thissalt, thissaltbytes, block);
+            /* Each subsequent bcrypt call reuses the previous one's
+             * output as its salt */
+            thissalt = block;
+            thissaltbytes = 32;
+
+            for (i = 0; i < 32; i++)
+                outblock[i] ^= block[i];
+        }
+
+        for (i = residue, j = 0; i < outbytes; i += modulus, j++)
+            out[i] = outblock[j];
+    }
+    smemclr(&hashed_passphrase, sizeof(hashed_passphrase));
+}
--- a/crypto/blake2.c
+++ b/crypto/blake2.c
@ -0,0 +1,223 @@
+/*
+ * BLAKE2 (RFC 7693) implementation for PuTTY.
+ *
+ * The BLAKE2 hash family includes BLAKE2s, in which the hash state is
+ * operated on as a collection of 32-bit integers, and BLAKE2b, based
+ * on 64-bit integers. At present this code implements BLAKE2b only.
+ */
+
+#include <assert.h>
+#include "ssh.h"
+
+static inline uint64_t ror(uint64_t x, unsigned rotation)
+{
+    unsigned lshift = 63 & -rotation, rshift = 63 & rotation;
+    return (x << lshift) | (x >> rshift);
+}
+
+/* RFC 7963 section 2.1 */
+enum { R1 = 32, R2 = 24, R3 = 16, R4 = 63 };
+
+/* RFC 7693 section 2.6 */
+static const uint64_t iv[] = {
+    0x6a09e667f3bcc908,                /* floor(2^64 * frac(sqrt(2)))  */
+    0xbb67ae8584caa73b,                /* floor(2^64 * frac(sqrt(3)))  */
+    0x3c6ef372fe94f82b,                /* floor(2^64 * frac(sqrt(5)))  */
+    0xa54ff53a5f1d36f1,                /* floor(2^64 * frac(sqrt(7)))  */
+    0x510e527fade682d1,                /* floor(2^64 * frac(sqrt(11))) */
+    0x9b05688c2b3e6c1f,                /* floor(2^64 * frac(sqrt(13))) */
+    0x1f83d9abfb41bd6b,                /* floor(2^64 * frac(sqrt(17))) */
+    0x5be0cd19137e2179,                /* floor(2^64 * frac(sqrt(19))) */
+};
+
+/* RFC 7693 section 2.7 */
+static const unsigned char sigma[][16] = {
+    { 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15},
+    {14, 10,  4,  8,  9, 15, 13,  6,  1, 12,  0,  2, 11,  7,  5,  3},
+    {11,  8, 12,  0,  5,  2, 15, 13, 10, 14,  3,  6,  7,  1,  9,  4},
+    { 7,  9,  3,  1, 13, 12, 11, 14,  2,  6,  5, 10,  4,  0, 15,  8},
+    { 9,  0,  5,  7,  2,  4, 10, 15, 14,  1, 11, 12,  6,  8,  3, 13},
+    { 2, 12,  6, 10,  0, 11,  8,  3,  4, 13,  7,  5, 15, 14,  1,  9},
+    {12,  5,  1, 15, 14, 13,  4, 10,  0,  7,  6,  3,  9,  2,  8, 11},
+    {13, 11,  7, 14, 12,  1,  3,  9,  5,  0, 15,  4,  8,  6,  2, 10},
+    { 6, 15, 14,  9, 11,  3,  0,  8, 12,  2, 13,  7,  1,  4, 10,  5},
+    {10,  2,  8,  4,  7,  6,  1,  5, 15, 11,  9, 14,  3, 12, 13,  0},
+    /* This array recycles if you have more than 10 rounds. BLAKE2b
+     * has 12, so we repeat the first two rows again. */
+    { 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15},
+    {14, 10,  4,  8,  9, 15, 13,  6,  1, 12,  0,  2, 11,  7,  5,  3},
+};
+
+static inline void g_half(uint64_t v[16], unsigned a, unsigned b, unsigned c,
+                          unsigned d, uint64_t x, unsigned r1, unsigned r2)
+{
+    v[a] += v[b] + x;
+    v[d] ^= v[a];
+    v[d] = ror(v[d], r1);
+    v[c] += v[d];
+    v[b] ^= v[c];
+    v[b] = ror(v[b], r2);
+}
+
+static inline void g(uint64_t v[16], unsigned a, unsigned b, unsigned c,
+                     unsigned d, uint64_t x, uint64_t y)
+{
+    g_half(v, a, b, c, d, x, R1, R2);
+    g_half(v, a, b, c, d, y, R3, R4);
+}
+
+static inline void f(uint64_t h[8], uint64_t m[16], uint64_t offset_hi,
+                     uint64_t offset_lo, unsigned final)
+{
+    uint64_t v[16];
+    memcpy(v, h, 8 * sizeof(*v));
+    memcpy(v + 8, iv, 8 * sizeof(*v));
+    v[12] ^= offset_lo;
+    v[13] ^= offset_hi;
+    v[14] ^= -(uint64_t)final;
+    for (unsigned round = 0; round < 12; round++) {
+        const unsigned char *s = sigma[round];
+        g(v,  0,  4,  8, 12, m[s[ 0]], m[s[ 1]]);
+        g(v,  1,  5,  9, 13, m[s[ 2]], m[s[ 3]]);
+        g(v,  2,  6, 10, 14, m[s[ 4]], m[s[ 5]]);
+        g(v,  3,  7, 11, 15, m[s[ 6]], m[s[ 7]]);
+        g(v,  0,  5, 10, 15, m[s[ 8]], m[s[ 9]]);
+        g(v,  1,  6, 11, 12, m[s[10]], m[s[11]]);
+        g(v,  2,  7,  8, 13, m[s[12]], m[s[13]]);
+        g(v,  3,  4,  9, 14, m[s[14]], m[s[15]]);
+    }
+    for (unsigned i = 0; i < 8; i++)
+        h[i] ^= v[i] ^ v[i+8];
+    smemclr(v, sizeof(v));
+}
+
+static inline void f_outer(uint64_t h[8], uint8_t blk[128], uint64_t offset_hi,
+                           uint64_t offset_lo, unsigned final)
+{
+    uint64_t m[16];
+    for (unsigned i = 0; i < 16; i++)
+        m[i] = GET_64BIT_LSB_FIRST(blk + 8*i);
+    f(h, m, offset_hi, offset_lo, final);
+    smemclr(m, sizeof(m));
+}
+
+typedef struct blake2b {
+    uint64_t h[8];
+    unsigned hashlen;
+
+    uint8_t block[128];
+    size_t used;
+    uint64_t lenhi, lenlo;
+
+    BinarySink_IMPLEMENTATION;
+    ssh_hash hash;
+} blake2b;
+
+static void blake2b_write(BinarySink *bs, const void *vp, size_t len);
+
+static ssh_hash *blake2b_new_inner(unsigned hashlen)
+{
+    assert(hashlen <= ssh_blake2b.hlen);
+
+    blake2b *s = snew(blake2b);
+    s->hash.vt = &ssh_blake2b;
+    s->hashlen = hashlen;
+    BinarySink_INIT(s, blake2b_write);
+    BinarySink_DELEGATE_INIT(&s->hash, s);
+    return &s->hash;
+}
+
+static ssh_hash *blake2b_new(const ssh_hashalg *alg)
+{
+    return blake2b_new_inner(alg->hlen);
+}
+
+ssh_hash *blake2b_new_general(unsigned hashlen)
+{
+    ssh_hash *h = blake2b_new_inner(hashlen);
+    ssh_hash_reset(h);
+    return h;
+}
+
+static void blake2b_reset(ssh_hash *hash)
+{
+    blake2b *s = container_of(hash, blake2b, hash);
+
+    /* Initialise the hash to the standard IV */
+    memcpy(s->h, iv, sizeof(s->h));
+
+    /* XOR in the parameters: secret key length (here always 0) in
+     * byte 1, and hash length in byte 0. */
+    s->h[0] ^= 0x01010000 ^ s->hashlen;
+
+    s->used = 0;
+    s->lenhi = s->lenlo = 0;
+}
+
+static void blake2b_copyfrom(ssh_hash *hcopy, ssh_hash *horig)
+{
+    blake2b *copy = container_of(hcopy, blake2b, hash);
+    blake2b *orig = container_of(horig, blake2b, hash);
+
+    memcpy(copy, orig, sizeof(*copy));
+    BinarySink_COPIED(copy);
+    BinarySink_DELEGATE_INIT(&copy->hash, copy);
+}
+
+static void blake2b_free(ssh_hash *hash)
+{
+    blake2b *s = container_of(hash, blake2b, hash);
+
+    smemclr(s, sizeof(*s));
+    sfree(s);
+}
+
+static void blake2b_write(BinarySink *bs, const void *vp, size_t len)
+{
+    blake2b *s = BinarySink_DOWNCAST(bs, blake2b);
+    const uint8_t *p = vp;
+
+    while (len > 0) {
+        if (s->used == sizeof(s->block)) {
+            f_outer(s->h, s->block, s->lenhi, s->lenlo, 0);
+            s->used = 0;
+        }
+
+        size_t chunk = sizeof(s->block) - s->used;
+        if (chunk > len)
+            chunk = len;
+
+        memcpy(s->block + s->used, p, chunk);
+        s->used += chunk;
+        p += chunk;
+        len -= chunk;
+
+        s->lenlo += chunk;
+        s->lenhi += (s->lenlo < chunk);
+    }
+}
+
+static void blake2b_digest(ssh_hash *hash, uint8_t *digest)
+{
+    blake2b *s = container_of(hash, blake2b, hash);
+
+    memset(s->block + s->used, 0, sizeof(s->block) - s->used);
+    f_outer(s->h, s->block, s->lenhi, s->lenlo, 1);
+
+    uint8_t hash_pre[128];
+    for (unsigned i = 0; i < 8; i++)
+        PUT_64BIT_LSB_FIRST(hash_pre + 8*i, s->h[i]);
+    memcpy(digest, hash_pre, s->hashlen);
+    smemclr(hash_pre, sizeof(hash_pre));
+}
+
+const ssh_hashalg ssh_blake2b = {
+    .new = blake2b_new,
+    .reset = blake2b_reset,
+    .copyfrom = blake2b_copyfrom,
+    .digest = blake2b_digest,
+    .free = blake2b_free,
+    .hlen = 64,
+    .blocklen = 128,
+    HASHALG_NAMES_BARE("BLAKE2b-64"),
+};
--- a/crypto/blowfish.c
+++ b/crypto/blowfish.c
@ -0,0 +1,699 @@
+/*
+ * Blowfish implementation for PuTTY.
+ *
+ * Coded from scratch from the algorithm description.
+ */
+
+#include <assert.h>
+#include <stdio.h>
+#include "ssh.h"
+#include "sshblowf.h"
+
+struct BlowfishContext {
+    uint32_t S0[256], S1[256], S2[256], S3[256], P[18];
+    uint32_t iv0, iv1;                 /* for CBC mode */
+};
+
+/*
+ * The Blowfish init data: hex digits of the fractional part of pi.
+ * (ie pi as a hex fraction is 3.243F6A8885A308D3...)
+ *
+ * If you have Simon Tatham's 'spigot' exact real calculator
+ * available, or any other method of generating 8336 fractional hex
+ * digits of pi on standard output, you can regenerate these tables
+ * exactly as below using the following Perl script (adjusting the
+ * first line or two if your pi-generator is not spigot).
+
+open my $spig, "spigot -n -B16 -d8336 pi |";
+read $spig, $ignore, 2; # throw away the leading "3."
+for my $name ("parray", "sbox0".."sbox3") {
+    print "static const uint32_t ${name}[] = {\n";
+    my $len = $name eq "parray" ? 18 : 256;
+    for my $i (1..$len) {
+        read $spig, $word, 8;
+        printf "%s0x%s,", ($i%6==1 ? "    " : " "), uc $word;
+        print "\n" if ($i == $len || $i%6 == 0);
+    }
+    print "};\n\n";
+}
+close $spig;
+
+ */
+static const uint32_t parray[] = {
+    0x243F6A88, 0x85A308D3, 0x13198A2E, 0x03707344, 0xA4093822, 0x299F31D0,
+    0x082EFA98, 0xEC4E6C89, 0x452821E6, 0x38D01377, 0xBE5466CF, 0x34E90C6C,
+    0xC0AC29B7, 0xC97C50DD, 0x3F84D5B5, 0xB5470917, 0x9216D5D9, 0x8979FB1B,
+};
+
+static const uint32_t sbox0[] = {
+    0xD1310BA6, 0x98DFB5AC, 0x2FFD72DB, 0xD01ADFB7, 0xB8E1AFED, 0x6A267E96,
+    0xBA7C9045, 0xF12C7F99, 0x24A19947, 0xB3916CF7, 0x0801F2E2, 0x858EFC16,
+    0x636920D8, 0x71574E69, 0xA458FEA3, 0xF4933D7E, 0x0D95748F, 0x728EB658,
+    0x718BCD58, 0x82154AEE, 0x7B54A41D, 0xC25A59B5, 0x9C30D539, 0x2AF26013,
+    0xC5D1B023, 0x286085F0, 0xCA417918, 0xB8DB38EF, 0x8E79DCB0, 0x603A180E,
+    0x6C9E0E8B, 0xB01E8A3E, 0xD71577C1, 0xBD314B27, 0x78AF2FDA, 0x55605C60,
+    0xE65525F3, 0xAA55AB94, 0x57489862, 0x63E81440, 0x55CA396A, 0x2AAB10B6,
+    0xB4CC5C34, 0x1141E8CE, 0xA15486AF, 0x7C72E993, 0xB3EE1411, 0x636FBC2A,
+    0x2BA9C55D, 0x741831F6, 0xCE5C3E16, 0x9B87931E, 0xAFD6BA33, 0x6C24CF5C,
+    0x7A325381, 0x28958677, 0x3B8F4898, 0x6B4BB9AF, 0xC4BFE81B, 0x66282193,
+    0x61D809CC, 0xFB21A991, 0x487CAC60, 0x5DEC8032, 0xEF845D5D, 0xE98575B1,
+    0xDC262302, 0xEB651B88, 0x23893E81, 0xD396ACC5, 0x0F6D6FF3, 0x83F44239,
+    0x2E0B4482, 0xA4842004, 0x69C8F04A, 0x9E1F9B5E, 0x21C66842, 0xF6E96C9A,
+    0x670C9C61, 0xABD388F0, 0x6A51A0D2, 0xD8542F68, 0x960FA728, 0xAB5133A3,
+    0x6EEF0B6C, 0x137A3BE4, 0xBA3BF050, 0x7EFB2A98, 0xA1F1651D, 0x39AF0176,
+    0x66CA593E, 0x82430E88, 0x8CEE8619, 0x456F9FB4, 0x7D84A5C3, 0x3B8B5EBE,
+    0xE06F75D8, 0x85C12073, 0x401A449F, 0x56C16AA6, 0x4ED3AA62, 0x363F7706,
+    0x1BFEDF72, 0x429B023D, 0x37D0D724, 0xD00A1248, 0xDB0FEAD3, 0x49F1C09B,
+    0x075372C9, 0x80991B7B, 0x25D479D8, 0xF6E8DEF7, 0xE3FE501A, 0xB6794C3B,
+    0x976CE0BD, 0x04C006BA, 0xC1A94FB6, 0x409F60C4, 0x5E5C9EC2, 0x196A2463,
+    0x68FB6FAF, 0x3E6C53B5, 0x1339B2EB, 0x3B52EC6F, 0x6DFC511F, 0x9B30952C,
+    0xCC814544, 0xAF5EBD09, 0xBEE3D004, 0xDE334AFD, 0x660F2807, 0x192E4BB3,
+    0xC0CBA857, 0x45C8740F, 0xD20B5F39, 0xB9D3FBDB, 0x5579C0BD, 0x1A60320A,
+    0xD6A100C6, 0x402C7279, 0x679F25FE, 0xFB1FA3CC, 0x8EA5E9F8, 0xDB3222F8,
+    0x3C7516DF, 0xFD616B15, 0x2F501EC8, 0xAD0552AB, 0x323DB5FA, 0xFD238760,
+    0x53317B48, 0x3E00DF82, 0x9E5C57BB, 0xCA6F8CA0, 0x1A87562E, 0xDF1769DB,
+    0xD542A8F6, 0x287EFFC3, 0xAC6732C6, 0x8C4F5573, 0x695B27B0, 0xBBCA58C8,
+    0xE1FFA35D, 0xB8F011A0, 0x10FA3D98, 0xFD2183B8, 0x4AFCB56C, 0x2DD1D35B,
+    0x9A53E479, 0xB6F84565, 0xD28E49BC, 0x4BFB9790, 0xE1DDF2DA, 0xA4CB7E33,
+    0x62FB1341, 0xCEE4C6E8, 0xEF20CADA, 0x36774C01, 0xD07E9EFE, 0x2BF11FB4,
+    0x95DBDA4D, 0xAE909198, 0xEAAD8E71, 0x6B93D5A0, 0xD08ED1D0, 0xAFC725E0,
+    0x8E3C5B2F, 0x8E7594B7, 0x8FF6E2FB, 0xF2122B64, 0x8888B812, 0x900DF01C,
+    0x4FAD5EA0, 0x688FC31C, 0xD1CFF191, 0xB3A8C1AD, 0x2F2F2218, 0xBE0E1777,
+    0xEA752DFE, 0x8B021FA1, 0xE5A0CC0F, 0xB56F74E8, 0x18ACF3D6, 0xCE89E299,
+    0xB4A84FE0, 0xFD13E0B7, 0x7CC43B81, 0xD2ADA8D9, 0x165FA266, 0x80957705,
+    0x93CC7314, 0x211A1477, 0xE6AD2065, 0x77B5FA86, 0xC75442F5, 0xFB9D35CF,
+    0xEBCDAF0C, 0x7B3E89A0, 0xD6411BD3, 0xAE1E7E49, 0x00250E2D, 0x2071B35E,
+    0x226800BB, 0x57B8E0AF, 0x2464369B, 0xF009B91E, 0x5563911D, 0x59DFA6AA,
+    0x78C14389, 0xD95A537F, 0x207D5BA2, 0x02E5B9C5, 0x83260376, 0x6295CFA9,
+    0x11C81968, 0x4E734A41, 0xB3472DCA, 0x7B14A94A, 0x1B510052, 0x9A532915,
+    0xD60F573F, 0xBC9BC6E4, 0x2B60A476, 0x81E67400, 0x08BA6FB5, 0x571BE91F,
+    0xF296EC6B, 0x2A0DD915, 0xB6636521, 0xE7B9F9B6, 0xFF34052E, 0xC5855664,
+    0x53B02D5D, 0xA99F8FA1, 0x08BA4799, 0x6E85076A,
+};
+
+static const uint32_t sbox1[] = {
+    0x4B7A70E9, 0xB5B32944, 0xDB75092E, 0xC4192623, 0xAD6EA6B0, 0x49A7DF7D,
+    0x9CEE60B8, 0x8FEDB266, 0xECAA8C71, 0x699A17FF, 0x5664526C, 0xC2B19EE1,
+    0x193602A5, 0x75094C29, 0xA0591340, 0xE4183A3E, 0x3F54989A, 0x5B429D65,
+    0x6B8FE4D6, 0x99F73FD6, 0xA1D29C07, 0xEFE830F5, 0x4D2D38E6, 0xF0255DC1,
+    0x4CDD2086, 0x8470EB26, 0x6382E9C6, 0x021ECC5E, 0x09686B3F, 0x3EBAEFC9,
+    0x3C971814, 0x6B6A70A1, 0x687F3584, 0x52A0E286, 0xB79C5305, 0xAA500737,
+    0x3E07841C, 0x7FDEAE5C, 0x8E7D44EC, 0x5716F2B8, 0xB03ADA37, 0xF0500C0D,
+    0xF01C1F04, 0x0200B3FF, 0xAE0CF51A, 0x3CB574B2, 0x25837A58, 0xDC0921BD,
+    0xD19113F9, 0x7CA92FF6, 0x94324773, 0x22F54701, 0x3AE5E581, 0x37C2DADC,
+    0xC8B57634, 0x9AF3DDA7, 0xA9446146, 0x0FD0030E, 0xECC8C73E, 0xA4751E41,
+    0xE238CD99, 0x3BEA0E2F, 0x3280BBA1, 0x183EB331, 0x4E548B38, 0x4F6DB908,
+    0x6F420D03, 0xF60A04BF, 0x2CB81290, 0x24977C79, 0x5679B072, 0xBCAF89AF,
+    0xDE9A771F, 0xD9930810, 0xB38BAE12, 0xDCCF3F2E, 0x5512721F, 0x2E6B7124,
+    0x501ADDE6, 0x9F84CD87, 0x7A584718, 0x7408DA17, 0xBC9F9ABC, 0xE94B7D8C,
+    0xEC7AEC3A, 0xDB851DFA, 0x63094366, 0xC464C3D2, 0xEF1C1847, 0x3215D908,
+    0xDD433B37, 0x24C2BA16, 0x12A14D43, 0x2A65C451, 0x50940002, 0x133AE4DD,
+    0x71DFF89E, 0x10314E55, 0x81AC77D6, 0x5F11199B, 0x043556F1, 0xD7A3C76B,
+    0x3C11183B, 0x5924A509, 0xF28FE6ED, 0x97F1FBFA, 0x9EBABF2C, 0x1E153C6E,
+    0x86E34570, 0xEAE96FB1, 0x860E5E0A, 0x5A3E2AB3, 0x771FE71C, 0x4E3D06FA,
+    0x2965DCB9, 0x99E71D0F, 0x803E89D6, 0x5266C825, 0x2E4CC978, 0x9C10B36A,
+    0xC6150EBA, 0x94E2EA78, 0xA5FC3C53, 0x1E0A2DF4, 0xF2F74EA7, 0x361D2B3D,
+    0x1939260F, 0x19C27960, 0x5223A708, 0xF71312B6, 0xEBADFE6E, 0xEAC31F66,
+    0xE3BC4595, 0xA67BC883, 0xB17F37D1, 0x018CFF28, 0xC332DDEF, 0xBE6C5AA5,
+    0x65582185, 0x68AB9802, 0xEECEA50F, 0xDB2F953B, 0x2AEF7DAD, 0x5B6E2F84,
+    0x1521B628, 0x29076170, 0xECDD4775, 0x619F1510, 0x13CCA830, 0xEB61BD96,
+    0x0334FE1E, 0xAA0363CF, 0xB5735C90, 0x4C70A239, 0xD59E9E0B, 0xCBAADE14,
+    0xEECC86BC, 0x60622CA7, 0x9CAB5CAB, 0xB2F3846E, 0x648B1EAF, 0x19BDF0CA,
+    0xA02369B9, 0x655ABB50, 0x40685A32, 0x3C2AB4B3, 0x319EE9D5, 0xC021B8F7,
+    0x9B540B19, 0x875FA099, 0x95F7997E, 0x623D7DA8, 0xF837889A, 0x97E32D77,
+    0x11ED935F, 0x16681281, 0x0E358829, 0xC7E61FD6, 0x96DEDFA1, 0x7858BA99,
+    0x57F584A5, 0x1B227263, 0x9B83C3FF, 0x1AC24696, 0xCDB30AEB, 0x532E3054,
+    0x8FD948E4, 0x6DBC3128, 0x58EBF2EF, 0x34C6FFEA, 0xFE28ED61, 0xEE7C3C73,
+    0x5D4A14D9, 0xE864B7E3, 0x42105D14, 0x203E13E0, 0x45EEE2B6, 0xA3AAABEA,
+    0xDB6C4F15, 0xFACB4FD0, 0xC742F442, 0xEF6ABBB5, 0x654F3B1D, 0x41CD2105,
+    0xD81E799E, 0x86854DC7, 0xE44B476A, 0x3D816250, 0xCF62A1F2, 0x5B8D2646,
+    0xFC8883A0, 0xC1C7B6A3, 0x7F1524C3, 0x69CB7492, 0x47848A0B, 0x5692B285,
+    0x095BBF00, 0xAD19489D, 0x1462B174, 0x23820E00, 0x58428D2A, 0x0C55F5EA,
+    0x1DADF43E, 0x233F7061, 0x3372F092, 0x8D937E41, 0xD65FECF1, 0x6C223BDB,
+    0x7CDE3759, 0xCBEE7460, 0x4085F2A7, 0xCE77326E, 0xA6078084, 0x19F8509E,
+    0xE8EFD855, 0x61D99735, 0xA969A7AA, 0xC50C06C2, 0x5A04ABFC, 0x800BCADC,
+    0x9E447A2E, 0xC3453484, 0xFDD56705, 0x0E1E9EC9, 0xDB73DBD3, 0x105588CD,
+    0x675FDA79, 0xE3674340, 0xC5C43465, 0x713E38D8, 0x3D28F89E, 0xF16DFF20,
+    0x153E21E7, 0x8FB03D4A, 0xE6E39F2B, 0xDB83ADF7,
+};
+
+static const uint32_t sbox2[] = {
+    0xE93D5A68, 0x948140F7, 0xF64C261C, 0x94692934, 0x411520F7, 0x7602D4F7,
+    0xBCF46B2E, 0xD4A20068, 0xD4082471, 0x3320F46A, 0x43B7D4B7, 0x500061AF,
+    0x1E39F62E, 0x97244546, 0x14214F74, 0xBF8B8840, 0x4D95FC1D, 0x96B591AF,
+    0x70F4DDD3, 0x66A02F45, 0xBFBC09EC, 0x03BD9785, 0x7FAC6DD0, 0x31CB8504,
+    0x96EB27B3, 0x55FD3941, 0xDA2547E6, 0xABCA0A9A, 0x28507825, 0x530429F4,
+    0x0A2C86DA, 0xE9B66DFB, 0x68DC1462, 0xD7486900, 0x680EC0A4, 0x27A18DEE,
+    0x4F3FFEA2, 0xE887AD8C, 0xB58CE006, 0x7AF4D6B6, 0xAACE1E7C, 0xD3375FEC,
+    0xCE78A399, 0x406B2A42, 0x20FE9E35, 0xD9F385B9, 0xEE39D7AB, 0x3B124E8B,
+    0x1DC9FAF7, 0x4B6D1856, 0x26A36631, 0xEAE397B2, 0x3A6EFA74, 0xDD5B4332,
+    0x6841E7F7, 0xCA7820FB, 0xFB0AF54E, 0xD8FEB397, 0x454056AC, 0xBA489527,
+    0x55533A3A, 0x20838D87, 0xFE6BA9B7, 0xD096954B, 0x55A867BC, 0xA1159A58,
+    0xCCA92963, 0x99E1DB33, 0xA62A4A56, 0x3F3125F9, 0x5EF47E1C, 0x9029317C,
+    0xFDF8E802, 0x04272F70, 0x80BB155C, 0x05282CE3, 0x95C11548, 0xE4C66D22,
+    0x48C1133F, 0xC70F86DC, 0x07F9C9EE, 0x41041F0F, 0x404779A4, 0x5D886E17,
+    0x325F51EB, 0xD59BC0D1, 0xF2BCC18F, 0x41113564, 0x257B7834, 0x602A9C60,
+    0xDFF8E8A3, 0x1F636C1B, 0x0E12B4C2, 0x02E1329E, 0xAF664FD1, 0xCAD18115,
+    0x6B2395E0, 0x333E92E1, 0x3B240B62, 0xEEBEB922, 0x85B2A20E, 0xE6BA0D99,
+    0xDE720C8C, 0x2DA2F728, 0xD0127845, 0x95B794FD, 0x647D0862, 0xE7CCF5F0,
+    0x5449A36F, 0x877D48FA, 0xC39DFD27, 0xF33E8D1E, 0x0A476341, 0x992EFF74,
+    0x3A6F6EAB, 0xF4F8FD37, 0xA812DC60, 0xA1EBDDF8, 0x991BE14C, 0xDB6E6B0D,
+    0xC67B5510, 0x6D672C37, 0x2765D43B, 0xDCD0E804, 0xF1290DC7, 0xCC00FFA3,
+    0xB5390F92, 0x690FED0B, 0x667B9FFB, 0xCEDB7D9C, 0xA091CF0B, 0xD9155EA3,
+    0xBB132F88, 0x515BAD24, 0x7B9479BF, 0x763BD6EB, 0x37392EB3, 0xCC115979,
+    0x8026E297, 0xF42E312D, 0x6842ADA7, 0xC66A2B3B, 0x12754CCC, 0x782EF11C,
+    0x6A124237, 0xB79251E7, 0x06A1BBE6, 0x4BFB6350, 0x1A6B1018, 0x11CAEDFA,
+    0x3D25BDD8, 0xE2E1C3C9, 0x44421659, 0x0A121386, 0xD90CEC6E, 0xD5ABEA2A,
+    0x64AF674E, 0xDA86A85F, 0xBEBFE988, 0x64E4C3FE, 0x9DBC8057, 0xF0F7C086,
+    0x60787BF8, 0x6003604D, 0xD1FD8346, 0xF6381FB0, 0x7745AE04, 0xD736FCCC,
+    0x83426B33, 0xF01EAB71, 0xB0804187, 0x3C005E5F, 0x77A057BE, 0xBDE8AE24,
+    0x55464299, 0xBF582E61, 0x4E58F48F, 0xF2DDFDA2, 0xF474EF38, 0x8789BDC2,
+    0x5366F9C3, 0xC8B38E74, 0xB475F255, 0x46FCD9B9, 0x7AEB2661, 0x8B1DDF84,
+    0x846A0E79, 0x915F95E2, 0x466E598E, 0x20B45770, 0x8CD55591, 0xC902DE4C,
+    0xB90BACE1, 0xBB8205D0, 0x11A86248, 0x7574A99E, 0xB77F19B6, 0xE0A9DC09,
+    0x662D09A1, 0xC4324633, 0xE85A1F02, 0x09F0BE8C, 0x4A99A025, 0x1D6EFE10,
+    0x1AB93D1D, 0x0BA5A4DF, 0xA186F20F, 0x2868F169, 0xDCB7DA83, 0x573906FE,
+    0xA1E2CE9B, 0x4FCD7F52, 0x50115E01, 0xA70683FA, 0xA002B5C4, 0x0DE6D027,
+    0x9AF88C27, 0x773F8641, 0xC3604C06, 0x61A806B5, 0xF0177A28, 0xC0F586E0,
+    0x006058AA, 0x30DC7D62, 0x11E69ED7, 0x2338EA63, 0x53C2DD94, 0xC2C21634,
+    0xBBCBEE56, 0x90BCB6DE, 0xEBFC7DA1, 0xCE591D76, 0x6F05E409, 0x4B7C0188,
+    0x39720A3D, 0x7C927C24, 0x86E3725F, 0x724D9DB9, 0x1AC15BB4, 0xD39EB8FC,
+    0xED545578, 0x08FCA5B5, 0xD83D7CD3, 0x4DAD0FC4, 0x1E50EF5E, 0xB161E6F8,
+    0xA28514D9, 0x6C51133C, 0x6FD5C7E7, 0x56E14EC4, 0x362ABFCE, 0xDDC6C837,
+    0xD79A3234, 0x92638212, 0x670EFA8E, 0x406000E0,
+};
+
+static const uint32_t sbox3[] = {
+    0x3A39CE37, 0xD3FAF5CF, 0xABC27737, 0x5AC52D1B, 0x5CB0679E, 0x4FA33742,
+    0xD3822740, 0x99BC9BBE, 0xD5118E9D, 0xBF0F7315, 0xD62D1C7E, 0xC700C47B,
+    0xB78C1B6B, 0x21A19045, 0xB26EB1BE, 0x6A366EB4, 0x5748AB2F, 0xBC946E79,
+    0xC6A376D2, 0x6549C2C8, 0x530FF8EE, 0x468DDE7D, 0xD5730A1D, 0x4CD04DC6,
+    0x2939BBDB, 0xA9BA4650, 0xAC9526E8, 0xBE5EE304, 0xA1FAD5F0, 0x6A2D519A,
+    0x63EF8CE2, 0x9A86EE22, 0xC089C2B8, 0x43242EF6, 0xA51E03AA, 0x9CF2D0A4,
+    0x83C061BA, 0x9BE96A4D, 0x8FE51550, 0xBA645BD6, 0x2826A2F9, 0xA73A3AE1,
+    0x4BA99586, 0xEF5562E9, 0xC72FEFD3, 0xF752F7DA, 0x3F046F69, 0x77FA0A59,
+    0x80E4A915, 0x87B08601, 0x9B09E6AD, 0x3B3EE593, 0xE990FD5A, 0x9E34D797,
+    0x2CF0B7D9, 0x022B8B51, 0x96D5AC3A, 0x017DA67D, 0xD1CF3ED6, 0x7C7D2D28,
+    0x1F9F25CF, 0xADF2B89B, 0x5AD6B472, 0x5A88F54C, 0xE029AC71, 0xE019A5E6,
+    0x47B0ACFD, 0xED93FA9B, 0xE8D3C48D, 0x283B57CC, 0xF8D56629, 0x79132E28,
+    0x785F0191, 0xED756055, 0xF7960E44, 0xE3D35E8C, 0x15056DD4, 0x88F46DBA,
+    0x03A16125, 0x0564F0BD, 0xC3EB9E15, 0x3C9057A2, 0x97271AEC, 0xA93A072A,
+    0x1B3F6D9B, 0x1E6321F5, 0xF59C66FB, 0x26DCF319, 0x7533D928, 0xB155FDF5,
+    0x03563482, 0x8ABA3CBB, 0x28517711, 0xC20AD9F8, 0xABCC5167, 0xCCAD925F,
+    0x4DE81751, 0x3830DC8E, 0x379D5862, 0x9320F991, 0xEA7A90C2, 0xFB3E7BCE,
+    0x5121CE64, 0x774FBE32, 0xA8B6E37E, 0xC3293D46, 0x48DE5369, 0x6413E680,
+    0xA2AE0810, 0xDD6DB224, 0x69852DFD, 0x09072166, 0xB39A460A, 0x6445C0DD,
+    0x586CDECF, 0x1C20C8AE, 0x5BBEF7DD, 0x1B588D40, 0xCCD2017F, 0x6BB4E3BB,
+    0xDDA26A7E, 0x3A59FF45, 0x3E350A44, 0xBCB4CDD5, 0x72EACEA8, 0xFA6484BB,
+    0x8D6612AE, 0xBF3C6F47, 0xD29BE463, 0x542F5D9E, 0xAEC2771B, 0xF64E6370,
+    0x740E0D8D, 0xE75B1357, 0xF8721671, 0xAF537D5D, 0x4040CB08, 0x4EB4E2CC,
+    0x34D2466A, 0x0115AF84, 0xE1B00428, 0x95983A1D, 0x06B89FB4, 0xCE6EA048,
+    0x6F3F3B82, 0x3520AB82, 0x011A1D4B, 0x277227F8, 0x611560B1, 0xE7933FDC,
+    0xBB3A792B, 0x344525BD, 0xA08839E1, 0x51CE794B, 0x2F32C9B7, 0xA01FBAC9,
+    0xE01CC87E, 0xBCC7D1F6, 0xCF0111C3, 0xA1E8AAC7, 0x1A908749, 0xD44FBD9A,
+    0xD0DADECB, 0xD50ADA38, 0x0339C32A, 0xC6913667, 0x8DF9317C, 0xE0B12B4F,
+    0xF79E59B7, 0x43F5BB3A, 0xF2D519FF, 0x27D9459C, 0xBF97222C, 0x15E6FC2A,
+    0x0F91FC71, 0x9B941525, 0xFAE59361, 0xCEB69CEB, 0xC2A86459, 0x12BAA8D1,
+    0xB6C1075E, 0xE3056A0C, 0x10D25065, 0xCB03A442, 0xE0EC6E0E, 0x1698DB3B,
+    0x4C98A0BE, 0x3278E964, 0x9F1F9532, 0xE0D392DF, 0xD3A0342B, 0x8971F21E,
+    0x1B0A7441, 0x4BA3348C, 0xC5BE7120, 0xC37632D8, 0xDF359F8D, 0x9B992F2E,
+    0xE60B6F47, 0x0FE3F11D, 0xE54CDA54, 0x1EDAD891, 0xCE6279CF, 0xCD3E7E6F,
+    0x1618B166, 0xFD2C1D05, 0x848FD2C5, 0xF6FB2299, 0xF523F357, 0xA6327623,
+    0x93A83531, 0x56CCCD02, 0xACF08162, 0x5A75EBB5, 0x6E163697, 0x88D273CC,
+    0xDE966292, 0x81B949D0, 0x4C50901B, 0x71C65614, 0xE6C6C7BD, 0x327A140A,
+    0x45E1D006, 0xC3F27B9A, 0xC9AA53FD, 0x62A80F00, 0xBB25BFE2, 0x35BDD2F6,
+    0x71126905, 0xB2040222, 0xB6CBCF7C, 0xCD769C2B, 0x53113EC0, 0x1640E3D3,
+    0x38ABBD60, 0x2547ADF0, 0xBA38209C, 0xF746CE76, 0x77AFA1C5, 0x20756060,
+    0x85CBFE4E, 0x8AE88DD8, 0x7AAAF9B0, 0x4CF9AA7E, 0x1948C25C, 0x02FB8A8C,
+    0x01C36AE4, 0xD6EBE1F9, 0x90D4F869, 0xA65CDEA0, 0x3F09252D, 0xC208E69F,
+    0xB74E6132, 0xCE77E25B, 0x578FDFE3, 0x3AC372E6,
+};
+
+#define Fprime(a,b,c,d) ( ( (S0[a] + S1[b]) ^ S2[c] ) + S3[d] )
+#define F(x) Fprime( ((x>>24)&0xFF), ((x>>16)&0xFF), ((x>>8)&0xFF), (x&0xFF) )
+#define ROUND(n) ( xL ^= P[n], t = xL, xL = F(xL) ^ xR, xR = t )
+
+static void blowfish_encrypt(uint32_t xL, uint32_t xR, uint32_t *output,
+                             BlowfishContext * ctx)
+{
+    uint32_t *S0 = ctx->S0;
+    uint32_t *S1 = ctx->S1;
+    uint32_t *S2 = ctx->S2;
+    uint32_t *S3 = ctx->S3;
+    uint32_t *P = ctx->P;
+    uint32_t t;
+
+    ROUND(0);
+    ROUND(1);
+    ROUND(2);
+    ROUND(3);
+    ROUND(4);
+    ROUND(5);
+    ROUND(6);
+    ROUND(7);
+    ROUND(8);
+    ROUND(9);
+    ROUND(10);
+    ROUND(11);
+    ROUND(12);
+    ROUND(13);
+    ROUND(14);
+    ROUND(15);
+    xL ^= P[16];
+    xR ^= P[17];
+
+    output[0] = xR;
+    output[1] = xL;
+}
+
+static void blowfish_decrypt(uint32_t xL, uint32_t xR, uint32_t *output,
+                             BlowfishContext * ctx)
+{
+    uint32_t *S0 = ctx->S0;
+    uint32_t *S1 = ctx->S1;
+    uint32_t *S2 = ctx->S2;
+    uint32_t *S3 = ctx->S3;
+    uint32_t *P = ctx->P;
+    uint32_t t;
+
+    ROUND(17);
+    ROUND(16);
+    ROUND(15);
+    ROUND(14);
+    ROUND(13);
+    ROUND(12);
+    ROUND(11);
+    ROUND(10);
+    ROUND(9);
+    ROUND(8);
+    ROUND(7);
+    ROUND(6);
+    ROUND(5);
+    ROUND(4);
+    ROUND(3);
+    ROUND(2);
+    xL ^= P[1];
+    xR ^= P[0];
+
+    output[0] = xR;
+    output[1] = xL;
+}
+
+static void blowfish_lsb_encrypt_cbc(unsigned char *blk, int len,
+                                     BlowfishContext * ctx)
+{
+    uint32_t xL, xR, out[2], iv0, iv1;
+
+    assert((len & 7) == 0);
+
+    iv0 = ctx->iv0;
+    iv1 = ctx->iv1;
+
+    while (len > 0) {
+        xL = GET_32BIT_LSB_FIRST(blk);
+        xR = GET_32BIT_LSB_FIRST(blk + 4);
+        iv0 ^= xL;
+        iv1 ^= xR;
+        blowfish_encrypt(iv0, iv1, out, ctx);
+        iv0 = out[0];
+        iv1 = out[1];
+        PUT_32BIT_LSB_FIRST(blk, iv0);
+        PUT_32BIT_LSB_FIRST(blk + 4, iv1);
+        blk += 8;
+        len -= 8;
+    }
+
+    ctx->iv0 = iv0;
+    ctx->iv1 = iv1;
+}
+
+void blowfish_lsb_encrypt_ecb(void *vblk, int len, BlowfishContext * ctx)
+{
+    unsigned char *blk = (unsigned char *)vblk;
+    uint32_t xL, xR, out[2];
+
+    assert((len & 7) == 0);
+
+    while (len > 0) {
+        xL = GET_32BIT_LSB_FIRST(blk);
+        xR = GET_32BIT_LSB_FIRST(blk + 4);
+        blowfish_encrypt(xL, xR, out, ctx);
+        PUT_32BIT_LSB_FIRST(blk, out[0]);
+        PUT_32BIT_LSB_FIRST(blk + 4, out[1]);
+        blk += 8;
+        len -= 8;
+    }
+}
+
+static void blowfish_lsb_decrypt_cbc(unsigned char *blk, int len,
+                                     BlowfishContext * ctx)
+{
+    uint32_t xL, xR, out[2], iv0, iv1;
+
+    assert((len & 7) == 0);
+
+    iv0 = ctx->iv0;
+    iv1 = ctx->iv1;
+
+    while (len > 0) {
+        xL = GET_32BIT_LSB_FIRST(blk);
+        xR = GET_32BIT_LSB_FIRST(blk + 4);
+        blowfish_decrypt(xL, xR, out, ctx);
+        iv0 ^= out[0];
+        iv1 ^= out[1];
+        PUT_32BIT_LSB_FIRST(blk, iv0);
+        PUT_32BIT_LSB_FIRST(blk + 4, iv1);
+        iv0 = xL;
+        iv1 = xR;
+        blk += 8;
+        len -= 8;
+    }
+
+    ctx->iv0 = iv0;
+    ctx->iv1 = iv1;
+}
+
+static void blowfish_msb_encrypt_cbc(unsigned char *blk, int len,
+                                     BlowfishContext * ctx)
+{
+    uint32_t xL, xR, out[2], iv0, iv1;
+
+    assert((len & 7) == 0);
+
+    iv0 = ctx->iv0;
+    iv1 = ctx->iv1;
+
+    while (len > 0) {
+        xL = GET_32BIT_MSB_FIRST(blk);
+        xR = GET_32BIT_MSB_FIRST(blk + 4);
+        iv0 ^= xL;
+        iv1 ^= xR;
+        blowfish_encrypt(iv0, iv1, out, ctx);
+        iv0 = out[0];
+        iv1 = out[1];
+        PUT_32BIT_MSB_FIRST(blk, iv0);
+        PUT_32BIT_MSB_FIRST(blk + 4, iv1);
+        blk += 8;
+        len -= 8;
+    }
+
+    ctx->iv0 = iv0;
+    ctx->iv1 = iv1;
+}
+
+static void blowfish_msb_decrypt_cbc(unsigned char *blk, int len,
+                                     BlowfishContext * ctx)
+{
+    uint32_t xL, xR, out[2], iv0, iv1;
+
+    assert((len & 7) == 0);
+
+    iv0 = ctx->iv0;
+    iv1 = ctx->iv1;
+
+    while (len > 0) {
+        xL = GET_32BIT_MSB_FIRST(blk);
+        xR = GET_32BIT_MSB_FIRST(blk + 4);
+        blowfish_decrypt(xL, xR, out, ctx);
+        iv0 ^= out[0];
+        iv1 ^= out[1];
+        PUT_32BIT_MSB_FIRST(blk, iv0);
+        PUT_32BIT_MSB_FIRST(blk + 4, iv1);
+        iv0 = xL;
+        iv1 = xR;
+        blk += 8;
+        len -= 8;
+    }
+
+    ctx->iv0 = iv0;
+    ctx->iv1 = iv1;
+}
+
+static void blowfish_msb_sdctr(unsigned char *blk, int len,
+                                     BlowfishContext * ctx)
+{
+    uint32_t b[2], iv0, iv1, tmp;
+
+    assert((len & 7) == 0);
+
+    iv0 = ctx->iv0;
+    iv1 = ctx->iv1;
+
+    while (len > 0) {
+        blowfish_encrypt(iv0, iv1, b, ctx);
+        tmp = GET_32BIT_MSB_FIRST(blk);
+        PUT_32BIT_MSB_FIRST(blk, tmp ^ b[0]);
+        tmp = GET_32BIT_MSB_FIRST(blk + 4);
+        PUT_32BIT_MSB_FIRST(blk + 4, tmp ^ b[1]);
+        if ((iv1 = (iv1 + 1) & 0xffffffff) == 0)
+            iv0 = (iv0 + 1) & 0xffffffff;
+        blk += 8;
+        len -= 8;
+    }
+
+    ctx->iv0 = iv0;
+    ctx->iv1 = iv1;
+}
+
+void blowfish_initkey(BlowfishContext *ctx)
+{
+    int i;
+
+    for (i = 0; i < 18; i++) {
+        ctx->P[i] = parray[i];
+    }
+
+    for (i = 0; i < 256; i++) {
+        ctx->S0[i] = sbox0[i];
+        ctx->S1[i] = sbox1[i];
+        ctx->S2[i] = sbox2[i];
+        ctx->S3[i] = sbox3[i];
+    }
+}
+
+void blowfish_expandkey(BlowfishContext * ctx,
+                        const void *vkey, short keybytes,
+                        const void *vsalt, short saltbytes)
+{
+    const unsigned char *key = (const unsigned char *)vkey;
+    const unsigned char *salt = (const unsigned char *)vsalt;
+    uint32_t *S0 = ctx->S0;
+    uint32_t *S1 = ctx->S1;
+    uint32_t *S2 = ctx->S2;
+    uint32_t *S3 = ctx->S3;
+    uint32_t *P = ctx->P;
+    uint32_t str[2];
+    int i, j;
+    int saltpos;
+    unsigned char dummysalt[1];
+
+    saltpos = 0;
+    if (!salt) {
+        saltbytes = 1;
+        salt = dummysalt;
+        dummysalt[0] = 0;
+    }
+
+    for (i = 0; i < 18; i++) {
+        P[i] ^=
+            ((uint32_t) (unsigned char) (key[(i * 4 + 0) % keybytes])) << 24;
+        P[i] ^=
+            ((uint32_t) (unsigned char) (key[(i * 4 + 1) % keybytes])) << 16;
+        P[i] ^=
+            ((uint32_t) (unsigned char) (key[(i * 4 + 2) % keybytes])) << 8;
+        P[i] ^= ((uint32_t) (unsigned char) (key[(i * 4 + 3) % keybytes]));
+    }
+
+    str[0] = str[1] = 0;
+
+    for (i = 0; i < 18; i += 2) {
+        for (j = 0; j < 8; j++)
+            str[j/4] ^= ((uint32_t)salt[saltpos++ % saltbytes]) << (24-8*(j%4));
+
+        blowfish_encrypt(str[0], str[1], str, ctx);
+        P[i] = str[0];
+        P[i + 1] = str[1];
+    }
+
+    for (i = 0; i < 256; i += 2) {
+        for (j = 0; j < 8; j++)
+            str[j/4] ^= ((uint32_t)salt[saltpos++ % saltbytes]) << (24-8*(j%4));
+        blowfish_encrypt(str[0], str[1], str, ctx);
+        S0[i] = str[0];
+        S0[i + 1] = str[1];
+    }
+    for (i = 0; i < 256; i += 2) {
+        for (j = 0; j < 8; j++)
+            str[j/4] ^= ((uint32_t)salt[saltpos++ % saltbytes]) << (24-8*(j%4));
+        blowfish_encrypt(str[0], str[1], str, ctx);
+        S1[i] = str[0];
+        S1[i + 1] = str[1];
+    }
+    for (i = 0; i < 256; i += 2) {
+        for (j = 0; j < 8; j++)
+            str[j/4] ^= ((uint32_t)salt[saltpos++ % saltbytes]) << (24-8*(j%4));
+        blowfish_encrypt(str[0], str[1], str, ctx);
+        S2[i] = str[0];
+        S2[i + 1] = str[1];
+    }
+    for (i = 0; i < 256; i += 2) {
+        for (j = 0; j < 8; j++)
+            str[j/4] ^= ((uint32_t)salt[saltpos++ % saltbytes]) << (24-8*(j%4));
+        blowfish_encrypt(str[0], str[1], str, ctx);
+        S3[i] = str[0];
+        S3[i + 1] = str[1];
+    }
+}
+
+static void blowfish_setkey(BlowfishContext *ctx,
+                            const unsigned char *key, short keybytes)
+{
+    blowfish_initkey(ctx);
+    blowfish_expandkey(ctx, key, keybytes, NULL, 0);
+}
+
+/* -- Interface with PuTTY -- */
+
+#define SSH1_SESSION_KEY_LENGTH 32
+
+BlowfishContext *blowfish_make_context(void)
+{
+    return snew(BlowfishContext);
+}
+
+void blowfish_free_context(BlowfishContext *ctx)
+{
+    sfree(ctx);
+}
+
+static void blowfish_iv_be(BlowfishContext *ctx, const void *viv)
+{
+    const unsigned char *iv = (const unsigned char *)viv;
+    ctx->iv0 = GET_32BIT_MSB_FIRST(iv);
+    ctx->iv1 = GET_32BIT_MSB_FIRST(iv + 4);
+}
+
+static void blowfish_iv_le(BlowfishContext *ctx, const void *viv)
+{
+    const unsigned char *iv = (const unsigned char *)viv;
+    ctx->iv0 = GET_32BIT_LSB_FIRST(iv);
+    ctx->iv1 = GET_32BIT_LSB_FIRST(iv + 4);
+}
+
+struct blowfish_ctx {
+    BlowfishContext context;
+    ssh_cipher ciph;
+};
+
+static ssh_cipher *blowfish_new(const ssh_cipheralg *alg)
+{
+    struct blowfish_ctx *ctx = snew(struct blowfish_ctx);
+    ctx->ciph.vt = alg;
+    return &ctx->ciph;
+}
+
+static void blowfish_free(ssh_cipher *cipher)
+{
+    struct blowfish_ctx *ctx = container_of(cipher, struct blowfish_ctx, ciph);
+    smemclr(ctx, sizeof(*ctx));
+    sfree(ctx);
+}
+
+static void blowfish_ssh_setkey(ssh_cipher *cipher, const void *key)
+{
+    struct blowfish_ctx *ctx = container_of(cipher, struct blowfish_ctx, ciph);
+    blowfish_setkey(&ctx->context, key, ctx->ciph.vt->padded_keybytes);
+}
+
+static void blowfish_ssh1_setiv(ssh_cipher *cipher, const void *iv)
+{
+    struct blowfish_ctx *ctx = container_of(cipher, struct blowfish_ctx, ciph);
+    blowfish_iv_le(&ctx->context, iv);
+}
+
+static void blowfish_ssh2_setiv(ssh_cipher *cipher, const void *iv)
+{
+    struct blowfish_ctx *ctx = container_of(cipher, struct blowfish_ctx, ciph);
+    blowfish_iv_be(&ctx->context, iv);
+}
+
+static void blowfish_ssh1_encrypt_blk(ssh_cipher *cipher, void *blk, int len)
+{
+    struct blowfish_ctx *ctx = container_of(cipher, struct blowfish_ctx, ciph);
+    blowfish_lsb_encrypt_cbc(blk, len, &ctx->context);
+}
+
+static void blowfish_ssh1_decrypt_blk(ssh_cipher *cipher, void *blk, int len)
+{
+    struct blowfish_ctx *ctx = container_of(cipher, struct blowfish_ctx, ciph);
+    blowfish_lsb_decrypt_cbc(blk, len, &ctx->context);
+}
+
+static void blowfish_ssh2_encrypt_blk(ssh_cipher *cipher, void *blk, int len)
+{
+    struct blowfish_ctx *ctx = container_of(cipher, struct blowfish_ctx, ciph);
+    blowfish_msb_encrypt_cbc(blk, len, &ctx->context);
+}
+
+static void blowfish_ssh2_decrypt_blk(ssh_cipher *cipher, void *blk, int len)
+{
+    struct blowfish_ctx *ctx = container_of(cipher, struct blowfish_ctx, ciph);
+    blowfish_msb_decrypt_cbc(blk, len, &ctx->context);
+}
+
+static void blowfish_ssh2_sdctr(ssh_cipher *cipher, void *blk, int len)
+{
+    struct blowfish_ctx *ctx = container_of(cipher, struct blowfish_ctx, ciph);
+    blowfish_msb_sdctr(blk, len, &ctx->context);
+}
+
+const ssh_cipheralg ssh_blowfish_ssh1 = {
+    .new = blowfish_new,
+    .free = blowfish_free,
+    .setiv = blowfish_ssh1_setiv,
+    .setkey = blowfish_ssh_setkey,
+    .encrypt = blowfish_ssh1_encrypt_blk,
+    .decrypt = blowfish_ssh1_decrypt_blk,
+    .blksize = 8,
+    .real_keybits = 128,
+    .padded_keybytes = SSH1_SESSION_KEY_LENGTH,
+    .flags = SSH_CIPHER_IS_CBC,
+    .text_name = "Blowfish-256 CBC",
+};
+
+const ssh_cipheralg ssh_blowfish_ssh2 = {
+    .new = blowfish_new,
+    .free = blowfish_free,
+    .setiv = blowfish_ssh2_setiv,
+    .setkey = blowfish_ssh_setkey,
+    .encrypt = blowfish_ssh2_encrypt_blk,
+    .decrypt = blowfish_ssh2_decrypt_blk,
+    .ssh2_id = "blowfish-cbc",
+    .blksize = 8,
+    .real_keybits = 128,
+    .padded_keybytes = 16,
+    .flags = SSH_CIPHER_IS_CBC,
+    .text_name = "Blowfish-128 CBC",
+};
+
+const ssh_cipheralg ssh_blowfish_ssh2_ctr = {
+    .new = blowfish_new,
+    .free = blowfish_free,
+    .setiv = blowfish_ssh2_setiv,
+    .setkey = blowfish_ssh_setkey,
+    .encrypt = blowfish_ssh2_sdctr,
+    .decrypt = blowfish_ssh2_sdctr,
+    .ssh2_id = "blowfish-ctr",
+    .blksize = 8,
+    .real_keybits = 256,
+    .padded_keybytes = 32,
+    .flags = 0,
+    .text_name = "Blowfish-256 SDCTR",
+};
+
+static const ssh_cipheralg *const blowfish_list[] = {
+    &ssh_blowfish_ssh2_ctr,
+    &ssh_blowfish_ssh2
+};
+
+const ssh2_ciphers ssh2_blowfish = { lenof(blowfish_list), blowfish_list };
--- a/crypto/chacha20-poly1305.c
+++ b/crypto/chacha20-poly1305.c
--- a/crypto/crc32.c
+++ b/crypto/crc32.c
@ -0,0 +1,113 @@
+/*
+ * CRC32 implementation, as used in SSH-1.
+ *
+ * (This is not, of course, a cryptographic function! It lives in the
+ * 'crypto' directory because SSH-1 uses it _as if_ it was crypto: it
+ * handles sensitive data, and we implement it with care for side
+ * channels.)
+ *
+ * This particular form of the CRC uses the polynomial
+ * P(x) = x^32+x^26+x^23+x^22+x^16+x^12+x^11+x^10+x^8+x^7+x^5+x^4+x^2+x^1+1
+ * and represents polynomials in bit-reversed form, so that the x^0
+ * coefficient (constant term) appears in the bit with place value
+ * 2^31, and the x^31 coefficient in the bit with place value 2^0. In
+ * this representation, (x^32 mod P) = 0xEDB88320, so multiplying the
+ * current state by x is done by shifting right by one bit, and XORing
+ * that constant into the result if the bit shifted out was 1.
+ *
+ * There's a bewildering array of subtly different variants of CRC out
+ * there, using different polynomials, both bit orders, and varying
+ * the start and end conditions. There are catalogue websites such as
+ * http://reveng.sourceforge.net/crc-catalogue/ , which generally seem
+ * to have the convention of indexing CRCs by their 'check value',
+ * defined as whatever you get if you hash the 9-byte test string
+ * "123456789".
+ *
+ * The crc32_rfc1662() function below, which starts off the CRC state
+ * at 0xFFFFFFFF and complements it after feeding all the data, gives
+ * the check value 0xCBF43926, and matches the hash function that the
+ * above catalogue refers to as "CRC-32/ISO-HDLC"; among other things,
+ * it's also the "FCS-32" checksum described in RFC 1662 section C.3
+ * (hence the name I've given it here).
+ *
+ * The crc32_ssh1() function implements the variant form used by
+ * SSH-1, which uses the same update function, but starts the state at
+ * zero and doesn't complement it at the end of the computation. The
+ * check value for that version is 0x2DFD2D88, which that CRC
+ * catalogue doesn't list at all.
+ */
+
+#include <stdint.h>
+#include <stdlib.h>
+
+#include "ssh.h"
+
+/*
+ * Multiply a CRC value by x^4. This implementation strategy avoids
+ * using a lookup table (which would be a side-channel hazard, since
+ * SSH-1 applies this CRC to decrypted session data).
+ *
+ * The basic idea is that you'd like to "multiply" the shifted-out 4
+ * bits by the CRC polynomial value 0xEDB88320, or rather by that
+ * value shifted right 3 bits (since you want the _last_ bit shifted
+ * out, i.e. the one originally at the 2^3 position, to generate
+ * 0xEDB88320 itself). But the scare-quoted "multiply" would have to
+ * be a multiplication of polynomials over GF(2), which differs from
+ * integer multiplication in that you don't have any carries. In other
+ * words, you make a copy of one input shifted left by the index of
+ * each set bit in the other, so that adding them all together would
+ * give you the ordinary integer product, and then you XOR them
+ * together instead.
+ *
+ * With a 4-bit multiplier, the two kinds of multiplication coincide
+ * provided the multiplicand has no two set bits at positions
+ * differing by less than 4, because then no two copies of the
+ * multiplier can overlap to generate a carry. So I break up the
+ * intended multiplicand K = 0xEDB88320 >> 3 into three sub-constants
+ * a,b,c with that property, such that a^b^c = K. Then I can multiply
+ * m by each of them separately, and XOR together the results.
+ */
+static inline uint32_t crc32_shift_4(uint32_t v)
+{
+    const uint32_t a = 0x11111044, b = 0x08840020, c = 0x04220000;
+    uint32_t m = v & 0xF;
+    return (v >> 4) ^ (a*m) ^ (b*m) ^ (c*m);
+}
+
+/*
+ * The 8-bit shift you need every time you absorb an input byte,
+ * implemented simply by iterating the 4-bit shift twice.
+ */
+static inline uint32_t crc32_shift_8(uint32_t v)
+{
+    return crc32_shift_4(crc32_shift_4(v));
+}
+
+/*
+ * Update an existing hash value with extra bytes of data.
+ */
+uint32_t crc32_update(uint32_t crc, ptrlen data)
+{
+    const uint8_t *p = (const uint8_t *)data.ptr;
+    for (size_t len = data.len; len-- > 0 ;)
+        crc = crc32_shift_8(crc ^ *p++);
+    return crc;
+}
+
+/*
+ * The SSH-1 variant of CRC-32.
+ */
+uint32_t crc32_ssh1(ptrlen data)
+{
+    return crc32_update(0, data);
+}
+
+/*
+ * The official version of CRC-32. Nothing in PuTTY proper uses this,
+ * but it's useful to expose it to testcrypt so that we can implement
+ * standard test vectors.
+ */
+uint32_t crc32_rfc1662(ptrlen data)
+{
+    return crc32_update(0xFFFFFFFF, data) ^ 0xFFFFFFFF;
+}
--- a/crypto/des.c
+++ b/crypto/des.c
--- a/crypto/diffie-hellman.c
+++ b/crypto/diffie-hellman.c
@ -0,0 +1,272 @@
+/*
+ * Diffie-Hellman implementation for PuTTY.
+ */
+
+#include <assert.h>
+
+#include "ssh.h"
+#include "misc.h"
+#include "mpint.h"
+
+struct dh_ctx {
+    mp_int *x, *e, *p, *q, *g;
+};
+
+struct dh_extra {
+    bool gex;
+    void (*construct)(dh_ctx *ctx);
+};
+
+static void dh_group1_construct(dh_ctx *ctx)
+{
+    ctx->p = MP_LITERAL(0xFFFFFFFFFFFFFFFFC90FDAA22168C234C4C6628B80DC1CD129024E088A67CC74020BBEA63B139B22514A08798E3404DDEF9519B3CD3A431B302B0A6DF25F14374FE1356D6D51C245E485B576625E7EC6F44C42E9A637ED6B0BFF5CB6F406B7EDEE386BFB5A899FA5AE9F24117C4B1FE649286651ECE65381FFFFFFFFFFFFFFFF);
+    ctx->g = mp_from_integer(2);
+}
+
+static void dh_group14_construct(dh_ctx *ctx)
+{
+    ctx->p = MP_LITERAL(0xFFFFFFFFFFFFFFFFC90FDAA22168C234C4C6628B80DC1CD129024E088A67CC74020BBEA63B139B22514A08798E3404DDEF9519B3CD3A431B302B0A6DF25F14374FE1356D6D51C245E485B576625E7EC6F44C42E9A637ED6B0BFF5CB6F406B7EDEE386BFB5A899FA5AE9F24117C4B1FE649286651ECE45B3DC2007CB8A163BF0598DA48361C55D39A69163FA8FD24CF5F83655D23DCA3AD961C62F356208552BB9ED529077096966D670C354E4ABC9804F1746C08CA18217C32905E462E36CE3BE39E772C180E86039B2783A2EC07A28FB5C55DF06F4C52C9DE2BCBF6955817183995497CEA956AE515D2261898FA051015728E5A8AACAA68FFFFFFFFFFFFFFFF);
+    ctx->g = mp_from_integer(2);
+}
+
+static const struct dh_extra extra_group1 = {
+    false, dh_group1_construct,
+};
+
+static const ssh_kex ssh_diffiehellman_group1_sha1 = {
+    "diffie-hellman-group1-sha1", "group1",
+    KEXTYPE_DH, &ssh_sha1, &extra_group1,
+};
+
+static const ssh_kex *const group1_list[] = {
+    &ssh_diffiehellman_group1_sha1
+};
+
+const ssh_kexes ssh_diffiehellman_group1 = { lenof(group1_list), group1_list };
+
+static const struct dh_extra extra_group14 = {
+    false, dh_group14_construct,
+};
+
+static const ssh_kex ssh_diffiehellman_group14_sha256 = {
+    "diffie-hellman-group14-sha256", "group14",
+    KEXTYPE_DH, &ssh_sha256, &extra_group14,
+};
+
+static const ssh_kex ssh_diffiehellman_group14_sha1 = {
+    "diffie-hellman-group14-sha1", "group14",
+    KEXTYPE_DH, &ssh_sha1, &extra_group14,
+};
+
+static const ssh_kex *const group14_list[] = {
+    &ssh_diffiehellman_group14_sha256,
+    &ssh_diffiehellman_group14_sha1
+};
+
+const ssh_kexes ssh_diffiehellman_group14 = {
+    lenof(group14_list), group14_list
+};
+
+static const struct dh_extra extra_gex = { true };
+
+static const ssh_kex ssh_diffiehellman_gex_sha256 = {
+    "diffie-hellman-group-exchange-sha256", NULL,
+    KEXTYPE_DH, &ssh_sha256, &extra_gex,
+};
+
+static const ssh_kex ssh_diffiehellman_gex_sha1 = {
+    "diffie-hellman-group-exchange-sha1", NULL,
+    KEXTYPE_DH, &ssh_sha1, &extra_gex,
+};
+
+static const ssh_kex *const gex_list[] = {
+    &ssh_diffiehellman_gex_sha256,
+    &ssh_diffiehellman_gex_sha1
+};
+
+const ssh_kexes ssh_diffiehellman_gex = { lenof(gex_list), gex_list };
+
+/*
+ * Suffix on GSSAPI SSH protocol identifiers that indicates Kerberos 5
+ * as the mechanism.
+ *
+ * This suffix is the base64-encoded MD5 hash of the byte sequence
+ * 06 09 2A 86 48 86 F7 12 01 02 02, which in turn is the ASN.1 DER
+ * encoding of the object ID 1.2.840.113554.1.2.2 which designates
+ * Kerberos v5.
+ *
+ * (The same encoded OID, minus the two-byte DER header, is defined in
+ * pgssapi.c as GSS_MECH_KRB5.)
+ */
+#define GSS_KRB5_OID_HASH "toWM5Slw5Ew8Mqkay+al2g=="
+
+static const ssh_kex ssh_gssk5_diffiehellman_gex_sha1 = {
+    "gss-gex-sha1-" GSS_KRB5_OID_HASH, NULL,
+    KEXTYPE_GSS, &ssh_sha1, &extra_gex,
+};
+
+static const ssh_kex ssh_gssk5_diffiehellman_group14_sha1 = {
+    "gss-group14-sha1-" GSS_KRB5_OID_HASH, "group14",
+    KEXTYPE_GSS, &ssh_sha1, &extra_group14,
+};
+
+static const ssh_kex ssh_gssk5_diffiehellman_group1_sha1 = {
+    "gss-group1-sha1-" GSS_KRB5_OID_HASH, "group1",
+    KEXTYPE_GSS, &ssh_sha1, &extra_group1,
+};
+
+static const ssh_kex *const gssk5_sha1_kex_list[] = {
+    &ssh_gssk5_diffiehellman_gex_sha1,
+    &ssh_gssk5_diffiehellman_group14_sha1,
+    &ssh_gssk5_diffiehellman_group1_sha1
+};
+
+const ssh_kexes ssh_gssk5_sha1_kex = {
+    lenof(gssk5_sha1_kex_list), gssk5_sha1_kex_list
+};
+
+/*
+ * Common DH initialisation.
+ */
+static void dh_init(dh_ctx *ctx)
+{
+    ctx->q = mp_rshift_fixed(ctx->p, 1);
+    ctx->x = ctx->e = NULL;
+}
+
+bool dh_is_gex(const ssh_kex *kex)
+{
+    const struct dh_extra *extra = (const struct dh_extra *)kex->extra;
+    return extra->gex;
+}
+
+/*
+ * Initialise DH for a standard group.
+ */
+dh_ctx *dh_setup_group(const ssh_kex *kex)
+{
+    const struct dh_extra *extra = (const struct dh_extra *)kex->extra;
+    assert(!extra->gex);
+    dh_ctx *ctx = snew(dh_ctx);
+    extra->construct(ctx);
+    dh_init(ctx);
+    return ctx;
+}
+
+/*
+ * Initialise DH for a server-supplied group.
+ */
+dh_ctx *dh_setup_gex(mp_int *pval, mp_int *gval)
+{
+    dh_ctx *ctx = snew(dh_ctx);
+    ctx->p = mp_copy(pval);
+    ctx->g = mp_copy(gval);
+    dh_init(ctx);
+    return ctx;
+}
+
+/*
+ * Return size of DH modulus p.
+ */
+int dh_modulus_bit_size(const dh_ctx *ctx)
+{
+    return mp_get_nbits(ctx->p);
+}
+
+/*
+ * Clean up and free a context.
+ */
+void dh_cleanup(dh_ctx *ctx)
+{
+    if (ctx->x)
+        mp_free(ctx->x);
+    if (ctx->e)
+        mp_free(ctx->e);
+    if (ctx->p)
+        mp_free(ctx->p);
+    if (ctx->g)
+        mp_free(ctx->g);
+    if (ctx->q)
+        mp_free(ctx->q);
+    sfree(ctx);
+}
+
+/*
+ * DH stage 1: invent a number x between 1 and q, and compute e =
+ * g^x mod p. Return e.
+ *
+ * If `nbits' is greater than zero, it is used as an upper limit
+ * for the number of bits in x. This is safe provided that (a) you
+ * use twice as many bits in x as the number of bits you expect to
+ * use in your session key, and (b) the DH group is a safe prime
+ * (which SSH demands that it must be).
+ *
+ * P. C. van Oorschot, M. J. Wiener
+ * "On Diffie-Hellman Key Agreement with Short Exponents".
+ * Advances in Cryptology: Proceedings of Eurocrypt '96
+ * Springer-Verlag, May 1996.
+ */
+mp_int *dh_create_e(dh_ctx *ctx, int nbits)
+{
+    /*
+     * Lower limit is just 2.
+     */
+    mp_int *lo = mp_from_integer(2);
+
+    /*
+     * Upper limit.
+     */
+    mp_int *hi = mp_copy(ctx->q);
+    mp_sub_integer_into(hi, hi, 1);
+    if (nbits) {
+        mp_int *pow2 = mp_power_2(nbits+1);
+        mp_min_into(pow2, pow2, hi);
+        mp_free(hi);
+        hi = pow2;
+    }
+
+    /*
+     * Make a random number in that range.
+     */
+    ctx->x = mp_random_in_range(lo, hi);
+    mp_free(lo);
+    mp_free(hi);
+
+    /*
+     * Now compute e = g^x mod p.
+     */
+    ctx->e = mp_modpow(ctx->g, ctx->x, ctx->p);
+
+    return ctx->e;
+}
+
+/*
+ * DH stage 2-epsilon: given a number f, validate it to ensure it's in
+ * range. (RFC 4253 section 8: "Values of 'e' or 'f' that are not in
+ * the range [1, p-1] MUST NOT be sent or accepted by either side."
+ * Also, we rule out 1 and p-1 too, since that's easy to do and since
+ * they lead to obviously weak keys that even a passive eavesdropper
+ * can figure out.)
+ */
+const char *dh_validate_f(dh_ctx *ctx, mp_int *f)
+{
+    if (!mp_hs_integer(f, 2)) {
+        return "f value received is too small";
+    } else {
+        mp_int *pm1 = mp_copy(ctx->p);
+        mp_sub_integer_into(pm1, pm1, 1);
+        unsigned cmp = mp_cmp_hs(f, pm1);
+        mp_free(pm1);
+        if (cmp)
+            return "f value received is too large";
+    }
+    return NULL;
+}
+
+/*
+ * DH stage 2: given a number f, compute K = f^x mod p.
+ */
+mp_int *dh_find_K(dh_ctx *ctx, mp_int *f)
+{
+    return mp_modpow(f, ctx->x, ctx->p);
+}
--- a/crypto/dsa.c
+++ b/crypto/dsa.c
@ -0,0 +1,503 @@
+/*
+ * Digital Signature Algorithm implementation for PuTTY.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+
+#include "ssh.h"
+#include "mpint.h"
+#include "misc.h"
+
+static void dss_freekey(ssh_key *key);    /* forward reference */
+
+static ssh_key *dss_new_pub(const ssh_keyalg *self, ptrlen data)
+{
+    BinarySource src[1];
+    struct dss_key *dss;
+
+    BinarySource_BARE_INIT_PL(src, data);
+    if (!ptrlen_eq_string(get_string(src), "ssh-dss"))
+        return NULL;
+
+    dss = snew(struct dss_key);
+    dss->sshk.vt = &ssh_dss;
+    dss->p = get_mp_ssh2(src);
+    dss->q = get_mp_ssh2(src);
+    dss->g = get_mp_ssh2(src);
+    dss->y = get_mp_ssh2(src);
+    dss->x = NULL;
+
+    if (get_err(src) ||
+        mp_eq_integer(dss->p, 0) || mp_eq_integer(dss->q, 0)) {
+        /* Invalid key. */
+        dss_freekey(&dss->sshk);
+        return NULL;
+    }
+
+    return &dss->sshk;
+}
+
+static void dss_freekey(ssh_key *key)
+{
+    struct dss_key *dss = container_of(key, struct dss_key, sshk);
+    if (dss->p)
+        mp_free(dss->p);
+    if (dss->q)
+        mp_free(dss->q);
+    if (dss->g)
+        mp_free(dss->g);
+    if (dss->y)
+        mp_free(dss->y);
+    if (dss->x)
+        mp_free(dss->x);
+    sfree(dss);
+}
+
+static void append_hex_to_strbuf(strbuf *sb, mp_int *x)
+{
+    if (sb->len > 0)
+        put_byte(sb, ',');
+    put_data(sb, "0x", 2);
+    char *hex = mp_get_hex(x);
+    size_t hexlen = strlen(hex);
+    put_data(sb, hex, hexlen);
+    smemclr(hex, hexlen);
+    sfree(hex);
+}
+
+static char *dss_cache_str(ssh_key *key)
+{
+    struct dss_key *dss = container_of(key, struct dss_key, sshk);
+    strbuf *sb = strbuf_new();
+
+    if (!dss->p) {
+        strbuf_free(sb);
+        return NULL;
+    }
+
+    append_hex_to_strbuf(sb, dss->p);
+    append_hex_to_strbuf(sb, dss->q);
+    append_hex_to_strbuf(sb, dss->g);
+    append_hex_to_strbuf(sb, dss->y);
+
+    return strbuf_to_str(sb);
+}
+
+static key_components *dss_components(ssh_key *key)
+{
+    struct dss_key *dss = container_of(key, struct dss_key, sshk);
+    key_components *kc = key_components_new();
+
+    key_components_add_text(kc, "key_type", "DSA");
+    assert(dss->p);
+    key_components_add_mp(kc, "p", dss->p);
+    key_components_add_mp(kc, "q", dss->q);
+    key_components_add_mp(kc, "g", dss->g);
+    key_components_add_mp(kc, "public_y", dss->y);
+    if (dss->x)
+        key_components_add_mp(kc, "private_x", dss->x);
+
+    return kc;
+}
+
+static char *dss_invalid(ssh_key *key, unsigned flags)
+{
+    /* No validity criterion will stop us from using a DSA key at all */
+    return NULL;
+}
+
+static bool dss_verify(ssh_key *key, ptrlen sig, ptrlen data)
+{
+    struct dss_key *dss = container_of(key, struct dss_key, sshk);
+    BinarySource src[1];
+    unsigned char hash[20];
+    bool toret;
+
+    if (!dss->p)
+        return false;
+
+    BinarySource_BARE_INIT_PL(src, sig);
+
+    /*
+     * Commercial SSH (2.0.13) and OpenSSH disagree over the format
+     * of a DSA signature. OpenSSH is in line with RFC 4253:
+     * it uses a string "ssh-dss", followed by a 40-byte string
+     * containing two 160-bit integers end-to-end. Commercial SSH
+     * can't be bothered with the header bit, and considers a DSA
+     * signature blob to be _just_ the 40-byte string containing
+     * the two 160-bit integers. We tell them apart by measuring
+     * the length: length 40 means the commercial-SSH bug, anything
+     * else is assumed to be RFC-compliant.
+     */
+    if (sig.len != 40) {      /* bug not present; read admin fields */
+        ptrlen type = get_string(src);
+        sig = get_string(src);
+
+        if (get_err(src) || !ptrlen_eq_string(type, "ssh-dss") ||
+            sig.len != 40)
+            return false;
+    }
+
+    /* Now we're sitting on a 40-byte string for sure. */
+    mp_int *r = mp_from_bytes_be(make_ptrlen(sig.ptr, 20));
+    mp_int *s = mp_from_bytes_be(make_ptrlen((const char *)sig.ptr + 20, 20));
+    if (!r || !s) {
+        if (r)
+            mp_free(r);
+        if (s)
+            mp_free(s);
+        return false;
+    }
+
+    /* Basic sanity checks: 0 < r,s < q */
+    unsigned invalid = 0;
+    invalid |= mp_eq_integer(r, 0);
+    invalid |= mp_eq_integer(s, 0);
+    invalid |= mp_cmp_hs(r, dss->q);
+    invalid |= mp_cmp_hs(s, dss->q);
+    if (invalid) {
+        mp_free(r);
+        mp_free(s);
+        return false;
+    }
+
+    /*
+     * Step 1. w <- s^-1 mod q.
+     */
+    mp_int *w = mp_invert(s, dss->q);
+    if (!w) {
+        mp_free(r);
+        mp_free(s);
+        return false;
+    }
+
+    /*
+     * Step 2. u1 <- SHA(message) * w mod q.
+     */
+    hash_simple(&ssh_sha1, data, hash);
+    mp_int *sha = mp_from_bytes_be(make_ptrlen(hash, 20));
+    mp_int *u1 = mp_modmul(sha, w, dss->q);
+
+    /*
+     * Step 3. u2 <- r * w mod q.
+     */
+    mp_int *u2 = mp_modmul(r, w, dss->q);
+
+    /*
+     * Step 4. v <- (g^u1 * y^u2 mod p) mod q.
+     */
+    mp_int *gu1p = mp_modpow(dss->g, u1, dss->p);
+    mp_int *yu2p = mp_modpow(dss->y, u2, dss->p);
+    mp_int *gu1yu2p = mp_modmul(gu1p, yu2p, dss->p);
+    mp_int *v = mp_mod(gu1yu2p, dss->q);
+
+    /*
+     * Step 5. v should now be equal to r.
+     */
+
+    toret = mp_cmp_eq(v, r);
+
+    mp_free(w);
+    mp_free(sha);
+    mp_free(u1);
+    mp_free(u2);
+    mp_free(gu1p);
+    mp_free(yu2p);
+    mp_free(gu1yu2p);
+    mp_free(v);
+    mp_free(r);
+    mp_free(s);
+
+    return toret;
+}
+
+static void dss_public_blob(ssh_key *key, BinarySink *bs)
+{
+    struct dss_key *dss = container_of(key, struct dss_key, sshk);
+
+    put_stringz(bs, "ssh-dss");
+    put_mp_ssh2(bs, dss->p);
+    put_mp_ssh2(bs, dss->q);
+    put_mp_ssh2(bs, dss->g);
+    put_mp_ssh2(bs, dss->y);
+}
+
+static void dss_private_blob(ssh_key *key, BinarySink *bs)
+{
+    struct dss_key *dss = container_of(key, struct dss_key, sshk);
+
+    put_mp_ssh2(bs, dss->x);
+}
+
+static ssh_key *dss_new_priv(const ssh_keyalg *self, ptrlen pub, ptrlen priv)
+{
+    BinarySource src[1];
+    ssh_key *sshk;
+    struct dss_key *dss;
+    ptrlen hash;
+    unsigned char digest[20];
+    mp_int *ytest;
+
+    sshk = dss_new_pub(self, pub);
+    if (!sshk)
+        return NULL;
+
+    dss = container_of(sshk, struct dss_key, sshk);
+    BinarySource_BARE_INIT_PL(src, priv);
+    dss->x = get_mp_ssh2(src);
+    if (get_err(src)) {
+        dss_freekey(&dss->sshk);
+        return NULL;
+    }
+
+    /*
+     * Check the obsolete hash in the old DSS key format.
+     */
+    hash = get_string(src);
+    if (hash.len == 20) {
+        ssh_hash *h = ssh_hash_new(&ssh_sha1);
+        put_mp_ssh2(h, dss->p);
+        put_mp_ssh2(h, dss->q);
+        put_mp_ssh2(h, dss->g);
+        ssh_hash_final(h, digest);
+        if (!smemeq(hash.ptr, digest, 20)) {
+            dss_freekey(&dss->sshk);
+            return NULL;
+        }
+    }
+
+    /*
+     * Now ensure g^x mod p really is y.
+     */
+    ytest = mp_modpow(dss->g, dss->x, dss->p);
+    if (!mp_cmp_eq(ytest, dss->y)) {
+        mp_free(ytest);
+        dss_freekey(&dss->sshk);
+        return NULL;
+    }
+    mp_free(ytest);
+
+    return &dss->sshk;
+}
+
+static ssh_key *dss_new_priv_openssh(const ssh_keyalg *self,
+                                     BinarySource *src)
+{
+    struct dss_key *dss;
+
+    dss = snew(struct dss_key);
+    dss->sshk.vt = &ssh_dss;
+
+    dss->p = get_mp_ssh2(src);
+    dss->q = get_mp_ssh2(src);
+    dss->g = get_mp_ssh2(src);
+    dss->y = get_mp_ssh2(src);
+    dss->x = get_mp_ssh2(src);
+
+    if (get_err(src) ||
+        mp_eq_integer(dss->q, 0) || mp_eq_integer(dss->p, 0)) {
+        /* Invalid key. */
+        dss_freekey(&dss->sshk);
+        return NULL;
+    }
+
+    return &dss->sshk;
+}
+
+static void dss_openssh_blob(ssh_key *key, BinarySink *bs)
+{
+    struct dss_key *dss = container_of(key, struct dss_key, sshk);
+
+    put_mp_ssh2(bs, dss->p);
+    put_mp_ssh2(bs, dss->q);
+    put_mp_ssh2(bs, dss->g);
+    put_mp_ssh2(bs, dss->y);
+    put_mp_ssh2(bs, dss->x);
+}
+
+static int dss_pubkey_bits(const ssh_keyalg *self, ptrlen pub)
+{
+    ssh_key *sshk;
+    struct dss_key *dss;
+    int ret;
+
+    sshk = dss_new_pub(self, pub);
+    if (!sshk)
+        return -1;
+
+    dss = container_of(sshk, struct dss_key, sshk);
+    ret = mp_get_nbits(dss->p);
+    dss_freekey(&dss->sshk);
+
+    return ret;
+}
+
+mp_int *dss_gen_k(const char *id_string, mp_int *modulus,
+                     mp_int *private_key,
+                     unsigned char *digest, int digest_len)
+{
+    /*
+     * The basic DSS signing algorithm is:
+     *
+     *  - invent a random k between 1 and q-1 (exclusive).
+     *  - Compute r = (g^k mod p) mod q.
+     *  - Compute s = k^-1 * (hash + x*r) mod q.
+     *
+     * This has the dangerous properties that:
+     *
+     *  - if an attacker in possession of the public key _and_ the
+     *    signature (for example, the host you just authenticated
+     *    to) can guess your k, he can reverse the computation of s
+     *    and work out x = r^-1 * (s*k - hash) mod q. That is, he
+     *    can deduce the private half of your key, and masquerade
+     *    as you for as long as the key is still valid.
+     *
+     *  - since r is a function purely of k and the public key, if
+     *    the attacker only has a _range of possibilities_ for k
+     *    it's easy for him to work through them all and check each
+     *    one against r; he'll never be unsure of whether he's got
+     *    the right one.
+     *
+     *  - if you ever sign two different hashes with the same k, it
+     *    will be immediately obvious because the two signatures
+     *    will have the same r, and moreover an attacker in
+     *    possession of both signatures (and the public key of
+     *    course) can compute k = (hash1-hash2) * (s1-s2)^-1 mod q,
+     *    and from there deduce x as before.
+     *
+     *  - the Bleichenbacher attack on DSA makes use of methods of
+     *    generating k which are significantly non-uniformly
+     *    distributed; in particular, generating a 160-bit random
+     *    number and reducing it mod q is right out.
+     *
+     * For this reason we must be pretty careful about how we
+     * generate our k. Since this code runs on Windows, with no
+     * particularly good system entropy sources, we can't trust our
+     * RNG itself to produce properly unpredictable data. Hence, we
+     * use a totally different scheme instead.
+     *
+     * What we do is to take a SHA-512 (_big_) hash of the private
+     * key x, and then feed this into another SHA-512 hash that
+     * also includes the message hash being signed. That is:
+     *
+     *   proto_k = SHA512 ( SHA512(x) || SHA160(message) )
+     *
+     * This number is 512 bits long, so reducing it mod q won't be
+     * noticeably non-uniform. So
+     *
+     *   k = proto_k mod q
+     *
+     * This has the interesting property that it's _deterministic_:
+     * signing the same hash twice with the same key yields the
+     * same signature.
+     *
+     * Despite this determinism, it's still not predictable to an
+     * attacker, because in order to repeat the SHA-512
+     * construction that created it, the attacker would have to
+     * know the private key value x - and by assumption he doesn't,
+     * because if he knew that he wouldn't be attacking k!
+     *
+     * (This trick doesn't, _per se_, protect against reuse of k.
+     * Reuse of k is left to chance; all it does is prevent
+     * _excessively high_ chances of reuse of k due to entropy
+     * problems.)
+     *
+     * Thanks to Colin Plumb for the general idea of using x to
+     * ensure k is hard to guess, and to the Cambridge University
+     * Computer Security Group for helping to argue out all the
+     * fine details.
+     */
+    ssh_hash *h;
+    unsigned char digest512[64];
+
+    /*
+     * Hash some identifying text plus x.
+     */
+    h = ssh_hash_new(&ssh_sha512);
+    put_asciz(h, id_string);
+    put_mp_ssh2(h, private_key);
+    ssh_hash_digest(h, digest512);
+
+    /*
+     * Now hash that digest plus the message hash.
+     */
+    ssh_hash_reset(h);
+    put_data(h, digest512, sizeof(digest512));
+    put_data(h, digest, digest_len);
+    ssh_hash_final(h, digest512);
+
+    /*
+     * Now convert the result into a bignum, and coerce it to the
+     * range [2,q), which we do by reducing it mod q-2 and adding 2.
+     */
+    mp_int *modminus2 = mp_copy(modulus);
+    mp_sub_integer_into(modminus2, modminus2, 2);
+    mp_int *proto_k = mp_from_bytes_be(make_ptrlen(digest512, 64));
+    mp_int *k = mp_mod(proto_k, modminus2);
+    mp_free(proto_k);
+    mp_free(modminus2);
+    mp_add_integer_into(k, k, 2);
+
+    smemclr(digest512, sizeof(digest512));
+
+    return k;
+}
+
+static void dss_sign(ssh_key *key, ptrlen data, unsigned flags, BinarySink *bs)
+{
+    struct dss_key *dss = container_of(key, struct dss_key, sshk);
+    unsigned char digest[20];
+    int i;
+
+    hash_simple(&ssh_sha1, data, digest);
+
+    mp_int *k = dss_gen_k("DSA deterministic k generator", dss->q, dss->x,
+                          digest, sizeof(digest));
+    mp_int *kinv = mp_invert(k, dss->q);       /* k^-1 mod q */
+
+    /*
+     * Now we have k, so just go ahead and compute the signature.
+     */
+    mp_int *gkp = mp_modpow(dss->g, k, dss->p); /* g^k mod p */
+    mp_int *r = mp_mod(gkp, dss->q);        /* r = (g^k mod p) mod q */
+    mp_free(gkp);
+
+    mp_int *hash = mp_from_bytes_be(make_ptrlen(digest, 20));
+    mp_int *xr = mp_mul(dss->x, r);
+    mp_int *hxr = mp_add(xr, hash);         /* hash + x*r */
+    mp_int *s = mp_modmul(kinv, hxr, dss->q); /* s = k^-1 * (hash+x*r) mod q */
+    mp_free(hxr);
+    mp_free(xr);
+    mp_free(kinv);
+    mp_free(k);
+    mp_free(hash);
+
+    put_stringz(bs, "ssh-dss");
+    put_uint32(bs, 40);
+    for (i = 0; i < 20; i++)
+        put_byte(bs, mp_get_byte(r, 19 - i));
+    for (i = 0; i < 20; i++)
+        put_byte(bs, mp_get_byte(s, 19 - i));
+    mp_free(r);
+    mp_free(s);
+}
+
+const ssh_keyalg ssh_dss = {
+    .new_pub = dss_new_pub,
+    .new_priv = dss_new_priv,
+    .new_priv_openssh = dss_new_priv_openssh,
+    .freekey = dss_freekey,
+    .invalid = dss_invalid,
+    .sign = dss_sign,
+    .verify = dss_verify,
+    .public_blob = dss_public_blob,
+    .private_blob = dss_private_blob,
+    .openssh_blob = dss_openssh_blob,
+    .cache_str = dss_cache_str,
+    .components = dss_components,
+    .pubkey_bits = dss_pubkey_bits,
+    .ssh_id = "ssh-dss",
+    .cache_id = "dss",
+};
--- a/crypto/ecc-arithmetic.c
+++ b/crypto/ecc-arithmetic.c
--- a/crypto/ecc-ssh.c
+++ b/crypto/ecc-ssh.c
--- a/crypto/hash_simple.c
+++ b/crypto/hash_simple.c
@ -0,0 +1,13 @@
+/*
+ * Convenience function to hash a single piece of data, wrapping up
+ * the faff of making and freeing an ssh_hash.
+ */
+
+#include "ssh.h"
+
+void hash_simple(const ssh_hashalg *alg, ptrlen data, void *output)
+{
+    ssh_hash *hash = ssh_hash_new(alg);
+    put_datapl(hash, data);
+    ssh_hash_final(hash, output);
+}
--- a/crypto/hmac.c
+++ b/crypto/hmac.c
@ -0,0 +1,257 @@
+/*
+ * Implementation of HMAC (RFC 2104) for PuTTY, in a general form that
+ * can wrap any underlying hash function.
+ */
+
+#include "ssh.h"
+
+struct hmac {
+    const ssh_hashalg *hashalg;
+    ssh_hash *h_outer, *h_inner, *h_live;
+    uint8_t *digest;
+    strbuf *text_name;
+    ssh2_mac mac;
+};
+
+struct hmac_extra {
+    const ssh_hashalg *hashalg_base;
+    const char *suffix, *annotation;
+};
+
+static ssh2_mac *hmac_new(const ssh2_macalg *alg, ssh_cipher *cipher)
+{
+    struct hmac *ctx = snew(struct hmac);
+    const struct hmac_extra *extra = (const struct hmac_extra *)alg->extra;
+
+    ctx->h_outer = ssh_hash_new(extra->hashalg_base);
+    /* In case that hashalg was a selector vtable, we'll now switch to
+     * using whatever real one it selected, for all future purposes. */
+    ctx->hashalg = ssh_hash_alg(ctx->h_outer);
+    ctx->h_inner = ssh_hash_new(ctx->hashalg);
+    ctx->h_live = ssh_hash_new(ctx->hashalg);
+
+    /*
+     * HMAC is not well defined as a wrapper on an absolutely general
+     * hash function; it expects that the function it's wrapping will
+     * consume data in fixed-size blocks, and it's partially defined
+     * in terms of that block size. So we insist that the hash we're
+     * given must have defined a meaningful block size.
+     */
+    assert(ctx->hashalg->blocklen);
+
+    ctx->digest = snewn(ctx->hashalg->hlen, uint8_t);
+
+    ctx->text_name = strbuf_new();
+    strbuf_catf(ctx->text_name, "HMAC-%s%s",
+                ctx->hashalg->text_basename, extra->suffix);
+    if (extra->annotation || ctx->hashalg->annotation) {
+        strbuf_catf(ctx->text_name, " (");
+        const char *sep = "";
+        if (extra->annotation) {
+            strbuf_catf(ctx->text_name, "%s%s", sep, extra->annotation);
+            sep = ", ";
+        }
+        if (ctx->hashalg->annotation) {
+            strbuf_catf(ctx->text_name, "%s%s", sep, ctx->hashalg->annotation);
+            sep = ", ";
+        }
+        strbuf_catf(ctx->text_name, ")");
+    }
+
+    ctx->mac.vt = alg;
+    BinarySink_DELEGATE_INIT(&ctx->mac, ctx->h_live);
+
+    return &ctx->mac;
+}
+
+static void hmac_free(ssh2_mac *mac)
+{
+    struct hmac *ctx = container_of(mac, struct hmac, mac);
+
+    ssh_hash_free(ctx->h_outer);
+    ssh_hash_free(ctx->h_inner);
+    ssh_hash_free(ctx->h_live);
+    smemclr(ctx->digest, ctx->hashalg->hlen);
+    sfree(ctx->digest);
+    strbuf_free(ctx->text_name);
+
+    smemclr(ctx, sizeof(*ctx));
+    sfree(ctx);
+}
+
+#define PAD_OUTER 0x5C
+#define PAD_INNER 0x36
+
+static void hmac_key(ssh2_mac *mac, ptrlen key)
+{
+    struct hmac *ctx = container_of(mac, struct hmac, mac);
+
+    const uint8_t *kp;
+    size_t klen;
+    strbuf *sb = NULL;
+
+    if (key.len > ctx->hashalg->blocklen) {
+        /*
+         * RFC 2104 section 2: if the key exceeds the block length of
+         * the underlying hash, then we start by hashing the key, and
+         * use that hash as the 'true' key for the HMAC construction.
+         */
+        sb = strbuf_new_nm();
+        strbuf_append(sb, ctx->hashalg->hlen);
+        hash_simple(ctx->hashalg, key, sb->u);
+        kp = sb->u;
+        klen = sb->len;
+    } else {
+        /*
+         * A short enough key is used as is.
+         */
+        kp = (const uint8_t *)key.ptr;
+        klen = key.len;
+    }
+
+    ssh_hash_reset(ctx->h_outer);
+    for (size_t i = 0; i < klen; i++)
+        put_byte(ctx->h_outer, PAD_OUTER ^ kp[i]);
+    for (size_t i = klen; i < ctx->hashalg->blocklen; i++)
+        put_byte(ctx->h_outer, PAD_OUTER);
+
+    ssh_hash_reset(ctx->h_inner);
+    for (size_t i = 0; i < klen; i++)
+        put_byte(ctx->h_inner, PAD_INNER ^ kp[i]);
+    for (size_t i = klen; i < ctx->hashalg->blocklen; i++)
+        put_byte(ctx->h_inner, PAD_INNER);
+
+    if (sb)
+        strbuf_free(sb);
+}
+
+static void hmac_start(ssh2_mac *mac)
+{
+    struct hmac *ctx = container_of(mac, struct hmac, mac);
+    ssh_hash_copyfrom(ctx->h_live, ctx->h_inner);
+}
+
+static void hmac_genresult(ssh2_mac *mac, unsigned char *output)
+{
+    struct hmac *ctx = container_of(mac, struct hmac, mac);
+    ssh_hash *htmp;
+
+    /* Leave h_live and h_outer in place, so that the SSH-2 BPP can
+     * continue regenerating test results from different-length
+     * prefixes of the packet */
+    ssh_hash_digest_nondestructive(ctx->h_live, ctx->digest);
+
+    htmp = ssh_hash_copy(ctx->h_outer);
+    put_data(htmp, ctx->digest, ctx->hashalg->hlen);
+    ssh_hash_final(htmp, ctx->digest);
+
+    /*
+     * Some instances of HMAC truncate the output hash, so instead of
+     * writing it directly to 'output' we wrote it to our own
+     * full-length buffer, and now we copy the required amount.
+     */
+    memcpy(output, ctx->digest, mac->vt->len);
+    smemclr(ctx->digest, ctx->hashalg->hlen);
+}
+
+static const char *hmac_text_name(ssh2_mac *mac)
+{
+    struct hmac *ctx = container_of(mac, struct hmac, mac);
+    return ctx->text_name->s;
+}
+
+static const struct hmac_extra ssh_hmac_sha256_extra = { &ssh_sha256, "" };
+const ssh2_macalg ssh_hmac_sha256 = {
+    .new = hmac_new,
+    .free = hmac_free,
+    .setkey = hmac_key,
+    .start = hmac_start,
+    .genresult = hmac_genresult,
+    .text_name = hmac_text_name,
+    .name = "hmac-sha2-256",
+    .etm_name = "hmac-sha2-256-etm@openssh.com",
+    .len = 32,
+    .keylen = 32,
+    .extra = &ssh_hmac_sha256_extra,
+};
+
+static const struct hmac_extra ssh_hmac_md5_extra = { &ssh_md5, "" };
+const ssh2_macalg ssh_hmac_md5 = {
+    .new = hmac_new,
+    .free = hmac_free,
+    .setkey = hmac_key,
+    .start = hmac_start,
+    .genresult = hmac_genresult,
+    .text_name = hmac_text_name,
+    .name = "hmac-md5",
+    .etm_name = "hmac-md5-etm@openssh.com",
+    .len = 16,
+    .keylen = 16,
+    .extra = &ssh_hmac_md5_extra,
+};
+
+static const struct hmac_extra ssh_hmac_sha1_extra = { &ssh_sha1, "" };
+
+const ssh2_macalg ssh_hmac_sha1 = {
+    .new = hmac_new,
+    .free = hmac_free,
+    .setkey = hmac_key,
+    .start = hmac_start,
+    .genresult = hmac_genresult,
+    .text_name = hmac_text_name,
+    .name = "hmac-sha1",
+    .etm_name = "hmac-sha1-etm@openssh.com",
+    .len = 20,
+    .keylen = 20,
+    .extra = &ssh_hmac_sha1_extra,
+};
+
+static const struct hmac_extra ssh_hmac_sha1_96_extra = { &ssh_sha1, "-96" };
+
+const ssh2_macalg ssh_hmac_sha1_96 = {
+    .new = hmac_new,
+    .free = hmac_free,
+    .setkey = hmac_key,
+    .start = hmac_start,
+    .genresult = hmac_genresult,
+    .text_name = hmac_text_name,
+    .name = "hmac-sha1-96",
+    .etm_name = "hmac-sha1-96-etm@openssh.com",
+    .len = 12,
+    .keylen = 20,
+    .extra = &ssh_hmac_sha1_96_extra,
+};
+
+static const struct hmac_extra ssh_hmac_sha1_buggy_extra = {
+    &ssh_sha1, "", "bug-compatible"
+};
+
+const ssh2_macalg ssh_hmac_sha1_buggy = {
+    .new = hmac_new,
+    .free = hmac_free,
+    .setkey = hmac_key,
+    .start = hmac_start,
+    .genresult = hmac_genresult,
+    .text_name = hmac_text_name,
+    .name = "hmac-sha1",
+    .len = 20,
+    .keylen = 16,
+    .extra = &ssh_hmac_sha1_buggy_extra,
+};
+
+static const struct hmac_extra ssh_hmac_sha1_96_buggy_extra = {
+    &ssh_sha1, "-96", "bug-compatible"
+};
+
+const ssh2_macalg ssh_hmac_sha1_96_buggy = {
+    .new = hmac_new,
+    .free = hmac_free,
+    .setkey = hmac_key,
+    .start = hmac_start,
+    .genresult = hmac_genresult,
+    .text_name = hmac_text_name,
+    .name = "hmac-sha1-96",
+    .len = 12,
+    .keylen = 16,
+    .extra = &ssh_hmac_sha1_96_buggy_extra,
+};
--- a/crypto/mac.c
+++ b/crypto/mac.c
@ -0,0 +1,43 @@
+/*
+ * Centralised parts of the SSH-2 MAC API, which don't need to vary
+ * with the MAC implementation.
+ */
+
+#include <assert.h>
+
+#include "ssh.h"
+
+bool ssh2_mac_verresult(ssh2_mac *mac, const void *candidate)
+{
+    unsigned char correct[64]; /* at least as big as all known MACs */
+    bool toret;
+
+    assert(mac->vt->len <= sizeof(correct));
+    ssh2_mac_genresult(mac, correct);
+    toret = smemeq(correct, candidate, mac->vt->len);
+
+    smemclr(correct, sizeof(correct));
+
+    return toret;
+}
+
+static void ssh2_mac_prepare(ssh2_mac *mac, const void *blk, int len,
+                             unsigned long seq)
+{
+    ssh2_mac_start(mac);
+    put_uint32(mac, seq);
+    put_data(mac, blk, len);
+}
+
+void ssh2_mac_generate(ssh2_mac *mac, void *blk, int len, unsigned long seq)
+{
+    ssh2_mac_prepare(mac, blk, len, seq);
+    ssh2_mac_genresult(mac, (unsigned char *)blk + len);
+}
+
+bool ssh2_mac_verify(
+    ssh2_mac *mac, const void *blk, int len, unsigned long seq)
+{
+    ssh2_mac_prepare(mac, blk, len, seq);
+    return ssh2_mac_verresult(mac, (const unsigned char *)blk + len);
+}
--- a/crypto/mac_simple.c
+++ b/crypto/mac_simple.c
@ -0,0 +1,16 @@
+/*
+ * Convenience function to MAC a single piece of data, wrapping up
+ * the faff of making and freeing an ssh_mac.
+ */
+
+#include "ssh.h"
+
+void mac_simple(const ssh2_macalg *alg, ptrlen key, ptrlen data, void *output)
+{
+    ssh2_mac *mac = ssh2_mac_new(alg, NULL);
+    ssh2_mac_setkey(mac, key);
+    ssh2_mac_start(mac);
+    put_datapl(mac, data);
+    ssh2_mac_genresult(mac, output);
+    ssh2_mac_free(mac);
+}
--- a/crypto/md5.c
+++ b/crypto/md5.c
@ -0,0 +1,245 @@
+/*
+ * MD5 implementation for PuTTY. Written directly from the spec by
+ * Simon Tatham.
+ */
+
+#include <assert.h>
+#include "ssh.h"
+
+static const uint32_t md5_initial_state[] = {
+    0x67452301, 0xefcdab89, 0x98badcfe, 0x10325476,
+};
+
+static const struct md5_round_constant {
+    uint32_t addition, rotation, msg_index;
+} md5_round_constants[] = {
+    { 0xd76aa478,  7,  0 }, { 0xe8c7b756, 12,  1 },
+    { 0x242070db, 17,  2 }, { 0xc1bdceee, 22,  3 },
+    { 0xf57c0faf,  7,  4 }, { 0x4787c62a, 12,  5 },
+    { 0xa8304613, 17,  6 }, { 0xfd469501, 22,  7 },
+    { 0x698098d8,  7,  8 }, { 0x8b44f7af, 12,  9 },
+    { 0xffff5bb1, 17, 10 }, { 0x895cd7be, 22, 11 },
+    { 0x6b901122,  7, 12 }, { 0xfd987193, 12, 13 },
+    { 0xa679438e, 17, 14 }, { 0x49b40821, 22, 15 },
+    { 0xf61e2562,  5,  1 }, { 0xc040b340,  9,  6 },
+    { 0x265e5a51, 14, 11 }, { 0xe9b6c7aa, 20,  0 },
+    { 0xd62f105d,  5,  5 }, { 0x02441453,  9, 10 },
+    { 0xd8a1e681, 14, 15 }, { 0xe7d3fbc8, 20,  4 },
+    { 0x21e1cde6,  5,  9 }, { 0xc33707d6,  9, 14 },
+    { 0xf4d50d87, 14,  3 }, { 0x455a14ed, 20,  8 },
+    { 0xa9e3e905,  5, 13 }, { 0xfcefa3f8,  9,  2 },
+    { 0x676f02d9, 14,  7 }, { 0x8d2a4c8a, 20, 12 },
+    { 0xfffa3942,  4,  5 }, { 0x8771f681, 11,  8 },
+    { 0x6d9d6122, 16, 11 }, { 0xfde5380c, 23, 14 },
+    { 0xa4beea44,  4,  1 }, { 0x4bdecfa9, 11,  4 },
+    { 0xf6bb4b60, 16,  7 }, { 0xbebfbc70, 23, 10 },
+    { 0x289b7ec6,  4, 13 }, { 0xeaa127fa, 11,  0 },
+    { 0xd4ef3085, 16,  3 }, { 0x04881d05, 23,  6 },
+    { 0xd9d4d039,  4,  9 }, { 0xe6db99e5, 11, 12 },
+    { 0x1fa27cf8, 16, 15 }, { 0xc4ac5665, 23,  2 },
+    { 0xf4292244,  6,  0 }, { 0x432aff97, 10,  7 },
+    { 0xab9423a7, 15, 14 }, { 0xfc93a039, 21,  5 },
+    { 0x655b59c3,  6, 12 }, { 0x8f0ccc92, 10,  3 },
+    { 0xffeff47d, 15, 10 }, { 0x85845dd1, 21,  1 },
+    { 0x6fa87e4f,  6,  8 }, { 0xfe2ce6e0, 10, 15 },
+    { 0xa3014314, 15,  6 }, { 0x4e0811a1, 21, 13 },
+    { 0xf7537e82,  6,  4 }, { 0xbd3af235, 10, 11 },
+    { 0x2ad7d2bb, 15,  2 }, { 0xeb86d391, 21,  9 },
+};
+
+typedef struct md5_block md5_block;
+struct md5_block {
+    uint8_t block[64];
+    size_t used;
+    uint64_t len;
+};
+
+static inline void md5_block_setup(md5_block *blk)
+{
+    blk->used = 0;
+    blk->len = 0;
+}
+
+static inline bool md5_block_write(
+    md5_block *blk, const void **vdata, size_t *len)
+{
+    size_t blkleft = sizeof(blk->block) - blk->used;
+    size_t chunk = *len < blkleft ? *len : blkleft;
+
+    const uint8_t *p = *vdata;
+    memcpy(blk->block + blk->used, p, chunk);
+    *vdata = p + chunk;
+    *len -= chunk;
+    blk->used += chunk;
+    blk->len += chunk;
+
+    if (blk->used == sizeof(blk->block)) {
+        blk->used = 0;
+        return true;
+    }
+
+    return false;
+}
+
+static inline void md5_block_pad(md5_block *blk, BinarySink *bs)
+{
+    uint64_t final_len = blk->len << 3;
+    size_t pad = 63 & (55 - blk->used);
+
+    put_byte(bs, 0x80);
+    put_padding(bs, pad, 0);
+
+    unsigned char buf[8];
+    PUT_64BIT_LSB_FIRST(buf, final_len);
+    put_data(bs, buf, 8);
+    smemclr(buf, 8);
+
+    assert(blk->used == 0 && "Should have exactly hit a block boundary");
+}
+
+static inline uint32_t rol(uint32_t x, unsigned y)
+{
+    return (x << (31 & y)) | (x >> (31 & -y));
+}
+
+static inline uint32_t Ch(uint32_t ctrl, uint32_t if1, uint32_t if0)
+{
+    return if0 ^ (ctrl & (if1 ^ if0));
+}
+
+/* Parameter functions for the four MD5 round types */
+static inline uint32_t F(uint32_t x, uint32_t y, uint32_t z)
+{ return Ch(x, y, z); }
+static inline uint32_t G(uint32_t x, uint32_t y, uint32_t z)
+{ return Ch(z, x, y); }
+static inline uint32_t H(uint32_t x, uint32_t y, uint32_t z)
+{ return x ^ y ^ z; }
+static inline uint32_t I(uint32_t x, uint32_t y, uint32_t z)
+{ return y ^ (x | ~z); }
+
+static inline void md5_round(
+    unsigned round_index, const uint32_t *message,
+    uint32_t *a, uint32_t *b, uint32_t *c, uint32_t *d,
+    uint32_t (*f)(uint32_t, uint32_t, uint32_t))
+{
+    struct md5_round_constant rc = md5_round_constants[round_index];
+
+    *a = *b + rol(*a + f(*b, *c, *d) + message[rc.msg_index] + rc.addition,
+                  rc.rotation);
+}
+
+static void md5_do_block(uint32_t *core, const uint8_t *block)
+{
+    uint32_t message_words[16];
+    for (size_t i = 0; i < 16; i++)
+        message_words[i] = GET_32BIT_LSB_FIRST(block + 4*i);
+
+    uint32_t a = core[0], b = core[1], c = core[2], d = core[3];
+
+    size_t t = 0;
+    for (size_t u = 0; u < 4; u++) {
+        md5_round(t++, message_words, &a, &b, &c, &d, F);
+        md5_round(t++, message_words, &d, &a, &b, &c, F);
+        md5_round(t++, message_words, &c, &d, &a, &b, F);
+        md5_round(t++, message_words, &b, &c, &d, &a, F);
+    }
+    for (size_t u = 0; u < 4; u++) {
+        md5_round(t++, message_words, &a, &b, &c, &d, G);
+        md5_round(t++, message_words, &d, &a, &b, &c, G);
+        md5_round(t++, message_words, &c, &d, &a, &b, G);
+        md5_round(t++, message_words, &b, &c, &d, &a, G);
+    }
+    for (size_t u = 0; u < 4; u++) {
+        md5_round(t++, message_words, &a, &b, &c, &d, H);
+        md5_round(t++, message_words, &d, &a, &b, &c, H);
+        md5_round(t++, message_words, &c, &d, &a, &b, H);
+        md5_round(t++, message_words, &b, &c, &d, &a, H);
+    }
+    for (size_t u = 0; u < 4; u++) {
+        md5_round(t++, message_words, &a, &b, &c, &d, I);
+        md5_round(t++, message_words, &d, &a, &b, &c, I);
+        md5_round(t++, message_words, &c, &d, &a, &b, I);
+        md5_round(t++, message_words, &b, &c, &d, &a, I);
+    }
+
+    core[0] += a;
+    core[1] += b;
+    core[2] += c;
+    core[3] += d;
+
+    smemclr(message_words, sizeof(message_words));
+}
+
+typedef struct md5 {
+    uint32_t core[4];
+    md5_block blk;
+    BinarySink_IMPLEMENTATION;
+    ssh_hash hash;
+} md5;
+
+static void md5_write(BinarySink *bs, const void *vp, size_t len);
+
+static ssh_hash *md5_new(const ssh_hashalg *alg)
+{
+    md5 *s = snew(md5);
+
+    s->hash.vt = alg;
+    BinarySink_INIT(s, md5_write);
+    BinarySink_DELEGATE_INIT(&s->hash, s);
+    return &s->hash;
+}
+
+static void md5_reset(ssh_hash *hash)
+{
+    md5 *s = container_of(hash, md5, hash);
+
+    memcpy(s->core, md5_initial_state, sizeof(s->core));
+    md5_block_setup(&s->blk);
+}
+
+static void md5_copyfrom(ssh_hash *hcopy, ssh_hash *horig)
+{
+    md5 *copy = container_of(hcopy, md5, hash);
+    md5 *orig = container_of(horig, md5, hash);
+
+    memcpy(copy, orig, sizeof(*copy));
+    BinarySink_COPIED(copy);
+    BinarySink_DELEGATE_INIT(&copy->hash, copy);
+}
+
+static void md5_free(ssh_hash *hash)
+{
+    md5 *s = container_of(hash, md5, hash);
+
+    smemclr(s, sizeof(*s));
+    sfree(s);
+}
+
+static void md5_write(BinarySink *bs, const void *vp, size_t len)
+{
+    md5 *s = BinarySink_DOWNCAST(bs, md5);
+
+    while (len > 0)
+        if (md5_block_write(&s->blk, &vp, &len))
+            md5_do_block(s->core, s->blk.block);
+}
+
+static void md5_digest(ssh_hash *hash, uint8_t *digest)
+{
+    md5 *s = container_of(hash, md5, hash);
+
+    md5_block_pad(&s->blk, BinarySink_UPCAST(s));
+    for (size_t i = 0; i < 4; i++)
+        PUT_32BIT_LSB_FIRST(digest + 4*i, s->core[i]);
+}
+
+const ssh_hashalg ssh_md5 = {
+    .new = md5_new,
+    .reset = md5_reset,
+    .copyfrom = md5_copyfrom,
+    .digest = md5_digest,
+    .free = md5_free,
+    .hlen = 16,
+    .blocklen = 64,
+    HASHALG_NAMES_BARE("MD5"),
+};
--- a/crypto/mpint.c
+++ b/crypto/mpint.c
--- a/crypto/prng.c
+++ b/crypto/prng.c
@ -0,0 +1,287 @@
+/*
+ * PuTTY's cryptographic pseudorandom number generator.
+ *
+ * This module just defines the PRNG object type and its methods. The
+ * usual global instance of it is managed by sshrand.c.
+ */
+
+#include "putty.h"
+#include "ssh.h"
+#include "mpint_i.h"
+
+#ifdef PRNG_DIAGNOSTICS
+#define prngdebug debug
+#else
+#define prngdebug(...) ((void)0)
+#endif
+
+/*
+ * This random number generator is based on the 'Fortuna' design by
+ * Niels Ferguson and Bruce Schneier. The biggest difference is that I
+ * use SHA-256 in place of a block cipher: the generator side of the
+ * system works by computing HASH(key || counter) instead of
+ * ENCRYPT(counter, key).
+ *
+ * Rationale: the Fortuna description itself suggests that using
+ * SHA-256 would be nice but people wouldn't accept it because it's
+ * too slow - but PuTTY isn't a heavy enough user of random numbers to
+ * make that a serious worry. In fact even with SHA-256 this generator
+ * is faster than the one we previously used. Also the Fortuna
+ * description worries about periodic rekeying to avoid the barely
+ * detectable pattern of never repeating a cipher block - but with
+ * SHA-256, even that shouldn't be a worry, because the output
+ * 'blocks' are twice the size, and also SHA-256 has no guarantee of
+ * bijectivity, so it surely _could_ be possible to generate the same
+ * block from two counter values. Thirdly, Fortuna has to have a hash
+ * function anyway, for reseeding and entropy collection, so reusing
+ * the same one means it only depends on one underlying primitive and
+ * can be easily reinstantiated with a larger hash function if you
+ * decide you'd like to do that on a particular occasion.
+ */
+
+#define NCOLLECTORS 32
+#define RESEED_DATA_SIZE 64
+
+typedef struct prng_impl prng_impl;
+struct prng_impl {
+    prng Prng;
+
+    const ssh_hashalg *hashalg;
+
+    /*
+     * Generation side:
+     *
+     * 'generator' is a hash object with the current key preloaded
+     * into it. The counter-mode generation is achieved by copying
+     * that hash object, appending the counter value to the copy, and
+     * calling ssh_hash_final.
+     */
+    ssh_hash *generator;
+    BignumInt counter[128 / BIGNUM_INT_BITS];
+
+    /*
+     * When re-seeding the generator, you call prng_seed_begin(),
+     * which sets up a hash object in 'keymaker'. You write your new
+     * seed data into it (which you can do by calling put_data on the
+     * PRNG object itself) and then call prng_seed_finish(), which
+     * finalises this hash and uses the output to set up the new
+     * generator.
+     *
+     * The keymaker hash preimage includes the previous key, so if you
+     * just want to change keys for the sake of not keeping the same
+     * one for too long, you don't have to put any extra seed data in
+     * at all.
+     */
+    ssh_hash *keymaker;
+
+    /*
+     * Collection side:
+     *
+     * There are NCOLLECTORS hash objects collecting entropy. Each
+     * separately numbered entropy source puts its output into those
+     * hash objects in the order 0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,...,
+     * that is to say, each entropy source has a separate counter
+     * which is incremented every time that source generates an event,
+     * and the event data is added to the collector corresponding to
+     * the index of the lowest set bit in the current counter value.
+     *
+     * Whenever collector #0 has at least RESEED_DATA_SIZE bytes (and
+     * it's not at least 100ms since the last reseed), the PRNG is
+     * reseeded, with seed data on reseed #n taken from the first j
+     * collectors, where j is one more than the number of factors of 2
+     * in n. That is, collector #0 is used in every reseed; #1 in
+     * every other one, #2 in every fourth, etc.
+     *
+     * 'until_reseed' counts the amount of data that still needs to be
+     * added to collector #0 before a reseed will be triggered.
+     */
+    uint32_t source_counters[NOISE_MAX_SOURCES];
+    ssh_hash *collectors[NCOLLECTORS];
+    size_t until_reseed;
+    uint32_t reseeds;
+    uint64_t last_reseed_time;
+};
+
+static void prng_seed_BinarySink_write(
+    BinarySink *bs, const void *data, size_t len);
+
+prng *prng_new(const ssh_hashalg *hashalg)
+{
+    prng_impl *pi = snew(prng_impl);
+
+    memset(pi, 0, sizeof(prng_impl));
+    pi->hashalg = hashalg;
+    pi->keymaker = NULL;
+    pi->generator = NULL;
+    memset(pi->counter, 0, sizeof(pi->counter));
+    for (size_t i = 0; i < NCOLLECTORS; i++)
+        pi->collectors[i] = ssh_hash_new(pi->hashalg);
+    pi->until_reseed = 0;
+    BinarySink_INIT(&pi->Prng, prng_seed_BinarySink_write);
+
+    pi->Prng.savesize = pi->hashalg->hlen * 4;
+
+    return &pi->Prng;
+}
+
+void prng_free(prng *pr)
+{
+    prng_impl *pi = container_of(pr, prng_impl, Prng);
+
+    smemclr(pi->counter, sizeof(pi->counter));
+    for (size_t i = 0; i < NCOLLECTORS; i++)
+        ssh_hash_free(pi->collectors[i]);
+    if (pi->generator)
+        ssh_hash_free(pi->generator);
+    if (pi->keymaker)
+        ssh_hash_free(pi->keymaker);
+    smemclr(pi, sizeof(*pi));
+    sfree(pi);
+}
+
+void prng_seed_begin(prng *pr)
+{
+    prng_impl *pi = container_of(pr, prng_impl, Prng);
+
+    assert(!pi->keymaker);
+
+    prngdebug("prng: reseed begin\n");
+
+    /*
+     * Make a hash instance that will generate the key for the new one.
+     */
+    if (pi->generator) {
+        pi->keymaker = pi->generator;
+        pi->generator = NULL;
+    } else {
+        pi->keymaker = ssh_hash_new(pi->hashalg);
+    }
+
+    put_byte(pi->keymaker, 'R');
+}
+
+static void prng_seed_BinarySink_write(
+    BinarySink *bs, const void *data, size_t len)
+{
+    prng *pr = BinarySink_DOWNCAST(bs, prng);
+    prng_impl *pi = container_of(pr, prng_impl, Prng);
+    assert(pi->keymaker);
+    prngdebug("prng: got %"SIZEu" bytes of seed\n", len);
+    put_data(pi->keymaker, data, len);
+}
+
+void prng_seed_finish(prng *pr)
+{
+    prng_impl *pi = container_of(pr, prng_impl, Prng);
+    unsigned char buf[MAX_HASH_LEN];
+
+    assert(pi->keymaker);
+
+    prngdebug("prng: reseed finish\n");
+
+    /*
+     * Actually generate the key.
+     */
+    ssh_hash_final(pi->keymaker, buf);
+    pi->keymaker = NULL;
+
+    /*
+     * Load that key into a fresh hash instance, which will become the
+     * new generator.
+     */
+    assert(!pi->generator);
+    pi->generator = ssh_hash_new(pi->hashalg);
+    put_data(pi->generator, buf, pi->hashalg->hlen);
+
+    pi->until_reseed = RESEED_DATA_SIZE;
+    pi->last_reseed_time = prng_reseed_time_ms();
+
+    smemclr(buf, sizeof(buf));
+}
+
+static inline void prng_generate(prng_impl *pi, void *outbuf)
+{
+    ssh_hash *h = ssh_hash_copy(pi->generator);
+
+    prngdebug("prng_generate\n");
+    put_byte(h, 'G');
+    for (unsigned i = 0; i < 128; i += 8)
+        put_byte(h, pi->counter[i/BIGNUM_INT_BITS] >> (i%BIGNUM_INT_BITS));
+    BignumCarry c = 1;
+    for (unsigned i = 0; i < lenof(pi->counter); i++)
+        BignumADC(pi->counter[i], c, pi->counter[i], 0, c);
+    ssh_hash_final(h, outbuf);
+}
+
+void prng_read(prng *pr, void *vout, size_t size)
+{
+    prng_impl *pi = container_of(pr, prng_impl, Prng);
+    unsigned char buf[MAX_HASH_LEN];
+
+    assert(!pi->keymaker);
+
+    prngdebug("prng_read %"SIZEu"\n", size);
+
+    uint8_t *out = (uint8_t *)vout;
+    while (size > 0) {
+        prng_generate(pi, buf);
+        size_t to_use = size > pi->hashalg->hlen ? pi->hashalg->hlen : size;
+        memcpy(out, buf, to_use);
+        out += to_use;
+        size -= to_use;
+    }
+
+    smemclr(buf, sizeof(buf));
+
+    prng_seed_begin(&pi->Prng);
+    prng_seed_finish(&pi->Prng);
+}
+
+void prng_add_entropy(prng *pr, unsigned source_id, ptrlen data)
+{
+    prng_impl *pi = container_of(pr, prng_impl, Prng);
+
+    assert(source_id < NOISE_MAX_SOURCES);
+    uint32_t counter = ++pi->source_counters[source_id];
+
+    size_t index = 0;
+    while (index+1 < NCOLLECTORS && !(counter & 1)) {
+        counter >>= 1;
+        index++;
+    }
+
+    prngdebug("prng_add_entropy source=%u size=%"SIZEu" -> collector %zi\n",
+              source_id, data.len, index);
+
+    put_datapl(pi->collectors[index], data);
+
+    if (index == 0)
+        pi->until_reseed = (pi->until_reseed < data.len ? 0 :
+                            pi->until_reseed - data.len);
+
+    if (pi->until_reseed == 0 &&
+        prng_reseed_time_ms() - pi->last_reseed_time >= 100) {
+        prng_seed_begin(&pi->Prng);
+
+        unsigned char buf[MAX_HASH_LEN];
+        uint32_t reseed_index = ++pi->reseeds;
+        prngdebug("prng entropy reseed #%"PRIu32"\n", reseed_index);
+        for (size_t i = 0; i < NCOLLECTORS; i++) {
+            prngdebug("emptying collector %"SIZEu"\n", i);
+            ssh_hash_digest(pi->collectors[i], buf);
+            put_data(&pi->Prng, buf, pi->hashalg->hlen);
+            ssh_hash_reset(pi->collectors[i]);
+            if (reseed_index & 1)
+                break;
+            reseed_index >>= 1;
+        }
+        smemclr(buf, sizeof(buf));
+        prng_seed_finish(&pi->Prng);
+    }
+}
+
+size_t prng_seed_bits(prng *pr)
+{
+    prng_impl *pi = container_of(pr, prng_impl, Prng);
+    return pi->hashalg->hlen * 8;
+}
--- a/crypto/pubkey-pem.c
+++ b/crypto/pubkey-pem.c
@ -0,0 +1,32 @@
+/*
+ * Convenience functions to encrypt and decrypt OpenSSH PEM format for
+ * SSH-2 private key files. This uses triple-DES in SSH-2 style (one
+ * CBC layer), with three distinct keys, and an IV also generated from
+ * the passphrase.
+ */
+
+#include "ssh.h"
+
+static ssh_cipher *des3_pubkey_ossh_cipher(const void *vkey, const void *viv)
+{
+    ssh_cipher *c = ssh_cipher_new(&ssh_3des_ssh2);
+    ssh_cipher_setkey(c, vkey);
+    ssh_cipher_setiv(c, viv);
+    return c;
+}
+
+void des3_decrypt_pubkey_ossh(const void *vkey, const void *viv,
+                              void *vblk, int len)
+{
+    ssh_cipher *c = des3_pubkey_ossh_cipher(vkey, viv);
+    ssh_cipher_decrypt(c, vblk, len);
+    ssh_cipher_free(c);
+}
+
+void des3_encrypt_pubkey_ossh(const void *vkey, const void *viv,
+                              void *vblk, int len)
+{
+    ssh_cipher *c = des3_pubkey_ossh_cipher(vkey, viv);
+    ssh_cipher_encrypt(c, vblk, len);
+    ssh_cipher_free(c);
+}
--- a/crypto/pubkey-ppk.c
+++ b/crypto/pubkey-ppk.c
@ -0,0 +1,29 @@
+/*
+ * Convenience functions to encrypt and decrypt PuTTY's own .PPK
+ * format for SSH-2 private key files, which uses 256-bit AES in CBC
+ * mode.
+ */
+
+#include "ssh.h"
+
+static ssh_cipher *aes256_pubkey_cipher(const void *key, const void *iv)
+{
+    ssh_cipher *cipher = ssh_cipher_new(&ssh_aes256_cbc);
+    ssh_cipher_setkey(cipher, key);
+    ssh_cipher_setiv(cipher, iv);
+    return cipher;
+}
+
+void aes256_encrypt_pubkey(const void *key, const void *iv, void *blk, int len)
+{
+    ssh_cipher *c = aes256_pubkey_cipher(key, iv);
+    ssh_cipher_encrypt(c, blk, len);
+    ssh_cipher_free(c);
+}
+
+void aes256_decrypt_pubkey(const void *key, const void *iv, void *blk, int len)
+{
+    ssh_cipher *c = aes256_pubkey_cipher(key, iv);
+    ssh_cipher_decrypt(c, blk, len);
+    ssh_cipher_free(c);
+}
--- a/crypto/pubkey-ssh1.c
+++ b/crypto/pubkey-ssh1.c
@ -0,0 +1,38 @@
+/*
+ * Convenience functions to encrypt and decrypt the standard format
+ * for SSH-1 private key files. This uses triple-DES in SSH-1 style
+ * (three separate CBC layers), but the same key is used for the first
+ * and third layers.CBC mode.
+ */
+
+#include "ssh.h"
+
+static ssh_cipher *des3_pubkey_cipher(const void *vkey)
+{
+    ssh_cipher *c = ssh_cipher_new(&ssh_3des_ssh1);
+    uint8_t keys3[24], iv[8];
+
+    memcpy(keys3, vkey, 16);
+    memcpy(keys3 + 16, vkey, 8);
+    ssh_cipher_setkey(c, keys3);
+    smemclr(keys3, sizeof(keys3));
+
+    memset(iv, 0, 8);
+    ssh_cipher_setiv(c, iv);
+
+    return c;
+}
+
+void des3_decrypt_pubkey(const void *vkey, void *vblk, int len)
+{
+    ssh_cipher *c = des3_pubkey_cipher(vkey);
+    ssh_cipher_decrypt(c, vblk, len);
+    ssh_cipher_free(c);
+}
+
+void des3_encrypt_pubkey(const void *vkey, void *vblk, int len)
+{
+    ssh_cipher *c = des3_pubkey_cipher(vkey);
+    ssh_cipher_encrypt(c, vblk, len);
+    ssh_cipher_free(c);
+}
--- a/crypto/rsa.c
+++ b/crypto/rsa.c
--- a/crypto/sha1.c
+++ b/crypto/sha1.c
@ -0,0 +1,933 @@
+/*
+ * SHA-1 algorithm as described at
+ *
+ *   http://csrc.nist.gov/cryptval/shs.html
+ */
+
+#include "ssh.h"
+#include <assert.h>
+
+/*
+ * Start by deciding whether we can support hardware SHA at all.
+ */
+#define HW_SHA1_NONE 0
+#define HW_SHA1_NI 1
+#define HW_SHA1_NEON 2
+
+#ifdef _FORCE_SHA_NI
+#   define HW_SHA1 HW_SHA1_NI
+#elif defined(__clang__)
+#   if __has_attribute(target) && __has_include(<wmmintrin.h>) &&       \
+    (defined(__x86_64__) || defined(__i386))
+#       define HW_SHA1 HW_SHA1_NI
+#   endif
+#elif defined(__GNUC__)
+#    if (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 9)) && \
+        (defined(__x86_64__) || defined(__i386))
+#       define HW_SHA1 HW_SHA1_NI
+#    endif
+#elif defined (_MSC_VER)
+#   if (defined(_M_X64) || defined(_M_IX86)) && _MSC_FULL_VER >= 150030729
+#      define HW_SHA1 HW_SHA1_NI
+#   endif
+#endif
+
+#ifdef _FORCE_SHA_NEON
+#   define HW_SHA1 HW_SHA1_NEON
+#elif defined __BYTE_ORDER__ && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+    /* Arm can potentially support both endiannesses, but this code
+     * hasn't been tested on anything but little. If anyone wants to
+     * run big-endian, they'll need to fix it first. */
+#elif defined __ARM_FEATURE_CRYPTO
+    /* If the Arm crypto extension is available already, we can
+     * support NEON SHA without having to enable anything by hand */
+#   define HW_SHA1 HW_SHA1_NEON
+#elif defined(__clang__)
+#   if __has_attribute(target) && __has_include(<arm_neon.h>) &&       \
+    (defined(__aarch64__))
+        /* clang can enable the crypto extension in AArch64 using
+         * __attribute__((target)) */
+#       define HW_SHA1 HW_SHA1_NEON
+#       define USE_CLANG_ATTR_TARGET_AARCH64
+#   endif
+#elif defined _MSC_VER
+    /* Visual Studio supports the crypto extension when targeting
+     * AArch64, but as of VS2017, the AArch32 header doesn't quite
+     * manage it (declaring the shae/shad intrinsics without a round
+     * key operand). */
+#   if defined _M_ARM64
+#       define HW_SHA1 HW_SHA1_NEON
+#       if defined _M_ARM64
+#           define USE_ARM64_NEON_H /* unusual header name in this case */
+#       endif
+#   endif
+#endif
+
+#if defined _FORCE_SOFTWARE_SHA || !defined HW_SHA1
+#   undef HW_SHA1
+#   define HW_SHA1 HW_SHA1_NONE
+#endif
+
+/*
+ * The actual query function that asks if hardware acceleration is
+ * available.
+ */
+static bool sha1_hw_available(void);
+
+/*
+ * The top-level selection function, caching the results of
+ * sha1_hw_available() so it only has to run once.
+ */
+static bool sha1_hw_available_cached(void)
+{
+    static bool initialised = false;
+    static bool hw_available;
+    if (!initialised) {
+        hw_available = sha1_hw_available();
+        initialised = true;
+    }
+    return hw_available;
+}
+
+static ssh_hash *sha1_select(const ssh_hashalg *alg)
+{
+    const ssh_hashalg *real_alg =
+        sha1_hw_available_cached() ? &ssh_sha1_hw : &ssh_sha1_sw;
+
+    return ssh_hash_new(real_alg);
+}
+
+const ssh_hashalg ssh_sha1 = {
+    .new = sha1_select,
+    .hlen = 20,
+    .blocklen = 64,
+    HASHALG_NAMES_ANNOTATED("SHA-1", "dummy selector vtable"),
+};
+
+/* ----------------------------------------------------------------------
+ * Definitions likely to be helpful to multiple implementations.
+ */
+
+static const uint32_t sha1_initial_state[] = {
+    0x67452301, 0xefcdab89, 0x98badcfe, 0x10325476, 0xc3d2e1f0,
+};
+
+#define SHA1_ROUNDS_PER_STAGE 20
+#define SHA1_STAGE0_CONSTANT 0x5a827999
+#define SHA1_STAGE1_CONSTANT 0x6ed9eba1
+#define SHA1_STAGE2_CONSTANT 0x8f1bbcdc
+#define SHA1_STAGE3_CONSTANT 0xca62c1d6
+#define SHA1_ROUNDS (4 * SHA1_ROUNDS_PER_STAGE)
+
+typedef struct sha1_block sha1_block;
+struct sha1_block {
+    uint8_t block[64];
+    size_t used;
+    uint64_t len;
+};
+
+static inline void sha1_block_setup(sha1_block *blk)
+{
+    blk->used = 0;
+    blk->len = 0;
+}
+
+static inline bool sha1_block_write(
+    sha1_block *blk, const void **vdata, size_t *len)
+{
+    size_t blkleft = sizeof(blk->block) - blk->used;
+    size_t chunk = *len < blkleft ? *len : blkleft;
+
+    const uint8_t *p = *vdata;
+    memcpy(blk->block + blk->used, p, chunk);
+    *vdata = p + chunk;
+    *len -= chunk;
+    blk->used += chunk;
+    blk->len += chunk;
+
+    if (blk->used == sizeof(blk->block)) {
+        blk->used = 0;
+        return true;
+    }
+
+    return false;
+}
+
+static inline void sha1_block_pad(sha1_block *blk, BinarySink *bs)
+{
+    uint64_t final_len = blk->len << 3;
+    size_t pad = 1 + (63 & (55 - blk->used));
+
+    put_byte(bs, 0x80);
+    for (size_t i = 1; i < pad; i++)
+        put_byte(bs, 0);
+    put_uint64(bs, final_len);
+
+    assert(blk->used == 0 && "Should have exactly hit a block boundary");
+}
+
+/* ----------------------------------------------------------------------
+ * Software implementation of SHA-1.
+ */
+
+static inline uint32_t rol(uint32_t x, unsigned y)
+{
+    return (x << (31 & y)) | (x >> (31 & -y));
+}
+
+static inline uint32_t Ch(uint32_t ctrl, uint32_t if1, uint32_t if0)
+{
+    return if0 ^ (ctrl & (if1 ^ if0));
+}
+
+static inline uint32_t Maj(uint32_t x, uint32_t y, uint32_t z)
+{
+    return (x & y) | (z & (x | y));
+}
+
+static inline uint32_t Par(uint32_t x, uint32_t y, uint32_t z)
+{
+    return (x ^ y ^ z);
+}
+
+static inline void sha1_sw_round(
+    unsigned round_index, const uint32_t *schedule,
+    uint32_t *a, uint32_t *b, uint32_t *c, uint32_t *d, uint32_t *e,
+    uint32_t f, uint32_t constant)
+{
+    *e = rol(*a, 5) + f + *e + schedule[round_index] + constant;
+    *b = rol(*b, 30);
+}
+
+static void sha1_sw_block(uint32_t *core, const uint8_t *block)
+{
+    uint32_t w[SHA1_ROUNDS];
+    uint32_t a,b,c,d,e;
+
+    for (size_t t = 0; t < 16; t++)
+        w[t] = GET_32BIT_MSB_FIRST(block + 4*t);
+
+    for (size_t t = 16; t < SHA1_ROUNDS; t++)
+        w[t] = rol(w[t - 3] ^ w[t - 8] ^ w[t - 14] ^ w[t - 16], 1);
+
+    a = core[0]; b = core[1]; c = core[2]; d = core[3];
+    e = core[4];
+
+    size_t t = 0;
+    for (size_t u = 0; u < SHA1_ROUNDS_PER_STAGE/5; u++) {
+        sha1_sw_round(t++,w, &a,&b,&c,&d,&e, Ch(b,c,d), SHA1_STAGE0_CONSTANT);
+        sha1_sw_round(t++,w, &e,&a,&b,&c,&d, Ch(a,b,c), SHA1_STAGE0_CONSTANT);
+        sha1_sw_round(t++,w, &d,&e,&a,&b,&c, Ch(e,a,b), SHA1_STAGE0_CONSTANT);
+        sha1_sw_round(t++,w, &c,&d,&e,&a,&b, Ch(d,e,a), SHA1_STAGE0_CONSTANT);
+        sha1_sw_round(t++,w, &b,&c,&d,&e,&a, Ch(c,d,e), SHA1_STAGE0_CONSTANT);
+    }
+    for (size_t u = 0; u < SHA1_ROUNDS_PER_STAGE/5; u++) {
+        sha1_sw_round(t++,w, &a,&b,&c,&d,&e, Par(b,c,d), SHA1_STAGE1_CONSTANT);
+        sha1_sw_round(t++,w, &e,&a,&b,&c,&d, Par(a,b,c), SHA1_STAGE1_CONSTANT);
+        sha1_sw_round(t++,w, &d,&e,&a,&b,&c, Par(e,a,b), SHA1_STAGE1_CONSTANT);
+        sha1_sw_round(t++,w, &c,&d,&e,&a,&b, Par(d,e,a), SHA1_STAGE1_CONSTANT);
+        sha1_sw_round(t++,w, &b,&c,&d,&e,&a, Par(c,d,e), SHA1_STAGE1_CONSTANT);
+    }
+    for (size_t u = 0; u < SHA1_ROUNDS_PER_STAGE/5; u++) {
+        sha1_sw_round(t++,w, &a,&b,&c,&d,&e, Maj(b,c,d), SHA1_STAGE2_CONSTANT);
+        sha1_sw_round(t++,w, &e,&a,&b,&c,&d, Maj(a,b,c), SHA1_STAGE2_CONSTANT);
+        sha1_sw_round(t++,w, &d,&e,&a,&b,&c, Maj(e,a,b), SHA1_STAGE2_CONSTANT);
+        sha1_sw_round(t++,w, &c,&d,&e,&a,&b, Maj(d,e,a), SHA1_STAGE2_CONSTANT);
+        sha1_sw_round(t++,w, &b,&c,&d,&e,&a, Maj(c,d,e), SHA1_STAGE2_CONSTANT);
+    }
+    for (size_t u = 0; u < SHA1_ROUNDS_PER_STAGE/5; u++) {
+        sha1_sw_round(t++,w, &a,&b,&c,&d,&e, Par(b,c,d), SHA1_STAGE3_CONSTANT);
+        sha1_sw_round(t++,w, &e,&a,&b,&c,&d, Par(a,b,c), SHA1_STAGE3_CONSTANT);
+        sha1_sw_round(t++,w, &d,&e,&a,&b,&c, Par(e,a,b), SHA1_STAGE3_CONSTANT);
+        sha1_sw_round(t++,w, &c,&d,&e,&a,&b, Par(d,e,a), SHA1_STAGE3_CONSTANT);
+        sha1_sw_round(t++,w, &b,&c,&d,&e,&a, Par(c,d,e), SHA1_STAGE3_CONSTANT);
+    }
+
+    core[0] += a; core[1] += b; core[2] += c; core[3] += d; core[4] += e;
+
+    smemclr(w, sizeof(w));
+}
+
+typedef struct sha1_sw {
+    uint32_t core[5];
+    sha1_block blk;
+    BinarySink_IMPLEMENTATION;
+    ssh_hash hash;
+} sha1_sw;
+
+static void sha1_sw_write(BinarySink *bs, const void *vp, size_t len);
+
+static ssh_hash *sha1_sw_new(const ssh_hashalg *alg)
+{
+    sha1_sw *s = snew(sha1_sw);
+
+    s->hash.vt = alg;
+    BinarySink_INIT(s, sha1_sw_write);
+    BinarySink_DELEGATE_INIT(&s->hash, s);
+    return &s->hash;
+}
+
+static void sha1_sw_reset(ssh_hash *hash)
+{
+    sha1_sw *s = container_of(hash, sha1_sw, hash);
+
+    memcpy(s->core, sha1_initial_state, sizeof(s->core));
+    sha1_block_setup(&s->blk);
+}
+
+static void sha1_sw_copyfrom(ssh_hash *hcopy, ssh_hash *horig)
+{
+    sha1_sw *copy = container_of(hcopy, sha1_sw, hash);
+    sha1_sw *orig = container_of(horig, sha1_sw, hash);
+
+    memcpy(copy, orig, sizeof(*copy));
+    BinarySink_COPIED(copy);
+    BinarySink_DELEGATE_INIT(&copy->hash, copy);
+}
+
+static void sha1_sw_free(ssh_hash *hash)
+{
+    sha1_sw *s = container_of(hash, sha1_sw, hash);
+
+    smemclr(s, sizeof(*s));
+    sfree(s);
+}
+
+static void sha1_sw_write(BinarySink *bs, const void *vp, size_t len)
+{
+    sha1_sw *s = BinarySink_DOWNCAST(bs, sha1_sw);
+
+    while (len > 0)
+        if (sha1_block_write(&s->blk, &vp, &len))
+            sha1_sw_block(s->core, s->blk.block);
+}
+
+static void sha1_sw_digest(ssh_hash *hash, uint8_t *digest)
+{
+    sha1_sw *s = container_of(hash, sha1_sw, hash);
+
+    sha1_block_pad(&s->blk, BinarySink_UPCAST(s));
+    for (size_t i = 0; i < 5; i++)
+        PUT_32BIT_MSB_FIRST(digest + 4*i, s->core[i]);
+}
+
+const ssh_hashalg ssh_sha1_sw = {
+    .new = sha1_sw_new,
+    .reset = sha1_sw_reset,
+    .copyfrom = sha1_sw_copyfrom,
+    .digest = sha1_sw_digest,
+    .free = sha1_sw_free,
+    .hlen = 20,
+    .blocklen = 64,
+    HASHALG_NAMES_ANNOTATED("SHA-1", "unaccelerated"),
+};
+
+/* ----------------------------------------------------------------------
+ * Hardware-accelerated implementation of SHA-1 using x86 SHA-NI.
+ */
+
+#if HW_SHA1 == HW_SHA1_NI
+
+/*
+ * Set target architecture for Clang and GCC
+ */
+
+#if defined(__clang__) || defined(__GNUC__)
+#    define FUNC_ISA __attribute__ ((target("sse4.1,sha")))
+#if !defined(__clang__)
+#    pragma GCC target("sha")
+#    pragma GCC target("sse4.1")
+#endif
+#else
+#    define FUNC_ISA
+#endif
+
+#include <wmmintrin.h>
+#include <smmintrin.h>
+#include <immintrin.h>
+#if defined(__clang__) || defined(__GNUC__)
+#include <shaintrin.h>
+#endif
+
+#if defined(__clang__) || defined(__GNUC__)
+#include <cpuid.h>
+#define GET_CPU_ID_0(out)                               \
+    __cpuid(0, (out)[0], (out)[1], (out)[2], (out)[3])
+#define GET_CPU_ID_7(out)                                       \
+    __cpuid_count(7, 0, (out)[0], (out)[1], (out)[2], (out)[3])
+#else
+#define GET_CPU_ID_0(out) __cpuid(out, 0)
+#define GET_CPU_ID_7(out) __cpuidex(out, 7, 0)
+#endif
+
+static bool sha1_hw_available(void)
+{
+    unsigned int CPUInfo[4];
+    GET_CPU_ID_0(CPUInfo);
+    if (CPUInfo[0] < 7)
+        return false;
+
+    GET_CPU_ID_7(CPUInfo);
+    return CPUInfo[1] & (1 << 29); /* Check SHA */
+}
+
+/* SHA1 implementation using new instructions
+   The code is based on Jeffrey Walton's SHA1 implementation:
+   https://github.com/noloader/SHA-Intrinsics
+*/
+FUNC_ISA
+static inline void sha1_ni_block(__m128i *core, const uint8_t *p)
+{
+    __m128i ABCD, E0, E1, MSG0, MSG1, MSG2, MSG3;
+    const __m128i MASK = _mm_set_epi64x(
+        0x0001020304050607ULL, 0x08090a0b0c0d0e0fULL);
+
+    const __m128i *block = (const __m128i *)p;
+
+    /* Load initial values */
+    ABCD = core[0];
+    E0 = core[1];
+
+    /* Rounds 0-3 */
+    MSG0 = _mm_loadu_si128(block);
+    MSG0 = _mm_shuffle_epi8(MSG0, MASK);
+    E0 = _mm_add_epi32(E0, MSG0);
+    E1 = ABCD;
+    ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 0);
+
+    /* Rounds 4-7 */
+    MSG1 = _mm_loadu_si128(block + 1);
+    MSG1 = _mm_shuffle_epi8(MSG1, MASK);
+    E1 = _mm_sha1nexte_epu32(E1, MSG1);
+    E0 = ABCD;
+    ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 0);
+    MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1);
+
+    /* Rounds 8-11 */
+    MSG2 = _mm_loadu_si128(block + 2);
+    MSG2 = _mm_shuffle_epi8(MSG2, MASK);
+    E0 = _mm_sha1nexte_epu32(E0, MSG2);
+    E1 = ABCD;
+    ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 0);
+    MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2);
+    MSG0 = _mm_xor_si128(MSG0, MSG2);
+
+    /* Rounds 12-15 */
+    MSG3 = _mm_loadu_si128(block + 3);
+    MSG3 = _mm_shuffle_epi8(MSG3, MASK);
+    E1 = _mm_sha1nexte_epu32(E1, MSG3);
+    E0 = ABCD;
+    MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3);
+    ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 0);
+    MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3);
+    MSG1 = _mm_xor_si128(MSG1, MSG3);
+
+    /* Rounds 16-19 */
+    E0 = _mm_sha1nexte_epu32(E0, MSG0);
+    E1 = ABCD;
+    MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0);
+    ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 0);
+    MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0);
+    MSG2 = _mm_xor_si128(MSG2, MSG0);
+
+    /* Rounds 20-23 */
+    E1 = _mm_sha1nexte_epu32(E1, MSG1);
+    E0 = ABCD;
+    MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1);
+    ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 1);
+    MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1);
+    MSG3 = _mm_xor_si128(MSG3, MSG1);
+
+    /* Rounds 24-27 */
+    E0 = _mm_sha1nexte_epu32(E0, MSG2);
+    E1 = ABCD;
+    MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2);
+    ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 1);
+    MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2);
+    MSG0 = _mm_xor_si128(MSG0, MSG2);
+
+    /* Rounds 28-31 */
+    E1 = _mm_sha1nexte_epu32(E1, MSG3);
+    E0 = ABCD;
+    MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3);
+    ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 1);
+    MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3);
+    MSG1 = _mm_xor_si128(MSG1, MSG3);
+
+    /* Rounds 32-35 */
+    E0 = _mm_sha1nexte_epu32(E0, MSG0);
+    E1 = ABCD;
+    MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0);
+    ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 1);
+    MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0);
+    MSG2 = _mm_xor_si128(MSG2, MSG0);
+
+    /* Rounds 36-39 */
+    E1 = _mm_sha1nexte_epu32(E1, MSG1);
+    E0 = ABCD;
+    MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1);
+    ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 1);
+    MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1);
+    MSG3 = _mm_xor_si128(MSG3, MSG1);
+
+    /* Rounds 40-43 */
+    E0 = _mm_sha1nexte_epu32(E0, MSG2);
+    E1 = ABCD;
+    MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2);
+    ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 2);
+    MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2);
+    MSG0 = _mm_xor_si128(MSG0, MSG2);
+
+    /* Rounds 44-47 */
+    E1 = _mm_sha1nexte_epu32(E1, MSG3);
+    E0 = ABCD;
+    MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3);
+    ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 2);
+    MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3);
+    MSG1 = _mm_xor_si128(MSG1, MSG3);
+
+    /* Rounds 48-51 */
+    E0 = _mm_sha1nexte_epu32(E0, MSG0);
+    E1 = ABCD;
+    MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0);
+    ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 2);
+    MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0);
+    MSG2 = _mm_xor_si128(MSG2, MSG0);
+
+    /* Rounds 52-55 */
+    E1 = _mm_sha1nexte_epu32(E1, MSG1);
+    E0 = ABCD;
+    MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1);
+    ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 2);
+    MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1);
+    MSG3 = _mm_xor_si128(MSG3, MSG1);
+
+    /* Rounds 56-59 */
+    E0 = _mm_sha1nexte_epu32(E0, MSG2);
+    E1 = ABCD;
+    MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2);
+    ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 2);
+    MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2);
+    MSG0 = _mm_xor_si128(MSG0, MSG2);
+
+    /* Rounds 60-63 */
+    E1 = _mm_sha1nexte_epu32(E1, MSG3);
+    E0 = ABCD;
+    MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3);
+    ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 3);
+    MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3);
+    MSG1 = _mm_xor_si128(MSG1, MSG3);
+
+    /* Rounds 64-67 */
+    E0 = _mm_sha1nexte_epu32(E0, MSG0);
+    E1 = ABCD;
+    MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0);
+    ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 3);
+    MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0);
+    MSG2 = _mm_xor_si128(MSG2, MSG0);
+
+    /* Rounds 68-71 */
+    E1 = _mm_sha1nexte_epu32(E1, MSG1);
+    E0 = ABCD;
+    MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1);
+    ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 3);
+    MSG3 = _mm_xor_si128(MSG3, MSG1);
+
+    /* Rounds 72-75 */
+    E0 = _mm_sha1nexte_epu32(E0, MSG2);
+    E1 = ABCD;
+    MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2);
+    ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 3);
+
+    /* Rounds 76-79 */
+    E1 = _mm_sha1nexte_epu32(E1, MSG3);
+    E0 = ABCD;
+    ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 3);
+
+    /* Combine state */
+    core[0] = _mm_add_epi32(ABCD, core[0]);
+    core[1] = _mm_sha1nexte_epu32(E0, core[1]);
+}
+
+typedef struct sha1_ni {
+    /*
+     * core[0] stores the first four words of the SHA-1 state. core[1]
+     * stores just the fifth word, in the vector lane at the highest
+     * address.
+     */
+    __m128i core[2];
+    sha1_block blk;
+    void *pointer_to_free;
+    BinarySink_IMPLEMENTATION;
+    ssh_hash hash;
+} sha1_ni;
+
+static void sha1_ni_write(BinarySink *bs, const void *vp, size_t len);
+
+static sha1_ni *sha1_ni_alloc(void)
+{
+    /*
+     * The __m128i variables in the context structure need to be
+     * 16-byte aligned, but not all malloc implementations that this
+     * code has to work with will guarantee to return a 16-byte
+     * aligned pointer. So we over-allocate, manually realign the
+     * pointer ourselves, and store the original one inside the
+     * context so we know how to free it later.
+     */
+    void *allocation = smalloc(sizeof(sha1_ni) + 15);
+    uintptr_t alloc_address = (uintptr_t)allocation;
+    uintptr_t aligned_address = (alloc_address + 15) & ~15;
+    sha1_ni *s = (sha1_ni *)aligned_address;
+    s->pointer_to_free = allocation;
+    return s;
+}
+
+static ssh_hash *sha1_ni_new(const ssh_hashalg *alg)
+{
+    if (!sha1_hw_available_cached())
+        return NULL;
+
+    sha1_ni *s = sha1_ni_alloc();
+
+    s->hash.vt = alg;
+    BinarySink_INIT(s, sha1_ni_write);
+    BinarySink_DELEGATE_INIT(&s->hash, s);
+    return &s->hash;
+}
+
+FUNC_ISA static void sha1_ni_reset(ssh_hash *hash)
+{
+    sha1_ni *s = container_of(hash, sha1_ni, hash);
+
+    /* Initialise the core vectors in their storage order */
+    s->core[0] = _mm_set_epi64x(
+        0x67452301efcdab89ULL, 0x98badcfe10325476ULL);
+    s->core[1] = _mm_set_epi32(0xc3d2e1f0, 0, 0, 0);
+
+    sha1_block_setup(&s->blk);
+}
+
+static void sha1_ni_copyfrom(ssh_hash *hcopy, ssh_hash *horig)
+{
+    sha1_ni *copy = container_of(hcopy, sha1_ni, hash);
+    sha1_ni *orig = container_of(horig, sha1_ni, hash);
+
+    void *ptf_save = copy->pointer_to_free;
+    *copy = *orig; /* structure copy */
+    copy->pointer_to_free = ptf_save;
+
+    BinarySink_COPIED(copy);
+    BinarySink_DELEGATE_INIT(&copy->hash, copy);
+}
+
+static void sha1_ni_free(ssh_hash *hash)
+{
+    sha1_ni *s = container_of(hash, sha1_ni, hash);
+
+    void *ptf = s->pointer_to_free;
+    smemclr(s, sizeof(*s));
+    sfree(ptf);
+}
+
+static void sha1_ni_write(BinarySink *bs, const void *vp, size_t len)
+{
+    sha1_ni *s = BinarySink_DOWNCAST(bs, sha1_ni);
+
+    while (len > 0)
+        if (sha1_block_write(&s->blk, &vp, &len))
+            sha1_ni_block(s->core, s->blk.block);
+}
+
+FUNC_ISA static void sha1_ni_digest(ssh_hash *hash, uint8_t *digest)
+{
+    sha1_ni *s = container_of(hash, sha1_ni, hash);
+
+    sha1_block_pad(&s->blk, BinarySink_UPCAST(s));
+
+    /* Rearrange the first vector into its output order */
+    __m128i abcd = _mm_shuffle_epi32(s->core[0], 0x1B);
+
+    /* Byte-swap it into the output endianness */
+    const __m128i mask = _mm_setr_epi8(3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12);
+    abcd = _mm_shuffle_epi8(abcd, mask);
+
+    /* And store it */
+    _mm_storeu_si128((__m128i *)digest, abcd);
+
+    /* Finally, store the leftover word */
+    uint32_t e = _mm_extract_epi32(s->core[1], 3);
+    PUT_32BIT_MSB_FIRST(digest + 16, e);
+}
+
+const ssh_hashalg ssh_sha1_hw = {
+    .new = sha1_ni_new,
+    .reset = sha1_ni_reset,
+    .copyfrom = sha1_ni_copyfrom,
+    .digest = sha1_ni_digest,
+    .free = sha1_ni_free,
+    .hlen = 20,
+    .blocklen = 64,
+    HASHALG_NAMES_ANNOTATED("SHA-1", "SHA-NI accelerated"),
+};
+
+/* ----------------------------------------------------------------------
+ * Hardware-accelerated implementation of SHA-1 using Arm NEON.
+ */
+
+#elif HW_SHA1 == HW_SHA1_NEON
+
+/*
+ * Manually set the target architecture, if we decided above that we
+ * need to.
+ */
+#ifdef USE_CLANG_ATTR_TARGET_AARCH64
+/*
+ * A spot of cheating: redefine some ACLE feature macros before
+ * including arm_neon.h. Otherwise we won't get the SHA intrinsics
+ * defined by that header, because it will be looking at the settings
+ * for the whole translation unit rather than the ones we're going to
+ * put on some particular functions using __attribute__((target)).
+ */
+#define __ARM_NEON 1
+#define __ARM_FEATURE_CRYPTO 1
+#define FUNC_ISA __attribute__ ((target("neon,crypto")))
+#endif /* USE_CLANG_ATTR_TARGET_AARCH64 */
+
+#ifndef FUNC_ISA
+#define FUNC_ISA
+#endif
+
+#ifdef USE_ARM64_NEON_H
+#include <arm64_neon.h>
+#else
+#include <arm_neon.h>
+#endif
+
+static bool sha1_hw_available(void)
+{
+    /*
+     * For Arm, we delegate to a per-platform detection function (see
+     * explanation in sshaes.c).
+     */
+    return platform_sha1_hw_available();
+}
+
+typedef struct sha1_neon_core sha1_neon_core;
+struct sha1_neon_core {
+    uint32x4_t abcd;
+    uint32_t e;
+};
+
+FUNC_ISA
+static inline uint32x4_t sha1_neon_load_input(const uint8_t *p)
+{
+    return vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(p)));
+}
+
+FUNC_ISA
+static inline uint32x4_t sha1_neon_schedule_update(
+    uint32x4_t m4, uint32x4_t m3, uint32x4_t m2, uint32x4_t m1)
+{
+    return vsha1su1q_u32(vsha1su0q_u32(m4, m3, m2), m1);
+}
+
+/*
+ * SHA-1 has three different kinds of round, differing in whether they
+ * use the Ch, Maj or Par functions defined above. Each one uses a
+ * separate NEON instruction, so we define three inline functions for
+ * the different round types using this macro.
+ *
+ * The two batches of Par-type rounds also use a different constant,
+ * but that's passed in as an operand, so we don't need a fourth
+ * inline function just for that.
+ */
+#define SHA1_NEON_ROUND_FN(type)                                        \
+    FUNC_ISA static inline sha1_neon_core sha1_neon_round4_##type(      \
+        sha1_neon_core old, uint32x4_t sched, uint32x4_t constant)      \
+    {                                                                   \
+        sha1_neon_core new;                                             \
+        uint32x4_t round_input = vaddq_u32(sched, constant);            \
+        new.abcd = vsha1##type##q_u32(old.abcd, old.e, round_input);    \
+        new.e = vsha1h_u32(vget_lane_u32(vget_low_u32(old.abcd), 0));   \
+        return new;                                                     \
+    }
+SHA1_NEON_ROUND_FN(c)
+SHA1_NEON_ROUND_FN(p)
+SHA1_NEON_ROUND_FN(m)
+
+FUNC_ISA
+static inline void sha1_neon_block(sha1_neon_core *core, const uint8_t *p)
+{
+    uint32x4_t constant, s0, s1, s2, s3;
+    sha1_neon_core cr = *core;
+
+    constant = vdupq_n_u32(SHA1_STAGE0_CONSTANT);
+    s0 = sha1_neon_load_input(p);
+    cr = sha1_neon_round4_c(cr, s0, constant);
+    s1 = sha1_neon_load_input(p + 16);
+    cr = sha1_neon_round4_c(cr, s1, constant);
+    s2 = sha1_neon_load_input(p + 32);
+    cr = sha1_neon_round4_c(cr, s2, constant);
+    s3 = sha1_neon_load_input(p + 48);
+    cr = sha1_neon_round4_c(cr, s3, constant);
+    s0 = sha1_neon_schedule_update(s0, s1, s2, s3);
+    cr = sha1_neon_round4_c(cr, s0, constant);
+
+    constant = vdupq_n_u32(SHA1_STAGE1_CONSTANT);
+    s1 = sha1_neon_schedule_update(s1, s2, s3, s0);
+    cr = sha1_neon_round4_p(cr, s1, constant);
+    s2 = sha1_neon_schedule_update(s2, s3, s0, s1);
+    cr = sha1_neon_round4_p(cr, s2, constant);
+    s3 = sha1_neon_schedule_update(s3, s0, s1, s2);
+    cr = sha1_neon_round4_p(cr, s3, constant);
+    s0 = sha1_neon_schedule_update(s0, s1, s2, s3);
+    cr = sha1_neon_round4_p(cr, s0, constant);
+    s1 = sha1_neon_schedule_update(s1, s2, s3, s0);
+    cr = sha1_neon_round4_p(cr, s1, constant);
+
+    constant = vdupq_n_u32(SHA1_STAGE2_CONSTANT);
+    s2 = sha1_neon_schedule_update(s2, s3, s0, s1);
+    cr = sha1_neon_round4_m(cr, s2, constant);
+    s3 = sha1_neon_schedule_update(s3, s0, s1, s2);
+    cr = sha1_neon_round4_m(cr, s3, constant);
+    s0 = sha1_neon_schedule_update(s0, s1, s2, s3);
+    cr = sha1_neon_round4_m(cr, s0, constant);
+    s1 = sha1_neon_schedule_update(s1, s2, s3, s0);
+    cr = sha1_neon_round4_m(cr, s1, constant);
+    s2 = sha1_neon_schedule_update(s2, s3, s0, s1);
+    cr = sha1_neon_round4_m(cr, s2, constant);
+
+    constant = vdupq_n_u32(SHA1_STAGE3_CONSTANT);
+    s3 = sha1_neon_schedule_update(s3, s0, s1, s2);
+    cr = sha1_neon_round4_p(cr, s3, constant);
+    s0 = sha1_neon_schedule_update(s0, s1, s2, s3);
+    cr = sha1_neon_round4_p(cr, s0, constant);
+    s1 = sha1_neon_schedule_update(s1, s2, s3, s0);
+    cr = sha1_neon_round4_p(cr, s1, constant);
+    s2 = sha1_neon_schedule_update(s2, s3, s0, s1);
+    cr = sha1_neon_round4_p(cr, s2, constant);
+    s3 = sha1_neon_schedule_update(s3, s0, s1, s2);
+    cr = sha1_neon_round4_p(cr, s3, constant);
+
+    core->abcd = vaddq_u32(core->abcd, cr.abcd);
+    core->e += cr.e;
+}
+
+typedef struct sha1_neon {
+    sha1_neon_core core;
+    sha1_block blk;
+    BinarySink_IMPLEMENTATION;
+    ssh_hash hash;
+} sha1_neon;
+
+static void sha1_neon_write(BinarySink *bs, const void *vp, size_t len);
+
+static ssh_hash *sha1_neon_new(const ssh_hashalg *alg)
+{
+    if (!sha1_hw_available_cached())
+        return NULL;
+
+    sha1_neon *s = snew(sha1_neon);
+
+    s->hash.vt = alg;
+    BinarySink_INIT(s, sha1_neon_write);
+    BinarySink_DELEGATE_INIT(&s->hash, s);
+    return &s->hash;
+}
+
+static void sha1_neon_reset(ssh_hash *hash)
+{
+    sha1_neon *s = container_of(hash, sha1_neon, hash);
+
+    s->core.abcd = vld1q_u32(sha1_initial_state);
+    s->core.e = sha1_initial_state[4];
+
+    sha1_block_setup(&s->blk);
+}
+
+static void sha1_neon_copyfrom(ssh_hash *hcopy, ssh_hash *horig)
+{
+    sha1_neon *copy = container_of(hcopy, sha1_neon, hash);
+    sha1_neon *orig = container_of(horig, sha1_neon, hash);
+
+    *copy = *orig; /* structure copy */
+
+    BinarySink_COPIED(copy);
+    BinarySink_DELEGATE_INIT(&copy->hash, copy);
+}
+
+static void sha1_neon_free(ssh_hash *hash)
+{
+    sha1_neon *s = container_of(hash, sha1_neon, hash);
+    smemclr(s, sizeof(*s));
+    sfree(s);
+}
+
+static void sha1_neon_write(BinarySink *bs, const void *vp, size_t len)
+{
+    sha1_neon *s = BinarySink_DOWNCAST(bs, sha1_neon);
+
+    while (len > 0)
+        if (sha1_block_write(&s->blk, &vp, &len))
+            sha1_neon_block(&s->core, s->blk.block);
+}
+
+static void sha1_neon_digest(ssh_hash *hash, uint8_t *digest)
+{
+    sha1_neon *s = container_of(hash, sha1_neon, hash);
+
+    sha1_block_pad(&s->blk, BinarySink_UPCAST(s));
+    vst1q_u8(digest, vrev32q_u8(vreinterpretq_u8_u32(s->core.abcd)));
+    PUT_32BIT_MSB_FIRST(digest + 16, s->core.e);
+}
+
+const ssh_hashalg ssh_sha1_hw = {
+    .new = sha1_neon_new,
+    .reset = sha1_neon_reset,
+    .copyfrom = sha1_neon_copyfrom,
+    .digest = sha1_neon_digest,
+    .free = sha1_neon_free,
+    .hlen = 20,
+    .blocklen = 64,
+    HASHALG_NAMES_ANNOTATED("SHA-1", "NEON accelerated"),
+};
+
+/* ----------------------------------------------------------------------
+ * Stub functions if we have no hardware-accelerated SHA-1. In this
+ * case, sha1_hw_new returns NULL (though it should also never be
+ * selected by sha1_select, so the only thing that should even be
+ * _able_ to call it is testcrypt). As a result, the remaining vtable
+ * functions should never be called at all.
+ */
+
+#elif HW_SHA1 == HW_SHA1_NONE
+
+static bool sha1_hw_available(void)
+{
+    return false;
+}
+
+static ssh_hash *sha1_stub_new(const ssh_hashalg *alg)
+{
+    return NULL;
+}
+
+#define STUB_BODY { unreachable("Should never be called"); }
+
+static void sha1_stub_reset(ssh_hash *hash) STUB_BODY
+static void sha1_stub_copyfrom(ssh_hash *hash, ssh_hash *orig) STUB_BODY
+static void sha1_stub_free(ssh_hash *hash) STUB_BODY
+static void sha1_stub_digest(ssh_hash *hash, uint8_t *digest) STUB_BODY
+
+const ssh_hashalg ssh_sha1_hw = {
+    .new = sha1_stub_new,
+    .reset = sha1_stub_reset,
+    .copyfrom = sha1_stub_copyfrom,
+    .digest = sha1_stub_digest,
+    .free = sha1_stub_free,
+    .hlen = 20,
+    .blocklen = 64,
+    HASHALG_NAMES_ANNOTATED("SHA-1", "!NONEXISTENT ACCELERATED VERSION!"),
+};
+
+#endif /* HW_SHA1 */
--- a/crypto/sha256.c
+++ b/crypto/sha256.c
@ -0,0 +1,939 @@
+/*
+ * SHA-256 algorithm as described at
+ *
+ *   http://csrc.nist.gov/cryptval/shs.html
+ */
+
+#include "ssh.h"
+#include <assert.h>
+
+/*
+ * Start by deciding whether we can support hardware SHA at all.
+ */
+#define HW_SHA256_NONE 0
+#define HW_SHA256_NI 1
+#define HW_SHA256_NEON 2
+
+#ifdef _FORCE_SHA_NI
+#   define HW_SHA256 HW_SHA256_NI
+#elif defined(__clang__)
+#   if __has_attribute(target) && __has_include(<wmmintrin.h>) &&       \
+    (defined(__x86_64__) || defined(__i386))
+#       define HW_SHA256 HW_SHA256_NI
+#   endif
+#elif defined(__GNUC__)
+#    if (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 9)) && \
+        (defined(__x86_64__) || defined(__i386))
+#       define HW_SHA256 HW_SHA256_NI
+#    endif
+#elif defined (_MSC_VER)
+#   if (defined(_M_X64) || defined(_M_IX86)) && _MSC_FULL_VER >= 150030729
+#      define HW_SHA256 HW_SHA256_NI
+#   endif
+#endif
+
+#ifdef _FORCE_SHA_NEON
+#   define HW_SHA256 HW_SHA256_NEON
+#elif defined __BYTE_ORDER__ && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+    /* Arm can potentially support both endiannesses, but this code
+     * hasn't been tested on anything but little. If anyone wants to
+     * run big-endian, they'll need to fix it first. */
+#elif defined __ARM_FEATURE_CRYPTO
+    /* If the Arm crypto extension is available already, we can
+     * support NEON SHA without having to enable anything by hand */
+#   define HW_SHA256 HW_SHA256_NEON
+#elif defined(__clang__)
+#   if __has_attribute(target) && __has_include(<arm_neon.h>) &&       \
+    (defined(__aarch64__))
+        /* clang can enable the crypto extension in AArch64 using
+         * __attribute__((target)) */
+#       define HW_SHA256 HW_SHA256_NEON
+#       define USE_CLANG_ATTR_TARGET_AARCH64
+#   endif
+#elif defined _MSC_VER
+    /* Visual Studio supports the crypto extension when targeting
+     * AArch64, but as of VS2017, the AArch32 header doesn't quite
+     * manage it (declaring the shae/shad intrinsics without a round
+     * key operand). */
+#   if defined _M_ARM64
+#       define HW_SHA256 HW_SHA256_NEON
+#       if defined _M_ARM64
+#           define USE_ARM64_NEON_H /* unusual header name in this case */
+#       endif
+#   endif
+#endif
+
+#if defined _FORCE_SOFTWARE_SHA || !defined HW_SHA256
+#   undef HW_SHA256
+#   define HW_SHA256 HW_SHA256_NONE
+#endif
+
+/*
+ * The actual query function that asks if hardware acceleration is
+ * available.
+ */
+static bool sha256_hw_available(void);
+
+/*
+ * The top-level selection function, caching the results of
+ * sha256_hw_available() so it only has to run once.
+ */
+static bool sha256_hw_available_cached(void)
+{
+    static bool initialised = false;
+    static bool hw_available;
+    if (!initialised) {
+        hw_available = sha256_hw_available();
+        initialised = true;
+    }
+    return hw_available;
+}
+
+static ssh_hash *sha256_select(const ssh_hashalg *alg)
+{
+    const ssh_hashalg *real_alg =
+        sha256_hw_available_cached() ? &ssh_sha256_hw : &ssh_sha256_sw;
+
+    return ssh_hash_new(real_alg);
+}
+
+const ssh_hashalg ssh_sha256 = {
+    .new = sha256_select,
+    .hlen = 32,
+    .blocklen = 64,
+    HASHALG_NAMES_ANNOTATED("SHA-256", "dummy selector vtable"),
+};
+
+/* ----------------------------------------------------------------------
+ * Definitions likely to be helpful to multiple implementations.
+ */
+
+static const uint32_t sha256_initial_state[] = {
+    0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a,
+    0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19,
+};
+
+static const uint32_t sha256_round_constants[] = {
+    0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
+    0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
+    0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
+    0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
+    0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
+    0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
+    0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
+    0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
+    0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
+    0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
+    0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
+    0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
+    0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
+    0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
+    0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
+    0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
+};
+
+#define SHA256_ROUNDS 64
+
+typedef struct sha256_block sha256_block;
+struct sha256_block {
+    uint8_t block[64];
+    size_t used;
+    uint64_t len;
+};
+
+static inline void sha256_block_setup(sha256_block *blk)
+{
+    blk->used = 0;
+    blk->len = 0;
+}
+
+static inline bool sha256_block_write(
+    sha256_block *blk, const void **vdata, size_t *len)
+{
+    size_t blkleft = sizeof(blk->block) - blk->used;
+    size_t chunk = *len < blkleft ? *len : blkleft;
+
+    const uint8_t *p = *vdata;
+    memcpy(blk->block + blk->used, p, chunk);
+    *vdata = p + chunk;
+    *len -= chunk;
+    blk->used += chunk;
+    blk->len += chunk;
+
+    if (blk->used == sizeof(blk->block)) {
+        blk->used = 0;
+        return true;
+    }
+
+    return false;
+}
+
+static inline void sha256_block_pad(sha256_block *blk, BinarySink *bs)
+{
+    uint64_t final_len = blk->len << 3;
+    size_t pad = 1 + (63 & (55 - blk->used));
+
+    put_byte(bs, 0x80);
+    for (size_t i = 1; i < pad; i++)
+        put_byte(bs, 0);
+    put_uint64(bs, final_len);
+
+    assert(blk->used == 0 && "Should have exactly hit a block boundary");
+}
+
+/* ----------------------------------------------------------------------
+ * Software implementation of SHA-256.
+ */
+
+static inline uint32_t ror(uint32_t x, unsigned y)
+{
+    return (x << (31 & -y)) | (x >> (31 & y));
+}
+
+static inline uint32_t Ch(uint32_t ctrl, uint32_t if1, uint32_t if0)
+{
+    return if0 ^ (ctrl & (if1 ^ if0));
+}
+
+static inline uint32_t Maj(uint32_t x, uint32_t y, uint32_t z)
+{
+    return (x & y) | (z & (x | y));
+}
+
+static inline uint32_t Sigma_0(uint32_t x)
+{
+    return ror(x,2) ^ ror(x,13) ^ ror(x,22);
+}
+
+static inline uint32_t Sigma_1(uint32_t x)
+{
+    return ror(x,6) ^ ror(x,11) ^ ror(x,25);
+}
+
+static inline uint32_t sigma_0(uint32_t x)
+{
+    return ror(x,7) ^ ror(x,18) ^ (x >> 3);
+}
+
+static inline uint32_t sigma_1(uint32_t x)
+{
+    return ror(x,17) ^ ror(x,19) ^ (x >> 10);
+}
+
+static inline void sha256_sw_round(
+    unsigned round_index, const uint32_t *schedule,
+    uint32_t *a, uint32_t *b, uint32_t *c, uint32_t *d,
+    uint32_t *e, uint32_t *f, uint32_t *g, uint32_t *h)
+{
+    uint32_t t1 = *h + Sigma_1(*e) + Ch(*e,*f,*g) +
+        sha256_round_constants[round_index] + schedule[round_index];
+
+    uint32_t t2 = Sigma_0(*a) + Maj(*a,*b,*c);
+
+    *d += t1;
+    *h = t1 + t2;
+}
+
+static void sha256_sw_block(uint32_t *core, const uint8_t *block)
+{
+    uint32_t w[SHA256_ROUNDS];
+    uint32_t a,b,c,d,e,f,g,h;
+
+    for (size_t t = 0; t < 16; t++)
+        w[t] = GET_32BIT_MSB_FIRST(block + 4*t);
+
+    for (size_t t = 16; t < SHA256_ROUNDS; t++)
+        w[t] = sigma_1(w[t-2]) + w[t-7] + sigma_0(w[t-15]) + w[t-16];
+
+    a = core[0]; b = core[1]; c = core[2]; d = core[3];
+    e = core[4]; f = core[5]; g = core[6]; h = core[7];
+
+    for (size_t t = 0; t < SHA256_ROUNDS; t += 8) {
+        sha256_sw_round(t+0, w, &a,&b,&c,&d,&e,&f,&g,&h);
+        sha256_sw_round(t+1, w, &h,&a,&b,&c,&d,&e,&f,&g);
+        sha256_sw_round(t+2, w, &g,&h,&a,&b,&c,&d,&e,&f);
+        sha256_sw_round(t+3, w, &f,&g,&h,&a,&b,&c,&d,&e);
+        sha256_sw_round(t+4, w, &e,&f,&g,&h,&a,&b,&c,&d);
+        sha256_sw_round(t+5, w, &d,&e,&f,&g,&h,&a,&b,&c);
+        sha256_sw_round(t+6, w, &c,&d,&e,&f,&g,&h,&a,&b);
+        sha256_sw_round(t+7, w, &b,&c,&d,&e,&f,&g,&h,&a);
+    }
+
+    core[0] += a; core[1] += b; core[2] += c; core[3] += d;
+    core[4] += e; core[5] += f; core[6] += g; core[7] += h;
+
+    smemclr(w, sizeof(w));
+}
+
+typedef struct sha256_sw {
+    uint32_t core[8];
+    sha256_block blk;
+    BinarySink_IMPLEMENTATION;
+    ssh_hash hash;
+} sha256_sw;
+
+static void sha256_sw_write(BinarySink *bs, const void *vp, size_t len);
+
+static ssh_hash *sha256_sw_new(const ssh_hashalg *alg)
+{
+    sha256_sw *s = snew(sha256_sw);
+
+    s->hash.vt = alg;
+    BinarySink_INIT(s, sha256_sw_write);
+    BinarySink_DELEGATE_INIT(&s->hash, s);
+    return &s->hash;
+}
+
+static void sha256_sw_reset(ssh_hash *hash)
+{
+    sha256_sw *s = container_of(hash, sha256_sw, hash);
+
+    memcpy(s->core, sha256_initial_state, sizeof(s->core));
+    sha256_block_setup(&s->blk);
+}
+
+static void sha256_sw_copyfrom(ssh_hash *hcopy, ssh_hash *horig)
+{
+    sha256_sw *copy = container_of(hcopy, sha256_sw, hash);
+    sha256_sw *orig = container_of(horig, sha256_sw, hash);
+
+    memcpy(copy, orig, sizeof(*copy));
+    BinarySink_COPIED(copy);
+    BinarySink_DELEGATE_INIT(&copy->hash, copy);
+}
+
+static void sha256_sw_free(ssh_hash *hash)
+{
+    sha256_sw *s = container_of(hash, sha256_sw, hash);
+
+    smemclr(s, sizeof(*s));
+    sfree(s);
+}
+
+static void sha256_sw_write(BinarySink *bs, const void *vp, size_t len)
+{
+    sha256_sw *s = BinarySink_DOWNCAST(bs, sha256_sw);
+
+    while (len > 0)
+        if (sha256_block_write(&s->blk, &vp, &len))
+            sha256_sw_block(s->core, s->blk.block);
+}
+
+static void sha256_sw_digest(ssh_hash *hash, uint8_t *digest)
+{
+    sha256_sw *s = container_of(hash, sha256_sw, hash);
+
+    sha256_block_pad(&s->blk, BinarySink_UPCAST(s));
+    for (size_t i = 0; i < 8; i++)
+        PUT_32BIT_MSB_FIRST(digest + 4*i, s->core[i]);
+}
+
+const ssh_hashalg ssh_sha256_sw = {
+    .new = sha256_sw_new,
+    .reset = sha256_sw_reset,
+    .copyfrom = sha256_sw_copyfrom,
+    .digest = sha256_sw_digest,
+    .free = sha256_sw_free,
+    .hlen = 32,
+    .blocklen = 64,
+    HASHALG_NAMES_ANNOTATED("SHA-256", "unaccelerated"),
+};
+
+/* ----------------------------------------------------------------------
+ * Hardware-accelerated implementation of SHA-256 using x86 SHA-NI.
+ */
+
+#if HW_SHA256 == HW_SHA256_NI
+
+/*
+ * Set target architecture for Clang and GCC
+ */
+#if defined(__clang__) || defined(__GNUC__)
+#    define FUNC_ISA __attribute__ ((target("sse4.1,sha")))
+#if !defined(__clang__)
+#    pragma GCC target("sha")
+#    pragma GCC target("sse4.1")
+#endif
+#else
+#    define FUNC_ISA
+#endif
+
+#include <wmmintrin.h>
+#include <smmintrin.h>
+#include <immintrin.h>
+#if defined(__clang__) || defined(__GNUC__)
+#include <shaintrin.h>
+#endif
+
+#if defined(__clang__) || defined(__GNUC__)
+#include <cpuid.h>
+#define GET_CPU_ID_0(out)                               \
+    __cpuid(0, (out)[0], (out)[1], (out)[2], (out)[3])
+#define GET_CPU_ID_7(out)                                       \
+    __cpuid_count(7, 0, (out)[0], (out)[1], (out)[2], (out)[3])
+#else
+#define GET_CPU_ID_0(out) __cpuid(out, 0)
+#define GET_CPU_ID_7(out) __cpuidex(out, 7, 0)
+#endif
+
+static bool sha256_hw_available(void)
+{
+    unsigned int CPUInfo[4];
+    GET_CPU_ID_0(CPUInfo);
+    if (CPUInfo[0] < 7)
+        return false;
+
+    GET_CPU_ID_7(CPUInfo);
+    return CPUInfo[1] & (1 << 29); /* Check SHA */
+}
+
+/* SHA256 implementation using new instructions
+   The code is based on Jeffrey Walton's SHA256 implementation:
+   https://github.com/noloader/SHA-Intrinsics
+*/
+FUNC_ISA
+static inline void sha256_ni_block(__m128i *core, const uint8_t *p)
+{
+    __m128i STATE0, STATE1;
+    __m128i MSG, TMP;
+    __m128i MSG0, MSG1, MSG2, MSG3;
+    const __m128i *block = (const __m128i *)p;
+    const __m128i MASK = _mm_set_epi64x(
+        0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL);
+
+    /* Load initial values */
+    STATE0 = core[0];
+    STATE1 = core[1];
+
+    /* Rounds 0-3 */
+    MSG = _mm_loadu_si128(block);
+    MSG0 = _mm_shuffle_epi8(MSG, MASK);
+    MSG = _mm_add_epi32(MSG0, _mm_set_epi64x(
+                            0xE9B5DBA5B5C0FBCFULL, 0x71374491428A2F98ULL));
+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+    MSG = _mm_shuffle_epi32(MSG, 0x0E);
+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+
+    /* Rounds 4-7 */
+    MSG1 = _mm_loadu_si128(block + 1);
+    MSG1 = _mm_shuffle_epi8(MSG1, MASK);
+    MSG = _mm_add_epi32(MSG1, _mm_set_epi64x(
+                            0xAB1C5ED5923F82A4ULL, 0x59F111F13956C25BULL));
+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+    MSG = _mm_shuffle_epi32(MSG, 0x0E);
+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+    MSG0 = _mm_sha256msg1_epu32(MSG0, MSG1);
+
+    /* Rounds 8-11 */
+    MSG2 = _mm_loadu_si128(block + 2);
+    MSG2 = _mm_shuffle_epi8(MSG2, MASK);
+    MSG = _mm_add_epi32(MSG2, _mm_set_epi64x(
+                            0x550C7DC3243185BEULL, 0x12835B01D807AA98ULL));
+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+    MSG = _mm_shuffle_epi32(MSG, 0x0E);
+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+    MSG1 = _mm_sha256msg1_epu32(MSG1, MSG2);
+
+    /* Rounds 12-15 */
+    MSG3 = _mm_loadu_si128(block + 3);
+    MSG3 = _mm_shuffle_epi8(MSG3, MASK);
+    MSG = _mm_add_epi32(MSG3, _mm_set_epi64x(
+                            0xC19BF1749BDC06A7ULL, 0x80DEB1FE72BE5D74ULL));
+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+    TMP = _mm_alignr_epi8(MSG3, MSG2, 4);
+    MSG0 = _mm_add_epi32(MSG0, TMP);
+    MSG0 = _mm_sha256msg2_epu32(MSG0, MSG3);
+    MSG = _mm_shuffle_epi32(MSG, 0x0E);
+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+    MSG2 = _mm_sha256msg1_epu32(MSG2, MSG3);
+
+    /* Rounds 16-19 */
+    MSG = _mm_add_epi32(MSG0, _mm_set_epi64x(
+                            0x240CA1CC0FC19DC6ULL, 0xEFBE4786E49B69C1ULL));
+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+    TMP = _mm_alignr_epi8(MSG0, MSG3, 4);
+    MSG1 = _mm_add_epi32(MSG1, TMP);
+    MSG1 = _mm_sha256msg2_epu32(MSG1, MSG0);
+    MSG = _mm_shuffle_epi32(MSG, 0x0E);
+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+    MSG3 = _mm_sha256msg1_epu32(MSG3, MSG0);
+
+    /* Rounds 20-23 */
+    MSG = _mm_add_epi32(MSG1, _mm_set_epi64x(
+                            0x76F988DA5CB0A9DCULL, 0x4A7484AA2DE92C6FULL));
+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+    TMP = _mm_alignr_epi8(MSG1, MSG0, 4);
+    MSG2 = _mm_add_epi32(MSG2, TMP);
+    MSG2 = _mm_sha256msg2_epu32(MSG2, MSG1);
+    MSG = _mm_shuffle_epi32(MSG, 0x0E);
+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+    MSG0 = _mm_sha256msg1_epu32(MSG0, MSG1);
+
+    /* Rounds 24-27 */
+    MSG = _mm_add_epi32(MSG2, _mm_set_epi64x(
+                            0xBF597FC7B00327C8ULL, 0xA831C66D983E5152ULL));
+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+    TMP = _mm_alignr_epi8(MSG2, MSG1, 4);
+    MSG3 = _mm_add_epi32(MSG3, TMP);
+    MSG3 = _mm_sha256msg2_epu32(MSG3, MSG2);
+    MSG = _mm_shuffle_epi32(MSG, 0x0E);
+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+    MSG1 = _mm_sha256msg1_epu32(MSG1, MSG2);
+
+    /* Rounds 28-31 */
+    MSG = _mm_add_epi32(MSG3, _mm_set_epi64x(
+                            0x1429296706CA6351ULL,  0xD5A79147C6E00BF3ULL));
+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+    TMP = _mm_alignr_epi8(MSG3, MSG2, 4);
+    MSG0 = _mm_add_epi32(MSG0, TMP);
+    MSG0 = _mm_sha256msg2_epu32(MSG0, MSG3);
+    MSG = _mm_shuffle_epi32(MSG, 0x0E);
+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+    MSG2 = _mm_sha256msg1_epu32(MSG2, MSG3);
+
+    /* Rounds 32-35 */
+    MSG = _mm_add_epi32(MSG0, _mm_set_epi64x(
+                            0x53380D134D2C6DFCULL, 0x2E1B213827B70A85ULL));
+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+    TMP = _mm_alignr_epi8(MSG0, MSG3, 4);
+    MSG1 = _mm_add_epi32(MSG1, TMP);
+    MSG1 = _mm_sha256msg2_epu32(MSG1, MSG0);
+    MSG = _mm_shuffle_epi32(MSG, 0x0E);
+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+    MSG3 = _mm_sha256msg1_epu32(MSG3, MSG0);
+
+    /* Rounds 36-39 */
+    MSG = _mm_add_epi32(MSG1, _mm_set_epi64x(
+                            0x92722C8581C2C92EULL, 0x766A0ABB650A7354ULL));
+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+    TMP = _mm_alignr_epi8(MSG1, MSG0, 4);
+    MSG2 = _mm_add_epi32(MSG2, TMP);
+    MSG2 = _mm_sha256msg2_epu32(MSG2, MSG1);
+    MSG = _mm_shuffle_epi32(MSG, 0x0E);
+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+    MSG0 = _mm_sha256msg1_epu32(MSG0, MSG1);
+
+    /* Rounds 40-43 */
+    MSG = _mm_add_epi32(MSG2, _mm_set_epi64x(
+                            0xC76C51A3C24B8B70ULL, 0xA81A664BA2BFE8A1ULL));
+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+    TMP = _mm_alignr_epi8(MSG2, MSG1, 4);
+    MSG3 = _mm_add_epi32(MSG3, TMP);
+    MSG3 = _mm_sha256msg2_epu32(MSG3, MSG2);
+    MSG = _mm_shuffle_epi32(MSG, 0x0E);
+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+    MSG1 = _mm_sha256msg1_epu32(MSG1, MSG2);
+
+    /* Rounds 44-47 */
+    MSG = _mm_add_epi32(MSG3, _mm_set_epi64x(
+                            0x106AA070F40E3585ULL, 0xD6990624D192E819ULL));
+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+    TMP = _mm_alignr_epi8(MSG3, MSG2, 4);
+    MSG0 = _mm_add_epi32(MSG0, TMP);
+    MSG0 = _mm_sha256msg2_epu32(MSG0, MSG3);
+    MSG = _mm_shuffle_epi32(MSG, 0x0E);
+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+    MSG2 = _mm_sha256msg1_epu32(MSG2, MSG3);
+
+    /* Rounds 48-51 */
+    MSG = _mm_add_epi32(MSG0, _mm_set_epi64x(
+                            0x34B0BCB52748774CULL, 0x1E376C0819A4C116ULL));
+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+    TMP = _mm_alignr_epi8(MSG0, MSG3, 4);
+    MSG1 = _mm_add_epi32(MSG1, TMP);
+    MSG1 = _mm_sha256msg2_epu32(MSG1, MSG0);
+    MSG = _mm_shuffle_epi32(MSG, 0x0E);
+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+    MSG3 = _mm_sha256msg1_epu32(MSG3, MSG0);
+
+    /* Rounds 52-55 */
+    MSG = _mm_add_epi32(MSG1, _mm_set_epi64x(
+                            0x682E6FF35B9CCA4FULL, 0x4ED8AA4A391C0CB3ULL));
+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+    TMP = _mm_alignr_epi8(MSG1, MSG0, 4);
+    MSG2 = _mm_add_epi32(MSG2, TMP);
+    MSG2 = _mm_sha256msg2_epu32(MSG2, MSG1);
+    MSG = _mm_shuffle_epi32(MSG, 0x0E);
+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+
+    /* Rounds 56-59 */
+    MSG = _mm_add_epi32(MSG2, _mm_set_epi64x(
+                            0x8CC7020884C87814ULL, 0x78A5636F748F82EEULL));
+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+    TMP = _mm_alignr_epi8(MSG2, MSG1, 4);
+    MSG3 = _mm_add_epi32(MSG3, TMP);
+    MSG3 = _mm_sha256msg2_epu32(MSG3, MSG2);
+    MSG = _mm_shuffle_epi32(MSG, 0x0E);
+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+
+    /* Rounds 60-63 */
+    MSG = _mm_add_epi32(MSG3, _mm_set_epi64x(
+                            0xC67178F2BEF9A3F7ULL, 0xA4506CEB90BEFFFAULL));
+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+    MSG = _mm_shuffle_epi32(MSG, 0x0E);
+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+
+    /* Combine state */
+    core[0] = _mm_add_epi32(STATE0, core[0]);
+    core[1] = _mm_add_epi32(STATE1, core[1]);
+}
+
+typedef struct sha256_ni {
+    /*
+     * These two vectors store the 8 words of the SHA-256 state, but
+     * not in the same order they appear in the spec: the first word
+     * holds A,B,E,F and the second word C,D,G,H.
+     */
+    __m128i core[2];
+    sha256_block blk;
+    void *pointer_to_free;
+    BinarySink_IMPLEMENTATION;
+    ssh_hash hash;
+} sha256_ni;
+
+static void sha256_ni_write(BinarySink *bs, const void *vp, size_t len);
+
+static sha256_ni *sha256_ni_alloc(void)
+{
+    /*
+     * The __m128i variables in the context structure need to be
+     * 16-byte aligned, but not all malloc implementations that this
+     * code has to work with will guarantee to return a 16-byte
+     * aligned pointer. So we over-allocate, manually realign the
+     * pointer ourselves, and store the original one inside the
+     * context so we know how to free it later.
+     */
+    void *allocation = smalloc(sizeof(sha256_ni) + 15);
+    uintptr_t alloc_address = (uintptr_t)allocation;
+    uintptr_t aligned_address = (alloc_address + 15) & ~15;
+    sha256_ni *s = (sha256_ni *)aligned_address;
+    s->pointer_to_free = allocation;
+    return s;
+}
+
+static ssh_hash *sha256_ni_new(const ssh_hashalg *alg)
+{
+    if (!sha256_hw_available_cached())
+        return NULL;
+
+    sha256_ni *s = sha256_ni_alloc();
+
+    s->hash.vt = alg;
+    BinarySink_INIT(s, sha256_ni_write);
+    BinarySink_DELEGATE_INIT(&s->hash, s);
+
+    return &s->hash;
+}
+
+FUNC_ISA static void sha256_ni_reset(ssh_hash *hash)
+{
+    sha256_ni *s = container_of(hash, sha256_ni, hash);
+
+    /* Initialise the core vectors in their storage order */
+    s->core[0] = _mm_set_epi64x(
+        0x6a09e667bb67ae85ULL, 0x510e527f9b05688cULL);
+    s->core[1] = _mm_set_epi64x(
+        0x3c6ef372a54ff53aULL, 0x1f83d9ab5be0cd19ULL);
+
+    sha256_block_setup(&s->blk);
+}
+
+static void sha256_ni_copyfrom(ssh_hash *hcopy, ssh_hash *horig)
+{
+    sha256_ni *copy = container_of(hcopy, sha256_ni, hash);
+    sha256_ni *orig = container_of(horig, sha256_ni, hash);
+
+    void *ptf_save = copy->pointer_to_free;
+    *copy = *orig; /* structure copy */
+    copy->pointer_to_free = ptf_save;
+
+    BinarySink_COPIED(copy);
+    BinarySink_DELEGATE_INIT(&copy->hash, copy);
+}
+
+static void sha256_ni_free(ssh_hash *hash)
+{
+    sha256_ni *s = container_of(hash, sha256_ni, hash);
+
+    void *ptf = s->pointer_to_free;
+    smemclr(s, sizeof(*s));
+    sfree(ptf);
+}
+
+static void sha256_ni_write(BinarySink *bs, const void *vp, size_t len)
+{
+    sha256_ni *s = BinarySink_DOWNCAST(bs, sha256_ni);
+
+    while (len > 0)
+        if (sha256_block_write(&s->blk, &vp, &len))
+            sha256_ni_block(s->core, s->blk.block);
+}
+
+FUNC_ISA static void sha256_ni_digest(ssh_hash *hash, uint8_t *digest)
+{
+    sha256_ni *s = container_of(hash, sha256_ni, hash);
+
+    sha256_block_pad(&s->blk, BinarySink_UPCAST(s));
+
+    /* Rearrange the words into the output order */
+    __m128i feba = _mm_shuffle_epi32(s->core[0], 0x1B);
+    __m128i dchg = _mm_shuffle_epi32(s->core[1], 0xB1);
+    __m128i dcba = _mm_blend_epi16(feba, dchg, 0xF0);
+    __m128i hgfe = _mm_alignr_epi8(dchg, feba, 8);
+
+    /* Byte-swap them into the output endianness */
+    const __m128i mask = _mm_setr_epi8(3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12);
+    dcba = _mm_shuffle_epi8(dcba, mask);
+    hgfe = _mm_shuffle_epi8(hgfe, mask);
+
+    /* And store them */
+    __m128i *output = (__m128i *)digest;
+    _mm_storeu_si128(output, dcba);
+    _mm_storeu_si128(output+1, hgfe);
+}
+
+const ssh_hashalg ssh_sha256_hw = {
+    .new = sha256_ni_new,
+    .reset = sha256_ni_reset,
+    .copyfrom = sha256_ni_copyfrom,
+    .digest = sha256_ni_digest,
+    .free = sha256_ni_free,
+    .hlen = 32,
+    .blocklen = 64,
+    HASHALG_NAMES_ANNOTATED("SHA-256", "SHA-NI accelerated"),
+};
+
+/* ----------------------------------------------------------------------
+ * Hardware-accelerated implementation of SHA-256 using Arm NEON.
+ */
+
+#elif HW_SHA256 == HW_SHA256_NEON
+
+/*
+ * Manually set the target architecture, if we decided above that we
+ * need to.
+ */
+#ifdef USE_CLANG_ATTR_TARGET_AARCH64
+/*
+ * A spot of cheating: redefine some ACLE feature macros before
+ * including arm_neon.h. Otherwise we won't get the SHA intrinsics
+ * defined by that header, because it will be looking at the settings
+ * for the whole translation unit rather than the ones we're going to
+ * put on some particular functions using __attribute__((target)).
+ */
+#define __ARM_NEON 1
+#define __ARM_FEATURE_CRYPTO 1
+#define FUNC_ISA __attribute__ ((target("neon,crypto")))
+#endif /* USE_CLANG_ATTR_TARGET_AARCH64 */
+
+#ifndef FUNC_ISA
+#define FUNC_ISA
+#endif
+
+#ifdef USE_ARM64_NEON_H
+#include <arm64_neon.h>
+#else
+#include <arm_neon.h>
+#endif
+
+static bool sha256_hw_available(void)
+{
+    /*
+     * For Arm, we delegate to a per-platform detection function (see
+     * explanation in sshaes.c).
+     */
+    return platform_sha256_hw_available();
+}
+
+typedef struct sha256_neon_core sha256_neon_core;
+struct sha256_neon_core {
+    uint32x4_t abcd, efgh;
+};
+
+FUNC_ISA
+static inline uint32x4_t sha256_neon_load_input(const uint8_t *p)
+{
+    return vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(p)));
+}
+
+FUNC_ISA
+static inline uint32x4_t sha256_neon_schedule_update(
+    uint32x4_t m4, uint32x4_t m3, uint32x4_t m2, uint32x4_t m1)
+{
+    return vsha256su1q_u32(vsha256su0q_u32(m4, m3), m2, m1);
+}
+
+FUNC_ISA
+static inline sha256_neon_core sha256_neon_round4(
+    sha256_neon_core old, uint32x4_t sched, unsigned round)
+{
+    sha256_neon_core new;
+
+    uint32x4_t round_input = vaddq_u32(
+        sched, vld1q_u32(sha256_round_constants + round));
+    new.abcd = vsha256hq_u32 (old.abcd, old.efgh, round_input);
+    new.efgh = vsha256h2q_u32(old.efgh, old.abcd, round_input);
+    return new;
+}
+
+FUNC_ISA
+static inline void sha256_neon_block(sha256_neon_core *core, const uint8_t *p)
+{
+    uint32x4_t s0, s1, s2, s3;
+    sha256_neon_core cr = *core;
+
+    s0 = sha256_neon_load_input(p);
+    cr = sha256_neon_round4(cr, s0, 0);
+    s1 = sha256_neon_load_input(p+16);
+    cr = sha256_neon_round4(cr, s1, 4);
+    s2 = sha256_neon_load_input(p+32);
+    cr = sha256_neon_round4(cr, s2, 8);
+    s3 = sha256_neon_load_input(p+48);
+    cr = sha256_neon_round4(cr, s3, 12);
+    s0 = sha256_neon_schedule_update(s0, s1, s2, s3);
+    cr = sha256_neon_round4(cr, s0, 16);
+    s1 = sha256_neon_schedule_update(s1, s2, s3, s0);
+    cr = sha256_neon_round4(cr, s1, 20);
+    s2 = sha256_neon_schedule_update(s2, s3, s0, s1);
+    cr = sha256_neon_round4(cr, s2, 24);
+    s3 = sha256_neon_schedule_update(s3, s0, s1, s2);
+    cr = sha256_neon_round4(cr, s3, 28);
+    s0 = sha256_neon_schedule_update(s0, s1, s2, s3);
+    cr = sha256_neon_round4(cr, s0, 32);
+    s1 = sha256_neon_schedule_update(s1, s2, s3, s0);
+    cr = sha256_neon_round4(cr, s1, 36);
+    s2 = sha256_neon_schedule_update(s2, s3, s0, s1);
+    cr = sha256_neon_round4(cr, s2, 40);
+    s3 = sha256_neon_schedule_update(s3, s0, s1, s2);
+    cr = sha256_neon_round4(cr, s3, 44);
+    s0 = sha256_neon_schedule_update(s0, s1, s2, s3);
+    cr = sha256_neon_round4(cr, s0, 48);
+    s1 = sha256_neon_schedule_update(s1, s2, s3, s0);
+    cr = sha256_neon_round4(cr, s1, 52);
+    s2 = sha256_neon_schedule_update(s2, s3, s0, s1);
+    cr = sha256_neon_round4(cr, s2, 56);
+    s3 = sha256_neon_schedule_update(s3, s0, s1, s2);
+    cr = sha256_neon_round4(cr, s3, 60);
+
+    core->abcd = vaddq_u32(core->abcd, cr.abcd);
+    core->efgh = vaddq_u32(core->efgh, cr.efgh);
+}
+
+typedef struct sha256_neon {
+    sha256_neon_core core;
+    sha256_block blk;
+    BinarySink_IMPLEMENTATION;
+    ssh_hash hash;
+} sha256_neon;
+
+static void sha256_neon_write(BinarySink *bs, const void *vp, size_t len);
+
+static ssh_hash *sha256_neon_new(const ssh_hashalg *alg)
+{
+    if (!sha256_hw_available_cached())
+        return NULL;
+
+    sha256_neon *s = snew(sha256_neon);
+
+    s->hash.vt = alg;
+    BinarySink_INIT(s, sha256_neon_write);
+    BinarySink_DELEGATE_INIT(&s->hash, s);
+    return &s->hash;
+}
+
+static void sha256_neon_reset(ssh_hash *hash)
+{
+    sha256_neon *s = container_of(hash, sha256_neon, hash);
+
+    s->core.abcd = vld1q_u32(sha256_initial_state);
+    s->core.efgh = vld1q_u32(sha256_initial_state + 4);
+
+    sha256_block_setup(&s->blk);
+}
+
+static void sha256_neon_copyfrom(ssh_hash *hcopy, ssh_hash *horig)
+{
+    sha256_neon *copy = container_of(hcopy, sha256_neon, hash);
+    sha256_neon *orig = container_of(horig, sha256_neon, hash);
+
+    *copy = *orig; /* structure copy */
+
+    BinarySink_COPIED(copy);
+    BinarySink_DELEGATE_INIT(&copy->hash, copy);
+}
+
+static void sha256_neon_free(ssh_hash *hash)
+{
+    sha256_neon *s = container_of(hash, sha256_neon, hash);
+    smemclr(s, sizeof(*s));
+    sfree(s);
+}
+
+static void sha256_neon_write(BinarySink *bs, const void *vp, size_t len)
+{
+    sha256_neon *s = BinarySink_DOWNCAST(bs, sha256_neon);
+
+    while (len > 0)
+        if (sha256_block_write(&s->blk, &vp, &len))
+            sha256_neon_block(&s->core, s->blk.block);
+}
+
+static void sha256_neon_digest(ssh_hash *hash, uint8_t *digest)
+{
+    sha256_neon *s = container_of(hash, sha256_neon, hash);
+
+    sha256_block_pad(&s->blk, BinarySink_UPCAST(s));
+    vst1q_u8(digest,      vrev32q_u8(vreinterpretq_u8_u32(s->core.abcd)));
+    vst1q_u8(digest + 16, vrev32q_u8(vreinterpretq_u8_u32(s->core.efgh)));
+}
+
+const ssh_hashalg ssh_sha256_hw = {
+    .new = sha256_neon_new,
+    .reset = sha256_neon_reset,
+    .copyfrom = sha256_neon_copyfrom,
+    .digest = sha256_neon_digest,
+    .free = sha256_neon_free,
+    .hlen = 32,
+    .blocklen = 64,
+    HASHALG_NAMES_ANNOTATED("SHA-256", "NEON accelerated"),
+};
+
+/* ----------------------------------------------------------------------
+ * Stub functions if we have no hardware-accelerated SHA-256. In this
+ * case, sha256_hw_new returns NULL (though it should also never be
+ * selected by sha256_select, so the only thing that should even be
+ * _able_ to call it is testcrypt). As a result, the remaining vtable
+ * functions should never be called at all.
+ */
+
+#elif HW_SHA256 == HW_SHA256_NONE
+
+static bool sha256_hw_available(void)
+{
+    return false;
+}
+
+static ssh_hash *sha256_stub_new(const ssh_hashalg *alg)
+{
+    return NULL;
+}
+
+#define STUB_BODY { unreachable("Should never be called"); }
+
+static void sha256_stub_reset(ssh_hash *hash) STUB_BODY
+static void sha256_stub_copyfrom(ssh_hash *hash, ssh_hash *orig) STUB_BODY
+static void sha256_stub_free(ssh_hash *hash) STUB_BODY
+static void sha256_stub_digest(ssh_hash *hash, uint8_t *digest) STUB_BODY
+
+const ssh_hashalg ssh_sha256_hw = {
+    .new = sha256_stub_new,
+    .reset = sha256_stub_reset,
+    .copyfrom = sha256_stub_copyfrom,
+    .digest = sha256_stub_digest,
+    .free = sha256_stub_free,
+    .hlen = 32,
+    .blocklen = 64,
+    HASHALG_NAMES_ANNOTATED("SHA-256", "!NONEXISTENT ACCELERATED VERSION!"),
+};
+
+#endif /* HW_SHA256 */
--- a/crypto/sha3.c
+++ b/crypto/sha3.c
@ -0,0 +1,329 @@
+/*
+ * SHA-3, as defined in FIPS PUB 202.
+ */
+
+#include <assert.h>
+#include <string.h>
+#include "ssh.h"
+
+static inline uint64_t rol(uint64_t x, unsigned shift)
+{
+    unsigned L = (+shift) & 63;
+    unsigned R = (-shift) & 63;
+    return (x << L) | (x >> R);
+}
+
+/*
+ * General Keccak is defined such that its state is a 5x5 array of
+ * words which can be any power-of-2 size from 1 up to 64. SHA-3 fixes
+ * on 64, and so do we.
+ *
+ * The number of rounds is defined as 12 + 2k if the word size is 2^k.
+ * Here we have 64-bit words only, so k=6, so 24 rounds always.
+ */
+typedef uint64_t keccak_core_state[5][5];
+#define NROUNDS 24               /* would differ for other word sizes */
+static const uint64_t round_constants[NROUNDS];
+static const unsigned rotation_counts[5][5];
+
+/*
+ * Core Keccak transform: just squodge the state around internally,
+ * without adding or extracting any data from it.
+ */
+static void keccak_transform(keccak_core_state A)
+{
+    union {
+        uint64_t C[5];
+        uint64_t B[5][5];
+    } u;
+
+    for (unsigned round = 0; round < NROUNDS; round++) {
+        /* theta step */
+        for (unsigned x = 0; x < 5; x++)
+            u.C[x] = A[x][0] ^ A[x][1] ^ A[x][2] ^ A[x][3] ^ A[x][4];
+        for (unsigned x = 0; x < 5; x++) {
+            uint64_t D = rol(u.C[(x+1) % 5], 1) ^ u.C[(x+4) % 5];
+            for (unsigned y = 0; y < 5; y++)
+                A[x][y] ^= D;
+        }
+
+        /* rho and pi steps */
+        for (unsigned x = 0; x < 5; x++)
+            for (unsigned y = 0; y < 5; y++)
+                u.B[y][(2*x+3*y) % 5] = rol(A[x][y], rotation_counts[x][y]);
+
+        /* chi step */
+        for (unsigned x = 0; x < 5; x++)
+            for (unsigned y = 0; y < 5; y++)
+                A[x][y] = u.B[x][y] ^ (u.B[(x+2)%5][y] & ~u.B[(x+1)%5][y]);
+
+        /* iota step */
+        A[0][0] ^= round_constants[round];
+    }
+
+    smemclr(&u, sizeof(u));
+}
+
+typedef struct {
+    keccak_core_state A;
+    unsigned char bytes[25*8];
+    unsigned char first_pad_byte;
+    size_t bytes_got, bytes_wanted, hash_bytes;
+} keccak_state;
+
+/*
+ * Keccak accumulation function: given a piece of message, add it to
+ * the hash.
+ */
+static void keccak_accumulate(keccak_state *s, const void *vdata, size_t len)
+{
+    const unsigned char *data = (const unsigned char *)vdata;
+
+    while (len >= s->bytes_wanted - s->bytes_got) {
+        size_t b = s->bytes_wanted - s->bytes_got;
+        memcpy(s->bytes + s->bytes_got, data, b);
+        len -= b;
+        data += b;
+
+        size_t n = 0;
+        for (unsigned y = 0; y < 5; y++) {
+            for (unsigned x = 0; x < 5; x++) {
+                if (n >= s->bytes_wanted)
+                    break;
+
+                s->A[x][y] ^= GET_64BIT_LSB_FIRST(s->bytes + n);
+                n += 8;
+            }
+        }
+        keccak_transform(s->A);
+
+        s->bytes_got = 0;
+    }
+
+    memcpy(s->bytes + s->bytes_got, data, len);
+    s->bytes_got += len;
+}
+
+/*
+ * Keccak output function.
+ */
+static void keccak_output(keccak_state *s, void *voutput)
+{
+    unsigned char *output = (unsigned char *)voutput;
+
+    /*
+     * Add message padding.
+     */
+    {
+        unsigned char padding[25*8];
+        size_t len = s->bytes_wanted - s->bytes_got;
+        if (len == 0)
+            len = s->bytes_wanted;
+        memset(padding, 0, len);
+        padding[0] |= s->first_pad_byte;
+        padding[len-1] |= 0x80;
+        keccak_accumulate(s, padding, len);
+    }
+
+    size_t n = 0;
+    for (unsigned y = 0; y < 5; y++) {
+        for (unsigned x = 0; x < 5; x++) {
+            size_t to_copy = s->hash_bytes - n;
+            if (to_copy == 0)
+                break;
+            if (to_copy > 8)
+                to_copy = 8;
+            unsigned char outbytes[8];
+            PUT_64BIT_LSB_FIRST(outbytes, s->A[x][y]);
+            memcpy(output + n, outbytes, to_copy);
+            n += to_copy;
+        }
+    }
+}
+
+static void keccak_init(keccak_state *s, unsigned hashbits, unsigned ratebits,
+                        unsigned char first_pad_byte)
+{
+    int x, y;
+
+    assert(hashbits % 8 == 0);
+    assert(ratebits % 8 == 0);
+
+    s->hash_bytes = hashbits / 8;
+    s->bytes_wanted = (25 * 64 - ratebits) / 8;
+    s->bytes_got = 0;
+    s->first_pad_byte = first_pad_byte;
+
+    assert(s->bytes_wanted % 8 == 0);
+
+    for (y = 0; y < 5; y++)
+        for (x = 0; x < 5; x++)
+            s->A[x][y] = 0;
+}
+
+static void keccak_sha3_init(keccak_state *s, int hashbits)
+{
+    keccak_init(s, hashbits, hashbits * 2, 0x06);
+}
+
+static void keccak_shake_init(keccak_state *s, int parambits, int hashbits)
+{
+    keccak_init(s, hashbits, parambits * 2, 0x1f);
+}
+
+/*
+ * Keccak round constants, generated via the LFSR specified in the
+ * Keccak reference by the following piece of Python:
+
+import textwrap
+from functools import reduce
+
+rbytes = [1]
+while len(rbytes) < 7*24:
+    k = rbytes[-1] * 2
+    rbytes.append(k ^ (0x171 * (k >> 8)))
+
+rbits = [byte & 1 for byte in rbytes]
+
+rwords = [sum(rbits[i+j] << ((1 << j) - 1) for j in range(7))
+          for i in range(0, len(rbits), 7)]
+
+print(textwrap.indent("\n".join(textwrap.wrap(", ".join(
+    map("0x{:016x}".format, rwords)))), " "*4))
+
+*/
+
+static const uint64_t round_constants[24] = {
+    0x0000000000000001, 0x0000000000008082, 0x800000000000808a,
+    0x8000000080008000, 0x000000000000808b, 0x0000000080000001,
+    0x8000000080008081, 0x8000000000008009, 0x000000000000008a,
+    0x0000000000000088, 0x0000000080008009, 0x000000008000000a,
+    0x000000008000808b, 0x800000000000008b, 0x8000000000008089,
+    0x8000000000008003, 0x8000000000008002, 0x8000000000000080,
+    0x000000000000800a, 0x800000008000000a, 0x8000000080008081,
+    0x8000000000008080, 0x0000000080000001, 0x8000000080008008
+};
+
+/*
+ * Keccak per-element rotation counts, generated from the matrix
+ * formula in the Keccak reference by the following piece of Python:
+
+coords = [1, 0]
+while len(coords) < 26:
+    coords.append((2*coords[-2] + 3*coords[-1]) % 5)
+
+matrix = { (coords[i], coords[i+1]) : i for i in range(24) }
+matrix[0,0] = -1
+
+f = lambda t: (t+1) * (t+2) // 2 % 64
+
+for y in range(5):
+    print("    {{{}}},".format(", ".join("{:2d}".format(f(matrix[y,x]))
+                                         for x in range(5))))
+
+*/
+static const unsigned rotation_counts[5][5] = {
+    { 0, 36,  3, 41, 18},
+    { 1, 44, 10, 45,  2},
+    {62,  6, 43, 15, 61},
+    {28, 55, 25, 21, 56},
+    {27, 20, 39,  8, 14},
+};
+
+/*
+ * The PuTTY ssh_hashalg abstraction.
+ */
+struct keccak_hash {
+    keccak_state state;
+    ssh_hash hash;
+    BinarySink_IMPLEMENTATION;
+};
+
+static void keccak_BinarySink_write(BinarySink *bs, const void *p, size_t len)
+{
+    struct keccak_hash *kh = BinarySink_DOWNCAST(bs, struct keccak_hash);
+    keccak_accumulate(&kh->state, p, len);
+}
+
+static ssh_hash *keccak_new(const ssh_hashalg *alg)
+{
+    struct keccak_hash *kh = snew(struct keccak_hash);
+    kh->hash.vt = alg;
+    BinarySink_INIT(kh, keccak_BinarySink_write);
+    BinarySink_DELEGATE_INIT(&kh->hash, kh);
+    return ssh_hash_reset(&kh->hash);
+}
+
+static void keccak_free(ssh_hash *hash)
+{
+    struct keccak_hash *kh = container_of(hash, struct keccak_hash, hash);
+    smemclr(kh, sizeof(*kh));
+    sfree(kh);
+}
+
+static void keccak_copyfrom(ssh_hash *hnew, ssh_hash *hold)
+{
+    struct keccak_hash *khold = container_of(hold, struct keccak_hash, hash);
+    struct keccak_hash *khnew = container_of(hnew, struct keccak_hash, hash);
+    khnew->state = khold->state;
+}
+
+static void keccak_digest(ssh_hash *hash, unsigned char *output)
+{
+    struct keccak_hash *kh = container_of(hash, struct keccak_hash, hash);
+    keccak_output(&kh->state, output);
+}
+
+static void sha3_reset(ssh_hash *hash)
+{
+    struct keccak_hash *kh = container_of(hash, struct keccak_hash, hash);
+    keccak_sha3_init(&kh->state, hash->vt->hlen * 8);
+}
+
+#define DEFINE_SHA3(bits)                       \
+    const ssh_hashalg ssh_sha3_##bits = {       \
+        .new = keccak_new,                      \
+        .reset = sha3_reset,                    \
+        .copyfrom = keccak_copyfrom,            \
+        .digest = keccak_digest,                \
+        .free = keccak_free,                    \
+        .hlen = bits/8,                         \
+        .blocklen = 200 - 2*(bits/8),           \
+        HASHALG_NAMES_BARE("SHA3-" #bits),      \
+    }
+
+DEFINE_SHA3(224);
+DEFINE_SHA3(256);
+DEFINE_SHA3(384);
+DEFINE_SHA3(512);
+
+static void shake256_reset(ssh_hash *hash)
+{
+    struct keccak_hash *kh = container_of(hash, struct keccak_hash, hash);
+    keccak_shake_init(&kh->state, 256, hash->vt->hlen * 8);
+}
+
+/*
+ * There is some confusion over the output length parameter for the
+ * SHAKE functions. By my reading, FIPS PUB 202 defines SHAKE256(M,d)
+ * to generate d _bits_ of output. But RFC 8032 (defining Ed448) talks
+ * about "SHAKE256(x,114)" in a context where it definitely means
+ * generating 114 _bytes_ of output.
+ *
+ * Our internal ID therefore suffixes the output length with "bytes",
+ * to be clear which we're talking about
+ */
+
+#define DEFINE_SHAKE(param, hashbytes)                          \
+    const ssh_hashalg ssh_shake##param##_##hashbytes##bytes = { \
+        .new = keccak_new,                                      \
+        .reset = shake##param##_reset,                          \
+        .copyfrom = keccak_copyfrom,                            \
+        .digest = keccak_digest,                                \
+        .free = keccak_free,                                    \
+        .hlen = hashbytes,                                      \
+        .blocklen = 0,                                          \
+        HASHALG_NAMES_BARE("SHAKE" #param),                     \
+    }
+
+DEFINE_SHAKE(256, 114);
--- a/crypto/sha512.c
+++ b/crypto/sha512.c
@ -0,0 +1,836 @@
+/*
+ * SHA-512 algorithm as described at
+ *
+ *   http://csrc.nist.gov/cryptval/shs.html
+ *
+ * Modifications made for SHA-384 also
+ */
+
+#include <assert.h>
+#include "ssh.h"
+
+/*
+ * Start by deciding whether we can support hardware SHA at all.
+ */
+#define HW_SHA512_NONE 0
+#define HW_SHA512_NEON 1
+
+#ifdef _FORCE_SHA512_NEON
+#   define HW_SHA512 HW_SHA512_NEON
+#elif defined __BYTE_ORDER__ && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+    /* Arm can potentially support both endiannesses, but this code
+     * hasn't been tested on anything but little. If anyone wants to
+     * run big-endian, they'll need to fix it first. */
+#elif defined __ARM_FEATURE_SHA512
+    /* If the Arm SHA-512 extension is available already, we can
+     * support NEON SHA without having to enable anything by hand */
+#   define HW_SHA512 HW_SHA512_NEON
+#elif defined(__clang__)
+#   if __has_attribute(target) && __has_include(<arm_neon.h>) &&       \
+    (defined(__aarch64__))
+        /* clang can enable the crypto extension in AArch64 using
+         * __attribute__((target)) */
+#       define HW_SHA512 HW_SHA512_NEON
+#       define USE_CLANG_ATTR_TARGET_AARCH64
+#   endif
+#endif
+
+#if defined _FORCE_SOFTWARE_SHA || !defined HW_SHA512
+#   undef HW_SHA512
+#   define HW_SHA512 HW_SHA512_NONE
+#endif
+
+/*
+ * The actual query function that asks if hardware acceleration is
+ * available.
+ */
+static bool sha512_hw_available(void);
+
+/*
+ * The top-level selection function, caching the results of
+ * sha512_hw_available() so it only has to run once.
+ */
+static bool sha512_hw_available_cached(void)
+{
+    static bool initialised = false;
+    static bool hw_available;
+    if (!initialised) {
+        hw_available = sha512_hw_available();
+        initialised = true;
+    }
+    return hw_available;
+}
+
+struct sha512_select_options {
+    const ssh_hashalg *hw, *sw;
+};
+
+static ssh_hash *sha512_select(const ssh_hashalg *alg)
+{
+    const struct sha512_select_options *options =
+        (const struct sha512_select_options *)alg->extra;
+
+    const ssh_hashalg *real_alg =
+        sha512_hw_available_cached() ? options->hw : options->sw;
+
+    return ssh_hash_new(real_alg);
+}
+
+const struct sha512_select_options ssh_sha512_select_options = {
+    &ssh_sha512_hw, &ssh_sha512_sw,
+};
+const struct sha512_select_options ssh_sha384_select_options = {
+    &ssh_sha384_hw, &ssh_sha384_sw,
+};
+
+const ssh_hashalg ssh_sha512 = {
+    .new = sha512_select,
+    .hlen = 64,
+    .blocklen = 128,
+    HASHALG_NAMES_ANNOTATED("SHA-512", "dummy selector vtable"),
+    .extra = &ssh_sha512_select_options,
+};
+
+const ssh_hashalg ssh_sha384 = {
+    .new = sha512_select,
+    .hlen = 48,
+    .blocklen = 128,
+    HASHALG_NAMES_ANNOTATED("SHA-384", "dummy selector vtable"),
+    .extra = &ssh_sha384_select_options,
+};
+
+/* ----------------------------------------------------------------------
+ * Definitions likely to be helpful to multiple implementations.
+ */
+
+static const uint64_t sha512_initial_state[] = {
+    0x6a09e667f3bcc908ULL,
+    0xbb67ae8584caa73bULL,
+    0x3c6ef372fe94f82bULL,
+    0xa54ff53a5f1d36f1ULL,
+    0x510e527fade682d1ULL,
+    0x9b05688c2b3e6c1fULL,
+    0x1f83d9abfb41bd6bULL,
+    0x5be0cd19137e2179ULL,
+};
+
+static const uint64_t sha384_initial_state[] = {
+    0xcbbb9d5dc1059ed8ULL,
+    0x629a292a367cd507ULL,
+    0x9159015a3070dd17ULL,
+    0x152fecd8f70e5939ULL,
+    0x67332667ffc00b31ULL,
+    0x8eb44a8768581511ULL,
+    0xdb0c2e0d64f98fa7ULL,
+    0x47b5481dbefa4fa4ULL,
+};
+
+static const uint64_t sha512_round_constants[] = {
+    0x428a2f98d728ae22ULL, 0x7137449123ef65cdULL,
+    0xb5c0fbcfec4d3b2fULL, 0xe9b5dba58189dbbcULL,
+    0x3956c25bf348b538ULL, 0x59f111f1b605d019ULL,
+    0x923f82a4af194f9bULL, 0xab1c5ed5da6d8118ULL,
+    0xd807aa98a3030242ULL, 0x12835b0145706fbeULL,
+    0x243185be4ee4b28cULL, 0x550c7dc3d5ffb4e2ULL,
+    0x72be5d74f27b896fULL, 0x80deb1fe3b1696b1ULL,
+    0x9bdc06a725c71235ULL, 0xc19bf174cf692694ULL,
+    0xe49b69c19ef14ad2ULL, 0xefbe4786384f25e3ULL,
+    0x0fc19dc68b8cd5b5ULL, 0x240ca1cc77ac9c65ULL,
+    0x2de92c6f592b0275ULL, 0x4a7484aa6ea6e483ULL,
+    0x5cb0a9dcbd41fbd4ULL, 0x76f988da831153b5ULL,
+    0x983e5152ee66dfabULL, 0xa831c66d2db43210ULL,
+    0xb00327c898fb213fULL, 0xbf597fc7beef0ee4ULL,
+    0xc6e00bf33da88fc2ULL, 0xd5a79147930aa725ULL,
+    0x06ca6351e003826fULL, 0x142929670a0e6e70ULL,
+    0x27b70a8546d22ffcULL, 0x2e1b21385c26c926ULL,
+    0x4d2c6dfc5ac42aedULL, 0x53380d139d95b3dfULL,
+    0x650a73548baf63deULL, 0x766a0abb3c77b2a8ULL,
+    0x81c2c92e47edaee6ULL, 0x92722c851482353bULL,
+    0xa2bfe8a14cf10364ULL, 0xa81a664bbc423001ULL,
+    0xc24b8b70d0f89791ULL, 0xc76c51a30654be30ULL,
+    0xd192e819d6ef5218ULL, 0xd69906245565a910ULL,
+    0xf40e35855771202aULL, 0x106aa07032bbd1b8ULL,
+    0x19a4c116b8d2d0c8ULL, 0x1e376c085141ab53ULL,
+    0x2748774cdf8eeb99ULL, 0x34b0bcb5e19b48a8ULL,
+    0x391c0cb3c5c95a63ULL, 0x4ed8aa4ae3418acbULL,
+    0x5b9cca4f7763e373ULL, 0x682e6ff3d6b2b8a3ULL,
+    0x748f82ee5defb2fcULL, 0x78a5636f43172f60ULL,
+    0x84c87814a1f0ab72ULL, 0x8cc702081a6439ecULL,
+    0x90befffa23631e28ULL, 0xa4506cebde82bde9ULL,
+    0xbef9a3f7b2c67915ULL, 0xc67178f2e372532bULL,
+    0xca273eceea26619cULL, 0xd186b8c721c0c207ULL,
+    0xeada7dd6cde0eb1eULL, 0xf57d4f7fee6ed178ULL,
+    0x06f067aa72176fbaULL, 0x0a637dc5a2c898a6ULL,
+    0x113f9804bef90daeULL, 0x1b710b35131c471bULL,
+    0x28db77f523047d84ULL, 0x32caab7b40c72493ULL,
+    0x3c9ebe0a15c9bebcULL, 0x431d67c49c100d4cULL,
+    0x4cc5d4becb3e42b6ULL, 0x597f299cfc657e2aULL,
+    0x5fcb6fab3ad6faecULL, 0x6c44198c4a475817ULL,
+};
+
+#define SHA512_ROUNDS 80
+
+typedef struct sha512_block sha512_block;
+struct sha512_block {
+    uint8_t block[128];
+    size_t used;
+    uint64_t lenhi, lenlo;
+};
+
+static inline void sha512_block_setup(sha512_block *blk)
+{
+    blk->used = 0;
+    blk->lenhi = blk->lenlo = 0;
+}
+
+static inline bool sha512_block_write(
+    sha512_block *blk, const void **vdata, size_t *len)
+{
+    size_t blkleft = sizeof(blk->block) - blk->used;
+    size_t chunk = *len < blkleft ? *len : blkleft;
+
+    const uint8_t *p = *vdata;
+    memcpy(blk->block + blk->used, p, chunk);
+    *vdata = p + chunk;
+    *len -= chunk;
+    blk->used += chunk;
+
+    size_t chunkbits = chunk << 3;
+
+    blk->lenlo += chunkbits;
+    blk->lenhi += (blk->lenlo < chunkbits);
+
+    if (blk->used == sizeof(blk->block)) {
+        blk->used = 0;
+        return true;
+    }
+
+    return false;
+}
+
+static inline void sha512_block_pad(sha512_block *blk, BinarySink *bs)
+{
+    uint64_t final_lenhi = blk->lenhi;
+    uint64_t final_lenlo = blk->lenlo;
+    size_t pad = 127 & (111 - blk->used);
+
+    put_byte(bs, 0x80);
+    put_padding(bs, pad, 0);
+    put_uint64(bs, final_lenhi);
+    put_uint64(bs, final_lenlo);
+
+    assert(blk->used == 0 && "Should have exactly hit a block boundary");
+}
+
+/* ----------------------------------------------------------------------
+ * Software implementation of SHA-512.
+ */
+
+static inline uint64_t ror(uint64_t x, unsigned y)
+{
+    return (x << (63 & -y)) | (x >> (63 & y));
+}
+
+static inline uint64_t Ch(uint64_t ctrl, uint64_t if1, uint64_t if0)
+{
+    return if0 ^ (ctrl & (if1 ^ if0));
+}
+
+static inline uint64_t Maj(uint64_t x, uint64_t y, uint64_t z)
+{
+    return (x & y) | (z & (x | y));
+}
+
+static inline uint64_t Sigma_0(uint64_t x)
+{
+    return ror(x,28) ^ ror(x,34) ^ ror(x,39);
+}
+
+static inline uint64_t Sigma_1(uint64_t x)
+{
+    return ror(x,14) ^ ror(x,18) ^ ror(x,41);
+}
+
+static inline uint64_t sigma_0(uint64_t x)
+{
+    return ror(x,1) ^ ror(x,8) ^ (x >> 7);
+}
+
+static inline uint64_t sigma_1(uint64_t x)
+{
+    return ror(x,19) ^ ror(x,61) ^ (x >> 6);
+}
+
+static inline void sha512_sw_round(
+    unsigned round_index, const uint64_t *schedule,
+    uint64_t *a, uint64_t *b, uint64_t *c, uint64_t *d,
+    uint64_t *e, uint64_t *f, uint64_t *g, uint64_t *h)
+{
+    uint64_t t1 = *h + Sigma_1(*e) + Ch(*e,*f,*g) +
+        sha512_round_constants[round_index] + schedule[round_index];
+
+    uint64_t t2 = Sigma_0(*a) + Maj(*a,*b,*c);
+
+    *d += t1;
+    *h = t1 + t2;
+}
+
+static void sha512_sw_block(uint64_t *core, const uint8_t *block)
+{
+    uint64_t w[SHA512_ROUNDS];
+    uint64_t a,b,c,d,e,f,g,h;
+
+    int t;
+
+    for (t = 0; t < 16; t++)
+        w[t] = GET_64BIT_MSB_FIRST(block + 8*t);
+
+    for (t = 16; t < SHA512_ROUNDS; t++)
+        w[t] = w[t-16] + w[t-7] + sigma_0(w[t-15]) + sigma_1(w[t-2]);
+
+    a = core[0]; b = core[1]; c = core[2]; d = core[3];
+    e = core[4]; f = core[5]; g = core[6]; h = core[7];
+
+    for (t = 0; t < SHA512_ROUNDS; t+=8) {
+        sha512_sw_round(t+0, w, &a,&b,&c,&d,&e,&f,&g,&h);
+        sha512_sw_round(t+1, w, &h,&a,&b,&c,&d,&e,&f,&g);
+        sha512_sw_round(t+2, w, &g,&h,&a,&b,&c,&d,&e,&f);
+        sha512_sw_round(t+3, w, &f,&g,&h,&a,&b,&c,&d,&e);
+        sha512_sw_round(t+4, w, &e,&f,&g,&h,&a,&b,&c,&d);
+        sha512_sw_round(t+5, w, &d,&e,&f,&g,&h,&a,&b,&c);
+        sha512_sw_round(t+6, w, &c,&d,&e,&f,&g,&h,&a,&b);
+        sha512_sw_round(t+7, w, &b,&c,&d,&e,&f,&g,&h,&a);
+    }
+
+    core[0] += a; core[1] += b; core[2] += c; core[3] += d;
+    core[4] += e; core[5] += f; core[6] += g; core[7] += h;
+
+    smemclr(w, sizeof(w));
+}
+
+typedef struct sha512_sw {
+    uint64_t core[8];
+    sha512_block blk;
+    BinarySink_IMPLEMENTATION;
+    ssh_hash hash;
+} sha512_sw;
+
+static void sha512_sw_write(BinarySink *bs, const void *vp, size_t len);
+
+static ssh_hash *sha512_sw_new(const ssh_hashalg *alg)
+{
+    sha512_sw *s = snew(sha512_sw);
+
+    s->hash.vt = alg;
+    BinarySink_INIT(s, sha512_sw_write);
+    BinarySink_DELEGATE_INIT(&s->hash, s);
+    return &s->hash;
+}
+
+static void sha512_sw_reset(ssh_hash *hash)
+{
+    sha512_sw *s = container_of(hash, sha512_sw, hash);
+
+    /* The 'extra' field in the ssh_hashalg indicates which
+     * initialisation vector we're using */
+    memcpy(s->core, hash->vt->extra, sizeof(s->core));
+    sha512_block_setup(&s->blk);
+}
+
+static void sha512_sw_copyfrom(ssh_hash *hcopy, ssh_hash *horig)
+{
+    sha512_sw *copy = container_of(hcopy, sha512_sw, hash);
+    sha512_sw *orig = container_of(horig, sha512_sw, hash);
+
+    memcpy(copy, orig, sizeof(*copy));
+    BinarySink_COPIED(copy);
+    BinarySink_DELEGATE_INIT(&copy->hash, copy);
+}
+
+static void sha512_sw_free(ssh_hash *hash)
+{
+    sha512_sw *s = container_of(hash, sha512_sw, hash);
+
+    smemclr(s, sizeof(*s));
+    sfree(s);
+}
+
+static void sha512_sw_write(BinarySink *bs, const void *vp, size_t len)
+{
+    sha512_sw *s = BinarySink_DOWNCAST(bs, sha512_sw);
+
+    while (len > 0)
+        if (sha512_block_write(&s->blk, &vp, &len))
+            sha512_sw_block(s->core, s->blk.block);
+}
+
+static void sha512_sw_digest(ssh_hash *hash, uint8_t *digest)
+{
+    sha512_sw *s = container_of(hash, sha512_sw, hash);
+
+    sha512_block_pad(&s->blk, BinarySink_UPCAST(s));
+    for (size_t i = 0; i < hash->vt->hlen / 8; i++)
+        PUT_64BIT_MSB_FIRST(digest + 8*i, s->core[i]);
+}
+
+const ssh_hashalg ssh_sha512_sw = {
+    .new = sha512_sw_new,
+    .reset = sha512_sw_reset,
+    .copyfrom = sha512_sw_copyfrom,
+    .digest = sha512_sw_digest,
+    .free = sha512_sw_free,
+    .hlen = 64,
+    .blocklen = 128,
+    HASHALG_NAMES_ANNOTATED("SHA-512", "unaccelerated"),
+    .extra = sha512_initial_state,
+};
+
+const ssh_hashalg ssh_sha384_sw = {
+    .new = sha512_sw_new,
+    .reset = sha512_sw_reset,
+    .copyfrom = sha512_sw_copyfrom,
+    .digest = sha512_sw_digest,
+    .free = sha512_sw_free,
+    .hlen = 48,
+    .blocklen = 128,
+    HASHALG_NAMES_ANNOTATED("SHA-384", "unaccelerated"),
+    .extra = sha384_initial_state,
+};
+
+/* ----------------------------------------------------------------------
+ * Hardware-accelerated implementation of SHA-512 using Arm NEON.
+ */
+
+#if HW_SHA512 == HW_SHA512_NEON
+
+/*
+ * Manually set the target architecture, if we decided above that we
+ * need to.
+ */
+#ifdef USE_CLANG_ATTR_TARGET_AARCH64
+/*
+ * A spot of cheating: redefine some ACLE feature macros before
+ * including arm_neon.h. Otherwise we won't get the SHA intrinsics
+ * defined by that header, because it will be looking at the settings
+ * for the whole translation unit rather than the ones we're going to
+ * put on some particular functions using __attribute__((target)).
+ */
+#define __ARM_NEON 1
+#define __ARM_FEATURE_CRYPTO 1
+#define FUNC_ISA __attribute__ ((target("neon,sha3")))
+#endif /* USE_CLANG_ATTR_TARGET_AARCH64 */
+
+#ifndef FUNC_ISA
+#define FUNC_ISA
+#endif
+
+#ifdef USE_ARM64_NEON_H
+#include <arm64_neon.h>
+#else
+#include <arm_neon.h>
+#endif
+
+static bool sha512_hw_available(void)
+{
+    /*
+     * For Arm, we delegate to a per-platform detection function (see
+     * explanation in sshaes.c).
+     */
+    return platform_sha512_hw_available();
+}
+
+#if defined __clang__
+/*
+ * As of 2020-12-24, I've found that clang doesn't provide the SHA-512
+ * NEON intrinsics. So I define my own set using inline assembler, and
+ * use #define to effectively rename them over the top of the standard
+ * names.
+ *
+ * The aim of that #define technique is that it should avoid a build
+ * failure if these intrinsics _are_ defined in <arm_neon.h>.
+ * Obviously it would be better in that situation to switch back to
+ * using the real intrinsics, but until I see a version of clang that
+ * supports them, I won't know what version number to test in the
+ * ifdef.
+ */
+static inline FUNC_ISA
+uint64x2_t vsha512su0q_u64_asm(uint64x2_t x, uint64x2_t y) {
+    __asm__("sha512su0 %0.2D,%1.2D" : "+w" (x) : "w" (y));
+    return x;
+}
+static inline FUNC_ISA
+uint64x2_t vsha512su1q_u64_asm(uint64x2_t x, uint64x2_t y, uint64x2_t z) {
+    __asm__("sha512su1 %0.2D,%1.2D,%2.2D" : "+w" (x) : "w" (y), "w" (z));
+    return x;
+}
+static inline FUNC_ISA
+uint64x2_t vsha512hq_u64_asm(uint64x2_t x, uint64x2_t y, uint64x2_t z) {
+    __asm__("sha512h %0,%1,%2.2D" : "+w" (x) : "w" (y), "w" (z));
+    return x;
+}
+static inline FUNC_ISA
+uint64x2_t vsha512h2q_u64_asm(uint64x2_t x, uint64x2_t y, uint64x2_t z) {
+    __asm__("sha512h2 %0,%1,%2.2D" : "+w" (x) : "w" (y), "w" (z));
+    return x;
+}
+#undef vsha512su0q_u64
+#define vsha512su0q_u64 vsha512su0q_u64_asm
+#undef vsha512su1q_u64
+#define vsha512su1q_u64 vsha512su1q_u64_asm
+#undef vsha512hq_u64
+#define vsha512hq_u64 vsha512hq_u64_asm
+#undef vsha512h2q_u64
+#define vsha512h2q_u64 vsha512h2q_u64_asm
+#endif /* defined __clang__ */
+
+typedef struct sha512_neon_core sha512_neon_core;
+struct sha512_neon_core {
+    uint64x2_t ab, cd, ef, gh;
+};
+
+FUNC_ISA
+static inline uint64x2_t sha512_neon_load_input(const uint8_t *p)
+{
+    return vreinterpretq_u64_u8(vrev64q_u8(vld1q_u8(p)));
+}
+
+FUNC_ISA
+static inline uint64x2_t sha512_neon_schedule_update(
+    uint64x2_t m8, uint64x2_t m7, uint64x2_t m4, uint64x2_t m3, uint64x2_t m1)
+{
+    /*
+     * vsha512su0q_u64() takes words from a long way back in the
+     * schedule and performs the sigma_0 half of the computation of
+     * the next two 64-bit message-schedule words.
+     *
+     * vsha512su1q_u64() combines the result of that with the sigma_1
+     * steps, to output the finished version of those two words. The
+     * total amount of input data it requires fits nicely into three
+     * 128-bit vector registers, but one of those registers is
+     * misaligned compared to the 128-bit chunks that the message
+     * schedule is stored in. So we use vextq_u64 to make one of its
+     * input words out of the second half of m4 and the first half of
+     * m3.
+     */
+    return vsha512su1q_u64(vsha512su0q_u64(m8, m7), m1, vextq_u64(m4, m3, 1));
+}
+
+FUNC_ISA
+static inline void sha512_neon_round2(
+    unsigned round_index, uint64x2_t schedule_words,
+    uint64x2_t *ab, uint64x2_t *cd, uint64x2_t *ef, uint64x2_t *gh)
+{
+    /*
+     * vsha512hq_u64 performs the Sigma_1 and Ch half of the
+     * computation of two rounds of SHA-512 (including feeding back
+     * one of the outputs from the first of those half-rounds into the
+     * second one).
+     *
+     * vsha512h2q_u64 combines the result of that with the Sigma_0 and
+     * Maj steps, and outputs one 128-bit vector that replaces the gh
+     * piece of the input hash state, and a second that updates cd by
+     * addition.
+     *
+     * Similarly to vsha512su1q_u64 above, some of the input registers
+     * expected by these instructions are misaligned by 64 bits
+     * relative to the chunks we've divided the hash state into, so we
+     * have to start by making 'de' and 'fg' words out of our input
+     * cd,ef,gh, using vextq_u64.
+     *
+     * Also, one of the inputs to vsha512hq_u64 is expected to contain
+     * the results of summing gh + two round constants + two words of
+     * message schedule, but the two words of the message schedule
+     * have to be the opposite way round in the vector register from
+     * the way that vsha512su1q_u64 output them. Hence, there's
+     * another vextq_u64 in here that swaps the two halves of the
+     * initial_sum vector register.
+     *
+     * (This also means that I don't have to prepare a specially
+     * reordered version of the sha512_round_constants[] array: as
+     * long as I'm unavoidably doing a swap at run time _anyway_, I
+     * can load from the normally ordered version of that array, and
+     * just take care to fold in that data _before_ the swap rather
+     * than after.)
+     */
+
+    /* Load two round constants, with the first one in the low half */
+    uint64x2_t round_constants = vld1q_u64(
+        sha512_round_constants + round_index);
+
+    /* Add schedule words to round constants */
+    uint64x2_t initial_sum = vaddq_u64(schedule_words, round_constants);
+
+    /* Swap that sum around so the word used in the first of the two
+     * rounds is in the _high_ half of the vector, matching where h
+     * lives in the gh vector */
+    uint64x2_t swapped_initial_sum = vextq_u64(initial_sum, initial_sum, 1);
+
+    /* Add gh to that, now that they're matching ways round */
+    uint64x2_t sum = vaddq_u64(swapped_initial_sum, *gh);
+
+    /* Make the misaligned de and fg words */
+    uint64x2_t de = vextq_u64(*cd, *ef, 1);
+    uint64x2_t fg = vextq_u64(*ef, *gh, 1);
+
+    /* Now we're ready to put all the pieces together. The output from
+     * vsha512h2q_u64 can be used directly as the new gh, and the
+     * output from vsha512hq_u64 is simultaneously the intermediate
+     * value passed to h2 and the thing you have to add on to cd. */
+    uint64x2_t intermed = vsha512hq_u64(sum, fg, de);
+    *gh = vsha512h2q_u64(intermed, *cd, *ab);
+    *cd = vaddq_u64(*cd, intermed);
+}
+
+FUNC_ISA
+static inline void sha512_neon_block(sha512_neon_core *core, const uint8_t *p)
+{
+    uint64x2_t s0, s1, s2, s3, s4, s5, s6, s7;
+
+    uint64x2_t ab = core->ab, cd = core->cd, ef = core->ef, gh = core->gh;
+
+    s0 = sha512_neon_load_input(p + 16*0);
+    sha512_neon_round2(0, s0, &ab, &cd, &ef, &gh);
+    s1 = sha512_neon_load_input(p + 16*1);
+    sha512_neon_round2(2, s1, &gh, &ab, &cd, &ef);
+    s2 = sha512_neon_load_input(p + 16*2);
+    sha512_neon_round2(4, s2, &ef, &gh, &ab, &cd);
+    s3 = sha512_neon_load_input(p + 16*3);
+    sha512_neon_round2(6, s3, &cd, &ef, &gh, &ab);
+    s4 = sha512_neon_load_input(p + 16*4);
+    sha512_neon_round2(8, s4, &ab, &cd, &ef, &gh);
+    s5 = sha512_neon_load_input(p + 16*5);
+    sha512_neon_round2(10, s5, &gh, &ab, &cd, &ef);
+    s6 = sha512_neon_load_input(p + 16*6);
+    sha512_neon_round2(12, s6, &ef, &gh, &ab, &cd);
+    s7 = sha512_neon_load_input(p + 16*7);
+    sha512_neon_round2(14, s7, &cd, &ef, &gh, &ab);
+    s0 = sha512_neon_schedule_update(s0, s1, s4, s5, s7);
+    sha512_neon_round2(16, s0, &ab, &cd, &ef, &gh);
+    s1 = sha512_neon_schedule_update(s1, s2, s5, s6, s0);
+    sha512_neon_round2(18, s1, &gh, &ab, &cd, &ef);
+    s2 = sha512_neon_schedule_update(s2, s3, s6, s7, s1);
+    sha512_neon_round2(20, s2, &ef, &gh, &ab, &cd);
+    s3 = sha512_neon_schedule_update(s3, s4, s7, s0, s2);
+    sha512_neon_round2(22, s3, &cd, &ef, &gh, &ab);
+    s4 = sha512_neon_schedule_update(s4, s5, s0, s1, s3);
+    sha512_neon_round2(24, s4, &ab, &cd, &ef, &gh);
+    s5 = sha512_neon_schedule_update(s5, s6, s1, s2, s4);
+    sha512_neon_round2(26, s5, &gh, &ab, &cd, &ef);
+    s6 = sha512_neon_schedule_update(s6, s7, s2, s3, s5);
+    sha512_neon_round2(28, s6, &ef, &gh, &ab, &cd);
+    s7 = sha512_neon_schedule_update(s7, s0, s3, s4, s6);
+    sha512_neon_round2(30, s7, &cd, &ef, &gh, &ab);
+    s0 = sha512_neon_schedule_update(s0, s1, s4, s5, s7);
+    sha512_neon_round2(32, s0, &ab, &cd, &ef, &gh);
+    s1 = sha512_neon_schedule_update(s1, s2, s5, s6, s0);
+    sha512_neon_round2(34, s1, &gh, &ab, &cd, &ef);
+    s2 = sha512_neon_schedule_update(s2, s3, s6, s7, s1);
+    sha512_neon_round2(36, s2, &ef, &gh, &ab, &cd);
+    s3 = sha512_neon_schedule_update(s3, s4, s7, s0, s2);
+    sha512_neon_round2(38, s3, &cd, &ef, &gh, &ab);
+    s4 = sha512_neon_schedule_update(s4, s5, s0, s1, s3);
+    sha512_neon_round2(40, s4, &ab, &cd, &ef, &gh);
+    s5 = sha512_neon_schedule_update(s5, s6, s1, s2, s4);
+    sha512_neon_round2(42, s5, &gh, &ab, &cd, &ef);
+    s6 = sha512_neon_schedule_update(s6, s7, s2, s3, s5);
+    sha512_neon_round2(44, s6, &ef, &gh, &ab, &cd);
+    s7 = sha512_neon_schedule_update(s7, s0, s3, s4, s6);
+    sha512_neon_round2(46, s7, &cd, &ef, &gh, &ab);
+    s0 = sha512_neon_schedule_update(s0, s1, s4, s5, s7);
+    sha512_neon_round2(48, s0, &ab, &cd, &ef, &gh);
+    s1 = sha512_neon_schedule_update(s1, s2, s5, s6, s0);
+    sha512_neon_round2(50, s1, &gh, &ab, &cd, &ef);
+    s2 = sha512_neon_schedule_update(s2, s3, s6, s7, s1);
+    sha512_neon_round2(52, s2, &ef, &gh, &ab, &cd);
+    s3 = sha512_neon_schedule_update(s3, s4, s7, s0, s2);
+    sha512_neon_round2(54, s3, &cd, &ef, &gh, &ab);
+    s4 = sha512_neon_schedule_update(s4, s5, s0, s1, s3);
+    sha512_neon_round2(56, s4, &ab, &cd, &ef, &gh);
+    s5 = sha512_neon_schedule_update(s5, s6, s1, s2, s4);
+    sha512_neon_round2(58, s5, &gh, &ab, &cd, &ef);
+    s6 = sha512_neon_schedule_update(s6, s7, s2, s3, s5);
+    sha512_neon_round2(60, s6, &ef, &gh, &ab, &cd);
+    s7 = sha512_neon_schedule_update(s7, s0, s3, s4, s6);
+    sha512_neon_round2(62, s7, &cd, &ef, &gh, &ab);
+    s0 = sha512_neon_schedule_update(s0, s1, s4, s5, s7);
+    sha512_neon_round2(64, s0, &ab, &cd, &ef, &gh);
+    s1 = sha512_neon_schedule_update(s1, s2, s5, s6, s0);
+    sha512_neon_round2(66, s1, &gh, &ab, &cd, &ef);
+    s2 = sha512_neon_schedule_update(s2, s3, s6, s7, s1);
+    sha512_neon_round2(68, s2, &ef, &gh, &ab, &cd);
+    s3 = sha512_neon_schedule_update(s3, s4, s7, s0, s2);
+    sha512_neon_round2(70, s3, &cd, &ef, &gh, &ab);
+    s4 = sha512_neon_schedule_update(s4, s5, s0, s1, s3);
+    sha512_neon_round2(72, s4, &ab, &cd, &ef, &gh);
+    s5 = sha512_neon_schedule_update(s5, s6, s1, s2, s4);
+    sha512_neon_round2(74, s5, &gh, &ab, &cd, &ef);
+    s6 = sha512_neon_schedule_update(s6, s7, s2, s3, s5);
+    sha512_neon_round2(76, s6, &ef, &gh, &ab, &cd);
+    s7 = sha512_neon_schedule_update(s7, s0, s3, s4, s6);
+    sha512_neon_round2(78, s7, &cd, &ef, &gh, &ab);
+
+    core->ab = vaddq_u64(core->ab, ab);
+    core->cd = vaddq_u64(core->cd, cd);
+    core->ef = vaddq_u64(core->ef, ef);
+    core->gh = vaddq_u64(core->gh, gh);
+}
+
+typedef struct sha512_neon {
+    sha512_neon_core core;
+    sha512_block blk;
+    BinarySink_IMPLEMENTATION;
+    ssh_hash hash;
+} sha512_neon;
+
+static void sha512_neon_write(BinarySink *bs, const void *vp, size_t len);
+
+static ssh_hash *sha512_neon_new(const ssh_hashalg *alg)
+{
+    if (!sha512_hw_available_cached())
+        return NULL;
+
+    sha512_neon *s = snew(sha512_neon);
+
+    s->hash.vt = alg;
+    BinarySink_INIT(s, sha512_neon_write);
+    BinarySink_DELEGATE_INIT(&s->hash, s);
+    return &s->hash;
+}
+
+static void sha512_neon_reset(ssh_hash *hash)
+{
+    sha512_neon *s = container_of(hash, sha512_neon, hash);
+    const uint64_t *iv = (const uint64_t *)hash->vt->extra;
+
+    s->core.ab = vld1q_u64(iv);
+    s->core.cd = vld1q_u64(iv+2);
+    s->core.ef = vld1q_u64(iv+4);
+    s->core.gh = vld1q_u64(iv+6);
+
+    sha512_block_setup(&s->blk);
+}
+
+static void sha512_neon_copyfrom(ssh_hash *hcopy, ssh_hash *horig)
+{
+    sha512_neon *copy = container_of(hcopy, sha512_neon, hash);
+    sha512_neon *orig = container_of(horig, sha512_neon, hash);
+
+    *copy = *orig; /* structure copy */
+
+    BinarySink_COPIED(copy);
+    BinarySink_DELEGATE_INIT(&copy->hash, copy);
+}
+
+static void sha512_neon_free(ssh_hash *hash)
+{
+    sha512_neon *s = container_of(hash, sha512_neon, hash);
+    smemclr(s, sizeof(*s));
+    sfree(s);
+}
+
+static void sha512_neon_write(BinarySink *bs, const void *vp, size_t len)
+{
+    sha512_neon *s = BinarySink_DOWNCAST(bs, sha512_neon);
+
+    while (len > 0)
+        if (sha512_block_write(&s->blk, &vp, &len))
+            sha512_neon_block(&s->core, s->blk.block);
+}
+
+static void sha512_neon_digest(ssh_hash *hash, uint8_t *digest)
+{
+    sha512_neon *s = container_of(hash, sha512_neon, hash);
+
+    sha512_block_pad(&s->blk, BinarySink_UPCAST(s));
+
+    vst1q_u8(digest,    vrev64q_u8(vreinterpretq_u8_u64(s->core.ab)));
+    vst1q_u8(digest+16, vrev64q_u8(vreinterpretq_u8_u64(s->core.cd)));
+    vst1q_u8(digest+32, vrev64q_u8(vreinterpretq_u8_u64(s->core.ef)));
+    vst1q_u8(digest+48, vrev64q_u8(vreinterpretq_u8_u64(s->core.gh)));
+}
+
+static void sha384_neon_digest(ssh_hash *hash, uint8_t *digest)
+{
+    sha512_neon *s = container_of(hash, sha512_neon, hash);
+
+    sha512_block_pad(&s->blk, BinarySink_UPCAST(s));
+
+    vst1q_u8(digest,    vrev64q_u8(vreinterpretq_u8_u64(s->core.ab)));
+    vst1q_u8(digest+16, vrev64q_u8(vreinterpretq_u8_u64(s->core.cd)));
+    vst1q_u8(digest+32, vrev64q_u8(vreinterpretq_u8_u64(s->core.ef)));
+}
+
+const ssh_hashalg ssh_sha512_hw = {
+    .new = sha512_neon_new,
+    .reset = sha512_neon_reset,
+    .copyfrom = sha512_neon_copyfrom,
+    .digest = sha512_neon_digest,
+    .free = sha512_neon_free,
+    .hlen = 64,
+    .blocklen = 128,
+    HASHALG_NAMES_ANNOTATED("SHA-512", "NEON accelerated"),
+    .extra = sha512_initial_state,
+};
+
+const ssh_hashalg ssh_sha384_hw = {
+    .new = sha512_neon_new,
+    .reset = sha512_neon_reset,
+    .copyfrom = sha512_neon_copyfrom,
+    .digest = sha384_neon_digest,
+    .free = sha512_neon_free,
+    .hlen = 48,
+    .blocklen = 128,
+    HASHALG_NAMES_ANNOTATED("SHA-384", "NEON accelerated"),
+    .extra = sha384_initial_state,
+};
+
+/* ----------------------------------------------------------------------
+ * Stub functions if we have no hardware-accelerated SHA-512. In this
+ * case, sha512_hw_new returns NULL (though it should also never be
+ * selected by sha512_select, so the only thing that should even be
+ * _able_ to call it is testcrypt). As a result, the remaining vtable
+ * functions should never be called at all.
+ */
+
+#elif HW_SHA512 == HW_SHA512_NONE
+
+static bool sha512_hw_available(void)
+{
+    return false;
+}
+
+static ssh_hash *sha512_stub_new(const ssh_hashalg *alg)
+{
+    return NULL;
+}
+
+#define STUB_BODY { unreachable("Should never be called"); }
+
+static void sha512_stub_reset(ssh_hash *hash) STUB_BODY
+static void sha512_stub_copyfrom(ssh_hash *hash, ssh_hash *orig) STUB_BODY
+static void sha512_stub_free(ssh_hash *hash) STUB_BODY
+static void sha512_stub_digest(ssh_hash *hash, uint8_t *digest) STUB_BODY
+
+const ssh_hashalg ssh_sha512_hw = {
+    .new = sha512_stub_new,
+    .reset = sha512_stub_reset,
+    .copyfrom = sha512_stub_copyfrom,
+    .digest = sha512_stub_digest,
+    .free = sha512_stub_free,
+    .hlen = 64,
+    .blocklen = 128,
+    HASHALG_NAMES_ANNOTATED("SHA-512", "!NONEXISTENT ACCELERATED VERSION!"),
+};
+
+const ssh_hashalg ssh_sha384_hw = {
+    .new = sha512_stub_new,
+    .reset = sha512_stub_reset,
+    .copyfrom = sha512_stub_copyfrom,
+    .digest = sha512_stub_digest,
+    .free = sha512_stub_free,
+    .hlen = 48,
+    .blocklen = 128,
+    HASHALG_NAMES_ANNOTATED("SHA-384", "!NONEXISTENT ACCELERATED VERSION!"),
+};
+
+#endif /* HW_SHA512 */
--- a/crypto/xdmauth.c
+++ b/crypto/xdmauth.c
@ -0,0 +1,53 @@
+/*
+ * Convenience functions to encrypt and decrypt the cookies used in
+ * XDM-AUTHORIZATION-1. 
+ */
+
+#include "ssh.h"
+
+static ssh_cipher *des_xdmauth_cipher(const void *vkeydata)
+{
+    /*
+     * XDM-AUTHORIZATION-1 uses single-DES, but packs the key into 7
+     * bytes, so here we have to repack it manually into the canonical
+     * form where it occupies 8 bytes each with the low bit unused.
+     */
+    const unsigned char *keydata = (const unsigned char *)vkeydata;
+    unsigned char key[8];
+    int i, nbits, j;
+    unsigned int bits;
+
+    bits = 0;
+    nbits = 0;
+    j = 0;
+    for (i = 0; i < 8; i++) {
+        if (nbits < 7) {
+            bits = (bits << 8) | keydata[j];
+            nbits += 8;
+            j++;
+        }
+        key[i] = (bits >> (nbits - 7)) << 1;
+        bits &= ~(0x7F << (nbits - 7));
+        nbits -= 7;
+    }
+
+    ssh_cipher *c = ssh_cipher_new(&ssh_des);
+    ssh_cipher_setkey(c, key);
+    smemclr(key, sizeof(key));
+    ssh_cipher_setiv(c, key);
+    return c;
+}
+
+void des_encrypt_xdmauth(const void *keydata, void *blk, int len)
+{
+    ssh_cipher *c = des_xdmauth_cipher(keydata);
+    ssh_cipher_encrypt(c, blk, len);
+    ssh_cipher_free(c);
+}
+
+void des_decrypt_xdmauth(const void *keydata, void *blk, int len)
+{
+    ssh_cipher *c = des_xdmauth_cipher(keydata);
+    ssh_cipher_decrypt(c, blk, len);
+    ssh_cipher_free(c);
+}