diff --git a/Recipe b/Recipe
index c332577c..802eebe8 100644
--- a/Recipe
+++ b/Recipe
@@ -252,7 +252,7 @@ NONSSH   = telnet raw rlogin supdup ldisc pinger
 
 # SSH back end (putty, plink, pscp, psftp).
 ARITH    = mpint ecc
-SSHCRYPTO = ARITH sshmd5 sshsha sshsh256 sshsh512 sshsha3 sshblake2
+SSHCRYPTO = ARITH sshmd5 sshsha sshsh256 sshsh512 sshsha3 sshblake2 sshargon2
 	 + sshrsa sshdss sshecc
          + sshdes sshblowf sshaes sshccp ssharcf
          + sshdh sshcrc sshcrcda sshauxcrypt
diff --git a/ssh.h b/ssh.h
index 56a2c82d..b25288a8 100644
--- a/ssh.h
+++ b/ssh.h
@@ -930,6 +930,18 @@ struct ssh2_userkey {
     char *comment;                     /* the key comment */
 };
 
+/* Argon2 password hashing function */
+typedef enum { Argon2d = 0, Argon2i = 1, Argon2id = 2 } Argon2Flavour;
+void argon2(Argon2Flavour, uint32_t mem, uint32_t passes,
+            uint32_t parallel, uint32_t taglen,
+            ptrlen P, ptrlen S, ptrlen K, ptrlen X, strbuf *out);
+void argon2_choose_passes(
+    Argon2Flavour, uint32_t mem, uint32_t milliseconds, uint32_t *passes,
+    uint32_t parallel, uint32_t taglen, ptrlen P, ptrlen S, ptrlen K, ptrlen X,
+    strbuf *out);
+/* The H' hash defined in Argon2, exposed just for testcrypt */
+strbuf *argon2_long_hash(unsigned length, ptrlen data);
+
 /* The maximum length of any hash algorithm. (bytes) */
 #define MAX_HASH_LEN (114) /* longest is SHAKE256 with 114-byte output */
 
diff --git a/sshargon2.c b/sshargon2.c
new file mode 100644
index 00000000..fe990359
--- /dev/null
+++ b/sshargon2.c
@@ -0,0 +1,582 @@
+/*
+ * Implementation of the Argon2 password hash function.
+ *
+ * My sources for the algorithm description and test vectors (the latter in
+ * test/cryptsuite.py) were the reference implementation on Github, and also
+ * the Internet-Draft description:
+ *
+ *   https://github.com/P-H-C/phc-winner-argon2
+ *   https://datatracker.ietf.org/doc/html/draft-irtf-cfrg-argon2-12
+ *
+ * Note on the spec: I believe draft-irtf-cfrg-argon2-12 has an error in the
+ * description. When making the pseudorandom data used for calculating Argon2i
+ * block indices, the spec in the Github repository says that you make a block
+ * of preimage data and then apply the block-mixing function G to it _twice_
+ * in iteration. But draft-irtf-cfrg-argon2-12 only mentions applying it once.
+ *
+ * The test vectors and reference implementation settle the difference: the
+ * reference implementation also applies G twice, and comes with a program
+ * that regenerates the test vectors as found in draft-irtf-cfrg-argon2-12. So
+ * draft-irtf-cfrg-argon2-12 is not consistent within itself - the algorithm
+ * with G applied just once does not pass its own test vectors. I'm convinced
+ * that the intention was to apply G twice.
+ */
+
+#include <assert.h>
+
+#include "putty.h"
+#include "ssh.h"
+#include "marshal.h"
+
+/* ----------------------------------------------------------------------
+ * Argon2 uses data marshalling rules similar to SSH but with 32-bit integers
+ * stored little-endian. Start with some local BinarySink routines for storing
+ * a uint32 and a string in that fashion.
+ */
+
+static void BinarySink_put_uint32_le(BinarySink *bs, unsigned long val)
+{
+    unsigned char data[4];
+    PUT_32BIT_LSB_FIRST(data, val);
+    bs->write(bs, data, sizeof(data));
+}
+
+static void BinarySink_put_stringpl_le(BinarySink *bs, ptrlen pl)
+{
+    /* Check that the string length fits in a uint32, without doing a
+     * potentially implementation-defined shift of more than 31 bits */
+    assert((pl.len >> 31) < 2);
+
+    BinarySink_put_uint32_le(bs, pl.len);
+    bs->write(bs, pl.ptr, pl.len);
+}
+
+#define put_uint32_le(bs, val) \
+    BinarySink_put_uint32_le(BinarySink_UPCAST(bs), val)
+#define put_stringpl_le(bs, val) \
+    BinarySink_put_stringpl_le(BinarySink_UPCAST(bs), val)
+
+/* ----------------------------------------------------------------------
+ * Argon2 defines a hash-function family that's an extension of BLAKE2b to
+ * generate longer output digests, by repeatedly outputting half of a BLAKE2
+ * hash output and then re-hashing the whole thing until there are 64 or fewer
+ * bytes left to output. The spec calls this H' (a variant of the original
+ * hash it calls H, which is the unmodified BLAKE2b).
+ */
+
+static ssh_hash *hprime_new(unsigned length)
+{
+    ssh_hash *h = blake2b_new_general(length > 64 ? 64 : length);
+    put_uint32_le(h, length);
+    return h;
+}
+
+static void hprime_final(ssh_hash *h, unsigned length, void *vout)
+{
+    uint8_t *out = (uint8_t *)vout;
+
+    while (length > 64) {
+        uint8_t hashbuf[64];
+        ssh_hash_final(h, hashbuf);
+
+        unsigned chunk = 32;
+        if (chunk > length)
+            chunk = length;
+        memcpy(out, hashbuf, chunk);
+        out += chunk;
+        length -= chunk;
+
+        h = blake2b_new_general(length > 64 ? 64 : length);
+        put_data(h, hashbuf, 64);
+
+        smemclr(hashbuf, sizeof(hashbuf));
+    }
+
+    ssh_hash_final(h, out);
+}
+
+/* Externally visible entry point for the long hash function. This is only
+ * used by testcrypt, so it would be overkill to set it up like a proper
+ * ssh_hash. */
+strbuf *argon2_long_hash(unsigned length, ptrlen data)
+{
+    ssh_hash *h = hprime_new(length);
+    put_datapl(h, data);
+    strbuf *out = strbuf_new();
+    hprime_final(h, length, strbuf_append(out, length));
+    return out;
+}
+
+/* ----------------------------------------------------------------------
+ * Argon2's own mixing function G, which operates on 1Kb blocks of data.
+ *
+ * The definition of G in the spec takes two 1Kb blocks as input and produces
+ * a 1Kb output block. The first thing that happens to the input blocks is
+ * that they get XORed together, and then only the XOR output is used, so you
+ * could perfectly well regard G as a 1Kb->1Kb function.
+ */
+
+static inline uint64_t ror(uint64_t x, unsigned rotation)
+{
+    unsigned lshift = 63 & -rotation, rshift = 63 & rotation;
+    return (x << lshift) | (x >> rshift);
+}
+
+static inline uint64_t trunc32(uint64_t x)
+{
+    return x & 0xFFFFFFFF;
+}
+
+/* Internal function similar to the BLAKE2b round, which mixes up four 64-bit
+ * words */
+static inline void GB(uint64_t *a, uint64_t *b, uint64_t *c, uint64_t *d)
+{
+    *a += *b + 2 * trunc32(*a) * trunc32(*b);
+    *d = ror(*d ^ *a, 32);
+    *c += *d + 2 * trunc32(*c) * trunc32(*d);
+    *b = ror(*b ^ *c, 24);
+    *a += *b + 2 * trunc32(*a) * trunc32(*b);
+    *d = ror(*d ^ *a, 16);
+    *c += *d + 2 * trunc32(*c) * trunc32(*d);
+    *b = ror(*b ^ *c, 63);
+}
+
+/* Higher-level internal function which mixes up sixteen 64-bit words. This is
+ * applied to different subsets of the 128 words in a kilobyte block, and the
+ * API here is designed to make it easy to apply in the circumstances the spec
+ * requires. In every call, the sixteen words form eight pairs adjacent in
+ * memory, whose addresses are in arithmetic progression. So the 16 input
+ * words are in[0], in[1], in[instep], in[instep+1], ..., in[7*instep],
+ * in[7*instep+1], and the 16 output words similarly. */
+static inline void P(uint64_t *out, unsigned outstep,
+                     uint64_t *in, unsigned instep)
+{
+    for (unsigned i = 0; i < 8; i++) {
+        out[i*outstep] = in[i*instep];
+        out[i*outstep+1] = in[i*instep+1];
+    }
+
+    GB(out+0*outstep+0, out+2*outstep+0, out+4*outstep+0, out+6*outstep+0);
+    GB(out+0*outstep+1, out+2*outstep+1, out+4*outstep+1, out+6*outstep+1);
+    GB(out+1*outstep+0, out+3*outstep+0, out+5*outstep+0, out+7*outstep+0);
+    GB(out+1*outstep+1, out+3*outstep+1, out+5*outstep+1, out+7*outstep+1);
+
+    GB(out+0*outstep+0, out+2*outstep+1, out+5*outstep+0, out+7*outstep+1);
+    GB(out+0*outstep+1, out+3*outstep+0, out+5*outstep+1, out+6*outstep+0);
+    GB(out+1*outstep+0, out+3*outstep+1, out+4*outstep+0, out+6*outstep+1);
+    GB(out+1*outstep+1, out+2*outstep+0, out+4*outstep+1, out+7*outstep+0);
+}
+
+/* The full G function, taking input blocks X and Y. The result of G is most
+ * often XORed into an existing output block, so this API is designed with
+ * that in mind: the mixing function's output is always XORed into whatever
+ * 1Kb of data is already at 'out'. */
+static void G_xor(uint8_t *out, const uint8_t *X, const uint8_t *Y)
+{
+    uint64_t R[128], Q[128], Z[128];
+
+    for (unsigned i = 0; i < 128; i++)
+        R[i] = GET_64BIT_LSB_FIRST(X + 8*i) ^ GET_64BIT_LSB_FIRST(Y + 8*i);
+
+    for (unsigned i = 0; i < 8; i++)
+        P(Q+16*i, 2, R+16*i, 2);
+
+    for (unsigned i = 0; i < 8; i++)
+        P(Z+2*i, 16, Q+2*i, 16);
+
+    for (unsigned i = 0; i < 128; i++)
+        PUT_64BIT_LSB_FIRST(out + 8*i,
+                            GET_64BIT_LSB_FIRST(out + 8*i) ^ R[i] ^ Z[i]);
+
+    smemclr(R, sizeof(R));
+    smemclr(Q, sizeof(Q));
+    smemclr(Z, sizeof(Z));
+}
+
+/* ----------------------------------------------------------------------
+ * The main Argon2 function.
+ */
+
+static void argon2_internal(uint32_t p, uint32_t T, uint32_t m, uint32_t t,
+                            uint32_t y, ptrlen P, ptrlen S, ptrlen K, ptrlen X,
+                            uint8_t *out)
+{
+    /*
+     * Start by hashing all the input data together: the four string arguments
+     * (password P, salt S, optional secret key K, optional associated data
+     * X), plus all the parameters for the function's memory and time usage.
+     *
+     * The output of this hash is the sole input to the subsequent mixing
+     * step: Argon2 does not preserve any more entropy from the inputs, it
+     * just makes it extra painful to get the final answer.
+     */
+    uint8_t h0[64];
+    {
+        ssh_hash *h = blake2b_new_general(64);
+        put_uint32_le(h, p);
+        put_uint32_le(h, T);
+        put_uint32_le(h, m);
+        put_uint32_le(h, t);
+        put_uint32_le(h, 0x13);        /* hash function version number */
+        put_uint32_le(h, y);
+        put_stringpl_le(h, P);
+        put_stringpl_le(h, S);
+        put_stringpl_le(h, K);
+        put_stringpl_le(h, X);
+        ssh_hash_final(h, h0);
+    }
+
+    struct blk { uint8_t data[1024]; };
+
+    /*
+     * Array of 1Kb blocks. The total size is (approximately) m, the
+     * caller-specified parameter for how much memory to use; the blocks are
+     * regarded as a rectangular array of p rows ('lanes') by q columns, where
+     * p is the 'parallelism' input parameter (the lanes can be processed
+     * concurrently up to a point) and q is whatever makes the product pq come
+     * to m.
+     *
+     * Additionally, each row is divided into four equal 'segments', which are
+     * important to the way the algorithm decides which blocks to use as input
+     * to each step of the function.
+     *
+     * The term 'slice' refers to a whole set of vertically aligned segments,
+     * i.e. slice 0 is the whole left quarter of the array, and slice 3 the
+     * whole right quarter.
+     */
+    size_t SL = m / (4*p); /* segment length: # of 1Kb blocks in a segment */
+    size_t q = 4 * SL;     /* width of the array: 4 segments times SL */
+    size_t mprime = q * p; /* total size of the array, approximately m */
+
+    /* Allocate the memory. */
+    struct blk *B = snewn(mprime, struct blk);
+    memset(B, 0, mprime * sizeof(struct blk));
+
+    /*
+     * Initial setup: fill the first two full columns of the array with data
+     * expanded from the starting hash h0. Each block is the result of using
+     * the long-output hash function H' to hash h0 itself plus the block's
+     * coordinates in the array.
+     */
+    for (size_t i = 0; i < p; i++) {
+        ssh_hash *h = hprime_new(1024);
+        put_data(h, h0, 64);
+        put_uint32_le(h, 0);
+        put_uint32_le(h, i);
+        hprime_final(h, 1024, B[i].data);
+    }
+    for (size_t i = 0; i < p; i++) {
+        ssh_hash *h = hprime_new(1024);
+        put_data(h, h0, 64);
+        put_uint32_le(h, 1);
+        put_uint32_le(h, i);
+        hprime_final(h, 1024, B[i+p].data);
+    }
+
+    /*
+     * Declarations for the main loop.
+     *
+     * The basic structure of the main loop is going to involve processing the
+     * array one whole slice (vertically divided quarter) at a time. Usually
+     * we'll write a new value into every single block in the slice, except
+     * that in the initial slice on the first pass, we've already written
+     * values into the first two columns during the initial setup above. So
+     * 'jstart' indicates the starting index in each segment we process; it
+     * starts off as 2 so that we don't overwrite the inital setup, and then
+     * after the first slice is done, we set it to 0, and it stays there.
+     *
+     * d_mode indicates whether we're being data-dependent (true) or
+     * data-independent (false). In the hybrid Argon2id mode, we start off
+     * independent, and then once we've mixed things up enough, switch over to
+     * dependent mode to force long serial chains of computation.
+     */
+    size_t jstart = 2;
+    bool d_mode = (y == 0);
+    struct blk out2i, tmp2i, in2i;
+
+    /* Outermost loop: t whole passes from left to right over the array */
+    for (size_t pass = 0; pass < t; pass++) {
+
+        /* Within that, we process the array in its four main slices */
+        for (unsigned slice = 0; slice < 4; slice++) {
+
+            /* In Argon2id mode, if we're half way through the first pass,
+             * this is the moment to switch d_mode from false to true */
+            if (pass == 0 && slice == 2 && y == 2)
+                d_mode = true;
+
+            /* Loop over every segment in the slice (i.e. every row). So i is
+             * the y-coordinate of each block we process. */
+            for (size_t i = 0; i < p; i++) {
+
+                /* And within that segment, process the blocks from left to
+                 * right, starting at 'jstart' (usually 0, but 2 in the first
+                 * slice). */
+                for (size_t jpre = jstart; jpre < SL; jpre++) {
+
+                    /* j is the x-coordinate of each block we process, made up
+                     * of the slice number and the index 'jpre' within the
+                     * segment. */
+                    size_t j = slice * SL + jpre;
+
+                    /* jm1 is j-1 (mod q) */
+                    uint32_t jm1 = (j == 0 ? q-1 : j-1);
+
+                    /*
+                     * Construct two 32-bit pseudorandom integers J1 and J2.
+                     * This is the part of the algorithm that varies between
+                     * the data-dependent and independent modes.
+                     */
+                    uint32_t J1, J2;
+                    if (d_mode) {
+                        /*
+                         * Data-dependent: grab the first 64 bits of the block
+                         * to the left of this one.
+                         */
+                        J1 = GET_32BIT_LSB_FIRST(B[i + p * jm1].data);
+                        J2 = GET_32BIT_LSB_FIRST(B[i + p * jm1].data + 4);
+                    } else {
+                        /*
+                         * Data-independent: generate pseudorandom data by
+                         * hashing a sequence of preimage blocks that include
+                         * all our input parameters, plus the coordinates of
+                         * this point in the algorithm (array position and
+                         * pass number) to make all the hash outputs distinct.
+                         *
+                         * The hash we use is G itself, applied twice (see
+                         * comment at top of file). So we generate 1Kb of data
+                         * at a time, which is enough for 128 (J1,J2) pairs.
+                         * Hence we only need to do the hashing if our index
+                         * within the segment is a multiple of 128, or if
+                         * we're at the very start of the algorithm (in which
+                         * case we started at 2 rather than 0). After that we
+                         * can just keep picking data out of our most recent
+                         * hash output.
+                         */
+                        if (jpre == jstart || jpre % 128 == 0) {
+                            /*
+                             * Hash preimage is mostly zeroes, with a
+                             * collection of assorted integer values we had
+                             * anyway.
+                             */
+                            memset(in2i.data, 0, sizeof(in2i.data));
+                            PUT_64BIT_LSB_FIRST(in2i.data +  0, pass);
+                            PUT_64BIT_LSB_FIRST(in2i.data +  8, i);
+                            PUT_64BIT_LSB_FIRST(in2i.data + 16, slice);
+                            PUT_64BIT_LSB_FIRST(in2i.data + 24, mprime);
+                            PUT_64BIT_LSB_FIRST(in2i.data + 32, t);
+                            PUT_64BIT_LSB_FIRST(in2i.data + 40, y);
+                            PUT_64BIT_LSB_FIRST(in2i.data + 48, jpre / 128 + 1);
+
+                            /*
+                             * Now apply G twice to generate the hash output
+                             * in out2i.
+                             */
+                            memset(tmp2i.data, 0, sizeof(tmp2i.data));
+                            G_xor(tmp2i.data, tmp2i.data, in2i.data);
+                            memset(out2i.data, 0, sizeof(out2i.data));
+                            G_xor(out2i.data, out2i.data, tmp2i.data);
+                        }
+
+                        /*
+                         * Extract J1 and J2 from the most recent hash output
+                         * (whether we've just computed it or not).
+                         */
+                        J1 = GET_32BIT_LSB_FIRST(
+                            out2i.data + 8 * (jpre % 128));
+                        J2 = GET_32BIT_LSB_FIRST(
+                            out2i.data + 8 * (jpre % 128) + 4);
+                    }
+
+                    /*
+                     * Now convert J1 and J2 into the index of an existing
+                     * block of the array to use as input to this step. This
+                     * is fairly fiddly.
+                     *
+                     * The easy part: the y-coordinate of the input block is
+                     * obtained by reducing J2 mod p, except that at the very
+                     * start of the algorithm (processing the first slice on
+                     * the first pass) we simply use the same y-coordinate as
+                     * our output block.
+                     *
+                     * Note that it's safe to use the ordinary % operator
+                     * here, without any concern for timing side channels: in
+                     * data-independent mode J2 is not correlated to any
+                     * secrets, and in data-dependent mode we're going to be
+                     * giving away side-channel data _anyway_ when we use it
+                     * as an array index (and by assumption we don't care,
+                     * because it's already massively randomised from the real
+                     * inputs).
+                     */
+                    uint32_t index_l = (pass == 0 && slice == 0) ? i : J2 % p;
+
+                    /*
+                     * The hard part: which block in this array row do we use?
+                     *
+                     * First, we decide what the possible candidates are. This
+                     * requires some case analysis, and depends on whether the
+                     * array row is the same one we're writing into or not.
+                     *
+                     * If it's not the same row: we can't use any block from
+                     * the current slice (because the segments within a slice
+                     * have to be processable in parallel, so in a concurrent
+                     * implementation those blocks are potentially in the
+                     * process of being overwritten by other threads). But the
+                     * other three slices are fair game, except that in the
+                     * first pass, slices to the right of us won't have had
+                     * any values written into them yet at all.
+                     *
+                     * If it is the same row, we _are_ allowed to use blocks
+                     * from the current slice, but only the ones before our
+                     * current position.
+                     *
+                     * In both cases, we also exclude the individual _column_
+                     * just to the left of the current one. (The block
+                     * immediately to our left is going to be the _other_
+                     * input to G, but the spec also says that we avoid that
+                     * column even in a different row.)
+                     *
+                     * All of this means that we end up choosing from a
+                     * cyclically contiguous interval of blocks within this
+                     * lane, but the start and end points require some thought
+                     * to get them right.
+                     */
+
+                    /* Start position is the beginning of the _next_ slice
+                     * (containing data from the previous pass), unless we're
+                     * on pass 0, where the start position has to be 0. */
+                    uint32_t Wstart = (pass == 0 ? 0 : (slice + 1) % 4 * SL);
+
+                    /* End position splits up by cases. */
+                    uint32_t Wend;
+                    if (index_l == i) {
+                        /* Same lane as output: we can use anything up to (but
+                         * not including) the block immediately left of us. */
+                        Wend = jm1;
+                    } else {
+                        /* Different lane from output: we can use anything up
+                         * to the previous slice boundary, or one less than
+                         * that if we're at the very left edge of our slice
+                         * right now. */
+                        Wend = SL * slice;
+                        if (jpre == 0)
+                            Wend = (Wend + q-1) % q;
+                    }
+
+                    /* Total number of blocks available to choose from */
+                    uint32_t Wsize = (Wend + q - Wstart) % q;
+
+                    /* Fiddly computation from the spec that chooses from the
+                     * available blocks, in a deliberately non-uniform
+                     * fashion, using J1 as pseudorandom input data. Output is
+                     * zz which is the index within our contiguous interval. */
+                    uint32_t x = ((uint64_t)J1 * J1) >> 32;
+                    uint32_t y = ((uint64_t)Wsize * x) >> 32;
+                    uint32_t zz = Wsize - 1 - y;
+
+                    /* And index_z is the actual x coordinate of the block we
+                     * want. */
+                    uint32_t index_z = (Wstart + zz) % q;
+
+                    /* Phew! Combine that block with the one immediately to
+                     * our left, and XOR over the top of whatever is already
+                     * in our current output block. */
+                    G_xor(B[i + p * j].data, B[i + p * jm1].data,
+                          B[index_l + p * index_z].data);
+                }
+            }
+
+            /* We've finished processing a slice. Reset jstart to 0. It will
+             * onily _not_ have been 0 if this was pass 0 slice 0, in which
+             * case it still had its initial value of 2 to avoid the starting
+             * data. */
+            jstart = 0;
+        }
+    }
+
+    /*
+     * The main output is all done. Final output works by taking the XOR of
+     * all the blocks in the rightmost column of the array, and then using
+     * that as input to our long hash H'. The output of _that_ is what we
+     * deliver to the caller.
+     */
+
+    struct blk C = B[p * (q-1)];
+    for (size_t i = 1; i < p; i++)
+        memxor(C.data, C.data, B[i + p * (q-1)].data, 1024);
+
+    {
+        ssh_hash *h = hprime_new(T);
+        put_data(h, C.data, 1024);
+        hprime_final(h, T, out);
+    }
+
+    /*
+     * Clean up.
+     */
+    smemclr(out2i.data, sizeof(out2i.data));
+    smemclr(tmp2i.data, sizeof(tmp2i.data));
+    smemclr(in2i.data, sizeof(in2i.data));
+    smemclr(C.data, sizeof(C.data));
+    smemclr(B, mprime * sizeof(struct blk));
+    sfree(B);
+}
+
+/*
+ * Wrapper function that appends to a strbuf (which sshpubk.c will want).
+ */
+void argon2(Argon2Flavour flavour, uint32_t mem, uint32_t passes,
+            uint32_t parallel, uint32_t taglen,
+            ptrlen P, ptrlen S, ptrlen K, ptrlen X, strbuf *out)
+{
+    argon2_internal(parallel, taglen, mem, passes, flavour,
+                    P, S, K, X, strbuf_append(out, taglen));
+}
+
+/*
+ * Wrapper function which dynamically chooses the number of passes to run in
+ * order to hit an approximate total amount of CPU time. Writes the result
+ * into 'passes'.
+ */
+void argon2_choose_passes(
+    Argon2Flavour flavour, uint32_t mem,
+    uint32_t milliseconds, uint32_t *passes,
+    uint32_t parallel, uint32_t taglen,
+    ptrlen P, ptrlen S, ptrlen K, ptrlen X,
+    strbuf *out)
+{
+    unsigned long desired_time = (TICKSPERSEC * milliseconds) / 1000;
+
+    /*
+     * We only need the time taken to be approximately right, so we
+     * scale up the number of passes geometrically, which avoids
+     * taking O(t^2) time to find a pass count taking time t.
+     *
+     * Using the Fibonacci numbers is slightly nicer than the obvious
+     * approach of powers of 2, because it's still very easy to
+     * compute, and grows less fast (powers of 1.6 instead of 2), so
+     * you get just a touch more precision.
+     */
+    uint32_t a = 1, b = 1;
+
+    while (true) {
+        unsigned long start_time = GETTICKCOUNT();
+        argon2(flavour, mem, b, parallel, taglen, P, S, K, X, out);
+        unsigned long ticks = GETTICKCOUNT() - start_time;
+
+        /* But just in case computers get _too_ fast, we have to cap
+         * the growth before it gets past the uint32_t upper bound! So
+         * if computing a+b would overflow, stop here. */
+
+        if (ticks >= desired_time || a > (uint32_t)~b) {
+            *passes = b;
+            return;
+        } else {
+            strbuf_clear(out);
+
+            /* Next Fibonacci number: replace (a, b) with (b, a+b) */
+            b += a;
+            a = b - a;
+        }
+    }
+}
diff --git a/test/cryptsuite.py b/test/cryptsuite.py
index b3d6e623..e39e0d0d 100755
--- a/test/cryptsuite.py
+++ b/test/cryptsuite.py
@@ -1757,6 +1757,199 @@ culpa qui officia deserunt mollit anim id est laborum.
             "daafcf2bd6fccf976cbc234b71cd9f4f7d56fe0eb33a40018707089a215c44a8"
             "4b272d0329ae6d85a0f8acc7e964dc2facb715ba472bb6"))
 
+    def testArgon2LongHash(self):
+        # Unit-test the Argon2 long hash function H', which starts off
+        # the same as BLAKE2b, but comes with its own method of
+        # extending the output length past 64 bytes.
+        #
+        # I generated these test values using a test program linked
+        # against the reference implementation's libargon2.a and
+        # calling its blake2b_long function.
+        preimage = b'hello, world'
+
+        self.assertEqualBin(argon2_long_hash(1, preimage), unhex("8b"))
+        self.assertEqualBin(argon2_long_hash(2, preimage), unhex("1ff9"))
+        self.assertEqualBin(argon2_long_hash(63, preimage), unhex(
+            "e2c997721f1d64aa8c25e588fb8ab19646ce6d5c2a431fa560fcb813e55dd481"
+            "322d2630d95ca6b1b63317b13d6b111e5816170c80c3ca7d5b4bf894096de4"))
+        self.assertEqualBin(argon2_long_hash(64, preimage), unhex(
+            "0c7ba7ee6d510b4bb5c9b69ac91e25e0b11aa30dd6234b8e61b0fe1537c037b8"
+            "8ed5aa59a277e8cc07095c81aff26d08967e4dfdabd32db8b6af6ceb78cf8c47"))
+        self.assertEqualBin(argon2_long_hash(65, preimage), unhex(
+            "680941abbd8fc80f28c38d623e90903f08709bf76575e2775d4ce01c31b192c8"
+            "73038d9a31af8991c8b1ad4f2b1991f4d15f73ab0f4f3add415c297a12eb9ddb"
+            "76"))
+        self.assertEqualBin(argon2_long_hash(95, preimage), unhex(
+            "4be28c51850fed70d9403e1406b6ba68a83d98cf222a4ee162beef60fd3384df"
+            "eba3fce9d95f646982eb384ac943ce5263cb03428fd8d261cc41ffdb7ba328fe"
+            "098526f2b49593f9e7f38188598ce4693b59f4dd32db30c1be9a9d35784fa0"))
+        self.assertEqualBin(argon2_long_hash(96, preimage), unhex(
+            "20295ea01e822cca113f668f33e5e481ed5879bfd7de6359ea42d497da97be52"
+            "2cdd518d34ae32c44cabd45249b4e697626b0b14b6a33a2bd138be0a4bceeaf4"
+            "9528f93acef01b093ee84d8d871d1ee6cf7c10e83ad0619631aed19345166f03"))
+        self.assertEqualBin(argon2_long_hash(97, preimage), unhex(
+            "d24b31f3ac0baad168d524efc4bafee55fef743fd60b14e28b860d7523e319c7"
+            "520e2d5457cc3d06dc1044530afdf6990fa12e38d5802eb642f8e77fcfee2c0b"
+            "1f84a28877f2f2f049ed9299e1e0230f98af3a161185970aad21f0ea0f5184cf"
+            "90"))
+        self.assertEqualBin(argon2_long_hash(127, preimage), unhex(
+            "5d1e8380450dbc985418ed1f3700b925ae0719e4486e29131c81bca7083ac6b8"
+            "f535c3398488e34d3dc1390de44097f1eee498f10ebe85b579e99a7672023b01"
+            "ca5c20e63c595b640e00d80f113a52e3773719889b266ab4c65269c11fb212e4"
+            "75f2b769bb26321bb60ecc0d490821e5056d7dfc9def3cd065d3ba90360764"))
+        self.assertEqualBin(argon2_long_hash(128, preimage), unhex(
+            "be15b316f3483c4d0d00f71a65b974894a2025f441b79b9fe461bc740cb0b039"
+            "c4fe914f61c05a612d63ebc50a662b2d59b1996091e5e3474340544ea46a46cb"
+            "25c41ff700fafcd96c4f12ddc698cd2426558f960696837ea8170fd2fe284b54"
+            "8f585f97919ef14f2b3cbb351eb98872add7ba6d08c1401232df6cc878fbeb22"))
+        self.assertEqualBin(argon2_long_hash(129, preimage), unhex(
+            "83da464c278dcb12c29b6685fee6d32f0b461337c155369ad0d56b58b0aa5f80"
+            "9aa7b56bd41b664c8d768957f8f0e40999fb0178eb53cf83f31d725bf92881bc"
+            "900774bce4cdf56b6386ad3de6891d11a0ccd4564a3431fc4c24105a02d0a6a2"
+            "434712b9a7471f3223c72a6e64912200d0a3d149a19d06fe9dc8ec09d7ed5a48"
+            "bb"))
+        self.assertEqualBin(argon2_long_hash(511, preimage), unhex(
+            "30c0c0d0467e7665368db0b40a2324a61fb569d35172de2df53a9739a8d18e60"
+            "b4f25d521c8855604be3e24ea56302566074323d94c0bd3a33d08f185d8ba5ac"
+            "a2bc3fb2e4c4e5ffec5778daea67c6b5913c9cac16f2e5c7b7818e757fa747b3"
+            "69e586d616010a752762f69c604238ed8738430366fbdb7493454fa02391a76b"
+            "30f241695b9fa8d3a3116227c6bb6f72d325cf104ab153d15f928b22767d467d"
+            "4bf7e16176aaa7315954b7872061933c12d548f1f93a8abb9d73791661bee521"
+            "b2ae51be373a229dfef32787234c1be5846d133563002b9a029178716ad41e70"
+            "1539d3fad300c77607c5217701e3e485d72c980f3f71d525c8148375a2f8d22c"
+            "a211ba165330a90b7e0e6baa6073833925c23bdd388ee904f38463c7e6b85475"
+            "09b810aae5c9ffc5dd902c2ffe049c338e3ae2c6416d3b874d6a9d384089564c"
+            "0d8e4dce9b6e47e1d5ec9087bf526cc9fa35aab1893a0588d31b77fea37e0799"
+            "468deacde47629d2960a3519b3bcd4e22364a9cccd3b128cba21cac27f140d53"
+            "f79c11e4157e4cb48272eecdf62f52084a27e5b0933bbe66ded17e2df6f8d398"
+            "f6c479c3c716457820ad177b8bd9334cb594e03d09fcc4f82d4385e141eacd7d"
+            "9ad1e1c4cb42788af70bac0509f0a891e662960955490abf2763373803e8c89c"
+            "df632579cb9c647634b30df214a3d67b92fd55d283c42c63b470a48a78cd5b"))
+        self.assertEqualBin(argon2_long_hash(512, preimage), unhex(
+            "79a6974e29a9a6c069e0156774d35c5014a409f5ffc60013725367a7208d4929"
+            "7d228637751768a31a59e27aa89372f1bcc095a6fa331198a5bd5ad053ba2ebb"
+            "cbcc501ea55cf142e8d95209228c9ab60cd104d5077472f2a9ecaa071aed6ee9"
+            "5de29e188b7399d5b6b7ed897b2bc4dd1ea745eb9974e39ca6fb983380cc537a"
+            "c04dfe6caefe85faf206b1613092ebadf791eaa8a5b814c9a79a73a5733b0505"
+            "a47163c10a0f7309df6663896df6079a7c88c6879bb591a40abd398c6deda792"
+            "1cc3986435b1c840a768b2fa507446f2f77a406b1b2f739f7795db24789c8927"
+            "24b4c84b7005445123154f8cd2ba63a7ede672af5d197f846700732025c9931d"
+            "1c67c5493417ca394a8f68ba532645815cf7b5102af134ecb4fd9e326f53779a"
+            "3039dbef6a0880db9e38b6b61d2f9ead969e4224c2d9c69b5897e5eeb7032e83"
+            "334e192ff50017056ccb84d4cc8eee3ab248d2614643d0174fe18c72186dd967"
+            "92d8545645ddf4a9b2c7a91c9a71857a399449d7154077a8e9580f1a2d20227d"
+            "671b455ccb897cba0491e50892120d7877f7776d653cfdb176fa3f64a9e6f848"
+            "cd681c487b488775aaf698294eec813b2cca90d68d63b5d886d61c1a8e922aaa"
+            "330fd658ede56e34bcd288048e845eba7b8e2e7cc22ba6c91b523e48017aa878"
+            "8ce4f91d0e6d6c6706762fb0cc7f465cee3916684fb21e337cfe1b583e0b1e92"))
+        self.assertEqualBin(argon2_long_hash(513, preimage), unhex(
+            "32243cfbd7eca582d60b3b8ea3ba3d93783537689c7cbcd1d1cbde46200b8c86"
+            "617fc00e8a9ae991a1e2f91c67e07d5f0a777d982c1461d0c5474e4e164b053c"
+            "2808559e2b8a5ac4a46a5fcbc825b1d5302c7b0611940194eb494d45ce7113a2"
+            "3424b51c199c6a5100ab159ff323eda5feffee4da4155a028a81da9d44e4286b"
+            "ac3dab4ffce43a80b6ce97a47ea0ac51ee16e8b4d3b68942afdc20e1c21747c4"
+            "94859c3d3883e7dc19ea416a393a3507683d9d03e6a3a91f8f1cb8a7d5d9892e"
+            "80c8fb0222527a73a1f59b9dd41770982f2af177a6e96093064534803edd0713"
+            "71ede53024cedc291d768325bb4e4def9af1b5569c349b64816496c37a8787b5"
+            "4fbe248372ebadb5ce20e03eaa935dc55ff4b8cbe5d6d844c7b71d4656fef22c"
+            "5a49f13d75a7a8368a2dbc1e78d732b879bfc5c9467eda2bf4918f0c59037ae3"
+            "dee7880a171409dd1a4e143c814e60301ac77237f261fa7519a04e68000530f9"
+            "708ed9fda5609d655560a9491f80f5875ad5725e3120686b73319c6a727932e3"
+            "20a2174422523498c38fea47aeb20d135ff9fd93c6fa6db0005e0001685d7577"
+            "33a82a4dc9dd6556b938f7b8dafd0d670846780b9931b815063708189b17877b"
+            "825533bcc250fb576a28be4caa107e6a3a6f7b0c60fb51b0def27008b7e272ac"
+            "95d610bfa912339799a2e537ce543d7862dddbe31bb224fda4ae283571847a28"
+            "54"))
+        self.assertEqualBin(argon2_long_hash(1024, preimage), unhex(
+            "951252f6fa152124f381266a358d9b78b88e469d08d5fc78e4ea32253c7fc26c"
+            "3ff1c93529ab4ee6fcf00acf29bbaba934a4014ce2625e0806601c55e6ce70d7"
+            "121fd82f0904f335c5c7ba07dc6e6adf7582c92f7f255072203ea85844b4fe54"
+            "817476a20bb742710ffc42750361be94332d0fc721b192309acfa70da43db6ae"
+            "1d0f0bbe8a3250966a4532b36728162073c9eb3e119ea4c1c187c775dbb25a5d"
+            "d883e3f65706a5fca897cdc4a8aa7b68ba3f57940c72f3a3396c417e758ba071"
+            "95be4afba325237c0e2738a74d96fd1350fb623cb2ad40ea8b1e070cf398b98c"
+            "2865ea40225b81f031f2b405409ca01dc5d9903d3d8e1d6381fbe7ccfc8f3dab"
+            "eadafd7c976c0ba84a936f78ff7df0f112c089ba88f82bed7f9a6e31a91e5fee"
+            "f675755454b948de22695660b243b9eca3bcc89608f83d2baa1d73dd6b8bd4f9"
+            "b995ed9cb0f1edc6e98a49ed841b506c1bf59b43f4b3457a376bbff116c1a4f6"
+            "07cc62381fc5c19953c68f300c1b51198d40784d812d25810ba404862f04b680"
+            "6039a074f612ad8b84e0941ba23c915c3e7162c225fbecffdb7dc1ab559b2b54"
+            "32fe8a498c32e918d8e7e33254ff75077f648827705e987f4d90fba971e78e1a"
+            "6896b4d775c7359dc950f1e964fa04621aacf3c0988969490f4c72c54caf79e8"
+            "481053cc0a27ffcd3580aabf9ef1268d498d8a18bd70e9b8402e011753bb7dc7"
+            "e856c00d988fca924ee7cf61979c38cda8a872e4cc4fbdc90c23a0ded71eb944"
+            "bb816ab22d9a4380e3e9d1cec818165c2fba6c5d51dcbf452c0cb1779a384937"
+            "64d695370e13a301eca7be68d4112d2177381514efbb36fe08fc5bc2970301b8"
+            "06f8e5a57a780e894d5276e2025bb775b6d1861e33c54ab6e3eb72947fbe6f91"
+            "8174ce24eb4682efbb3c4f01233dc7ce9ef44792e9e876bb03e6751b3d559047"
+            "d045127d976aa042fc55c690c9048e200065e7b7de19d9353aa9ac9b3e7611f0"
+            "d1c42d069a300455ca1f7420a352bace89215e705106927510c11b3b1c1486d9"
+            "f3ab006d2de2ee2c94574f760ce8c246bca229f98c66f06042b14f1fff9a16c0"
+            "1550237e16d108ce5597299b1eb406a9ee505a29a6e0fa526b3e6beafd336aea"
+            "138b2f31971586f67c5ffffbd6826d1c75666038c43d0bdff4edfc294e064a49"
+            "2eed43e2dc78d00abc4e85edcd9563b8251b66f57b0f4b6d17f5a3f35c87c488"
+            "dbeeb84fd720286197c2dec8290eccf3a313747de285b9cd3548e90cf81b3838"
+            "3ffcc8c2a7f582feb369d05cb96b9b224d05902b3e39e5b96536032e9dddeb9b"
+            "9d4f40a9c8f544ca37cf8d39d7c8c6a33880e9184ed017bd642db9590759bd10"
+            "7362048ede5c0257feecc4984584592c566f37fba8469c064015339fb4f03023"
+            "56ece37fd3655aae2bfc989b9b4c1384efc3503c8866db901802cb36eda9fb00"))
+
+    def testArgon2(self):
+        # A few tests of my own of Argon2, derived from the reference
+        # implementation.
+        pwd = b"password"
+        salt = b"salt of at least 16 bytes"
+        secret = b"secret"
+        assoc = b"associated data"
+
+        # Smallest memory (8Kb) and parallelism (1) parameters the
+        # reference implementation will accept, but lots of passes
+        self.assertEqualBin(
+            argon2('i', 8, 16, 1, 24, pwd, salt, secret, assoc), unhex(
+                "314da280240a3ca1eedd1f1db417a76eb0741e7df64b8cdf"))
+        self.assertEqualBin(
+            argon2('d', 8, 16, 1, 24, pwd, salt, secret, assoc), unhex(
+                "9cc961cf43e0f86c2d4e202b816dc5bc5b2177e68faa0b08"))
+        self.assertEqualBin(
+            argon2('id', 8, 16, 1, 24, pwd, salt, secret, assoc), unhex(
+                "6cd6c490c582fa597721d772d4e3de166987792491b48c51"))
+
+        # Test a memory cost value that isn't a power of 2. This
+        # checks a wraparound case during the conversion of J1 to a
+        # block index, and is a regression test for a bug that nearly
+        # got past me during original development.
+        self.assertEqualBin(
+            argon2('i', 104, 16, 2, 24, pwd, salt, secret, assoc), unhex(
+                "a561963623f1073c9aa8caecdb600c73ffc6de677ba8d97c"))
+        self.assertEqualBin(
+            argon2('d', 104, 16, 2, 24, pwd, salt, secret, assoc), unhex(
+                "a9014db7f1d468fb25b88fa7fc0deac0f2e7f27e25d2cf6e"))
+        self.assertEqualBin(
+            argon2('id', 104, 16, 2, 24, pwd, salt, secret, assoc), unhex(
+                "64f3212b1e7725ffcf9ae2d1753d63e763bcd6970061a435"))
+
+        # Larger parameters that should exercise the pseudorandom
+        # block indexing reasonably thoroughly. Also generate plenty
+        # of output data.
+        self.assertEqualBin(
+            argon2('i', 1024, 5, 16, 77, pwd, salt, secret, assoc), unhex(
+                "b008a685ff57730fad0e6f3ef3b9189282c0d9b05303675f43b5f3054724"
+                "733fcbe8e2639cc2c930535b31b723339041bcd703bf2483455acf86c0e6"
+                "9ed88c545ad40f1f2068855e4d61e99407"))
+        self.assertEqualBin(
+            argon2('d', 1024, 5, 16, 111, pwd, salt, secret, assoc), unhex(
+                "399ffbcd720c47745b9deb391ed0de7d5e0ffe53aef9f8ef7a7918cfa212"
+                "53df8cc577affbd5e0c0f8bf6d93c11b2f63973f8fc8f89dccd832fc587e"
+                "5d61717be6e88ca33eef5d1e168c028bae632a2a723c6c83f8e755f39171"
+                "5eda1c77c8e2fe06fbdd4e56d35262587e7df73cd7"))
+        self.assertEqualBin(
+            argon2('id', 1024, 5, 16, 123, pwd, salt, secret, assoc), unhex(
+                "6636807289cb9b9c032f48dcc31ffed1de4ca6c1b97e1ce768d690486341"
+                "2ac84b39d568a81dd01d9ee3ceec6cc23441d95e6abeb4a2024f1f540d56"
+                "9b799277c4037ddc7195ba783c9158a901adc7d4a5df8357b34a3869e5d6"
+                "aeae2a21201eef5e347de22c922192e8f46274b0c9d33e965155a91e7686"
+                "9d530e"))
+
     def testRSAVerify(self):
         def blobs(n, e, d, p, q, iqmp):
             pubblob = ssh_string(b"ssh-rsa") + ssh2_mpint(e) + ssh2_mpint(n)
@@ -2471,6 +2664,24 @@ class standard_test_vectors(MyTestBase):
                 digest = ssh_hash_digest(h)
                 self.assertEqualBin(digest, unhex(vector['out']))
 
+    def testArgon2(self):
+        # draft-irtf-cfrg-argon2-12 section 5
+        self.assertEqualBin(
+            argon2('d', 32, 3, 4, 32, b'\x01' * 32, b'\x02' * 16,
+                   b'\x03' * 8, b'\x04' * 12),
+            unhex("512b391b6f1162975371d30919734294"
+                  "f868e3be3984f3c1a13a4db9fabe4acb"))
+        self.assertEqualBin(
+            argon2('i', 32, 3, 4, 32, b'\x01' * 32, b'\x02' * 16,
+                   b'\x03' * 8, b'\x04' * 12),
+            unhex("c814d9d1dc7f37aa13f0d77f2494bda1"
+                  "c8de6b016dd388d29952a4c4672b6ce8"))
+        self.assertEqualBin(
+            argon2('id', 32, 3, 4, 32, b'\x01' * 32, b'\x02' * 16,
+                   b'\x03' * 8, b'\x04' * 12),
+            unhex("0d640df58d78766c08c037a34a8b53c9"
+                  "d01ef0452d75b65eb52520e96b01e659"))
+
     def testHmacSHA(self):
         # Test cases from RFC 6234 section 8.5.
         def vector(key, message, s1=None, s256=None):
diff --git a/test/testcrypt.py b/test/testcrypt.py
index ba0160f2..973b90ee 100644
--- a/test/testcrypt.py
+++ b/test/testcrypt.py
@@ -177,7 +177,8 @@ def make_argword(arg, argtype, fnname, argindex, to_preserve):
         return "true" if arg else "false"
     if typename in {
             "hashalg", "macalg", "keyalg", "cipheralg",
-            "dh_group", "ecdh_alg", "rsaorder", "primegenpolicy"}:
+            "dh_group", "ecdh_alg", "rsaorder", "primegenpolicy",
+            "argon2flavour"}:
         arg = coerce_to_bytes(arg)
         if isinstance(arg, bytes) and b" " not in arg:
             return arg
diff --git a/testcrypt.c b/testcrypt.c
index 0113fbfc..f76417bc 100644
--- a/testcrypt.c
+++ b/testcrypt.c
@@ -403,6 +403,32 @@ static const PrimeGenerationPolicy *get_primegenpolicy(BinarySource *in)
     fatal_error("primegenpolicy '%.*s': not found", PTRLEN_PRINTF(name));
 }
 
+static Argon2Flavour get_argon2flavour(BinarySource *in)
+{
+    static const struct {
+        const char *key;
+        Argon2Flavour value;
+    } algs[] = {
+        {"d", Argon2d},
+        {"i", Argon2i},
+        {"id", Argon2id},
+        /* I expect to forget which spelling I chose, so let's support many */
+        {"argon2d", Argon2d},
+        {"argon2i", Argon2i},
+        {"argon2id", Argon2id},
+        {"Argon2d", Argon2d},
+        {"Argon2i", Argon2i},
+        {"Argon2id", Argon2id},
+    };
+
+    ptrlen name = get_word(in);
+    for (size_t i = 0; i < lenof(algs); i++)
+        if (ptrlen_eq_string(name, algs[i].key))
+            return algs[i].value;
+
+    fatal_error("Argon2 flavour '%.*s': not found", PTRLEN_PRINTF(name));
+}
+
 static uintmax_t get_uint(BinarySource *in)
 {
     ptrlen word = get_word(in);
@@ -1219,6 +1245,16 @@ PockleStatus pockle_add_prime_wrapper(Pockle *pockle, mp_int *p,
 }
 #define pockle_add_prime pockle_add_prime_wrapper
 
+strbuf *argon2_wrapper(Argon2Flavour flavour, uint32_t mem, uint32_t passes,
+                       uint32_t parallel, uint32_t taglen,
+                       ptrlen P, ptrlen S, ptrlen K, ptrlen X)
+{
+    strbuf *out = strbuf_new();
+    argon2(flavour, mem, passes, parallel, taglen, P, S, K, X, out);
+    return out;
+}
+#define argon2 argon2_wrapper
+
 #define OPTIONAL_PTR_FUNC(type)                                         \
     typedef TD_val_##type TD_opt_val_##type;                            \
     static TD_opt_val_##type get_opt_val_##type(BinarySource *in) {     \
@@ -1254,6 +1290,7 @@ typedef key_components *TD_keycomponents;
 typedef const PrimeGenerationPolicy *TD_primegenpolicy;
 typedef struct mpint_list TD_mpint_list;
 typedef PockleStatus TD_pocklestatus;
+typedef Argon2Flavour TD_argon2flavour;
 
 #define FUNC0(rettype, function)                                        \
     static void handle_##function(BinarySource *in, strbuf *out) {      \
diff --git a/testcrypt.h b/testcrypt.h
index e7624aa0..12e248b8 100644
--- a/testcrypt.h
+++ b/testcrypt.h
@@ -261,6 +261,12 @@ FUNC5(int, rsa1_load_s, val_string_binarysource, val_rsa, out_opt_val_string_asc
 FUNC3(val_string, ppk_save_sb, val_key, opt_val_string_asciz, opt_val_string_asciz)
 FUNC3(val_string, rsa1_save_sb, val_rsa, opt_val_string_asciz, opt_val_string_asciz)
 
+/*
+ * Password hashing.
+ */
+FUNC9(val_string, argon2, argon2flavour, uint, uint, uint, uint, val_string_ptrlen, val_string_ptrlen, val_string_ptrlen, val_string_ptrlen)
+FUNC2(val_string, argon2_long_hash, uint, val_string_ptrlen)
+
 /*
  * Key generation functions.
  */
diff --git a/testsc.c b/testsc.c
index 5beeee95..6f9ee003 100644
--- a/testsc.c
+++ b/testsc.c
@@ -327,6 +327,7 @@ VOLATILE_WRAPPED_DEFN(static, size_t, looplimit, (size_t x))
     CIPHERS(CIPHER_TESTLIST, X)                 \
     MACS(MAC_TESTLIST, X)                       \
     HASHES(HASH_TESTLIST, X)                    \
+    X(argon2)                                   \
     /* end of list */
 
 static void test_mp_get_nbits(void)
@@ -1409,6 +1410,36 @@ struct test {
     void (*testfn)(void);
 };
 
+static void test_argon2(void)
+{
+    /*
+     * We can only expect the Argon2i variant to pass this stringent
+     * test for no data-dependency, because the other two variants of
+     * Argon2 have _deliberate_ data-dependency.
+     */
+    size_t inlen = 48+16+24+8;
+    uint8_t *indata = snewn(inlen, uint8_t);
+    ptrlen password = make_ptrlen(indata, 48);
+    ptrlen salt = make_ptrlen(indata+48, 16);
+    ptrlen secret = make_ptrlen(indata+48+16, 24);
+    ptrlen assoc = make_ptrlen(indata+48+16+24, 8);
+
+    strbuf *outdata = strbuf_new();
+    strbuf_append(outdata, 256);
+
+    for (size_t i = 0; i < looplimit(16); i++) {
+        strbuf_clear(outdata);
+        random_read(indata, inlen);
+
+        log_start();
+        argon2(Argon2i, 32, 2, 2, 144, password, salt, secret, assoc, outdata);
+        log_end();
+    }
+
+    sfree(indata);
+    strbuf_free(outdata);
+}
+
 static const struct test tests[] = {
 #define STRUCT_TEST(X) { #X, test_##X },
 TESTLIST(STRUCT_TEST)