diff --git a/Recipe b/Recipe index c332577c..802eebe8 100644 --- a/Recipe +++ b/Recipe @@ -252,7 +252,7 @@ NONSSH = telnet raw rlogin supdup ldisc pinger # SSH back end (putty, plink, pscp, psftp). ARITH = mpint ecc -SSHCRYPTO = ARITH sshmd5 sshsha sshsh256 sshsh512 sshsha3 sshblake2 +SSHCRYPTO = ARITH sshmd5 sshsha sshsh256 sshsh512 sshsha3 sshblake2 sshargon2 + sshrsa sshdss sshecc + sshdes sshblowf sshaes sshccp ssharcf + sshdh sshcrc sshcrcda sshauxcrypt diff --git a/ssh.h b/ssh.h index 56a2c82d..b25288a8 100644 --- a/ssh.h +++ b/ssh.h @@ -930,6 +930,18 @@ struct ssh2_userkey { char *comment; /* the key comment */ }; +/* Argon2 password hashing function */ +typedef enum { Argon2d = 0, Argon2i = 1, Argon2id = 2 } Argon2Flavour; +void argon2(Argon2Flavour, uint32_t mem, uint32_t passes, + uint32_t parallel, uint32_t taglen, + ptrlen P, ptrlen S, ptrlen K, ptrlen X, strbuf *out); +void argon2_choose_passes( + Argon2Flavour, uint32_t mem, uint32_t milliseconds, uint32_t *passes, + uint32_t parallel, uint32_t taglen, ptrlen P, ptrlen S, ptrlen K, ptrlen X, + strbuf *out); +/* The H' hash defined in Argon2, exposed just for testcrypt */ +strbuf *argon2_long_hash(unsigned length, ptrlen data); + /* The maximum length of any hash algorithm. (bytes) */ #define MAX_HASH_LEN (114) /* longest is SHAKE256 with 114-byte output */ diff --git a/sshargon2.c b/sshargon2.c new file mode 100644 index 00000000..fe990359 --- /dev/null +++ b/sshargon2.c @@ -0,0 +1,582 @@ +/* + * Implementation of the Argon2 password hash function. + * + * My sources for the algorithm description and test vectors (the latter in + * test/cryptsuite.py) were the reference implementation on Github, and also + * the Internet-Draft description: + * + * https://github.com/P-H-C/phc-winner-argon2 + * https://datatracker.ietf.org/doc/html/draft-irtf-cfrg-argon2-12 + * + * Note on the spec: I believe draft-irtf-cfrg-argon2-12 has an error in the + * description. When making the pseudorandom data used for calculating Argon2i + * block indices, the spec in the Github repository says that you make a block + * of preimage data and then apply the block-mixing function G to it _twice_ + * in iteration. But draft-irtf-cfrg-argon2-12 only mentions applying it once. + * + * The test vectors and reference implementation settle the difference: the + * reference implementation also applies G twice, and comes with a program + * that regenerates the test vectors as found in draft-irtf-cfrg-argon2-12. So + * draft-irtf-cfrg-argon2-12 is not consistent within itself - the algorithm + * with G applied just once does not pass its own test vectors. I'm convinced + * that the intention was to apply G twice. + */ + +#include + +#include "putty.h" +#include "ssh.h" +#include "marshal.h" + +/* ---------------------------------------------------------------------- + * Argon2 uses data marshalling rules similar to SSH but with 32-bit integers + * stored little-endian. Start with some local BinarySink routines for storing + * a uint32 and a string in that fashion. + */ + +static void BinarySink_put_uint32_le(BinarySink *bs, unsigned long val) +{ + unsigned char data[4]; + PUT_32BIT_LSB_FIRST(data, val); + bs->write(bs, data, sizeof(data)); +} + +static void BinarySink_put_stringpl_le(BinarySink *bs, ptrlen pl) +{ + /* Check that the string length fits in a uint32, without doing a + * potentially implementation-defined shift of more than 31 bits */ + assert((pl.len >> 31) < 2); + + BinarySink_put_uint32_le(bs, pl.len); + bs->write(bs, pl.ptr, pl.len); +} + +#define put_uint32_le(bs, val) \ + BinarySink_put_uint32_le(BinarySink_UPCAST(bs), val) +#define put_stringpl_le(bs, val) \ + BinarySink_put_stringpl_le(BinarySink_UPCAST(bs), val) + +/* ---------------------------------------------------------------------- + * Argon2 defines a hash-function family that's an extension of BLAKE2b to + * generate longer output digests, by repeatedly outputting half of a BLAKE2 + * hash output and then re-hashing the whole thing until there are 64 or fewer + * bytes left to output. The spec calls this H' (a variant of the original + * hash it calls H, which is the unmodified BLAKE2b). + */ + +static ssh_hash *hprime_new(unsigned length) +{ + ssh_hash *h = blake2b_new_general(length > 64 ? 64 : length); + put_uint32_le(h, length); + return h; +} + +static void hprime_final(ssh_hash *h, unsigned length, void *vout) +{ + uint8_t *out = (uint8_t *)vout; + + while (length > 64) { + uint8_t hashbuf[64]; + ssh_hash_final(h, hashbuf); + + unsigned chunk = 32; + if (chunk > length) + chunk = length; + memcpy(out, hashbuf, chunk); + out += chunk; + length -= chunk; + + h = blake2b_new_general(length > 64 ? 64 : length); + put_data(h, hashbuf, 64); + + smemclr(hashbuf, sizeof(hashbuf)); + } + + ssh_hash_final(h, out); +} + +/* Externally visible entry point for the long hash function. This is only + * used by testcrypt, so it would be overkill to set it up like a proper + * ssh_hash. */ +strbuf *argon2_long_hash(unsigned length, ptrlen data) +{ + ssh_hash *h = hprime_new(length); + put_datapl(h, data); + strbuf *out = strbuf_new(); + hprime_final(h, length, strbuf_append(out, length)); + return out; +} + +/* ---------------------------------------------------------------------- + * Argon2's own mixing function G, which operates on 1Kb blocks of data. + * + * The definition of G in the spec takes two 1Kb blocks as input and produces + * a 1Kb output block. The first thing that happens to the input blocks is + * that they get XORed together, and then only the XOR output is used, so you + * could perfectly well regard G as a 1Kb->1Kb function. + */ + +static inline uint64_t ror(uint64_t x, unsigned rotation) +{ + unsigned lshift = 63 & -rotation, rshift = 63 & rotation; + return (x << lshift) | (x >> rshift); +} + +static inline uint64_t trunc32(uint64_t x) +{ + return x & 0xFFFFFFFF; +} + +/* Internal function similar to the BLAKE2b round, which mixes up four 64-bit + * words */ +static inline void GB(uint64_t *a, uint64_t *b, uint64_t *c, uint64_t *d) +{ + *a += *b + 2 * trunc32(*a) * trunc32(*b); + *d = ror(*d ^ *a, 32); + *c += *d + 2 * trunc32(*c) * trunc32(*d); + *b = ror(*b ^ *c, 24); + *a += *b + 2 * trunc32(*a) * trunc32(*b); + *d = ror(*d ^ *a, 16); + *c += *d + 2 * trunc32(*c) * trunc32(*d); + *b = ror(*b ^ *c, 63); +} + +/* Higher-level internal function which mixes up sixteen 64-bit words. This is + * applied to different subsets of the 128 words in a kilobyte block, and the + * API here is designed to make it easy to apply in the circumstances the spec + * requires. In every call, the sixteen words form eight pairs adjacent in + * memory, whose addresses are in arithmetic progression. So the 16 input + * words are in[0], in[1], in[instep], in[instep+1], ..., in[7*instep], + * in[7*instep+1], and the 16 output words similarly. */ +static inline void P(uint64_t *out, unsigned outstep, + uint64_t *in, unsigned instep) +{ + for (unsigned i = 0; i < 8; i++) { + out[i*outstep] = in[i*instep]; + out[i*outstep+1] = in[i*instep+1]; + } + + GB(out+0*outstep+0, out+2*outstep+0, out+4*outstep+0, out+6*outstep+0); + GB(out+0*outstep+1, out+2*outstep+1, out+4*outstep+1, out+6*outstep+1); + GB(out+1*outstep+0, out+3*outstep+0, out+5*outstep+0, out+7*outstep+0); + GB(out+1*outstep+1, out+3*outstep+1, out+5*outstep+1, out+7*outstep+1); + + GB(out+0*outstep+0, out+2*outstep+1, out+5*outstep+0, out+7*outstep+1); + GB(out+0*outstep+1, out+3*outstep+0, out+5*outstep+1, out+6*outstep+0); + GB(out+1*outstep+0, out+3*outstep+1, out+4*outstep+0, out+6*outstep+1); + GB(out+1*outstep+1, out+2*outstep+0, out+4*outstep+1, out+7*outstep+0); +} + +/* The full G function, taking input blocks X and Y. The result of G is most + * often XORed into an existing output block, so this API is designed with + * that in mind: the mixing function's output is always XORed into whatever + * 1Kb of data is already at 'out'. */ +static void G_xor(uint8_t *out, const uint8_t *X, const uint8_t *Y) +{ + uint64_t R[128], Q[128], Z[128]; + + for (unsigned i = 0; i < 128; i++) + R[i] = GET_64BIT_LSB_FIRST(X + 8*i) ^ GET_64BIT_LSB_FIRST(Y + 8*i); + + for (unsigned i = 0; i < 8; i++) + P(Q+16*i, 2, R+16*i, 2); + + for (unsigned i = 0; i < 8; i++) + P(Z+2*i, 16, Q+2*i, 16); + + for (unsigned i = 0; i < 128; i++) + PUT_64BIT_LSB_FIRST(out + 8*i, + GET_64BIT_LSB_FIRST(out + 8*i) ^ R[i] ^ Z[i]); + + smemclr(R, sizeof(R)); + smemclr(Q, sizeof(Q)); + smemclr(Z, sizeof(Z)); +} + +/* ---------------------------------------------------------------------- + * The main Argon2 function. + */ + +static void argon2_internal(uint32_t p, uint32_t T, uint32_t m, uint32_t t, + uint32_t y, ptrlen P, ptrlen S, ptrlen K, ptrlen X, + uint8_t *out) +{ + /* + * Start by hashing all the input data together: the four string arguments + * (password P, salt S, optional secret key K, optional associated data + * X), plus all the parameters for the function's memory and time usage. + * + * The output of this hash is the sole input to the subsequent mixing + * step: Argon2 does not preserve any more entropy from the inputs, it + * just makes it extra painful to get the final answer. + */ + uint8_t h0[64]; + { + ssh_hash *h = blake2b_new_general(64); + put_uint32_le(h, p); + put_uint32_le(h, T); + put_uint32_le(h, m); + put_uint32_le(h, t); + put_uint32_le(h, 0x13); /* hash function version number */ + put_uint32_le(h, y); + put_stringpl_le(h, P); + put_stringpl_le(h, S); + put_stringpl_le(h, K); + put_stringpl_le(h, X); + ssh_hash_final(h, h0); + } + + struct blk { uint8_t data[1024]; }; + + /* + * Array of 1Kb blocks. The total size is (approximately) m, the + * caller-specified parameter for how much memory to use; the blocks are + * regarded as a rectangular array of p rows ('lanes') by q columns, where + * p is the 'parallelism' input parameter (the lanes can be processed + * concurrently up to a point) and q is whatever makes the product pq come + * to m. + * + * Additionally, each row is divided into four equal 'segments', which are + * important to the way the algorithm decides which blocks to use as input + * to each step of the function. + * + * The term 'slice' refers to a whole set of vertically aligned segments, + * i.e. slice 0 is the whole left quarter of the array, and slice 3 the + * whole right quarter. + */ + size_t SL = m / (4*p); /* segment length: # of 1Kb blocks in a segment */ + size_t q = 4 * SL; /* width of the array: 4 segments times SL */ + size_t mprime = q * p; /* total size of the array, approximately m */ + + /* Allocate the memory. */ + struct blk *B = snewn(mprime, struct blk); + memset(B, 0, mprime * sizeof(struct blk)); + + /* + * Initial setup: fill the first two full columns of the array with data + * expanded from the starting hash h0. Each block is the result of using + * the long-output hash function H' to hash h0 itself plus the block's + * coordinates in the array. + */ + for (size_t i = 0; i < p; i++) { + ssh_hash *h = hprime_new(1024); + put_data(h, h0, 64); + put_uint32_le(h, 0); + put_uint32_le(h, i); + hprime_final(h, 1024, B[i].data); + } + for (size_t i = 0; i < p; i++) { + ssh_hash *h = hprime_new(1024); + put_data(h, h0, 64); + put_uint32_le(h, 1); + put_uint32_le(h, i); + hprime_final(h, 1024, B[i+p].data); + } + + /* + * Declarations for the main loop. + * + * The basic structure of the main loop is going to involve processing the + * array one whole slice (vertically divided quarter) at a time. Usually + * we'll write a new value into every single block in the slice, except + * that in the initial slice on the first pass, we've already written + * values into the first two columns during the initial setup above. So + * 'jstart' indicates the starting index in each segment we process; it + * starts off as 2 so that we don't overwrite the inital setup, and then + * after the first slice is done, we set it to 0, and it stays there. + * + * d_mode indicates whether we're being data-dependent (true) or + * data-independent (false). In the hybrid Argon2id mode, we start off + * independent, and then once we've mixed things up enough, switch over to + * dependent mode to force long serial chains of computation. + */ + size_t jstart = 2; + bool d_mode = (y == 0); + struct blk out2i, tmp2i, in2i; + + /* Outermost loop: t whole passes from left to right over the array */ + for (size_t pass = 0; pass < t; pass++) { + + /* Within that, we process the array in its four main slices */ + for (unsigned slice = 0; slice < 4; slice++) { + + /* In Argon2id mode, if we're half way through the first pass, + * this is the moment to switch d_mode from false to true */ + if (pass == 0 && slice == 2 && y == 2) + d_mode = true; + + /* Loop over every segment in the slice (i.e. every row). So i is + * the y-coordinate of each block we process. */ + for (size_t i = 0; i < p; i++) { + + /* And within that segment, process the blocks from left to + * right, starting at 'jstart' (usually 0, but 2 in the first + * slice). */ + for (size_t jpre = jstart; jpre < SL; jpre++) { + + /* j is the x-coordinate of each block we process, made up + * of the slice number and the index 'jpre' within the + * segment. */ + size_t j = slice * SL + jpre; + + /* jm1 is j-1 (mod q) */ + uint32_t jm1 = (j == 0 ? q-1 : j-1); + + /* + * Construct two 32-bit pseudorandom integers J1 and J2. + * This is the part of the algorithm that varies between + * the data-dependent and independent modes. + */ + uint32_t J1, J2; + if (d_mode) { + /* + * Data-dependent: grab the first 64 bits of the block + * to the left of this one. + */ + J1 = GET_32BIT_LSB_FIRST(B[i + p * jm1].data); + J2 = GET_32BIT_LSB_FIRST(B[i + p * jm1].data + 4); + } else { + /* + * Data-independent: generate pseudorandom data by + * hashing a sequence of preimage blocks that include + * all our input parameters, plus the coordinates of + * this point in the algorithm (array position and + * pass number) to make all the hash outputs distinct. + * + * The hash we use is G itself, applied twice (see + * comment at top of file). So we generate 1Kb of data + * at a time, which is enough for 128 (J1,J2) pairs. + * Hence we only need to do the hashing if our index + * within the segment is a multiple of 128, or if + * we're at the very start of the algorithm (in which + * case we started at 2 rather than 0). After that we + * can just keep picking data out of our most recent + * hash output. + */ + if (jpre == jstart || jpre % 128 == 0) { + /* + * Hash preimage is mostly zeroes, with a + * collection of assorted integer values we had + * anyway. + */ + memset(in2i.data, 0, sizeof(in2i.data)); + PUT_64BIT_LSB_FIRST(in2i.data + 0, pass); + PUT_64BIT_LSB_FIRST(in2i.data + 8, i); + PUT_64BIT_LSB_FIRST(in2i.data + 16, slice); + PUT_64BIT_LSB_FIRST(in2i.data + 24, mprime); + PUT_64BIT_LSB_FIRST(in2i.data + 32, t); + PUT_64BIT_LSB_FIRST(in2i.data + 40, y); + PUT_64BIT_LSB_FIRST(in2i.data + 48, jpre / 128 + 1); + + /* + * Now apply G twice to generate the hash output + * in out2i. + */ + memset(tmp2i.data, 0, sizeof(tmp2i.data)); + G_xor(tmp2i.data, tmp2i.data, in2i.data); + memset(out2i.data, 0, sizeof(out2i.data)); + G_xor(out2i.data, out2i.data, tmp2i.data); + } + + /* + * Extract J1 and J2 from the most recent hash output + * (whether we've just computed it or not). + */ + J1 = GET_32BIT_LSB_FIRST( + out2i.data + 8 * (jpre % 128)); + J2 = GET_32BIT_LSB_FIRST( + out2i.data + 8 * (jpre % 128) + 4); + } + + /* + * Now convert J1 and J2 into the index of an existing + * block of the array to use as input to this step. This + * is fairly fiddly. + * + * The easy part: the y-coordinate of the input block is + * obtained by reducing J2 mod p, except that at the very + * start of the algorithm (processing the first slice on + * the first pass) we simply use the same y-coordinate as + * our output block. + * + * Note that it's safe to use the ordinary % operator + * here, without any concern for timing side channels: in + * data-independent mode J2 is not correlated to any + * secrets, and in data-dependent mode we're going to be + * giving away side-channel data _anyway_ when we use it + * as an array index (and by assumption we don't care, + * because it's already massively randomised from the real + * inputs). + */ + uint32_t index_l = (pass == 0 && slice == 0) ? i : J2 % p; + + /* + * The hard part: which block in this array row do we use? + * + * First, we decide what the possible candidates are. This + * requires some case analysis, and depends on whether the + * array row is the same one we're writing into or not. + * + * If it's not the same row: we can't use any block from + * the current slice (because the segments within a slice + * have to be processable in parallel, so in a concurrent + * implementation those blocks are potentially in the + * process of being overwritten by other threads). But the + * other three slices are fair game, except that in the + * first pass, slices to the right of us won't have had + * any values written into them yet at all. + * + * If it is the same row, we _are_ allowed to use blocks + * from the current slice, but only the ones before our + * current position. + * + * In both cases, we also exclude the individual _column_ + * just to the left of the current one. (The block + * immediately to our left is going to be the _other_ + * input to G, but the spec also says that we avoid that + * column even in a different row.) + * + * All of this means that we end up choosing from a + * cyclically contiguous interval of blocks within this + * lane, but the start and end points require some thought + * to get them right. + */ + + /* Start position is the beginning of the _next_ slice + * (containing data from the previous pass), unless we're + * on pass 0, where the start position has to be 0. */ + uint32_t Wstart = (pass == 0 ? 0 : (slice + 1) % 4 * SL); + + /* End position splits up by cases. */ + uint32_t Wend; + if (index_l == i) { + /* Same lane as output: we can use anything up to (but + * not including) the block immediately left of us. */ + Wend = jm1; + } else { + /* Different lane from output: we can use anything up + * to the previous slice boundary, or one less than + * that if we're at the very left edge of our slice + * right now. */ + Wend = SL * slice; + if (jpre == 0) + Wend = (Wend + q-1) % q; + } + + /* Total number of blocks available to choose from */ + uint32_t Wsize = (Wend + q - Wstart) % q; + + /* Fiddly computation from the spec that chooses from the + * available blocks, in a deliberately non-uniform + * fashion, using J1 as pseudorandom input data. Output is + * zz which is the index within our contiguous interval. */ + uint32_t x = ((uint64_t)J1 * J1) >> 32; + uint32_t y = ((uint64_t)Wsize * x) >> 32; + uint32_t zz = Wsize - 1 - y; + + /* And index_z is the actual x coordinate of the block we + * want. */ + uint32_t index_z = (Wstart + zz) % q; + + /* Phew! Combine that block with the one immediately to + * our left, and XOR over the top of whatever is already + * in our current output block. */ + G_xor(B[i + p * j].data, B[i + p * jm1].data, + B[index_l + p * index_z].data); + } + } + + /* We've finished processing a slice. Reset jstart to 0. It will + * onily _not_ have been 0 if this was pass 0 slice 0, in which + * case it still had its initial value of 2 to avoid the starting + * data. */ + jstart = 0; + } + } + + /* + * The main output is all done. Final output works by taking the XOR of + * all the blocks in the rightmost column of the array, and then using + * that as input to our long hash H'. The output of _that_ is what we + * deliver to the caller. + */ + + struct blk C = B[p * (q-1)]; + for (size_t i = 1; i < p; i++) + memxor(C.data, C.data, B[i + p * (q-1)].data, 1024); + + { + ssh_hash *h = hprime_new(T); + put_data(h, C.data, 1024); + hprime_final(h, T, out); + } + + /* + * Clean up. + */ + smemclr(out2i.data, sizeof(out2i.data)); + smemclr(tmp2i.data, sizeof(tmp2i.data)); + smemclr(in2i.data, sizeof(in2i.data)); + smemclr(C.data, sizeof(C.data)); + smemclr(B, mprime * sizeof(struct blk)); + sfree(B); +} + +/* + * Wrapper function that appends to a strbuf (which sshpubk.c will want). + */ +void argon2(Argon2Flavour flavour, uint32_t mem, uint32_t passes, + uint32_t parallel, uint32_t taglen, + ptrlen P, ptrlen S, ptrlen K, ptrlen X, strbuf *out) +{ + argon2_internal(parallel, taglen, mem, passes, flavour, + P, S, K, X, strbuf_append(out, taglen)); +} + +/* + * Wrapper function which dynamically chooses the number of passes to run in + * order to hit an approximate total amount of CPU time. Writes the result + * into 'passes'. + */ +void argon2_choose_passes( + Argon2Flavour flavour, uint32_t mem, + uint32_t milliseconds, uint32_t *passes, + uint32_t parallel, uint32_t taglen, + ptrlen P, ptrlen S, ptrlen K, ptrlen X, + strbuf *out) +{ + unsigned long desired_time = (TICKSPERSEC * milliseconds) / 1000; + + /* + * We only need the time taken to be approximately right, so we + * scale up the number of passes geometrically, which avoids + * taking O(t^2) time to find a pass count taking time t. + * + * Using the Fibonacci numbers is slightly nicer than the obvious + * approach of powers of 2, because it's still very easy to + * compute, and grows less fast (powers of 1.6 instead of 2), so + * you get just a touch more precision. + */ + uint32_t a = 1, b = 1; + + while (true) { + unsigned long start_time = GETTICKCOUNT(); + argon2(flavour, mem, b, parallel, taglen, P, S, K, X, out); + unsigned long ticks = GETTICKCOUNT() - start_time; + + /* But just in case computers get _too_ fast, we have to cap + * the growth before it gets past the uint32_t upper bound! So + * if computing a+b would overflow, stop here. */ + + if (ticks >= desired_time || a > (uint32_t)~b) { + *passes = b; + return; + } else { + strbuf_clear(out); + + /* Next Fibonacci number: replace (a, b) with (b, a+b) */ + b += a; + a = b - a; + } + } +} diff --git a/test/cryptsuite.py b/test/cryptsuite.py index b3d6e623..e39e0d0d 100755 --- a/test/cryptsuite.py +++ b/test/cryptsuite.py @@ -1757,6 +1757,199 @@ culpa qui officia deserunt mollit anim id est laborum. "daafcf2bd6fccf976cbc234b71cd9f4f7d56fe0eb33a40018707089a215c44a8" "4b272d0329ae6d85a0f8acc7e964dc2facb715ba472bb6")) + def testArgon2LongHash(self): + # Unit-test the Argon2 long hash function H', which starts off + # the same as BLAKE2b, but comes with its own method of + # extending the output length past 64 bytes. + # + # I generated these test values using a test program linked + # against the reference implementation's libargon2.a and + # calling its blake2b_long function. + preimage = b'hello, world' + + self.assertEqualBin(argon2_long_hash(1, preimage), unhex("8b")) + self.assertEqualBin(argon2_long_hash(2, preimage), unhex("1ff9")) + self.assertEqualBin(argon2_long_hash(63, preimage), unhex( + "e2c997721f1d64aa8c25e588fb8ab19646ce6d5c2a431fa560fcb813e55dd481" + "322d2630d95ca6b1b63317b13d6b111e5816170c80c3ca7d5b4bf894096de4")) + self.assertEqualBin(argon2_long_hash(64, preimage), unhex( + "0c7ba7ee6d510b4bb5c9b69ac91e25e0b11aa30dd6234b8e61b0fe1537c037b8" + "8ed5aa59a277e8cc07095c81aff26d08967e4dfdabd32db8b6af6ceb78cf8c47")) + self.assertEqualBin(argon2_long_hash(65, preimage), unhex( + "680941abbd8fc80f28c38d623e90903f08709bf76575e2775d4ce01c31b192c8" + "73038d9a31af8991c8b1ad4f2b1991f4d15f73ab0f4f3add415c297a12eb9ddb" + "76")) + self.assertEqualBin(argon2_long_hash(95, preimage), unhex( + "4be28c51850fed70d9403e1406b6ba68a83d98cf222a4ee162beef60fd3384df" + "eba3fce9d95f646982eb384ac943ce5263cb03428fd8d261cc41ffdb7ba328fe" + "098526f2b49593f9e7f38188598ce4693b59f4dd32db30c1be9a9d35784fa0")) + self.assertEqualBin(argon2_long_hash(96, preimage), unhex( + "20295ea01e822cca113f668f33e5e481ed5879bfd7de6359ea42d497da97be52" + "2cdd518d34ae32c44cabd45249b4e697626b0b14b6a33a2bd138be0a4bceeaf4" + "9528f93acef01b093ee84d8d871d1ee6cf7c10e83ad0619631aed19345166f03")) + self.assertEqualBin(argon2_long_hash(97, preimage), unhex( + "d24b31f3ac0baad168d524efc4bafee55fef743fd60b14e28b860d7523e319c7" + "520e2d5457cc3d06dc1044530afdf6990fa12e38d5802eb642f8e77fcfee2c0b" + "1f84a28877f2f2f049ed9299e1e0230f98af3a161185970aad21f0ea0f5184cf" + "90")) + self.assertEqualBin(argon2_long_hash(127, preimage), unhex( + "5d1e8380450dbc985418ed1f3700b925ae0719e4486e29131c81bca7083ac6b8" + "f535c3398488e34d3dc1390de44097f1eee498f10ebe85b579e99a7672023b01" + "ca5c20e63c595b640e00d80f113a52e3773719889b266ab4c65269c11fb212e4" + "75f2b769bb26321bb60ecc0d490821e5056d7dfc9def3cd065d3ba90360764")) + self.assertEqualBin(argon2_long_hash(128, preimage), unhex( + "be15b316f3483c4d0d00f71a65b974894a2025f441b79b9fe461bc740cb0b039" + "c4fe914f61c05a612d63ebc50a662b2d59b1996091e5e3474340544ea46a46cb" + "25c41ff700fafcd96c4f12ddc698cd2426558f960696837ea8170fd2fe284b54" + "8f585f97919ef14f2b3cbb351eb98872add7ba6d08c1401232df6cc878fbeb22")) + self.assertEqualBin(argon2_long_hash(129, preimage), unhex( + "83da464c278dcb12c29b6685fee6d32f0b461337c155369ad0d56b58b0aa5f80" + "9aa7b56bd41b664c8d768957f8f0e40999fb0178eb53cf83f31d725bf92881bc" + "900774bce4cdf56b6386ad3de6891d11a0ccd4564a3431fc4c24105a02d0a6a2" + "434712b9a7471f3223c72a6e64912200d0a3d149a19d06fe9dc8ec09d7ed5a48" + "bb")) + self.assertEqualBin(argon2_long_hash(511, preimage), unhex( + "30c0c0d0467e7665368db0b40a2324a61fb569d35172de2df53a9739a8d18e60" + "b4f25d521c8855604be3e24ea56302566074323d94c0bd3a33d08f185d8ba5ac" + "a2bc3fb2e4c4e5ffec5778daea67c6b5913c9cac16f2e5c7b7818e757fa747b3" + "69e586d616010a752762f69c604238ed8738430366fbdb7493454fa02391a76b" + "30f241695b9fa8d3a3116227c6bb6f72d325cf104ab153d15f928b22767d467d" + "4bf7e16176aaa7315954b7872061933c12d548f1f93a8abb9d73791661bee521" + "b2ae51be373a229dfef32787234c1be5846d133563002b9a029178716ad41e70" + "1539d3fad300c77607c5217701e3e485d72c980f3f71d525c8148375a2f8d22c" + "a211ba165330a90b7e0e6baa6073833925c23bdd388ee904f38463c7e6b85475" + "09b810aae5c9ffc5dd902c2ffe049c338e3ae2c6416d3b874d6a9d384089564c" + "0d8e4dce9b6e47e1d5ec9087bf526cc9fa35aab1893a0588d31b77fea37e0799" + "468deacde47629d2960a3519b3bcd4e22364a9cccd3b128cba21cac27f140d53" + "f79c11e4157e4cb48272eecdf62f52084a27e5b0933bbe66ded17e2df6f8d398" + "f6c479c3c716457820ad177b8bd9334cb594e03d09fcc4f82d4385e141eacd7d" + "9ad1e1c4cb42788af70bac0509f0a891e662960955490abf2763373803e8c89c" + "df632579cb9c647634b30df214a3d67b92fd55d283c42c63b470a48a78cd5b")) + self.assertEqualBin(argon2_long_hash(512, preimage), unhex( + "79a6974e29a9a6c069e0156774d35c5014a409f5ffc60013725367a7208d4929" + "7d228637751768a31a59e27aa89372f1bcc095a6fa331198a5bd5ad053ba2ebb" + "cbcc501ea55cf142e8d95209228c9ab60cd104d5077472f2a9ecaa071aed6ee9" + "5de29e188b7399d5b6b7ed897b2bc4dd1ea745eb9974e39ca6fb983380cc537a" + "c04dfe6caefe85faf206b1613092ebadf791eaa8a5b814c9a79a73a5733b0505" + "a47163c10a0f7309df6663896df6079a7c88c6879bb591a40abd398c6deda792" + "1cc3986435b1c840a768b2fa507446f2f77a406b1b2f739f7795db24789c8927" + "24b4c84b7005445123154f8cd2ba63a7ede672af5d197f846700732025c9931d" + "1c67c5493417ca394a8f68ba532645815cf7b5102af134ecb4fd9e326f53779a" + "3039dbef6a0880db9e38b6b61d2f9ead969e4224c2d9c69b5897e5eeb7032e83" + "334e192ff50017056ccb84d4cc8eee3ab248d2614643d0174fe18c72186dd967" + "92d8545645ddf4a9b2c7a91c9a71857a399449d7154077a8e9580f1a2d20227d" + "671b455ccb897cba0491e50892120d7877f7776d653cfdb176fa3f64a9e6f848" + "cd681c487b488775aaf698294eec813b2cca90d68d63b5d886d61c1a8e922aaa" + "330fd658ede56e34bcd288048e845eba7b8e2e7cc22ba6c91b523e48017aa878" + "8ce4f91d0e6d6c6706762fb0cc7f465cee3916684fb21e337cfe1b583e0b1e92")) + self.assertEqualBin(argon2_long_hash(513, preimage), unhex( + "32243cfbd7eca582d60b3b8ea3ba3d93783537689c7cbcd1d1cbde46200b8c86" + "617fc00e8a9ae991a1e2f91c67e07d5f0a777d982c1461d0c5474e4e164b053c" + "2808559e2b8a5ac4a46a5fcbc825b1d5302c7b0611940194eb494d45ce7113a2" + "3424b51c199c6a5100ab159ff323eda5feffee4da4155a028a81da9d44e4286b" + "ac3dab4ffce43a80b6ce97a47ea0ac51ee16e8b4d3b68942afdc20e1c21747c4" + "94859c3d3883e7dc19ea416a393a3507683d9d03e6a3a91f8f1cb8a7d5d9892e" + "80c8fb0222527a73a1f59b9dd41770982f2af177a6e96093064534803edd0713" + "71ede53024cedc291d768325bb4e4def9af1b5569c349b64816496c37a8787b5" + "4fbe248372ebadb5ce20e03eaa935dc55ff4b8cbe5d6d844c7b71d4656fef22c" + "5a49f13d75a7a8368a2dbc1e78d732b879bfc5c9467eda2bf4918f0c59037ae3" + "dee7880a171409dd1a4e143c814e60301ac77237f261fa7519a04e68000530f9" + "708ed9fda5609d655560a9491f80f5875ad5725e3120686b73319c6a727932e3" + "20a2174422523498c38fea47aeb20d135ff9fd93c6fa6db0005e0001685d7577" + "33a82a4dc9dd6556b938f7b8dafd0d670846780b9931b815063708189b17877b" + "825533bcc250fb576a28be4caa107e6a3a6f7b0c60fb51b0def27008b7e272ac" + "95d610bfa912339799a2e537ce543d7862dddbe31bb224fda4ae283571847a28" + "54")) + self.assertEqualBin(argon2_long_hash(1024, preimage), unhex( + "951252f6fa152124f381266a358d9b78b88e469d08d5fc78e4ea32253c7fc26c" + "3ff1c93529ab4ee6fcf00acf29bbaba934a4014ce2625e0806601c55e6ce70d7" + "121fd82f0904f335c5c7ba07dc6e6adf7582c92f7f255072203ea85844b4fe54" + "817476a20bb742710ffc42750361be94332d0fc721b192309acfa70da43db6ae" + "1d0f0bbe8a3250966a4532b36728162073c9eb3e119ea4c1c187c775dbb25a5d" + "d883e3f65706a5fca897cdc4a8aa7b68ba3f57940c72f3a3396c417e758ba071" + "95be4afba325237c0e2738a74d96fd1350fb623cb2ad40ea8b1e070cf398b98c" + "2865ea40225b81f031f2b405409ca01dc5d9903d3d8e1d6381fbe7ccfc8f3dab" + "eadafd7c976c0ba84a936f78ff7df0f112c089ba88f82bed7f9a6e31a91e5fee" + "f675755454b948de22695660b243b9eca3bcc89608f83d2baa1d73dd6b8bd4f9" + "b995ed9cb0f1edc6e98a49ed841b506c1bf59b43f4b3457a376bbff116c1a4f6" + "07cc62381fc5c19953c68f300c1b51198d40784d812d25810ba404862f04b680" + "6039a074f612ad8b84e0941ba23c915c3e7162c225fbecffdb7dc1ab559b2b54" + "32fe8a498c32e918d8e7e33254ff75077f648827705e987f4d90fba971e78e1a" + "6896b4d775c7359dc950f1e964fa04621aacf3c0988969490f4c72c54caf79e8" + "481053cc0a27ffcd3580aabf9ef1268d498d8a18bd70e9b8402e011753bb7dc7" + "e856c00d988fca924ee7cf61979c38cda8a872e4cc4fbdc90c23a0ded71eb944" + "bb816ab22d9a4380e3e9d1cec818165c2fba6c5d51dcbf452c0cb1779a384937" + "64d695370e13a301eca7be68d4112d2177381514efbb36fe08fc5bc2970301b8" + "06f8e5a57a780e894d5276e2025bb775b6d1861e33c54ab6e3eb72947fbe6f91" + "8174ce24eb4682efbb3c4f01233dc7ce9ef44792e9e876bb03e6751b3d559047" + "d045127d976aa042fc55c690c9048e200065e7b7de19d9353aa9ac9b3e7611f0" + "d1c42d069a300455ca1f7420a352bace89215e705106927510c11b3b1c1486d9" + "f3ab006d2de2ee2c94574f760ce8c246bca229f98c66f06042b14f1fff9a16c0" + "1550237e16d108ce5597299b1eb406a9ee505a29a6e0fa526b3e6beafd336aea" + "138b2f31971586f67c5ffffbd6826d1c75666038c43d0bdff4edfc294e064a49" + "2eed43e2dc78d00abc4e85edcd9563b8251b66f57b0f4b6d17f5a3f35c87c488" + "dbeeb84fd720286197c2dec8290eccf3a313747de285b9cd3548e90cf81b3838" + "3ffcc8c2a7f582feb369d05cb96b9b224d05902b3e39e5b96536032e9dddeb9b" + "9d4f40a9c8f544ca37cf8d39d7c8c6a33880e9184ed017bd642db9590759bd10" + "7362048ede5c0257feecc4984584592c566f37fba8469c064015339fb4f03023" + "56ece37fd3655aae2bfc989b9b4c1384efc3503c8866db901802cb36eda9fb00")) + + def testArgon2(self): + # A few tests of my own of Argon2, derived from the reference + # implementation. + pwd = b"password" + salt = b"salt of at least 16 bytes" + secret = b"secret" + assoc = b"associated data" + + # Smallest memory (8Kb) and parallelism (1) parameters the + # reference implementation will accept, but lots of passes + self.assertEqualBin( + argon2('i', 8, 16, 1, 24, pwd, salt, secret, assoc), unhex( + "314da280240a3ca1eedd1f1db417a76eb0741e7df64b8cdf")) + self.assertEqualBin( + argon2('d', 8, 16, 1, 24, pwd, salt, secret, assoc), unhex( + "9cc961cf43e0f86c2d4e202b816dc5bc5b2177e68faa0b08")) + self.assertEqualBin( + argon2('id', 8, 16, 1, 24, pwd, salt, secret, assoc), unhex( + "6cd6c490c582fa597721d772d4e3de166987792491b48c51")) + + # Test a memory cost value that isn't a power of 2. This + # checks a wraparound case during the conversion of J1 to a + # block index, and is a regression test for a bug that nearly + # got past me during original development. + self.assertEqualBin( + argon2('i', 104, 16, 2, 24, pwd, salt, secret, assoc), unhex( + "a561963623f1073c9aa8caecdb600c73ffc6de677ba8d97c")) + self.assertEqualBin( + argon2('d', 104, 16, 2, 24, pwd, salt, secret, assoc), unhex( + "a9014db7f1d468fb25b88fa7fc0deac0f2e7f27e25d2cf6e")) + self.assertEqualBin( + argon2('id', 104, 16, 2, 24, pwd, salt, secret, assoc), unhex( + "64f3212b1e7725ffcf9ae2d1753d63e763bcd6970061a435")) + + # Larger parameters that should exercise the pseudorandom + # block indexing reasonably thoroughly. Also generate plenty + # of output data. + self.assertEqualBin( + argon2('i', 1024, 5, 16, 77, pwd, salt, secret, assoc), unhex( + "b008a685ff57730fad0e6f3ef3b9189282c0d9b05303675f43b5f3054724" + "733fcbe8e2639cc2c930535b31b723339041bcd703bf2483455acf86c0e6" + "9ed88c545ad40f1f2068855e4d61e99407")) + self.assertEqualBin( + argon2('d', 1024, 5, 16, 111, pwd, salt, secret, assoc), unhex( + "399ffbcd720c47745b9deb391ed0de7d5e0ffe53aef9f8ef7a7918cfa212" + "53df8cc577affbd5e0c0f8bf6d93c11b2f63973f8fc8f89dccd832fc587e" + "5d61717be6e88ca33eef5d1e168c028bae632a2a723c6c83f8e755f39171" + "5eda1c77c8e2fe06fbdd4e56d35262587e7df73cd7")) + self.assertEqualBin( + argon2('id', 1024, 5, 16, 123, pwd, salt, secret, assoc), unhex( + "6636807289cb9b9c032f48dcc31ffed1de4ca6c1b97e1ce768d690486341" + "2ac84b39d568a81dd01d9ee3ceec6cc23441d95e6abeb4a2024f1f540d56" + "9b799277c4037ddc7195ba783c9158a901adc7d4a5df8357b34a3869e5d6" + "aeae2a21201eef5e347de22c922192e8f46274b0c9d33e965155a91e7686" + "9d530e")) + def testRSAVerify(self): def blobs(n, e, d, p, q, iqmp): pubblob = ssh_string(b"ssh-rsa") + ssh2_mpint(e) + ssh2_mpint(n) @@ -2471,6 +2664,24 @@ class standard_test_vectors(MyTestBase): digest = ssh_hash_digest(h) self.assertEqualBin(digest, unhex(vector['out'])) + def testArgon2(self): + # draft-irtf-cfrg-argon2-12 section 5 + self.assertEqualBin( + argon2('d', 32, 3, 4, 32, b'\x01' * 32, b'\x02' * 16, + b'\x03' * 8, b'\x04' * 12), + unhex("512b391b6f1162975371d30919734294" + "f868e3be3984f3c1a13a4db9fabe4acb")) + self.assertEqualBin( + argon2('i', 32, 3, 4, 32, b'\x01' * 32, b'\x02' * 16, + b'\x03' * 8, b'\x04' * 12), + unhex("c814d9d1dc7f37aa13f0d77f2494bda1" + "c8de6b016dd388d29952a4c4672b6ce8")) + self.assertEqualBin( + argon2('id', 32, 3, 4, 32, b'\x01' * 32, b'\x02' * 16, + b'\x03' * 8, b'\x04' * 12), + unhex("0d640df58d78766c08c037a34a8b53c9" + "d01ef0452d75b65eb52520e96b01e659")) + def testHmacSHA(self): # Test cases from RFC 6234 section 8.5. def vector(key, message, s1=None, s256=None): diff --git a/test/testcrypt.py b/test/testcrypt.py index ba0160f2..973b90ee 100644 --- a/test/testcrypt.py +++ b/test/testcrypt.py @@ -177,7 +177,8 @@ def make_argword(arg, argtype, fnname, argindex, to_preserve): return "true" if arg else "false" if typename in { "hashalg", "macalg", "keyalg", "cipheralg", - "dh_group", "ecdh_alg", "rsaorder", "primegenpolicy"}: + "dh_group", "ecdh_alg", "rsaorder", "primegenpolicy", + "argon2flavour"}: arg = coerce_to_bytes(arg) if isinstance(arg, bytes) and b" " not in arg: return arg diff --git a/testcrypt.c b/testcrypt.c index 0113fbfc..f76417bc 100644 --- a/testcrypt.c +++ b/testcrypt.c @@ -403,6 +403,32 @@ static const PrimeGenerationPolicy *get_primegenpolicy(BinarySource *in) fatal_error("primegenpolicy '%.*s': not found", PTRLEN_PRINTF(name)); } +static Argon2Flavour get_argon2flavour(BinarySource *in) +{ + static const struct { + const char *key; + Argon2Flavour value; + } algs[] = { + {"d", Argon2d}, + {"i", Argon2i}, + {"id", Argon2id}, + /* I expect to forget which spelling I chose, so let's support many */ + {"argon2d", Argon2d}, + {"argon2i", Argon2i}, + {"argon2id", Argon2id}, + {"Argon2d", Argon2d}, + {"Argon2i", Argon2i}, + {"Argon2id", Argon2id}, + }; + + ptrlen name = get_word(in); + for (size_t i = 0; i < lenof(algs); i++) + if (ptrlen_eq_string(name, algs[i].key)) + return algs[i].value; + + fatal_error("Argon2 flavour '%.*s': not found", PTRLEN_PRINTF(name)); +} + static uintmax_t get_uint(BinarySource *in) { ptrlen word = get_word(in); @@ -1219,6 +1245,16 @@ PockleStatus pockle_add_prime_wrapper(Pockle *pockle, mp_int *p, } #define pockle_add_prime pockle_add_prime_wrapper +strbuf *argon2_wrapper(Argon2Flavour flavour, uint32_t mem, uint32_t passes, + uint32_t parallel, uint32_t taglen, + ptrlen P, ptrlen S, ptrlen K, ptrlen X) +{ + strbuf *out = strbuf_new(); + argon2(flavour, mem, passes, parallel, taglen, P, S, K, X, out); + return out; +} +#define argon2 argon2_wrapper + #define OPTIONAL_PTR_FUNC(type) \ typedef TD_val_##type TD_opt_val_##type; \ static TD_opt_val_##type get_opt_val_##type(BinarySource *in) { \ @@ -1254,6 +1290,7 @@ typedef key_components *TD_keycomponents; typedef const PrimeGenerationPolicy *TD_primegenpolicy; typedef struct mpint_list TD_mpint_list; typedef PockleStatus TD_pocklestatus; +typedef Argon2Flavour TD_argon2flavour; #define FUNC0(rettype, function) \ static void handle_##function(BinarySource *in, strbuf *out) { \ diff --git a/testcrypt.h b/testcrypt.h index e7624aa0..12e248b8 100644 --- a/testcrypt.h +++ b/testcrypt.h @@ -261,6 +261,12 @@ FUNC5(int, rsa1_load_s, val_string_binarysource, val_rsa, out_opt_val_string_asc FUNC3(val_string, ppk_save_sb, val_key, opt_val_string_asciz, opt_val_string_asciz) FUNC3(val_string, rsa1_save_sb, val_rsa, opt_val_string_asciz, opt_val_string_asciz) +/* + * Password hashing. + */ +FUNC9(val_string, argon2, argon2flavour, uint, uint, uint, uint, val_string_ptrlen, val_string_ptrlen, val_string_ptrlen, val_string_ptrlen) +FUNC2(val_string, argon2_long_hash, uint, val_string_ptrlen) + /* * Key generation functions. */ diff --git a/testsc.c b/testsc.c index 5beeee95..6f9ee003 100644 --- a/testsc.c +++ b/testsc.c @@ -327,6 +327,7 @@ VOLATILE_WRAPPED_DEFN(static, size_t, looplimit, (size_t x)) CIPHERS(CIPHER_TESTLIST, X) \ MACS(MAC_TESTLIST, X) \ HASHES(HASH_TESTLIST, X) \ + X(argon2) \ /* end of list */ static void test_mp_get_nbits(void) @@ -1409,6 +1410,36 @@ struct test { void (*testfn)(void); }; +static void test_argon2(void) +{ + /* + * We can only expect the Argon2i variant to pass this stringent + * test for no data-dependency, because the other two variants of + * Argon2 have _deliberate_ data-dependency. + */ + size_t inlen = 48+16+24+8; + uint8_t *indata = snewn(inlen, uint8_t); + ptrlen password = make_ptrlen(indata, 48); + ptrlen salt = make_ptrlen(indata+48, 16); + ptrlen secret = make_ptrlen(indata+48+16, 24); + ptrlen assoc = make_ptrlen(indata+48+16+24, 8); + + strbuf *outdata = strbuf_new(); + strbuf_append(outdata, 256); + + for (size_t i = 0; i < looplimit(16); i++) { + strbuf_clear(outdata); + random_read(indata, inlen); + + log_start(); + argon2(Argon2i, 32, 2, 2, 144, password, salt, secret, assoc, outdata); + log_end(); + } + + sfree(indata); + strbuf_free(outdata); +} + static const struct test tests[] = { #define STRUCT_TEST(X) { #X, test_##X }, TESTLIST(STRUCT_TEST)