From 8581676ee9c4ebd6365444d3d98973bf8fe33ccb Mon Sep 17 00:00:00 2001 From: Simon Tatham Date: Sun, 7 Jun 2015 12:26:26 +0100 Subject: [PATCH] Dedicated routines for poly1305 arithmetic. Rather than doing arithmetic mod 2^130-5 using the general-purpose Bignum library, which requires lots of mallocs and frees per operation and also uses a general-purpose divide routine for each modular reduction, we now have some dedicated routines in sshccp.c to do arithmetic mod 2^130-5 in a more efficient way, and hopefully also with data-independent performance. Because PuTTY's target platforms don't all use the same size of bignum component, I've arranged to auto-generate the arithmetic functions using a Python script living in the 'contrib' directory. As and when we need to support an extra BignumInt size, that script should still be around to re-run with different arguments. --- contrib/make1305.py | 267 +++++++++++++++ sshccp.c | 804 ++++++++++++++++++++++++++++++++++++++++---- 2 files changed, 1011 insertions(+), 60 deletions(-) create mode 100755 contrib/make1305.py diff --git a/contrib/make1305.py b/contrib/make1305.py new file mode 100755 index 00000000..ede220df --- /dev/null +++ b/contrib/make1305.py @@ -0,0 +1,267 @@ +#!/usr/bin/env python + +import sys + +class Output(object): + def __init__(self, bignum_int_bits): + self.bignum_int_bits = bignum_int_bits + self.text = "" + self.vars = [] + def stmt(self, statement): + self.text += " %s;\n" % statement + def register_var(self, var): + self.vars.append(var) + def finalise(self): + for var in self.vars: + assert var.maxval == 0, "Variable not clear: %s" % var.name + return self.text + +class Variable(object): + def __init__(self, out, name): + self.out = out + self.maxval = 0 + self.name = name + self.placeval = None + self.out.stmt("BignumDblInt %s" % (self.name)) + self.out.register_var(self) + def clear(self, placeval): + self.maxval = 0 + self.placeval = placeval + self.out.stmt("%s = 0" % (self.name)) + def set_word(self, name, limit=None): + if limit is not None: + self.maxval = limit-1 + else: + self.maxval = (1 << self.out.bignum_int_bits) - 1 + assert self.maxval < (1 << 2*self.out.bignum_int_bits) + self.out.stmt("%s = %s" % (self.name, name)) + def add_word(self, name, limit=None): + if limit is not None: + self.maxval += limit-1 + else: + self.maxval += (1 << self.out.bignum_int_bits) - 1 + assert self.maxval < (1 << 2*self.out.bignum_int_bits) + self.out.stmt("%s += %s" % (self.name, name)) + def add_input_word(self, fmt, wordpos, limit=None): + assert self.placeval == wordpos * self.out.bignum_int_bits + self.add_word(fmt % wordpos, limit) + def set_to_product(self, a, b, placeval): + self.maxval = ((1 << self.out.bignum_int_bits) - 1) ** 2 + assert self.maxval < (1 << 2*self.out.bignum_int_bits) + self.out.stmt("%s = (BignumDblInt)(%s) * (%s)" % (self.name, a, b)) + self.placeval = placeval + def add_bottom_half(self, srcvar): + self.add_word("%s & BIGNUM_INT_MASK" % (srcvar.name)) + def add_top_half(self, srcvar): + self.add_word("%s >> %d" % (srcvar.name, self.out.bignum_int_bits)) + def unload_into(self, topvar, botvar): + assert botvar.placeval == self.placeval + botvar.add_bottom_half(self) + assert topvar.placeval == self.placeval + self.out.bignum_int_bits + topvar.add_top_half(self) + self.maxval = 0 + def output_word(self, bitpos, bits, destfmt, destwordpos): + assert bitpos == 0 + assert self.placeval == destwordpos * self.out.bignum_int_bits + dest = destfmt % destwordpos + if bits == self.out.bignum_int_bits: + self.out.stmt("%s = %s" % (dest, self.name)) + else: + self.out.stmt("%s = %s & (((BignumInt)1 << %d)-1)" % + (dest, self.name, bits)) + def transfer_to_next_acc(self, bitpos, bits, pow5, destvar): + destbitpos = self.placeval + bitpos - 130 * pow5 - destvar.placeval + #print "transfer", "*%d" % 5**pow5, self.name, self.placeval, bitpos, destvar.name, destvar.placeval, destbitpos, bits + assert 0 <= bitpos < bitpos+bits <= self.out.bignum_int_bits + assert 0 <= destbitpos < destbitpos+bits <= self.out.bignum_int_bits + expr = self.name + if bitpos > 0: + expr = "(%s >> %d)" % (expr, bitpos) + expr = "(%s & (((BignumInt)1 << %d)-1))" % (expr, bits) + self.out.stmt("%s += %s * ((BignumDblInt)%d << %d)" % + (destvar.name, expr, 5**pow5, destbitpos)) + destvar.maxval += (((1 << bits)-1) << destbitpos) * (5**pow5) + def shift_down_from(self, top): + if top is not None: + self.out.stmt("%s = %s + (%s >> %d)" % + (self.name, top.name, self.name, + self.out.bignum_int_bits)) + topmaxval = top.maxval + else: + self.out.stmt("%s >>= %d" % (self.name, self.out.bignum_int_bits)) + topmaxval = 0 + self.maxval = topmaxval + self.maxval >> self.out.bignum_int_bits + assert self.maxval < (1 << 2*self.out.bignum_int_bits) + if top is not None: + assert self.placeval + self.out.bignum_int_bits == top.placeval + top.clear(top.placeval + self.out.bignum_int_bits) + self.placeval += self.out.bignum_int_bits + +def gen_add(bignum_int_bits): + out = Output(bignum_int_bits) + + inbits = 130 + inwords = (inbits + bignum_int_bits - 1) / bignum_int_bits + + # This is an addition _without_ reduction mod p, so that it can be + # used both during accumulation of the polynomial and for adding + # on the encrypted nonce at the end (which is mod 2^128, not mod + # p). + # + # Because one of the inputs will have come from our + # not-completely-reducing multiplication function, we expect up to + # 3 extra bits of input. + acclo = Variable(out, "acclo") + + acclo.clear(0) + + for wordpos in range(inwords): + limit = min(1 << bignum_int_bits, 1 << (130 - wordpos*bignum_int_bits)) + acclo.add_input_word("a->w[%d]", wordpos, limit) + acclo.add_input_word("b->w[%d]", wordpos, limit) + acclo.output_word(0, bignum_int_bits, "r->w[%d]", wordpos) + acclo.shift_down_from(None) + + return out.finalise() + +def gen_mul_1305(bignum_int_bits): + out = Output(bignum_int_bits) + + inbits = 130 + inwords = (inbits + bignum_int_bits - 1) / bignum_int_bits + + # The inputs are not 100% reduced mod p. Specifically, we can get + # a full 130-bit number from the pow5==0 pass, and then a 130-bit + # number times 5 from the pow5==1 pass, plus a possible carry. The + # total of that can be easily bounded above by 2^130 * 8, so we + # need to assume we're multiplying two 133-bit numbers. + outbits = (inbits + 3) * 2 + outwords = (outbits + bignum_int_bits - 1) / bignum_int_bits + 1 + + tmp = Variable(out, "tmp") + acclo = Variable(out, "acclo") + acchi = Variable(out, "acchi") + acc2lo = Variable(out, "acc2lo") + + pow5, bits_at_pow5 = 0, inbits + + acclo.clear(0) + acchi.clear(bignum_int_bits) + bits_needed_in_acc2 = bignum_int_bits + + for outwordpos in range(outwords): + for a in range(inwords): + b = outwordpos - a + if 0 <= b < inwords: + tmp.set_to_product("a->w[%d]" % a, "b->w[%d]" % b, + outwordpos * bignum_int_bits) + tmp.unload_into(acchi, acclo) + + bits_in_word = bignum_int_bits + bitpos = 0 + #print "begin output" + while bits_in_word > 0: + chunk = min(bits_in_word, bits_at_pow5) + if pow5 > 0: + chunk = min(chunk, bits_needed_in_acc2) + if pow5 == 0: + acclo.output_word(bitpos, chunk, "r->w[%d]", outwordpos) + else: + acclo.transfer_to_next_acc(bitpos, chunk, pow5, acc2lo) + bits_needed_in_acc2 -= chunk + if bits_needed_in_acc2 == 0: + assert acc2lo.placeval % bignum_int_bits == 0 + other_outwordpos = acc2lo.placeval / bignum_int_bits + acc2lo.add_input_word("r->w[%d]", other_outwordpos) + acc2lo.output_word(bitpos, bignum_int_bits, "r->w[%d]", + other_outwordpos) + acc2lo.shift_down_from(None) + bits_needed_in_acc2 = bignum_int_bits + bits_in_word -= chunk + bits_at_pow5 -= chunk + bitpos += chunk + if bits_at_pow5 == 0: + if pow5 > 0: + assert acc2lo.placeval % bignum_int_bits == 0 + other_outwordpos = acc2lo.placeval / bignum_int_bits + acc2lo.add_input_word("r->w[%d]", other_outwordpos) + acc2lo.output_word(0, bignum_int_bits, "r->w[%d]", + other_outwordpos) + pow5 += 1 + bits_at_pow5 = inbits + acc2lo.clear(0) + bits_needed_in_acc2 = bignum_int_bits + acclo.shift_down_from(acchi) + + while acc2lo.maxval > 0: + other_outwordpos = acc2lo.placeval / bignum_int_bits + bitsleft = inbits - other_outwordpos * bignum_int_bits + limit = 1<w[%d]", other_outwordpos, limit=limit) + acc2lo.output_word(0, bignum_int_bits, "r->w[%d]", other_outwordpos) + acc2lo.shift_down_from(None) + + return out.finalise() + +def gen_final_reduce_1305(bignum_int_bits): + out = Output(bignum_int_bits) + + inbits = 130 + inwords = (inbits + bignum_int_bits - 1) / bignum_int_bits + + # We take our input number n, and compute k = 5 + 5*(n >> 130). + # Then k >> 130 is precisely the multiple of p that needs to be + # subtracted from n to reduce it to strictly less than p. + + acclo = Variable(out, "acclo") + + acclo.clear(0) + # Hopefully all the bits we're shifting down fit in the same word. + assert 130 / bignum_int_bits == (130 + 3 - 1) / bignum_int_bits + acclo.add_word("5 * ((n->w[%d] >> %d) + 1)" % + (130 / bignum_int_bits, 130 % bignum_int_bits), + limit = 5 * (7 + 1)) + for wordpos in range(inwords): + acclo.add_input_word("n->w[%d]", wordpos) + # Notionally, we could call acclo.output_word here to store + # our adjusted value k. But we don't need to, because all we + # actually want is the very top word of it. + if wordpos == 130 / bignum_int_bits: + break + acclo.shift_down_from(None) + + # Now we can find the right multiple of p to subtract. We actually + # subtract it by adding 5 times it, and then finally discarding + # the top bits of the output. + + # Hopefully all the bits we're shifting down fit in the same word. + assert 130 / bignum_int_bits == (130 + 3 - 1) / bignum_int_bits + acclo.set_word("5 * (acclo >> %d)" % (130 % bignum_int_bits), + limit = 5 * (7 + 1)) + acclo.placeval = 0 + for wordpos in range(inwords): + acclo.add_input_word("n->w[%d]", wordpos) + acclo.output_word(0, bignum_int_bits, "n->w[%d]", wordpos) + acclo.shift_down_from(None) + + out.stmt("n->w[%d] &= (1 << %d) - 1" % + (130 / bignum_int_bits, 130 % bignum_int_bits)) + + # Here we don't call out.finalise(), because that will complain + # that there are bits of output we never dealt with. This is true, + # but all the bits in question are above 2^130, so they're bits + # we're discarding anyway. + return out.text # not out.finalise() + +ops = { "mul" : gen_mul_1305, + "add" : gen_add, + "final_reduce" : gen_final_reduce_1305 } + +args = sys.argv[1:] +if len(args) != 2 or args[0] not in ops: + sys.stderr.write("usage: make1305.py (%s) \n" % (" | ".join(sorted(ops)))) + sys.exit(1) + +sys.stdout.write(" /* ./contrib/make1305.py %s %s */\n" % tuple(args)) +s = ops[args[0]](int(args[1])) +sys.stdout.write(s) diff --git a/sshccp.c b/sshccp.c index 51a71450..df973e63 100644 --- a/sshccp.c +++ b/sshccp.c @@ -30,6 +30,7 @@ */ #include "ssh.h" +#include "sshbn.h" #ifndef INLINE #define INLINE @@ -179,44 +180,746 @@ static INLINE void chacha20_decrypt(struct chacha20 *ctx, /* Poly1305 implementation (no AES, nonce is not encrypted) */ +#define NWORDS ((130 + BIGNUM_INT_BITS-1) / BIGNUM_INT_BITS) +typedef struct bigval { + BignumInt w[NWORDS]; +} bigval; + +static void bigval_clear(bigval *r) +{ + int i; + for (i = 0; i < NWORDS; i++) + r->w[i] = 0; +} + +static void bigval_import_le(bigval *r, const void *vdata, int len) +{ + const unsigned char *data = (const unsigned char *)vdata; + int i; + bigval_clear(r); + for (i = 0; i < len; i++) + r->w[i / BIGNUM_INT_BYTES] |= data[i] << (8 * (i % BIGNUM_INT_BYTES)); +} + +static void bigval_export_le(const bigval *r, void *vdata, int len) +{ + unsigned char *data = (unsigned char *)vdata; + int i; + for (i = 0; i < len; i++) + data[i] = r->w[i / BIGNUM_INT_BYTES] >> (8 * (i % BIGNUM_INT_BYTES)); +} + +/* + * Addition of bigvals, not mod p. + */ +static void bigval_add(bigval *r, const bigval *a, const bigval *b) +{ +#if BIGNUM_INT_BITS == 32 + /* ./contrib/make1305.py add 32 */ + BignumDblInt acclo; + acclo = 0; + acclo += a->w[0]; + acclo += b->w[0]; + r->w[0] = acclo; + acclo >>= 32; + acclo += a->w[1]; + acclo += b->w[1]; + r->w[1] = acclo; + acclo >>= 32; + acclo += a->w[2]; + acclo += b->w[2]; + r->w[2] = acclo; + acclo >>= 32; + acclo += a->w[3]; + acclo += b->w[3]; + r->w[3] = acclo; + acclo >>= 32; + acclo += a->w[4]; + acclo += b->w[4]; + r->w[4] = acclo; + acclo >>= 32; +#elif BIGNUM_INT_BITS == 16 + /* ./contrib/make1305.py add 16 */ + BignumDblInt acclo; + acclo = 0; + acclo += a->w[0]; + acclo += b->w[0]; + r->w[0] = acclo; + acclo >>= 16; + acclo += a->w[1]; + acclo += b->w[1]; + r->w[1] = acclo; + acclo >>= 16; + acclo += a->w[2]; + acclo += b->w[2]; + r->w[2] = acclo; + acclo >>= 16; + acclo += a->w[3]; + acclo += b->w[3]; + r->w[3] = acclo; + acclo >>= 16; + acclo += a->w[4]; + acclo += b->w[4]; + r->w[4] = acclo; + acclo >>= 16; + acclo += a->w[5]; + acclo += b->w[5]; + r->w[5] = acclo; + acclo >>= 16; + acclo += a->w[6]; + acclo += b->w[6]; + r->w[6] = acclo; + acclo >>= 16; + acclo += a->w[7]; + acclo += b->w[7]; + r->w[7] = acclo; + acclo >>= 16; + acclo += a->w[8]; + acclo += b->w[8]; + r->w[8] = acclo; + acclo >>= 16; +#else +#error Run contrib/make1305.py again with a different bit count +#endif +} + +/* + * Multiplication of bigvals mod p. Uses r as temporary storage, so + * don't pass r aliasing a or b. + */ +static void bigval_mul_mod_p(bigval *r, const bigval *a, const bigval *b) +{ +#if BIGNUM_INT_BITS == 32 + /* ./contrib/make1305.py mul 32 */ + BignumDblInt tmp; + BignumDblInt acclo; + BignumDblInt acchi; + BignumDblInt acc2lo; + acclo = 0; + acchi = 0; + tmp = (BignumDblInt)(a->w[0]) * (b->w[0]); + acclo += tmp & BIGNUM_INT_MASK; + acchi += tmp >> 32; + r->w[0] = acclo; + acclo = acchi + (acclo >> 32); + acchi = 0; + tmp = (BignumDblInt)(a->w[0]) * (b->w[1]); + acclo += tmp & BIGNUM_INT_MASK; + acchi += tmp >> 32; + tmp = (BignumDblInt)(a->w[1]) * (b->w[0]); + acclo += tmp & BIGNUM_INT_MASK; + acchi += tmp >> 32; + r->w[1] = acclo; + acclo = acchi + (acclo >> 32); + acchi = 0; + tmp = (BignumDblInt)(a->w[0]) * (b->w[2]); + acclo += tmp & BIGNUM_INT_MASK; + acchi += tmp >> 32; + tmp = (BignumDblInt)(a->w[1]) * (b->w[1]); + acclo += tmp & BIGNUM_INT_MASK; + acchi += tmp >> 32; + tmp = (BignumDblInt)(a->w[2]) * (b->w[0]); + acclo += tmp & BIGNUM_INT_MASK; + acchi += tmp >> 32; + r->w[2] = acclo; + acclo = acchi + (acclo >> 32); + acchi = 0; + tmp = (BignumDblInt)(a->w[0]) * (b->w[3]); + acclo += tmp & BIGNUM_INT_MASK; + acchi += tmp >> 32; + tmp = (BignumDblInt)(a->w[1]) * (b->w[2]); + acclo += tmp & BIGNUM_INT_MASK; + acchi += tmp >> 32; + tmp = (BignumDblInt)(a->w[2]) * (b->w[1]); + acclo += tmp & BIGNUM_INT_MASK; + acchi += tmp >> 32; + tmp = (BignumDblInt)(a->w[3]) * (b->w[0]); + acclo += tmp & BIGNUM_INT_MASK; + acchi += tmp >> 32; + r->w[3] = acclo; + acclo = acchi + (acclo >> 32); + acchi = 0; + tmp = (BignumDblInt)(a->w[0]) * (b->w[4]); + acclo += tmp & BIGNUM_INT_MASK; + acchi += tmp >> 32; + tmp = (BignumDblInt)(a->w[1]) * (b->w[3]); + acclo += tmp & BIGNUM_INT_MASK; + acchi += tmp >> 32; + tmp = (BignumDblInt)(a->w[2]) * (b->w[2]); + acclo += tmp & BIGNUM_INT_MASK; + acchi += tmp >> 32; + tmp = (BignumDblInt)(a->w[3]) * (b->w[1]); + acclo += tmp & BIGNUM_INT_MASK; + acchi += tmp >> 32; + tmp = (BignumDblInt)(a->w[4]) * (b->w[0]); + acclo += tmp & BIGNUM_INT_MASK; + acchi += tmp >> 32; + r->w[4] = acclo & (((BignumInt)1 << 2)-1); + acc2lo = 0; + acc2lo += ((acclo >> 2) & (((BignumInt)1 << 30)-1)) * ((BignumDblInt)5 << 0); + acclo = acchi + (acclo >> 32); + acchi = 0; + tmp = (BignumDblInt)(a->w[1]) * (b->w[4]); + acclo += tmp & BIGNUM_INT_MASK; + acchi += tmp >> 32; + tmp = (BignumDblInt)(a->w[2]) * (b->w[3]); + acclo += tmp & BIGNUM_INT_MASK; + acchi += tmp >> 32; + tmp = (BignumDblInt)(a->w[3]) * (b->w[2]); + acclo += tmp & BIGNUM_INT_MASK; + acchi += tmp >> 32; + tmp = (BignumDblInt)(a->w[4]) * (b->w[1]); + acclo += tmp & BIGNUM_INT_MASK; + acchi += tmp >> 32; + acc2lo += (acclo & (((BignumInt)1 << 2)-1)) * ((BignumDblInt)5 << 30); + acc2lo += r->w[0]; + r->w[0] = acc2lo; + acc2lo >>= 32; + acc2lo += ((acclo >> 2) & (((BignumInt)1 << 30)-1)) * ((BignumDblInt)5 << 0); + acclo = acchi + (acclo >> 32); + acchi = 0; + tmp = (BignumDblInt)(a->w[2]) * (b->w[4]); + acclo += tmp & BIGNUM_INT_MASK; + acchi += tmp >> 32; + tmp = (BignumDblInt)(a->w[3]) * (b->w[3]); + acclo += tmp & BIGNUM_INT_MASK; + acchi += tmp >> 32; + tmp = (BignumDblInt)(a->w[4]) * (b->w[2]); + acclo += tmp & BIGNUM_INT_MASK; + acchi += tmp >> 32; + acc2lo += (acclo & (((BignumInt)1 << 2)-1)) * ((BignumDblInt)5 << 30); + acc2lo += r->w[1]; + r->w[1] = acc2lo; + acc2lo >>= 32; + acc2lo += ((acclo >> 2) & (((BignumInt)1 << 30)-1)) * ((BignumDblInt)5 << 0); + acclo = acchi + (acclo >> 32); + acchi = 0; + tmp = (BignumDblInt)(a->w[3]) * (b->w[4]); + acclo += tmp & BIGNUM_INT_MASK; + acchi += tmp >> 32; + tmp = (BignumDblInt)(a->w[4]) * (b->w[3]); + acclo += tmp & BIGNUM_INT_MASK; + acchi += tmp >> 32; + acc2lo += (acclo & (((BignumInt)1 << 2)-1)) * ((BignumDblInt)5 << 30); + acc2lo += r->w[2]; + r->w[2] = acc2lo; + acc2lo >>= 32; + acc2lo += ((acclo >> 2) & (((BignumInt)1 << 30)-1)) * ((BignumDblInt)5 << 0); + acclo = acchi + (acclo >> 32); + acchi = 0; + tmp = (BignumDblInt)(a->w[4]) * (b->w[4]); + acclo += tmp & BIGNUM_INT_MASK; + acchi += tmp >> 32; + acc2lo += (acclo & (((BignumInt)1 << 2)-1)) * ((BignumDblInt)5 << 30); + acc2lo += r->w[3]; + r->w[3] = acc2lo; + acc2lo >>= 32; + acc2lo += ((acclo >> 2) & (((BignumInt)1 << 2)-1)) * ((BignumDblInt)5 << 0); + acc2lo += r->w[4]; + r->w[4] = acc2lo; + acc2lo = 0; + acc2lo += ((acclo >> 4) & (((BignumInt)1 << 28)-1)) * ((BignumDblInt)25 << 0); + acclo = acchi + (acclo >> 32); + acchi = 0; + acc2lo += (acclo & (((BignumInt)1 << 4)-1)) * ((BignumDblInt)25 << 28); + acc2lo += r->w[0]; + r->w[0] = acc2lo; + acc2lo >>= 32; + acc2lo += ((acclo >> 4) & (((BignumInt)1 << 28)-1)) * ((BignumDblInt)25 << 0); + acclo = acchi + (acclo >> 32); + acchi = 0; + acc2lo += r->w[1]; + r->w[1] = acc2lo; + acc2lo >>= 32; + acc2lo += r->w[2]; + r->w[2] = acc2lo; + acc2lo >>= 32; + acc2lo += r->w[3]; + r->w[3] = acc2lo; + acc2lo >>= 32; + acc2lo += r->w[4]; + r->w[4] = acc2lo; + acc2lo >>= 32; +#elif BIGNUM_INT_BITS == 16 + /* ./contrib/make1305.py mul 16 */ + BignumDblInt tmp; + BignumDblInt acclo; + BignumDblInt acchi; + BignumDblInt acc2lo; + acclo = 0; + acchi = 0; + tmp = (BignumDblInt)(a->w[0]) * (b->w[0]); + acclo += tmp & BIGNUM_INT_MASK; + acchi += tmp >> 16; + r->w[0] = acclo; + acclo = acchi + (acclo >> 16); + acchi = 0; + tmp = (BignumDblInt)(a->w[0]) * (b->w[1]); + acclo += tmp & BIGNUM_INT_MASK; + acchi += tmp >> 16; + tmp = (BignumDblInt)(a->w[1]) * (b->w[0]); + acclo += tmp & BIGNUM_INT_MASK; + acchi += tmp >> 16; + r->w[1] = acclo; + acclo = acchi + (acclo >> 16); + acchi = 0; + tmp = (BignumDblInt)(a->w[0]) * (b->w[2]); + acclo += tmp & BIGNUM_INT_MASK; + acchi += tmp >> 16; + tmp = (BignumDblInt)(a->w[1]) * (b->w[1]); + acclo += tmp & BIGNUM_INT_MASK; + acchi += tmp >> 16; + tmp = (BignumDblInt)(a->w[2]) * (b->w[0]); + acclo += tmp & BIGNUM_INT_MASK; + acchi += tmp >> 16; + r->w[2] = acclo; + acclo = acchi + (acclo >> 16); + acchi = 0; + tmp = (BignumDblInt)(a->w[0]) * (b->w[3]); + acclo += tmp & BIGNUM_INT_MASK; + acchi += tmp >> 16; + tmp = (BignumDblInt)(a->w[1]) * (b->w[2]); + acclo += tmp & BIGNUM_INT_MASK; + acchi += tmp >> 16; + tmp = (BignumDblInt)(a->w[2]) * (b->w[1]); + acclo += tmp & BIGNUM_INT_MASK; + acchi += tmp >> 16; + tmp = (BignumDblInt)(a->w[3]) * (b->w[0]); + acclo += tmp & BIGNUM_INT_MASK; + acchi += tmp >> 16; + r->w[3] = acclo; + acclo = acchi + (acclo >> 16); + acchi = 0; + tmp = (BignumDblInt)(a->w[0]) * (b->w[4]); + acclo += tmp & BIGNUM_INT_MASK; + acchi += tmp >> 16; + tmp = (BignumDblInt)(a->w[1]) * (b->w[3]); + acclo += tmp & BIGNUM_INT_MASK; + acchi += tmp >> 16; + tmp = (BignumDblInt)(a->w[2]) * (b->w[2]); + acclo += tmp & BIGNUM_INT_MASK; + acchi += tmp >> 16; + tmp = (BignumDblInt)(a->w[3]) * (b->w[1]); + acclo += tmp & BIGNUM_INT_MASK; + acchi += tmp >> 16; + tmp = (BignumDblInt)(a->w[4]) * (b->w[0]); + acclo += tmp & BIGNUM_INT_MASK; + acchi += tmp >> 16; + r->w[4] = acclo; + acclo = acchi + (acclo >> 16); + acchi = 0; + tmp = (BignumDblInt)(a->w[0]) * (b->w[5]); + acclo += tmp & BIGNUM_INT_MASK; + acchi += tmp >> 16; + tmp = (BignumDblInt)(a->w[1]) * (b->w[4]); + acclo += tmp & BIGNUM_INT_MASK; + acchi += tmp >> 16; + tmp = (BignumDblInt)(a->w[2]) * (b->w[3]); + acclo += tmp & BIGNUM_INT_MASK; + acchi += tmp >> 16; + tmp = (BignumDblInt)(a->w[3]) * (b->w[2]); + acclo += tmp & BIGNUM_INT_MASK; + acchi += tmp >> 16; + tmp = (BignumDblInt)(a->w[4]) * (b->w[1]); + acclo += tmp & BIGNUM_INT_MASK; + acchi += tmp >> 16; + tmp = (BignumDblInt)(a->w[5]) * (b->w[0]); + acclo += tmp & BIGNUM_INT_MASK; + acchi += tmp >> 16; + r->w[5] = acclo; + acclo = acchi + (acclo >> 16); + acchi = 0; + tmp = (BignumDblInt)(a->w[0]) * (b->w[6]); + acclo += tmp & BIGNUM_INT_MASK; + acchi += tmp >> 16; + tmp = (BignumDblInt)(a->w[1]) * (b->w[5]); + acclo += tmp & BIGNUM_INT_MASK; + acchi += tmp >> 16; + tmp = (BignumDblInt)(a->w[2]) * (b->w[4]); + acclo += tmp & BIGNUM_INT_MASK; + acchi += tmp >> 16; + tmp = (BignumDblInt)(a->w[3]) * (b->w[3]); + acclo += tmp & BIGNUM_INT_MASK; + acchi += tmp >> 16; + tmp = (BignumDblInt)(a->w[4]) * (b->w[2]); + acclo += tmp & BIGNUM_INT_MASK; + acchi += tmp >> 16; + tmp = (BignumDblInt)(a->w[5]) * (b->w[1]); + acclo += tmp & BIGNUM_INT_MASK; + acchi += tmp >> 16; + tmp = (BignumDblInt)(a->w[6]) * (b->w[0]); + acclo += tmp & BIGNUM_INT_MASK; + acchi += tmp >> 16; + r->w[6] = acclo; + acclo = acchi + (acclo >> 16); + acchi = 0; + tmp = (BignumDblInt)(a->w[0]) * (b->w[7]); + acclo += tmp & BIGNUM_INT_MASK; + acchi += tmp >> 16; + tmp = (BignumDblInt)(a->w[1]) * (b->w[6]); + acclo += tmp & BIGNUM_INT_MASK; + acchi += tmp >> 16; + tmp = (BignumDblInt)(a->w[2]) * (b->w[5]); + acclo += tmp & BIGNUM_INT_MASK; + acchi += tmp >> 16; + tmp = (BignumDblInt)(a->w[3]) * (b->w[4]); + acclo += tmp & BIGNUM_INT_MASK; + acchi += tmp >> 16; + tmp = (BignumDblInt)(a->w[4]) * (b->w[3]); + acclo += tmp & BIGNUM_INT_MASK; + acchi += tmp >> 16; + tmp = (BignumDblInt)(a->w[5]) * (b->w[2]); + acclo += tmp & BIGNUM_INT_MASK; + acchi += tmp >> 16; + tmp = (BignumDblInt)(a->w[6]) * (b->w[1]); + acclo += tmp & BIGNUM_INT_MASK; + acchi += tmp >> 16; + tmp = (BignumDblInt)(a->w[7]) * (b->w[0]); + acclo += tmp & BIGNUM_INT_MASK; + acchi += tmp >> 16; + r->w[7] = acclo; + acclo = acchi + (acclo >> 16); + acchi = 0; + tmp = (BignumDblInt)(a->w[0]) * (b->w[8]); + acclo += tmp & BIGNUM_INT_MASK; + acchi += tmp >> 16; + tmp = (BignumDblInt)(a->w[1]) * (b->w[7]); + acclo += tmp & BIGNUM_INT_MASK; + acchi += tmp >> 16; + tmp = (BignumDblInt)(a->w[2]) * (b->w[6]); + acclo += tmp & BIGNUM_INT_MASK; + acchi += tmp >> 16; + tmp = (BignumDblInt)(a->w[3]) * (b->w[5]); + acclo += tmp & BIGNUM_INT_MASK; + acchi += tmp >> 16; + tmp = (BignumDblInt)(a->w[4]) * (b->w[4]); + acclo += tmp & BIGNUM_INT_MASK; + acchi += tmp >> 16; + tmp = (BignumDblInt)(a->w[5]) * (b->w[3]); + acclo += tmp & BIGNUM_INT_MASK; + acchi += tmp >> 16; + tmp = (BignumDblInt)(a->w[6]) * (b->w[2]); + acclo += tmp & BIGNUM_INT_MASK; + acchi += tmp >> 16; + tmp = (BignumDblInt)(a->w[7]) * (b->w[1]); + acclo += tmp & BIGNUM_INT_MASK; + acchi += tmp >> 16; + tmp = (BignumDblInt)(a->w[8]) * (b->w[0]); + acclo += tmp & BIGNUM_INT_MASK; + acchi += tmp >> 16; + r->w[8] = acclo & (((BignumInt)1 << 2)-1); + acc2lo = 0; + acc2lo += ((acclo >> 2) & (((BignumInt)1 << 14)-1)) * ((BignumDblInt)5 << 0); + acclo = acchi + (acclo >> 16); + acchi = 0; + tmp = (BignumDblInt)(a->w[1]) * (b->w[8]); + acclo += tmp & BIGNUM_INT_MASK; + acchi += tmp >> 16; + tmp = (BignumDblInt)(a->w[2]) * (b->w[7]); + acclo += tmp & BIGNUM_INT_MASK; + acchi += tmp >> 16; + tmp = (BignumDblInt)(a->w[3]) * (b->w[6]); + acclo += tmp & BIGNUM_INT_MASK; + acchi += tmp >> 16; + tmp = (BignumDblInt)(a->w[4]) * (b->w[5]); + acclo += tmp & BIGNUM_INT_MASK; + acchi += tmp >> 16; + tmp = (BignumDblInt)(a->w[5]) * (b->w[4]); + acclo += tmp & BIGNUM_INT_MASK; + acchi += tmp >> 16; + tmp = (BignumDblInt)(a->w[6]) * (b->w[3]); + acclo += tmp & BIGNUM_INT_MASK; + acchi += tmp >> 16; + tmp = (BignumDblInt)(a->w[7]) * (b->w[2]); + acclo += tmp & BIGNUM_INT_MASK; + acchi += tmp >> 16; + tmp = (BignumDblInt)(a->w[8]) * (b->w[1]); + acclo += tmp & BIGNUM_INT_MASK; + acchi += tmp >> 16; + acc2lo += (acclo & (((BignumInt)1 << 2)-1)) * ((BignumDblInt)5 << 14); + acc2lo += r->w[0]; + r->w[0] = acc2lo; + acc2lo >>= 16; + acc2lo += ((acclo >> 2) & (((BignumInt)1 << 14)-1)) * ((BignumDblInt)5 << 0); + acclo = acchi + (acclo >> 16); + acchi = 0; + tmp = (BignumDblInt)(a->w[2]) * (b->w[8]); + acclo += tmp & BIGNUM_INT_MASK; + acchi += tmp >> 16; + tmp = (BignumDblInt)(a->w[3]) * (b->w[7]); + acclo += tmp & BIGNUM_INT_MASK; + acchi += tmp >> 16; + tmp = (BignumDblInt)(a->w[4]) * (b->w[6]); + acclo += tmp & BIGNUM_INT_MASK; + acchi += tmp >> 16; + tmp = (BignumDblInt)(a->w[5]) * (b->w[5]); + acclo += tmp & BIGNUM_INT_MASK; + acchi += tmp >> 16; + tmp = (BignumDblInt)(a->w[6]) * (b->w[4]); + acclo += tmp & BIGNUM_INT_MASK; + acchi += tmp >> 16; + tmp = (BignumDblInt)(a->w[7]) * (b->w[3]); + acclo += tmp & BIGNUM_INT_MASK; + acchi += tmp >> 16; + tmp = (BignumDblInt)(a->w[8]) * (b->w[2]); + acclo += tmp & BIGNUM_INT_MASK; + acchi += tmp >> 16; + acc2lo += (acclo & (((BignumInt)1 << 2)-1)) * ((BignumDblInt)5 << 14); + acc2lo += r->w[1]; + r->w[1] = acc2lo; + acc2lo >>= 16; + acc2lo += ((acclo >> 2) & (((BignumInt)1 << 14)-1)) * ((BignumDblInt)5 << 0); + acclo = acchi + (acclo >> 16); + acchi = 0; + tmp = (BignumDblInt)(a->w[3]) * (b->w[8]); + acclo += tmp & BIGNUM_INT_MASK; + acchi += tmp >> 16; + tmp = (BignumDblInt)(a->w[4]) * (b->w[7]); + acclo += tmp & BIGNUM_INT_MASK; + acchi += tmp >> 16; + tmp = (BignumDblInt)(a->w[5]) * (b->w[6]); + acclo += tmp & BIGNUM_INT_MASK; + acchi += tmp >> 16; + tmp = (BignumDblInt)(a->w[6]) * (b->w[5]); + acclo += tmp & BIGNUM_INT_MASK; + acchi += tmp >> 16; + tmp = (BignumDblInt)(a->w[7]) * (b->w[4]); + acclo += tmp & BIGNUM_INT_MASK; + acchi += tmp >> 16; + tmp = (BignumDblInt)(a->w[8]) * (b->w[3]); + acclo += tmp & BIGNUM_INT_MASK; + acchi += tmp >> 16; + acc2lo += (acclo & (((BignumInt)1 << 2)-1)) * ((BignumDblInt)5 << 14); + acc2lo += r->w[2]; + r->w[2] = acc2lo; + acc2lo >>= 16; + acc2lo += ((acclo >> 2) & (((BignumInt)1 << 14)-1)) * ((BignumDblInt)5 << 0); + acclo = acchi + (acclo >> 16); + acchi = 0; + tmp = (BignumDblInt)(a->w[4]) * (b->w[8]); + acclo += tmp & BIGNUM_INT_MASK; + acchi += tmp >> 16; + tmp = (BignumDblInt)(a->w[5]) * (b->w[7]); + acclo += tmp & BIGNUM_INT_MASK; + acchi += tmp >> 16; + tmp = (BignumDblInt)(a->w[6]) * (b->w[6]); + acclo += tmp & BIGNUM_INT_MASK; + acchi += tmp >> 16; + tmp = (BignumDblInt)(a->w[7]) * (b->w[5]); + acclo += tmp & BIGNUM_INT_MASK; + acchi += tmp >> 16; + tmp = (BignumDblInt)(a->w[8]) * (b->w[4]); + acclo += tmp & BIGNUM_INT_MASK; + acchi += tmp >> 16; + acc2lo += (acclo & (((BignumInt)1 << 2)-1)) * ((BignumDblInt)5 << 14); + acc2lo += r->w[3]; + r->w[3] = acc2lo; + acc2lo >>= 16; + acc2lo += ((acclo >> 2) & (((BignumInt)1 << 14)-1)) * ((BignumDblInt)5 << 0); + acclo = acchi + (acclo >> 16); + acchi = 0; + tmp = (BignumDblInt)(a->w[5]) * (b->w[8]); + acclo += tmp & BIGNUM_INT_MASK; + acchi += tmp >> 16; + tmp = (BignumDblInt)(a->w[6]) * (b->w[7]); + acclo += tmp & BIGNUM_INT_MASK; + acchi += tmp >> 16; + tmp = (BignumDblInt)(a->w[7]) * (b->w[6]); + acclo += tmp & BIGNUM_INT_MASK; + acchi += tmp >> 16; + tmp = (BignumDblInt)(a->w[8]) * (b->w[5]); + acclo += tmp & BIGNUM_INT_MASK; + acchi += tmp >> 16; + acc2lo += (acclo & (((BignumInt)1 << 2)-1)) * ((BignumDblInt)5 << 14); + acc2lo += r->w[4]; + r->w[4] = acc2lo; + acc2lo >>= 16; + acc2lo += ((acclo >> 2) & (((BignumInt)1 << 14)-1)) * ((BignumDblInt)5 << 0); + acclo = acchi + (acclo >> 16); + acchi = 0; + tmp = (BignumDblInt)(a->w[6]) * (b->w[8]); + acclo += tmp & BIGNUM_INT_MASK; + acchi += tmp >> 16; + tmp = (BignumDblInt)(a->w[7]) * (b->w[7]); + acclo += tmp & BIGNUM_INT_MASK; + acchi += tmp >> 16; + tmp = (BignumDblInt)(a->w[8]) * (b->w[6]); + acclo += tmp & BIGNUM_INT_MASK; + acchi += tmp >> 16; + acc2lo += (acclo & (((BignumInt)1 << 2)-1)) * ((BignumDblInt)5 << 14); + acc2lo += r->w[5]; + r->w[5] = acc2lo; + acc2lo >>= 16; + acc2lo += ((acclo >> 2) & (((BignumInt)1 << 14)-1)) * ((BignumDblInt)5 << 0); + acclo = acchi + (acclo >> 16); + acchi = 0; + tmp = (BignumDblInt)(a->w[7]) * (b->w[8]); + acclo += tmp & BIGNUM_INT_MASK; + acchi += tmp >> 16; + tmp = (BignumDblInt)(a->w[8]) * (b->w[7]); + acclo += tmp & BIGNUM_INT_MASK; + acchi += tmp >> 16; + acc2lo += (acclo & (((BignumInt)1 << 2)-1)) * ((BignumDblInt)5 << 14); + acc2lo += r->w[6]; + r->w[6] = acc2lo; + acc2lo >>= 16; + acc2lo += ((acclo >> 2) & (((BignumInt)1 << 14)-1)) * ((BignumDblInt)5 << 0); + acclo = acchi + (acclo >> 16); + acchi = 0; + tmp = (BignumDblInt)(a->w[8]) * (b->w[8]); + acclo += tmp & BIGNUM_INT_MASK; + acchi += tmp >> 16; + acc2lo += (acclo & (((BignumInt)1 << 2)-1)) * ((BignumDblInt)5 << 14); + acc2lo += r->w[7]; + r->w[7] = acc2lo; + acc2lo >>= 16; + acc2lo += ((acclo >> 2) & (((BignumInt)1 << 2)-1)) * ((BignumDblInt)5 << 0); + acc2lo += r->w[8]; + r->w[8] = acc2lo; + acc2lo = 0; + acc2lo += ((acclo >> 4) & (((BignumInt)1 << 12)-1)) * ((BignumDblInt)25 << 0); + acclo = acchi + (acclo >> 16); + acchi = 0; + acc2lo += (acclo & (((BignumInt)1 << 4)-1)) * ((BignumDblInt)25 << 12); + acc2lo += r->w[0]; + r->w[0] = acc2lo; + acc2lo >>= 16; + acc2lo += ((acclo >> 4) & (((BignumInt)1 << 12)-1)) * ((BignumDblInt)25 << 0); + acclo = acchi + (acclo >> 16); + acchi = 0; + acc2lo += r->w[1]; + r->w[1] = acc2lo; + acc2lo >>= 16; + acc2lo += r->w[2]; + r->w[2] = acc2lo; + acc2lo >>= 16; + acc2lo += r->w[3]; + r->w[3] = acc2lo; + acc2lo >>= 16; + acc2lo += r->w[4]; + r->w[4] = acc2lo; + acc2lo >>= 16; + acc2lo += r->w[5]; + r->w[5] = acc2lo; + acc2lo >>= 16; + acc2lo += r->w[6]; + r->w[6] = acc2lo; + acc2lo >>= 16; + acc2lo += r->w[7]; + r->w[7] = acc2lo; + acc2lo >>= 16; + acc2lo += r->w[8]; + r->w[8] = acc2lo; + acc2lo >>= 16; +#else +#error Run contrib/make1305.py again with a different bit count +#endif +} + +static void bigval_final_reduce(bigval *n) +{ +#if BIGNUM_INT_BITS == 32 + /* ./contrib/make1305.py final_reduce 32 */ + BignumDblInt acclo; + acclo = 0; + acclo += 5 * ((n->w[4] >> 2) + 1); + acclo += n->w[0]; + acclo >>= 32; + acclo += n->w[1]; + acclo >>= 32; + acclo += n->w[2]; + acclo >>= 32; + acclo += n->w[3]; + acclo >>= 32; + acclo += n->w[4]; + acclo = 5 * (acclo >> 2); + acclo += n->w[0]; + n->w[0] = acclo; + acclo >>= 32; + acclo += n->w[1]; + n->w[1] = acclo; + acclo >>= 32; + acclo += n->w[2]; + n->w[2] = acclo; + acclo >>= 32; + acclo += n->w[3]; + n->w[3] = acclo; + acclo >>= 32; + acclo += n->w[4]; + n->w[4] = acclo; + acclo >>= 32; + n->w[4] &= (1 << 2) - 1; +#elif BIGNUM_INT_BITS == 16 + /* ./contrib/make1305.py final_reduce 16 */ + BignumDblInt acclo; + acclo = 0; + acclo += 5 * ((n->w[8] >> 2) + 1); + acclo += n->w[0]; + acclo >>= 16; + acclo += n->w[1]; + acclo >>= 16; + acclo += n->w[2]; + acclo >>= 16; + acclo += n->w[3]; + acclo >>= 16; + acclo += n->w[4]; + acclo >>= 16; + acclo += n->w[5]; + acclo >>= 16; + acclo += n->w[6]; + acclo >>= 16; + acclo += n->w[7]; + acclo >>= 16; + acclo += n->w[8]; + acclo = 5 * (acclo >> 2); + acclo += n->w[0]; + n->w[0] = acclo; + acclo >>= 16; + acclo += n->w[1]; + n->w[1] = acclo; + acclo >>= 16; + acclo += n->w[2]; + n->w[2] = acclo; + acclo >>= 16; + acclo += n->w[3]; + n->w[3] = acclo; + acclo >>= 16; + acclo += n->w[4]; + n->w[4] = acclo; + acclo >>= 16; + acclo += n->w[5]; + n->w[5] = acclo; + acclo >>= 16; + acclo += n->w[6]; + n->w[6] = acclo; + acclo >>= 16; + acclo += n->w[7]; + n->w[7] = acclo; + acclo >>= 16; + acclo += n->w[8]; + n->w[8] = acclo; + acclo >>= 16; + n->w[8] &= (1 << 2) - 1; +#else +#error Run contrib/make1305.py again with a different bit count +#endif +} + struct poly1305 { unsigned char nonce[16]; - Bignum modulo; - Bignum r; - Bignum h; + bigval r; + bigval h; /* Buffer in case we get less that a multiple of 16 bytes */ unsigned char buffer[16]; int bufferIndex; }; -static void poly1305_make(struct poly1305 *ctx) +static void poly1305_init(struct poly1305 *ctx) { - static const unsigned char p[] = { - 0x03, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xfb - }; - - ctx->modulo = bignum_from_bytes(p, sizeof(p)); - ctx->r = NULL; - ctx->h = NULL; memset(ctx->nonce, 0, 16); ctx->bufferIndex = 0; -} - -static void poly1305_free(struct poly1305 *ctx) -{ - if (ctx->modulo) { - freebn(ctx->modulo); - } - if (ctx->r) { - freebn(ctx->r); - } - if (ctx->h) { - freebn(ctx->h); - } - smemclr(ctx, sizeof(struct poly1305)); + bigval_clear(&ctx->h); } /* Takes a 256 bit key */ @@ -235,10 +938,7 @@ static void poly1305_key(struct poly1305 *ctx, const unsigned char *key) key_copy[4] &= 0xfc; key_copy[8] &= 0xfc; key_copy[12] &= 0xfc; - if (ctx->r) { - freebn(ctx->r); - } - ctx->r = bignum_from_bytes_le(key_copy, 16); + bigval_import_le(&ctx->r, key_copy, 16); smemclr(key_copy, sizeof(key_copy)); /* Use second 128 bits are the nonce */ @@ -249,21 +949,11 @@ static void poly1305_key(struct poly1305 *ctx, const unsigned char *key) static void poly1305_feed_chunk(struct poly1305 *ctx, const unsigned char *chunk, int len) { - Bignum tmp, tmp2; - Bignum c = bignum_from_bytes_le(chunk, len); - tmp = bignum_lshift(One, 8 * len); - tmp2 = bigadd(c, tmp); - freebn(tmp); - freebn(c); - if (ctx->h) { - tmp = bigadd(ctx->h, tmp2); - freebn(tmp2); - freebn(ctx->h); - } else { - tmp = tmp2; - } - ctx->h = modmul(tmp, ctx->r, ctx->modulo); - freebn(tmp); + bigval c; + bigval_import_le(&c, chunk, len); + c.w[len / BIGNUM_INT_BYTES] |= 1 << (8 * (len % BIGNUM_INT_BYTES)); + bigval_add(&c, &c, &ctx->h); + bigval_mul_mod_p(&ctx->h, &c, &ctx->r); } static void poly1305_feed(struct poly1305 *ctx, @@ -299,21 +989,16 @@ static void poly1305_feed(struct poly1305 *ctx, /* Finalise and populate buffer with 16 byte with MAC */ static void poly1305_finalise(struct poly1305 *ctx, unsigned char *mac) { - Bignum tmp, tmp2; - int i; + bigval tmp; if (ctx->bufferIndex) { poly1305_feed_chunk(ctx, ctx->buffer, ctx->bufferIndex); } - tmp = bignum_from_bytes_le(ctx->nonce, 16); - - tmp2 = bigadd(ctx->h, tmp); - freebn(tmp); - for (i = 0; i < 16; ++i) { - mac[i] = bignum_byte(tmp2, i); - } - freebn(tmp2); + bigval_import_le(&tmp, ctx->nonce, 16); + bigval_final_reduce(&ctx->h); + bigval_add(&tmp, &tmp, &ctx->h); + bigval_export_le(&tmp, mac, 16); } /* SSH-2 wrapper */ @@ -351,8 +1036,7 @@ static void poly_start(void *handle) ctx->mac_initialised = 0; memset(ctx->mac_iv, 0, 8); - poly1305_free(&ctx->mac); - poly1305_make(&ctx->mac); + poly1305_init(&ctx->mac); } static void poly_bytes(void *handle, unsigned char const *blk, int len) @@ -445,7 +1129,7 @@ static void *ccp_make_context(void) { struct ccp_context *ctx = snew(struct ccp_context); if (ctx) { - poly1305_make(&ctx->mac); + poly1305_init(&ctx->mac); } return ctx; } @@ -455,7 +1139,7 @@ static void ccp_free_context(void *vctx) struct ccp_context *ctx = (struct ccp_context *)vctx; smemclr(&ctx->a_cipher, sizeof(ctx->a_cipher)); smemclr(&ctx->b_cipher, sizeof(ctx->b_cipher)); - poly1305_free(&ctx->mac); + smemclr(&ctx->mac, sizeof(ctx->mac)); sfree(ctx); }