From c2ec13c7e98a2dd0c40161e5d16284bcaf6ec62b Mon Sep 17 00:00:00 2001 From: Simon Tatham Date: Wed, 16 Dec 2015 14:12:26 +0000 Subject: [PATCH] Relegate BignumDblInt to an implementation detail of sshbn.h. As I mentioned in the previous commit, I'm going to want PuTTY to be able to run sensibly when compiled with 64-bit Visual Studio, including handling bignums in 64-bit chunks for speed. Unfortunately, 64-bit VS does not provide any type we can use as BignumDblInt in that situation (unlike 64-bit gcc and clang, which give us __uint128_t). The only facilities it provides are compiler intrinsics to access an add-with-carry operation and a 64x64->128 multiplication (the latter delivering its product in two separate 64-bit output chunks). Hence, here's a substantial rework of the bignum code to make it implement everything in terms of _those_ primitives, rather than depending throughout on having BignumDblInt available to use ad-hoc. BignumDblInt does still exist, for the moment, but now it's an internal implementation detail of sshbn.h, only declared inside a new set of macros implementing arithmetic primitives, and not accessible to any code outside sshbn.h (which confirms that I really did catch all uses of it and remove them). The resulting code is surprisingly nice-looking, actually. You'd expect more hassle and roundabout circumlocutions when you drop down to using a more basic set of primitive operations, but actually, in many cases it's turned out shorter to write things in terms of the new BignumADC and BignumMUL macros - because almost all my uses of BignumDblInt were implementing those operations anyway, taking several lines at a time, and now they can do each thing in just one line. The biggest headache was Poly1305: I wasn't able to find any sensible way to adapt the existing Python script that generates the various per-int-size implementations of arithmetic mod 2^130-5, and so I had to rewrite it from scratch instead, with nothing in common with the old version beyond a handful of comments. But even that seems to have worked out nicely: the new version has much more legible descriptions of the high-level algorithms, by virtue of having a 'Multiprecision' type which wraps up the division into words, and yet Multiprecision's range analysis allows it to automatically drop out special cases such as multiplication by 5 being much easier than multiplication by another multi-word integer. --- contrib/make1305.py | 577 +++++++++++-------- sshbn.c | 281 +++++----- sshbn.h | 205 +++++-- sshccp.c | 1305 +++++++++++++++++-------------------------- 4 files changed, 1159 insertions(+), 1209 deletions(-) diff --git a/contrib/make1305.py b/contrib/make1305.py index ede220df..c8597040 100755 --- a/contrib/make1305.py +++ b/contrib/make1305.py @@ -1,108 +1,300 @@ #!/usr/bin/env python import sys +import string +from collections import namedtuple -class Output(object): - def __init__(self, bignum_int_bits): - self.bignum_int_bits = bignum_int_bits - self.text = "" - self.vars = [] - def stmt(self, statement): - self.text += " %s;\n" % statement - def register_var(self, var): - self.vars.append(var) - def finalise(self): - for var in self.vars: - assert var.maxval == 0, "Variable not clear: %s" % var.name - return self.text +class Multiprecision(object): + def __init__(self, target, minval, maxval, words): + self.target = target + self.minval = minval + self.maxval = maxval + self.words = words + assert 0 <= self.minval + assert self.minval <= self.maxval + assert self.target.nwords(self.maxval) == len(words) -class Variable(object): - def __init__(self, out, name): - self.out = out - self.maxval = 0 - self.name = name - self.placeval = None - self.out.stmt("BignumDblInt %s" % (self.name)) - self.out.register_var(self) - def clear(self, placeval): - self.maxval = 0 - self.placeval = placeval - self.out.stmt("%s = 0" % (self.name)) - def set_word(self, name, limit=None): - if limit is not None: - self.maxval = limit-1 - else: - self.maxval = (1 << self.out.bignum_int_bits) - 1 - assert self.maxval < (1 << 2*self.out.bignum_int_bits) - self.out.stmt("%s = %s" % (self.name, name)) - def add_word(self, name, limit=None): - if limit is not None: - self.maxval += limit-1 - else: - self.maxval += (1 << self.out.bignum_int_bits) - 1 - assert self.maxval < (1 << 2*self.out.bignum_int_bits) - self.out.stmt("%s += %s" % (self.name, name)) - def add_input_word(self, fmt, wordpos, limit=None): - assert self.placeval == wordpos * self.out.bignum_int_bits - self.add_word(fmt % wordpos, limit) - def set_to_product(self, a, b, placeval): - self.maxval = ((1 << self.out.bignum_int_bits) - 1) ** 2 - assert self.maxval < (1 << 2*self.out.bignum_int_bits) - self.out.stmt("%s = (BignumDblInt)(%s) * (%s)" % (self.name, a, b)) - self.placeval = placeval - def add_bottom_half(self, srcvar): - self.add_word("%s & BIGNUM_INT_MASK" % (srcvar.name)) - def add_top_half(self, srcvar): - self.add_word("%s >> %d" % (srcvar.name, self.out.bignum_int_bits)) - def unload_into(self, topvar, botvar): - assert botvar.placeval == self.placeval - botvar.add_bottom_half(self) - assert topvar.placeval == self.placeval + self.out.bignum_int_bits - topvar.add_top_half(self) - self.maxval = 0 - def output_word(self, bitpos, bits, destfmt, destwordpos): - assert bitpos == 0 - assert self.placeval == destwordpos * self.out.bignum_int_bits - dest = destfmt % destwordpos - if bits == self.out.bignum_int_bits: - self.out.stmt("%s = %s" % (dest, self.name)) - else: - self.out.stmt("%s = %s & (((BignumInt)1 << %d)-1)" % - (dest, self.name, bits)) - def transfer_to_next_acc(self, bitpos, bits, pow5, destvar): - destbitpos = self.placeval + bitpos - 130 * pow5 - destvar.placeval - #print "transfer", "*%d" % 5**pow5, self.name, self.placeval, bitpos, destvar.name, destvar.placeval, destbitpos, bits - assert 0 <= bitpos < bitpos+bits <= self.out.bignum_int_bits - assert 0 <= destbitpos < destbitpos+bits <= self.out.bignum_int_bits - expr = self.name - if bitpos > 0: - expr = "(%s >> %d)" % (expr, bitpos) - expr = "(%s & (((BignumInt)1 << %d)-1))" % (expr, bits) - self.out.stmt("%s += %s * ((BignumDblInt)%d << %d)" % - (destvar.name, expr, 5**pow5, destbitpos)) - destvar.maxval += (((1 << bits)-1) << destbitpos) * (5**pow5) - def shift_down_from(self, top): - if top is not None: - self.out.stmt("%s = %s + (%s >> %d)" % - (self.name, top.name, self.name, - self.out.bignum_int_bits)) - topmaxval = top.maxval - else: - self.out.stmt("%s >>= %d" % (self.name, self.out.bignum_int_bits)) - topmaxval = 0 - self.maxval = topmaxval + self.maxval >> self.out.bignum_int_bits - assert self.maxval < (1 << 2*self.out.bignum_int_bits) - if top is not None: - assert self.placeval + self.out.bignum_int_bits == top.placeval - top.clear(top.placeval + self.out.bignum_int_bits) - self.placeval += self.out.bignum_int_bits + def getword(self, n): + return self.words[n] if n < len(self.words) else "0" -def gen_add(bignum_int_bits): - out = Output(bignum_int_bits) + def __add__(self, rhs): + newmin = self.minval + rhs.minval + newmax = self.maxval + rhs.maxval + nwords = self.target.nwords(newmax) + words = [] - inbits = 130 - inwords = (inbits + bignum_int_bits - 1) / bignum_int_bits + addfn = self.target.add + for i in range(nwords): + words.append(addfn(self.getword(i), rhs.getword(i))) + addfn = self.target.adc + return Multiprecision(self.target, newmin, newmax, words) + + def __mul__(self, rhs): + newmin = self.minval * rhs.minval + newmax = self.maxval * rhs.maxval + nwords = self.target.nwords(newmax) + words = [] + + # There are basically two strategies we could take for + # multiplying two multiprecision integers. One is to enumerate + # the space of pairs of word indices in lexicographic order, + # essentially computing a*b[i] for each i and adding them + # together; the other is to enumerate in diagonal order, + # computing everything together that belongs at a particular + # output word index. + # + # For the moment, I've gone for the former. + + sprev = [] + for i, sword in enumerate(self.words): + rprev = None + sthis = sprev[:i] + for j, rword in enumerate(rhs.words): + prevwords = [] + if i+j < len(sprev): + prevwords.append(sprev[i+j]) + if rprev is not None: + prevwords.append(rprev) + vhi, vlo = self.target.muladd(sword, rword, *prevwords) + sthis.append(vlo) + rprev = vhi + sthis.append(rprev) + sprev = sthis + + # Remove unneeded words from the top of the output, if we can + # prove by range analysis that they'll always be zero. + sprev = sprev[:self.target.nwords(newmax)] + + return Multiprecision(self.target, newmin, newmax, sprev) + + def extract_bits(self, start, bits=None): + if bits is None: + bits = (self.maxval >> start).bit_length() + + # Overly thorough range analysis: if min and max have the same + # *quotient* by 2^bits, then the result of reducing anything + # in the range [min,max] mod 2^bits has to fall within the + # obvious range. But if they have different quotients, then + # you can wrap round the modulus and so any value mod 2^bits + # is possible. + newmin = self.minval >> start + newmax = self.maxval >> start + if (newmin >> bits) != (newmax >> bits): + newmin = 0 + newmax = (1 << bits) - 1 + + nwords = self.target.nwords(newmax) + words = [] + for i in range(nwords): + srcpos = i * self.target.bits + start + maxbits = min(self.target.bits, start + bits - srcpos) + wordindex = srcpos / self.target.bits + if srcpos % self.target.bits == 0: + word = self.getword(srcpos / self.target.bits) + elif (wordindex+1 >= len(self.words) or + srcpos % self.target.bits + maxbits < self.target.bits): + word = self.target.new_value( + "(%%s) >> %d" % (srcpos % self.target.bits), + self.getword(srcpos / self.target.bits)) + else: + word = self.target.new_value( + "((%%s) >> %d) | ((%%s) << %d)" % ( + srcpos % self.target.bits, + self.target.bits - (srcpos % self.target.bits)), + self.getword(srcpos / self.target.bits), + self.getword(srcpos / self.target.bits + 1)) + if maxbits < self.target.bits and maxbits < bits: + word = self.target.new_value( + "(%%s) & ((((BignumInt)1) << %d)-1)" % maxbits, + word) + words.append(word) + + return Multiprecision(self.target, newmin, newmax, words) + +# Each Statement has a list of variables it reads, and a list of ones +# it writes. 'forms' is a list of multiple actual C statements it +# could be generated as, depending on which of its output variables is +# actually used (e.g. no point calling BignumADC if the generated +# carry in a particular case is unused, or BignumMUL if nobody needs +# the top half). It is indexed by a bitmap whose bits correspond to +# the entries in wvars, with wvars[0] the MSB and wvars[-1] the LSB. +Statement = namedtuple("Statement", "rvars wvars forms") + +class CodegenTarget(object): + def __init__(self, bits): + self.bits = bits + self.valindex = 0 + self.stmts = [] + self.generators = {} + self.bv_words = (130 + self.bits - 1) / self.bits + self.carry_index = 0 + + def nwords(self, maxval): + return (maxval.bit_length() + self.bits - 1) / self.bits + + def stmt(self, stmt, needed=False): + index = len(self.stmts) + self.stmts.append([needed, stmt]) + for val in stmt.wvars: + self.generators[val] = index + + def new_value(self, formatstr=None, *deps): + name = "v%d" % self.valindex + self.valindex += 1 + if formatstr is not None: + self.stmt(Statement( + rvars=deps, wvars=[name], + forms=[None, name + " = " + formatstr % deps])) + return name + + def bigval_input(self, name, bits): + words = (bits + self.bits - 1) / self.bits + # Expect not to require an entire extra word + assert words == self.bv_words + + return Multiprecision(self, 0, (1<w[%d]" % (name, i)) for i in range(words)]) + + def const(self, value): + # We only support constants small enough to both fit in a + # BignumInt (of any size supported) _and_ be expressible in C + # with no weird integer literal syntax like a trailing LL. + # + # Supporting larger constants would be possible - you could + # break 'value' up into word-sized pieces on the Python side, + # and generate a legal C expression for each piece by + # splitting it further into pieces within the + # standards-guaranteed 'unsigned long' limit of 32 bits and + # then casting those to BignumInt before combining them with + # shifts. But it would be a lot of effort, and since the + # application for this code doesn't even need it, there's no + # point in bothering. + assert value < 2**16 + return Multiprecision(self, value, value, ["%d" % value]) + + def current_carry(self): + return "carry%d" % self.carry_index + + def add(self, a1, a2): + ret = self.new_value() + adcform = "BignumADC(%s, carry, %s, %s, 0)" % (ret, a1, a2) + plainform = "%s = %s + %s" % (ret, a1, a2) + self.carry_index += 1 + carryout = self.current_carry() + self.stmt(Statement( + rvars=[a1,a2], wvars=[ret,carryout], + forms=[None, adcform, plainform, adcform])) + return ret + + def adc(self, a1, a2): + ret = self.new_value() + adcform = "BignumADC(%s, carry, %s, %s, carry)" % (ret, a1, a2) + plainform = "%s = %s + %s + carry" % (ret, a1, a2) + carryin = self.current_carry() + self.carry_index += 1 + carryout = self.current_carry() + self.stmt(Statement( + rvars=[a1,a2,carryin], wvars=[ret,carryout], + forms=[None, adcform, plainform, adcform])) + return ret + + def muladd(self, m1, m2, *addends): + rlo = self.new_value() + rhi = self.new_value() + wideform = "BignumMUL%s(%s)" % ( + { 0:"", 1:"ADD", 2:"ADD2" }[len(addends)], + ", ".join([rhi, rlo, m1, m2] + list(addends))) + narrowform = " + ".join(["%s = %s * %s" % (rlo, m1, m2)] + + list(addends)) + self.stmt(Statement( + rvars=[m1,m2]+list(addends), wvars=[rhi,rlo], + forms=[None, narrowform, wideform, wideform])) + return rhi, rlo + + def write_bigval(self, name, val): + for i in range(self.bv_words): + word = val.getword(i) + self.stmt(Statement( + rvars=[word], wvars=[], + forms=["%s->w[%d] = %s" % (name, i, word)]), + needed=True) + + def compute_needed(self): + used_vars = set() + + self.queue = [stmt for (needed,stmt) in self.stmts if needed] + while len(self.queue) > 0: + stmt = self.queue.pop(0) + deps = [] + for var in stmt.rvars: + if var[0] in string.digits: + continue # constant + deps.append(self.generators[var]) + used_vars.add(var) + for index in deps: + if not self.stmts[index][0]: + self.stmts[index][0] = True + self.queue.append(self.stmts[index][1]) + + forms = [] + for i, (needed, stmt) in enumerate(self.stmts): + if needed: + formindex = 0 + for (j, var) in enumerate(stmt.wvars): + formindex *= 2 + if var in used_vars: + formindex += 1 + forms.append(stmt.forms[formindex]) + + # Now we must check whether this form of the statement + # also writes some variables we _don't_ actually need + # (e.g. if you only wanted the top half from a mul, or + # only the carry from an adc, you'd be forced to + # generate the other output too). Easiest way to do + # this is to look for an identical statement form + # later in the array. + maxindex = max(i for i in range(len(stmt.forms)) + if stmt.forms[i] == stmt.forms[formindex]) + extra_vars = maxindex & ~formindex + bitpos = 0 + while extra_vars != 0: + if extra_vars & (1 << bitpos): + extra_vars &= ~(1 << bitpos) + var = stmt.wvars[-1-bitpos] + used_vars.add(var) + # Also, write out a cast-to-void for each + # subsequently unused value, to prevent gcc + # warnings when the output code is compiled. + forms.append("(void)" + var) + bitpos += 1 + + used_carry = any(v.startswith("carry") for v in used_vars) + used_vars = [v for v in used_vars if v.startswith("v")] + used_vars.sort(key=lambda v: int(v[1:])) + + return used_carry, used_vars, forms + + def text(self): + used_carry, values, forms = self.compute_needed() + + ret = "" + while len(values) > 0: + prefix, sep, suffix = " BignumInt ", ", ", ";" + currline = values.pop(0) + while (len(values) > 0 and + len(prefix+currline+sep+values[0]+suffix) < 79): + currline += sep + values.pop(0) + ret += prefix + currline + suffix + "\n" + if used_carry: + ret += " BignumCarry carry;\n" + if ret != "": + ret += "\n" + for stmtform in forms: + ret += " %s;\n" % stmtform + return ret + +def gen_add(target): # This is an addition _without_ reduction mod p, so that it can be # used both during accumulation of the polynomial and for adding # on the encrypted nonce at the end (which is mod 2^128, not mod @@ -111,157 +303,66 @@ def gen_add(bignum_int_bits): # Because one of the inputs will have come from our # not-completely-reducing multiplication function, we expect up to # 3 extra bits of input. - acclo = Variable(out, "acclo") - acclo.clear(0) - - for wordpos in range(inwords): - limit = min(1 << bignum_int_bits, 1 << (130 - wordpos*bignum_int_bits)) - acclo.add_input_word("a->w[%d]", wordpos, limit) - acclo.add_input_word("b->w[%d]", wordpos, limit) - acclo.output_word(0, bignum_int_bits, "r->w[%d]", wordpos) - acclo.shift_down_from(None) - - return out.finalise() - -def gen_mul_1305(bignum_int_bits): - out = Output(bignum_int_bits) - - inbits = 130 - inwords = (inbits + bignum_int_bits - 1) / bignum_int_bits + a = target.bigval_input("a", 133) + b = target.bigval_input("b", 133) + ret = a + b + target.write_bigval("r", ret) + return """\ +static void bigval_add(bigval *r, const bigval *a, const bigval *b) +{ +%s} +\n""" % target.text() +def gen_mul(target): # The inputs are not 100% reduced mod p. Specifically, we can get # a full 130-bit number from the pow5==0 pass, and then a 130-bit # number times 5 from the pow5==1 pass, plus a possible carry. The # total of that can be easily bounded above by 2^130 * 8, so we # need to assume we're multiplying two 133-bit numbers. - outbits = (inbits + 3) * 2 - outwords = (outbits + bignum_int_bits - 1) / bignum_int_bits + 1 - tmp = Variable(out, "tmp") - acclo = Variable(out, "acclo") - acchi = Variable(out, "acchi") - acc2lo = Variable(out, "acc2lo") + a = target.bigval_input("a", 133) + b = target.bigval_input("b", 133) + ab = a * b + ab0 = ab.extract_bits(0, 130) + ab1 = ab.extract_bits(130, 130) + ab2 = ab.extract_bits(260) + ab1_5 = target.const(5) * ab1 + ab2_25 = target.const(25) * ab2 + ret = ab0 + ab1_5 + ab2_25 + target.write_bigval("r", ret) + return """\ +static void bigval_mul_mod_p(bigval *r, const bigval *a, const bigval *b) +{ +%s} +\n""" % target.text() - pow5, bits_at_pow5 = 0, inbits - - acclo.clear(0) - acchi.clear(bignum_int_bits) - bits_needed_in_acc2 = bignum_int_bits - - for outwordpos in range(outwords): - for a in range(inwords): - b = outwordpos - a - if 0 <= b < inwords: - tmp.set_to_product("a->w[%d]" % a, "b->w[%d]" % b, - outwordpos * bignum_int_bits) - tmp.unload_into(acchi, acclo) - - bits_in_word = bignum_int_bits - bitpos = 0 - #print "begin output" - while bits_in_word > 0: - chunk = min(bits_in_word, bits_at_pow5) - if pow5 > 0: - chunk = min(chunk, bits_needed_in_acc2) - if pow5 == 0: - acclo.output_word(bitpos, chunk, "r->w[%d]", outwordpos) - else: - acclo.transfer_to_next_acc(bitpos, chunk, pow5, acc2lo) - bits_needed_in_acc2 -= chunk - if bits_needed_in_acc2 == 0: - assert acc2lo.placeval % bignum_int_bits == 0 - other_outwordpos = acc2lo.placeval / bignum_int_bits - acc2lo.add_input_word("r->w[%d]", other_outwordpos) - acc2lo.output_word(bitpos, bignum_int_bits, "r->w[%d]", - other_outwordpos) - acc2lo.shift_down_from(None) - bits_needed_in_acc2 = bignum_int_bits - bits_in_word -= chunk - bits_at_pow5 -= chunk - bitpos += chunk - if bits_at_pow5 == 0: - if pow5 > 0: - assert acc2lo.placeval % bignum_int_bits == 0 - other_outwordpos = acc2lo.placeval / bignum_int_bits - acc2lo.add_input_word("r->w[%d]", other_outwordpos) - acc2lo.output_word(0, bignum_int_bits, "r->w[%d]", - other_outwordpos) - pow5 += 1 - bits_at_pow5 = inbits - acc2lo.clear(0) - bits_needed_in_acc2 = bignum_int_bits - acclo.shift_down_from(acchi) - - while acc2lo.maxval > 0: - other_outwordpos = acc2lo.placeval / bignum_int_bits - bitsleft = inbits - other_outwordpos * bignum_int_bits - limit = 1<w[%d]", other_outwordpos, limit=limit) - acc2lo.output_word(0, bignum_int_bits, "r->w[%d]", other_outwordpos) - acc2lo.shift_down_from(None) - - return out.finalise() - -def gen_final_reduce_1305(bignum_int_bits): - out = Output(bignum_int_bits) - - inbits = 130 - inwords = (inbits + bignum_int_bits - 1) / bignum_int_bits - - # We take our input number n, and compute k = 5 + 5*(n >> 130). +def gen_final_reduce(target): + # We take our input number n, and compute k = n + 5*(n >> 130). # Then k >> 130 is precisely the multiple of p that needs to be # subtracted from n to reduce it to strictly less than p. - acclo = Variable(out, "acclo") + a = target.bigval_input("n", 133) + a1 = a.extract_bits(130, 130) + k = a + target.const(5) * a1 + q = k.extract_bits(130) + adjusted = a + target.const(5) * q + ret = adjusted.extract_bits(0, 130) + target.write_bigval("n", ret) + return """\ +static void bigval_final_reduce(bigval *n) +{ +%s} +\n""" % target.text() - acclo.clear(0) - # Hopefully all the bits we're shifting down fit in the same word. - assert 130 / bignum_int_bits == (130 + 3 - 1) / bignum_int_bits - acclo.add_word("5 * ((n->w[%d] >> %d) + 1)" % - (130 / bignum_int_bits, 130 % bignum_int_bits), - limit = 5 * (7 + 1)) - for wordpos in range(inwords): - acclo.add_input_word("n->w[%d]", wordpos) - # Notionally, we could call acclo.output_word here to store - # our adjusted value k. But we don't need to, because all we - # actually want is the very top word of it. - if wordpos == 130 / bignum_int_bits: - break - acclo.shift_down_from(None) - - # Now we can find the right multiple of p to subtract. We actually - # subtract it by adding 5 times it, and then finally discarding - # the top bits of the output. - - # Hopefully all the bits we're shifting down fit in the same word. - assert 130 / bignum_int_bits == (130 + 3 - 1) / bignum_int_bits - acclo.set_word("5 * (acclo >> %d)" % (130 % bignum_int_bits), - limit = 5 * (7 + 1)) - acclo.placeval = 0 - for wordpos in range(inwords): - acclo.add_input_word("n->w[%d]", wordpos) - acclo.output_word(0, bignum_int_bits, "n->w[%d]", wordpos) - acclo.shift_down_from(None) - - out.stmt("n->w[%d] &= (1 << %d) - 1" % - (130 / bignum_int_bits, 130 % bignum_int_bits)) - - # Here we don't call out.finalise(), because that will complain - # that there are bits of output we never dealt with. This is true, - # but all the bits in question are above 2^130, so they're bits - # we're discarding anyway. - return out.text # not out.finalise() - -ops = { "mul" : gen_mul_1305, - "add" : gen_add, - "final_reduce" : gen_final_reduce_1305 } - -args = sys.argv[1:] -if len(args) != 2 or args[0] not in ops: - sys.stderr.write("usage: make1305.py (%s) \n" % (" | ".join(sorted(ops)))) - sys.exit(1) - -sys.stdout.write(" /* ./contrib/make1305.py %s %s */\n" % tuple(args)) -s = ops[args[0]](int(args[1])) -sys.stdout.write(s) +pp_keyword = "#if" +for bits in [16, 32, 64]: + sys.stdout.write("%s BIGNUM_INT_BITS == %d\n\n" % (pp_keyword, bits)) + pp_keyword = "#elif" + sys.stdout.write(gen_add(CodegenTarget(bits))) + sys.stdout.write(gen_mul(CodegenTarget(bits))) + sys.stdout.write(gen_final_reduce(CodegenTarget(bits))) +sys.stdout.write("""#else +#error Add another bit count to contrib/make1305.py and rerun it +#endif +""") diff --git a/sshbn.c b/sshbn.c index 455aa57a..4f27dc9b 100644 --- a/sshbn.c +++ b/sshbn.c @@ -87,20 +87,17 @@ Bignum bn_power_2(int n) /* * Internal addition. Sets c = a - b, where 'a', 'b' and 'c' are all - * big-endian arrays of 'len' BignumInts. Returns a BignumInt carried - * off the top. + * big-endian arrays of 'len' BignumInts. Returns the carry off the + * top. */ -static BignumInt internal_add(const BignumInt *a, const BignumInt *b, - BignumInt *c, int len) +static BignumCarry internal_add(const BignumInt *a, const BignumInt *b, + BignumInt *c, int len) { int i; - BignumDblInt carry = 0; + BignumCarry carry = 0; - for (i = len-1; i >= 0; i--) { - carry += (BignumDblInt)a[i] + b[i]; - c[i] = (BignumInt)carry; - carry >>= BIGNUM_INT_BITS; - } + for (i = len-1; i >= 0; i--) + BignumADC(c[i], carry, a[i], b[i], carry); return (BignumInt)carry; } @@ -114,13 +111,10 @@ static void internal_sub(const BignumInt *a, const BignumInt *b, BignumInt *c, int len) { int i; - BignumDblInt carry = 1; + BignumCarry carry = 1; - for (i = len-1; i >= 0; i--) { - carry += (BignumDblInt)a[i] + (b[i] ^ BIGNUM_INT_MASK); - c[i] = (BignumInt)carry; - carry >>= BIGNUM_INT_BITS; - } + for (i = len-1; i >= 0; i--) + BignumADC(c[i], carry, a[i], ~b[i], carry); } /* @@ -184,7 +178,7 @@ static void internal_mul(const BignumInt *a, const BignumInt *b, int toplen = len/2, botlen = len - toplen; /* botlen is the bigger */ int midlen = botlen + 1; - BignumDblInt carry; + BignumCarry carry; #ifdef KARA_DEBUG int i; #endif @@ -313,9 +307,7 @@ static void internal_mul(const BignumInt *a, const BignumInt *b, i = 2*len - botlen - 2*midlen - 1; while (carry) { assert(i >= 0); - carry += c[i]; - c[i] = (BignumInt)carry; - carry >>= BIGNUM_INT_BITS; + BignumADC(c[i], carry, c[i], 0, carry); i--; } #ifdef KARA_DEBUG @@ -329,7 +321,6 @@ static void internal_mul(const BignumInt *a, const BignumInt *b, } else { int i; BignumInt carry; - BignumDblInt t; const BignumInt *ap, *bp; BignumInt *cp, *cps; @@ -342,11 +333,8 @@ static void internal_mul(const BignumInt *a, const BignumInt *b, for (cps = c + 2*len, ap = a + len; ap-- > a; cps--) { carry = 0; - for (cp = cps, bp = b + len; cp--, bp-- > b ;) { - t = (MUL_WORD(*ap, *bp) + carry) + *cp; - *cp = (BignumInt) t; - carry = (BignumInt)(t >> BIGNUM_INT_BITS); - } + for (cp = cps, bp = b + len; cp--, bp-- > b ;) + BignumMULADD2(carry, *cp, *ap, *bp, *cp, carry); *cp = carry; } } @@ -431,7 +419,6 @@ static void internal_mul_low(const BignumInt *a, const BignumInt *b, } else { int i; BignumInt carry; - BignumDblInt t; const BignumInt *ap, *bp; BignumInt *cp, *cps; @@ -444,11 +431,8 @@ static void internal_mul_low(const BignumInt *a, const BignumInt *b, for (cps = c + len, ap = a + len; ap-- > a; cps--) { carry = 0; - for (cp = cps, bp = b + len; bp--, cp-- > c ;) { - t = (MUL_WORD(*ap, *bp) + carry) + *cp; - *cp = (BignumInt) t; - carry = (BignumInt)(t >> BIGNUM_INT_BITS); - } + for (cp = cps, bp = b + len; bp--, cp-- > c ;) + BignumMULADD2(carry, *cp, *ap, *bp, *cp, carry); } } } @@ -519,15 +503,23 @@ static void internal_add_shifted(BignumInt *number, { int word = 1 + (shift / BIGNUM_INT_BITS); int bshift = shift % BIGNUM_INT_BITS; - BignumDblInt addend; + BignumInt addendh, addendl; + BignumCarry carry; - addend = (BignumDblInt)n << bshift; + addendl = n << bshift; + addendh = (bshift == 0 ? 0 : n >> (BIGNUM_INT_BITS - bshift)); - while (addend) { + assert(word <= number[0]); + BignumADC(number[word], carry, number[word], addendl, 0); + word++; + if (!addendh && !carry) + return; + assert(word <= number[0]); + BignumADC(number[word], carry, number[word], addendh, carry); + word++; + while (carry) { assert(word <= number[0]); - addend += number[word]; - number[word] = (BignumInt) addend & BIGNUM_INT_MASK; - addend >>= BIGNUM_INT_BITS; + BignumADC(number[word], carry, number[word], 0, carry); word++; } } @@ -555,8 +547,7 @@ static int bn_clz(BignumInt x) static BignumInt reciprocal_word(BignumInt d) { - BignumInt dshort, recip; - BignumDblInt product; + BignumInt dshort, recip, prodh, prodl; int corrections; /* @@ -600,15 +591,16 @@ static BignumInt reciprocal_word(BignumInt d) * iteration, and the initial division above already gave us half * the output word, so it's only worth doing one iteration. */ - product = MUL_WORD(recip, d); - product += recip; - product = -product; /* the 2K shifts just off the top */ - product &= (((BignumDblInt)BIGNUM_INT_MASK << BIGNUM_INT_BITS) + - BIGNUM_INT_MASK); - product >>= BIGNUM_INT_BITS; - product = MUL_WORD(product, recip); - product >>= (BIGNUM_INT_BITS-1); - recip = (BignumInt)product; + BignumMULADD(prodh, prodl, recip, d, recip); + prodl = ~prodl; + prodh = ~prodh; + { + BignumCarry c; + BignumADC(prodl, c, prodl, 1, 0); + prodh += c; + } + BignumMUL(prodh, prodl, prodh, recip); + recip = (prodh << 1) | (prodl >> (BIGNUM_INT_BITS-1)); /* * Now make sure we have the best possible reciprocal estimate, @@ -616,18 +608,24 @@ static BignumInt reciprocal_word(BignumInt d) * way - not enough to bother with any better-thought-out kind of * correction loop. */ - product = MUL_WORD(recip, d); - product += recip; + BignumMULADD(prodh, prodl, recip, d, recip); corrections = 0; - if (product >= ((BignumDblInt)1 << (2*BIGNUM_INT_BITS-1))) { + if (prodh >= BIGNUM_TOP_BIT) { do { - product -= d; + BignumCarry c = 1; + BignumADC(prodl, c, prodl, ~d, c); prodh += BIGNUM_INT_MASK + c; recip--; corrections++; - } while (product >= ((BignumDblInt)1 << (2*BIGNUM_INT_BITS-1))); + } while (prodh >= ((BignumInt)1 << (BIGNUM_INT_BITS-1))); } else { - while (product < ((BignumDblInt)1 << (2*BIGNUM_INT_BITS-1)) - d) { - product += d; + while (1) { + BignumInt newprodh, newprodl; + BignumCarry c = 0; + BignumADC(newprodl, c, prodl, d, c); newprodh = prodh + c; + if (newprodh >= BIGNUM_TOP_BIT) + break; + prodh = newprodh; + prodl = newprodl; recip++; corrections++; } @@ -679,7 +677,7 @@ static void internal_mod(BignumInt *a, int alen, * here. */ for (i = 0; i <= alen - mlen ;) { - BignumDblInt product, subtmp, t; + BignumInt product; BignumInt aword, q; int shift, full_bitoffset, bitoffset, wordoffset; @@ -707,8 +705,11 @@ static void internal_mod(BignumInt *a, int alen, if (shift > 0 && i+1 < alen) aword |= a[i+1] >> (BIGNUM_INT_BITS - shift); - t = MUL_WORD(recip, aword); - q = (BignumInt)(t >> BIGNUM_INT_BITS); + { + BignumInt unused; + BignumMUL(q, unused, recip, aword); + (void)unused; + } #ifdef DIVISION_DEBUG printf("i=%d, aword=%#0*llx, shift=%d, q=%#0*llx\n", @@ -784,27 +785,22 @@ static void internal_mod(BignumInt *a, int alen, wordoffset = alen - mlen - wordoffset; if (bitoffset == 0) { - BignumInt c = 1; + BignumCarry c = 1; BignumInt prev_hi_word = 0; for (k = mlen - 1; wordoffset+k >= i; k--) { BignumInt mword = k<0 ? 0 : m[k]; - product = MUL_WORD(q, mword); - product += prev_hi_word; - prev_hi_word = product >> BIGNUM_INT_BITS; + BignumMULADD(prev_hi_word, product, q, mword, prev_hi_word); #ifdef DIVISION_DEBUG printf(" aligned sub: product word for m[%d] = %#0*llx\n", k, BIGNUM_INT_BITS/4, - (unsigned long long)(BignumInt)product); + (unsigned long long)product); #endif #ifdef DIVISION_DEBUG printf(" aligned sub: subtrahend for a[%d] = %#0*llx\n", wordoffset+k, BIGNUM_INT_BITS/4, - (unsigned long long)(BignumInt)product); + (unsigned long long)product); #endif - subtmp = (BignumDblInt)a[wordoffset+k] + - ((BignumInt)product ^ BIGNUM_INT_MASK) + c; - a[wordoffset+k] = (BignumInt)subtmp; - c = subtmp >> BIGNUM_INT_BITS; + BignumADC(a[wordoffset+k], c, a[wordoffset+k], ~product, c); } } else { BignumInt add_word = 0; @@ -812,28 +808,23 @@ static void internal_mod(BignumInt *a, int alen, BignumInt prev_hi_word = 0; for (k = mlen - 1; wordoffset+k >= i; k--) { BignumInt mword = k<0 ? 0 : m[k]; - product = MUL_WORD(q, mword); - product += prev_hi_word; - prev_hi_word = product >> BIGNUM_INT_BITS; + BignumMULADD(prev_hi_word, product, q, mword, prev_hi_word); #ifdef DIVISION_DEBUG printf(" unaligned sub: product word for m[%d] = %#0*llx\n", k, BIGNUM_INT_BITS/4, - (unsigned long long)(BignumInt)product); + (unsigned long long)product); #endif - add_word |= (BignumInt)product << bitoffset; + add_word |= product << bitoffset; #ifdef DIVISION_DEBUG printf(" unaligned sub: subtrahend for a[%d] = %#0*llx\n", wordoffset+k, BIGNUM_INT_BITS/4, (unsigned long long)add_word); #endif - subtmp = (BignumDblInt)a[wordoffset+k] + - (add_word ^ BIGNUM_INT_MASK) + c; - a[wordoffset+k] = (BignumInt)subtmp; - c = subtmp >> BIGNUM_INT_BITS; + BignumADC(a[wordoffset+k], c, a[wordoffset+k], ~add_word, c); - add_word = (BignumInt)product >> (BIGNUM_INT_BITS - bitoffset); + add_word = product >> (BIGNUM_INT_BITS - bitoffset); } } @@ -917,14 +908,11 @@ static void internal_mod(BignumInt *a, int alen, * subtract m, and increment the quotient. */ { - BignumInt c = 1; + BignumCarry c = 1; for (i = alen - 1; i >= 0; i--) { int mindex = mlen-alen+i; BignumInt mword = mindex < 0 ? 0 : m[mindex]; - BignumDblInt subtmp = (BignumDblInt)a[i] + - ((BignumInt)mword ^ BIGNUM_INT_MASK) + c; - a[i] = (BignumInt)subtmp; - c = subtmp >> BIGNUM_INT_BITS; + BignumADC(a[i], c, a[i], ~mword, c); } } if (quot) @@ -1767,12 +1755,11 @@ Bignum bigmuladd(Bignum a, Bignum b, Bignum addend) /* now add in the addend, if any */ if (addend) { - BignumDblInt carry = 0; + BignumCarry carry = 0; for (i = 1; i <= rlen; i++) { - carry += (i <= (int)ret[0] ? ret[i] : 0); - carry += (i <= (int)addend[0] ? addend[i] : 0); - ret[i] = (BignumInt) carry & BIGNUM_INT_MASK; - carry >>= BIGNUM_INT_BITS; + BignumInt retword = (i <= (int)ret[0] ? ret[i] : 0); + BignumInt addword = (i <= (int)addend[0] ? addend[i] : 0); + BignumADC(ret[i], carry, retword, addword, carry); if (ret[i] != 0 && i > maxspot) maxspot = i; } @@ -1801,17 +1788,16 @@ Bignum bigadd(Bignum a, Bignum b) int rlen = (alen > blen ? alen : blen) + 1; int i, maxspot; Bignum ret; - BignumDblInt carry; + BignumCarry carry; ret = newbn(rlen); carry = 0; maxspot = 0; for (i = 1; i <= rlen; i++) { - carry += (i <= (int)a[0] ? a[i] : 0); - carry += (i <= (int)b[0] ? b[i] : 0); - ret[i] = (BignumInt) carry & BIGNUM_INT_MASK; - carry >>= BIGNUM_INT_BITS; + BignumInt aword = (i <= (int)a[0] ? a[i] : 0); + BignumInt bword = (i <= (int)b[0] ? b[i] : 0); + BignumADC(ret[i], carry, aword, bword, carry); if (ret[i] != 0 && i > maxspot) maxspot = i; } @@ -1831,17 +1817,16 @@ Bignum bigsub(Bignum a, Bignum b) int rlen = (alen > blen ? alen : blen); int i, maxspot; Bignum ret; - BignumDblInt carry; + BignumCarry carry; ret = newbn(rlen); carry = 1; maxspot = 0; for (i = 1; i <= rlen; i++) { - carry += (i <= (int)a[0] ? a[i] : 0); - carry += (i <= (int)b[0] ? b[i] ^ BIGNUM_INT_MASK : BIGNUM_INT_MASK); - ret[i] = (BignumInt) carry & BIGNUM_INT_MASK; - carry >>= BIGNUM_INT_BITS; + BignumInt aword = (i <= (int)a[0] ? a[i] : 0); + BignumInt bword = (i <= (int)b[0] ? b[i] : 0); + BignumADC(ret[i], carry, aword, ~bword, carry); if (ret[i] != 0 && i > maxspot) maxspot = i; } @@ -1881,40 +1866,52 @@ Bignum bignum_bitmask(Bignum n) } /* - * Convert a (max 32-bit) long into a bignum. + * Convert an unsigned long into a bignum. */ -Bignum bignum_from_long(unsigned long nn) +Bignum bignum_from_long(unsigned long n) { + const int maxwords = + (sizeof(unsigned long) + sizeof(BignumInt) - 1) / sizeof(BignumInt); Bignum ret; - BignumDblInt n = nn; + int i; + + ret = newbn(maxwords); + ret[0] = 0; + for (i = 0; i < maxwords; i++) { + ret[i+1] = n >> (i * BIGNUM_INT_BITS); + if (ret[i+1] != 0) + ret[0] = i+1; + } - ret = newbn(3); - ret[1] = (BignumInt)(n & BIGNUM_INT_MASK); - ret[2] = (BignumInt)((n >> BIGNUM_INT_BITS) & BIGNUM_INT_MASK); - ret[3] = 0; - ret[0] = (ret[2] ? 2 : 1); return ret; } /* * Add a long to a bignum. */ -Bignum bignum_add_long(Bignum number, unsigned long addendx) +Bignum bignum_add_long(Bignum number, unsigned long n) { - Bignum ret = newbn(number[0] + 1); - int i, maxspot = 0; - BignumDblInt carry = 0, addend = addendx; + const int maxwords = + (sizeof(unsigned long) + sizeof(BignumInt) - 1) / sizeof(BignumInt); + Bignum ret; + int words, i; + BignumCarry carry; - for (i = 1; i <= (int)ret[0]; i++) { - carry += addend & BIGNUM_INT_MASK; - carry += (i <= (int)number[0] ? number[i] : 0); - addend >>= BIGNUM_INT_BITS; - ret[i] = (BignumInt) carry & BIGNUM_INT_MASK; - carry >>= BIGNUM_INT_BITS; - if (ret[i] != 0) - maxspot = i; + words = number[0]; + if (words < maxwords) + words = maxwords; + words++; + ret = newbn(words); + + carry = 0; + ret[0] = 0; + for (i = 0; i < words; i++) { + BignumInt nword = (i < maxwords ? n >> (i * BIGNUM_INT_BITS) : 0); + BignumInt numword = (i < number[0] ? number[i+1] : 0); + BignumADC(ret[i+1], carry, numword, nword, carry); + if (ret[i+1] != 0) + ret[0] = i+1; } - ret[0] = maxspot; return ret; } @@ -1923,13 +1920,17 @@ Bignum bignum_add_long(Bignum number, unsigned long addendx) */ unsigned short bignum_mod_short(Bignum number, unsigned short modulus) { - BignumDblInt mod, r; + unsigned long mod = modulus, r = 0; + /* Precompute (BIGNUM_INT_MASK+1) % mod */ + unsigned long base_r = (BIGNUM_INT_MASK - modulus + 1) % mod; int i; - r = 0; - mod = modulus; - for (i = number[0]; i > 0; i--) - r = (r * (BIGNUM_TOP_BIT % mod) * 2 + number[i] % mod) % mod; + for (i = number[0]; i > 0; i--) { + /* + * Conceptually, ((r << BIGNUM_INT_BITS) + number[i]) % mod + */ + r = ((r * base_r) + (number[i] % mod)) % mod; + } return (unsigned short) r; } @@ -2087,7 +2088,7 @@ char *bignum_decimal(Bignum x) { int ndigits, ndigit; int i, iszero; - BignumDblInt carry; + BignumInt carry; char *ret; BignumInt *workspace; @@ -2134,11 +2135,33 @@ char *bignum_decimal(Bignum x) iszero = 1; carry = 0; for (i = 0; i < (int)x[0]; i++) { - carry = (carry << BIGNUM_INT_BITS) + workspace[i]; - workspace[i] = (BignumInt) (carry / 10); + /* + * Conceptually, we want to compute + * + * (carry << BIGNUM_INT_BITS) + workspace[i] + * ----------------------------------------- + * 10 + * + * but we don't have an integer type longer than BignumInt + * to work with. So we have to do it in pieces. + */ + + BignumInt q, r; + q = workspace[i] / 10; + r = workspace[i] % 10; + + /* I want (BIGNUM_INT_MASK+1)/10 but can't say so directly! */ + q += carry * ((BIGNUM_INT_MASK-9) / 10 + 1); + r += carry * ((BIGNUM_INT_MASK-9) % 10); + + q += r / 10; + r %= 10; + + workspace[i] = q; + carry = r; + if (workspace[i]) iszero = 0; - carry %= 10; } ret[--ndigit] = (char) (carry + '0'); } while (!iszero); diff --git a/sshbn.h b/sshbn.h index fc0e6b5a..6f5877b0 100644 --- a/sshbn.h +++ b/sshbn.h @@ -3,58 +3,171 @@ * multiply macros used throughout the bignum code to treat numbers as * arrays of the most conveniently sized word for the target machine. * Exported so that other code (e.g. poly1305) can use it too. + * + * This file must export, in whatever ifdef branch it ends up in: + * + * - two types: 'BignumInt' and 'BignumCarry'. BignumInt is an + * unsigned integer type which will be used as the base word size + * for all bignum operations. BignumCarry is an unsigned integer + * type used to hold the carry flag taken as input and output by + * the BignumADC macro (see below). + * + * - four constant macros: BIGNUM_INT_BITS, BIGNUM_INT_BYTES, + * BIGNUM_TOP_BIT, BIGNUM_INT_MASK. These should be more or less + * self-explanatory, but just in case, they give the number of bits + * in BignumInt, the number of bytes that works out to, the + * BignumInt value consisting of only the top bit, and the + * BignumInt value with all bits set. + * + * - four statement macros: BignumADC, BignumMUL, BignumMULADD, + * BignumMULADD2. These do various kinds of multi-word arithmetic, + * and all produce two output values. + * * BignumADC(ret,retc,a,b,c) takes input BignumInt values a,b + * and a BignumCarry c, and outputs a BignumInt ret = a+b+c and + * a BignumCarry retc which is the carry off the top of that + * addition. + * * BignumMUL(rh,rl,a,b) returns the two halves of the + * double-width product a*b. + * * BignumMULADD(rh,rl,a,b,addend) returns the two halves of the + * double-width value a*b + addend. + * * BignumMULADD2(rh,rl,a,b,addend1,addend2) returns the two + * halves of the double-width value a*b + addend1 + addend2. + * + * Every branch of the main ifdef below defines the type BignumInt and + * the value BIGNUM_INT_BITS. The other three constant macros are + * filled in by common code further down. + * + * Most branches also define a macro DEFINE_BIGNUMDBLINT containing a + * typedef statement which declares a type _twice_ the length of a + * BignumInt. This causes the common code further down to produce a + * default implementation of the four statement macros in terms of + * that double-width type, and also to defined BignumCarry to be + * BignumInt. + * + * However, if a particular compile target does not have a type twice + * the length of the BignumInt you want to use but it does provide + * some alternative means of doing add-with-carry and double-word + * multiply, then the ifdef branch in question can just define + * BignumCarry and the four statement macros itself, and that's fine + * too. */ #if defined __SIZEOF_INT128__ -/* gcc and clang both provide a __uint128_t type on 64-bit targets - * (and, when they do, indicate its presence by the above macro), - * using the same 'two machine registers' kind of code generation that - * 32-bit targets use for 64-bit ints. If we have one of these, we can - * use a 64-bit BignumInt and a 128-bit BignumDblInt. */ -typedef unsigned long long BignumInt; -typedef __uint128_t BignumDblInt; -#define BIGNUM_INT_MASK 0xFFFFFFFFFFFFFFFFULL -#define BIGNUM_TOP_BIT 0x8000000000000000ULL -#define BIGNUM_INT_BITS 64 -#define MUL_WORD(w1, w2) ((BignumDblInt)w1 * w2) -#elif defined __GNUC__ && defined __i386__ -typedef unsigned long BignumInt; -typedef unsigned long long BignumDblInt; -#define BIGNUM_INT_MASK 0xFFFFFFFFUL -#define BIGNUM_TOP_BIT 0x80000000UL -#define BIGNUM_INT_BITS 32 -#define MUL_WORD(w1, w2) ((BignumDblInt)w1 * w2) + + /* + * 64-bit BignumInt using gcc/clang style 128-bit BignumDblInt. + * + * gcc and clang both provide a __uint128_t type on 64-bit targets + * (and, when they do, indicate its presence by the above macro), + * using the same 'two machine registers' kind of code generation + * that 32-bit targets use for 64-bit ints. + */ + + typedef unsigned long long BignumInt; + #define BIGNUM_INT_BITS 64 + #define DEFINE_BIGNUMDBLINT typedef __uint128_t BignumDblInt + +#elif defined __GNUC__ || defined _LLP64 || __STDC__ >= 199901L + + /* 32-bit BignumInt, using C99 unsigned long long as BignumDblInt */ + + typedef unsigned int BignumInt; + #define BIGNUM_INT_BITS 32 + #define DEFINE_BIGNUMDBLINT typedef unsigned long long BignumDblInt + #elif defined _MSC_VER && defined _M_IX86 -typedef unsigned __int32 BignumInt; -typedef unsigned __int64 BignumDblInt; -#define BIGNUM_INT_MASK 0xFFFFFFFFUL -#define BIGNUM_TOP_BIT 0x80000000UL -#define BIGNUM_INT_BITS 32 -#define MUL_WORD(w1, w2) ((BignumDblInt)w1 * w2) + + /* 32-bit BignumInt, using Visual Studio __int64 as BignumDblInt */ + + typedef unsigned int BignumInt; + #define BIGNUM_INT_BITS 32 + #define DEFINE_BIGNUMDBLINT typedef unsigned __int64 BignumDblInt + #elif defined _LP64 -/* 64-bit architectures can do 32x32->64 chunks at a time */ -typedef unsigned int BignumInt; -typedef unsigned long BignumDblInt; -#define BIGNUM_INT_MASK 0xFFFFFFFFU -#define BIGNUM_TOP_BIT 0x80000000U -#define BIGNUM_INT_BITS 32 -#define MUL_WORD(w1, w2) ((BignumDblInt)w1 * w2) -#elif defined _LLP64 -/* 64-bit architectures in which unsigned long is 32 bits, not 64 */ -typedef unsigned long BignumInt; -typedef unsigned long long BignumDblInt; -#define BIGNUM_INT_MASK 0xFFFFFFFFUL -#define BIGNUM_TOP_BIT 0x80000000UL -#define BIGNUM_INT_BITS 32 -#define MUL_WORD(w1, w2) ((BignumDblInt)w1 * w2) + + /* + * 32-bit BignumInt, using unsigned long itself as BignumDblInt. + * + * Only for platforms where long is 64 bits, of course. + */ + + typedef unsigned int BignumInt; + #define BIGNUM_INT_BITS 32 + #define DEFINE_BIGNUMDBLINT typedef unsigned long BignumDblInt + #else -/* Fallback for all other cases */ -typedef unsigned short BignumInt; -typedef unsigned long BignumDblInt; -#define BIGNUM_INT_MASK 0xFFFFU -#define BIGNUM_TOP_BIT 0x8000U -#define BIGNUM_INT_BITS 16 -#define MUL_WORD(w1, w2) ((BignumDblInt)w1 * w2) + + /* + * 16-bit BignumInt, using unsigned long as BignumDblInt. + * + * This is the final fallback for real emergencies: C89 guarantees + * unsigned short/long to be at least the required sizes, so this + * should work on any C implementation at all. But it'll be + * noticeably slow, so if you find yourself in this case you + * probably want to move heaven and earth to find an alternative! + */ + + typedef unsigned short BignumInt; + #define BIGNUM_INT_BITS 16 + #define DEFINE_BIGNUMDBLINT typedef unsigned long BignumDblInt + #endif +/* + * Common code across all branches of that ifdef: define the three + * easy constant macros in terms of BIGNUM_INT_BITS. + */ #define BIGNUM_INT_BYTES (BIGNUM_INT_BITS / 8) +#define BIGNUM_TOP_BIT (((BignumInt)1) << (BIGNUM_INT_BITS-1)) +#define BIGNUM_INT_MASK (BIGNUM_TOP_BIT | (BIGNUM_TOP_BIT-1)) + +/* + * Common code across _most_ branches of the ifdef: define a set of + * statement macros in terms of the BignumDblInt type provided. In + * this case, we also define BignumCarry to be the same thing as + * BignumInt, for simplicity. + */ +#ifdef DEFINE_BIGNUMDBLINT + + typedef BignumInt BignumCarry; + #define BignumADC(ret, retc, a, b, c) do \ + { \ + DEFINE_BIGNUMDBLINT; \ + BignumDblInt ADC_temp = (BignumInt)(a); \ + ADC_temp += (BignumInt)(b); \ + ADC_temp += (c); \ + (ret) = (BignumInt)ADC_temp; \ + (retc) = (BignumCarry)(ADC_temp >> BIGNUM_INT_BITS); \ + } while (0) + + #define BignumMUL(rh, rl, a, b) do \ + { \ + DEFINE_BIGNUMDBLINT; \ + BignumDblInt MUL_temp = (BignumInt)(a); \ + MUL_temp *= (BignumInt)(b); \ + (rh) = (BignumInt)(MUL_temp >> BIGNUM_INT_BITS); \ + (rl) = (BignumInt)(MUL_temp); \ + } while (0) + + #define BignumMULADD(rh, rl, a, b, addend) do \ + { \ + DEFINE_BIGNUMDBLINT; \ + BignumDblInt MUL_temp = (BignumInt)(a); \ + MUL_temp *= (BignumInt)(b); \ + MUL_temp += (BignumInt)(addend); \ + (rh) = (BignumInt)(MUL_temp >> BIGNUM_INT_BITS); \ + (rl) = (BignumInt)(MUL_temp); \ + } while (0) + + #define BignumMULADD2(rh, rl, a, b, addend1, addend2) do \ + { \ + DEFINE_BIGNUMDBLINT; \ + BignumDblInt MUL_temp = (BignumInt)(a); \ + MUL_temp *= (BignumInt)(b); \ + MUL_temp += (BignumInt)(addend1); \ + MUL_temp += (BignumInt)(addend2); \ + (rh) = (BignumInt)(MUL_temp >> BIGNUM_INT_BITS); \ + (rl) = (BignumInt)(MUL_temp); \ + } while (0) + +#endif /* DEFINE_BIGNUMDBLINT */ diff --git a/sshccp.c b/sshccp.c index 0b3dc573..d0a61775 100644 --- a/sshccp.c +++ b/sshccp.c @@ -211,815 +211,528 @@ static void bigval_export_le(const bigval *r, void *vdata, int len) } /* - * Addition of bigvals, not mod p. + * Core functions to do arithmetic mod p = 2^130-5. The whole + * collection of these, up to and including the surrounding #if, are + * generated automatically for various sizes of BignumInt by + * contrib/make1305.py. */ + +#if BIGNUM_INT_BITS == 16 + static void bigval_add(bigval *r, const bigval *a, const bigval *b) { -#if BIGNUM_INT_BITS == 64 - /* ./contrib/make1305.py add 64 */ - BignumDblInt acclo; - acclo = 0; - acclo += a->w[0]; - acclo += b->w[0]; - r->w[0] = acclo; - acclo >>= 64; - acclo += a->w[1]; - acclo += b->w[1]; - r->w[1] = acclo; - acclo >>= 64; - acclo += a->w[2]; - acclo += b->w[2]; - r->w[2] = acclo; - acclo >>= 64; -#elif BIGNUM_INT_BITS == 32 - /* ./contrib/make1305.py add 32 */ - BignumDblInt acclo; - acclo = 0; - acclo += a->w[0]; - acclo += b->w[0]; - r->w[0] = acclo; - acclo >>= 32; - acclo += a->w[1]; - acclo += b->w[1]; - r->w[1] = acclo; - acclo >>= 32; - acclo += a->w[2]; - acclo += b->w[2]; - r->w[2] = acclo; - acclo >>= 32; - acclo += a->w[3]; - acclo += b->w[3]; - r->w[3] = acclo; - acclo >>= 32; - acclo += a->w[4]; - acclo += b->w[4]; - r->w[4] = acclo; - acclo >>= 32; -#elif BIGNUM_INT_BITS == 16 - /* ./contrib/make1305.py add 16 */ - BignumDblInt acclo; - acclo = 0; - acclo += a->w[0]; - acclo += b->w[0]; - r->w[0] = acclo; - acclo >>= 16; - acclo += a->w[1]; - acclo += b->w[1]; - r->w[1] = acclo; - acclo >>= 16; - acclo += a->w[2]; - acclo += b->w[2]; - r->w[2] = acclo; - acclo >>= 16; - acclo += a->w[3]; - acclo += b->w[3]; - r->w[3] = acclo; - acclo >>= 16; - acclo += a->w[4]; - acclo += b->w[4]; - r->w[4] = acclo; - acclo >>= 16; - acclo += a->w[5]; - acclo += b->w[5]; - r->w[5] = acclo; - acclo >>= 16; - acclo += a->w[6]; - acclo += b->w[6]; - r->w[6] = acclo; - acclo >>= 16; - acclo += a->w[7]; - acclo += b->w[7]; - r->w[7] = acclo; - acclo >>= 16; - acclo += a->w[8]; - acclo += b->w[8]; - r->w[8] = acclo; - acclo >>= 16; -#else -#error Run contrib/make1305.py again with a different bit count -#endif + BignumInt v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14; + BignumInt v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26; + BignumCarry carry; + + v0 = a->w[0]; + v1 = a->w[1]; + v2 = a->w[2]; + v3 = a->w[3]; + v4 = a->w[4]; + v5 = a->w[5]; + v6 = a->w[6]; + v7 = a->w[7]; + v8 = a->w[8]; + v9 = b->w[0]; + v10 = b->w[1]; + v11 = b->w[2]; + v12 = b->w[3]; + v13 = b->w[4]; + v14 = b->w[5]; + v15 = b->w[6]; + v16 = b->w[7]; + v17 = b->w[8]; + BignumADC(v18, carry, v0, v9, 0); + BignumADC(v19, carry, v1, v10, carry); + BignumADC(v20, carry, v2, v11, carry); + BignumADC(v21, carry, v3, v12, carry); + BignumADC(v22, carry, v4, v13, carry); + BignumADC(v23, carry, v5, v14, carry); + BignumADC(v24, carry, v6, v15, carry); + BignumADC(v25, carry, v7, v16, carry); + v26 = v8 + v17 + carry; + r->w[0] = v18; + r->w[1] = v19; + r->w[2] = v20; + r->w[3] = v21; + r->w[4] = v22; + r->w[5] = v23; + r->w[6] = v24; + r->w[7] = v25; + r->w[8] = v26; } -/* - * Multiplication of bigvals mod p. Uses r as temporary storage, so - * don't pass r aliasing a or b. - */ static void bigval_mul_mod_p(bigval *r, const bigval *a, const bigval *b) { -#if BIGNUM_INT_BITS == 64 - /* ./contrib/make1305.py mul 64 */ - BignumDblInt tmp; - BignumDblInt acclo; - BignumDblInt acchi; - BignumDblInt acc2lo; - acclo = 0; - acchi = 0; - tmp = (BignumDblInt)(a->w[0]) * (b->w[0]); - acclo += tmp & BIGNUM_INT_MASK; - acchi += tmp >> 64; - r->w[0] = acclo; - acclo = acchi + (acclo >> 64); - acchi = 0; - tmp = (BignumDblInt)(a->w[0]) * (b->w[1]); - acclo += tmp & BIGNUM_INT_MASK; - acchi += tmp >> 64; - tmp = (BignumDblInt)(a->w[1]) * (b->w[0]); - acclo += tmp & BIGNUM_INT_MASK; - acchi += tmp >> 64; - r->w[1] = acclo; - acclo = acchi + (acclo >> 64); - acchi = 0; - tmp = (BignumDblInt)(a->w[0]) * (b->w[2]); - acclo += tmp & BIGNUM_INT_MASK; - acchi += tmp >> 64; - tmp = (BignumDblInt)(a->w[1]) * (b->w[1]); - acclo += tmp & BIGNUM_INT_MASK; - acchi += tmp >> 64; - tmp = (BignumDblInt)(a->w[2]) * (b->w[0]); - acclo += tmp & BIGNUM_INT_MASK; - acchi += tmp >> 64; - r->w[2] = acclo & (((BignumInt)1 << 2)-1); - acc2lo = 0; - acc2lo += ((acclo >> 2) & (((BignumInt)1 << 62)-1)) * ((BignumDblInt)5 << 0); - acclo = acchi + (acclo >> 64); - acchi = 0; - tmp = (BignumDblInt)(a->w[1]) * (b->w[2]); - acclo += tmp & BIGNUM_INT_MASK; - acchi += tmp >> 64; - tmp = (BignumDblInt)(a->w[2]) * (b->w[1]); - acclo += tmp & BIGNUM_INT_MASK; - acchi += tmp >> 64; - acc2lo += (acclo & (((BignumInt)1 << 2)-1)) * ((BignumDblInt)5 << 62); - acc2lo += r->w[0]; - r->w[0] = acc2lo; - acc2lo >>= 64; - acc2lo += ((acclo >> 2) & (((BignumInt)1 << 62)-1)) * ((BignumDblInt)5 << 0); - acclo = acchi + (acclo >> 64); - acchi = 0; - tmp = (BignumDblInt)(a->w[2]) * (b->w[2]); - acclo += tmp & BIGNUM_INT_MASK; - acchi += tmp >> 64; - acc2lo += (acclo & (((BignumInt)1 << 2)-1)) * ((BignumDblInt)5 << 62); - acc2lo += r->w[1]; - r->w[1] = acc2lo; - acc2lo >>= 64; - acc2lo += ((acclo >> 2) & (((BignumInt)1 << 2)-1)) * ((BignumDblInt)5 << 0); - acc2lo += r->w[2]; - r->w[2] = acc2lo; - acc2lo = 0; - acc2lo += ((acclo >> 4) & (((BignumInt)1 << 60)-1)) * ((BignumDblInt)25 << 0); - acclo = acchi + (acclo >> 64); - acchi = 0; - acc2lo += (acclo & (((BignumInt)1 << 4)-1)) * ((BignumDblInt)25 << 60); - acc2lo += r->w[0]; - r->w[0] = acc2lo; - acc2lo >>= 64; - acc2lo += ((acclo >> 4) & (((BignumInt)1 << 60)-1)) * ((BignumDblInt)25 << 0); - acclo = acchi + (acclo >> 64); - acchi = 0; - acc2lo += r->w[1]; - r->w[1] = acc2lo; - acc2lo >>= 64; - acc2lo += r->w[2]; - r->w[2] = acc2lo; - acc2lo >>= 64; -#elif BIGNUM_INT_BITS == 32 - /* ./contrib/make1305.py mul 32 */ - BignumDblInt tmp; - BignumDblInt acclo; - BignumDblInt acchi; - BignumDblInt acc2lo; - acclo = 0; - acchi = 0; - tmp = (BignumDblInt)(a->w[0]) * (b->w[0]); - acclo += tmp & BIGNUM_INT_MASK; - acchi += tmp >> 32; - r->w[0] = acclo; - acclo = acchi + (acclo >> 32); - acchi = 0; - tmp = (BignumDblInt)(a->w[0]) * (b->w[1]); - acclo += tmp & BIGNUM_INT_MASK; - acchi += tmp >> 32; - tmp = (BignumDblInt)(a->w[1]) * (b->w[0]); - acclo += tmp & BIGNUM_INT_MASK; - acchi += tmp >> 32; - r->w[1] = acclo; - acclo = acchi + (acclo >> 32); - acchi = 0; - tmp = (BignumDblInt)(a->w[0]) * (b->w[2]); - acclo += tmp & BIGNUM_INT_MASK; - acchi += tmp >> 32; - tmp = (BignumDblInt)(a->w[1]) * (b->w[1]); - acclo += tmp & BIGNUM_INT_MASK; - acchi += tmp >> 32; - tmp = (BignumDblInt)(a->w[2]) * (b->w[0]); - acclo += tmp & BIGNUM_INT_MASK; - acchi += tmp >> 32; - r->w[2] = acclo; - acclo = acchi + (acclo >> 32); - acchi = 0; - tmp = (BignumDblInt)(a->w[0]) * (b->w[3]); - acclo += tmp & BIGNUM_INT_MASK; - acchi += tmp >> 32; - tmp = (BignumDblInt)(a->w[1]) * (b->w[2]); - acclo += tmp & BIGNUM_INT_MASK; - acchi += tmp >> 32; - tmp = (BignumDblInt)(a->w[2]) * (b->w[1]); - acclo += tmp & BIGNUM_INT_MASK; - acchi += tmp >> 32; - tmp = (BignumDblInt)(a->w[3]) * (b->w[0]); - acclo += tmp & BIGNUM_INT_MASK; - acchi += tmp >> 32; - r->w[3] = acclo; - acclo = acchi + (acclo >> 32); - acchi = 0; - tmp = (BignumDblInt)(a->w[0]) * (b->w[4]); - acclo += tmp & BIGNUM_INT_MASK; - acchi += tmp >> 32; - tmp = (BignumDblInt)(a->w[1]) * (b->w[3]); - acclo += tmp & BIGNUM_INT_MASK; - acchi += tmp >> 32; - tmp = (BignumDblInt)(a->w[2]) * (b->w[2]); - acclo += tmp & BIGNUM_INT_MASK; - acchi += tmp >> 32; - tmp = (BignumDblInt)(a->w[3]) * (b->w[1]); - acclo += tmp & BIGNUM_INT_MASK; - acchi += tmp >> 32; - tmp = (BignumDblInt)(a->w[4]) * (b->w[0]); - acclo += tmp & BIGNUM_INT_MASK; - acchi += tmp >> 32; - r->w[4] = acclo & (((BignumInt)1 << 2)-1); - acc2lo = 0; - acc2lo += ((acclo >> 2) & (((BignumInt)1 << 30)-1)) * ((BignumDblInt)5 << 0); - acclo = acchi + (acclo >> 32); - acchi = 0; - tmp = (BignumDblInt)(a->w[1]) * (b->w[4]); - acclo += tmp & BIGNUM_INT_MASK; - acchi += tmp >> 32; - tmp = (BignumDblInt)(a->w[2]) * (b->w[3]); - acclo += tmp & BIGNUM_INT_MASK; - acchi += tmp >> 32; - tmp = (BignumDblInt)(a->w[3]) * (b->w[2]); - acclo += tmp & BIGNUM_INT_MASK; - acchi += tmp >> 32; - tmp = (BignumDblInt)(a->w[4]) * (b->w[1]); - acclo += tmp & BIGNUM_INT_MASK; - acchi += tmp >> 32; - acc2lo += (acclo & (((BignumInt)1 << 2)-1)) * ((BignumDblInt)5 << 30); - acc2lo += r->w[0]; - r->w[0] = acc2lo; - acc2lo >>= 32; - acc2lo += ((acclo >> 2) & (((BignumInt)1 << 30)-1)) * ((BignumDblInt)5 << 0); - acclo = acchi + (acclo >> 32); - acchi = 0; - tmp = (BignumDblInt)(a->w[2]) * (b->w[4]); - acclo += tmp & BIGNUM_INT_MASK; - acchi += tmp >> 32; - tmp = (BignumDblInt)(a->w[3]) * (b->w[3]); - acclo += tmp & BIGNUM_INT_MASK; - acchi += tmp >> 32; - tmp = (BignumDblInt)(a->w[4]) * (b->w[2]); - acclo += tmp & BIGNUM_INT_MASK; - acchi += tmp >> 32; - acc2lo += (acclo & (((BignumInt)1 << 2)-1)) * ((BignumDblInt)5 << 30); - acc2lo += r->w[1]; - r->w[1] = acc2lo; - acc2lo >>= 32; - acc2lo += ((acclo >> 2) & (((BignumInt)1 << 30)-1)) * ((BignumDblInt)5 << 0); - acclo = acchi + (acclo >> 32); - acchi = 0; - tmp = (BignumDblInt)(a->w[3]) * (b->w[4]); - acclo += tmp & BIGNUM_INT_MASK; - acchi += tmp >> 32; - tmp = (BignumDblInt)(a->w[4]) * (b->w[3]); - acclo += tmp & BIGNUM_INT_MASK; - acchi += tmp >> 32; - acc2lo += (acclo & (((BignumInt)1 << 2)-1)) * ((BignumDblInt)5 << 30); - acc2lo += r->w[2]; - r->w[2] = acc2lo; - acc2lo >>= 32; - acc2lo += ((acclo >> 2) & (((BignumInt)1 << 30)-1)) * ((BignumDblInt)5 << 0); - acclo = acchi + (acclo >> 32); - acchi = 0; - tmp = (BignumDblInt)(a->w[4]) * (b->w[4]); - acclo += tmp & BIGNUM_INT_MASK; - acchi += tmp >> 32; - acc2lo += (acclo & (((BignumInt)1 << 2)-1)) * ((BignumDblInt)5 << 30); - acc2lo += r->w[3]; - r->w[3] = acc2lo; - acc2lo >>= 32; - acc2lo += ((acclo >> 2) & (((BignumInt)1 << 2)-1)) * ((BignumDblInt)5 << 0); - acc2lo += r->w[4]; - r->w[4] = acc2lo; - acc2lo = 0; - acc2lo += ((acclo >> 4) & (((BignumInt)1 << 28)-1)) * ((BignumDblInt)25 << 0); - acclo = acchi + (acclo >> 32); - acchi = 0; - acc2lo += (acclo & (((BignumInt)1 << 4)-1)) * ((BignumDblInt)25 << 28); - acc2lo += r->w[0]; - r->w[0] = acc2lo; - acc2lo >>= 32; - acc2lo += ((acclo >> 4) & (((BignumInt)1 << 28)-1)) * ((BignumDblInt)25 << 0); - acclo = acchi + (acclo >> 32); - acchi = 0; - acc2lo += r->w[1]; - r->w[1] = acc2lo; - acc2lo >>= 32; - acc2lo += r->w[2]; - r->w[2] = acc2lo; - acc2lo >>= 32; - acc2lo += r->w[3]; - r->w[3] = acc2lo; - acc2lo >>= 32; - acc2lo += r->w[4]; - r->w[4] = acc2lo; - acc2lo >>= 32; -#elif BIGNUM_INT_BITS == 16 - /* ./contrib/make1305.py mul 16 */ - BignumDblInt tmp; - BignumDblInt acclo; - BignumDblInt acchi; - BignumDblInt acc2lo; - acclo = 0; - acchi = 0; - tmp = (BignumDblInt)(a->w[0]) * (b->w[0]); - acclo += tmp & BIGNUM_INT_MASK; - acchi += tmp >> 16; - r->w[0] = acclo; - acclo = acchi + (acclo >> 16); - acchi = 0; - tmp = (BignumDblInt)(a->w[0]) * (b->w[1]); - acclo += tmp & BIGNUM_INT_MASK; - acchi += tmp >> 16; - tmp = (BignumDblInt)(a->w[1]) * (b->w[0]); - acclo += tmp & BIGNUM_INT_MASK; - acchi += tmp >> 16; - r->w[1] = acclo; - acclo = acchi + (acclo >> 16); - acchi = 0; - tmp = (BignumDblInt)(a->w[0]) * (b->w[2]); - acclo += tmp & BIGNUM_INT_MASK; - acchi += tmp >> 16; - tmp = (BignumDblInt)(a->w[1]) * (b->w[1]); - acclo += tmp & BIGNUM_INT_MASK; - acchi += tmp >> 16; - tmp = (BignumDblInt)(a->w[2]) * (b->w[0]); - acclo += tmp & BIGNUM_INT_MASK; - acchi += tmp >> 16; - r->w[2] = acclo; - acclo = acchi + (acclo >> 16); - acchi = 0; - tmp = (BignumDblInt)(a->w[0]) * (b->w[3]); - acclo += tmp & BIGNUM_INT_MASK; - acchi += tmp >> 16; - tmp = (BignumDblInt)(a->w[1]) * (b->w[2]); - acclo += tmp & BIGNUM_INT_MASK; - acchi += tmp >> 16; - tmp = (BignumDblInt)(a->w[2]) * (b->w[1]); - acclo += tmp & BIGNUM_INT_MASK; - acchi += tmp >> 16; - tmp = (BignumDblInt)(a->w[3]) * (b->w[0]); - acclo += tmp & BIGNUM_INT_MASK; - acchi += tmp >> 16; - r->w[3] = acclo; - acclo = acchi + (acclo >> 16); - acchi = 0; - tmp = (BignumDblInt)(a->w[0]) * (b->w[4]); - acclo += tmp & BIGNUM_INT_MASK; - acchi += tmp >> 16; - tmp = (BignumDblInt)(a->w[1]) * (b->w[3]); - acclo += tmp & BIGNUM_INT_MASK; - acchi += tmp >> 16; - tmp = (BignumDblInt)(a->w[2]) * (b->w[2]); - acclo += tmp & BIGNUM_INT_MASK; - acchi += tmp >> 16; - tmp = (BignumDblInt)(a->w[3]) * (b->w[1]); - acclo += tmp & BIGNUM_INT_MASK; - acchi += tmp >> 16; - tmp = (BignumDblInt)(a->w[4]) * (b->w[0]); - acclo += tmp & BIGNUM_INT_MASK; - acchi += tmp >> 16; - r->w[4] = acclo; - acclo = acchi + (acclo >> 16); - acchi = 0; - tmp = (BignumDblInt)(a->w[0]) * (b->w[5]); - acclo += tmp & BIGNUM_INT_MASK; - acchi += tmp >> 16; - tmp = (BignumDblInt)(a->w[1]) * (b->w[4]); - acclo += tmp & BIGNUM_INT_MASK; - acchi += tmp >> 16; - tmp = (BignumDblInt)(a->w[2]) * (b->w[3]); - acclo += tmp & BIGNUM_INT_MASK; - acchi += tmp >> 16; - tmp = (BignumDblInt)(a->w[3]) * (b->w[2]); - acclo += tmp & BIGNUM_INT_MASK; - acchi += tmp >> 16; - tmp = (BignumDblInt)(a->w[4]) * (b->w[1]); - acclo += tmp & BIGNUM_INT_MASK; - acchi += tmp >> 16; - tmp = (BignumDblInt)(a->w[5]) * (b->w[0]); - acclo += tmp & BIGNUM_INT_MASK; - acchi += tmp >> 16; - r->w[5] = acclo; - acclo = acchi + (acclo >> 16); - acchi = 0; - tmp = (BignumDblInt)(a->w[0]) * (b->w[6]); - acclo += tmp & BIGNUM_INT_MASK; - acchi += tmp >> 16; - tmp = (BignumDblInt)(a->w[1]) * (b->w[5]); - acclo += tmp & BIGNUM_INT_MASK; - acchi += tmp >> 16; - tmp = (BignumDblInt)(a->w[2]) * (b->w[4]); - acclo += tmp & BIGNUM_INT_MASK; - acchi += tmp >> 16; - tmp = (BignumDblInt)(a->w[3]) * (b->w[3]); - acclo += tmp & BIGNUM_INT_MASK; - acchi += tmp >> 16; - tmp = (BignumDblInt)(a->w[4]) * (b->w[2]); - acclo += tmp & BIGNUM_INT_MASK; - acchi += tmp >> 16; - tmp = (BignumDblInt)(a->w[5]) * (b->w[1]); - acclo += tmp & BIGNUM_INT_MASK; - acchi += tmp >> 16; - tmp = (BignumDblInt)(a->w[6]) * (b->w[0]); - acclo += tmp & BIGNUM_INT_MASK; - acchi += tmp >> 16; - r->w[6] = acclo; - acclo = acchi + (acclo >> 16); - acchi = 0; - tmp = (BignumDblInt)(a->w[0]) * (b->w[7]); - acclo += tmp & BIGNUM_INT_MASK; - acchi += tmp >> 16; - tmp = (BignumDblInt)(a->w[1]) * (b->w[6]); - acclo += tmp & BIGNUM_INT_MASK; - acchi += tmp >> 16; - tmp = (BignumDblInt)(a->w[2]) * (b->w[5]); - acclo += tmp & BIGNUM_INT_MASK; - acchi += tmp >> 16; - tmp = (BignumDblInt)(a->w[3]) * (b->w[4]); - acclo += tmp & BIGNUM_INT_MASK; - acchi += tmp >> 16; - tmp = (BignumDblInt)(a->w[4]) * (b->w[3]); - acclo += tmp & BIGNUM_INT_MASK; - acchi += tmp >> 16; - tmp = (BignumDblInt)(a->w[5]) * (b->w[2]); - acclo += tmp & BIGNUM_INT_MASK; - acchi += tmp >> 16; - tmp = (BignumDblInt)(a->w[6]) * (b->w[1]); - acclo += tmp & BIGNUM_INT_MASK; - acchi += tmp >> 16; - tmp = (BignumDblInt)(a->w[7]) * (b->w[0]); - acclo += tmp & BIGNUM_INT_MASK; - acchi += tmp >> 16; - r->w[7] = acclo; - acclo = acchi + (acclo >> 16); - acchi = 0; - tmp = (BignumDblInt)(a->w[0]) * (b->w[8]); - acclo += tmp & BIGNUM_INT_MASK; - acchi += tmp >> 16; - tmp = (BignumDblInt)(a->w[1]) * (b->w[7]); - acclo += tmp & BIGNUM_INT_MASK; - acchi += tmp >> 16; - tmp = (BignumDblInt)(a->w[2]) * (b->w[6]); - acclo += tmp & BIGNUM_INT_MASK; - acchi += tmp >> 16; - tmp = (BignumDblInt)(a->w[3]) * (b->w[5]); - acclo += tmp & BIGNUM_INT_MASK; - acchi += tmp >> 16; - tmp = (BignumDblInt)(a->w[4]) * (b->w[4]); - acclo += tmp & BIGNUM_INT_MASK; - acchi += tmp >> 16; - tmp = (BignumDblInt)(a->w[5]) * (b->w[3]); - acclo += tmp & BIGNUM_INT_MASK; - acchi += tmp >> 16; - tmp = (BignumDblInt)(a->w[6]) * (b->w[2]); - acclo += tmp & BIGNUM_INT_MASK; - acchi += tmp >> 16; - tmp = (BignumDblInt)(a->w[7]) * (b->w[1]); - acclo += tmp & BIGNUM_INT_MASK; - acchi += tmp >> 16; - tmp = (BignumDblInt)(a->w[8]) * (b->w[0]); - acclo += tmp & BIGNUM_INT_MASK; - acchi += tmp >> 16; - r->w[8] = acclo & (((BignumInt)1 << 2)-1); - acc2lo = 0; - acc2lo += ((acclo >> 2) & (((BignumInt)1 << 14)-1)) * ((BignumDblInt)5 << 0); - acclo = acchi + (acclo >> 16); - acchi = 0; - tmp = (BignumDblInt)(a->w[1]) * (b->w[8]); - acclo += tmp & BIGNUM_INT_MASK; - acchi += tmp >> 16; - tmp = (BignumDblInt)(a->w[2]) * (b->w[7]); - acclo += tmp & BIGNUM_INT_MASK; - acchi += tmp >> 16; - tmp = (BignumDblInt)(a->w[3]) * (b->w[6]); - acclo += tmp & BIGNUM_INT_MASK; - acchi += tmp >> 16; - tmp = (BignumDblInt)(a->w[4]) * (b->w[5]); - acclo += tmp & BIGNUM_INT_MASK; - acchi += tmp >> 16; - tmp = (BignumDblInt)(a->w[5]) * (b->w[4]); - acclo += tmp & BIGNUM_INT_MASK; - acchi += tmp >> 16; - tmp = (BignumDblInt)(a->w[6]) * (b->w[3]); - acclo += tmp & BIGNUM_INT_MASK; - acchi += tmp >> 16; - tmp = (BignumDblInt)(a->w[7]) * (b->w[2]); - acclo += tmp & BIGNUM_INT_MASK; - acchi += tmp >> 16; - tmp = (BignumDblInt)(a->w[8]) * (b->w[1]); - acclo += tmp & BIGNUM_INT_MASK; - acchi += tmp >> 16; - acc2lo += (acclo & (((BignumInt)1 << 2)-1)) * ((BignumDblInt)5 << 14); - acc2lo += r->w[0]; - r->w[0] = acc2lo; - acc2lo >>= 16; - acc2lo += ((acclo >> 2) & (((BignumInt)1 << 14)-1)) * ((BignumDblInt)5 << 0); - acclo = acchi + (acclo >> 16); - acchi = 0; - tmp = (BignumDblInt)(a->w[2]) * (b->w[8]); - acclo += tmp & BIGNUM_INT_MASK; - acchi += tmp >> 16; - tmp = (BignumDblInt)(a->w[3]) * (b->w[7]); - acclo += tmp & BIGNUM_INT_MASK; - acchi += tmp >> 16; - tmp = (BignumDblInt)(a->w[4]) * (b->w[6]); - acclo += tmp & BIGNUM_INT_MASK; - acchi += tmp >> 16; - tmp = (BignumDblInt)(a->w[5]) * (b->w[5]); - acclo += tmp & BIGNUM_INT_MASK; - acchi += tmp >> 16; - tmp = (BignumDblInt)(a->w[6]) * (b->w[4]); - acclo += tmp & BIGNUM_INT_MASK; - acchi += tmp >> 16; - tmp = (BignumDblInt)(a->w[7]) * (b->w[3]); - acclo += tmp & BIGNUM_INT_MASK; - acchi += tmp >> 16; - tmp = (BignumDblInt)(a->w[8]) * (b->w[2]); - acclo += tmp & BIGNUM_INT_MASK; - acchi += tmp >> 16; - acc2lo += (acclo & (((BignumInt)1 << 2)-1)) * ((BignumDblInt)5 << 14); - acc2lo += r->w[1]; - r->w[1] = acc2lo; - acc2lo >>= 16; - acc2lo += ((acclo >> 2) & (((BignumInt)1 << 14)-1)) * ((BignumDblInt)5 << 0); - acclo = acchi + (acclo >> 16); - acchi = 0; - tmp = (BignumDblInt)(a->w[3]) * (b->w[8]); - acclo += tmp & BIGNUM_INT_MASK; - acchi += tmp >> 16; - tmp = (BignumDblInt)(a->w[4]) * (b->w[7]); - acclo += tmp & BIGNUM_INT_MASK; - acchi += tmp >> 16; - tmp = (BignumDblInt)(a->w[5]) * (b->w[6]); - acclo += tmp & BIGNUM_INT_MASK; - acchi += tmp >> 16; - tmp = (BignumDblInt)(a->w[6]) * (b->w[5]); - acclo += tmp & BIGNUM_INT_MASK; - acchi += tmp >> 16; - tmp = (BignumDblInt)(a->w[7]) * (b->w[4]); - acclo += tmp & BIGNUM_INT_MASK; - acchi += tmp >> 16; - tmp = (BignumDblInt)(a->w[8]) * (b->w[3]); - acclo += tmp & BIGNUM_INT_MASK; - acchi += tmp >> 16; - acc2lo += (acclo & (((BignumInt)1 << 2)-1)) * ((BignumDblInt)5 << 14); - acc2lo += r->w[2]; - r->w[2] = acc2lo; - acc2lo >>= 16; - acc2lo += ((acclo >> 2) & (((BignumInt)1 << 14)-1)) * ((BignumDblInt)5 << 0); - acclo = acchi + (acclo >> 16); - acchi = 0; - tmp = (BignumDblInt)(a->w[4]) * (b->w[8]); - acclo += tmp & BIGNUM_INT_MASK; - acchi += tmp >> 16; - tmp = (BignumDblInt)(a->w[5]) * (b->w[7]); - acclo += tmp & BIGNUM_INT_MASK; - acchi += tmp >> 16; - tmp = (BignumDblInt)(a->w[6]) * (b->w[6]); - acclo += tmp & BIGNUM_INT_MASK; - acchi += tmp >> 16; - tmp = (BignumDblInt)(a->w[7]) * (b->w[5]); - acclo += tmp & BIGNUM_INT_MASK; - acchi += tmp >> 16; - tmp = (BignumDblInt)(a->w[8]) * (b->w[4]); - acclo += tmp & BIGNUM_INT_MASK; - acchi += tmp >> 16; - acc2lo += (acclo & (((BignumInt)1 << 2)-1)) * ((BignumDblInt)5 << 14); - acc2lo += r->w[3]; - r->w[3] = acc2lo; - acc2lo >>= 16; - acc2lo += ((acclo >> 2) & (((BignumInt)1 << 14)-1)) * ((BignumDblInt)5 << 0); - acclo = acchi + (acclo >> 16); - acchi = 0; - tmp = (BignumDblInt)(a->w[5]) * (b->w[8]); - acclo += tmp & BIGNUM_INT_MASK; - acchi += tmp >> 16; - tmp = (BignumDblInt)(a->w[6]) * (b->w[7]); - acclo += tmp & BIGNUM_INT_MASK; - acchi += tmp >> 16; - tmp = (BignumDblInt)(a->w[7]) * (b->w[6]); - acclo += tmp & BIGNUM_INT_MASK; - acchi += tmp >> 16; - tmp = (BignumDblInt)(a->w[8]) * (b->w[5]); - acclo += tmp & BIGNUM_INT_MASK; - acchi += tmp >> 16; - acc2lo += (acclo & (((BignumInt)1 << 2)-1)) * ((BignumDblInt)5 << 14); - acc2lo += r->w[4]; - r->w[4] = acc2lo; - acc2lo >>= 16; - acc2lo += ((acclo >> 2) & (((BignumInt)1 << 14)-1)) * ((BignumDblInt)5 << 0); - acclo = acchi + (acclo >> 16); - acchi = 0; - tmp = (BignumDblInt)(a->w[6]) * (b->w[8]); - acclo += tmp & BIGNUM_INT_MASK; - acchi += tmp >> 16; - tmp = (BignumDblInt)(a->w[7]) * (b->w[7]); - acclo += tmp & BIGNUM_INT_MASK; - acchi += tmp >> 16; - tmp = (BignumDblInt)(a->w[8]) * (b->w[6]); - acclo += tmp & BIGNUM_INT_MASK; - acchi += tmp >> 16; - acc2lo += (acclo & (((BignumInt)1 << 2)-1)) * ((BignumDblInt)5 << 14); - acc2lo += r->w[5]; - r->w[5] = acc2lo; - acc2lo >>= 16; - acc2lo += ((acclo >> 2) & (((BignumInt)1 << 14)-1)) * ((BignumDblInt)5 << 0); - acclo = acchi + (acclo >> 16); - acchi = 0; - tmp = (BignumDblInt)(a->w[7]) * (b->w[8]); - acclo += tmp & BIGNUM_INT_MASK; - acchi += tmp >> 16; - tmp = (BignumDblInt)(a->w[8]) * (b->w[7]); - acclo += tmp & BIGNUM_INT_MASK; - acchi += tmp >> 16; - acc2lo += (acclo & (((BignumInt)1 << 2)-1)) * ((BignumDblInt)5 << 14); - acc2lo += r->w[6]; - r->w[6] = acc2lo; - acc2lo >>= 16; - acc2lo += ((acclo >> 2) & (((BignumInt)1 << 14)-1)) * ((BignumDblInt)5 << 0); - acclo = acchi + (acclo >> 16); - acchi = 0; - tmp = (BignumDblInt)(a->w[8]) * (b->w[8]); - acclo += tmp & BIGNUM_INT_MASK; - acchi += tmp >> 16; - acc2lo += (acclo & (((BignumInt)1 << 2)-1)) * ((BignumDblInt)5 << 14); - acc2lo += r->w[7]; - r->w[7] = acc2lo; - acc2lo >>= 16; - acc2lo += ((acclo >> 2) & (((BignumInt)1 << 2)-1)) * ((BignumDblInt)5 << 0); - acc2lo += r->w[8]; - r->w[8] = acc2lo; - acc2lo = 0; - acc2lo += ((acclo >> 4) & (((BignumInt)1 << 12)-1)) * ((BignumDblInt)25 << 0); - acclo = acchi + (acclo >> 16); - acchi = 0; - acc2lo += (acclo & (((BignumInt)1 << 4)-1)) * ((BignumDblInt)25 << 12); - acc2lo += r->w[0]; - r->w[0] = acc2lo; - acc2lo >>= 16; - acc2lo += ((acclo >> 4) & (((BignumInt)1 << 12)-1)) * ((BignumDblInt)25 << 0); - acclo = acchi + (acclo >> 16); - acchi = 0; - acc2lo += r->w[1]; - r->w[1] = acc2lo; - acc2lo >>= 16; - acc2lo += r->w[2]; - r->w[2] = acc2lo; - acc2lo >>= 16; - acc2lo += r->w[3]; - r->w[3] = acc2lo; - acc2lo >>= 16; - acc2lo += r->w[4]; - r->w[4] = acc2lo; - acc2lo >>= 16; - acc2lo += r->w[5]; - r->w[5] = acc2lo; - acc2lo >>= 16; - acc2lo += r->w[6]; - r->w[6] = acc2lo; - acc2lo >>= 16; - acc2lo += r->w[7]; - r->w[7] = acc2lo; - acc2lo >>= 16; - acc2lo += r->w[8]; - r->w[8] = acc2lo; - acc2lo >>= 16; -#else -#error Run contrib/make1305.py again with a different bit count -#endif + BignumInt v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14; + BignumInt v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27; + BignumInt v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40; + BignumInt v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53; + BignumInt v54, v55, v56, v57, v58, v59, v60, v61, v62, v63, v64, v65, v66; + BignumInt v67, v68, v69, v70, v71, v72, v73, v74, v75, v76, v77, v78, v79; + BignumInt v80, v81, v82, v83, v84, v85, v86, v87, v88, v89, v90, v91, v92; + BignumInt v93, v94, v95, v96, v97, v98, v99, v100, v101, v102, v103, v104; + BignumInt v105, v106, v107, v108, v109, v110, v111, v112, v113, v114; + BignumInt v115, v116, v117, v118, v119, v120, v121, v122, v123, v124; + BignumInt v125, v126, v127, v128, v129, v130, v131, v132, v133, v134; + BignumInt v135, v136, v137, v138, v139, v140, v141, v142, v143, v144; + BignumInt v145, v146, v147, v148, v149, v150, v151, v152, v153, v154; + BignumInt v155, v156, v157, v158, v159, v160, v161, v162, v163, v164; + BignumInt v165, v166, v167, v168, v169, v170, v171, v172, v173, v174; + BignumInt v175, v176, v177, v178, v180, v181, v182, v183, v184, v185; + BignumInt v186, v187, v188, v189, v190, v191, v192, v193, v194, v195; + BignumInt v196, v197, v198, v199, v200, v201, v202, v203, v204, v205; + BignumInt v206, v207, v208, v210, v212, v213, v214, v215, v216, v217; + BignumInt v218, v219, v220, v221, v222, v223, v224, v225, v226, v227; + BignumInt v228, v229; + BignumCarry carry; + + v0 = a->w[0]; + v1 = a->w[1]; + v2 = a->w[2]; + v3 = a->w[3]; + v4 = a->w[4]; + v5 = a->w[5]; + v6 = a->w[6]; + v7 = a->w[7]; + v8 = a->w[8]; + v9 = b->w[0]; + v10 = b->w[1]; + v11 = b->w[2]; + v12 = b->w[3]; + v13 = b->w[4]; + v14 = b->w[5]; + v15 = b->w[6]; + v16 = b->w[7]; + v17 = b->w[8]; + BignumMUL(v19, v18, v0, v9); + BignumMULADD(v21, v20, v0, v10, v19); + BignumMULADD(v23, v22, v0, v11, v21); + BignumMULADD(v25, v24, v0, v12, v23); + BignumMULADD(v27, v26, v0, v13, v25); + BignumMULADD(v29, v28, v0, v14, v27); + BignumMULADD(v31, v30, v0, v15, v29); + BignumMULADD(v33, v32, v0, v16, v31); + BignumMULADD(v35, v34, v0, v17, v33); + BignumMULADD(v37, v36, v1, v9, v20); + BignumMULADD2(v39, v38, v1, v10, v22, v37); + BignumMULADD2(v41, v40, v1, v11, v24, v39); + BignumMULADD2(v43, v42, v1, v12, v26, v41); + BignumMULADD2(v45, v44, v1, v13, v28, v43); + BignumMULADD2(v47, v46, v1, v14, v30, v45); + BignumMULADD2(v49, v48, v1, v15, v32, v47); + BignumMULADD2(v51, v50, v1, v16, v34, v49); + BignumMULADD2(v53, v52, v1, v17, v35, v51); + BignumMULADD(v55, v54, v2, v9, v38); + BignumMULADD2(v57, v56, v2, v10, v40, v55); + BignumMULADD2(v59, v58, v2, v11, v42, v57); + BignumMULADD2(v61, v60, v2, v12, v44, v59); + BignumMULADD2(v63, v62, v2, v13, v46, v61); + BignumMULADD2(v65, v64, v2, v14, v48, v63); + BignumMULADD2(v67, v66, v2, v15, v50, v65); + BignumMULADD2(v69, v68, v2, v16, v52, v67); + BignumMULADD2(v71, v70, v2, v17, v53, v69); + BignumMULADD(v73, v72, v3, v9, v56); + BignumMULADD2(v75, v74, v3, v10, v58, v73); + BignumMULADD2(v77, v76, v3, v11, v60, v75); + BignumMULADD2(v79, v78, v3, v12, v62, v77); + BignumMULADD2(v81, v80, v3, v13, v64, v79); + BignumMULADD2(v83, v82, v3, v14, v66, v81); + BignumMULADD2(v85, v84, v3, v15, v68, v83); + BignumMULADD2(v87, v86, v3, v16, v70, v85); + BignumMULADD2(v89, v88, v3, v17, v71, v87); + BignumMULADD(v91, v90, v4, v9, v74); + BignumMULADD2(v93, v92, v4, v10, v76, v91); + BignumMULADD2(v95, v94, v4, v11, v78, v93); + BignumMULADD2(v97, v96, v4, v12, v80, v95); + BignumMULADD2(v99, v98, v4, v13, v82, v97); + BignumMULADD2(v101, v100, v4, v14, v84, v99); + BignumMULADD2(v103, v102, v4, v15, v86, v101); + BignumMULADD2(v105, v104, v4, v16, v88, v103); + BignumMULADD2(v107, v106, v4, v17, v89, v105); + BignumMULADD(v109, v108, v5, v9, v92); + BignumMULADD2(v111, v110, v5, v10, v94, v109); + BignumMULADD2(v113, v112, v5, v11, v96, v111); + BignumMULADD2(v115, v114, v5, v12, v98, v113); + BignumMULADD2(v117, v116, v5, v13, v100, v115); + BignumMULADD2(v119, v118, v5, v14, v102, v117); + BignumMULADD2(v121, v120, v5, v15, v104, v119); + BignumMULADD2(v123, v122, v5, v16, v106, v121); + BignumMULADD2(v125, v124, v5, v17, v107, v123); + BignumMULADD(v127, v126, v6, v9, v110); + BignumMULADD2(v129, v128, v6, v10, v112, v127); + BignumMULADD2(v131, v130, v6, v11, v114, v129); + BignumMULADD2(v133, v132, v6, v12, v116, v131); + BignumMULADD2(v135, v134, v6, v13, v118, v133); + BignumMULADD2(v137, v136, v6, v14, v120, v135); + BignumMULADD2(v139, v138, v6, v15, v122, v137); + BignumMULADD2(v141, v140, v6, v16, v124, v139); + BignumMULADD2(v143, v142, v6, v17, v125, v141); + BignumMULADD(v145, v144, v7, v9, v128); + BignumMULADD2(v147, v146, v7, v10, v130, v145); + BignumMULADD2(v149, v148, v7, v11, v132, v147); + BignumMULADD2(v151, v150, v7, v12, v134, v149); + BignumMULADD2(v153, v152, v7, v13, v136, v151); + BignumMULADD2(v155, v154, v7, v14, v138, v153); + BignumMULADD2(v157, v156, v7, v15, v140, v155); + BignumMULADD2(v159, v158, v7, v16, v142, v157); + BignumMULADD2(v161, v160, v7, v17, v143, v159); + BignumMULADD(v163, v162, v8, v9, v146); + BignumMULADD2(v165, v164, v8, v10, v148, v163); + BignumMULADD2(v167, v166, v8, v11, v150, v165); + BignumMULADD2(v169, v168, v8, v12, v152, v167); + BignumMULADD2(v171, v170, v8, v13, v154, v169); + BignumMULADD2(v173, v172, v8, v14, v156, v171); + BignumMULADD2(v175, v174, v8, v15, v158, v173); + BignumMULADD2(v177, v176, v8, v16, v160, v175); + v178 = v8 * v17 + v161 + v177; + v180 = (v162) & ((((BignumInt)1) << 2)-1); + v181 = ((v162) >> 2) | ((v164) << 14); + v182 = ((v164) >> 2) | ((v166) << 14); + v183 = ((v166) >> 2) | ((v168) << 14); + v184 = ((v168) >> 2) | ((v170) << 14); + v185 = ((v170) >> 2) | ((v172) << 14); + v186 = ((v172) >> 2) | ((v174) << 14); + v187 = ((v174) >> 2) | ((v176) << 14); + v188 = ((v176) >> 2) | ((v178) << 14); + v189 = (v178) >> 2; + v190 = (v189) & ((((BignumInt)1) << 2)-1); + v191 = (v178) >> 4; + BignumMUL(v193, v192, 5, v181); + BignumMULADD(v195, v194, 5, v182, v193); + BignumMULADD(v197, v196, 5, v183, v195); + BignumMULADD(v199, v198, 5, v184, v197); + BignumMULADD(v201, v200, 5, v185, v199); + BignumMULADD(v203, v202, 5, v186, v201); + BignumMULADD(v205, v204, 5, v187, v203); + BignumMULADD(v207, v206, 5, v188, v205); + v208 = 5 * v190 + v207; + v210 = 25 * v191; + BignumADC(v212, carry, v18, v192, 0); + BignumADC(v213, carry, v36, v194, carry); + BignumADC(v214, carry, v54, v196, carry); + BignumADC(v215, carry, v72, v198, carry); + BignumADC(v216, carry, v90, v200, carry); + BignumADC(v217, carry, v108, v202, carry); + BignumADC(v218, carry, v126, v204, carry); + BignumADC(v219, carry, v144, v206, carry); + v220 = v180 + v208 + carry; + BignumADC(v221, carry, v212, v210, 0); + BignumADC(v222, carry, v213, 0, carry); + BignumADC(v223, carry, v214, 0, carry); + BignumADC(v224, carry, v215, 0, carry); + BignumADC(v225, carry, v216, 0, carry); + BignumADC(v226, carry, v217, 0, carry); + BignumADC(v227, carry, v218, 0, carry); + BignumADC(v228, carry, v219, 0, carry); + v229 = v220 + 0 + carry; + r->w[0] = v221; + r->w[1] = v222; + r->w[2] = v223; + r->w[3] = v224; + r->w[4] = v225; + r->w[5] = v226; + r->w[6] = v227; + r->w[7] = v228; + r->w[8] = v229; } static void bigval_final_reduce(bigval *n) { -#if BIGNUM_INT_BITS == 64 - /* ./contrib/make1305.py final_reduce 64 */ - BignumDblInt acclo; - acclo = 0; - acclo += 5 * ((n->w[2] >> 2) + 1); - acclo += n->w[0]; - acclo >>= 64; - acclo += n->w[1]; - acclo >>= 64; - acclo += n->w[2]; - acclo = 5 * (acclo >> 2); - acclo += n->w[0]; - n->w[0] = acclo; - acclo >>= 64; - acclo += n->w[1]; - n->w[1] = acclo; - acclo >>= 64; - acclo += n->w[2]; - n->w[2] = acclo; - acclo >>= 64; - n->w[2] &= (1 << 2) - 1; -#elif BIGNUM_INT_BITS == 32 - /* ./contrib/make1305.py final_reduce 32 */ - BignumDblInt acclo; - acclo = 0; - acclo += 5 * ((n->w[4] >> 2) + 1); - acclo += n->w[0]; - acclo >>= 32; - acclo += n->w[1]; - acclo >>= 32; - acclo += n->w[2]; - acclo >>= 32; - acclo += n->w[3]; - acclo >>= 32; - acclo += n->w[4]; - acclo = 5 * (acclo >> 2); - acclo += n->w[0]; - n->w[0] = acclo; - acclo >>= 32; - acclo += n->w[1]; - n->w[1] = acclo; - acclo >>= 32; - acclo += n->w[2]; - n->w[2] = acclo; - acclo >>= 32; - acclo += n->w[3]; - n->w[3] = acclo; - acclo >>= 32; - acclo += n->w[4]; - n->w[4] = acclo; - acclo >>= 32; - n->w[4] &= (1 << 2) - 1; -#elif BIGNUM_INT_BITS == 16 - /* ./contrib/make1305.py final_reduce 16 */ - BignumDblInt acclo; - acclo = 0; - acclo += 5 * ((n->w[8] >> 2) + 1); - acclo += n->w[0]; - acclo >>= 16; - acclo += n->w[1]; - acclo >>= 16; - acclo += n->w[2]; - acclo >>= 16; - acclo += n->w[3]; - acclo >>= 16; - acclo += n->w[4]; - acclo >>= 16; - acclo += n->w[5]; - acclo >>= 16; - acclo += n->w[6]; - acclo >>= 16; - acclo += n->w[7]; - acclo >>= 16; - acclo += n->w[8]; - acclo = 5 * (acclo >> 2); - acclo += n->w[0]; - n->w[0] = acclo; - acclo >>= 16; - acclo += n->w[1]; - n->w[1] = acclo; - acclo >>= 16; - acclo += n->w[2]; - n->w[2] = acclo; - acclo >>= 16; - acclo += n->w[3]; - n->w[3] = acclo; - acclo >>= 16; - acclo += n->w[4]; - n->w[4] = acclo; - acclo >>= 16; - acclo += n->w[5]; - n->w[5] = acclo; - acclo >>= 16; - acclo += n->w[6]; - n->w[6] = acclo; - acclo >>= 16; - acclo += n->w[7]; - n->w[7] = acclo; - acclo >>= 16; - acclo += n->w[8]; - n->w[8] = acclo; - acclo >>= 16; - n->w[8] &= (1 << 2) - 1; -#else -#error Run contrib/make1305.py again with a different bit count -#endif + BignumInt v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v12, v13, v14, v15; + BignumInt v16, v17, v18, v19, v20, v21, v22, v24, v25, v26, v27, v28, v29; + BignumInt v30, v31, v32, v33; + BignumCarry carry; + + v0 = n->w[0]; + v1 = n->w[1]; + v2 = n->w[2]; + v3 = n->w[3]; + v4 = n->w[4]; + v5 = n->w[5]; + v6 = n->w[6]; + v7 = n->w[7]; + v8 = n->w[8]; + v9 = (v8) >> 2; + v10 = 5 * v9; + BignumADC(v12, carry, v0, v10, 0); + (void)v12; + BignumADC(v13, carry, v1, 0, carry); + (void)v13; + BignumADC(v14, carry, v2, 0, carry); + (void)v14; + BignumADC(v15, carry, v3, 0, carry); + (void)v15; + BignumADC(v16, carry, v4, 0, carry); + (void)v16; + BignumADC(v17, carry, v5, 0, carry); + (void)v17; + BignumADC(v18, carry, v6, 0, carry); + (void)v18; + BignumADC(v19, carry, v7, 0, carry); + (void)v19; + v20 = v8 + 0 + carry; + v21 = (v20) >> 2; + v22 = 5 * v21; + BignumADC(v24, carry, v0, v22, 0); + BignumADC(v25, carry, v1, 0, carry); + BignumADC(v26, carry, v2, 0, carry); + BignumADC(v27, carry, v3, 0, carry); + BignumADC(v28, carry, v4, 0, carry); + BignumADC(v29, carry, v5, 0, carry); + BignumADC(v30, carry, v6, 0, carry); + BignumADC(v31, carry, v7, 0, carry); + v32 = v8 + 0 + carry; + v33 = (v32) & ((((BignumInt)1) << 2)-1); + n->w[0] = v24; + n->w[1] = v25; + n->w[2] = v26; + n->w[3] = v27; + n->w[4] = v28; + n->w[5] = v29; + n->w[6] = v30; + n->w[7] = v31; + n->w[8] = v33; } +#elif BIGNUM_INT_BITS == 32 + +static void bigval_add(bigval *r, const bigval *a, const bigval *b) +{ + BignumInt v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14; + BignumCarry carry; + + v0 = a->w[0]; + v1 = a->w[1]; + v2 = a->w[2]; + v3 = a->w[3]; + v4 = a->w[4]; + v5 = b->w[0]; + v6 = b->w[1]; + v7 = b->w[2]; + v8 = b->w[3]; + v9 = b->w[4]; + BignumADC(v10, carry, v0, v5, 0); + BignumADC(v11, carry, v1, v6, carry); + BignumADC(v12, carry, v2, v7, carry); + BignumADC(v13, carry, v3, v8, carry); + v14 = v4 + v9 + carry; + r->w[0] = v10; + r->w[1] = v11; + r->w[2] = v12; + r->w[3] = v13; + r->w[4] = v14; +} + +static void bigval_mul_mod_p(bigval *r, const bigval *a, const bigval *b) +{ + BignumInt v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14; + BignumInt v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27; + BignumInt v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40; + BignumInt v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53; + BignumInt v54, v55, v56, v57, v58, v60, v61, v62, v63, v64, v65, v66, v67; + BignumInt v68, v69, v70, v71, v72, v73, v74, v75, v76, v78, v80, v81, v82; + BignumInt v83, v84, v85, v86, v87, v88, v89; + BignumCarry carry; + + v0 = a->w[0]; + v1 = a->w[1]; + v2 = a->w[2]; + v3 = a->w[3]; + v4 = a->w[4]; + v5 = b->w[0]; + v6 = b->w[1]; + v7 = b->w[2]; + v8 = b->w[3]; + v9 = b->w[4]; + BignumMUL(v11, v10, v0, v5); + BignumMULADD(v13, v12, v0, v6, v11); + BignumMULADD(v15, v14, v0, v7, v13); + BignumMULADD(v17, v16, v0, v8, v15); + BignumMULADD(v19, v18, v0, v9, v17); + BignumMULADD(v21, v20, v1, v5, v12); + BignumMULADD2(v23, v22, v1, v6, v14, v21); + BignumMULADD2(v25, v24, v1, v7, v16, v23); + BignumMULADD2(v27, v26, v1, v8, v18, v25); + BignumMULADD2(v29, v28, v1, v9, v19, v27); + BignumMULADD(v31, v30, v2, v5, v22); + BignumMULADD2(v33, v32, v2, v6, v24, v31); + BignumMULADD2(v35, v34, v2, v7, v26, v33); + BignumMULADD2(v37, v36, v2, v8, v28, v35); + BignumMULADD2(v39, v38, v2, v9, v29, v37); + BignumMULADD(v41, v40, v3, v5, v32); + BignumMULADD2(v43, v42, v3, v6, v34, v41); + BignumMULADD2(v45, v44, v3, v7, v36, v43); + BignumMULADD2(v47, v46, v3, v8, v38, v45); + BignumMULADD2(v49, v48, v3, v9, v39, v47); + BignumMULADD(v51, v50, v4, v5, v42); + BignumMULADD2(v53, v52, v4, v6, v44, v51); + BignumMULADD2(v55, v54, v4, v7, v46, v53); + BignumMULADD2(v57, v56, v4, v8, v48, v55); + v58 = v4 * v9 + v49 + v57; + v60 = (v50) & ((((BignumInt)1) << 2)-1); + v61 = ((v50) >> 2) | ((v52) << 30); + v62 = ((v52) >> 2) | ((v54) << 30); + v63 = ((v54) >> 2) | ((v56) << 30); + v64 = ((v56) >> 2) | ((v58) << 30); + v65 = (v58) >> 2; + v66 = (v65) & ((((BignumInt)1) << 2)-1); + v67 = (v58) >> 4; + BignumMUL(v69, v68, 5, v61); + BignumMULADD(v71, v70, 5, v62, v69); + BignumMULADD(v73, v72, 5, v63, v71); + BignumMULADD(v75, v74, 5, v64, v73); + v76 = 5 * v66 + v75; + v78 = 25 * v67; + BignumADC(v80, carry, v10, v68, 0); + BignumADC(v81, carry, v20, v70, carry); + BignumADC(v82, carry, v30, v72, carry); + BignumADC(v83, carry, v40, v74, carry); + v84 = v60 + v76 + carry; + BignumADC(v85, carry, v80, v78, 0); + BignumADC(v86, carry, v81, 0, carry); + BignumADC(v87, carry, v82, 0, carry); + BignumADC(v88, carry, v83, 0, carry); + v89 = v84 + 0 + carry; + r->w[0] = v85; + r->w[1] = v86; + r->w[2] = v87; + r->w[3] = v88; + r->w[4] = v89; +} + +static void bigval_final_reduce(bigval *n) +{ + BignumInt v0, v1, v2, v3, v4, v5, v6, v8, v9, v10, v11, v12, v13, v14; + BignumInt v16, v17, v18, v19, v20, v21; + BignumCarry carry; + + v0 = n->w[0]; + v1 = n->w[1]; + v2 = n->w[2]; + v3 = n->w[3]; + v4 = n->w[4]; + v5 = (v4) >> 2; + v6 = 5 * v5; + BignumADC(v8, carry, v0, v6, 0); + (void)v8; + BignumADC(v9, carry, v1, 0, carry); + (void)v9; + BignumADC(v10, carry, v2, 0, carry); + (void)v10; + BignumADC(v11, carry, v3, 0, carry); + (void)v11; + v12 = v4 + 0 + carry; + v13 = (v12) >> 2; + v14 = 5 * v13; + BignumADC(v16, carry, v0, v14, 0); + BignumADC(v17, carry, v1, 0, carry); + BignumADC(v18, carry, v2, 0, carry); + BignumADC(v19, carry, v3, 0, carry); + v20 = v4 + 0 + carry; + v21 = (v20) & ((((BignumInt)1) << 2)-1); + n->w[0] = v16; + n->w[1] = v17; + n->w[2] = v18; + n->w[3] = v19; + n->w[4] = v21; +} + +#elif BIGNUM_INT_BITS == 64 + +static void bigval_add(bigval *r, const bigval *a, const bigval *b) +{ + BignumInt v0, v1, v2, v3, v4, v5, v6, v7, v8; + BignumCarry carry; + + v0 = a->w[0]; + v1 = a->w[1]; + v2 = a->w[2]; + v3 = b->w[0]; + v4 = b->w[1]; + v5 = b->w[2]; + BignumADC(v6, carry, v0, v3, 0); + BignumADC(v7, carry, v1, v4, carry); + v8 = v2 + v5 + carry; + r->w[0] = v6; + r->w[1] = v7; + r->w[2] = v8; +} + +static void bigval_mul_mod_p(bigval *r, const bigval *a, const bigval *b) +{ + BignumInt v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14; + BignumInt v15, v16, v17, v18, v19, v20, v21, v22, v24, v25, v26, v27, v28; + BignumInt v29, v30, v31, v32, v33, v34, v36, v38, v39, v40, v41, v42, v43; + BignumCarry carry; + + v0 = a->w[0]; + v1 = a->w[1]; + v2 = a->w[2]; + v3 = b->w[0]; + v4 = b->w[1]; + v5 = b->w[2]; + BignumMUL(v7, v6, v0, v3); + BignumMULADD(v9, v8, v0, v4, v7); + BignumMULADD(v11, v10, v0, v5, v9); + BignumMULADD(v13, v12, v1, v3, v8); + BignumMULADD2(v15, v14, v1, v4, v10, v13); + BignumMULADD2(v17, v16, v1, v5, v11, v15); + BignumMULADD(v19, v18, v2, v3, v14); + BignumMULADD2(v21, v20, v2, v4, v16, v19); + v22 = v2 * v5 + v17 + v21; + v24 = (v18) & ((((BignumInt)1) << 2)-1); + v25 = ((v18) >> 2) | ((v20) << 62); + v26 = ((v20) >> 2) | ((v22) << 62); + v27 = (v22) >> 2; + v28 = (v27) & ((((BignumInt)1) << 2)-1); + v29 = (v22) >> 4; + BignumMUL(v31, v30, 5, v25); + BignumMULADD(v33, v32, 5, v26, v31); + v34 = 5 * v28 + v33; + v36 = 25 * v29; + BignumADC(v38, carry, v6, v30, 0); + BignumADC(v39, carry, v12, v32, carry); + v40 = v24 + v34 + carry; + BignumADC(v41, carry, v38, v36, 0); + BignumADC(v42, carry, v39, 0, carry); + v43 = v40 + 0 + carry; + r->w[0] = v41; + r->w[1] = v42; + r->w[2] = v43; +} + +static void bigval_final_reduce(bigval *n) +{ + BignumInt v0, v1, v2, v3, v4, v6, v7, v8, v9, v10, v12, v13, v14, v15; + BignumCarry carry; + + v0 = n->w[0]; + v1 = n->w[1]; + v2 = n->w[2]; + v3 = (v2) >> 2; + v4 = 5 * v3; + BignumADC(v6, carry, v0, v4, 0); + (void)v6; + BignumADC(v7, carry, v1, 0, carry); + (void)v7; + v8 = v2 + 0 + carry; + v9 = (v8) >> 2; + v10 = 5 * v9; + BignumADC(v12, carry, v0, v10, 0); + BignumADC(v13, carry, v1, 0, carry); + v14 = v2 + 0 + carry; + v15 = (v14) & ((((BignumInt)1) << 2)-1); + n->w[0] = v12; + n->w[1] = v13; + n->w[2] = v15; +} + +#else +#error Add another bit count to contrib/make1305.py and rerun it +#endif + struct poly1305 { unsigned char nonce[16]; bigval r;