Relegate BignumDblInt to an implementation detail of sshbn.h.

As I mentioned in the previous commit, I'm going to want PuTTY to be able to run sensibly when compiled with 64-bit Visual Studio, including handling bignums in 64-bit chunks for speed. Unfortunately, 64-bit VS does not provide any type we can use as BignumDblInt in that situation (unlike 64-bit gcc and clang, which give us __uint128_t). The only facilities it provides are compiler intrinsics to access an add-with-carry operation and a 64x64->128 multiplication (the latter delivering its product in two separate 64-bit output chunks). Hence, here's a substantial rework of the bignum code to make it implement everything in terms of _those_ primitives, rather than depending throughout on having BignumDblInt available to use ad-hoc. BignumDblInt does still exist, for the moment, but now it's an internal implementation detail of sshbn.h, only declared inside a new set of macros implementing arithmetic primitives, and not accessible to any code outside sshbn.h (which confirms that I really did catch all uses of it and remove them). The resulting code is surprisingly nice-looking, actually. You'd expect more hassle and roundabout circumlocutions when you drop down to using a more basic set of primitive operations, but actually, in many cases it's turned out shorter to write things in terms of the new BignumADC and BignumMUL macros - because almost all my uses of BignumDblInt were implementing those operations anyway, taking several lines at a time, and now they can do each thing in just one line. The biggest headache was Poly1305: I wasn't able to find any sensible way to adapt the existing Python script that generates the various per-int-size implementations of arithmetic mod 2^130-5, and so I had to rewrite it from scratch instead, with nothing in common with the old version beyond a handful of comments. But even that seems to have worked out nicely: the new version has much more legible descriptions of the high-level algorithms, by virtue of having a 'Multiprecision' type which wraps up the division into words, and yet Multiprecision's range analysis allows it to automatically drop out special cases such as multiplication by 5 being much easier than multiplication by another multi-word integer.
2025-07-14 17:47:33 -05:00 · 2015-12-16 14:12:26 +00:00
parent 482b4ab872
commit c2ec13c7e9
4 changed files with 1159 additions and 1209 deletions
--- a/contrib/make1305.py
+++ b/contrib/make1305.py
@ -1,108 +1,300 @@
 #!/usr/bin/env python
 import sys
 import string
 from collections import namedtuple
-class Output(object):
+class Multiprecision(object):
-    def __init__(self, bignum_int_bits):
+    def __init__(self, target, minval, maxval, words):
-        self.bignum_int_bits = bignum_int_bits
+        self.target = target
-        self.text = ""
+        self.minval = minval
-        self.vars = []
+        self.maxval = maxval
-    def stmt(self, statement):
+        self.words = words
-        self.text += "    %s;\n" % statement
+        assert 0 <= self.minval
-    def register_var(self, var):
+        assert self.minval <= self.maxval
-        self.vars.append(var)
+        assert self.target.nwords(self.maxval) == len(words)
    def finalise(self):
        for var in self.vars:
            assert var.maxval == 0, "Variable not clear: %s" % var.name
        return self.text
-class Variable(object):
+    def getword(self, n):
-    def __init__(self, out, name):
+        return self.words[n] if n < len(self.words) else "0"
        self.out = out
        self.maxval = 0
        self.name = name
        self.placeval = None
        self.out.stmt("BignumDblInt %s" % (self.name))
        self.out.register_var(self)
    def clear(self, placeval):
        self.maxval = 0
        self.placeval = placeval
        self.out.stmt("%s = 0" % (self.name))
    def set_word(self, name, limit=None):
        if limit is not None:
            self.maxval = limit-1
        else:
            self.maxval = (1 << self.out.bignum_int_bits) - 1
        assert self.maxval < (1 << 2*self.out.bignum_int_bits)
        self.out.stmt("%s = %s" % (self.name, name))
    def add_word(self, name, limit=None):
        if limit is not None:
            self.maxval += limit-1
        else:
            self.maxval += (1 << self.out.bignum_int_bits) - 1
        assert self.maxval < (1 << 2*self.out.bignum_int_bits)
        self.out.stmt("%s += %s" % (self.name, name))
    def add_input_word(self, fmt, wordpos, limit=None):
        assert self.placeval == wordpos * self.out.bignum_int_bits
        self.add_word(fmt % wordpos, limit)
    def set_to_product(self, a, b, placeval):
        self.maxval = ((1 << self.out.bignum_int_bits) - 1) ** 2
        assert self.maxval < (1 << 2*self.out.bignum_int_bits)        
        self.out.stmt("%s = (BignumDblInt)(%s) * (%s)" % (self.name, a, b))
        self.placeval = placeval
    def add_bottom_half(self, srcvar):
        self.add_word("%s & BIGNUM_INT_MASK" % (srcvar.name))
    def add_top_half(self, srcvar):
        self.add_word("%s >> %d" % (srcvar.name, self.out.bignum_int_bits))
    def unload_into(self, topvar, botvar):
        assert botvar.placeval == self.placeval
        botvar.add_bottom_half(self)
        assert topvar.placeval == self.placeval + self.out.bignum_int_bits
        topvar.add_top_half(self)
        self.maxval = 0
    def output_word(self, bitpos, bits, destfmt, destwordpos):
        assert bitpos == 0
        assert self.placeval == destwordpos * self.out.bignum_int_bits
        dest = destfmt % destwordpos
        if bits == self.out.bignum_int_bits:
            self.out.stmt("%s = %s" % (dest, self.name))
        else:
            self.out.stmt("%s = %s & (((BignumInt)1 << %d)-1)" %
                          (dest, self.name, bits))
    def transfer_to_next_acc(self, bitpos, bits, pow5, destvar):
        destbitpos = self.placeval + bitpos - 130 * pow5 - destvar.placeval
        #print "transfer", "*%d" % 5**pow5, self.name, self.placeval, bitpos, destvar.name, destvar.placeval, destbitpos, bits
        assert 0 <= bitpos < bitpos+bits <= self.out.bignum_int_bits
        assert 0 <= destbitpos < destbitpos+bits <= self.out.bignum_int_bits
        expr = self.name
        if bitpos > 0:
            expr = "(%s >> %d)" % (expr, bitpos)
        expr = "(%s & (((BignumInt)1 << %d)-1))" % (expr, bits)
        self.out.stmt("%s += %s * ((BignumDblInt)%d << %d)" %
                      (destvar.name, expr, 5**pow5, destbitpos))
        destvar.maxval += (((1 << bits)-1) << destbitpos) * (5**pow5)
    def shift_down_from(self, top):
        if top is not None:
            self.out.stmt("%s = %s + (%s >> %d)" %
                          (self.name, top.name, self.name,
                           self.out.bignum_int_bits))
            topmaxval = top.maxval
        else:
            self.out.stmt("%s >>= %d" % (self.name, self.out.bignum_int_bits))
            topmaxval = 0
        self.maxval = topmaxval + self.maxval >> self.out.bignum_int_bits
        assert self.maxval < (1 << 2*self.out.bignum_int_bits)
        if top is not None:
            assert self.placeval + self.out.bignum_int_bits == top.placeval
            top.clear(top.placeval + self.out.bignum_int_bits)
        self.placeval += self.out.bignum_int_bits
-def gen_add(bignum_int_bits):
+    def __add__(self, rhs):
-    out = Output(bignum_int_bits)
+        newmin = self.minval + rhs.minval
        newmax = self.maxval + rhs.maxval
        nwords = self.target.nwords(newmax)
        words = []
-    inbits = 130
+        addfn = self.target.add
-    inwords = (inbits + bignum_int_bits - 1) / bignum_int_bits
+        for i in range(nwords):
            words.append(addfn(self.getword(i), rhs.getword(i)))
            addfn = self.target.adc
        return Multiprecision(self.target, newmin, newmax, words)
    def __mul__(self, rhs):
        newmin = self.minval * rhs.minval
        newmax = self.maxval * rhs.maxval
        nwords = self.target.nwords(newmax)
        words = []
        # There are basically two strategies we could take for
        # multiplying two multiprecision integers. One is to enumerate
        # the space of pairs of word indices in lexicographic order,
        # essentially computing a*b[i] for each i and adding them
        # together; the other is to enumerate in diagonal order,
        # computing everything together that belongs at a particular
        # output word index.
        #
        # For the moment, I've gone for the former.
        sprev = []
        for i, sword in enumerate(self.words):
            rprev = None
            sthis = sprev[:i]
            for j, rword in enumerate(rhs.words):
                prevwords = []
                if i+j < len(sprev):
                    prevwords.append(sprev[i+j])
                if rprev is not None:
                    prevwords.append(rprev)
                vhi, vlo = self.target.muladd(sword, rword, *prevwords)
                sthis.append(vlo)
                rprev = vhi
            sthis.append(rprev)
            sprev = sthis
        # Remove unneeded words from the top of the output, if we can
        # prove by range analysis that they'll always be zero.
        sprev = sprev[:self.target.nwords(newmax)]
        return Multiprecision(self.target, newmin, newmax, sprev)
    def extract_bits(self, start, bits=None):
        if bits is None:
            bits = (self.maxval >> start).bit_length()
        # Overly thorough range analysis: if min and max have the same
        # *quotient* by 2^bits, then the result of reducing anything
        # in the range [min,max] mod 2^bits has to fall within the
        # obvious range. But if they have different quotients, then
        # you can wrap round the modulus and so any value mod 2^bits
        # is possible.
        newmin = self.minval >> start
        newmax = self.maxval >> start
        if (newmin >> bits) != (newmax >> bits):
            newmin = 0
            newmax = (1 << bits) - 1
        nwords = self.target.nwords(newmax)
        words = []
        for i in range(nwords):
            srcpos = i * self.target.bits + start
            maxbits = min(self.target.bits, start + bits - srcpos)
            wordindex = srcpos / self.target.bits
            if srcpos % self.target.bits == 0:
                word = self.getword(srcpos / self.target.bits)
            elif (wordindex+1 >= len(self.words) or
                  srcpos % self.target.bits + maxbits < self.target.bits):
                word = self.target.new_value(
                    "(%%s) >> %d" % (srcpos % self.target.bits),
                    self.getword(srcpos / self.target.bits))
            else:
                word = self.target.new_value(
                    "((%%s) >> %d) | ((%%s) << %d)" % (
                        srcpos % self.target.bits,
                        self.target.bits - (srcpos % self.target.bits)),
                    self.getword(srcpos / self.target.bits),
                    self.getword(srcpos / self.target.bits + 1))
            if maxbits < self.target.bits and maxbits < bits:
                word = self.target.new_value(
                    "(%%s) & ((((BignumInt)1) << %d)-1)" % maxbits,
                    word)
            words.append(word)
        return Multiprecision(self.target, newmin, newmax, words)
 # Each Statement has a list of variables it reads, and a list of ones
 # it writes. 'forms' is a list of multiple actual C statements it
 # could be generated as, depending on which of its output variables is
 # actually used (e.g. no point calling BignumADC if the generated
 # carry in a particular case is unused, or BignumMUL if nobody needs
 # the top half). It is indexed by a bitmap whose bits correspond to
 # the entries in wvars, with wvars[0] the MSB and wvars[-1] the LSB.
 Statement = namedtuple("Statement", "rvars wvars forms")
 class CodegenTarget(object):
    def __init__(self, bits):
        self.bits = bits
        self.valindex = 0
        self.stmts = []
        self.generators = {}
        self.bv_words = (130 + self.bits - 1) / self.bits
        self.carry_index = 0
    def nwords(self, maxval):
        return (maxval.bit_length() + self.bits - 1) / self.bits
    def stmt(self, stmt, needed=False):
        index = len(self.stmts)
        self.stmts.append([needed, stmt])
        for val in stmt.wvars:
            self.generators[val] = index
    def new_value(self, formatstr=None, *deps):
        name = "v%d" % self.valindex
        self.valindex += 1
        if formatstr is not None:
            self.stmt(Statement(
                    rvars=deps, wvars=[name],
                    forms=[None, name + " = " + formatstr % deps]))
        return name
    def bigval_input(self, name, bits):
        words = (bits + self.bits - 1) / self.bits
        # Expect not to require an entire extra word
        assert words == self.bv_words
        return Multiprecision(self, 0, (1<<bits)-1, [
                self.new_value("%s->w[%d]" % (name, i)) for i in range(words)])
    def const(self, value):
        # We only support constants small enough to both fit in a
        # BignumInt (of any size supported) _and_ be expressible in C
        # with no weird integer literal syntax like a trailing LL.
        #
        # Supporting larger constants would be possible - you could
        # break 'value' up into word-sized pieces on the Python side,
        # and generate a legal C expression for each piece by
        # splitting it further into pieces within the
        # standards-guaranteed 'unsigned long' limit of 32 bits and
        # then casting those to BignumInt before combining them with
        # shifts. But it would be a lot of effort, and since the
        # application for this code doesn't even need it, there's no
        # point in bothering.
        assert value < 2**16
        return Multiprecision(self, value, value, ["%d" % value])
    def current_carry(self):
        return "carry%d" % self.carry_index
    def add(self, a1, a2):
        ret = self.new_value()
        adcform = "BignumADC(%s, carry, %s, %s, 0)" % (ret, a1, a2)
        plainform = "%s = %s + %s" % (ret, a1, a2)
        self.carry_index += 1
        carryout = self.current_carry()
        self.stmt(Statement(
                rvars=[a1,a2], wvars=[ret,carryout],
                forms=[None, adcform, plainform, adcform]))
        return ret
    def adc(self, a1, a2):
        ret = self.new_value()
        adcform = "BignumADC(%s, carry, %s, %s, carry)" % (ret, a1, a2)
        plainform = "%s = %s + %s + carry" % (ret, a1, a2)
        carryin = self.current_carry()
        self.carry_index += 1
        carryout = self.current_carry()
        self.stmt(Statement(
                rvars=[a1,a2,carryin], wvars=[ret,carryout],
                forms=[None, adcform, plainform, adcform]))
        return ret
    def muladd(self, m1, m2, *addends):
        rlo = self.new_value()
        rhi = self.new_value()
        wideform = "BignumMUL%s(%s)" % (
            { 0:"", 1:"ADD", 2:"ADD2" }[len(addends)],
            ", ".join([rhi, rlo, m1, m2] + list(addends)))
        narrowform = " + ".join(["%s = %s * %s" % (rlo, m1, m2)] +
                                list(addends))
        self.stmt(Statement(
                rvars=[m1,m2]+list(addends), wvars=[rhi,rlo],
                forms=[None, narrowform, wideform, wideform]))
        return rhi, rlo
    def write_bigval(self, name, val):
        for i in range(self.bv_words):
            word = val.getword(i)
            self.stmt(Statement(
                    rvars=[word], wvars=[],
                    forms=["%s->w[%d] = %s" % (name, i, word)]),
                      needed=True)
    def compute_needed(self):
        used_vars = set()
        self.queue = [stmt for (needed,stmt) in self.stmts if needed]
        while len(self.queue) > 0:
            stmt = self.queue.pop(0)
            deps = []
            for var in stmt.rvars:
                if var[0] in string.digits:
                    continue # constant
                deps.append(self.generators[var])
                used_vars.add(var)
            for index in deps:
                if not self.stmts[index][0]:
                    self.stmts[index][0] = True
                    self.queue.append(self.stmts[index][1])
        forms = []
        for i, (needed, stmt) in enumerate(self.stmts):
            if needed:
                formindex = 0
                for (j, var) in enumerate(stmt.wvars):
                    formindex *= 2
                    if var in used_vars:
                        formindex += 1
                forms.append(stmt.forms[formindex])
                # Now we must check whether this form of the statement
                # also writes some variables we _don't_ actually need
                # (e.g. if you only wanted the top half from a mul, or
                # only the carry from an adc, you'd be forced to
                # generate the other output too). Easiest way to do
                # this is to look for an identical statement form
                # later in the array.
                maxindex = max(i for i in range(len(stmt.forms))
                               if stmt.forms[i] == stmt.forms[formindex])
                extra_vars = maxindex & ~formindex
                bitpos = 0
                while extra_vars != 0:
                    if extra_vars & (1 << bitpos):
                        extra_vars &= ~(1 << bitpos)
                        var = stmt.wvars[-1-bitpos]
                        used_vars.add(var)
                        # Also, write out a cast-to-void for each
                        # subsequently unused value, to prevent gcc
                        # warnings when the output code is compiled.
                        forms.append("(void)" + var)
                    bitpos += 1
        used_carry = any(v.startswith("carry") for v in used_vars)
        used_vars = [v for v in used_vars if v.startswith("v")]
        used_vars.sort(key=lambda v: int(v[1:]))
        return used_carry, used_vars, forms
    def text(self):
        used_carry, values, forms = self.compute_needed()
        ret = ""
        while len(values) > 0:
            prefix, sep, suffix = "    BignumInt ", ", ", ";"
            currline = values.pop(0)
            while (len(values) > 0 and
                   len(prefix+currline+sep+values[0]+suffix) < 79):
                currline += sep + values.pop(0)
            ret += prefix + currline + suffix + "\n"
        if used_carry:
            ret += "    BignumCarry carry;\n"
        if ret != "":
            ret += "\n"
        for stmtform in forms:
            ret += "    %s;\n" % stmtform
        return ret
 def gen_add(target):
    # This is an addition _without_ reduction mod p, so that it can be
    # used both during accumulation of the polynomial and for adding
    # on the encrypted nonce at the end (which is mod 2^128, not mod
@ -111,157 +303,66 @@ def gen_add(bignum_int_bits):
    # Because one of the inputs will have come from our
    # not-completely-reducing multiplication function, we expect up to
    # 3 extra bits of input.
    acclo = Variable(out, "acclo")
-    acclo.clear(0)
+    a = target.bigval_input("a", 133)
-
+    b = target.bigval_input("b", 133)
-    for wordpos in range(inwords):
+    ret = a + b
-        limit = min(1 << bignum_int_bits, 1 << (130 - wordpos*bignum_int_bits))
+    target.write_bigval("r", ret)
-        acclo.add_input_word("a->w[%d]", wordpos, limit)
+    return """\
-        acclo.add_input_word("b->w[%d]", wordpos, limit)
+static void bigval_add(bigval *r, const bigval *a, const bigval *b)
-        acclo.output_word(0, bignum_int_bits, "r->w[%d]", wordpos)
+{
-        acclo.shift_down_from(None)
+%s}
-
+\n""" % target.text()
    return out.finalise()
 def gen_mul_1305(bignum_int_bits):
    out = Output(bignum_int_bits)
    inbits = 130
    inwords = (inbits + bignum_int_bits - 1) / bignum_int_bits
 def gen_mul(target):
    # The inputs are not 100% reduced mod p. Specifically, we can get
    # a full 130-bit number from the pow5==0 pass, and then a 130-bit
    # number times 5 from the pow5==1 pass, plus a possible carry. The
    # total of that can be easily bounded above by 2^130 * 8, so we
    # need to assume we're multiplying two 133-bit numbers.
    outbits = (inbits + 3) * 2
    outwords = (outbits + bignum_int_bits - 1) / bignum_int_bits + 1
-    tmp = Variable(out, "tmp")
+    a = target.bigval_input("a", 133)
-    acclo = Variable(out, "acclo")
+    b = target.bigval_input("b", 133)
-    acchi = Variable(out, "acchi")
+    ab = a * b
-    acc2lo = Variable(out, "acc2lo")
+    ab0 = ab.extract_bits(0, 130)
    ab1 = ab.extract_bits(130, 130)
    ab2 = ab.extract_bits(260)
    ab1_5 = target.const(5) * ab1
    ab2_25 = target.const(25) * ab2
    ret = ab0 + ab1_5 + ab2_25
    target.write_bigval("r", ret)
    return """\
 static void bigval_mul_mod_p(bigval *r, const bigval *a, const bigval *b)
 {
 %s}
 \n""" % target.text()
-    pow5, bits_at_pow5 = 0, inbits
+def gen_final_reduce(target):
-
+    # We take our input number n, and compute k = n + 5*(n >> 130).
    acclo.clear(0)
    acchi.clear(bignum_int_bits)
    bits_needed_in_acc2 = bignum_int_bits
    for outwordpos in range(outwords):
        for a in range(inwords):
            b = outwordpos - a
            if 0 <= b < inwords:
                tmp.set_to_product("a->w[%d]" % a, "b->w[%d]" % b,
                                   outwordpos * bignum_int_bits)
                tmp.unload_into(acchi, acclo)
        bits_in_word = bignum_int_bits
        bitpos = 0
        #print "begin output"
        while bits_in_word > 0:
            chunk = min(bits_in_word, bits_at_pow5)
            if pow5 > 0:
                chunk = min(chunk, bits_needed_in_acc2)
            if pow5 == 0:
                acclo.output_word(bitpos, chunk, "r->w[%d]", outwordpos)
            else:
                acclo.transfer_to_next_acc(bitpos, chunk, pow5, acc2lo)
                bits_needed_in_acc2 -= chunk
                if bits_needed_in_acc2 == 0:
                    assert acc2lo.placeval % bignum_int_bits == 0
                    other_outwordpos = acc2lo.placeval / bignum_int_bits
                    acc2lo.add_input_word("r->w[%d]", other_outwordpos)
                    acc2lo.output_word(bitpos, bignum_int_bits, "r->w[%d]",
                                       other_outwordpos)
                    acc2lo.shift_down_from(None)
                    bits_needed_in_acc2 = bignum_int_bits
            bits_in_word -= chunk
            bits_at_pow5 -= chunk
            bitpos += chunk
            if bits_at_pow5 == 0:
                if pow5 > 0:
                    assert acc2lo.placeval % bignum_int_bits == 0
                    other_outwordpos = acc2lo.placeval / bignum_int_bits
                    acc2lo.add_input_word("r->w[%d]", other_outwordpos)
                    acc2lo.output_word(0, bignum_int_bits, "r->w[%d]",
                                       other_outwordpos)
                pow5 += 1
                bits_at_pow5 = inbits
                acc2lo.clear(0)
                bits_needed_in_acc2 = bignum_int_bits
        acclo.shift_down_from(acchi)
    while acc2lo.maxval > 0:
        other_outwordpos = acc2lo.placeval / bignum_int_bits
        bitsleft = inbits - other_outwordpos * bignum_int_bits
        limit = 1<<bitsleft if bitsleft < bignum_int_bits else None
        acc2lo.add_input_word("r->w[%d]", other_outwordpos, limit=limit)
        acc2lo.output_word(0, bignum_int_bits, "r->w[%d]", other_outwordpos)
        acc2lo.shift_down_from(None)
    return out.finalise()
 def gen_final_reduce_1305(bignum_int_bits):
    out = Output(bignum_int_bits)
    inbits = 130
    inwords = (inbits + bignum_int_bits - 1) / bignum_int_bits
    # We take our input number n, and compute k = 5 + 5*(n >> 130).
    # Then k >> 130 is precisely the multiple of p that needs to be
    # subtracted from n to reduce it to strictly less than p.
-    acclo = Variable(out, "acclo")
+    a = target.bigval_input("n", 133)
    a1 = a.extract_bits(130, 130)
    k = a + target.const(5) * a1
    q = k.extract_bits(130)
    adjusted = a + target.const(5) * q
    ret = adjusted.extract_bits(0, 130)
    target.write_bigval("n", ret)
    return """\
 static void bigval_final_reduce(bigval *n)
 {
 %s}
 \n""" % target.text()
-    acclo.clear(0)
+pp_keyword = "#if"
-    # Hopefully all the bits we're shifting down fit in the same word.
+for bits in [16, 32, 64]:
-    assert 130 / bignum_int_bits == (130 + 3 - 1) / bignum_int_bits
+    sys.stdout.write("%s BIGNUM_INT_BITS == %d\n\n" % (pp_keyword, bits))
-    acclo.add_word("5 * ((n->w[%d] >> %d) + 1)" %
+    pp_keyword = "#elif"
-                   (130 / bignum_int_bits, 130 % bignum_int_bits),
+    sys.stdout.write(gen_add(CodegenTarget(bits)))
-                   limit = 5 * (7 + 1))
+    sys.stdout.write(gen_mul(CodegenTarget(bits)))
-    for wordpos in range(inwords):
+    sys.stdout.write(gen_final_reduce(CodegenTarget(bits)))
-        acclo.add_input_word("n->w[%d]", wordpos)
+sys.stdout.write("""#else
-        # Notionally, we could call acclo.output_word here to store
+#error Add another bit count to contrib/make1305.py and rerun it
-        # our adjusted value k. But we don't need to, because all we
+#endif
-        # actually want is the very top word of it.
+""")
        if wordpos == 130 / bignum_int_bits:
            break
        acclo.shift_down_from(None)
    # Now we can find the right multiple of p to subtract. We actually
    # subtract it by adding 5 times it, and then finally discarding
    # the top bits of the output.
    # Hopefully all the bits we're shifting down fit in the same word.
    assert 130 / bignum_int_bits == (130 + 3 - 1) / bignum_int_bits
    acclo.set_word("5 * (acclo >> %d)" % (130 % bignum_int_bits),
                   limit = 5 * (7 + 1))
    acclo.placeval = 0
    for wordpos in range(inwords):
        acclo.add_input_word("n->w[%d]", wordpos)
        acclo.output_word(0, bignum_int_bits, "n->w[%d]", wordpos)
        acclo.shift_down_from(None)
    out.stmt("n->w[%d] &= (1 << %d) - 1" %
             (130 / bignum_int_bits, 130 % bignum_int_bits))
    # Here we don't call out.finalise(), because that will complain
    # that there are bits of output we never dealt with. This is true,
    # but all the bits in question are above 2^130, so they're bits
    # we're discarding anyway.
    return out.text # not out.finalise()
 ops = { "mul" : gen_mul_1305,
        "add" : gen_add,
        "final_reduce" : gen_final_reduce_1305 }
 args = sys.argv[1:]
 if len(args) != 2 or args[0] not in ops:
    sys.stderr.write("usage: make1305.py (%s) <bits>\n" % (" | ".join(sorted(ops))))
    sys.exit(1)
 sys.stdout.write("    /* ./contrib/make1305.py %s %s */\n" % tuple(args))
 s = ops[args[0]](int(args[1]))
 sys.stdout.write(s)
--- a/sshbn.c
+++ b/sshbn.c
@ -87,20 +87,17 @@ Bignum bn_power_2(int n)
 /*
 * Internal addition. Sets c = a - b, where 'a', 'b' and 'c' are all
- * big-endian arrays of 'len' BignumInts. Returns a BignumInt carried
+ * big-endian arrays of 'len' BignumInts. Returns the carry off the
- * off the top.
+ * top.
 */
-static BignumInt internal_add(const BignumInt *a, const BignumInt *b,
+static BignumCarry internal_add(const BignumInt *a, const BignumInt *b,
-                              BignumInt *c, int len)
+                                BignumInt *c, int len)
 {
    int i;
-    BignumDblInt carry = 0;
+    BignumCarry carry = 0;
-    for (i = len-1; i >= 0; i--) {
+    for (i = len-1; i >= 0; i--)
-        carry += (BignumDblInt)a[i] + b[i];
+        BignumADC(c[i], carry, a[i], b[i], carry);
        c[i] = (BignumInt)carry;
        carry >>= BIGNUM_INT_BITS;
    }
    return (BignumInt)carry;
 }
@ -114,13 +111,10 @@ static void internal_sub(const BignumInt *a, const BignumInt *b,
                         BignumInt *c, int len)
 {
    int i;
-    BignumDblInt carry = 1;
+    BignumCarry carry = 1;
-    for (i = len-1; i >= 0; i--) {
+    for (i = len-1; i >= 0; i--)
-        carry += (BignumDblInt)a[i] + (b[i] ^ BIGNUM_INT_MASK);
+        BignumADC(c[i], carry, a[i], ~b[i], carry);
        c[i] = (BignumInt)carry;
        carry >>= BIGNUM_INT_BITS;
    }
 }
 /*
@ -184,7 +178,7 @@ static void internal_mul(const BignumInt *a, const BignumInt *b,
        int toplen = len/2, botlen = len - toplen; /* botlen is the bigger */
        int midlen = botlen + 1;
-        BignumDblInt carry;
+        BignumCarry carry;
 #ifdef KARA_DEBUG
        int i;
 #endif
@ -313,9 +307,7 @@ static void internal_mul(const BignumInt *a, const BignumInt *b,
        i = 2*len - botlen - 2*midlen - 1;
        while (carry) {
            assert(i >= 0);
-            carry += c[i];
+            BignumADC(c[i], carry, c[i], 0, carry);
            c[i] = (BignumInt)carry;
            carry >>= BIGNUM_INT_BITS;
            i--;
        }
 #ifdef KARA_DEBUG
@ -329,7 +321,6 @@ static void internal_mul(const BignumInt *a, const BignumInt *b,
    } else {
        int i;
        BignumInt carry;
        BignumDblInt t;
        const BignumInt *ap, *bp;
        BignumInt *cp, *cps;
@ -342,11 +333,8 @@ static void internal_mul(const BignumInt *a, const BignumInt *b,
        for (cps = c + 2*len, ap = a + len; ap-- > a; cps--) {
            carry = 0;
-            for (cp = cps, bp = b + len; cp--, bp-- > b ;) {
+            for (cp = cps, bp = b + len; cp--, bp-- > b ;)
-                t = (MUL_WORD(*ap, *bp) + carry) + *cp;
+                BignumMULADD2(carry, *cp, *ap, *bp, *cp, carry);
                *cp = (BignumInt) t;
                carry = (BignumInt)(t >> BIGNUM_INT_BITS);
            }
            *cp = carry;
        }
    }
@ -431,7 +419,6 @@ static void internal_mul_low(const BignumInt *a, const BignumInt *b,
    } else {
        int i;
        BignumInt carry;
        BignumDblInt t;
        const BignumInt *ap, *bp;
        BignumInt *cp, *cps;
@ -444,11 +431,8 @@ static void internal_mul_low(const BignumInt *a, const BignumInt *b,
        for (cps = c + len, ap = a + len; ap-- > a; cps--) {
            carry = 0;
-            for (cp = cps, bp = b + len; bp--, cp-- > c ;) {
+            for (cp = cps, bp = b + len; bp--, cp-- > c ;)
-                t = (MUL_WORD(*ap, *bp) + carry) + *cp;
+                BignumMULADD2(carry, *cp, *ap, *bp, *cp, carry);
                *cp = (BignumInt) t;
                carry = (BignumInt)(t >> BIGNUM_INT_BITS);
            }
        }
    }
 }
@ -519,15 +503,23 @@ static void internal_add_shifted(BignumInt *number,
 {
    int word = 1 + (shift / BIGNUM_INT_BITS);
    int bshift = shift % BIGNUM_INT_BITS;
-    BignumDblInt addend;
+    BignumInt addendh, addendl;
    BignumCarry carry;
-    addend = (BignumDblInt)n << bshift;
+    addendl = n << bshift;
    addendh = (bshift == 0 ? 0 : n >> (BIGNUM_INT_BITS - bshift));
-    while (addend) {
+    assert(word <= number[0]);
    BignumADC(number[word], carry, number[word], addendl, 0);
    word++;
    if (!addendh && !carry)
        return;
    assert(word <= number[0]);
    BignumADC(number[word], carry, number[word], addendh, carry);
    word++;
    while (carry) {
        assert(word <= number[0]);
-	addend += number[word];
+        BignumADC(number[word], carry, number[word], 0, carry);
 	number[word] = (BignumInt) addend & BIGNUM_INT_MASK;
 	addend >>= BIGNUM_INT_BITS;
 	word++;
    }
 }
@ -555,8 +547,7 @@ static int bn_clz(BignumInt x)
 static BignumInt reciprocal_word(BignumInt d)
 {
-    BignumInt dshort, recip;
+    BignumInt dshort, recip, prodh, prodl;
    BignumDblInt product;
    int corrections;
    /*
@ -600,15 +591,16 @@ static BignumInt reciprocal_word(BignumInt d)
     * iteration, and the initial division above already gave us half
     * the output word, so it's only worth doing one iteration.
     */
-    product = MUL_WORD(recip, d);
+    BignumMULADD(prodh, prodl, recip, d, recip);
-    product += recip;
+    prodl = ~prodl;
-    product = -product;                /* the 2K shifts just off the top */
+    prodh = ~prodh;
-    product &= (((BignumDblInt)BIGNUM_INT_MASK << BIGNUM_INT_BITS) +
+    {
-                BIGNUM_INT_MASK);
+        BignumCarry c;
-    product >>= BIGNUM_INT_BITS;
+        BignumADC(prodl, c, prodl, 1, 0);
-    product = MUL_WORD(product, recip);
+        prodh += c;
-    product >>= (BIGNUM_INT_BITS-1);
+    }
-    recip = (BignumInt)product;
+    BignumMUL(prodh, prodl, prodh, recip);
    recip = (prodh << 1) | (prodl >> (BIGNUM_INT_BITS-1));
    /*
     * Now make sure we have the best possible reciprocal estimate,
@ -616,18 +608,24 @@ static BignumInt reciprocal_word(BignumInt d)
     * way - not enough to bother with any better-thought-out kind of
     * correction loop.
     */
-    product = MUL_WORD(recip, d);
+    BignumMULADD(prodh, prodl, recip, d, recip);
    product += recip;
    corrections = 0;
-    if (product >= ((BignumDblInt)1 << (2*BIGNUM_INT_BITS-1))) {
+    if (prodh >= BIGNUM_TOP_BIT) {
        do {
-            product -= d;
+            BignumCarry c = 1;
            BignumADC(prodl, c, prodl, ~d, c); prodh += BIGNUM_INT_MASK + c;
            recip--;
            corrections++;
-        } while (product >= ((BignumDblInt)1 << (2*BIGNUM_INT_BITS-1)));
+        } while (prodh >= ((BignumInt)1 << (BIGNUM_INT_BITS-1)));
    } else {
-        while (product < ((BignumDblInt)1 << (2*BIGNUM_INT_BITS-1)) - d) {
+        while (1) {
-            product += d;
+            BignumInt newprodh, newprodl;
            BignumCarry c = 0;
            BignumADC(newprodl, c, prodl, d, c); newprodh = prodh + c;
            if (newprodh >= BIGNUM_TOP_BIT)
                break;
            prodh = newprodh;
            prodl = newprodl;
            recip++;
            corrections++;
        }
@ -679,7 +677,7 @@ static void internal_mod(BignumInt *a, int alen,
     * here.
     */
    for (i = 0; i <= alen - mlen ;) {
-	BignumDblInt product, subtmp, t;
+	BignumInt product;
        BignumInt aword, q;
        int shift, full_bitoffset, bitoffset, wordoffset;
@ -707,8 +705,11 @@ static void internal_mod(BignumInt *a, int alen,
        if (shift > 0 && i+1 < alen)
            aword |= a[i+1] >> (BIGNUM_INT_BITS - shift);
-        t = MUL_WORD(recip, aword);
+        {
-        q = (BignumInt)(t >> BIGNUM_INT_BITS);
+            BignumInt unused;
            BignumMUL(q, unused, recip, aword);
            (void)unused;
        }
 #ifdef DIVISION_DEBUG
        printf("i=%d, aword=%#0*llx, shift=%d, q=%#0*llx\n",
@ -784,27 +785,22 @@ static void internal_mod(BignumInt *a, int alen,
        wordoffset = alen - mlen - wordoffset;
        if (bitoffset == 0) {
-            BignumInt c = 1;
+            BignumCarry c = 1;
            BignumInt prev_hi_word = 0;
            for (k = mlen - 1; wordoffset+k >= i; k--) {
                BignumInt mword = k<0 ? 0 : m[k];
-                product = MUL_WORD(q, mword);
+                BignumMULADD(prev_hi_word, product, q, mword, prev_hi_word);
                product += prev_hi_word;
                prev_hi_word = product >> BIGNUM_INT_BITS;
 #ifdef DIVISION_DEBUG
                printf("  aligned sub: product word for m[%d] = %#0*llx\n",
                       k, BIGNUM_INT_BITS/4,
-                       (unsigned long long)(BignumInt)product);
+                       (unsigned long long)product);
 #endif
 #ifdef DIVISION_DEBUG
                printf("  aligned sub: subtrahend for a[%d] = %#0*llx\n",
                       wordoffset+k, BIGNUM_INT_BITS/4,
-                       (unsigned long long)(BignumInt)product);
+                       (unsigned long long)product);
 #endif
-                subtmp = (BignumDblInt)a[wordoffset+k] +
+                BignumADC(a[wordoffset+k], c, a[wordoffset+k], ~product, c);
                    ((BignumInt)product ^ BIGNUM_INT_MASK) + c;
                a[wordoffset+k] = (BignumInt)subtmp;
                c = subtmp >> BIGNUM_INT_BITS;
            }
        } else {
            BignumInt add_word = 0;
@ -812,28 +808,23 @@ static void internal_mod(BignumInt *a, int alen,
            BignumInt prev_hi_word = 0;
            for (k = mlen - 1; wordoffset+k >= i; k--) {
                BignumInt mword = k<0 ? 0 : m[k];
-                product = MUL_WORD(q, mword);
+                BignumMULADD(prev_hi_word, product, q, mword, prev_hi_word);
                product += prev_hi_word;
                prev_hi_word = product >> BIGNUM_INT_BITS;
 #ifdef DIVISION_DEBUG
                printf("  unaligned sub: product word for m[%d] = %#0*llx\n",
                       k, BIGNUM_INT_BITS/4,
-                       (unsigned long long)(BignumInt)product);
+                       (unsigned long long)product);
 #endif
-                add_word |= (BignumInt)product << bitoffset;
+                add_word |= product << bitoffset;
 #ifdef DIVISION_DEBUG
                printf("  unaligned sub: subtrahend for a[%d] = %#0*llx\n",
                       wordoffset+k,
                       BIGNUM_INT_BITS/4, (unsigned long long)add_word);
 #endif
-                subtmp = (BignumDblInt)a[wordoffset+k] +
+                BignumADC(a[wordoffset+k], c, a[wordoffset+k], ~add_word, c);
                    (add_word ^ BIGNUM_INT_MASK) + c;
                a[wordoffset+k] = (BignumInt)subtmp;
                c = subtmp >> BIGNUM_INT_BITS;
-                add_word = (BignumInt)product >> (BIGNUM_INT_BITS - bitoffset);
+                add_word = product >> (BIGNUM_INT_BITS - bitoffset);
            }
        }
@ -917,14 +908,11 @@ static void internal_mod(BignumInt *a, int alen,
     * subtract m, and increment the quotient.
     */
    {
-        BignumInt c = 1;
+        BignumCarry c = 1;
        for (i = alen - 1; i >= 0; i--) {
            int mindex = mlen-alen+i;
            BignumInt mword = mindex < 0 ? 0 : m[mindex];
-            BignumDblInt subtmp = (BignumDblInt)a[i] +
+            BignumADC(a[i], c, a[i], ~mword, c);
                ((BignumInt)mword ^ BIGNUM_INT_MASK) + c;
            a[i] = (BignumInt)subtmp;
            c = subtmp >> BIGNUM_INT_BITS;
        }
    }
    if (quot)
@ -1767,12 +1755,11 @@ Bignum bigmuladd(Bignum a, Bignum b, Bignum addend)
    /* now add in the addend, if any */
    if (addend) {
-	BignumDblInt carry = 0;
+	BignumCarry carry = 0;
 	for (i = 1; i <= rlen; i++) {
-	    carry += (i <= (int)ret[0] ? ret[i] : 0);
+            BignumInt retword = (i <= (int)ret[0] ? ret[i] : 0);
-	    carry += (i <= (int)addend[0] ? addend[i] : 0);
+            BignumInt addword = (i <= (int)addend[0] ? addend[i] : 0);
-	    ret[i] = (BignumInt) carry & BIGNUM_INT_MASK;
+            BignumADC(ret[i], carry, retword, addword, carry);
 	    carry >>= BIGNUM_INT_BITS;
 	    if (ret[i] != 0 && i > maxspot)
 		maxspot = i;
 	}
@ -1801,17 +1788,16 @@ Bignum bigadd(Bignum a, Bignum b)
    int rlen = (alen > blen ? alen : blen) + 1;
    int i, maxspot;
    Bignum ret;
-    BignumDblInt carry;
+    BignumCarry carry;
    ret = newbn(rlen);
    carry = 0;
    maxspot = 0;
    for (i = 1; i <= rlen; i++) {
-        carry += (i <= (int)a[0] ? a[i] : 0);
+        BignumInt aword = (i <= (int)a[0] ? a[i] : 0);
-        carry += (i <= (int)b[0] ? b[i] : 0);
+        BignumInt bword = (i <= (int)b[0] ? b[i] : 0);
-        ret[i] = (BignumInt) carry & BIGNUM_INT_MASK;
+        BignumADC(ret[i], carry, aword, bword, carry);
        carry >>= BIGNUM_INT_BITS;
        if (ret[i] != 0 && i > maxspot)
            maxspot = i;
    }
@ -1831,17 +1817,16 @@ Bignum bigsub(Bignum a, Bignum b)
    int rlen = (alen > blen ? alen : blen);
    int i, maxspot;
    Bignum ret;
-    BignumDblInt carry;
+    BignumCarry carry;
    ret = newbn(rlen);
    carry = 1;
    maxspot = 0;
    for (i = 1; i <= rlen; i++) {
-        carry += (i <= (int)a[0] ? a[i] : 0);
+        BignumInt aword = (i <= (int)a[0] ? a[i] : 0);
-        carry += (i <= (int)b[0] ? b[i] ^ BIGNUM_INT_MASK : BIGNUM_INT_MASK);
+        BignumInt bword = (i <= (int)b[0] ? b[i] : 0);
-        ret[i] = (BignumInt) carry & BIGNUM_INT_MASK;
+        BignumADC(ret[i], carry, aword, ~bword, carry);
        carry >>= BIGNUM_INT_BITS;
        if (ret[i] != 0 && i > maxspot)
            maxspot = i;
    }
@ -1881,40 +1866,52 @@ Bignum bignum_bitmask(Bignum n)
 }
 /*
- * Convert a (max 32-bit) long into a bignum.
+ * Convert an unsigned long into a bignum.
 */
-Bignum bignum_from_long(unsigned long nn)
+Bignum bignum_from_long(unsigned long n)
 {
    const int maxwords =
        (sizeof(unsigned long) + sizeof(BignumInt) - 1) / sizeof(BignumInt);
    Bignum ret;
-    BignumDblInt n = nn;
+    int i;
    ret = newbn(maxwords);
    ret[0] = 0;
    for (i = 0; i < maxwords; i++) {
        ret[i+1] = n >> (i * BIGNUM_INT_BITS);
        if (ret[i+1] != 0)
            ret[0] = i+1;
    }
    ret = newbn(3);
    ret[1] = (BignumInt)(n & BIGNUM_INT_MASK);
    ret[2] = (BignumInt)((n >> BIGNUM_INT_BITS) & BIGNUM_INT_MASK);
    ret[3] = 0;
    ret[0] = (ret[2]  ? 2 : 1);
    return ret;
 }
 /*
 * Add a long to a bignum.
 */
-Bignum bignum_add_long(Bignum number, unsigned long addendx)
+Bignum bignum_add_long(Bignum number, unsigned long n)
 {
-    Bignum ret = newbn(number[0] + 1);
+    const int maxwords =
-    int i, maxspot = 0;
+        (sizeof(unsigned long) + sizeof(BignumInt) - 1) / sizeof(BignumInt);
-    BignumDblInt carry = 0, addend = addendx;
+    Bignum ret;
    int words, i;
    BignumCarry carry;
-    for (i = 1; i <= (int)ret[0]; i++) {
+    words = number[0];
-	carry += addend & BIGNUM_INT_MASK;
+    if (words < maxwords)
-	carry += (i <= (int)number[0] ? number[i] : 0);
+        words = maxwords;
-	addend >>= BIGNUM_INT_BITS;
+    words++;
-	ret[i] = (BignumInt) carry & BIGNUM_INT_MASK;
+    ret = newbn(words);
-	carry >>= BIGNUM_INT_BITS;
+
-	if (ret[i] != 0)
+    carry = 0;
-	    maxspot = i;
+    ret[0] = 0;
    for (i = 0; i < words; i++) {
        BignumInt nword = (i < maxwords ? n >> (i * BIGNUM_INT_BITS) : 0);
        BignumInt numword = (i < number[0] ? number[i+1] : 0);
        BignumADC(ret[i+1], carry, numword, nword, carry);
 	if (ret[i+1] != 0)
            ret[0] = i+1;
    }
    ret[0] = maxspot;
    return ret;
 }
@ -1923,13 +1920,17 @@ Bignum bignum_add_long(Bignum number, unsigned long addendx)
 */
 unsigned short bignum_mod_short(Bignum number, unsigned short modulus)
 {
-    BignumDblInt mod, r;
+    unsigned long mod = modulus, r = 0;
    /* Precompute (BIGNUM_INT_MASK+1) % mod */
    unsigned long base_r = (BIGNUM_INT_MASK - modulus + 1) % mod;
    int i;
-    r = 0;
+    for (i = number[0]; i > 0; i--) {
-    mod = modulus;
+        /*
-    for (i = number[0]; i > 0; i--)
+         * Conceptually, ((r << BIGNUM_INT_BITS) + number[i]) % mod
-	r = (r * (BIGNUM_TOP_BIT % mod) * 2 + number[i] % mod) % mod;
+         */
        r = ((r * base_r) + (number[i] % mod)) % mod;
    }
    return (unsigned short) r;
 }
@ -2087,7 +2088,7 @@ char *bignum_decimal(Bignum x)
 {
    int ndigits, ndigit;
    int i, iszero;
-    BignumDblInt carry;
+    BignumInt carry;
    char *ret;
    BignumInt *workspace;
@ -2134,11 +2135,33 @@ char *bignum_decimal(Bignum x)
 	iszero = 1;
 	carry = 0;
 	for (i = 0; i < (int)x[0]; i++) {
-	    carry = (carry << BIGNUM_INT_BITS) + workspace[i];
+            /*
-	    workspace[i] = (BignumInt) (carry / 10);
+             * Conceptually, we want to compute
             *
             *   (carry << BIGNUM_INT_BITS) + workspace[i]
             *   -----------------------------------------
             *                      10
             *
             * but we don't have an integer type longer than BignumInt
             * to work with. So we have to do it in pieces.
             */
            BignumInt q, r;
            q = workspace[i] / 10;
            r = workspace[i] % 10;
            /* I want (BIGNUM_INT_MASK+1)/10 but can't say so directly! */
            q += carry * ((BIGNUM_INT_MASK-9) / 10 + 1);
            r += carry * ((BIGNUM_INT_MASK-9) % 10);
            q += r / 10;
            r %= 10;
 	    workspace[i] = q;
 	    carry = r;
 	    if (workspace[i])
 		iszero = 0;
 	    carry %= 10;
 	}
 	ret[--ndigit] = (char) (carry + '0');
    } while (!iszero);
--- a/sshbn.h
+++ b/sshbn.h
@ -3,58 +3,171 @@
 * multiply macros used throughout the bignum code to treat numbers as
 * arrays of the most conveniently sized word for the target machine.
 * Exported so that other code (e.g. poly1305) can use it too.
 *
 * This file must export, in whatever ifdef branch it ends up in:
 *
 *  - two types: 'BignumInt' and 'BignumCarry'. BignumInt is an
 *    unsigned integer type which will be used as the base word size
 *    for all bignum operations. BignumCarry is an unsigned integer
 *    type used to hold the carry flag taken as input and output by
 *    the BignumADC macro (see below).
 *
 *  - four constant macros: BIGNUM_INT_BITS, BIGNUM_INT_BYTES,
 *    BIGNUM_TOP_BIT, BIGNUM_INT_MASK. These should be more or less
 *    self-explanatory, but just in case, they give the number of bits
 *    in BignumInt, the number of bytes that works out to, the
 *    BignumInt value consisting of only the top bit, and the
 *    BignumInt value with all bits set.
 *
 *  - four statement macros: BignumADC, BignumMUL, BignumMULADD,
 *    BignumMULADD2. These do various kinds of multi-word arithmetic,
 *    and all produce two output values.
 *     * BignumADC(ret,retc,a,b,c) takes input BignumInt values a,b
 *       and a BignumCarry c, and outputs a BignumInt ret = a+b+c and
 *       a BignumCarry retc which is the carry off the top of that
 *       addition.
 *     * BignumMUL(rh,rl,a,b) returns the two halves of the
 *       double-width product a*b.
 *     * BignumMULADD(rh,rl,a,b,addend) returns the two halves of the
 *       double-width value a*b + addend.
 *     * BignumMULADD2(rh,rl,a,b,addend1,addend2) returns the two
 *       halves of the double-width value a*b + addend1 + addend2.
 *
 * Every branch of the main ifdef below defines the type BignumInt and
 * the value BIGNUM_INT_BITS. The other three constant macros are
 * filled in by common code further down.
 *
 * Most branches also define a macro DEFINE_BIGNUMDBLINT containing a
 * typedef statement which declares a type _twice_ the length of a
 * BignumInt. This causes the common code further down to produce a
 * default implementation of the four statement macros in terms of
 * that double-width type, and also to defined BignumCarry to be
 * BignumInt.
 *
 * However, if a particular compile target does not have a type twice
 * the length of the BignumInt you want to use but it does provide
 * some alternative means of doing add-with-carry and double-word
 * multiply, then the ifdef branch in question can just define
 * BignumCarry and the four statement macros itself, and that's fine
 * too.
 */
 #if defined __SIZEOF_INT128__
-/* gcc and clang both provide a __uint128_t type on 64-bit targets
+
- * (and, when they do, indicate its presence by the above macro),
+  /*
- * using the same 'two machine registers' kind of code generation that
+   * 64-bit BignumInt using gcc/clang style 128-bit BignumDblInt.
- * 32-bit targets use for 64-bit ints. If we have one of these, we can
+   *
- * use a 64-bit BignumInt and a 128-bit BignumDblInt. */
+   * gcc and clang both provide a __uint128_t type on 64-bit targets
-typedef unsigned long long BignumInt;
+   * (and, when they do, indicate its presence by the above macro),
-typedef __uint128_t BignumDblInt;
+   * using the same 'two machine registers' kind of code generation
-#define BIGNUM_INT_MASK  0xFFFFFFFFFFFFFFFFULL
+   * that 32-bit targets use for 64-bit ints.
-#define BIGNUM_TOP_BIT   0x8000000000000000ULL
+   */
-#define BIGNUM_INT_BITS  64
+
-#define MUL_WORD(w1, w2) ((BignumDblInt)w1 * w2)
+  typedef unsigned long long BignumInt;
-#elif defined __GNUC__ && defined __i386__
+  #define BIGNUM_INT_BITS 64
-typedef unsigned long BignumInt;
+  #define DEFINE_BIGNUMDBLINT typedef __uint128_t BignumDblInt
-typedef unsigned long long BignumDblInt;
+
-#define BIGNUM_INT_MASK  0xFFFFFFFFUL
+#elif defined __GNUC__ || defined _LLP64 || __STDC__ >= 199901L
-#define BIGNUM_TOP_BIT   0x80000000UL
+
-#define BIGNUM_INT_BITS  32
+  /* 32-bit BignumInt, using C99 unsigned long long as BignumDblInt */
-#define MUL_WORD(w1, w2) ((BignumDblInt)w1 * w2)
+
  typedef unsigned int BignumInt;
  #define BIGNUM_INT_BITS 32
  #define DEFINE_BIGNUMDBLINT typedef unsigned long long BignumDblInt
 #elif defined _MSC_VER && defined _M_IX86
-typedef unsigned __int32 BignumInt;
+
-typedef unsigned __int64 BignumDblInt;
+  /* 32-bit BignumInt, using Visual Studio __int64 as BignumDblInt */
-#define BIGNUM_INT_MASK  0xFFFFFFFFUL
+
-#define BIGNUM_TOP_BIT   0x80000000UL
+  typedef unsigned int BignumInt;
-#define BIGNUM_INT_BITS  32
+  #define BIGNUM_INT_BITS  32
-#define MUL_WORD(w1, w2) ((BignumDblInt)w1 * w2)
+  #define DEFINE_BIGNUMDBLINT typedef unsigned __int64 BignumDblInt
 #elif defined _LP64
-/* 64-bit architectures can do 32x32->64 chunks at a time */
+
-typedef unsigned int BignumInt;
+  /*
-typedef unsigned long BignumDblInt;
+   * 32-bit BignumInt, using unsigned long itself as BignumDblInt.
-#define BIGNUM_INT_MASK  0xFFFFFFFFU
+   *
-#define BIGNUM_TOP_BIT   0x80000000U
+   * Only for platforms where long is 64 bits, of course.
-#define BIGNUM_INT_BITS  32
+   */
-#define MUL_WORD(w1, w2) ((BignumDblInt)w1 * w2)
+
-#elif defined _LLP64
+  typedef unsigned int BignumInt;
-/* 64-bit architectures in which unsigned long is 32 bits, not 64 */
+  #define BIGNUM_INT_BITS  32
-typedef unsigned long BignumInt;
+  #define DEFINE_BIGNUMDBLINT typedef unsigned long BignumDblInt
-typedef unsigned long long BignumDblInt;
+
 #define BIGNUM_INT_MASK  0xFFFFFFFFUL
 #define BIGNUM_TOP_BIT   0x80000000UL
 #define BIGNUM_INT_BITS  32
 #define MUL_WORD(w1, w2) ((BignumDblInt)w1 * w2)
 #else
-/* Fallback for all other cases */
+
-typedef unsigned short BignumInt;
+  /*
-typedef unsigned long BignumDblInt;
+   * 16-bit BignumInt, using unsigned long as BignumDblInt.
-#define BIGNUM_INT_MASK  0xFFFFU
+   *
-#define BIGNUM_TOP_BIT   0x8000U
+   * This is the final fallback for real emergencies: C89 guarantees
-#define BIGNUM_INT_BITS  16
+   * unsigned short/long to be at least the required sizes, so this
-#define MUL_WORD(w1, w2) ((BignumDblInt)w1 * w2)
+   * should work on any C implementation at all. But it'll be
   * noticeably slow, so if you find yourself in this case you
   * probably want to move heaven and earth to find an alternative!
   */
  typedef unsigned short BignumInt;
  #define BIGNUM_INT_BITS  16
  #define DEFINE_BIGNUMDBLINT typedef unsigned long BignumDblInt
 #endif
 /*
 * Common code across all branches of that ifdef: define the three
 * easy constant macros in terms of BIGNUM_INT_BITS.
 */
 #define BIGNUM_INT_BYTES (BIGNUM_INT_BITS / 8)
 #define BIGNUM_TOP_BIT (((BignumInt)1) << (BIGNUM_INT_BITS-1))
 #define BIGNUM_INT_MASK (BIGNUM_TOP_BIT | (BIGNUM_TOP_BIT-1))
 /*
 * Common code across _most_ branches of the ifdef: define a set of
 * statement macros in terms of the BignumDblInt type provided. In
 * this case, we also define BignumCarry to be the same thing as
 * BignumInt, for simplicity.
 */
 #ifdef DEFINE_BIGNUMDBLINT
  typedef BignumInt BignumCarry;
  #define BignumADC(ret, retc, a, b, c) do                        \
      {                                                           \
          DEFINE_BIGNUMDBLINT;                                    \
          BignumDblInt ADC_temp = (BignumInt)(a);                 \
          ADC_temp += (BignumInt)(b);                             \
          ADC_temp += (c);                                        \
          (ret) = (BignumInt)ADC_temp;                            \
          (retc) = (BignumCarry)(ADC_temp >> BIGNUM_INT_BITS);    \
      } while (0)
  #define BignumMUL(rh, rl, a, b) do                              \
      {                                                           \
          DEFINE_BIGNUMDBLINT;                                    \
          BignumDblInt MUL_temp = (BignumInt)(a);                 \
          MUL_temp *= (BignumInt)(b);                             \
          (rh) = (BignumInt)(MUL_temp >> BIGNUM_INT_BITS);        \
          (rl) = (BignumInt)(MUL_temp);                           \
      } while (0)
  #define BignumMULADD(rh, rl, a, b, addend) do                   \
      {                                                           \
          DEFINE_BIGNUMDBLINT;                                    \
          BignumDblInt MUL_temp = (BignumInt)(a);                 \
          MUL_temp *= (BignumInt)(b);                             \
          MUL_temp += (BignumInt)(addend);                        \
          (rh) = (BignumInt)(MUL_temp >> BIGNUM_INT_BITS);        \
          (rl) = (BignumInt)(MUL_temp);                           \
      } while (0)
  #define BignumMULADD2(rh, rl, a, b, addend1, addend2) do        \
      {                                                           \
          DEFINE_BIGNUMDBLINT;                                    \
          BignumDblInt MUL_temp = (BignumInt)(a);                 \
          MUL_temp *= (BignumInt)(b);                             \
          MUL_temp += (BignumInt)(addend1);                       \
          MUL_temp += (BignumInt)(addend2);                       \
          (rh) = (BignumInt)(MUL_temp >> BIGNUM_INT_BITS);        \
          (rl) = (BignumInt)(MUL_temp);                           \
      } while (0)
 #endif /* DEFINE_BIGNUMDBLINT */
--- a/sshccp.c
+++ b/sshccp.c