From f8b27925eee6a37df107a7cd2e718e997a52516e Mon Sep 17 00:00:00 2001 From: Simon Tatham Date: Mon, 8 Jun 2015 19:24:58 +0100 Subject: [PATCH] Use 64-bit BignumInt wherever __uint128_t is available. gcc and clang both provide a type called __uint128_t when compiling for 64-bit targets, code-generated more or less similarly to the way 64-bit long longs are handled on 32-bit targets (spanning two registers, using ADD/ADC, that sort of thing). Where this is available (and they also provide a handy macro to make it easy to detect), we should obviously use it, so that we can handle bignums a larger chunk at a time and make use of the full width of the hardware's multiplier. Preliminary benchmarking using 'testbn' suggests a factor of about 2.5 improvement. I've added the new possibility to the ifdefs in sshbn.h, and also re-run contrib/make1305.py to generate a set of variants of the poly1305 arithmetic for the new size of BignumInt. --- sshbn.h | 19 ++++++++- sshccp.c | 120 +++++++++++++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 135 insertions(+), 4 deletions(-) diff --git a/sshbn.h b/sshbn.h index 3d15b948..a043241e 100644 --- a/sshbn.h +++ b/sshbn.h @@ -20,7 +20,24 @@ * The C variant won't give the right answer, either. */ -#if defined __GNUC__ && defined __i386__ +#if defined __SIZEOF_INT128__ +/* gcc and clang both provide a __uint128_t type on 64-bit targets + * (and, when they do, indicate its presence by the above macro), + * using the same 'two machine registers' kind of code generation that + * 32-bit targets use for 64-bit ints. If we have one of these, we can + * use a 64-bit BignumInt and a 128-bit BignumDblInt. */ +typedef __uint64_t BignumInt; +typedef __uint128_t BignumDblInt; +#define BIGNUM_INT_MASK 0xFFFFFFFFFFFFFFFFULL +#define BIGNUM_TOP_BIT 0x8000000000000000ULL +#define BIGNUM_INT_BITS 64 +#define MUL_WORD(w1, w2) ((BignumDblInt)w1 * w2) +#define DIVMOD_WORD(q, r, hi, lo, w) do { \ + BignumDblInt n = (((BignumDblInt)hi) << BIGNUM_INT_BITS) | lo; \ + q = n / w; \ + r = n % w; \ +} while (0) +#elif defined __GNUC__ && defined __i386__ typedef unsigned long BignumInt; typedef unsigned long long BignumDblInt; #define BIGNUM_INT_MASK 0xFFFFFFFFUL diff --git a/sshccp.c b/sshccp.c index 71fde427..35aa43fb 100644 --- a/sshccp.c +++ b/sshccp.c @@ -215,7 +215,23 @@ static void bigval_export_le(const bigval *r, void *vdata, int len) */ static void bigval_add(bigval *r, const bigval *a, const bigval *b) { -#if BIGNUM_INT_BITS == 32 +#if BIGNUM_INT_BITS == 64 + /* ./contrib/make1305.py add 64 */ + BignumDblInt acclo; + acclo = 0; + acclo += a->w[0]; + acclo += b->w[0]; + r->w[0] = acclo; + acclo >>= 64; + acclo += a->w[1]; + acclo += b->w[1]; + r->w[1] = acclo; + acclo >>= 64; + acclo += a->w[2]; + acclo += b->w[2]; + r->w[2] = acclo; + acclo >>= 64; +#elif BIGNUM_INT_BITS == 32 /* ./contrib/make1305.py add 32 */ BignumDblInt acclo; acclo = 0; @@ -290,7 +306,84 @@ static void bigval_add(bigval *r, const bigval *a, const bigval *b) */ static void bigval_mul_mod_p(bigval *r, const bigval *a, const bigval *b) { -#if BIGNUM_INT_BITS == 32 +#if BIGNUM_INT_BITS == 64 + /* ./contrib/make1305.py mul 64 */ + BignumDblInt tmp; + BignumDblInt acclo; + BignumDblInt acchi; + BignumDblInt acc2lo; + acclo = 0; + acchi = 0; + tmp = (BignumDblInt)(a->w[0]) * (b->w[0]); + acclo += tmp & BIGNUM_INT_MASK; + acchi += tmp >> 64; + r->w[0] = acclo; + acclo = acchi + (acclo >> 64); + acchi = 0; + tmp = (BignumDblInt)(a->w[0]) * (b->w[1]); + acclo += tmp & BIGNUM_INT_MASK; + acchi += tmp >> 64; + tmp = (BignumDblInt)(a->w[1]) * (b->w[0]); + acclo += tmp & BIGNUM_INT_MASK; + acchi += tmp >> 64; + r->w[1] = acclo; + acclo = acchi + (acclo >> 64); + acchi = 0; + tmp = (BignumDblInt)(a->w[0]) * (b->w[2]); + acclo += tmp & BIGNUM_INT_MASK; + acchi += tmp >> 64; + tmp = (BignumDblInt)(a->w[1]) * (b->w[1]); + acclo += tmp & BIGNUM_INT_MASK; + acchi += tmp >> 64; + tmp = (BignumDblInt)(a->w[2]) * (b->w[0]); + acclo += tmp & BIGNUM_INT_MASK; + acchi += tmp >> 64; + r->w[2] = acclo & (((BignumInt)1 << 2)-1); + acc2lo = 0; + acc2lo += ((acclo >> 2) & (((BignumInt)1 << 62)-1)) * ((BignumDblInt)5 << 0); + acclo = acchi + (acclo >> 64); + acchi = 0; + tmp = (BignumDblInt)(a->w[1]) * (b->w[2]); + acclo += tmp & BIGNUM_INT_MASK; + acchi += tmp >> 64; + tmp = (BignumDblInt)(a->w[2]) * (b->w[1]); + acclo += tmp & BIGNUM_INT_MASK; + acchi += tmp >> 64; + acc2lo += (acclo & (((BignumInt)1 << 2)-1)) * ((BignumDblInt)5 << 62); + acc2lo += r->w[0]; + r->w[0] = acc2lo; + acc2lo >>= 64; + acc2lo += ((acclo >> 2) & (((BignumInt)1 << 62)-1)) * ((BignumDblInt)5 << 0); + acclo = acchi + (acclo >> 64); + acchi = 0; + tmp = (BignumDblInt)(a->w[2]) * (b->w[2]); + acclo += tmp & BIGNUM_INT_MASK; + acchi += tmp >> 64; + acc2lo += (acclo & (((BignumInt)1 << 2)-1)) * ((BignumDblInt)5 << 62); + acc2lo += r->w[1]; + r->w[1] = acc2lo; + acc2lo >>= 64; + acc2lo += ((acclo >> 2) & (((BignumInt)1 << 2)-1)) * ((BignumDblInt)5 << 0); + acc2lo += r->w[2]; + r->w[2] = acc2lo; + acc2lo = 0; + acc2lo += ((acclo >> 4) & (((BignumInt)1 << 60)-1)) * ((BignumDblInt)25 << 0); + acclo = acchi + (acclo >> 64); + acchi = 0; + acc2lo += (acclo & (((BignumInt)1 << 4)-1)) * ((BignumDblInt)25 << 60); + acc2lo += r->w[0]; + r->w[0] = acc2lo; + acc2lo >>= 64; + acc2lo += ((acclo >> 4) & (((BignumInt)1 << 60)-1)) * ((BignumDblInt)25 << 0); + acclo = acchi + (acclo >> 64); + acchi = 0; + acc2lo += r->w[1]; + r->w[1] = acc2lo; + acc2lo >>= 64; + acc2lo += r->w[2]; + r->w[2] = acc2lo; + acc2lo >>= 64; +#elif BIGNUM_INT_BITS == 32 /* ./contrib/make1305.py mul 32 */ BignumDblInt tmp; BignumDblInt acclo; @@ -819,7 +912,28 @@ static void bigval_mul_mod_p(bigval *r, const bigval *a, const bigval *b) static void bigval_final_reduce(bigval *n) { -#if BIGNUM_INT_BITS == 32 +#if BIGNUM_INT_BITS == 64 + /* ./contrib/make1305.py final_reduce 64 */ + BignumDblInt acclo; + acclo = 0; + acclo += 5 * ((n->w[2] >> 2) + 1); + acclo += n->w[0]; + acclo >>= 64; + acclo += n->w[1]; + acclo >>= 64; + acclo += n->w[2]; + acclo = 5 * (acclo >> 2); + acclo += n->w[0]; + n->w[0] = acclo; + acclo >>= 64; + acclo += n->w[1]; + n->w[1] = acclo; + acclo >>= 64; + acclo += n->w[2]; + n->w[2] = acclo; + acclo >>= 64; + n->w[2] &= (1 << 2) - 1; +#elif BIGNUM_INT_BITS == 32 /* ./contrib/make1305.py final_reduce 32 */ BignumDblInt acclo; acclo = 0;