1
0
mirror of https://git.tartarus.org/simon/putty.git synced 2025-01-09 17:38:00 +00:00

Use 64-bit BignumInt wherever __uint128_t is available.

gcc and clang both provide a type called __uint128_t when compiling
for 64-bit targets, code-generated more or less similarly to the way
64-bit long longs are handled on 32-bit targets (spanning two
registers, using ADD/ADC, that sort of thing). Where this is available
(and they also provide a handy macro to make it easy to detect), we
should obviously use it, so that we can handle bignums a larger chunk
at a time and make use of the full width of the hardware's multiplier.
Preliminary benchmarking using 'testbn' suggests a factor of about 2.5
improvement.

I've added the new possibility to the ifdefs in sshbn.h, and also
re-run contrib/make1305.py to generate a set of variants of the
poly1305 arithmetic for the new size of BignumInt.
This commit is contained in:
Simon Tatham 2015-06-08 19:24:58 +01:00
parent e28b35b0a3
commit f8b27925ee
2 changed files with 135 additions and 4 deletions

19
sshbn.h
View File

@ -20,7 +20,24 @@
* The C variant won't give the right answer, either.
*/
#if defined __GNUC__ && defined __i386__
#if defined __SIZEOF_INT128__
/* gcc and clang both provide a __uint128_t type on 64-bit targets
* (and, when they do, indicate its presence by the above macro),
* using the same 'two machine registers' kind of code generation that
* 32-bit targets use for 64-bit ints. If we have one of these, we can
* use a 64-bit BignumInt and a 128-bit BignumDblInt. */
typedef __uint64_t BignumInt;
typedef __uint128_t BignumDblInt;
#define BIGNUM_INT_MASK 0xFFFFFFFFFFFFFFFFULL
#define BIGNUM_TOP_BIT 0x8000000000000000ULL
#define BIGNUM_INT_BITS 64
#define MUL_WORD(w1, w2) ((BignumDblInt)w1 * w2)
#define DIVMOD_WORD(q, r, hi, lo, w) do { \
BignumDblInt n = (((BignumDblInt)hi) << BIGNUM_INT_BITS) | lo; \
q = n / w; \
r = n % w; \
} while (0)
#elif defined __GNUC__ && defined __i386__
typedef unsigned long BignumInt;
typedef unsigned long long BignumDblInt;
#define BIGNUM_INT_MASK 0xFFFFFFFFUL

120
sshccp.c
View File

@ -215,7 +215,23 @@ static void bigval_export_le(const bigval *r, void *vdata, int len)
*/
static void bigval_add(bigval *r, const bigval *a, const bigval *b)
{
#if BIGNUM_INT_BITS == 32
#if BIGNUM_INT_BITS == 64
/* ./contrib/make1305.py add 64 */
BignumDblInt acclo;
acclo = 0;
acclo += a->w[0];
acclo += b->w[0];
r->w[0] = acclo;
acclo >>= 64;
acclo += a->w[1];
acclo += b->w[1];
r->w[1] = acclo;
acclo >>= 64;
acclo += a->w[2];
acclo += b->w[2];
r->w[2] = acclo;
acclo >>= 64;
#elif BIGNUM_INT_BITS == 32
/* ./contrib/make1305.py add 32 */
BignumDblInt acclo;
acclo = 0;
@ -290,7 +306,84 @@ static void bigval_add(bigval *r, const bigval *a, const bigval *b)
*/
static void bigval_mul_mod_p(bigval *r, const bigval *a, const bigval *b)
{
#if BIGNUM_INT_BITS == 32
#if BIGNUM_INT_BITS == 64
/* ./contrib/make1305.py mul 64 */
BignumDblInt tmp;
BignumDblInt acclo;
BignumDblInt acchi;
BignumDblInt acc2lo;
acclo = 0;
acchi = 0;
tmp = (BignumDblInt)(a->w[0]) * (b->w[0]);
acclo += tmp & BIGNUM_INT_MASK;
acchi += tmp >> 64;
r->w[0] = acclo;
acclo = acchi + (acclo >> 64);
acchi = 0;
tmp = (BignumDblInt)(a->w[0]) * (b->w[1]);
acclo += tmp & BIGNUM_INT_MASK;
acchi += tmp >> 64;
tmp = (BignumDblInt)(a->w[1]) * (b->w[0]);
acclo += tmp & BIGNUM_INT_MASK;
acchi += tmp >> 64;
r->w[1] = acclo;
acclo = acchi + (acclo >> 64);
acchi = 0;
tmp = (BignumDblInt)(a->w[0]) * (b->w[2]);
acclo += tmp & BIGNUM_INT_MASK;
acchi += tmp >> 64;
tmp = (BignumDblInt)(a->w[1]) * (b->w[1]);
acclo += tmp & BIGNUM_INT_MASK;
acchi += tmp >> 64;
tmp = (BignumDblInt)(a->w[2]) * (b->w[0]);
acclo += tmp & BIGNUM_INT_MASK;
acchi += tmp >> 64;
r->w[2] = acclo & (((BignumInt)1 << 2)-1);
acc2lo = 0;
acc2lo += ((acclo >> 2) & (((BignumInt)1 << 62)-1)) * ((BignumDblInt)5 << 0);
acclo = acchi + (acclo >> 64);
acchi = 0;
tmp = (BignumDblInt)(a->w[1]) * (b->w[2]);
acclo += tmp & BIGNUM_INT_MASK;
acchi += tmp >> 64;
tmp = (BignumDblInt)(a->w[2]) * (b->w[1]);
acclo += tmp & BIGNUM_INT_MASK;
acchi += tmp >> 64;
acc2lo += (acclo & (((BignumInt)1 << 2)-1)) * ((BignumDblInt)5 << 62);
acc2lo += r->w[0];
r->w[0] = acc2lo;
acc2lo >>= 64;
acc2lo += ((acclo >> 2) & (((BignumInt)1 << 62)-1)) * ((BignumDblInt)5 << 0);
acclo = acchi + (acclo >> 64);
acchi = 0;
tmp = (BignumDblInt)(a->w[2]) * (b->w[2]);
acclo += tmp & BIGNUM_INT_MASK;
acchi += tmp >> 64;
acc2lo += (acclo & (((BignumInt)1 << 2)-1)) * ((BignumDblInt)5 << 62);
acc2lo += r->w[1];
r->w[1] = acc2lo;
acc2lo >>= 64;
acc2lo += ((acclo >> 2) & (((BignumInt)1 << 2)-1)) * ((BignumDblInt)5 << 0);
acc2lo += r->w[2];
r->w[2] = acc2lo;
acc2lo = 0;
acc2lo += ((acclo >> 4) & (((BignumInt)1 << 60)-1)) * ((BignumDblInt)25 << 0);
acclo = acchi + (acclo >> 64);
acchi = 0;
acc2lo += (acclo & (((BignumInt)1 << 4)-1)) * ((BignumDblInt)25 << 60);
acc2lo += r->w[0];
r->w[0] = acc2lo;
acc2lo >>= 64;
acc2lo += ((acclo >> 4) & (((BignumInt)1 << 60)-1)) * ((BignumDblInt)25 << 0);
acclo = acchi + (acclo >> 64);
acchi = 0;
acc2lo += r->w[1];
r->w[1] = acc2lo;
acc2lo >>= 64;
acc2lo += r->w[2];
r->w[2] = acc2lo;
acc2lo >>= 64;
#elif BIGNUM_INT_BITS == 32
/* ./contrib/make1305.py mul 32 */
BignumDblInt tmp;
BignumDblInt acclo;
@ -819,7 +912,28 @@ static void bigval_mul_mod_p(bigval *r, const bigval *a, const bigval *b)
static void bigval_final_reduce(bigval *n)
{
#if BIGNUM_INT_BITS == 32
#if BIGNUM_INT_BITS == 64
/* ./contrib/make1305.py final_reduce 64 */
BignumDblInt acclo;
acclo = 0;
acclo += 5 * ((n->w[2] >> 2) + 1);
acclo += n->w[0];
acclo >>= 64;
acclo += n->w[1];
acclo >>= 64;
acclo += n->w[2];
acclo = 5 * (acclo >> 2);
acclo += n->w[0];
n->w[0] = acclo;
acclo >>= 64;
acclo += n->w[1];
n->w[1] = acclo;
acclo >>= 64;
acclo += n->w[2];
n->w[2] = acclo;
acclo >>= 64;
n->w[2] &= (1 << 2) - 1;
#elif BIGNUM_INT_BITS == 32
/* ./contrib/make1305.py final_reduce 32 */
BignumDblInt acclo;
acclo = 0;