1
0
mirror of https://git.tartarus.org/simon/putty.git synced 2025-06-30 19:12:48 -05:00

Rewrite the core divide function to not use DIVMOD_WORD.

DIVMOD_WORD is a portability hazard, because implementing it requires
either a way to get direct access to the x86 DIV instruction or
equivalent (be it inline assembler or a compiler intrinsic), or else
an integer type we can use as BignumDblInt. But I'm starting to think
about porting to 64-bit Visual Studio with a 64-bit BignumInt, and in
that situation neither of those options will be available.

I could write a piece of _out_-of-line x86-64 assembler in a separate
source file and put a function call in DIVMOD_WORD, but instead I've
decided to solve the problem in a more futureproof way: remove
DIVMOD_WORD totally and write a division function that doesn't need it
at all, solving not only today's porting headache but all future ones
in this area.

The new implementation works by precomputing (a good enough
approximation to) the leading word of the reciprocal of the modulus,
and then getting each word of quotient by multiplying by that
reciprocal, where we previously used DIVMOD_WORD to divide by the
leading word of the actual modulus. The reciprocal itself is computed
outside internal_mod() and passed in as a parameter, allowing me to
save time by only computing it once when I'm about to do a modpow.

To some extent this complicates the implementation: the advantage of
DIVMOD_WORD was that it yielded a full word q of quotient every time
it was used, so the subtraction of q*m from the input could be done in
a nicely word-aligned way. But the reciprocal multiply approach yields
_almost_ a full word of quotient, because you have to make the
reciprocal a bit short to avoid overflow at multiplication time. For a
start, this means we have to do fractionally more iterations of the
main loop; but more painfully, we can no longer depend on the
subtraction of q*m at every step being word-aligned, and instead we
have to be prepared to do it at any bit shift.

But the flip side is that once we've implemented that, the rest of the
algorithm becomes a lot less full of horrible special cases: in
particular, we can now completely throw away the horribleness at all
the call sites where we shift the modulus up by a fractional word to
set its top bit, and then have to do a little dance to get the last
few bits of quotient involving a second call to internal_mod.

So there are points both for and against the new implementation in
simplicity terms; but I think on balance it's more comprehensible than
the old one, and a quick timing test suggests it also ends up a touch
faster overall - the new testbn gets through the output of
testdata/bignum.py in 4.034s where the old one took 4.392s.
This commit is contained in:
Simon Tatham
2015-12-13 14:46:43 +00:00
parent 984792e9f4
commit 482b4ab872
2 changed files with 449 additions and 207 deletions

55
sshbn.h
View File

@ -1,23 +1,8 @@
/*
* sshbn.h: the assorted conditional definitions of BignumInt and
* multiply/divide macros used throughout the bignum code to treat
* numbers as arrays of the most conveniently sized word for the
* target machine. Exported so that other code (e.g. poly1305) can use
* it too.
*/
/*
* Usage notes:
* * Do not call the DIVMOD_WORD macro with expressions such as array
* subscripts, as some implementations object to this (see below).
* * Note that none of the division methods below will cope if the
* quotient won't fit into BIGNUM_INT_BITS. Callers should be careful
* to avoid this case.
* If this condition occurs, in the case of the x86 DIV instruction,
* an overflow exception will occur, which (according to a correspondent)
* will manifest on Windows as something like
* 0xC0000095: Integer overflow
* The C variant won't give the right answer, either.
* multiply macros used throughout the bignum code to treat numbers as
* arrays of the most conveniently sized word for the target machine.
* Exported so that other code (e.g. poly1305) can use it too.
*/
#if defined __SIZEOF_INT128__
@ -32,11 +17,6 @@ typedef __uint128_t BignumDblInt;
#define BIGNUM_TOP_BIT 0x8000000000000000ULL
#define BIGNUM_INT_BITS 64
#define MUL_WORD(w1, w2) ((BignumDblInt)w1 * w2)
#define DIVMOD_WORD(q, r, hi, lo, w) do { \
BignumDblInt n = (((BignumDblInt)hi) << BIGNUM_INT_BITS) | lo; \
q = n / w; \
r = n % w; \
} while (0)
#elif defined __GNUC__ && defined __i386__
typedef unsigned long BignumInt;
typedef unsigned long long BignumDblInt;
@ -44,10 +24,6 @@ typedef unsigned long long BignumDblInt;
#define BIGNUM_TOP_BIT 0x80000000UL
#define BIGNUM_INT_BITS 32
#define MUL_WORD(w1, w2) ((BignumDblInt)w1 * w2)
#define DIVMOD_WORD(q, r, hi, lo, w) \
__asm__("div %2" : \
"=d" (r), "=a" (q) : \
"r" (w), "d" (hi), "a" (lo))
#elif defined _MSC_VER && defined _M_IX86
typedef unsigned __int32 BignumInt;
typedef unsigned __int64 BignumDblInt;
@ -55,16 +31,6 @@ typedef unsigned __int64 BignumDblInt;
#define BIGNUM_TOP_BIT 0x80000000UL
#define BIGNUM_INT_BITS 32
#define MUL_WORD(w1, w2) ((BignumDblInt)w1 * w2)
/* Note: MASM interprets array subscripts in the macro arguments as
* assembler syntax, which gives the wrong answer. Don't supply them.
* <http://msdn2.microsoft.com/en-us/library/bf1dw62z.aspx> */
#define DIVMOD_WORD(q, r, hi, lo, w) do { \
__asm mov edx, hi \
__asm mov eax, lo \
__asm div w \
__asm mov r, edx \
__asm mov q, eax \
} while(0)
#elif defined _LP64
/* 64-bit architectures can do 32x32->64 chunks at a time */
typedef unsigned int BignumInt;
@ -73,11 +39,6 @@ typedef unsigned long BignumDblInt;
#define BIGNUM_TOP_BIT 0x80000000U
#define BIGNUM_INT_BITS 32
#define MUL_WORD(w1, w2) ((BignumDblInt)w1 * w2)
#define DIVMOD_WORD(q, r, hi, lo, w) do { \
BignumDblInt n = (((BignumDblInt)hi) << BIGNUM_INT_BITS) | lo; \
q = n / w; \
r = n % w; \
} while (0)
#elif defined _LLP64
/* 64-bit architectures in which unsigned long is 32 bits, not 64 */
typedef unsigned long BignumInt;
@ -86,11 +47,6 @@ typedef unsigned long long BignumDblInt;
#define BIGNUM_TOP_BIT 0x80000000UL
#define BIGNUM_INT_BITS 32
#define MUL_WORD(w1, w2) ((BignumDblInt)w1 * w2)
#define DIVMOD_WORD(q, r, hi, lo, w) do { \
BignumDblInt n = (((BignumDblInt)hi) << BIGNUM_INT_BITS) | lo; \
q = n / w; \
r = n % w; \
} while (0)
#else
/* Fallback for all other cases */
typedef unsigned short BignumInt;
@ -99,11 +55,6 @@ typedef unsigned long BignumDblInt;
#define BIGNUM_TOP_BIT 0x8000U
#define BIGNUM_INT_BITS 16
#define MUL_WORD(w1, w2) ((BignumDblInt)w1 * w2)
#define DIVMOD_WORD(q, r, hi, lo, w) do { \
BignumDblInt n = (((BignumDblInt)hi) << BIGNUM_INT_BITS) | lo; \
q = n / w; \
r = n % w; \
} while (0)
#endif
#define BIGNUM_INT_BYTES (BIGNUM_INT_BITS / 8)