diff --git a/crypto/mpint.c b/crypto/mpint.c index f015bd09..fba9431d 100644 --- a/crypto/mpint.c +++ b/crypto/mpint.c @@ -1539,41 +1539,112 @@ mp_int *monty_export(MontyContext *mc, mp_int *x) return toret; } -static void monty_reduce(MontyContext *mc, mp_int *x) -{ - mp_int reduced = monty_reduce_internal(mc, x, *mc->scratch); - mp_copy_into(x, &reduced); - mp_clear(mc->scratch); -} - +#define MODPOW_LOG2_WINDOW_SIZE 5 +#define MODPOW_WINDOW_SIZE (1 << MODPOW_LOG2_WINDOW_SIZE) mp_int *monty_pow(MontyContext *mc, mp_int *base, mp_int *exponent) { - /* square builds up powers of the form base^{2^i}. */ - mp_int *square = mp_copy(base); - size_t i = 0; + /* + * Modular exponentation is done from the top down, using a + * fixed-window technique. + * + * We have a table storing every power of the base from base^0 up + * to base^{w-1}, where w is a small power of 2, say 2^k. (k is + * defined above as MODPOW_LOG2_WINDOW_SIZE, and w = 2^k is + * defined as MODPOW_WINDOW_SIZE.) + * + * We break the exponent up into k-bit chunks, from the bottom up, + * that is + * + * exponent = c_0 + 2^k c_1 + 2^{2k} c_2 + ... + 2^{nk} c_n + * + * and we compute base^exponent by computing in turn + * + * base^{c_n} + * base^{2^k c_n + c_{n-1}} + * base^{2^{2k} c_n + 2^k c_{n-1} + c_{n-2}} + * ... + * + * where each line is obtained by raising the previous line to the + * power 2^k (i.e. squaring it k times) and then multiplying in + * a value base^{c_i}, which we can look up in our table. + * + * Side-channel considerations: the exponent is secret, so + * actually doing a single table lookup by using a chunk of + * exponent bits as an array index would be an obvious leak of + * secret information into the cache. So instead, in each + * iteration, we read _all_ the table entries, and do a sequence + * of mp_select operations to leave just the one we wanted in the + * variable that will go into the multiplication. In other + * contexts (like software AES) that technique is so prohibitively + * slow that it makes you choose a strategy that doesn't use table + * lookups at all (we do bitslicing in preference); but here, this + * iteration through 2^k table elements is replacing k-1 bignum + * _multiplications_ that you'd have to use instead if you did + * simple square-and-multiply, and that makes it still a win. + */ - /* out accumulates the output value. Starts at 1 (in Montgomery - * representation) and we multiply in each base^{2^i}. */ - mp_int *out = mp_copy(mc->powers_of_r_mod_m[0]); + /* Table that holds base^0, ..., base^{w-1} */ + mp_int *table[MODPOW_WINDOW_SIZE]; + table[0] = mp_copy(monty_identity(mc)); + for (size_t i = 1; i < MODPOW_WINDOW_SIZE; i++) + table[i] = monty_mul(mc, table[i-1], base); - /* tmp holds each product we compute and reduce. */ - mp_int *tmp = mp_make_sized(mc->rw * 2); + /* out accumulates the output value */ + mp_int *out = mp_make_sized(mc->rw); + mp_copy_into(out, monty_identity(mc)); + + /* table_entry will hold each value we get out of the table */ + mp_int *table_entry = mp_make_sized(mc->rw); + + /* Bit index of the chunk of bits we're working on. Start with the + * highest multiple of k strictly less than the size of our + * bignum, i.e. the highest-index chunk of bits that might + * conceivably contain any nonzero bit. */ + size_t i = (exponent->nw * BIGNUM_INT_BITS) - 1; + i -= i % MODPOW_LOG2_WINDOW_SIZE; + + bool first_iteration = true; while (true) { - mp_mul_into(tmp, out, square); - monty_reduce(mc, tmp); - mp_select_into(out, out, tmp, mp_get_bit(exponent, i)); + /* Construct the table index */ + unsigned table_index = 0; + for (size_t j = 0; j < MODPOW_LOG2_WINDOW_SIZE; j++) + table_index |= mp_get_bit(exponent, i+j) << j; - if (++i >= exponent->nw * BIGNUM_INT_BITS) + /* Iterate through the table to do a side-channel-safe lookup, + * ending up with table_entry = table[table_index] */ + mp_copy_into(table_entry, table[0]); + for (size_t j = 1; j < MODPOW_WINDOW_SIZE; j++) { + unsigned not_this_one = + ((table_index ^ j) + MODPOW_WINDOW_SIZE - 1) + >> MODPOW_LOG2_WINDOW_SIZE; + mp_select_into(table_entry, table[j], table_entry, not_this_one); + } + + if (!first_iteration) { + /* Multiply into the output */ + monty_mul_into(mc, out, out, table_entry); + } else { + /* On the first iteration, we can save one multiplication + * by just copying */ + mp_copy_into(out, table_entry); + first_iteration = false; + } + + /* If that was the bottommost chunk of bits, we're done */ + if (i == 0) break; - mp_mul_into(tmp, square, square); - monty_reduce(mc, tmp); - mp_copy_into(square, tmp); + /* Otherwise, square k times and go round again. */ + for (size_t j = 0; j < MODPOW_LOG2_WINDOW_SIZE; j++) + monty_mul_into(mc, out, out, out); + + i-= MODPOW_LOG2_WINDOW_SIZE; } - mp_free(square); - mp_free(tmp); + for (size_t i = 0; i < MODPOW_WINDOW_SIZE; i++) + mp_free(table[i]); + mp_free(table_entry); mp_clear(mc->scratch); return out; }