diff --git a/sshkeygen.h b/sshkeygen.h index a965d3a6..f8c3a370 100644 --- a/sshkeygen.h +++ b/sshkeygen.h @@ -261,6 +261,9 @@ static inline mp_int *primegen_generate( { return ctx->vt->generate(ctx, pcs, prog); } extern const PrimeGenerationPolicy primegen_probabilistic; +extern const PrimeGenerationPolicy primegen_provable_fast; +extern const PrimeGenerationPolicy primegen_provable_maurer_simple; +extern const PrimeGenerationPolicy primegen_provable_maurer_complex; /* ---------------------------------------------------------------------- * The overall top-level API for generating entire key pairs. diff --git a/sshprime.c b/sshprime.c index 4431204c..30ad4b14 100644 --- a/sshprime.c +++ b/sshprime.c @@ -110,6 +110,588 @@ const PrimeGenerationPolicy primegen_probabilistic = { probprime_generate, }; +/* ---------------------------------------------------------------------- + * Alternative provable-prime algorithm, based on the following paper: + * + * [MAURER] Maurer, U.M. Fast generation of prime numbers and secure + * public-key cryptographic parameters. J. Cryptology 8, 123–155 + * (1995). https://doi.org/10.1007/BF00202269 + */ + +typedef enum SubprimePolicy { + SPP_FAST, + SPP_MAURER_SIMPLE, + SPP_MAURER_COMPLEX, +} SubprimePolicy; + +typedef struct ProvablePrimePolicyExtra { + SubprimePolicy spp; +} ProvablePrimePolicyExtra; + +typedef struct ProvablePrimeContext ProvablePrimeContext; +struct ProvablePrimeContext { + Pockle *pockle; + PrimeGenerationContext pgc; + const ProvablePrimePolicyExtra *extra; +}; + +static PrimeGenerationContext *provableprime_new_context( + const PrimeGenerationPolicy *policy) +{ + ProvablePrimeContext *ppc = snew(ProvablePrimeContext); + ppc->pgc.vt = policy; + ppc->pockle = pockle_new(); + ppc->extra = policy->extra; + return &ppc->pgc; +} + +static void provableprime_free_context(PrimeGenerationContext *ctx) +{ + ProvablePrimeContext *ppc = container_of(ctx, ProvablePrimeContext, pgc); + pockle_free(ppc->pockle); + sfree(ppc); +} + +static ProgressPhase provableprime_add_progress_phase( + const PrimeGenerationPolicy *policy, + ProgressReceiver *prog, unsigned bits) +{ + /* + * Estimating the cost of making a _provable_ prime is difficult + * because of all the recursions to smaller sizes. + * + * Once you have enough factors of p-1 to certify primality of p, + * the remaining work in provable prime generation is not very + * different from probabilistic: you generate a random candidate, + * test its primality probabilistically, and use the witness value + * generated as a byproduct of that test for the full Pocklington + * verification. The expensive part, as usual, is made of modpows. + * + * The Pocklington test needs at least two modpows (one for the + * Fermat check, and one per known factor of p-1). + * + * The prior M-R step needs an unknown number, because we iterate + * until we find a value whose order is divisible by the largest + * power of 2 that divides p-1, say 2^j. That excludes half the + * possible witness values (specifically, the quadratic residues), + * so we expect to need on average two M-R operations to find one. + * But that's only if the number _is_ prime - as usual, it's also + * possible that we hit a non-prime and have to try again. + * + * So, if we were only estimating the cost of that final step, it + * would look a lot like the probabilistic version: we'd have to + * estimate the expected total number of modexps by knowing + * something about the density of primes among our candidate + * integers, and then multiply that by estimate_modexp_cost(bits). + * But the problem is that we also have to _find_ a smaller prime, + * so we have to recurse. + * + * In the MAURER_SIMPLE version of the algorithm, you recurse to + * any one of a range of possible smaller sizes i, each with + * probability proportional to 1/i. So your expected time to + * generate an n-bit prime is given by a horrible recurrence of + * the form E_n = S_n + (sum E_i/i) / (sum 1/i), in which S_n is + * the expected cost of the final step once you have your smaller + * primes, and both sums are over ceil(n/2) <= i <= n-20. + * + * At this point I ran out of effort to actually do the maths + * rigorously, so instead I did the empirical experiment of + * generating that sequence in Python and plotting it on a graph. + * My Python code is here, in case I need it again: + +from math import log + +alpha = log(3)/log(2) + 1 # exponent for modexp using Karatsuba mult + +E = [1] * 16 # assume generating tiny primes is trivial + +for n in range(len(E), 4096): + + # Expected time for sub-generations, as a weighted mean of prior + # values of the same sequence. + lo = (n+1)//2 + hi = n-20 + if lo <= hi: + subrange = range(lo, hi+1) + num = sum(E[i]/i for i in subrange) + den = sum(1/i for i in subrange) + else: + num, den = 0, 1 + + # Constant term (cost of final step). + # Similar to probprime_add_progress_phase. + winnow_factor = 1 if n < 32 else 19.76 + prob = winnow_factor / (n * log(2)) + cost = 4 * n**alpha / prob + + E.append(cost + num / den) + +for i, p in enumerate(E): + try: + print(log(i), log(p)) + except ValueError: + continue + + * The output loop prints the logs of both i and E_i, so that when + * I plot the resulting data file in gnuplot I get a log-log + * diagram. That showed me some early noise and then a very + * straight-looking line; feeding the straight part of the graph + * to linear-regression analysis reported that it fits the line + * + * log E_n = -1.7901825337965498 + 3.6199197179662517 * log(n) + * => E_n = 0.16692969657466802 * n^3.6199197179662517 + * + * So my somewhat empirical estimate is that Maurer prime + * generation costs about 0.167 * bits^3.62, in the same arbitrary + * time units used by estimate_modexp_cost. + */ + + return progress_add_linear(prog, 0.167 * pow(bits, 3.62)); +} + +static mp_int *primegen_small(Pockle *pockle, PrimeCandidateSource *pcs) +{ + assert(pcs_get_bits(pcs) <= 32); + + pcs_ready(pcs); + + while (true) { + mp_int *p = pcs_generate(pcs); + if (pockle_add_small_prime(pockle, p) == POCKLE_OK) { + pcs_free(pcs); + return p; + } + mp_free(p); + } +} + +#ifdef DEBUG_PRIMEGEN +static void timestamp(FILE *fp) +{ + struct timespec ts; + clock_gettime(CLOCK_MONOTONIC, &ts); + fprintf(fp, "%lu.%09lu: ", (unsigned long)ts.tv_sec, + (unsigned long)ts.tv_nsec); +} +static PRINTF_LIKE(1, 2) void debug_f(const char *fmt, ...) +{ + va_list ap; + va_start(ap, fmt); + timestamp(stderr); + vfprintf(stderr, fmt, ap); + fputc('\n', stderr); + va_end(ap); +} +static void debug_f_mp(const char *fmt, mp_int *x, ...) +{ + va_list ap; + va_start(ap, x); + timestamp(stderr); + vfprintf(stderr, fmt, ap); + mp_dump(stderr, "", x, "\n"); + va_end(ap); +} +#else +#define debug_f(...) ((void)0) +#define debug_f_mp(...) ((void)0) +#endif + +static double uniform_random_double(void) +{ + unsigned char randbuf[8]; + random_read(randbuf, 8); + return GET_64BIT_MSB_FIRST(randbuf) * 0x1.0p-64; +} + +static mp_int *mp_ceil_div(mp_int *n, mp_int *d) +{ + mp_int *nplus = mp_add(n, d); + mp_sub_integer_into(nplus, nplus, 1); + mp_int *toret = mp_div(nplus, d); + mp_free(nplus); + return toret; +} + +static mp_int *provableprime_generate_inner( + ProvablePrimeContext *ppc, PrimeCandidateSource *pcs, + ProgressReceiver *prog, double progress_origin, double progress_scale) +{ + unsigned bits = pcs_get_bits(pcs); + assert(bits > 1); + + if (bits <= 32) { + debug_f("ppgi(%u) -> small", bits); + return primegen_small(ppc->pockle, pcs); + } + + unsigned min_bits_needed, max_bits_needed; + { + /* + * Find the product of all the prime factors we already know + * about. + */ + mp_int *size_got = mp_from_integer(1); + size_t nfactors; + mp_int **factors = pcs_get_known_prime_factors(pcs, &nfactors); + for (size_t i = 0; i < nfactors; i++) { + mp_int *to_free = size_got; + size_got = mp_unsafe_shrink(mp_mul(size_got, factors[i])); + mp_free(to_free); + } + + /* + * Find the largest cofactor we might be able to use, and the + * smallest one we can get away with. + */ + mp_int *upperbound = pcs_get_upper_bound(pcs); + mp_int *size_needed = mp_nthroot(upperbound, 3, NULL); + debug_f_mp("upperbound = ", upperbound); + { + mp_int *to_free = upperbound; + upperbound = mp_unsafe_shrink(mp_div(upperbound, size_got)); + mp_free(to_free); + } + debug_f_mp("size_needed = ", size_needed); + { + mp_int *to_free = size_needed; + size_needed = mp_unsafe_shrink(mp_ceil_div(size_needed, size_got)); + mp_free(to_free); + } + + max_bits_needed = mp_get_nbits(upperbound); + + /* + * We need a prime that is greater than or equal to + * 'size_needed' in order for the product of all our known + * factors of p-1 to exceed the cube root of the largest value + * p might take. + * + * Since pcs_new wants a size specified in bits, we must count + * the bits in size_needed and then add 1. Otherwise we might + * get a value with the same bit count as size_needed but + * slightly smaller than it. + * + * An exception is if size_needed = 1. In that case the + * product of existing known factors is _already_ enough, so + * we don't need to generate an extra factor at all. + */ + if (mp_hs_integer(size_needed, 2)) { + min_bits_needed = mp_get_nbits(size_needed) + 1; + } else { + min_bits_needed = 0; + } + + mp_free(upperbound); + mp_free(size_needed); + mp_free(size_got); + } + + double progress = 0.0; + + if (min_bits_needed) { + debug_f("ppgi(%u) recursing, need [%u,%u] more bits", + bits, min_bits_needed, max_bits_needed); + + unsigned *sizes = NULL; + size_t nsizes = 0, sizesize = 0; + + unsigned real_min = max_bits_needed / 2; + unsigned real_max = (max_bits_needed >= 20 ? + max_bits_needed - 20 : 0); + if (real_min < min_bits_needed) + real_min = min_bits_needed; + if (real_max < real_min) + real_max = real_min; + debug_f("ppgi(%u) revised bits interval = [%u,%u]", + bits, real_min, real_max); + + switch (ppc->extra->spp) { + case SPP_FAST: + /* + * Always pick the smallest subsidiary prime we can get + * away with: just over n/3 bits. + * + * This is not a good mode for cryptographic prime + * generation, because it skews the distribution of primes + * greatly, and worse, it skews them in a direction that + * heads away from the properties crypto algorithms tend + * to like. + * + * (For both discrete-log systems and RSA, people have + * tended to recommend in the past that p-1 should have a + * _large_ factor if possible. There's some disagreement + * on which algorithms this is really necessary for, but + * certainly I've never seen anyone recommend arranging a + * _small_ factor on purpose.) + * + * I originally implemented this mode because it was + * convenient for debugging - it wastes as little time as + * possible on finding a sub-prime and lets you get to the + * interesting part! And I leave it in the code because it + * might still be useful for _something_. Because it's + * cryptographically questionable, it's not selectable in + * the UI of either version of PuTTYgen proper; but it can + * be accessed through testcrypt, and if for some reason a + * definite prime is needed for non-crypto purposes, it + * may still be the fastest way to put your hands on one. + */ + debug_f("ppgi(%u) fast mode, just ask for %u bits", + bits, min_bits_needed); + sgrowarray(sizes, sizesize, nsizes); + sizes[nsizes++] = min_bits_needed; + break; + case SPP_MAURER_SIMPLE: { + /* + * Select the size of the subsidiary prime at random from + * sqrt(outputprime) up to outputprime/2^20, in such a way + * that the probability distribution matches that of the + * largest prime factor of a random n-bit number. + * + * Per [MAURER] section 3.4, the cumulative distribution + * function of this relative size is 1+log2(x), for x in + * [1/2,1]. You can generate a value from the distribution + * given by a cdf by applying the inverse cdf to a uniform + * value in [0,1]. Simplifying that in this case, what we + * have to do is raise 2 to the power of a random real + * number between -1 and 0. (And that gives you the number + * of _bits_ in the sub-prime, as a factor of the desired + * output number of bits.) + * + * We also require that the subsidiary prime q is at least + * 20 bits smaller than the output one, to give us a + * fighting chance of there being _any_ prime we can find + * such that q | p-1. + * + * (But these rules have to be applied in an order that + * still leaves us _some_ interval of possible sizes we + * can pick!) + */ + maurer_simple: + debug_f("ppgi(%u) Maurer simple mode", bits); + + unsigned sub_bits; + do { + double uniform = uniform_random_double(); + sub_bits = real_max * pow(2.0, uniform - 1) + 0.5; + debug_f(" ... %.6f -> %u?", uniform, sub_bits); + } while (!(real_min <= sub_bits && sub_bits <= real_max)); + + debug_f("ppgi(%u) asking for %u bits", bits, sub_bits); + sgrowarray(sizes, sizesize, nsizes); + sizes[nsizes++] = sub_bits; + + break; + } + case SPP_MAURER_COMPLEX: { + /* + * In this mode, we may generate multiple factors of p-1 + * which between them add up to at least n/2 bits, in such + * a way that those are guaranteed to be the largest + * factors of p-1 and that they have the same probability + * distribution as the largest k factors would have in a + * random integer. The idea is that this more elaborate + * procedure gets as close as possible to the same + * probability distribution you'd get by selecting a + * completely random prime (if you feasibly could). + * + * Algorithm from Appendix 1 of [MAURER]: we generate + * random real numbers that sum to at most 1, by choosing + * each one uniformly from the range [0, 1 - sum of all + * the previous ones]. We maintain them in a list in + * decreasing order, and we stop as soon as we find an + * initial subsequence of the list s_1,...,s_r such that + * s_1 + ... + s_{r-1} + 2 s_r > 1. In particular, this + * guarantees that the sum of that initial subsequence is + * at least 1/2, so we end up with enough factors to + * satisfy Pocklington. + */ + + if (max_bits_needed / 2 + 1 > real_max) { + /* Early exit path in the case where this algorithm + * can't possibly generate a value in the range we + * need. In that situation, fall back to Maurer + * simple. */ + debug_f("ppgi(%u) skipping GenerateSizeList, " + "real_max too small", bits); + goto maurer_simple; /* sorry! */ + } + + double *s = NULL; + size_t ns, ssize = 0; + + while (true) { + debug_f("ppgi(%u) starting GenerateSizeList", bits); + ns = 0; + double range = 1.0; + while (true) { + /* Generate the next number */ + double u = uniform_random_double() * range; + range -= u; + debug_f(" u_%"SIZEu" = %g", ns, u); + + /* Insert it in the list */ + sgrowarray(s, ssize, ns); + size_t i; + for (i = ns; i > 0 && s[i-1] < u; i--) + s[i] = s[i-1]; + s[i] = u; + ns++; + debug_f(" inserting as s[%"SIZEu"]", i); + + /* Look for a suitable initial subsequence */ + double sum = 0; + for (i = 0; i < ns; i++) { + sum += s[i]; + if (sum + s[i] > 1.0) { + debug_f(" s[0..%"SIZEu"] works!", i); + + /* Truncate the sequence here, and stop + * generating random real numbers. */ + ns = i+1; + goto got_list; + } + } + } + + got_list:; + /* + * Now translate those real numbers into actual bit + * counts, and do a last-minute check to make sure we + * haven't generated one too close to the final output + * size. + */ + nsizes = 0; + + unsigned total = 1; /* account for leading 1 */ + + for (size_t i = 0; i < ns; i++) { + /* These sizes are measured in actual entropy, so + * add 1 bit each time to account for the + * zero-information leading 1 */ + unsigned this_size = max_bits_needed * s[i] + 1; + debug_f(" bits[%"SIZEu"] = %u", i, this_size); + sgrowarray(sizes, sizesize, nsizes); + sizes[nsizes++] = this_size; + + total += this_size - 1; + } + + debug_f(" total bits = %u", total); + if (total < real_min || total > real_max) { + debug_f(" total out of range, try again"); + } else { + debug_f(" success! %"SIZEu" sub-primes totalling %u bits", + nsizes, total); + break; + } + } + + smemclr(s, ssize * sizeof(*s)); + sfree(s); + break; + } + default: + unreachable("bad subprime policy"); + } + + for (size_t i = 0; i < nsizes; i++) { + unsigned sub_bits = sizes[i]; + double progress_in_this_prime = (double)sub_bits / bits; + mp_int *q = provableprime_generate_inner( + ppc, pcs_new(sub_bits), + prog, progress_origin + progress_scale * progress, + progress_scale * progress_in_this_prime); + progress += progress_in_this_prime; + assert(q); + debug_f_mp("ppgi(%u) got factor ", q, bits); + pcs_require_residue_1_mod_prime(pcs, q); + mp_free(q); + } + + smemclr(sizes, sizesize * sizeof(*sizes)); + sfree(sizes); + } else { + debug_f("ppgi(%u) no need to recurse", bits); + } + + debug_f("ppgi(%u) ready", bits); + pcs_ready(pcs); + + while (true) { + mp_int *p = pcs_generate(pcs); + + debug_f_mp("provable_step p=", p); + + MillerRabin *mr = miller_rabin_new(p); + debug_f("provable_step mr setup done"); + mp_int *witness = miller_rabin_find_potential_primitive_root(mr); + miller_rabin_free(mr); + + if (!witness) { + debug_f("provable_step mr failed"); + mp_free(p); + continue; + } + + size_t nfactors; + mp_int **factors = pcs_get_known_prime_factors(pcs, &nfactors); + PockleStatus st = pockle_add_prime( + ppc->pockle, p, factors, nfactors, witness); + + if (st != POCKLE_OK) { + debug_f("provable_step proof failed %d", (int)st); + + /* + * Check by assertion that the error status is not one of + * the ones we ought to have ruled out already by + * construction. If there's a bug in this code that means + * we can _never_ pass this test (e.g. picking products of + * factors that never quite reach cbrt(n)), we'd rather + * fail an assertion than loop forever. + */ + assert(st == POCKLE_DISCRIMINANT_IS_SQUARE || + st == POCKLE_WITNESS_POWER_IS_1 || + st == POCKLE_WITNESS_POWER_NOT_COPRIME); + + mp_free(p); + if (witness) + mp_free(witness); + continue; + } + + mp_free(witness); + pcs_free(pcs); + debug_f_mp("ppgi(%u) done, got ", p, bits); + progress_report(prog, progress_origin + progress_scale); + return p; + } +} + +static mp_int *provableprime_generate( + PrimeGenerationContext *ctx, + PrimeCandidateSource *pcs, ProgressReceiver *prog) +{ + ProvablePrimeContext *ppc = container_of(ctx, ProvablePrimeContext, pgc); + mp_int *p = provableprime_generate_inner(ppc, pcs, prog, 0.0, 1.0); + + return p; +} + +#define DECLARE_POLICY(name, policy) \ + static const struct ProvablePrimePolicyExtra \ + pppextra_##name = {policy}; \ + const PrimeGenerationPolicy name = { \ + provableprime_add_progress_phase, \ + provableprime_new_context, \ + provableprime_free_context, \ + provableprime_generate, \ + &pppextra_##name, \ + } + +DECLARE_POLICY(primegen_provable_fast, SPP_FAST); +DECLARE_POLICY(primegen_provable_maurer_simple, SPP_MAURER_SIMPLE); +DECLARE_POLICY(primegen_provable_maurer_complex, SPP_MAURER_COMPLEX); + /* ---------------------------------------------------------------------- * Reusable null implementation of the progress-reporting API. */ diff --git a/testcrypt.c b/testcrypt.c index 40108704..8f6990ee 100644 --- a/testcrypt.c +++ b/testcrypt.c @@ -378,6 +378,9 @@ static const PrimeGenerationPolicy *get_primegenpolicy(BinarySource *in) const PrimeGenerationPolicy *value; } algs[] = { {"probabilistic", &primegen_probabilistic}, + {"provable_fast", &primegen_provable_fast}, + {"provable_maurer_simple", &primegen_provable_maurer_simple}, + {"provable_maurer_complex", &primegen_provable_maurer_complex}, }; ptrlen name = get_word(in);