Move key-generation code into its own subdir.

Including mpunsafe.{h,c}, which should be an extra defence against inadvertently using it outside the keygen library.
2025-07-02 03:52:49 -05:00 · 2021-04-22 17:59:16 +01:00
parent 83fa43497f
commit 8f0f5b69c0
12 changed files with 11 additions and 2 deletions
--- a/keygen/CMakeLists.txt
+++ b/keygen/CMakeLists.txt
@ -0,0 +1,10 @@
+add_sources_from_current_dir(keygen
+  dsa.c
+  ecdsa.c
+  millerrabin.c
+  mpunsafe.c
+  pockle.c
+  prime.c
+  primecandidate.c
+  rsa.c
+  smallprimes.c)
--- a/keygen/dsa.c
+++ b/keygen/dsa.c
@ -0,0 +1,103 @@
+/*
+ * DSS key generation.
+ */
+
+#include "misc.h"
+#include "ssh.h"
+#include "sshkeygen.h"
+#include "mpint.h"
+
+int dsa_generate(struct dss_key *key, int bits, PrimeGenerationContext *pgc,
+                 ProgressReceiver *prog)
+{
+    /*
+     * Progress-reporting setup.
+     *
+     * DSA generation involves three potentially long jobs: inventing
+     * the small prime q, the large prime p, and finding an order-q
+     * element of the multiplicative group of p.
+     *
+     * The latter is done by finding an element whose order is
+     * _divisible_ by q and raising it to the power of (p-1)/q. Every
+     * element whose order is not divisible by q is a qth power of q
+     * distinct elements whose order _is_ divisible by q, so the
+     * probability of not finding a suitable element on the first try
+     * is in the region of 1/q, i.e. at most 2^-159.
+     *
+     * (So the probability of success will end up indistinguishable
+     * from 1 in IEEE standard floating point! But what can you do.)
+     */
+    ProgressPhase phase_q = primegen_add_progress_phase(pgc, prog, 160);
+    ProgressPhase phase_p = primegen_add_progress_phase(pgc, prog, bits);
+    double g_failure_probability = 1.0
+        / (double)(1ULL << 53)
+        / (double)(1ULL << 53)
+        / (double)(1ULL << 53);
+    ProgressPhase phase_g = progress_add_probabilistic(
+        prog, estimate_modexp_cost(bits), 1.0 - g_failure_probability);
+    progress_ready(prog);
+
+    PrimeCandidateSource *pcs;
+
+    /*
+     * Generate q: a prime of length 160.
+     */
+    progress_start_phase(prog, phase_q);
+    pcs = pcs_new(160);
+    mp_int *q = primegen_generate(pgc, pcs, prog);
+    progress_report_phase_complete(prog);
+
+    /*
+     * Now generate p: a prime of length `bits', such that p-1 is
+     * divisible by q.
+     */
+    progress_start_phase(prog, phase_p);
+    pcs = pcs_new(bits);
+    pcs_require_residue_1_mod_prime(pcs, q);
+    mp_int *p = primegen_generate(pgc, pcs, prog);
+    progress_report_phase_complete(prog);
+
+    /*
+     * Next we need g. Raise 2 to the power (p-1)/q modulo p, and
+     * if that comes out to one then try 3, then 4 and so on. As
+     * soon as we hit a non-unit (and non-zero!) one, that'll do
+     * for g.
+     */
+    progress_start_phase(prog, phase_g);
+    mp_int *power = mp_div(p, q); /* this is floor(p/q) == (p-1)/q */
+    mp_int *h = mp_from_integer(2);
+    mp_int *g;
+    while (1) {
+        progress_report_attempt(prog);
+        g = mp_modpow(h, power, p);
+        if (mp_hs_integer(g, 2))
+            break;                     /* got one */
+        mp_free(g);
+        mp_add_integer_into(h, h, 1);
+    }
+    mp_free(h);
+    mp_free(power);
+    progress_report_phase_complete(prog);
+
+    /*
+     * Now we're nearly done. All we need now is our private key x,
+     * which should be a number between 1 and q-1 exclusive, and
+     * our public key y = g^x mod p.
+     */
+    mp_int *two = mp_from_integer(2);
+    mp_int *qm1 = mp_copy(q);
+    mp_sub_integer_into(qm1, qm1, 1);
+    mp_int *x = mp_random_in_range(two, qm1);
+    mp_free(two);
+    mp_free(qm1);
+
+    key->sshk.vt = &ssh_dss;
+
+    key->p = p;
+    key->q = q;
+    key->g = g;
+    key->x = x;
+    key->y = mp_modpow(key->g, key->x, key->p);
+
+    return 1;
+}
--- a/keygen/ecdsa.c
+++ b/keygen/ecdsa.c
@ -0,0 +1,37 @@
+/*
+ * EC key generation.
+ */
+
+#include "ssh.h"
+#include "sshkeygen.h"
+#include "mpint.h"
+
+int ecdsa_generate(struct ecdsa_key *ek, int bits)
+{
+    if (!ec_nist_alg_and_curve_by_bits(bits, &ek->curve, &ek->sshk.vt))
+        return 0;
+
+    mp_int *one = mp_from_integer(1);
+    ek->privateKey = mp_random_in_range(one, ek->curve->w.G_order);
+    mp_free(one);
+
+    ek->publicKey = ecdsa_public(ek->privateKey, ek->sshk.vt);
+
+    return 1;
+}
+
+int eddsa_generate(struct eddsa_key *ek, int bits)
+{
+    if (!ec_ed_alg_and_curve_by_bits(bits, &ek->curve, &ek->sshk.vt))
+        return 0;
+
+    /* EdDSA secret keys are just 32 bytes of hash preimage; the
+     * 64-byte SHA-512 hash of that key will be used when signing,
+     * but the form of the key stored on disk is the preimage
+     * only. */
+    ek->privateKey = mp_random_bits(bits);
+
+    ek->publicKey = eddsa_public(ek->privateKey, ek->sshk.vt);
+
+    return 1;
+}
--- a/keygen/millerrabin.c
+++ b/keygen/millerrabin.c
@ -0,0 +1,214 @@
+/*
+ * millerrabin.c: Miller-Rabin probabilistic primality testing, as
+ * declared in sshkeygen.h.
+ */
+
+#include <assert.h>
+#include "ssh.h"
+#include "sshkeygen.h"
+#include "mpint.h"
+#include "mpunsafe.h"
+
+/*
+ * The Miller-Rabin primality test is an extension to the Fermat
+ * test. The Fermat test just checks that a^(p-1) == 1 mod p; this
+ * is vulnerable to Carmichael numbers. Miller-Rabin considers how
+ * that 1 is derived as well.
+ *
+ * Lemma: if a^2 == 1 (mod p), and p is prime, then either a == 1
+ * or a == -1 (mod p).
+ *
+ *   Proof: p divides a^2-1, i.e. p divides (a+1)(a-1). Hence,
+ *   since p is prime, either p divides (a+1) or p divides (a-1).
+ *   But this is the same as saying that either a is congruent to
+ *   -1 mod p or a is congruent to +1 mod p. []
+ *
+ *   Comment: This fails when p is not prime. Consider p=mn, so
+ *   that mn divides (a+1)(a-1). Now we could have m dividing (a+1)
+ *   and n dividing (a-1), without the whole of mn dividing either.
+ *   For example, consider a=10 and p=99. 99 = 9 * 11; 9 divides
+ *   10-1 and 11 divides 10+1, so a^2 is congruent to 1 mod p
+ *   without a having to be congruent to either 1 or -1.
+ *
+ * So the Miller-Rabin test, as well as considering a^(p-1),
+ * considers a^((p-1)/2), a^((p-1)/4), and so on as far as it can
+ * go. In other words. we write p-1 as q * 2^k, with k as large as
+ * possible (i.e. q must be odd), and we consider the powers
+ *
+ *       a^(q*2^0)      a^(q*2^1)          ...  a^(q*2^(k-1))  a^(q*2^k)
+ * i.e.  a^((n-1)/2^k)  a^((n-1)/2^(k-1))  ...  a^((n-1)/2)    a^(n-1)
+ *
+ * If p is to be prime, the last of these must be 1. Therefore, by
+ * the above lemma, the one before it must be either 1 or -1. And
+ * _if_ it's 1, then the one before that must be either 1 or -1,
+ * and so on ... In other words, we expect to see a trailing chain
+ * of 1s preceded by a -1. (If we're unlucky, our trailing chain of
+ * 1s will be as long as the list so we'll never get to see what
+ * lies before it. This doesn't count as a test failure because it
+ * hasn't _proved_ that p is not prime.)
+ *
+ * For example, consider a=2 and p=1729. 1729 is a Carmichael
+ * number: although it's not prime, it satisfies a^(p-1) == 1 mod p
+ * for any a coprime to it. So the Fermat test wouldn't have a
+ * problem with it at all, unless we happened to stumble on an a
+ * which had a common factor.
+ *
+ * So. 1729 - 1 equals 27 * 2^6. So we look at
+ *
+ *     2^27 mod 1729 == 645
+ *    2^108 mod 1729 == 1065
+ *    2^216 mod 1729 == 1
+ *    2^432 mod 1729 == 1
+ *    2^864 mod 1729 == 1
+ *   2^1728 mod 1729 == 1
+ *
+ * We do have a trailing string of 1s, so the Fermat test would
+ * have been happy. But this trailing string of 1s is preceded by
+ * 1065; whereas if 1729 were prime, we'd expect to see it preceded
+ * by -1 (i.e. 1728.). Guards! Seize this impostor.
+ *
+ * (If we were unlucky, we might have tried a=16 instead of a=2;
+ * now 16^27 mod 1729 == 1, so we would have seen a long string of
+ * 1s and wouldn't have seen the thing _before_ the 1s. So, just
+ * like the Fermat test, for a given p there may well exist values
+ * of a which fail to show up its compositeness. So we try several,
+ * just like the Fermat test. The difference is that Miller-Rabin
+ * is not _in general_ fooled by Carmichael numbers.)
+ *
+ * Put simply, then, the Miller-Rabin test requires us to:
+ *
+ *  1. write p-1 as q * 2^k, with q odd
+ *  2. compute z = (a^q) mod p.
+ *  3. report success if z == 1 or z == -1.
+ *  4. square z at most k-1 times, and report success if it becomes
+ *     -1 at any point.
+ *  5. report failure otherwise.
+ *
+ * (We expect z to become -1 after at most k-1 squarings, because
+ * if it became -1 after k squarings then a^(p-1) would fail to be
+ * 1. And we don't need to investigate what happens after we see a
+ * -1, because we _know_ that -1 squared is 1 modulo anything at
+ * all, so after we've seen a -1 we can be sure of seeing nothing
+ * but 1s.)
+ */
+
+struct MillerRabin {
+    MontyContext *mc;
+
+    size_t k;
+    mp_int *q;
+
+    mp_int *two, *pm1, *m_pm1;
+};
+
+MillerRabin *miller_rabin_new(mp_int *p)
+{
+    MillerRabin *mr = snew(MillerRabin);
+
+    assert(mp_hs_integer(p, 2));
+    assert(mp_get_bit(p, 0) == 1);
+
+    mr->k = 1;
+    while (!mp_get_bit(p, mr->k))
+        mr->k++;
+    mr->q = mp_rshift_safe(p, mr->k);
+
+    mr->two = mp_from_integer(2);
+
+    mr->pm1 = mp_unsafe_copy(p);
+    mp_sub_integer_into(mr->pm1, mr->pm1, 1);
+
+    mr->mc = monty_new(p);
+    mr->m_pm1 = monty_import(mr->mc, mr->pm1);
+
+    return mr;
+}
+
+void miller_rabin_free(MillerRabin *mr)
+{
+    mp_free(mr->q);
+    mp_free(mr->two);
+    mp_free(mr->pm1);
+    mp_free(mr->m_pm1);
+    monty_free(mr->mc);
+    smemclr(mr, sizeof(*mr));
+    sfree(mr);
+}
+
+struct mr_result {
+    bool passed;
+    bool potential_primitive_root;
+};
+
+static struct mr_result miller_rabin_test_inner(MillerRabin *mr, mp_int *w)
+{
+    /*
+     * Compute w^q mod p.
+     */
+    mp_int *wqp = monty_pow(mr->mc, w, mr->q);
+
+    /*
+     * See if this is 1, or if it is -1, or if it becomes -1
+     * when squared at most k-1 times.
+     */
+    struct mr_result result;
+    result.passed = false;
+    result.potential_primitive_root = false;
+
+    if (mp_cmp_eq(wqp, monty_identity(mr->mc))) {
+        result.passed = true;
+    } else {
+        for (size_t i = 0; i < mr->k; i++) {
+            if (mp_cmp_eq(wqp, mr->m_pm1)) {
+                result.passed = true;
+                result.potential_primitive_root = (i == mr->k - 1);
+                break;
+            }
+            if (i == mr->k - 1)
+                break;
+            monty_mul_into(mr->mc, wqp, wqp, wqp);
+        }
+    }
+
+    mp_free(wqp);
+
+    return result;
+}
+
+bool miller_rabin_test_random(MillerRabin *mr)
+{
+    mp_int *mw = mp_random_in_range(mr->two, mr->pm1);
+    struct mr_result result = miller_rabin_test_inner(mr, mw);
+    mp_free(mw);
+    return result.passed;
+}
+
+mp_int *miller_rabin_find_potential_primitive_root(MillerRabin *mr)
+{
+    while (true) {
+        mp_int *mw = mp_unsafe_shrink(mp_random_in_range(mr->two, mr->pm1));
+        struct mr_result result = miller_rabin_test_inner(mr, mw);
+
+        if (result.passed && result.potential_primitive_root) {
+            mp_int *pr = monty_export(mr->mc, mw);
+            mp_free(mw);
+            return pr;
+        }
+
+        mp_free(mw);
+
+        if (!result.passed) {
+            return NULL;
+        }
+    }
+}
+
+unsigned miller_rabin_checks_needed(unsigned bits)
+{
+    /* Table 4.4 from Handbook of Applied Cryptography */
+    return (bits >= 1300 ?  2 : bits >= 850 ?  3 : bits >= 650 ?  4 :
+            bits >=  550 ?  5 : bits >= 450 ?  6 : bits >= 400 ?  7 :
+            bits >=  350 ?  8 : bits >= 300 ?  9 : bits >= 250 ? 12 :
+            bits >=  200 ? 15 : bits >= 150 ? 18 : 27);
+}
+
--- a/keygen/mpunsafe.c
+++ b/keygen/mpunsafe.c
@ -0,0 +1,57 @@
+#include <assert.h>
+#include <limits.h>
+#include <stdio.h>
+
+#include "defs.h"
+#include "misc.h"
+#include "puttymem.h"
+
+#include "mpint.h"
+#include "crypto/mpint_i.h"
+
+/*
+ * This global symbol is also defined in ssh/kex2-client.c, to ensure
+ * that these unsafe non-constant-time mp_int functions can't end up
+ * accidentally linked in to any PuTTY tool that actually makes an SSH
+ * client connection.
+ *
+ * (Only _client_ connections, however. Uppity, being a test server
+ * only, is exempt.)
+ */
+const int deliberate_symbol_clash = 12345;
+
+static size_t mp_unsafe_words_needed(mp_int *x)
+{
+    size_t words = x->nw;
+    while (words > 1 && !x->w[words-1])
+        words--;
+    return words;
+}
+
+mp_int *mp_unsafe_shrink(mp_int *x)
+{
+    x->nw = mp_unsafe_words_needed(x);
+    /* This potentially leaves some allocated words between the new
+     * and old values of x->nw, which won't be wiped by mp_free now
+     * that x->nw doesn't mention that they exist. But we've just
+     * checked they're all zero, so we don't need to wipe them now
+     * either. */
+    return x;
+}
+
+mp_int *mp_unsafe_copy(mp_int *x)
+{
+    mp_int *copy = mp_make_sized(mp_unsafe_words_needed(x));
+    mp_copy_into(copy, x);
+    return copy;
+}
+
+uint32_t mp_unsafe_mod_integer(mp_int *x, uint32_t modulus)
+{
+    uint64_t accumulator = 0;
+    for (size_t i = mp_max_bytes(x); i-- > 0 ;) {
+        accumulator = 0x100 * accumulator + mp_get_byte(x, i);
+        accumulator %= modulus;
+    }
+    return accumulator;
+}
--- a/keygen/mpunsafe.h
+++ b/keygen/mpunsafe.h
@ -0,0 +1,46 @@
+/*
+ * mpunsafe.h: functions that deal with mp_ints in ways that are *not*
+ * expected to be constant-time. Used during key generation, in which
+ * constant run time is a lost cause anyway.
+ *
+ * These functions are in a separate header, so that you can easily
+ * check that you're not calling them in the wrong context. They're
+ * also defined in a separate source file, which is only linked in to
+ * the key generation tools. Furthermore, that source file also
+ * defines a global symbol that intentionally conflicts with one
+ * defined in the SSH client code, so that any attempt to put these
+ * functions into the same binary as the live SSH client
+ * implementation will cause a link-time failure. They should only be
+ * linked into PuTTYgen and auxiliary test programs.
+ *
+ * Also, just in case those precautions aren't enough, all the unsafe
+ * functions have 'unsafe' in the name.
+ */
+
+#ifndef PUTTY_MPINT_UNSAFE_H
+#define PUTTY_MPINT_UNSAFE_H
+
+/*
+ * The most obvious unsafe thing you want to do with an mp_int is to
+ * get rid of leading zero words in its representation, so that its
+ * nominal size is as close as possible to its true size, and you
+ * don't waste any time processing it.
+ *
+ * mp_unsafe_shrink performs this operation in place, mutating the
+ * size field of the mp_int it's given. It returns the same pointer it
+ * was given.
+ *
+ * mp_unsafe_copy leaves the original mp_int alone and makes a new one
+ * with the minimal size.
+ */
+mp_int *mp_unsafe_shrink(mp_int *m);
+mp_int *mp_unsafe_copy(mp_int *m);
+
+/*
+ * Compute the residue of x mod m. This is implemented in the most
+ * obvious way using the C % operator, which won't be constant-time on
+ * many C implementations.
+ */
+uint32_t mp_unsafe_mod_integer(mp_int *x, uint32_t m);
+
+#endif /* PUTTY_MPINT_UNSAFE_H */
--- a/keygen/pockle.c
+++ b/keygen/pockle.c
@ -0,0 +1,450 @@
+#include <assert.h>
+#include "ssh.h"
+#include "sshkeygen.h"
+#include "mpint.h"
+#include "mpunsafe.h"
+#include "tree234.h"
+
+typedef struct PocklePrimeRecord PocklePrimeRecord;
+
+struct Pockle {
+    tree234 *tree;
+
+    PocklePrimeRecord **list;
+    size_t nlist, listsize;
+};
+
+struct PocklePrimeRecord {
+    mp_int *prime;
+    PocklePrimeRecord **factors;
+    size_t nfactors;
+    mp_int *witness;
+
+    size_t index; /* index in pockle->list */
+};
+
+static int ppr_cmp(void *av, void *bv)
+{
+    PocklePrimeRecord *a = (PocklePrimeRecord *)av;
+    PocklePrimeRecord *b = (PocklePrimeRecord *)bv;
+    return mp_cmp_hs(a->prime, b->prime) - mp_cmp_hs(b->prime, a->prime);
+}
+
+static int ppr_find(void *av, void *bv)
+{
+    mp_int *a = (mp_int *)av;
+    PocklePrimeRecord *b = (PocklePrimeRecord *)bv;
+    return mp_cmp_hs(a, b->prime) - mp_cmp_hs(b->prime, a);
+}
+
+Pockle *pockle_new(void)
+{
+    Pockle *pockle = snew(Pockle);
+    pockle->tree = newtree234(ppr_cmp);
+    pockle->list = NULL;
+    pockle->nlist = pockle->listsize = 0;
+    return pockle;
+}
+
+void pockle_free(Pockle *pockle)
+{
+    pockle_release(pockle, 0);
+    assert(count234(pockle->tree) == 0);
+    freetree234(pockle->tree);
+    sfree(pockle->list);
+    sfree(pockle);
+}
+
+static PockleStatus pockle_insert(Pockle *pockle, mp_int *p, mp_int **factors,
+                                  size_t nfactors, mp_int *w)
+{
+    PocklePrimeRecord *pr = snew(PocklePrimeRecord);
+    pr->prime = mp_copy(p);
+
+    PocklePrimeRecord *found = add234(pockle->tree, pr);
+    if (pr != found) {
+        /* it was already in there */
+        mp_free(pr->prime);
+        sfree(pr);
+        return POCKLE_OK;
+    }
+
+    if (w) {
+        pr->factors = snewn(nfactors, PocklePrimeRecord *);
+        for (size_t i = 0; i < nfactors; i++) {
+            pr->factors[i] = find234(pockle->tree, factors[i], ppr_find);
+            assert(pr->factors[i]);
+        }
+        pr->nfactors = nfactors;
+        pr->witness = mp_copy(w);
+    } else {
+        pr->factors = NULL;
+        pr->nfactors = 0;
+        pr->witness = NULL;
+    }
+    pr->index = pockle->nlist;
+
+    sgrowarray(pockle->list, pockle->listsize, pockle->nlist);
+    pockle->list[pockle->nlist++] = pr;
+    return POCKLE_OK;
+}
+
+size_t pockle_mark(Pockle *pockle)
+{
+    return pockle->nlist;
+}
+
+void pockle_release(Pockle *pockle, size_t mark)
+{
+    while (pockle->nlist > mark) {
+        PocklePrimeRecord *pr = pockle->list[--pockle->nlist];
+        del234(pockle->tree, pr);
+        mp_free(pr->prime);
+        if (pr->witness)
+            mp_free(pr->witness);
+        sfree(pr->factors);
+        sfree(pr);
+    }
+}
+
+PockleStatus pockle_add_small_prime(Pockle *pockle, mp_int *p)
+{
+    if (mp_hs_integer(p, (1ULL << 32)))
+        return POCKLE_SMALL_PRIME_NOT_SMALL;
+
+    uint32_t val = mp_get_integer(p);
+
+    if (val < 2)
+        return POCKLE_PRIME_SMALLER_THAN_2;
+
+    init_smallprimes();
+    for (size_t i = 0; i < NSMALLPRIMES; i++) {
+        if (val == smallprimes[i])
+            break; /* success */
+        if (val % smallprimes[i] == 0)
+            return POCKLE_SMALL_PRIME_NOT_PRIME;
+    }
+
+    return pockle_insert(pockle, p, NULL, 0, NULL);
+}
+
+PockleStatus pockle_add_prime(Pockle *pockle, mp_int *p,
+                              mp_int **factors, size_t nfactors,
+                              mp_int *witness)
+{
+    MontyContext *mc = NULL;
+    mp_int *x = NULL, *f = NULL, *w = NULL;
+    PockleStatus status;
+
+    /*
+     * We're going to try to verify that p is prime by using
+     * Pocklington's theorem. The idea is that we're given w such that
+     *          w^{p-1} == 1 (mod p)             (1)
+     * and for a collection of primes q | p-1,
+     *       w^{(p-1)/q} - 1 is coprime to p.    (2)
+     *
+     * Suppose r is a prime factor of p itself. Consider the
+     * multiplicative order of w mod r. By (1), r | w^{p-1}-1. But by
+     * (2), r does not divide w^{(p-1)/q}-1. So the order of w mod r
+     * is a factor of p-1, but not a factor of (p-1)/q. Hence, the
+     * largest power of q that divides p-1 must also divide ord w.
+     *
+     * Repeating this reasoning for all q, we find that the product of
+     * all the q (which we'll denote f) must divide ord w, which in
+     * turn divides r-1. So f | r-1 for any r | p.
+     *
+     * In particular, this means f < r. That is, all primes r | p are
+     * bigger than f. So if f > sqrt(p), then we've shown p is prime,
+     * because otherwise it would have to be the product of at least
+     * two factors bigger than its own square root.
+     *
+     * With an extra check, we can also show p to be prime even if
+     * we're only given enough factors to make f > cbrt(p). See below
+     * for that part, when we come to it.
+     */
+
+    /*
+     * Start by checking p > 1. It certainly can't be prime otherwise!
+     * (And since we're going to prove it prime by showing all its
+     * prime factors are large, we do also have to know it _has_ at
+     * least one prime factor for that to tell us anything.)
+     */
+    if (!mp_hs_integer(p, 2))
+        return POCKLE_PRIME_SMALLER_THAN_2;
+
+    /*
+     * Check that all the factors we've been given really are primes
+     * (in the sense that we already had them in our index). Make the
+     * product f, and check it really does divide p-1.
+     */
+    x = mp_copy(p);
+    mp_sub_integer_into(x, x, 1);
+    f = mp_from_integer(1);
+    for (size_t i = 0; i < nfactors; i++) {
+        mp_int *q = factors[i];
+
+        if (!find234(pockle->tree, q, ppr_find)) {
+            status = POCKLE_FACTOR_NOT_KNOWN_PRIME;
+            goto out;
+        }
+
+        mp_int *quotient = mp_new(mp_max_bits(x));
+        mp_int *residue = mp_new(mp_max_bits(q));
+        mp_divmod_into(x, q, quotient, residue);
+
+        unsigned exact = mp_eq_integer(residue, 0);
+        mp_free(residue);
+
+        mp_free(x);
+        x = quotient;
+
+        if (!exact) {
+            status = POCKLE_FACTOR_NOT_A_FACTOR;
+            goto out;
+        }
+
+        mp_int *tmp = f;
+        f = mp_unsafe_shrink(mp_mul(tmp, q));
+        mp_free(tmp);
+    }
+
+    /*
+     * Check that f > cbrt(p).
+     */
+    mp_int *f2 = mp_mul(f, f);
+    mp_int *f3 = mp_mul(f2, f);
+    bool too_big = mp_cmp_hs(p, f3);
+    mp_free(f3);
+    mp_free(f2);
+    if (too_big) {
+        status = POCKLE_PRODUCT_OF_FACTORS_TOO_SMALL;
+        goto out;
+    }
+
+    /*
+     * Now do the extra check that allows us to get away with only
+     * having f > cbrt(p) instead of f > sqrt(p).
+     *
+     * If we can show that f | r-1 for any r | p, then we've ruled out
+     * p being a product of _more_ than two primes (because then it
+     * would be the product of at least three things bigger than its
+     * own cube root). But we still have to rule out it being a
+     * product of exactly two.
+     *
+     * Suppose for the sake of contradiction that p is the product of
+     * two prime factors. We know both of those factors would have to
+     * be congruent to 1 mod f. So we'd have to have
+     *
+     *    p = (uf+1)(vf+1) = (uv)f^2 + (u+v)f + 1        (3)
+     *
+     * We can't have uv >= f, or else that expression would come to at
+     * least f^3, i.e. it would exceed p. So uv < f. Hence, u,v < f as
+     * well.
+     *
+     * Can we have u+v >= f? If we did, then we could write v >= f-u,
+     * and hence f > uv >= u(f-u). That can be rearranged to show that
+     * u^2 > (u-1)f; decrementing the LHS makes the inequality no
+     * longer necessarily strict, so we have u^2-1 >= (u-1)f, and
+     * dividing off u-1 gives u+1 >= f. But we know u < f, so the only
+     * way this could happen would be if u=f-1, which makes v=1. But
+     * _then_ (3) gives us p = (f-1)f^2 + f^2 + 1 = f^3+1. But that
+     * can't be true if f^3 > p. So we can't have u+v >= f either, by
+     * contradiction.
+     *
+     * After all that, what have we shown? We've shown that we can
+     * write p = (uv)f^2 + (u+v)f + 1, with both uv and u+v strictly
+     * less than f. In other words, if you write down p in base f, it
+     * has exactly three digits, and they are uv, u+v and 1.
+     *
+     * But that means we can _find_ u and v: we know p and f, so we
+     * can just extract those digits of p's base-f representation.
+     * Once we've done so, they give the sum and product of the
+     * potential u,v. And given the sum and product of two numbers,
+     * you can make a quadratic which has those numbers as roots.
+     *
+     * We don't actually have to _solve_ the quadratic: all we have to
+     * do is check if its discriminant is a perfect square. If not,
+     * we'll know that no integers u,v can match this description.
+     */
+    {
+        /* We already have x = (p-1)/f. So we just need to write x in
+         * the form aF + b, and then we have a=uv and b=u+v. */
+        mp_int *a = mp_new(mp_max_bits(x));
+        mp_int *b = mp_new(mp_max_bits(f));
+        mp_divmod_into(x, f, a, b);
+        assert(!mp_cmp_hs(a, f));
+        assert(!mp_cmp_hs(b, f));
+
+        /* If a=0, then that means p < f^2, so we don't need to do
+         * this check at all: the straightforward Pocklington theorem
+         * is all we need. */
+        if (!mp_eq_integer(a, 0)) {
+            unsigned perfect_square = 0;
+
+            mp_int *bsq = mp_mul(b, b);
+            mp_lshift_fixed_into(a, a, 2);
+
+            if (mp_cmp_hs(bsq, a)) {
+                /* b^2-4a is non-negative, so it might be a square.
+                 * Check it. */
+                mp_int *discriminant = mp_sub(bsq, a);
+                mp_int *remainder = mp_new(mp_max_bits(discriminant));
+                mp_int *root = mp_nthroot(discriminant, 2, remainder);
+                perfect_square = mp_eq_integer(remainder, 0);
+                mp_free(discriminant);
+                mp_free(root);
+                mp_free(remainder);
+            }
+
+            mp_free(bsq);
+
+            if (perfect_square) {
+                mp_free(b);
+                mp_free(a);
+                status = POCKLE_DISCRIMINANT_IS_SQUARE;
+                goto out;
+            }
+        }
+        mp_free(b);
+        mp_free(a);
+    }
+
+    /*
+     * Now we've done all the checks that are cheaper than a modpow,
+     * so we've ruled out as many things as possible before having to
+     * do any hard work. But there's nothing for it now: make a
+     * MontyContext.
+     */
+    mc = monty_new(p);
+    w = monty_import(mc, witness);
+
+    /*
+     * The initial Fermat check: is w^{p-1} itself congruent to 1 mod
+     * p?
+     */
+    {
+        mp_int *pm1 = mp_copy(p);
+        mp_sub_integer_into(pm1, pm1, 1);
+        mp_int *power = monty_pow(mc, w, pm1);
+        unsigned fermat_pass = mp_cmp_eq(power, monty_identity(mc));
+        mp_free(power);
+        mp_free(pm1);
+
+        if (!fermat_pass) {
+            status = POCKLE_FERMAT_TEST_FAILED;
+            goto out;
+        }
+    }
+
+    /*
+     * And now, for each factor q, is w^{(p-1)/q}-1 coprime to p?
+     */
+    for (size_t i = 0; i < nfactors; i++) {
+        mp_int *q = factors[i];
+        mp_int *exponent = mp_unsafe_shrink(mp_div(p, q));
+        mp_int *power = monty_pow(mc, w, exponent);
+        mp_int *power_extracted = monty_export(mc, power);
+        mp_sub_integer_into(power_extracted, power_extracted, 1);
+
+        unsigned coprime = mp_coprime(power_extracted, p);
+        if (!coprime) {
+            /*
+             * If w^{(p-1)/q}-1 is not coprime to p, the test has
+             * failed. But it makes a difference why. If the power of
+             * w turned out to be 1, so that we took gcd(1-1,p) =
+             * gcd(0,p) = p, that's like an inconclusive Fermat or M-R
+             * test: it might just mean you picked a witness integer
+             * that wasn't a primitive root. But if the power is any
+             * _other_ value mod p that is not coprime to p, it means
+             * we've detected that the number is *actually not prime*!
+             */
+            if (mp_eq_integer(power_extracted, 0))
+                status = POCKLE_WITNESS_POWER_IS_1;
+            else
+                status = POCKLE_WITNESS_POWER_NOT_COPRIME;
+        }
+
+        mp_free(exponent);
+        mp_free(power);
+        mp_free(power_extracted);
+
+        if (!coprime)
+            goto out; /* with the status we set up above */
+    }
+
+    /*
+     * Success! p is prime. Insert it into our tree234 of known
+     * primes, so that future calls to this function can cite it in
+     * evidence of larger numbers' primality.
+     */
+    status = pockle_insert(pockle, p, factors, nfactors, witness);
+
+  out:
+    if (x)
+        mp_free(x);
+    if (f)
+        mp_free(f);
+    if (w)
+        mp_free(w);
+    if (mc)
+        monty_free(mc);
+    return status;
+}
+
+static void mp_write_decimal(strbuf *sb, mp_int *x)
+{
+    char *s = mp_get_decimal(x);
+    ptrlen pl = ptrlen_from_asciz(s);
+    put_datapl(sb, pl);
+    smemclr(s, pl.len);
+    sfree(s);
+}
+
+strbuf *pockle_mpu(Pockle *pockle, mp_int *p)
+{
+    strbuf *sb = strbuf_new_nm();
+    PocklePrimeRecord *pr = find234(pockle->tree, p, ppr_find);
+    assert(pr);
+
+    bool *needed = snewn(pockle->nlist, bool);
+    memset(needed, 0, pockle->nlist * sizeof(bool));
+    needed[pr->index] = true;
+
+    strbuf_catf(sb, "[MPU - Primality Certificate]\nVersion 1.0\nBase 10\n\n"
+                "Proof for:\nN  ");
+    mp_write_decimal(sb, p);
+    strbuf_catf(sb, "\n");
+
+    for (size_t index = pockle->nlist; index-- > 0 ;) {
+        if (!needed[index])
+            continue;
+        pr = pockle->list[index];
+
+        if (mp_get_nbits(pr->prime) <= 64) {
+            strbuf_catf(sb, "\nType Small\nN  ");
+            mp_write_decimal(sb, pr->prime);
+            strbuf_catf(sb, "\n");
+        } else {
+            assert(pr->witness);
+            strbuf_catf(sb, "\nType BLS5\nN  ");
+            mp_write_decimal(sb, pr->prime);
+            strbuf_catf(sb, "\n");
+            for (size_t i = 0; i < pr->nfactors; i++) {
+                strbuf_catf(sb, "Q[%"SIZEu"]  ", i+1);
+                mp_write_decimal(sb, pr->factors[i]->prime);
+                assert(pr->factors[i]->index < index);
+                needed[pr->factors[i]->index] = true;
+                strbuf_catf(sb, "\n");
+            }
+            for (size_t i = 0; i < pr->nfactors + 1; i++) {
+                strbuf_catf(sb, "A[%"SIZEu"]  ", i);
+                mp_write_decimal(sb, pr->witness);
+                strbuf_catf(sb, "\n");
+            }
+            strbuf_catf(sb, "----\n");
+        }
+    }
+    sfree(needed);
+
+    return sb;
+}
--- a/keygen/prime.c
+++ b/keygen/prime.c
@ -0,0 +1,762 @@
+/*
+ * Prime generation.
+ */
+
+#include <assert.h>
+#include <math.h>
+
+#include "ssh.h"
+#include "mpint.h"
+#include "mpunsafe.h"
+#include "sshkeygen.h"
+
+/* ----------------------------------------------------------------------
+ * Standard probabilistic prime-generation algorithm:
+ *
+ *  - get a number from our PrimeCandidateSource which will at least
+ *    avoid being divisible by any prime under 2^16
+ *
+ *  - perform the Miller-Rabin primality test enough times to
+ *    ensure the probability of it being composite is 2^-80 or
+ *    less
+ *
+ *  - go back to square one if any M-R test fails.
+ */
+
+static PrimeGenerationContext *probprime_new_context(
+    const PrimeGenerationPolicy *policy)
+{
+    PrimeGenerationContext *ctx = snew(PrimeGenerationContext);
+    ctx->vt = policy;
+    return ctx;
+}
+
+static void probprime_free_context(PrimeGenerationContext *ctx)
+{
+    sfree(ctx);
+}
+
+static ProgressPhase probprime_add_progress_phase(
+    const PrimeGenerationPolicy *policy,
+    ProgressReceiver *prog, unsigned bits)
+{
+    /*
+     * The density of primes near x is 1/(log x). When x is about 2^b,
+     * that's 1/(b log 2).
+     *
+     * But we're only doing the expensive part of the process (the M-R
+     * checks) for a number that passes the initial winnowing test of
+     * having no factor less than 2^16 (at least, unless the prime is
+     * so small that PrimeCandidateSource gives up on that winnowing).
+     * The density of _those_ numbers is about 1/19.76. So the odds of
+     * hitting a prime per expensive attempt are boosted by a factor
+     * of 19.76.
+     */
+    const double log_2 = 0.693147180559945309417232121458;
+    double winnow_factor = (bits < 32 ? 1.0 : 19.76);
+    double prob = winnow_factor / (bits * log_2);
+
+    /*
+     * Estimate the cost of prime generation as the cost of the M-R
+     * modexps.
+     */
+    double cost = (miller_rabin_checks_needed(bits) *
+                   estimate_modexp_cost(bits));
+    return progress_add_probabilistic(prog, cost, prob);
+}
+
+static mp_int *probprime_generate(
+    PrimeGenerationContext *ctx,
+    PrimeCandidateSource *pcs, ProgressReceiver *prog)
+{
+    pcs_ready(pcs);
+
+    while (true) {
+        progress_report_attempt(prog);
+
+        mp_int *p = pcs_generate(pcs);
+        if (!p) {
+            pcs_free(pcs);
+            return NULL;
+        }
+
+        MillerRabin *mr = miller_rabin_new(p);
+        bool known_bad = false;
+        unsigned nchecks = miller_rabin_checks_needed(mp_get_nbits(p));
+        for (unsigned check = 0; check < nchecks; check++) {
+            if (!miller_rabin_test_random(mr)) {
+                known_bad = true;
+                break;
+            }
+        }
+        miller_rabin_free(mr);
+
+        if (!known_bad) {
+            /*
+             * We have a prime!
+             */
+            pcs_free(pcs);
+            return p;
+        }
+
+        mp_free(p);
+    }
+}
+
+static strbuf *null_mpu_certificate(PrimeGenerationContext *ctx, mp_int *p)
+{
+    return NULL;
+}
+
+const PrimeGenerationPolicy primegen_probabilistic = {
+    probprime_add_progress_phase,
+    probprime_new_context,
+    probprime_free_context,
+    probprime_generate,
+    null_mpu_certificate,
+};
+
+/* ----------------------------------------------------------------------
+ * Alternative provable-prime algorithm, based on the following paper:
+ *
+ * [MAURER] Maurer, U.M. Fast generation of prime numbers and secure
+ * public-key cryptographic parameters. J. Cryptology 8, 123–155
+ * (1995). https://doi.org/10.1007/BF00202269
+ */
+
+typedef enum SubprimePolicy {
+    SPP_FAST,
+    SPP_MAURER_SIMPLE,
+    SPP_MAURER_COMPLEX,
+} SubprimePolicy;
+
+typedef struct ProvablePrimePolicyExtra {
+    SubprimePolicy spp;
+} ProvablePrimePolicyExtra;
+
+typedef struct ProvablePrimeContext ProvablePrimeContext;
+struct ProvablePrimeContext {
+    Pockle *pockle;
+    PrimeGenerationContext pgc;
+    const ProvablePrimePolicyExtra *extra;
+};
+
+static PrimeGenerationContext *provableprime_new_context(
+    const PrimeGenerationPolicy *policy)
+{
+    ProvablePrimeContext *ppc = snew(ProvablePrimeContext);
+    ppc->pgc.vt = policy;
+    ppc->pockle = pockle_new();
+    ppc->extra = policy->extra;
+    return &ppc->pgc;
+}
+
+static void provableprime_free_context(PrimeGenerationContext *ctx)
+{
+    ProvablePrimeContext *ppc = container_of(ctx, ProvablePrimeContext, pgc);
+    pockle_free(ppc->pockle);
+    sfree(ppc);
+}
+
+static ProgressPhase provableprime_add_progress_phase(
+    const PrimeGenerationPolicy *policy,
+    ProgressReceiver *prog, unsigned bits)
+{
+    /*
+     * Estimating the cost of making a _provable_ prime is difficult
+     * because of all the recursions to smaller sizes.
+     *
+     * Once you have enough factors of p-1 to certify primality of p,
+     * the remaining work in provable prime generation is not very
+     * different from probabilistic: you generate a random candidate,
+     * test its primality probabilistically, and use the witness value
+     * generated as a byproduct of that test for the full Pocklington
+     * verification. The expensive part, as usual, is made of modpows.
+     *
+     * The Pocklington test needs at least two modpows (one for the
+     * Fermat check, and one per known factor of p-1).
+     *
+     * The prior M-R step needs an unknown number, because we iterate
+     * until we find a value whose order is divisible by the largest
+     * power of 2 that divides p-1, say 2^j. That excludes half the
+     * possible witness values (specifically, the quadratic residues),
+     * so we expect to need on average two M-R operations to find one.
+     * But that's only if the number _is_ prime - as usual, it's also
+     * possible that we hit a non-prime and have to try again.
+     *
+     * So, if we were only estimating the cost of that final step, it
+     * would look a lot like the probabilistic version: we'd have to
+     * estimate the expected total number of modexps by knowing
+     * something about the density of primes among our candidate
+     * integers, and then multiply that by estimate_modexp_cost(bits).
+     * But the problem is that we also have to _find_ a smaller prime,
+     * so we have to recurse.
+     *
+     * In the MAURER_SIMPLE version of the algorithm, you recurse to
+     * any one of a range of possible smaller sizes i, each with
+     * probability proportional to 1/i. So your expected time to
+     * generate an n-bit prime is given by a horrible recurrence of
+     * the form E_n = S_n + (sum E_i/i) / (sum 1/i), in which S_n is
+     * the expected cost of the final step once you have your smaller
+     * primes, and both sums are over ceil(n/2) <= i <= n-20.
+     *
+     * At this point I ran out of effort to actually do the maths
+     * rigorously, so instead I did the empirical experiment of
+     * generating that sequence in Python and plotting it on a graph.
+     * My Python code is here, in case I need it again:
+
+from math import log
+
+alpha = log(3)/log(2) + 1 # exponent for modexp using Karatsuba mult
+
+E = [1] * 16 # assume generating tiny primes is trivial
+
+for n in range(len(E), 4096):
+
+    # Expected time for sub-generations, as a weighted mean of prior
+    # values of the same sequence.
+    lo = (n+1)//2
+    hi = n-20
+    if lo <= hi:
+        subrange = range(lo, hi+1)
+        num = sum(E[i]/i for i in subrange)
+        den = sum(1/i for i in subrange)
+    else:
+        num, den = 0, 1
+
+    # Constant term (cost of final step).
+    # Similar to probprime_add_progress_phase.
+    winnow_factor = 1 if n < 32 else 19.76
+    prob = winnow_factor / (n * log(2))
+    cost = 4 * n**alpha / prob
+
+    E.append(cost + num / den)
+
+for i, p in enumerate(E):
+    try:
+        print(log(i), log(p))
+    except ValueError:
+        continue
+
+     * The output loop prints the logs of both i and E_i, so that when
+     * I plot the resulting data file in gnuplot I get a log-log
+     * diagram. That showed me some early noise and then a very
+     * straight-looking line; feeding the straight part of the graph
+     * to linear-regression analysis reported that it fits the line
+     *
+     *     log E_n = -1.7901825337965498 + 3.6199197179662517 * log(n)
+     * =>      E_n = 0.16692969657466802 * n^3.6199197179662517
+     *
+     * So my somewhat empirical estimate is that Maurer prime
+     * generation costs about 0.167 * bits^3.62, in the same arbitrary
+     * time units used by estimate_modexp_cost.
+     */
+
+    return progress_add_linear(prog, 0.167 * pow(bits, 3.62));
+}
+
+static mp_int *primegen_small(Pockle *pockle, PrimeCandidateSource *pcs)
+{
+    assert(pcs_get_bits(pcs) <= 32);
+
+    pcs_ready(pcs);
+
+    while (true) {
+        mp_int *p = pcs_generate(pcs);
+        if (!p) {
+            pcs_free(pcs);
+            return NULL;
+        }
+        if (pockle_add_small_prime(pockle, p) == POCKLE_OK) {
+            pcs_free(pcs);
+            return p;
+        }
+        mp_free(p);
+    }
+}
+
+#ifdef DEBUG_PRIMEGEN
+static void timestamp(FILE *fp)
+{
+    struct timespec ts;
+    clock_gettime(CLOCK_MONOTONIC, &ts);
+    fprintf(fp, "%lu.%09lu: ", (unsigned long)ts.tv_sec,
+            (unsigned long)ts.tv_nsec);
+}
+static PRINTF_LIKE(1, 2) void debug_f(const char *fmt, ...)
+{
+    va_list ap;
+    va_start(ap, fmt);
+    timestamp(stderr);
+    vfprintf(stderr, fmt, ap);
+    fputc('\n', stderr);
+    va_end(ap);
+}
+static void debug_f_mp(const char *fmt, mp_int *x, ...)
+{
+    va_list ap;
+    va_start(ap, x);
+    timestamp(stderr);
+    vfprintf(stderr, fmt, ap);
+    mp_dump(stderr, "", x, "\n");
+    va_end(ap);
+}
+#else
+#define debug_f(...) ((void)0)
+#define debug_f_mp(...) ((void)0)
+#endif
+
+static double uniform_random_double(void)
+{
+    unsigned char randbuf[8];
+    random_read(randbuf, 8);
+    return GET_64BIT_MSB_FIRST(randbuf) * 0x1.0p-64;
+}
+
+static mp_int *mp_ceil_div(mp_int *n, mp_int *d)
+{
+    mp_int *nplus = mp_add(n, d);
+    mp_sub_integer_into(nplus, nplus, 1);
+    mp_int *toret = mp_div(nplus, d);
+    mp_free(nplus);
+    return toret;
+}
+
+static mp_int *provableprime_generate_inner(
+    ProvablePrimeContext *ppc, PrimeCandidateSource *pcs,
+    ProgressReceiver *prog, double progress_origin, double progress_scale)
+{
+    unsigned bits = pcs_get_bits(pcs);
+    assert(bits > 1);
+
+    if (bits <= 32) {
+        debug_f("ppgi(%u) -> small", bits);
+        return primegen_small(ppc->pockle, pcs);
+    }
+
+    unsigned min_bits_needed, max_bits_needed;
+    {
+        /*
+         * Find the product of all the prime factors we already know
+         * about.
+         */
+        mp_int *size_got = mp_from_integer(1);
+        size_t nfactors;
+        mp_int **factors = pcs_get_known_prime_factors(pcs, &nfactors);
+        for (size_t i = 0; i < nfactors; i++) {
+            mp_int *to_free = size_got;
+            size_got = mp_unsafe_shrink(mp_mul(size_got, factors[i]));
+            mp_free(to_free);
+        }
+
+        /*
+         * Find the largest cofactor we might be able to use, and the
+         * smallest one we can get away with.
+         */
+        mp_int *upperbound = pcs_get_upper_bound(pcs);
+        mp_int *size_needed = mp_nthroot(upperbound, 3, NULL);
+        debug_f_mp("upperbound = ", upperbound);
+        {
+            mp_int *to_free = upperbound;
+            upperbound = mp_unsafe_shrink(mp_div(upperbound, size_got));
+            mp_free(to_free);
+        }
+        debug_f_mp("size_needed = ", size_needed);
+        {
+            mp_int *to_free = size_needed;
+            size_needed = mp_unsafe_shrink(mp_ceil_div(size_needed, size_got));
+            mp_free(to_free);
+        }
+
+        max_bits_needed = pcs_get_bits_remaining(pcs);
+
+        /*
+         * We need a prime that is greater than or equal to
+         * 'size_needed' in order for the product of all our known
+         * factors of p-1 to exceed the cube root of the largest value
+         * p might take.
+         *
+         * Since pcs_new wants a size specified in bits, we must count
+         * the bits in size_needed and then add 1. Otherwise we might
+         * get a value with the same bit count as size_needed but
+         * slightly smaller than it.
+         *
+         * An exception is if size_needed = 1. In that case the
+         * product of existing known factors is _already_ enough, so
+         * we don't need to generate an extra factor at all.
+         */
+        if (mp_hs_integer(size_needed, 2)) {
+            min_bits_needed = mp_get_nbits(size_needed) + 1;
+        } else {
+            min_bits_needed = 0;
+        }
+
+        mp_free(upperbound);
+        mp_free(size_needed);
+        mp_free(size_got);
+    }
+
+    double progress = 0.0;
+
+    if (min_bits_needed) {
+        debug_f("ppgi(%u) recursing, need [%u,%u] more bits",
+                bits, min_bits_needed, max_bits_needed);
+
+        unsigned *sizes = NULL;
+        size_t nsizes = 0, sizesize = 0;
+
+        unsigned real_min = max_bits_needed / 2;
+        unsigned real_max = (max_bits_needed >= 20 ?
+                             max_bits_needed - 20 : 0);
+        if (real_min < min_bits_needed)
+            real_min = min_bits_needed;
+        if (real_max < real_min)
+            real_max = real_min;
+        debug_f("ppgi(%u) revised bits interval = [%u,%u]",
+                bits, real_min, real_max);
+
+        switch (ppc->extra->spp) {
+          case SPP_FAST:
+            /*
+             * Always pick the smallest subsidiary prime we can get
+             * away with: just over n/3 bits.
+             *
+             * This is not a good mode for cryptographic prime
+             * generation, because it skews the distribution of primes
+             * greatly, and worse, it skews them in a direction that
+             * heads away from the properties crypto algorithms tend
+             * to like.
+             *
+             * (For both discrete-log systems and RSA, people have
+             * tended to recommend in the past that p-1 should have a
+             * _large_ factor if possible. There's some disagreement
+             * on which algorithms this is really necessary for, but
+             * certainly I've never seen anyone recommend arranging a
+             * _small_ factor on purpose.)
+             *
+             * I originally implemented this mode because it was
+             * convenient for debugging - it wastes as little time as
+             * possible on finding a sub-prime and lets you get to the
+             * interesting part! And I leave it in the code because it
+             * might still be useful for _something_. Because it's
+             * cryptographically questionable, it's not selectable in
+             * the UI of either version of PuTTYgen proper; but it can
+             * be accessed through testcrypt, and if for some reason a
+             * definite prime is needed for non-crypto purposes, it
+             * may still be the fastest way to put your hands on one.
+             */
+            debug_f("ppgi(%u) fast mode, just ask for %u bits",
+                    bits, min_bits_needed);
+            sgrowarray(sizes, sizesize, nsizes);
+            sizes[nsizes++] = min_bits_needed;
+            break;
+          case SPP_MAURER_SIMPLE: {
+            /*
+             * Select the size of the subsidiary prime at random from
+             * sqrt(outputprime) up to outputprime/2^20, in such a way
+             * that the probability distribution matches that of the
+             * largest prime factor of a random n-bit number.
+             *
+             * Per [MAURER] section 3.4, the cumulative distribution
+             * function of this relative size is 1+log2(x), for x in
+             * [1/2,1]. You can generate a value from the distribution
+             * given by a cdf by applying the inverse cdf to a uniform
+             * value in [0,1]. Simplifying that in this case, what we
+             * have to do is raise 2 to the power of a random real
+             * number between -1 and 0. (And that gives you the number
+             * of _bits_ in the sub-prime, as a factor of the desired
+             * output number of bits.)
+             *
+             * We also require that the subsidiary prime q is at least
+             * 20 bits smaller than the output one, to give us a
+             * fighting chance of there being _any_ prime we can find
+             * such that q | p-1.
+             *
+             * (But these rules have to be applied in an order that
+             * still leaves us _some_ interval of possible sizes we
+             * can pick!)
+             */
+          maurer_simple:
+            debug_f("ppgi(%u) Maurer simple mode", bits);
+
+            unsigned sub_bits;
+            do {
+                double uniform = uniform_random_double();
+                sub_bits = real_max * pow(2.0, uniform - 1) + 0.5;
+                debug_f(" ... %.6f -> %u?", uniform, sub_bits);
+            } while (!(real_min <= sub_bits && sub_bits <= real_max));
+
+            debug_f("ppgi(%u) asking for %u bits", bits, sub_bits);
+            sgrowarray(sizes, sizesize, nsizes);
+            sizes[nsizes++] = sub_bits;
+
+            break;
+          }
+          case SPP_MAURER_COMPLEX: {
+            /*
+             * In this mode, we may generate multiple factors of p-1
+             * which between them add up to at least n/2 bits, in such
+             * a way that those are guaranteed to be the largest
+             * factors of p-1 and that they have the same probability
+             * distribution as the largest k factors would have in a
+             * random integer. The idea is that this more elaborate
+             * procedure gets as close as possible to the same
+             * probability distribution you'd get by selecting a
+             * completely random prime (if you feasibly could).
+             *
+             * Algorithm from Appendix 1 of [MAURER]: we generate
+             * random real numbers that sum to at most 1, by choosing
+             * each one uniformly from the range [0, 1 - sum of all
+             * the previous ones]. We maintain them in a list in
+             * decreasing order, and we stop as soon as we find an
+             * initial subsequence of the list s_1,...,s_r such that
+             * s_1 + ... + s_{r-1} + 2 s_r > 1. In particular, this
+             * guarantees that the sum of that initial subsequence is
+             * at least 1/2, so we end up with enough factors to
+             * satisfy Pocklington.
+             */
+
+            if (max_bits_needed / 2 + 1 > real_max) {
+                /* Early exit path in the case where this algorithm
+                 * can't possibly generate a value in the range we
+                 * need. In that situation, fall back to Maurer
+                 * simple. */
+                debug_f("ppgi(%u) skipping GenerateSizeList, "
+                        "real_max too small", bits);
+                goto maurer_simple;    /* sorry! */
+            }
+
+            double *s = NULL;
+            size_t ns, ssize = 0;
+
+            while (true) {
+                debug_f("ppgi(%u) starting GenerateSizeList", bits);
+                ns = 0;
+                double range = 1.0;
+                while (true) {
+                    /* Generate the next number */
+                    double u = uniform_random_double() * range;
+                    range -= u;
+                    debug_f("  u_%"SIZEu" = %g", ns, u);
+
+                    /* Insert it in the list */
+                    sgrowarray(s, ssize, ns);
+                    size_t i;
+                    for (i = ns; i > 0 && s[i-1] < u; i--)
+                        s[i] = s[i-1];
+                    s[i] = u;
+                    ns++;
+                    debug_f("    inserting as s[%"SIZEu"]", i);
+
+                    /* Look for a suitable initial subsequence */
+                    double sum = 0;
+                    for (i = 0; i < ns; i++) {
+                        sum += s[i];
+                        if (sum + s[i] > 1.0) {
+                            debug_f("  s[0..%"SIZEu"] works!", i);
+
+                            /* Truncate the sequence here, and stop
+                             * generating random real numbers. */
+                            ns = i+1;
+                            goto got_list;
+                        }
+                    }
+                }
+
+              got_list:;
+                /*
+                 * Now translate those real numbers into actual bit
+                 * counts, and do a last-minute check to make sure
+                 * their product is going to be in range.
+                 *
+                 * We have to check both the min and max sizes of the
+                 * total. A b-bit number is in [2^{b-1},2^b). So the
+                 * product of numbers of sizes b_1,...,b_k is at least
+                 * 2^{\sum (b_i-1)}, and less than 2^{\sum b_i}.
+                 */
+                nsizes = 0;
+
+                unsigned min_total = 0, max_total = 0;
+
+                for (size_t i = 0; i < ns; i++) {
+                    /* These sizes are measured in actual entropy, so
+                     * add 1 bit each time to account for the
+                     * zero-information leading 1 */
+                    unsigned this_size = max_bits_needed * s[i] + 1;
+                    debug_f("  bits[%"SIZEu"] = %u", i, this_size);
+                    sgrowarray(sizes, sizesize, nsizes);
+                    sizes[nsizes++] = this_size;
+
+                    min_total += this_size - 1;
+                    max_total += this_size;
+                }
+
+                debug_f("  total bits = [%u,%u)", min_total, max_total);
+                if (min_total < real_min || max_total > real_max+1) {
+                    debug_f("  total out of range, try again");
+                } else {
+                    debug_f("  success! %"SIZEu" sub-primes totalling [%u,%u) "
+                            "bits", nsizes, min_total, max_total);
+                    break;
+                }
+            }
+
+            smemclr(s, ssize * sizeof(*s));
+            sfree(s);
+            break;
+          }
+          default:
+            unreachable("bad subprime policy");
+        }
+
+        for (size_t i = 0; i < nsizes; i++) {
+            unsigned sub_bits = sizes[i];
+            double progress_in_this_prime = (double)sub_bits / bits;
+            mp_int *q = provableprime_generate_inner(
+                ppc, pcs_new(sub_bits),
+                prog, progress_origin + progress_scale * progress,
+                progress_scale * progress_in_this_prime);
+            progress += progress_in_this_prime;
+            assert(q);
+            debug_f_mp("ppgi(%u) got factor ", q, bits);
+            pcs_require_residue_1_mod_prime(pcs, q);
+            mp_free(q);
+        }
+
+        smemclr(sizes, sizesize * sizeof(*sizes));
+        sfree(sizes);
+    } else {
+        debug_f("ppgi(%u) no need to recurse", bits);
+    }
+
+    debug_f("ppgi(%u) ready, %u bits remaining",
+            bits, pcs_get_bits_remaining(pcs));
+    pcs_ready(pcs);
+
+    while (true) {
+        mp_int *p = pcs_generate(pcs);
+        if (!p) {
+            pcs_free(pcs);
+            return NULL;
+        }
+
+        debug_f_mp("provable_step p=", p);
+
+        MillerRabin *mr = miller_rabin_new(p);
+        debug_f("provable_step mr setup done");
+        mp_int *witness = miller_rabin_find_potential_primitive_root(mr);
+        miller_rabin_free(mr);
+
+        if (!witness) {
+            debug_f("provable_step mr failed");
+            mp_free(p);
+            continue;
+        }
+
+        size_t nfactors;
+        mp_int **factors = pcs_get_known_prime_factors(pcs, &nfactors);
+        PockleStatus st = pockle_add_prime(
+            ppc->pockle, p, factors, nfactors, witness);
+
+        if (st != POCKLE_OK) {
+            debug_f("provable_step proof failed %d", (int)st);
+
+            /*
+             * Check by assertion that the error status is not one of
+             * the ones we ought to have ruled out already by
+             * construction. If there's a bug in this code that means
+             * we can _never_ pass this test (e.g. picking products of
+             * factors that never quite reach cbrt(n)), we'd rather
+             * fail an assertion than loop forever.
+             */
+            assert(st == POCKLE_DISCRIMINANT_IS_SQUARE ||
+                   st == POCKLE_WITNESS_POWER_IS_1 ||
+                   st == POCKLE_WITNESS_POWER_NOT_COPRIME);
+
+            mp_free(p);
+            if (witness)
+                mp_free(witness);
+            continue;
+        }
+
+        mp_free(witness);
+        pcs_free(pcs);
+        debug_f_mp("ppgi(%u) done, got ", p, bits);
+        progress_report(prog, progress_origin + progress_scale);
+        return p;
+    }
+}
+
+static mp_int *provableprime_generate(
+    PrimeGenerationContext *ctx,
+    PrimeCandidateSource *pcs, ProgressReceiver *prog)
+{
+    ProvablePrimeContext *ppc = container_of(ctx, ProvablePrimeContext, pgc);
+    mp_int *p = provableprime_generate_inner(ppc, pcs, prog, 0.0, 1.0);
+
+    return p;
+}
+
+static inline strbuf *provableprime_mpu_certificate(
+    PrimeGenerationContext *ctx, mp_int *p)
+{
+    ProvablePrimeContext *ppc = container_of(ctx, ProvablePrimeContext, pgc);
+    return pockle_mpu(ppc->pockle, p);
+}
+
+#define DECLARE_POLICY(name, policy)                                    \
+    static const struct ProvablePrimePolicyExtra                        \
+        pppextra_##name = {policy};                                     \
+    const PrimeGenerationPolicy name = {                                \
+        provableprime_add_progress_phase,                               \
+        provableprime_new_context,                                      \
+        provableprime_free_context,                                     \
+        provableprime_generate,                                         \
+        provableprime_mpu_certificate,                                  \
+        &pppextra_##name,                                               \
+    }
+
+DECLARE_POLICY(primegen_provable_fast, SPP_FAST);
+DECLARE_POLICY(primegen_provable_maurer_simple, SPP_MAURER_SIMPLE);
+DECLARE_POLICY(primegen_provable_maurer_complex, SPP_MAURER_COMPLEX);
+
+/* ----------------------------------------------------------------------
+ * Reusable null implementation of the progress-reporting API.
+ */
+
+static inline ProgressPhase null_progress_add(void) {
+    ProgressPhase ph = { .n = 0 };
+    return ph;
+}
+ProgressPhase null_progress_add_linear(
+    ProgressReceiver *prog, double c) { return null_progress_add(); }
+ProgressPhase null_progress_add_probabilistic(
+    ProgressReceiver *prog, double c, double p) { return null_progress_add(); }
+void null_progress_ready(ProgressReceiver *prog) {}
+void null_progress_start_phase(ProgressReceiver *prog, ProgressPhase phase) {}
+void null_progress_report(ProgressReceiver *prog, double progress) {}
+void null_progress_report_attempt(ProgressReceiver *prog) {}
+void null_progress_report_phase_complete(ProgressReceiver *prog) {}
+const ProgressReceiverVtable null_progress_vt = {
+    .add_linear = null_progress_add_linear,
+    .add_probabilistic = null_progress_add_probabilistic,
+    .ready = null_progress_ready,
+    .start_phase = null_progress_start_phase,
+    .report = null_progress_report,
+    .report_attempt = null_progress_report_attempt,
+    .report_phase_complete = null_progress_report_phase_complete,
+};
+
+/* ----------------------------------------------------------------------
+ * Helper function for progress estimation.
+ */
+
+double estimate_modexp_cost(unsigned bits)
+{
+    /*
+     * A modexp of n bits goes roughly like O(n^2.58), on the grounds
+     * that our modmul is O(n^1.58) (Karatsuba) and you need O(n) of
+     * them in a modexp.
+     */
+    return pow(bits, 2.58);
+}
--- a/keygen/primecandidate.c
+++ b/keygen/primecandidate.c
@ -0,0 +1,445 @@
+/*
+ * primecandidate.c: implementation of the PrimeCandidateSource
+ * abstraction declared in sshkeygen.h.
+ */
+
+#include <assert.h>
+#include "ssh.h"
+#include "mpint.h"
+#include "mpunsafe.h"
+#include "sshkeygen.h"
+
+struct avoid {
+    unsigned mod, res;
+};
+
+struct PrimeCandidateSource {
+    unsigned bits;
+    bool ready, try_sophie_germain;
+    bool one_shot, thrown_away_my_shot;
+
+    /* We'll start by making up a random number strictly less than this ... */
+    mp_int *limit;
+
+    /* ... then we'll multiply by 'factor', and add 'addend'. */
+    mp_int *factor, *addend;
+
+    /* Then we'll try to add a small multiple of 'factor' to it to
+     * avoid it being a multiple of any small prime. Also, for RSA, we
+     * may need to avoid it being _this_ multiple of _this_: */
+    unsigned avoid_residue, avoid_modulus;
+
+    /* Once we're actually running, this will be the complete list of
+     * (modulus, residue) pairs we want to avoid. */
+    struct avoid *avoids;
+    size_t navoids, avoidsize;
+
+    /* List of known primes that our number will be congruent to 1 modulo */
+    mp_int **kps;
+    size_t nkps, kpsize;
+};
+
+PrimeCandidateSource *pcs_new_with_firstbits(unsigned bits,
+                                             unsigned first, unsigned nfirst)
+{
+    PrimeCandidateSource *s = snew(PrimeCandidateSource);
+
+    assert(first >> (nfirst-1) == 1);
+
+    s->bits = bits;
+    s->ready = false;
+    s->try_sophie_germain = false;
+    s->one_shot = false;
+    s->thrown_away_my_shot = false;
+
+    s->kps = NULL;
+    s->nkps = s->kpsize = 0;
+
+    s->avoids = NULL;
+    s->navoids = s->avoidsize = 0;
+
+    /* Make the number that's the lower limit of our range */
+    mp_int *firstmp = mp_from_integer(first);
+    mp_int *base = mp_lshift_fixed(firstmp, bits - nfirst);
+    mp_free(firstmp);
+
+    /* Set the low bit of that, because all (nontrivial) primes are odd */
+    mp_set_bit(base, 0, 1);
+
+    /* That's our addend. Now initialise factor to 2, to ensure we
+     * only generate odd numbers */
+    s->factor = mp_from_integer(2);
+    s->addend = base;
+
+    /* And that means the limit of our random numbers must be one
+     * factor of two _less_ than the position of the low bit of
+     * 'first', because we'll be multiplying the random number by
+     * 2 immediately afterwards. */
+    s->limit = mp_power_2(bits - nfirst - 1);
+
+    /* avoid_modulus == 0 signals that there's no extra residue to avoid */
+    s->avoid_residue = 1;
+    s->avoid_modulus = 0;
+
+    return s;
+}
+
+PrimeCandidateSource *pcs_new(unsigned bits)
+{
+    return pcs_new_with_firstbits(bits, 1, 1);
+}
+
+void pcs_free(PrimeCandidateSource *s)
+{
+    mp_free(s->limit);
+    mp_free(s->factor);
+    mp_free(s->addend);
+    for (size_t i = 0; i < s->nkps; i++)
+        mp_free(s->kps[i]);
+    sfree(s->avoids);
+    sfree(s->kps);
+    sfree(s);
+}
+
+void pcs_try_sophie_germain(PrimeCandidateSource *s)
+{
+    s->try_sophie_germain = true;
+}
+
+void pcs_set_oneshot(PrimeCandidateSource *s)
+{
+    s->one_shot = true;
+}
+
+static void pcs_require_residue_inner(PrimeCandidateSource *s,
+                                      mp_int *mod, mp_int *res)
+{
+    /*
+     * We already have a factor and addend. Ensure this one doesn't
+     * contradict it.
+     */
+    mp_int *gcd = mp_gcd(mod, s->factor);
+    mp_int *test1 = mp_mod(s->addend, gcd);
+    mp_int *test2 = mp_mod(res, gcd);
+    assert(mp_cmp_eq(test1, test2));
+    mp_free(test1);
+    mp_free(test2);
+
+    /*
+     * Reduce our input factor and addend, which are constraints on
+     * the ultimate output number, so that they're constraints on the
+     * initial cofactor we're going to make up.
+     *
+     * If we're generating x and we want to ensure ax+b == r (mod m),
+     * how does that work? We've already checked that b == r modulo g
+     * = gcd(a,m), i.e. r-b is a multiple of g, and so are a and m. So
+     * let's write a=gA, m=gM, (r-b)=gR, and then we can start by
+     * dividing that off:
+     *
+     *      ax == r-b (mod m )
+     * =>  gAx == gR  (mod gM)
+     * =>   Ax ==  R  (mod  M)
+     *
+     * Now the moduli A,M are coprime, which makes things easier.
+     *
+     * We're going to need to generate the x in this equation by
+     * generating a new smaller value y, multiplying it by M, and
+     * adding some constant K. So we have x = My + K, and we need to
+     * work out what K will satisfy the above equation. In other
+     * words, we need A(My+K) == R (mod M), and the AMy term vanishes,
+     * so we just need AK == R (mod M). So our congruence is solved by
+     * setting K to be R * A^{-1} mod M.
+     */
+    mp_int *A = mp_div(s->factor, gcd);
+    mp_int *M = mp_div(mod, gcd);
+    mp_int *Rpre = mp_modsub(res, s->addend, mod);
+    mp_int *R = mp_div(Rpre, gcd);
+    mp_int *Ainv = mp_invert(A, M);
+    mp_int *K = mp_modmul(R, Ainv, M);
+
+    mp_free(gcd);
+    mp_free(Rpre);
+    mp_free(Ainv);
+    mp_free(A);
+    mp_free(R);
+
+    /*
+     * So we know we have to transform our existing (factor, addend)
+     * pair into (factor * M, addend * factor * K). Now we just need
+     * to work out what the limit should be on the random value we're
+     * generating.
+     *
+     * If we need My+K < old_limit, then y < (old_limit-K)/M. But the
+     * RHS is a fraction, so in integers, we need y < ceil of it.
+     */
+    assert(!mp_cmp_hs(K, s->limit));
+    mp_int *dividend = mp_add(s->limit, M);
+    mp_sub_integer_into(dividend, dividend, 1);
+    mp_sub_into(dividend, dividend, K);
+    mp_free(s->limit);
+    s->limit = mp_div(dividend, M);
+    mp_free(dividend);
+
+    /*
+     * Now just update the real factor and addend, and we're done.
+     */
+
+    mp_int *addend_old = s->addend;
+    mp_int *tmp = mp_mul(s->factor, K); /* use the _old_ value of factor */
+    s->addend = mp_add(s->addend, tmp);
+    mp_free(tmp);
+    mp_free(addend_old);
+
+    mp_int *factor_old = s->factor;
+    s->factor = mp_mul(s->factor, M);
+    mp_free(factor_old);
+
+    mp_free(M);
+    mp_free(K);
+    s->factor = mp_unsafe_shrink(s->factor);
+    s->addend = mp_unsafe_shrink(s->addend);
+    s->limit = mp_unsafe_shrink(s->limit);
+}
+
+void pcs_require_residue(PrimeCandidateSource *s,
+                         mp_int *mod, mp_int *res_orig)
+{
+    /*
+     * Reduce the input residue to its least non-negative value, in
+     * case it was given as a larger equivalent value.
+     */
+    mp_int *res_reduced = mp_mod(res_orig, mod);
+    pcs_require_residue_inner(s, mod, res_reduced);
+    mp_free(res_reduced);
+}
+
+void pcs_require_residue_1(PrimeCandidateSource *s, mp_int *mod)
+{
+    mp_int *res = mp_from_integer(1);
+    pcs_require_residue(s, mod, res);
+    mp_free(res);
+}
+
+void pcs_require_residue_1_mod_prime(PrimeCandidateSource *s, mp_int *mod)
+{
+    pcs_require_residue_1(s, mod);
+
+    sgrowarray(s->kps, s->kpsize, s->nkps);
+    s->kps[s->nkps++] = mp_copy(mod);
+}
+
+void pcs_avoid_residue_small(PrimeCandidateSource *s,
+                             unsigned mod, unsigned res)
+{
+    assert(!s->avoid_modulus);         /* can't cope with more than one */
+    s->avoid_modulus = mod;
+    s->avoid_residue = res % mod;      /* reduce, just in case */
+}
+
+static int avoid_cmp(const void *av, const void *bv)
+{
+    const struct avoid *a = (const struct avoid *)av;
+    const struct avoid *b = (const struct avoid *)bv;
+    return a->mod < b->mod ? -1 : a->mod > b->mod ? +1 : 0;
+}
+
+static uint64_t invert(uint64_t a, uint64_t m)
+{
+    int64_t v0 = a, i0 = 1;
+    int64_t v1 = m, i1 = 0;
+    while (v0) {
+        int64_t tmp, q = v1 / v0;
+        tmp = v0; v0 = v1 - q*v0; v1 = tmp;
+        tmp = i0; i0 = i1 - q*i0; i1 = tmp;
+    }
+    assert(v1 == 1 || v1 == -1);
+    return i1 * v1;
+}
+
+void pcs_ready(PrimeCandidateSource *s)
+{
+    /*
+     * List all the small (modulus, residue) pairs we want to avoid.
+     */
+
+    init_smallprimes();
+
+#define ADD_AVOID(newmod, newres) do {                          \
+        sgrowarray(s->avoids, s->avoidsize, s->navoids);        \
+        s->avoids[s->navoids].mod = (newmod);                   \
+        s->avoids[s->navoids].res = (newres);                   \
+        s->navoids++;                                           \
+    } while (0)
+
+    unsigned limit = (mp_hs_integer(s->addend, 65536) ? 65536 :
+                      mp_get_integer(s->addend));
+
+    /*
+     * Don't be divisible by any small prime, or at least, any prime
+     * smaller than our output number might actually manage to be. (If
+     * asked to generate a really small prime, it would be
+     * embarrassing to rule out legitimate answers on the grounds that
+     * they were divisible by themselves.)
+     */
+    for (size_t i = 0; i < NSMALLPRIMES && smallprimes[i] < limit; i++)
+        ADD_AVOID(smallprimes[i], 0);
+
+    if (s->try_sophie_germain) {
+        /*
+         * If we're aiming to generate a Sophie Germain prime (i.e. p
+         * such that 2p+1 is also prime), then we also want to ensure
+         * 2p+1 is not congruent to 0 mod any small prime, because if
+         * it is, we'll waste a lot of time generating a p for which
+         * 2p+1 can't possibly work. So we have to avoid an extra
+         * residue mod each odd q.
+         *
+         * We can simplify:        2p+1 ==  0      (mod q)
+         *                    =>     2p == -1      (mod q)
+         *                    =>      p == -2^{-1} (mod q)
+         *
+         * There's no need to do Euclid's algorithm to compute those
+         * inverses, because for any odd q, the modular inverse of -2
+         * mod q is just (q-1)/2. (Proof: multiplying it by -2 gives
+         * 1-q, which is congruent to 1 mod q.)
+         */
+        for (size_t i = 0; i < NSMALLPRIMES && smallprimes[i] < limit; i++)
+            if (smallprimes[i] != 2)
+                ADD_AVOID(smallprimes[i], (smallprimes[i] - 1) / 2);
+    }
+
+    /*
+     * Finally, if there's a particular modulus and residue we've been
+     * told to avoid, put it on the list.
+     */
+    if (s->avoid_modulus)
+        ADD_AVOID(s->avoid_modulus, s->avoid_residue);
+
+#undef ADD_AVOID
+
+    /*
+     * Sort our to-avoid list by modulus. Partly this is so that we'll
+     * check the smaller moduli first during the live runs, which lets
+     * us spot most failing cases earlier rather than later. Also, it
+     * brings equal moduli together, so that we can reuse the residue
+     * we computed from a previous one.
+     */
+    qsort(s->avoids, s->navoids, sizeof(*s->avoids), avoid_cmp);
+
+    /*
+     * Next, adjust each of these moduli to take account of our factor
+     * and addend. If we want factor*x+addend to avoid being congruent
+     * to 'res' modulo 'mod', then x itself must avoid being congruent
+     * to (res - addend) * factor^{-1}.
+     *
+     * If factor == 0 modulo mod, then the answer will have a fixed
+     * residue anyway, so we can discard it from our list to test.
+     */
+    int64_t factor_m = 0, addend_m = 0, last_mod = 0;
+
+    size_t out = 0;
+    for (size_t i = 0; i < s->navoids; i++) {
+        int64_t mod = s->avoids[i].mod, res = s->avoids[i].res;
+        if (mod != last_mod) {
+            last_mod = mod;
+            addend_m = mp_unsafe_mod_integer(s->addend, mod);
+            factor_m = mp_unsafe_mod_integer(s->factor, mod);
+        }
+
+        if (factor_m == 0) {
+            assert(res != addend_m);
+            continue;
+        }
+
+        res = (res - addend_m) * invert(factor_m, mod);
+        res %= mod;
+        if (res < 0)
+            res += mod;
+
+        s->avoids[out].mod = mod;
+        s->avoids[out].res = res;
+        out++;
+    }
+
+    s->navoids = out;
+
+    s->ready = true;
+}
+
+mp_int *pcs_generate(PrimeCandidateSource *s)
+{
+    assert(s->ready);
+    if (s->one_shot) {
+        if (s->thrown_away_my_shot)
+            return NULL;
+        s->thrown_away_my_shot = true;
+    }
+
+    while (true) {
+        mp_int *x = mp_random_upto(s->limit);
+
+        int64_t x_res = 0, last_mod = 0;
+        bool ok = true;
+
+        for (size_t i = 0; i < s->navoids; i++) {
+            int64_t mod = s->avoids[i].mod, avoid_res = s->avoids[i].res;
+
+            if (mod != last_mod) {
+                last_mod = mod;
+                x_res = mp_unsafe_mod_integer(x, mod);
+            }
+
+            if (x_res == avoid_res) {
+                ok = false;
+                break;
+            }
+        }
+
+        if (!ok) {
+            mp_free(x);
+            continue; /* try a new x */
+        }
+
+        /*
+         * We've found a viable x. Make the final output value.
+         */
+        mp_int *toret = mp_new(s->bits);
+        mp_mul_into(toret, x, s->factor);
+        mp_add_into(toret, toret, s->addend);
+        mp_free(x);
+        return toret;
+    }
+}
+
+void pcs_inspect(PrimeCandidateSource *pcs, mp_int **limit_out,
+                 mp_int **factor_out, mp_int **addend_out)
+{
+    *limit_out = mp_copy(pcs->limit);
+    *factor_out = mp_copy(pcs->factor);
+    *addend_out = mp_copy(pcs->addend);
+}
+
+unsigned pcs_get_bits(PrimeCandidateSource *pcs)
+{
+    return pcs->bits;
+}
+
+unsigned pcs_get_bits_remaining(PrimeCandidateSource *pcs)
+{
+    return mp_get_nbits(pcs->limit);
+}
+
+mp_int *pcs_get_upper_bound(PrimeCandidateSource *pcs)
+{
+    /* Compute (limit-1) * factor + addend */
+    mp_int *tmp = mp_mul(pcs->limit, pcs->factor);
+    mp_int *bound = mp_add(tmp, pcs->addend);
+    mp_free(tmp);
+    mp_sub_into(bound, bound, pcs->factor);
+    return bound;
+}
+
+mp_int **pcs_get_known_prime_factors(PrimeCandidateSource *pcs, size_t *nout)
+{
+    *nout = pcs->nkps;
+    return pcs->kps;
+}
--- a/keygen/rsa.c
+++ b/keygen/rsa.c
@ -0,0 +1,292 @@
+/*
+ * RSA key generation.
+ */
+
+#include <assert.h>
+
+#include "ssh.h"
+#include "sshkeygen.h"
+#include "mpint.h"
+
+#define RSA_EXPONENT 65537
+
+#define NFIRSTBITS 13
+static void invent_firstbits(unsigned *one, unsigned *two,
+                             unsigned min_separation);
+
+typedef struct RSAPrimeDetails RSAPrimeDetails;
+struct RSAPrimeDetails {
+    bool strong;
+    int bits, bitsm1m1, bitsm1, bitsp1;
+    unsigned firstbits;
+    ProgressPhase phase_main, phase_m1m1, phase_m1, phase_p1;
+};
+
+#define STRONG_MARGIN (20 + NFIRSTBITS)
+
+static RSAPrimeDetails setup_rsa_prime(
+    int bits, bool strong, PrimeGenerationContext *pgc, ProgressReceiver *prog)
+{
+    RSAPrimeDetails pd;
+    pd.bits = bits;
+    if (strong) {
+        pd.bitsm1 = (bits - STRONG_MARGIN) / 2;
+        pd.bitsp1 = (bits - STRONG_MARGIN) - pd.bitsm1;
+        pd.bitsm1m1 = (pd.bitsm1 - STRONG_MARGIN) / 2;
+        if (pd.bitsm1m1 < STRONG_MARGIN) {
+            /* Absurdly small prime, but we should at least not crash. */
+            strong = false;
+        }
+    }
+    pd.strong = strong;
+
+    if (pd.strong) {
+        pd.phase_m1m1 = primegen_add_progress_phase(pgc, prog, pd.bitsm1m1);
+        pd.phase_m1 = primegen_add_progress_phase(pgc, prog, pd.bitsm1);
+        pd.phase_p1 = primegen_add_progress_phase(pgc, prog, pd.bitsp1);
+    }
+    pd.phase_main = primegen_add_progress_phase(pgc, prog, pd.bits);
+
+    return pd;
+}
+
+static mp_int *generate_rsa_prime(
+    RSAPrimeDetails pd, PrimeGenerationContext *pgc, ProgressReceiver *prog)
+{
+    mp_int *m1m1 = NULL, *m1 = NULL, *p1 = NULL, *p = NULL;
+    PrimeCandidateSource *pcs;
+
+    if (pd.strong) {
+        progress_start_phase(prog, pd.phase_m1m1);
+        pcs = pcs_new_with_firstbits(pd.bitsm1m1, pd.firstbits, NFIRSTBITS);
+        m1m1 = primegen_generate(pgc, pcs, prog);
+        progress_report_phase_complete(prog);
+
+        progress_start_phase(prog, pd.phase_m1);
+        pcs = pcs_new_with_firstbits(pd.bitsm1, pd.firstbits, NFIRSTBITS);
+        pcs_require_residue_1_mod_prime(pcs, m1m1);
+        m1 = primegen_generate(pgc, pcs, prog);
+        progress_report_phase_complete(prog);
+
+        progress_start_phase(prog, pd.phase_p1);
+        pcs = pcs_new_with_firstbits(pd.bitsp1, pd.firstbits, NFIRSTBITS);
+        p1 = primegen_generate(pgc, pcs, prog);
+        progress_report_phase_complete(prog);
+    }
+
+    progress_start_phase(prog, pd.phase_main);
+    pcs = pcs_new_with_firstbits(pd.bits, pd.firstbits, NFIRSTBITS);
+    pcs_avoid_residue_small(pcs, RSA_EXPONENT, 1);
+    if (pd.strong) {
+        pcs_require_residue_1_mod_prime(pcs, m1);
+        mp_int *p1_minus_1 = mp_copy(p1);
+        mp_sub_integer_into(p1_minus_1, p1, 1);
+        pcs_require_residue(pcs, p1, p1_minus_1);
+        mp_free(p1_minus_1);
+    }
+    p = primegen_generate(pgc, pcs, prog);
+    progress_report_phase_complete(prog);
+
+    if (m1m1)
+        mp_free(m1m1);
+    if (m1)
+        mp_free(m1);
+    if (p1)
+        mp_free(p1);
+
+    return p;
+}
+
+int rsa_generate(RSAKey *key, int bits, bool strong,
+                 PrimeGenerationContext *pgc, ProgressReceiver *prog)
+{
+    key->sshk.vt = &ssh_rsa;
+
+    /*
+     * We don't generate e; we just use a standard one always.
+     */
+    mp_int *exponent = mp_from_integer(RSA_EXPONENT);
+
+    /*
+     * Generate p and q: primes with combined length `bits', not
+     * congruent to 1 modulo e. (Strictly speaking, we wanted (p-1)
+     * and e to be coprime, and (q-1) and e to be coprime, but in
+     * general that's slightly more fiddly to arrange. By choosing
+     * a prime e, we can simplify the criterion.)
+     *
+     * We give a min_separation of 2 to invent_firstbits(), ensuring
+     * that the two primes won't be very close to each other. (The
+     * chance of them being _dangerously_ close is negligible - even
+     * more so than an attacker guessing a whole 256-bit session key -
+     * but it doesn't cost much to make sure.)
+     */
+    int qbits = bits / 2;
+    int pbits = bits - qbits;
+    assert(pbits >= qbits);
+
+    RSAPrimeDetails pd = setup_rsa_prime(pbits, strong, pgc, prog);
+    RSAPrimeDetails qd = setup_rsa_prime(qbits, strong, pgc, prog);
+    progress_ready(prog);
+
+    invent_firstbits(&pd.firstbits, &qd.firstbits, 2);
+
+    mp_int *p = generate_rsa_prime(pd, pgc, prog);
+    mp_int *q = generate_rsa_prime(qd, pgc, prog);
+
+    /*
+     * Ensure p > q, by swapping them if not.
+     *
+     * We only need to do this if the two primes were generated with
+     * the same number of bits (i.e. if the requested key size is
+     * even) - otherwise it's already guaranteed!
+     */
+    if (pbits == qbits) {
+        mp_cond_swap(p, q, mp_cmp_hs(q, p));
+    } else {
+        assert(mp_cmp_hs(p, q));
+    }
+
+    /*
+     * Now we have p, q and e. All we need to do now is work out
+     * the other helpful quantities: n=pq, d=e^-1 mod (p-1)(q-1),
+     * and (q^-1 mod p).
+     */
+    mp_int *modulus = mp_mul(p, q);
+    mp_int *pm1 = mp_copy(p);
+    mp_sub_integer_into(pm1, pm1, 1);
+    mp_int *qm1 = mp_copy(q);
+    mp_sub_integer_into(qm1, qm1, 1);
+    mp_int *phi_n = mp_mul(pm1, qm1);
+    mp_free(pm1);
+    mp_free(qm1);
+    mp_int *private_exponent = mp_invert(exponent, phi_n);
+    mp_free(phi_n);
+    mp_int *iqmp = mp_invert(q, p);
+
+    /*
+     * Populate the returned structure.
+     */
+    key->modulus = modulus;
+    key->exponent = exponent;
+    key->private_exponent = private_exponent;
+    key->p = p;
+    key->q = q;
+    key->iqmp = iqmp;
+
+    key->bits = mp_get_nbits(modulus);
+    key->bytes = (key->bits + 7) / 8;
+
+    return 1;
+}
+
+/*
+ * Invent a pair of values suitable for use as the 'firstbits' values
+ * for the two RSA primes, such that their product is at least 2, and
+ * such that their difference is also at least min_separation.
+ *
+ * This is used for generating RSA keys which have exactly the
+ * specified number of bits rather than one fewer - if you generate an
+ * a-bit and a b-bit number completely at random and multiply them
+ * together, you could end up with either an (ab-1)-bit number or an
+ * (ab)-bit number. The former happens log(2)*2-1 of the time (about
+ * 39%) and, though actually harmless, every time it occurs it has a
+ * non-zero probability of sparking a user email along the lines of
+ * 'Hey, I asked PuTTYgen for a 2048-bit key and I only got 2047 bits!
+ * Bug!'
+ */
+static inline unsigned firstbits_b_min(
+    unsigned a, unsigned lo, unsigned hi, unsigned min_separation)
+{
+    /* To get a large enough product, b must be at least this much */
+    unsigned b_min = (2*lo*lo + a - 1) / a;
+    /* Now enforce a<b, optionally with minimum separation */
+    if (b_min < a + min_separation)
+        b_min = a + min_separation;
+    /* And cap at the upper limit */
+    if (b_min > hi)
+        b_min = hi;
+    return b_min;
+}
+
+static void invent_firstbits(unsigned *one, unsigned *two,
+                             unsigned min_separation)
+{
+    /*
+     * We'll pick 12 initial bits (number selected at random) for each
+     * prime, not counting the leading 1. So we want to return two
+     * values in the range [2^12,2^13) whose product is at least 2^25.
+     *
+     * Strategy: count up all the viable pairs, then select a random
+     * number in that range and use it to pick a pair.
+     *
+     * To keep things simple, we'll ensure a < b, and randomly swap
+     * them at the end.
+     */
+    const unsigned lo = 1<<12, hi = 1<<13, minproduct = 2*lo*lo;
+    unsigned a, b;
+
+    /*
+     * Count up the number of prefixes of b that would be valid for
+     * each prefix of a.
+     */
+    mp_int *total = mp_new(32);
+    for (a = lo; a < hi; a++) {
+        unsigned b_min = firstbits_b_min(a, lo, hi, min_separation);
+        mp_add_integer_into(total, total, hi - b_min);
+    }
+
+    /*
+     * Make up a random number in the range [0,2*total).
+     */
+    mp_int *mlo = mp_from_integer(0), *mhi = mp_new(32);
+    mp_lshift_fixed_into(mhi, total, 1);
+    mp_int *randval = mp_random_in_range(mlo, mhi);
+    mp_free(mlo);
+    mp_free(mhi);
+
+    /*
+     * Use the low bit of randval as our swap indicator, leaving the
+     * rest of it in the range [0,total).
+     */
+    unsigned swap = mp_get_bit(randval, 0);
+    mp_rshift_fixed_into(randval, randval, 1);
+
+    /*
+     * Now do the same counting loop again to make the actual choice.
+     */
+    a = b = 0;
+    for (unsigned a_candidate = lo; a_candidate < hi; a_candidate++) {
+        unsigned b_min = firstbits_b_min(a_candidate, lo, hi, min_separation);
+        unsigned limit = hi - b_min;
+
+        unsigned b_candidate = b_min + mp_get_integer(randval);
+        unsigned use_it = 1 ^ mp_hs_integer(randval, limit);
+        a ^= (a ^ a_candidate) & -use_it;
+        b ^= (b ^ b_candidate) & -use_it;
+
+        mp_sub_integer_into(randval, randval, limit);
+    }
+
+    mp_free(randval);
+    mp_free(total);
+
+    /*
+     * Check everything came out right.
+     */
+    assert(lo <= a);
+    assert(a < hi);
+    assert(lo <= b);
+    assert(b < hi);
+    assert(a * b >= minproduct);
+    assert(b >= a + min_separation);
+
+    /*
+     * Last-minute optional swap of a and b.
+     */
+    unsigned diff = (a ^ b) & (-swap);
+    a ^= diff;
+    b ^= diff;
+
+    *one = a;
+    *two = b;
+}
--- a/keygen/smallprimes.c
+++ b/keygen/smallprimes.c
@ -0,0 +1,44 @@
+/*
+ * smallprimes.c: implementation of the array of small primes defined
+ * in sshkeygen.h.
+ */
+
+#include <assert.h>
+#include "ssh.h"
+#include "sshkeygen.h"
+
+/* The real array that stores the primes. It has to be writable in
+ * this module, but outside this module, we only expose the
+ * const-qualified pointer 'smallprimes' so that nobody else can
+ * accidentally overwrite it. */
+static unsigned short smallprimes_array[NSMALLPRIMES];
+
+const unsigned short *const smallprimes = smallprimes_array;
+
+void init_smallprimes(void)
+{
+    if (smallprimes_array[0])
+        return;                        /* already done */
+
+    bool A[65536];
+
+    for (size_t i = 2; i < lenof(A); i++)
+        A[i] = true;
+
+    for (size_t i = 2; i < lenof(A); i++) {
+        if (!A[i])
+            continue;
+        for (size_t j = 2*i; j < lenof(A); j += i)
+            A[j] = false;
+    }
+
+    size_t pos = 0;
+    for (size_t i = 2; i < lenof(A); i++) {
+        if (A[i]) {
+            assert(pos < NSMALLPRIMES);
+            smallprimes_array[pos++] = i;
+        }
+    }
+
+    assert(pos == NSMALLPRIMES);
+}