From fcdc804b4f4cee6c5b6a1b436fbc6ec7db06ea3e Mon Sep 17 00:00:00 2001
From: Simon Tatham <anakin@pobox.com>
Date: Sun, 1 Dec 2024 09:55:39 +0000
Subject: [PATCH] Move some NTRU helper routines into a header file.

I'm going to want to use these again for ML-KEM, so let's put one copy
of them where both algorithms can use it.
---
 crypto/ntru.c        | 56 +++++---------------------------------------
 crypto/smallmoduli.h | 54 ++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 60 insertions(+), 50 deletions(-)
 create mode 100644 crypto/smallmoduli.h

diff --git a/crypto/ntru.c b/crypto/ntru.c
index 60c37e2e..a7e53122 100644
--- a/crypto/ntru.c
+++ b/crypto/ntru.c
@@ -79,58 +79,14 @@
 #include "ssh.h"
 #include "mpint.h"
 #include "ntru.h"
-
-/* ----------------------------------------------------------------------
- * Preliminaries: we're going to need to do modular arithmetic on
- * small values (considerably smaller than 2^16), and we need to do it
- * without using integer division which might not be time-safe.
- *
- * The strategy for this is the same as I used in
- * mp_mod_known_integer: see there for the proofs. The basic idea is
- * that we precompute the reciprocal of our modulus as a fixed-point
- * number, and use that to get an approximate quotient which we
- * subtract off. For these integer sizes, precomputing a fixed-point
- * reciprocal of the form (2^48 / modulus) leaves us at most off by 1
- * in the quotient, so there's a single (time-safe) trial subtraction
- * at the end.
- *
- * (It's possible that some speed could be gained by not reducing
- * fully at every step. But then you'd have to carefully identify all
- * the places in the algorithm where things are compared to zero. This
- * was the easiest way to get it all working in the first place.)
- */
-
-/* Precompute the reciprocal */
-static uint64_t reciprocal_for_reduction(uint16_t q)
-{
-    return ((uint64_t)1 << 48) / q;
-}
-
-/* Reduce x mod q, assuming qrecip == reciprocal_for_reduction(q) */
-static uint16_t reduce(uint32_t x, uint16_t q, uint64_t qrecip)
-{
-    uint64_t unshifted_quot = x * qrecip;
-    uint64_t quot = unshifted_quot >> 48;
-    uint16_t reduced = x - quot * q;
-    reduced -= q * (1 & ((q-1 - reduced) >> 15));
-    return reduced;
-}
-
-/* Reduce x mod q as above, but also return the quotient */
-static uint16_t reduce_with_quot(uint32_t x, uint32_t *quot_out,
-                                 uint16_t q, uint64_t qrecip)
-{
-    uint64_t unshifted_quot = x * qrecip;
-    uint64_t quot = unshifted_quot >> 48;
-    uint16_t reduced = x - quot * q;
-    uint64_t extraquot = (1 & ((q-1 - reduced) >> 15));
-    reduced -= extraquot * q;
-    *quot_out = quot + extraquot;
-    return reduced;
-}
+#include "smallmoduli.h"
 
 /* Invert x mod q, assuming it's nonzero. (For time-safety, no check
- * is made for zero; it just returns 0.) */
+ * is made for zero; it just returns 0.)
+ *
+ * Expects qrecip == reciprocal_for_reduction(q). (But it's passed in
+ * as a parameter to save recomputing it, on the theory that the
+ * caller will have had it lying around already in most cases.) */
 static uint16_t invert(uint16_t x, uint16_t q, uint64_t qrecip)
 {
     /* Fermat inversion: compute x^(q-2), since x^(q-1) == 1. */
diff --git a/crypto/smallmoduli.h b/crypto/smallmoduli.h
new file mode 100644
index 00000000..b452b410
--- /dev/null
+++ b/crypto/smallmoduli.h
@@ -0,0 +1,54 @@
+/*
+ * Shared code between algorithms whose state consists of a large
+ * collection of residues mod a small prime.
+ */
+
+/*
+ * We need to do modular arithmetic on small values (considerably
+ * smaller than 2^16), and we need to do it without using integer
+ * division which might not be time-safe. Input values might not fit
+ * in a 16-bit int, because we'll also be multiplying mod q.
+ *
+ * The strategy for this is the same as I used in
+ * mp_mod_known_integer: see there for the proofs. The basic idea is
+ * that we precompute the reciprocal of our modulus as a fixed-point
+ * number, and use that to get an approximate quotient which we
+ * subtract off. For these integer sizes, precomputing a fixed-point
+ * reciprocal of the form (2^48 / modulus) leaves us at most off by 1
+ * in the quotient, so there's a single (time-safe) trial subtraction
+ * at the end.
+ *
+ * (It's possible that some speed could be gained by not reducing
+ * fully at every step. But then you'd have to carefully identify all
+ * the places in the algorithm where things are compared to zero. This
+ * was the easiest way to get it all working in the first place.)
+ */
+
+/* Precompute the reciprocal */
+static inline uint64_t reciprocal_for_reduction(uint16_t q)
+{
+    return ((uint64_t)1 << 48) / q;
+}
+
+/* Reduce x mod q, assuming qrecip == reciprocal_for_reduction(q) */
+static inline uint16_t reduce(uint32_t x, uint16_t q, uint64_t qrecip)
+{
+    uint64_t unshifted_quot = x * qrecip;
+    uint64_t quot = unshifted_quot >> 48;
+    uint16_t reduced = x - quot * q;
+    reduced -= q * (1 & ((q-1 - reduced) >> 15));
+    return reduced;
+}
+
+/* Reduce x mod q as above, but also return the quotient */
+static inline uint16_t reduce_with_quot(uint32_t x, uint32_t *quot_out,
+                                        uint16_t q, uint64_t qrecip)
+{
+    uint64_t unshifted_quot = x * qrecip;
+    uint64_t quot = unshifted_quot >> 48;
+    uint16_t reduced = x - quot * q;
+    uint64_t extraquot = (1 & ((q-1 - reduced) >> 15));
+    reduced -= extraquot * q;
+    *quot_out = quot + extraquot;
+    return reduced;
+}