Implement Unicode normalisation.

A new module in 'utils' computes NFC and NFD, via a new set of data tables generated by read_ucd.py. The new module comes with a new test program, which can read the NormalizationTest.txt that appears in the Unicode Character Database. All the tests pass, as of Unicode 15.
2025-07-01 03:22:48 -05:00 · 2022-11-09 19:28:51 +00:00
parent 4cb429e3f4
commit b35d23f699
8 changed files with 3984 additions and 9 deletions
--- a/utils/CMakeLists.txt
+++ b/utils/CMakeLists.txt
@ -65,6 +65,7 @@ add_sources_from_current_dir(utils
  stripctrl.c
  tempseat.c
  tree234.c
+  unicode-norm.c
  validate_manual_hostkey.c
  version.c
  wcwidth.c
--- a/utils/unicode-norm.c
+++ b/utils/unicode-norm.c
@ -0,0 +1,446 @@
+#include <stdio.h>
+#include <string.h>
+#include "misc.h"
+
+typedef uint32_t uchar;
+typedef int cclass_t;
+
+/* A local uchar-oriented analogue of strbuf */
+typedef struct ucharbuf {
+    uchar *buf;
+    size_t len, size;
+} ucharbuf;
+
+static ucharbuf *ucharbuf_new(void)
+{
+    ucharbuf *ub = snew(ucharbuf);
+    ub->buf = NULL;
+    ub->len = ub->size = 0;
+    return ub;
+}
+
+static void ucharbuf_append(ucharbuf *ub, uchar c)
+{
+    /* Use the _nm variant because this is used for passphrases */
+    sgrowarray_nm(ub->buf, ub->size, ub->len);
+    ub->buf[ub->len++] = c;
+}
+
+static void ucharbuf_free(ucharbuf *ub)
+{
+    if (ub->buf) {
+        memset(ub->buf, 0, ub->size * sizeof(*ub->buf));
+        sfree(ub->buf);
+    }
+    sfree(ub);
+}
+
+/*
+ * Constants relating to the arithmetic decomposition mapping of
+ * Hangul to jamo, from section 3.12 of Unicode 15.0.0. The following
+ * constant names match those in the spec.
+ */
+static const uchar SBase = 0xAC00;
+static const uchar LBase = 0x1100, VBase = 0x1161, TBase = 0x11A7;
+static const uchar LCount = 19, VCount = 21, TCount = 28;
+static const uchar NCount = VCount * TCount, SCount = LCount * NCount;
+
+static cclass_t combining_class(uchar c)
+{
+    struct range {
+        uchar start, end;
+        cclass_t cclass;
+    };
+    static const struct range ranges[] = {
+        #include "unicode/combining_classes.h"
+    };
+
+    const struct range *start = ranges, *end = start + lenof(ranges);
+
+    while (end > start) {
+        const struct range *curr = start + (end-start) / 2;
+        if (c < curr->start)
+            end = curr;
+        else if (c > curr->end)
+            start = curr + 1;
+        else
+            return curr->cclass;
+    }
+
+    return 0;
+};
+
+static unsigned decompose_char(uchar c, uchar *out)
+{
+    struct decomp {
+        uchar composed, dec0, dec1;
+    };
+    static const struct decomp decomps[] = {
+        #include "unicode/canonical_decomp.h"
+    };
+
+    if (c - SBase < SCount) {
+        /* Arithmetically decompose a Hangul character into jamo */
+        uchar SIndex = c - SBase;
+        uchar LIndex = SIndex / NCount;
+        uchar VIndex = SIndex % NCount / TCount;
+        uchar TIndex = SIndex % TCount;
+
+        unsigned n = 0;
+        out[n++] = LBase + LIndex;
+        out[n++] = VBase + VIndex;
+        if (TIndex)
+            out[n++] = TBase + TIndex;
+        return n;
+    }
+
+    const struct decomp *start = decomps, *end = start + lenof(decomps);
+
+    while (end > start) {
+        const struct decomp *curr = start + (end-start) / 2;
+        if (c < curr->composed)
+            end = curr;
+        else if (c > curr->composed)
+            start = curr + 1;
+        else {
+            out[0] = curr->dec0;
+            if (curr->dec1) {
+                out[1] = curr->dec1;
+                return 2;
+            } else {
+                return 1;
+            }
+        }
+    }
+
+    return 0;
+};
+
+static uchar compose_chars(uchar S, uchar C)
+{
+    struct comp {
+        uchar dec0, dec1, composed;
+    };
+    static const struct comp comps[] = {
+        #include "unicode/canonical_comp.h"
+    };
+
+    if (S - LBase < LCount && C - VBase < VCount) {
+        /* Arithmetically compose an L and V jamo into a Hangul LV
+         * character */
+        return SBase + (S - LBase) * NCount + (C - VBase) * TCount;
+    }
+
+    if (S - SBase < SCount && (S - SBase) % TCount == 0 &&
+        C - TBase < TCount) {
+        /* Arithmetically compose an LV Hangul character and a T jamo
+         * into a Hangul LVT character */
+        return S + C - TBase;
+    }
+
+    const struct comp *start = comps, *end = start + lenof(comps);
+
+    while (end > start) {
+        const struct comp *curr = start + (end-start) / 2;
+        if (S < curr->dec0)
+            end = curr;
+        else if (S > curr->dec0)
+            start = curr + 1;
+        else if (C < curr->dec1)
+            end = curr;
+        else if (C > curr->dec1)
+            start = curr + 1;
+        else
+            return curr->composed;
+    }
+
+    return 0;
+};
+
+/*
+ * Recursively decompose a sequence of Unicode characters. The output
+ * is written to 'out', as a sequence of native-byte-order uchar.
+ */
+static void recursively_decompose(const uchar *str, size_t len, ucharbuf *out)
+{
+    uchar decomposed[3];
+
+    while (len-- > 0) {
+        uchar c = *str++;
+        unsigned n = decompose_char(c, decomposed);
+        if (n == 0) {
+            /* This character is indecomposable */
+            ucharbuf_append(out, c);
+        } else {
+            /* This character has been decomposed into up to 3
+             * characters, so we must now recursively decompose those */
+            recursively_decompose(decomposed, n, out);
+        }
+    }
+}
+
+/*
+ * Reorder combining marks according to the Canonical Ordering
+ * Algorithm (definition D109 in Unicode 15.0.0 section 3.11).
+ *
+ * The algorithm is phrased mechanistically, but the essence is: among
+ * any contiguous sequence of combining marks (that is, characters
+ * with cclass > 0), sort them by their cclass - but _stably_, i.e.
+ * breaking ties in cclass by preserving the original order of the
+ * characters in question.
+ */
+static void canonical_ordering(uchar *str, size_t len)
+{
+    for (size_t i = 1; i < len; i++) {
+        cclass_t cclass = combining_class(str[i]);
+        if (cclass == 0)
+            continue;
+
+        size_t j = i;
+        while (j > 0 && combining_class(str[j-1]) > cclass) {
+            uchar tmp = str[j-1];
+            str[j-1] = str[j];
+            str[j] = tmp;
+
+            j--;
+        }
+    }
+}
+
+/*
+ * Canonically recompose characters according to the Canonical
+ * Composition Algorithm (definition D117 in Unicode 15.0.0 section
+ * 3.11).
+ */
+static size_t canonical_composition(uchar *str, size_t len)
+{
+    const uchar *in = str;
+    uchar *out = str;
+    uchar *last_starter = NULL;
+    cclass_t highest_cclass_between = -1;
+
+    while (len > 0) {
+        len--;
+        uchar c = *in++;
+        cclass_t cclass = combining_class(c);
+
+        if (last_starter && highest_cclass_between < cclass) {
+            uchar composed = compose_chars(*last_starter, c);
+            if (composed) {
+                *last_starter = composed;
+                continue;
+            }
+        }
+
+        if (cclass == 0) {
+            last_starter = out;
+            highest_cclass_between = -1;
+        } else if (cclass > highest_cclass_between) {
+            highest_cclass_between = cclass;
+        }
+
+        *out++ = c;
+    }
+
+    return out - str;
+}
+
+/*
+ * Render a string into NFD.
+ */
+static ucharbuf *nfd(ucharbuf *input)
+{
+    ucharbuf *output = ucharbuf_new();
+
+    /*
+     * Definition D118 in Unicode 15.0.0 section 3.11, referring to
+     * D68 in section 3.7: recursively decompose characters, then
+     * reorder combining marks.
+     */
+    recursively_decompose(input->buf, input->len, output);
+    canonical_ordering(output->buf, output->len);
+
+    return output;
+}
+
+/*
+ * Render a string into NFC.
+ */
+static ucharbuf *nfc(ucharbuf *input)
+{
+    /*
+     * Definition D120 in Unicode 15.0.0 section 3.11: render the
+     * string into NFD, then apply the canonical composition algorithm.
+     */
+    ucharbuf *output = nfd(input);
+    output->len = canonical_composition(output->buf, output->len);
+
+    return output;
+}
+
+/*
+ * Convert a UTF-8 string into NFC, returning it as UTF-8 again.
+ */
+strbuf *utf8_to_nfc(ptrlen input)
+{
+    BinarySource src[1];
+    BinarySource_BARE_INIT_PL(src, input);
+
+    ucharbuf *inbuf = ucharbuf_new();
+    while (get_avail(src))
+        ucharbuf_append(inbuf, decode_utf8(src));
+
+    ucharbuf *outbuf = nfc(inbuf);
+
+    strbuf *output = strbuf_new_nm();
+    for (size_t i = 0; i < outbuf->len; i++)
+        put_utf8_char(output, outbuf->buf[i]);
+
+    ucharbuf_free(inbuf);
+    ucharbuf_free(outbuf);
+
+    return output;
+}
+
+#ifdef TEST
+void out_of_memory(void)
+{
+    fprintf(stderr, "out of memory!\n");
+    exit(2);
+}
+
+static int pass, fail;
+
+static void subtest(const char *filename, int lineno, const char *subdesc,
+                    char nftype, ucharbuf *input, ucharbuf *expected)
+{
+    /*
+     * Convert input into either NFC or NFD, and check it's equal to
+     * expected
+     */
+    ucharbuf *nf;
+    switch (nftype) {
+      case 'C':
+        nf = nfc(input);
+        break;
+      case 'D':
+        nf = nfd(input);
+        break;
+      default:
+        unreachable("bad nftype");
+    }
+
+    if (nf->len == expected->len && !memcmp(nf->buf, expected->buf, nf->len)) {
+        pass++;
+    } else {
+        printf("%s:%d: failed %s: NF%c([", filename, lineno, subdesc, nftype);
+        for (size_t pos = 0; pos < input->len; pos += sizeof(uchar))
+            printf("%s%04X", pos ? " " : "", (unsigned)input->buf[pos]);
+        printf("]) -> [");
+        for (size_t pos = 0; pos < nf->len; pos += sizeof(uchar))
+            printf("%s%04X", pos ? " " : "", (unsigned)nf->buf[pos]);
+        printf("] != [");
+        for (size_t pos = 0; pos < expected->len; pos += sizeof(uchar))
+            printf("%s%04X", pos ? " " : "", (unsigned)expected->buf[pos]);
+        printf("]\n");
+        fail++;
+    }
+
+    ucharbuf_free(nf);
+}
+
+static void run_tests(const char *filename, FILE *fp)
+{
+    for (int lineno = 1;; lineno++) {
+        char *line = chomp(fgetline(fp));
+        if (!line)
+            break;
+
+        /* Strip section dividers which begin with @ */
+        if (*line == '@') {
+            sfree(line);
+            continue;
+        }
+
+        /* Strip comments, if any */
+        ptrlen pl = ptrlen_from_asciz(line);
+        {
+            const char *p = memchr(pl.ptr, '#', pl.len);
+            if (p)
+                pl.len = p - (const char *)pl.ptr;
+        }
+
+        /* Strip trailing space */
+        while (pl.len > 0 &&
+               (((char *)pl.ptr)[pl.len-1] == ' ' ||
+                ((char *)pl.ptr)[pl.len-1] == '\t'))
+            pl.len--;
+
+        /* Skip empty lines */
+        if (!pl.len) {
+            sfree(line);
+            continue;
+        }
+
+        /* Break up at semicolons, expecting five fields, each of
+         * which we decode into hex code points */
+        ucharbuf *fields[5];
+        for (size_t i = 0; i < lenof(fields); i++) {
+            ptrlen field = ptrlen_get_word(&pl, ";");
+            fields[i] = ucharbuf_new();
+
+            ptrlen chr;
+            while ((chr = ptrlen_get_word(&field, " ")).len) {
+                char *chrstr = mkstr(chr);
+                uchar c = strtoul(chrstr, NULL, 16);
+                sfree(chrstr);
+                ucharbuf_append(fields[i], c);
+            }
+        }
+
+        subtest(filename, lineno, "NFC(c1) = c2", 'C', fields[0], fields[1]);
+        subtest(filename, lineno, "NFC(c2) = c2", 'C', fields[1], fields[1]);
+        subtest(filename, lineno, "NFC(c3) = c2", 'C', fields[2], fields[1]);
+        subtest(filename, lineno, "NFC(c4) = c4", 'C', fields[3], fields[3]);
+        subtest(filename, lineno, "NFC(c5) = c4", 'C', fields[4], fields[3]);
+        subtest(filename, lineno, "NFD(c1) = c3", 'D', fields[0], fields[2]);
+        subtest(filename, lineno, "NFD(c2) = c3", 'D', fields[1], fields[2]);
+        subtest(filename, lineno, "NFD(c3) = c3", 'D', fields[2], fields[2]);
+        subtest(filename, lineno, "NFD(c4) = c5", 'D', fields[3], fields[4]);
+        subtest(filename, lineno, "NFD(c5) = c5", 'D', fields[4], fields[4]);
+
+        for (size_t i = 0; i < lenof(fields); i++)
+            ucharbuf_free(fields[i]);
+
+        sfree(line);
+    }
+}
+
+int main(int argc, char **argv)
+{
+    if (argc != 2) {
+        fprintf(stderr, "test_unicode_norm: give an input file "
+                "of tests or '-'\n");
+        return 1;
+    }
+
+    const char *filename = argv[1];
+
+    if (!strcmp(filename, "-")) {
+        run_tests("<standard input>", stdin);
+    } else {
+        FILE *fp = fopen(filename, "r");
+        if (!fp) {
+            fprintf(stderr, "test_unicode_norm: unable to open '%s'\n",
+                    filename);
+            return 1;
+        }
+        run_tests(filename, fp);
+        fclose(fp);
+    }
+
+    printf("pass %d fail %d total %d\n", pass, fail, pass + fail);
+
+    return fail != 0;
+}
+#endif