New script to generate Unicode data tables.

This will replace the various pieces of Perl scattered throughout the code base in comments above long boring data tables. The idea is that those long boring tables will move into header files in the new 'unicode' directory, and will be #included from the source files that use the tables. One benefit is that I won't have to page tediously past the tables to get to the actual code I want to edit. But more importantly, it should now become easy to update to a new version of Unicode, by re-running just one script and committing the changed versions of all the headers in the 'unicode' subdir. This version of the script regenerates six Unicode-derived tables in the existing source code in a byte-for-byte identical form. In the next commits I'll clean it up, commit the output, and delete the tables from their previous locations. (One table I _haven't_ incorporated into this system is the Arabic shaping table in bidi.c, because my attempt to regenerate it came out not matching the original at all. That _might_ be because the table is based on an old Unicode standard and desperately needs updating, but it might also be because I misunderstood how it works. So I'll leave sorting that out for another time.)
2025-07-02 20:12:48 -05:00 · 2022-11-08 18:11:44 +00:00
parent 69e217d23a
commit b72c9aba28
1 changed files with 295 additions and 0 deletions
--- a/unicode/read_ucd.py
+++ b/unicode/read_ucd.py
@ -0,0 +1,295 @@
+#!/usr/bin/env python3
+
+# Tool to read various files from the Unicode character database and
+# generate headers containing derived arrays and lookup tables needed
+# by PuTTY.
+#
+# The aim is to have this be a single tool which you can easily re-run
+# against a new version of Unicode, simply by pointing it at an
+# appropriate UCD.zip or a directory containing the same files
+# unpacked.
+
+import argparse
+import collections
+import io
+import os
+import sys
+import zipfile
+
+UCDRecord = collections.namedtuple('UCDRecord', [
+    'c',
+    'General_Category',
+    'Bidi_Class',
+    'Decomposition_Mapping',
+])
+
+def to_ranges(iterable):
+    """Collect together adjacent ranges in a list of (key, value) pairs.
+
+    The input iterable should deliver a sequence of (key, value) pairs
+    in which the keys are integers in sorted order. The output is a
+    sequence of tuples with structure ((start, end), value), each
+    indicating that all the keys [start, start+1, ..., end] go with
+    that value.
+    """
+    start = end = val = None
+
+    for k, v in iterable:
+        if (k-1, v) == (end, val):
+            end = k
+        else:
+            if start is not None:
+                yield (start, end), val
+            start, end, val = k, k, v
+
+    if start is not None:
+        yield (start, end), val
+
+def map_to_ranges(m):
+    """Convert an integer-keyed map into a list of (range, value) pairs."""
+    yield from to_ranges(sorted(m.items()))
+
+def set_to_ranges(s):
+    """Convert a set into a list of ranges."""
+    for r, _ in to_ranges((x, None) for x in sorted(s)):
+        yield r
+
+def lines(iterable, keep_comments=False):
+    """Deliver the lines of a Unicode data file.
+
+    The input iterable should yield raw lines of the file: for
+    example, it can be the file handle itself. The output values have
+    their newlines removed, comments and trailing spaces deleted, and
+    blank lines discarded.
+    """
+    for line in iter(iterable):
+        line = line.rstrip("\r\n")
+        if not keep_comments:
+            line = line.split("#", 1)[0]
+        line = line.rstrip(" \t")
+        if line == "":
+            continue
+        yield line
+
+class Main:
+    def run(self):
+        "Parse arguments and generate all the output files."
+
+        parser = argparse.ArgumentParser(
+            description='Build UCD-derived source files.')
+        parser.add_argument("ucd", help="UCD to work from, either UCD.zip or "
+                            "a directory full of unpacked files.")
+        args = parser.parse_args()
+
+        if os.path.isdir(args.ucd):
+            ucd_dir = args.ucd
+            self.open_ucd_file = lambda filename: (
+                open(os.path.join(ucd_dir, filename)))
+        else:
+            ucd_zip = zipfile.ZipFile(args.ucd)
+            self.open_ucd_file = lambda filename: (
+                io.TextIOWrapper(ucd_zip.open(filename)))
+
+        with open("bidi_type.h", "w") as fh:
+            self.write_bidi_type_table(fh)
+        with open("bidi_mirror.h", "w") as fh:
+            self.write_bidi_mirroring_table(fh)
+        with open("bidi_brackets.h", "w") as fh:
+            self.write_bidi_brackets_table(fh)
+        with open("nonspacing_chars.h", "w") as fh:
+            self.write_nonspacing_chars_list(fh)
+        with open("wide_chars.h", "w") as fh:
+            self.write_wide_chars_list(fh)
+        with open("ambiguous_wide_chars.h", "w") as fh:
+            self.write_ambiguous_wide_chars_list(fh)
+
+    @property
+    def UnicodeData(self):
+        """Records from UnicodeData.txt.
+
+        Each yielded item is a UCDRecord tuple.
+        """
+        with self.open_ucd_file("UnicodeData.txt") as fh:
+            for line in lines(fh):
+                # Split up the line into its raw fields.
+                (
+                    codepoint, name, category, cclass, bidiclass, decomp,
+                    num6, num7, num8, bidimirrored, obsolete_unicode1_name,
+                    obsolete_comment, uppercase, lowercase, titlecase,
+                ) = line.split(";")
+
+                # By default, we expect that this record describes
+                # just one code point.
+                codepoints = [int(codepoint, 16)]
+
+                # Spot the special markers where consecutive lines say
+                # <Foo, First> and <Foo, Last>, indicating that the
+                # entire range of code points in between are treated
+                # the same. If so, we replace 'codepoints' with a
+                # range object.
+                if "<" in name:
+                    assert name.startswith("<") and name.endswith(">"), (
+                        "Confusing < in character name: {!r}".format(line))
+                    name_pieces = [piece.strip(" \t") for piece in
+                                   name.lstrip("<").rstrip(">").split(",")]
+                    if "First" in name_pieces:
+                        assert isinstance(codepoints, list)
+                        prev_line_was_first = True
+                        prev_codepoint = codepoints[0]
+                        continue
+                    elif "Last" in name_pieces:
+                        assert prev_line_was_first
+                        codepoints = range(prev_codepoint, codepoints[0]+1)
+                        del prev_codepoint
+                prev_line_was_first = False
+
+                # Decode some of the raw fields into more cooked
+                # forms.
+
+                # For the moment, we only care about decomposition
+                # mappings that consist of a single hex number (i.e.
+                # are singletons and not compatibility mappings)
+                try:
+                    dm = [int(decomp, 16)]
+                except ValueError:
+                    dm = []
+
+                # And yield a UCDRecord for each code point in our
+                # range.
+                for codepoint in codepoints:
+                    yield UCDRecord(
+                        c=codepoint,
+                        General_Category=category,
+                        Bidi_Class=bidiclass,
+                        Decomposition_Mapping=dm,
+                    )
+
+    @property
+    def BidiMirroring(self):
+        """Parsed character pairs from BidiMirroring.txt.
+
+        Each yielded tuple is a pair of Unicode code points.
+        """
+        with self.open_ucd_file("BidiMirroring.txt") as fh:
+            for line in lines(fh):
+                cs1, cs2 = line.split(";")
+                c1 = int(cs1, 16)
+                c2 = int(cs2, 16)
+                yield c1, c2
+
+    @property
+    def BidiBrackets(self):
+        """Bracket pairs from BidiBrackets.txt.
+
+        Each yielded tuple is a pair of Unicode code points, followed
+        by either 'o', 'c' or 'n' to indicate whether the first one is
+        an open or closing parenthesis or neither.
+        """
+        with self.open_ucd_file("BidiBrackets.txt") as fh:
+            for line in lines(fh):
+                cs1, cs2, kind = line.split(";")
+                c1 = int(cs1, 16)
+                c2 = int(cs2, 16)
+                kind = kind.strip(" \t")
+                yield c1, c2, kind
+
+    @property
+    def EastAsianWidth(self):
+        """East Asian width types from EastAsianWidth.txt.
+
+        Each yielded tuple is (code point, width type).
+        """
+        with self.open_ucd_file("EastAsianWidth.txt") as fh:
+            for line in lines(fh):
+                fields = line.split(";")
+                if ".." in fields[0]:
+                    start, end = [int(s, 16) for s in fields[0].split("..")]
+                    cs = range(start, end+1)
+                else:
+                    cs = [int(fields[0], 16)]
+                for c in cs:
+                    yield c, fields[1]
+
+    def write_bidi_type_table(self, fh):
+        types = {}
+
+        for rec in self.UnicodeData:
+            if rec.Bidi_Class != "ON":
+                types[rec.c] = rec.Bidi_Class
+
+        for (start, end), t in map_to_ranges(types):
+            print(f"        {{0x{start:04x}, 0x{end:04x}, {t}}},", file=fh)
+
+    def write_bidi_mirroring_table(self, fh):
+        bidi_mirror = {}
+        for c1, c2 in self.BidiMirroring:
+            assert bidi_mirror.get(c1, c2) == c2, f"Clash at {c1:%04X}"
+            bidi_mirror[c1] = c2
+            assert bidi_mirror.get(c2, c1) == c1, f"Clash at {c2:%04X}"
+            bidi_mirror[c2] = c1
+
+        for c1, c2 in sorted(bidi_mirror.items()):
+            print("        {{0x{:04x}, 0x{:04x}}},".format(c1, c2), file=fh)
+
+    def write_bidi_brackets_table(self, fh):
+        bracket_map = {}
+        for c1, c2, kind in self.BidiBrackets:
+            bracket_map[c1] = kind, c2
+
+        equivalents = {}
+        for rec in self.UnicodeData:
+            if len(rec.Decomposition_Mapping) == 1:
+                c = rec.c
+                c2 = rec.Decomposition_Mapping[0]
+                equivalents[c] = c2
+                equivalents[c2] = c
+
+        for src, (kind, dst) in sorted(bracket_map.items()):
+            dsteq = equivalents.get(dst, 0)
+            # UCD claims there's an 'n' kind possible, but as of UCD
+            # 14, no instances of it exist
+            enumval = {'o': 'BT_OPEN', 'c': 'BT_CLOSE'}[kind]
+            print("        {{0x{:04x}, {{0x{:04x}, 0x{:04x}, {}}}}},".format(
+                src, dst, dsteq, enumval), file=fh)
+
+    def write_nonspacing_chars_list(self, fh):
+        cs = set()
+
+        for rec in self.UnicodeData:
+            nonspacing = rec.General_Category in {"Me", "Mn", "Cf"}
+            if rec.c == 0xAD:
+                # In typography this is a SOFT HYPHEN and counts as
+                # discardable. But it's also an ISO 8859-1 printing
+                # character, and all of those occupy one character
+                # cell in a terminal.
+                nonspacing = False
+            if 0x1160 <= rec.c <= 0x11FF:
+                # Medial (vowel) and final (consonant) jamo for
+                # decomposed Hangul characters. These are regarded as
+                # non-spacing on the grounds that they compose with
+                # the preceding initial consonant.
+                nonspacing = True
+            if nonspacing:
+                cs.add(rec.c)
+
+        for start, end in set_to_ranges(cs):
+            print(f"    {{ 0x{start:04X}, 0x{end:04X} }},", file=fh)
+
+    def write_width_table(self, fh, accept):
+        cs = set()
+
+        for c, wid in self.EastAsianWidth:
+            if wid in accept:
+                cs.add(c)
+
+        for start, end in set_to_ranges(cs):
+            print(f"    {{0x{start:04X}, 0x{end:04X}}},", file=fh)
+
+    def write_wide_chars_list(self, fh):
+        self.write_width_table(fh, {'W', 'F'})
+
+    def write_ambiguous_wide_chars_list(self, fh):
+        self.write_width_table(fh, {'A'})
+
+if __name__ == '__main__':
+    Main().run()