Polish the output of read_ucd.py.

The initial outputs were all deliberately inconsistent with each other, so that each one exactly matched the existing table I was trying to replace. Now I've done that check, I can clean them up. Normalised spacing and case to be consistent; removed pointless indentation (these are now include files, so they don't have to be indented to the same level as the array declaration surrounding each one's #include); added a header comment in each autogenerated file, saying that it's autogenerated, what it's for, and who it's used by. The currently supported version number of Unicode is also exposed in a header file, so that I can put it in diagnostics.
2025-07-07 06:22:47 -05:00 · 2022-11-08 18:04:46 +00:00
parent b72c9aba28
commit 430af47a38
1 changed files with 103 additions and 5 deletions
--- a/unicode/read_ucd.py
+++ b/unicode/read_ucd.py
@ -13,6 +13,7 @@ import argparse
 import collections
 import io
 import os
+import string
 import sys
 import zipfile

@ -90,6 +91,10 @@ class Main:
            self.open_ucd_file = lambda filename: (
                io.TextIOWrapper(ucd_zip.open(filename)))

+        self.find_unicode_version()
+
+        with open("version.h", "w") as fh:
+            self.write_version_header(fh)
        with open("bidi_type.h", "w") as fh:
            self.write_bidi_type_table(fh)
        with open("bidi_mirror.h", "w") as fh:
@ -103,6 +108,22 @@ class Main:
        with open("ambiguous_wide_chars.h", "w") as fh:
            self.write_ambiguous_wide_chars_list(fh)

+    def find_unicode_version(self):
+        """Find out the version of Unicode.
+
+        This is read from the top of NamesList.txt, which has the
+        closest thing to a machine-readable statement of the version
+        number that I found in the whole collection of files.
+        """
+        with self.open_ucd_file("NamesList.txt") as fh:
+            for line in lines(fh):
+                if line.startswith("@@@\t"):
+                    self.unicode_version_full = line[4:]
+                    self.unicode_version_short = " ".join(
+                        w for w in self.unicode_version_full.split(" ")
+                        if any(c in string.digits for c in w))
+                    return
+
    @property
    def UnicodeData(self):
        """Records from UnicodeData.txt.
@ -210,7 +231,41 @@ class Main:
                for c in cs:
                    yield c, fields[1]

+    def write_file_header_comment(self, fh, description):
+        print("/*", file=fh)
+        print(" * Autogenerated by read_ucd.py from",
+              self.unicode_version_full, file=fh)
+        print(" *", file=fh)
+        for line in description.strip("\n").split("\n"):
+            print(" *" + (" " if line != "" else "") + line, file=fh)
+        print(" */", file=fh)
+        print(file=fh)
+
+    def write_version_header(self, fh):
+        self.write_file_header_comment(fh, """
+
+String literals giving the currently supported version of Unicode.
+Useful for error messages and 'about' boxes.
+
+""")
+        assert all(0x20 <= ord(c) < 0x7F and c != '"'
+                   for c in self.unicode_version_full)
+
+        print("#define UNICODE_VERSION_FULL \"{}\"".format(
+            self.unicode_version_full), file=fh)
+        print("#define UNICODE_VERSION_SHORT \"{}\"".format(
+            self.unicode_version_short), file=fh)
+
    def write_bidi_type_table(self, fh):
+        self.write_file_header_comment(fh, """
+
+Bidirectional type of every Unicode character, excluding those with
+type ON.
+
+Used by terminal/bidi.c, whose associated lookup function returns ON
+by default for anything not in this list.
+
+""")
        types = {}

        for rec in self.UnicodeData:
@ -218,9 +273,17 @@ class Main:
                types[rec.c] = rec.Bidi_Class

        for (start, end), t in map_to_ranges(types):
-            print(f"        {{0x{start:04x}, 0x{end:04x}, {t}}},", file=fh)
+            print(f"{{0x{start:04x}, 0x{end:04x}, {t}}},", file=fh)

    def write_bidi_mirroring_table(self, fh):
+        self.write_file_header_comment(fh, """
+
+Map each Unicode character to its mirrored form when printing right to
+left.
+
+Used by terminal/bidi.c.
+
+""")
        bidi_mirror = {}
        for c1, c2 in self.BidiMirroring:
            assert bidi_mirror.get(c1, c2) == c2, f"Clash at {c1:%04X}"
@ -229,9 +292,19 @@ class Main:
            bidi_mirror[c2] = c1

        for c1, c2 in sorted(bidi_mirror.items()):
-            print("        {{0x{:04x}, 0x{:04x}}},".format(c1, c2), file=fh)
+            print("{{0x{:04x}, 0x{:04x}}},".format(c1, c2), file=fh)

    def write_bidi_brackets_table(self, fh):
+        self.write_file_header_comment(fh, """
+
+Identify Unicode characters that count as brackets for the purposes of
+bidirectional text layout. For each one, indicate whether it's an open
+or closed bracket, and identify up to two characters that can act as
+its counterpart.
+
+Used by terminal/bidi.c.
+
+""")
        bracket_map = {}
        for c1, c2, kind in self.BidiBrackets:
            bracket_map[c1] = kind, c2
@ -249,10 +322,18 @@ class Main:
            # UCD claims there's an 'n' kind possible, but as of UCD
            # 14, no instances of it exist
            enumval = {'o': 'BT_OPEN', 'c': 'BT_CLOSE'}[kind]
-            print("        {{0x{:04x}, {{0x{:04x}, 0x{:04x}, {}}}}},".format(
+            print("{{0x{:04x}, {{0x{:04x}, 0x{:04x}, {}}}}},".format(
                src, dst, dsteq, enumval), file=fh)

    def write_nonspacing_chars_list(self, fh):
+        self.write_file_header_comment(fh, """
+
+Identify Unicode characters that occupy no character cells of a
+terminal.
+
+Used by utils/wcwidth.c.
+
+""")
        cs = set()

        for rec in self.UnicodeData:
@ -273,7 +354,7 @@ class Main:
                cs.add(rec.c)

        for start, end in set_to_ranges(cs):
-            print(f"    {{ 0x{start:04X}, 0x{end:04X} }},", file=fh)
+            print(f"{{0x{start:04x}, 0x{end:04x}}},", file=fh)

    def write_width_table(self, fh, accept):
        cs = set()
@ -283,12 +364,29 @@ class Main:
                cs.add(c)

        for start, end in set_to_ranges(cs):
-            print(f"    {{0x{start:04X}, 0x{end:04X}}},", file=fh)
+            print(f"{{0x{start:04x}, 0x{end:04x}}},", file=fh)

    def write_wide_chars_list(self, fh):
+        self.write_file_header_comment(fh, """
+
+Identify Unicode characters that occupy two adjacent character cells
+in a terminal.
+
+Used by utils/wcwidth.c.
+
+""")
        self.write_width_table(fh, {'W', 'F'})

    def write_ambiguous_wide_chars_list(self, fh):
+        self.write_file_header_comment(fh, """
+
+Identify Unicode characters that are width-ambiguous: some regimes
+regard them as occupying two adjacent character cells in a terminal,
+and others do not.
+
+Used by utils/wcwidth.c.
+
+""")
        self.write_width_table(fh, {'A'})

 if __name__ == '__main__':