diff --git a/unicode/read_ucd.py b/unicode/read_ucd.py index 51828ceb..050f6eb1 100755 --- a/unicode/read_ucd.py +++ b/unicode/read_ucd.py @@ -13,6 +13,7 @@ import argparse import collections import io import os +import string import sys import zipfile @@ -90,6 +91,10 @@ class Main: self.open_ucd_file = lambda filename: ( io.TextIOWrapper(ucd_zip.open(filename))) + self.find_unicode_version() + + with open("version.h", "w") as fh: + self.write_version_header(fh) with open("bidi_type.h", "w") as fh: self.write_bidi_type_table(fh) with open("bidi_mirror.h", "w") as fh: @@ -103,6 +108,22 @@ class Main: with open("ambiguous_wide_chars.h", "w") as fh: self.write_ambiguous_wide_chars_list(fh) + def find_unicode_version(self): + """Find out the version of Unicode. + + This is read from the top of NamesList.txt, which has the + closest thing to a machine-readable statement of the version + number that I found in the whole collection of files. + """ + with self.open_ucd_file("NamesList.txt") as fh: + for line in lines(fh): + if line.startswith("@@@\t"): + self.unicode_version_full = line[4:] + self.unicode_version_short = " ".join( + w for w in self.unicode_version_full.split(" ") + if any(c in string.digits for c in w)) + return + @property def UnicodeData(self): """Records from UnicodeData.txt. @@ -210,7 +231,41 @@ class Main: for c in cs: yield c, fields[1] + def write_file_header_comment(self, fh, description): + print("/*", file=fh) + print(" * Autogenerated by read_ucd.py from", + self.unicode_version_full, file=fh) + print(" *", file=fh) + for line in description.strip("\n").split("\n"): + print(" *" + (" " if line != "" else "") + line, file=fh) + print(" */", file=fh) + print(file=fh) + + def write_version_header(self, fh): + self.write_file_header_comment(fh, """ + +String literals giving the currently supported version of Unicode. +Useful for error messages and 'about' boxes. + +""") + assert all(0x20 <= ord(c) < 0x7F and c != '"' + for c in self.unicode_version_full) + + print("#define UNICODE_VERSION_FULL \"{}\"".format( + self.unicode_version_full), file=fh) + print("#define UNICODE_VERSION_SHORT \"{}\"".format( + self.unicode_version_short), file=fh) + def write_bidi_type_table(self, fh): + self.write_file_header_comment(fh, """ + +Bidirectional type of every Unicode character, excluding those with +type ON. + +Used by terminal/bidi.c, whose associated lookup function returns ON +by default for anything not in this list. + +""") types = {} for rec in self.UnicodeData: @@ -218,9 +273,17 @@ class Main: types[rec.c] = rec.Bidi_Class for (start, end), t in map_to_ranges(types): - print(f" {{0x{start:04x}, 0x{end:04x}, {t}}},", file=fh) + print(f"{{0x{start:04x}, 0x{end:04x}, {t}}},", file=fh) def write_bidi_mirroring_table(self, fh): + self.write_file_header_comment(fh, """ + +Map each Unicode character to its mirrored form when printing right to +left. + +Used by terminal/bidi.c. + +""") bidi_mirror = {} for c1, c2 in self.BidiMirroring: assert bidi_mirror.get(c1, c2) == c2, f"Clash at {c1:%04X}" @@ -229,9 +292,19 @@ class Main: bidi_mirror[c2] = c1 for c1, c2 in sorted(bidi_mirror.items()): - print(" {{0x{:04x}, 0x{:04x}}},".format(c1, c2), file=fh) + print("{{0x{:04x}, 0x{:04x}}},".format(c1, c2), file=fh) def write_bidi_brackets_table(self, fh): + self.write_file_header_comment(fh, """ + +Identify Unicode characters that count as brackets for the purposes of +bidirectional text layout. For each one, indicate whether it's an open +or closed bracket, and identify up to two characters that can act as +its counterpart. + +Used by terminal/bidi.c. + +""") bracket_map = {} for c1, c2, kind in self.BidiBrackets: bracket_map[c1] = kind, c2 @@ -249,10 +322,18 @@ class Main: # UCD claims there's an 'n' kind possible, but as of UCD # 14, no instances of it exist enumval = {'o': 'BT_OPEN', 'c': 'BT_CLOSE'}[kind] - print(" {{0x{:04x}, {{0x{:04x}, 0x{:04x}, {}}}}},".format( + print("{{0x{:04x}, {{0x{:04x}, 0x{:04x}, {}}}}},".format( src, dst, dsteq, enumval), file=fh) def write_nonspacing_chars_list(self, fh): + self.write_file_header_comment(fh, """ + +Identify Unicode characters that occupy no character cells of a +terminal. + +Used by utils/wcwidth.c. + +""") cs = set() for rec in self.UnicodeData: @@ -273,7 +354,7 @@ class Main: cs.add(rec.c) for start, end in set_to_ranges(cs): - print(f" {{ 0x{start:04X}, 0x{end:04X} }},", file=fh) + print(f"{{0x{start:04x}, 0x{end:04x}}},", file=fh) def write_width_table(self, fh, accept): cs = set() @@ -283,12 +364,29 @@ class Main: cs.add(c) for start, end in set_to_ranges(cs): - print(f" {{0x{start:04X}, 0x{end:04X}}},", file=fh) + print(f"{{0x{start:04x}, 0x{end:04x}}},", file=fh) def write_wide_chars_list(self, fh): + self.write_file_header_comment(fh, """ + +Identify Unicode characters that occupy two adjacent character cells +in a terminal. + +Used by utils/wcwidth.c. + +""") self.write_width_table(fh, {'W', 'F'}) def write_ambiguous_wide_chars_list(self, fh): + self.write_file_header_comment(fh, """ + +Identify Unicode characters that are width-ambiguous: some regimes +regard them as occupying two adjacent character cells in a terminal, +and others do not. + +Used by utils/wcwidth.c. + +""") self.write_width_table(fh, {'A'}) if __name__ == '__main__':