1
0
mirror of https://git.tartarus.org/simon/putty.git synced 2025-01-10 01:48:00 +00:00

Polish the output of read_ucd.py.

The initial outputs were all deliberately inconsistent with each
other, so that each one exactly matched the existing table I was
trying to replace.

Now I've done that check, I can clean them up. Normalised spacing and
case to be consistent; removed pointless indentation (these are now
include files, so they don't have to be indented to the same level as
the array declaration surrounding each one's #include); added a header
comment in each autogenerated file, saying that it's autogenerated,
what it's for, and who it's used by.

The currently supported version number of Unicode is also exposed in a
header file, so that I can put it in diagnostics.
This commit is contained in:
Simon Tatham 2022-11-08 18:04:46 +00:00
parent b72c9aba28
commit 430af47a38

View File

@ -13,6 +13,7 @@ import argparse
import collections
import io
import os
import string
import sys
import zipfile
@ -90,6 +91,10 @@ class Main:
self.open_ucd_file = lambda filename: (
io.TextIOWrapper(ucd_zip.open(filename)))
self.find_unicode_version()
with open("version.h", "w") as fh:
self.write_version_header(fh)
with open("bidi_type.h", "w") as fh:
self.write_bidi_type_table(fh)
with open("bidi_mirror.h", "w") as fh:
@ -103,6 +108,22 @@ class Main:
with open("ambiguous_wide_chars.h", "w") as fh:
self.write_ambiguous_wide_chars_list(fh)
def find_unicode_version(self):
"""Find out the version of Unicode.
This is read from the top of NamesList.txt, which has the
closest thing to a machine-readable statement of the version
number that I found in the whole collection of files.
"""
with self.open_ucd_file("NamesList.txt") as fh:
for line in lines(fh):
if line.startswith("@@@\t"):
self.unicode_version_full = line[4:]
self.unicode_version_short = " ".join(
w for w in self.unicode_version_full.split(" ")
if any(c in string.digits for c in w))
return
@property
def UnicodeData(self):
"""Records from UnicodeData.txt.
@ -210,7 +231,41 @@ class Main:
for c in cs:
yield c, fields[1]
def write_file_header_comment(self, fh, description):
print("/*", file=fh)
print(" * Autogenerated by read_ucd.py from",
self.unicode_version_full, file=fh)
print(" *", file=fh)
for line in description.strip("\n").split("\n"):
print(" *" + (" " if line != "" else "") + line, file=fh)
print(" */", file=fh)
print(file=fh)
def write_version_header(self, fh):
self.write_file_header_comment(fh, """
String literals giving the currently supported version of Unicode.
Useful for error messages and 'about' boxes.
""")
assert all(0x20 <= ord(c) < 0x7F and c != '"'
for c in self.unicode_version_full)
print("#define UNICODE_VERSION_FULL \"{}\"".format(
self.unicode_version_full), file=fh)
print("#define UNICODE_VERSION_SHORT \"{}\"".format(
self.unicode_version_short), file=fh)
def write_bidi_type_table(self, fh):
self.write_file_header_comment(fh, """
Bidirectional type of every Unicode character, excluding those with
type ON.
Used by terminal/bidi.c, whose associated lookup function returns ON
by default for anything not in this list.
""")
types = {}
for rec in self.UnicodeData:
@ -218,9 +273,17 @@ class Main:
types[rec.c] = rec.Bidi_Class
for (start, end), t in map_to_ranges(types):
print(f" {{0x{start:04x}, 0x{end:04x}, {t}}},", file=fh)
print(f"{{0x{start:04x}, 0x{end:04x}, {t}}},", file=fh)
def write_bidi_mirroring_table(self, fh):
self.write_file_header_comment(fh, """
Map each Unicode character to its mirrored form when printing right to
left.
Used by terminal/bidi.c.
""")
bidi_mirror = {}
for c1, c2 in self.BidiMirroring:
assert bidi_mirror.get(c1, c2) == c2, f"Clash at {c1:%04X}"
@ -229,9 +292,19 @@ class Main:
bidi_mirror[c2] = c1
for c1, c2 in sorted(bidi_mirror.items()):
print(" {{0x{:04x}, 0x{:04x}}},".format(c1, c2), file=fh)
print("{{0x{:04x}, 0x{:04x}}},".format(c1, c2), file=fh)
def write_bidi_brackets_table(self, fh):
self.write_file_header_comment(fh, """
Identify Unicode characters that count as brackets for the purposes of
bidirectional text layout. For each one, indicate whether it's an open
or closed bracket, and identify up to two characters that can act as
its counterpart.
Used by terminal/bidi.c.
""")
bracket_map = {}
for c1, c2, kind in self.BidiBrackets:
bracket_map[c1] = kind, c2
@ -249,10 +322,18 @@ class Main:
# UCD claims there's an 'n' kind possible, but as of UCD
# 14, no instances of it exist
enumval = {'o': 'BT_OPEN', 'c': 'BT_CLOSE'}[kind]
print(" {{0x{:04x}, {{0x{:04x}, 0x{:04x}, {}}}}},".format(
print("{{0x{:04x}, {{0x{:04x}, 0x{:04x}, {}}}}},".format(
src, dst, dsteq, enumval), file=fh)
def write_nonspacing_chars_list(self, fh):
self.write_file_header_comment(fh, """
Identify Unicode characters that occupy no character cells of a
terminal.
Used by utils/wcwidth.c.
""")
cs = set()
for rec in self.UnicodeData:
@ -273,7 +354,7 @@ class Main:
cs.add(rec.c)
for start, end in set_to_ranges(cs):
print(f" {{ 0x{start:04X}, 0x{end:04X} }},", file=fh)
print(f"{{0x{start:04x}, 0x{end:04x}}},", file=fh)
def write_width_table(self, fh, accept):
cs = set()
@ -283,12 +364,29 @@ class Main:
cs.add(c)
for start, end in set_to_ranges(cs):
print(f" {{0x{start:04X}, 0x{end:04X}}},", file=fh)
print(f"{{0x{start:04x}, 0x{end:04x}}},", file=fh)
def write_wide_chars_list(self, fh):
self.write_file_header_comment(fh, """
Identify Unicode characters that occupy two adjacent character cells
in a terminal.
Used by utils/wcwidth.c.
""")
self.write_width_table(fh, {'W', 'F'})
def write_ambiguous_wide_chars_list(self, fh):
self.write_file_header_comment(fh, """
Identify Unicode characters that are width-ambiguous: some regimes
regard them as occupying two adjacent character cells in a terminal,
and others do not.
Used by utils/wcwidth.c.
""")
self.write_width_table(fh, {'A'})
if __name__ == '__main__':