mirror of
https://git.tartarus.org/simon/putty.git
synced 2025-01-10 01:48:00 +00:00
Polish the output of read_ucd.py.
The initial outputs were all deliberately inconsistent with each other, so that each one exactly matched the existing table I was trying to replace. Now I've done that check, I can clean them up. Normalised spacing and case to be consistent; removed pointless indentation (these are now include files, so they don't have to be indented to the same level as the array declaration surrounding each one's #include); added a header comment in each autogenerated file, saying that it's autogenerated, what it's for, and who it's used by. The currently supported version number of Unicode is also exposed in a header file, so that I can put it in diagnostics.
This commit is contained in:
parent
b72c9aba28
commit
430af47a38
@ -13,6 +13,7 @@ import argparse
|
||||
import collections
|
||||
import io
|
||||
import os
|
||||
import string
|
||||
import sys
|
||||
import zipfile
|
||||
|
||||
@ -90,6 +91,10 @@ class Main:
|
||||
self.open_ucd_file = lambda filename: (
|
||||
io.TextIOWrapper(ucd_zip.open(filename)))
|
||||
|
||||
self.find_unicode_version()
|
||||
|
||||
with open("version.h", "w") as fh:
|
||||
self.write_version_header(fh)
|
||||
with open("bidi_type.h", "w") as fh:
|
||||
self.write_bidi_type_table(fh)
|
||||
with open("bidi_mirror.h", "w") as fh:
|
||||
@ -103,6 +108,22 @@ class Main:
|
||||
with open("ambiguous_wide_chars.h", "w") as fh:
|
||||
self.write_ambiguous_wide_chars_list(fh)
|
||||
|
||||
def find_unicode_version(self):
|
||||
"""Find out the version of Unicode.
|
||||
|
||||
This is read from the top of NamesList.txt, which has the
|
||||
closest thing to a machine-readable statement of the version
|
||||
number that I found in the whole collection of files.
|
||||
"""
|
||||
with self.open_ucd_file("NamesList.txt") as fh:
|
||||
for line in lines(fh):
|
||||
if line.startswith("@@@\t"):
|
||||
self.unicode_version_full = line[4:]
|
||||
self.unicode_version_short = " ".join(
|
||||
w for w in self.unicode_version_full.split(" ")
|
||||
if any(c in string.digits for c in w))
|
||||
return
|
||||
|
||||
@property
|
||||
def UnicodeData(self):
|
||||
"""Records from UnicodeData.txt.
|
||||
@ -210,7 +231,41 @@ class Main:
|
||||
for c in cs:
|
||||
yield c, fields[1]
|
||||
|
||||
def write_file_header_comment(self, fh, description):
|
||||
print("/*", file=fh)
|
||||
print(" * Autogenerated by read_ucd.py from",
|
||||
self.unicode_version_full, file=fh)
|
||||
print(" *", file=fh)
|
||||
for line in description.strip("\n").split("\n"):
|
||||
print(" *" + (" " if line != "" else "") + line, file=fh)
|
||||
print(" */", file=fh)
|
||||
print(file=fh)
|
||||
|
||||
def write_version_header(self, fh):
|
||||
self.write_file_header_comment(fh, """
|
||||
|
||||
String literals giving the currently supported version of Unicode.
|
||||
Useful for error messages and 'about' boxes.
|
||||
|
||||
""")
|
||||
assert all(0x20 <= ord(c) < 0x7F and c != '"'
|
||||
for c in self.unicode_version_full)
|
||||
|
||||
print("#define UNICODE_VERSION_FULL \"{}\"".format(
|
||||
self.unicode_version_full), file=fh)
|
||||
print("#define UNICODE_VERSION_SHORT \"{}\"".format(
|
||||
self.unicode_version_short), file=fh)
|
||||
|
||||
def write_bidi_type_table(self, fh):
|
||||
self.write_file_header_comment(fh, """
|
||||
|
||||
Bidirectional type of every Unicode character, excluding those with
|
||||
type ON.
|
||||
|
||||
Used by terminal/bidi.c, whose associated lookup function returns ON
|
||||
by default for anything not in this list.
|
||||
|
||||
""")
|
||||
types = {}
|
||||
|
||||
for rec in self.UnicodeData:
|
||||
@ -218,9 +273,17 @@ class Main:
|
||||
types[rec.c] = rec.Bidi_Class
|
||||
|
||||
for (start, end), t in map_to_ranges(types):
|
||||
print(f" {{0x{start:04x}, 0x{end:04x}, {t}}},", file=fh)
|
||||
print(f"{{0x{start:04x}, 0x{end:04x}, {t}}},", file=fh)
|
||||
|
||||
def write_bidi_mirroring_table(self, fh):
|
||||
self.write_file_header_comment(fh, """
|
||||
|
||||
Map each Unicode character to its mirrored form when printing right to
|
||||
left.
|
||||
|
||||
Used by terminal/bidi.c.
|
||||
|
||||
""")
|
||||
bidi_mirror = {}
|
||||
for c1, c2 in self.BidiMirroring:
|
||||
assert bidi_mirror.get(c1, c2) == c2, f"Clash at {c1:%04X}"
|
||||
@ -229,9 +292,19 @@ class Main:
|
||||
bidi_mirror[c2] = c1
|
||||
|
||||
for c1, c2 in sorted(bidi_mirror.items()):
|
||||
print(" {{0x{:04x}, 0x{:04x}}},".format(c1, c2), file=fh)
|
||||
print("{{0x{:04x}, 0x{:04x}}},".format(c1, c2), file=fh)
|
||||
|
||||
def write_bidi_brackets_table(self, fh):
|
||||
self.write_file_header_comment(fh, """
|
||||
|
||||
Identify Unicode characters that count as brackets for the purposes of
|
||||
bidirectional text layout. For each one, indicate whether it's an open
|
||||
or closed bracket, and identify up to two characters that can act as
|
||||
its counterpart.
|
||||
|
||||
Used by terminal/bidi.c.
|
||||
|
||||
""")
|
||||
bracket_map = {}
|
||||
for c1, c2, kind in self.BidiBrackets:
|
||||
bracket_map[c1] = kind, c2
|
||||
@ -249,10 +322,18 @@ class Main:
|
||||
# UCD claims there's an 'n' kind possible, but as of UCD
|
||||
# 14, no instances of it exist
|
||||
enumval = {'o': 'BT_OPEN', 'c': 'BT_CLOSE'}[kind]
|
||||
print(" {{0x{:04x}, {{0x{:04x}, 0x{:04x}, {}}}}},".format(
|
||||
print("{{0x{:04x}, {{0x{:04x}, 0x{:04x}, {}}}}},".format(
|
||||
src, dst, dsteq, enumval), file=fh)
|
||||
|
||||
def write_nonspacing_chars_list(self, fh):
|
||||
self.write_file_header_comment(fh, """
|
||||
|
||||
Identify Unicode characters that occupy no character cells of a
|
||||
terminal.
|
||||
|
||||
Used by utils/wcwidth.c.
|
||||
|
||||
""")
|
||||
cs = set()
|
||||
|
||||
for rec in self.UnicodeData:
|
||||
@ -273,7 +354,7 @@ class Main:
|
||||
cs.add(rec.c)
|
||||
|
||||
for start, end in set_to_ranges(cs):
|
||||
print(f" {{ 0x{start:04X}, 0x{end:04X} }},", file=fh)
|
||||
print(f"{{0x{start:04x}, 0x{end:04x}}},", file=fh)
|
||||
|
||||
def write_width_table(self, fh, accept):
|
||||
cs = set()
|
||||
@ -283,12 +364,29 @@ class Main:
|
||||
cs.add(c)
|
||||
|
||||
for start, end in set_to_ranges(cs):
|
||||
print(f" {{0x{start:04X}, 0x{end:04X}}},", file=fh)
|
||||
print(f"{{0x{start:04x}, 0x{end:04x}}},", file=fh)
|
||||
|
||||
def write_wide_chars_list(self, fh):
|
||||
self.write_file_header_comment(fh, """
|
||||
|
||||
Identify Unicode characters that occupy two adjacent character cells
|
||||
in a terminal.
|
||||
|
||||
Used by utils/wcwidth.c.
|
||||
|
||||
""")
|
||||
self.write_width_table(fh, {'W', 'F'})
|
||||
|
||||
def write_ambiguous_wide_chars_list(self, fh):
|
||||
self.write_file_header_comment(fh, """
|
||||
|
||||
Identify Unicode characters that are width-ambiguous: some regimes
|
||||
regard them as occupying two adjacent character cells in a terminal,
|
||||
and others do not.
|
||||
|
||||
Used by utils/wcwidth.c.
|
||||
|
||||
""")
|
||||
self.write_width_table(fh, {'A'})
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
Loading…
Reference in New Issue
Block a user