1
0
mirror of https://git.tartarus.org/simon/putty.git synced 2025-07-18 11:31:00 -05:00

Implement Unicode normalisation.

A new module in 'utils' computes NFC and NFD, via a new set of data
tables generated by read_ucd.py.

The new module comes with a new test program, which can read the
NormalizationTest.txt that appears in the Unicode Character Database.
All the tests pass, as of Unicode 15.
This commit is contained in:
Simon Tatham
2022-11-09 19:28:51 +00:00
parent 4cb429e3f4
commit b35d23f699
8 changed files with 3984 additions and 9 deletions

View File

@ -20,7 +20,9 @@ import zipfile
UCDRecord = collections.namedtuple('UCDRecord', [
'c',
'General_Category',
'Canonical_Combining_Class',
'Bidi_Class',
'Decomposition_Type',
'Decomposition_Mapping',
])
@ -107,6 +109,12 @@ class Main:
self.write_wide_chars_list(fh)
with open("ambiguous_wide_chars.h", "w") as fh:
self.write_ambiguous_wide_chars_list(fh)
with open("combining_classes.h", "w") as fh:
self.write_combining_class_table(fh)
with open("canonical_decomp.h", "w") as fh:
self.write_canonical_decomp_table(fh)
with open("canonical_comp.h", "w") as fh:
self.write_canonical_comp_table(fh)
def find_unicode_version(self):
"""Find out the version of Unicode.
@ -166,14 +174,21 @@ class Main:
# Decode some of the raw fields into more cooked
# forms.
cclass = int(cclass)
# For the moment, we only care about decomposition
# mappings that consist of a single hex number (i.e.
# are singletons and not compatibility mappings)
try:
dm = [int(decomp, 16)]
except ValueError:
dm = []
# Separate the decomposition field into decomposition
# type and mapping.
if decomp == "":
dtype = decomp = None
elif "<" not in decomp:
dtype = 'canonical'
else:
assert decomp.startswith("<")
dtype, decomp = decomp[1:].split(">", 1)
decomp = decomp.lstrip(" ")
# And decode the mapping part from hex strings to integers.
if decomp is not None:
decomp = [int(w, 16) for w in decomp.split(" ")]
# And yield a UCDRecord for each code point in our
# range.
@ -181,8 +196,10 @@ class Main:
yield UCDRecord(
c=codepoint,
General_Category=category,
Canonical_Combining_Class=cclass,
Bidi_Class=bidiclass,
Decomposition_Mapping=dm,
Decomposition_Type=dtype,
Decomposition_Mapping=decomp,
)
@property
@ -231,6 +248,16 @@ class Main:
for c in cs:
yield c, fields[1]
@property
def CompositionExclusions(self):
"""Composition exclusions from CompositionExclusions.txt.
Each yielded item is just a code point.
"""
with self.open_ucd_file("CompositionExclusions.txt") as fh:
for line in lines(fh):
yield int(line, 16)
def write_file_header_comment(self, fh, description):
print("/*", file=fh)
print(" * Autogenerated by read_ucd.py from",
@ -311,7 +338,8 @@ Used by terminal/bidi.c.
equivalents = {}
for rec in self.UnicodeData:
if len(rec.Decomposition_Mapping) == 1:
if (rec.Decomposition_Type == 'canonical' and
len(rec.Decomposition_Mapping) == 1):
c = rec.c
c2 = rec.Decomposition_Mapping[0]
equivalents[c] = c2
@ -389,5 +417,78 @@ Used by utils/wcwidth.c.
""")
self.write_width_table(fh, {'A'})
def write_combining_class_table(self, fh):
self.write_file_header_comment(fh, """
List the canonical combining class of each Unicode character, if it is
not zero. This controls how combining marks can be reordered by the
Unicode normalisation algorithms.
Used by utils/unicode-norm.c.
""")
cclasses = {}
for rec in self.UnicodeData:
cc = rec.Canonical_Combining_Class
if cc != 0:
cclasses[rec.c] = cc
for (start, end), cclass in map_to_ranges(cclasses):
print(f"{{0x{start:04x}, 0x{end:04x}, {cclass:d}}},", file=fh)
def write_canonical_decomp_table(self, fh):
self.write_file_header_comment(fh, """
List the canonical decomposition of every Unicode character that has
one. This consists of up to two characters, but those may need
decomposition in turn.
Used by utils/unicode-norm.c.
""")
decomps = {}
for rec in self.UnicodeData:
if rec.Decomposition_Type != 'canonical':
continue
# Fill in a zero code point as the second character, if
# it's only one character long
decomps[rec.c] = (rec.Decomposition_Mapping + [0])[:2]
for c, (d1, d2) in sorted(decomps.items()):
d2s = f"0x{d2:04x}" if d2 else "0"
print(f"{{0x{c:04x}, 0x{d1:04x}, {d2s}}},", file=fh)
def write_canonical_comp_table(self, fh):
self.write_file_header_comment(fh, """
List the pairs of Unicode characters that canonically recompose to a
single character in NFC.
Used by utils/unicode-norm.c.
""")
exclusions = set(self.CompositionExclusions)
nonstarters = set(rec.c for rec in self.UnicodeData
if rec.Canonical_Combining_Class != 0)
decomps = {}
for rec in self.UnicodeData:
if rec.Decomposition_Type != 'canonical':
continue # we don't want compatibility decompositions
if len(rec.Decomposition_Mapping) != 2:
continue # we don't want singletons either
if rec.c in exclusions:
continue # we don't want anything explicitly excluded
if (rec.c in nonstarters or
rec.Decomposition_Mapping[0] in nonstarters):
continue # we don't want non-starter decompositions
decomps[tuple(rec.Decomposition_Mapping)] = rec.c
for (d0, d1), c in sorted(decomps.items()):
print(f"{{0x{d0:04x}, 0x{d1:04x}, 0x{c:04x}}},", file=fh)
if __name__ == '__main__':
Main().run()