#!/usr/bin/env python3 # Tool to read various files from the Unicode character database and # generate headers containing derived arrays and lookup tables needed # by PuTTY. # # The aim is to have this be a single tool which you can easily re-run # against a new version of Unicode, simply by pointing it at an # appropriate UCD.zip or a directory containing the same files # unpacked. import argparse import collections import io import os import sys import zipfile UCDRecord = collections.namedtuple('UCDRecord', [ 'c', 'General_Category', 'Bidi_Class', 'Decomposition_Mapping', ]) def to_ranges(iterable): """Collect together adjacent ranges in a list of (key, value) pairs. The input iterable should deliver a sequence of (key, value) pairs in which the keys are integers in sorted order. The output is a sequence of tuples with structure ((start, end), value), each indicating that all the keys [start, start+1, ..., end] go with that value. """ start = end = val = None for k, v in iterable: if (k-1, v) == (end, val): end = k else: if start is not None: yield (start, end), val start, end, val = k, k, v if start is not None: yield (start, end), val def map_to_ranges(m): """Convert an integer-keyed map into a list of (range, value) pairs.""" yield from to_ranges(sorted(m.items())) def set_to_ranges(s): """Convert a set into a list of ranges.""" for r, _ in to_ranges((x, None) for x in sorted(s)): yield r def lines(iterable, keep_comments=False): """Deliver the lines of a Unicode data file. The input iterable should yield raw lines of the file: for example, it can be the file handle itself. The output values have their newlines removed, comments and trailing spaces deleted, and blank lines discarded. """ for line in iter(iterable): line = line.rstrip("\r\n") if not keep_comments: line = line.split("#", 1)[0] line = line.rstrip(" \t") if line == "": continue yield line class Main: def run(self): "Parse arguments and generate all the output files." parser = argparse.ArgumentParser( description='Build UCD-derived source files.') parser.add_argument("ucd", help="UCD to work from, either UCD.zip or " "a directory full of unpacked files.") args = parser.parse_args() if os.path.isdir(args.ucd): ucd_dir = args.ucd self.open_ucd_file = lambda filename: ( open(os.path.join(ucd_dir, filename))) else: ucd_zip = zipfile.ZipFile(args.ucd) self.open_ucd_file = lambda filename: ( io.TextIOWrapper(ucd_zip.open(filename))) with open("bidi_type.h", "w") as fh: self.write_bidi_type_table(fh) with open("bidi_mirror.h", "w") as fh: self.write_bidi_mirroring_table(fh) with open("bidi_brackets.h", "w") as fh: self.write_bidi_brackets_table(fh) with open("nonspacing_chars.h", "w") as fh: self.write_nonspacing_chars_list(fh) with open("wide_chars.h", "w") as fh: self.write_wide_chars_list(fh) with open("ambiguous_wide_chars.h", "w") as fh: self.write_ambiguous_wide_chars_list(fh) @property def UnicodeData(self): """Records from UnicodeData.txt. Each yielded item is a UCDRecord tuple. """ with self.open_ucd_file("UnicodeData.txt") as fh: for line in lines(fh): # Split up the line into its raw fields. ( codepoint, name, category, cclass, bidiclass, decomp, num6, num7, num8, bidimirrored, obsolete_unicode1_name, obsolete_comment, uppercase, lowercase, titlecase, ) = line.split(";") # By default, we expect that this record describes # just one code point. codepoints = [int(codepoint, 16)] # Spot the special markers where consecutive lines say # and , indicating that the # entire range of code points in between are treated # the same. If so, we replace 'codepoints' with a # range object. if "<" in name: assert name.startswith("<") and name.endswith(">"), ( "Confusing < in character name: {!r}".format(line)) name_pieces = [piece.strip(" \t") for piece in name.lstrip("<").rstrip(">").split(",")] if "First" in name_pieces: assert isinstance(codepoints, list) prev_line_was_first = True prev_codepoint = codepoints[0] continue elif "Last" in name_pieces: assert prev_line_was_first codepoints = range(prev_codepoint, codepoints[0]+1) del prev_codepoint prev_line_was_first = False # Decode some of the raw fields into more cooked # forms. # For the moment, we only care about decomposition # mappings that consist of a single hex number (i.e. # are singletons and not compatibility mappings) try: dm = [int(decomp, 16)] except ValueError: dm = [] # And yield a UCDRecord for each code point in our # range. for codepoint in codepoints: yield UCDRecord( c=codepoint, General_Category=category, Bidi_Class=bidiclass, Decomposition_Mapping=dm, ) @property def BidiMirroring(self): """Parsed character pairs from BidiMirroring.txt. Each yielded tuple is a pair of Unicode code points. """ with self.open_ucd_file("BidiMirroring.txt") as fh: for line in lines(fh): cs1, cs2 = line.split(";") c1 = int(cs1, 16) c2 = int(cs2, 16) yield c1, c2 @property def BidiBrackets(self): """Bracket pairs from BidiBrackets.txt. Each yielded tuple is a pair of Unicode code points, followed by either 'o', 'c' or 'n' to indicate whether the first one is an open or closing parenthesis or neither. """ with self.open_ucd_file("BidiBrackets.txt") as fh: for line in lines(fh): cs1, cs2, kind = line.split(";") c1 = int(cs1, 16) c2 = int(cs2, 16) kind = kind.strip(" \t") yield c1, c2, kind @property def EastAsianWidth(self): """East Asian width types from EastAsianWidth.txt. Each yielded tuple is (code point, width type). """ with self.open_ucd_file("EastAsianWidth.txt") as fh: for line in lines(fh): fields = line.split(";") if ".." in fields[0]: start, end = [int(s, 16) for s in fields[0].split("..")] cs = range(start, end+1) else: cs = [int(fields[0], 16)] for c in cs: yield c, fields[1] def write_bidi_type_table(self, fh): types = {} for rec in self.UnicodeData: if rec.Bidi_Class != "ON": types[rec.c] = rec.Bidi_Class for (start, end), t in map_to_ranges(types): print(f" {{0x{start:04x}, 0x{end:04x}, {t}}},", file=fh) def write_bidi_mirroring_table(self, fh): bidi_mirror = {} for c1, c2 in self.BidiMirroring: assert bidi_mirror.get(c1, c2) == c2, f"Clash at {c1:%04X}" bidi_mirror[c1] = c2 assert bidi_mirror.get(c2, c1) == c1, f"Clash at {c2:%04X}" bidi_mirror[c2] = c1 for c1, c2 in sorted(bidi_mirror.items()): print(" {{0x{:04x}, 0x{:04x}}},".format(c1, c2), file=fh) def write_bidi_brackets_table(self, fh): bracket_map = {} for c1, c2, kind in self.BidiBrackets: bracket_map[c1] = kind, c2 equivalents = {} for rec in self.UnicodeData: if len(rec.Decomposition_Mapping) == 1: c = rec.c c2 = rec.Decomposition_Mapping[0] equivalents[c] = c2 equivalents[c2] = c for src, (kind, dst) in sorted(bracket_map.items()): dsteq = equivalents.get(dst, 0) # UCD claims there's an 'n' kind possible, but as of UCD # 14, no instances of it exist enumval = {'o': 'BT_OPEN', 'c': 'BT_CLOSE'}[kind] print(" {{0x{:04x}, {{0x{:04x}, 0x{:04x}, {}}}}},".format( src, dst, dsteq, enumval), file=fh) def write_nonspacing_chars_list(self, fh): cs = set() for rec in self.UnicodeData: nonspacing = rec.General_Category in {"Me", "Mn", "Cf"} if rec.c == 0xAD: # In typography this is a SOFT HYPHEN and counts as # discardable. But it's also an ISO 8859-1 printing # character, and all of those occupy one character # cell in a terminal. nonspacing = False if 0x1160 <= rec.c <= 0x11FF: # Medial (vowel) and final (consonant) jamo for # decomposed Hangul characters. These are regarded as # non-spacing on the grounds that they compose with # the preceding initial consonant. nonspacing = True if nonspacing: cs.add(rec.c) for start, end in set_to_ranges(cs): print(f" {{ 0x{start:04X}, 0x{end:04X} }},", file=fh) def write_width_table(self, fh, accept): cs = set() for c, wid in self.EastAsianWidth: if wid in accept: cs.add(c) for start, end in set_to_ranges(cs): print(f" {{0x{start:04X}, 0x{end:04X}}},", file=fh) def write_wide_chars_list(self, fh): self.write_width_table(fh, {'W', 'F'}) def write_ambiguous_wide_chars_list(self, fh): self.write_width_table(fh, {'A'}) if __name__ == '__main__': Main().run()