#!/usr/bin/env python3

# Tool to read various files from the Unicode character database and
# generate headers containing derived arrays and lookup tables needed
# by PuTTY.
#
# The aim is to have this be a single tool which you can easily re-run
# against a new version of Unicode, simply by pointing it at an
# appropriate UCD.zip or a directory containing the same files
# unpacked.

import argparse
import collections
import io
import os
import sys
import zipfile

UCDRecord = collections.namedtuple('UCDRecord', [
    'c',
    'General_Category',
    'Bidi_Class',
    'Decomposition_Mapping',
])

def to_ranges(iterable):
    """Collect together adjacent ranges in a list of (key, value) pairs.

    The input iterable should deliver a sequence of (key, value) pairs
    in which the keys are integers in sorted order. The output is a
    sequence of tuples with structure ((start, end), value), each
    indicating that all the keys [start, start+1, ..., end] go with
    that value.
    """
    start = end = val = None

    for k, v in iterable:
        if (k-1, v) == (end, val):
            end = k
        else:
            if start is not None:
                yield (start, end), val
            start, end, val = k, k, v

    if start is not None:
        yield (start, end), val

def map_to_ranges(m):
    """Convert an integer-keyed map into a list of (range, value) pairs."""
    yield from to_ranges(sorted(m.items()))

def set_to_ranges(s):
    """Convert a set into a list of ranges."""
    for r, _ in to_ranges((x, None) for x in sorted(s)):
        yield r

def lines(iterable, keep_comments=False):
    """Deliver the lines of a Unicode data file.

    The input iterable should yield raw lines of the file: for
    example, it can be the file handle itself. The output values have
    their newlines removed, comments and trailing spaces deleted, and
    blank lines discarded.
    """
    for line in iter(iterable):
        line = line.rstrip("\r\n")
        if not keep_comments:
            line = line.split("#", 1)[0]
        line = line.rstrip(" \t")
        if line == "":
            continue
        yield line

class Main:
    def run(self):
        "Parse arguments and generate all the output files."

        parser = argparse.ArgumentParser(
            description='Build UCD-derived source files.')
        parser.add_argument("ucd", help="UCD to work from, either UCD.zip or "
                            "a directory full of unpacked files.")
        args = parser.parse_args()

        if os.path.isdir(args.ucd):
            ucd_dir = args.ucd
            self.open_ucd_file = lambda filename: (
                open(os.path.join(ucd_dir, filename)))
        else:
            ucd_zip = zipfile.ZipFile(args.ucd)
            self.open_ucd_file = lambda filename: (
                io.TextIOWrapper(ucd_zip.open(filename)))

        with open("bidi_type.h", "w") as fh:
            self.write_bidi_type_table(fh)
        with open("bidi_mirror.h", "w") as fh:
            self.write_bidi_mirroring_table(fh)
        with open("bidi_brackets.h", "w") as fh:
            self.write_bidi_brackets_table(fh)
        with open("nonspacing_chars.h", "w") as fh:
            self.write_nonspacing_chars_list(fh)
        with open("wide_chars.h", "w") as fh:
            self.write_wide_chars_list(fh)
        with open("ambiguous_wide_chars.h", "w") as fh:
            self.write_ambiguous_wide_chars_list(fh)

    @property
    def UnicodeData(self):
        """Records from UnicodeData.txt.

        Each yielded item is a UCDRecord tuple.
        """
        with self.open_ucd_file("UnicodeData.txt") as fh:
            for line in lines(fh):
                # Split up the line into its raw fields.
                (
                    codepoint, name, category, cclass, bidiclass, decomp,
                    num6, num7, num8, bidimirrored, obsolete_unicode1_name,
                    obsolete_comment, uppercase, lowercase, titlecase,
                ) = line.split(";")

                # By default, we expect that this record describes
                # just one code point.
                codepoints = [int(codepoint, 16)]

                # Spot the special markers where consecutive lines say
                # <Foo, First> and <Foo, Last>, indicating that the
                # entire range of code points in between are treated
                # the same. If so, we replace 'codepoints' with a
                # range object.
                if "<" in name:
                    assert name.startswith("<") and name.endswith(">"), (
                        "Confusing < in character name: {!r}".format(line))
                    name_pieces = [piece.strip(" \t") for piece in
                                   name.lstrip("<").rstrip(">").split(",")]
                    if "First" in name_pieces:
                        assert isinstance(codepoints, list)
                        prev_line_was_first = True
                        prev_codepoint = codepoints[0]
                        continue
                    elif "Last" in name_pieces:
                        assert prev_line_was_first
                        codepoints = range(prev_codepoint, codepoints[0]+1)
                        del prev_codepoint
                prev_line_was_first = False

                # Decode some of the raw fields into more cooked
                # forms.

                # For the moment, we only care about decomposition
                # mappings that consist of a single hex number (i.e.
                # are singletons and not compatibility mappings)
                try:
                    dm = [int(decomp, 16)]
                except ValueError:
                    dm = []

                # And yield a UCDRecord for each code point in our
                # range.
                for codepoint in codepoints:
                    yield UCDRecord(
                        c=codepoint,
                        General_Category=category,
                        Bidi_Class=bidiclass,
                        Decomposition_Mapping=dm,
                    )

    @property
    def BidiMirroring(self):
        """Parsed character pairs from BidiMirroring.txt.

        Each yielded tuple is a pair of Unicode code points.
        """
        with self.open_ucd_file("BidiMirroring.txt") as fh:
            for line in lines(fh):
                cs1, cs2 = line.split(";")
                c1 = int(cs1, 16)
                c2 = int(cs2, 16)
                yield c1, c2

    @property
    def BidiBrackets(self):
        """Bracket pairs from BidiBrackets.txt.

        Each yielded tuple is a pair of Unicode code points, followed
        by either 'o', 'c' or 'n' to indicate whether the first one is
        an open or closing parenthesis or neither.
        """
        with self.open_ucd_file("BidiBrackets.txt") as fh:
            for line in lines(fh):
                cs1, cs2, kind = line.split(";")
                c1 = int(cs1, 16)
                c2 = int(cs2, 16)
                kind = kind.strip(" \t")
                yield c1, c2, kind

    @property
    def EastAsianWidth(self):
        """East Asian width types from EastAsianWidth.txt.

        Each yielded tuple is (code point, width type).
        """
        with self.open_ucd_file("EastAsianWidth.txt") as fh:
            for line in lines(fh):
                fields = line.split(";")
                if ".." in fields[0]:
                    start, end = [int(s, 16) for s in fields[0].split("..")]
                    cs = range(start, end+1)
                else:
                    cs = [int(fields[0], 16)]
                for c in cs:
                    yield c, fields[1]

    def write_bidi_type_table(self, fh):
        types = {}

        for rec in self.UnicodeData:
            if rec.Bidi_Class != "ON":
                types[rec.c] = rec.Bidi_Class

        for (start, end), t in map_to_ranges(types):
            print(f"        {{0x{start:04x}, 0x{end:04x}, {t}}},", file=fh)

    def write_bidi_mirroring_table(self, fh):
        bidi_mirror = {}
        for c1, c2 in self.BidiMirroring:
            assert bidi_mirror.get(c1, c2) == c2, f"Clash at {c1:%04X}"
            bidi_mirror[c1] = c2
            assert bidi_mirror.get(c2, c1) == c1, f"Clash at {c2:%04X}"
            bidi_mirror[c2] = c1

        for c1, c2 in sorted(bidi_mirror.items()):
            print("        {{0x{:04x}, 0x{:04x}}},".format(c1, c2), file=fh)

    def write_bidi_brackets_table(self, fh):
        bracket_map = {}
        for c1, c2, kind in self.BidiBrackets:
            bracket_map[c1] = kind, c2

        equivalents = {}
        for rec in self.UnicodeData:
            if len(rec.Decomposition_Mapping) == 1:
                c = rec.c
                c2 = rec.Decomposition_Mapping[0]
                equivalents[c] = c2
                equivalents[c2] = c

        for src, (kind, dst) in sorted(bracket_map.items()):
            dsteq = equivalents.get(dst, 0)
            # UCD claims there's an 'n' kind possible, but as of UCD
            # 14, no instances of it exist
            enumval = {'o': 'BT_OPEN', 'c': 'BT_CLOSE'}[kind]
            print("        {{0x{:04x}, {{0x{:04x}, 0x{:04x}, {}}}}},".format(
                src, dst, dsteq, enumval), file=fh)

    def write_nonspacing_chars_list(self, fh):
        cs = set()

        for rec in self.UnicodeData:
            nonspacing = rec.General_Category in {"Me", "Mn", "Cf"}
            if rec.c == 0xAD:
                # In typography this is a SOFT HYPHEN and counts as
                # discardable. But it's also an ISO 8859-1 printing
                # character, and all of those occupy one character
                # cell in a terminal.
                nonspacing = False
            if 0x1160 <= rec.c <= 0x11FF:
                # Medial (vowel) and final (consonant) jamo for
                # decomposed Hangul characters. These are regarded as
                # non-spacing on the grounds that they compose with
                # the preceding initial consonant.
                nonspacing = True
            if nonspacing:
                cs.add(rec.c)

        for start, end in set_to_ranges(cs):
            print(f"    {{ 0x{start:04X}, 0x{end:04X} }},", file=fh)

    def write_width_table(self, fh, accept):
        cs = set()

        for c, wid in self.EastAsianWidth:
            if wid in accept:
                cs.add(c)

        for start, end in set_to_ranges(cs):
            print(f"    {{0x{start:04X}, 0x{end:04X}}},", file=fh)

    def write_wide_chars_list(self, fh):
        self.write_width_table(fh, {'W', 'F'})

    def write_ambiguous_wide_chars_list(self, fh):
        self.write_width_table(fh, {'A'})

if __name__ == '__main__':
    Main().run()