1
0
mirror of https://git.tartarus.org/simon/putty.git synced 2025-01-25 01:02:24 +00:00

Implement Unicode normalisation.

A new module in 'utils' computes NFC and NFD, via a new set of data
tables generated by read_ucd.py.

The new module comes with a new test program, which can read the
NormalizationTest.txt that appears in the Unicode Character Database.
All the tests pass, as of Unicode 15.
This commit is contained in:
Simon Tatham 2022-11-09 19:28:51 +00:00
parent 4cb429e3f4
commit b35d23f699
8 changed files with 3984 additions and 9 deletions

View File

@ -85,6 +85,11 @@ add_executable(test_decode_utf8
target_compile_definitions(test_decode_utf8 PRIVATE TEST)
target_link_libraries(test_decode_utf8 utils ${platform_libraries})
add_executable(test_unicode_norm
utils/unicode-norm.c)
target_compile_definitions(test_unicode_norm PRIVATE TEST)
target_link_libraries(test_unicode_norm utils ${platform_libraries})
add_executable(test_tree234
utils/tree234.c)
target_compile_definitions(test_tree234 PRIVATE TEST)

3
misc.h
View File

@ -265,6 +265,9 @@ unsigned decode_utf8(BinarySource *src);
* number written. */
size_t decode_utf8_to_wchar(BinarySource *src, wchar_t *out);
/* Normalise a UTF-8 string into Normalisation Form C. */
strbuf *utf8_to_nfc(ptrlen input);
/* Write a string out in C string-literal format. */
void write_c_string_literal(FILE *fp, ptrlen str);

950
unicode/canonical_comp.h Normal file
View File

@ -0,0 +1,950 @@
/*
* Autogenerated by read_ucd.py from The Unicode Standard 15.0.0
*
* List the pairs of Unicode characters that canonically recompose to a
* single character in NFC.
*
* Used by utils/unicode-norm.c.
*/
{0x003c, 0x0338, 0x226e},
{0x003d, 0x0338, 0x2260},
{0x003e, 0x0338, 0x226f},
{0x0041, 0x0300, 0x00c0},
{0x0041, 0x0301, 0x00c1},
{0x0041, 0x0302, 0x00c2},
{0x0041, 0x0303, 0x00c3},
{0x0041, 0x0304, 0x0100},
{0x0041, 0x0306, 0x0102},
{0x0041, 0x0307, 0x0226},
{0x0041, 0x0308, 0x00c4},
{0x0041, 0x0309, 0x1ea2},
{0x0041, 0x030a, 0x00c5},
{0x0041, 0x030c, 0x01cd},
{0x0041, 0x030f, 0x0200},
{0x0041, 0x0311, 0x0202},
{0x0041, 0x0323, 0x1ea0},
{0x0041, 0x0325, 0x1e00},
{0x0041, 0x0328, 0x0104},
{0x0042, 0x0307, 0x1e02},
{0x0042, 0x0323, 0x1e04},
{0x0042, 0x0331, 0x1e06},
{0x0043, 0x0301, 0x0106},
{0x0043, 0x0302, 0x0108},
{0x0043, 0x0307, 0x010a},
{0x0043, 0x030c, 0x010c},
{0x0043, 0x0327, 0x00c7},
{0x0044, 0x0307, 0x1e0a},
{0x0044, 0x030c, 0x010e},
{0x0044, 0x0323, 0x1e0c},
{0x0044, 0x0327, 0x1e10},
{0x0044, 0x032d, 0x1e12},
{0x0044, 0x0331, 0x1e0e},
{0x0045, 0x0300, 0x00c8},
{0x0045, 0x0301, 0x00c9},
{0x0045, 0x0302, 0x00ca},
{0x0045, 0x0303, 0x1ebc},
{0x0045, 0x0304, 0x0112},
{0x0045, 0x0306, 0x0114},
{0x0045, 0x0307, 0x0116},
{0x0045, 0x0308, 0x00cb},
{0x0045, 0x0309, 0x1eba},
{0x0045, 0x030c, 0x011a},
{0x0045, 0x030f, 0x0204},
{0x0045, 0x0311, 0x0206},
{0x0045, 0x0323, 0x1eb8},
{0x0045, 0x0327, 0x0228},
{0x0045, 0x0328, 0x0118},
{0x0045, 0x032d, 0x1e18},
{0x0045, 0x0330, 0x1e1a},
{0x0046, 0x0307, 0x1e1e},
{0x0047, 0x0301, 0x01f4},
{0x0047, 0x0302, 0x011c},
{0x0047, 0x0304, 0x1e20},
{0x0047, 0x0306, 0x011e},
{0x0047, 0x0307, 0x0120},
{0x0047, 0x030c, 0x01e6},
{0x0047, 0x0327, 0x0122},
{0x0048, 0x0302, 0x0124},
{0x0048, 0x0307, 0x1e22},
{0x0048, 0x0308, 0x1e26},
{0x0048, 0x030c, 0x021e},
{0x0048, 0x0323, 0x1e24},
{0x0048, 0x0327, 0x1e28},
{0x0048, 0x032e, 0x1e2a},
{0x0049, 0x0300, 0x00cc},
{0x0049, 0x0301, 0x00cd},
{0x0049, 0x0302, 0x00ce},
{0x0049, 0x0303, 0x0128},
{0x0049, 0x0304, 0x012a},
{0x0049, 0x0306, 0x012c},
{0x0049, 0x0307, 0x0130},
{0x0049, 0x0308, 0x00cf},
{0x0049, 0x0309, 0x1ec8},
{0x0049, 0x030c, 0x01cf},
{0x0049, 0x030f, 0x0208},
{0x0049, 0x0311, 0x020a},
{0x0049, 0x0323, 0x1eca},
{0x0049, 0x0328, 0x012e},
{0x0049, 0x0330, 0x1e2c},
{0x004a, 0x0302, 0x0134},
{0x004b, 0x0301, 0x1e30},
{0x004b, 0x030c, 0x01e8},
{0x004b, 0x0323, 0x1e32},
{0x004b, 0x0327, 0x0136},
{0x004b, 0x0331, 0x1e34},
{0x004c, 0x0301, 0x0139},
{0x004c, 0x030c, 0x013d},
{0x004c, 0x0323, 0x1e36},
{0x004c, 0x0327, 0x013b},
{0x004c, 0x032d, 0x1e3c},
{0x004c, 0x0331, 0x1e3a},
{0x004d, 0x0301, 0x1e3e},
{0x004d, 0x0307, 0x1e40},
{0x004d, 0x0323, 0x1e42},
{0x004e, 0x0300, 0x01f8},
{0x004e, 0x0301, 0x0143},
{0x004e, 0x0303, 0x00d1},
{0x004e, 0x0307, 0x1e44},
{0x004e, 0x030c, 0x0147},
{0x004e, 0x0323, 0x1e46},
{0x004e, 0x0327, 0x0145},
{0x004e, 0x032d, 0x1e4a},
{0x004e, 0x0331, 0x1e48},
{0x004f, 0x0300, 0x00d2},
{0x004f, 0x0301, 0x00d3},
{0x004f, 0x0302, 0x00d4},
{0x004f, 0x0303, 0x00d5},
{0x004f, 0x0304, 0x014c},
{0x004f, 0x0306, 0x014e},
{0x004f, 0x0307, 0x022e},
{0x004f, 0x0308, 0x00d6},
{0x004f, 0x0309, 0x1ece},
{0x004f, 0x030b, 0x0150},
{0x004f, 0x030c, 0x01d1},
{0x004f, 0x030f, 0x020c},
{0x004f, 0x0311, 0x020e},
{0x004f, 0x031b, 0x01a0},
{0x004f, 0x0323, 0x1ecc},
{0x004f, 0x0328, 0x01ea},
{0x0050, 0x0301, 0x1e54},
{0x0050, 0x0307, 0x1e56},
{0x0052, 0x0301, 0x0154},
{0x0052, 0x0307, 0x1e58},
{0x0052, 0x030c, 0x0158},
{0x0052, 0x030f, 0x0210},
{0x0052, 0x0311, 0x0212},
{0x0052, 0x0323, 0x1e5a},
{0x0052, 0x0327, 0x0156},
{0x0052, 0x0331, 0x1e5e},
{0x0053, 0x0301, 0x015a},
{0x0053, 0x0302, 0x015c},
{0x0053, 0x0307, 0x1e60},
{0x0053, 0x030c, 0x0160},
{0x0053, 0x0323, 0x1e62},
{0x0053, 0x0326, 0x0218},
{0x0053, 0x0327, 0x015e},
{0x0054, 0x0307, 0x1e6a},
{0x0054, 0x030c, 0x0164},
{0x0054, 0x0323, 0x1e6c},
{0x0054, 0x0326, 0x021a},
{0x0054, 0x0327, 0x0162},
{0x0054, 0x032d, 0x1e70},
{0x0054, 0x0331, 0x1e6e},
{0x0055, 0x0300, 0x00d9},
{0x0055, 0x0301, 0x00da},
{0x0055, 0x0302, 0x00db},
{0x0055, 0x0303, 0x0168},
{0x0055, 0x0304, 0x016a},
{0x0055, 0x0306, 0x016c},
{0x0055, 0x0308, 0x00dc},
{0x0055, 0x0309, 0x1ee6},
{0x0055, 0x030a, 0x016e},
{0x0055, 0x030b, 0x0170},
{0x0055, 0x030c, 0x01d3},
{0x0055, 0x030f, 0x0214},
{0x0055, 0x0311, 0x0216},
{0x0055, 0x031b, 0x01af},
{0x0055, 0x0323, 0x1ee4},
{0x0055, 0x0324, 0x1e72},
{0x0055, 0x0328, 0x0172},
{0x0055, 0x032d, 0x1e76},
{0x0055, 0x0330, 0x1e74},
{0x0056, 0x0303, 0x1e7c},
{0x0056, 0x0323, 0x1e7e},
{0x0057, 0x0300, 0x1e80},
{0x0057, 0x0301, 0x1e82},
{0x0057, 0x0302, 0x0174},
{0x0057, 0x0307, 0x1e86},
{0x0057, 0x0308, 0x1e84},
{0x0057, 0x0323, 0x1e88},
{0x0058, 0x0307, 0x1e8a},
{0x0058, 0x0308, 0x1e8c},
{0x0059, 0x0300, 0x1ef2},
{0x0059, 0x0301, 0x00dd},
{0x0059, 0x0302, 0x0176},
{0x0059, 0x0303, 0x1ef8},
{0x0059, 0x0304, 0x0232},
{0x0059, 0x0307, 0x1e8e},
{0x0059, 0x0308, 0x0178},
{0x0059, 0x0309, 0x1ef6},
{0x0059, 0x0323, 0x1ef4},
{0x005a, 0x0301, 0x0179},
{0x005a, 0x0302, 0x1e90},
{0x005a, 0x0307, 0x017b},
{0x005a, 0x030c, 0x017d},
{0x005a, 0x0323, 0x1e92},
{0x005a, 0x0331, 0x1e94},
{0x0061, 0x0300, 0x00e0},
{0x0061, 0x0301, 0x00e1},
{0x0061, 0x0302, 0x00e2},
{0x0061, 0x0303, 0x00e3},
{0x0061, 0x0304, 0x0101},
{0x0061, 0x0306, 0x0103},
{0x0061, 0x0307, 0x0227},
{0x0061, 0x0308, 0x00e4},
{0x0061, 0x0309, 0x1ea3},
{0x0061, 0x030a, 0x00e5},
{0x0061, 0x030c, 0x01ce},
{0x0061, 0x030f, 0x0201},
{0x0061, 0x0311, 0x0203},
{0x0061, 0x0323, 0x1ea1},
{0x0061, 0x0325, 0x1e01},
{0x0061, 0x0328, 0x0105},
{0x0062, 0x0307, 0x1e03},
{0x0062, 0x0323, 0x1e05},
{0x0062, 0x0331, 0x1e07},
{0x0063, 0x0301, 0x0107},
{0x0063, 0x0302, 0x0109},
{0x0063, 0x0307, 0x010b},
{0x0063, 0x030c, 0x010d},
{0x0063, 0x0327, 0x00e7},
{0x0064, 0x0307, 0x1e0b},
{0x0064, 0x030c, 0x010f},
{0x0064, 0x0323, 0x1e0d},
{0x0064, 0x0327, 0x1e11},
{0x0064, 0x032d, 0x1e13},
{0x0064, 0x0331, 0x1e0f},
{0x0065, 0x0300, 0x00e8},
{0x0065, 0x0301, 0x00e9},
{0x0065, 0x0302, 0x00ea},
{0x0065, 0x0303, 0x1ebd},
{0x0065, 0x0304, 0x0113},
{0x0065, 0x0306, 0x0115},
{0x0065, 0x0307, 0x0117},
{0x0065, 0x0308, 0x00eb},
{0x0065, 0x0309, 0x1ebb},
{0x0065, 0x030c, 0x011b},
{0x0065, 0x030f, 0x0205},
{0x0065, 0x0311, 0x0207},
{0x0065, 0x0323, 0x1eb9},
{0x0065, 0x0327, 0x0229},
{0x0065, 0x0328, 0x0119},
{0x0065, 0x032d, 0x1e19},
{0x0065, 0x0330, 0x1e1b},
{0x0066, 0x0307, 0x1e1f},
{0x0067, 0x0301, 0x01f5},
{0x0067, 0x0302, 0x011d},
{0x0067, 0x0304, 0x1e21},
{0x0067, 0x0306, 0x011f},
{0x0067, 0x0307, 0x0121},
{0x0067, 0x030c, 0x01e7},
{0x0067, 0x0327, 0x0123},
{0x0068, 0x0302, 0x0125},
{0x0068, 0x0307, 0x1e23},
{0x0068, 0x0308, 0x1e27},
{0x0068, 0x030c, 0x021f},
{0x0068, 0x0323, 0x1e25},
{0x0068, 0x0327, 0x1e29},
{0x0068, 0x032e, 0x1e2b},
{0x0068, 0x0331, 0x1e96},
{0x0069, 0x0300, 0x00ec},
{0x0069, 0x0301, 0x00ed},
{0x0069, 0x0302, 0x00ee},
{0x0069, 0x0303, 0x0129},
{0x0069, 0x0304, 0x012b},
{0x0069, 0x0306, 0x012d},
{0x0069, 0x0308, 0x00ef},
{0x0069, 0x0309, 0x1ec9},
{0x0069, 0x030c, 0x01d0},
{0x0069, 0x030f, 0x0209},
{0x0069, 0x0311, 0x020b},
{0x0069, 0x0323, 0x1ecb},
{0x0069, 0x0328, 0x012f},
{0x0069, 0x0330, 0x1e2d},
{0x006a, 0x0302, 0x0135},
{0x006a, 0x030c, 0x01f0},
{0x006b, 0x0301, 0x1e31},
{0x006b, 0x030c, 0x01e9},
{0x006b, 0x0323, 0x1e33},
{0x006b, 0x0327, 0x0137},
{0x006b, 0x0331, 0x1e35},
{0x006c, 0x0301, 0x013a},
{0x006c, 0x030c, 0x013e},
{0x006c, 0x0323, 0x1e37},
{0x006c, 0x0327, 0x013c},
{0x006c, 0x032d, 0x1e3d},
{0x006c, 0x0331, 0x1e3b},
{0x006d, 0x0301, 0x1e3f},
{0x006d, 0x0307, 0x1e41},
{0x006d, 0x0323, 0x1e43},
{0x006e, 0x0300, 0x01f9},
{0x006e, 0x0301, 0x0144},
{0x006e, 0x0303, 0x00f1},
{0x006e, 0x0307, 0x1e45},
{0x006e, 0x030c, 0x0148},
{0x006e, 0x0323, 0x1e47},
{0x006e, 0x0327, 0x0146},
{0x006e, 0x032d, 0x1e4b},
{0x006e, 0x0331, 0x1e49},
{0x006f, 0x0300, 0x00f2},
{0x006f, 0x0301, 0x00f3},
{0x006f, 0x0302, 0x00f4},
{0x006f, 0x0303, 0x00f5},
{0x006f, 0x0304, 0x014d},
{0x006f, 0x0306, 0x014f},
{0x006f, 0x0307, 0x022f},
{0x006f, 0x0308, 0x00f6},
{0x006f, 0x0309, 0x1ecf},
{0x006f, 0x030b, 0x0151},
{0x006f, 0x030c, 0x01d2},
{0x006f, 0x030f, 0x020d},
{0x006f, 0x0311, 0x020f},
{0x006f, 0x031b, 0x01a1},
{0x006f, 0x0323, 0x1ecd},
{0x006f, 0x0328, 0x01eb},
{0x0070, 0x0301, 0x1e55},
{0x0070, 0x0307, 0x1e57},
{0x0072, 0x0301, 0x0155},
{0x0072, 0x0307, 0x1e59},
{0x0072, 0x030c, 0x0159},
{0x0072, 0x030f, 0x0211},
{0x0072, 0x0311, 0x0213},
{0x0072, 0x0323, 0x1e5b},
{0x0072, 0x0327, 0x0157},
{0x0072, 0x0331, 0x1e5f},
{0x0073, 0x0301, 0x015b},
{0x0073, 0x0302, 0x015d},
{0x0073, 0x0307, 0x1e61},
{0x0073, 0x030c, 0x0161},
{0x0073, 0x0323, 0x1e63},
{0x0073, 0x0326, 0x0219},
{0x0073, 0x0327, 0x015f},
{0x0074, 0x0307, 0x1e6b},
{0x0074, 0x0308, 0x1e97},
{0x0074, 0x030c, 0x0165},
{0x0074, 0x0323, 0x1e6d},
{0x0074, 0x0326, 0x021b},
{0x0074, 0x0327, 0x0163},
{0x0074, 0x032d, 0x1e71},
{0x0074, 0x0331, 0x1e6f},
{0x0075, 0x0300, 0x00f9},
{0x0075, 0x0301, 0x00fa},
{0x0075, 0x0302, 0x00fb},
{0x0075, 0x0303, 0x0169},
{0x0075, 0x0304, 0x016b},
{0x0075, 0x0306, 0x016d},
{0x0075, 0x0308, 0x00fc},
{0x0075, 0x0309, 0x1ee7},
{0x0075, 0x030a, 0x016f},
{0x0075, 0x030b, 0x0171},
{0x0075, 0x030c, 0x01d4},
{0x0075, 0x030f, 0x0215},
{0x0075, 0x0311, 0x0217},
{0x0075, 0x031b, 0x01b0},
{0x0075, 0x0323, 0x1ee5},
{0x0075, 0x0324, 0x1e73},
{0x0075, 0x0328, 0x0173},
{0x0075, 0x032d, 0x1e77},
{0x0075, 0x0330, 0x1e75},
{0x0076, 0x0303, 0x1e7d},
{0x0076, 0x0323, 0x1e7f},
{0x0077, 0x0300, 0x1e81},
{0x0077, 0x0301, 0x1e83},
{0x0077, 0x0302, 0x0175},
{0x0077, 0x0307, 0x1e87},
{0x0077, 0x0308, 0x1e85},
{0x0077, 0x030a, 0x1e98},
{0x0077, 0x0323, 0x1e89},
{0x0078, 0x0307, 0x1e8b},
{0x0078, 0x0308, 0x1e8d},
{0x0079, 0x0300, 0x1ef3},
{0x0079, 0x0301, 0x00fd},
{0x0079, 0x0302, 0x0177},
{0x0079, 0x0303, 0x1ef9},
{0x0079, 0x0304, 0x0233},
{0x0079, 0x0307, 0x1e8f},
{0x0079, 0x0308, 0x00ff},
{0x0079, 0x0309, 0x1ef7},
{0x0079, 0x030a, 0x1e99},
{0x0079, 0x0323, 0x1ef5},
{0x007a, 0x0301, 0x017a},
{0x007a, 0x0302, 0x1e91},
{0x007a, 0x0307, 0x017c},
{0x007a, 0x030c, 0x017e},
{0x007a, 0x0323, 0x1e93},
{0x007a, 0x0331, 0x1e95},
{0x00a8, 0x0300, 0x1fed},
{0x00a8, 0x0301, 0x0385},
{0x00a8, 0x0342, 0x1fc1},
{0x00c2, 0x0300, 0x1ea6},
{0x00c2, 0x0301, 0x1ea4},
{0x00c2, 0x0303, 0x1eaa},
{0x00c2, 0x0309, 0x1ea8},
{0x00c4, 0x0304, 0x01de},
{0x00c5, 0x0301, 0x01fa},
{0x00c6, 0x0301, 0x01fc},
{0x00c6, 0x0304, 0x01e2},
{0x00c7, 0x0301, 0x1e08},
{0x00ca, 0x0300, 0x1ec0},
{0x00ca, 0x0301, 0x1ebe},
{0x00ca, 0x0303, 0x1ec4},
{0x00ca, 0x0309, 0x1ec2},
{0x00cf, 0x0301, 0x1e2e},
{0x00d4, 0x0300, 0x1ed2},
{0x00d4, 0x0301, 0x1ed0},
{0x00d4, 0x0303, 0x1ed6},
{0x00d4, 0x0309, 0x1ed4},
{0x00d5, 0x0301, 0x1e4c},
{0x00d5, 0x0304, 0x022c},
{0x00d5, 0x0308, 0x1e4e},
{0x00d6, 0x0304, 0x022a},
{0x00d8, 0x0301, 0x01fe},
{0x00dc, 0x0300, 0x01db},
{0x00dc, 0x0301, 0x01d7},
{0x00dc, 0x0304, 0x01d5},
{0x00dc, 0x030c, 0x01d9},
{0x00e2, 0x0300, 0x1ea7},
{0x00e2, 0x0301, 0x1ea5},
{0x00e2, 0x0303, 0x1eab},
{0x00e2, 0x0309, 0x1ea9},
{0x00e4, 0x0304, 0x01df},
{0x00e5, 0x0301, 0x01fb},
{0x00e6, 0x0301, 0x01fd},
{0x00e6, 0x0304, 0x01e3},
{0x00e7, 0x0301, 0x1e09},
{0x00ea, 0x0300, 0x1ec1},
{0x00ea, 0x0301, 0x1ebf},
{0x00ea, 0x0303, 0x1ec5},
{0x00ea, 0x0309, 0x1ec3},
{0x00ef, 0x0301, 0x1e2f},
{0x00f4, 0x0300, 0x1ed3},
{0x00f4, 0x0301, 0x1ed1},
{0x00f4, 0x0303, 0x1ed7},
{0x00f4, 0x0309, 0x1ed5},
{0x00f5, 0x0301, 0x1e4d},
{0x00f5, 0x0304, 0x022d},
{0x00f5, 0x0308, 0x1e4f},
{0x00f6, 0x0304, 0x022b},
{0x00f8, 0x0301, 0x01ff},
{0x00fc, 0x0300, 0x01dc},
{0x00fc, 0x0301, 0x01d8},
{0x00fc, 0x0304, 0x01d6},
{0x00fc, 0x030c, 0x01da},
{0x0102, 0x0300, 0x1eb0},
{0x0102, 0x0301, 0x1eae},
{0x0102, 0x0303, 0x1eb4},
{0x0102, 0x0309, 0x1eb2},
{0x0103, 0x0300, 0x1eb1},
{0x0103, 0x0301, 0x1eaf},
{0x0103, 0x0303, 0x1eb5},
{0x0103, 0x0309, 0x1eb3},
{0x0112, 0x0300, 0x1e14},
{0x0112, 0x0301, 0x1e16},
{0x0113, 0x0300, 0x1e15},
{0x0113, 0x0301, 0x1e17},
{0x014c, 0x0300, 0x1e50},
{0x014c, 0x0301, 0x1e52},
{0x014d, 0x0300, 0x1e51},
{0x014d, 0x0301, 0x1e53},
{0x015a, 0x0307, 0x1e64},
{0x015b, 0x0307, 0x1e65},
{0x0160, 0x0307, 0x1e66},
{0x0161, 0x0307, 0x1e67},
{0x0168, 0x0301, 0x1e78},
{0x0169, 0x0301, 0x1e79},
{0x016a, 0x0308, 0x1e7a},
{0x016b, 0x0308, 0x1e7b},
{0x017f, 0x0307, 0x1e9b},
{0x01a0, 0x0300, 0x1edc},
{0x01a0, 0x0301, 0x1eda},
{0x01a0, 0x0303, 0x1ee0},
{0x01a0, 0x0309, 0x1ede},
{0x01a0, 0x0323, 0x1ee2},
{0x01a1, 0x0300, 0x1edd},
{0x01a1, 0x0301, 0x1edb},
{0x01a1, 0x0303, 0x1ee1},
{0x01a1, 0x0309, 0x1edf},
{0x01a1, 0x0323, 0x1ee3},
{0x01af, 0x0300, 0x1eea},
{0x01af, 0x0301, 0x1ee8},
{0x01af, 0x0303, 0x1eee},
{0x01af, 0x0309, 0x1eec},
{0x01af, 0x0323, 0x1ef0},
{0x01b0, 0x0300, 0x1eeb},
{0x01b0, 0x0301, 0x1ee9},
{0x01b0, 0x0303, 0x1eef},
{0x01b0, 0x0309, 0x1eed},
{0x01b0, 0x0323, 0x1ef1},
{0x01b7, 0x030c, 0x01ee},
{0x01ea, 0x0304, 0x01ec},
{0x01eb, 0x0304, 0x01ed},
{0x0226, 0x0304, 0x01e0},
{0x0227, 0x0304, 0x01e1},
{0x0228, 0x0306, 0x1e1c},
{0x0229, 0x0306, 0x1e1d},
{0x022e, 0x0304, 0x0230},
{0x022f, 0x0304, 0x0231},
{0x0292, 0x030c, 0x01ef},
{0x0391, 0x0300, 0x1fba},
{0x0391, 0x0301, 0x0386},
{0x0391, 0x0304, 0x1fb9},
{0x0391, 0x0306, 0x1fb8},
{0x0391, 0x0313, 0x1f08},
{0x0391, 0x0314, 0x1f09},
{0x0391, 0x0345, 0x1fbc},
{0x0395, 0x0300, 0x1fc8},
{0x0395, 0x0301, 0x0388},
{0x0395, 0x0313, 0x1f18},
{0x0395, 0x0314, 0x1f19},
{0x0397, 0x0300, 0x1fca},
{0x0397, 0x0301, 0x0389},
{0x0397, 0x0313, 0x1f28},
{0x0397, 0x0314, 0x1f29},
{0x0397, 0x0345, 0x1fcc},
{0x0399, 0x0300, 0x1fda},
{0x0399, 0x0301, 0x038a},
{0x0399, 0x0304, 0x1fd9},
{0x0399, 0x0306, 0x1fd8},
{0x0399, 0x0308, 0x03aa},
{0x0399, 0x0313, 0x1f38},
{0x0399, 0x0314, 0x1f39},
{0x039f, 0x0300, 0x1ff8},
{0x039f, 0x0301, 0x038c},
{0x039f, 0x0313, 0x1f48},
{0x039f, 0x0314, 0x1f49},
{0x03a1, 0x0314, 0x1fec},
{0x03a5, 0x0300, 0x1fea},
{0x03a5, 0x0301, 0x038e},
{0x03a5, 0x0304, 0x1fe9},
{0x03a5, 0x0306, 0x1fe8},
{0x03a5, 0x0308, 0x03ab},
{0x03a5, 0x0314, 0x1f59},
{0x03a9, 0x0300, 0x1ffa},
{0x03a9, 0x0301, 0x038f},
{0x03a9, 0x0313, 0x1f68},
{0x03a9, 0x0314, 0x1f69},
{0x03a9, 0x0345, 0x1ffc},
{0x03ac, 0x0345, 0x1fb4},
{0x03ae, 0x0345, 0x1fc4},
{0x03b1, 0x0300, 0x1f70},
{0x03b1, 0x0301, 0x03ac},
{0x03b1, 0x0304, 0x1fb1},
{0x03b1, 0x0306, 0x1fb0},
{0x03b1, 0x0313, 0x1f00},
{0x03b1, 0x0314, 0x1f01},
{0x03b1, 0x0342, 0x1fb6},
{0x03b1, 0x0345, 0x1fb3},
{0x03b5, 0x0300, 0x1f72},
{0x03b5, 0x0301, 0x03ad},
{0x03b5, 0x0313, 0x1f10},
{0x03b5, 0x0314, 0x1f11},
{0x03b7, 0x0300, 0x1f74},
{0x03b7, 0x0301, 0x03ae},
{0x03b7, 0x0313, 0x1f20},
{0x03b7, 0x0314, 0x1f21},
{0x03b7, 0x0342, 0x1fc6},
{0x03b7, 0x0345, 0x1fc3},
{0x03b9, 0x0300, 0x1f76},
{0x03b9, 0x0301, 0x03af},
{0x03b9, 0x0304, 0x1fd1},
{0x03b9, 0x0306, 0x1fd0},
{0x03b9, 0x0308, 0x03ca},
{0x03b9, 0x0313, 0x1f30},
{0x03b9, 0x0314, 0x1f31},
{0x03b9, 0x0342, 0x1fd6},
{0x03bf, 0x0300, 0x1f78},
{0x03bf, 0x0301, 0x03cc},
{0x03bf, 0x0313, 0x1f40},
{0x03bf, 0x0314, 0x1f41},
{0x03c1, 0x0313, 0x1fe4},
{0x03c1, 0x0314, 0x1fe5},
{0x03c5, 0x0300, 0x1f7a},
{0x03c5, 0x0301, 0x03cd},
{0x03c5, 0x0304, 0x1fe1},
{0x03c5, 0x0306, 0x1fe0},
{0x03c5, 0x0308, 0x03cb},
{0x03c5, 0x0313, 0x1f50},
{0x03c5, 0x0314, 0x1f51},
{0x03c5, 0x0342, 0x1fe6},
{0x03c9, 0x0300, 0x1f7c},
{0x03c9, 0x0301, 0x03ce},
{0x03c9, 0x0313, 0x1f60},
{0x03c9, 0x0314, 0x1f61},
{0x03c9, 0x0342, 0x1ff6},
{0x03c9, 0x0345, 0x1ff3},
{0x03ca, 0x0300, 0x1fd2},
{0x03ca, 0x0301, 0x0390},
{0x03ca, 0x0342, 0x1fd7},
{0x03cb, 0x0300, 0x1fe2},
{0x03cb, 0x0301, 0x03b0},
{0x03cb, 0x0342, 0x1fe7},
{0x03ce, 0x0345, 0x1ff4},
{0x03d2, 0x0301, 0x03d3},
{0x03d2, 0x0308, 0x03d4},
{0x0406, 0x0308, 0x0407},
{0x0410, 0x0306, 0x04d0},
{0x0410, 0x0308, 0x04d2},
{0x0413, 0x0301, 0x0403},
{0x0415, 0x0300, 0x0400},
{0x0415, 0x0306, 0x04d6},
{0x0415, 0x0308, 0x0401},
{0x0416, 0x0306, 0x04c1},
{0x0416, 0x0308, 0x04dc},
{0x0417, 0x0308, 0x04de},
{0x0418, 0x0300, 0x040d},
{0x0418, 0x0304, 0x04e2},
{0x0418, 0x0306, 0x0419},
{0x0418, 0x0308, 0x04e4},
{0x041a, 0x0301, 0x040c},
{0x041e, 0x0308, 0x04e6},
{0x0423, 0x0304, 0x04ee},
{0x0423, 0x0306, 0x040e},
{0x0423, 0x0308, 0x04f0},
{0x0423, 0x030b, 0x04f2},
{0x0427, 0x0308, 0x04f4},
{0x042b, 0x0308, 0x04f8},
{0x042d, 0x0308, 0x04ec},
{0x0430, 0x0306, 0x04d1},
{0x0430, 0x0308, 0x04d3},
{0x0433, 0x0301, 0x0453},
{0x0435, 0x0300, 0x0450},
{0x0435, 0x0306, 0x04d7},
{0x0435, 0x0308, 0x0451},
{0x0436, 0x0306, 0x04c2},
{0x0436, 0x0308, 0x04dd},
{0x0437, 0x0308, 0x04df},
{0x0438, 0x0300, 0x045d},
{0x0438, 0x0304, 0x04e3},
{0x0438, 0x0306, 0x0439},
{0x0438, 0x0308, 0x04e5},
{0x043a, 0x0301, 0x045c},
{0x043e, 0x0308, 0x04e7},
{0x0443, 0x0304, 0x04ef},
{0x0443, 0x0306, 0x045e},
{0x0443, 0x0308, 0x04f1},
{0x0443, 0x030b, 0x04f3},
{0x0447, 0x0308, 0x04f5},
{0x044b, 0x0308, 0x04f9},
{0x044d, 0x0308, 0x04ed},
{0x0456, 0x0308, 0x0457},
{0x0474, 0x030f, 0x0476},
{0x0475, 0x030f, 0x0477},
{0x04d8, 0x0308, 0x04da},
{0x04d9, 0x0308, 0x04db},
{0x04e8, 0x0308, 0x04ea},
{0x04e9, 0x0308, 0x04eb},
{0x0627, 0x0653, 0x0622},
{0x0627, 0x0654, 0x0623},
{0x0627, 0x0655, 0x0625},
{0x0648, 0x0654, 0x0624},
{0x064a, 0x0654, 0x0626},
{0x06c1, 0x0654, 0x06c2},
{0x06d2, 0x0654, 0x06d3},
{0x06d5, 0x0654, 0x06c0},
{0x0928, 0x093c, 0x0929},
{0x0930, 0x093c, 0x0931},
{0x0933, 0x093c, 0x0934},
{0x09c7, 0x09be, 0x09cb},
{0x09c7, 0x09d7, 0x09cc},
{0x0b47, 0x0b3e, 0x0b4b},
{0x0b47, 0x0b56, 0x0b48},
{0x0b47, 0x0b57, 0x0b4c},
{0x0b92, 0x0bd7, 0x0b94},
{0x0bc6, 0x0bbe, 0x0bca},
{0x0bc6, 0x0bd7, 0x0bcc},
{0x0bc7, 0x0bbe, 0x0bcb},
{0x0c46, 0x0c56, 0x0c48},
{0x0cbf, 0x0cd5, 0x0cc0},
{0x0cc6, 0x0cc2, 0x0cca},
{0x0cc6, 0x0cd5, 0x0cc7},
{0x0cc6, 0x0cd6, 0x0cc8},
{0x0cca, 0x0cd5, 0x0ccb},
{0x0d46, 0x0d3e, 0x0d4a},
{0x0d46, 0x0d57, 0x0d4c},
{0x0d47, 0x0d3e, 0x0d4b},
{0x0dd9, 0x0dca, 0x0dda},
{0x0dd9, 0x0dcf, 0x0ddc},
{0x0dd9, 0x0ddf, 0x0dde},
{0x0ddc, 0x0dca, 0x0ddd},
{0x1025, 0x102e, 0x1026},
{0x1b05, 0x1b35, 0x1b06},
{0x1b07, 0x1b35, 0x1b08},
{0x1b09, 0x1b35, 0x1b0a},
{0x1b0b, 0x1b35, 0x1b0c},
{0x1b0d, 0x1b35, 0x1b0e},
{0x1b11, 0x1b35, 0x1b12},
{0x1b3a, 0x1b35, 0x1b3b},
{0x1b3c, 0x1b35, 0x1b3d},
{0x1b3e, 0x1b35, 0x1b40},
{0x1b3f, 0x1b35, 0x1b41},
{0x1b42, 0x1b35, 0x1b43},
{0x1e36, 0x0304, 0x1e38},
{0x1e37, 0x0304, 0x1e39},
{0x1e5a, 0x0304, 0x1e5c},
{0x1e5b, 0x0304, 0x1e5d},
{0x1e62, 0x0307, 0x1e68},
{0x1e63, 0x0307, 0x1e69},
{0x1ea0, 0x0302, 0x1eac},
{0x1ea0, 0x0306, 0x1eb6},
{0x1ea1, 0x0302, 0x1ead},
{0x1ea1, 0x0306, 0x1eb7},
{0x1eb8, 0x0302, 0x1ec6},
{0x1eb9, 0x0302, 0x1ec7},
{0x1ecc, 0x0302, 0x1ed8},
{0x1ecd, 0x0302, 0x1ed9},
{0x1f00, 0x0300, 0x1f02},
{0x1f00, 0x0301, 0x1f04},
{0x1f00, 0x0342, 0x1f06},
{0x1f00, 0x0345, 0x1f80},
{0x1f01, 0x0300, 0x1f03},
{0x1f01, 0x0301, 0x1f05},
{0x1f01, 0x0342, 0x1f07},
{0x1f01, 0x0345, 0x1f81},
{0x1f02, 0x0345, 0x1f82},
{0x1f03, 0x0345, 0x1f83},
{0x1f04, 0x0345, 0x1f84},
{0x1f05, 0x0345, 0x1f85},
{0x1f06, 0x0345, 0x1f86},
{0x1f07, 0x0345, 0x1f87},
{0x1f08, 0x0300, 0x1f0a},
{0x1f08, 0x0301, 0x1f0c},
{0x1f08, 0x0342, 0x1f0e},
{0x1f08, 0x0345, 0x1f88},
{0x1f09, 0x0300, 0x1f0b},
{0x1f09, 0x0301, 0x1f0d},
{0x1f09, 0x0342, 0x1f0f},
{0x1f09, 0x0345, 0x1f89},
{0x1f0a, 0x0345, 0x1f8a},
{0x1f0b, 0x0345, 0x1f8b},
{0x1f0c, 0x0345, 0x1f8c},
{0x1f0d, 0x0345, 0x1f8d},
{0x1f0e, 0x0345, 0x1f8e},
{0x1f0f, 0x0345, 0x1f8f},
{0x1f10, 0x0300, 0x1f12},
{0x1f10, 0x0301, 0x1f14},
{0x1f11, 0x0300, 0x1f13},
{0x1f11, 0x0301, 0x1f15},
{0x1f18, 0x0300, 0x1f1a},
{0x1f18, 0x0301, 0x1f1c},
{0x1f19, 0x0300, 0x1f1b},
{0x1f19, 0x0301, 0x1f1d},
{0x1f20, 0x0300, 0x1f22},
{0x1f20, 0x0301, 0x1f24},
{0x1f20, 0x0342, 0x1f26},
{0x1f20, 0x0345, 0x1f90},
{0x1f21, 0x0300, 0x1f23},
{0x1f21, 0x0301, 0x1f25},
{0x1f21, 0x0342, 0x1f27},
{0x1f21, 0x0345, 0x1f91},
{0x1f22, 0x0345, 0x1f92},
{0x1f23, 0x0345, 0x1f93},
{0x1f24, 0x0345, 0x1f94},
{0x1f25, 0x0345, 0x1f95},
{0x1f26, 0x0345, 0x1f96},
{0x1f27, 0x0345, 0x1f97},
{0x1f28, 0x0300, 0x1f2a},
{0x1f28, 0x0301, 0x1f2c},
{0x1f28, 0x0342, 0x1f2e},
{0x1f28, 0x0345, 0x1f98},
{0x1f29, 0x0300, 0x1f2b},
{0x1f29, 0x0301, 0x1f2d},
{0x1f29, 0x0342, 0x1f2f},
{0x1f29, 0x0345, 0x1f99},
{0x1f2a, 0x0345, 0x1f9a},
{0x1f2b, 0x0345, 0x1f9b},
{0x1f2c, 0x0345, 0x1f9c},
{0x1f2d, 0x0345, 0x1f9d},
{0x1f2e, 0x0345, 0x1f9e},
{0x1f2f, 0x0345, 0x1f9f},
{0x1f30, 0x0300, 0x1f32},
{0x1f30, 0x0301, 0x1f34},
{0x1f30, 0x0342, 0x1f36},
{0x1f31, 0x0300, 0x1f33},
{0x1f31, 0x0301, 0x1f35},
{0x1f31, 0x0342, 0x1f37},
{0x1f38, 0x0300, 0x1f3a},
{0x1f38, 0x0301, 0x1f3c},
{0x1f38, 0x0342, 0x1f3e},
{0x1f39, 0x0300, 0x1f3b},
{0x1f39, 0x0301, 0x1f3d},
{0x1f39, 0x0342, 0x1f3f},
{0x1f40, 0x0300, 0x1f42},
{0x1f40, 0x0301, 0x1f44},
{0x1f41, 0x0300, 0x1f43},
{0x1f41, 0x0301, 0x1f45},
{0x1f48, 0x0300, 0x1f4a},
{0x1f48, 0x0301, 0x1f4c},
{0x1f49, 0x0300, 0x1f4b},
{0x1f49, 0x0301, 0x1f4d},
{0x1f50, 0x0300, 0x1f52},
{0x1f50, 0x0301, 0x1f54},
{0x1f50, 0x0342, 0x1f56},
{0x1f51, 0x0300, 0x1f53},
{0x1f51, 0x0301, 0x1f55},
{0x1f51, 0x0342, 0x1f57},
{0x1f59, 0x0300, 0x1f5b},
{0x1f59, 0x0301, 0x1f5d},
{0x1f59, 0x0342, 0x1f5f},
{0x1f60, 0x0300, 0x1f62},
{0x1f60, 0x0301, 0x1f64},
{0x1f60, 0x0342, 0x1f66},
{0x1f60, 0x0345, 0x1fa0},
{0x1f61, 0x0300, 0x1f63},
{0x1f61, 0x0301, 0x1f65},
{0x1f61, 0x0342, 0x1f67},
{0x1f61, 0x0345, 0x1fa1},
{0x1f62, 0x0345, 0x1fa2},
{0x1f63, 0x0345, 0x1fa3},
{0x1f64, 0x0345, 0x1fa4},
{0x1f65, 0x0345, 0x1fa5},
{0x1f66, 0x0345, 0x1fa6},
{0x1f67, 0x0345, 0x1fa7},
{0x1f68, 0x0300, 0x1f6a},
{0x1f68, 0x0301, 0x1f6c},
{0x1f68, 0x0342, 0x1f6e},
{0x1f68, 0x0345, 0x1fa8},
{0x1f69, 0x0300, 0x1f6b},
{0x1f69, 0x0301, 0x1f6d},
{0x1f69, 0x0342, 0x1f6f},
{0x1f69, 0x0345, 0x1fa9},
{0x1f6a, 0x0345, 0x1faa},
{0x1f6b, 0x0345, 0x1fab},
{0x1f6c, 0x0345, 0x1fac},
{0x1f6d, 0x0345, 0x1fad},
{0x1f6e, 0x0345, 0x1fae},
{0x1f6f, 0x0345, 0x1faf},
{0x1f70, 0x0345, 0x1fb2},
{0x1f74, 0x0345, 0x1fc2},
{0x1f7c, 0x0345, 0x1ff2},
{0x1fb6, 0x0345, 0x1fb7},
{0x1fbf, 0x0300, 0x1fcd},
{0x1fbf, 0x0301, 0x1fce},
{0x1fbf, 0x0342, 0x1fcf},
{0x1fc6, 0x0345, 0x1fc7},
{0x1ff6, 0x0345, 0x1ff7},
{0x1ffe, 0x0300, 0x1fdd},
{0x1ffe, 0x0301, 0x1fde},
{0x1ffe, 0x0342, 0x1fdf},
{0x2190, 0x0338, 0x219a},
{0x2192, 0x0338, 0x219b},
{0x2194, 0x0338, 0x21ae},
{0x21d0, 0x0338, 0x21cd},
{0x21d2, 0x0338, 0x21cf},
{0x21d4, 0x0338, 0x21ce},
{0x2203, 0x0338, 0x2204},
{0x2208, 0x0338, 0x2209},
{0x220b, 0x0338, 0x220c},
{0x2223, 0x0338, 0x2224},
{0x2225, 0x0338, 0x2226},
{0x223c, 0x0338, 0x2241},
{0x2243, 0x0338, 0x2244},
{0x2245, 0x0338, 0x2247},
{0x2248, 0x0338, 0x2249},
{0x224d, 0x0338, 0x226d},
{0x2261, 0x0338, 0x2262},
{0x2264, 0x0338, 0x2270},
{0x2265, 0x0338, 0x2271},
{0x2272, 0x0338, 0x2274},
{0x2273, 0x0338, 0x2275},
{0x2276, 0x0338, 0x2278},
{0x2277, 0x0338, 0x2279},
{0x227a, 0x0338, 0x2280},
{0x227b, 0x0338, 0x2281},
{0x227c, 0x0338, 0x22e0},
{0x227d, 0x0338, 0x22e1},
{0x2282, 0x0338, 0x2284},
{0x2283, 0x0338, 0x2285},
{0x2286, 0x0338, 0x2288},
{0x2287, 0x0338, 0x2289},
{0x2291, 0x0338, 0x22e2},
{0x2292, 0x0338, 0x22e3},
{0x22a2, 0x0338, 0x22ac},
{0x22a8, 0x0338, 0x22ad},
{0x22a9, 0x0338, 0x22ae},
{0x22ab, 0x0338, 0x22af},
{0x22b2, 0x0338, 0x22ea},
{0x22b3, 0x0338, 0x22eb},
{0x22b4, 0x0338, 0x22ec},
{0x22b5, 0x0338, 0x22ed},
{0x3046, 0x3099, 0x3094},
{0x304b, 0x3099, 0x304c},
{0x304d, 0x3099, 0x304e},
{0x304f, 0x3099, 0x3050},
{0x3051, 0x3099, 0x3052},
{0x3053, 0x3099, 0x3054},
{0x3055, 0x3099, 0x3056},
{0x3057, 0x3099, 0x3058},
{0x3059, 0x3099, 0x305a},
{0x305b, 0x3099, 0x305c},
{0x305d, 0x3099, 0x305e},
{0x305f, 0x3099, 0x3060},
{0x3061, 0x3099, 0x3062},
{0x3064, 0x3099, 0x3065},
{0x3066, 0x3099, 0x3067},
{0x3068, 0x3099, 0x3069},
{0x306f, 0x3099, 0x3070},
{0x306f, 0x309a, 0x3071},
{0x3072, 0x3099, 0x3073},
{0x3072, 0x309a, 0x3074},
{0x3075, 0x3099, 0x3076},
{0x3075, 0x309a, 0x3077},
{0x3078, 0x3099, 0x3079},
{0x3078, 0x309a, 0x307a},
{0x307b, 0x3099, 0x307c},
{0x307b, 0x309a, 0x307d},
{0x309d, 0x3099, 0x309e},
{0x30a6, 0x3099, 0x30f4},
{0x30ab, 0x3099, 0x30ac},
{0x30ad, 0x3099, 0x30ae},
{0x30af, 0x3099, 0x30b0},
{0x30b1, 0x3099, 0x30b2},
{0x30b3, 0x3099, 0x30b4},
{0x30b5, 0x3099, 0x30b6},
{0x30b7, 0x3099, 0x30b8},
{0x30b9, 0x3099, 0x30ba},
{0x30bb, 0x3099, 0x30bc},
{0x30bd, 0x3099, 0x30be},
{0x30bf, 0x3099, 0x30c0},
{0x30c1, 0x3099, 0x30c2},
{0x30c4, 0x3099, 0x30c5},
{0x30c6, 0x3099, 0x30c7},
{0x30c8, 0x3099, 0x30c9},
{0x30cf, 0x3099, 0x30d0},
{0x30cf, 0x309a, 0x30d1},
{0x30d2, 0x3099, 0x30d3},
{0x30d2, 0x309a, 0x30d4},
{0x30d5, 0x3099, 0x30d6},
{0x30d5, 0x309a, 0x30d7},
{0x30d8, 0x3099, 0x30d9},
{0x30d8, 0x309a, 0x30da},
{0x30db, 0x3099, 0x30dc},
{0x30db, 0x309a, 0x30dd},
{0x30ef, 0x3099, 0x30f7},
{0x30f0, 0x3099, 0x30f8},
{0x30f1, 0x3099, 0x30f9},
{0x30f2, 0x3099, 0x30fa},
{0x30fd, 0x3099, 0x30fe},
{0x11099, 0x110ba, 0x1109a},
{0x1109b, 0x110ba, 0x1109c},
{0x110a5, 0x110ba, 0x110ab},
{0x11131, 0x11127, 0x1112e},
{0x11132, 0x11127, 0x1112f},
{0x11347, 0x1133e, 0x1134b},
{0x11347, 0x11357, 0x1134c},
{0x114b9, 0x114b0, 0x114bc},
{0x114b9, 0x114ba, 0x114bb},
{0x114b9, 0x114bd, 0x114be},
{0x115b8, 0x115af, 0x115ba},
{0x115b9, 0x115af, 0x115bb},
{0x11935, 0x11930, 0x11938},

2071
unicode/canonical_decomp.h Normal file

File diff suppressed because it is too large Load Diff

398
unicode/combining_classes.h Normal file
View File

@ -0,0 +1,398 @@
/*
* Autogenerated by read_ucd.py from The Unicode Standard 15.0.0
*
* List the canonical combining class of each Unicode character, if it is
* not zero. This controls how combining marks can be reordered by the
* Unicode normalisation algorithms.
*
* Used by utils/unicode-norm.c.
*/
{0x0300, 0x0314, 230},
{0x0315, 0x0315, 232},
{0x0316, 0x0319, 220},
{0x031a, 0x031a, 232},
{0x031b, 0x031b, 216},
{0x031c, 0x0320, 220},
{0x0321, 0x0322, 202},
{0x0323, 0x0326, 220},
{0x0327, 0x0328, 202},
{0x0329, 0x0333, 220},
{0x0334, 0x0338, 1},
{0x0339, 0x033c, 220},
{0x033d, 0x0344, 230},
{0x0345, 0x0345, 240},
{0x0346, 0x0346, 230},
{0x0347, 0x0349, 220},
{0x034a, 0x034c, 230},
{0x034d, 0x034e, 220},
{0x0350, 0x0352, 230},
{0x0353, 0x0356, 220},
{0x0357, 0x0357, 230},
{0x0358, 0x0358, 232},
{0x0359, 0x035a, 220},
{0x035b, 0x035b, 230},
{0x035c, 0x035c, 233},
{0x035d, 0x035e, 234},
{0x035f, 0x035f, 233},
{0x0360, 0x0361, 234},
{0x0362, 0x0362, 233},
{0x0363, 0x036f, 230},
{0x0483, 0x0487, 230},
{0x0591, 0x0591, 220},
{0x0592, 0x0595, 230},
{0x0596, 0x0596, 220},
{0x0597, 0x0599, 230},
{0x059a, 0x059a, 222},
{0x059b, 0x059b, 220},
{0x059c, 0x05a1, 230},
{0x05a2, 0x05a7, 220},
{0x05a8, 0x05a9, 230},
{0x05aa, 0x05aa, 220},
{0x05ab, 0x05ac, 230},
{0x05ad, 0x05ad, 222},
{0x05ae, 0x05ae, 228},
{0x05af, 0x05af, 230},
{0x05b0, 0x05b0, 10},
{0x05b1, 0x05b1, 11},
{0x05b2, 0x05b2, 12},
{0x05b3, 0x05b3, 13},
{0x05b4, 0x05b4, 14},
{0x05b5, 0x05b5, 15},
{0x05b6, 0x05b6, 16},
{0x05b7, 0x05b7, 17},
{0x05b8, 0x05b8, 18},
{0x05b9, 0x05ba, 19},
{0x05bb, 0x05bb, 20},
{0x05bc, 0x05bc, 21},
{0x05bd, 0x05bd, 22},
{0x05bf, 0x05bf, 23},
{0x05c1, 0x05c1, 24},
{0x05c2, 0x05c2, 25},
{0x05c4, 0x05c4, 230},
{0x05c5, 0x05c5, 220},
{0x05c7, 0x05c7, 18},
{0x0610, 0x0617, 230},
{0x0618, 0x0618, 30},
{0x0619, 0x0619, 31},
{0x061a, 0x061a, 32},
{0x064b, 0x064b, 27},
{0x064c, 0x064c, 28},
{0x064d, 0x064d, 29},
{0x064e, 0x064e, 30},
{0x064f, 0x064f, 31},
{0x0650, 0x0650, 32},
{0x0651, 0x0651, 33},
{0x0652, 0x0652, 34},
{0x0653, 0x0654, 230},
{0x0655, 0x0656, 220},
{0x0657, 0x065b, 230},
{0x065c, 0x065c, 220},
{0x065d, 0x065e, 230},
{0x065f, 0x065f, 220},
{0x0670, 0x0670, 35},
{0x06d6, 0x06dc, 230},
{0x06df, 0x06e2, 230},
{0x06e3, 0x06e3, 220},
{0x06e4, 0x06e4, 230},
{0x06e7, 0x06e8, 230},
{0x06ea, 0x06ea, 220},
{0x06eb, 0x06ec, 230},
{0x06ed, 0x06ed, 220},
{0x0711, 0x0711, 36},
{0x0730, 0x0730, 230},
{0x0731, 0x0731, 220},
{0x0732, 0x0733, 230},
{0x0734, 0x0734, 220},
{0x0735, 0x0736, 230},
{0x0737, 0x0739, 220},
{0x073a, 0x073a, 230},
{0x073b, 0x073c, 220},
{0x073d, 0x073d, 230},
{0x073e, 0x073e, 220},
{0x073f, 0x0741, 230},
{0x0742, 0x0742, 220},
{0x0743, 0x0743, 230},
{0x0744, 0x0744, 220},
{0x0745, 0x0745, 230},
{0x0746, 0x0746, 220},
{0x0747, 0x0747, 230},
{0x0748, 0x0748, 220},
{0x0749, 0x074a, 230},
{0x07eb, 0x07f1, 230},
{0x07f2, 0x07f2, 220},
{0x07f3, 0x07f3, 230},
{0x07fd, 0x07fd, 220},
{0x0816, 0x0819, 230},
{0x081b, 0x0823, 230},
{0x0825, 0x0827, 230},
{0x0829, 0x082d, 230},
{0x0859, 0x085b, 220},
{0x0898, 0x0898, 230},
{0x0899, 0x089b, 220},
{0x089c, 0x089f, 230},
{0x08ca, 0x08ce, 230},
{0x08cf, 0x08d3, 220},
{0x08d4, 0x08e1, 230},
{0x08e3, 0x08e3, 220},
{0x08e4, 0x08e5, 230},
{0x08e6, 0x08e6, 220},
{0x08e7, 0x08e8, 230},
{0x08e9, 0x08e9, 220},
{0x08ea, 0x08ec, 230},
{0x08ed, 0x08ef, 220},
{0x08f0, 0x08f0, 27},
{0x08f1, 0x08f1, 28},
{0x08f2, 0x08f2, 29},
{0x08f3, 0x08f5, 230},
{0x08f6, 0x08f6, 220},
{0x08f7, 0x08f8, 230},
{0x08f9, 0x08fa, 220},
{0x08fb, 0x08ff, 230},
{0x093c, 0x093c, 7},
{0x094d, 0x094d, 9},
{0x0951, 0x0951, 230},
{0x0952, 0x0952, 220},
{0x0953, 0x0954, 230},
{0x09bc, 0x09bc, 7},
{0x09cd, 0x09cd, 9},
{0x09fe, 0x09fe, 230},
{0x0a3c, 0x0a3c, 7},
{0x0a4d, 0x0a4d, 9},
{0x0abc, 0x0abc, 7},
{0x0acd, 0x0acd, 9},
{0x0b3c, 0x0b3c, 7},
{0x0b4d, 0x0b4d, 9},
{0x0bcd, 0x0bcd, 9},
{0x0c3c, 0x0c3c, 7},
{0x0c4d, 0x0c4d, 9},
{0x0c55, 0x0c55, 84},
{0x0c56, 0x0c56, 91},
{0x0cbc, 0x0cbc, 7},
{0x0ccd, 0x0ccd, 9},
{0x0d3b, 0x0d3c, 9},
{0x0d4d, 0x0d4d, 9},
{0x0dca, 0x0dca, 9},
{0x0e38, 0x0e39, 103},
{0x0e3a, 0x0e3a, 9},
{0x0e48, 0x0e4b, 107},
{0x0eb8, 0x0eb9, 118},
{0x0eba, 0x0eba, 9},
{0x0ec8, 0x0ecb, 122},
{0x0f18, 0x0f19, 220},
{0x0f35, 0x0f35, 220},
{0x0f37, 0x0f37, 220},
{0x0f39, 0x0f39, 216},
{0x0f71, 0x0f71, 129},
{0x0f72, 0x0f72, 130},
{0x0f74, 0x0f74, 132},
{0x0f7a, 0x0f7d, 130},
{0x0f80, 0x0f80, 130},
{0x0f82, 0x0f83, 230},
{0x0f84, 0x0f84, 9},
{0x0f86, 0x0f87, 230},
{0x0fc6, 0x0fc6, 220},
{0x1037, 0x1037, 7},
{0x1039, 0x103a, 9},
{0x108d, 0x108d, 220},
{0x135d, 0x135f, 230},
{0x1714, 0x1715, 9},
{0x1734, 0x1734, 9},
{0x17d2, 0x17d2, 9},
{0x17dd, 0x17dd, 230},
{0x18a9, 0x18a9, 228},
{0x1939, 0x1939, 222},
{0x193a, 0x193a, 230},
{0x193b, 0x193b, 220},
{0x1a17, 0x1a17, 230},
{0x1a18, 0x1a18, 220},
{0x1a60, 0x1a60, 9},
{0x1a75, 0x1a7c, 230},
{0x1a7f, 0x1a7f, 220},
{0x1ab0, 0x1ab4, 230},
{0x1ab5, 0x1aba, 220},
{0x1abb, 0x1abc, 230},
{0x1abd, 0x1abd, 220},
{0x1abf, 0x1ac0, 220},
{0x1ac1, 0x1ac2, 230},
{0x1ac3, 0x1ac4, 220},
{0x1ac5, 0x1ac9, 230},
{0x1aca, 0x1aca, 220},
{0x1acb, 0x1ace, 230},
{0x1b34, 0x1b34, 7},
{0x1b44, 0x1b44, 9},
{0x1b6b, 0x1b6b, 230},
{0x1b6c, 0x1b6c, 220},
{0x1b6d, 0x1b73, 230},
{0x1baa, 0x1bab, 9},
{0x1be6, 0x1be6, 7},
{0x1bf2, 0x1bf3, 9},
{0x1c37, 0x1c37, 7},
{0x1cd0, 0x1cd2, 230},
{0x1cd4, 0x1cd4, 1},
{0x1cd5, 0x1cd9, 220},
{0x1cda, 0x1cdb, 230},
{0x1cdc, 0x1cdf, 220},
{0x1ce0, 0x1ce0, 230},
{0x1ce2, 0x1ce8, 1},
{0x1ced, 0x1ced, 220},
{0x1cf4, 0x1cf4, 230},
{0x1cf8, 0x1cf9, 230},
{0x1dc0, 0x1dc1, 230},
{0x1dc2, 0x1dc2, 220},
{0x1dc3, 0x1dc9, 230},
{0x1dca, 0x1dca, 220},
{0x1dcb, 0x1dcc, 230},
{0x1dcd, 0x1dcd, 234},
{0x1dce, 0x1dce, 214},
{0x1dcf, 0x1dcf, 220},
{0x1dd0, 0x1dd0, 202},
{0x1dd1, 0x1df5, 230},
{0x1df6, 0x1df6, 232},
{0x1df7, 0x1df8, 228},
{0x1df9, 0x1df9, 220},
{0x1dfa, 0x1dfa, 218},
{0x1dfb, 0x1dfb, 230},
{0x1dfc, 0x1dfc, 233},
{0x1dfd, 0x1dfd, 220},
{0x1dfe, 0x1dfe, 230},
{0x1dff, 0x1dff, 220},
{0x20d0, 0x20d1, 230},
{0x20d2, 0x20d3, 1},
{0x20d4, 0x20d7, 230},
{0x20d8, 0x20da, 1},
{0x20db, 0x20dc, 230},
{0x20e1, 0x20e1, 230},
{0x20e5, 0x20e6, 1},
{0x20e7, 0x20e7, 230},
{0x20e8, 0x20e8, 220},
{0x20e9, 0x20e9, 230},
{0x20ea, 0x20eb, 1},
{0x20ec, 0x20ef, 220},
{0x20f0, 0x20f0, 230},
{0x2cef, 0x2cf1, 230},
{0x2d7f, 0x2d7f, 9},
{0x2de0, 0x2dff, 230},
{0x302a, 0x302a, 218},
{0x302b, 0x302b, 228},
{0x302c, 0x302c, 232},
{0x302d, 0x302d, 222},
{0x302e, 0x302f, 224},
{0x3099, 0x309a, 8},
{0xa66f, 0xa66f, 230},
{0xa674, 0xa67d, 230},
{0xa69e, 0xa69f, 230},
{0xa6f0, 0xa6f1, 230},
{0xa806, 0xa806, 9},
{0xa82c, 0xa82c, 9},
{0xa8c4, 0xa8c4, 9},
{0xa8e0, 0xa8f1, 230},
{0xa92b, 0xa92d, 220},
{0xa953, 0xa953, 9},
{0xa9b3, 0xa9b3, 7},
{0xa9c0, 0xa9c0, 9},
{0xaab0, 0xaab0, 230},
{0xaab2, 0xaab3, 230},
{0xaab4, 0xaab4, 220},
{0xaab7, 0xaab8, 230},
{0xaabe, 0xaabf, 230},
{0xaac1, 0xaac1, 230},
{0xaaf6, 0xaaf6, 9},
{0xabed, 0xabed, 9},
{0xfb1e, 0xfb1e, 26},
{0xfe20, 0xfe26, 230},
{0xfe27, 0xfe2d, 220},
{0xfe2e, 0xfe2f, 230},
{0x101fd, 0x101fd, 220},
{0x102e0, 0x102e0, 220},
{0x10376, 0x1037a, 230},
{0x10a0d, 0x10a0d, 220},
{0x10a0f, 0x10a0f, 230},
{0x10a38, 0x10a38, 230},
{0x10a39, 0x10a39, 1},
{0x10a3a, 0x10a3a, 220},
{0x10a3f, 0x10a3f, 9},
{0x10ae5, 0x10ae5, 230},
{0x10ae6, 0x10ae6, 220},
{0x10d24, 0x10d27, 230},
{0x10eab, 0x10eac, 230},
{0x10efd, 0x10eff, 220},
{0x10f46, 0x10f47, 220},
{0x10f48, 0x10f4a, 230},
{0x10f4b, 0x10f4b, 220},
{0x10f4c, 0x10f4c, 230},
{0x10f4d, 0x10f50, 220},
{0x10f82, 0x10f82, 230},
{0x10f83, 0x10f83, 220},
{0x10f84, 0x10f84, 230},
{0x10f85, 0x10f85, 220},
{0x11046, 0x11046, 9},
{0x11070, 0x11070, 9},
{0x1107f, 0x1107f, 9},
{0x110b9, 0x110b9, 9},
{0x110ba, 0x110ba, 7},
{0x11100, 0x11102, 230},
{0x11133, 0x11134, 9},
{0x11173, 0x11173, 7},
{0x111c0, 0x111c0, 9},
{0x111ca, 0x111ca, 7},
{0x11235, 0x11235, 9},
{0x11236, 0x11236, 7},
{0x112e9, 0x112e9, 7},
{0x112ea, 0x112ea, 9},
{0x1133b, 0x1133c, 7},
{0x1134d, 0x1134d, 9},
{0x11366, 0x1136c, 230},
{0x11370, 0x11374, 230},
{0x11442, 0x11442, 9},
{0x11446, 0x11446, 7},
{0x1145e, 0x1145e, 230},
{0x114c2, 0x114c2, 9},
{0x114c3, 0x114c3, 7},
{0x115bf, 0x115bf, 9},
{0x115c0, 0x115c0, 7},
{0x1163f, 0x1163f, 9},
{0x116b6, 0x116b6, 9},
{0x116b7, 0x116b7, 7},
{0x1172b, 0x1172b, 9},
{0x11839, 0x11839, 9},
{0x1183a, 0x1183a, 7},
{0x1193d, 0x1193e, 9},
{0x11943, 0x11943, 7},
{0x119e0, 0x119e0, 9},
{0x11a34, 0x11a34, 9},
{0x11a47, 0x11a47, 9},
{0x11a99, 0x11a99, 9},
{0x11c3f, 0x11c3f, 9},
{0x11d42, 0x11d42, 7},
{0x11d44, 0x11d45, 9},
{0x11d97, 0x11d97, 9},
{0x11f41, 0x11f42, 9},
{0x16af0, 0x16af4, 1},
{0x16b30, 0x16b36, 230},
{0x16ff0, 0x16ff1, 6},
{0x1bc9e, 0x1bc9e, 1},
{0x1d165, 0x1d166, 216},
{0x1d167, 0x1d169, 1},
{0x1d16d, 0x1d16d, 226},
{0x1d16e, 0x1d172, 216},
{0x1d17b, 0x1d182, 220},
{0x1d185, 0x1d189, 230},
{0x1d18a, 0x1d18b, 220},
{0x1d1aa, 0x1d1ad, 230},
{0x1d242, 0x1d244, 230},
{0x1e000, 0x1e006, 230},
{0x1e008, 0x1e018, 230},
{0x1e01b, 0x1e021, 230},
{0x1e023, 0x1e024, 230},
{0x1e026, 0x1e02a, 230},
{0x1e08f, 0x1e08f, 230},
{0x1e130, 0x1e136, 230},
{0x1e2ae, 0x1e2ae, 230},
{0x1e2ec, 0x1e2ef, 230},
{0x1e4ec, 0x1e4ed, 232},
{0x1e4ee, 0x1e4ee, 220},
{0x1e4ef, 0x1e4ef, 230},
{0x1e8d0, 0x1e8d6, 220},
{0x1e944, 0x1e949, 230},
{0x1e94a, 0x1e94a, 7},

View File

@ -20,7 +20,9 @@ import zipfile
UCDRecord = collections.namedtuple('UCDRecord', [
'c',
'General_Category',
'Canonical_Combining_Class',
'Bidi_Class',
'Decomposition_Type',
'Decomposition_Mapping',
])
@ -107,6 +109,12 @@ class Main:
self.write_wide_chars_list(fh)
with open("ambiguous_wide_chars.h", "w") as fh:
self.write_ambiguous_wide_chars_list(fh)
with open("combining_classes.h", "w") as fh:
self.write_combining_class_table(fh)
with open("canonical_decomp.h", "w") as fh:
self.write_canonical_decomp_table(fh)
with open("canonical_comp.h", "w") as fh:
self.write_canonical_comp_table(fh)
def find_unicode_version(self):
"""Find out the version of Unicode.
@ -166,14 +174,21 @@ class Main:
# Decode some of the raw fields into more cooked
# forms.
cclass = int(cclass)
# For the moment, we only care about decomposition
# mappings that consist of a single hex number (i.e.
# are singletons and not compatibility mappings)
try:
dm = [int(decomp, 16)]
except ValueError:
dm = []
# Separate the decomposition field into decomposition
# type and mapping.
if decomp == "":
dtype = decomp = None
elif "<" not in decomp:
dtype = 'canonical'
else:
assert decomp.startswith("<")
dtype, decomp = decomp[1:].split(">", 1)
decomp = decomp.lstrip(" ")
# And decode the mapping part from hex strings to integers.
if decomp is not None:
decomp = [int(w, 16) for w in decomp.split(" ")]
# And yield a UCDRecord for each code point in our
# range.
@ -181,8 +196,10 @@ class Main:
yield UCDRecord(
c=codepoint,
General_Category=category,
Canonical_Combining_Class=cclass,
Bidi_Class=bidiclass,
Decomposition_Mapping=dm,
Decomposition_Type=dtype,
Decomposition_Mapping=decomp,
)
@property
@ -231,6 +248,16 @@ class Main:
for c in cs:
yield c, fields[1]
@property
def CompositionExclusions(self):
"""Composition exclusions from CompositionExclusions.txt.
Each yielded item is just a code point.
"""
with self.open_ucd_file("CompositionExclusions.txt") as fh:
for line in lines(fh):
yield int(line, 16)
def write_file_header_comment(self, fh, description):
print("/*", file=fh)
print(" * Autogenerated by read_ucd.py from",
@ -311,7 +338,8 @@ Used by terminal/bidi.c.
equivalents = {}
for rec in self.UnicodeData:
if len(rec.Decomposition_Mapping) == 1:
if (rec.Decomposition_Type == 'canonical' and
len(rec.Decomposition_Mapping) == 1):
c = rec.c
c2 = rec.Decomposition_Mapping[0]
equivalents[c] = c2
@ -389,5 +417,78 @@ Used by utils/wcwidth.c.
""")
self.write_width_table(fh, {'A'})
def write_combining_class_table(self, fh):
self.write_file_header_comment(fh, """
List the canonical combining class of each Unicode character, if it is
not zero. This controls how combining marks can be reordered by the
Unicode normalisation algorithms.
Used by utils/unicode-norm.c.
""")
cclasses = {}
for rec in self.UnicodeData:
cc = rec.Canonical_Combining_Class
if cc != 0:
cclasses[rec.c] = cc
for (start, end), cclass in map_to_ranges(cclasses):
print(f"{{0x{start:04x}, 0x{end:04x}, {cclass:d}}},", file=fh)
def write_canonical_decomp_table(self, fh):
self.write_file_header_comment(fh, """
List the canonical decomposition of every Unicode character that has
one. This consists of up to two characters, but those may need
decomposition in turn.
Used by utils/unicode-norm.c.
""")
decomps = {}
for rec in self.UnicodeData:
if rec.Decomposition_Type != 'canonical':
continue
# Fill in a zero code point as the second character, if
# it's only one character long
decomps[rec.c] = (rec.Decomposition_Mapping + [0])[:2]
for c, (d1, d2) in sorted(decomps.items()):
d2s = f"0x{d2:04x}" if d2 else "0"
print(f"{{0x{c:04x}, 0x{d1:04x}, {d2s}}},", file=fh)
def write_canonical_comp_table(self, fh):
self.write_file_header_comment(fh, """
List the pairs of Unicode characters that canonically recompose to a
single character in NFC.
Used by utils/unicode-norm.c.
""")
exclusions = set(self.CompositionExclusions)
nonstarters = set(rec.c for rec in self.UnicodeData
if rec.Canonical_Combining_Class != 0)
decomps = {}
for rec in self.UnicodeData:
if rec.Decomposition_Type != 'canonical':
continue # we don't want compatibility decompositions
if len(rec.Decomposition_Mapping) != 2:
continue # we don't want singletons either
if rec.c in exclusions:
continue # we don't want anything explicitly excluded
if (rec.c in nonstarters or
rec.Decomposition_Mapping[0] in nonstarters):
continue # we don't want non-starter decompositions
decomps[tuple(rec.Decomposition_Mapping)] = rec.c
for (d0, d1), c in sorted(decomps.items()):
print(f"{{0x{d0:04x}, 0x{d1:04x}, 0x{c:04x}}},", file=fh)
if __name__ == '__main__':
Main().run()

View File

@ -65,6 +65,7 @@ add_sources_from_current_dir(utils
stripctrl.c
tempseat.c
tree234.c
unicode-norm.c
validate_manual_hostkey.c
version.c
wcwidth.c

446
utils/unicode-norm.c Normal file
View File

@ -0,0 +1,446 @@
#include <stdio.h>
#include <string.h>
#include "misc.h"
typedef uint32_t uchar;
typedef int cclass_t;
/* A local uchar-oriented analogue of strbuf */
typedef struct ucharbuf {
uchar *buf;
size_t len, size;
} ucharbuf;
static ucharbuf *ucharbuf_new(void)
{
ucharbuf *ub = snew(ucharbuf);
ub->buf = NULL;
ub->len = ub->size = 0;
return ub;
}
static void ucharbuf_append(ucharbuf *ub, uchar c)
{
/* Use the _nm variant because this is used for passphrases */
sgrowarray_nm(ub->buf, ub->size, ub->len);
ub->buf[ub->len++] = c;
}
static void ucharbuf_free(ucharbuf *ub)
{
if (ub->buf) {
memset(ub->buf, 0, ub->size * sizeof(*ub->buf));
sfree(ub->buf);
}
sfree(ub);
}
/*
* Constants relating to the arithmetic decomposition mapping of
* Hangul to jamo, from section 3.12 of Unicode 15.0.0. The following
* constant names match those in the spec.
*/
static const uchar SBase = 0xAC00;
static const uchar LBase = 0x1100, VBase = 0x1161, TBase = 0x11A7;
static const uchar LCount = 19, VCount = 21, TCount = 28;
static const uchar NCount = VCount * TCount, SCount = LCount * NCount;
static cclass_t combining_class(uchar c)
{
struct range {
uchar start, end;
cclass_t cclass;
};
static const struct range ranges[] = {
#include "unicode/combining_classes.h"
};
const struct range *start = ranges, *end = start + lenof(ranges);
while (end > start) {
const struct range *curr = start + (end-start) / 2;
if (c < curr->start)
end = curr;
else if (c > curr->end)
start = curr + 1;
else
return curr->cclass;
}
return 0;
};
static unsigned decompose_char(uchar c, uchar *out)
{
struct decomp {
uchar composed, dec0, dec1;
};
static const struct decomp decomps[] = {
#include "unicode/canonical_decomp.h"
};
if (c - SBase < SCount) {
/* Arithmetically decompose a Hangul character into jamo */
uchar SIndex = c - SBase;
uchar LIndex = SIndex / NCount;
uchar VIndex = SIndex % NCount / TCount;
uchar TIndex = SIndex % TCount;
unsigned n = 0;
out[n++] = LBase + LIndex;
out[n++] = VBase + VIndex;
if (TIndex)
out[n++] = TBase + TIndex;
return n;
}
const struct decomp *start = decomps, *end = start + lenof(decomps);
while (end > start) {
const struct decomp *curr = start + (end-start) / 2;
if (c < curr->composed)
end = curr;
else if (c > curr->composed)
start = curr + 1;
else {
out[0] = curr->dec0;
if (curr->dec1) {
out[1] = curr->dec1;
return 2;
} else {
return 1;
}
}
}
return 0;
};
static uchar compose_chars(uchar S, uchar C)
{
struct comp {
uchar dec0, dec1, composed;
};
static const struct comp comps[] = {
#include "unicode/canonical_comp.h"
};
if (S - LBase < LCount && C - VBase < VCount) {
/* Arithmetically compose an L and V jamo into a Hangul LV
* character */
return SBase + (S - LBase) * NCount + (C - VBase) * TCount;
}
if (S - SBase < SCount && (S - SBase) % TCount == 0 &&
C - TBase < TCount) {
/* Arithmetically compose an LV Hangul character and a T jamo
* into a Hangul LVT character */
return S + C - TBase;
}
const struct comp *start = comps, *end = start + lenof(comps);
while (end > start) {
const struct comp *curr = start + (end-start) / 2;
if (S < curr->dec0)
end = curr;
else if (S > curr->dec0)
start = curr + 1;
else if (C < curr->dec1)
end = curr;
else if (C > curr->dec1)
start = curr + 1;
else
return curr->composed;
}
return 0;
};
/*
* Recursively decompose a sequence of Unicode characters. The output
* is written to 'out', as a sequence of native-byte-order uchar.
*/
static void recursively_decompose(const uchar *str, size_t len, ucharbuf *out)
{
uchar decomposed[3];
while (len-- > 0) {
uchar c = *str++;
unsigned n = decompose_char(c, decomposed);
if (n == 0) {
/* This character is indecomposable */
ucharbuf_append(out, c);
} else {
/* This character has been decomposed into up to 3
* characters, so we must now recursively decompose those */
recursively_decompose(decomposed, n, out);
}
}
}
/*
* Reorder combining marks according to the Canonical Ordering
* Algorithm (definition D109 in Unicode 15.0.0 section 3.11).
*
* The algorithm is phrased mechanistically, but the essence is: among
* any contiguous sequence of combining marks (that is, characters
* with cclass > 0), sort them by their cclass - but _stably_, i.e.
* breaking ties in cclass by preserving the original order of the
* characters in question.
*/
static void canonical_ordering(uchar *str, size_t len)
{
for (size_t i = 1; i < len; i++) {
cclass_t cclass = combining_class(str[i]);
if (cclass == 0)
continue;
size_t j = i;
while (j > 0 && combining_class(str[j-1]) > cclass) {
uchar tmp = str[j-1];
str[j-1] = str[j];
str[j] = tmp;
j--;
}
}
}
/*
* Canonically recompose characters according to the Canonical
* Composition Algorithm (definition D117 in Unicode 15.0.0 section
* 3.11).
*/
static size_t canonical_composition(uchar *str, size_t len)
{
const uchar *in = str;
uchar *out = str;
uchar *last_starter = NULL;
cclass_t highest_cclass_between = -1;
while (len > 0) {
len--;
uchar c = *in++;
cclass_t cclass = combining_class(c);
if (last_starter && highest_cclass_between < cclass) {
uchar composed = compose_chars(*last_starter, c);
if (composed) {
*last_starter = composed;
continue;
}
}
if (cclass == 0) {
last_starter = out;
highest_cclass_between = -1;
} else if (cclass > highest_cclass_between) {
highest_cclass_between = cclass;
}
*out++ = c;
}
return out - str;
}
/*
* Render a string into NFD.
*/
static ucharbuf *nfd(ucharbuf *input)
{
ucharbuf *output = ucharbuf_new();
/*
* Definition D118 in Unicode 15.0.0 section 3.11, referring to
* D68 in section 3.7: recursively decompose characters, then
* reorder combining marks.
*/
recursively_decompose(input->buf, input->len, output);
canonical_ordering(output->buf, output->len);
return output;
}
/*
* Render a string into NFC.
*/
static ucharbuf *nfc(ucharbuf *input)
{
/*
* Definition D120 in Unicode 15.0.0 section 3.11: render the
* string into NFD, then apply the canonical composition algorithm.
*/
ucharbuf *output = nfd(input);
output->len = canonical_composition(output->buf, output->len);
return output;
}
/*
* Convert a UTF-8 string into NFC, returning it as UTF-8 again.
*/
strbuf *utf8_to_nfc(ptrlen input)
{
BinarySource src[1];
BinarySource_BARE_INIT_PL(src, input);
ucharbuf *inbuf = ucharbuf_new();
while (get_avail(src))
ucharbuf_append(inbuf, decode_utf8(src));
ucharbuf *outbuf = nfc(inbuf);
strbuf *output = strbuf_new_nm();
for (size_t i = 0; i < outbuf->len; i++)
put_utf8_char(output, outbuf->buf[i]);
ucharbuf_free(inbuf);
ucharbuf_free(outbuf);
return output;
}
#ifdef TEST
void out_of_memory(void)
{
fprintf(stderr, "out of memory!\n");
exit(2);
}
static int pass, fail;
static void subtest(const char *filename, int lineno, const char *subdesc,
char nftype, ucharbuf *input, ucharbuf *expected)
{
/*
* Convert input into either NFC or NFD, and check it's equal to
* expected
*/
ucharbuf *nf;
switch (nftype) {
case 'C':
nf = nfc(input);
break;
case 'D':
nf = nfd(input);
break;
default:
unreachable("bad nftype");
}
if (nf->len == expected->len && !memcmp(nf->buf, expected->buf, nf->len)) {
pass++;
} else {
printf("%s:%d: failed %s: NF%c([", filename, lineno, subdesc, nftype);
for (size_t pos = 0; pos < input->len; pos += sizeof(uchar))
printf("%s%04X", pos ? " " : "", (unsigned)input->buf[pos]);
printf("]) -> [");
for (size_t pos = 0; pos < nf->len; pos += sizeof(uchar))
printf("%s%04X", pos ? " " : "", (unsigned)nf->buf[pos]);
printf("] != [");
for (size_t pos = 0; pos < expected->len; pos += sizeof(uchar))
printf("%s%04X", pos ? " " : "", (unsigned)expected->buf[pos]);
printf("]\n");
fail++;
}
ucharbuf_free(nf);
}
static void run_tests(const char *filename, FILE *fp)
{
for (int lineno = 1;; lineno++) {
char *line = chomp(fgetline(fp));
if (!line)
break;
/* Strip section dividers which begin with @ */
if (*line == '@') {
sfree(line);
continue;
}
/* Strip comments, if any */
ptrlen pl = ptrlen_from_asciz(line);
{
const char *p = memchr(pl.ptr, '#', pl.len);
if (p)
pl.len = p - (const char *)pl.ptr;
}
/* Strip trailing space */
while (pl.len > 0 &&
(((char *)pl.ptr)[pl.len-1] == ' ' ||
((char *)pl.ptr)[pl.len-1] == '\t'))
pl.len--;
/* Skip empty lines */
if (!pl.len) {
sfree(line);
continue;
}
/* Break up at semicolons, expecting five fields, each of
* which we decode into hex code points */
ucharbuf *fields[5];
for (size_t i = 0; i < lenof(fields); i++) {
ptrlen field = ptrlen_get_word(&pl, ";");
fields[i] = ucharbuf_new();
ptrlen chr;
while ((chr = ptrlen_get_word(&field, " ")).len) {
char *chrstr = mkstr(chr);
uchar c = strtoul(chrstr, NULL, 16);
sfree(chrstr);
ucharbuf_append(fields[i], c);
}
}
subtest(filename, lineno, "NFC(c1) = c2", 'C', fields[0], fields[1]);
subtest(filename, lineno, "NFC(c2) = c2", 'C', fields[1], fields[1]);
subtest(filename, lineno, "NFC(c3) = c2", 'C', fields[2], fields[1]);
subtest(filename, lineno, "NFC(c4) = c4", 'C', fields[3], fields[3]);
subtest(filename, lineno, "NFC(c5) = c4", 'C', fields[4], fields[3]);
subtest(filename, lineno, "NFD(c1) = c3", 'D', fields[0], fields[2]);
subtest(filename, lineno, "NFD(c2) = c3", 'D', fields[1], fields[2]);
subtest(filename, lineno, "NFD(c3) = c3", 'D', fields[2], fields[2]);
subtest(filename, lineno, "NFD(c4) = c5", 'D', fields[3], fields[4]);
subtest(filename, lineno, "NFD(c5) = c5", 'D', fields[4], fields[4]);
for (size_t i = 0; i < lenof(fields); i++)
ucharbuf_free(fields[i]);
sfree(line);
}
}
int main(int argc, char **argv)
{
if (argc != 2) {
fprintf(stderr, "test_unicode_norm: give an input file "
"of tests or '-'\n");
return 1;
}
const char *filename = argv[1];
if (!strcmp(filename, "-")) {
run_tests("<standard input>", stdin);
} else {
FILE *fp = fopen(filename, "r");
if (!fp) {
fprintf(stderr, "test_unicode_norm: unable to open '%s'\n",
filename);
return 1;
}
run_tests(filename, fp);
fclose(fp);
}
printf("pass %d fail %d total %d\n", pass, fail, pass + fail);
return fail != 0;
}
#endif