From d3e186e81b1d30c8b0c42ac98ef0a2e15a4838ec Mon Sep 17 00:00:00 2001 From: Simon Tatham Date: Wed, 9 Nov 2022 08:56:11 +0000 Subject: [PATCH] Function to check a UTF-8 string for unknown characters. So we can reject things we don't know how to NFC yet. --- misc.h | 4 + unicode/known_chars.h | 716 ++++++++++++++++++++++++++++++++++++++++++ unicode/read_ucd.py | 16 + utils/CMakeLists.txt | 1 + utils/unicode-known.c | 53 ++++ 5 files changed, 790 insertions(+) create mode 100644 unicode/known_chars.h create mode 100644 utils/unicode-known.c diff --git a/misc.h b/misc.h index ab8c137f..a0e686a3 100644 --- a/misc.h +++ b/misc.h @@ -268,6 +268,10 @@ size_t decode_utf8_to_wchar(BinarySource *src, wchar_t *out); /* Normalise a UTF-8 string into Normalisation Form C. */ strbuf *utf8_to_nfc(ptrlen input); +/* Determine if a UTF-8 string contains any characters unknown to our + * supported version of Unicode. */ +char *utf8_unknown_char(ptrlen input); + /* Write a string out in C string-literal format. */ void write_c_string_literal(FILE *fp, ptrlen str); diff --git a/unicode/known_chars.h b/unicode/known_chars.h new file mode 100644 index 00000000..de1b313c --- /dev/null +++ b/unicode/known_chars.h @@ -0,0 +1,716 @@ +/* + * Autogenerated by read_ucd.py from The Unicode Standard 15.0.0 + * + * List the Unicode code points that are known to this version of the + * standard at all. + * + * Used by utils/unicode-known.c. + */ + +{0x0000, 0x0377}, +{0x037a, 0x037f}, +{0x0384, 0x038a}, +{0x038c, 0x038c}, +{0x038e, 0x03a1}, +{0x03a3, 0x052f}, +{0x0531, 0x0556}, +{0x0559, 0x058a}, +{0x058d, 0x058f}, +{0x0591, 0x05c7}, +{0x05d0, 0x05ea}, +{0x05ef, 0x05f4}, +{0x0600, 0x070d}, +{0x070f, 0x074a}, +{0x074d, 0x07b1}, +{0x07c0, 0x07fa}, +{0x07fd, 0x082d}, +{0x0830, 0x083e}, +{0x0840, 0x085b}, +{0x085e, 0x085e}, +{0x0860, 0x086a}, +{0x0870, 0x088e}, +{0x0890, 0x0891}, +{0x0898, 0x0983}, +{0x0985, 0x098c}, +{0x098f, 0x0990}, +{0x0993, 0x09a8}, +{0x09aa, 0x09b0}, +{0x09b2, 0x09b2}, +{0x09b6, 0x09b9}, +{0x09bc, 0x09c4}, +{0x09c7, 0x09c8}, +{0x09cb, 0x09ce}, +{0x09d7, 0x09d7}, +{0x09dc, 0x09dd}, +{0x09df, 0x09e3}, +{0x09e6, 0x09fe}, +{0x0a01, 0x0a03}, +{0x0a05, 0x0a0a}, +{0x0a0f, 0x0a10}, +{0x0a13, 0x0a28}, +{0x0a2a, 0x0a30}, +{0x0a32, 0x0a33}, +{0x0a35, 0x0a36}, +{0x0a38, 0x0a39}, +{0x0a3c, 0x0a3c}, +{0x0a3e, 0x0a42}, +{0x0a47, 0x0a48}, +{0x0a4b, 0x0a4d}, +{0x0a51, 0x0a51}, +{0x0a59, 0x0a5c}, +{0x0a5e, 0x0a5e}, +{0x0a66, 0x0a76}, +{0x0a81, 0x0a83}, +{0x0a85, 0x0a8d}, +{0x0a8f, 0x0a91}, +{0x0a93, 0x0aa8}, +{0x0aaa, 0x0ab0}, +{0x0ab2, 0x0ab3}, +{0x0ab5, 0x0ab9}, +{0x0abc, 0x0ac5}, +{0x0ac7, 0x0ac9}, +{0x0acb, 0x0acd}, +{0x0ad0, 0x0ad0}, +{0x0ae0, 0x0ae3}, +{0x0ae6, 0x0af1}, +{0x0af9, 0x0aff}, +{0x0b01, 0x0b03}, +{0x0b05, 0x0b0c}, +{0x0b0f, 0x0b10}, +{0x0b13, 0x0b28}, +{0x0b2a, 0x0b30}, +{0x0b32, 0x0b33}, +{0x0b35, 0x0b39}, +{0x0b3c, 0x0b44}, +{0x0b47, 0x0b48}, +{0x0b4b, 0x0b4d}, +{0x0b55, 0x0b57}, +{0x0b5c, 0x0b5d}, +{0x0b5f, 0x0b63}, +{0x0b66, 0x0b77}, +{0x0b82, 0x0b83}, +{0x0b85, 0x0b8a}, +{0x0b8e, 0x0b90}, +{0x0b92, 0x0b95}, +{0x0b99, 0x0b9a}, +{0x0b9c, 0x0b9c}, +{0x0b9e, 0x0b9f}, +{0x0ba3, 0x0ba4}, +{0x0ba8, 0x0baa}, +{0x0bae, 0x0bb9}, +{0x0bbe, 0x0bc2}, +{0x0bc6, 0x0bc8}, +{0x0bca, 0x0bcd}, +{0x0bd0, 0x0bd0}, +{0x0bd7, 0x0bd7}, +{0x0be6, 0x0bfa}, +{0x0c00, 0x0c0c}, +{0x0c0e, 0x0c10}, +{0x0c12, 0x0c28}, +{0x0c2a, 0x0c39}, +{0x0c3c, 0x0c44}, +{0x0c46, 0x0c48}, +{0x0c4a, 0x0c4d}, +{0x0c55, 0x0c56}, +{0x0c58, 0x0c5a}, +{0x0c5d, 0x0c5d}, +{0x0c60, 0x0c63}, +{0x0c66, 0x0c6f}, +{0x0c77, 0x0c8c}, +{0x0c8e, 0x0c90}, +{0x0c92, 0x0ca8}, +{0x0caa, 0x0cb3}, +{0x0cb5, 0x0cb9}, +{0x0cbc, 0x0cc4}, +{0x0cc6, 0x0cc8}, +{0x0cca, 0x0ccd}, +{0x0cd5, 0x0cd6}, +{0x0cdd, 0x0cde}, +{0x0ce0, 0x0ce3}, +{0x0ce6, 0x0cef}, +{0x0cf1, 0x0cf3}, +{0x0d00, 0x0d0c}, +{0x0d0e, 0x0d10}, +{0x0d12, 0x0d44}, +{0x0d46, 0x0d48}, +{0x0d4a, 0x0d4f}, +{0x0d54, 0x0d63}, +{0x0d66, 0x0d7f}, +{0x0d81, 0x0d83}, +{0x0d85, 0x0d96}, +{0x0d9a, 0x0db1}, +{0x0db3, 0x0dbb}, +{0x0dbd, 0x0dbd}, +{0x0dc0, 0x0dc6}, +{0x0dca, 0x0dca}, +{0x0dcf, 0x0dd4}, +{0x0dd6, 0x0dd6}, +{0x0dd8, 0x0ddf}, +{0x0de6, 0x0def}, +{0x0df2, 0x0df4}, +{0x0e01, 0x0e3a}, +{0x0e3f, 0x0e5b}, +{0x0e81, 0x0e82}, +{0x0e84, 0x0e84}, +{0x0e86, 0x0e8a}, +{0x0e8c, 0x0ea3}, +{0x0ea5, 0x0ea5}, +{0x0ea7, 0x0ebd}, +{0x0ec0, 0x0ec4}, +{0x0ec6, 0x0ec6}, +{0x0ec8, 0x0ece}, +{0x0ed0, 0x0ed9}, +{0x0edc, 0x0edf}, +{0x0f00, 0x0f47}, +{0x0f49, 0x0f6c}, +{0x0f71, 0x0f97}, +{0x0f99, 0x0fbc}, +{0x0fbe, 0x0fcc}, +{0x0fce, 0x0fda}, +{0x1000, 0x10c5}, +{0x10c7, 0x10c7}, +{0x10cd, 0x10cd}, +{0x10d0, 0x1248}, +{0x124a, 0x124d}, +{0x1250, 0x1256}, +{0x1258, 0x1258}, +{0x125a, 0x125d}, +{0x1260, 0x1288}, +{0x128a, 0x128d}, +{0x1290, 0x12b0}, +{0x12b2, 0x12b5}, +{0x12b8, 0x12be}, +{0x12c0, 0x12c0}, +{0x12c2, 0x12c5}, +{0x12c8, 0x12d6}, +{0x12d8, 0x1310}, +{0x1312, 0x1315}, +{0x1318, 0x135a}, +{0x135d, 0x137c}, +{0x1380, 0x1399}, +{0x13a0, 0x13f5}, +{0x13f8, 0x13fd}, +{0x1400, 0x169c}, +{0x16a0, 0x16f8}, +{0x1700, 0x1715}, +{0x171f, 0x1736}, +{0x1740, 0x1753}, +{0x1760, 0x176c}, +{0x176e, 0x1770}, +{0x1772, 0x1773}, +{0x1780, 0x17dd}, +{0x17e0, 0x17e9}, +{0x17f0, 0x17f9}, +{0x1800, 0x1819}, +{0x1820, 0x1878}, +{0x1880, 0x18aa}, +{0x18b0, 0x18f5}, +{0x1900, 0x191e}, +{0x1920, 0x192b}, +{0x1930, 0x193b}, +{0x1940, 0x1940}, +{0x1944, 0x196d}, +{0x1970, 0x1974}, +{0x1980, 0x19ab}, +{0x19b0, 0x19c9}, +{0x19d0, 0x19da}, +{0x19de, 0x1a1b}, +{0x1a1e, 0x1a5e}, +{0x1a60, 0x1a7c}, +{0x1a7f, 0x1a89}, +{0x1a90, 0x1a99}, +{0x1aa0, 0x1aad}, +{0x1ab0, 0x1ace}, +{0x1b00, 0x1b4c}, +{0x1b50, 0x1b7e}, +{0x1b80, 0x1bf3}, +{0x1bfc, 0x1c37}, +{0x1c3b, 0x1c49}, +{0x1c4d, 0x1c88}, +{0x1c90, 0x1cba}, +{0x1cbd, 0x1cc7}, +{0x1cd0, 0x1cfa}, +{0x1d00, 0x1f15}, +{0x1f18, 0x1f1d}, +{0x1f20, 0x1f45}, +{0x1f48, 0x1f4d}, +{0x1f50, 0x1f57}, +{0x1f59, 0x1f59}, +{0x1f5b, 0x1f5b}, +{0x1f5d, 0x1f5d}, +{0x1f5f, 0x1f7d}, +{0x1f80, 0x1fb4}, +{0x1fb6, 0x1fc4}, +{0x1fc6, 0x1fd3}, +{0x1fd6, 0x1fdb}, +{0x1fdd, 0x1fef}, +{0x1ff2, 0x1ff4}, +{0x1ff6, 0x1ffe}, +{0x2000, 0x2064}, +{0x2066, 0x2071}, +{0x2074, 0x208e}, +{0x2090, 0x209c}, +{0x20a0, 0x20c0}, +{0x20d0, 0x20f0}, +{0x2100, 0x218b}, +{0x2190, 0x2426}, +{0x2440, 0x244a}, +{0x2460, 0x2b73}, +{0x2b76, 0x2b95}, +{0x2b97, 0x2cf3}, +{0x2cf9, 0x2d25}, +{0x2d27, 0x2d27}, +{0x2d2d, 0x2d2d}, +{0x2d30, 0x2d67}, +{0x2d6f, 0x2d70}, +{0x2d7f, 0x2d96}, +{0x2da0, 0x2da6}, +{0x2da8, 0x2dae}, +{0x2db0, 0x2db6}, +{0x2db8, 0x2dbe}, +{0x2dc0, 0x2dc6}, +{0x2dc8, 0x2dce}, +{0x2dd0, 0x2dd6}, +{0x2dd8, 0x2dde}, +{0x2de0, 0x2e5d}, +{0x2e80, 0x2e99}, +{0x2e9b, 0x2ef3}, +{0x2f00, 0x2fd5}, +{0x2ff0, 0x2ffb}, +{0x3000, 0x303f}, +{0x3041, 0x3096}, +{0x3099, 0x30ff}, +{0x3105, 0x312f}, +{0x3131, 0x318e}, +{0x3190, 0x31e3}, +{0x31f0, 0x321e}, +{0x3220, 0xa48c}, +{0xa490, 0xa4c6}, +{0xa4d0, 0xa62b}, +{0xa640, 0xa6f7}, +{0xa700, 0xa7ca}, +{0xa7d0, 0xa7d1}, +{0xa7d3, 0xa7d3}, +{0xa7d5, 0xa7d9}, +{0xa7f2, 0xa82c}, +{0xa830, 0xa839}, +{0xa840, 0xa877}, +{0xa880, 0xa8c5}, +{0xa8ce, 0xa8d9}, +{0xa8e0, 0xa953}, +{0xa95f, 0xa97c}, +{0xa980, 0xa9cd}, +{0xa9cf, 0xa9d9}, +{0xa9de, 0xa9fe}, +{0xaa00, 0xaa36}, +{0xaa40, 0xaa4d}, +{0xaa50, 0xaa59}, +{0xaa5c, 0xaac2}, +{0xaadb, 0xaaf6}, +{0xab01, 0xab06}, +{0xab09, 0xab0e}, +{0xab11, 0xab16}, +{0xab20, 0xab26}, +{0xab28, 0xab2e}, +{0xab30, 0xab6b}, +{0xab70, 0xabed}, +{0xabf0, 0xabf9}, +{0xac00, 0xd7a3}, +{0xd7b0, 0xd7c6}, +{0xd7cb, 0xd7fb}, +{0xd800, 0xfa6d}, +{0xfa70, 0xfad9}, +{0xfb00, 0xfb06}, +{0xfb13, 0xfb17}, +{0xfb1d, 0xfb36}, +{0xfb38, 0xfb3c}, +{0xfb3e, 0xfb3e}, +{0xfb40, 0xfb41}, +{0xfb43, 0xfb44}, +{0xfb46, 0xfbc2}, +{0xfbd3, 0xfd8f}, +{0xfd92, 0xfdc7}, +{0xfdcf, 0xfdcf}, +{0xfdf0, 0xfe19}, +{0xfe20, 0xfe52}, +{0xfe54, 0xfe66}, +{0xfe68, 0xfe6b}, +{0xfe70, 0xfe74}, +{0xfe76, 0xfefc}, +{0xfeff, 0xfeff}, +{0xff01, 0xffbe}, +{0xffc2, 0xffc7}, +{0xffca, 0xffcf}, +{0xffd2, 0xffd7}, +{0xffda, 0xffdc}, +{0xffe0, 0xffe6}, +{0xffe8, 0xffee}, +{0xfff9, 0xfffd}, +{0x10000, 0x1000b}, +{0x1000d, 0x10026}, +{0x10028, 0x1003a}, +{0x1003c, 0x1003d}, +{0x1003f, 0x1004d}, +{0x10050, 0x1005d}, +{0x10080, 0x100fa}, +{0x10100, 0x10102}, +{0x10107, 0x10133}, +{0x10137, 0x1018e}, +{0x10190, 0x1019c}, +{0x101a0, 0x101a0}, +{0x101d0, 0x101fd}, +{0x10280, 0x1029c}, +{0x102a0, 0x102d0}, +{0x102e0, 0x102fb}, +{0x10300, 0x10323}, +{0x1032d, 0x1034a}, +{0x10350, 0x1037a}, +{0x10380, 0x1039d}, +{0x1039f, 0x103c3}, +{0x103c8, 0x103d5}, +{0x10400, 0x1049d}, +{0x104a0, 0x104a9}, +{0x104b0, 0x104d3}, +{0x104d8, 0x104fb}, +{0x10500, 0x10527}, +{0x10530, 0x10563}, +{0x1056f, 0x1057a}, +{0x1057c, 0x1058a}, +{0x1058c, 0x10592}, +{0x10594, 0x10595}, +{0x10597, 0x105a1}, +{0x105a3, 0x105b1}, +{0x105b3, 0x105b9}, +{0x105bb, 0x105bc}, +{0x10600, 0x10736}, +{0x10740, 0x10755}, +{0x10760, 0x10767}, +{0x10780, 0x10785}, +{0x10787, 0x107b0}, +{0x107b2, 0x107ba}, +{0x10800, 0x10805}, +{0x10808, 0x10808}, +{0x1080a, 0x10835}, +{0x10837, 0x10838}, +{0x1083c, 0x1083c}, +{0x1083f, 0x10855}, +{0x10857, 0x1089e}, +{0x108a7, 0x108af}, +{0x108e0, 0x108f2}, +{0x108f4, 0x108f5}, +{0x108fb, 0x1091b}, +{0x1091f, 0x10939}, +{0x1093f, 0x1093f}, +{0x10980, 0x109b7}, +{0x109bc, 0x109cf}, +{0x109d2, 0x10a03}, +{0x10a05, 0x10a06}, +{0x10a0c, 0x10a13}, +{0x10a15, 0x10a17}, +{0x10a19, 0x10a35}, +{0x10a38, 0x10a3a}, +{0x10a3f, 0x10a48}, +{0x10a50, 0x10a58}, +{0x10a60, 0x10a9f}, +{0x10ac0, 0x10ae6}, +{0x10aeb, 0x10af6}, +{0x10b00, 0x10b35}, +{0x10b39, 0x10b55}, +{0x10b58, 0x10b72}, +{0x10b78, 0x10b91}, +{0x10b99, 0x10b9c}, +{0x10ba9, 0x10baf}, +{0x10c00, 0x10c48}, +{0x10c80, 0x10cb2}, +{0x10cc0, 0x10cf2}, +{0x10cfa, 0x10d27}, +{0x10d30, 0x10d39}, +{0x10e60, 0x10e7e}, +{0x10e80, 0x10ea9}, +{0x10eab, 0x10ead}, +{0x10eb0, 0x10eb1}, +{0x10efd, 0x10f27}, +{0x10f30, 0x10f59}, +{0x10f70, 0x10f89}, +{0x10fb0, 0x10fcb}, +{0x10fe0, 0x10ff6}, +{0x11000, 0x1104d}, +{0x11052, 0x11075}, +{0x1107f, 0x110c2}, +{0x110cd, 0x110cd}, +{0x110d0, 0x110e8}, +{0x110f0, 0x110f9}, +{0x11100, 0x11134}, +{0x11136, 0x11147}, +{0x11150, 0x11176}, +{0x11180, 0x111df}, +{0x111e1, 0x111f4}, +{0x11200, 0x11211}, +{0x11213, 0x11241}, +{0x11280, 0x11286}, +{0x11288, 0x11288}, +{0x1128a, 0x1128d}, +{0x1128f, 0x1129d}, +{0x1129f, 0x112a9}, +{0x112b0, 0x112ea}, +{0x112f0, 0x112f9}, +{0x11300, 0x11303}, +{0x11305, 0x1130c}, +{0x1130f, 0x11310}, +{0x11313, 0x11328}, +{0x1132a, 0x11330}, +{0x11332, 0x11333}, +{0x11335, 0x11339}, +{0x1133b, 0x11344}, +{0x11347, 0x11348}, +{0x1134b, 0x1134d}, +{0x11350, 0x11350}, +{0x11357, 0x11357}, +{0x1135d, 0x11363}, +{0x11366, 0x1136c}, +{0x11370, 0x11374}, +{0x11400, 0x1145b}, +{0x1145d, 0x11461}, +{0x11480, 0x114c7}, +{0x114d0, 0x114d9}, +{0x11580, 0x115b5}, +{0x115b8, 0x115dd}, +{0x11600, 0x11644}, +{0x11650, 0x11659}, +{0x11660, 0x1166c}, +{0x11680, 0x116b9}, +{0x116c0, 0x116c9}, +{0x11700, 0x1171a}, +{0x1171d, 0x1172b}, +{0x11730, 0x11746}, +{0x11800, 0x1183b}, +{0x118a0, 0x118f2}, +{0x118ff, 0x11906}, +{0x11909, 0x11909}, +{0x1190c, 0x11913}, +{0x11915, 0x11916}, +{0x11918, 0x11935}, +{0x11937, 0x11938}, +{0x1193b, 0x11946}, +{0x11950, 0x11959}, +{0x119a0, 0x119a7}, +{0x119aa, 0x119d7}, +{0x119da, 0x119e4}, +{0x11a00, 0x11a47}, +{0x11a50, 0x11aa2}, +{0x11ab0, 0x11af8}, +{0x11b00, 0x11b09}, +{0x11c00, 0x11c08}, +{0x11c0a, 0x11c36}, +{0x11c38, 0x11c45}, +{0x11c50, 0x11c6c}, +{0x11c70, 0x11c8f}, +{0x11c92, 0x11ca7}, +{0x11ca9, 0x11cb6}, +{0x11d00, 0x11d06}, +{0x11d08, 0x11d09}, +{0x11d0b, 0x11d36}, +{0x11d3a, 0x11d3a}, +{0x11d3c, 0x11d3d}, +{0x11d3f, 0x11d47}, +{0x11d50, 0x11d59}, +{0x11d60, 0x11d65}, +{0x11d67, 0x11d68}, +{0x11d6a, 0x11d8e}, +{0x11d90, 0x11d91}, +{0x11d93, 0x11d98}, +{0x11da0, 0x11da9}, +{0x11ee0, 0x11ef8}, +{0x11f00, 0x11f10}, +{0x11f12, 0x11f3a}, +{0x11f3e, 0x11f59}, +{0x11fb0, 0x11fb0}, +{0x11fc0, 0x11ff1}, +{0x11fff, 0x12399}, +{0x12400, 0x1246e}, +{0x12470, 0x12474}, +{0x12480, 0x12543}, +{0x12f90, 0x12ff2}, +{0x13000, 0x13455}, +{0x14400, 0x14646}, +{0x16800, 0x16a38}, +{0x16a40, 0x16a5e}, +{0x16a60, 0x16a69}, +{0x16a6e, 0x16abe}, +{0x16ac0, 0x16ac9}, +{0x16ad0, 0x16aed}, +{0x16af0, 0x16af5}, +{0x16b00, 0x16b45}, +{0x16b50, 0x16b59}, +{0x16b5b, 0x16b61}, +{0x16b63, 0x16b77}, +{0x16b7d, 0x16b8f}, +{0x16e40, 0x16e9a}, +{0x16f00, 0x16f4a}, +{0x16f4f, 0x16f87}, +{0x16f8f, 0x16f9f}, +{0x16fe0, 0x16fe4}, +{0x16ff0, 0x16ff1}, +{0x17000, 0x187f7}, +{0x18800, 0x18cd5}, +{0x18d00, 0x18d08}, +{0x1aff0, 0x1aff3}, +{0x1aff5, 0x1affb}, +{0x1affd, 0x1affe}, +{0x1b000, 0x1b122}, +{0x1b132, 0x1b132}, +{0x1b150, 0x1b152}, +{0x1b155, 0x1b155}, +{0x1b164, 0x1b167}, +{0x1b170, 0x1b2fb}, +{0x1bc00, 0x1bc6a}, +{0x1bc70, 0x1bc7c}, +{0x1bc80, 0x1bc88}, +{0x1bc90, 0x1bc99}, +{0x1bc9c, 0x1bca3}, +{0x1cf00, 0x1cf2d}, +{0x1cf30, 0x1cf46}, +{0x1cf50, 0x1cfc3}, +{0x1d000, 0x1d0f5}, +{0x1d100, 0x1d126}, +{0x1d129, 0x1d1ea}, +{0x1d200, 0x1d245}, +{0x1d2c0, 0x1d2d3}, +{0x1d2e0, 0x1d2f3}, +{0x1d300, 0x1d356}, +{0x1d360, 0x1d378}, +{0x1d400, 0x1d454}, +{0x1d456, 0x1d49c}, +{0x1d49e, 0x1d49f}, +{0x1d4a2, 0x1d4a2}, +{0x1d4a5, 0x1d4a6}, +{0x1d4a9, 0x1d4ac}, +{0x1d4ae, 0x1d4b9}, +{0x1d4bb, 0x1d4bb}, +{0x1d4bd, 0x1d4c3}, +{0x1d4c5, 0x1d505}, +{0x1d507, 0x1d50a}, +{0x1d50d, 0x1d514}, +{0x1d516, 0x1d51c}, +{0x1d51e, 0x1d539}, +{0x1d53b, 0x1d53e}, +{0x1d540, 0x1d544}, +{0x1d546, 0x1d546}, +{0x1d54a, 0x1d550}, +{0x1d552, 0x1d6a5}, +{0x1d6a8, 0x1d7cb}, +{0x1d7ce, 0x1da8b}, +{0x1da9b, 0x1da9f}, +{0x1daa1, 0x1daaf}, +{0x1df00, 0x1df1e}, +{0x1df25, 0x1df2a}, +{0x1e000, 0x1e006}, +{0x1e008, 0x1e018}, +{0x1e01b, 0x1e021}, +{0x1e023, 0x1e024}, +{0x1e026, 0x1e02a}, +{0x1e030, 0x1e06d}, +{0x1e08f, 0x1e08f}, +{0x1e100, 0x1e12c}, +{0x1e130, 0x1e13d}, +{0x1e140, 0x1e149}, +{0x1e14e, 0x1e14f}, +{0x1e290, 0x1e2ae}, +{0x1e2c0, 0x1e2f9}, +{0x1e2ff, 0x1e2ff}, +{0x1e4d0, 0x1e4f9}, +{0x1e7e0, 0x1e7e6}, +{0x1e7e8, 0x1e7eb}, +{0x1e7ed, 0x1e7ee}, +{0x1e7f0, 0x1e7fe}, +{0x1e800, 0x1e8c4}, +{0x1e8c7, 0x1e8d6}, +{0x1e900, 0x1e94b}, +{0x1e950, 0x1e959}, +{0x1e95e, 0x1e95f}, +{0x1ec71, 0x1ecb4}, +{0x1ed01, 0x1ed3d}, +{0x1ee00, 0x1ee03}, +{0x1ee05, 0x1ee1f}, +{0x1ee21, 0x1ee22}, +{0x1ee24, 0x1ee24}, +{0x1ee27, 0x1ee27}, +{0x1ee29, 0x1ee32}, +{0x1ee34, 0x1ee37}, +{0x1ee39, 0x1ee39}, +{0x1ee3b, 0x1ee3b}, +{0x1ee42, 0x1ee42}, +{0x1ee47, 0x1ee47}, +{0x1ee49, 0x1ee49}, +{0x1ee4b, 0x1ee4b}, +{0x1ee4d, 0x1ee4f}, +{0x1ee51, 0x1ee52}, +{0x1ee54, 0x1ee54}, +{0x1ee57, 0x1ee57}, +{0x1ee59, 0x1ee59}, +{0x1ee5b, 0x1ee5b}, +{0x1ee5d, 0x1ee5d}, +{0x1ee5f, 0x1ee5f}, +{0x1ee61, 0x1ee62}, +{0x1ee64, 0x1ee64}, +{0x1ee67, 0x1ee6a}, +{0x1ee6c, 0x1ee72}, +{0x1ee74, 0x1ee77}, +{0x1ee79, 0x1ee7c}, +{0x1ee7e, 0x1ee7e}, +{0x1ee80, 0x1ee89}, +{0x1ee8b, 0x1ee9b}, +{0x1eea1, 0x1eea3}, +{0x1eea5, 0x1eea9}, +{0x1eeab, 0x1eebb}, +{0x1eef0, 0x1eef1}, +{0x1f000, 0x1f02b}, +{0x1f030, 0x1f093}, +{0x1f0a0, 0x1f0ae}, +{0x1f0b1, 0x1f0bf}, +{0x1f0c1, 0x1f0cf}, +{0x1f0d1, 0x1f0f5}, +{0x1f100, 0x1f1ad}, +{0x1f1e6, 0x1f202}, +{0x1f210, 0x1f23b}, +{0x1f240, 0x1f248}, +{0x1f250, 0x1f251}, +{0x1f260, 0x1f265}, +{0x1f300, 0x1f6d7}, +{0x1f6dc, 0x1f6ec}, +{0x1f6f0, 0x1f6fc}, +{0x1f700, 0x1f776}, +{0x1f77b, 0x1f7d9}, +{0x1f7e0, 0x1f7eb}, +{0x1f7f0, 0x1f7f0}, +{0x1f800, 0x1f80b}, +{0x1f810, 0x1f847}, +{0x1f850, 0x1f859}, +{0x1f860, 0x1f887}, +{0x1f890, 0x1f8ad}, +{0x1f8b0, 0x1f8b1}, +{0x1f900, 0x1fa53}, +{0x1fa60, 0x1fa6d}, +{0x1fa70, 0x1fa7c}, +{0x1fa80, 0x1fa88}, +{0x1fa90, 0x1fabd}, +{0x1fabf, 0x1fac5}, +{0x1face, 0x1fadb}, +{0x1fae0, 0x1fae8}, +{0x1faf0, 0x1faf8}, +{0x1fb00, 0x1fb92}, +{0x1fb94, 0x1fbca}, +{0x1fbf0, 0x1fbf9}, +{0x20000, 0x2a6df}, +{0x2a700, 0x2b739}, +{0x2b740, 0x2b81d}, +{0x2b820, 0x2cea1}, +{0x2ceb0, 0x2ebe0}, +{0x2f800, 0x2fa1d}, +{0x30000, 0x3134a}, +{0x31350, 0x323af}, +{0xe0001, 0xe0001}, +{0xe0020, 0xe007f}, +{0xe0100, 0xe01ef}, +{0xf0000, 0xffffd}, +{0x100000, 0x10fffd}, diff --git a/unicode/read_ucd.py b/unicode/read_ucd.py index 03bc5006..ba9f0f31 100755 --- a/unicode/read_ucd.py +++ b/unicode/read_ucd.py @@ -109,6 +109,8 @@ class Main: self.write_wide_chars_list(fh) with open("ambiguous_wide_chars.h", "w") as fh: self.write_ambiguous_wide_chars_list(fh) + with open("known_chars.h", "w") as fh: + self.write_known_chars_table(fh) with open("combining_classes.h", "w") as fh: self.write_combining_class_table(fh) with open("canonical_decomp.h", "w") as fh: @@ -417,6 +419,20 @@ Used by utils/wcwidth.c. """) self.write_width_table(fh, {'A'}) + def write_known_chars_table(self, fh): + self.write_file_header_comment(fh, """ + +List the Unicode code points that are known to this version of the +standard at all. + +Used by utils/unicode-known.c. + +""") + chars = set(rec.c for rec in self.UnicodeData) + + for start, end in set_to_ranges(chars): + print(f"{{0x{start:04x}, 0x{end:04x}}},", file=fh) + def write_combining_class_table(self, fh): self.write_file_header_comment(fh, """ diff --git a/utils/CMakeLists.txt b/utils/CMakeLists.txt index 1f64225f..3cf1cbd9 100644 --- a/utils/CMakeLists.txt +++ b/utils/CMakeLists.txt @@ -65,6 +65,7 @@ add_sources_from_current_dir(utils stripctrl.c tempseat.c tree234.c + unicode-known.c unicode-norm.c validate_manual_hostkey.c version.c diff --git a/utils/unicode-known.c b/utils/unicode-known.c new file mode 100644 index 00000000..bfa63d70 --- /dev/null +++ b/utils/unicode-known.c @@ -0,0 +1,53 @@ +/* + * Check a UTF-8 string to ensure every character in it is part of the + * version of Unicode that we understand. + * + * (If it isn't, then we don't know what combining properties it has, + * so we can't safely NFC it and rely on the result not changing when + * we later update our Unicode version.) + */ + +#include "misc.h" +#include "unicode/version.h" + +static bool known(unsigned c) +{ + struct range { + unsigned start, end; + }; + static const struct range ranges[] = { + #include "unicode/known_chars.h" + }; + + const struct range *start = ranges, *end = start + lenof(ranges); + + while (end > start) { + const struct range *curr = start + (end-start) / 2; + if (c < curr->start) + end = curr; + else if (c > curr->end) + start = curr + 1; + else + return true; + } + + return false; +}; + +char *utf8_unknown_char(ptrlen input) +{ + BinarySource src[1]; + BinarySource_BARE_INIT_PL(src, input); + + for (size_t nchars = 0; get_avail(src); nchars++) { + unsigned c = decode_utf8(src); + if (!known(c)) + return dupprintf( + "cannot stably normalise this string: code point %04X " + "(at character position %"SIZEu", byte position %"SIZEu") " + "is not in Unicode %s", c, nchars, src->pos, + UNICODE_VERSION_SHORT); + } + + return NULL; +}