/* * Check a UTF-8 string to ensure every character in it is part of the * version of Unicode that we understand. * * (If it isn't, then we don't know what combining properties it has, * so we can't safely NFC it and rely on the result not changing when * we later update our Unicode version.) */ #include "misc.h" #include "unicode/version.h" static bool known(unsigned c) { struct range { unsigned start, end; }; static const struct range ranges[] = { #include "unicode/known_chars.h" }; const struct range *start = ranges, *end = start + lenof(ranges); while (end > start) { const struct range *curr = start + (end-start) / 2; if (c < curr->start) end = curr; else if (c > curr->end) start = curr + 1; else return true; } return false; }; char *utf8_unknown_char(ptrlen input) { BinarySource src[1]; BinarySource_BARE_INIT_PL(src, input); for (size_t nchars = 0; get_avail(src); nchars++) { DecodeUTF8Failure err; unsigned c = decode_utf8(src, &err); if (err != DUTF8_SUCCESS) return dupprintf( "cannot normalise this string: UTF-8 decoding error " "at character position %"SIZEu", byte position %"SIZEu": %s", nchars, src->pos, decode_utf8_error_strings[err]); if (!known(c)) return dupprintf( "cannot stably normalise this string: code point %04X " "(at character position %"SIZEu", byte position %"SIZEu") " "is not in Unicode %s", c, nchars, src->pos, UNICODE_VERSION_SHORT); } return NULL; }