1
0
mirror of https://git.tartarus.org/simon/putty.git synced 2025-01-09 17:38:00 +00:00
putty-source/utils/unicode-known.c

60 lines
1.7 KiB
C
Raw Normal View History

/*
* Check a UTF-8 string to ensure every character in it is part of the
* version of Unicode that we understand.
*
* (If it isn't, then we don't know what combining properties it has,
* so we can't safely NFC it and rely on the result not changing when
* we later update our Unicode version.)
*/
#include "misc.h"
#include "unicode/version.h"
static bool known(unsigned c)
{
struct range {
unsigned start, end;
};
static const struct range ranges[] = {
#include "unicode/known_chars.h"
};
const struct range *start = ranges, *end = start + lenof(ranges);
while (end > start) {
const struct range *curr = start + (end-start) / 2;
if (c < curr->start)
end = curr;
else if (c > curr->end)
start = curr + 1;
else
return true;
}
return false;
};
char *utf8_unknown_char(ptrlen input)
{
BinarySource src[1];
BinarySource_BARE_INIT_PL(src, input);
for (size_t nchars = 0; get_avail(src); nchars++) {
DecodeUTF8Failure err;
unsigned c = decode_utf8(src, &err);
if (err != DUTF8_SUCCESS)
return dupprintf(
"cannot normalise this string: UTF-8 decoding error "
"at character position %"SIZEu", byte position %"SIZEu": %s",
nchars, src->pos, decode_utf8_error_strings[err]);
if (!known(c))
return dupprintf(
"cannot stably normalise this string: code point %04X "
"(at character position %"SIZEu", byte position %"SIZEu") "
"is not in Unicode %s", c, nchars, src->pos,
UNICODE_VERSION_SHORT);
}
return NULL;
}