1
0
mirror of https://git.tartarus.org/simon/putty.git synced 2025-01-09 17:38:00 +00:00
putty-source/utils/unicode-norm.c
Simon Tatham 9e01de7c2b decode_utf8: add an enumeration of failure reasons.
Now you can optionally get back an enum value indicating whether the
character was successfully decoded, or whether U+FFFD was substituted
due to some kind of problem, and if the latter, what problem.

For a start, this allows distinguishing 'real' U+FFFD (encoded
legitimately in the input) from one invented by the decoder. Also, it
allows the recipient of the decode to treat failures differently,
either by passing on a useful error report to the user (as
utf8_unknown_char now does) or by doing something special.

In particular, there are two distinct error codes for a truncated
UTF-8 encoding, depending on whether it was truncated by the end of
the input or by encountering a non-continuation byte. The former code
means that the string is not legal UTF-8 _as it is_, but doesn't rule
out it being a (bytewise) prefix of a legal UTF-8 string - so if a
client is receiving UTF-8 data a byte at a time, they can treat that
error code specially and not make it a fatal error.
2023-02-17 17:16:54 +00:00

454 lines
12 KiB
C

#include <stdio.h>
#include <string.h>
#include "misc.h"
typedef uint32_t uchar;
typedef int cclass_t;
/* A local uchar-oriented analogue of strbuf */
typedef struct ucharbuf {
uchar *buf;
size_t len, size;
} ucharbuf;
static ucharbuf *ucharbuf_new(void)
{
ucharbuf *ub = snew(ucharbuf);
ub->buf = NULL;
ub->len = ub->size = 0;
return ub;
}
static void ucharbuf_append(ucharbuf *ub, uchar c)
{
/* Use the _nm variant because this is used for passphrases */
sgrowarray_nm(ub->buf, ub->size, ub->len);
ub->buf[ub->len++] = c;
}
static void ucharbuf_free(ucharbuf *ub)
{
if (ub->buf) {
memset(ub->buf, 0, ub->size * sizeof(*ub->buf));
sfree(ub->buf);
}
sfree(ub);
}
/*
* Constants relating to the arithmetic decomposition mapping of
* Hangul to jamo, from section 3.12 of Unicode 15.0.0. The following
* constant names match those in the spec.
*/
enum {
SBase = 0xAC00, /* base index for precomposed Hangul */
LBase = 0x1100, /* base index for L (leading consonant) jamo */
VBase = 0x1161, /* base index for V (vowel) jamo */
TBase = 0x11A7, /* base index for T (trailing consonant) jamo */
LCount = 19, /* number of L jamo */
VCount = 21, /* number of V jamo */
TCount = 28, /* number of T jamo, including not having one at all */
NCount = VCount * TCount, /* number of Hangul for each L jamo */
SCount = LCount * NCount, /* number of Hangul in total */
};
static cclass_t combining_class(uchar c)
{
struct range {
uchar start, end;
cclass_t cclass;
};
static const struct range ranges[] = {
#include "unicode/combining_classes.h"
};
const struct range *start = ranges, *end = start + lenof(ranges);
while (end > start) {
const struct range *curr = start + (end-start) / 2;
if (c < curr->start)
end = curr;
else if (c > curr->end)
start = curr + 1;
else
return curr->cclass;
}
return 0;
};
static unsigned decompose_char(uchar c, uchar *out)
{
struct decomp {
uchar composed, dec0, dec1;
};
static const struct decomp decomps[] = {
#include "unicode/canonical_decomp.h"
};
if (c - SBase < SCount) {
/* Arithmetically decompose a Hangul character into jamo */
uchar SIndex = c - SBase;
uchar LIndex = SIndex / NCount;
uchar VIndex = SIndex % NCount / TCount;
uchar TIndex = SIndex % TCount;
unsigned n = 0;
out[n++] = LBase + LIndex;
out[n++] = VBase + VIndex;
if (TIndex)
out[n++] = TBase + TIndex;
return n;
}
const struct decomp *start = decomps, *end = start + lenof(decomps);
while (end > start) {
const struct decomp *curr = start + (end-start) / 2;
if (c < curr->composed)
end = curr;
else if (c > curr->composed)
start = curr + 1;
else {
out[0] = curr->dec0;
if (curr->dec1) {
out[1] = curr->dec1;
return 2;
} else {
return 1;
}
}
}
return 0;
};
static uchar compose_chars(uchar S, uchar C)
{
struct comp {
uchar dec0, dec1, composed;
};
static const struct comp comps[] = {
#include "unicode/canonical_comp.h"
};
if (S - LBase < LCount && C - VBase < VCount) {
/* Arithmetically compose an L and V jamo into a Hangul LV
* character */
return SBase + (S - LBase) * NCount + (C - VBase) * TCount;
}
if (S - SBase < SCount && (S - SBase) % TCount == 0 &&
C - TBase < TCount) {
/* Arithmetically compose an LV Hangul character and a T jamo
* into a Hangul LVT character */
return S + C - TBase;
}
const struct comp *start = comps, *end = start + lenof(comps);
while (end > start) {
const struct comp *curr = start + (end-start) / 2;
if (S < curr->dec0)
end = curr;
else if (S > curr->dec0)
start = curr + 1;
else if (C < curr->dec1)
end = curr;
else if (C > curr->dec1)
start = curr + 1;
else
return curr->composed;
}
return 0;
};
/*
* Recursively decompose a sequence of Unicode characters. The output
* is written to 'out', as a sequence of native-byte-order uchar.
*/
static void recursively_decompose(const uchar *str, size_t len, ucharbuf *out)
{
uchar decomposed[3];
while (len-- > 0) {
uchar c = *str++;
unsigned n = decompose_char(c, decomposed);
if (n == 0) {
/* This character is indecomposable */
ucharbuf_append(out, c);
} else {
/* This character has been decomposed into up to 3
* characters, so we must now recursively decompose those */
recursively_decompose(decomposed, n, out);
}
}
}
/*
* Reorder combining marks according to the Canonical Ordering
* Algorithm (definition D109 in Unicode 15.0.0 section 3.11).
*
* The algorithm is phrased mechanistically, but the essence is: among
* any contiguous sequence of combining marks (that is, characters
* with cclass > 0), sort them by their cclass - but _stably_, i.e.
* breaking ties in cclass by preserving the original order of the
* characters in question.
*/
static void canonical_ordering(uchar *str, size_t len)
{
for (size_t i = 1; i < len; i++) {
cclass_t cclass = combining_class(str[i]);
if (cclass == 0)
continue;
size_t j = i;
while (j > 0 && combining_class(str[j-1]) > cclass) {
uchar tmp = str[j-1];
str[j-1] = str[j];
str[j] = tmp;
j--;
}
}
}
/*
* Canonically recompose characters according to the Canonical
* Composition Algorithm (definition D117 in Unicode 15.0.0 section
* 3.11).
*/
static size_t canonical_composition(uchar *str, size_t len)
{
const uchar *in = str;
uchar *out = str;
uchar *last_starter = NULL;
cclass_t highest_cclass_between = -1;
while (len > 0) {
len--;
uchar c = *in++;
cclass_t cclass = combining_class(c);
if (last_starter && highest_cclass_between < cclass) {
uchar composed = compose_chars(*last_starter, c);
if (composed) {
*last_starter = composed;
continue;
}
}
if (cclass == 0) {
last_starter = out;
highest_cclass_between = -1;
} else if (cclass > highest_cclass_between) {
highest_cclass_between = cclass;
}
*out++ = c;
}
return out - str;
}
/*
* Render a string into NFD.
*/
static ucharbuf *nfd(ucharbuf *input)
{
ucharbuf *output = ucharbuf_new();
/*
* Definition D118 in Unicode 15.0.0 section 3.11, referring to
* D68 in section 3.7: recursively decompose characters, then
* reorder combining marks.
*/
recursively_decompose(input->buf, input->len, output);
canonical_ordering(output->buf, output->len);
return output;
}
/*
* Render a string into NFC.
*/
static ucharbuf *nfc(ucharbuf *input)
{
/*
* Definition D120 in Unicode 15.0.0 section 3.11: render the
* string into NFD, then apply the canonical composition algorithm.
*/
ucharbuf *output = nfd(input);
output->len = canonical_composition(output->buf, output->len);
return output;
}
/*
* Convert a UTF-8 string into NFC, returning it as UTF-8 again.
*/
strbuf *utf8_to_nfc(ptrlen input)
{
BinarySource src[1];
BinarySource_BARE_INIT_PL(src, input);
ucharbuf *inbuf = ucharbuf_new();
while (get_avail(src))
ucharbuf_append(inbuf, decode_utf8(src, NULL));
ucharbuf *outbuf = nfc(inbuf);
strbuf *output = strbuf_new_nm();
for (size_t i = 0; i < outbuf->len; i++)
put_utf8_char(output, outbuf->buf[i]);
ucharbuf_free(inbuf);
ucharbuf_free(outbuf);
return output;
}
#ifdef TEST
void out_of_memory(void)
{
fprintf(stderr, "out of memory!\n");
exit(2);
}
static int pass, fail;
static void subtest(const char *filename, int lineno, const char *subdesc,
char nftype, ucharbuf *input, ucharbuf *expected)
{
/*
* Convert input into either NFC or NFD, and check it's equal to
* expected
*/
ucharbuf *nf;
switch (nftype) {
case 'C':
nf = nfc(input);
break;
case 'D':
nf = nfd(input);
break;
default:
unreachable("bad nftype");
}
if (nf->len == expected->len && !memcmp(nf->buf, expected->buf, nf->len)) {
pass++;
} else {
printf("%s:%d: failed %s: NF%c([", filename, lineno, subdesc, nftype);
for (size_t pos = 0; pos < input->len; pos += sizeof(uchar))
printf("%s%04X", pos ? " " : "", (unsigned)input->buf[pos]);
printf("]) -> [");
for (size_t pos = 0; pos < nf->len; pos += sizeof(uchar))
printf("%s%04X", pos ? " " : "", (unsigned)nf->buf[pos]);
printf("] != [");
for (size_t pos = 0; pos < expected->len; pos += sizeof(uchar))
printf("%s%04X", pos ? " " : "", (unsigned)expected->buf[pos]);
printf("]\n");
fail++;
}
ucharbuf_free(nf);
}
static void run_tests(const char *filename, FILE *fp)
{
for (int lineno = 1;; lineno++) {
char *line = chomp(fgetline(fp));
if (!line)
break;
/* Strip section dividers which begin with @ */
if (*line == '@') {
sfree(line);
continue;
}
/* Strip comments, if any */
ptrlen pl = ptrlen_from_asciz(line);
{
const char *p = memchr(pl.ptr, '#', pl.len);
if (p)
pl.len = p - (const char *)pl.ptr;
}
/* Strip trailing space */
while (pl.len > 0 &&
(((char *)pl.ptr)[pl.len-1] == ' ' ||
((char *)pl.ptr)[pl.len-1] == '\t'))
pl.len--;
/* Skip empty lines */
if (!pl.len) {
sfree(line);
continue;
}
/* Break up at semicolons, expecting five fields, each of
* which we decode into hex code points */
ucharbuf *fields[5];
for (size_t i = 0; i < lenof(fields); i++) {
ptrlen field = ptrlen_get_word(&pl, ";");
fields[i] = ucharbuf_new();
ptrlen chr;
while ((chr = ptrlen_get_word(&field, " ")).len) {
char *chrstr = mkstr(chr);
uchar c = strtoul(chrstr, NULL, 16);
sfree(chrstr);
ucharbuf_append(fields[i], c);
}
}
subtest(filename, lineno, "NFC(c1) = c2", 'C', fields[0], fields[1]);
subtest(filename, lineno, "NFC(c2) = c2", 'C', fields[1], fields[1]);
subtest(filename, lineno, "NFC(c3) = c2", 'C', fields[2], fields[1]);
subtest(filename, lineno, "NFC(c4) = c4", 'C', fields[3], fields[3]);
subtest(filename, lineno, "NFC(c5) = c4", 'C', fields[4], fields[3]);
subtest(filename, lineno, "NFD(c1) = c3", 'D', fields[0], fields[2]);
subtest(filename, lineno, "NFD(c2) = c3", 'D', fields[1], fields[2]);
subtest(filename, lineno, "NFD(c3) = c3", 'D', fields[2], fields[2]);
subtest(filename, lineno, "NFD(c4) = c5", 'D', fields[3], fields[4]);
subtest(filename, lineno, "NFD(c5) = c5", 'D', fields[4], fields[4]);
for (size_t i = 0; i < lenof(fields); i++)
ucharbuf_free(fields[i]);
sfree(line);
}
}
int main(int argc, char **argv)
{
if (argc != 2) {
fprintf(stderr, "test_unicode_norm: give an input file "
"of tests or '-'\n");
return 1;
}
const char *filename = argv[1];
if (!strcmp(filename, "-")) {
run_tests("<standard input>", stdin);
} else {
FILE *fp = fopen(filename, "r");
if (!fp) {
fprintf(stderr, "test_unicode_norm: unable to open '%s'\n",
filename);
return 1;
}
run_tests(filename, fp);
fclose(fp);
}
printf("pass %d fail %d total %d\n", pass, fail, pass + fail);
return fail != 0;
}
#endif