mirror of
https://git.tartarus.org/simon/putty.git
synced 2025-01-09 01:18:00 +00:00
9e01de7c2b
Now you can optionally get back an enum value indicating whether the character was successfully decoded, or whether U+FFFD was substituted due to some kind of problem, and if the latter, what problem. For a start, this allows distinguishing 'real' U+FFFD (encoded legitimately in the input) from one invented by the decoder. Also, it allows the recipient of the decode to treat failures differently, either by passing on a useful error report to the user (as utf8_unknown_char now does) or by doing something special. In particular, there are two distinct error codes for a truncated UTF-8 encoding, depending on whether it was truncated by the end of the input or by encountering a non-continuation byte. The former code means that the string is not legal UTF-8 _as it is_, but doesn't rule out it being a (bytewise) prefix of a legal UTF-8 string - so if a client is receiving UTF-8 data a byte at a time, they can treat that error code specially and not make it a fatal error.
454 lines
12 KiB
C
454 lines
12 KiB
C
#include <stdio.h>
|
|
#include <string.h>
|
|
#include "misc.h"
|
|
|
|
typedef uint32_t uchar;
|
|
typedef int cclass_t;
|
|
|
|
/* A local uchar-oriented analogue of strbuf */
|
|
typedef struct ucharbuf {
|
|
uchar *buf;
|
|
size_t len, size;
|
|
} ucharbuf;
|
|
|
|
static ucharbuf *ucharbuf_new(void)
|
|
{
|
|
ucharbuf *ub = snew(ucharbuf);
|
|
ub->buf = NULL;
|
|
ub->len = ub->size = 0;
|
|
return ub;
|
|
}
|
|
|
|
static void ucharbuf_append(ucharbuf *ub, uchar c)
|
|
{
|
|
/* Use the _nm variant because this is used for passphrases */
|
|
sgrowarray_nm(ub->buf, ub->size, ub->len);
|
|
ub->buf[ub->len++] = c;
|
|
}
|
|
|
|
static void ucharbuf_free(ucharbuf *ub)
|
|
{
|
|
if (ub->buf) {
|
|
memset(ub->buf, 0, ub->size * sizeof(*ub->buf));
|
|
sfree(ub->buf);
|
|
}
|
|
sfree(ub);
|
|
}
|
|
|
|
/*
|
|
* Constants relating to the arithmetic decomposition mapping of
|
|
* Hangul to jamo, from section 3.12 of Unicode 15.0.0. The following
|
|
* constant names match those in the spec.
|
|
*/
|
|
enum {
|
|
SBase = 0xAC00, /* base index for precomposed Hangul */
|
|
LBase = 0x1100, /* base index for L (leading consonant) jamo */
|
|
VBase = 0x1161, /* base index for V (vowel) jamo */
|
|
TBase = 0x11A7, /* base index for T (trailing consonant) jamo */
|
|
LCount = 19, /* number of L jamo */
|
|
VCount = 21, /* number of V jamo */
|
|
TCount = 28, /* number of T jamo, including not having one at all */
|
|
NCount = VCount * TCount, /* number of Hangul for each L jamo */
|
|
SCount = LCount * NCount, /* number of Hangul in total */
|
|
};
|
|
|
|
static cclass_t combining_class(uchar c)
|
|
{
|
|
struct range {
|
|
uchar start, end;
|
|
cclass_t cclass;
|
|
};
|
|
static const struct range ranges[] = {
|
|
#include "unicode/combining_classes.h"
|
|
};
|
|
|
|
const struct range *start = ranges, *end = start + lenof(ranges);
|
|
|
|
while (end > start) {
|
|
const struct range *curr = start + (end-start) / 2;
|
|
if (c < curr->start)
|
|
end = curr;
|
|
else if (c > curr->end)
|
|
start = curr + 1;
|
|
else
|
|
return curr->cclass;
|
|
}
|
|
|
|
return 0;
|
|
};
|
|
|
|
static unsigned decompose_char(uchar c, uchar *out)
|
|
{
|
|
struct decomp {
|
|
uchar composed, dec0, dec1;
|
|
};
|
|
static const struct decomp decomps[] = {
|
|
#include "unicode/canonical_decomp.h"
|
|
};
|
|
|
|
if (c - SBase < SCount) {
|
|
/* Arithmetically decompose a Hangul character into jamo */
|
|
uchar SIndex = c - SBase;
|
|
uchar LIndex = SIndex / NCount;
|
|
uchar VIndex = SIndex % NCount / TCount;
|
|
uchar TIndex = SIndex % TCount;
|
|
|
|
unsigned n = 0;
|
|
out[n++] = LBase + LIndex;
|
|
out[n++] = VBase + VIndex;
|
|
if (TIndex)
|
|
out[n++] = TBase + TIndex;
|
|
return n;
|
|
}
|
|
|
|
const struct decomp *start = decomps, *end = start + lenof(decomps);
|
|
|
|
while (end > start) {
|
|
const struct decomp *curr = start + (end-start) / 2;
|
|
if (c < curr->composed)
|
|
end = curr;
|
|
else if (c > curr->composed)
|
|
start = curr + 1;
|
|
else {
|
|
out[0] = curr->dec0;
|
|
if (curr->dec1) {
|
|
out[1] = curr->dec1;
|
|
return 2;
|
|
} else {
|
|
return 1;
|
|
}
|
|
}
|
|
}
|
|
|
|
return 0;
|
|
};
|
|
|
|
static uchar compose_chars(uchar S, uchar C)
|
|
{
|
|
struct comp {
|
|
uchar dec0, dec1, composed;
|
|
};
|
|
static const struct comp comps[] = {
|
|
#include "unicode/canonical_comp.h"
|
|
};
|
|
|
|
if (S - LBase < LCount && C - VBase < VCount) {
|
|
/* Arithmetically compose an L and V jamo into a Hangul LV
|
|
* character */
|
|
return SBase + (S - LBase) * NCount + (C - VBase) * TCount;
|
|
}
|
|
|
|
if (S - SBase < SCount && (S - SBase) % TCount == 0 &&
|
|
C - TBase < TCount) {
|
|
/* Arithmetically compose an LV Hangul character and a T jamo
|
|
* into a Hangul LVT character */
|
|
return S + C - TBase;
|
|
}
|
|
|
|
const struct comp *start = comps, *end = start + lenof(comps);
|
|
|
|
while (end > start) {
|
|
const struct comp *curr = start + (end-start) / 2;
|
|
if (S < curr->dec0)
|
|
end = curr;
|
|
else if (S > curr->dec0)
|
|
start = curr + 1;
|
|
else if (C < curr->dec1)
|
|
end = curr;
|
|
else if (C > curr->dec1)
|
|
start = curr + 1;
|
|
else
|
|
return curr->composed;
|
|
}
|
|
|
|
return 0;
|
|
};
|
|
|
|
/*
|
|
* Recursively decompose a sequence of Unicode characters. The output
|
|
* is written to 'out', as a sequence of native-byte-order uchar.
|
|
*/
|
|
static void recursively_decompose(const uchar *str, size_t len, ucharbuf *out)
|
|
{
|
|
uchar decomposed[3];
|
|
|
|
while (len-- > 0) {
|
|
uchar c = *str++;
|
|
unsigned n = decompose_char(c, decomposed);
|
|
if (n == 0) {
|
|
/* This character is indecomposable */
|
|
ucharbuf_append(out, c);
|
|
} else {
|
|
/* This character has been decomposed into up to 3
|
|
* characters, so we must now recursively decompose those */
|
|
recursively_decompose(decomposed, n, out);
|
|
}
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Reorder combining marks according to the Canonical Ordering
|
|
* Algorithm (definition D109 in Unicode 15.0.0 section 3.11).
|
|
*
|
|
* The algorithm is phrased mechanistically, but the essence is: among
|
|
* any contiguous sequence of combining marks (that is, characters
|
|
* with cclass > 0), sort them by their cclass - but _stably_, i.e.
|
|
* breaking ties in cclass by preserving the original order of the
|
|
* characters in question.
|
|
*/
|
|
static void canonical_ordering(uchar *str, size_t len)
|
|
{
|
|
for (size_t i = 1; i < len; i++) {
|
|
cclass_t cclass = combining_class(str[i]);
|
|
if (cclass == 0)
|
|
continue;
|
|
|
|
size_t j = i;
|
|
while (j > 0 && combining_class(str[j-1]) > cclass) {
|
|
uchar tmp = str[j-1];
|
|
str[j-1] = str[j];
|
|
str[j] = tmp;
|
|
|
|
j--;
|
|
}
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Canonically recompose characters according to the Canonical
|
|
* Composition Algorithm (definition D117 in Unicode 15.0.0 section
|
|
* 3.11).
|
|
*/
|
|
static size_t canonical_composition(uchar *str, size_t len)
|
|
{
|
|
const uchar *in = str;
|
|
uchar *out = str;
|
|
uchar *last_starter = NULL;
|
|
cclass_t highest_cclass_between = -1;
|
|
|
|
while (len > 0) {
|
|
len--;
|
|
uchar c = *in++;
|
|
cclass_t cclass = combining_class(c);
|
|
|
|
if (last_starter && highest_cclass_between < cclass) {
|
|
uchar composed = compose_chars(*last_starter, c);
|
|
if (composed) {
|
|
*last_starter = composed;
|
|
continue;
|
|
}
|
|
}
|
|
|
|
if (cclass == 0) {
|
|
last_starter = out;
|
|
highest_cclass_between = -1;
|
|
} else if (cclass > highest_cclass_between) {
|
|
highest_cclass_between = cclass;
|
|
}
|
|
|
|
*out++ = c;
|
|
}
|
|
|
|
return out - str;
|
|
}
|
|
|
|
/*
|
|
* Render a string into NFD.
|
|
*/
|
|
static ucharbuf *nfd(ucharbuf *input)
|
|
{
|
|
ucharbuf *output = ucharbuf_new();
|
|
|
|
/*
|
|
* Definition D118 in Unicode 15.0.0 section 3.11, referring to
|
|
* D68 in section 3.7: recursively decompose characters, then
|
|
* reorder combining marks.
|
|
*/
|
|
recursively_decompose(input->buf, input->len, output);
|
|
canonical_ordering(output->buf, output->len);
|
|
|
|
return output;
|
|
}
|
|
|
|
/*
|
|
* Render a string into NFC.
|
|
*/
|
|
static ucharbuf *nfc(ucharbuf *input)
|
|
{
|
|
/*
|
|
* Definition D120 in Unicode 15.0.0 section 3.11: render the
|
|
* string into NFD, then apply the canonical composition algorithm.
|
|
*/
|
|
ucharbuf *output = nfd(input);
|
|
output->len = canonical_composition(output->buf, output->len);
|
|
|
|
return output;
|
|
}
|
|
|
|
/*
|
|
* Convert a UTF-8 string into NFC, returning it as UTF-8 again.
|
|
*/
|
|
strbuf *utf8_to_nfc(ptrlen input)
|
|
{
|
|
BinarySource src[1];
|
|
BinarySource_BARE_INIT_PL(src, input);
|
|
|
|
ucharbuf *inbuf = ucharbuf_new();
|
|
while (get_avail(src))
|
|
ucharbuf_append(inbuf, decode_utf8(src, NULL));
|
|
|
|
ucharbuf *outbuf = nfc(inbuf);
|
|
|
|
strbuf *output = strbuf_new_nm();
|
|
for (size_t i = 0; i < outbuf->len; i++)
|
|
put_utf8_char(output, outbuf->buf[i]);
|
|
|
|
ucharbuf_free(inbuf);
|
|
ucharbuf_free(outbuf);
|
|
|
|
return output;
|
|
}
|
|
|
|
#ifdef TEST
|
|
void out_of_memory(void)
|
|
{
|
|
fprintf(stderr, "out of memory!\n");
|
|
exit(2);
|
|
}
|
|
|
|
static int pass, fail;
|
|
|
|
static void subtest(const char *filename, int lineno, const char *subdesc,
|
|
char nftype, ucharbuf *input, ucharbuf *expected)
|
|
{
|
|
/*
|
|
* Convert input into either NFC or NFD, and check it's equal to
|
|
* expected
|
|
*/
|
|
ucharbuf *nf;
|
|
switch (nftype) {
|
|
case 'C':
|
|
nf = nfc(input);
|
|
break;
|
|
case 'D':
|
|
nf = nfd(input);
|
|
break;
|
|
default:
|
|
unreachable("bad nftype");
|
|
}
|
|
|
|
if (nf->len == expected->len && !memcmp(nf->buf, expected->buf, nf->len)) {
|
|
pass++;
|
|
} else {
|
|
printf("%s:%d: failed %s: NF%c([", filename, lineno, subdesc, nftype);
|
|
for (size_t pos = 0; pos < input->len; pos += sizeof(uchar))
|
|
printf("%s%04X", pos ? " " : "", (unsigned)input->buf[pos]);
|
|
printf("]) -> [");
|
|
for (size_t pos = 0; pos < nf->len; pos += sizeof(uchar))
|
|
printf("%s%04X", pos ? " " : "", (unsigned)nf->buf[pos]);
|
|
printf("] != [");
|
|
for (size_t pos = 0; pos < expected->len; pos += sizeof(uchar))
|
|
printf("%s%04X", pos ? " " : "", (unsigned)expected->buf[pos]);
|
|
printf("]\n");
|
|
fail++;
|
|
}
|
|
|
|
ucharbuf_free(nf);
|
|
}
|
|
|
|
static void run_tests(const char *filename, FILE *fp)
|
|
{
|
|
for (int lineno = 1;; lineno++) {
|
|
char *line = chomp(fgetline(fp));
|
|
if (!line)
|
|
break;
|
|
|
|
/* Strip section dividers which begin with @ */
|
|
if (*line == '@') {
|
|
sfree(line);
|
|
continue;
|
|
}
|
|
|
|
/* Strip comments, if any */
|
|
ptrlen pl = ptrlen_from_asciz(line);
|
|
{
|
|
const char *p = memchr(pl.ptr, '#', pl.len);
|
|
if (p)
|
|
pl.len = p - (const char *)pl.ptr;
|
|
}
|
|
|
|
/* Strip trailing space */
|
|
while (pl.len > 0 &&
|
|
(((char *)pl.ptr)[pl.len-1] == ' ' ||
|
|
((char *)pl.ptr)[pl.len-1] == '\t'))
|
|
pl.len--;
|
|
|
|
/* Skip empty lines */
|
|
if (!pl.len) {
|
|
sfree(line);
|
|
continue;
|
|
}
|
|
|
|
/* Break up at semicolons, expecting five fields, each of
|
|
* which we decode into hex code points */
|
|
ucharbuf *fields[5];
|
|
for (size_t i = 0; i < lenof(fields); i++) {
|
|
ptrlen field = ptrlen_get_word(&pl, ";");
|
|
fields[i] = ucharbuf_new();
|
|
|
|
ptrlen chr;
|
|
while ((chr = ptrlen_get_word(&field, " ")).len) {
|
|
char *chrstr = mkstr(chr);
|
|
uchar c = strtoul(chrstr, NULL, 16);
|
|
sfree(chrstr);
|
|
ucharbuf_append(fields[i], c);
|
|
}
|
|
}
|
|
|
|
subtest(filename, lineno, "NFC(c1) = c2", 'C', fields[0], fields[1]);
|
|
subtest(filename, lineno, "NFC(c2) = c2", 'C', fields[1], fields[1]);
|
|
subtest(filename, lineno, "NFC(c3) = c2", 'C', fields[2], fields[1]);
|
|
subtest(filename, lineno, "NFC(c4) = c4", 'C', fields[3], fields[3]);
|
|
subtest(filename, lineno, "NFC(c5) = c4", 'C', fields[4], fields[3]);
|
|
subtest(filename, lineno, "NFD(c1) = c3", 'D', fields[0], fields[2]);
|
|
subtest(filename, lineno, "NFD(c2) = c3", 'D', fields[1], fields[2]);
|
|
subtest(filename, lineno, "NFD(c3) = c3", 'D', fields[2], fields[2]);
|
|
subtest(filename, lineno, "NFD(c4) = c5", 'D', fields[3], fields[4]);
|
|
subtest(filename, lineno, "NFD(c5) = c5", 'D', fields[4], fields[4]);
|
|
|
|
for (size_t i = 0; i < lenof(fields); i++)
|
|
ucharbuf_free(fields[i]);
|
|
|
|
sfree(line);
|
|
}
|
|
}
|
|
|
|
int main(int argc, char **argv)
|
|
{
|
|
if (argc != 2) {
|
|
fprintf(stderr, "test_unicode_norm: give an input file "
|
|
"of tests or '-'\n");
|
|
return 1;
|
|
}
|
|
|
|
const char *filename = argv[1];
|
|
|
|
if (!strcmp(filename, "-")) {
|
|
run_tests("<standard input>", stdin);
|
|
} else {
|
|
FILE *fp = fopen(filename, "r");
|
|
if (!fp) {
|
|
fprintf(stderr, "test_unicode_norm: unable to open '%s'\n",
|
|
filename);
|
|
return 1;
|
|
}
|
|
run_tests(filename, fp);
|
|
fclose(fp);
|
|
}
|
|
|
|
printf("pass %d fail %d total %d\n", pass, fail, pass + fail);
|
|
|
|
return fail != 0;
|
|
}
|
|
#endif
|