mirror of
https://git.tartarus.org/simon/putty.git
synced 2025-07-01 03:22:48 -05:00
Implement Unicode normalisation.
A new module in 'utils' computes NFC and NFD, via a new set of data tables generated by read_ucd.py. The new module comes with a new test program, which can read the NormalizationTest.txt that appears in the Unicode Character Database. All the tests pass, as of Unicode 15.
This commit is contained in:
@ -65,6 +65,7 @@ add_sources_from_current_dir(utils
|
||||
stripctrl.c
|
||||
tempseat.c
|
||||
tree234.c
|
||||
unicode-norm.c
|
||||
validate_manual_hostkey.c
|
||||
version.c
|
||||
wcwidth.c
|
||||
|
446
utils/unicode-norm.c
Normal file
446
utils/unicode-norm.c
Normal file
@ -0,0 +1,446 @@
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include "misc.h"
|
||||
|
||||
typedef uint32_t uchar;
|
||||
typedef int cclass_t;
|
||||
|
||||
/* A local uchar-oriented analogue of strbuf */
|
||||
typedef struct ucharbuf {
|
||||
uchar *buf;
|
||||
size_t len, size;
|
||||
} ucharbuf;
|
||||
|
||||
static ucharbuf *ucharbuf_new(void)
|
||||
{
|
||||
ucharbuf *ub = snew(ucharbuf);
|
||||
ub->buf = NULL;
|
||||
ub->len = ub->size = 0;
|
||||
return ub;
|
||||
}
|
||||
|
||||
static void ucharbuf_append(ucharbuf *ub, uchar c)
|
||||
{
|
||||
/* Use the _nm variant because this is used for passphrases */
|
||||
sgrowarray_nm(ub->buf, ub->size, ub->len);
|
||||
ub->buf[ub->len++] = c;
|
||||
}
|
||||
|
||||
static void ucharbuf_free(ucharbuf *ub)
|
||||
{
|
||||
if (ub->buf) {
|
||||
memset(ub->buf, 0, ub->size * sizeof(*ub->buf));
|
||||
sfree(ub->buf);
|
||||
}
|
||||
sfree(ub);
|
||||
}
|
||||
|
||||
/*
|
||||
* Constants relating to the arithmetic decomposition mapping of
|
||||
* Hangul to jamo, from section 3.12 of Unicode 15.0.0. The following
|
||||
* constant names match those in the spec.
|
||||
*/
|
||||
static const uchar SBase = 0xAC00;
|
||||
static const uchar LBase = 0x1100, VBase = 0x1161, TBase = 0x11A7;
|
||||
static const uchar LCount = 19, VCount = 21, TCount = 28;
|
||||
static const uchar NCount = VCount * TCount, SCount = LCount * NCount;
|
||||
|
||||
static cclass_t combining_class(uchar c)
|
||||
{
|
||||
struct range {
|
||||
uchar start, end;
|
||||
cclass_t cclass;
|
||||
};
|
||||
static const struct range ranges[] = {
|
||||
#include "unicode/combining_classes.h"
|
||||
};
|
||||
|
||||
const struct range *start = ranges, *end = start + lenof(ranges);
|
||||
|
||||
while (end > start) {
|
||||
const struct range *curr = start + (end-start) / 2;
|
||||
if (c < curr->start)
|
||||
end = curr;
|
||||
else if (c > curr->end)
|
||||
start = curr + 1;
|
||||
else
|
||||
return curr->cclass;
|
||||
}
|
||||
|
||||
return 0;
|
||||
};
|
||||
|
||||
static unsigned decompose_char(uchar c, uchar *out)
|
||||
{
|
||||
struct decomp {
|
||||
uchar composed, dec0, dec1;
|
||||
};
|
||||
static const struct decomp decomps[] = {
|
||||
#include "unicode/canonical_decomp.h"
|
||||
};
|
||||
|
||||
if (c - SBase < SCount) {
|
||||
/* Arithmetically decompose a Hangul character into jamo */
|
||||
uchar SIndex = c - SBase;
|
||||
uchar LIndex = SIndex / NCount;
|
||||
uchar VIndex = SIndex % NCount / TCount;
|
||||
uchar TIndex = SIndex % TCount;
|
||||
|
||||
unsigned n = 0;
|
||||
out[n++] = LBase + LIndex;
|
||||
out[n++] = VBase + VIndex;
|
||||
if (TIndex)
|
||||
out[n++] = TBase + TIndex;
|
||||
return n;
|
||||
}
|
||||
|
||||
const struct decomp *start = decomps, *end = start + lenof(decomps);
|
||||
|
||||
while (end > start) {
|
||||
const struct decomp *curr = start + (end-start) / 2;
|
||||
if (c < curr->composed)
|
||||
end = curr;
|
||||
else if (c > curr->composed)
|
||||
start = curr + 1;
|
||||
else {
|
||||
out[0] = curr->dec0;
|
||||
if (curr->dec1) {
|
||||
out[1] = curr->dec1;
|
||||
return 2;
|
||||
} else {
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
};
|
||||
|
||||
static uchar compose_chars(uchar S, uchar C)
|
||||
{
|
||||
struct comp {
|
||||
uchar dec0, dec1, composed;
|
||||
};
|
||||
static const struct comp comps[] = {
|
||||
#include "unicode/canonical_comp.h"
|
||||
};
|
||||
|
||||
if (S - LBase < LCount && C - VBase < VCount) {
|
||||
/* Arithmetically compose an L and V jamo into a Hangul LV
|
||||
* character */
|
||||
return SBase + (S - LBase) * NCount + (C - VBase) * TCount;
|
||||
}
|
||||
|
||||
if (S - SBase < SCount && (S - SBase) % TCount == 0 &&
|
||||
C - TBase < TCount) {
|
||||
/* Arithmetically compose an LV Hangul character and a T jamo
|
||||
* into a Hangul LVT character */
|
||||
return S + C - TBase;
|
||||
}
|
||||
|
||||
const struct comp *start = comps, *end = start + lenof(comps);
|
||||
|
||||
while (end > start) {
|
||||
const struct comp *curr = start + (end-start) / 2;
|
||||
if (S < curr->dec0)
|
||||
end = curr;
|
||||
else if (S > curr->dec0)
|
||||
start = curr + 1;
|
||||
else if (C < curr->dec1)
|
||||
end = curr;
|
||||
else if (C > curr->dec1)
|
||||
start = curr + 1;
|
||||
else
|
||||
return curr->composed;
|
||||
}
|
||||
|
||||
return 0;
|
||||
};
|
||||
|
||||
/*
|
||||
* Recursively decompose a sequence of Unicode characters. The output
|
||||
* is written to 'out', as a sequence of native-byte-order uchar.
|
||||
*/
|
||||
static void recursively_decompose(const uchar *str, size_t len, ucharbuf *out)
|
||||
{
|
||||
uchar decomposed[3];
|
||||
|
||||
while (len-- > 0) {
|
||||
uchar c = *str++;
|
||||
unsigned n = decompose_char(c, decomposed);
|
||||
if (n == 0) {
|
||||
/* This character is indecomposable */
|
||||
ucharbuf_append(out, c);
|
||||
} else {
|
||||
/* This character has been decomposed into up to 3
|
||||
* characters, so we must now recursively decompose those */
|
||||
recursively_decompose(decomposed, n, out);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Reorder combining marks according to the Canonical Ordering
|
||||
* Algorithm (definition D109 in Unicode 15.0.0 section 3.11).
|
||||
*
|
||||
* The algorithm is phrased mechanistically, but the essence is: among
|
||||
* any contiguous sequence of combining marks (that is, characters
|
||||
* with cclass > 0), sort them by their cclass - but _stably_, i.e.
|
||||
* breaking ties in cclass by preserving the original order of the
|
||||
* characters in question.
|
||||
*/
|
||||
static void canonical_ordering(uchar *str, size_t len)
|
||||
{
|
||||
for (size_t i = 1; i < len; i++) {
|
||||
cclass_t cclass = combining_class(str[i]);
|
||||
if (cclass == 0)
|
||||
continue;
|
||||
|
||||
size_t j = i;
|
||||
while (j > 0 && combining_class(str[j-1]) > cclass) {
|
||||
uchar tmp = str[j-1];
|
||||
str[j-1] = str[j];
|
||||
str[j] = tmp;
|
||||
|
||||
j--;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Canonically recompose characters according to the Canonical
|
||||
* Composition Algorithm (definition D117 in Unicode 15.0.0 section
|
||||
* 3.11).
|
||||
*/
|
||||
static size_t canonical_composition(uchar *str, size_t len)
|
||||
{
|
||||
const uchar *in = str;
|
||||
uchar *out = str;
|
||||
uchar *last_starter = NULL;
|
||||
cclass_t highest_cclass_between = -1;
|
||||
|
||||
while (len > 0) {
|
||||
len--;
|
||||
uchar c = *in++;
|
||||
cclass_t cclass = combining_class(c);
|
||||
|
||||
if (last_starter && highest_cclass_between < cclass) {
|
||||
uchar composed = compose_chars(*last_starter, c);
|
||||
if (composed) {
|
||||
*last_starter = composed;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
if (cclass == 0) {
|
||||
last_starter = out;
|
||||
highest_cclass_between = -1;
|
||||
} else if (cclass > highest_cclass_between) {
|
||||
highest_cclass_between = cclass;
|
||||
}
|
||||
|
||||
*out++ = c;
|
||||
}
|
||||
|
||||
return out - str;
|
||||
}
|
||||
|
||||
/*
|
||||
* Render a string into NFD.
|
||||
*/
|
||||
static ucharbuf *nfd(ucharbuf *input)
|
||||
{
|
||||
ucharbuf *output = ucharbuf_new();
|
||||
|
||||
/*
|
||||
* Definition D118 in Unicode 15.0.0 section 3.11, referring to
|
||||
* D68 in section 3.7: recursively decompose characters, then
|
||||
* reorder combining marks.
|
||||
*/
|
||||
recursively_decompose(input->buf, input->len, output);
|
||||
canonical_ordering(output->buf, output->len);
|
||||
|
||||
return output;
|
||||
}
|
||||
|
||||
/*
|
||||
* Render a string into NFC.
|
||||
*/
|
||||
static ucharbuf *nfc(ucharbuf *input)
|
||||
{
|
||||
/*
|
||||
* Definition D120 in Unicode 15.0.0 section 3.11: render the
|
||||
* string into NFD, then apply the canonical composition algorithm.
|
||||
*/
|
||||
ucharbuf *output = nfd(input);
|
||||
output->len = canonical_composition(output->buf, output->len);
|
||||
|
||||
return output;
|
||||
}
|
||||
|
||||
/*
|
||||
* Convert a UTF-8 string into NFC, returning it as UTF-8 again.
|
||||
*/
|
||||
strbuf *utf8_to_nfc(ptrlen input)
|
||||
{
|
||||
BinarySource src[1];
|
||||
BinarySource_BARE_INIT_PL(src, input);
|
||||
|
||||
ucharbuf *inbuf = ucharbuf_new();
|
||||
while (get_avail(src))
|
||||
ucharbuf_append(inbuf, decode_utf8(src));
|
||||
|
||||
ucharbuf *outbuf = nfc(inbuf);
|
||||
|
||||
strbuf *output = strbuf_new_nm();
|
||||
for (size_t i = 0; i < outbuf->len; i++)
|
||||
put_utf8_char(output, outbuf->buf[i]);
|
||||
|
||||
ucharbuf_free(inbuf);
|
||||
ucharbuf_free(outbuf);
|
||||
|
||||
return output;
|
||||
}
|
||||
|
||||
#ifdef TEST
|
||||
void out_of_memory(void)
|
||||
{
|
||||
fprintf(stderr, "out of memory!\n");
|
||||
exit(2);
|
||||
}
|
||||
|
||||
static int pass, fail;
|
||||
|
||||
static void subtest(const char *filename, int lineno, const char *subdesc,
|
||||
char nftype, ucharbuf *input, ucharbuf *expected)
|
||||
{
|
||||
/*
|
||||
* Convert input into either NFC or NFD, and check it's equal to
|
||||
* expected
|
||||
*/
|
||||
ucharbuf *nf;
|
||||
switch (nftype) {
|
||||
case 'C':
|
||||
nf = nfc(input);
|
||||
break;
|
||||
case 'D':
|
||||
nf = nfd(input);
|
||||
break;
|
||||
default:
|
||||
unreachable("bad nftype");
|
||||
}
|
||||
|
||||
if (nf->len == expected->len && !memcmp(nf->buf, expected->buf, nf->len)) {
|
||||
pass++;
|
||||
} else {
|
||||
printf("%s:%d: failed %s: NF%c([", filename, lineno, subdesc, nftype);
|
||||
for (size_t pos = 0; pos < input->len; pos += sizeof(uchar))
|
||||
printf("%s%04X", pos ? " " : "", (unsigned)input->buf[pos]);
|
||||
printf("]) -> [");
|
||||
for (size_t pos = 0; pos < nf->len; pos += sizeof(uchar))
|
||||
printf("%s%04X", pos ? " " : "", (unsigned)nf->buf[pos]);
|
||||
printf("] != [");
|
||||
for (size_t pos = 0; pos < expected->len; pos += sizeof(uchar))
|
||||
printf("%s%04X", pos ? " " : "", (unsigned)expected->buf[pos]);
|
||||
printf("]\n");
|
||||
fail++;
|
||||
}
|
||||
|
||||
ucharbuf_free(nf);
|
||||
}
|
||||
|
||||
static void run_tests(const char *filename, FILE *fp)
|
||||
{
|
||||
for (int lineno = 1;; lineno++) {
|
||||
char *line = chomp(fgetline(fp));
|
||||
if (!line)
|
||||
break;
|
||||
|
||||
/* Strip section dividers which begin with @ */
|
||||
if (*line == '@') {
|
||||
sfree(line);
|
||||
continue;
|
||||
}
|
||||
|
||||
/* Strip comments, if any */
|
||||
ptrlen pl = ptrlen_from_asciz(line);
|
||||
{
|
||||
const char *p = memchr(pl.ptr, '#', pl.len);
|
||||
if (p)
|
||||
pl.len = p - (const char *)pl.ptr;
|
||||
}
|
||||
|
||||
/* Strip trailing space */
|
||||
while (pl.len > 0 &&
|
||||
(((char *)pl.ptr)[pl.len-1] == ' ' ||
|
||||
((char *)pl.ptr)[pl.len-1] == '\t'))
|
||||
pl.len--;
|
||||
|
||||
/* Skip empty lines */
|
||||
if (!pl.len) {
|
||||
sfree(line);
|
||||
continue;
|
||||
}
|
||||
|
||||
/* Break up at semicolons, expecting five fields, each of
|
||||
* which we decode into hex code points */
|
||||
ucharbuf *fields[5];
|
||||
for (size_t i = 0; i < lenof(fields); i++) {
|
||||
ptrlen field = ptrlen_get_word(&pl, ";");
|
||||
fields[i] = ucharbuf_new();
|
||||
|
||||
ptrlen chr;
|
||||
while ((chr = ptrlen_get_word(&field, " ")).len) {
|
||||
char *chrstr = mkstr(chr);
|
||||
uchar c = strtoul(chrstr, NULL, 16);
|
||||
sfree(chrstr);
|
||||
ucharbuf_append(fields[i], c);
|
||||
}
|
||||
}
|
||||
|
||||
subtest(filename, lineno, "NFC(c1) = c2", 'C', fields[0], fields[1]);
|
||||
subtest(filename, lineno, "NFC(c2) = c2", 'C', fields[1], fields[1]);
|
||||
subtest(filename, lineno, "NFC(c3) = c2", 'C', fields[2], fields[1]);
|
||||
subtest(filename, lineno, "NFC(c4) = c4", 'C', fields[3], fields[3]);
|
||||
subtest(filename, lineno, "NFC(c5) = c4", 'C', fields[4], fields[3]);
|
||||
subtest(filename, lineno, "NFD(c1) = c3", 'D', fields[0], fields[2]);
|
||||
subtest(filename, lineno, "NFD(c2) = c3", 'D', fields[1], fields[2]);
|
||||
subtest(filename, lineno, "NFD(c3) = c3", 'D', fields[2], fields[2]);
|
||||
subtest(filename, lineno, "NFD(c4) = c5", 'D', fields[3], fields[4]);
|
||||
subtest(filename, lineno, "NFD(c5) = c5", 'D', fields[4], fields[4]);
|
||||
|
||||
for (size_t i = 0; i < lenof(fields); i++)
|
||||
ucharbuf_free(fields[i]);
|
||||
|
||||
sfree(line);
|
||||
}
|
||||
}
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
if (argc != 2) {
|
||||
fprintf(stderr, "test_unicode_norm: give an input file "
|
||||
"of tests or '-'\n");
|
||||
return 1;
|
||||
}
|
||||
|
||||
const char *filename = argv[1];
|
||||
|
||||
if (!strcmp(filename, "-")) {
|
||||
run_tests("<standard input>", stdin);
|
||||
} else {
|
||||
FILE *fp = fopen(filename, "r");
|
||||
if (!fp) {
|
||||
fprintf(stderr, "test_unicode_norm: unable to open '%s'\n",
|
||||
filename);
|
||||
return 1;
|
||||
}
|
||||
run_tests(filename, fp);
|
||||
fclose(fp);
|
||||
}
|
||||
|
||||
printf("pass %d fail %d total %d\n", pass, fail, pass + fail);
|
||||
|
||||
return fail != 0;
|
||||
}
|
||||
#endif
|
Reference in New Issue
Block a user