From 9e01de7c2b2903412822f3285da1d692d1474524 Mon Sep 17 00:00:00 2001 From: Simon Tatham Date: Fri, 17 Feb 2023 16:39:09 +0000 Subject: [PATCH] decode_utf8: add an enumeration of failure reasons. Now you can optionally get back an enum value indicating whether the character was successfully decoded, or whether U+FFFD was substituted due to some kind of problem, and if the latter, what problem. For a start, this allows distinguishing 'real' U+FFFD (encoded legitimately in the input) from one invented by the decoder. Also, it allows the recipient of the decode to treat failures differently, either by passing on a useful error report to the user (as utf8_unknown_char now does) or by doing something special. In particular, there are two distinct error codes for a truncated UTF-8 encoding, depending on whether it was truncated by the end of the input or by encountering a non-continuation byte. The former code means that the string is not legal UTF-8 _as it is_, but doesn't rule out it being a (bytewise) prefix of a legal UTF-8 string - so if a client is receiving UTF-8 data a byte at a time, they can treat that error code specially and not make it a fatal error. --- misc.h | 24 +++++- utils/decode_utf8.c | 151 ++++++++++++++++++++++++++--------- utils/decode_utf8_to_wchar.c | 5 +- utils/unicode-known.c | 8 +- utils/unicode-norm.c | 2 +- windows/unicode.c | 2 +- 6 files changed, 147 insertions(+), 45 deletions(-) diff --git a/misc.h b/misc.h index e47f19f3..c4f0e14c 100644 --- a/misc.h +++ b/misc.h @@ -258,14 +258,34 @@ char *encode_wide_string_as_utf8(const wchar_t *wstr); /* Decode a single UTF-8 character. Returns U+FFFD for any of the * illegal cases. If the source is empty, returns L'\0' (and sets the * error indicator on the source, of course). */ -unsigned decode_utf8(BinarySource *src); +#define DECODE_UTF8_FAILURE_LIST(X) \ + X(DUTF8_SUCCESS, "success") \ + X(DUTF8_SPURIOUS_CONTINUATION, "spurious continuation byte") \ + X(DUTF8_ILLEGAL_BYTE, "illegal UTF-8 byte value") \ + X(DUTF8_E_OUT_OF_DATA, "unfinished multibyte encoding at end of string") \ + X(DUTF8_TRUNCATED_SEQUENCE, "multibyte encoding interrupted by " \ + "non-continuation byte") \ + X(DUTF8_OVERLONG_ENCODING, "overlong encoding") \ + X(DUTF8_ENCODED_SURROGATE, "Unicode surrogate character encoded in " \ + "UTF-8") \ + X(DUTF8_CODE_POINT_TOO_BIG, "code point outside the Unicode range") \ + /* end of list */ +typedef enum DecodeUTF8Failure { + #define ENUM_DECL(sym, string) sym, + DECODE_UTF8_FAILURE_LIST(ENUM_DECL) + #undef ENUM_DECL + DUTF8_N_FAILURE_CODES +} DecodeUTF8Failure; +unsigned decode_utf8(BinarySource *src, DecodeUTF8Failure *err); +extern const char *const decode_utf8_error_strings[DUTF8_N_FAILURE_CODES]; /* Decode a single UTF-8 character to an output buffer of the * platform's wchar_t. May write a pair of surrogates if * sizeof(wchar_t) == 2, assuming that in that case the wide string is * encoded in UTF-16. Otherwise, writes one character. Returns the * number written. */ -size_t decode_utf8_to_wchar(BinarySource *src, wchar_t *out); +size_t decode_utf8_to_wchar(BinarySource *src, wchar_t *out, + DecodeUTF8Failure *err); /* Normalise a UTF-8 string into Normalisation Form C. */ strbuf *utf8_to_nfc(ptrlen input); diff --git a/utils/decode_utf8.c b/utils/decode_utf8.c index 3e648a77..38b2001a 100644 --- a/utils/decode_utf8.c +++ b/utils/decode_utf8.c @@ -5,17 +5,23 @@ #include "putty.h" #include "misc.h" -unsigned decode_utf8(BinarySource *src) +unsigned decode_utf8(BinarySource *src, DecodeUTF8Failure *err) { + /* Permit user to pass NULL as the err pointer */ + DecodeUTF8Failure dummy; + if (!err) err = &dummy; + /* If the source has no byte available, this will return 0, which * we'll return immediately and is a reasonable error return anyway */ unsigned char c = get_byte(src); /* One-byte cases. */ if (c < 0x80) { + *err = DUTF8_SUCCESS; return c; } else if (c < 0xC0) { - return 0xFFFD; /* spurious continuation byte */ + *err = DUTF8_SPURIOUS_CONTINUATION; + return 0xFFFD; } unsigned long wc, min; @@ -31,30 +37,47 @@ unsigned decode_utf8(BinarySource *src) } else if (c < 0xFE) { wc = c & 0x01; ncont = 5; min = 0x4000000; } else { - return 0xFFFD; /* FE or FF illegal bytes */ + *err = DUTF8_ILLEGAL_BYTE; /* FE or FF */ + return 0xFFFD; } while (ncont-- > 0) { - if (!get_avail(src)) - return 0xFFFD; /* sequence terminated by end of data */ + if (!get_avail(src)) { + *err = DUTF8_E_OUT_OF_DATA; + return 0xFFFD; + } unsigned char cont = get_byte(src); if (!(0x80 <= cont && cont < 0xC0)) { BinarySource_REWIND_TO(src, src->pos - 1); - return 0xFFFD; /* short sequence */ + *err = DUTF8_TRUNCATED_SEQUENCE; + return 0xFFFD; } wc = (wc << 6) | (cont & 0x3F); } - if (wc < min) - return 0xFFFD; /* overlong encoding */ - if (0xD800 <= wc && wc < 0xE000) - return 0xFFFD; /* UTF-8 encoding of surrogate */ - if (wc > 0x10FFFF) + if (wc < min) { + *err = DUTF8_OVERLONG_ENCODING; + return 0xFFFD; + } + if (0xD800 <= wc && wc < 0xE000) { + *err = DUTF8_ENCODED_SURROGATE; + return 0xFFFD; + } + if (wc > 0x10FFFF) { + *err = DUTF8_CODE_POINT_TOO_BIG; return 0xFFFD; /* outside Unicode range */ + } + *err = DUTF8_SUCCESS; return wc; } +const char *const decode_utf8_error_strings[DUTF8_N_FAILURE_CODES] = { + #define MSG_ENTRY(sym, string) string, + DECODE_UTF8_FAILURE_LIST(MSG_ENTRY) + #undef MSG_ENTRY +}; + #ifdef TEST #include @@ -65,6 +88,12 @@ void out_of_memory(void) exit(2); } +static const char *const decode_utf8_error_syms[DUTF8_N_FAILURE_CODES] = { + #define SYM_ENTRY(sym, string) #sym, + DECODE_UTF8_FAILURE_LIST(SYM_ENTRY) + #undef SYM_ENTRY +}; + bool dotest(const char *file, int line, const char *input, size_t ninput, const unsigned long *chars, size_t nchars) { @@ -76,12 +105,13 @@ bool dotest(const char *file, int line, const char *input, size_t ninput, while (get_avail(src)) { size_t before = src->pos; - unsigned long wc = decode_utf8(src); + DecodeUTF8Failure err; + unsigned long wc = decode_utf8(src, &err); printf("%s:%d in+%"SIZEu" out+%"SIZEu":", file, line, before, noutput); while (before < src->pos) printf(" %02x", (unsigned)(unsigned char)(input[before++])); - printf(" -> U-%08lx\n", wc); + printf(" -> U-%08lx %s\n", wc, decode_utf8_error_syms[err]); if (noutput >= nchars) { printf("%s:%d: FAIL: expected no further output\n", file, line); @@ -95,6 +125,22 @@ bool dotest(const char *file, int line, const char *input, size_t ninput, } noutput++; + + DecodeUTF8Failure expected_err; + if (wc == 0xFFFD) { + /* In the 'chars' array, any occurrence of 0xFFFD is followed + * by the expected error code */ + assert(noutput < nchars && "bad test data"); + expected_err = chars[noutput++]; + } else { + /* Expect success status to go with any non-FFFD character */ + expected_err = DUTF8_SUCCESS; + } + if (err != expected_err) { + printf("%s:%d: FAIL: expected %s\n", file, line, + decode_utf8_error_syms[expected_err]); + return false; + } } if (noutput < nchars) { @@ -126,56 +172,85 @@ int main(void) DOTEST("\xC2\x80", 0x0080); DOTEST("\xE0\xA0\x80", 0x0800); DOTEST("\xF0\x90\x80\x80", 0x00010000); - DOTEST("\xF8\x88\x80\x80\x80", 0xFFFD); /* would be 0x00200000 */ - DOTEST("\xFC\x84\x80\x80\x80\x80", 0xFFFD); /* would be 0x04000000 */ + DOTEST("\xF8\x88\x80\x80\x80", + 0xFFFD, DUTF8_CODE_POINT_TOO_BIG); /* would be 0x00200000 */ + DOTEST("\xFC\x84\x80\x80\x80\x80", + 0xFFFD, DUTF8_CODE_POINT_TOO_BIG); /* would be 0x04000000 */ /* Last sequence of each length */ DOTEST("\x7F", 0x007F); DOTEST("\xDF\xBF", 0x07FF); DOTEST("\xEF\xBF\xBF", 0xFFFF); - DOTEST("\xF7\xBF\xBF\xBF", 0xFFFD); /* would be 0x001FFFFF */ - DOTEST("\xFB\xBF\xBF\xBF\xBF", 0xFFFD); /* would be 0x03FFFFFF */ - DOTEST("\xFD\xBF\xBF\xBF\xBF\xBF", 0xFFFD); /* would be 0x7FFFFFFF */ + DOTEST("\xF7\xBF\xBF\xBF", + 0xFFFD, DUTF8_CODE_POINT_TOO_BIG); /* would be 0x001FFFFF */ + DOTEST("\xFB\xBF\xBF\xBF\xBF", + 0xFFFD, DUTF8_CODE_POINT_TOO_BIG); /* would be 0x03FFFFFF */ + DOTEST("\xFD\xBF\xBF\xBF\xBF\xBF", + 0xFFFD, DUTF8_CODE_POINT_TOO_BIG); /* would be 0x7FFFFFFF */ /* Endpoints of the surrogate range */ DOTEST("\xED\x9F\xBF", 0xD7FF); - DOTEST("\xED\xA0\x80", 0xFFFD); /* would be 0xD800 */ - DOTEST("\xED\xBF\xBF", 0xFFFD); /* would be 0xDFFF */ + DOTEST("\xED\xA0\x80", 0xFFFD, DUTF8_ENCODED_SURROGATE); /* 0xD800 */ + DOTEST("\xED\xBF\xBF", 0xFFFD, DUTF8_ENCODED_SURROGATE); /* 0xDFFF */ DOTEST("\xEE\x80\x80", 0xE000); /* REPLACEMENT CHARACTER itself */ - DOTEST("\xEF\xBF\xBD", 0xFFFD); + DOTEST("\xEF\xBF\xBD", 0xFFFD, DUTF8_SUCCESS); /* FFFD but no error! */ /* Endpoints of the legal Unicode range */ DOTEST("\xF4\x8F\xBF\xBF", 0x0010FFFF); - DOTEST("\xF4\x90\x80\x80", 0xFFFD); /* would be 0x00110000 */ + DOTEST("\xF4\x90\x80\x80", 0xFFFD, + DUTF8_CODE_POINT_TOO_BIG); /* would be 0x00110000 */ /* Spurious continuation bytes, each shown as a separate failure */ DOTEST("\x80 \x81\x82 \xBD\xBE\xBF", - 0xFFFD, 0x0020, 0xFFFD, 0xFFFD, 0x0020, 0xFFFD, 0xFFFD, 0xFFFD); + 0xFFFD, DUTF8_SPURIOUS_CONTINUATION, + 0x0020, + 0xFFFD, DUTF8_SPURIOUS_CONTINUATION, + 0xFFFD, DUTF8_SPURIOUS_CONTINUATION, + 0x0020, + 0xFFFD, DUTF8_SPURIOUS_CONTINUATION, + 0xFFFD, DUTF8_SPURIOUS_CONTINUATION, + 0xFFFD, DUTF8_SPURIOUS_CONTINUATION); - /* Truncated sequences, each shown as just one failure */ + /* Truncated sequences, each shown as just one failure. The last + * one gets a different error code because the sequence is + * interrupted by the end of the string instead of another + * character, so that if the string were a prefix of a longer + * chunk of data then that would not _necessarily_ indicate an + * error */ DOTEST("\xC2\xE0\xA0\xF0\x90\x80\xF8\x88\x80\x80\xFC\x84\x80\x80\x80", - 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD); + 0xFFFD, DUTF8_TRUNCATED_SEQUENCE, + 0xFFFD, DUTF8_TRUNCATED_SEQUENCE, + 0xFFFD, DUTF8_TRUNCATED_SEQUENCE, + 0xFFFD, DUTF8_TRUNCATED_SEQUENCE, + 0xFFFD, DUTF8_E_OUT_OF_DATA); DOTEST("\xC2 \xE0\xA0 \xF0\x90\x80 \xF8\x88\x80\x80 \xFC\x84\x80\x80\x80", - 0xFFFD, 0x0020, 0xFFFD, 0x0020, 0xFFFD, 0x0020, 0xFFFD, 0x0020, - 0xFFFD); + 0xFFFD, DUTF8_TRUNCATED_SEQUENCE, + 0x0020, + 0xFFFD, DUTF8_TRUNCATED_SEQUENCE, + 0x0020, + 0xFFFD, DUTF8_TRUNCATED_SEQUENCE, + 0x0020, + 0xFFFD, DUTF8_TRUNCATED_SEQUENCE, + 0x0020, + 0xFFFD, DUTF8_E_OUT_OF_DATA); /* Illegal bytes */ - DOTEST("\xFE\xFF", 0xFFFD, 0xFFFD); + DOTEST("\xFE\xFF", 0xFFFD, DUTF8_ILLEGAL_BYTE, 0xFFFD, DUTF8_ILLEGAL_BYTE); /* Overlong sequences */ - DOTEST("\xC1\xBF", 0xFFFD); - DOTEST("\xE0\x9F\xBF", 0xFFFD); - DOTEST("\xF0\x8F\xBF\xBF", 0xFFFD); - DOTEST("\xF8\x87\xBF\xBF\xBF", 0xFFFD); - DOTEST("\xFC\x83\xBF\xBF\xBF\xBF", 0xFFFD); + DOTEST("\xC1\xBF", 0xFFFD, DUTF8_OVERLONG_ENCODING); + DOTEST("\xE0\x9F\xBF", 0xFFFD, DUTF8_OVERLONG_ENCODING); + DOTEST("\xF0\x8F\xBF\xBF", 0xFFFD, DUTF8_OVERLONG_ENCODING); + DOTEST("\xF8\x87\xBF\xBF\xBF", 0xFFFD, DUTF8_OVERLONG_ENCODING); + DOTEST("\xFC\x83\xBF\xBF\xBF\xBF", 0xFFFD, DUTF8_OVERLONG_ENCODING); - DOTEST("\xC0\x80", 0xFFFD); - DOTEST("\xE0\x80\x80", 0xFFFD); - DOTEST("\xF0\x80\x80\x80", 0xFFFD); - DOTEST("\xF8\x80\x80\x80\x80", 0xFFFD); - DOTEST("\xFC\x80\x80\x80\x80\x80", 0xFFFD); + DOTEST("\xC0\x80", 0xFFFD, DUTF8_OVERLONG_ENCODING); + DOTEST("\xE0\x80\x80", 0xFFFD, DUTF8_OVERLONG_ENCODING); + DOTEST("\xF0\x80\x80\x80", 0xFFFD, DUTF8_OVERLONG_ENCODING); + DOTEST("\xF8\x80\x80\x80\x80", 0xFFFD, DUTF8_OVERLONG_ENCODING); + DOTEST("\xFC\x80\x80\x80\x80\x80", 0xFFFD, DUTF8_OVERLONG_ENCODING); printf("%d tests %d passed", ntest, npass); if (npass < ntest) { diff --git a/utils/decode_utf8_to_wchar.c b/utils/decode_utf8_to_wchar.c index 6b9be5c8..b21cbb5f 100644 --- a/utils/decode_utf8_to_wchar.c +++ b/utils/decode_utf8_to_wchar.c @@ -5,10 +5,11 @@ #include "putty.h" #include "misc.h" -size_t decode_utf8_to_wchar(BinarySource *src, wchar_t *out) +size_t decode_utf8_to_wchar(BinarySource *src, wchar_t *out, + DecodeUTF8Failure *err) { size_t outlen = 0; - unsigned wc = decode_utf8(src); + unsigned wc = decode_utf8(src, err); if (sizeof(wchar_t) > 2 || wc < 0x10000) { out[outlen++] = wc; } else { diff --git a/utils/unicode-known.c b/utils/unicode-known.c index bfa63d70..01b9b8a4 100644 --- a/utils/unicode-known.c +++ b/utils/unicode-known.c @@ -40,7 +40,13 @@ char *utf8_unknown_char(ptrlen input) BinarySource_BARE_INIT_PL(src, input); for (size_t nchars = 0; get_avail(src); nchars++) { - unsigned c = decode_utf8(src); + DecodeUTF8Failure err; + unsigned c = decode_utf8(src, &err); + if (err != DUTF8_SUCCESS) + return dupprintf( + "cannot normalise this string: UTF-8 decoding error " + "at character position %"SIZEu", byte position %"SIZEu": %s", + nchars, src->pos, decode_utf8_error_strings[err]); if (!known(c)) return dupprintf( "cannot stably normalise this string: code point %04X " diff --git a/utils/unicode-norm.c b/utils/unicode-norm.c index 8b6ca8d2..af620fd3 100644 --- a/utils/unicode-norm.c +++ b/utils/unicode-norm.c @@ -295,7 +295,7 @@ strbuf *utf8_to_nfc(ptrlen input) ucharbuf *inbuf = ucharbuf_new(); while (get_avail(src)) - ucharbuf_append(inbuf, decode_utf8(src)); + ucharbuf_append(inbuf, decode_utf8(src, NULL)); ucharbuf *outbuf = nfc(inbuf); diff --git a/windows/unicode.c b/windows/unicode.c index 190a7460..4b18ef56 100644 --- a/windows/unicode.c +++ b/windows/unicode.c @@ -1365,7 +1365,7 @@ int mb_to_wc(int codepage, int flags, const char *mbstr, int mblen, while (get_avail(src)) { wchar_t wcbuf[2]; - size_t nwc = decode_utf8_to_wchar(src, wcbuf); + size_t nwc = decode_utf8_to_wchar(src, wcbuf, NULL); for (size_t i = 0; i < nwc; i++) { if (remaining > 0) {