1
0
mirror of https://git.tartarus.org/simon/putty.git synced 2025-01-09 01:18:00 +00:00

decode_utf8: add an enumeration of failure reasons.

Now you can optionally get back an enum value indicating whether the
character was successfully decoded, or whether U+FFFD was substituted
due to some kind of problem, and if the latter, what problem.

For a start, this allows distinguishing 'real' U+FFFD (encoded
legitimately in the input) from one invented by the decoder. Also, it
allows the recipient of the decode to treat failures differently,
either by passing on a useful error report to the user (as
utf8_unknown_char now does) or by doing something special.

In particular, there are two distinct error codes for a truncated
UTF-8 encoding, depending on whether it was truncated by the end of
the input or by encountering a non-continuation byte. The former code
means that the string is not legal UTF-8 _as it is_, but doesn't rule
out it being a (bytewise) prefix of a legal UTF-8 string - so if a
client is receiving UTF-8 data a byte at a time, they can treat that
error code specially and not make it a fatal error.
This commit is contained in:
Simon Tatham 2023-02-17 16:39:09 +00:00
parent 9d308b39da
commit 9e01de7c2b
6 changed files with 147 additions and 45 deletions

24
misc.h
View File

@ -258,14 +258,34 @@ char *encode_wide_string_as_utf8(const wchar_t *wstr);
/* Decode a single UTF-8 character. Returns U+FFFD for any of the
* illegal cases. If the source is empty, returns L'\0' (and sets the
* error indicator on the source, of course). */
unsigned decode_utf8(BinarySource *src);
#define DECODE_UTF8_FAILURE_LIST(X) \
X(DUTF8_SUCCESS, "success") \
X(DUTF8_SPURIOUS_CONTINUATION, "spurious continuation byte") \
X(DUTF8_ILLEGAL_BYTE, "illegal UTF-8 byte value") \
X(DUTF8_E_OUT_OF_DATA, "unfinished multibyte encoding at end of string") \
X(DUTF8_TRUNCATED_SEQUENCE, "multibyte encoding interrupted by " \
"non-continuation byte") \
X(DUTF8_OVERLONG_ENCODING, "overlong encoding") \
X(DUTF8_ENCODED_SURROGATE, "Unicode surrogate character encoded in " \
"UTF-8") \
X(DUTF8_CODE_POINT_TOO_BIG, "code point outside the Unicode range") \
/* end of list */
typedef enum DecodeUTF8Failure {
#define ENUM_DECL(sym, string) sym,
DECODE_UTF8_FAILURE_LIST(ENUM_DECL)
#undef ENUM_DECL
DUTF8_N_FAILURE_CODES
} DecodeUTF8Failure;
unsigned decode_utf8(BinarySource *src, DecodeUTF8Failure *err);
extern const char *const decode_utf8_error_strings[DUTF8_N_FAILURE_CODES];
/* Decode a single UTF-8 character to an output buffer of the
* platform's wchar_t. May write a pair of surrogates if
* sizeof(wchar_t) == 2, assuming that in that case the wide string is
* encoded in UTF-16. Otherwise, writes one character. Returns the
* number written. */
size_t decode_utf8_to_wchar(BinarySource *src, wchar_t *out);
size_t decode_utf8_to_wchar(BinarySource *src, wchar_t *out,
DecodeUTF8Failure *err);
/* Normalise a UTF-8 string into Normalisation Form C. */
strbuf *utf8_to_nfc(ptrlen input);

View File

@ -5,17 +5,23 @@
#include "putty.h"
#include "misc.h"
unsigned decode_utf8(BinarySource *src)
unsigned decode_utf8(BinarySource *src, DecodeUTF8Failure *err)
{
/* Permit user to pass NULL as the err pointer */
DecodeUTF8Failure dummy;
if (!err) err = &dummy;
/* If the source has no byte available, this will return 0, which
* we'll return immediately and is a reasonable error return anyway */
unsigned char c = get_byte(src);
/* One-byte cases. */
if (c < 0x80) {
*err = DUTF8_SUCCESS;
return c;
} else if (c < 0xC0) {
return 0xFFFD; /* spurious continuation byte */
*err = DUTF8_SPURIOUS_CONTINUATION;
return 0xFFFD;
}
unsigned long wc, min;
@ -31,30 +37,47 @@ unsigned decode_utf8(BinarySource *src)
} else if (c < 0xFE) {
wc = c & 0x01; ncont = 5; min = 0x4000000;
} else {
return 0xFFFD; /* FE or FF illegal bytes */
*err = DUTF8_ILLEGAL_BYTE; /* FE or FF */
return 0xFFFD;
}
while (ncont-- > 0) {
if (!get_avail(src))
return 0xFFFD; /* sequence terminated by end of data */
if (!get_avail(src)) {
*err = DUTF8_E_OUT_OF_DATA;
return 0xFFFD;
}
unsigned char cont = get_byte(src);
if (!(0x80 <= cont && cont < 0xC0)) {
BinarySource_REWIND_TO(src, src->pos - 1);
return 0xFFFD; /* short sequence */
*err = DUTF8_TRUNCATED_SEQUENCE;
return 0xFFFD;
}
wc = (wc << 6) | (cont & 0x3F);
}
if (wc < min)
return 0xFFFD; /* overlong encoding */
if (0xD800 <= wc && wc < 0xE000)
return 0xFFFD; /* UTF-8 encoding of surrogate */
if (wc > 0x10FFFF)
if (wc < min) {
*err = DUTF8_OVERLONG_ENCODING;
return 0xFFFD;
}
if (0xD800 <= wc && wc < 0xE000) {
*err = DUTF8_ENCODED_SURROGATE;
return 0xFFFD;
}
if (wc > 0x10FFFF) {
*err = DUTF8_CODE_POINT_TOO_BIG;
return 0xFFFD; /* outside Unicode range */
}
*err = DUTF8_SUCCESS;
return wc;
}
const char *const decode_utf8_error_strings[DUTF8_N_FAILURE_CODES] = {
#define MSG_ENTRY(sym, string) string,
DECODE_UTF8_FAILURE_LIST(MSG_ENTRY)
#undef MSG_ENTRY
};
#ifdef TEST
#include <stdio.h>
@ -65,6 +88,12 @@ void out_of_memory(void)
exit(2);
}
static const char *const decode_utf8_error_syms[DUTF8_N_FAILURE_CODES] = {
#define SYM_ENTRY(sym, string) #sym,
DECODE_UTF8_FAILURE_LIST(SYM_ENTRY)
#undef SYM_ENTRY
};
bool dotest(const char *file, int line, const char *input, size_t ninput,
const unsigned long *chars, size_t nchars)
{
@ -76,12 +105,13 @@ bool dotest(const char *file, int line, const char *input, size_t ninput,
while (get_avail(src)) {
size_t before = src->pos;
unsigned long wc = decode_utf8(src);
DecodeUTF8Failure err;
unsigned long wc = decode_utf8(src, &err);
printf("%s:%d in+%"SIZEu" out+%"SIZEu":", file, line, before, noutput);
while (before < src->pos)
printf(" %02x", (unsigned)(unsigned char)(input[before++]));
printf(" -> U-%08lx\n", wc);
printf(" -> U-%08lx %s\n", wc, decode_utf8_error_syms[err]);
if (noutput >= nchars) {
printf("%s:%d: FAIL: expected no further output\n", file, line);
@ -95,6 +125,22 @@ bool dotest(const char *file, int line, const char *input, size_t ninput,
}
noutput++;
DecodeUTF8Failure expected_err;
if (wc == 0xFFFD) {
/* In the 'chars' array, any occurrence of 0xFFFD is followed
* by the expected error code */
assert(noutput < nchars && "bad test data");
expected_err = chars[noutput++];
} else {
/* Expect success status to go with any non-FFFD character */
expected_err = DUTF8_SUCCESS;
}
if (err != expected_err) {
printf("%s:%d: FAIL: expected %s\n", file, line,
decode_utf8_error_syms[expected_err]);
return false;
}
}
if (noutput < nchars) {
@ -126,56 +172,85 @@ int main(void)
DOTEST("\xC2\x80", 0x0080);
DOTEST("\xE0\xA0\x80", 0x0800);
DOTEST("\xF0\x90\x80\x80", 0x00010000);
DOTEST("\xF8\x88\x80\x80\x80", 0xFFFD); /* would be 0x00200000 */
DOTEST("\xFC\x84\x80\x80\x80\x80", 0xFFFD); /* would be 0x04000000 */
DOTEST("\xF8\x88\x80\x80\x80",
0xFFFD, DUTF8_CODE_POINT_TOO_BIG); /* would be 0x00200000 */
DOTEST("\xFC\x84\x80\x80\x80\x80",
0xFFFD, DUTF8_CODE_POINT_TOO_BIG); /* would be 0x04000000 */
/* Last sequence of each length */
DOTEST("\x7F", 0x007F);
DOTEST("\xDF\xBF", 0x07FF);
DOTEST("\xEF\xBF\xBF", 0xFFFF);
DOTEST("\xF7\xBF\xBF\xBF", 0xFFFD); /* would be 0x001FFFFF */
DOTEST("\xFB\xBF\xBF\xBF\xBF", 0xFFFD); /* would be 0x03FFFFFF */
DOTEST("\xFD\xBF\xBF\xBF\xBF\xBF", 0xFFFD); /* would be 0x7FFFFFFF */
DOTEST("\xF7\xBF\xBF\xBF",
0xFFFD, DUTF8_CODE_POINT_TOO_BIG); /* would be 0x001FFFFF */
DOTEST("\xFB\xBF\xBF\xBF\xBF",
0xFFFD, DUTF8_CODE_POINT_TOO_BIG); /* would be 0x03FFFFFF */
DOTEST("\xFD\xBF\xBF\xBF\xBF\xBF",
0xFFFD, DUTF8_CODE_POINT_TOO_BIG); /* would be 0x7FFFFFFF */
/* Endpoints of the surrogate range */
DOTEST("\xED\x9F\xBF", 0xD7FF);
DOTEST("\xED\xA0\x80", 0xFFFD); /* would be 0xD800 */
DOTEST("\xED\xBF\xBF", 0xFFFD); /* would be 0xDFFF */
DOTEST("\xED\xA0\x80", 0xFFFD, DUTF8_ENCODED_SURROGATE); /* 0xD800 */
DOTEST("\xED\xBF\xBF", 0xFFFD, DUTF8_ENCODED_SURROGATE); /* 0xDFFF */
DOTEST("\xEE\x80\x80", 0xE000);
/* REPLACEMENT CHARACTER itself */
DOTEST("\xEF\xBF\xBD", 0xFFFD);
DOTEST("\xEF\xBF\xBD", 0xFFFD, DUTF8_SUCCESS); /* FFFD but no error! */
/* Endpoints of the legal Unicode range */
DOTEST("\xF4\x8F\xBF\xBF", 0x0010FFFF);
DOTEST("\xF4\x90\x80\x80", 0xFFFD); /* would be 0x00110000 */
DOTEST("\xF4\x90\x80\x80", 0xFFFD,
DUTF8_CODE_POINT_TOO_BIG); /* would be 0x00110000 */
/* Spurious continuation bytes, each shown as a separate failure */
DOTEST("\x80 \x81\x82 \xBD\xBE\xBF",
0xFFFD, 0x0020, 0xFFFD, 0xFFFD, 0x0020, 0xFFFD, 0xFFFD, 0xFFFD);
0xFFFD, DUTF8_SPURIOUS_CONTINUATION,
0x0020,
0xFFFD, DUTF8_SPURIOUS_CONTINUATION,
0xFFFD, DUTF8_SPURIOUS_CONTINUATION,
0x0020,
0xFFFD, DUTF8_SPURIOUS_CONTINUATION,
0xFFFD, DUTF8_SPURIOUS_CONTINUATION,
0xFFFD, DUTF8_SPURIOUS_CONTINUATION);
/* Truncated sequences, each shown as just one failure */
/* Truncated sequences, each shown as just one failure. The last
* one gets a different error code because the sequence is
* interrupted by the end of the string instead of another
* character, so that if the string were a prefix of a longer
* chunk of data then that would not _necessarily_ indicate an
* error */
DOTEST("\xC2\xE0\xA0\xF0\x90\x80\xF8\x88\x80\x80\xFC\x84\x80\x80\x80",
0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD);
0xFFFD, DUTF8_TRUNCATED_SEQUENCE,
0xFFFD, DUTF8_TRUNCATED_SEQUENCE,
0xFFFD, DUTF8_TRUNCATED_SEQUENCE,
0xFFFD, DUTF8_TRUNCATED_SEQUENCE,
0xFFFD, DUTF8_E_OUT_OF_DATA);
DOTEST("\xC2 \xE0\xA0 \xF0\x90\x80 \xF8\x88\x80\x80 \xFC\x84\x80\x80\x80",
0xFFFD, 0x0020, 0xFFFD, 0x0020, 0xFFFD, 0x0020, 0xFFFD, 0x0020,
0xFFFD);
0xFFFD, DUTF8_TRUNCATED_SEQUENCE,
0x0020,
0xFFFD, DUTF8_TRUNCATED_SEQUENCE,
0x0020,
0xFFFD, DUTF8_TRUNCATED_SEQUENCE,
0x0020,
0xFFFD, DUTF8_TRUNCATED_SEQUENCE,
0x0020,
0xFFFD, DUTF8_E_OUT_OF_DATA);
/* Illegal bytes */
DOTEST("\xFE\xFF", 0xFFFD, 0xFFFD);
DOTEST("\xFE\xFF", 0xFFFD, DUTF8_ILLEGAL_BYTE, 0xFFFD, DUTF8_ILLEGAL_BYTE);
/* Overlong sequences */
DOTEST("\xC1\xBF", 0xFFFD);
DOTEST("\xE0\x9F\xBF", 0xFFFD);
DOTEST("\xF0\x8F\xBF\xBF", 0xFFFD);
DOTEST("\xF8\x87\xBF\xBF\xBF", 0xFFFD);
DOTEST("\xFC\x83\xBF\xBF\xBF\xBF", 0xFFFD);
DOTEST("\xC1\xBF", 0xFFFD, DUTF8_OVERLONG_ENCODING);
DOTEST("\xE0\x9F\xBF", 0xFFFD, DUTF8_OVERLONG_ENCODING);
DOTEST("\xF0\x8F\xBF\xBF", 0xFFFD, DUTF8_OVERLONG_ENCODING);
DOTEST("\xF8\x87\xBF\xBF\xBF", 0xFFFD, DUTF8_OVERLONG_ENCODING);
DOTEST("\xFC\x83\xBF\xBF\xBF\xBF", 0xFFFD, DUTF8_OVERLONG_ENCODING);
DOTEST("\xC0\x80", 0xFFFD);
DOTEST("\xE0\x80\x80", 0xFFFD);
DOTEST("\xF0\x80\x80\x80", 0xFFFD);
DOTEST("\xF8\x80\x80\x80\x80", 0xFFFD);
DOTEST("\xFC\x80\x80\x80\x80\x80", 0xFFFD);
DOTEST("\xC0\x80", 0xFFFD, DUTF8_OVERLONG_ENCODING);
DOTEST("\xE0\x80\x80", 0xFFFD, DUTF8_OVERLONG_ENCODING);
DOTEST("\xF0\x80\x80\x80", 0xFFFD, DUTF8_OVERLONG_ENCODING);
DOTEST("\xF8\x80\x80\x80\x80", 0xFFFD, DUTF8_OVERLONG_ENCODING);
DOTEST("\xFC\x80\x80\x80\x80\x80", 0xFFFD, DUTF8_OVERLONG_ENCODING);
printf("%d tests %d passed", ntest, npass);
if (npass < ntest) {

View File

@ -5,10 +5,11 @@
#include "putty.h"
#include "misc.h"
size_t decode_utf8_to_wchar(BinarySource *src, wchar_t *out)
size_t decode_utf8_to_wchar(BinarySource *src, wchar_t *out,
DecodeUTF8Failure *err)
{
size_t outlen = 0;
unsigned wc = decode_utf8(src);
unsigned wc = decode_utf8(src, err);
if (sizeof(wchar_t) > 2 || wc < 0x10000) {
out[outlen++] = wc;
} else {

View File

@ -40,7 +40,13 @@ char *utf8_unknown_char(ptrlen input)
BinarySource_BARE_INIT_PL(src, input);
for (size_t nchars = 0; get_avail(src); nchars++) {
unsigned c = decode_utf8(src);
DecodeUTF8Failure err;
unsigned c = decode_utf8(src, &err);
if (err != DUTF8_SUCCESS)
return dupprintf(
"cannot normalise this string: UTF-8 decoding error "
"at character position %"SIZEu", byte position %"SIZEu": %s",
nchars, src->pos, decode_utf8_error_strings[err]);
if (!known(c))
return dupprintf(
"cannot stably normalise this string: code point %04X "

View File

@ -295,7 +295,7 @@ strbuf *utf8_to_nfc(ptrlen input)
ucharbuf *inbuf = ucharbuf_new();
while (get_avail(src))
ucharbuf_append(inbuf, decode_utf8(src));
ucharbuf_append(inbuf, decode_utf8(src, NULL));
ucharbuf *outbuf = nfc(inbuf);

View File

@ -1365,7 +1365,7 @@ int mb_to_wc(int codepage, int flags, const char *mbstr, int mblen,
while (get_avail(src)) {
wchar_t wcbuf[2];
size_t nwc = decode_utf8_to_wchar(src, wcbuf);
size_t nwc = decode_utf8_to_wchar(src, wcbuf, NULL);
for (size_t i = 0; i < nwc; i++) {
if (remaining > 0) {