1
0
mirror of https://git.tartarus.org/simon/putty.git synced 2025-07-18 19:41:01 -05:00

decode_utf8: add an enumeration of failure reasons.

Now you can optionally get back an enum value indicating whether the
character was successfully decoded, or whether U+FFFD was substituted
due to some kind of problem, and if the latter, what problem.

For a start, this allows distinguishing 'real' U+FFFD (encoded
legitimately in the input) from one invented by the decoder. Also, it
allows the recipient of the decode to treat failures differently,
either by passing on a useful error report to the user (as
utf8_unknown_char now does) or by doing something special.

In particular, there are two distinct error codes for a truncated
UTF-8 encoding, depending on whether it was truncated by the end of
the input or by encountering a non-continuation byte. The former code
means that the string is not legal UTF-8 _as it is_, but doesn't rule
out it being a (bytewise) prefix of a legal UTF-8 string - so if a
client is receiving UTF-8 data a byte at a time, they can treat that
error code specially and not make it a fatal error.
This commit is contained in:
Simon Tatham
2023-02-17 16:39:09 +00:00
parent 9d308b39da
commit 9e01de7c2b
6 changed files with 147 additions and 45 deletions

24
misc.h
View File

@ -258,14 +258,34 @@ char *encode_wide_string_as_utf8(const wchar_t *wstr);
/* Decode a single UTF-8 character. Returns U+FFFD for any of the
* illegal cases. If the source is empty, returns L'\0' (and sets the
* error indicator on the source, of course). */
unsigned decode_utf8(BinarySource *src);
#define DECODE_UTF8_FAILURE_LIST(X) \
X(DUTF8_SUCCESS, "success") \
X(DUTF8_SPURIOUS_CONTINUATION, "spurious continuation byte") \
X(DUTF8_ILLEGAL_BYTE, "illegal UTF-8 byte value") \
X(DUTF8_E_OUT_OF_DATA, "unfinished multibyte encoding at end of string") \
X(DUTF8_TRUNCATED_SEQUENCE, "multibyte encoding interrupted by " \
"non-continuation byte") \
X(DUTF8_OVERLONG_ENCODING, "overlong encoding") \
X(DUTF8_ENCODED_SURROGATE, "Unicode surrogate character encoded in " \
"UTF-8") \
X(DUTF8_CODE_POINT_TOO_BIG, "code point outside the Unicode range") \
/* end of list */
typedef enum DecodeUTF8Failure {
#define ENUM_DECL(sym, string) sym,
DECODE_UTF8_FAILURE_LIST(ENUM_DECL)
#undef ENUM_DECL
DUTF8_N_FAILURE_CODES
} DecodeUTF8Failure;
unsigned decode_utf8(BinarySource *src, DecodeUTF8Failure *err);
extern const char *const decode_utf8_error_strings[DUTF8_N_FAILURE_CODES];
/* Decode a single UTF-8 character to an output buffer of the
* platform's wchar_t. May write a pair of surrogates if
* sizeof(wchar_t) == 2, assuming that in that case the wide string is
* encoded in UTF-16. Otherwise, writes one character. Returns the
* number written. */
size_t decode_utf8_to_wchar(BinarySource *src, wchar_t *out);
size_t decode_utf8_to_wchar(BinarySource *src, wchar_t *out,
DecodeUTF8Failure *err);
/* Normalise a UTF-8 string into Normalisation Form C. */
strbuf *utf8_to_nfc(ptrlen input);