mirror of
https://git.tartarus.org/simon/putty.git
synced 2025-07-18 19:41:01 -05:00
decode_utf8: add an enumeration of failure reasons.
Now you can optionally get back an enum value indicating whether the character was successfully decoded, or whether U+FFFD was substituted due to some kind of problem, and if the latter, what problem. For a start, this allows distinguishing 'real' U+FFFD (encoded legitimately in the input) from one invented by the decoder. Also, it allows the recipient of the decode to treat failures differently, either by passing on a useful error report to the user (as utf8_unknown_char now does) or by doing something special. In particular, there are two distinct error codes for a truncated UTF-8 encoding, depending on whether it was truncated by the end of the input or by encountering a non-continuation byte. The former code means that the string is not legal UTF-8 _as it is_, but doesn't rule out it being a (bytewise) prefix of a legal UTF-8 string - so if a client is receiving UTF-8 data a byte at a time, they can treat that error code specially and not make it a fatal error.
This commit is contained in:
24
misc.h
24
misc.h
@ -258,14 +258,34 @@ char *encode_wide_string_as_utf8(const wchar_t *wstr);
|
||||
/* Decode a single UTF-8 character. Returns U+FFFD for any of the
|
||||
* illegal cases. If the source is empty, returns L'\0' (and sets the
|
||||
* error indicator on the source, of course). */
|
||||
unsigned decode_utf8(BinarySource *src);
|
||||
#define DECODE_UTF8_FAILURE_LIST(X) \
|
||||
X(DUTF8_SUCCESS, "success") \
|
||||
X(DUTF8_SPURIOUS_CONTINUATION, "spurious continuation byte") \
|
||||
X(DUTF8_ILLEGAL_BYTE, "illegal UTF-8 byte value") \
|
||||
X(DUTF8_E_OUT_OF_DATA, "unfinished multibyte encoding at end of string") \
|
||||
X(DUTF8_TRUNCATED_SEQUENCE, "multibyte encoding interrupted by " \
|
||||
"non-continuation byte") \
|
||||
X(DUTF8_OVERLONG_ENCODING, "overlong encoding") \
|
||||
X(DUTF8_ENCODED_SURROGATE, "Unicode surrogate character encoded in " \
|
||||
"UTF-8") \
|
||||
X(DUTF8_CODE_POINT_TOO_BIG, "code point outside the Unicode range") \
|
||||
/* end of list */
|
||||
typedef enum DecodeUTF8Failure {
|
||||
#define ENUM_DECL(sym, string) sym,
|
||||
DECODE_UTF8_FAILURE_LIST(ENUM_DECL)
|
||||
#undef ENUM_DECL
|
||||
DUTF8_N_FAILURE_CODES
|
||||
} DecodeUTF8Failure;
|
||||
unsigned decode_utf8(BinarySource *src, DecodeUTF8Failure *err);
|
||||
extern const char *const decode_utf8_error_strings[DUTF8_N_FAILURE_CODES];
|
||||
|
||||
/* Decode a single UTF-8 character to an output buffer of the
|
||||
* platform's wchar_t. May write a pair of surrogates if
|
||||
* sizeof(wchar_t) == 2, assuming that in that case the wide string is
|
||||
* encoded in UTF-16. Otherwise, writes one character. Returns the
|
||||
* number written. */
|
||||
size_t decode_utf8_to_wchar(BinarySource *src, wchar_t *out);
|
||||
size_t decode_utf8_to_wchar(BinarySource *src, wchar_t *out,
|
||||
DecodeUTF8Failure *err);
|
||||
|
||||
/* Normalise a UTF-8 string into Normalisation Form C. */
|
||||
strbuf *utf8_to_nfc(ptrlen input);
|
||||
|
Reference in New Issue
Block a user