decode_utf8: add an enumeration of failure reasons.

Now you can optionally get back an enum value indicating whether the character was successfully decoded, or whether U+FFFD was substituted due to some kind of problem, and if the latter, what problem. For a start, this allows distinguishing 'real' U+FFFD (encoded legitimately in the input) from one invented by the decoder. Also, it allows the recipient of the decode to treat failures differently, either by passing on a useful error report to the user (as utf8_unknown_char now does) or by doing something special. In particular, there are two distinct error codes for a truncated UTF-8 encoding, depending on whether it was truncated by the end of the input or by encountering a non-continuation byte. The former code means that the string is not legal UTF-8 _as it is_, but doesn't rule out it being a (bytewise) prefix of a legal UTF-8 string - so if a client is receiving UTF-8 data a byte at a time, they can treat that error code specially and not make it a fatal error.
2025-07-18 19:41:01 -05:00 · 2023-02-17 16:39:09 +00:00
parent 9d308b39da
commit 9e01de7c2b
6 changed files with 147 additions and 45 deletions
--- a/misc.h
+++ b/misc.h
@ -258,14 +258,34 @@ char *encode_wide_string_as_utf8(const wchar_t *wstr);
 /* Decode a single UTF-8 character. Returns U+FFFD for any of the
 * illegal cases. If the source is empty, returns L'\0' (and sets the
 * error indicator on the source, of course). */
-unsigned decode_utf8(BinarySource *src);
+#define DECODE_UTF8_FAILURE_LIST(X) \
+    X(DUTF8_SUCCESS, "success")                                      \
+    X(DUTF8_SPURIOUS_CONTINUATION, "spurious continuation byte")     \
+    X(DUTF8_ILLEGAL_BYTE, "illegal UTF-8 byte value")                \
+    X(DUTF8_E_OUT_OF_DATA, "unfinished multibyte encoding at end of string") \
+    X(DUTF8_TRUNCATED_SEQUENCE, "multibyte encoding interrupted by " \
+      "non-continuation byte")                                       \
+    X(DUTF8_OVERLONG_ENCODING, "overlong encoding")                  \
+    X(DUTF8_ENCODED_SURROGATE, "Unicode surrogate character encoded in " \
+      "UTF-8")                                                       \
+    X(DUTF8_CODE_POINT_TOO_BIG, "code point outside the Unicode range") \
+    /* end of list */
+typedef enum DecodeUTF8Failure {
+    #define ENUM_DECL(sym, string) sym,
+    DECODE_UTF8_FAILURE_LIST(ENUM_DECL)
+    #undef ENUM_DECL
+    DUTF8_N_FAILURE_CODES
+} DecodeUTF8Failure;
+unsigned decode_utf8(BinarySource *src, DecodeUTF8Failure *err);
+extern const char *const decode_utf8_error_strings[DUTF8_N_FAILURE_CODES];

 /* Decode a single UTF-8 character to an output buffer of the
 * platform's wchar_t. May write a pair of surrogates if
 * sizeof(wchar_t) == 2, assuming that in that case the wide string is
 * encoded in UTF-16. Otherwise, writes one character. Returns the
 * number written. */
-size_t decode_utf8_to_wchar(BinarySource *src, wchar_t *out);
+size_t decode_utf8_to_wchar(BinarySource *src, wchar_t *out,
+                            DecodeUTF8Failure *err);

 /* Normalise a UTF-8 string into Normalisation Form C. */
 strbuf *utf8_to_nfc(ptrlen input);