Make decode_utf8() read from a BinarySource.

This enables it to handle data that isn't presented as a NUL-terminated string. In particular, the NUL byte can appear _within_ the string and be correctly translated to the NUL wide character. So I've been able to remove the awkwardness in the test rig of having to include the terminating NUL in every test to ensure NUL has been tested, and instead, insert a single explicit test for it. Similarly to the previous commit, the simplification at the (one) call site gives me a strong feeling of 'this is what the API should have been all along'!
2025-06-30 11:02:48 -05:00 · 2022-11-09 19:01:04 +00:00
parent d89f2bfc55
commit 69e217d23a
4 changed files with 41 additions and 35 deletions
--- a/misc.h
+++ b/misc.h
@ -254,15 +254,16 @@ unsigned smemeq(const void *av, const void *bv, size_t len);
 char *encode_wide_string_as_utf8(const wchar_t *wstr);

 /* Decode a single UTF-8 character. Returns U+FFFD for any of the
- * illegal cases. */
-unsigned long decode_utf8(const char **utf8);
+ * illegal cases. If the source is empty, returns L'\0' (and sets the
+ * error indicator on the source, of course). */
+unsigned decode_utf8(BinarySource *src);

 /* Decode a single UTF-8 character to an output buffer of the
 * platform's wchar_t. May write a pair of surrogates if
 * sizeof(wchar_t) == 2, assuming that in that case the wide string is
 * encoded in UTF-16. Otherwise, writes one character. Returns the
 * number written. */
-size_t decode_utf8_to_wchar(const char **utf8, wchar_t *out);
+size_t decode_utf8_to_wchar(BinarySource *src, wchar_t *out);

 /* Write a string out in C string-literal format. */
 void write_c_string_literal(FILE *fp, ptrlen str);
--- a/utils/decode_utf8.c
+++ b/utils/decode_utf8.c
@ -5,9 +5,11 @@
 #include "putty.h"
 #include "misc.h"

-unsigned long decode_utf8(const char **utf8)
+unsigned decode_utf8(BinarySource *src)
 {
-    unsigned char c = (unsigned char)*(*utf8)++;
+    /* If the source has no byte available, this will return 0, which
+     * we'll return immediately and is a reasonable error return anyway */
+    unsigned char c = get_byte(src);

    /* One-byte cases. */
    if (c < 0x80) {
@ -33,10 +35,13 @@ unsigned long decode_utf8(const char **utf8)
    }

    while (ncont-- > 0) {
-        unsigned char cont = (unsigned char)**utf8;
-        if (!(0x80 <= cont && cont < 0xC0))
+        if (!get_avail(src))
+            return 0xFFFD;  /* sequence terminated by end of data */
+        unsigned char cont = get_byte(src);
+        if (!(0x80 <= cont && cont < 0xC0)) {
+            BinarySource_REWIND_TO(src, src->pos - 1);
            return 0xFFFD;             /* short sequence */
-        (*utf8)++;
+        }

        wc = (wc << 6) | (cont & 0x3F);
    }
@ -54,23 +59,28 @@ unsigned long decode_utf8(const char **utf8)

 #include <stdio.h>

-bool dotest(const char *file, int line, const char *input,
+void out_of_memory(void)
+{
+    fprintf(stderr, "out of memory!\n");
+    exit(2);
+}
+
+bool dotest(const char *file, int line, const char *input, size_t ninput,
            const unsigned long *chars, size_t nchars)
 {
-    const char *start = input;
-    const char *end = input + strlen(input) + 1;
+    BinarySource src[1];
+    BinarySource_BARE_INIT(src, input, ninput);
    size_t noutput = 0;

    printf("%s:%d: test start\n", file, line);

-    while (input < end) {
-        const char *before = input;
-        unsigned long wc = decode_utf8(&input);
+    while (get_avail(src)) {
+        size_t before = src->pos;
+        unsigned long wc = decode_utf8(src);

-        printf("%s:%d in+%"SIZEu" out+%"SIZEu":",
-               file, line, (size_t)(before-start), noutput);
-        while (before < input)
-            printf(" %02x", (unsigned)(unsigned char)(*before++));
+        printf("%s:%d in+%"SIZEu" out+%"SIZEu":", file, line, before, noutput);
+        while (before < src->pos)
+            printf(" %02x", (unsigned)(unsigned char)(input[before++]));
        printf(" -> U-%08lx\n", wc);

        if (noutput >= nchars) {
@ -97,9 +107,10 @@ bool dotest(const char *file, int line, const char *input,
 }

 #define DOTEST(input, ...) do {                                         \
-        static const unsigned long chars[] = { __VA_ARGS__, 0 };        \
+        static const unsigned long chars[] = { __VA_ARGS__ };           \
        ntest++;                                                        \
-        if (dotest(__FILE__, __LINE__, input, chars, lenof(chars)))     \
+        if (dotest(__FILE__, __LINE__, input, sizeof(input)-1,          \
+                   chars, lenof(chars)))                                \
            npass++;                                                    \
    } while (0)

@ -110,8 +121,8 @@ int main(void)
    DOTEST("\xCE\xBA\xE1\xBD\xB9\xCF\x83\xCE\xBC\xCE\xB5",
           0x03BA, 0x1F79, 0x03C3, 0x03BC, 0x03B5);

-    /* First sequence of each length (not counting NUL, which is
-     * tested anyway by the string-termination handling in every test) */
+    /* First sequence of each length */
+    DOTEST("\x00", 0x0000);
    DOTEST("\xC2\x80", 0x0080);
    DOTEST("\xE0\xA0\x80", 0x0800);
    DOTEST("\xF0\x90\x80\x80", 0x00010000);
--- a/utils/decode_utf8_to_wchar.c
+++ b/utils/decode_utf8_to_wchar.c
@ -5,10 +5,10 @@
 #include "putty.h"
 #include "misc.h"

-size_t decode_utf8_to_wchar(const char **utf8, wchar_t *out)
+size_t decode_utf8_to_wchar(BinarySource *src, wchar_t *out)
 {
    size_t outlen = 0;
-    unsigned wc = decode_utf8(utf8);
+    unsigned wc = decode_utf8(src);
    if (sizeof(wchar_t) > 2 || wc < 0x10000) {
        out[outlen++] = wc;
    } else {
--- a/windows/unicode.c
+++ b/windows/unicode.c
@ -1357,18 +1357,15 @@ int mb_to_wc(int codepage, int flags, const char *mbstr, int mblen,
     * codepage is UTF-8, we can do the translation ourselves.
     */
    if (codepage == CP_UTF8 && mblen > 0 && wclen > 0) {
+        BinarySource src[1];
+        BinarySource_BARE_INIT(src, mbstr, mblen);
+
        size_t remaining = wclen;
        wchar_t *p = wcstr;

-        while (mblen > 0) {
-            char utfbuf[7];
-            int thissize = mblen < 6 ? mblen : 6;
-            memcpy(utfbuf, mbstr, thissize);
-            utfbuf[thissize] = '\0';
-
-            const char *utfptr = utfbuf;
+        while (get_avail(src)) {
            wchar_t wcbuf[2];
-            size_t nwc = decode_utf8_to_wchar(&utfptr, wcbuf);
+            size_t nwc = decode_utf8_to_wchar(src, wcbuf);

            for (size_t i = 0; i < nwc; i++) {
                if (remaining > 0) {
@ -1378,9 +1375,6 @@ int mb_to_wc(int codepage, int flags, const char *mbstr, int mblen,
                    return p - wcstr;
                }
            }
-
-            mbstr += (utfptr - utfbuf);
-            mblen -= (utfptr - utfbuf);
        }

        return p - wcstr;