Rework Unicode conversion APIs to use a BinarySink.

The previous mb_to_wc and wc_to_mb had horrible and also buggy APIs. This commit introduces a fresh pair of functions to replace them, which generate output by writing to a BinarySink. So it's now up to the caller to decide whether it wants the output written to a fixed-size buffer with overflow checking (via buffer_sink), or dynamically allocated, or even written directly to some other output channel. Nothing uses the new functions yet. I plan to migrate things over in upcoming commits. What was wrong with the old APIs: they had that awkward undocumented Windows-specific 'flags' parameter that I described in the previous commit and took out of the dup_X_to_Y wrappers. But much worse, the semantics for buffer overflow were not just undocumented but actually inconsistent. dup_wc_to_mb() in utils assumed that the underlying wc_to_mb would fill the buffer nearly full and return the size of data it wrote. In fact, this was untrue in the case where wc_to_mb called WideCharToMultiByte: that returns straight-up failure, setting the Windows error code to ERROR_INSUFFICIENT_BUFFER. It _does_ partially fill the output buffer, but doesn't tell you how much it wrote! What's wrong with the new API: it's a bit awkward to write a sequence of wchar_t in native byte order to a byte-oriented BinarySink, so people using put_mb_to_wc directly have to do some annoying pointer casting. But I think that's less horrible than the previous APIs. Another change: in the new API for wc_to_mb, defchr can be "", but not NULL.
2025-07-01 11:32:48 -05:00 · 2024-09-24 08:18:48 +01:00
parent 32b8da1177
commit 4f756d2a4d
10 changed files with 267 additions and 239 deletions
--- a/windows/console.c
+++ b/windows/console.c
@ -221,7 +221,7 @@ static bool console_read_line_to_strbuf(ConsoleIO *conio, bool echo,
        }

        if (conio->utf8) {
-            wchar_t wbuf[4096];
+            wchar_t wbuf[4097];
            size_t wlen;

            if (conio->hin_is_console) {
@ -245,17 +245,15 @@ static bool console_read_line_to_strbuf(ConsoleIO *conio, bool echo,
                if (!ReadFile(conio->hin, buf, lenof(buf), &nread, NULL))
                    goto out;

-                wlen = mb_to_wc(CP_ACP, 0, buf, nread, wbuf, lenof(wbuf));
+                buffer_sink bs[1];
+                buffer_sink_init(bs, wbuf, sizeof(wbuf) - sizeof(wchar_t));
+                put_mb_to_wc(bs, CP_ACP, buf, nread);
+                assert(!bs->overflowed);
+                wlen = (wchar_t *)bs->out - wbuf;
                smemclr(buf, sizeof(buf));
            }

-            /* Allocate the maximum space in the strbuf that might be
-             * needed for this data */
-            size_t oldlen = sb->len, maxout = wlen * 4;
-            void *outptr = strbuf_append(sb, maxout);
-            size_t newlen = oldlen + wc_to_mb(CP_UTF8, 0, wbuf, wlen,
-                                              outptr, maxout, NULL);
-            strbuf_shrink_to(sb, newlen);
+            put_wc_to_mb(sb, CP_UTF8, wbuf, wlen, "");
            smemclr(wbuf, sizeof(wbuf));
        } else {
            /*
--- a/windows/unicode.c
+++ b/windows/unicode.c
@ -1232,8 +1232,7 @@ void get_unitab(int codepage, wchar_t *unitab, int ftype)
        for (i = 0; i < max; i++) {
            tbuf[0] = i;

-            if (mb_to_wc(codepage, flg, tbuf, 1, unitab + i, 1)
-                != 1)
+            if (MultiByteToWideChar(codepage, flg, tbuf, 1, unitab+i, 1) != 1)
                unitab[i] = 0xFFFD;
        }
    } else {
@ -1245,151 +1244,192 @@ void get_unitab(int codepage, wchar_t *unitab, int ftype)
    }
 }

-int wc_to_mb(int codepage, int flags, const wchar_t *wcstr, int wclen,
-             char *mbstr, int mblen, const char *defchr)
+bool BinarySink_put_wc_to_mb(
+    BinarySink *bs, int codepage, const wchar_t *wcstr, int wclen,
+    const char *defchr)
 {
+    if (!wclen)
+        return true;
+
    reverse_mapping *rmap = get_reverse_mapping(codepage);

    if (rmap) {
+        size_t defchr_len = 0;
+        bool defchr_len_known = false;
+
        /* Do this by array lookup if we can. */
-        if (wclen < 0) {
-            for (wclen = 0; wcstr[wclen++] ;);   /* will include the NUL */
-        }
-        char *p;
-        int i;
-        for (p = mbstr, i = 0; i < wclen; i++) {
+        for (size_t i = 0; i < wclen; i++) {
            wchar_t ch = wcstr[i];
            int by;
-            const char *p1;
+            const char *blk;

-            #define WRITECH(chr) do             \
-            {                                   \
-                assert(p - mbstr < mblen);      \
-                *p++ = (char)(chr);             \
-            } while (0)
-
-            if ((p1 = rmap->blocks[(ch >> 8) & 0xFF]) != NULL &&
-                (by = p1[ch & 0xFF]) != '\0')
-                WRITECH(by);
+            if ((blk = rmap->blocks[(ch >> 8) & 0xFF]) != NULL &&
+                (by = blk[ch & 0xFF]) != '\0')
+                put_byte(bs, by);
            else if (ch < 0x80)
-                WRITECH(ch);
-            else if (defchr)
-                for (const char *q = defchr; *q; q++)
-                    WRITECH(*q);
-#if 1
-            else
-                WRITECH('.');
-#endif
-
-            #undef WRITECH
+                put_byte(bs, ch);
+            else if (defchr) {
+                if (!defchr_len_known) {
+                    defchr_len = strlen(defchr);
+                    defchr_len_known = true;
+                }
+                put_data(bs, defchr, defchr_len);
+            }
        }
-        return p - mbstr;
-    } else {
-        int defused, ret;
-        ret = WideCharToMultiByte(codepage, flags, wcstr, wclen,
-                                  mbstr, mblen, defchr, &defused);
-        if (ret)
-            return ret;
+        return true;
+    }
+
+    {
+        char internalbuf[2048];
+        char *allocbuf = NULL;
+        size_t allocsize = 0;
+        char *currbuf = internalbuf;
+        size_t currsize = lenof(internalbuf);
+        bool success;
+
+        BOOL defused = false;
+        BOOL *defusedp = &defused;
+
+        if (codepage == CP_UTF8 || !defchr[0]) {
+            /*
+             * The Win32 API spec says that defchr and defused must be
+             * NULL when doing a UTF-8 conversion, on pain of
+             * ERROR_INVALID_PARAMETER.
+             *
+             * Also, translate defchr="" on input to NULL in the Win32
+             * API.
+             */
+            defchr = NULL;
+            defusedp = NULL;
+        }
+
+        while (true) {
+            int ret = WideCharToMultiByte(
+                codepage, 0, wcstr, wclen, currbuf, currsize,
+                defchr, defusedp);
+
+            if (ret) {
+                put_data(bs, currbuf, ret);
+                success = true;
+                break;
+            } else if (GetLastError() != ERROR_INSUFFICIENT_BUFFER) {
+                success = false;
+                break;
+            } else {
+                sgrowarray_nm(allocbuf, allocsize, currsize);
+                currbuf = allocbuf;
+                currsize = allocsize;
+            }
+        }
+
+        smemclr(allocbuf, allocsize);
+        if (success)
+            return true;
+    }

 #ifdef LEGACY_WINDOWS
-        /*
-         * Fallback for legacy platforms too old to support UTF-8: if
-         * the codepage is UTF-8, we can do the translation ourselves.
-         */
-        if (codepage == CP_UTF8 && mblen > 0 && wclen > 0) {
-            buffer_sink bs[1];
-            buffer_sink_init(bs, mbstr, mblen);
-
-            while (wclen > 0) {
-                unsigned long wc = (wclen--, *wcstr++);
-                if (wclen > 0 && IS_SURROGATE_PAIR(wc, *wcstr)) {
-                    wc = FROM_SURROGATES(wc, *wcstr);
-                    wclen--, wcstr++;
-                }
-
-                const char *prev_ptr = bs->out;
-                put_utf8_char(bs, wc);
-                if (bs->overflowed)
-                    return prev_ptr - mbstr;
+    /*
+     * Fallback for legacy platforms too old to support UTF-8: if
+     * the codepage is UTF-8, we can do the translation ourselves.
+     */
+    if (codepage == CP_UTF8 && wclen > 0) {
+        while (wclen > 0) {
+            unsigned long wc = (wclen--, *wcstr++);
+            if (wclen > 0 && IS_SURROGATE_PAIR(wc, *wcstr)) {
+                wc = FROM_SURROGATES(wc, *wcstr);
+                wclen--, wcstr++;
            }
-
-            return bs->out - mbstr;
+            put_utf8_char(bs, wc);
        }
+
+        return true;
+    }
 #endif

-        /* No other fallbacks are available */
-        return 0;
-    }
+    /* No other fallbacks are available */
+    return false;
 }

-int mb_to_wc(int codepage, int flags, const char *mbstr, int mblen,
-             wchar_t *wcstr, int wclen)
+bool BinarySink_put_mb_to_wc(
+    BinarySink *bs, int codepage, const char *mbstr, int mblen)
 {
+    if (!mblen)
+        return true;
+
    if (codepage >= 65536) {
        /* Character set not known to Windows, so we'll have to
         * translate it ourself */
        size_t index = codepage - 65536;
        if (index >= lenof(cp_list))
-            return 0;
+            return false;
        const struct cp_list_item *cp = &cp_list[index];
        if (!cp->cp_table)
-            return 0;
+            return false;

-        size_t remaining = wclen;
-        wchar_t *p = wcstr;
        unsigned tablebase = 256 - cp->cp_size;

        while (mblen > 0) {
            mblen--;
            unsigned c = 0xFF & *mbstr++;
            wchar_t wc = (c < tablebase ? c : cp->cp_table[c - tablebase]);
-            if (remaining > 0) {
-                remaining--;
-                *p++ = wc;
+            put_data(bs, &wc, sizeof(wc));
+        }
+
+        return true;
+    }
+
+    {
+        wchar_t internalbuf[1024];
+        wchar_t *allocbuf = NULL;
+        size_t allocsize = 0;
+        wchar_t *currbuf = internalbuf;
+        size_t currsize = lenof(internalbuf);
+        bool success;
+
+        while (true) {
+            int ret = MultiByteToWideChar(
+                codepage, 0, mbstr, mblen, currbuf, currsize);
+
+            if (ret > 0) {
+                put_data(bs, currbuf, ret * sizeof(wchar_t));
+                success = true;
+                break;
+            } else if (GetLastError() != ERROR_INSUFFICIENT_BUFFER) {
+                success = false;
+                break;
            } else {
-                return p - wcstr;
+                sgrowarray_nm(allocbuf, allocsize, currsize);
+                currbuf = allocbuf;
+                currsize = allocsize;
            }
        }

-        return p - wcstr;
+        smemclr(allocbuf, allocsize * sizeof(wchar_t));
+        if (success)
+            return true;
    }

-    int ret = MultiByteToWideChar(codepage, flags, mbstr, mblen, wcstr, wclen);
-    if (ret)
-        return ret;
-
 #ifdef LEGACY_WINDOWS
    /*
     * Fallback for legacy platforms too old to support UTF-8: if the
     * codepage is UTF-8, we can do the translation ourselves.
     */
-    if (codepage == CP_UTF8 && mblen > 0 && wclen > 0) {
+    if (codepage == CP_UTF8 && mblen > 0) {
        BinarySource src[1];
        BinarySource_BARE_INIT(src, mbstr, mblen);

-        size_t remaining = wclen;
-        wchar_t *p = wcstr;
-
        while (get_avail(src)) {
            wchar_t wcbuf[2];
            size_t nwc = decode_utf8_to_wchar(src, wcbuf, NULL);
-
-            for (size_t i = 0; i < nwc; i++) {
-                if (remaining > 0) {
-                    remaining--;
-                    *p++ = wcbuf[i];
-                } else {
-                    return p - wcstr;
-                }
-            }
+            put_data(bs, wcbuf, nwc * sizeof(wchar_t));
        }

-        return p - wcstr;
+        return true;
    }
 #endif

    /* No other fallbacks are available */
-    return 0;
+    return false;
 }

 bool is_dbcs_leadbyte(int codepage, char byte)