From 4f756d2a4db767866af254a5bb733dcb89c27c88 Mon Sep 17 00:00:00 2001 From: Simon Tatham Date: Tue, 24 Sep 2024 08:18:48 +0100 Subject: [PATCH] Rework Unicode conversion APIs to use a BinarySink. The previous mb_to_wc and wc_to_mb had horrible and also buggy APIs. This commit introduces a fresh pair of functions to replace them, which generate output by writing to a BinarySink. So it's now up to the caller to decide whether it wants the output written to a fixed-size buffer with overflow checking (via buffer_sink), or dynamically allocated, or even written directly to some other output channel. Nothing uses the new functions yet. I plan to migrate things over in upcoming commits. What was wrong with the old APIs: they had that awkward undocumented Windows-specific 'flags' parameter that I described in the previous commit and took out of the dup_X_to_Y wrappers. But much worse, the semantics for buffer overflow were not just undocumented but actually inconsistent. dup_wc_to_mb() in utils assumed that the underlying wc_to_mb would fill the buffer nearly full and return the size of data it wrote. In fact, this was untrue in the case where wc_to_mb called WideCharToMultiByte: that returns straight-up failure, setting the Windows error code to ERROR_INSUFFICIENT_BUFFER. It _does_ partially fill the output buffer, but doesn't tell you how much it wrote! What's wrong with the new API: it's a bit awkward to write a sequence of wchar_t in native byte order to a byte-oriented BinarySink, so people using put_mb_to_wc directly have to do some annoying pointer casting. But I think that's less horrible than the previous APIs. Another change: in the new API for wc_to_mb, defchr can be "", but not NULL. --- marshal.h | 12 +++ putty.h | 5 +- terminal/terminal.c | 39 +++----- unix/unicode.c | 92 ++++++++++-------- unix/unifont.c | 28 +++--- unix/window.c | 43 ++++----- utils/dup_mb_to_wc.c | 24 ++--- utils/dup_wc_to_mb.c | 25 +---- windows/console.c | 16 ++-- windows/unicode.c | 222 +++++++++++++++++++++++++------------------ 10 files changed, 267 insertions(+), 239 deletions(-) diff --git a/marshal.h b/marshal.h index ab19ff0a..59b9ed2a 100644 --- a/marshal.h +++ b/marshal.h @@ -160,6 +160,12 @@ struct BinarySink { #define put_utf8_char(bs, c) \ BinarySink_put_utf8_char(BinarySink_UPCAST(bs), c) +/* More complicated functions still implemented in /unicode.c */ +#define put_mb_to_wc(bs, codepage, mbstr, mblen) \ + BinarySink_put_mb_to_wc(BinarySink_UPCAST(bs), codepage, mbstr, mblen) +#define put_wc_to_mb(bs, codepage, wcstr, wclen, def) \ + BinarySink_put_wc_to_mb(BinarySink_UPCAST(bs), codepage, wcstr, wclen, def) + /* * The underlying real C functions that implement most of those * macros. Generally you won't want to call these directly, because @@ -190,6 +196,12 @@ void BinarySink_put_fmt(BinarySink *, const char *fmt, ...) PRINTF_LIKE(2, 3); void BinarySink_put_fmtv(BinarySink *, const char *fmt, va_list ap); void BinarySink_put_c_string_literal(BinarySink *, ptrlen); void BinarySink_put_utf8_char(BinarySink *, unsigned); +/* put_mb_to_wc / put_wc_to_mb return false if the codepage is invalid */ +bool BinarySink_put_mb_to_wc( + BinarySink *bs, int codepage, const char *mbstr, int mblen); +bool BinarySink_put_wc_to_mb( + BinarySink *bs, int codepage, const wchar_t *wcstr, int wclen, + const char *defchr); /* ---------------------------------------------------------------------- */ diff --git a/putty.h b/putty.h index c97bb6ef..9483be20 100644 --- a/putty.h +++ b/putty.h @@ -2246,10 +2246,7 @@ extern const char commitid[]; */ /* void init_ucs(void); -- this is now in platform-specific headers */ bool is_dbcs_leadbyte(int codepage, char byte); -int mb_to_wc(int codepage, int flags, const char *mbstr, int mblen, - wchar_t *wcstr, int wclen); -int wc_to_mb(int codepage, int flags, const wchar_t *wcstr, int wclen, - char *mbstr, int mblen, const char *defchr); +/* For put_mb_to_wc / put_wc_to_mb, see marshal.h */ wchar_t xlat_uskbd2cyrllic(int ch); int check_compose(int first, int second); int decode_codepage(const char *cp_name); diff --git a/terminal/terminal.c b/terminal/terminal.c index 642202af..27068bf6 100644 --- a/terminal/terminal.c +++ b/terminal/terminal.c @@ -3497,11 +3497,7 @@ static strbuf *term_input_data_from_unicode( * (But also we must allow space for the trailing NUL that * wc_to_mb will write.) */ - char *bufptr = strbuf_append(buf, len + 1); - int rv; - rv = wc_to_mb(term->ucsdata->line_codepage, 0, widebuf, len, - bufptr, len + 1, NULL); - strbuf_shrink_to(buf, rv < 0 ? 0 : rv); + put_wc_to_mb(buf, term->ucsdata->line_codepage, widebuf, len, ""); } return buf; @@ -3510,18 +3506,12 @@ static strbuf *term_input_data_from_unicode( static strbuf *term_input_data_from_charset( Terminal *term, int codepage, const char *str, size_t len) { - strbuf *buf; + strbuf *buf = strbuf_new(); - if (codepage < 0) { - buf = strbuf_new(); + if (codepage < 0) put_data(buf, str, len); - } else { - size_t widesize = len * 2; /* allow for UTF-16 surrogates */ - wchar_t *widebuf = snewn(widesize, wchar_t); - int widelen = mb_to_wc(codepage, 0, str, len, widebuf, widesize); - buf = term_input_data_from_unicode(term, widebuf, widelen); - sfree(widebuf); - } + else + put_mb_to_wc(buf, codepage, str, len); return buf; } @@ -6734,23 +6724,24 @@ static void clipme(Terminal *term, pos top, pos bottom, bool rect, bool desel, if (DIRECT_FONT(uc)) { if (c >= ' ' && c != 0x7F) { - char buf[4]; - WCHAR wbuf[4]; - int rv; + char buf[2]; + buffer_sink bs[1]; + buffer_sink_init(bs, cbuf, + sizeof(cbuf) - sizeof(wchar_t)); if (is_dbcs_leadbyte(term->ucsdata->font_codepage, (BYTE) c)) { buf[0] = c; buf[1] = (char) (0xFF & ldata->chars[top.x + 1].chr); - rv = mb_to_wc(term->ucsdata->font_codepage, 0, buf, 2, wbuf, 4); + put_mb_to_wc(bs, term->ucsdata->font_codepage, + buf, 2); top.x++; } else { buf[0] = c; - rv = mb_to_wc(term->ucsdata->font_codepage, 0, buf, 1, wbuf, 4); + put_mb_to_wc(bs, term->ucsdata->font_codepage, + buf, 1); } - if (rv > 0) { - memcpy(cbuf, wbuf, rv * sizeof(wchar_t)); - cbuf[rv] = 0; - } + assert(!bs->overflowed); + *(wchar_t *)bs->out = L'\0'; } } diff --git a/unix/unicode.c b/unix/unicode.c index 7be64a53..44d523e9 100644 --- a/unix/unicode.c +++ b/unix/unicode.c @@ -21,81 +21,99 @@ bool is_dbcs_leadbyte(int codepage, char byte) return false; /* we don't do DBCS */ } -int mb_to_wc(int codepage, int flags, const char *mbstr, int mblen, - wchar_t *wcstr, int wclen) +bool BinarySink_put_mb_to_wc( + BinarySink *bs, int codepage, const char *mbstr, int mblen) { if (codepage == DEFAULT_CODEPAGE) { - int n = 0; mbstate_t state; memset(&state, 0, sizeof state); while (mblen > 0) { - if (n >= wclen) - return n; - size_t i = mbrtowc(wcstr+n, mbstr, (size_t)mblen, &state); + wchar_t wc; + size_t i = mbrtowc(&wc, mbstr, (size_t)mblen, &state); if (i == (size_t)-1 || i == (size_t)-2) break; - n++; + put_data(bs, &wc, sizeof(wc)); mbstr += i; mblen -= i; } - - return n; } else if (codepage == CS_NONE) { - int n = 0; - while (mblen > 0) { - if (n >= wclen) - return n; - wcstr[n] = 0xD800 | (mbstr[0] & 0xFF); - n++; + wchar_t wc = 0xD800 | (mbstr[0] & 0xFF); + put_data(bs, &wc, sizeof(wc)); mbstr++; mblen--; } + } else { + wchar_t wbuf[1024]; + while (mblen > 0) { + int wlen = charset_to_unicode(&mbstr, &mblen, wbuf, lenof(wbuf), + codepage, NULL, NULL, 0); + put_data(bs, wbuf, wlen * sizeof(wchar_t)); + } + } - return n; - } else - return charset_to_unicode(&mbstr, &mblen, wcstr, wclen, codepage, - NULL, NULL, 0); + /* We never expect to receive invalid charset values on Unix, + * because we're not dependent on an externally defined space of + * OS-provided code pages */ + return true; } -int wc_to_mb(int codepage, int flags, const wchar_t *wcstr, int wclen, - char *mbstr, int mblen, const char *defchr) +bool BinarySink_put_wc_to_mb( + BinarySink *bs, int codepage, const wchar_t *wcstr, int wclen, + const char *defchr) { + size_t defchr_len = 0; + bool defchr_len_known = false; + if (codepage == DEFAULT_CODEPAGE) { char output[MB_LEN_MAX]; mbstate_t state; - int n = 0; memset(&state, 0, sizeof state); while (wclen > 0) { size_t i = wcrtomb(output, wcstr[0], &state); - if (i == (size_t)-1 || i > n - mblen) - break; - memcpy(mbstr+n, output, i); - n += i; + if (i == (size_t)-1) { + if (!defchr_len_known) { + defchr_len = strlen(defchr); + defchr_len_known = true; + } + put_data(bs, defchr, defchr_len); + } else { + put_data(bs, output, i); + } wcstr++; wclen--; } - - return n; } else if (codepage == CS_NONE) { - int n = 0; - while (wclen > 0 && n < mblen) { - if (*wcstr >= 0xD800 && *wcstr < 0xD900) - mbstr[n++] = (*wcstr & 0xFF); - else if (defchr) - mbstr[n++] = *defchr; + while (wclen > 0) { + if (*wcstr >= 0xD800 && *wcstr < 0xD900) { + put_byte(bs, *wcstr & 0xFF); + } else { + if (!defchr_len_known) { + defchr_len = strlen(defchr); + defchr_len_known = true; + } + put_data(bs, defchr, defchr_len); + } wcstr++; wclen--; } - return n; } else { - return charset_from_unicode(&wcstr, &wclen, mbstr, mblen, codepage, - NULL, defchr?defchr:NULL, defchr?1:0); + char buf[2048]; + defchr_len = strlen(defchr); + + while (wclen > 0) { + int len = charset_from_unicode( + &wcstr, &wclen, buf, lenof(buf), codepage, + NULL, defchr, defchr_len); + put_data(bs, buf, len); + } } + + return true; } /* diff --git a/unix/unifont.c b/unix/unifont.c index e9f8623a..c8256794 100644 --- a/unix/unifont.c +++ b/unix/unifont.c @@ -598,14 +598,14 @@ static bool x11font_has_glyph(unifont *font, wchar_t glyph) * This X font has 8-bit indices, so we must convert to the * appropriate character set. */ - char sbstring[2]; - int sblen = wc_to_mb(xfont->real_charset, 0, &glyph, 1, - sbstring, 2, ""); - if (sblen == 0 || !sbstring[0]) + char c = '\0'; + buffer_sink bs[1]; + buffer_sink_init(bs, &c, 1); + put_wc_to_mb(bs, xfont->real_charset, &glyph, 1, ""); + if (!c) return false; /* not even in the charset */ - return x11_font_has_glyph(xfont->fonts[0].xfs, 0, - (unsigned char)sbstring[0]); + return x11_font_has_glyph(xfont->fonts[0].xfs, 0, (unsigned char)c); } } @@ -953,14 +953,13 @@ static void x11font_draw_text(unifont_drawctx *ctx, unifont *font, * This X font has 8-bit indices, so we must convert to the * appropriate character set. */ - char *sbstring = snewn(len+1, char); - int sblen = wc_to_mb(xfont->real_charset, 0, string, len, - sbstring, len+1, "."); + strbuf *sb = strbuf_new(); + put_wc_to_mb(sb, xfont->real_charset, string, len, "."); x11font_really_draw_text(x11font_drawfuncs + index + 0, ctx, &xfont->fonts[sfid], xfont->disp, x, y, - sbstring, sblen, shadowoffset, + sb->s, sb->len, shadowoffset, xfont->variable, cellwidth * mult); - sfree(sbstring); + strbuf_free(sb); } } @@ -1603,7 +1602,7 @@ static void pangofont_draw_internal(unifont_drawctx *ctx, unifont *font, PangoLayout *layout; PangoRectangle rect; char *utfstring, *utfptr; - int utflen; + size_t utflen; bool shadowbold = false; void (*draw_layout)(unifont_drawctx *ctx, gint x, gint y, PangoLayout *layout) = NULL; @@ -1642,12 +1641,11 @@ static void pangofont_draw_internal(unifont_drawctx *ctx, unifont *font, * Pango always expects UTF-8, so convert the input wide character * string to UTF-8. */ - utfstring = snewn(len*6+1, char); /* UTF-8 has max 6 bytes/char */ - utflen = wc_to_mb(CS_UTF8, 0, string, len, utfstring, len*6+1, "."); + utfstring = dup_wc_to_mb_c(CS_UTF8, string, len, "", &utflen); utfptr = utfstring; while (utflen > 0) { - int clen, n; + size_t clen, n; int desired = cellwidth * PANGO_SCALE; /* diff --git a/unix/window.c b/unix/window.c index 27696212..bb772d32 100644 --- a/unix/window.c +++ b/unix/window.c @@ -1603,10 +1603,12 @@ gint key_event(GtkWidget *widget, GdkEventKey *event, gpointer data) const wchar_t *wp; int wlen; int ulen; + buffer_sink bs[1]; - wlen = mb_to_wc(DEFAULT_CODEPAGE, 0, - event_string, strlen(event_string), - widedata, lenof(widedata)-1); + buffer_sink_init(bs, widedata, sizeof(widedata) - sizeof(wchar_t)); + put_mb_to_wc(bs, DEFAULT_CODEPAGE, + event_string, strlen(event_string)); + wlen = (wchar_t *)bs->out - widedata; #ifdef KEY_EVENT_DIAGNOSTICS { @@ -2954,16 +2956,12 @@ static void clipboard_text_received(GtkClipboard *clipboard, { GtkFrontend *inst = (GtkFrontend *)data; wchar_t *paste; - int paste_len; - int length; + size_t paste_len; if (!text) return; - length = strlen(text); - - paste = snewn(length, wchar_t); - paste_len = mb_to_wc(CS_UTF8, 0, text, length, paste, length); + paste = dup_mb_to_wc(CS_UTF8, text, length, &paste_len); term_do_paste(inst->term, paste, paste_len); @@ -3102,17 +3100,15 @@ static void gtkwin_clip_write( state->pasteout_data_ctext_len = 0; } - state->pasteout_data = snewn(len*6, char); - state->pasteout_data_len = len*6; - state->pasteout_data_len = wc_to_mb(inst->ucsdata.line_codepage, 0, - data, len, state->pasteout_data, - state->pasteout_data_len, NULL); - if (state->pasteout_data_len == 0) { - sfree(state->pasteout_data); - state->pasteout_data = NULL; - } else { - state->pasteout_data = - sresize(state->pasteout_data, state->pasteout_data_len, char); + { + size_t outlen; + state->pasteout_data = dup_wc_to_mb_c( + inst->ucsdata.line_codepage, data, len, "", &outlen); + /* We can't handle pastes larger than INT_MAX, because + * gtk_selection_data_set_text's length parameter is a gint */ + if (outlen > INT_MAX) + outlen = INT_MAX; + state->pasteout_data_len = outlen; } #ifndef NOT_X_WINDOWS @@ -3240,7 +3236,7 @@ static void selection_received(GtkWidget *widget, GtkSelectionData *seldata, const guchar *seldata_data = gtk_selection_data_get_data(seldata); gint seldata_length = gtk_selection_data_get_length(seldata); wchar_t *paste; - int paste_len; + size_t paste_len; struct clipboard_state *state = clipboard_from_atom( inst, gtk_selection_data_get_selection(seldata)); @@ -3333,11 +3329,8 @@ static void selection_received(GtkWidget *widget, GtkSelectionData *seldata, } } - paste = snewn(length, wchar_t); - paste_len = mb_to_wc(charset, 0, text, length, paste, length); - + paste = dup_mb_to_wc_c(charset, text, length, &paste_len); term_do_paste(inst->term, paste, paste_len); - sfree(paste); #ifndef NOT_X_WINDOWS diff --git a/utils/dup_mb_to_wc.c b/utils/dup_mb_to_wc.c index 6317ca93..0738ed27 100644 --- a/utils/dup_mb_to_wc.c +++ b/utils/dup_mb_to_wc.c @@ -12,20 +12,16 @@ wchar_t *dup_mb_to_wc_c(int codepage, const char *string, size_t inlen, size_t *outlen_p) { - assert(inlen <= INT_MAX); - size_t mult; - for (mult = 1 ;; mult++) { - wchar_t *ret = snewn(mult*inlen + 2, wchar_t); - size_t outlen = mb_to_wc(codepage, 0, string, inlen, ret, - mult*inlen + 1); - if (outlen < mult*inlen+1) { - if (outlen_p) - *outlen_p = outlen; - ret[outlen] = L'\0'; - return ret; - } - sfree(ret); - } + strbuf *sb = strbuf_new(); + put_mb_to_wc(sb, codepage, string, inlen); + if (outlen_p) + *outlen_p = sb->len / sizeof(wchar_t); + + /* Append a trailing L'\0'. For this we only need to write one + * byte _fewer_ than sizeof(wchar_t), because strbuf will append a + * byte '\0' for us. */ + put_padding(sb, sizeof(wchar_t) - 1, 0); + return (wchar_t *)strbuf_to_str(sb); } wchar_t *dup_mb_to_wc(int codepage, const char *string) diff --git a/utils/dup_wc_to_mb.c b/utils/dup_wc_to_mb.c index 42780f73..3259d22e 100644 --- a/utils/dup_wc_to_mb.c +++ b/utils/dup_wc_to_mb.c @@ -14,26 +14,11 @@ char *dup_wc_to_mb_c(int codepage, const wchar_t *string, size_t inlen, const char *defchr, size_t *outlen_p) { - assert(inlen <= INT_MAX); - - size_t outsize = inlen+1; - char *out = snewn(outsize, char); - - while (true) { - size_t outlen = wc_to_mb(codepage, 0, string, inlen, out, outsize, - defchr); - /* We can only be sure we've consumed the whole input if the - * output is not within a multibyte-character-length of the - * end of the buffer! */ - if (outlen < outsize && outsize - outlen > MB_LEN_MAX) { - if (outlen_p) - *outlen_p = outlen; - out[outlen] = '\0'; - return out; - } - - sgrowarray(out, outsize, outsize); - } + strbuf *sb = strbuf_new(); + put_wc_to_mb(sb, codepage, string, inlen, defchr); + if (outlen_p) + *outlen_p = sb->len; + return strbuf_to_str(sb); } char *dup_wc_to_mb(int codepage, const wchar_t *string, diff --git a/windows/console.c b/windows/console.c index be49dd89..099b8a29 100644 --- a/windows/console.c +++ b/windows/console.c @@ -221,7 +221,7 @@ static bool console_read_line_to_strbuf(ConsoleIO *conio, bool echo, } if (conio->utf8) { - wchar_t wbuf[4096]; + wchar_t wbuf[4097]; size_t wlen; if (conio->hin_is_console) { @@ -245,17 +245,15 @@ static bool console_read_line_to_strbuf(ConsoleIO *conio, bool echo, if (!ReadFile(conio->hin, buf, lenof(buf), &nread, NULL)) goto out; - wlen = mb_to_wc(CP_ACP, 0, buf, nread, wbuf, lenof(wbuf)); + buffer_sink bs[1]; + buffer_sink_init(bs, wbuf, sizeof(wbuf) - sizeof(wchar_t)); + put_mb_to_wc(bs, CP_ACP, buf, nread); + assert(!bs->overflowed); + wlen = (wchar_t *)bs->out - wbuf; smemclr(buf, sizeof(buf)); } - /* Allocate the maximum space in the strbuf that might be - * needed for this data */ - size_t oldlen = sb->len, maxout = wlen * 4; - void *outptr = strbuf_append(sb, maxout); - size_t newlen = oldlen + wc_to_mb(CP_UTF8, 0, wbuf, wlen, - outptr, maxout, NULL); - strbuf_shrink_to(sb, newlen); + put_wc_to_mb(sb, CP_UTF8, wbuf, wlen, ""); smemclr(wbuf, sizeof(wbuf)); } else { /* diff --git a/windows/unicode.c b/windows/unicode.c index 412db6ea..bbd07c52 100644 --- a/windows/unicode.c +++ b/windows/unicode.c @@ -1232,8 +1232,7 @@ void get_unitab(int codepage, wchar_t *unitab, int ftype) for (i = 0; i < max; i++) { tbuf[0] = i; - if (mb_to_wc(codepage, flg, tbuf, 1, unitab + i, 1) - != 1) + if (MultiByteToWideChar(codepage, flg, tbuf, 1, unitab+i, 1) != 1) unitab[i] = 0xFFFD; } } else { @@ -1245,151 +1244,192 @@ void get_unitab(int codepage, wchar_t *unitab, int ftype) } } -int wc_to_mb(int codepage, int flags, const wchar_t *wcstr, int wclen, - char *mbstr, int mblen, const char *defchr) +bool BinarySink_put_wc_to_mb( + BinarySink *bs, int codepage, const wchar_t *wcstr, int wclen, + const char *defchr) { + if (!wclen) + return true; + reverse_mapping *rmap = get_reverse_mapping(codepage); if (rmap) { + size_t defchr_len = 0; + bool defchr_len_known = false; + /* Do this by array lookup if we can. */ - if (wclen < 0) { - for (wclen = 0; wcstr[wclen++] ;); /* will include the NUL */ - } - char *p; - int i; - for (p = mbstr, i = 0; i < wclen; i++) { + for (size_t i = 0; i < wclen; i++) { wchar_t ch = wcstr[i]; int by; - const char *p1; + const char *blk; - #define WRITECH(chr) do \ - { \ - assert(p - mbstr < mblen); \ - *p++ = (char)(chr); \ - } while (0) - - if ((p1 = rmap->blocks[(ch >> 8) & 0xFF]) != NULL && - (by = p1[ch & 0xFF]) != '\0') - WRITECH(by); + if ((blk = rmap->blocks[(ch >> 8) & 0xFF]) != NULL && + (by = blk[ch & 0xFF]) != '\0') + put_byte(bs, by); else if (ch < 0x80) - WRITECH(ch); - else if (defchr) - for (const char *q = defchr; *q; q++) - WRITECH(*q); -#if 1 - else - WRITECH('.'); -#endif - - #undef WRITECH + put_byte(bs, ch); + else if (defchr) { + if (!defchr_len_known) { + defchr_len = strlen(defchr); + defchr_len_known = true; + } + put_data(bs, defchr, defchr_len); + } } - return p - mbstr; - } else { - int defused, ret; - ret = WideCharToMultiByte(codepage, flags, wcstr, wclen, - mbstr, mblen, defchr, &defused); - if (ret) - return ret; + return true; + } + + { + char internalbuf[2048]; + char *allocbuf = NULL; + size_t allocsize = 0; + char *currbuf = internalbuf; + size_t currsize = lenof(internalbuf); + bool success; + + BOOL defused = false; + BOOL *defusedp = &defused; + + if (codepage == CP_UTF8 || !defchr[0]) { + /* + * The Win32 API spec says that defchr and defused must be + * NULL when doing a UTF-8 conversion, on pain of + * ERROR_INVALID_PARAMETER. + * + * Also, translate defchr="" on input to NULL in the Win32 + * API. + */ + defchr = NULL; + defusedp = NULL; + } + + while (true) { + int ret = WideCharToMultiByte( + codepage, 0, wcstr, wclen, currbuf, currsize, + defchr, defusedp); + + if (ret) { + put_data(bs, currbuf, ret); + success = true; + break; + } else if (GetLastError() != ERROR_INSUFFICIENT_BUFFER) { + success = false; + break; + } else { + sgrowarray_nm(allocbuf, allocsize, currsize); + currbuf = allocbuf; + currsize = allocsize; + } + } + + smemclr(allocbuf, allocsize); + if (success) + return true; + } #ifdef LEGACY_WINDOWS - /* - * Fallback for legacy platforms too old to support UTF-8: if - * the codepage is UTF-8, we can do the translation ourselves. - */ - if (codepage == CP_UTF8 && mblen > 0 && wclen > 0) { - buffer_sink bs[1]; - buffer_sink_init(bs, mbstr, mblen); - - while (wclen > 0) { - unsigned long wc = (wclen--, *wcstr++); - if (wclen > 0 && IS_SURROGATE_PAIR(wc, *wcstr)) { - wc = FROM_SURROGATES(wc, *wcstr); - wclen--, wcstr++; - } - - const char *prev_ptr = bs->out; - put_utf8_char(bs, wc); - if (bs->overflowed) - return prev_ptr - mbstr; + /* + * Fallback for legacy platforms too old to support UTF-8: if + * the codepage is UTF-8, we can do the translation ourselves. + */ + if (codepage == CP_UTF8 && wclen > 0) { + while (wclen > 0) { + unsigned long wc = (wclen--, *wcstr++); + if (wclen > 0 && IS_SURROGATE_PAIR(wc, *wcstr)) { + wc = FROM_SURROGATES(wc, *wcstr); + wclen--, wcstr++; } - - return bs->out - mbstr; + put_utf8_char(bs, wc); } + + return true; + } #endif - /* No other fallbacks are available */ - return 0; - } + /* No other fallbacks are available */ + return false; } -int mb_to_wc(int codepage, int flags, const char *mbstr, int mblen, - wchar_t *wcstr, int wclen) +bool BinarySink_put_mb_to_wc( + BinarySink *bs, int codepage, const char *mbstr, int mblen) { + if (!mblen) + return true; + if (codepage >= 65536) { /* Character set not known to Windows, so we'll have to * translate it ourself */ size_t index = codepage - 65536; if (index >= lenof(cp_list)) - return 0; + return false; const struct cp_list_item *cp = &cp_list[index]; if (!cp->cp_table) - return 0; + return false; - size_t remaining = wclen; - wchar_t *p = wcstr; unsigned tablebase = 256 - cp->cp_size; while (mblen > 0) { mblen--; unsigned c = 0xFF & *mbstr++; wchar_t wc = (c < tablebase ? c : cp->cp_table[c - tablebase]); - if (remaining > 0) { - remaining--; - *p++ = wc; + put_data(bs, &wc, sizeof(wc)); + } + + return true; + } + + { + wchar_t internalbuf[1024]; + wchar_t *allocbuf = NULL; + size_t allocsize = 0; + wchar_t *currbuf = internalbuf; + size_t currsize = lenof(internalbuf); + bool success; + + while (true) { + int ret = MultiByteToWideChar( + codepage, 0, mbstr, mblen, currbuf, currsize); + + if (ret > 0) { + put_data(bs, currbuf, ret * sizeof(wchar_t)); + success = true; + break; + } else if (GetLastError() != ERROR_INSUFFICIENT_BUFFER) { + success = false; + break; } else { - return p - wcstr; + sgrowarray_nm(allocbuf, allocsize, currsize); + currbuf = allocbuf; + currsize = allocsize; } } - return p - wcstr; + smemclr(allocbuf, allocsize * sizeof(wchar_t)); + if (success) + return true; } - int ret = MultiByteToWideChar(codepage, flags, mbstr, mblen, wcstr, wclen); - if (ret) - return ret; - #ifdef LEGACY_WINDOWS /* * Fallback for legacy platforms too old to support UTF-8: if the * codepage is UTF-8, we can do the translation ourselves. */ - if (codepage == CP_UTF8 && mblen > 0 && wclen > 0) { + if (codepage == CP_UTF8 && mblen > 0) { BinarySource src[1]; BinarySource_BARE_INIT(src, mbstr, mblen); - size_t remaining = wclen; - wchar_t *p = wcstr; - while (get_avail(src)) { wchar_t wcbuf[2]; size_t nwc = decode_utf8_to_wchar(src, wcbuf, NULL); - - for (size_t i = 0; i < nwc; i++) { - if (remaining > 0) { - remaining--; - *p++ = wcbuf[i]; - } else { - return p - wcstr; - } - } + put_data(bs, wcbuf, nwc * sizeof(wchar_t)); } - return p - wcstr; + return true; } #endif /* No other fallbacks are available */ - return 0; + return false; } bool is_dbcs_leadbyte(int codepage, char byte)