diff --git a/marshal.h b/marshal.h index 34a0572f..ab19ff0a 100644 --- a/marshal.h +++ b/marshal.h @@ -156,6 +156,10 @@ struct BinarySink { #define put_c_string_literal(bs, str) \ BinarySink_put_c_string_literal(BinarySink_UPCAST(bs), str) +/* More complicated function implemented in encode_utf8.c */ +#define put_utf8_char(bs, c) \ + BinarySink_put_utf8_char(BinarySink_UPCAST(bs), c) + /* * The underlying real C functions that implement most of those * macros. Generally you won't want to call these directly, because @@ -185,6 +189,7 @@ void BinarySink_put_mp_ssh2(BinarySink *bs, mp_int *x); void BinarySink_put_fmt(BinarySink *, const char *fmt, ...) PRINTF_LIKE(2, 3); void BinarySink_put_fmtv(BinarySink *, const char *fmt, va_list ap); void BinarySink_put_c_string_literal(BinarySink *, ptrlen); +void BinarySink_put_utf8_char(BinarySink *, unsigned); /* ---------------------------------------------------------------------- */ diff --git a/misc.h b/misc.h index 1b3d324a..f3b71a21 100644 --- a/misc.h +++ b/misc.h @@ -248,11 +248,6 @@ void smemclr(void *b, size_t len); * by the 'eq' in the name. */ unsigned smemeq(const void *av, const void *bv, size_t len); -/* Encode a single UTF-8 character. Assumes that illegal characters - * (such as things in the surrogate range, or > 0x10FFFF) have already - * been removed. */ -size_t encode_utf8(void *output, unsigned long ch); - /* Encode a wide-character string into UTF-8. Tolerates surrogates if * sizeof(wchar_t) == 2, assuming that in that case the wide string is * encoded in UTF-16. */ diff --git a/terminal/terminal.c b/terminal/terminal.c index 6bc6c514..cf658710 100644 --- a/terminal/terminal.c +++ b/terminal/terminal.c @@ -3398,8 +3398,7 @@ static strbuf *term_input_data_from_unicode( } } - char utf8_chr[6]; - put_data(buf, utf8_chr, encode_utf8(utf8_chr, ch)); + put_utf8_char(buf, ch); } } else { /* diff --git a/utils/encode_utf8.c b/utils/encode_utf8.c index 731ab925..d24f0951 100644 --- a/utils/encode_utf8.c +++ b/utils/encode_utf8.c @@ -5,25 +5,22 @@ #include "defs.h" #include "misc.h" -size_t encode_utf8(void *output, unsigned long ch) +void BinarySink_put_utf8_char(BinarySink *output, unsigned ch) { - unsigned char *start = (unsigned char *)output, *p = start; - if (ch < 0x80) { - *p++ = ch; + put_byte(output, ch); } else if (ch < 0x800) { - *p++ = 0xC0 | (ch >> 6); - *p++ = 0x80 | (ch & 0x3F); + put_byte(output, 0xC0 | (ch >> 6)); + put_byte(output, 0x80 | (ch & 0x3F)); } else if (ch < 0x10000) { - *p++ = 0xE0 | (ch >> 12); - *p++ = 0x80 | ((ch >> 6) & 0x3F); - *p++ = 0x80 | (ch & 0x3F); + put_byte(output, 0xE0 | (ch >> 12)); + put_byte(output, 0x80 | ((ch >> 6) & 0x3F)); + put_byte(output, 0x80 | (ch & 0x3F)); } else { assert(ch <= 0x10FFFF); - *p++ = 0xF0 | (ch >> 18); - *p++ = 0x80 | ((ch >> 12) & 0x3F); - *p++ = 0x80 | ((ch >> 6) & 0x3F); - *p++ = 0x80 | (ch & 0x3F); + put_byte(output, 0xF0 | (ch >> 18)); + put_byte(output, 0x80 | ((ch >> 12) & 0x3F)); + put_byte(output, 0x80 | ((ch >> 6) & 0x3F)); + put_byte(output, 0x80 | (ch & 0x3F)); } - return p - start; } diff --git a/utils/encode_wide_string_as_utf8.c b/utils/encode_wide_string_as_utf8.c index 870903d5..f5782888 100644 --- a/utils/encode_wide_string_as_utf8.c +++ b/utils/encode_wide_string_as_utf8.c @@ -17,9 +17,7 @@ char *encode_wide_string_as_utf8(const wchar_t *ws) } else if (IS_SURROGATE(ch)) { ch = 0xfffd; /* illegal UTF-16 -> REPLACEMENT CHARACTER */ } - char utf8[6]; - size_t size = encode_utf8(utf8, ch); - put_data(sb, utf8, size); + put_utf8_char(sb, ch); } return strbuf_to_str(sb); } diff --git a/utils/stripctrl.c b/utils/stripctrl.c index d723a079..16a8dce2 100644 --- a/utils/stripctrl.c +++ b/utils/stripctrl.c @@ -217,9 +217,6 @@ static inline void stripctrl_term_put_wc( if (prefix.len) put_datapl(scc->bs_out, prefix); - char outbuf[6]; - size_t produced; - /* * The Terminal implementation encodes 7-bit ASCII characters in * UTF-8 mode, and all printing characters in non-UTF-8 (i.e. @@ -232,14 +229,10 @@ static inline void stripctrl_term_put_wc( wc &= 0xFF; if (in_utf(scc->term)) { - produced = encode_utf8(outbuf, wc); + put_utf8_char(scc->bs_out, wc); } else { - outbuf[0] = wc; - produced = 1; + put_byte(scc->bs_out, wc); } - - if (produced > 0) - put_data(scc->bs_out, outbuf, produced); } static inline size_t stripctrl_locale_try_consume( diff --git a/windows/unicode.c b/windows/unicode.c index b3f6d802..421314f1 100644 --- a/windows/unicode.c +++ b/windows/unicode.c @@ -1290,8 +1290,8 @@ int wc_to_mb(int codepage, int flags, const wchar_t *wcstr, int wclen, * the codepage is UTF-8, we can do the translation ourselves. */ if (codepage == CP_UTF8 && mblen > 0 && wclen > 0) { - size_t remaining = mblen; - char *p = mbstr; + buffer_sink bs[1]; + buffer_sink_init(bs, mbstr, mblen); while (wclen > 0) { unsigned long wc = (wclen--, *wcstr++); @@ -1300,18 +1300,13 @@ int wc_to_mb(int codepage, int flags, const wchar_t *wcstr, int wclen, wclen--, wcstr++; } - char utfbuf[6]; - size_t utflen = encode_utf8(utfbuf, wc); - if (utflen <= remaining) { - memcpy(p, utfbuf, utflen); - p += utflen; - remaining -= utflen; - } else { - return p - mbstr; - } + const char *prev_ptr = bs->out; + put_utf8_char(bs, wc); + if (bs->overflowed) + return prev_ptr - mbstr; } - return p - mbstr; + return bs->out - mbstr; } #endif