From 834b58e39b2c6eddc717532411d90e05746b9df2 Mon Sep 17 00:00:00 2001 From: Simon Tatham Date: Wed, 9 Nov 2022 18:56:51 +0000 Subject: [PATCH] Make encode_utf8() output to a BinarySink. Previously it output to an ordinary char buffer, and returned the number of bytes it had written. But three out of the four call sites immediately chucked the resulting bytes into a BinarySink anyway. The fourth, in windows/unicode.c, really is writing into successive locations of a fixed-size buffer - but we can make that into a BinarySink too, using the buffer_sink added in the previous commit. So now encode_utf8() is renamed put_utf8_char, and the call sites all look simpler than they started out. --- marshal.h | 5 +++++ misc.h | 5 ----- terminal/terminal.c | 3 +-- utils/encode_utf8.c | 25 +++++++++++-------------- utils/encode_wide_string_as_utf8.c | 4 +--- utils/stripctrl.c | 11 ++--------- windows/unicode.c | 19 +++++++------------ 7 files changed, 27 insertions(+), 45 deletions(-) diff --git a/marshal.h b/marshal.h index 34a0572f..ab19ff0a 100644 --- a/marshal.h +++ b/marshal.h @@ -156,6 +156,10 @@ struct BinarySink { #define put_c_string_literal(bs, str) \ BinarySink_put_c_string_literal(BinarySink_UPCAST(bs), str) +/* More complicated function implemented in encode_utf8.c */ +#define put_utf8_char(bs, c) \ + BinarySink_put_utf8_char(BinarySink_UPCAST(bs), c) + /* * The underlying real C functions that implement most of those * macros. Generally you won't want to call these directly, because @@ -185,6 +189,7 @@ void BinarySink_put_mp_ssh2(BinarySink *bs, mp_int *x); void BinarySink_put_fmt(BinarySink *, const char *fmt, ...) PRINTF_LIKE(2, 3); void BinarySink_put_fmtv(BinarySink *, const char *fmt, va_list ap); void BinarySink_put_c_string_literal(BinarySink *, ptrlen); +void BinarySink_put_utf8_char(BinarySink *, unsigned); /* ---------------------------------------------------------------------- */ diff --git a/misc.h b/misc.h index 1b3d324a..f3b71a21 100644 --- a/misc.h +++ b/misc.h @@ -248,11 +248,6 @@ void smemclr(void *b, size_t len); * by the 'eq' in the name. */ unsigned smemeq(const void *av, const void *bv, size_t len); -/* Encode a single UTF-8 character. Assumes that illegal characters - * (such as things in the surrogate range, or > 0x10FFFF) have already - * been removed. */ -size_t encode_utf8(void *output, unsigned long ch); - /* Encode a wide-character string into UTF-8. Tolerates surrogates if * sizeof(wchar_t) == 2, assuming that in that case the wide string is * encoded in UTF-16. */ diff --git a/terminal/terminal.c b/terminal/terminal.c index 6bc6c514..cf658710 100644 --- a/terminal/terminal.c +++ b/terminal/terminal.c @@ -3398,8 +3398,7 @@ static strbuf *term_input_data_from_unicode( } } - char utf8_chr[6]; - put_data(buf, utf8_chr, encode_utf8(utf8_chr, ch)); + put_utf8_char(buf, ch); } } else { /* diff --git a/utils/encode_utf8.c b/utils/encode_utf8.c index 731ab925..d24f0951 100644 --- a/utils/encode_utf8.c +++ b/utils/encode_utf8.c @@ -5,25 +5,22 @@ #include "defs.h" #include "misc.h" -size_t encode_utf8(void *output, unsigned long ch) +void BinarySink_put_utf8_char(BinarySink *output, unsigned ch) { - unsigned char *start = (unsigned char *)output, *p = start; - if (ch < 0x80) { - *p++ = ch; + put_byte(output, ch); } else if (ch < 0x800) { - *p++ = 0xC0 | (ch >> 6); - *p++ = 0x80 | (ch & 0x3F); + put_byte(output, 0xC0 | (ch >> 6)); + put_byte(output, 0x80 | (ch & 0x3F)); } else if (ch < 0x10000) { - *p++ = 0xE0 | (ch >> 12); - *p++ = 0x80 | ((ch >> 6) & 0x3F); - *p++ = 0x80 | (ch & 0x3F); + put_byte(output, 0xE0 | (ch >> 12)); + put_byte(output, 0x80 | ((ch >> 6) & 0x3F)); + put_byte(output, 0x80 | (ch & 0x3F)); } else { assert(ch <= 0x10FFFF); - *p++ = 0xF0 | (ch >> 18); - *p++ = 0x80 | ((ch >> 12) & 0x3F); - *p++ = 0x80 | ((ch >> 6) & 0x3F); - *p++ = 0x80 | (ch & 0x3F); + put_byte(output, 0xF0 | (ch >> 18)); + put_byte(output, 0x80 | ((ch >> 12) & 0x3F)); + put_byte(output, 0x80 | ((ch >> 6) & 0x3F)); + put_byte(output, 0x80 | (ch & 0x3F)); } - return p - start; } diff --git a/utils/encode_wide_string_as_utf8.c b/utils/encode_wide_string_as_utf8.c index 870903d5..f5782888 100644 --- a/utils/encode_wide_string_as_utf8.c +++ b/utils/encode_wide_string_as_utf8.c @@ -17,9 +17,7 @@ char *encode_wide_string_as_utf8(const wchar_t *ws) } else if (IS_SURROGATE(ch)) { ch = 0xfffd; /* illegal UTF-16 -> REPLACEMENT CHARACTER */ } - char utf8[6]; - size_t size = encode_utf8(utf8, ch); - put_data(sb, utf8, size); + put_utf8_char(sb, ch); } return strbuf_to_str(sb); } diff --git a/utils/stripctrl.c b/utils/stripctrl.c index d723a079..16a8dce2 100644 --- a/utils/stripctrl.c +++ b/utils/stripctrl.c @@ -217,9 +217,6 @@ static inline void stripctrl_term_put_wc( if (prefix.len) put_datapl(scc->bs_out, prefix); - char outbuf[6]; - size_t produced; - /* * The Terminal implementation encodes 7-bit ASCII characters in * UTF-8 mode, and all printing characters in non-UTF-8 (i.e. @@ -232,14 +229,10 @@ static inline void stripctrl_term_put_wc( wc &= 0xFF; if (in_utf(scc->term)) { - produced = encode_utf8(outbuf, wc); + put_utf8_char(scc->bs_out, wc); } else { - outbuf[0] = wc; - produced = 1; + put_byte(scc->bs_out, wc); } - - if (produced > 0) - put_data(scc->bs_out, outbuf, produced); } static inline size_t stripctrl_locale_try_consume( diff --git a/windows/unicode.c b/windows/unicode.c index b3f6d802..421314f1 100644 --- a/windows/unicode.c +++ b/windows/unicode.c @@ -1290,8 +1290,8 @@ int wc_to_mb(int codepage, int flags, const wchar_t *wcstr, int wclen, * the codepage is UTF-8, we can do the translation ourselves. */ if (codepage == CP_UTF8 && mblen > 0 && wclen > 0) { - size_t remaining = mblen; - char *p = mbstr; + buffer_sink bs[1]; + buffer_sink_init(bs, mbstr, mblen); while (wclen > 0) { unsigned long wc = (wclen--, *wcstr++); @@ -1300,18 +1300,13 @@ int wc_to_mb(int codepage, int flags, const wchar_t *wcstr, int wclen, wclen--, wcstr++; } - char utfbuf[6]; - size_t utflen = encode_utf8(utfbuf, wc); - if (utflen <= remaining) { - memcpy(p, utfbuf, utflen); - p += utflen; - remaining -= utflen; - } else { - return p - mbstr; - } + const char *prev_ptr = bs->out; + put_utf8_char(bs, wc); + if (bs->overflowed) + return prev_ptr - mbstr; } - return p - mbstr; + return bs->out - mbstr; } #endif