From 392be3e494dc68f1d41cd27cd300cfcac78c235e Mon Sep 17 00:00:00 2001 From: Simon Tatham Date: Mon, 29 May 2023 13:27:16 +0100 Subject: [PATCH] New utility function: decode_utf8_to_wide_string. We already had encode_wide_string_as_utf8, which treats the wide string as UTF-16 or UTF-32 as appropriate to the size of wchar_t. I'm about to need the inverse function, and was surprised that it didn't already exist (even though enough component parts did to make it easy). --- misc.h | 4 ++++ utils/CMakeLists.txt | 1 + utils/decode_utf8_to_wide_string.c | 35 ++++++++++++++++++++++++++++++ 3 files changed, 40 insertions(+) create mode 100644 utils/decode_utf8_to_wide_string.c diff --git a/misc.h b/misc.h index be9ff9c0..16b9b68e 100644 --- a/misc.h +++ b/misc.h @@ -256,6 +256,10 @@ unsigned smemeq(const void *av, const void *bv, size_t len); * encoded in UTF-16. */ char *encode_wide_string_as_utf8(const wchar_t *wstr); +/* Decode UTF-8 to a wide-character string, emitting UTF-16 surrogates + * if sizeof(wchar_t) == 2. */ +wchar_t *decode_utf8_to_wide_string(const char *ustr); + /* Decode a single UTF-8 character. Returns U+FFFD for any of the * illegal cases. If the source is empty, returns L'\0' (and sets the * error indicator on the source, of course). */ diff --git a/utils/CMakeLists.txt b/utils/CMakeLists.txt index 38882059..30b1d72b 100644 --- a/utils/CMakeLists.txt +++ b/utils/CMakeLists.txt @@ -21,6 +21,7 @@ add_sources_from_current_dir(utils debug.c decode_utf8.c decode_utf8_to_wchar.c + decode_utf8_to_wide_string.c default_description.c dupcat.c dupprintf.c diff --git a/utils/decode_utf8_to_wide_string.c b/utils/decode_utf8_to_wide_string.c new file mode 100644 index 00000000..1ab9a8de --- /dev/null +++ b/utils/decode_utf8_to_wide_string.c @@ -0,0 +1,35 @@ +/* + * Decode a string of UTF-8 to a wchar_t string. + */ + +#include "misc.h" + +wchar_t *decode_utf8_to_wide_string(const char *s) +{ + wchar_t *ws = NULL; + size_t wlen = 0, wsize = 0; + + BinarySource src[1]; + BinarySource_BARE_INIT_PL(src, ptrlen_from_asciz(s)); + + while (get_avail(src) > 0) { + /* + * decode_utf8_to_wchar might emit up to 2 wchar_t if wchar_t + * is 16 bits (because of UTF-16 surrogates), but will emit at + * most one if wchar_t is 32-bit + */ + sgrowarrayn(ws, wsize, wlen, 1 + (sizeof(wchar_t) < 4)); + + /* We ignore 'err': if it is set, then the character decode + * function will have emitted U+FFFD REPLACEMENT CHARACTER, + * which is what we'd have done in response anyway. */ + DecodeUTF8Failure err; + wlen += decode_utf8_to_wchar(src, ws + wlen, &err); + } + + /* Reallocate to the final size and append the trailing NUL */ + ws = sresize(ws, wlen + 1, wchar_t); + ws[wlen] = L'\0'; + + return ws; +}