New utility function: decode_utf8_to_wide_string.

We already had encode_wide_string_as_utf8, which treats the wide string as UTF-16 or UTF-32 as appropriate to the size of wchar_t. I'm about to need the inverse function, and was surprised that it didn't already exist (even though enough component parts did to make it easy).
2025-07-01 19:42:48 -05:00 · 2023-05-29 13:27:16 +01:00
parent 36db93748e
commit 392be3e494
3 changed files with 40 additions and 0 deletions
--- a/misc.h
+++ b/misc.h
@ -256,6 +256,10 @@ unsigned smemeq(const void *av, const void *bv, size_t len);
 * encoded in UTF-16. */
 char *encode_wide_string_as_utf8(const wchar_t *wstr);

+/* Decode UTF-8 to a wide-character string, emitting UTF-16 surrogates
+ * if sizeof(wchar_t) == 2. */
+wchar_t *decode_utf8_to_wide_string(const char *ustr);
+
 /* Decode a single UTF-8 character. Returns U+FFFD for any of the
 * illegal cases. If the source is empty, returns L'\0' (and sets the
 * error indicator on the source, of course). */
--- a/utils/CMakeLists.txt
+++ b/utils/CMakeLists.txt
@ -21,6 +21,7 @@ add_sources_from_current_dir(utils
  debug.c
  decode_utf8.c
  decode_utf8_to_wchar.c
+  decode_utf8_to_wide_string.c
  default_description.c
  dupcat.c
  dupprintf.c
--- a/utils/decode_utf8_to_wide_string.c
+++ b/utils/decode_utf8_to_wide_string.c
@ -0,0 +1,35 @@
+/*
+ * Decode a string of UTF-8 to a wchar_t string.
+ */
+
+#include "misc.h"
+
+wchar_t *decode_utf8_to_wide_string(const char *s)
+{
+    wchar_t *ws = NULL;
+    size_t wlen = 0, wsize = 0;
+
+    BinarySource src[1];
+    BinarySource_BARE_INIT_PL(src, ptrlen_from_asciz(s));
+
+    while (get_avail(src) > 0) {
+        /*
+         * decode_utf8_to_wchar might emit up to 2 wchar_t if wchar_t
+         * is 16 bits (because of UTF-16 surrogates), but will emit at
+         * most one if wchar_t is 32-bit
+         */
+        sgrowarrayn(ws, wsize, wlen, 1 + (sizeof(wchar_t) < 4));
+
+        /* We ignore 'err': if it is set, then the character decode
+         * function will have emitted U+FFFD REPLACEMENT CHARACTER,
+         * which is what we'd have done in response anyway. */
+        DecodeUTF8Failure err;
+        wlen += decode_utf8_to_wchar(src, ws + wlen, &err);
+    }
+
+    /* Reallocate to the final size and append the trailing NUL */
+    ws = sresize(ws, wlen + 1, wchar_t);
+    ws[wlen] = L'\0';
+
+    return ws;
+}