From 392be3e494dc68f1d41cd27cd300cfcac78c235e Mon Sep 17 00:00:00 2001
From: Simon Tatham <anakin@pobox.com>
Date: Mon, 29 May 2023 13:27:16 +0100
Subject: [PATCH] New utility function: decode_utf8_to_wide_string.

We already had encode_wide_string_as_utf8, which treats the wide
string as UTF-16 or UTF-32 as appropriate to the size of wchar_t. I'm
about to need the inverse function, and was surprised that it didn't
already exist (even though enough component parts did to make it easy).
---
 misc.h                             |  4 ++++
 utils/CMakeLists.txt               |  1 +
 utils/decode_utf8_to_wide_string.c | 35 ++++++++++++++++++++++++++++++
 3 files changed, 40 insertions(+)
 create mode 100644 utils/decode_utf8_to_wide_string.c

diff --git a/misc.h b/misc.h
index be9ff9c0..16b9b68e 100644
--- a/misc.h
+++ b/misc.h
@@ -256,6 +256,10 @@ unsigned smemeq(const void *av, const void *bv, size_t len);
  * encoded in UTF-16. */
 char *encode_wide_string_as_utf8(const wchar_t *wstr);
 
+/* Decode UTF-8 to a wide-character string, emitting UTF-16 surrogates
+ * if sizeof(wchar_t) == 2. */
+wchar_t *decode_utf8_to_wide_string(const char *ustr);
+
 /* Decode a single UTF-8 character. Returns U+FFFD for any of the
  * illegal cases. If the source is empty, returns L'\0' (and sets the
  * error indicator on the source, of course). */
diff --git a/utils/CMakeLists.txt b/utils/CMakeLists.txt
index 38882059..30b1d72b 100644
--- a/utils/CMakeLists.txt
+++ b/utils/CMakeLists.txt
@@ -21,6 +21,7 @@ add_sources_from_current_dir(utils
   debug.c
   decode_utf8.c
   decode_utf8_to_wchar.c
+  decode_utf8_to_wide_string.c
   default_description.c
   dupcat.c
   dupprintf.c
diff --git a/utils/decode_utf8_to_wide_string.c b/utils/decode_utf8_to_wide_string.c
new file mode 100644
index 00000000..1ab9a8de
--- /dev/null
+++ b/utils/decode_utf8_to_wide_string.c
@@ -0,0 +1,35 @@
+/*
+ * Decode a string of UTF-8 to a wchar_t string.
+ */
+
+#include "misc.h"
+
+wchar_t *decode_utf8_to_wide_string(const char *s)
+{
+    wchar_t *ws = NULL;
+    size_t wlen = 0, wsize = 0;
+
+    BinarySource src[1];
+    BinarySource_BARE_INIT_PL(src, ptrlen_from_asciz(s));
+
+    while (get_avail(src) > 0) {
+        /*
+         * decode_utf8_to_wchar might emit up to 2 wchar_t if wchar_t
+         * is 16 bits (because of UTF-16 surrogates), but will emit at
+         * most one if wchar_t is 32-bit
+         */
+        sgrowarrayn(ws, wsize, wlen, 1 + (sizeof(wchar_t) < 4));
+
+        /* We ignore 'err': if it is set, then the character decode
+         * function will have emitted U+FFFD REPLACEMENT CHARACTER,
+         * which is what we'd have done in response anyway. */
+        DecodeUTF8Failure err;
+        wlen += decode_utf8_to_wchar(src, ws + wlen, &err);
+    }
+
+    /* Reallocate to the final size and append the trailing NUL */
+    ws = sresize(ws, wlen + 1, wchar_t);
+    ws[wlen] = L'\0';
+
+    return ws;
+}