Remove uni_tbl from struct unicode_data.

Instead of maintaining a single sparse table mapping Unicode to the currently selected code page, we now maintain a collection of such tables mapping Unicode to any code page we've so far found a need to work with, and we add code pages to that list as necessary, and never throw them away (since there are a limited number of them). This means that the wc_to_mb family of functions are effectively stateless: they no longer depend on a 'struct unicode_data' corresponding to the current terminal settings. So I've removed that parameter from all of them. This fills in the missing piece of yesterday's commit a216d86106: now wc_to_mb too should be able to handle internally-implemented character sets, by hastily making their reverse mapping table if it doesn't already have it. (That was only a _latent_ bug, because the only use of wc_to_mb in the cross-platform or Windows code _did_ want to convert to the currently selected code page, so the old strategy worked in that case. But there was no protection against an unworkable use of it being added later.)
2025-07-12 00:33:53 -05:00 · 2022-06-01 08:35:12 +01:00
parent 8a907510dd
commit 5a28658a6d
9 changed files with 130 additions and 53 deletions
--- a/misc.h
+++ b/misc.h
@ -72,9 +72,9 @@ void strbuf_finalise_agent_query(strbuf *buf);
 wchar_t *dup_mb_to_wc_c(int codepage, int flags, const char *string, int len);
 wchar_t *dup_mb_to_wc(int codepage, int flags, const char *string);
 char *dup_wc_to_mb_c(int codepage, int flags, const wchar_t *string, int len,
-                     const char *defchr, struct unicode_data *ucsdata);
+                     const char *defchr);
 char *dup_wc_to_mb(int codepage, int flags, const wchar_t *string,
-                   const char *defchr, struct unicode_data *ucsdata);
+                   const char *defchr);

 static inline int toint(unsigned u)
 {
--- a/putty.h
+++ b/putty.h
@ -266,7 +266,6 @@ struct sesslist {
 };

 struct unicode_data {
-    char **uni_tbl;
    bool dbcs_screenfont;
    int font_codepage;
    int line_codepage;
@ -2436,8 +2435,7 @@ bool is_dbcs_leadbyte(int codepage, char byte);
 int mb_to_wc(int codepage, int flags, const char *mbstr, int mblen,
             wchar_t *wcstr, int wclen);
 int wc_to_mb(int codepage, int flags, const wchar_t *wcstr, int wclen,
-             char *mbstr, int mblen, const char *defchr,
-             struct unicode_data *ucsdata);
+             char *mbstr, int mblen, const char *defchr);
 wchar_t xlat_uskbd2cyrllic(int ch);
 int check_compose(int first, int second);
 int decode_codepage(const char *cp_name);
--- a/terminal/terminal.c
+++ b/terminal/terminal.c
@ -3416,7 +3416,7 @@ static strbuf *term_input_data_from_unicode(
        char *bufptr = strbuf_append(buf, len + 1);
        int rv;
        rv = wc_to_mb(term->ucsdata->line_codepage, 0, widebuf, len,
-                      bufptr, len + 1, NULL, term->ucsdata);
+                      bufptr, len + 1, NULL);
        strbuf_shrink_to(buf, rv < 0 ? 0 : rv);
    }

--- a/unix/unicode.c
+++ b/unix/unicode.c
@ -61,8 +61,7 @@ int mb_to_wc(int codepage, int flags, const char *mbstr, int mblen,
 }

 int wc_to_mb(int codepage, int flags, const wchar_t *wcstr, int wclen,
-             char *mbstr, int mblen, const char *defchr,
-             struct unicode_data *ucsdata)
+             char *mbstr, int mblen, const char *defchr)
 {
    if (codepage == DEFAULT_CODEPAGE) {
        char output[MB_LEN_MAX];
--- a/unix/unifont.c
+++ b/unix/unifont.c
@ -600,7 +600,7 @@ static bool x11font_has_glyph(unifont *font, wchar_t glyph)
         */
        char sbstring[2];
        int sblen = wc_to_mb(xfont->real_charset, 0, &glyph, 1,
-                             sbstring, 2, "", NULL);
+                             sbstring, 2, "");
        if (sblen == 0 || !sbstring[0])
            return false;              /* not even in the charset */

@ -956,7 +956,7 @@ static void x11font_draw_text(unifont_drawctx *ctx, unifont *font,
         */
        char *sbstring = snewn(len+1, char);
        int sblen = wc_to_mb(xfont->real_charset, 0, string, len,
-                             sbstring, len+1, ".", NULL);
+                             sbstring, len+1, ".");
        x11font_really_draw_text(x11font_drawfuncs + index + 0, ctx,
                                 &xfont->fonts[sfid], xfont->disp, x, y,
                                 sbstring, sblen, shadowoffset,
@ -1644,8 +1644,7 @@ static void pangofont_draw_internal(unifont_drawctx *ctx, unifont *font,
     * string to UTF-8.
     */
    utfstring = snewn(len*6+1, char); /* UTF-8 has max 6 bytes/char */
-    utflen = wc_to_mb(CS_UTF8, 0, string, len,
-                      utfstring, len*6+1, ".", NULL);
+    utflen = wc_to_mb(CS_UTF8, 0, string, len, utfstring, len*6+1, ".");

    utfptr = utfstring;
    while (utflen > 0) {
--- a/unix/window.c
+++ b/unix/window.c
@ -3056,8 +3056,7 @@ static void gtkwin_clip_write(
    state->pasteout_data_len = len*6;
    state->pasteout_data_len = wc_to_mb(inst->ucsdata.line_codepage, 0,
                                       data, len, state->pasteout_data,
-                                       state->pasteout_data_len,
-                                       NULL, NULL);
+                                       state->pasteout_data_len, NULL);
    if (state->pasteout_data_len == 0) {
        sfree(state->pasteout_data);
        state->pasteout_data = NULL;
--- a/utils/dup_wc_to_mb.c
+++ b/utils/dup_wc_to_mb.c
@ -11,14 +11,14 @@
 #include "misc.h"

 char *dup_wc_to_mb_c(int codepage, int flags, const wchar_t *string, int len,
-                     const char *defchr, struct unicode_data *ucsdata)
+                     const char *defchr)
 {
    size_t outsize = len+1;
    char *out = snewn(outsize, char);

    while (true) {
        size_t outlen = wc_to_mb(codepage, flags, string, len, out, outsize,
-                                 defchr, ucsdata);
+                                 defchr);
        /* We can only be sure we've consumed the whole input if the
         * output is not within a multibyte-character-length of the
         * end of the buffer! */
@ -32,8 +32,7 @@ char *dup_wc_to_mb_c(int codepage, int flags, const wchar_t *string, int len,
 }

 char *dup_wc_to_mb(int codepage, int flags, const wchar_t *string,
-                   const char *defchr, struct unicode_data *ucsdata)
+                   const char *defchr)
 {
-    return dup_wc_to_mb_c(codepage, flags, string, wcslen(string),
-                          defchr, ucsdata);
+    return dup_wc_to_mb_c(codepage, flags, string, wcslen(string), defchr);
 }
--- a/windows/unicode.c
+++ b/windows/unicode.c
@ -437,9 +437,114 @@ static const struct cp_list_item cp_list[] = {

 static void link_font(WCHAR * line_tbl, WCHAR * font_tbl, WCHAR attr);

+/*
+ * We keep a collection of reverse mappings from Unicode back to code pages,
+ * in the form of array[256] of array[256] of char. These live forever in a
+ * local tree234, and we just make a new one whenever we find a need.
+ */
+typedef struct reverse_mapping {
+    int codepage;
+    char **blocks;
+} reverse_mapping;
+static tree234 *reverse_mappings = NULL;
+
+static int reverse_mapping_cmp(void *av, void *bv)
+{
+    const reverse_mapping *a = (const reverse_mapping *)av;
+    const reverse_mapping *b = (const reverse_mapping *)bv;
+    if (a->codepage < b->codepage)
+        return -1;
+    if (a->codepage > b->codepage)
+        return +1;
+    return 0;
+}
+
+static int reverse_mapping_find(void *av, void *bv)
+{
+    const reverse_mapping *a = (const reverse_mapping *)av;
+    int b_codepage = *(const int *)bv;
+    if (a->codepage < b_codepage)
+        return -1;
+    if (a->codepage > b_codepage)
+        return +1;
+    return 0;
+}
+
+static reverse_mapping *get_existing_reverse_mapping(int codepage)
+{
+    if (!reverse_mappings)
+        return NULL;
+    return find234(reverse_mappings, &codepage, reverse_mapping_find);
+}
+
+static reverse_mapping *make_reverse_mapping_inner(
+    int codepage, const wchar_t *mapping)
+{
+    if (!reverse_mappings)
+        reverse_mappings = newtree234(reverse_mapping_cmp);
+
+    reverse_mapping *rmap = snew(reverse_mapping);
+    rmap->blocks = snewn(256, char *);
+    memset(rmap->blocks, 0, 256 * sizeof(char *));
+
+    for (size_t i = 0; i < 256; i++) {
+        /* These special kinds of value correspond to no Unicode character */
+        if (DIRECT_CHAR(mapping[i]))
+            continue;
+        if (DIRECT_FONT(mapping[i]))
+            continue;
+
+        size_t chr = mapping[i];
+        size_t block = chr >> 8, index = chr & 0xFF;
+
+        if (!rmap->blocks[block]) {
+            rmap->blocks[block] = snewn(256, char);
+            memset(rmap->blocks[block], 0, 256);
+        }
+        rmap->blocks[block][index] = i;
+    }
+
+    rmap->codepage = codepage;
+    reverse_mapping *added = add234(reverse_mappings, rmap);
+    assert(added == rmap); /* we already checked it wasn't already in there */
+    return added;
+}
+
+static void make_reverse_mapping(int codepage, const wchar_t *mapping)
+{
+    if (get_existing_reverse_mapping(codepage))
+        return;                        /* we've already got this one */
+    make_reverse_mapping_inner(codepage, mapping);
+}
+
+static reverse_mapping *get_reverse_mapping(int codepage)
+{
+    /*
+     * Try harder to get a reverse mapping for a codepage we implement
+     * internally via a translation table, by hastily making it if it doesn't
+     * already exist.
+     */
+
+    reverse_mapping *rmap = get_existing_reverse_mapping(codepage);
+    if (rmap)
+        return rmap;
+
+    if (codepage < 65536)
+        return NULL;
+    if (codepage > 65536 + lenof(cp_list))
+        return NULL;
+    const struct cp_list_item *cp = &cp_list[codepage - 65536];
+    if (!cp->cp_table)
+        return NULL;
+
+    wchar_t mapping[256];
+    get_unitab(codepage, mapping, 0);
+    return make_reverse_mapping_inner(codepage, mapping);
+}
+
 void init_ucs(Conf *conf, struct unicode_data *ucsdata)
 {
-    int i, j;
+    int i;
    bool used_dtf = false;
    int vtmode;

@ -522,31 +627,9 @@ void init_ucs(Conf *conf, struct unicode_data *ucsdata)
           sizeof(unitab_xterm_std));
    ucsdata->unitab_xterm['_'] = ' ';

-    /* Generate UCS ->line page table. */
-    if (ucsdata->uni_tbl) {
-        for (i = 0; i < 256; i++)
-            if (ucsdata->uni_tbl[i])
-                sfree(ucsdata->uni_tbl[i]);
-        sfree(ucsdata->uni_tbl);
-        ucsdata->uni_tbl = 0;
-    }
    if (!used_dtf) {
-        for (i = 0; i < 256; i++) {
-            if (DIRECT_CHAR(ucsdata->unitab_line[i]))
-                continue;
-            if (DIRECT_FONT(ucsdata->unitab_line[i]))
-                continue;
-            if (!ucsdata->uni_tbl) {
-                ucsdata->uni_tbl = snewn(256, char *);
-                memset(ucsdata->uni_tbl, 0, 256 * sizeof(char *));
-            }
-            j = ((ucsdata->unitab_line[i] >> 8) & 0xFF);
-            if (!ucsdata->uni_tbl[j]) {
-                ucsdata->uni_tbl[j] = snewn(256, char);
-                memset(ucsdata->uni_tbl[j], 0, 256 * sizeof(char));
-            }
-            ucsdata->uni_tbl[j][ucsdata->unitab_line[i] & 0xFF] = i;
-        }
+        /* Make sure a reverse mapping exists for this code page. */
+        make_reverse_mapping(ucsdata->line_codepage, ucsdata->unitab_line);
    }

    /* Find the line control characters. */
@ -1156,20 +1239,21 @@ void get_unitab(int codepage, wchar_t * unitab, int ftype)
 }

 int wc_to_mb(int codepage, int flags, const wchar_t *wcstr, int wclen,
-             char *mbstr, int mblen, const char *defchr,
-             struct unicode_data *ucsdata)
+             char *mbstr, int mblen, const char *defchr)
 {
-    char *p;
-    int i;
-    if (ucsdata && codepage == ucsdata->line_codepage && ucsdata->uni_tbl) {
+    reverse_mapping *rmap = get_reverse_mapping(codepage);
+
+    if (rmap) {
        /* Do this by array lookup if we can. */
        if (wclen < 0) {
            for (wclen = 0; wcstr[wclen++] ;);   /* will include the NUL */
        }
+        char *p;
+        int i;
        for (p = mbstr, i = 0; i < wclen; i++) {
            wchar_t ch = wcstr[i];
            int by;
-            char *p1;
+            const char *p1;

            #define WRITECH(chr) do             \
            {                                   \
@ -1177,8 +1261,7 @@ int wc_to_mb(int codepage, int flags, const wchar_t *wcstr, int wclen,
                *p++ = (char)(chr);             \
            } while (0)

-            if (ucsdata->uni_tbl &&
-                (p1 = ucsdata->uni_tbl[(ch >> 8) & 0xFF]) != NULL &&
+            if ((p1 = rmap->blocks[(ch >> 8) & 0xFF]) != NULL &&
                (by = p1[ch & 0xFF]) != '\0')
                WRITECH(by);
            else if (ch < 0x80)
--- a/windows/window.c
+++ b/windows/window.c
@ -473,7 +473,7 @@ static void sw_SetWindowText(HWND hwnd, wchar_t *text)
    if (unicode_window) {
        SetWindowTextW(hwnd, text);
    } else {
-        char *mb = dup_wc_to_mb(DEFAULT_CODEPAGE, 0, text, "?", &ucsdata);
+        char *mb = dup_wc_to_mb(DEFAULT_CODEPAGE, 0, text, "?");
        SetWindowTextA(hwnd, mb);
        sfree(mb);
    }