From 053d2ba6d1c35c3d3c77bbc48bfd31fd8628ed35 Mon Sep 17 00:00:00 2001 From: Simon Tatham Date: Fri, 17 Feb 2012 19:28:55 +0000 Subject: [PATCH] Patch from Yoshida Masato to fill in the missing pieces of Windows UTF-16 support. High Unicode characters in the terminal are now converted back into surrogates during copy and draw operations, and the Windows drawing code takes account of that when splitting up the UTF-16 string for display. Meanwhile, accidental uses of wchar_t have been replaced with 32-bit integers in parts of the cross-platform code which were expecting not to have to deal with UTF-16. [originally from svn r9409] --- ldiscucs.c | 7 ++-- minibidi.c | 2 +- putty.h | 35 +++++++++++++--- terminal.c | 24 ++++++++--- wcwidth.c | 10 ++--- windows/window.c | 104 +++++++++++++++++++++++++++++++++++++++++++---- 6 files changed, 155 insertions(+), 27 deletions(-) diff --git a/ldiscucs.c b/ldiscucs.c index c4fc726d..fe0da8a6 100644 --- a/ldiscucs.c +++ b/ldiscucs.c @@ -51,13 +51,12 @@ void luni_send(void *handle, wchar_t * widebuf, int len, int interactive) for (p = linebuffer, i = 0; i < len; i++) { unsigned long ch = widebuf[i]; - if ((ch & 0xF800) == 0xD800) { + if (IS_SURROGATE(ch)) { #ifdef PLATFORM_IS_UTF16 if (i+1 < len) { unsigned long ch2 = widebuf[i+1]; - if ((ch & 0xFC00) == 0xD800 && - (ch2 & 0xFC00) == 0xDC00) { - ch = 0x10000 + ((ch & 0x3FF) << 10) + (ch2 & 0x3FF); + if (IS_SURROGATE_PAIR(ch, ch2)) { + ch = FROM_SURROGATES(ch, ch2); i++; } } else diff --git a/minibidi.c b/minibidi.c index c0197a5e..f2c68c4f 100644 --- a/minibidi.c +++ b/minibidi.c @@ -58,7 +58,7 @@ shapetypes[(xh)-SHAPE_FIRST].type : SU) /*))*/ #define leastGreaterEven(x) ( ((x)+2) &~ 1 ) typedef struct bidi_char { - wchar_t origwc, wc; + unsigned int origwc, wc; unsigned short index; } bidi_char; diff --git a/putty.h b/putty.h index 4c338b4e..93b683e1 100644 --- a/putty.h +++ b/putty.h @@ -1126,10 +1126,10 @@ void get_unitab(int codepage, wchar_t * unitab, int ftype); /* * Exports from wcwidth.c */ -int mk_wcwidth(wchar_t ucs); -int mk_wcswidth(const wchar_t *pwcs, size_t n); -int mk_wcwidth_cjk(wchar_t ucs); -int mk_wcswidth_cjk(const wchar_t *pwcs, size_t n); +int mk_wcwidth(unsigned int ucs); +int mk_wcswidth(const unsigned int *pwcs, size_t n); +int mk_wcwidth_cjk(unsigned int ucs); +int mk_wcswidth_cjk(const unsigned int *pwcs, size_t n); /* * Exports from mscrypto.c @@ -1257,7 +1257,7 @@ void setup_config_box(struct controlbox *b, int midsession, * Exports from minibidi.c. */ typedef struct bidi_char { - wchar_t origwc, wc; + unsigned int origwc, wc; unsigned short index; } bidi_char; int do_bidi(bidi_char *line, int count); @@ -1399,4 +1399,29 @@ void timer_change_notify(long next); #define remove_session_from_jumplist(x) ((void)0) #endif +/* SURROGATE PAIR */ +#ifndef IS_HIGH_SURROGATE +#define HIGH_SURROGATE_START 0xd800 +#define HIGH_SURROGATE_END 0xdbff +#define LOW_SURROGATE_START 0xdc00 +#define LOW_SURROGATE_END 0xdfff + +#define IS_HIGH_SURROGATE(wch) (((wch) >= HIGH_SURROGATE_START) && \ + ((wch) <= HIGH_SURROGATE_END)) +#define IS_LOW_SURROGATE(wch) (((wch) >= LOW_SURROGATE_START) && \ + ((wch) <= LOW_SURROGATE_END)) +#define IS_SURROGATE_PAIR(hs, ls) (IS_HIGH_SURROGATE(hs) && \ + IS_LOW_SURROGATE(ls)) +#endif + + +#define IS_SURROGATE(wch) (((wch) >= HIGH_SURROGATE_START) && \ + ((wch) <= LOW_SURROGATE_END)) +#define HIGH_SURROGATE_OF(codept) \ + (HIGH_SURROGATE_START + (((codept) - 0x10000) >> 10)) +#define LOW_SURROGATE_OF(codept) \ + (LOW_SURROGATE_START + (((codept) - 0x10000) & 0x3FF)) +#define FROM_SURROGATES(wch1, wch2) \ + (0x10000 + (((wch1) & 0x3FF) << 10) + ((wch2) & 0x3FF)) + #endif diff --git a/terminal.c b/terminal.c index abb06e2a..d23cbe91 100644 --- a/terminal.c +++ b/terminal.c @@ -3016,8 +3016,8 @@ static void term_out(Terminal *term) width = 1; if (!width) width = (term->cjk_ambig_wide ? - mk_wcwidth_cjk((wchar_t) c) : - mk_wcwidth((wchar_t) c)); + mk_wcwidth_cjk((unsigned int) c) : + mk_wcwidth((unsigned int) c)); if (term->wrapnext && term->wrap && width > 0) { cline->lattr |= LATTR_WRAPPED; @@ -4692,7 +4692,7 @@ static termchar *term_bidi_line(Terminal *term, struct termline *ldata, } term->wcFrom[it].origwc = term->wcFrom[it].wc = - (wchar_t)uc; + (unsigned int)uc; term->wcFrom[it].index = it; } @@ -5067,10 +5067,17 @@ static void do_paint(Terminal *term, Context ctx, int may_optimise) dirty_run = TRUE; } - if (ccount >= chlen) { + if (ccount+2 > chlen) { chlen = ccount + 256; ch = sresize(ch, chlen, wchar_t); } + +#ifdef PLATFORM_IS_UTF16 + if (tchar > 0x10000 && tchar < 0x110000) { + ch[ccount++] = (wchar_t) HIGH_SURROGATE_OF(tchar); + ch[ccount++] = (wchar_t) LOW_SURROGATE_OF(tchar); + } else +#endif /* PLATFORM_IS_UTF16 */ ch[ccount++] = (wchar_t) tchar; if (d->cc_next) { @@ -5094,10 +5101,17 @@ static void do_paint(Terminal *term, Context ctx, int may_optimise) break; } - if (ccount >= chlen) { + if (ccount+2 > chlen) { chlen = ccount + 256; ch = sresize(ch, chlen, wchar_t); } + +#ifdef PLATFORM_IS_UTF16 + if (schar > 0x10000 && schar < 0x110000) { + ch[ccount++] = (wchar_t) HIGH_SURROGATE_OF(schar); + ch[ccount++] = (wchar_t) LOW_SURROGATE_OF(schar); + } else +#endif /* PLATFORM_IS_UTF16 */ ch[ccount++] = (wchar_t) schar; } diff --git a/wcwidth.c b/wcwidth.c index 94ecb540..46ac1e35 100644 --- a/wcwidth.c +++ b/wcwidth.c @@ -69,7 +69,7 @@ struct interval { }; /* auxiliary function for binary search in interval table */ -static int bisearch(wchar_t ucs, const struct interval *table, int max) { +static int bisearch(unsigned int ucs, const struct interval *table, int max) { int min = 0; int mid; @@ -121,7 +121,7 @@ static int bisearch(wchar_t ucs, const struct interval *table, int max) { * in ISO 10646. */ -int mk_wcwidth(wchar_t ucs) +int mk_wcwidth(unsigned int ucs) { /* sorted list of non-overlapping intervals of non-spacing characters */ /* generated by "uniset +cat=Me +cat=Mn +cat=Cf -00AD +1160-11FF +200B c" */ @@ -198,7 +198,7 @@ int mk_wcwidth(wchar_t ucs) } -int mk_wcswidth(const wchar_t *pwcs, size_t n) +int mk_wcswidth(const unsigned int *pwcs, size_t n) { int w, width = 0; @@ -221,7 +221,7 @@ int mk_wcswidth(const wchar_t *pwcs, size_t n) * the traditional terminal character-width behaviour. It is not * otherwise recommended for general use. */ -int mk_wcwidth_cjk(wchar_t ucs) +int mk_wcwidth_cjk(unsigned int ucs) { /* sorted list of non-overlapping intervals of East Asian Ambiguous * characters, generated by "uniset +WIDTH-A -cat=Me -cat=Mn -cat=Cf c" */ @@ -289,7 +289,7 @@ int mk_wcwidth_cjk(wchar_t ucs) } -int mk_wcswidth_cjk(const wchar_t *pwcs, size_t n) +int mk_wcswidth_cjk(const unsigned int *pwcs, size_t n) { int w, width = 0; diff --git a/windows/window.c b/windows/window.c index a1d99bae..80ca7955 100644 --- a/windows/window.c +++ b/windows/window.c @@ -206,6 +206,12 @@ static int compose_state = 0; static UINT wm_mousewheel = WM_MOUSEWHEEL; +#define IS_HIGH_VARSEL(wch1, wch2) \ + ((wch1) == 0xDB40 && ((wch2) >= 0xDD00 && (wch2) <= 0xDDEF)) +#define IS_LOW_VARSEL(wch) \ + (((wch) >= 0x180B && (wch) <= 0x180D) || /* MONGOLIAN FREE VARIATION SELECTOR */ \ + ((wch) >= 0xFE00 && (wch) <= 0xFE0F)) /* VARIATION SELECTOR 1-16 */ + /* Dummy routine, only required in plink. */ void ldisc_update(void *frontend, int echo, int edit) { @@ -3105,9 +3111,20 @@ static LRESULT CALLBACK WndProc(HWND hwnd, UINT message, * instead we luni_send the characters one by one. */ term_seen_key_event(term); - for (i = 0; i < n; i += 2) { - if (ldisc) + /* don't divide SURROGATE PAIR */ + if (ldisc) { + for (i = 0; i < n; i += 2) { + WCHAR hs = *(unsigned short *)(buff+i); + if (IS_HIGH_SURROGATE(hs) && i+2 < n) { + WCHAR ls = *(unsigned short *)(buff+i+2); + if (IS_LOW_SURROGATE(ls)) { + luni_send(ldisc, (unsigned short *)(buff+i), 2, 1); + i += 2; + continue; + } + } luni_send(ldisc, (unsigned short *)(buff+i), 1, 1); + } } free(buff); } @@ -3309,6 +3326,7 @@ void do_text_internal(Context ctx, int x, int y, wchar_t *text, int len, static int *lpDx = NULL; static int lpDx_len = 0; int *lpDx_maybe; + int len2; /* for SURROGATE PAIR */ lattr &= LATTR_MODE; @@ -3379,7 +3397,8 @@ void do_text_internal(Context ctx, int x, int y, wchar_t *text, int len, } /* Anything left as an original character set is unprintable. */ - if (DIRECT_CHAR(text[0])) { + if (DIRECT_CHAR(text[0]) && + (len < 2 || !IS_SURROGATE_PAIR(text[0], text[1]))) { int i; for (i = 0; i < len; i++) text[i] = 0xFFFD; @@ -3432,6 +3451,24 @@ void do_text_internal(Context ctx, int x, int y, wchar_t *text, int len, line_box.top = y; line_box.right = x + char_width * len; line_box.bottom = y + font_height; + /* adjust line_box.right for SURROGATE PAIR & VARIATION SELECTOR */ + { + int i; + int rc_width = 0; + for (i = 0; i < len ; i++) { + if (i+1 < len && IS_HIGH_VARSEL(text[i], text[i+1])) { + i++; + } else if (i+1 < len && IS_SURROGATE_PAIR(text[i], text[i+1])) { + rc_width += char_width; + i++; + } else if (IS_LOW_VARSEL(text[i])) { + /* do nothing */ + } else { + rc_width += char_width; + } + } + line_box.right = line_box.left + rc_width; + } /* Only want the left half of double width lines */ if (line_box.right > font_width*term->cols+offset_width) @@ -3462,8 +3499,19 @@ void do_text_internal(Context ctx, int x, int y, wchar_t *text, int len, opaque = TRUE; /* start by erasing the rectangle */ for (remaining = len; remaining > 0; - text += len, remaining -= len, x += char_width * len) { + text += len, remaining -= len, x += char_width * len2) { len = (maxlen < remaining ? maxlen : remaining); + /* don't divide SURROGATE PAIR and VARIATION SELECTOR */ + len2 = len; + if (maxlen == 1) { + if (remaining >= 1 && IS_SURROGATE_PAIR(text[0], text[1])) + len++; + if (remaining-len >= 1 && IS_LOW_VARSEL(text[len])) + len++; + else if (remaining-len >= 2 && + IS_HIGH_VARSEL(text[len], text[len+1])) + len += 2; + } if (len > lpDx_len) { if (len > lpDx_len) { @@ -3473,8 +3521,24 @@ void do_text_internal(Context ctx, int x, int y, wchar_t *text, int len, } { int i; - for (i = 0; i < len; i++) + /* only last char has dx width in SURROGATE PAIR and + * VARIATION sequence */ + for (i = 0; i < len; i++) { lpDx[i] = char_width; + if (i+1 < len && IS_HIGH_VARSEL(text[i], text[i+1])) { + if (i > 0) lpDx[i-1] = 0; + lpDx[i] = 0; + i++; + lpDx[i] = char_width; + } else if (i+1 < len && IS_SURROGATE_PAIR(text[i],text[i+1])) { + lpDx[i] = 0; + i++; + lpDx[i] = char_width; + } else if (IS_LOW_VARSEL(text[i])) { + if (i > 0) lpDx[i-1] = 0; + lpDx[i] = char_width; + } + } } /* We're using a private area for direct to font. (512 chars.) */ @@ -3623,9 +3687,35 @@ void do_text(Context ctx, int x, int y, wchar_t *text, int len, { if (attr & TATTR_COMBINING) { unsigned long a = 0; - attr &= ~TATTR_COMBINING; + int len0 = 1; + /* don't divide SURROGATE PAIR and VARIATION SELECTOR */ + if (len >= 2 && IS_SURROGATE_PAIR(text[0], text[1])) + len0 = 2; + if (len-len0 >= 1 && IS_LOW_VARSEL(text[len0])) { + attr &= ~TATTR_COMBINING; + do_text_internal(ctx, x, y, text, len0+1, attr, lattr); + text += len0+1; + len -= len0+1; + a = TATTR_COMBINING; + } else if (len-len0 >= 2 && IS_HIGH_VARSEL(text[len0], text[len0+1])) { + attr &= ~TATTR_COMBINING; + do_text_internal(ctx, x, y, text, len0+2, attr, lattr); + text += len0+2; + len -= len0+2; + a = TATTR_COMBINING; + } else { + attr &= ~TATTR_COMBINING; + } + while (len--) { - do_text_internal(ctx, x, y, text, 1, attr | a, lattr); + if (len >= 1 && IS_SURROGATE_PAIR(text[0], text[1])) { + do_text_internal(ctx, x, y, text, 2, attr | a, lattr); + len--; + text++; + } else { + do_text_internal(ctx, x, y, text, 1, attr | a, lattr); + } + text++; a = TATTR_COMBINING; }