mirror of
https://git.tartarus.org/simon/putty.git
synced 2025-01-25 01:02:24 +00:00
Factor out term_out's character set translation.
I've moved it into a subfunction term_translate(), which I'm about to reuse elsewhere. No functional change intended.
This commit is contained in:
parent
deafaa811e
commit
3cb846e70f
280
terminal.c
280
terminal.c
@ -2866,6 +2866,153 @@ static void term_display_graphic_char(Terminal *term, unsigned long c)
|
||||
seen_disp_event(term);
|
||||
}
|
||||
|
||||
/*
|
||||
* UCSINCOMPLETE is returned from term_translate if it's successfully
|
||||
* absorbed a byte but not emitted a complete character yet.
|
||||
* UCSTRUNCATED indicates a truncated multibyte sequence (so the
|
||||
* caller emits an error character and then calls term_translate again
|
||||
* with the same input byte). UCSINVALID indicates some other invalid
|
||||
* multibyte sequence, such as an overlong synonym, or a standalone
|
||||
* continuation byte, or a completely illegal thing like 0xFE. These
|
||||
* values are not stored in the terminal data structures at all.
|
||||
*/
|
||||
#define UCSINCOMPLETE 0x8000003FU /* '?' */
|
||||
#define UCSTRUNCATED 0x80000021U /* '!' */
|
||||
#define UCSINVALID 0x8000002AU /* '*' */
|
||||
|
||||
static unsigned long term_translate(Terminal *term, unsigned char c)
|
||||
{
|
||||
if (in_utf(term)) {
|
||||
switch (term->utf_state) {
|
||||
case 0:
|
||||
if (c < 0x80) {
|
||||
/* UTF-8 must be stateless so we ignore iso2022. */
|
||||
if (term->ucsdata->unitab_ctrl[c] != 0xFF) {
|
||||
return term->ucsdata->unitab_ctrl[c];
|
||||
} else if ((term->utf8linedraw) &&
|
||||
(term->cset_attr[term->cset] == CSET_LINEDRW)) {
|
||||
/* Linedraw characters are explicitly enabled */
|
||||
return c | CSET_LINEDRW;
|
||||
} else {
|
||||
return c | CSET_ASCII;
|
||||
}
|
||||
} else if ((c & 0xe0) == 0xc0) {
|
||||
term->utf_size = term->utf_state = 1;
|
||||
term->utf_char = (c & 0x1f);
|
||||
} else if ((c & 0xf0) == 0xe0) {
|
||||
term->utf_size = term->utf_state = 2;
|
||||
term->utf_char = (c & 0x0f);
|
||||
} else if ((c & 0xf8) == 0xf0) {
|
||||
term->utf_size = term->utf_state = 3;
|
||||
term->utf_char = (c & 0x07);
|
||||
} else if ((c & 0xfc) == 0xf8) {
|
||||
term->utf_size = term->utf_state = 4;
|
||||
term->utf_char = (c & 0x03);
|
||||
} else if ((c & 0xfe) == 0xfc) {
|
||||
term->utf_size = term->utf_state = 5;
|
||||
term->utf_char = (c & 0x01);
|
||||
} else {
|
||||
return UCSINVALID;
|
||||
}
|
||||
return UCSINCOMPLETE;
|
||||
case 1:
|
||||
case 2:
|
||||
case 3:
|
||||
case 4:
|
||||
case 5:
|
||||
if ((c & 0xC0) != 0x80) {
|
||||
term->utf_state = 0;
|
||||
return UCSTRUNCATED; /* caller will then give us the
|
||||
* same byte again */
|
||||
}
|
||||
term->utf_char = (term->utf_char << 6) | (c & 0x3f);
|
||||
if (--term->utf_state)
|
||||
return UCSINCOMPLETE;
|
||||
|
||||
unsigned long t = term->utf_char;
|
||||
|
||||
/* Is somebody trying to be evil! */
|
||||
if (t < 0x80 ||
|
||||
(t < 0x800 && term->utf_size >= 2) ||
|
||||
(t < 0x10000 && term->utf_size >= 3) ||
|
||||
(t < 0x200000 && term->utf_size >= 4) ||
|
||||
(t < 0x4000000 && term->utf_size >= 5))
|
||||
return UCSINVALID;
|
||||
|
||||
/* Unicode line separator and paragraph separator are CR-LF */
|
||||
if (t == 0x2028 || t == 0x2029)
|
||||
return 0x85;
|
||||
|
||||
/* High controls are probably a Baaad idea too. */
|
||||
if (t < 0xA0)
|
||||
return 0xFFFD;
|
||||
|
||||
/* The UTF-16 surrogates are not nice either. */
|
||||
/* The standard give the option of decoding these:
|
||||
* I don't want to! */
|
||||
if (t >= 0xD800 && t < 0xE000)
|
||||
return UCSINVALID;
|
||||
|
||||
/* ISO 10646 characters now limited to UTF-16 range. */
|
||||
if (t > 0x10FFFF)
|
||||
return UCSINVALID;
|
||||
|
||||
/* This is currently a TagPhobic application.. */
|
||||
if (t >= 0xE0000 && t <= 0xE007F)
|
||||
return UCSINCOMPLETE;
|
||||
|
||||
/* U+FEFF is best seen as a null. */
|
||||
if (t == 0xFEFF)
|
||||
return UCSINCOMPLETE;
|
||||
/* But U+FFFE is an error. */
|
||||
if (t == 0xFFFE || t == 0xFFFF)
|
||||
return UCSINVALID;
|
||||
|
||||
return t;
|
||||
}
|
||||
} else if (term->sco_acs &&
|
||||
(c!='\033' && c!='\012' && c!='\015' && c!='\b')) {
|
||||
/* Are we in the nasty ACS mode? Note: no sco in utf mode. */
|
||||
if (term->sco_acs == 2)
|
||||
c |= 0x80;
|
||||
|
||||
return c | CSET_SCOACS;
|
||||
} else {
|
||||
switch (term->cset_attr[term->cset]) {
|
||||
/*
|
||||
* Linedraw characters are different from 'ESC ( B'
|
||||
* only for a small range. For ones outside that
|
||||
* range, make sure we use the same font as well as
|
||||
* the same encoding.
|
||||
*/
|
||||
case CSET_LINEDRW:
|
||||
if (term->ucsdata->unitab_ctrl[c] != 0xFF)
|
||||
return term->ucsdata->unitab_ctrl[c];
|
||||
else
|
||||
return c | CSET_LINEDRW;
|
||||
break;
|
||||
|
||||
case CSET_GBCHR:
|
||||
/* If UK-ASCII, make the '#' a LineDraw Pound */
|
||||
if (c == '#')
|
||||
return '}' | CSET_LINEDRW;
|
||||
/* fall through */
|
||||
|
||||
case CSET_ASCII:
|
||||
if (term->ucsdata->unitab_ctrl[c] != 0xFF)
|
||||
return term->ucsdata->unitab_ctrl[c];
|
||||
else
|
||||
return c | CSET_ASCII;
|
||||
break;
|
||||
case CSET_SCOACS:
|
||||
if (c >= ' ')
|
||||
return c | CSET_SCOACS;
|
||||
break;
|
||||
}
|
||||
}
|
||||
return c;
|
||||
}
|
||||
|
||||
/*
|
||||
* Remove everything currently in `inbuf' and stick it up on the
|
||||
* in-memory display. There's a big state machine in here to
|
||||
@ -2945,135 +3092,22 @@ static void term_out(Terminal *term)
|
||||
}
|
||||
}
|
||||
|
||||
/* First see about all those translations. */
|
||||
/* Do character-set translation. */
|
||||
if (term->termstate == TOPLEVEL) {
|
||||
if (in_utf(term))
|
||||
switch (term->utf_state) {
|
||||
case 0:
|
||||
if (c < 0x80) {
|
||||
/* UTF-8 must be stateless so we ignore iso2022. */
|
||||
if (term->ucsdata->unitab_ctrl[c] != 0xFF)
|
||||
c = term->ucsdata->unitab_ctrl[c];
|
||||
else if ((term->utf8linedraw) &&
|
||||
(term->cset_attr[term->cset] == CSET_LINEDRW))
|
||||
/* Linedraw characters are explicitly enabled */
|
||||
c = ((unsigned char) c) | CSET_LINEDRW;
|
||||
else c = ((unsigned char)c) | CSET_ASCII;
|
||||
break;
|
||||
} else if ((c & 0xe0) == 0xc0) {
|
||||
term->utf_size = term->utf_state = 1;
|
||||
term->utf_char = (c & 0x1f);
|
||||
} else if ((c & 0xf0) == 0xe0) {
|
||||
term->utf_size = term->utf_state = 2;
|
||||
term->utf_char = (c & 0x0f);
|
||||
} else if ((c & 0xf8) == 0xf0) {
|
||||
term->utf_size = term->utf_state = 3;
|
||||
term->utf_char = (c & 0x07);
|
||||
} else if ((c & 0xfc) == 0xf8) {
|
||||
term->utf_size = term->utf_state = 4;
|
||||
term->utf_char = (c & 0x03);
|
||||
} else if ((c & 0xfe) == 0xfc) {
|
||||
term->utf_size = term->utf_state = 5;
|
||||
term->utf_char = (c & 0x01);
|
||||
} else {
|
||||
c = UCSERR;
|
||||
break;
|
||||
}
|
||||
continue;
|
||||
case 1:
|
||||
case 2:
|
||||
case 3:
|
||||
case 4:
|
||||
case 5:
|
||||
if ((c & 0xC0) != 0x80) {
|
||||
unsigned long t = term_translate(term, c);
|
||||
switch (t) {
|
||||
case UCSINCOMPLETE:
|
||||
continue; /* didn't complete a multibyte char */
|
||||
case UCSTRUNCATED:
|
||||
unget = c;
|
||||
/* fall through */
|
||||
case UCSINVALID:
|
||||
c = UCSERR;
|
||||
term->utf_state = 0;
|
||||
break;
|
||||
}
|
||||
term->utf_char = (term->utf_char << 6) | (c & 0x3f);
|
||||
if (--term->utf_state)
|
||||
continue;
|
||||
|
||||
c = term->utf_char;
|
||||
|
||||
/* Is somebody trying to be evil! */
|
||||
if (c < 0x80 ||
|
||||
(c < 0x800 && term->utf_size >= 2) ||
|
||||
(c < 0x10000 && term->utf_size >= 3) ||
|
||||
(c < 0x200000 && term->utf_size >= 4) ||
|
||||
(c < 0x4000000 && term->utf_size >= 5))
|
||||
c = UCSERR;
|
||||
|
||||
/* Unicode line separator and paragraph separator are CR-LF */
|
||||
if (c == 0x2028 || c == 0x2029)
|
||||
c = 0x85;
|
||||
|
||||
/* High controls are probably a Baaad idea too. */
|
||||
if (c < 0xA0)
|
||||
c = 0xFFFD;
|
||||
|
||||
/* The UTF-16 surrogates are not nice either. */
|
||||
/* The standard give the option of decoding these:
|
||||
* I don't want to! */
|
||||
if (c >= 0xD800 && c < 0xE000)
|
||||
c = UCSERR;
|
||||
|
||||
/* ISO 10646 characters now limited to UTF-16 range. */
|
||||
if (c > 0x10FFFF)
|
||||
c = UCSERR;
|
||||
|
||||
/* This is currently a TagPhobic application.. */
|
||||
if (c >= 0xE0000 && c <= 0xE007F)
|
||||
continue;
|
||||
|
||||
/* U+FEFF is best seen as a null. */
|
||||
if (c == 0xFEFF)
|
||||
continue;
|
||||
/* But U+FFFE is an error. */
|
||||
if (c == 0xFFFE || c == 0xFFFF)
|
||||
c = UCSERR;
|
||||
|
||||
default:
|
||||
c = t;
|
||||
break;
|
||||
}
|
||||
/* Are we in the nasty ACS mode? Note: no sco in utf mode. */
|
||||
else if(term->sco_acs &&
|
||||
(c!='\033' && c!='\012' && c!='\015' && c!='\b'))
|
||||
{
|
||||
if (term->sco_acs == 2) c |= 0x80;
|
||||
c |= CSET_SCOACS;
|
||||
} else {
|
||||
switch (term->cset_attr[term->cset]) {
|
||||
/*
|
||||
* Linedraw characters are different from 'ESC ( B'
|
||||
* only for a small range. For ones outside that
|
||||
* range, make sure we use the same font as well as
|
||||
* the same encoding.
|
||||
*/
|
||||
case CSET_LINEDRW:
|
||||
if (term->ucsdata->unitab_ctrl[c] != 0xFF)
|
||||
c = term->ucsdata->unitab_ctrl[c];
|
||||
else
|
||||
c = ((unsigned char) c) | CSET_LINEDRW;
|
||||
break;
|
||||
|
||||
case CSET_GBCHR:
|
||||
/* If UK-ASCII, make the '#' a LineDraw Pound */
|
||||
if (c == '#') {
|
||||
c = '}' | CSET_LINEDRW;
|
||||
break;
|
||||
}
|
||||
/*FALLTHROUGH*/ case CSET_ASCII:
|
||||
if (term->ucsdata->unitab_ctrl[c] != 0xFF)
|
||||
c = term->ucsdata->unitab_ctrl[c];
|
||||
else
|
||||
c = ((unsigned char) c) | CSET_ASCII;
|
||||
break;
|
||||
case CSET_SCOACS:
|
||||
if (c>=' ') c = ((unsigned char)c) | CSET_SCOACS;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
|
Loading…
Reference in New Issue
Block a user