From 0dcdb1b5a35ab120e3b1114d58db42b9420b33d9 Mon Sep 17 00:00:00 2001 From: Simon Tatham Date: Mon, 4 Mar 2019 20:53:41 +0000 Subject: [PATCH] Expose term_translate outside terminal.c. Also, instead of insisting on modifying the UTF-8 decoding state inside the Terminal structure, it now takes a separate pointer to a small struct containing that decode state. The idea is that if a separate module wants to decode characters the same way the real terminal would, it can pass its own mutable state structure, but the same main Terminal pointer. --- defs.h | 1 + terminal.c | 59 +++++++++++++++++++++--------------------------------- terminal.h | 27 ++++++++++++++++++++++--- 3 files changed, 48 insertions(+), 39 deletions(-) diff --git a/defs.h b/defs.h index a07a3562..6c37b99c 100644 --- a/defs.h +++ b/defs.h @@ -28,6 +28,7 @@ typedef struct conf_tag Conf; typedef struct terminal_tag Terminal; +typedef struct term_utf8_decode term_utf8_decode; typedef struct Filename Filename; typedef struct FontSpec FontSpec; diff --git a/terminal.c b/terminal.c index 3a8d7e11..794ffa82 100644 --- a/terminal.c +++ b/terminal.c @@ -1285,7 +1285,7 @@ static void power_on(Terminal *term, bool clear) term->utf = false; term->save_utf = false; term->alt_save_utf = false; - term->utf_state = 0; + term->utf8.state = 0; term->alt_sco_acs = term->sco_acs = term->save_sco_acs = term->alt_save_sco_acs = 0; term->cset_attr[0] = term->cset_attr[1] = @@ -2866,24 +2866,11 @@ static void term_display_graphic_char(Terminal *term, unsigned long c) seen_disp_event(term); } -/* - * UCSINCOMPLETE is returned from term_translate if it's successfully - * absorbed a byte but not emitted a complete character yet. - * UCSTRUNCATED indicates a truncated multibyte sequence (so the - * caller emits an error character and then calls term_translate again - * with the same input byte). UCSINVALID indicates some other invalid - * multibyte sequence, such as an overlong synonym, or a standalone - * continuation byte, or a completely illegal thing like 0xFE. These - * values are not stored in the terminal data structures at all. - */ -#define UCSINCOMPLETE 0x8000003FU /* '?' */ -#define UCSTRUNCATED 0x80000021U /* '!' */ -#define UCSINVALID 0x8000002AU /* '*' */ - -static unsigned long term_translate(Terminal *term, unsigned char c) +unsigned long term_translate( + Terminal *term, struct term_utf8_decode *utf8, unsigned char c) { if (in_utf(term)) { - switch (term->utf_state) { + switch (utf8->state) { case 0: if (c < 0x80) { /* UTF-8 must be stateless so we ignore iso2022. */ @@ -2897,20 +2884,20 @@ static unsigned long term_translate(Terminal *term, unsigned char c) return c | CSET_ASCII; } } else if ((c & 0xe0) == 0xc0) { - term->utf_size = term->utf_state = 1; - term->utf_char = (c & 0x1f); + utf8->size = utf8->state = 1; + utf8->chr = (c & 0x1f); } else if ((c & 0xf0) == 0xe0) { - term->utf_size = term->utf_state = 2; - term->utf_char = (c & 0x0f); + utf8->size = utf8->state = 2; + utf8->chr = (c & 0x0f); } else if ((c & 0xf8) == 0xf0) { - term->utf_size = term->utf_state = 3; - term->utf_char = (c & 0x07); + utf8->size = utf8->state = 3; + utf8->chr = (c & 0x07); } else if ((c & 0xfc) == 0xf8) { - term->utf_size = term->utf_state = 4; - term->utf_char = (c & 0x03); + utf8->size = utf8->state = 4; + utf8->chr = (c & 0x03); } else if ((c & 0xfe) == 0xfc) { - term->utf_size = term->utf_state = 5; - term->utf_char = (c & 0x01); + utf8->size = utf8->state = 5; + utf8->chr = (c & 0x01); } else { return UCSINVALID; } @@ -2921,22 +2908,22 @@ static unsigned long term_translate(Terminal *term, unsigned char c) case 4: case 5: if ((c & 0xC0) != 0x80) { - term->utf_state = 0; + utf8->state = 0; return UCSTRUNCATED; /* caller will then give us the * same byte again */ } - term->utf_char = (term->utf_char << 6) | (c & 0x3f); - if (--term->utf_state) + utf8->chr = (utf8->chr << 6) | (c & 0x3f); + if (--utf8->state) return UCSINCOMPLETE; - unsigned long t = term->utf_char; + unsigned long t = utf8->chr; /* Is somebody trying to be evil! */ if (t < 0x80 || - (t < 0x800 && term->utf_size >= 2) || - (t < 0x10000 && term->utf_size >= 3) || - (t < 0x200000 && term->utf_size >= 4) || - (t < 0x4000000 && term->utf_size >= 5)) + (t < 0x800 && utf8->size >= 2) || + (t < 0x10000 && utf8->size >= 3) || + (t < 0x200000 && utf8->size >= 4) || + (t < 0x4000000 && utf8->size >= 5)) return UCSINVALID; /* Unicode line separator and paragraph separator are CR-LF */ @@ -3094,7 +3081,7 @@ static void term_out(Terminal *term) /* Do character-set translation. */ if (term->termstate == TOPLEVEL) { - unsigned long t = term_translate(term, c); + unsigned long t = term_translate(term, &term->utf8, c); switch (t) { case UCSINCOMPLETE: continue; /* didn't complete a multibyte char */ diff --git a/terminal.h b/terminal.h index 7981fd33..eae4c7fa 100644 --- a/terminal.h +++ b/terminal.h @@ -63,6 +63,12 @@ struct bidi_cache_entry { int *forward, *backward; /* the permutations of line positions */ }; +struct term_utf8_decode { + int state; /* Is there a pending UTF-8 character */ + int chr; /* and what is it so far? */ + int size; /* The size of the UTF character. */ +}; + struct terminal_tag { int compatibility_level; @@ -116,9 +122,7 @@ struct terminal_tag { int sco_acs, save_sco_acs; /* CSI 10,11,12m -> OEM charset */ bool vt52_bold; /* Force bold on non-bold colours */ bool utf; /* Are we in toggleable UTF-8 mode? */ - int utf_state; /* Is there a pending UTF-8 character */ - int utf_char; /* and what is it so far. */ - int utf_size; /* The size of the UTF character. */ + term_utf8_decode utf8; /* If so, here's our decoding state */ bool printing, only_printing; /* Are we doing ANSI printing? */ int print_state; /* state of print-end-sequence scan */ bufchain printer_buf; /* buffered data for printer */ @@ -335,4 +339,21 @@ static inline bool in_utf(Terminal *term) return term->utf || term->ucsdata->line_codepage == CP_UTF8; } +unsigned long term_translate( + Terminal *term, term_utf8_decode *utf8, unsigned char c); + +/* + * UCSINCOMPLETE is returned from term_translate if it's successfully + * absorbed a byte but not emitted a complete character yet. + * UCSTRUNCATED indicates a truncated multibyte sequence (so the + * caller emits an error character and then calls term_translate again + * with the same input byte). UCSINVALID indicates some other invalid + * multibyte sequence, such as an overlong synonym, or a standalone + * continuation byte, or a completely illegal thing like 0xFE. These + * values are not stored in the terminal data structures at all. + */ +#define UCSINCOMPLETE 0x8000003FU /* '?' */ +#define UCSTRUNCATED 0x80000021U /* '!' */ +#define UCSINVALID 0x8000002AU /* '*' */ + #endif