From 511eea450ab2552e9744fe378cee81383cdb5aa7 Mon Sep 17 00:00:00 2001 From: Simon Tatham Date: Tue, 5 Mar 2019 07:24:17 +0000 Subject: [PATCH] Factor out encode_utf8 from luni_send into utils.c. I knew there had to already be a UTF-8 encoder _somewhere_ in this code base, but it took me a while to find it! Now it's reusable in other contexts. --- ldiscucs.c | 16 +--------------- misc.h | 5 +++++ utils.c | 22 ++++++++++++++++++++++ 3 files changed, 28 insertions(+), 15 deletions(-) diff --git a/ldiscucs.c b/ldiscucs.c index 774368f3..d7a0e269 100644 --- a/ldiscucs.c +++ b/ldiscucs.c @@ -65,21 +65,7 @@ void luni_send(Ldisc *ldisc, const wchar_t *widebuf, int len, bool interactive) } } - if (ch < 0x80) { - *p++ = (char) (ch); - } else if (ch < 0x800) { - *p++ = (char) (0xC0 | (ch >> 6)); - *p++ = (char) (0x80 | (ch & 0x3F)); - } else if (ch < 0x10000) { - *p++ = (char) (0xE0 | (ch >> 12)); - *p++ = (char) (0x80 | ((ch >> 6) & 0x3F)); - *p++ = (char) (0x80 | (ch & 0x3F)); - } else { - *p++ = (char) (0xF0 | (ch >> 18)); - *p++ = (char) (0x80 | ((ch >> 12) & 0x3F)); - *p++ = (char) (0x80 | ((ch >> 6) & 0x3F)); - *p++ = (char) (0x80 | (ch & 0x3F)); - } + p += encode_utf8(p, ch); } } else { int rv; diff --git a/misc.h b/misc.h index f97f6730..d301e2ca 100644 --- a/misc.h +++ b/misc.h @@ -199,6 +199,11 @@ void smemclr(void *b, size_t len); * hinted at by the 'eq' in the name. */ bool smemeq(const void *av, const void *bv, size_t len); +/* Encode a single UTF-8 character. Assumes that illegal characters + * (such as things in the surrogate range, or > 0x10FFFF) have already + * been removed. */ +size_t encode_utf8(void *output, unsigned long ch); + char *buildinfo(const char *newline); /* diff --git a/utils.c b/utils.c index f87596eb..9264c6e6 100644 --- a/utils.c +++ b/utils.c @@ -958,3 +958,25 @@ bool strendswith(const char *s, const char *t) size_t slen = strlen(s), tlen = strlen(t); return slen >= tlen && !strcmp(s + (slen - tlen), t); } + +size_t encode_utf8(void *output, unsigned long ch) +{ + unsigned char *start = (unsigned char *)output, *p = start; + + if (ch < 0x80) { + *p++ = ch; + } else if (ch < 0x800) { + *p++ = 0xC0 | (ch >> 6); + *p++ = 0x80 | (ch & 0x3F); + } else if (ch < 0x10000) { + *p++ = 0xE0 | (ch >> 12); + *p++ = 0x80 | ((ch >> 6) & 0x3F); + *p++ = 0x80 | (ch & 0x3F); + } else { + *p++ = 0xF0 | (ch >> 18); + *p++ = 0x80 | ((ch >> 12) & 0x3F); + *p++ = 0x80 | ((ch >> 6) & 0x3F); + *p++ = 0x80 | (ch & 0x3F); + } + return p - start; +}