1
0
mirror of https://git.tartarus.org/simon/putty.git synced 2025-01-09 17:38:00 +00:00
putty-source/stripctrl.c
Simon Tatham e74790003c StripCtrlChars: option to provide a target Terminal.
If you use the new stripctrl_new_term() to construct a StripCtrlChars
instead of the existing stripctrl_new(), then the resulting object
will align itself with the character-set configuration of the Terminal
object you point it at. (In fact, it'll reuse the same actual
translation code, courtesy of the last few refactoring commits.) So it
will interpret things as control characters precisely if that Terminal
would also have done so.

The previous locale-based sanitisation is appropriate if you're
sending the sanitised output to an OS terminal device managed outside
this process - the LC_CTYPE setting has the best chance of knowing how
that terminal device will interpret a byte stream. But I want to start
using the same sanitisation system for data intended for PuTTY's own
internal terminal emulator, in which case there's no reason why
LC_CTYPE should be expected to match that terminal's configuration,
and no reason to need it to either since we can check the internal
terminal configuration directly.

One small bodge: stripctrl_new_term() is actually a macro, which
passes in the function pointer term_translate() to the underlying real
constructor. That's just so that console-only tools can link in
stripctrl.c without acquiring a dependency on terminal.c (similarly to
how we pass random_read in to the mp_random functions).
2019-03-06 20:31:26 +00:00

388 lines
12 KiB
C

/*
* stripctrl.c: a facility for stripping control characters out of a
* data stream (defined as any multibyte character in the system
* locale which is neither printable nor \n), using the standard C
* library multibyte character facilities.
*/
#include <assert.h>
#include <locale.h>
#include <string.h>
#include <wchar.h>
#include <wctype.h>
#include "putty.h"
#include "terminal.h"
#include "misc.h"
#include "marshal.h"
#define SCC_BUFSIZE 64
typedef struct StripCtrlCharsImpl StripCtrlCharsImpl;
struct StripCtrlCharsImpl {
mbstate_t mbs_in, mbs_out;
bool permit_cr;
wchar_t substitution;
char buf[SCC_BUFSIZE];
size_t buflen;
Terminal *term;
bool last_term_utf;
struct term_utf8_decode utf8;
unsigned long (*translate)(Terminal *, term_utf8_decode *, unsigned char);
BinarySink *bs_out;
StripCtrlChars public;
};
static void stripctrl_locale_BinarySink_write(
BinarySink *bs, const void *vp, size_t len);
static void stripctrl_term_BinarySink_write(
BinarySink *bs, const void *vp, size_t len);
static StripCtrlCharsImpl *stripctrl_new_common(
BinarySink *bs_out, bool permit_cr, wchar_t substitution)
{
StripCtrlCharsImpl *scc = snew(StripCtrlCharsImpl);
memset(scc, 0, sizeof(StripCtrlCharsImpl)); /* zeroes mbstates */
scc->bs_out = bs_out;
scc->permit_cr = permit_cr;
scc->substitution = substitution;
return scc;
}
StripCtrlChars *stripctrl_new(
BinarySink *bs_out, bool permit_cr, wchar_t substitution)
{
StripCtrlCharsImpl *scc = stripctrl_new_common(
bs_out, permit_cr, substitution);
BinarySink_INIT(&scc->public, stripctrl_locale_BinarySink_write);
return &scc->public;
}
StripCtrlChars *stripctrl_new_term_fn(
BinarySink *bs_out, bool permit_cr, wchar_t substitution,
Terminal *term, unsigned long (*translate)(
Terminal *, term_utf8_decode *, unsigned char))
{
StripCtrlCharsImpl *scc = stripctrl_new_common(
bs_out, permit_cr, substitution);
scc->term = term;
scc->translate = translate;
BinarySink_INIT(&scc->public, stripctrl_term_BinarySink_write);
return &scc->public;
}
void stripctrl_free(StripCtrlChars *sccpub)
{
StripCtrlCharsImpl *scc =
container_of(sccpub, StripCtrlCharsImpl, public);
smemclr(scc, sizeof(StripCtrlCharsImpl));
sfree(scc);
}
static inline bool stripctrl_ctrlchar_ok(StripCtrlCharsImpl *scc, wchar_t wc)
{
return wc == L'\n' || (wc == L'\r' && scc->permit_cr);
}
static inline void stripctrl_locale_put_wc(StripCtrlCharsImpl *scc, wchar_t wc)
{
if (iswprint(wc) || stripctrl_ctrlchar_ok(scc, wc)) {
/* Printable character, or one we're going to let through anyway. */
} else if (scc->substitution) {
wc = scc->substitution;
} else {
/* No defined substitution, so don't write any output wchar_t. */
return;
}
char outbuf[MB_LEN_MAX];
size_t produced = wcrtomb(outbuf, wc, &scc->mbs_out);
if (produced > 0)
put_data(scc->bs_out, outbuf, produced);
}
static inline void stripctrl_term_put_wc(
StripCtrlCharsImpl *scc, unsigned long wc)
{
if (!(wc & ~0x9F)) {
/* This is something the terminal interprets as a control
* character. */
if (!stripctrl_ctrlchar_ok(scc, wc)) {
if (!scc->substitution)
return;
else
wc = scc->substitution;
}
if (wc == '\012') {
/* Precede \n with \r, because our terminal will not
* generally be in the ONLCR mode where it assumes that
* internally, and any \r on input has been stripped
* out. */
put_datapl(scc->bs_out, PTRLEN_LITERAL("\r"));
}
}
char outbuf[6];
size_t produced;
/*
* The Terminal implementation encodes 7-bit ASCII characters in
* UTF-8 mode, and all printing characters in non-UTF-8 (i.e.
* single-byte character set) mode, as values in the surrogate
* range (a conveniently unused piece of space in this context)
* whose low byte is the original 1-byte representation of the
* character.
*/
if ((wc - 0xD800) < (0xE000 - 0xD800))
wc &= 0xFF;
if (in_utf(scc->term)) {
produced = encode_utf8(outbuf, wc);
} else {
outbuf[0] = wc;
produced = 1;
}
if (produced > 0)
put_data(scc->bs_out, outbuf, produced);
}
static inline size_t stripctrl_locale_try_consume(
StripCtrlCharsImpl *scc, const char *p, size_t len)
{
wchar_t wc;
mbstate_t mbs_orig = scc->mbs_in;
size_t consumed = mbrtowc(&wc, p, len, &scc->mbs_in);
if (consumed == (size_t)-2) {
/*
* The buffer is too short to see the end of the multibyte
* character that it appears to be starting with. We return 0
* for 'no data consumed', restore the conversion state from
* before consuming the partial character, and our caller will
* come back when it has more data available.
*/
scc->mbs_in = mbs_orig;
return 0;
}
if (consumed == (size_t)-1) {
/*
* The buffer contains an illegal multibyte sequence. There's
* no really good way to recover from this, so we'll just
* reset our input state, consume a single byte without
* emitting anything, and hope we can resynchronise to
* _something_ sooner or later.
*/
memset(&scc->mbs_in, 0, sizeof(scc->mbs_in));
return 1;
}
if (consumed == 0) {
/*
* A zero wide character is encoded by the data, but mbrtowc
* hasn't told us how many input bytes it takes. There isn't
* really anything good we can do here, so we just advance by
* one byte in the hope that that was the NUL.
*
* (If it wasn't - that is, if we're in a multibyte encoding
* in which the terminator of a normal C string is encoded in
* some way other than a single zero byte - then probably lots
* of other things will have gone wrong before we get here!)
*/
stripctrl_locale_put_wc(scc, L'\0');
return 1;
}
/*
* Otherwise, this is the easy case: consumed > 0, and we've eaten
* a valid multibyte character.
*/
stripctrl_locale_put_wc(scc, wc);
return consumed;
}
static void stripctrl_locale_BinarySink_write(
BinarySink *bs, const void *vp, size_t len)
{
StripCtrlChars *sccpub = BinarySink_DOWNCAST(bs, StripCtrlChars);
StripCtrlCharsImpl *scc =
container_of(sccpub, StripCtrlCharsImpl, public);
const char *p = (const char *)vp;
const char *previous_locale = setlocale(LC_CTYPE, NULL);
setlocale(LC_CTYPE, "");
/*
* Deal with any partial multibyte character buffered from last
* time.
*/
while (scc->buflen > 0) {
size_t to_copy = SCC_BUFSIZE - scc->buflen;
if (to_copy > len)
to_copy = len;
memcpy(scc->buf + scc->buflen, p, to_copy);
size_t consumed = stripctrl_locale_try_consume(
scc, scc->buf, scc->buflen + to_copy);
if (consumed >= scc->buflen) {
/*
* We've consumed a multibyte character that includes all
* the data buffered from last time. So we can clear our
* buffer and move on to processing the main input string
* in situ, having first discarded whatever initial
* segment of it completed our previous character.
*/
size_t consumed_from_main_string = consumed - scc->buflen;
assert(consumed_from_main_string <= len);
p += consumed_from_main_string;
len -= consumed_from_main_string;
scc->buflen = 0;
break;
}
if (consumed == 0) {
/*
* If we didn't manage to consume anything, i.e. the whole
* buffer contains an incomplete sequence, it had better
* be because our entire input string _this_ time plus
* whatever leftover data we had from _last_ time still
* comes to less than SCC_BUFSIZE. In other words, we've
* already copied all the new data on to the end of our
* buffer, and it still hasn't helped. So increment buflen
* to reflect the new data, and return.
*/
assert(to_copy == len);
scc->buflen += to_copy;
goto out;
}
/*
* Otherwise, we've somehow consumed _less_ data than we had
* buffered, and yet we weren't able to consume that data in
* the last call to this function. That sounds impossible, but
* I can think of one situation in which it could happen: if
* we had an incomplete MB sequence last time, and now more
* data has arrived, it turns out to be an _illegal_ one, so
* we consume one byte in the hope of resynchronising.
*
* Anyway, in this case we move the buffer up and go back
* round this initial loop.
*/
scc->buflen -= consumed;
memmove(scc->buf, scc->buf + consumed, scc->buflen);
}
/*
* Now charge along the main string.
*/
while (len > 0) {
size_t consumed = stripctrl_locale_try_consume(scc, p, len);
if (consumed == 0)
break;
assert(consumed <= len);
p += consumed;
len -= consumed;
}
/*
* Any data remaining should be copied into our buffer, to keep
* for next time.
*/
assert(len <= SCC_BUFSIZE);
memcpy(scc->buf, p, len);
scc->buflen = len;
out:
setlocale(LC_CTYPE, previous_locale);
}
static void stripctrl_term_BinarySink_write(
BinarySink *bs, const void *vp, size_t len)
{
StripCtrlChars *sccpub = BinarySink_DOWNCAST(bs, StripCtrlChars);
StripCtrlCharsImpl *scc =
container_of(sccpub, StripCtrlCharsImpl, public);
bool utf = in_utf(scc->term);
if (utf != scc->last_term_utf) {
scc->last_term_utf = utf;
scc->utf8.state = 0;
}
for (const unsigned char *p = (const unsigned char *)vp;
len > 0; len--, p++) {
unsigned long t = scc->translate(scc->term, &scc->utf8, *p);
if (t == UCSTRUNCATED) {
stripctrl_term_put_wc(scc, 0xFFFD);
/* go round again */
t = scc->translate(scc->term, &scc->utf8, *p);
}
if (t == UCSINCOMPLETE)
continue;
if (t == UCSINVALID)
t = 0xFFFD;
stripctrl_term_put_wc(scc, t);
}
}
char *stripctrl_string_ptrlen(ptrlen str)
{
strbuf *out = strbuf_new();
StripCtrlChars *scc = stripctrl_new(BinarySink_UPCAST(out), false, L'?');
put_datapl(scc, str);
stripctrl_free(scc);
return strbuf_to_str(out);
}
#ifdef STRIPCTRL_TEST
/*
gcc -DSTRIPCTRL_TEST -o scctest stripctrl.c marshal.c utils.c memory.c
*/
void out_of_memory(void) { fprintf(stderr, "out of memory\n"); abort(); }
void stripctrl_write(BinarySink *bs, const void *vdata, size_t len)
{
const uint8_t *p = vdata;
printf("[");
for (size_t i = 0; i < len; i++)
printf("%*s%02x", i?1:0, "", (unsigned)p[i]);
printf("]");
}
void stripctrl_test(StripCtrlChars *scc, ptrlen pl)
{
stripctrl_write(NULL, pl.ptr, pl.len);
printf(" -> ");
put_datapl(scc, pl);
printf("\n");
}
int main(void)
{
struct foo { BinarySink_IMPLEMENTATION; } foo;
BinarySink_INIT(&foo, stripctrl_write);
StripCtrlChars *scc = stripctrl_new(BinarySink_UPCAST(&foo));
stripctrl_test(scc, PTRLEN_LITERAL("a\033[1mb"));
stripctrl_test(scc, PTRLEN_LITERAL("a\xC2\x9B[1mb"));
stripctrl_test(scc, PTRLEN_LITERAL("a\xC2\xC2[1mb"));
stripctrl_test(scc, PTRLEN_LITERAL("\xC3"));
stripctrl_test(scc, PTRLEN_LITERAL("\xA9"));
stripctrl_test(scc, PTRLEN_LITERAL("\xE2\x80\x8F"));
stripctrl_test(scc, PTRLEN_LITERAL("a\0b"));
stripctrl_free(scc);
return 0;
}
#endif /* STRIPCTRL_TEST */