2022-03-12 15:53:04 +00:00
|
|
|
/*
|
|
|
|
* Decode a single UTF-8 character to the platform's local wchar_t.
|
|
|
|
*/
|
|
|
|
|
|
|
|
#include "putty.h"
|
|
|
|
#include "misc.h"
|
|
|
|
|
decode_utf8: add an enumeration of failure reasons.
Now you can optionally get back an enum value indicating whether the
character was successfully decoded, or whether U+FFFD was substituted
due to some kind of problem, and if the latter, what problem.
For a start, this allows distinguishing 'real' U+FFFD (encoded
legitimately in the input) from one invented by the decoder. Also, it
allows the recipient of the decode to treat failures differently,
either by passing on a useful error report to the user (as
utf8_unknown_char now does) or by doing something special.
In particular, there are two distinct error codes for a truncated
UTF-8 encoding, depending on whether it was truncated by the end of
the input or by encountering a non-continuation byte. The former code
means that the string is not legal UTF-8 _as it is_, but doesn't rule
out it being a (bytewise) prefix of a legal UTF-8 string - so if a
client is receiving UTF-8 data a byte at a time, they can treat that
error code specially and not make it a fatal error.
2023-02-17 16:39:09 +00:00
|
|
|
size_t decode_utf8_to_wchar(BinarySource *src, wchar_t *out,
|
|
|
|
DecodeUTF8Failure *err)
|
2022-03-12 15:53:04 +00:00
|
|
|
{
|
|
|
|
size_t outlen = 0;
|
decode_utf8: add an enumeration of failure reasons.
Now you can optionally get back an enum value indicating whether the
character was successfully decoded, or whether U+FFFD was substituted
due to some kind of problem, and if the latter, what problem.
For a start, this allows distinguishing 'real' U+FFFD (encoded
legitimately in the input) from one invented by the decoder. Also, it
allows the recipient of the decode to treat failures differently,
either by passing on a useful error report to the user (as
utf8_unknown_char now does) or by doing something special.
In particular, there are two distinct error codes for a truncated
UTF-8 encoding, depending on whether it was truncated by the end of
the input or by encountering a non-continuation byte. The former code
means that the string is not legal UTF-8 _as it is_, but doesn't rule
out it being a (bytewise) prefix of a legal UTF-8 string - so if a
client is receiving UTF-8 data a byte at a time, they can treat that
error code specially and not make it a fatal error.
2023-02-17 16:39:09 +00:00
|
|
|
unsigned wc = decode_utf8(src, err);
|
2022-03-12 15:53:04 +00:00
|
|
|
if (sizeof(wchar_t) > 2 || wc < 0x10000) {
|
|
|
|
out[outlen++] = wc;
|
|
|
|
} else {
|
|
|
|
unsigned wcoff = wc - 0x10000;
|
|
|
|
out[outlen++] = 0xD800 | (0x3FF & (wcoff >> 10));
|
|
|
|
out[outlen++] = 0xDC00 | (0x3FF & wcoff);
|
|
|
|
}
|
|
|
|
return outlen;
|
|
|
|
}
|