mirror of
https://git.tartarus.org/simon/putty.git
synced 2025-01-25 01:02:24 +00:00
First draft of Unicode support in pterm. It's pretty complete: it
does UTF-8 copy and paste (falling back to normal strings if necessary), it understands X font encodings and translates things accordingly so that if you have a Unicode font you can ask for virtually any single-byte encoding and get it (Mac-Roman pterm, anyone?), and so on. There's work left to be done (wide fonts for CJK spring to mind), but I reckon this is a pretty good start. [originally from svn r2395]
This commit is contained in:
parent
241570c04f
commit
ad2bbc52a4
5
Recipe
5
Recipe
@ -110,6 +110,9 @@ SFTP = sftp int64 logging
|
||||
WINMISC = misc version winstore settings tree234 winnet proxy cmdline
|
||||
UXMISC = misc version uxstore settings tree234 uxnet proxy cmdline
|
||||
|
||||
# Character set library, for use in pterm.
|
||||
CHARSET = sbcsdat slookup sbcs utf8 toucs fromucs xenc mimeenc
|
||||
|
||||
# Standard libraries, and the same with WinSocks 1 and 2.
|
||||
LIBS = advapi32.lib user32.lib gdi32.lib comctl32.lib comdlg32.lib
|
||||
+ shell32.lib winmm.lib imm32.lib winspool.lib
|
||||
@ -137,7 +140,7 @@ puttygen : [G] puttygen sshrsag sshdssg sshprime sshdes sshbn sshmd5 version
|
||||
+ sshpubk sshaes sshsh512 import winutils puttygen.res LIBS
|
||||
|
||||
pterm : [X] pterm terminal wcwidth uxucs uxmisc tree234 misc ldisc ldiscucs
|
||||
+ logging uxprint settings pty be_none uxstore signal
|
||||
+ logging uxprint settings pty be_none uxstore signal CHARSET
|
||||
|
||||
plink : [U] uxplink uxcons NONSSH UXSSH be_all logging UXMISC
|
||||
|
||||
|
1
charset/.cvsignore
Normal file
1
charset/.cvsignore
Normal file
@ -0,0 +1 @@
|
||||
sbcsdat.c
|
11
charset/README
Normal file
11
charset/README
Normal file
@ -0,0 +1,11 @@
|
||||
This subdirectory contains a general character-set conversion
|
||||
library, used in the Unix port of PuTTY, and available for use in
|
||||
other ports if it should happen to be useful.
|
||||
|
||||
I intend to use this same library in other programs at some future
|
||||
date. It is therefore a _strong_ design goal that this library
|
||||
should remain perfectly general, and not tied to particulars of
|
||||
PuTTY. It must not reference any code outside its own subdirectory;
|
||||
it should not have PuTTY-specific helper routines added to it unless
|
||||
they can be documented in a general manner which might make them
|
||||
useful in other circumstances as well.
|
120
charset/charset.h
Normal file
120
charset/charset.h
Normal file
@ -0,0 +1,120 @@
|
||||
/*
|
||||
* charset.h - header file for general character set conversion
|
||||
* routines.
|
||||
*/
|
||||
|
||||
#ifndef charset_charset_h
|
||||
#define charset_charset_h
|
||||
|
||||
#include <stddef.h>
|
||||
|
||||
/*
|
||||
* Enumeration that lists all the multibyte or single-byte
|
||||
* character sets known to this library.
|
||||
*/
|
||||
typedef enum {
|
||||
CS_NONE, /* used for reporting errors, etc */
|
||||
CS_ISO8859_1,
|
||||
CS_ISO8859_1_X11, /* X font encoding with VT100 glyphs */
|
||||
CS_ISO8859_2,
|
||||
CS_ISO8859_3,
|
||||
CS_ISO8859_4,
|
||||
CS_ISO8859_5,
|
||||
CS_ISO8859_6,
|
||||
CS_ISO8859_7,
|
||||
CS_ISO8859_8,
|
||||
CS_ISO8859_9,
|
||||
CS_ISO8859_10,
|
||||
CS_ISO8859_11,
|
||||
CS_ISO8859_13,
|
||||
CS_ISO8859_14,
|
||||
CS_ISO8859_15,
|
||||
CS_ISO8859_16,
|
||||
CS_CP437,
|
||||
CS_CP850,
|
||||
CS_CP1250,
|
||||
CS_CP1251,
|
||||
CS_CP1252,
|
||||
CS_CP1253,
|
||||
CS_CP1254,
|
||||
CS_CP1255,
|
||||
CS_CP1256,
|
||||
CS_CP1257,
|
||||
CS_CP1258,
|
||||
CS_KOI8_R,
|
||||
CS_KOI8_U,
|
||||
CS_MAC_ROMAN,
|
||||
CS_VISCII,
|
||||
CS_HP_ROMAN8,
|
||||
CS_DEC_MCS,
|
||||
CS_UTF8
|
||||
} charset_t;
|
||||
|
||||
typedef struct {
|
||||
unsigned long s0;
|
||||
} charset_state;
|
||||
|
||||
/*
|
||||
* Routine to convert a MB/SB character set to Unicode.
|
||||
*
|
||||
* This routine accepts some number of bytes, updates a state
|
||||
* variable, and outputs some number of Unicode characters. There
|
||||
* are no guarantees. You can't even guarantee that at most one
|
||||
* Unicode character will be output per byte you feed in; for
|
||||
* example, suppose you're reading UTF-8, you've seen E1 80, and
|
||||
* then you suddenly see FE. Now you need to output _two_ error
|
||||
* characters - one for the incomplete sequence E1 80, and one for
|
||||
* the completely invalid UTF-8 byte FE.
|
||||
*
|
||||
* Returns the number of wide characters output; will never output
|
||||
* more than the size of the buffer (as specified on input).
|
||||
* Advances the `input' pointer and decrements `inlen', to indicate
|
||||
* how far along the input string it got.
|
||||
*
|
||||
* The sequence of `errlen' wide characters pointed to by `errstr'
|
||||
* will be used to indicate a conversion error. If `errstr' is
|
||||
* NULL, `errlen' will be ignored, and the library will choose
|
||||
* something sensible to do on its own. For Unicode, this will be
|
||||
* U+FFFD (REPLACEMENT CHARACTER).
|
||||
*/
|
||||
|
||||
int charset_to_unicode(char **input, int *inlen, wchar_t *output, int outlen,
|
||||
int charset, charset_state *state,
|
||||
const wchar_t *errstr, int errlen);
|
||||
|
||||
/*
|
||||
* Routine to convert Unicode to an MB/SB character set.
|
||||
*
|
||||
* This routine accepts some number of Unicode characters, updates
|
||||
* a state variable, and outputs some number of bytes.
|
||||
*
|
||||
* Returns the number of bytes characters output; will never output
|
||||
* more than the size of the buffer (as specified on input), and
|
||||
* will never output a partial MB character. Advances the `input'
|
||||
* pointer and decrements `inlen', to indicate how far along the
|
||||
* input string it got.
|
||||
*
|
||||
* The sequence of `errlen' characters pointed to by `errstr' will
|
||||
* be used to indicate a conversion error. If `errstr' is NULL,
|
||||
* `errlen' will be ignored, and the library will choose something
|
||||
* sensible to do on its own (which will vary depending on the
|
||||
* output charset).
|
||||
*/
|
||||
|
||||
int charset_from_unicode(wchar_t **input, int *inlen, char *output, int outlen,
|
||||
int charset, charset_state *state,
|
||||
const char *errstr, int errlen);
|
||||
|
||||
/*
|
||||
* Convert X11 encoding names to and from our charset identifiers.
|
||||
*/
|
||||
const char *charset_to_xenc(int charset);
|
||||
int charset_from_xenc(const char *name);
|
||||
|
||||
/*
|
||||
* Convert MIME encoding names to and from our charset identifiers.
|
||||
*/
|
||||
const char *charset_to_mimeenc(int charset);
|
||||
int charset_from_mimeenc(const char *name);
|
||||
|
||||
#endif /* charset_charset_h */
|
19
charset/enum.c
Normal file
19
charset/enum.c
Normal file
@ -0,0 +1,19 @@
|
||||
/*
|
||||
* enum.c - enumerate all charsets defined by the library.
|
||||
*
|
||||
* This file maintains a list of every other source file which
|
||||
* contains ENUM_CHARSET definitions. It #includes each one with
|
||||
* ENUM_CHARSETS defined, which causes those source files to do
|
||||
* nothing at all except call the ENUM_CHARSET macro on each
|
||||
* charset they define.
|
||||
*
|
||||
* This file in turn is included from various other places, with
|
||||
* the ENUM_CHARSET macro defined to various different things. This
|
||||
* allows us to have multiple implementations of the master charset
|
||||
* lookup table (a static one and a dynamic one).
|
||||
*/
|
||||
|
||||
#define ENUM_CHARSETS
|
||||
#include "sbcsdat.c"
|
||||
#include "utf8.c"
|
||||
#undef ENUM_CHARSETS
|
91
charset/fromucs.c
Normal file
91
charset/fromucs.c
Normal file
@ -0,0 +1,91 @@
|
||||
/*
|
||||
* fromucs.c - convert Unicode to other character sets.
|
||||
*/
|
||||
|
||||
#include "charset.h"
|
||||
#include "internal.h"
|
||||
|
||||
struct charset_emit_param {
|
||||
char *output;
|
||||
int outlen;
|
||||
const char *errstr;
|
||||
int errlen;
|
||||
int stopped;
|
||||
};
|
||||
|
||||
static void charset_emit(void *ctx, long int output)
|
||||
{
|
||||
struct charset_emit_param *param = (struct charset_emit_param *)ctx;
|
||||
char outval;
|
||||
char const *p;
|
||||
int outlen;
|
||||
|
||||
if (output == ERROR) {
|
||||
p = param->errstr;
|
||||
outlen = param->errlen;
|
||||
} else {
|
||||
outval = output;
|
||||
p = &outval;
|
||||
outlen = 1;
|
||||
}
|
||||
|
||||
if (param->outlen >= outlen) {
|
||||
while (outlen > 0) {
|
||||
*param->output++ = *p++;
|
||||
param->outlen--;
|
||||
outlen--;
|
||||
}
|
||||
} else {
|
||||
param->stopped = 1;
|
||||
}
|
||||
}
|
||||
|
||||
int charset_from_unicode(wchar_t **input, int *inlen, char *output, int outlen,
|
||||
int charset, charset_state *state,
|
||||
const char *errstr, int errlen)
|
||||
{
|
||||
charset_spec const *spec = charset_find_spec(charset);
|
||||
charset_state localstate;
|
||||
struct charset_emit_param param;
|
||||
|
||||
param.output = output;
|
||||
param.outlen = outlen;
|
||||
param.stopped = 0;
|
||||
|
||||
/*
|
||||
* charset_emit will expect a valid errstr.
|
||||
*/
|
||||
if (!errstr) {
|
||||
/* *shrug* this is good enough, and consistent across all SBCS... */
|
||||
param.errstr = ".";
|
||||
param.errlen = 1;
|
||||
}
|
||||
param.errstr = errstr;
|
||||
param.errlen = errlen;
|
||||
|
||||
if (!state) {
|
||||
localstate.s0 = 0;
|
||||
} else {
|
||||
localstate = *state; /* structure copy */
|
||||
}
|
||||
state = &localstate;
|
||||
|
||||
while (*inlen > 0) {
|
||||
int lenbefore = param.output - output;
|
||||
spec->write(spec, **input, &localstate, charset_emit, ¶m);
|
||||
if (param.stopped) {
|
||||
/*
|
||||
* The emit function has _tried_ to output some
|
||||
* characters, but ran up against the end of the
|
||||
* buffer. Leave immediately, and return what happened
|
||||
* _before_ attempting to process this character.
|
||||
*/
|
||||
return lenbefore;
|
||||
}
|
||||
if (state)
|
||||
*state = localstate; /* structure copy */
|
||||
(*input)++;
|
||||
(*inlen)--;
|
||||
}
|
||||
return param.output - output;
|
||||
}
|
89
charset/internal.h
Normal file
89
charset/internal.h
Normal file
@ -0,0 +1,89 @@
|
||||
/*
|
||||
* internal.h - internal header stuff for the charset library.
|
||||
*/
|
||||
|
||||
#ifndef charset_internal_h
|
||||
#define charset_internal_h
|
||||
|
||||
/* This invariably comes in handy */
|
||||
#define lenof(x) ( sizeof((x)) / sizeof(*(x)) )
|
||||
|
||||
/* This is an invalid Unicode value used to indicate an error. */
|
||||
#define ERROR 0xFFFFL /* Unicode value representing error */
|
||||
|
||||
typedef struct charset_spec charset_spec;
|
||||
typedef struct sbcs_data sbcs_data;
|
||||
|
||||
struct charset_spec {
|
||||
int charset; /* numeric identifier */
|
||||
|
||||
/*
|
||||
* A function to read the character set and output Unicode
|
||||
* characters. The `emit' function expects to get Unicode chars
|
||||
* passed to it; it should be sent ERROR for any encoding error
|
||||
* on the input.
|
||||
*/
|
||||
void (*read)(charset_spec const *charset, long int input_chr,
|
||||
charset_state *state,
|
||||
void (*emit)(void *ctx, long int output), void *emitctx);
|
||||
/*
|
||||
* A function to read Unicode characters and output in this
|
||||
* character set. The `emit' function expects to get byte
|
||||
* values passed to it; it should be sent ERROR for any
|
||||
* non-representable characters on the input.
|
||||
*/
|
||||
void (*write)(charset_spec const *charset, long int input_chr,
|
||||
charset_state *state,
|
||||
void (*emit)(void *ctx, long int output), void *emitctx);
|
||||
void const *data;
|
||||
};
|
||||
|
||||
/*
|
||||
* This is the format of `data' used by the SBCS read and write
|
||||
* functions; so it's the format used in all SBCS definitions.
|
||||
*/
|
||||
struct sbcs_data {
|
||||
/*
|
||||
* This is a simple mapping table converting each SBCS position
|
||||
* to a Unicode code point. Some positions may contain ERROR,
|
||||
* indicating that that byte value is not defined in the SBCS
|
||||
* in question and its occurrence in input is an error.
|
||||
*/
|
||||
unsigned long sbcs2ucs[256];
|
||||
|
||||
/*
|
||||
* This lookup table is used to convert Unicode back to the
|
||||
* SBCS. It consists of the valid byte values in the SBCS,
|
||||
* sorted in order of their Unicode translation. So given a
|
||||
* Unicode value U, you can do a binary search on this table
|
||||
* using the above table as a lookup: when testing the Xth
|
||||
* position in this table, you branch according to whether
|
||||
* sbcs2ucs[ucs2sbcs[X]] is less than, greater than, or equal
|
||||
* to U.
|
||||
*
|
||||
* Note that since there may be fewer than 256 valid byte
|
||||
* values in a particular SBCS, we must supply the length of
|
||||
* this table as well as the contents.
|
||||
*/
|
||||
unsigned char ucs2sbcs[256];
|
||||
int nvalid;
|
||||
};
|
||||
|
||||
/*
|
||||
* Prototypes for internal library functions.
|
||||
*/
|
||||
charset_spec const *charset_find_spec(int charset);
|
||||
void read_sbcs(charset_spec const *charset, long int input_chr,
|
||||
charset_state *state,
|
||||
void (*emit)(void *ctx, long int output), void *emitctx);
|
||||
void write_sbcs(charset_spec const *charset, long int input_chr,
|
||||
charset_state *state,
|
||||
void (*emit)(void *ctx, long int output), void *emitctx);
|
||||
|
||||
/*
|
||||
* Placate compiler warning about unused parameters, of which we
|
||||
* expect to have some in this library.
|
||||
*/
|
||||
#define UNUSEDARG(x) ( (x) = (x) )
|
||||
|
||||
#endif /* charset_internal_h */
|
209
charset/mimeenc.c
Normal file
209
charset/mimeenc.c
Normal file
@ -0,0 +1,209 @@
|
||||
/*
|
||||
* mimeenc.c - translate our internal character set codes to and
|
||||
* from MIME standard character-set names.
|
||||
*
|
||||
*/
|
||||
|
||||
#include <ctype.h>
|
||||
#include "charset.h"
|
||||
#include "internal.h"
|
||||
|
||||
static const struct {
|
||||
const char *name;
|
||||
int charset;
|
||||
} mimeencs[] = {
|
||||
/*
|
||||
* These names are taken from
|
||||
*
|
||||
* http://www.iana.org/assignments/character-sets
|
||||
*
|
||||
* Where multiple encoding names map to the same encoding id
|
||||
* (such as the variety of aliases for ISO-8859-1), the first
|
||||
* is considered canonical and will be returned when
|
||||
* translating the id to a string.
|
||||
*/
|
||||
{ "ISO-8859-1", CS_ISO8859_1 },
|
||||
{ "iso-ir-100", CS_ISO8859_1 },
|
||||
{ "ISO_8859-1", CS_ISO8859_1 },
|
||||
{ "ISO_8859-1:1987", CS_ISO8859_1 },
|
||||
{ "latin1", CS_ISO8859_1 },
|
||||
{ "l1", CS_ISO8859_1 },
|
||||
{ "IBM819", CS_ISO8859_1 },
|
||||
{ "CP819", CS_ISO8859_1 },
|
||||
{ "csISOLatin1", CS_ISO8859_1 },
|
||||
|
||||
{ "ISO-8859-2", CS_ISO8859_2 },
|
||||
{ "ISO_8859-2:1987", CS_ISO8859_2 },
|
||||
{ "iso-ir-101", CS_ISO8859_2 },
|
||||
{ "ISO_8859-2", CS_ISO8859_2 },
|
||||
{ "latin2", CS_ISO8859_2 },
|
||||
{ "l2", CS_ISO8859_2 },
|
||||
{ "csISOLatin2", CS_ISO8859_2 },
|
||||
|
||||
{ "ISO-8859-3", CS_ISO8859_3 },
|
||||
{ "ISO_8859-3:1988", CS_ISO8859_3 },
|
||||
{ "iso-ir-109", CS_ISO8859_3 },
|
||||
{ "ISO_8859-3", CS_ISO8859_3 },
|
||||
{ "latin3", CS_ISO8859_3 },
|
||||
{ "l3", CS_ISO8859_3 },
|
||||
{ "csISOLatin3", CS_ISO8859_3 },
|
||||
|
||||
{ "ISO-8859-4", CS_ISO8859_4 },
|
||||
{ "ISO_8859-4:1988", CS_ISO8859_4 },
|
||||
{ "iso-ir-110", CS_ISO8859_4 },
|
||||
{ "ISO_8859-4", CS_ISO8859_4 },
|
||||
{ "latin4", CS_ISO8859_4 },
|
||||
{ "l4", CS_ISO8859_4 },
|
||||
{ "csISOLatin4", CS_ISO8859_4 },
|
||||
|
||||
{ "ISO-8859-5", CS_ISO8859_5 },
|
||||
{ "ISO_8859-5:1988", CS_ISO8859_5 },
|
||||
{ "iso-ir-144", CS_ISO8859_5 },
|
||||
{ "ISO_8859-5", CS_ISO8859_5 },
|
||||
{ "cyrillic", CS_ISO8859_5 },
|
||||
{ "csISOLatinCyrillic", CS_ISO8859_5 },
|
||||
|
||||
{ "ISO-8859-6", CS_ISO8859_6 },
|
||||
{ "ISO_8859-6:1987", CS_ISO8859_6 },
|
||||
{ "iso-ir-127", CS_ISO8859_6 },
|
||||
{ "ISO_8859-6", CS_ISO8859_6 },
|
||||
{ "ECMA-114", CS_ISO8859_6 },
|
||||
{ "ASMO-708", CS_ISO8859_6 },
|
||||
{ "arabic", CS_ISO8859_6 },
|
||||
{ "csISOLatinArabic", CS_ISO8859_6 },
|
||||
|
||||
{ "ISO-8859-7", CS_ISO8859_7 },
|
||||
{ "ISO_8859-7:1987", CS_ISO8859_7 },
|
||||
{ "iso-ir-126", CS_ISO8859_7 },
|
||||
{ "ISO_8859-7", CS_ISO8859_7 },
|
||||
{ "ELOT_928", CS_ISO8859_7 },
|
||||
{ "ECMA-118", CS_ISO8859_7 },
|
||||
{ "greek", CS_ISO8859_7 },
|
||||
{ "greek8", CS_ISO8859_7 },
|
||||
{ "csISOLatinGreek", CS_ISO8859_7 },
|
||||
|
||||
{ "ISO-8859-8", CS_ISO8859_8 },
|
||||
{ "ISO_8859-8:1988", CS_ISO8859_8 },
|
||||
{ "iso-ir-138", CS_ISO8859_8 },
|
||||
{ "ISO_8859-8", CS_ISO8859_8 },
|
||||
{ "hebrew", CS_ISO8859_8 },
|
||||
{ "csISOLatinHebrew", CS_ISO8859_8 },
|
||||
|
||||
{ "ISO-8859-9", CS_ISO8859_9 },
|
||||
{ "ISO_8859-9:1989", CS_ISO8859_9 },
|
||||
{ "iso-ir-148", CS_ISO8859_9 },
|
||||
{ "ISO_8859-9", CS_ISO8859_9 },
|
||||
{ "latin5", CS_ISO8859_9 },
|
||||
{ "l5", CS_ISO8859_9 },
|
||||
{ "csISOLatin5", CS_ISO8859_9 },
|
||||
|
||||
{ "ISO-8859-10", CS_ISO8859_10 },
|
||||
{ "iso-ir-157", CS_ISO8859_10 },
|
||||
{ "l6", CS_ISO8859_10 },
|
||||
{ "ISO_8859-10:1992", CS_ISO8859_10 },
|
||||
{ "csISOLatin6", CS_ISO8859_10 },
|
||||
{ "latin6", CS_ISO8859_10 },
|
||||
|
||||
{ "ISO-8859-13", CS_ISO8859_13 },
|
||||
|
||||
{ "ISO-8859-14", CS_ISO8859_14 },
|
||||
{ "iso-ir-199", CS_ISO8859_14 },
|
||||
{ "ISO_8859-14:1998", CS_ISO8859_14 },
|
||||
{ "ISO_8859-14", CS_ISO8859_14 },
|
||||
{ "latin8", CS_ISO8859_14 },
|
||||
{ "iso-celtic", CS_ISO8859_14 },
|
||||
{ "l8", CS_ISO8859_14 },
|
||||
|
||||
{ "ISO-8859-15", CS_ISO8859_15 },
|
||||
{ "ISO_8859-15", CS_ISO8859_15 },
|
||||
{ "Latin-9", CS_ISO8859_15 },
|
||||
|
||||
{ "ISO-8859-16", CS_ISO8859_16 },
|
||||
{ "iso-ir-226", CS_ISO8859_16 },
|
||||
{ "ISO_8859-16", CS_ISO8859_16 },
|
||||
{ "ISO_8859-16:2001", CS_ISO8859_16 },
|
||||
{ "latin10", CS_ISO8859_16 },
|
||||
{ "l10", CS_ISO8859_16 },
|
||||
|
||||
{ "IBM437", CS_CP437 },
|
||||
{ "cp437", CS_CP437 },
|
||||
{ "437", CS_CP437 },
|
||||
{ "csPC8CodePage437", CS_CP437 },
|
||||
|
||||
{ "IBM850", CS_CP850 },
|
||||
{ "cp850", CS_CP850 },
|
||||
{ "850", CS_CP850 },
|
||||
{ "csPC850Multilingual", CS_CP850 },
|
||||
|
||||
{ "windows-1250", CS_CP1250 },
|
||||
|
||||
{ "windows-1251", CS_CP1251 },
|
||||
|
||||
{ "windows-1252", CS_CP1252 },
|
||||
|
||||
{ "windows-1253", CS_CP1253 },
|
||||
|
||||
{ "windows-1254", CS_CP1254 },
|
||||
|
||||
{ "windows-1255", CS_CP1255 },
|
||||
|
||||
{ "windows-1256", CS_CP1256 },
|
||||
|
||||
{ "windows-1257", CS_CP1257 },
|
||||
|
||||
{ "windows-1258", CS_CP1258 },
|
||||
|
||||
{ "KOI8-R", CS_KOI8_R },
|
||||
{ "csKOI8R", CS_KOI8_R },
|
||||
|
||||
{ "KOI8-U", CS_KOI8_U },
|
||||
|
||||
{ "macintosh", CS_MAC_ROMAN },
|
||||
{ "mac", CS_MAC_ROMAN },
|
||||
{ "csMacintosh", CS_MAC_ROMAN },
|
||||
|
||||
{ "VISCII", CS_VISCII },
|
||||
{ "csVISCII", CS_VISCII },
|
||||
|
||||
{ "hp-roman8", CS_HP_ROMAN8 },
|
||||
{ "roman8", CS_HP_ROMAN8 },
|
||||
{ "r8", CS_HP_ROMAN8 },
|
||||
{ "csHPRoman8", CS_HP_ROMAN8 },
|
||||
|
||||
{ "DEC-MCS", CS_DEC_MCS },
|
||||
{ "dec", CS_DEC_MCS },
|
||||
{ "csDECMCS", CS_DEC_MCS },
|
||||
|
||||
{ "UTF-8", CS_UTF8 },
|
||||
};
|
||||
|
||||
const char *charset_to_mimeenc(int charset)
|
||||
{
|
||||
int i;
|
||||
|
||||
for (i = 0; i < (int)lenof(mimeencs); i++)
|
||||
if (charset == mimeencs[i].charset)
|
||||
return mimeencs[i].name;
|
||||
|
||||
return NULL; /* not found */
|
||||
}
|
||||
|
||||
int charset_from_mimeenc(const char *name)
|
||||
{
|
||||
int i;
|
||||
|
||||
for (i = 0; i < (int)lenof(mimeencs); i++) {
|
||||
const char *p, *q;
|
||||
p = name;
|
||||
q = mimeencs[i].name;
|
||||
while (*p || *q) {
|
||||
if (tolower(*p) != tolower(*q))
|
||||
break;
|
||||
p++; q++;
|
||||
}
|
||||
if (!*p && !*q)
|
||||
return mimeencs[i].charset;
|
||||
}
|
||||
|
||||
return CS_NONE; /* not found */
|
||||
}
|
45
charset/sbcs.c
Normal file
45
charset/sbcs.c
Normal file
@ -0,0 +1,45 @@
|
||||
/*
|
||||
* sbcs.c - routines to handle single-byte character sets.
|
||||
*/
|
||||
|
||||
#include "charset.h"
|
||||
#include "internal.h"
|
||||
|
||||
/*
|
||||
* The charset_spec for any single-byte character set should
|
||||
* provide read_sbcs() as its read function, and its `data' field
|
||||
* should be a wchar_t string constant containing the 256 entries
|
||||
* of the translation table.
|
||||
*/
|
||||
|
||||
void read_sbcs(charset_spec const *charset, long int input_chr,
|
||||
charset_state *state,
|
||||
void (*emit)(void *ctx, long int output), void *emitctx)
|
||||
{
|
||||
wchar_t const *table = (wchar_t const *)charset->data;
|
||||
|
||||
UNUSEDARG(state);
|
||||
|
||||
emit(emitctx, table[input_chr]);
|
||||
}
|
||||
|
||||
void write_sbcs(charset_spec const *charset, long int input_chr,
|
||||
charset_state *state,
|
||||
void (*emit)(void *ctx, long int output), void *emitctx)
|
||||
{
|
||||
wchar_t const *table = (wchar_t const *)charset->data;
|
||||
int i;
|
||||
|
||||
UNUSEDARG(state);
|
||||
|
||||
/*
|
||||
* FIXME: this should work, but it's ludicrously inefficient.
|
||||
* We should be using the ucs2sbcs table.
|
||||
*/
|
||||
for (i = 0; i < 256; i++)
|
||||
if (table[i] == input_chr) {
|
||||
emit(emitctx, i);
|
||||
return;
|
||||
}
|
||||
emit(emitctx, ERROR);
|
||||
}
|
698
charset/sbcs.dat
Normal file
698
charset/sbcs.dat
Normal file
@ -0,0 +1,698 @@
|
||||
Data file defining single-byte character sets.
|
||||
|
||||
All lines which begin with whitespace are considered comments.
|
||||
|
||||
To generate an SBCS table from a unicode.org mapping table:
|
||||
|
||||
gensbcs() {
|
||||
wget -q -O - "$1" | tr '\r' '\n' | \
|
||||
perl -ne '/^(0x.*)\s+(0x.*)\s+/ and $a[hex $1]=sprintf "%04x", hex $2;' \
|
||||
-e 'BEGIN{for($i=0;$i<256;$i++){$a[$i]="XXXX";' \
|
||||
-e ' if ($i < 32 or $i == 127) {$a[$i]=sprintf "%04x", $i}}}' \
|
||||
-e 'END{for($i=0;$i<256;$i++){printf"%s%s",$a[$i],$i%16==15?"\n":" "}}'
|
||||
}
|
||||
|
||||
(A couple of noteworthy ickinesses here. For a start, any
|
||||
undefined characters in the control-code regions (00-1F and 7F)
|
||||
are assumed to be the Unicode code point corresponding to their
|
||||
index, since the Mac Roman mapping table declines to define them
|
||||
but realistically you don't want to be messing with that sort of
|
||||
thing. Secondly, the Mac mapping tables are shipped with Mac line
|
||||
endings, so note the `tr' to turn them into something legible to
|
||||
Perl...)
|
||||
|
||||
Here are the ISO-8859-x tables, generated by this piece of Bourne
|
||||
shell:
|
||||
|
||||
for i in 1 2 3 4 5 6 7 8 9 10 11 13 14 15 16; do
|
||||
echo charset CS_ISO8859_$i
|
||||
gensbcs http://www.unicode.org/Public/MAPPINGS/ISO8859/8859-$i.TXT
|
||||
echo
|
||||
done
|
||||
|
||||
charset CS_ISO8859_1
|
||||
0000 0001 0002 0003 0004 0005 0006 0007 0008 0009 000a 000b 000c 000d 000e 000f
|
||||
0010 0011 0012 0013 0014 0015 0016 0017 0018 0019 001a 001b 001c 001d 001e 001f
|
||||
0020 0021 0022 0023 0024 0025 0026 0027 0028 0029 002a 002b 002c 002d 002e 002f
|
||||
0030 0031 0032 0033 0034 0035 0036 0037 0038 0039 003a 003b 003c 003d 003e 003f
|
||||
0040 0041 0042 0043 0044 0045 0046 0047 0048 0049 004a 004b 004c 004d 004e 004f
|
||||
0050 0051 0052 0053 0054 0055 0056 0057 0058 0059 005a 005b 005c 005d 005e 005f
|
||||
0060 0061 0062 0063 0064 0065 0066 0067 0068 0069 006a 006b 006c 006d 006e 006f
|
||||
0070 0071 0072 0073 0074 0075 0076 0077 0078 0079 007a 007b 007c 007d 007e 007f
|
||||
0080 0081 0082 0083 0084 0085 0086 0087 0088 0089 008a 008b 008c 008d 008e 008f
|
||||
0090 0091 0092 0093 0094 0095 0096 0097 0098 0099 009a 009b 009c 009d 009e 009f
|
||||
00a0 00a1 00a2 00a3 00a4 00a5 00a6 00a7 00a8 00a9 00aa 00ab 00ac 00ad 00ae 00af
|
||||
00b0 00b1 00b2 00b3 00b4 00b5 00b6 00b7 00b8 00b9 00ba 00bb 00bc 00bd 00be 00bf
|
||||
00c0 00c1 00c2 00c3 00c4 00c5 00c6 00c7 00c8 00c9 00ca 00cb 00cc 00cd 00ce 00cf
|
||||
00d0 00d1 00d2 00d3 00d4 00d5 00d6 00d7 00d8 00d9 00da 00db 00dc 00dd 00de 00df
|
||||
00e0 00e1 00e2 00e3 00e4 00e5 00e6 00e7 00e8 00e9 00ea 00eb 00ec 00ed 00ee 00ef
|
||||
00f0 00f1 00f2 00f3 00f4 00f5 00f6 00f7 00f8 00f9 00fa 00fb 00fc 00fd 00fe 00ff
|
||||
|
||||
charset CS_ISO8859_2
|
||||
0000 0001 0002 0003 0004 0005 0006 0007 0008 0009 000a 000b 000c 000d 000e 000f
|
||||
0010 0011 0012 0013 0014 0015 0016 0017 0018 0019 001a 001b 001c 001d 001e 001f
|
||||
0020 0021 0022 0023 0024 0025 0026 0027 0028 0029 002a 002b 002c 002d 002e 002f
|
||||
0030 0031 0032 0033 0034 0035 0036 0037 0038 0039 003a 003b 003c 003d 003e 003f
|
||||
0040 0041 0042 0043 0044 0045 0046 0047 0048 0049 004a 004b 004c 004d 004e 004f
|
||||
0050 0051 0052 0053 0054 0055 0056 0057 0058 0059 005a 005b 005c 005d 005e 005f
|
||||
0060 0061 0062 0063 0064 0065 0066 0067 0068 0069 006a 006b 006c 006d 006e 006f
|
||||
0070 0071 0072 0073 0074 0075 0076 0077 0078 0079 007a 007b 007c 007d 007e 007f
|
||||
0080 0081 0082 0083 0084 0085 0086 0087 0088 0089 008a 008b 008c 008d 008e 008f
|
||||
0090 0091 0092 0093 0094 0095 0096 0097 0098 0099 009a 009b 009c 009d 009e 009f
|
||||
00a0 0104 02d8 0141 00a4 013d 015a 00a7 00a8 0160 015e 0164 0179 00ad 017d 017b
|
||||
00b0 0105 02db 0142 00b4 013e 015b 02c7 00b8 0161 015f 0165 017a 02dd 017e 017c
|
||||
0154 00c1 00c2 0102 00c4 0139 0106 00c7 010c 00c9 0118 00cb 011a 00cd 00ce 010e
|
||||
0110 0143 0147 00d3 00d4 0150 00d6 00d7 0158 016e 00da 0170 00dc 00dd 0162 00df
|
||||
0155 00e1 00e2 0103 00e4 013a 0107 00e7 010d 00e9 0119 00eb 011b 00ed 00ee 010f
|
||||
0111 0144 0148 00f3 00f4 0151 00f6 00f7 0159 016f 00fa 0171 00fc 00fd 0163 02d9
|
||||
|
||||
charset CS_ISO8859_3
|
||||
0000 0001 0002 0003 0004 0005 0006 0007 0008 0009 000a 000b 000c 000d 000e 000f
|
||||
0010 0011 0012 0013 0014 0015 0016 0017 0018 0019 001a 001b 001c 001d 001e 001f
|
||||
0020 0021 0022 0023 0024 0025 0026 0027 0028 0029 002a 002b 002c 002d 002e 002f
|
||||
0030 0031 0032 0033 0034 0035 0036 0037 0038 0039 003a 003b 003c 003d 003e 003f
|
||||
0040 0041 0042 0043 0044 0045 0046 0047 0048 0049 004a 004b 004c 004d 004e 004f
|
||||
0050 0051 0052 0053 0054 0055 0056 0057 0058 0059 005a 005b 005c 005d 005e 005f
|
||||
0060 0061 0062 0063 0064 0065 0066 0067 0068 0069 006a 006b 006c 006d 006e 006f
|
||||
0070 0071 0072 0073 0074 0075 0076 0077 0078 0079 007a 007b 007c 007d 007e 007f
|
||||
0080 0081 0082 0083 0084 0085 0086 0087 0088 0089 008a 008b 008c 008d 008e 008f
|
||||
0090 0091 0092 0093 0094 0095 0096 0097 0098 0099 009a 009b 009c 009d 009e 009f
|
||||
00a0 0126 02d8 00a3 00a4 XXXX 0124 00a7 00a8 0130 015e 011e 0134 00ad XXXX 017b
|
||||
00b0 0127 00b2 00b3 00b4 00b5 0125 00b7 00b8 0131 015f 011f 0135 00bd XXXX 017c
|
||||
00c0 00c1 00c2 XXXX 00c4 010a 0108 00c7 00c8 00c9 00ca 00cb 00cc 00cd 00ce 00cf
|
||||
XXXX 00d1 00d2 00d3 00d4 0120 00d6 00d7 011c 00d9 00da 00db 00dc 016c 015c 00df
|
||||
00e0 00e1 00e2 XXXX 00e4 010b 0109 00e7 00e8 00e9 00ea 00eb 00ec 00ed 00ee 00ef
|
||||
XXXX 00f1 00f2 00f3 00f4 0121 00f6 00f7 011d 00f9 00fa 00fb 00fc 016d 015d 02d9
|
||||
|
||||
charset CS_ISO8859_4
|
||||
0000 0001 0002 0003 0004 0005 0006 0007 0008 0009 000a 000b 000c 000d 000e 000f
|
||||
0010 0011 0012 0013 0014 0015 0016 0017 0018 0019 001a 001b 001c 001d 001e 001f
|
||||
0020 0021 0022 0023 0024 0025 0026 0027 0028 0029 002a 002b 002c 002d 002e 002f
|
||||
0030 0031 0032 0033 0034 0035 0036 0037 0038 0039 003a 003b 003c 003d 003e 003f
|
||||
0040 0041 0042 0043 0044 0045 0046 0047 0048 0049 004a 004b 004c 004d 004e 004f
|
||||
0050 0051 0052 0053 0054 0055 0056 0057 0058 0059 005a 005b 005c 005d 005e 005f
|
||||
0060 0061 0062 0063 0064 0065 0066 0067 0068 0069 006a 006b 006c 006d 006e 006f
|
||||
0070 0071 0072 0073 0074 0075 0076 0077 0078 0079 007a 007b 007c 007d 007e 007f
|
||||
0080 0081 0082 0083 0084 0085 0086 0087 0088 0089 008a 008b 008c 008d 008e 008f
|
||||
0090 0091 0092 0093 0094 0095 0096 0097 0098 0099 009a 009b 009c 009d 009e 009f
|
||||
00a0 0104 0138 0156 00a4 0128 013b 00a7 00a8 0160 0112 0122 0166 00ad 017d 00af
|
||||
00b0 0105 02db 0157 00b4 0129 013c 02c7 00b8 0161 0113 0123 0167 014a 017e 014b
|
||||
0100 00c1 00c2 00c3 00c4 00c5 00c6 012e 010c 00c9 0118 00cb 0116 00cd 00ce 012a
|
||||
0110 0145 014c 0136 00d4 00d5 00d6 00d7 00d8 0172 00da 00db 00dc 0168 016a 00df
|
||||
0101 00e1 00e2 00e3 00e4 00e5 00e6 012f 010d 00e9 0119 00eb 0117 00ed 00ee 012b
|
||||
0111 0146 014d 0137 00f4 00f5 00f6 00f7 00f8 0173 00fa 00fb 00fc 0169 016b 02d9
|
||||
|
||||
charset CS_ISO8859_5
|
||||
0000 0001 0002 0003 0004 0005 0006 0007 0008 0009 000a 000b 000c 000d 000e 000f
|
||||
0010 0011 0012 0013 0014 0015 0016 0017 0018 0019 001a 001b 001c 001d 001e 001f
|
||||
0020 0021 0022 0023 0024 0025 0026 0027 0028 0029 002a 002b 002c 002d 002e 002f
|
||||
0030 0031 0032 0033 0034 0035 0036 0037 0038 0039 003a 003b 003c 003d 003e 003f
|
||||
0040 0041 0042 0043 0044 0045 0046 0047 0048 0049 004a 004b 004c 004d 004e 004f
|
||||
0050 0051 0052 0053 0054 0055 0056 0057 0058 0059 005a 005b 005c 005d 005e 005f
|
||||
0060 0061 0062 0063 0064 0065 0066 0067 0068 0069 006a 006b 006c 006d 006e 006f
|
||||
0070 0071 0072 0073 0074 0075 0076 0077 0078 0079 007a 007b 007c 007d 007e 007f
|
||||
0080 0081 0082 0083 0084 0085 0086 0087 0088 0089 008a 008b 008c 008d 008e 008f
|
||||
0090 0091 0092 0093 0094 0095 0096 0097 0098 0099 009a 009b 009c 009d 009e 009f
|
||||
00a0 0401 0402 0403 0404 0405 0406 0407 0408 0409 040a 040b 040c 00ad 040e 040f
|
||||
0410 0411 0412 0413 0414 0415 0416 0417 0418 0419 041a 041b 041c 041d 041e 041f
|
||||
0420 0421 0422 0423 0424 0425 0426 0427 0428 0429 042a 042b 042c 042d 042e 042f
|
||||
0430 0431 0432 0433 0434 0435 0436 0437 0438 0439 043a 043b 043c 043d 043e 043f
|
||||
0440 0441 0442 0443 0444 0445 0446 0447 0448 0449 044a 044b 044c 044d 044e 044f
|
||||
2116 0451 0452 0453 0454 0455 0456 0457 0458 0459 045a 045b 045c 00a7 045e 045f
|
||||
|
||||
charset CS_ISO8859_6
|
||||
0000 0001 0002 0003 0004 0005 0006 0007 0008 0009 000a 000b 000c 000d 000e 000f
|
||||
0010 0011 0012 0013 0014 0015 0016 0017 0018 0019 001a 001b 001c 001d 001e 001f
|
||||
0020 0021 0022 0023 0024 0025 0026 0027 0028 0029 002a 002b 002c 002d 002e 002f
|
||||
0030 0031 0032 0033 0034 0035 0036 0037 0038 0039 003a 003b 003c 003d 003e 003f
|
||||
0040 0041 0042 0043 0044 0045 0046 0047 0048 0049 004a 004b 004c 004d 004e 004f
|
||||
0050 0051 0052 0053 0054 0055 0056 0057 0058 0059 005a 005b 005c 005d 005e 005f
|
||||
0060 0061 0062 0063 0064 0065 0066 0067 0068 0069 006a 006b 006c 006d 006e 006f
|
||||
0070 0071 0072 0073 0074 0075 0076 0077 0078 0079 007a 007b 007c 007d 007e 007f
|
||||
0080 0081 0082 0083 0084 0085 0086 0087 0088 0089 008a 008b 008c 008d 008e 008f
|
||||
0090 0091 0092 0093 0094 0095 0096 0097 0098 0099 009a 009b 009c 009d 009e 009f
|
||||
00a0 XXXX XXXX XXXX 00a4 XXXX XXXX XXXX XXXX XXXX XXXX XXXX 060c 00ad XXXX XXXX
|
||||
XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX 061b XXXX XXXX XXXX 061f
|
||||
XXXX 0621 0622 0623 0624 0625 0626 0627 0628 0629 062a 062b 062c 062d 062e 062f
|
||||
0630 0631 0632 0633 0634 0635 0636 0637 0638 0639 063a XXXX XXXX XXXX XXXX XXXX
|
||||
0640 0641 0642 0643 0644 0645 0646 0647 0648 0649 064a 064b 064c 064d 064e 064f
|
||||
0650 0651 0652 XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX
|
||||
|
||||
charset CS_ISO8859_7
|
||||
0000 0001 0002 0003 0004 0005 0006 0007 0008 0009 000a 000b 000c 000d 000e 000f
|
||||
0010 0011 0012 0013 0014 0015 0016 0017 0018 0019 001a 001b 001c 001d 001e 001f
|
||||
0020 0021 0022 0023 0024 0025 0026 0027 0028 0029 002a 002b 002c 002d 002e 002f
|
||||
0030 0031 0032 0033 0034 0035 0036 0037 0038 0039 003a 003b 003c 003d 003e 003f
|
||||
0040 0041 0042 0043 0044 0045 0046 0047 0048 0049 004a 004b 004c 004d 004e 004f
|
||||
0050 0051 0052 0053 0054 0055 0056 0057 0058 0059 005a 005b 005c 005d 005e 005f
|
||||
0060 0061 0062 0063 0064 0065 0066 0067 0068 0069 006a 006b 006c 006d 006e 006f
|
||||
0070 0071 0072 0073 0074 0075 0076 0077 0078 0079 007a 007b 007c 007d 007e 007f
|
||||
0080 0081 0082 0083 0084 0085 0086 0087 0088 0089 008a 008b 008c 008d 008e 008f
|
||||
0090 0091 0092 0093 0094 0095 0096 0097 0098 0099 009a 009b 009c 009d 009e 009f
|
||||
00a0 2018 2019 00a3 XXXX XXXX 00a6 00a7 00a8 00a9 XXXX 00ab 00ac 00ad XXXX 2015
|
||||
00b0 00b1 00b2 00b3 0384 0385 0386 00b7 0388 0389 038a 00bb 038c 00bd 038e 038f
|
||||
0390 0391 0392 0393 0394 0395 0396 0397 0398 0399 039a 039b 039c 039d 039e 039f
|
||||
03a0 03a1 XXXX 03a3 03a4 03a5 03a6 03a7 03a8 03a9 03aa 03ab 03ac 03ad 03ae 03af
|
||||
03b0 03b1 03b2 03b3 03b4 03b5 03b6 03b7 03b8 03b9 03ba 03bb 03bc 03bd 03be 03bf
|
||||
03c0 03c1 03c2 03c3 03c4 03c5 03c6 03c7 03c8 03c9 03ca 03cb 03cc 03cd 03ce XXXX
|
||||
|
||||
charset CS_ISO8859_8
|
||||
0000 0001 0002 0003 0004 0005 0006 0007 0008 0009 000a 000b 000c 000d 000e 000f
|
||||
0010 0011 0012 0013 0014 0015 0016 0017 0018 0019 001a 001b 001c 001d 001e 001f
|
||||
0020 0021 0022 0023 0024 0025 0026 0027 0028 0029 002a 002b 002c 002d 002e 002f
|
||||
0030 0031 0032 0033 0034 0035 0036 0037 0038 0039 003a 003b 003c 003d 003e 003f
|
||||
0040 0041 0042 0043 0044 0045 0046 0047 0048 0049 004a 004b 004c 004d 004e 004f
|
||||
0050 0051 0052 0053 0054 0055 0056 0057 0058 0059 005a 005b 005c 005d 005e 005f
|
||||
0060 0061 0062 0063 0064 0065 0066 0067 0068 0069 006a 006b 006c 006d 006e 006f
|
||||
0070 0071 0072 0073 0074 0075 0076 0077 0078 0079 007a 007b 007c 007d 007e 007f
|
||||
0080 0081 0082 0083 0084 0085 0086 0087 0088 0089 008a 008b 008c 008d 008e 008f
|
||||
0090 0091 0092 0093 0094 0095 0096 0097 0098 0099 009a 009b 009c 009d 009e 009f
|
||||
00a0 XXXX 00a2 00a3 00a4 00a5 00a6 00a7 00a8 00a9 00d7 00ab 00ac 00ad 00ae 00af
|
||||
00b0 00b1 00b2 00b3 00b4 00b5 00b6 00b7 00b8 00b9 00f7 00bb 00bc 00bd 00be XXXX
|
||||
XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX
|
||||
XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX 2017
|
||||
05d0 05d1 05d2 05d3 05d4 05d5 05d6 05d7 05d8 05d9 05da 05db 05dc 05dd 05de 05df
|
||||
05e0 05e1 05e2 05e3 05e4 05e5 05e6 05e7 05e8 05e9 05ea XXXX XXXX 200e 200f XXXX
|
||||
|
||||
charset CS_ISO8859_9
|
||||
0000 0001 0002 0003 0004 0005 0006 0007 0008 0009 000a 000b 000c 000d 000e 000f
|
||||
0010 0011 0012 0013 0014 0015 0016 0017 0018 0019 001a 001b 001c 001d 001e 001f
|
||||
0020 0021 0022 0023 0024 0025 0026 0027 0028 0029 002a 002b 002c 002d 002e 002f
|
||||
0030 0031 0032 0033 0034 0035 0036 0037 0038 0039 003a 003b 003c 003d 003e 003f
|
||||
0040 0041 0042 0043 0044 0045 0046 0047 0048 0049 004a 004b 004c 004d 004e 004f
|
||||
0050 0051 0052 0053 0054 0055 0056 0057 0058 0059 005a 005b 005c 005d 005e 005f
|
||||
0060 0061 0062 0063 0064 0065 0066 0067 0068 0069 006a 006b 006c 006d 006e 006f
|
||||
0070 0071 0072 0073 0074 0075 0076 0077 0078 0079 007a 007b 007c 007d 007e 007f
|
||||
0080 0081 0082 0083 0084 0085 0086 0087 0088 0089 008a 008b 008c 008d 008e 008f
|
||||
0090 0091 0092 0093 0094 0095 0096 0097 0098 0099 009a 009b 009c 009d 009e 009f
|
||||
00a0 00a1 00a2 00a3 00a4 00a5 00a6 00a7 00a8 00a9 00aa 00ab 00ac 00ad 00ae 00af
|
||||
00b0 00b1 00b2 00b3 00b4 00b5 00b6 00b7 00b8 00b9 00ba 00bb 00bc 00bd 00be 00bf
|
||||
00c0 00c1 00c2 00c3 00c4 00c5 00c6 00c7 00c8 00c9 00ca 00cb 00cc 00cd 00ce 00cf
|
||||
011e 00d1 00d2 00d3 00d4 00d5 00d6 00d7 00d8 00d9 00da 00db 00dc 0130 015e 00df
|
||||
00e0 00e1 00e2 00e3 00e4 00e5 00e6 00e7 00e8 00e9 00ea 00eb 00ec 00ed 00ee 00ef
|
||||
011f 00f1 00f2 00f3 00f4 00f5 00f6 00f7 00f8 00f9 00fa 00fb 00fc 0131 015f 00ff
|
||||
|
||||
charset CS_ISO8859_10
|
||||
0000 0001 0002 0003 0004 0005 0006 0007 0008 0009 000a 000b 000c 000d 000e 000f
|
||||
0010 0011 0012 0013 0014 0015 0016 0017 0018 0019 001a 001b 001c 001d 001e 001f
|
||||
0020 0021 0022 0023 0024 0025 0026 0027 0028 0029 002a 002b 002c 002d 002e 002f
|
||||
0030 0031 0032 0033 0034 0035 0036 0037 0038 0039 003a 003b 003c 003d 003e 003f
|
||||
0040 0041 0042 0043 0044 0045 0046 0047 0048 0049 004a 004b 004c 004d 004e 004f
|
||||
0050 0051 0052 0053 0054 0055 0056 0057 0058 0059 005a 005b 005c 005d 005e 005f
|
||||
0060 0061 0062 0063 0064 0065 0066 0067 0068 0069 006a 006b 006c 006d 006e 006f
|
||||
0070 0071 0072 0073 0074 0075 0076 0077 0078 0079 007a 007b 007c 007d 007e 007f
|
||||
0080 0081 0082 0083 0084 0085 0086 0087 0088 0089 008a 008b 008c 008d 008e 008f
|
||||
0090 0091 0092 0093 0094 0095 0096 0097 0098 0099 009a 009b 009c 009d 009e 009f
|
||||
00a0 0104 0112 0122 012a 0128 0136 00a7 013b 0110 0160 0166 017d 00ad 016a 014a
|
||||
00b0 0105 0113 0123 012b 0129 0137 00b7 013c 0111 0161 0167 017e 2015 016b 014b
|
||||
0100 00c1 00c2 00c3 00c4 00c5 00c6 012e 010c 00c9 0118 00cb 0116 00cd 00ce 00cf
|
||||
00d0 0145 014c 00d3 00d4 00d5 00d6 0168 00d8 0172 00da 00db 00dc 00dd 00de 00df
|
||||
0101 00e1 00e2 00e3 00e4 00e5 00e6 012f 010d 00e9 0119 00eb 0117 00ed 00ee 00ef
|
||||
00f0 0146 014d 00f3 00f4 00f5 00f6 0169 00f8 0173 00fa 00fb 00fc 00fd 00fe 0138
|
||||
|
||||
charset CS_ISO8859_11
|
||||
0000 0001 0002 0003 0004 0005 0006 0007 0008 0009 000a 000b 000c 000d 000e 000f
|
||||
0010 0011 0012 0013 0014 0015 0016 0017 0018 0019 001a 001b 001c 001d 001e 001f
|
||||
0020 0021 0022 0023 0024 0025 0026 0027 0028 0029 002a 002b 002c 002d 002e 002f
|
||||
0030 0031 0032 0033 0034 0035 0036 0037 0038 0039 003a 003b 003c 003d 003e 003f
|
||||
0040 0041 0042 0043 0044 0045 0046 0047 0048 0049 004a 004b 004c 004d 004e 004f
|
||||
0050 0051 0052 0053 0054 0055 0056 0057 0058 0059 005a 005b 005c 005d 005e 005f
|
||||
0060 0061 0062 0063 0064 0065 0066 0067 0068 0069 006a 006b 006c 006d 006e 006f
|
||||
0070 0071 0072 0073 0074 0075 0076 0077 0078 0079 007a 007b 007c 007d 007e 007f
|
||||
0080 0081 0082 0083 0084 0085 0086 0087 0088 0089 008a 008b 008c 008d 008e 008f
|
||||
0090 0091 0092 0093 0094 0095 0096 0097 0098 0099 009a 009b 009c 009d 009e 009f
|
||||
00a0 0e01 0e02 0e03 0e04 0e05 0e06 0e07 0e08 0e09 0e0a 0e0b 0e0c 0e0d 0e0e 0e0f
|
||||
0e10 0e11 0e12 0e13 0e14 0e15 0e16 0e17 0e18 0e19 0e1a 0e1b 0e1c 0e1d 0e1e 0e1f
|
||||
0e20 0e21 0e22 0e23 0e24 0e25 0e26 0e27 0e28 0e29 0e2a 0e2b 0e2c 0e2d 0e2e 0e2f
|
||||
0e30 0e31 0e32 0e33 0e34 0e35 0e36 0e37 0e38 0e39 0e3a XXXX XXXX XXXX XXXX 0e3f
|
||||
0e40 0e41 0e42 0e43 0e44 0e45 0e46 0e47 0e48 0e49 0e4a 0e4b 0e4c 0e4d 0e4e 0e4f
|
||||
0e50 0e51 0e52 0e53 0e54 0e55 0e56 0e57 0e58 0e59 0e5a 0e5b XXXX XXXX XXXX XXXX
|
||||
|
||||
charset CS_ISO8859_13
|
||||
0000 0001 0002 0003 0004 0005 0006 0007 0008 0009 000a 000b 000c 000d 000e 000f
|
||||
0010 0011 0012 0013 0014 0015 0016 0017 0018 0019 001a 001b 001c 001d 001e 001f
|
||||
0020 0021 0022 0023 0024 0025 0026 0027 0028 0029 002a 002b 002c 002d 002e 002f
|
||||
0030 0031 0032 0033 0034 0035 0036 0037 0038 0039 003a 003b 003c 003d 003e 003f
|
||||
0040 0041 0042 0043 0044 0045 0046 0047 0048 0049 004a 004b 004c 004d 004e 004f
|
||||
0050 0051 0052 0053 0054 0055 0056 0057 0058 0059 005a 005b 005c 005d 005e 005f
|
||||
0060 0061 0062 0063 0064 0065 0066 0067 0068 0069 006a 006b 006c 006d 006e 006f
|
||||
0070 0071 0072 0073 0074 0075 0076 0077 0078 0079 007a 007b 007c 007d 007e 007f
|
||||
0080 0081 0082 0083 0084 0085 0086 0087 0088 0089 008a 008b 008c 008d 008e 008f
|
||||
0090 0091 0092 0093 0094 0095 0096 0097 0098 0099 009a 009b 009c 009d 009e 009f
|
||||
00a0 201d 00a2 00a3 00a4 201e 00a6 00a7 00d8 00a9 0156 00ab 00ac 00ad 00ae 00c6
|
||||
00b0 00b1 00b2 00b3 201c 00b5 00b6 00b7 00f8 00b9 0157 00bb 00bc 00bd 00be 00e6
|
||||
0104 012e 0100 0106 00c4 00c5 0118 0112 010c 00c9 0179 0116 0122 0136 012a 013b
|
||||
0160 0143 0145 00d3 014c 00d5 00d6 00d7 0172 0141 015a 016a 00dc 017b 017d 00df
|
||||
0105 012f 0101 0107 00e4 00e5 0119 0113 010d 00e9 017a 0117 0123 0137 012b 013c
|
||||
0161 0144 0146 00f3 014d 00f5 00f6 00f7 0173 0142 015b 016b 00fc 017c 017e 2019
|
||||
|
||||
charset CS_ISO8859_14
|
||||
0000 0001 0002 0003 0004 0005 0006 0007 0008 0009 000a 000b 000c 000d 000e 000f
|
||||
0010 0011 0012 0013 0014 0015 0016 0017 0018 0019 001a 001b 001c 001d 001e 001f
|
||||
0020 0021 0022 0023 0024 0025 0026 0027 0028 0029 002a 002b 002c 002d 002e 002f
|
||||
0030 0031 0032 0033 0034 0035 0036 0037 0038 0039 003a 003b 003c 003d 003e 003f
|
||||
0040 0041 0042 0043 0044 0045 0046 0047 0048 0049 004a 004b 004c 004d 004e 004f
|
||||
0050 0051 0052 0053 0054 0055 0056 0057 0058 0059 005a 005b 005c 005d 005e 005f
|
||||
0060 0061 0062 0063 0064 0065 0066 0067 0068 0069 006a 006b 006c 006d 006e 006f
|
||||
0070 0071 0072 0073 0074 0075 0076 0077 0078 0079 007a 007b 007c 007d 007e 007f
|
||||
0080 0081 0082 0083 0084 0085 0086 0087 0088 0089 008a 008b 008c 008d 008e 008f
|
||||
0090 0091 0092 0093 0094 0095 0096 0097 0098 0099 009a 009b 009c 009d 009e 009f
|
||||
00a0 1e02 1e03 00a3 010a 010b 1e0a 00a7 1e80 00a9 1e82 1e0b 1ef2 00ad 00ae 0178
|
||||
1e1e 1e1f 0120 0121 1e40 1e41 00b6 1e56 1e81 1e57 1e83 1e60 1ef3 1e84 1e85 1e61
|
||||
00c0 00c1 00c2 00c3 00c4 00c5 00c6 00c7 00c8 00c9 00ca 00cb 00cc 00cd 00ce 00cf
|
||||
0174 00d1 00d2 00d3 00d4 00d5 00d6 1e6a 00d8 00d9 00da 00db 00dc 00dd 0176 00df
|
||||
00e0 00e1 00e2 00e3 00e4 00e5 00e6 00e7 00e8 00e9 00ea 00eb 00ec 00ed 00ee 00ef
|
||||
0175 00f1 00f2 00f3 00f4 00f5 00f6 1e6b 00f8 00f9 00fa 00fb 00fc 00fd 0177 00ff
|
||||
|
||||
charset CS_ISO8859_15
|
||||
0000 0001 0002 0003 0004 0005 0006 0007 0008 0009 000a 000b 000c 000d 000e 000f
|
||||
0010 0011 0012 0013 0014 0015 0016 0017 0018 0019 001a 001b 001c 001d 001e 001f
|
||||
0020 0021 0022 0023 0024 0025 0026 0027 0028 0029 002a 002b 002c 002d 002e 002f
|
||||
0030 0031 0032 0033 0034 0035 0036 0037 0038 0039 003a 003b 003c 003d 003e 003f
|
||||
0040 0041 0042 0043 0044 0045 0046 0047 0048 0049 004a 004b 004c 004d 004e 004f
|
||||
0050 0051 0052 0053 0054 0055 0056 0057 0058 0059 005a 005b 005c 005d 005e 005f
|
||||
0060 0061 0062 0063 0064 0065 0066 0067 0068 0069 006a 006b 006c 006d 006e 006f
|
||||
0070 0071 0072 0073 0074 0075 0076 0077 0078 0079 007a 007b 007c 007d 007e 007f
|
||||
0080 0081 0082 0083 0084 0085 0086 0087 0088 0089 008a 008b 008c 008d 008e 008f
|
||||
0090 0091 0092 0093 0094 0095 0096 0097 0098 0099 009a 009b 009c 009d 009e 009f
|
||||
00a0 00a1 00a2 00a3 20ac 00a5 0160 00a7 0161 00a9 00aa 00ab 00ac 00ad 00ae 00af
|
||||
00b0 00b1 00b2 00b3 017d 00b5 00b6 00b7 017e 00b9 00ba 00bb 0152 0153 0178 00bf
|
||||
00c0 00c1 00c2 00c3 00c4 00c5 00c6 00c7 00c8 00c9 00ca 00cb 00cc 00cd 00ce 00cf
|
||||
00d0 00d1 00d2 00d3 00d4 00d5 00d6 00d7 00d8 00d9 00da 00db 00dc 00dd 00de 00df
|
||||
00e0 00e1 00e2 00e3 00e4 00e5 00e6 00e7 00e8 00e9 00ea 00eb 00ec 00ed 00ee 00ef
|
||||
00f0 00f1 00f2 00f3 00f4 00f5 00f6 00f7 00f8 00f9 00fa 00fb 00fc 00fd 00fe 00ff
|
||||
|
||||
charset CS_ISO8859_16
|
||||
0000 0001 0002 0003 0004 0005 0006 0007 0008 0009 000a 000b 000c 000d 000e 000f
|
||||
0010 0011 0012 0013 0014 0015 0016 0017 0018 0019 001a 001b 001c 001d 001e 001f
|
||||
0020 0021 0022 0023 0024 0025 0026 0027 0028 0029 002a 002b 002c 002d 002e 002f
|
||||
0030 0031 0032 0033 0034 0035 0036 0037 0038 0039 003a 003b 003c 003d 003e 003f
|
||||
0040 0041 0042 0043 0044 0045 0046 0047 0048 0049 004a 004b 004c 004d 004e 004f
|
||||
0050 0051 0052 0053 0054 0055 0056 0057 0058 0059 005a 005b 005c 005d 005e 005f
|
||||
0060 0061 0062 0063 0064 0065 0066 0067 0068 0069 006a 006b 006c 006d 006e 006f
|
||||
0070 0071 0072 0073 0074 0075 0076 0077 0078 0079 007a 007b 007c 007d 007e 007f
|
||||
0080 0081 0082 0083 0084 0085 0086 0087 0088 0089 008a 008b 008c 008d 008e 008f
|
||||
0090 0091 0092 0093 0094 0095 0096 0097 0098 0099 009a 009b 009c 009d 009e 009f
|
||||
00a0 0104 0105 0141 20ac 201e 0160 00a7 0161 00a9 0218 00ab 0179 00ad 017a 017b
|
||||
00b0 00b1 010c 0142 017d 201d 00b6 00b7 017e 010d 0219 00bb 0152 0153 0178 017c
|
||||
00c0 00c1 00c2 0102 00c4 0106 00c6 00c7 00c8 00c9 00ca 00cb 00cc 00cd 00ce 00cf
|
||||
0110 0143 00d2 00d3 00d4 0150 00d6 015a 0170 00d9 00da 00db 00dc 0118 021a 00df
|
||||
00e0 00e1 00e2 0103 00e4 0107 00e6 00e7 00e8 00e9 00ea 00eb 00ec 00ed 00ee 00ef
|
||||
0111 0144 00f2 00f3 00f4 0151 00f6 015b 0171 00f9 00fa 00fb 00fc 0119 021b 00ff
|
||||
|
||||
Some X fonts are encoded in a variant form of ISO8859-1:
|
||||
everything above 0x20 (space) is as normal, but the first 32
|
||||
characters contain the VT100 line drawing glyphs as they would
|
||||
appear from positions 0x5F to 0x7E inclusive. Here is the modified
|
||||
ISO8859-1 code table.
|
||||
|
||||
charset CS_ISO8859_1_X11
|
||||
0020 2666 2592 2409 240c 240d 240a 00b0 00b1 2424 240b 2518 2510 250c 2514 253c
|
||||
23ba 23bb 2500 23bc 23bd 251c 2524 2534 252c 2502 2264 2265 03c0 2260 00a3 00b7
|
||||
0020 0021 0022 0023 0024 0025 0026 0027 0028 0029 002a 002b 002c 002d 002e 002f
|
||||
0030 0031 0032 0033 0034 0035 0036 0037 0038 0039 003a 003b 003c 003d 003e 003f
|
||||
0040 0041 0042 0043 0044 0045 0046 0047 0048 0049 004a 004b 004c 004d 004e 004f
|
||||
0050 0051 0052 0053 0054 0055 0056 0057 0058 0059 005a 005b 005c 005d 005e 005f
|
||||
0060 0061 0062 0063 0064 0065 0066 0067 0068 0069 006a 006b 006c 006d 006e 006f
|
||||
0070 0071 0072 0073 0074 0075 0076 0077 0078 0079 007a 007b 007c 007d 007e 007f
|
||||
0080 0081 0082 0083 0084 0085 0086 0087 0088 0089 008a 008b 008c 008d 008e 008f
|
||||
0090 0091 0092 0093 0094 0095 0096 0097 0098 0099 009a 009b 009c 009d 009e 009f
|
||||
00a0 00a1 00a2 00a3 00a4 00a5 00a6 00a7 00a8 00a9 00aa 00ab 00ac 00ad 00ae 00af
|
||||
00b0 00b1 00b2 00b3 00b4 00b5 00b6 00b7 00b8 00b9 00ba 00bb 00bc 00bd 00be 00bf
|
||||
00c0 00c1 00c2 00c3 00c4 00c5 00c6 00c7 00c8 00c9 00ca 00cb 00cc 00cd 00ce 00cf
|
||||
00d0 00d1 00d2 00d3 00d4 00d5 00d6 00d7 00d8 00d9 00da 00db 00dc 00dd 00de 00df
|
||||
00e0 00e1 00e2 00e3 00e4 00e5 00e6 00e7 00e8 00e9 00ea 00eb 00ec 00ed 00ee 00ef
|
||||
00f0 00f1 00f2 00f3 00f4 00f5 00f6 00f7 00f8 00f9 00fa 00fb 00fc 00fd 00fe 00ff
|
||||
|
||||
Here are some PC (old DOS) code pages, generated by this piece of
|
||||
Bourne shell:
|
||||
|
||||
for i in 437 850; do
|
||||
echo charset CS_CP$i
|
||||
gensbcs http://www.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/PC/CP$i.TXT
|
||||
echo
|
||||
done
|
||||
|
||||
charset CS_CP437
|
||||
0000 0001 0002 0003 0004 0005 0006 0007 0008 0009 000a 000b 000c 000d 000e 000f
|
||||
0010 0011 0012 0013 0014 0015 0016 0017 0018 0019 001a 001b 001c 001d 001e 001f
|
||||
0020 0021 0022 0023 0024 0025 0026 0027 0028 0029 002a 002b 002c 002d 002e 002f
|
||||
0030 0031 0032 0033 0034 0035 0036 0037 0038 0039 003a 003b 003c 003d 003e 003f
|
||||
0040 0041 0042 0043 0044 0045 0046 0047 0048 0049 004a 004b 004c 004d 004e 004f
|
||||
0050 0051 0052 0053 0054 0055 0056 0057 0058 0059 005a 005b 005c 005d 005e 005f
|
||||
0060 0061 0062 0063 0064 0065 0066 0067 0068 0069 006a 006b 006c 006d 006e 006f
|
||||
0070 0071 0072 0073 0074 0075 0076 0077 0078 0079 007a 007b 007c 007d 007e 007f
|
||||
00c7 00fc 00e9 00e2 00e4 00e0 00e5 00e7 00ea 00eb 00e8 00ef 00ee 00ec 00c4 00c5
|
||||
00c9 00e6 00c6 00f4 00f6 00f2 00fb 00f9 00ff 00d6 00dc 00a2 00a3 00a5 20a7 0192
|
||||
00e1 00ed 00f3 00fa 00f1 00d1 00aa 00ba 00bf 2310 00ac 00bd 00bc 00a1 00ab 00bb
|
||||
2591 2592 2593 2502 2524 2561 2562 2556 2555 2563 2551 2557 255d 255c 255b 2510
|
||||
2514 2534 252c 251c 2500 253c 255e 255f 255a 2554 2569 2566 2560 2550 256c 2567
|
||||
2568 2564 2565 2559 2558 2552 2553 256b 256a 2518 250c 2588 2584 258c 2590 2580
|
||||
03b1 00df 0393 03c0 03a3 03c3 00b5 03c4 03a6 0398 03a9 03b4 221e 03c6 03b5 2229
|
||||
2261 00b1 2265 2264 2320 2321 00f7 2248 00b0 2219 00b7 221a 207f 00b2 25a0 00a0
|
||||
|
||||
charset CS_CP850
|
||||
0000 0001 0002 0003 0004 0005 0006 0007 0008 0009 000a 000b 000c 000d 000e 000f
|
||||
0010 0011 0012 0013 0014 0015 0016 0017 0018 0019 001a 001b 001c 001d 001e 001f
|
||||
0020 0021 0022 0023 0024 0025 0026 0027 0028 0029 002a 002b 002c 002d 002e 002f
|
||||
0030 0031 0032 0033 0034 0035 0036 0037 0038 0039 003a 003b 003c 003d 003e 003f
|
||||
0040 0041 0042 0043 0044 0045 0046 0047 0048 0049 004a 004b 004c 004d 004e 004f
|
||||
0050 0051 0052 0053 0054 0055 0056 0057 0058 0059 005a 005b 005c 005d 005e 005f
|
||||
0060 0061 0062 0063 0064 0065 0066 0067 0068 0069 006a 006b 006c 006d 006e 006f
|
||||
0070 0071 0072 0073 0074 0075 0076 0077 0078 0079 007a 007b 007c 007d 007e 007f
|
||||
00c7 00fc 00e9 00e2 00e4 00e0 00e5 00e7 00ea 00eb 00e8 00ef 00ee 00ec 00c4 00c5
|
||||
00c9 00e6 00c6 00f4 00f6 00f2 00fb 00f9 00ff 00d6 00dc 00f8 00a3 00d8 00d7 0192
|
||||
00e1 00ed 00f3 00fa 00f1 00d1 00aa 00ba 00bf 00ae 00ac 00bd 00bc 00a1 00ab 00bb
|
||||
2591 2592 2593 2502 2524 00c1 00c2 00c0 00a9 2563 2551 2557 255d 00a2 00a5 2510
|
||||
2514 2534 252c 251c 2500 253c 00e3 00c3 255a 2554 2569 2566 2560 2550 256c 00a4
|
||||
00f0 00d0 00ca 00cb 00c8 0131 00cd 00ce 00cf 2518 250c 2588 2584 00a6 00cc 2580
|
||||
00d3 00df 00d4 00d2 00f5 00d5 00b5 00fe 00de 00da 00db 00d9 00fd 00dd 00af 00b4
|
||||
00ad 00b1 2017 00be 00b6 00a7 00f7 00b8 00b0 00a8 00b7 00b9 00b3 00b2 25a0 00a0
|
||||
|
||||
Here are some Windows code pages, generated by this piece of
|
||||
Bourne shell:
|
||||
|
||||
for i in 1250 1251 1252 1253 1254 1255 1256 1257 1258; do
|
||||
echo charset CS_CP$i
|
||||
gensbcs http://www.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP$i.TXT
|
||||
echo
|
||||
done
|
||||
|
||||
charset CS_CP1250
|
||||
0000 0001 0002 0003 0004 0005 0006 0007 0008 0009 000a 000b 000c 000d 000e 000f
|
||||
0010 0011 0012 0013 0014 0015 0016 0017 0018 0019 001a 001b 001c 001d 001e 001f
|
||||
0020 0021 0022 0023 0024 0025 0026 0027 0028 0029 002a 002b 002c 002d 002e 002f
|
||||
0030 0031 0032 0033 0034 0035 0036 0037 0038 0039 003a 003b 003c 003d 003e 003f
|
||||
0040 0041 0042 0043 0044 0045 0046 0047 0048 0049 004a 004b 004c 004d 004e 004f
|
||||
0050 0051 0052 0053 0054 0055 0056 0057 0058 0059 005a 005b 005c 005d 005e 005f
|
||||
0060 0061 0062 0063 0064 0065 0066 0067 0068 0069 006a 006b 006c 006d 006e 006f
|
||||
0070 0071 0072 0073 0074 0075 0076 0077 0078 0079 007a 007b 007c 007d 007e 007f
|
||||
20ac XXXX 201a XXXX 201e 2026 2020 2021 XXXX 2030 0160 2039 015a 0164 017d 0179
|
||||
XXXX 2018 2019 201c 201d 2022 2013 2014 XXXX 2122 0161 203a 015b 0165 017e 017a
|
||||
00a0 02c7 02d8 0141 00a4 0104 00a6 00a7 00a8 00a9 015e 00ab 00ac 00ad 00ae 017b
|
||||
00b0 00b1 02db 0142 00b4 00b5 00b6 00b7 00b8 0105 015f 00bb 013d 02dd 013e 017c
|
||||
0154 00c1 00c2 0102 00c4 0139 0106 00c7 010c 00c9 0118 00cb 011a 00cd 00ce 010e
|
||||
0110 0143 0147 00d3 00d4 0150 00d6 00d7 0158 016e 00da 0170 00dc 00dd 0162 00df
|
||||
0155 00e1 00e2 0103 00e4 013a 0107 00e7 010d 00e9 0119 00eb 011b 00ed 00ee 010f
|
||||
0111 0144 0148 00f3 00f4 0151 00f6 00f7 0159 016f 00fa 0171 00fc 00fd 0163 02d9
|
||||
|
||||
charset CS_CP1251
|
||||
0000 0001 0002 0003 0004 0005 0006 0007 0008 0009 000a 000b 000c 000d 000e 000f
|
||||
0010 0011 0012 0013 0014 0015 0016 0017 0018 0019 001a 001b 001c 001d 001e 001f
|
||||
0020 0021 0022 0023 0024 0025 0026 0027 0028 0029 002a 002b 002c 002d 002e 002f
|
||||
0030 0031 0032 0033 0034 0035 0036 0037 0038 0039 003a 003b 003c 003d 003e 003f
|
||||
0040 0041 0042 0043 0044 0045 0046 0047 0048 0049 004a 004b 004c 004d 004e 004f
|
||||
0050 0051 0052 0053 0054 0055 0056 0057 0058 0059 005a 005b 005c 005d 005e 005f
|
||||
0060 0061 0062 0063 0064 0065 0066 0067 0068 0069 006a 006b 006c 006d 006e 006f
|
||||
0070 0071 0072 0073 0074 0075 0076 0077 0078 0079 007a 007b 007c 007d 007e 007f
|
||||
0402 0403 201a 0453 201e 2026 2020 2021 20ac 2030 0409 2039 040a 040c 040b 040f
|
||||
0452 2018 2019 201c 201d 2022 2013 2014 XXXX 2122 0459 203a 045a 045c 045b 045f
|
||||
00a0 040e 045e 0408 00a4 0490 00a6 00a7 0401 00a9 0404 00ab 00ac 00ad 00ae 0407
|
||||
00b0 00b1 0406 0456 0491 00b5 00b6 00b7 0451 2116 0454 00bb 0458 0405 0455 0457
|
||||
0410 0411 0412 0413 0414 0415 0416 0417 0418 0419 041a 041b 041c 041d 041e 041f
|
||||
0420 0421 0422 0423 0424 0425 0426 0427 0428 0429 042a 042b 042c 042d 042e 042f
|
||||
0430 0431 0432 0433 0434 0435 0436 0437 0438 0439 043a 043b 043c 043d 043e 043f
|
||||
0440 0441 0442 0443 0444 0445 0446 0447 0448 0449 044a 044b 044c 044d 044e 044f
|
||||
|
||||
charset CS_CP1252
|
||||
0000 0001 0002 0003 0004 0005 0006 0007 0008 0009 000a 000b 000c 000d 000e 000f
|
||||
0010 0011 0012 0013 0014 0015 0016 0017 0018 0019 001a 001b 001c 001d 001e 001f
|
||||
0020 0021 0022 0023 0024 0025 0026 0027 0028 0029 002a 002b 002c 002d 002e 002f
|
||||
0030 0031 0032 0033 0034 0035 0036 0037 0038 0039 003a 003b 003c 003d 003e 003f
|
||||
0040 0041 0042 0043 0044 0045 0046 0047 0048 0049 004a 004b 004c 004d 004e 004f
|
||||
0050 0051 0052 0053 0054 0055 0056 0057 0058 0059 005a 005b 005c 005d 005e 005f
|
||||
0060 0061 0062 0063 0064 0065 0066 0067 0068 0069 006a 006b 006c 006d 006e 006f
|
||||
0070 0071 0072 0073 0074 0075 0076 0077 0078 0079 007a 007b 007c 007d 007e 007f
|
||||
20ac XXXX 201a 0192 201e 2026 2020 2021 02c6 2030 0160 2039 0152 XXXX 017d XXXX
|
||||
XXXX 2018 2019 201c 201d 2022 2013 2014 02dc 2122 0161 203a 0153 XXXX 017e 0178
|
||||
00a0 00a1 00a2 00a3 00a4 00a5 00a6 00a7 00a8 00a9 00aa 00ab 00ac 00ad 00ae 00af
|
||||
00b0 00b1 00b2 00b3 00b4 00b5 00b6 00b7 00b8 00b9 00ba 00bb 00bc 00bd 00be 00bf
|
||||
00c0 00c1 00c2 00c3 00c4 00c5 00c6 00c7 00c8 00c9 00ca 00cb 00cc 00cd 00ce 00cf
|
||||
00d0 00d1 00d2 00d3 00d4 00d5 00d6 00d7 00d8 00d9 00da 00db 00dc 00dd 00de 00df
|
||||
00e0 00e1 00e2 00e3 00e4 00e5 00e6 00e7 00e8 00e9 00ea 00eb 00ec 00ed 00ee 00ef
|
||||
00f0 00f1 00f2 00f3 00f4 00f5 00f6 00f7 00f8 00f9 00fa 00fb 00fc 00fd 00fe 00ff
|
||||
|
||||
charset CS_CP1253
|
||||
0000 0001 0002 0003 0004 0005 0006 0007 0008 0009 000a 000b 000c 000d 000e 000f
|
||||
0010 0011 0012 0013 0014 0015 0016 0017 0018 0019 001a 001b 001c 001d 001e 001f
|
||||
0020 0021 0022 0023 0024 0025 0026 0027 0028 0029 002a 002b 002c 002d 002e 002f
|
||||
0030 0031 0032 0033 0034 0035 0036 0037 0038 0039 003a 003b 003c 003d 003e 003f
|
||||
0040 0041 0042 0043 0044 0045 0046 0047 0048 0049 004a 004b 004c 004d 004e 004f
|
||||
0050 0051 0052 0053 0054 0055 0056 0057 0058 0059 005a 005b 005c 005d 005e 005f
|
||||
0060 0061 0062 0063 0064 0065 0066 0067 0068 0069 006a 006b 006c 006d 006e 006f
|
||||
0070 0071 0072 0073 0074 0075 0076 0077 0078 0079 007a 007b 007c 007d 007e 007f
|
||||
20ac XXXX 201a 0192 201e 2026 2020 2021 XXXX 2030 XXXX 2039 XXXX XXXX XXXX XXXX
|
||||
XXXX 2018 2019 201c 201d 2022 2013 2014 XXXX 2122 XXXX 203a XXXX XXXX XXXX XXXX
|
||||
00a0 0385 0386 00a3 00a4 00a5 00a6 00a7 00a8 00a9 XXXX 00ab 00ac 00ad 00ae 2015
|
||||
00b0 00b1 00b2 00b3 0384 00b5 00b6 00b7 0388 0389 038a 00bb 038c 00bd 038e 038f
|
||||
0390 0391 0392 0393 0394 0395 0396 0397 0398 0399 039a 039b 039c 039d 039e 039f
|
||||
03a0 03a1 XXXX 03a3 03a4 03a5 03a6 03a7 03a8 03a9 03aa 03ab 03ac 03ad 03ae 03af
|
||||
03b0 03b1 03b2 03b3 03b4 03b5 03b6 03b7 03b8 03b9 03ba 03bb 03bc 03bd 03be 03bf
|
||||
03c0 03c1 03c2 03c3 03c4 03c5 03c6 03c7 03c8 03c9 03ca 03cb 03cc 03cd 03ce XXXX
|
||||
|
||||
charset CS_CP1254
|
||||
0000 0001 0002 0003 0004 0005 0006 0007 0008 0009 000a 000b 000c 000d 000e 000f
|
||||
0010 0011 0012 0013 0014 0015 0016 0017 0018 0019 001a 001b 001c 001d 001e 001f
|
||||
0020 0021 0022 0023 0024 0025 0026 0027 0028 0029 002a 002b 002c 002d 002e 002f
|
||||
0030 0031 0032 0033 0034 0035 0036 0037 0038 0039 003a 003b 003c 003d 003e 003f
|
||||
0040 0041 0042 0043 0044 0045 0046 0047 0048 0049 004a 004b 004c 004d 004e 004f
|
||||
0050 0051 0052 0053 0054 0055 0056 0057 0058 0059 005a 005b 005c 005d 005e 005f
|
||||
0060 0061 0062 0063 0064 0065 0066 0067 0068 0069 006a 006b 006c 006d 006e 006f
|
||||
0070 0071 0072 0073 0074 0075 0076 0077 0078 0079 007a 007b 007c 007d 007e 007f
|
||||
20ac XXXX 201a 0192 201e 2026 2020 2021 02c6 2030 0160 2039 0152 XXXX XXXX XXXX
|
||||
XXXX 2018 2019 201c 201d 2022 2013 2014 02dc 2122 0161 203a 0153 XXXX XXXX 0178
|
||||
00a0 00a1 00a2 00a3 00a4 00a5 00a6 00a7 00a8 00a9 00aa 00ab 00ac 00ad 00ae 00af
|
||||
00b0 00b1 00b2 00b3 00b4 00b5 00b6 00b7 00b8 00b9 00ba 00bb 00bc 00bd 00be 00bf
|
||||
00c0 00c1 00c2 00c3 00c4 00c5 00c6 00c7 00c8 00c9 00ca 00cb 00cc 00cd 00ce 00cf
|
||||
011e 00d1 00d2 00d3 00d4 00d5 00d6 00d7 00d8 00d9 00da 00db 00dc 0130 015e 00df
|
||||
00e0 00e1 00e2 00e3 00e4 00e5 00e6 00e7 00e8 00e9 00ea 00eb 00ec 00ed 00ee 00ef
|
||||
011f 00f1 00f2 00f3 00f4 00f5 00f6 00f7 00f8 00f9 00fa 00fb 00fc 0131 015f 00ff
|
||||
|
||||
charset CS_CP1255
|
||||
0000 0001 0002 0003 0004 0005 0006 0007 0008 0009 000a 000b 000c 000d 000e 000f
|
||||
0010 0011 0012 0013 0014 0015 0016 0017 0018 0019 001a 001b 001c 001d 001e 001f
|
||||
0020 0021 0022 0023 0024 0025 0026 0027 0028 0029 002a 002b 002c 002d 002e 002f
|
||||
0030 0031 0032 0033 0034 0035 0036 0037 0038 0039 003a 003b 003c 003d 003e 003f
|
||||
0040 0041 0042 0043 0044 0045 0046 0047 0048 0049 004a 004b 004c 004d 004e 004f
|
||||
0050 0051 0052 0053 0054 0055 0056 0057 0058 0059 005a 005b 005c 005d 005e 005f
|
||||
0060 0061 0062 0063 0064 0065 0066 0067 0068 0069 006a 006b 006c 006d 006e 006f
|
||||
0070 0071 0072 0073 0074 0075 0076 0077 0078 0079 007a 007b 007c 007d 007e 007f
|
||||
20ac XXXX 201a 0192 201e 2026 2020 2021 02c6 2030 XXXX 2039 XXXX XXXX XXXX XXXX
|
||||
XXXX 2018 2019 201c 201d 2022 2013 2014 02dc 2122 XXXX 203a XXXX XXXX XXXX XXXX
|
||||
00a0 00a1 00a2 00a3 20aa 00a5 00a6 00a7 00a8 00a9 00d7 00ab 00ac 00ad 00ae 00af
|
||||
00b0 00b1 00b2 00b3 00b4 00b5 00b6 00b7 00b8 00b9 00f7 00bb 00bc 00bd 00be 00bf
|
||||
05b0 05b1 05b2 05b3 05b4 05b5 05b6 05b7 05b8 05b9 XXXX 05bb 05bc 05bd 05be 05bf
|
||||
05c0 05c1 05c2 05c3 05f0 05f1 05f2 05f3 05f4 XXXX XXXX XXXX XXXX XXXX XXXX XXXX
|
||||
05d0 05d1 05d2 05d3 05d4 05d5 05d6 05d7 05d8 05d9 05da 05db 05dc 05dd 05de 05df
|
||||
05e0 05e1 05e2 05e3 05e4 05e5 05e6 05e7 05e8 05e9 05ea XXXX XXXX 200e 200f XXXX
|
||||
|
||||
charset CS_CP1256
|
||||
0000 0001 0002 0003 0004 0005 0006 0007 0008 0009 000a 000b 000c 000d 000e 000f
|
||||
0010 0011 0012 0013 0014 0015 0016 0017 0018 0019 001a 001b 001c 001d 001e 001f
|
||||
0020 0021 0022 0023 0024 0025 0026 0027 0028 0029 002a 002b 002c 002d 002e 002f
|
||||
0030 0031 0032 0033 0034 0035 0036 0037 0038 0039 003a 003b 003c 003d 003e 003f
|
||||
0040 0041 0042 0043 0044 0045 0046 0047 0048 0049 004a 004b 004c 004d 004e 004f
|
||||
0050 0051 0052 0053 0054 0055 0056 0057 0058 0059 005a 005b 005c 005d 005e 005f
|
||||
0060 0061 0062 0063 0064 0065 0066 0067 0068 0069 006a 006b 006c 006d 006e 006f
|
||||
0070 0071 0072 0073 0074 0075 0076 0077 0078 0079 007a 007b 007c 007d 007e 007f
|
||||
20ac 067e 201a 0192 201e 2026 2020 2021 02c6 2030 0679 2039 0152 0686 0698 0688
|
||||
06af 2018 2019 201c 201d 2022 2013 2014 06a9 2122 0691 203a 0153 200c 200d 06ba
|
||||
00a0 060c 00a2 00a3 00a4 00a5 00a6 00a7 00a8 00a9 06be 00ab 00ac 00ad 00ae 00af
|
||||
00b0 00b1 00b2 00b3 00b4 00b5 00b6 00b7 00b8 00b9 061b 00bb 00bc 00bd 00be 061f
|
||||
06c1 0621 0622 0623 0624 0625 0626 0627 0628 0629 062a 062b 062c 062d 062e 062f
|
||||
0630 0631 0632 0633 0634 0635 0636 00d7 0637 0638 0639 063a 0640 0641 0642 0643
|
||||
00e0 0644 00e2 0645 0646 0647 0648 00e7 00e8 00e9 00ea 00eb 0649 064a 00ee 00ef
|
||||
064b 064c 064d 064e 00f4 064f 0650 00f7 0651 00f9 0652 00fb 00fc 200e 200f 06d2
|
||||
|
||||
charset CS_CP1257
|
||||
0000 0001 0002 0003 0004 0005 0006 0007 0008 0009 000a 000b 000c 000d 000e 000f
|
||||
0010 0011 0012 0013 0014 0015 0016 0017 0018 0019 001a 001b 001c 001d 001e 001f
|
||||
0020 0021 0022 0023 0024 0025 0026 0027 0028 0029 002a 002b 002c 002d 002e 002f
|
||||
0030 0031 0032 0033 0034 0035 0036 0037 0038 0039 003a 003b 003c 003d 003e 003f
|
||||
0040 0041 0042 0043 0044 0045 0046 0047 0048 0049 004a 004b 004c 004d 004e 004f
|
||||
0050 0051 0052 0053 0054 0055 0056 0057 0058 0059 005a 005b 005c 005d 005e 005f
|
||||
0060 0061 0062 0063 0064 0065 0066 0067 0068 0069 006a 006b 006c 006d 006e 006f
|
||||
0070 0071 0072 0073 0074 0075 0076 0077 0078 0079 007a 007b 007c 007d 007e 007f
|
||||
20ac XXXX 201a XXXX 201e 2026 2020 2021 XXXX 2030 XXXX 2039 XXXX 00a8 02c7 00b8
|
||||
XXXX 2018 2019 201c 201d 2022 2013 2014 XXXX 2122 XXXX 203a XXXX 00af 02db XXXX
|
||||
00a0 XXXX 00a2 00a3 00a4 XXXX 00a6 00a7 00d8 00a9 0156 00ab 00ac 00ad 00ae 00c6
|
||||
00b0 00b1 00b2 00b3 00b4 00b5 00b6 00b7 00f8 00b9 0157 00bb 00bc 00bd 00be 00e6
|
||||
0104 012e 0100 0106 00c4 00c5 0118 0112 010c 00c9 0179 0116 0122 0136 012a 013b
|
||||
0160 0143 0145 00d3 014c 00d5 00d6 00d7 0172 0141 015a 016a 00dc 017b 017d 00df
|
||||
0105 012f 0101 0107 00e4 00e5 0119 0113 010d 00e9 017a 0117 0123 0137 012b 013c
|
||||
0161 0144 0146 00f3 014d 00f5 00f6 00f7 0173 0142 015b 016b 00fc 017c 017e 02d9
|
||||
|
||||
charset CS_CP1258
|
||||
0000 0001 0002 0003 0004 0005 0006 0007 0008 0009 000a 000b 000c 000d 000e 000f
|
||||
0010 0011 0012 0013 0014 0015 0016 0017 0018 0019 001a 001b 001c 001d 001e 001f
|
||||
0020 0021 0022 0023 0024 0025 0026 0027 0028 0029 002a 002b 002c 002d 002e 002f
|
||||
0030 0031 0032 0033 0034 0035 0036 0037 0038 0039 003a 003b 003c 003d 003e 003f
|
||||
0040 0041 0042 0043 0044 0045 0046 0047 0048 0049 004a 004b 004c 004d 004e 004f
|
||||
0050 0051 0052 0053 0054 0055 0056 0057 0058 0059 005a 005b 005c 005d 005e 005f
|
||||
0060 0061 0062 0063 0064 0065 0066 0067 0068 0069 006a 006b 006c 006d 006e 006f
|
||||
0070 0071 0072 0073 0074 0075 0076 0077 0078 0079 007a 007b 007c 007d 007e 007f
|
||||
20ac XXXX 201a 0192 201e 2026 2020 2021 02c6 2030 XXXX 2039 0152 XXXX XXXX XXXX
|
||||
XXXX 2018 2019 201c 201d 2022 2013 2014 02dc 2122 XXXX 203a 0153 XXXX XXXX 0178
|
||||
00a0 00a1 00a2 00a3 00a4 00a5 00a6 00a7 00a8 00a9 00aa 00ab 00ac 00ad 00ae 00af
|
||||
00b0 00b1 00b2 00b3 00b4 00b5 00b6 00b7 00b8 00b9 00ba 00bb 00bc 00bd 00be 00bf
|
||||
00c0 00c1 00c2 0102 00c4 00c5 00c6 00c7 00c8 00c9 00ca 00cb 0300 00cd 00ce 00cf
|
||||
0110 00d1 0309 00d3 00d4 01a0 00d6 00d7 00d8 00d9 00da 00db 00dc 01af 0303 00df
|
||||
00e0 00e1 00e2 0103 00e4 00e5 00e6 00e7 00e8 00e9 00ea 00eb 0301 00ed 00ee 00ef
|
||||
0111 00f1 0323 00f3 00f4 01a1 00f6 00f7 00f8 00f9 00fa 00fb 00fc 01b0 20ab 00ff
|
||||
|
||||
KOI8-R, generated by this code:
|
||||
|
||||
{ echo charset CS_KOI8_R;
|
||||
gensbcs http://www.unicode.org/Public/MAPPINGS/VENDORS/MISC/KOI8-R.TXT; }
|
||||
|
||||
charset CS_KOI8_R
|
||||
0000 0001 0002 0003 0004 0005 0006 0007 0008 0009 000a 000b 000c 000d 000e 000f
|
||||
0010 0011 0012 0013 0014 0015 0016 0017 0018 0019 001a 001b 001c 001d 001e 001f
|
||||
0020 0021 0022 0023 0024 0025 0026 0027 0028 0029 002a 002b 002c 002d 002e 002f
|
||||
0030 0031 0032 0033 0034 0035 0036 0037 0038 0039 003a 003b 003c 003d 003e 003f
|
||||
0040 0041 0042 0043 0044 0045 0046 0047 0048 0049 004a 004b 004c 004d 004e 004f
|
||||
0050 0051 0052 0053 0054 0055 0056 0057 0058 0059 005a 005b 005c 005d 005e 005f
|
||||
0060 0061 0062 0063 0064 0065 0066 0067 0068 0069 006a 006b 006c 006d 006e 006f
|
||||
0070 0071 0072 0073 0074 0075 0076 0077 0078 0079 007a 007b 007c 007d 007e 007f
|
||||
2500 2502 250c 2510 2514 2518 251c 2524 252c 2534 253c 2580 2584 2588 258c 2590
|
||||
2591 2592 2593 2320 25a0 2219 221a 2248 2264 2265 00a0 2321 00b0 00b2 00b7 00f7
|
||||
2550 2551 2552 0451 2553 2554 2555 2556 2557 2558 2559 255a 255b 255c 255d 255e
|
||||
255f 2560 2561 0401 2562 2563 2564 2565 2566 2567 2568 2569 256a 256b 256c 00a9
|
||||
044e 0430 0431 0446 0434 0435 0444 0433 0445 0438 0439 043a 043b 043c 043d 043e
|
||||
043f 044f 0440 0441 0442 0443 0436 0432 044c 044b 0437 0448 044d 0449 0447 044a
|
||||
042e 0410 0411 0426 0414 0415 0424 0413 0425 0418 0419 041a 041b 041c 041d 041e
|
||||
041f 042f 0420 0421 0422 0423 0416 0412 042c 042b 0417 0428 042d 0429 0427 042a
|
||||
|
||||
KOI8-U: I can't find an easily machine-processable mapping table
|
||||
for this one, so I've created it by hand-editing the KOI8-R
|
||||
mapping table in accordance with the list of differences specified
|
||||
in RFC2319. Note that RFC2319 has an apparent error: position B4
|
||||
is listed as U+0404 in the main character set list, but as U+0403
|
||||
in Appendix A (differences from KOI8-R). Both agree that it should
|
||||
be CYRILLIC CAPITAL LETTER UKRAINIAN IE, however, and the Unicode
|
||||
character database says that therefore U+0404 is the correct value.
|
||||
|
||||
charset CS_KOI8_U
|
||||
0000 0001 0002 0003 0004 0005 0006 0007 0008 0009 000a 000b 000c 000d 000e 000f
|
||||
0010 0011 0012 0013 0014 0015 0016 0017 0018 0019 001a 001b 001c 001d 001e 001f
|
||||
0020 0021 0022 0023 0024 0025 0026 0027 0028 0029 002a 002b 002c 002d 002e 002f
|
||||
0030 0031 0032 0033 0034 0035 0036 0037 0038 0039 003a 003b 003c 003d 003e 003f
|
||||
0040 0041 0042 0043 0044 0045 0046 0047 0048 0049 004a 004b 004c 004d 004e 004f
|
||||
0050 0051 0052 0053 0054 0055 0056 0057 0058 0059 005a 005b 005c 005d 005e 005f
|
||||
0060 0061 0062 0063 0064 0065 0066 0067 0068 0069 006a 006b 006c 006d 006e 006f
|
||||
0070 0071 0072 0073 0074 0075 0076 0077 0078 0079 007a 007b 007c 007d 007e 007f
|
||||
2500 2502 250c 2510 2514 2518 251c 2524 252c 2534 253c 2580 2584 2588 258c 2590
|
||||
2591 2592 2593 2320 25a0 2219 221a 2248 2264 2265 00a0 2321 00b0 00b2 00b7 00f7
|
||||
2550 2551 2552 0451 0454 2554 0456 0457 2557 2558 2559 255a 255b 0491 255d 255e
|
||||
255f 2560 2561 0401 0404 2563 0406 0407 2566 2567 2568 2569 256a 0490 256c 00a9
|
||||
044e 0430 0431 0446 0434 0435 0444 0433 0445 0438 0439 043a 043b 043c 043d 043e
|
||||
043f 044f 0440 0441 0442 0443 0436 0432 044c 044b 0437 0448 044d 0449 0447 044a
|
||||
042e 0410 0411 0426 0414 0415 0424 0413 0425 0418 0419 041a 041b 041c 041d 041e
|
||||
041f 042f 0420 0421 0422 0423 0416 0412 042c 042b 0417 0428 042d 0429 0427 042a
|
||||
|
||||
Mac Roman, generated by this code:
|
||||
|
||||
{ echo charset CS_MAC_ROMAN;
|
||||
gensbcs http://www.unicode.org/Public/MAPPINGS/VENDORS/APPLE/ROMAN.TXT; }
|
||||
|
||||
The code point F8FF at position F0 is an interesting one. In
|
||||
Unicode, it's the last of the Private Use section. The mapping
|
||||
table states that it should be an Apple logo. I suppose we should
|
||||
just leave it as it is; there's bound to be some software out
|
||||
there that understands U+F8FF to be an Apple logo!
|
||||
|
||||
charset CS_MAC_ROMAN
|
||||
0000 0001 0002 0003 0004 0005 0006 0007 0008 0009 000a 000b 000c 000d 000e 000f
|
||||
0010 0011 0012 0013 0014 0015 0016 0017 0018 0019 001a 001b 001c 001d 001e 001f
|
||||
0020 0021 0022 0023 0024 0025 0026 0027 0028 0029 002a 002b 002c 002d 002e 002f
|
||||
0030 0031 0032 0033 0034 0035 0036 0037 0038 0039 003a 003b 003c 003d 003e 003f
|
||||
0040 0041 0042 0043 0044 0045 0046 0047 0048 0049 004a 004b 004c 004d 004e 004f
|
||||
0050 0051 0052 0053 0054 0055 0056 0057 0058 0059 005a 005b 005c 005d 005e 005f
|
||||
0060 0061 0062 0063 0064 0065 0066 0067 0068 0069 006a 006b 006c 006d 006e 006f
|
||||
0070 0071 0072 0073 0074 0075 0076 0077 0078 0079 007a 007b 007c 007d 007e 007f
|
||||
00c4 00c5 00c7 00c9 00d1 00d6 00dc 00e1 00e0 00e2 00e4 00e3 00e5 00e7 00e9 00e8
|
||||
00ea 00eb 00ed 00ec 00ee 00ef 00f1 00f3 00f2 00f4 00f6 00f5 00fa 00f9 00fb 00fc
|
||||
2020 00b0 00a2 00a3 00a7 2022 00b6 00df 00ae 00a9 2122 00b4 00a8 2260 00c6 00d8
|
||||
221e 00b1 2264 2265 00a5 00b5 2202 2211 220f 03c0 222b 00aa 00ba 03a9 00e6 00f8
|
||||
00bf 00a1 00ac 221a 0192 2248 2206 00ab 00bb 2026 00a0 00c0 00c3 00d5 0152 0153
|
||||
2013 2014 201c 201d 2018 2019 00f7 25ca 00ff 0178 2044 20ac 2039 203a fb01 fb02
|
||||
2021 00b7 201a 201e 2030 00c2 00ca 00c1 00cb 00c8 00cd 00ce 00cf 00cc 00d3 00d4
|
||||
f8ff 00d2 00da 00db 00d9 0131 02c6 02dc 00af 02d8 02d9 02da 00b8 02dd 02db 02c7
|
||||
|
||||
Roman Czyborra's web site (http://czyborra.com/) has a variety of
|
||||
other useful mapping tables, in a slightly different format (and
|
||||
gzipped). Here's a shell/Perl function to generate an SBCS table
|
||||
from a Czyborra mapping table:
|
||||
|
||||
gensbcs_c() {
|
||||
wget -q -O - "$1" | gzip -d | \
|
||||
perl -ne '/^=(.*)\s+U\+(.*)\s+/ and $a[hex $1]=sprintf "%04x", hex $2;' \
|
||||
-e 'BEGIN{for($i=0;$i<256;$i++){$a[$i]="XXXX";' \
|
||||
-e 'if ($i < 32 or ($i >=127 and $i < 160)) {$a[$i]=sprintf "%04x", $i}}}' \
|
||||
-e 'END{for($i=0;$i<256;$i++){printf"%s%s",$a[$i],$i%16==15?"\n":" "}}'
|
||||
}
|
||||
|
||||
So here we have some character sets generated from Czyborra
|
||||
mapping tables: VISCII, HP-Roman8, and the DEC Multinational
|
||||
Character Set.
|
||||
|
||||
{ echo charset CS_VISCII;
|
||||
gensbcs_c http://czyborra.com/charsets/viscii.txt.gz; echo;
|
||||
echo charset CS_HP_ROMAN8;
|
||||
gensbcs_c http://czyborra.com/charsets/hp-roman8.txt.gz; echo;
|
||||
echo charset CS_DEC_MCS;
|
||||
gensbcs_c http://czyborra.com/charsets/dec-mcs.txt.gz; echo; }
|
||||
|
||||
charset CS_VISCII
|
||||
0000 0001 1eb2 0003 0004 1eb4 1eaa 0007 0008 0009 000a 000b 000c 000d 000e 000f
|
||||
0010 0011 0012 0013 1ef6 0015 0016 0017 0018 1ef8 001a 001b 001c 001d 1ef4 001f
|
||||
0020 0021 0022 0023 0024 0025 0026 0027 0028 0029 002a 002b 002c 002d 002e 002f
|
||||
0030 0031 0032 0033 0034 0035 0036 0037 0038 0039 003a 003b 003c 003d 003e 003f
|
||||
0040 0041 0042 0043 0044 0045 0046 0047 0048 0049 004a 004b 004c 004d 004e 004f
|
||||
0050 0051 0052 0053 0054 0055 0056 0057 0058 0059 005a 005b 005c 005d 005e 005f
|
||||
0060 0061 0062 0063 0064 0065 0066 0067 0068 0069 006a 006b 006c 006d 006e 006f
|
||||
0070 0071 0072 0073 0074 0075 0076 0077 0078 0079 007a 007b 007c 007d 007e 007f
|
||||
1ea0 1eae 1eb0 1eb6 1ea4 1ea6 1ea8 1eac 1ebc 1eb8 1ebe 1ec0 1ec2 1ec4 1ec6 1ed0
|
||||
1ed2 1ed4 1ed6 1ed8 1ee2 1eda 1edc 1ede 1eca 1ece 1ecc 1ec8 1ee6 0168 1ee4 1ef2
|
||||
00d5 1eaf 1eb1 1eb7 1ea5 1ea7 1ea8 1ead 1ebd 1eb9 1ebf 1ec1 1ec3 1ec5 1ec7 1ed1
|
||||
1ed3 1ed5 1ed7 1ee0 01a0 1ed9 1edd 1edf 1ecb 1ef0 1ee8 1eea 1eec 01a1 1edb 01af
|
||||
00c0 00c1 00c2 00c3 1ea2 0102 1eb3 1eb5 00c8 00c9 00ca 1eba 00cc 00cd 0128 1ef3
|
||||
0110 1ee9 00d2 00d3 00d4 1ea1 1ef7 1eeb 1eed 00d9 00da 1ef9 1ef5 00dd 1ee1 01b0
|
||||
00e0 00e1 00e2 00e3 1ea3 0103 1eef 1eab 00e8 00e9 00ea 1ebb 00ec 00ed 0129 1ec9
|
||||
0111 1ef1 00f2 00f3 00f4 00f5 1ecf 1ecd 1ee5 00f9 00fa 0169 1ee7 00fd 1ee3 1eee
|
||||
|
||||
charset CS_HP_ROMAN8
|
||||
0000 0001 0002 0003 0004 0005 0006 0007 0008 0009 000a 000b 000c 000d 000e 000f
|
||||
0010 0011 0012 0013 0014 0015 0016 0017 0018 0019 001a 001b 001c 001d 001e 001f
|
||||
0020 0021 0022 0023 0024 0025 0026 0027 0028 0029 002a 002b 002c 002d 002e 002f
|
||||
0030 0031 0032 0033 0034 0035 0036 0037 0038 0039 003a 003b 003c 003d 003e 003f
|
||||
0040 0041 0042 0043 0044 0045 0046 0047 0048 0049 004a 004b 004c 004d 004e 004f
|
||||
0050 0051 0052 0053 0054 0055 0056 0057 0058 0059 005a 005b 005c 005d 005e 005f
|
||||
0060 0061 0062 0063 0064 0065 0066 0067 0068 0069 006a 006b 006c 006d 006e 006f
|
||||
0070 0071 0072 0073 0074 0075 0076 0077 0078 0079 007a 007b 007c 007d 007e 007f
|
||||
0080 0081 0082 0083 0084 0085 0086 0087 0088 0089 008a 008b 008c 008d 008e 008f
|
||||
0090 0091 0092 0093 0094 0095 0096 0097 0098 0099 009a 009b 009c 009d 009e 009f
|
||||
00a0 00c0 00c2 00c8 00ca 00cb 00ce 00cf 00b4 02cb 02c6 00a8 02dc 00d9 00db 20a4
|
||||
00af 00dd 00fd 00b0 00c7 00e7 00d1 00f1 00a1 00bf 00a4 00a3 00a5 00a7 0192 00a2
|
||||
00e2 00ea 00f4 00fb 00e1 00e9 00f3 00fa 00e0 00e8 00f2 00f9 00e4 00eb 00f6 00fc
|
||||
00c5 00ee 00d8 00c6 00e5 00ed 00f8 00e6 00c4 00ec 00d6 00dc 00c9 00ef 00df 00d4
|
||||
00c1 00c3 00e3 00d0 00f0 00cd 00cc 00d3 00d2 00d5 00f5 0160 0161 00da 0178 00ff
|
||||
00de 00fe 00b7 00b5 00b6 00be 2014 00bc 00bd 00aa 00ba 00ab 25a0 00bb 00b1 XXXX
|
||||
|
||||
charset CS_DEC_MCS
|
||||
0000 0001 0002 0003 0004 0005 0006 0007 0008 0009 000a 000b 000c 000d 000e 000f
|
||||
0010 0011 0012 0013 0014 0015 0016 0017 0018 0019 001a 001b 001c 001d 001e 001f
|
||||
0020 0021 0022 0023 0024 0025 0026 0027 0028 0029 002a 002b 002c 002d 002e 002f
|
||||
0030 0031 0032 0033 0034 0035 0036 0037 0038 0039 003a 003b 003c 003d 003e 003f
|
||||
0040 0041 0042 0043 0044 0045 0046 0047 0048 0049 004a 004b 004c 004d 004e 004f
|
||||
0050 0051 0052 0053 0054 0055 0056 0057 0058 0059 005a 005b 005c 005d 005e 005f
|
||||
0060 0061 0062 0063 0064 0065 0066 0067 0068 0069 006a 006b 006c 006d 006e 006f
|
||||
0070 0071 0072 0073 0074 0075 0076 0077 0078 0079 007a 007b 007c 007d 007e 007f
|
||||
0080 0081 0082 0083 0084 0085 0086 0087 0088 0089 008a 008b 008c 008d 008e 008f
|
||||
0090 0091 0092 0093 0094 0095 0096 0097 0098 0099 009a 009b 009c 009d 009e 009f
|
||||
XXXX 00a1 00a2 00a3 XXXX 00a5 XXXX 00a7 00a4 00a9 00aa 00ab XXXX XXXX XXXX XXXX
|
||||
00b0 00b1 00b2 00b3 XXXX 00b5 00b6 00b7 XXXX 00b9 00ba 00bb 00bc 00bd XXXX 00bf
|
||||
00c0 00c1 00c2 00c3 00c4 00c5 00c6 00c7 00c8 00c9 00ca 00cb 00cc 00cd 00ce 00cf
|
||||
XXXX 00d1 00d2 00d3 00d4 00d5 00d6 0152 00d8 00d9 00da 00db 00dc 0178 XXXX 00df
|
||||
00e0 00e1 00e2 00e3 00e4 00e5 00e6 00e7 00e8 00e9 00ea 00eb 00ec 00ed 00ee 00ef
|
||||
XXXX 00f1 00f2 00f3 00f4 00f5 00f6 0153 00f8 00f9 00fa 00fb 00fc 00ff XXXX XXXX
|
95
charset/sbcsgen.pl
Normal file
95
charset/sbcsgen.pl
Normal file
@ -0,0 +1,95 @@
|
||||
#!/usr/bin/env perl -w
|
||||
|
||||
# This script generates sbcsdat.c (the data for all the SBCSes) from its
|
||||
# source form sbcs.dat.
|
||||
|
||||
$infile = "sbcs.dat";
|
||||
$outfile = "sbcsdat.c";
|
||||
|
||||
open FOO, $infile;
|
||||
open BAR, ">$outfile";
|
||||
select BAR;
|
||||
|
||||
print "/*\n";
|
||||
print " * sbcsdat.c - data definitions for single-byte character sets.\n";
|
||||
print " *\n";
|
||||
print " * Generated by sbcsgen.pl from sbcs.dat.\n";
|
||||
print " * You should edit those files rather than editing this one.\n";
|
||||
print " */\n";
|
||||
print "\n";
|
||||
print "#ifndef ENUM_CHARSETS\n";
|
||||
print "\n";
|
||||
print "#include \"charset.h\"\n";
|
||||
print "#include \"internal.h\"\n";
|
||||
print "\n";
|
||||
|
||||
my $charsetname = undef;
|
||||
my @vals = ();
|
||||
|
||||
my @charsetnames = ();
|
||||
|
||||
while (<FOO>) {
|
||||
chomp;
|
||||
if (/^charset (.*)$/) {
|
||||
$charsetname = $1;
|
||||
@vals = ();
|
||||
} elsif (/^[0-9a-fA-FX]/) {
|
||||
push @vals, map { $_ eq "XXXX" ? -1 : hex $_ } split / +/, $_;
|
||||
if (scalar @vals > 256) {
|
||||
die "$infile:$.: charset $charsetname has more than 256 values\n";
|
||||
} elsif (scalar @vals == 256) {
|
||||
&outcharset($charsetname, @vals);
|
||||
push @charsetnames, $charsetname;
|
||||
$charsetname = undef;
|
||||
@vals = ();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
print "#else /* ENUM_CHARSETS */\n";
|
||||
print "\n";
|
||||
|
||||
foreach $i (@charsetnames) {
|
||||
print "ENUM_CHARSET($i)\n";
|
||||
}
|
||||
|
||||
print "\n";
|
||||
print "#endif /* ENUM_CHARSETS */\n";
|
||||
|
||||
sub outcharset($@) {
|
||||
my ($name, @vals) = @_;
|
||||
my ($prefix, $i, @sorted);
|
||||
|
||||
print "static const sbcs_data data_$name = {\n";
|
||||
print " {\n";
|
||||
$prefix = " ";
|
||||
@sorted = ();
|
||||
for ($i = 0; $i < 256; $i++) {
|
||||
if ($vals[$i] < 0) {
|
||||
printf "%sERROR ", $prefix;
|
||||
} else {
|
||||
printf "%s0x%04x", $prefix, $vals[$i];
|
||||
push @sorted, [$i, $vals[$i]];
|
||||
}
|
||||
if ($i % 8 == 7) {
|
||||
$prefix = ",\n ";
|
||||
} else {
|
||||
$prefix = ", ";
|
||||
}
|
||||
}
|
||||
print "\n },\n {\n";
|
||||
@sorted = sort { $a->[1] <=> $b->[1] } @sorted;
|
||||
$prefix = " ";
|
||||
for ($i = 0; $i < scalar @sorted; $i++) {
|
||||
printf "%s0x%02x", $prefix, $sorted[$i]->[0];
|
||||
if ($i % 8 == 7) {
|
||||
$prefix = ",\n ";
|
||||
} else {
|
||||
$prefix = ", ";
|
||||
}
|
||||
}
|
||||
printf "\n },\n %d\n", scalar @sorted;
|
||||
print "};\n";
|
||||
print "const charset_spec charset_$name = {\n" .
|
||||
" $name, read_sbcs, write_sbcs, &data_$name\n};\n\n";
|
||||
}
|
29
charset/slookup.c
Normal file
29
charset/slookup.c
Normal file
@ -0,0 +1,29 @@
|
||||
/*
|
||||
* slookup.c - static lookup of character sets.
|
||||
*/
|
||||
|
||||
#include "charset.h"
|
||||
#include "internal.h"
|
||||
|
||||
#define ENUM_CHARSET(x) extern charset_spec const charset_##x;
|
||||
#include "enum.c"
|
||||
#undef ENUM_CHARSET
|
||||
|
||||
static charset_spec const *const cs_table[] = {
|
||||
|
||||
#define ENUM_CHARSET(x) &charset_##x,
|
||||
#include "enum.c"
|
||||
#undef ENUM_CHARSET
|
||||
|
||||
};
|
||||
|
||||
charset_spec const *charset_find_spec(int charset)
|
||||
{
|
||||
int i;
|
||||
|
||||
for (i = 0; i < (int)lenof(cs_table); i++)
|
||||
if (cs_table[i]->charset == charset)
|
||||
return cs_table[i];
|
||||
|
||||
return NULL;
|
||||
}
|
89
charset/toucs.c
Normal file
89
charset/toucs.c
Normal file
@ -0,0 +1,89 @@
|
||||
/*
|
||||
* toucs.c - convert charsets to Unicode.
|
||||
*/
|
||||
|
||||
#include "charset.h"
|
||||
#include "internal.h"
|
||||
|
||||
struct unicode_emit_param {
|
||||
wchar_t *output;
|
||||
int outlen;
|
||||
const wchar_t *errstr;
|
||||
int errlen;
|
||||
int stopped;
|
||||
};
|
||||
|
||||
static void unicode_emit(void *ctx, long int output)
|
||||
{
|
||||
struct unicode_emit_param *param = (struct unicode_emit_param *)ctx;
|
||||
wchar_t outval;
|
||||
wchar_t const *p;
|
||||
int outlen;
|
||||
|
||||
if (output == ERROR) {
|
||||
if (param->errstr) {
|
||||
p = param->errstr;
|
||||
outlen = param->errlen;
|
||||
} else {
|
||||
outval = 0xFFFD; /* U+FFFD REPLACEMENT CHARACTER */
|
||||
p = &outval;
|
||||
outlen = 1;
|
||||
}
|
||||
} else {
|
||||
outval = output;
|
||||
p = &outval;
|
||||
outlen = 1;
|
||||
}
|
||||
|
||||
if (param->outlen >= outlen) {
|
||||
while (outlen > 0) {
|
||||
*param->output++ = *p++;
|
||||
param->outlen--;
|
||||
outlen--;
|
||||
}
|
||||
} else {
|
||||
param->stopped = 1;
|
||||
}
|
||||
}
|
||||
|
||||
int charset_to_unicode(char **input, int *inlen, wchar_t *output, int outlen,
|
||||
int charset, charset_state *state,
|
||||
const wchar_t *errstr, int errlen)
|
||||
{
|
||||
charset_spec const *spec = charset_find_spec(charset);
|
||||
charset_state localstate;
|
||||
struct unicode_emit_param param;
|
||||
|
||||
param.output = output;
|
||||
param.outlen = outlen;
|
||||
param.errstr = errstr;
|
||||
param.errlen = errlen;
|
||||
param.stopped = 0;
|
||||
|
||||
if (!state) {
|
||||
localstate.s0 = 0;
|
||||
} else {
|
||||
localstate = *state; /* structure copy */
|
||||
}
|
||||
|
||||
while (*inlen > 0) {
|
||||
int lenbefore = param.output - output;
|
||||
spec->read(spec, (unsigned char)**input, &localstate,
|
||||
unicode_emit, ¶m);
|
||||
if (param.stopped) {
|
||||
/*
|
||||
* The emit function has _tried_ to output some
|
||||
* characters, but ran up against the end of the
|
||||
* buffer. Leave immediately, and return what happened
|
||||
* _before_ attempting to process this character.
|
||||
*/
|
||||
return lenbefore;
|
||||
}
|
||||
if (state)
|
||||
*state = localstate; /* structure copy */
|
||||
(*input)++;
|
||||
(*inlen)--;
|
||||
}
|
||||
|
||||
return param.output - output;
|
||||
}
|
877
charset/utf8.c
Normal file
877
charset/utf8.c
Normal file
@ -0,0 +1,877 @@
|
||||
/*
|
||||
* utf8.c - routines to handle UTF-8.
|
||||
*/
|
||||
|
||||
#ifndef ENUM_CHARSETS
|
||||
|
||||
#include "charset.h"
|
||||
#include "internal.h"
|
||||
|
||||
/*
|
||||
* UTF-8 has no associated data, so `charset' may be ignored.
|
||||
*/
|
||||
|
||||
void read_utf8(charset_spec const *charset, long int input_chr,
|
||||
charset_state *state,
|
||||
void (*emit)(void *ctx, long int output), void *emitctx)
|
||||
{
|
||||
UNUSEDARG(charset);
|
||||
|
||||
/*
|
||||
* For reading UTF-8, the `state' word contains:
|
||||
*
|
||||
* - in bits 29-31, the number of bytes expected to be in the
|
||||
* current multibyte character (which we can tell instantly
|
||||
* from the first byte, of course).
|
||||
*
|
||||
* - in bits 26-28, the number of bytes _seen so far_ in the
|
||||
* current multibyte character.
|
||||
*
|
||||
* - in the remainder of the word, the current value of the
|
||||
* character, which is shifted upwards by 6 bits to
|
||||
* accommodate each new byte.
|
||||
*
|
||||
* As required, the state is zero when we are not in the middle
|
||||
* of a multibyte character at all.
|
||||
*
|
||||
* For example, when reading E9 8D 8B, starting at state=0:
|
||||
*
|
||||
* - after E9, the state is 0x64000009
|
||||
* - after 8D, the state is 0x6800024d
|
||||
* - after 8B, the state conceptually becomes 0x6c00934b, at
|
||||
* which point we notice we've got as many characters as we
|
||||
* were expecting, output U+934B, and reset the state to
|
||||
* zero.
|
||||
*
|
||||
* Note that the maximum number of bits we might need to store
|
||||
* in the character value field is 25 (U+7FFFFFFF contains 31
|
||||
* bits, but we will never actually store its full value
|
||||
* because when we receive the last 6 bits in the final
|
||||
* continuation byte we will output it and revert the state to
|
||||
* zero). Hence the character value field never collides with
|
||||
* the byte counts.
|
||||
*/
|
||||
|
||||
if (input_chr < 0x80) {
|
||||
/*
|
||||
* Single-byte character. If the state is nonzero before
|
||||
* coming here, output an error for an incomplete sequence.
|
||||
* Then output the character.
|
||||
*/
|
||||
if (state->s0 != 0) {
|
||||
emit(emitctx, ERROR);
|
||||
state->s0 = 0;
|
||||
}
|
||||
emit(emitctx, input_chr);
|
||||
} else if (input_chr == 0xFE || input_chr == 0xFF) {
|
||||
/*
|
||||
* FE and FF bytes should _never_ occur in UTF-8. They are
|
||||
* automatic errors; if the state was nonzero to start
|
||||
* with, output a further error for an incomplete sequence.
|
||||
*/
|
||||
if (state->s0 != 0) {
|
||||
emit(emitctx, ERROR);
|
||||
state->s0 = 0;
|
||||
}
|
||||
emit(emitctx, ERROR);
|
||||
} else if (input_chr >= 0x80 && input_chr < 0xC0) {
|
||||
/*
|
||||
* Continuation byte. Output an error for an unexpected
|
||||
* continuation byte, if the state is zero.
|
||||
*/
|
||||
if (state->s0 == 0) {
|
||||
emit(emitctx, ERROR);
|
||||
} else {
|
||||
unsigned long charval;
|
||||
unsigned long topstuff;
|
||||
int bytes;
|
||||
|
||||
/*
|
||||
* Otherwise, accumulate more of the character value.
|
||||
*/
|
||||
charval = state->s0 & 0x03ffffffL;
|
||||
charval = (charval << 6) | (input_chr & 0x3F);
|
||||
|
||||
/*
|
||||
* Check the byte counts; if we have not reached the
|
||||
* end of the character, update the state and return.
|
||||
*/
|
||||
topstuff = state->s0 & 0xfc000000L;
|
||||
topstuff += 0x04000000L; /* add one to the byte count */
|
||||
if (((topstuff << 3) ^ topstuff) & 0xe0000000L) {
|
||||
state->s0 = topstuff | charval;
|
||||
return;
|
||||
}
|
||||
|
||||
/*
|
||||
* Now we know we've reached the end of the character.
|
||||
* `charval' is the Unicode value. We should check for
|
||||
* various invalid things, and then either output
|
||||
* charval or an error. In all cases we reset the state
|
||||
* to zero.
|
||||
*/
|
||||
bytes = topstuff >> 29;
|
||||
state->s0 = 0;
|
||||
|
||||
if (charval >= 0xD800 && charval < 0xE000) {
|
||||
/*
|
||||
* Surrogates (0xD800-0xDFFF) may never be encoded
|
||||
* in UTF-8. A surrogate pair in Unicode should
|
||||
* have been encoded as a single UTF-8 character
|
||||
* occupying more than three bytes.
|
||||
*/
|
||||
emit(emitctx, ERROR);
|
||||
} else if (charval == 0xFFFE || charval == 0xFFFF) {
|
||||
/*
|
||||
* U+FFFE and U+FFFF are invalid Unicode characters
|
||||
* and may never be encoded in UTF-8. (This is one
|
||||
* reason why U+FFFF is our way of signalling an
|
||||
* error to our `emit' function :-)
|
||||
*/
|
||||
emit(emitctx, ERROR);
|
||||
} else if ((charval <= 0x7FL /* && bytes > 1 */) ||
|
||||
(charval <= 0x7FFL && bytes > 2) ||
|
||||
(charval <= 0xFFFFL && bytes > 3) ||
|
||||
(charval <= 0x1FFFFFL && bytes > 4) ||
|
||||
(charval <= 0x3FFFFFFL && bytes > 5)) {
|
||||
/*
|
||||
* Overlong sequences are not to be tolerated,
|
||||
* under any circumstances.
|
||||
*/
|
||||
emit(emitctx, ERROR);
|
||||
} else {
|
||||
/*
|
||||
* Oh, all right. We'll let this one off.
|
||||
*/
|
||||
emit(emitctx, charval);
|
||||
}
|
||||
}
|
||||
|
||||
} else {
|
||||
/*
|
||||
* Lead byte. First output an error for an incomplete
|
||||
* sequence, if the state is nonzero.
|
||||
*/
|
||||
if (state->s0 != 0)
|
||||
emit(emitctx, ERROR);
|
||||
|
||||
/*
|
||||
* Now deal with the lead byte: work out the number of
|
||||
* bytes we expect to see in this character, and extract
|
||||
* the initial bits of it too.
|
||||
*/
|
||||
if (input_chr >= 0xC0 && input_chr < 0xE0) {
|
||||
state->s0 = 0x44000000L | (input_chr & 0x1F);
|
||||
} else if (input_chr >= 0xE0 && input_chr < 0xF0) {
|
||||
state->s0 = 0x64000000L | (input_chr & 0x0F);
|
||||
} else if (input_chr >= 0xF0 && input_chr < 0xF8) {
|
||||
state->s0 = 0x84000000L | (input_chr & 0x07);
|
||||
} else if (input_chr >= 0xF8 && input_chr < 0xFC) {
|
||||
state->s0 = 0xa4000000L | (input_chr & 0x03);
|
||||
} else if (input_chr >= 0xFC && input_chr < 0xFE) {
|
||||
state->s0 = 0xc4000000L | (input_chr & 0x01);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* UTF-8 is a stateless multi-byte encoding (in the sense that just
|
||||
* after any character has been completed, the state is always the
|
||||
* same); hence when writing it, there is no need to use the
|
||||
* charset_state.
|
||||
*/
|
||||
|
||||
void write_utf8(charset_spec const *charset, long int input_chr,
|
||||
charset_state *state,
|
||||
void (*emit)(void *ctx, long int output), void *emitctx)
|
||||
{
|
||||
UNUSEDARG(charset);
|
||||
UNUSEDARG(state);
|
||||
|
||||
/*
|
||||
* Refuse to output any illegal code points.
|
||||
*/
|
||||
if (input_chr == 0xFFFE || input_chr == 0xFFFF ||
|
||||
(input_chr >= 0xD800 && input_chr < 0xE000)) {
|
||||
emit(emitctx, ERROR);
|
||||
} else if (input_chr < 0x80) { /* one-byte character */
|
||||
emit(emitctx, input_chr);
|
||||
} else if (input_chr < 0x800) { /* two-byte character */
|
||||
emit(emitctx, 0xC0 | (0x1F & (input_chr >> 6)));
|
||||
emit(emitctx, 0x80 | (0x3F & (input_chr )));
|
||||
} else if (input_chr < 0x10000) { /* three-byte character */
|
||||
emit(emitctx, 0xE0 | (0x0F & (input_chr >> 12)));
|
||||
emit(emitctx, 0x80 | (0x3F & (input_chr >> 6)));
|
||||
emit(emitctx, 0x80 | (0x3F & (input_chr )));
|
||||
} else if (input_chr < 0x200000) { /* four-byte character */
|
||||
emit(emitctx, 0xF0 | (0x07 & (input_chr >> 18)));
|
||||
emit(emitctx, 0x80 | (0x3F & (input_chr >> 12)));
|
||||
emit(emitctx, 0x80 | (0x3F & (input_chr >> 6)));
|
||||
emit(emitctx, 0x80 | (0x3F & (input_chr )));
|
||||
} else if (input_chr < 0x4000000) {/* five-byte character */
|
||||
emit(emitctx, 0xF8 | (0x03 & (input_chr >> 24)));
|
||||
emit(emitctx, 0x80 | (0x3F & (input_chr >> 18)));
|
||||
emit(emitctx, 0x80 | (0x3F & (input_chr >> 12)));
|
||||
emit(emitctx, 0x80 | (0x3F & (input_chr >> 6)));
|
||||
emit(emitctx, 0x80 | (0x3F & (input_chr )));
|
||||
} else { /* six-byte character */
|
||||
emit(emitctx, 0xFC | (0x01 & (input_chr >> 30)));
|
||||
emit(emitctx, 0x80 | (0x3F & (input_chr >> 24)));
|
||||
emit(emitctx, 0x80 | (0x3F & (input_chr >> 18)));
|
||||
emit(emitctx, 0x80 | (0x3F & (input_chr >> 12)));
|
||||
emit(emitctx, 0x80 | (0x3F & (input_chr >> 6)));
|
||||
emit(emitctx, 0x80 | (0x3F & (input_chr )));
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef TESTMODE
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdarg.h>
|
||||
|
||||
int total_errs = 0;
|
||||
|
||||
void utf8_emit(void *ctx, long output)
|
||||
{
|
||||
wchar_t **p = (wchar_t **)ctx;
|
||||
*(*p)++ = output;
|
||||
}
|
||||
|
||||
void utf8_read_test(int line, char *input, int inlen, ...)
|
||||
{
|
||||
va_list ap;
|
||||
wchar_t *p, str[512];
|
||||
int i;
|
||||
charset_state state;
|
||||
unsigned long l;
|
||||
|
||||
state.s0 = 0;
|
||||
p = str;
|
||||
|
||||
for (i = 0; i < inlen; i++)
|
||||
read_utf8(NULL, input[i] & 0xFF, &state, utf8_emit, &p);
|
||||
|
||||
va_start(ap, inlen);
|
||||
l = 0;
|
||||
for (i = 0; i < p - str; i++) {
|
||||
l = va_arg(ap, long int);
|
||||
if (l == -1) {
|
||||
printf("%d: correct string shorter than output\n", line);
|
||||
total_errs++;
|
||||
break;
|
||||
}
|
||||
if (l != str[i]) {
|
||||
printf("%d: char %d came out as %08x, should be %08x\n",
|
||||
line, i, str[i], l);
|
||||
total_errs++;
|
||||
}
|
||||
}
|
||||
if (l != -1) {
|
||||
l = va_arg(ap, long int);
|
||||
if (l != -1) {
|
||||
printf("%d: correct string longer than output\n", line);
|
||||
total_errs++;
|
||||
}
|
||||
}
|
||||
va_end(ap);
|
||||
}
|
||||
|
||||
void utf8_write_test(int line, const long *input, int inlen, ...)
|
||||
{
|
||||
va_list ap;
|
||||
wchar_t *p, str[512];
|
||||
int i;
|
||||
charset_state state;
|
||||
unsigned long l;
|
||||
|
||||
state.s0 = 0;
|
||||
p = str;
|
||||
|
||||
for (i = 0; i < inlen; i++)
|
||||
write_utf8(NULL, input[i], &state, utf8_emit, &p);
|
||||
|
||||
va_start(ap, inlen);
|
||||
l = 0;
|
||||
for (i = 0; i < p - str; i++) {
|
||||
l = va_arg(ap, long int);
|
||||
if (l == -1) {
|
||||
printf("%d: correct string shorter than output\n", line);
|
||||
total_errs++;
|
||||
break;
|
||||
}
|
||||
if (l != str[i]) {
|
||||
printf("%d: char %d came out as %08x, should be %08x\n",
|
||||
line, i, str[i], l);
|
||||
total_errs++;
|
||||
}
|
||||
}
|
||||
if (l != -1) {
|
||||
l = va_arg(ap, long int);
|
||||
if (l != -1) {
|
||||
printf("%d: correct string longer than output\n", line);
|
||||
total_errs++;
|
||||
}
|
||||
}
|
||||
va_end(ap);
|
||||
}
|
||||
|
||||
/* Macro to concoct the first three parameters of utf8_read_test. */
|
||||
#define TESTSTR(x) __LINE__, x, lenof(x)
|
||||
|
||||
int main(void)
|
||||
{
|
||||
printf("read tests beginning\n");
|
||||
utf8_read_test(TESTSTR("\xCE\xBA\xE1\xBD\xB9\xCF\x83\xCE\xBC\xCE\xB5"),
|
||||
0x000003BA, /* GREEK SMALL LETTER KAPPA */
|
||||
0x00001F79, /* GREEK SMALL LETTER OMICRON WITH OXIA */
|
||||
0x000003C3, /* GREEK SMALL LETTER SIGMA */
|
||||
0x000003BC, /* GREEK SMALL LETTER MU */
|
||||
0x000003B5, /* GREEK SMALL LETTER EPSILON */
|
||||
0, -1);
|
||||
utf8_read_test(TESTSTR("\x00"),
|
||||
0x00000000, /* <control> */
|
||||
0, -1);
|
||||
utf8_read_test(TESTSTR("\xC2\x80"),
|
||||
0x00000080, /* <control> */
|
||||
0, -1);
|
||||
utf8_read_test(TESTSTR("\xE0\xA0\x80"),
|
||||
0x00000800, /* <no name available> */
|
||||
0, -1);
|
||||
utf8_read_test(TESTSTR("\xF0\x90\x80\x80"),
|
||||
0x00010000, /* <no name available> */
|
||||
0, -1);
|
||||
utf8_read_test(TESTSTR("\xF8\x88\x80\x80\x80"),
|
||||
0x00200000, /* <no name available> */
|
||||
0, -1);
|
||||
utf8_read_test(TESTSTR("\xFC\x84\x80\x80\x80\x80"),
|
||||
0x04000000, /* <no name available> */
|
||||
0, -1);
|
||||
utf8_read_test(TESTSTR("\x7F"),
|
||||
0x0000007F, /* <control> */
|
||||
0, -1);
|
||||
utf8_read_test(TESTSTR("\xDF\xBF"),
|
||||
0x000007FF, /* <no name available> */
|
||||
0, -1);
|
||||
utf8_read_test(TESTSTR("\xEF\xBF\xBD"),
|
||||
0x0000FFFD, /* REPLACEMENT CHARACTER */
|
||||
0, -1);
|
||||
utf8_read_test(TESTSTR("\xEF\xBF\xBF"),
|
||||
ERROR, /* <no name available> (invalid char) */
|
||||
0, -1);
|
||||
utf8_read_test(TESTSTR("\xF7\xBF\xBF\xBF"),
|
||||
0x001FFFFF, /* <no name available> */
|
||||
0, -1);
|
||||
utf8_read_test(TESTSTR("\xFB\xBF\xBF\xBF\xBF"),
|
||||
0x03FFFFFF, /* <no name available> */
|
||||
0, -1);
|
||||
utf8_read_test(TESTSTR("\xFD\xBF\xBF\xBF\xBF\xBF"),
|
||||
0x7FFFFFFF, /* <no name available> */
|
||||
0, -1);
|
||||
utf8_read_test(TESTSTR("\xED\x9F\xBF"),
|
||||
0x0000D7FF, /* <no name available> */
|
||||
0, -1);
|
||||
utf8_read_test(TESTSTR("\xEE\x80\x80"),
|
||||
0x0000E000, /* <Private Use, First> */
|
||||
0, -1);
|
||||
utf8_read_test(TESTSTR("\xEF\xBF\xBD"),
|
||||
0x0000FFFD, /* REPLACEMENT CHARACTER */
|
||||
0, -1);
|
||||
utf8_read_test(TESTSTR("\xF4\x8F\xBF\xBF"),
|
||||
0x0010FFFF, /* <no name available> */
|
||||
0, -1);
|
||||
utf8_read_test(TESTSTR("\xF4\x90\x80\x80"),
|
||||
0x00110000, /* <no name available> */
|
||||
0, -1);
|
||||
utf8_read_test(TESTSTR("\x80"),
|
||||
ERROR, /* (unexpected continuation byte) */
|
||||
0, -1);
|
||||
utf8_read_test(TESTSTR("\xBF"),
|
||||
ERROR, /* (unexpected continuation byte) */
|
||||
0, -1);
|
||||
utf8_read_test(TESTSTR("\x80\xBF"),
|
||||
ERROR, /* (unexpected continuation byte) */
|
||||
ERROR, /* (unexpected continuation byte) */
|
||||
0, -1);
|
||||
utf8_read_test(TESTSTR("\x80\xBF\x80"),
|
||||
ERROR, /* (unexpected continuation byte) */
|
||||
ERROR, /* (unexpected continuation byte) */
|
||||
ERROR, /* (unexpected continuation byte) */
|
||||
0, -1);
|
||||
utf8_read_test(TESTSTR("\x80\xBF\x80\xBF"),
|
||||
ERROR, /* (unexpected continuation byte) */
|
||||
ERROR, /* (unexpected continuation byte) */
|
||||
ERROR, /* (unexpected continuation byte) */
|
||||
ERROR, /* (unexpected continuation byte) */
|
||||
0, -1);
|
||||
utf8_read_test(TESTSTR("\x80\xBF\x80\xBF\x80"),
|
||||
ERROR, /* (unexpected continuation byte) */
|
||||
ERROR, /* (unexpected continuation byte) */
|
||||
ERROR, /* (unexpected continuation byte) */
|
||||
ERROR, /* (unexpected continuation byte) */
|
||||
ERROR, /* (unexpected continuation byte) */
|
||||
0, -1);
|
||||
utf8_read_test(TESTSTR("\x80\xBF\x80\xBF\x80\xBF"),
|
||||
ERROR, /* (unexpected continuation byte) */
|
||||
ERROR, /* (unexpected continuation byte) */
|
||||
ERROR, /* (unexpected continuation byte) */
|
||||
ERROR, /* (unexpected continuation byte) */
|
||||
ERROR, /* (unexpected continuation byte) */
|
||||
ERROR, /* (unexpected continuation byte) */
|
||||
0, -1);
|
||||
utf8_read_test(TESTSTR("\x80\xBF\x80\xBF\x80\xBF\x80"),
|
||||
ERROR, /* (unexpected continuation byte) */
|
||||
ERROR, /* (unexpected continuation byte) */
|
||||
ERROR, /* (unexpected continuation byte) */
|
||||
ERROR, /* (unexpected continuation byte) */
|
||||
ERROR, /* (unexpected continuation byte) */
|
||||
ERROR, /* (unexpected continuation byte) */
|
||||
ERROR, /* (unexpected continuation byte) */
|
||||
0, -1);
|
||||
utf8_read_test(TESTSTR("\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8A\x8B\x8C\x8D\x8E\x8F\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9A\x9B\x9C\x9D\x9E\x9F\xA0\xA1\xA2\xA3\xA4\xA5\xA6\xA7\xA8\xA9\xAA\xAB\xAC\xAD\xAE\xAF\xB0\xB1\xB2\xB3\xB4\xB5\xB6\xB7\xB8\xB9\xBA\xBB\xBC\xBD\xBE\xBF"),
|
||||
ERROR, /* (unexpected continuation byte) */
|
||||
ERROR, /* (unexpected continuation byte) */
|
||||
ERROR, /* (unexpected continuation byte) */
|
||||
ERROR, /* (unexpected continuation byte) */
|
||||
ERROR, /* (unexpected continuation byte) */
|
||||
ERROR, /* (unexpected continuation byte) */
|
||||
ERROR, /* (unexpected continuation byte) */
|
||||
ERROR, /* (unexpected continuation byte) */
|
||||
ERROR, /* (unexpected continuation byte) */
|
||||
ERROR, /* (unexpected continuation byte) */
|
||||
ERROR, /* (unexpected continuation byte) */
|
||||
ERROR, /* (unexpected continuation byte) */
|
||||
ERROR, /* (unexpected continuation byte) */
|
||||
ERROR, /* (unexpected continuation byte) */
|
||||
ERROR, /* (unexpected continuation byte) */
|
||||
ERROR, /* (unexpected continuation byte) */
|
||||
ERROR, /* (unexpected continuation byte) */
|
||||
ERROR, /* (unexpected continuation byte) */
|
||||
ERROR, /* (unexpected continuation byte) */
|
||||
ERROR, /* (unexpected continuation byte) */
|
||||
ERROR, /* (unexpected continuation byte) */
|
||||
ERROR, /* (unexpected continuation byte) */
|
||||
ERROR, /* (unexpected continuation byte) */
|
||||
ERROR, /* (unexpected continuation byte) */
|
||||
ERROR, /* (unexpected continuation byte) */
|
||||
ERROR, /* (unexpected continuation byte) */
|
||||
ERROR, /* (unexpected continuation byte) */
|
||||
ERROR, /* (unexpected continuation byte) */
|
||||
ERROR, /* (unexpected continuation byte) */
|
||||
ERROR, /* (unexpected continuation byte) */
|
||||
ERROR, /* (unexpected continuation byte) */
|
||||
ERROR, /* (unexpected continuation byte) */
|
||||
ERROR, /* (unexpected continuation byte) */
|
||||
ERROR, /* (unexpected continuation byte) */
|
||||
ERROR, /* (unexpected continuation byte) */
|
||||
ERROR, /* (unexpected continuation byte) */
|
||||
ERROR, /* (unexpected continuation byte) */
|
||||
ERROR, /* (unexpected continuation byte) */
|
||||
ERROR, /* (unexpected continuation byte) */
|
||||
ERROR, /* (unexpected continuation byte) */
|
||||
ERROR, /* (unexpected continuation byte) */
|
||||
ERROR, /* (unexpected continuation byte) */
|
||||
ERROR, /* (unexpected continuation byte) */
|
||||
ERROR, /* (unexpected continuation byte) */
|
||||
ERROR, /* (unexpected continuation byte) */
|
||||
ERROR, /* (unexpected continuation byte) */
|
||||
ERROR, /* (unexpected continuation byte) */
|
||||
ERROR, /* (unexpected continuation byte) */
|
||||
ERROR, /* (unexpected continuation byte) */
|
||||
ERROR, /* (unexpected continuation byte) */
|
||||
ERROR, /* (unexpected continuation byte) */
|
||||
ERROR, /* (unexpected continuation byte) */
|
||||
ERROR, /* (unexpected continuation byte) */
|
||||
ERROR, /* (unexpected continuation byte) */
|
||||
ERROR, /* (unexpected continuation byte) */
|
||||
ERROR, /* (unexpected continuation byte) */
|
||||
ERROR, /* (unexpected continuation byte) */
|
||||
ERROR, /* (unexpected continuation byte) */
|
||||
ERROR, /* (unexpected continuation byte) */
|
||||
ERROR, /* (unexpected continuation byte) */
|
||||
ERROR, /* (unexpected continuation byte) */
|
||||
ERROR, /* (unexpected continuation byte) */
|
||||
ERROR, /* (unexpected continuation byte) */
|
||||
ERROR, /* (unexpected continuation byte) */
|
||||
0, -1);
|
||||
utf8_read_test(TESTSTR("\xC0\x20\xC1\x20\xC2\x20\xC3\x20\xC4\x20\xC5\x20\xC6\x20\xC7\x20"),
|
||||
ERROR, /* (incomplete sequence) */
|
||||
0x00000020, /* SPACE */
|
||||
ERROR, /* (incomplete sequence) */
|
||||
0x00000020, /* SPACE */
|
||||
ERROR, /* (incomplete sequence) */
|
||||
0x00000020, /* SPACE */
|
||||
ERROR, /* (incomplete sequence) */
|
||||
0x00000020, /* SPACE */
|
||||
ERROR, /* (incomplete sequence) */
|
||||
0x00000020, /* SPACE */
|
||||
ERROR, /* (incomplete sequence) */
|
||||
0x00000020, /* SPACE */
|
||||
ERROR, /* (incomplete sequence) */
|
||||
0x00000020, /* SPACE */
|
||||
ERROR, /* (incomplete sequence) */
|
||||
0x00000020, /* SPACE */
|
||||
0, -1);
|
||||
utf8_read_test(TESTSTR("\xE0\x20\xE1\x20\xE2\x20\xE3\x20\xE4\x20\xE5\x20\xE6\x20\xE7\x20\xE8\x20\xE9\x20\xEA\x20\xEB\x20\xEC\x20\xED\x20\xEE\x20\xEF\x20"),
|
||||
ERROR, /* (incomplete sequence) */
|
||||
0x00000020, /* SPACE */
|
||||
ERROR, /* (incomplete sequence) */
|
||||
0x00000020, /* SPACE */
|
||||
ERROR, /* (incomplete sequence) */
|
||||
0x00000020, /* SPACE */
|
||||
ERROR, /* (incomplete sequence) */
|
||||
0x00000020, /* SPACE */
|
||||
ERROR, /* (incomplete sequence) */
|
||||
0x00000020, /* SPACE */
|
||||
ERROR, /* (incomplete sequence) */
|
||||
0x00000020, /* SPACE */
|
||||
ERROR, /* (incomplete sequence) */
|
||||
0x00000020, /* SPACE */
|
||||
ERROR, /* (incomplete sequence) */
|
||||
0x00000020, /* SPACE */
|
||||
ERROR, /* (incomplete sequence) */
|
||||
0x00000020, /* SPACE */
|
||||
ERROR, /* (incomplete sequence) */
|
||||
0x00000020, /* SPACE */
|
||||
ERROR, /* (incomplete sequence) */
|
||||
0x00000020, /* SPACE */
|
||||
ERROR, /* (incomplete sequence) */
|
||||
0x00000020, /* SPACE */
|
||||
ERROR, /* (incomplete sequence) */
|
||||
0x00000020, /* SPACE */
|
||||
ERROR, /* (incomplete sequence) */
|
||||
0x00000020, /* SPACE */
|
||||
ERROR, /* (incomplete sequence) */
|
||||
0x00000020, /* SPACE */
|
||||
ERROR, /* (incomplete sequence) */
|
||||
0x00000020, /* SPACE */
|
||||
0, -1);
|
||||
utf8_read_test(TESTSTR("\xF0\x20\xF1\x20\xF2\x20\xF3\x20\xF4\x20\xF5\x20\xF6\x20\xF7\x20"),
|
||||
ERROR, /* (incomplete sequence) */
|
||||
0x00000020, /* SPACE */
|
||||
ERROR, /* (incomplete sequence) */
|
||||
0x00000020, /* SPACE */
|
||||
ERROR, /* (incomplete sequence) */
|
||||
0x00000020, /* SPACE */
|
||||
ERROR, /* (incomplete sequence) */
|
||||
0x00000020, /* SPACE */
|
||||
ERROR, /* (incomplete sequence) */
|
||||
0x00000020, /* SPACE */
|
||||
ERROR, /* (incomplete sequence) */
|
||||
0x00000020, /* SPACE */
|
||||
ERROR, /* (incomplete sequence) */
|
||||
0x00000020, /* SPACE */
|
||||
ERROR, /* (incomplete sequence) */
|
||||
0x00000020, /* SPACE */
|
||||
0, -1);
|
||||
utf8_read_test(TESTSTR("\xF8\x20\xF9\x20\xFA\x20\xFB\x20"),
|
||||
ERROR, /* (incomplete sequence) */
|
||||
0x00000020, /* SPACE */
|
||||
ERROR, /* (incomplete sequence) */
|
||||
0x00000020, /* SPACE */
|
||||
ERROR, /* (incomplete sequence) */
|
||||
0x00000020, /* SPACE */
|
||||
ERROR, /* (incomplete sequence) */
|
||||
0x00000020, /* SPACE */
|
||||
0, -1);
|
||||
utf8_read_test(TESTSTR("\xFC\x20\xFD\x20"),
|
||||
ERROR, /* (incomplete sequence) */
|
||||
0x00000020, /* SPACE */
|
||||
ERROR, /* (incomplete sequence) */
|
||||
0x00000020, /* SPACE */
|
||||
0, -1);
|
||||
utf8_read_test(TESTSTR("\xC0"),
|
||||
ERROR, /* (incomplete sequence) */
|
||||
0, -1);
|
||||
utf8_read_test(TESTSTR("\xE0\x80"),
|
||||
ERROR, /* (incomplete sequence) */
|
||||
0, -1);
|
||||
utf8_read_test(TESTSTR("\xF0\x80\x80"),
|
||||
ERROR, /* (incomplete sequence) */
|
||||
0, -1);
|
||||
utf8_read_test(TESTSTR("\xF8\x80\x80\x80"),
|
||||
ERROR, /* (incomplete sequence) */
|
||||
0, -1);
|
||||
utf8_read_test(TESTSTR("\xFC\x80\x80\x80\x80"),
|
||||
ERROR, /* (incomplete sequence) */
|
||||
0, -1);
|
||||
utf8_read_test(TESTSTR("\xDF"),
|
||||
ERROR, /* (incomplete sequence) */
|
||||
0, -1);
|
||||
utf8_read_test(TESTSTR("\xEF\xBF"),
|
||||
ERROR, /* (incomplete sequence) */
|
||||
0, -1);
|
||||
utf8_read_test(TESTSTR("\xF7\xBF\xBF"),
|
||||
ERROR, /* (incomplete sequence) */
|
||||
0, -1);
|
||||
utf8_read_test(TESTSTR("\xFB\xBF\xBF\xBF"),
|
||||
ERROR, /* (incomplete sequence) */
|
||||
0, -1);
|
||||
utf8_read_test(TESTSTR("\xFD\xBF\xBF\xBF\xBF"),
|
||||
ERROR, /* (incomplete sequence) */
|
||||
0, -1);
|
||||
utf8_read_test(TESTSTR("\xC0\xE0\x80\xF0\x80\x80\xF8\x80\x80\x80\xFC\x80\x80\x80\x80\xDF\xEF\xBF\xF7\xBF\xBF\xFB\xBF\xBF\xBF\xFD\xBF\xBF\xBF\xBF"),
|
||||
ERROR, /* (incomplete sequence) */
|
||||
ERROR, /* (incomplete sequence) */
|
||||
ERROR, /* (incomplete sequence) */
|
||||
ERROR, /* (incomplete sequence) */
|
||||
ERROR, /* (incomplete sequence) */
|
||||
ERROR, /* (incomplete sequence) */
|
||||
ERROR, /* (incomplete sequence) */
|
||||
ERROR, /* (incomplete sequence) */
|
||||
ERROR, /* (incomplete sequence) */
|
||||
ERROR, /* (incomplete sequence) */
|
||||
0, -1);
|
||||
utf8_read_test(TESTSTR("\xFE"),
|
||||
ERROR, /* (invalid UTF-8 byte) */
|
||||
0, -1);
|
||||
utf8_read_test(TESTSTR("\xFF"),
|
||||
ERROR, /* (invalid UTF-8 byte) */
|
||||
0, -1);
|
||||
utf8_read_test(TESTSTR("\xFE\xFE\xFF\xFF"),
|
||||
ERROR, /* (invalid UTF-8 byte) */
|
||||
ERROR, /* (invalid UTF-8 byte) */
|
||||
ERROR, /* (invalid UTF-8 byte) */
|
||||
ERROR, /* (invalid UTF-8 byte) */
|
||||
0, -1);
|
||||
utf8_read_test(TESTSTR("\xC0\xAF"),
|
||||
ERROR, /* SOLIDUS (overlong form of 2F) */
|
||||
0, -1);
|
||||
utf8_read_test(TESTSTR("\xE0\x80\xAF"),
|
||||
ERROR, /* SOLIDUS (overlong form of 2F) */
|
||||
0, -1);
|
||||
utf8_read_test(TESTSTR("\xF0\x80\x80\xAF"),
|
||||
ERROR, /* SOLIDUS (overlong form of 2F) */
|
||||
0, -1);
|
||||
utf8_read_test(TESTSTR("\xF8\x80\x80\x80\xAF"),
|
||||
ERROR, /* SOLIDUS (overlong form of 2F) */
|
||||
0, -1);
|
||||
utf8_read_test(TESTSTR("\xFC\x80\x80\x80\x80\xAF"),
|
||||
ERROR, /* SOLIDUS (overlong form of 2F) */
|
||||
0, -1);
|
||||
utf8_read_test(TESTSTR("\xC1\xBF"),
|
||||
ERROR, /* <control> (overlong form of 7F) */
|
||||
0, -1);
|
||||
utf8_read_test(TESTSTR("\xE0\x9F\xBF"),
|
||||
ERROR, /* <no name available> (overlong form of DF BF) */
|
||||
0, -1);
|
||||
utf8_read_test(TESTSTR("\xF0\x8F\xBF\xBF"),
|
||||
ERROR, /* <no name available> (overlong form of EF BF BF) (invalid char) */
|
||||
0, -1);
|
||||
utf8_read_test(TESTSTR("\xF8\x87\xBF\xBF\xBF"),
|
||||
ERROR, /* <no name available> (overlong form of F7 BF BF BF) */
|
||||
0, -1);
|
||||
utf8_read_test(TESTSTR("\xFC\x83\xBF\xBF\xBF\xBF"),
|
||||
ERROR, /* <no name available> (overlong form of FB BF BF BF BF) */
|
||||
0, -1);
|
||||
utf8_read_test(TESTSTR("\xC0\x80"),
|
||||
ERROR, /* <control> (overlong form of 00) */
|
||||
0, -1);
|
||||
utf8_read_test(TESTSTR("\xE0\x80\x80"),
|
||||
ERROR, /* <control> (overlong form of 00) */
|
||||
0, -1);
|
||||
utf8_read_test(TESTSTR("\xF0\x80\x80\x80"),
|
||||
ERROR, /* <control> (overlong form of 00) */
|
||||
0, -1);
|
||||
utf8_read_test(TESTSTR("\xF8\x80\x80\x80\x80"),
|
||||
ERROR, /* <control> (overlong form of 00) */
|
||||
0, -1);
|
||||
utf8_read_test(TESTSTR("\xFC\x80\x80\x80\x80\x80"),
|
||||
ERROR, /* <control> (overlong form of 00) */
|
||||
0, -1);
|
||||
utf8_read_test(TESTSTR("\xED\xA0\x80"),
|
||||
ERROR, /* <Non Private Use High Surrogate, First> (surrogate) */
|
||||
0, -1);
|
||||
utf8_read_test(TESTSTR("\xED\xAD\xBF"),
|
||||
ERROR, /* <Non Private Use High Surrogate, Last> (surrogate) */
|
||||
0, -1);
|
||||
utf8_read_test(TESTSTR("\xED\xAE\x80"),
|
||||
ERROR, /* <Private Use High Surrogate, First> (surrogate) */
|
||||
0, -1);
|
||||
utf8_read_test(TESTSTR("\xED\xAF\xBF"),
|
||||
ERROR, /* <Private Use High Surrogate, Last> (surrogate) */
|
||||
0, -1);
|
||||
utf8_read_test(TESTSTR("\xED\xB0\x80"),
|
||||
ERROR, /* <Low Surrogate, First> (surrogate) */
|
||||
0, -1);
|
||||
utf8_read_test(TESTSTR("\xED\xBE\x80"),
|
||||
ERROR, /* <no name available> (surrogate) */
|
||||
0, -1);
|
||||
utf8_read_test(TESTSTR("\xED\xBF\xBF"),
|
||||
ERROR, /* <Low Surrogate, Last> (surrogate) */
|
||||
0, -1);
|
||||
utf8_read_test(TESTSTR("\xED\xA0\x80\xED\xB0\x80"),
|
||||
ERROR, /* <Non Private Use High Surrogate, First> (surrogate) */
|
||||
ERROR, /* <Low Surrogate, First> (surrogate) */
|
||||
0, -1);
|
||||
utf8_read_test(TESTSTR("\xED\xA0\x80\xED\xBF\xBF"),
|
||||
ERROR, /* <Non Private Use High Surrogate, First> (surrogate) */
|
||||
ERROR, /* <Low Surrogate, Last> (surrogate) */
|
||||
0, -1);
|
||||
utf8_read_test(TESTSTR("\xED\xAD\xBF\xED\xB0\x80"),
|
||||
ERROR, /* <Non Private Use High Surrogate, Last> (surrogate) */
|
||||
ERROR, /* <Low Surrogate, First> (surrogate) */
|
||||
0, -1);
|
||||
utf8_read_test(TESTSTR("\xED\xAD\xBF\xED\xBF\xBF"),
|
||||
ERROR, /* <Non Private Use High Surrogate, Last> (surrogate) */
|
||||
ERROR, /* <Low Surrogate, Last> (surrogate) */
|
||||
0, -1);
|
||||
utf8_read_test(TESTSTR("\xED\xAE\x80\xED\xB0\x80"),
|
||||
ERROR, /* <Private Use High Surrogate, First> (surrogate) */
|
||||
ERROR, /* <Low Surrogate, First> (surrogate) */
|
||||
0, -1);
|
||||
utf8_read_test(TESTSTR("\xED\xAE\x80\xED\xBF\xBF"),
|
||||
ERROR, /* <Private Use High Surrogate, First> (surrogate) */
|
||||
ERROR, /* <Low Surrogate, Last> (surrogate) */
|
||||
0, -1);
|
||||
utf8_read_test(TESTSTR("\xED\xAF\xBF\xED\xB0\x80"),
|
||||
ERROR, /* <Private Use High Surrogate, Last> (surrogate) */
|
||||
ERROR, /* <Low Surrogate, First> (surrogate) */
|
||||
0, -1);
|
||||
utf8_read_test(TESTSTR("\xED\xAF\xBF\xED\xBF\xBF"),
|
||||
ERROR, /* <Private Use High Surrogate, Last> (surrogate) */
|
||||
ERROR, /* <Low Surrogate, Last> (surrogate) */
|
||||
0, -1);
|
||||
utf8_read_test(TESTSTR("\xEF\xBF\xBE"),
|
||||
ERROR, /* <no name available> (invalid char) */
|
||||
0, -1);
|
||||
utf8_read_test(TESTSTR("\xEF\xBF\xBF"),
|
||||
ERROR, /* <no name available> (invalid char) */
|
||||
0, -1);
|
||||
printf("read tests completed\n");
|
||||
printf("write tests beginning\n");
|
||||
{
|
||||
const static long str[] =
|
||||
{0x03BAL, 0x1F79L, 0x03C3L, 0x03BCL, 0x03B5L, 0};
|
||||
utf8_write_test(TESTSTR(str),
|
||||
0xCE, 0xBA,
|
||||
0xE1, 0xBD, 0xB9,
|
||||
0xCF, 0x83,
|
||||
0xCE, 0xBC,
|
||||
0xCE, 0xB5,
|
||||
0, -1);
|
||||
}
|
||||
{
|
||||
const static long str[] = {0x0000L, 0};
|
||||
utf8_write_test(TESTSTR(str),
|
||||
0x00,
|
||||
0, -1);
|
||||
}
|
||||
{
|
||||
const static long str[] = {0x0080L, 0};
|
||||
utf8_write_test(TESTSTR(str),
|
||||
0xC2, 0x80,
|
||||
0, -1);
|
||||
}
|
||||
{
|
||||
const static long str[] = {0x0800L, 0};
|
||||
utf8_write_test(TESTSTR(str),
|
||||
0xE0, 0xA0, 0x80,
|
||||
0, -1);
|
||||
}
|
||||
{
|
||||
const static long str[] = {0x00010000L, 0};
|
||||
utf8_write_test(TESTSTR(str),
|
||||
0xF0, 0x90, 0x80, 0x80,
|
||||
0, -1);
|
||||
}
|
||||
{
|
||||
const static long str[] = {0x00200000L, 0};
|
||||
utf8_write_test(TESTSTR(str),
|
||||
0xF8, 0x88, 0x80, 0x80, 0x80,
|
||||
0, -1);
|
||||
}
|
||||
{
|
||||
const static long str[] = {0x04000000L, 0};
|
||||
utf8_write_test(TESTSTR(str),
|
||||
0xFC, 0x84, 0x80, 0x80, 0x80, 0x80,
|
||||
0, -1);
|
||||
}
|
||||
{
|
||||
const static long str[] = {0x007FL, 0};
|
||||
utf8_write_test(TESTSTR(str),
|
||||
0x7F,
|
||||
0, -1);
|
||||
}
|
||||
{
|
||||
const static long str[] = {0x07FFL, 0};
|
||||
utf8_write_test(TESTSTR(str),
|
||||
0xDF, 0xBF,
|
||||
0, -1);
|
||||
}
|
||||
{
|
||||
const static long str[] = {0xFFFDL, 0};
|
||||
utf8_write_test(TESTSTR(str),
|
||||
0xEF, 0xBF, 0xBD,
|
||||
0, -1);
|
||||
}
|
||||
{
|
||||
const static long str[] = {0xFFFFL, 0};
|
||||
utf8_write_test(TESTSTR(str),
|
||||
ERROR,
|
||||
0, -1);
|
||||
}
|
||||
{
|
||||
const static long str[] = {0x001FFFFFL, 0};
|
||||
utf8_write_test(TESTSTR(str),
|
||||
0xF7, 0xBF, 0xBF, 0xBF,
|
||||
0, -1);
|
||||
}
|
||||
{
|
||||
const static long str[] = {0x03FFFFFFL, 0};
|
||||
utf8_write_test(TESTSTR(str),
|
||||
0xFB, 0xBF, 0xBF, 0xBF, 0xBF,
|
||||
0, -1);
|
||||
}
|
||||
{
|
||||
const static long str[] = {0x7FFFFFFFL, 0};
|
||||
utf8_write_test(TESTSTR(str),
|
||||
0xFD, 0xBF, 0xBF, 0xBF, 0xBF, 0xBF,
|
||||
0, -1);
|
||||
}
|
||||
{
|
||||
const static long str[] = {0xD7FFL, 0};
|
||||
utf8_write_test(TESTSTR(str),
|
||||
0xED, 0x9F, 0xBF,
|
||||
0, -1);
|
||||
}
|
||||
{
|
||||
const static long str[] = {0xD800L, 0};
|
||||
utf8_write_test(TESTSTR(str),
|
||||
ERROR,
|
||||
0, -1);
|
||||
}
|
||||
{
|
||||
const static long str[] = {0xD800L, 0xDC00L, 0};
|
||||
utf8_write_test(TESTSTR(str),
|
||||
ERROR,
|
||||
ERROR,
|
||||
0, -1);
|
||||
}
|
||||
{
|
||||
const static long str[] = {0xDFFFL, 0};
|
||||
utf8_write_test(TESTSTR(str),
|
||||
ERROR,
|
||||
0, -1);
|
||||
}
|
||||
{
|
||||
const static long str[] = {0xE000L, 0};
|
||||
utf8_write_test(TESTSTR(str),
|
||||
0xEE, 0x80, 0x80,
|
||||
0, -1);
|
||||
}
|
||||
printf("write tests completed\n");
|
||||
|
||||
printf("total: %d errors\n", total_errs);
|
||||
return (total_errs != 0);
|
||||
}
|
||||
#endif /* TESTMODE */
|
||||
|
||||
const charset_spec charset_CS_UTF8 = {
|
||||
CS_UTF8, read_utf8, write_utf8, NULL
|
||||
};
|
||||
|
||||
#else /* ENUM_CHARSETS */
|
||||
|
||||
ENUM_CHARSET(CS_UTF8)
|
||||
|
||||
#endif /* ENUM_CHARSETS */
|
92
charset/xenc.c
Normal file
92
charset/xenc.c
Normal file
@ -0,0 +1,92 @@
|
||||
/*
|
||||
* xenc.c - translate our internal character set codes to and from
|
||||
* X11 character encoding names.
|
||||
*
|
||||
*/
|
||||
|
||||
#include <ctype.h>
|
||||
#include "charset.h"
|
||||
#include "internal.h"
|
||||
|
||||
static const struct {
|
||||
const char *name;
|
||||
int charset;
|
||||
} xencs[] = {
|
||||
/*
|
||||
* Officially registered encoding names. This list is derived
|
||||
* from the font encodings section of
|
||||
*
|
||||
* http://ftp.x.org/pub/DOCS/registry
|
||||
*
|
||||
* Where multiple encoding names map to the same encoding id
|
||||
* (such as iso8859-15 and fcd8859-15), the first is considered
|
||||
* canonical and will be returned when translating the id to a
|
||||
* string.
|
||||
*/
|
||||
{ "iso8859-1", CS_ISO8859_1 },
|
||||
{ "iso8859-2", CS_ISO8859_2 },
|
||||
{ "iso8859-3", CS_ISO8859_3 },
|
||||
{ "iso8859-4", CS_ISO8859_4 },
|
||||
{ "iso8859-5", CS_ISO8859_5 },
|
||||
{ "iso8859-6", CS_ISO8859_6 },
|
||||
{ "iso8859-7", CS_ISO8859_7 },
|
||||
{ "iso8859-8", CS_ISO8859_8 },
|
||||
{ "iso8859-9", CS_ISO8859_9 },
|
||||
{ "iso8859-10", CS_ISO8859_10 },
|
||||
{ "iso8859-13", CS_ISO8859_13 },
|
||||
{ "iso8859-14", CS_ISO8859_14 },
|
||||
{ "iso8859-15", CS_ISO8859_15 },
|
||||
{ "fcd8859-15", CS_ISO8859_15 },
|
||||
{ "hp-roman8", CS_HP_ROMAN8 },
|
||||
{ "koi8-r", CS_KOI8_R },
|
||||
/*
|
||||
* Unofficial encoding names found in the wild.
|
||||
*/
|
||||
{ "iso8859-16", CS_ISO8859_16 },
|
||||
{ "koi8-u", CS_KOI8_U },
|
||||
{ "ibm-cp437", CS_CP437 },
|
||||
{ "ibm-cp850", CS_CP850 },
|
||||
{ "microsoft-cp1250", CS_CP1250 },
|
||||
{ "microsoft-cp1251", CS_CP1251 },
|
||||
{ "microsoft-cp1252", CS_CP1252 },
|
||||
{ "microsoft-cp1253", CS_CP1253 },
|
||||
{ "microsoft-cp1254", CS_CP1254 },
|
||||
{ "microsoft-cp1255", CS_CP1255 },
|
||||
{ "microsoft-cp1256", CS_CP1256 },
|
||||
{ "microsoft-cp1257", CS_CP1257 },
|
||||
{ "microsoft-cp1258", CS_CP1258 },
|
||||
{ "mac-roman", CS_MAC_ROMAN },
|
||||
{ "viscii1.1-1", CS_VISCII },
|
||||
{ "viscii1-1", CS_VISCII },
|
||||
};
|
||||
|
||||
const char *charset_to_xenc(int charset)
|
||||
{
|
||||
int i;
|
||||
|
||||
for (i = 0; i < (int)lenof(xencs); i++)
|
||||
if (charset == xencs[i].charset)
|
||||
return xencs[i].name;
|
||||
|
||||
return NULL; /* not found */
|
||||
}
|
||||
|
||||
int charset_from_xenc(const char *name)
|
||||
{
|
||||
int i;
|
||||
|
||||
for (i = 0; i < (int)lenof(xencs); i++) {
|
||||
const char *p, *q;
|
||||
p = name;
|
||||
q = xencs[i].name;
|
||||
while (*p || *q) {
|
||||
if (tolower(*p) != tolower(*q))
|
||||
break;
|
||||
p++; q++;
|
||||
}
|
||||
if (!*p && !*q)
|
||||
return xencs[i].charset;
|
||||
}
|
||||
|
||||
return CS_NONE; /* not found */
|
||||
}
|
@ -11,7 +11,12 @@ use FileHandle;
|
||||
|
||||
open IN, "Recipe" or die "unable to open Recipe file\n";
|
||||
|
||||
@incdirs = ("", "unix/", "mac/");
|
||||
# HACK: One of the source files in `charset' is auto-generated by
|
||||
# sbcsgen.pl. We need to generate that _now_, before attempting
|
||||
# dependency analysis.
|
||||
eval 'chdir "charset"; require "sbcsgen.pl"; chdir ".."';
|
||||
|
||||
@incdirs = ("", "charset/", "unix/", "mac/");
|
||||
|
||||
$help = ""; # list of newline-free lines of help text
|
||||
%programs = (); # maps prog name + type letter to listref of objects/resources
|
||||
@ -534,7 +539,7 @@ print
|
||||
"# TOOLPATH = /opt/gcc/bin\n".
|
||||
"CC = \$(TOOLPATH)cc\n".
|
||||
"\n".
|
||||
&splitline("CFLAGS = -Wall -g -I. -I.. `gtk-config --cflags`")."\n".
|
||||
&splitline("CFLAGS = -Wall -g -I. -I.. -I../charset `gtk-config --cflags`")."\n".
|
||||
"XLDFLAGS = `gtk-config --libs`\n".
|
||||
"ULDFLAGS =#\n".
|
||||
"INSTALL=install\n",
|
||||
|
22
unix/pterm.1
22
unix/pterm.1
@ -90,6 +90,20 @@ to specify it explicitly if you have changed the default using the
|
||||
.IP "\fB\-log\fP \fIfilename\fP"
|
||||
This option makes \fIpterm\fP log all the terminal output to a file
|
||||
as well as displaying it in the terminal.
|
||||
.IP "\fB\-cs\fP \fIcharset\fP"
|
||||
This option specifies the character set in which \fIpterm\fP should
|
||||
assume the session is operating. This character set will be used to
|
||||
interpret all the data received from the session, and all input you
|
||||
type or paste into \fIpterm\fP will be converted into this character
|
||||
set before being sent to the session.
|
||||
|
||||
Any character set name which is valid in a MIME header (and
|
||||
supported by \fIpterm\fP) should be valid here (examples are
|
||||
"ISO-8859-1", "windows-1252" or "UTF-8"). Also, any character
|
||||
encoding which is valid in an X logical font description should be
|
||||
valid ("ibm-cp437", for example).
|
||||
|
||||
Character set names are case-insensitive.
|
||||
.IP "\fB\-nethack\fP"
|
||||
Tells \fIpterm\fP to enable NetHack keypad mode, in which the
|
||||
numeric keypad generates the NetHack "hjklyubn" direction keys. This
|
||||
@ -385,6 +399,14 @@ reset to the very bottom.
|
||||
This option should be set to either 0 or 1; the default is 1. When
|
||||
set to 1, any activity in the display causes the position of the
|
||||
scrollback to be reset to the very bottom.
|
||||
.IP "\fBpterm.LineCodePage\fP"
|
||||
This option specifies the character set to be used for the session.
|
||||
This is the same as the \fI\-cs\fP command-line option.
|
||||
.IP "\fBpterm.NoRemoteCharset\fP"
|
||||
This option disables the terminal's ability to change its character
|
||||
set when it receives escape sequences telling it to. You might need
|
||||
to do this to interoperate with programs which incorrectly change
|
||||
the character set to something they think is sensible.
|
||||
.IP "\fBpterm.BCE\fP"
|
||||
This option should be set to either 0 or 1; the default is 1. When
|
||||
set to 1, the various control sequences that erase parts of the
|
||||
|
206
unix/pterm.c
206
unix/pterm.c
@ -24,6 +24,7 @@
|
||||
#include <X11/Xutil.h>
|
||||
|
||||
#define PUTTY_DO_GLOBALS /* actually _define_ globals */
|
||||
|
||||
#include "putty.h"
|
||||
#include "terminal.h"
|
||||
|
||||
@ -39,18 +40,22 @@ struct gui_data {
|
||||
GtkAdjustment *sbar_adjust;
|
||||
GdkPixmap *pixmap;
|
||||
GdkFont *fonts[2]; /* normal and bold (for now!) */
|
||||
struct {
|
||||
int charset;
|
||||
int is_wide;
|
||||
} fontinfo[2];
|
||||
GdkCursor *rawcursor, *textcursor, *blankcursor, *currcursor;
|
||||
GdkColor cols[NCOLOURS];
|
||||
GdkColormap *colmap;
|
||||
wchar_t *pastein_data;
|
||||
int pastein_data_len;
|
||||
char *pasteout_data;
|
||||
int pasteout_data_len;
|
||||
char *pasteout_data, *pasteout_data_utf8;
|
||||
int pasteout_data_len, pasteout_data_utf8_len;
|
||||
int font_width, font_height;
|
||||
int ignore_sbar;
|
||||
int mouseptr_visible;
|
||||
guint term_paste_idle_id;
|
||||
GdkAtom compound_text_atom;
|
||||
GdkAtom compound_text_atom, utf8_string_atom;
|
||||
int alt_keycode;
|
||||
int alt_digits;
|
||||
char wintitle[sizeof(((Config *)0)->wintitle)];
|
||||
@ -831,7 +836,19 @@ gint key_event(GtkWidget *widget, GdkEventKey *event, gpointer data)
|
||||
printf("\n");
|
||||
#endif
|
||||
|
||||
ldisc_send(inst->ldisc, output+start, end-start, 1);
|
||||
/*
|
||||
* The stuff we've just generated is assumed to be
|
||||
* ISO-8859-1! This sounds insane, but `man XLookupString'
|
||||
* agrees: strings of this type returned from the X server
|
||||
* are hardcoded to 8859-1. Strictly speaking we should be
|
||||
* doing this using some sort of GtkIMContext, which (if
|
||||
* we're lucky) would give us our data directly in Unicode;
|
||||
* but that's not supported in GTK 1.2 as far as I can
|
||||
* tell, and it's poorly documented even in 2.0, so it'll
|
||||
* have to wait.
|
||||
*/
|
||||
lpage_send(inst->ldisc, CS_ISO8859_1, output+start, end-start, 1);
|
||||
|
||||
show_mouseptr(inst, 0);
|
||||
term_seen_key_event(inst->term);
|
||||
term_out(inst->term);
|
||||
@ -1198,9 +1215,26 @@ void write_clip(void *frontend, wchar_t * data, int len, int must_deselect)
|
||||
struct gui_data *inst = (struct gui_data *)frontend;
|
||||
if (inst->pasteout_data)
|
||||
sfree(inst->pasteout_data);
|
||||
if (inst->pasteout_data_utf8)
|
||||
sfree(inst->pasteout_data_utf8);
|
||||
|
||||
inst->pasteout_data_utf8 = smalloc(len*6);
|
||||
inst->pasteout_data_utf8_len = len*6;
|
||||
{
|
||||
wchar_t *tmp = data;
|
||||
int tmplen = len;
|
||||
inst->pasteout_data_utf8_len =
|
||||
charset_from_unicode(&tmp, &tmplen, inst->pasteout_data_utf8,
|
||||
inst->pasteout_data_utf8_len,
|
||||
CS_UTF8, NULL, NULL, 0);
|
||||
inst->pasteout_data_utf8 =
|
||||
srealloc(inst->pasteout_data_utf8, inst->pasteout_data_utf8_len);
|
||||
}
|
||||
|
||||
inst->pasteout_data = smalloc(len);
|
||||
inst->pasteout_data_len = len;
|
||||
wc_to_mb(0, 0, data, len, inst->pasteout_data, inst->pasteout_data_len,
|
||||
wc_to_mb(line_codepage, 0, data, len,
|
||||
inst->pasteout_data, inst->pasteout_data_len,
|
||||
NULL, NULL);
|
||||
|
||||
if (gtk_selection_owner_set(inst->area, GDK_SELECTION_PRIMARY,
|
||||
@ -1209,6 +1243,8 @@ void write_clip(void *frontend, wchar_t * data, int len, int must_deselect)
|
||||
GDK_SELECTION_TYPE_STRING, 1);
|
||||
gtk_selection_add_target(inst->area, GDK_SELECTION_PRIMARY,
|
||||
inst->compound_text_atom, 1);
|
||||
gtk_selection_add_target(inst->area, GDK_SELECTION_PRIMARY,
|
||||
inst->utf8_string_atom, 1);
|
||||
}
|
||||
}
|
||||
|
||||
@ -1216,8 +1252,13 @@ void selection_get(GtkWidget *widget, GtkSelectionData *seldata,
|
||||
guint info, guint time_stamp, gpointer data)
|
||||
{
|
||||
struct gui_data *inst = (struct gui_data *)data;
|
||||
gtk_selection_data_set(seldata, GDK_SELECTION_TYPE_STRING, 8,
|
||||
inst->pasteout_data, inst->pasteout_data_len);
|
||||
if (seldata->target == inst->utf8_string_atom)
|
||||
gtk_selection_data_set(seldata, seldata->target, 8,
|
||||
inst->pasteout_data_utf8,
|
||||
inst->pasteout_data_utf8_len);
|
||||
else
|
||||
gtk_selection_data_set(seldata, seldata->target, 8,
|
||||
inst->pasteout_data, inst->pasteout_data_len);
|
||||
}
|
||||
|
||||
gint selection_clear(GtkWidget *widget, GdkEventSelection *seldata,
|
||||
@ -1227,8 +1268,12 @@ gint selection_clear(GtkWidget *widget, GdkEventSelection *seldata,
|
||||
term_deselect(inst->term);
|
||||
if (inst->pasteout_data)
|
||||
sfree(inst->pasteout_data);
|
||||
if (inst->pasteout_data_utf8)
|
||||
sfree(inst->pasteout_data_utf8);
|
||||
inst->pasteout_data = NULL;
|
||||
inst->pasteout_data_len = 0;
|
||||
inst->pasteout_data_utf8 = NULL;
|
||||
inst->pasteout_data_utf8_len = 0;
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
@ -1240,8 +1285,16 @@ void request_paste(void *frontend)
|
||||
* moment is to call gtk_selection_convert(), and when the data
|
||||
* comes back _then_ we can call term_do_paste().
|
||||
*/
|
||||
|
||||
/*
|
||||
* First we attempt to retrieve the selection as a UTF-8 string
|
||||
* (which we will convert to the correct code page before
|
||||
* sending to the session, of course). If that fails,
|
||||
* selection_received() will be informed and will fall back to
|
||||
* an ordinary string.
|
||||
*/
|
||||
gtk_selection_convert(inst->area, GDK_SELECTION_PRIMARY,
|
||||
GDK_SELECTION_TYPE_STRING, GDK_CURRENT_TIME);
|
||||
inst->utf8_string_atom, GDK_CURRENT_TIME);
|
||||
}
|
||||
|
||||
gint idle_paste_func(gpointer data); /* forward ref */
|
||||
@ -1251,8 +1304,22 @@ void selection_received(GtkWidget *widget, GtkSelectionData *seldata,
|
||||
{
|
||||
struct gui_data *inst = (struct gui_data *)data;
|
||||
|
||||
if (seldata->target == inst->utf8_string_atom && seldata->length <= 0) {
|
||||
/*
|
||||
* Failed to get a UTF-8 selection string. Try an ordinary
|
||||
* string.
|
||||
*/
|
||||
gtk_selection_convert(inst->area, GDK_SELECTION_PRIMARY,
|
||||
GDK_SELECTION_TYPE_STRING, GDK_CURRENT_TIME);
|
||||
return;
|
||||
}
|
||||
|
||||
/*
|
||||
* Any other failure should just go foom.
|
||||
*/
|
||||
if (seldata->length <= 0 ||
|
||||
seldata->type != GDK_SELECTION_TYPE_STRING)
|
||||
(seldata->type != GDK_SELECTION_TYPE_STRING &&
|
||||
seldata->type != inst->utf8_string_atom))
|
||||
return; /* Nothing happens. */
|
||||
|
||||
if (inst->pastein_data)
|
||||
@ -1260,8 +1327,11 @@ void selection_received(GtkWidget *widget, GtkSelectionData *seldata,
|
||||
|
||||
inst->pastein_data = smalloc(seldata->length * sizeof(wchar_t));
|
||||
inst->pastein_data_len = seldata->length;
|
||||
mb_to_wc(0, 0, seldata->data, seldata->length,
|
||||
inst->pastein_data, inst->pastein_data_len);
|
||||
inst->pastein_data_len =
|
||||
mb_to_wc((seldata->type == inst->utf8_string_atom ?
|
||||
CS_UTF8 : line_codepage),
|
||||
0, seldata->data, seldata->length,
|
||||
inst->pastein_data, inst->pastein_data_len);
|
||||
|
||||
term_do_paste(inst->term);
|
||||
|
||||
@ -1457,10 +1527,45 @@ void do_text_internal(Context ctx, int x, int y, char *text, int len,
|
||||
rlen*inst->font_width, inst->font_height);
|
||||
|
||||
gdk_gc_set_foreground(gc, &inst->cols[nfg]);
|
||||
gdk_draw_text(inst->pixmap, inst->fonts[fontid], gc,
|
||||
x*inst->font_width+cfg.window_border,
|
||||
y*inst->font_height+cfg.window_border+inst->fonts[0]->ascent,
|
||||
text, len);
|
||||
{
|
||||
GdkWChar *gwcs;
|
||||
gchar *gcs;
|
||||
wchar_t *wcs;
|
||||
int i;
|
||||
|
||||
wcs = smalloc(sizeof(wchar_t) * (len+1));
|
||||
for (i = 0; i < len; i++) {
|
||||
wcs[i] = (wchar_t) ((attr & CSET_MASK) + (text[i] & CHAR_MASK));
|
||||
}
|
||||
|
||||
if (inst->fontinfo[fontid].is_wide) {
|
||||
gwcs = smalloc(sizeof(GdkWChar) * (len+1));
|
||||
/*
|
||||
* FIXME: when we have a wide-char equivalent of
|
||||
* from_unicode, use it instead of this.
|
||||
*/
|
||||
for (i = 0; i <= len; i++)
|
||||
gwcs[i] = wcs[i];
|
||||
gdk_draw_text_wc(inst->pixmap, inst->fonts[fontid], gc,
|
||||
x*inst->font_width+cfg.window_border,
|
||||
y*inst->font_height+cfg.window_border+inst->fonts[0]->ascent,
|
||||
gwcs, len*2);
|
||||
sfree(gwcs);
|
||||
} else {
|
||||
wchar_t *wcstmp = wcs;
|
||||
int lentmp = len;
|
||||
gcs = smalloc(sizeof(GdkWChar) * (len+1));
|
||||
charset_from_unicode(&wcstmp, &lentmp, gcs, len,
|
||||
inst->fontinfo[fontid].charset,
|
||||
NULL, ".", 1);
|
||||
gdk_draw_text(inst->pixmap, inst->fonts[fontid], gc,
|
||||
x*inst->font_width+cfg.window_border,
|
||||
y*inst->font_height+cfg.window_border+inst->fonts[0]->ascent,
|
||||
gcs, len);
|
||||
sfree(gcs);
|
||||
}
|
||||
sfree(wcs);
|
||||
}
|
||||
|
||||
if (shadow) {
|
||||
gdk_draw_text(inst->pixmap, inst->fonts[fontid], gc,
|
||||
@ -1818,6 +1923,12 @@ int do_cmdline(int argc, char **argv, int do_everything)
|
||||
strncpy(cfg.boldfont, val, sizeof(cfg.boldfont));
|
||||
cfg.boldfont[sizeof(cfg.boldfont)-1] = '\0';
|
||||
|
||||
} else if (!strcmp(p, "-cs")) {
|
||||
EXPECTS_ARG;
|
||||
SECOND_PASS_ONLY;
|
||||
strncpy(cfg.line_codepage, val, sizeof(cfg.line_codepage));
|
||||
cfg.line_codepage[sizeof(cfg.line_codepage)-1] = '\0';
|
||||
|
||||
} else if (!strcmp(p, "-geometry")) {
|
||||
int flags, x, y, w, h;
|
||||
EXPECTS_ARG;
|
||||
@ -1955,6 +2066,68 @@ static void block_signal(int sig, int block_it) {
|
||||
}
|
||||
}
|
||||
|
||||
static void set_font_info(struct gui_data *inst, int fontid)
|
||||
{
|
||||
GdkFont *font = inst->fonts[fontid];
|
||||
XFontStruct *xfs = GDK_FONT_XFONT(font);
|
||||
Display *disp = GDK_FONT_XDISPLAY(font);
|
||||
Atom charset_registry, charset_encoding;
|
||||
unsigned long registry_ret, encoding_ret;
|
||||
charset_registry = XInternAtom(disp, "CHARSET_REGISTRY", False);
|
||||
charset_encoding = XInternAtom(disp, "CHARSET_ENCODING", False);
|
||||
inst->fontinfo[fontid].charset = CS_NONE;
|
||||
inst->fontinfo[fontid].is_wide = 0;
|
||||
if (XGetFontProperty(xfs, charset_registry, ®istry_ret) &&
|
||||
XGetFontProperty(xfs, charset_encoding, &encoding_ret)) {
|
||||
char *reg, *enc;
|
||||
reg = XGetAtomName(disp, (Atom)registry_ret);
|
||||
enc = XGetAtomName(disp, (Atom)encoding_ret);
|
||||
if (reg && enc) {
|
||||
char *encoding = dupcat(reg, "-", enc, NULL);
|
||||
inst->fontinfo[fontid].charset = charset_from_xenc(encoding);
|
||||
/* FIXME: when libcharset supports wide encodings fix this. */
|
||||
if (!strcasecmp(encoding, "iso10646-1"))
|
||||
inst->fontinfo[fontid].is_wide = 1;
|
||||
|
||||
/*
|
||||
* Hack for X line-drawing characters: if the primary
|
||||
* font is encoded as ISO-8859-anything, and has valid
|
||||
* glyphs in the first 32 char positions, it is assumed
|
||||
* that those glyphs are the VT100 line-drawing
|
||||
* character set.
|
||||
*
|
||||
* Actually, we'll hack even harder by only checking
|
||||
* position 0x19 (vertical line, VT100 linedrawing
|
||||
* `x'). Then we can check it easily by seeing if the
|
||||
* ascent and descent differ.
|
||||
*/
|
||||
if (inst->fontinfo[fontid].charset == CS_ISO8859_1) {
|
||||
int lb, rb, wid, asc, desc;
|
||||
gchar text[2];
|
||||
|
||||
text[1] = '\0';
|
||||
text[0] = '\x12';
|
||||
gdk_string_extents(inst->fonts[fontid], text,
|
||||
&lb, &rb, &wid, &asc, &desc);
|
||||
if (asc != desc)
|
||||
inst->fontinfo[fontid].charset = CS_ISO8859_1_X11;
|
||||
}
|
||||
|
||||
/*
|
||||
* FIXME: this is a hack. Currently fonts with
|
||||
* incomprehensible encodings are dealt with by
|
||||
* pretending they're 8859-1. It's ugly, but it's good
|
||||
* enough to stop things crashing. Should do something
|
||||
* better here.
|
||||
*/
|
||||
if (inst->fontinfo[fontid].charset == CS_NONE)
|
||||
inst->fontinfo[fontid].charset = CS_ISO8859_1;
|
||||
|
||||
sfree(encoding);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
extern int pty_master_fd; /* declared in pty.c */
|
||||
@ -1987,6 +2160,7 @@ int main(int argc, char **argv)
|
||||
fprintf(stderr, "pterm: unable to load font \"%s\"\n", cfg.font);
|
||||
exit(1);
|
||||
}
|
||||
set_font_info(inst, 0);
|
||||
if (cfg.boldfont[0]) {
|
||||
inst->fonts[1] = gdk_font_load(cfg.boldfont);
|
||||
if (!inst->fonts[1]) {
|
||||
@ -1994,6 +2168,7 @@ int main(int argc, char **argv)
|
||||
cfg.boldfont);
|
||||
exit(1);
|
||||
}
|
||||
set_font_info(inst, 1);
|
||||
} else
|
||||
inst->fonts[1] = NULL;
|
||||
|
||||
@ -2001,6 +2176,7 @@ int main(int argc, char **argv)
|
||||
inst->font_height = inst->fonts[0]->ascent + inst->fonts[0]->descent;
|
||||
|
||||
inst->compound_text_atom = gdk_atom_intern("COMPOUND_TEXT", FALSE);
|
||||
inst->utf8_string_atom = gdk_atom_intern("UTF8_STRING", FALSE);
|
||||
|
||||
init_ucs();
|
||||
|
||||
|
13
unix/unix.h
13
unix/unix.h
@ -1,6 +1,8 @@
|
||||
#ifndef PUTTY_UNIX_H
|
||||
#define PUTTY_UNIX_H
|
||||
|
||||
#include "charset.h"
|
||||
|
||||
typedef void *Context; /* FIXME: probably needs changing */
|
||||
|
||||
extern Backend pty_backend;
|
||||
@ -47,7 +49,16 @@ int select_result(int fd, int event);
|
||||
int first_socket(int *state, int *rwx);
|
||||
int next_socket(int *state, int *rwx);
|
||||
|
||||
#define DEFAULT_CODEPAGE 0 /* FIXME: no idea how to do this */
|
||||
/*
|
||||
* In the Unix Unicode layer, DEFAULT_CODEPAGE is a special value
|
||||
* which causes mb_to_wc and wc_to_mb to call _libc_ rather than
|
||||
* libcharset. That way, we can interface the various charsets
|
||||
* supported by libcharset with the one supported by mbstowcs and
|
||||
* wcstombs (which will be the character set in which stuff read
|
||||
* from the command line or config files is assumed to be encoded).
|
||||
*/
|
||||
#define DEFAULT_CODEPAGE 0xFFFF
|
||||
#define CP_UTF8 CS_UTF8 /* from libcharset */
|
||||
|
||||
#define strnicmp strncasecmp
|
||||
#define stricmp strcasecmp
|
||||
|
166
unix/uxucs.c
166
unix/uxucs.c
@ -1,17 +1,18 @@
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <ctype.h>
|
||||
#include <locale.h>
|
||||
#include <limits.h>
|
||||
#include <wchar.h>
|
||||
|
||||
#include <time.h>
|
||||
|
||||
#include "putty.h"
|
||||
#include "terminal.h"
|
||||
#include "misc.h"
|
||||
|
||||
/*
|
||||
* Unix Unicode-handling routines.
|
||||
*
|
||||
* FIXME: currently trivial stub versions assuming all codepages
|
||||
* are ISO8859-1.
|
||||
*/
|
||||
|
||||
int is_dbcs_leadbyte(int codepage, char byte)
|
||||
@ -22,48 +23,151 @@ int is_dbcs_leadbyte(int codepage, char byte)
|
||||
int mb_to_wc(int codepage, int flags, char *mbstr, int mblen,
|
||||
wchar_t *wcstr, int wclen)
|
||||
{
|
||||
int ret = 0;
|
||||
while (mblen > 0 && wclen > 0) {
|
||||
*wcstr++ = (unsigned char) *mbstr++;
|
||||
mblen--, wclen--, ret++;
|
||||
}
|
||||
return ret; /* FIXME: check error codes! */
|
||||
if (codepage == DEFAULT_CODEPAGE) {
|
||||
int n = 0;
|
||||
mbstate_t state = { 0 };
|
||||
|
||||
setlocale(LC_CTYPE, "");
|
||||
|
||||
while (mblen > 0) {
|
||||
size_t i = mbrtowc(wcstr+n, mbstr, (size_t)mblen, &state);
|
||||
if (i == (size_t)-1 || i == (size_t)-2)
|
||||
break;
|
||||
n++;
|
||||
mbstr += i;
|
||||
mblen -= i;
|
||||
}
|
||||
|
||||
setlocale(LC_CTYPE, "C");
|
||||
|
||||
return n;
|
||||
} else
|
||||
return charset_to_unicode(&mbstr, &mblen, wcstr, wclen, codepage,
|
||||
NULL, NULL, 0);
|
||||
}
|
||||
|
||||
int wc_to_mb(int codepage, int flags, wchar_t *wcstr, int wclen,
|
||||
char *mbstr, int mblen, char *defchr, int *defused)
|
||||
{
|
||||
int ret = 0;
|
||||
/* FIXME: we should remove the defused param completely... */
|
||||
if (defused)
|
||||
*defused = 0;
|
||||
while (mblen > 0 && wclen > 0) {
|
||||
if (*wcstr >= 0x100) {
|
||||
if (defchr)
|
||||
*mbstr++ = *defchr;
|
||||
else
|
||||
*mbstr++ = '.';
|
||||
if (defused)
|
||||
*defused = 1;
|
||||
} else
|
||||
*mbstr++ = (unsigned char) *wcstr;
|
||||
wcstr++;
|
||||
mblen--, wclen--, ret++;
|
||||
}
|
||||
return ret; /* FIXME: check error codes! */
|
||||
|
||||
if (codepage == DEFAULT_CODEPAGE) {
|
||||
char output[MB_LEN_MAX];
|
||||
mbstate_t state = { 0 };
|
||||
int n = 0;
|
||||
|
||||
setlocale(LC_CTYPE, "");
|
||||
|
||||
while (wclen > 0) {
|
||||
int i = wcrtomb(output, wcstr[0], &state);
|
||||
if (i == (size_t)-1 || i > n - mblen)
|
||||
break;
|
||||
memcpy(mbstr+n, output, i);
|
||||
n += i;
|
||||
wcstr++;
|
||||
wclen--;
|
||||
}
|
||||
|
||||
setlocale(LC_CTYPE, "C");
|
||||
|
||||
return n;
|
||||
} else
|
||||
return charset_from_unicode(&wcstr, &wclen, mbstr, mblen, codepage,
|
||||
NULL, NULL, 0);
|
||||
}
|
||||
|
||||
void init_ucs(void)
|
||||
{
|
||||
int i;
|
||||
/* Find the line control characters. FIXME: this is not right. */
|
||||
|
||||
/*
|
||||
* In the platform-independent parts of the code, font_codepage
|
||||
* is used only for system DBCS support - which we don't
|
||||
* support at all. So we set this to something which will never
|
||||
* be used.
|
||||
*/
|
||||
font_codepage = -1;
|
||||
|
||||
/*
|
||||
* line_codepage should be decoded from the specification in
|
||||
* cfg.
|
||||
*/
|
||||
line_codepage = charset_from_mimeenc(cfg.line_codepage);
|
||||
if (line_codepage == CS_NONE)
|
||||
line_codepage = charset_from_xenc(cfg.line_codepage);
|
||||
/* If it's still CS_NONE, we should assume direct-to-font. */
|
||||
|
||||
/* FIXME: this is a hack. Currently fonts with incomprehensible
|
||||
* encodings are dealt with by pretending they're 8859-1. It's
|
||||
* ugly, but it's good enough to stop things crashing. Should do
|
||||
* something better here. */
|
||||
if (line_codepage == CS_NONE)
|
||||
line_codepage = CS_ISO8859_1;
|
||||
|
||||
/*
|
||||
* Set up unitab_line, by translating each individual character
|
||||
* in the line codepage into Unicode.
|
||||
*/
|
||||
for (i = 0; i < 256; i++) {
|
||||
char c[1], *p;
|
||||
wchar_t wc[1];
|
||||
int len;
|
||||
c[0] = i;
|
||||
p = c;
|
||||
len = 1;
|
||||
if (1 == charset_to_unicode(&p,&len,wc,1,line_codepage,NULL,L"",0))
|
||||
unitab_line[i] = wc[0];
|
||||
else
|
||||
unitab_line[i] = 0xFFFD;
|
||||
}
|
||||
|
||||
/*
|
||||
* Set up unitab_xterm. This is the same as unitab_line except
|
||||
* in the line-drawing regions, where it follows the Unicode
|
||||
* encoding.
|
||||
*
|
||||
* (Note that the strange X encoding of line-drawing characters
|
||||
* in the bottom 32 glyphs of ISO8859-1 fonts is taken care of
|
||||
* by the font encoding, which will spot such a font and act as
|
||||
* if it were in a variant encoding of ISO8859-1.)
|
||||
*/
|
||||
for (i = 0; i < 256; i++) {
|
||||
static const wchar_t unitab_xterm_std[32] = {
|
||||
0x2666, 0x2592, 0x2409, 0x240c, 0x240d, 0x240a, 0x00b0, 0x00b1,
|
||||
0x2424, 0x240b, 0x2518, 0x2510, 0x250c, 0x2514, 0x253c, 0x23ba,
|
||||
0x23bb, 0x2500, 0x23bc, 0x23bd, 0x251c, 0x2524, 0x2534, 0x252c,
|
||||
0x2502, 0x2264, 0x2265, 0x03c0, 0x2260, 0x00a3, 0x00b7, 0x0020
|
||||
};
|
||||
if (i >= 0x5F && i < 0x7F)
|
||||
unitab_xterm[i] = unitab_xterm_std[i & 0x1F];
|
||||
else
|
||||
unitab_xterm[i] = unitab_line[i];
|
||||
}
|
||||
|
||||
/*
|
||||
* Set up unitab_scoacs. The SCO Alternate Character Set is
|
||||
* simply CP437.
|
||||
*/
|
||||
for (i = 0; i < 256; i++) {
|
||||
char c[1], *p;
|
||||
wchar_t wc[1];
|
||||
int len;
|
||||
c[0] = i;
|
||||
p = c;
|
||||
len = 1;
|
||||
if (1 == charset_to_unicode(&p,&len,wc,1,CS_CP437,NULL,L"",0))
|
||||
unitab_scoacs[i] = wc[0];
|
||||
else
|
||||
unitab_scoacs[i] = 0xFFFD;
|
||||
}
|
||||
|
||||
/* Find the line control characters. */
|
||||
for (i = 0; i < 256; i++)
|
||||
if (i < ' ' || (i >= 0x7F && i < 0xA0))
|
||||
if (unitab_line[i] < ' '
|
||||
|| (unitab_line[i] >= 0x7F && unitab_line[i] < 0xA0))
|
||||
unitab_ctrl[i] = i;
|
||||
else
|
||||
unitab_ctrl[i] = 0xFF;
|
||||
|
||||
for (i = 0; i < 256; i++) {
|
||||
unitab_line[i] = unitab_scoacs[i] = i;
|
||||
unitab_xterm[i] = (i >= 0x5F && i < 0x7F) ? ((i+1) & 0x1F) : i;
|
||||
}
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user