1
0
mirror of https://git.tartarus.org/simon/putty.git synced 2025-06-30 11:02:48 -05:00

First draft of Unicode support in pterm. It's pretty complete: it

does UTF-8 copy and paste (falling back to normal strings if
necessary), it understands X font encodings and translates things
accordingly so that if you have a Unicode font you can ask for
virtually any single-byte encoding and get it (Mac-Roman pterm,
anyone?), and so on. There's work left to be done (wide fonts for
CJK spring to mind), but I reckon this is a pretty good start.

[originally from svn r2395]
This commit is contained in:
Simon Tatham
2002-12-31 12:20:34 +00:00
parent 241570c04f
commit ad2bbc52a4
20 changed files with 2836 additions and 50 deletions

View File

@ -90,6 +90,20 @@ to specify it explicitly if you have changed the default using the
.IP "\fB\-log\fP \fIfilename\fP"
This option makes \fIpterm\fP log all the terminal output to a file
as well as displaying it in the terminal.
.IP "\fB\-cs\fP \fIcharset\fP"
This option specifies the character set in which \fIpterm\fP should
assume the session is operating. This character set will be used to
interpret all the data received from the session, and all input you
type or paste into \fIpterm\fP will be converted into this character
set before being sent to the session.
Any character set name which is valid in a MIME header (and
supported by \fIpterm\fP) should be valid here (examples are
"ISO-8859-1", "windows-1252" or "UTF-8"). Also, any character
encoding which is valid in an X logical font description should be
valid ("ibm-cp437", for example).
Character set names are case-insensitive.
.IP "\fB\-nethack\fP"
Tells \fIpterm\fP to enable NetHack keypad mode, in which the
numeric keypad generates the NetHack "hjklyubn" direction keys. This
@ -385,6 +399,14 @@ reset to the very bottom.
This option should be set to either 0 or 1; the default is 1. When
set to 1, any activity in the display causes the position of the
scrollback to be reset to the very bottom.
.IP "\fBpterm.LineCodePage\fP"
This option specifies the character set to be used for the session.
This is the same as the \fI\-cs\fP command-line option.
.IP "\fBpterm.NoRemoteCharset\fP"
This option disables the terminal's ability to change its character
set when it receives escape sequences telling it to. You might need
to do this to interoperate with programs which incorrectly change
the character set to something they think is sensible.
.IP "\fBpterm.BCE\fP"
This option should be set to either 0 or 1; the default is 1. When
set to 1, the various control sequences that erase parts of the

View File

@ -24,6 +24,7 @@
#include <X11/Xutil.h>
#define PUTTY_DO_GLOBALS /* actually _define_ globals */
#include "putty.h"
#include "terminal.h"
@ -39,18 +40,22 @@ struct gui_data {
GtkAdjustment *sbar_adjust;
GdkPixmap *pixmap;
GdkFont *fonts[2]; /* normal and bold (for now!) */
struct {
int charset;
int is_wide;
} fontinfo[2];
GdkCursor *rawcursor, *textcursor, *blankcursor, *currcursor;
GdkColor cols[NCOLOURS];
GdkColormap *colmap;
wchar_t *pastein_data;
int pastein_data_len;
char *pasteout_data;
int pasteout_data_len;
char *pasteout_data, *pasteout_data_utf8;
int pasteout_data_len, pasteout_data_utf8_len;
int font_width, font_height;
int ignore_sbar;
int mouseptr_visible;
guint term_paste_idle_id;
GdkAtom compound_text_atom;
GdkAtom compound_text_atom, utf8_string_atom;
int alt_keycode;
int alt_digits;
char wintitle[sizeof(((Config *)0)->wintitle)];
@ -831,7 +836,19 @@ gint key_event(GtkWidget *widget, GdkEventKey *event, gpointer data)
printf("\n");
#endif
ldisc_send(inst->ldisc, output+start, end-start, 1);
/*
* The stuff we've just generated is assumed to be
* ISO-8859-1! This sounds insane, but `man XLookupString'
* agrees: strings of this type returned from the X server
* are hardcoded to 8859-1. Strictly speaking we should be
* doing this using some sort of GtkIMContext, which (if
* we're lucky) would give us our data directly in Unicode;
* but that's not supported in GTK 1.2 as far as I can
* tell, and it's poorly documented even in 2.0, so it'll
* have to wait.
*/
lpage_send(inst->ldisc, CS_ISO8859_1, output+start, end-start, 1);
show_mouseptr(inst, 0);
term_seen_key_event(inst->term);
term_out(inst->term);
@ -1198,9 +1215,26 @@ void write_clip(void *frontend, wchar_t * data, int len, int must_deselect)
struct gui_data *inst = (struct gui_data *)frontend;
if (inst->pasteout_data)
sfree(inst->pasteout_data);
if (inst->pasteout_data_utf8)
sfree(inst->pasteout_data_utf8);
inst->pasteout_data_utf8 = smalloc(len*6);
inst->pasteout_data_utf8_len = len*6;
{
wchar_t *tmp = data;
int tmplen = len;
inst->pasteout_data_utf8_len =
charset_from_unicode(&tmp, &tmplen, inst->pasteout_data_utf8,
inst->pasteout_data_utf8_len,
CS_UTF8, NULL, NULL, 0);
inst->pasteout_data_utf8 =
srealloc(inst->pasteout_data_utf8, inst->pasteout_data_utf8_len);
}
inst->pasteout_data = smalloc(len);
inst->pasteout_data_len = len;
wc_to_mb(0, 0, data, len, inst->pasteout_data, inst->pasteout_data_len,
wc_to_mb(line_codepage, 0, data, len,
inst->pasteout_data, inst->pasteout_data_len,
NULL, NULL);
if (gtk_selection_owner_set(inst->area, GDK_SELECTION_PRIMARY,
@ -1209,6 +1243,8 @@ void write_clip(void *frontend, wchar_t * data, int len, int must_deselect)
GDK_SELECTION_TYPE_STRING, 1);
gtk_selection_add_target(inst->area, GDK_SELECTION_PRIMARY,
inst->compound_text_atom, 1);
gtk_selection_add_target(inst->area, GDK_SELECTION_PRIMARY,
inst->utf8_string_atom, 1);
}
}
@ -1216,8 +1252,13 @@ void selection_get(GtkWidget *widget, GtkSelectionData *seldata,
guint info, guint time_stamp, gpointer data)
{
struct gui_data *inst = (struct gui_data *)data;
gtk_selection_data_set(seldata, GDK_SELECTION_TYPE_STRING, 8,
inst->pasteout_data, inst->pasteout_data_len);
if (seldata->target == inst->utf8_string_atom)
gtk_selection_data_set(seldata, seldata->target, 8,
inst->pasteout_data_utf8,
inst->pasteout_data_utf8_len);
else
gtk_selection_data_set(seldata, seldata->target, 8,
inst->pasteout_data, inst->pasteout_data_len);
}
gint selection_clear(GtkWidget *widget, GdkEventSelection *seldata,
@ -1227,8 +1268,12 @@ gint selection_clear(GtkWidget *widget, GdkEventSelection *seldata,
term_deselect(inst->term);
if (inst->pasteout_data)
sfree(inst->pasteout_data);
if (inst->pasteout_data_utf8)
sfree(inst->pasteout_data_utf8);
inst->pasteout_data = NULL;
inst->pasteout_data_len = 0;
inst->pasteout_data_utf8 = NULL;
inst->pasteout_data_utf8_len = 0;
return TRUE;
}
@ -1240,8 +1285,16 @@ void request_paste(void *frontend)
* moment is to call gtk_selection_convert(), and when the data
* comes back _then_ we can call term_do_paste().
*/
/*
* First we attempt to retrieve the selection as a UTF-8 string
* (which we will convert to the correct code page before
* sending to the session, of course). If that fails,
* selection_received() will be informed and will fall back to
* an ordinary string.
*/
gtk_selection_convert(inst->area, GDK_SELECTION_PRIMARY,
GDK_SELECTION_TYPE_STRING, GDK_CURRENT_TIME);
inst->utf8_string_atom, GDK_CURRENT_TIME);
}
gint idle_paste_func(gpointer data); /* forward ref */
@ -1251,8 +1304,22 @@ void selection_received(GtkWidget *widget, GtkSelectionData *seldata,
{
struct gui_data *inst = (struct gui_data *)data;
if (seldata->target == inst->utf8_string_atom && seldata->length <= 0) {
/*
* Failed to get a UTF-8 selection string. Try an ordinary
* string.
*/
gtk_selection_convert(inst->area, GDK_SELECTION_PRIMARY,
GDK_SELECTION_TYPE_STRING, GDK_CURRENT_TIME);
return;
}
/*
* Any other failure should just go foom.
*/
if (seldata->length <= 0 ||
seldata->type != GDK_SELECTION_TYPE_STRING)
(seldata->type != GDK_SELECTION_TYPE_STRING &&
seldata->type != inst->utf8_string_atom))
return; /* Nothing happens. */
if (inst->pastein_data)
@ -1260,8 +1327,11 @@ void selection_received(GtkWidget *widget, GtkSelectionData *seldata,
inst->pastein_data = smalloc(seldata->length * sizeof(wchar_t));
inst->pastein_data_len = seldata->length;
mb_to_wc(0, 0, seldata->data, seldata->length,
inst->pastein_data, inst->pastein_data_len);
inst->pastein_data_len =
mb_to_wc((seldata->type == inst->utf8_string_atom ?
CS_UTF8 : line_codepage),
0, seldata->data, seldata->length,
inst->pastein_data, inst->pastein_data_len);
term_do_paste(inst->term);
@ -1457,10 +1527,45 @@ void do_text_internal(Context ctx, int x, int y, char *text, int len,
rlen*inst->font_width, inst->font_height);
gdk_gc_set_foreground(gc, &inst->cols[nfg]);
gdk_draw_text(inst->pixmap, inst->fonts[fontid], gc,
x*inst->font_width+cfg.window_border,
y*inst->font_height+cfg.window_border+inst->fonts[0]->ascent,
text, len);
{
GdkWChar *gwcs;
gchar *gcs;
wchar_t *wcs;
int i;
wcs = smalloc(sizeof(wchar_t) * (len+1));
for (i = 0; i < len; i++) {
wcs[i] = (wchar_t) ((attr & CSET_MASK) + (text[i] & CHAR_MASK));
}
if (inst->fontinfo[fontid].is_wide) {
gwcs = smalloc(sizeof(GdkWChar) * (len+1));
/*
* FIXME: when we have a wide-char equivalent of
* from_unicode, use it instead of this.
*/
for (i = 0; i <= len; i++)
gwcs[i] = wcs[i];
gdk_draw_text_wc(inst->pixmap, inst->fonts[fontid], gc,
x*inst->font_width+cfg.window_border,
y*inst->font_height+cfg.window_border+inst->fonts[0]->ascent,
gwcs, len*2);
sfree(gwcs);
} else {
wchar_t *wcstmp = wcs;
int lentmp = len;
gcs = smalloc(sizeof(GdkWChar) * (len+1));
charset_from_unicode(&wcstmp, &lentmp, gcs, len,
inst->fontinfo[fontid].charset,
NULL, ".", 1);
gdk_draw_text(inst->pixmap, inst->fonts[fontid], gc,
x*inst->font_width+cfg.window_border,
y*inst->font_height+cfg.window_border+inst->fonts[0]->ascent,
gcs, len);
sfree(gcs);
}
sfree(wcs);
}
if (shadow) {
gdk_draw_text(inst->pixmap, inst->fonts[fontid], gc,
@ -1818,6 +1923,12 @@ int do_cmdline(int argc, char **argv, int do_everything)
strncpy(cfg.boldfont, val, sizeof(cfg.boldfont));
cfg.boldfont[sizeof(cfg.boldfont)-1] = '\0';
} else if (!strcmp(p, "-cs")) {
EXPECTS_ARG;
SECOND_PASS_ONLY;
strncpy(cfg.line_codepage, val, sizeof(cfg.line_codepage));
cfg.line_codepage[sizeof(cfg.line_codepage)-1] = '\0';
} else if (!strcmp(p, "-geometry")) {
int flags, x, y, w, h;
EXPECTS_ARG;
@ -1955,6 +2066,68 @@ static void block_signal(int sig, int block_it) {
}
}
static void set_font_info(struct gui_data *inst, int fontid)
{
GdkFont *font = inst->fonts[fontid];
XFontStruct *xfs = GDK_FONT_XFONT(font);
Display *disp = GDK_FONT_XDISPLAY(font);
Atom charset_registry, charset_encoding;
unsigned long registry_ret, encoding_ret;
charset_registry = XInternAtom(disp, "CHARSET_REGISTRY", False);
charset_encoding = XInternAtom(disp, "CHARSET_ENCODING", False);
inst->fontinfo[fontid].charset = CS_NONE;
inst->fontinfo[fontid].is_wide = 0;
if (XGetFontProperty(xfs, charset_registry, &registry_ret) &&
XGetFontProperty(xfs, charset_encoding, &encoding_ret)) {
char *reg, *enc;
reg = XGetAtomName(disp, (Atom)registry_ret);
enc = XGetAtomName(disp, (Atom)encoding_ret);
if (reg && enc) {
char *encoding = dupcat(reg, "-", enc, NULL);
inst->fontinfo[fontid].charset = charset_from_xenc(encoding);
/* FIXME: when libcharset supports wide encodings fix this. */
if (!strcasecmp(encoding, "iso10646-1"))
inst->fontinfo[fontid].is_wide = 1;
/*
* Hack for X line-drawing characters: if the primary
* font is encoded as ISO-8859-anything, and has valid
* glyphs in the first 32 char positions, it is assumed
* that those glyphs are the VT100 line-drawing
* character set.
*
* Actually, we'll hack even harder by only checking
* position 0x19 (vertical line, VT100 linedrawing
* `x'). Then we can check it easily by seeing if the
* ascent and descent differ.
*/
if (inst->fontinfo[fontid].charset == CS_ISO8859_1) {
int lb, rb, wid, asc, desc;
gchar text[2];
text[1] = '\0';
text[0] = '\x12';
gdk_string_extents(inst->fonts[fontid], text,
&lb, &rb, &wid, &asc, &desc);
if (asc != desc)
inst->fontinfo[fontid].charset = CS_ISO8859_1_X11;
}
/*
* FIXME: this is a hack. Currently fonts with
* incomprehensible encodings are dealt with by
* pretending they're 8859-1. It's ugly, but it's good
* enough to stop things crashing. Should do something
* better here.
*/
if (inst->fontinfo[fontid].charset == CS_NONE)
inst->fontinfo[fontid].charset = CS_ISO8859_1;
sfree(encoding);
}
}
}
int main(int argc, char **argv)
{
extern int pty_master_fd; /* declared in pty.c */
@ -1987,6 +2160,7 @@ int main(int argc, char **argv)
fprintf(stderr, "pterm: unable to load font \"%s\"\n", cfg.font);
exit(1);
}
set_font_info(inst, 0);
if (cfg.boldfont[0]) {
inst->fonts[1] = gdk_font_load(cfg.boldfont);
if (!inst->fonts[1]) {
@ -1994,6 +2168,7 @@ int main(int argc, char **argv)
cfg.boldfont);
exit(1);
}
set_font_info(inst, 1);
} else
inst->fonts[1] = NULL;
@ -2001,6 +2176,7 @@ int main(int argc, char **argv)
inst->font_height = inst->fonts[0]->ascent + inst->fonts[0]->descent;
inst->compound_text_atom = gdk_atom_intern("COMPOUND_TEXT", FALSE);
inst->utf8_string_atom = gdk_atom_intern("UTF8_STRING", FALSE);
init_ucs();

View File

@ -1,6 +1,8 @@
#ifndef PUTTY_UNIX_H
#define PUTTY_UNIX_H
#include "charset.h"
typedef void *Context; /* FIXME: probably needs changing */
extern Backend pty_backend;
@ -47,7 +49,16 @@ int select_result(int fd, int event);
int first_socket(int *state, int *rwx);
int next_socket(int *state, int *rwx);
#define DEFAULT_CODEPAGE 0 /* FIXME: no idea how to do this */
/*
* In the Unix Unicode layer, DEFAULT_CODEPAGE is a special value
* which causes mb_to_wc and wc_to_mb to call _libc_ rather than
* libcharset. That way, we can interface the various charsets
* supported by libcharset with the one supported by mbstowcs and
* wcstombs (which will be the character set in which stuff read
* from the command line or config files is assumed to be encoded).
*/
#define DEFAULT_CODEPAGE 0xFFFF
#define CP_UTF8 CS_UTF8 /* from libcharset */
#define strnicmp strncasecmp
#define stricmp strcasecmp

View File

@ -1,17 +1,18 @@
#include <stdio.h>
#include <stdlib.h>
#include <ctype.h>
#include <locale.h>
#include <limits.h>
#include <wchar.h>
#include <time.h>
#include "putty.h"
#include "terminal.h"
#include "misc.h"
/*
* Unix Unicode-handling routines.
*
* FIXME: currently trivial stub versions assuming all codepages
* are ISO8859-1.
*/
int is_dbcs_leadbyte(int codepage, char byte)
@ -22,48 +23,151 @@ int is_dbcs_leadbyte(int codepage, char byte)
int mb_to_wc(int codepage, int flags, char *mbstr, int mblen,
wchar_t *wcstr, int wclen)
{
int ret = 0;
while (mblen > 0 && wclen > 0) {
*wcstr++ = (unsigned char) *mbstr++;
mblen--, wclen--, ret++;
}
return ret; /* FIXME: check error codes! */
if (codepage == DEFAULT_CODEPAGE) {
int n = 0;
mbstate_t state = { 0 };
setlocale(LC_CTYPE, "");
while (mblen > 0) {
size_t i = mbrtowc(wcstr+n, mbstr, (size_t)mblen, &state);
if (i == (size_t)-1 || i == (size_t)-2)
break;
n++;
mbstr += i;
mblen -= i;
}
setlocale(LC_CTYPE, "C");
return n;
} else
return charset_to_unicode(&mbstr, &mblen, wcstr, wclen, codepage,
NULL, NULL, 0);
}
int wc_to_mb(int codepage, int flags, wchar_t *wcstr, int wclen,
char *mbstr, int mblen, char *defchr, int *defused)
{
int ret = 0;
/* FIXME: we should remove the defused param completely... */
if (defused)
*defused = 0;
while (mblen > 0 && wclen > 0) {
if (*wcstr >= 0x100) {
if (defchr)
*mbstr++ = *defchr;
else
*mbstr++ = '.';
if (defused)
*defused = 1;
} else
*mbstr++ = (unsigned char) *wcstr;
wcstr++;
mblen--, wclen--, ret++;
}
return ret; /* FIXME: check error codes! */
if (codepage == DEFAULT_CODEPAGE) {
char output[MB_LEN_MAX];
mbstate_t state = { 0 };
int n = 0;
setlocale(LC_CTYPE, "");
while (wclen > 0) {
int i = wcrtomb(output, wcstr[0], &state);
if (i == (size_t)-1 || i > n - mblen)
break;
memcpy(mbstr+n, output, i);
n += i;
wcstr++;
wclen--;
}
setlocale(LC_CTYPE, "C");
return n;
} else
return charset_from_unicode(&wcstr, &wclen, mbstr, mblen, codepage,
NULL, NULL, 0);
}
void init_ucs(void)
{
int i;
/* Find the line control characters. FIXME: this is not right. */
/*
* In the platform-independent parts of the code, font_codepage
* is used only for system DBCS support - which we don't
* support at all. So we set this to something which will never
* be used.
*/
font_codepage = -1;
/*
* line_codepage should be decoded from the specification in
* cfg.
*/
line_codepage = charset_from_mimeenc(cfg.line_codepage);
if (line_codepage == CS_NONE)
line_codepage = charset_from_xenc(cfg.line_codepage);
/* If it's still CS_NONE, we should assume direct-to-font. */
/* FIXME: this is a hack. Currently fonts with incomprehensible
* encodings are dealt with by pretending they're 8859-1. It's
* ugly, but it's good enough to stop things crashing. Should do
* something better here. */
if (line_codepage == CS_NONE)
line_codepage = CS_ISO8859_1;
/*
* Set up unitab_line, by translating each individual character
* in the line codepage into Unicode.
*/
for (i = 0; i < 256; i++) {
char c[1], *p;
wchar_t wc[1];
int len;
c[0] = i;
p = c;
len = 1;
if (1 == charset_to_unicode(&p,&len,wc,1,line_codepage,NULL,L"",0))
unitab_line[i] = wc[0];
else
unitab_line[i] = 0xFFFD;
}
/*
* Set up unitab_xterm. This is the same as unitab_line except
* in the line-drawing regions, where it follows the Unicode
* encoding.
*
* (Note that the strange X encoding of line-drawing characters
* in the bottom 32 glyphs of ISO8859-1 fonts is taken care of
* by the font encoding, which will spot such a font and act as
* if it were in a variant encoding of ISO8859-1.)
*/
for (i = 0; i < 256; i++) {
static const wchar_t unitab_xterm_std[32] = {
0x2666, 0x2592, 0x2409, 0x240c, 0x240d, 0x240a, 0x00b0, 0x00b1,
0x2424, 0x240b, 0x2518, 0x2510, 0x250c, 0x2514, 0x253c, 0x23ba,
0x23bb, 0x2500, 0x23bc, 0x23bd, 0x251c, 0x2524, 0x2534, 0x252c,
0x2502, 0x2264, 0x2265, 0x03c0, 0x2260, 0x00a3, 0x00b7, 0x0020
};
if (i >= 0x5F && i < 0x7F)
unitab_xterm[i] = unitab_xterm_std[i & 0x1F];
else
unitab_xterm[i] = unitab_line[i];
}
/*
* Set up unitab_scoacs. The SCO Alternate Character Set is
* simply CP437.
*/
for (i = 0; i < 256; i++) {
char c[1], *p;
wchar_t wc[1];
int len;
c[0] = i;
p = c;
len = 1;
if (1 == charset_to_unicode(&p,&len,wc,1,CS_CP437,NULL,L"",0))
unitab_scoacs[i] = wc[0];
else
unitab_scoacs[i] = 0xFFFD;
}
/* Find the line control characters. */
for (i = 0; i < 256; i++)
if (i < ' ' || (i >= 0x7F && i < 0xA0))
if (unitab_line[i] < ' '
|| (unitab_line[i] >= 0x7F && unitab_line[i] < 0xA0))
unitab_ctrl[i] = i;
else
unitab_ctrl[i] = 0xFF;
for (i = 0; i < 256; i++) {
unitab_line[i] = unitab_scoacs[i] = i;
unitab_xterm[i] = (i >= 0x5F && i < 0x7F) ? ((i+1) & 0x1F) : i;
}
}