A better solution to the problem of duplicated positions in

CS_ISO8859_1_X11: where two SBCS positions map to the same Unicode code point, we now have a `sortpriority' hint which can tell sbcsgen.pl which one it should preferentially generate when converting back to SBCS. [originally from svn r2427]
2025-05-28 23:34:49 -05:00 · 2003-01-02 16:56:29 +00:00 · 2003-01-02 16:56:29 +00:00 · a2afc03bdb
commit a2afc03bdb
parent 8de5682450
2 changed files with 30 additions and 14 deletions
--- a/charset/sbcs.dat
+++ b/charset/sbcs.dat
@ -306,13 +306,15 @@ charset CS_ISO8859_16
  appear from positions 0x5F to 0x7E inclusive. Here is the modified
  ISO8859-1 code table.

-  Note that position 0 is still 0000, not 0020 as it might plausibly
-  be, because I didn't like the idea that converting several words
-  in Unicode through this table would produce NULs in place of all
-  the spaces! In principle that works fine, but it makes me uneasy.
+  Since this table contains a few duplicated positions, we use the
+  `sortpriority' hint to indicate that things in the main part of
+  the code table (0x20-0xFF) should be generated preferentially when
+  converting _from_ Unicode. Hence, U+00b0 (for example) will yield
+  0xb0 rather than 0x07.

 charset CS_ISO8859_1_X11
-0000 2666 2592 2409 240c 240d 240a 00b0 00b1 2424 240b 2518 2510 250c 2514 253c
+sortpriority 00-1F -1
+0020 2666 2592 2409 240c 240d 240a 00b0 00b1 2424 240b 2518 2510 250c 2514 253c
 23ba 23bb 2500 23bc 23bd 251c 2524 2534 252c 2502 2264 2265 03c0 2260 00a3 00b7
 0020 0021 0022 0023 0024 0025 0026 0027 0028 0029 002a 002b 002c 002d 002e 002f
 0030 0031 0032 0033 0034 0035 0036 0037 0038 0039 003a 003b 003c 003d 003e 003f
--- a/charset/sbcsgen.pl
+++ b/charset/sbcsgen.pl
@ -27,21 +27,28 @@ my $charsetname = undef;
 my @vals = ();

 my @charsetnames = ();
+my @sortpriority = ();

 while (<FOO>) {
    chomp;
    if (/^charset (.*)$/) {
 	$charsetname = $1;
 	@vals = ();
+	@sortpriority = map { 0 } 0..255;
+    } elsif (/^sortpriority ([^-]*)-([^-]*) (.*)$/) {
+	for ($i = hex $1; $i <= hex $2; $i++) {
+	    $sortpriority[$i] += $3;
+	}
    } elsif (/^[0-9a-fA-FX]/) {
 	push @vals, map { $_ eq "XXXX" ? -1 : hex $_ } split / +/, $_;
 	if (scalar @vals > 256) {
 	    die "$infile:$.: charset $charsetname has more than 256 values\n";
 	} elsif (scalar @vals == 256) {
-	    &outcharset($charsetname, @vals);
+	    &outcharset($charsetname, \@vals, \@sortpriority);
 	    push @charsetnames, $charsetname;
 	    $charsetname = undef;
 	    @vals = ();
+	    @sortpriority = map { 0 } 0..255;
 	}
    }
 }
@ -56,8 +63,8 @@ foreach $i (@charsetnames) {
 print "\n";
 print "#endif /* ENUM_CHARSETS */\n";

-sub outcharset($@) {
-    my ($name, @vals) = @_;
+sub outcharset($$$) {
+    my ($name, $vals, $sortpriority) = @_;
    my ($prefix, $i, @sorted);

    print "static const sbcs_data data_$name = {\n";
@ -65,11 +72,12 @@ sub outcharset($@) {
    $prefix = "    ";
    @sorted = ();
    for ($i = 0; $i < 256; $i++) {
-	if ($vals[$i] < 0) {
+	if ($vals->[$i] < 0) {
 	    printf "%sERROR ", $prefix;
 	} else {
-	    printf "%s0x%04x", $prefix, $vals[$i];
-	    push @sorted, [$i, $vals[$i]];
+	    printf "%s0x%04x", $prefix, $vals->[$i];
+	    die "ooh? $i\n" unless defined $sortpriority->[$i];
+	    push @sorted, [$i, $vals->[$i], 0+$sortpriority->[$i]];
 	}
 	if ($i % 8 == 7) {
 	    $prefix = ",\n    ";
@ -78,15 +86,21 @@ sub outcharset($@) {
 	}
    }
    print "\n    },\n    {\n";
-    @sorted = sort { $a->[1] <=> $b->[1] } @sorted;
+    @sorted = sort { $a->[1] == $b->[1] ?
+	             $b->[2] <=> $a->[2] :
+	             $a->[1] <=> $b->[1] } @sorted;
    $prefix = "    ";
-    for ($i = 0; $i < scalar @sorted; $i++) {
+    $uval = -1;
+    for ($i = $j = 0; $i < scalar @sorted; $i++) {
+	next if ($uval == $sorted[$i]->[1]); # low-priority alternative
+	$uval = $sorted[$i]->[1];
 	printf "%s0x%02x", $prefix, $sorted[$i]->[0];
-	if ($i % 8 == 7) {
+	if ($j % 8 == 7) {
 	    $prefix = ",\n    ";
 	} else {
 	    $prefix = ", ";
 	}
+	$j++;
    }
    printf "\n    },\n    %d\n", scalar @sorted;
    print "};\n";