1
0
mirror of https://git.tartarus.org/simon/putty.git synced 2025-01-10 01:48:00 +00:00

Complete rewrite of the bidi algorithm.

A user reported that PuTTY's existing bidi algorithm will generate
misordered text in cases like this (assuming UTF-8):

  echo -e '12 A \xD7\x90\xD7\x91 B'

The hex codes in the middle are the Hebrew letters aleph and beth.
Appearing in the middle of a line whose primary direction is
left-to-right, those two letters should appear in the opposite order,
but not cause the rest of the line to move around. That is, you expect
the displayed text in this situation to be

  12 A <beth><aleph> B

But in fact, the digits '12' were erroneously reversed, so you would
actually see '21 A <beth><aleph> B'.

I tried to debug the existing bidi algorithm, but it was very hard,
because the Unicode bidi spec has been extensively changed since
Arabeyes contributed that code, and I couldn't even reliably work out
which version of the spec the code was intended to implement. I found
some problems, notably that the resolution phase was running once on
the whole line instead of separately on runs of characters at the same
level, and also that the 'sor' and 'eor' values were being wrongly
computed. But I had no way to test any fix to ensure it hadn't
introduced another bug somewhere else.

Unicode provides a set of conformance tests in the UCD. That was just
what I wanted - but they're too up-to-date to run against the old
algorithm and expect to pass!

So, paradoxically, it seemed to me that the _easiest_ way to fix this
bidi bug would be to bring absolutely everything up to date. But the
revised bidi algorithm is significantly more complicated, so I also
didn't think it would be sensible to try to gradually evolve the
existing code into it. Instead, I've done a complete rewrite of my
own.

The new code implements the full UAX#9 rev 44 algorithm, including in
particular support for the new 'directional isolate' control
characters, and also special handling for matched pairs of brackets in
the text (see rule N0 in the spec). I've managed to get it to pass the
entire UCD conformance test suite, so I'm reasonably confident it's
right, or at the very least a lot closer to right than the old
algorithm was.

So the upshot is: the test case shown at the top of this file now
passes, but also, other detailed bidi handling might have changed,
certainly some cases involving brackets, but perhaps also other things
that were either bugs in the old algorithm or updates to the standard.
This commit is contained in:
Simon Tatham 2021-10-10 14:51:17 +01:00
parent caa16deb1c
commit b8be01adca
2 changed files with 2798 additions and 1070 deletions

File diff suppressed because it is too large Load Diff

View File

@ -30,11 +30,15 @@ unsigned char bidi_getType(int ch);
X(L) \ X(L) \
X(LRE) \ X(LRE) \
X(LRO) \ X(LRO) \
X(LRI) \
X(R) \ X(R) \
X(AL) \ X(AL) \
X(RLE) \ X(RLE) \
X(RLO) \ X(RLO) \
X(RLI) \
X(PDF) \ X(PDF) \
X(PDI) \
X(FSI) \
X(EN) \ X(EN) \
X(ES) \ X(ES) \
X(ET) \ X(ET) \
@ -62,4 +66,69 @@ typedef enum { BIDI_CHAR_TYPE_LIST(ENUM_DECL) N_BIDI_TYPES } BidiType;
typedef enum { SHAPING_CHAR_TYPE_LIST(ENUM_DECL) N_SHAPING_TYPES } ShapingType; typedef enum { SHAPING_CHAR_TYPE_LIST(ENUM_DECL) N_SHAPING_TYPES } ShapingType;
#undef ENUM_DECL #undef ENUM_DECL
static inline bool typeIsStrong(BidiType t)
{
return ((1<<L) | (1<<R) | (1<<AL)) & (1 << t);
}
static inline bool typeIsWeak(BidiType t)
{
return ((1<<EN) | (1<<ES) | (1<<ET) | (1<<AN) |
(1<<CS) | (1<<NSM) | (1<<BN)) & (1 << t);
}
static inline bool typeIsNeutral(BidiType t)
{
return ((1<<B) | (1<<S) | (1<<WS) | (1<<ON)) & (1 << t);
}
static inline bool typeIsBidiActive(BidiType t)
{
return ((1<<R) | (1<<AL) | (1<<AN) | (1<<RLE) | (1<<LRE) | (1<<RLO) |
(1<<LRO) | (1<<PDF) | (1<<RLI)) & (1 << t);
}
static inline bool typeIsIsolateInitiator(BidiType t)
{
return ((1<<LRI) | (1<<RLI) | (1<<FSI)) & (1 << t);
}
static inline bool typeIsIsolateInitiatorOrPDI(BidiType t)
{
return ((1<<LRI) | (1<<RLI) | (1<<FSI) | (1<<PDI)) & (1 << t);
}
static inline bool typeIsEmbeddingInitiator(BidiType t)
{
return ((1<<LRE) | (1<<RLE) | (1<<LRO) | (1<<RLO)) & (1 << t);
}
static inline bool typeIsEmbeddingInitiatorOrPDF(BidiType t)
{
return ((1<<LRE) | (1<<RLE) | (1<<LRO) | (1<<RLO) | (1<<PDF)) & (1 << t);
}
static inline bool typeIsWeakSeparatorOrTerminator(BidiType t)
{
return ((1<<ES) | (1<<ET) | (1<<CS)) & (1 << t);
}
static inline bool typeIsNeutralOrIsolate(BidiType t)
{
return ((1<<S) | (1<<WS) | (1<<ON) | (1<<FSI) | (1<<LRI) | (1<<RLI) |
(1<<PDI)) & (1 << t);
}
static inline bool typeIsSegmentOrParaSeparator(BidiType t)
{
return ((1<<S) | (1<<B)) & (1 << t);
}
static inline bool typeIsWhitespaceOrIsolate(BidiType t)
{
return ((1<<WS) | (1<<FSI) | (1<<LRI) | (1<<RLI) | (1<<PDI)) & (1 << t);
}
static inline bool typeIsRemovedDuringProcessing(BidiType t)
{
return ((1<<RLE) | (1<<LRE) | (1<<RLO) | (1<<LRO) | (1<<PDF) |
(1<<BN)) & (1 << t);
}
static inline bool typeIsStrongOrNumber(BidiType t)
{
return ((1<<L) | (1<<R) | (1<<AL) | (1<<EN) | (1<<AN)) & (1 << t);
}
static inline bool typeIsETOrBN(BidiType t)
{
return ((1<<ET) | (1<<BN)) & (1 << t);
}
#endif /* PUTTY_BIDI_H */ #endif /* PUTTY_BIDI_H */