putty-source/terminal/bidi.h

/*
 * Header file shared between bidi.c and its tests. Not used by
 * anything outside the bidi subsystem.
 */

#ifndef PUTTY_BIDI_H
#define PUTTY_BIDI_H

#define LMASK   0x3F    /* Embedding Level mask */
#define OMASK   0xC0    /* Override mask */
#define OISL    0x80    /* Override is L */
#define OISR    0x40    /* Override is R */

/* Shaping Helpers */
#define STYPE(xh) ((((xh) >= SHAPE_FIRST) && ((xh) <= SHAPE_LAST)) ? \
shapetypes[(xh)-SHAPE_FIRST].type : SU) /*))*/
#define SISOLATED(xh) (shapetypes[(xh)-SHAPE_FIRST].form_b)
#define SFINAL(xh) ((xh)+1)
#define SINITIAL(xh) ((xh)+2)
#define SMEDIAL(ch) ((ch)+3)

#define leastGreaterOdd(x) ( ((x)+1) | 1 )
#define leastGreaterEven(x) ( ((x)+2) &~ 1 )

/* Function declarations used outside bidi.c */
unsigned char bidi_getType(int ch);

/* Bidi character types */
#define BIDI_CHAR_TYPE_LIST(X) \
    X(L)                       \
    X(LRE)                     \
    X(LRO)                     \
    X(LRI)                     \
    X(R)                       \
    X(AL)                      \
    X(RLE)                     \
    X(RLO)                     \
    X(RLI)                     \
    X(PDF)                     \
    X(PDI)                     \
    X(FSI)                     \
    X(EN)                      \
    X(ES)                      \
    X(ET)                      \
    X(AN)                      \
    X(CS)                      \
    X(NSM)                     \
    X(BN)                      \
    X(B)                       \
    X(S)                       \
    X(WS)                      \
    X(ON)                      \
    /* end of list */

/* Shaping Types */
#define SHAPING_CHAR_TYPE_LIST(X)                                       \
    X(SL) /* Left-Joining, doesn't exist in U+0600 - U+06FF */          \
    X(SR) /* Right-Joining, ie has Isolated, Final */                   \
    X(SD) /* Dual-Joining, ie has Isolated, Final, Initial, Medial */   \
    X(SU) /* Non-Joining */                                             \
    X(SC) /* Join-Causing, like U+0640 (TATWEEL) */                     \
    /* end of list */

#define ENUM_DECL(name) name,
typedef enum { BIDI_CHAR_TYPE_LIST(ENUM_DECL) N_BIDI_TYPES } BidiType;
typedef enum { SHAPING_CHAR_TYPE_LIST(ENUM_DECL) N_SHAPING_TYPES } ShapingType;
#undef ENUM_DECL

static inline bool typeIsStrong(BidiType t)
{
    return ((1<<L) | (1<<R) | (1<<AL)) & (1 << t);
}
static inline bool typeIsWeak(BidiType t)
{
    return ((1<<EN) | (1<<ES) | (1<<ET) | (1<<AN) |
            (1<<CS) | (1<<NSM) | (1<<BN)) & (1 << t);
}
static inline bool typeIsNeutral(BidiType t)
{
    return ((1<<B) | (1<<S) | (1<<WS) | (1<<ON)) & (1 << t);
}
static inline bool typeIsBidiActive(BidiType t)
{
    return ((1<<R) | (1<<AL) | (1<<AN) | (1<<RLE) | (1<<LRE) | (1<<RLO) |
            (1<<LRO) | (1<<PDF) | (1<<RLI)) & (1 << t);
}
static inline bool typeIsIsolateInitiator(BidiType t)
{
    return ((1<<LRI) | (1<<RLI) | (1<<FSI)) & (1 << t);
}
static inline bool typeIsIsolateInitiatorOrPDI(BidiType t)
{
    return ((1<<LRI) | (1<<RLI) | (1<<FSI) | (1<<PDI)) & (1 << t);
}
static inline bool typeIsEmbeddingInitiator(BidiType t)
{
    return ((1<<LRE) | (1<<RLE) | (1<<LRO) | (1<<RLO)) & (1 << t);
}
static inline bool typeIsEmbeddingInitiatorOrPDF(BidiType t)
{
    return ((1<<LRE) | (1<<RLE) | (1<<LRO) | (1<<RLO) | (1<<PDF)) & (1 << t);
}
static inline bool typeIsWeakSeparatorOrTerminator(BidiType t)
{
    return ((1<<ES) | (1<<ET) | (1<<CS)) & (1 << t);
}
static inline bool typeIsNeutralOrIsolate(BidiType t)
{
    return ((1<<S) | (1<<WS) | (1<<ON) | (1<<FSI) | (1<<LRI) | (1<<RLI) |
            (1<<PDI)) & (1 << t);
}
static inline bool typeIsSegmentOrParaSeparator(BidiType t)
{
    return ((1<<S) | (1<<B)) & (1 << t);
}
static inline bool typeIsWhitespaceOrIsolate(BidiType t)
{
    return ((1<<WS) | (1<<FSI) | (1<<LRI) | (1<<RLI) | (1<<PDI)) & (1 << t);
}
static inline bool typeIsRemovedDuringProcessing(BidiType t)
{
    return ((1<<RLE) | (1<<LRE) | (1<<RLO) | (1<<LRO) | (1<<PDF) |
            (1<<BN)) & (1 << t);
}
static inline bool typeIsStrongOrNumber(BidiType t)
{
    return ((1<<L) | (1<<R) | (1<<AL) | (1<<EN) | (1<<AN)) & (1 << t);
}
static inline bool typeIsETOrBN(BidiType t)
{
    return ((1<<ET) | (1<<BN)) & (1 << t);
}

/*
 * More featureful interface to the bidi code, for use in bidi_test.c.
 * It returns a potentially different value of textlen (in case we're
 * compiling in REMOVE_FORMATTING_CHARACTERS mode), and also permits
 * you to pass in an override to the paragraph direction (because many
 * of the UCD conformance tests use one).
 *
 * 'override' is 0 for no override, +1 for left-to-right, -1 for
 * right-to-left.
 */
size_t do_bidi_test(BidiContext *ctx, bidi_char *text, size_t textlen,
                    int override);

#endif /* PUTTY_BIDI_H */
Move bidi gettype main() into its own file. That's what I've usually been doing with any main()s I find under ifdef; there's no reason this should be an exception. If we're keeping it in the code at all, we should ensure it carries on compiling. I've also created a new header file bidi.h, containing pieces of the bidi definitions shared between bidi.c and the new source file. 2021-10-10 13:31:04 +00:00			`/*`
			`* Header file shared between bidi.c and its tests. Not used by`
			`* anything outside the bidi subsystem.`
			`*/`

			`#ifndef PUTTY_BIDI_H`
			`#define PUTTY_BIDI_H`

			`#define LMASK 0x3F /* Embedding Level mask */`
			`#define OMASK 0xC0 /* Override mask */`
			`#define OISL 0x80 /* Override is L */`
			`#define OISR 0x40 /* Override is R */`

			`/* Shaping Helpers */`
			`#define STYPE(xh) ((((xh) >= SHAPE_FIRST) && ((xh) <= SHAPE_LAST)) ? \`
			`shapetypes[(xh)-SHAPE_FIRST].type : SU) /))/`
			`#define SISOLATED(xh) (shapetypes[(xh)-SHAPE_FIRST].form_b)`
			`#define SFINAL(xh) ((xh)+1)`
			`#define SINITIAL(xh) ((xh)+2)`
			`#define SMEDIAL(ch) ((ch)+3)`

			`#define leastGreaterOdd(x) ( ((x)+1) \| 1 )`
			`#define leastGreaterEven(x) ( ((x)+2) &~ 1 )`

			`/* Function declarations used outside bidi.c */`
			`unsigned char bidi_getType(int ch);`

Make bidi type enums into list macros. This makes it easier to create the matching array of type names in bidi_gettype.c, and eliminates the need for an assertion to check the array matched the enum. And I'm about to need to add more types, so let's start by making that trivially easy. 2021-10-10 13:32:03 +00:00			`/* Bidi character types */`
			`#define BIDI_CHAR_TYPE_LIST(X) \`
			`X(L) \`
			`X(LRE) \`
			`X(LRO) \`
Complete rewrite of the bidi algorithm. A user reported that PuTTY's existing bidi algorithm will generate misordered text in cases like this (assuming UTF-8): echo -e '12 A \xD7\x90\xD7\x91 B' The hex codes in the middle are the Hebrew letters aleph and beth. Appearing in the middle of a line whose primary direction is left-to-right, those two letters should appear in the opposite order, but not cause the rest of the line to move around. That is, you expect the displayed text in this situation to be 12 A <beth><aleph> B But in fact, the digits '12' were erroneously reversed, so you would actually see '21 A <beth><aleph> B'. I tried to debug the existing bidi algorithm, but it was very hard, because the Unicode bidi spec has been extensively changed since Arabeyes contributed that code, and I couldn't even reliably work out which version of the spec the code was intended to implement. I found some problems, notably that the resolution phase was running once on the whole line instead of separately on runs of characters at the same level, and also that the 'sor' and 'eor' values were being wrongly computed. But I had no way to test any fix to ensure it hadn't introduced another bug somewhere else. Unicode provides a set of conformance tests in the UCD. That was just what I wanted - but they're too up-to-date to run against the old algorithm and expect to pass! So, paradoxically, it seemed to me that the _easiest_ way to fix this bidi bug would be to bring absolutely everything up to date. But the revised bidi algorithm is significantly more complicated, so I also didn't think it would be sensible to try to gradually evolve the existing code into it. Instead, I've done a complete rewrite of my own. The new code implements the full UAX#9 rev 44 algorithm, including in particular support for the new 'directional isolate' control characters, and also special handling for matched pairs of brackets in the text (see rule N0 in the spec). I've managed to get it to pass the entire UCD conformance test suite, so I'm reasonably confident it's right, or at the very least a lot closer to right than the old algorithm was. So the upshot is: the test case shown at the top of this file now passes, but also, other detailed bidi handling might have changed, certainly some cases involving brackets, but perhaps also other things that were either bugs in the old algorithm or updates to the standard. 2021-10-10 13:51:17 +00:00			`X(LRI) \`
Make bidi type enums into list macros. This makes it easier to create the matching array of type names in bidi_gettype.c, and eliminates the need for an assertion to check the array matched the enum. And I'm about to need to add more types, so let's start by making that trivially easy. 2021-10-10 13:32:03 +00:00			`X(R) \`
			`X(AL) \`
			`X(RLE) \`
			`X(RLO) \`
Complete rewrite of the bidi algorithm. A user reported that PuTTY's existing bidi algorithm will generate misordered text in cases like this (assuming UTF-8): echo -e '12 A \xD7\x90\xD7\x91 B' The hex codes in the middle are the Hebrew letters aleph and beth. Appearing in the middle of a line whose primary direction is left-to-right, those two letters should appear in the opposite order, but not cause the rest of the line to move around. That is, you expect the displayed text in this situation to be 12 A <beth><aleph> B But in fact, the digits '12' were erroneously reversed, so you would actually see '21 A <beth><aleph> B'. I tried to debug the existing bidi algorithm, but it was very hard, because the Unicode bidi spec has been extensively changed since Arabeyes contributed that code, and I couldn't even reliably work out which version of the spec the code was intended to implement. I found some problems, notably that the resolution phase was running once on the whole line instead of separately on runs of characters at the same level, and also that the 'sor' and 'eor' values were being wrongly computed. But I had no way to test any fix to ensure it hadn't introduced another bug somewhere else. Unicode provides a set of conformance tests in the UCD. That was just what I wanted - but they're too up-to-date to run against the old algorithm and expect to pass! So, paradoxically, it seemed to me that the _easiest_ way to fix this bidi bug would be to bring absolutely everything up to date. But the revised bidi algorithm is significantly more complicated, so I also didn't think it would be sensible to try to gradually evolve the existing code into it. Instead, I've done a complete rewrite of my own. The new code implements the full UAX#9 rev 44 algorithm, including in particular support for the new 'directional isolate' control characters, and also special handling for matched pairs of brackets in the text (see rule N0 in the spec). I've managed to get it to pass the entire UCD conformance test suite, so I'm reasonably confident it's right, or at the very least a lot closer to right than the old algorithm was. So the upshot is: the test case shown at the top of this file now passes, but also, other detailed bidi handling might have changed, certainly some cases involving brackets, but perhaps also other things that were either bugs in the old algorithm or updates to the standard. 2021-10-10 13:51:17 +00:00			`X(RLI) \`
Make bidi type enums into list macros. This makes it easier to create the matching array of type names in bidi_gettype.c, and eliminates the need for an assertion to check the array matched the enum. And I'm about to need to add more types, so let's start by making that trivially easy. 2021-10-10 13:32:03 +00:00			`X(PDF) \`
Complete rewrite of the bidi algorithm. A user reported that PuTTY's existing bidi algorithm will generate misordered text in cases like this (assuming UTF-8): echo -e '12 A \xD7\x90\xD7\x91 B' The hex codes in the middle are the Hebrew letters aleph and beth. Appearing in the middle of a line whose primary direction is left-to-right, those two letters should appear in the opposite order, but not cause the rest of the line to move around. That is, you expect the displayed text in this situation to be 12 A <beth><aleph> B But in fact, the digits '12' were erroneously reversed, so you would actually see '21 A <beth><aleph> B'. I tried to debug the existing bidi algorithm, but it was very hard, because the Unicode bidi spec has been extensively changed since Arabeyes contributed that code, and I couldn't even reliably work out which version of the spec the code was intended to implement. I found some problems, notably that the resolution phase was running once on the whole line instead of separately on runs of characters at the same level, and also that the 'sor' and 'eor' values were being wrongly computed. But I had no way to test any fix to ensure it hadn't introduced another bug somewhere else. Unicode provides a set of conformance tests in the UCD. That was just what I wanted - but they're too up-to-date to run against the old algorithm and expect to pass! So, paradoxically, it seemed to me that the _easiest_ way to fix this bidi bug would be to bring absolutely everything up to date. But the revised bidi algorithm is significantly more complicated, so I also didn't think it would be sensible to try to gradually evolve the existing code into it. Instead, I've done a complete rewrite of my own. The new code implements the full UAX#9 rev 44 algorithm, including in particular support for the new 'directional isolate' control characters, and also special handling for matched pairs of brackets in the text (see rule N0 in the spec). I've managed to get it to pass the entire UCD conformance test suite, so I'm reasonably confident it's right, or at the very least a lot closer to right than the old algorithm was. So the upshot is: the test case shown at the top of this file now passes, but also, other detailed bidi handling might have changed, certainly some cases involving brackets, but perhaps also other things that were either bugs in the old algorithm or updates to the standard. 2021-10-10 13:51:17 +00:00			`X(PDI) \`
			`X(FSI) \`
Make bidi type enums into list macros. This makes it easier to create the matching array of type names in bidi_gettype.c, and eliminates the need for an assertion to check the array matched the enum. And I'm about to need to add more types, so let's start by making that trivially easy. 2021-10-10 13:32:03 +00:00			`X(EN) \`
			`X(ES) \`
			`X(ET) \`
			`X(AN) \`
			`X(CS) \`
			`X(NSM) \`
			`X(BN) \`
			`X(B) \`
			`X(S) \`
			`X(WS) \`
			`X(ON) \`
			`/* end of list */`
Move bidi gettype main() into its own file. That's what I've usually been doing with any main()s I find under ifdef; there's no reason this should be an exception. If we're keeping it in the code at all, we should ensure it carries on compiling. I've also created a new header file bidi.h, containing pieces of the bidi definitions shared between bidi.c and the new source file. 2021-10-10 13:31:04 +00:00
			`/* Shaping Types */`
Make bidi type enums into list macros. This makes it easier to create the matching array of type names in bidi_gettype.c, and eliminates the need for an assertion to check the array matched the enum. And I'm about to need to add more types, so let's start by making that trivially easy. 2021-10-10 13:32:03 +00:00			`#define SHAPING_CHAR_TYPE_LIST(X) \`
			`X(SL) /* Left-Joining, doesn't exist in U+0600 - U+06FF */ \`
			`X(SR) /* Right-Joining, ie has Isolated, Final */ \`
			`X(SD) /* Dual-Joining, ie has Isolated, Final, Initial, Medial */ \`
			`X(SU) /* Non-Joining */ \`
			`X(SC) /* Join-Causing, like U+0640 (TATWEEL) */ \`
			`/* end of list */`

			`#define ENUM_DECL(name) name,`
			`typedef enum { BIDI_CHAR_TYPE_LIST(ENUM_DECL) N_BIDI_TYPES } BidiType;`
			`typedef enum { SHAPING_CHAR_TYPE_LIST(ENUM_DECL) N_SHAPING_TYPES } ShapingType;`
			`#undef ENUM_DECL`
Move bidi gettype main() into its own file. That's what I've usually been doing with any main()s I find under ifdef; there's no reason this should be an exception. If we're keeping it in the code at all, we should ensure it carries on compiling. I've also created a new header file bidi.h, containing pieces of the bidi definitions shared between bidi.c and the new source file. 2021-10-10 13:31:04 +00:00
Complete rewrite of the bidi algorithm. A user reported that PuTTY's existing bidi algorithm will generate misordered text in cases like this (assuming UTF-8): echo -e '12 A \xD7\x90\xD7\x91 B' The hex codes in the middle are the Hebrew letters aleph and beth. Appearing in the middle of a line whose primary direction is left-to-right, those two letters should appear in the opposite order, but not cause the rest of the line to move around. That is, you expect the displayed text in this situation to be 12 A <beth><aleph> B But in fact, the digits '12' were erroneously reversed, so you would actually see '21 A <beth><aleph> B'. I tried to debug the existing bidi algorithm, but it was very hard, because the Unicode bidi spec has been extensively changed since Arabeyes contributed that code, and I couldn't even reliably work out which version of the spec the code was intended to implement. I found some problems, notably that the resolution phase was running once on the whole line instead of separately on runs of characters at the same level, and also that the 'sor' and 'eor' values were being wrongly computed. But I had no way to test any fix to ensure it hadn't introduced another bug somewhere else. Unicode provides a set of conformance tests in the UCD. That was just what I wanted - but they're too up-to-date to run against the old algorithm and expect to pass! So, paradoxically, it seemed to me that the _easiest_ way to fix this bidi bug would be to bring absolutely everything up to date. But the revised bidi algorithm is significantly more complicated, so I also didn't think it would be sensible to try to gradually evolve the existing code into it. Instead, I've done a complete rewrite of my own. The new code implements the full UAX#9 rev 44 algorithm, including in particular support for the new 'directional isolate' control characters, and also special handling for matched pairs of brackets in the text (see rule N0 in the spec). I've managed to get it to pass the entire UCD conformance test suite, so I'm reasonably confident it's right, or at the very least a lot closer to right than the old algorithm was. So the upshot is: the test case shown at the top of this file now passes, but also, other detailed bidi handling might have changed, certainly some cases involving brackets, but perhaps also other things that were either bugs in the old algorithm or updates to the standard. 2021-10-10 13:51:17 +00:00			`static inline bool typeIsStrong(BidiType t)`
			`{`
			`return ((1<<L) \| (1<<R) \| (1<<AL)) & (1 << t);`
			`}`
			`static inline bool typeIsWeak(BidiType t)`
			`{`
			`return ((1<<EN) \| (1<<ES) \| (1<<ET) \| (1<<AN) \|`
			`(1<<CS) \| (1<<NSM) \| (1<<BN)) & (1 << t);`
			`}`
			`static inline bool typeIsNeutral(BidiType t)`
			`{`
			`return ((1<<B) \| (1<<S) \| (1<<WS) \| (1<<ON)) & (1 << t);`
			`}`
			`static inline bool typeIsBidiActive(BidiType t)`
			`{`
			`return ((1<<R) \| (1<<AL) \| (1<<AN) \| (1<<RLE) \| (1<<LRE) \| (1<<RLO) \|`
			`(1<<LRO) \| (1<<PDF) \| (1<<RLI)) & (1 << t);`
			`}`
			`static inline bool typeIsIsolateInitiator(BidiType t)`
			`{`
			`return ((1<<LRI) \| (1<<RLI) \| (1<<FSI)) & (1 << t);`
			`}`
			`static inline bool typeIsIsolateInitiatorOrPDI(BidiType t)`
			`{`
			`return ((1<<LRI) \| (1<<RLI) \| (1<<FSI) \| (1<<PDI)) & (1 << t);`
			`}`
			`static inline bool typeIsEmbeddingInitiator(BidiType t)`
			`{`
			`return ((1<<LRE) \| (1<<RLE) \| (1<<LRO) \| (1<<RLO)) & (1 << t);`
			`}`
			`static inline bool typeIsEmbeddingInitiatorOrPDF(BidiType t)`
			`{`
			`return ((1<<LRE) \| (1<<RLE) \| (1<<LRO) \| (1<<RLO) \| (1<<PDF)) & (1 << t);`
			`}`
			`static inline bool typeIsWeakSeparatorOrTerminator(BidiType t)`
			`{`
			`return ((1<<ES) \| (1<<ET) \| (1<<CS)) & (1 << t);`
			`}`
			`static inline bool typeIsNeutralOrIsolate(BidiType t)`
			`{`
			`return ((1<<S) \| (1<<WS) \| (1<<ON) \| (1<<FSI) \| (1<<LRI) \| (1<<RLI) \|`
			`(1<<PDI)) & (1 << t);`
			`}`
			`static inline bool typeIsSegmentOrParaSeparator(BidiType t)`
			`{`
			`return ((1<<S) \| (1<<B)) & (1 << t);`
			`}`
			`static inline bool typeIsWhitespaceOrIsolate(BidiType t)`
			`{`
			`return ((1<<WS) \| (1<<FSI) \| (1<<LRI) \| (1<<RLI) \| (1<<PDI)) & (1 << t);`
			`}`
			`static inline bool typeIsRemovedDuringProcessing(BidiType t)`
			`{`
			`return ((1<<RLE) \| (1<<LRE) \| (1<<RLO) \| (1<<LRO) \| (1<<PDF) \|`
			`(1<<BN)) & (1 << t);`
			`}`
			`static inline bool typeIsStrongOrNumber(BidiType t)`
			`{`
			`return ((1<<L) \| (1<<R) \| (1<<AL) \| (1<<EN) \| (1<<AN)) & (1 << t);`
			`}`
			`static inline bool typeIsETOrBN(BidiType t)`
			`{`
			`return ((1<<ET) \| (1<<BN)) & (1 << t);`
			`}`

Test rig for the new bidi algorithm. This standalone CLI program runs the UCD bidi tests in the form provided in Unicode 14.0.0. You can run it by just saying bidi_test --class BidiTest.txt --char BidiCharacterTest.txt assuming those two UCD files are in the current directory. 2021-10-10 13:52:17 +00:00			`/*`
			`* More featureful interface to the bidi code, for use in bidi_test.c.`
			`* It returns a potentially different value of textlen (in case we're`
			`* compiling in REMOVE_FORMATTING_CHARACTERS mode), and also permits`
			`* you to pass in an override to the paragraph direction (because many`
			`* of the UCD conformance tests use one).`
			`*`
			`* 'override' is 0 for no override, +1 for left-to-right, -1 for`
			`* right-to-left.`
			`*/`
			`size_t do_bidi_test(BidiContext ctx, bidi_char text, size_t textlen,`
			`int override);`

Move bidi gettype main() into its own file. That's what I've usually been doing with any main()s I find under ifdef; there's no reason this should be an exception. If we're keeping it in the code at all, we should ensure it carries on compiling. I've also created a new header file bidi.h, containing pieces of the bidi definitions shared between bidi.c and the new source file. 2021-10-10 13:31:04 +00:00			`#endif /* PUTTY_BIDI_H */`