Braille patterns

This example is about encoding support in re2c. It’s a partial decoder from Grade-1 (uncontracted) Unicode English Braille to plain English. The input may be encoded in UTF-8, UTF-16, UTF-32, or UCS-2: all of these encodings are capable of representing Braille patterns (code points [0x2800 - 0x28ff]). We use the -r option to reuse the same block of re2c rules with different encodings.

So. The hardest part is to get some input. Here is a message out of the void:

⠠⠁⠇⠇⠀⠓⠥⠍⠁⠝⠀⠃⠑⠊⠝⠛⠎⠀⠁⠗⠑⠀⠃⠕⠗⠝⠀⠋⠗⠑⠑⠀⠁⠝⠙⠀⠑⠟⠥⠁⠇⠀⠊⠝⠀⠙⠊⠛⠝⠊⠞⠽⠀⠁⠝⠙⠀⠗⠊⠛⠓⠞⠎⠲⠀ ⠠⠞⠓⠑⠽⠀⠁⠗⠑⠀⠑⠝⠙⠕⠺⠑⠙⠀⠺⠊⠞⠓⠀⠗⠑⠁⠎⠕⠝⠀⠁⠝⠙⠀⠉⠕⠝⠎⠉⠊⠑⠝⠉⠑⠀⠁⠝⠙⠀⠎⠓⠕⠥⠇⠙⠀⠁⠉⠞⠀⠞⠕⠺⠁⠗⠙⠎⠀ ⠕⠝⠑⠀⠁⠝⠕⠞⠓⠑⠗⠀⠊⠝⠀⠁⠀⠎⠏⠊⠗⠊⠞⠀⠕⠋⠀⠃⠗⠕⠞⠓⠑⠗⠓⠕⠕⠙⠲

It appears to be UTF-8 encoded [braille.utf8.txt]. Let’s convert it into UTF-16, UTF-32, and UCS-2:

$ iconv -f utf8 -t utf16le 06_braille.utf8.txt > 06_braille.utf16.txt
$ iconv -f utf8 -t utf32le 06_braille.utf8.txt > 06_braille.utf32.txt
$ iconv -f utf8 -t ucs2 06_braille.utf8.txt > 06_braille.ucs2.txt

And the input is ready.

Grade-1 Braille is quite simple (compared to Grade-2 Braille). Patterns map directly to symbols (letters, digits, and punctuators) except for a couple of special patterns: the numeric mode indicator (⠼), the letter mode indicator (⠰), the capital letter indicator (⠠) and some others, which we omit here for the sake of simplicity (as well as a few ambiguous punctuation patterns). Grade-2 Braille allows contractions; those obey some rather complex rules (like those of a natural language) and are much harder to implement.

[braille.re]

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
#include <ctype.h>
#include <stdio.h>

template<typename char_t>
struct input_t {
    size_t len;
    char_t *str;

    input_t(FILE *f) : len(0), str(NULL)
    {
        fseek(f, 0, SEEK_END);
        len = ftell(f) / sizeof(char_t);
        fseek(f, 0, SEEK_SET);
        str = new char_t[len + 1];
        fread(str, sizeof(char_t), len, f);
        str[len] = 0;
    }
    ~input_t()
    {
        delete[]str;
    }
};

typedef input_t<unsigned char>  iutf8_t;
typedef input_t<unsigned short> iutf16_t;
typedef input_t<unsigned int>   iutf32_t;
typedef input_t<unsigned short> iucs2_t;

struct out_t {
    bool caps;

    out_t() : caps(false) {}
    void prt(char c)
    {
        printf("%c", caps ? toupper(c) : c);
        caps = false;
    }
    void err()
    {
        printf(" ... error\n");
    }
};

/*!rules:re2c
    re2c:yyfill:enable = 0;

    // letters
    l = "\u2830";
    la = "\u2801"; lb = "\u2803"; lc = "\u2809"; ld = "\u2819"; le = "\u2811";
    lf = "\u280b"; lg = "\u281b"; lh = "\u2813"; li = "\u280a"; lj = "\u281a";
    lk = "\u2805"; ll = "\u2807"; lm = "\u280d"; ln = "\u281d"; lo = "\u2815";
    lp = "\u280f"; lq = "\u281f"; lr = "\u2817"; ls = "\u280e"; lt = "\u281e";
    lu = "\u2825"; lv = "\u2827"; lw = "\u283a"; lx = "\u282d"; ly = "\u283d";
    lz = "\u2835";

    // numbers
    n = "\u283c";
    n1 = "\u2801"; n2 = "\u2803"; n3 = "\u2809"; n4 = "\u2819"; n5 = "\u2811";
    n6 = "\u280b"; n7 = "\u281b"; n8 = "\u2813"; n9 = "\u280a"; n0 = "\u281a";

    // punctuation
    pcom = "\u2802"; psem = "\u2806"; pcln = "\u2812";
    pdot = "\u2832"; pxcl = "\u2816"; pqst = "\u2826";
    past = "\u2814"; pdsh = "\u2804"; phyp = "\u2824";

    // formatting
    fcp = "\u2820"; fsp = "\u2800" | "\x20"; fnl = "\n" | "\n\r";

    <*> *      { out.err(); return; }
    <*> "\x00" { if (YYCURSOR != in.str + in.len + 1) out.err(); return; }

    <*> l :=> l
    <l> la { out.prt('a'); goto yyc_l; }
    <l> lb { out.prt('b'); goto yyc_l; }
    <l> lc { out.prt('c'); goto yyc_l; }
    <l> ld { out.prt('d'); goto yyc_l; }
    <l> le { out.prt('e'); goto yyc_l; }
    <l> lf { out.prt('f'); goto yyc_l; }
    <l> lg { out.prt('g'); goto yyc_l; }
    <l> lh { out.prt('h'); goto yyc_l; }
    <l> li { out.prt('i'); goto yyc_l; }
    <l> lj { out.prt('j'); goto yyc_l; }
    <l> lk { out.prt('k'); goto yyc_l; }
    <l> ll { out.prt('l'); goto yyc_l; }
    <l> lm { out.prt('m'); goto yyc_l; }
    <l> ln { out.prt('n'); goto yyc_l; }
    <l> lo { out.prt('o'); goto yyc_l; }
    <l> lp { out.prt('p'); goto yyc_l; }
    <l> lq { out.prt('q'); goto yyc_l; }
    <l> lr { out.prt('r'); goto yyc_l; }
    <l> ls { out.prt('s'); goto yyc_l; }
    <l> lt { out.prt('t'); goto yyc_l; }
    <l> lu { out.prt('u'); goto yyc_l; }
    <l> lv { out.prt('v'); goto yyc_l; }
    <l> lw { out.prt('w'); goto yyc_l; }
    <l> lx { out.prt('x'); goto yyc_l; }
    <l> ly { out.prt('y'); goto yyc_l; }
    <l> lz { out.prt('z'); goto yyc_l; }

    <*> n :=> n
    <n> n1 { out.prt('1'); goto yyc_n; }
    <n> n2 { out.prt('2'); goto yyc_n; }
    <n> n3 { out.prt('3'); goto yyc_n; }
    <n> n4 { out.prt('4'); goto yyc_n; }
    <n> n5 { out.prt('5'); goto yyc_n; }
    <n> n6 { out.prt('6'); goto yyc_n; }
    <n> n7 { out.prt('7'); goto yyc_n; }
    <n> n8 { out.prt('8'); goto yyc_n; }
    <n> n9 { out.prt('9'); goto yyc_n; }
    <n> n0 { out.prt('0'); goto yyc_n; }

    <*> pcom { out.prt(','); goto yyc_l; }
    <*> psem { out.prt(';'); goto yyc_l; }
    <*> pcln { out.prt(':'); goto yyc_l; }
    <*> pdot { out.prt('.'); goto yyc_l; }
    <*> pxcl { out.prt('!'); goto yyc_l; }
    <*> pqst { out.prt('?'); goto yyc_l; }
    <*> past { out.prt('*'); goto yyc_l; }
    <*> pdsh { out.prt('\''); goto yyc_l; }
    <*> phyp { out.prt('-'); goto yyc_l; }

    <*> fcp { out.caps = true; goto yyc_l; }
    <*> fsp { out.prt(' '); goto yyc_l; }
    <*> fnl { out.prt('\n'); goto yyc_l; }
*/

/*!types:re2c*/

static void lex_utf8(const iutf8_t & in)
{
    const unsigned char *YYCURSOR = in.str;
    const unsigned char *YYMARKER;
    int c = yycl;
    out_t out;
    /*!use:re2c
        re2c:define:YYCTYPE = "unsigned char";
        re2c:define:YYGETCONDITION = "c";
        re2c:define:YYGETCONDITION:naked = 1;
        re2c:define:YYSETCONDITION = "c = @@;";
        re2c:define:YYSETCONDITION:naked = 1;
    */
}

static void lex_utf16(const iutf16_t & in)
{
    const unsigned short *YYCURSOR = in.str;
    int c = yycl;
    out_t out;
    /*!use:re2c
        re2c:define:YYCTYPE = "unsigned int";
        re2c:define:YYGETCONDITION = "c";
        re2c:define:YYGETCONDITION:naked = 1;
        re2c:define:YYSETCONDITION = "c = @@;";
        re2c:define:YYSETCONDITION:naked = 1;
        re2c:flags:8 = 0;
        re2c:flags:x = 1;
    */
}

static void lex_utf32(const iutf32_t & in)
{
    const unsigned int *YYCURSOR = in.str;
    int c = yycl;
    out_t out;
    /*!use:re2c
        re2c:define:YYCTYPE = "unsigned int";
        re2c:define:YYGETCONDITION = "c";
        re2c:define:YYGETCONDITION:naked = 1;
        re2c:define:YYSETCONDITION = "c = @@;";
        re2c:define:YYSETCONDITION:naked = 1;
        re2c:flags:x = 0;
        re2c:flags:u = 1;
    */
}

static void lex_ucs2(const iucs2_t & in)
{
    const unsigned short *YYCURSOR = in.str;
    int c = yycl;
    out_t out;
    /*!use:re2c
        re2c:define:YYCTYPE = "unsigned int";
        re2c:define:YYGETCONDITION = "c";
        re2c:define:YYGETCONDITION:naked = 1;
        re2c:define:YYSETCONDITION = "c = @@;";
        re2c:define:YYSETCONDITION:naked = 1;
        re2c:flags:u = 0;
        re2c:flags:w = 1;
    */
}

int main()
{
    FILE *f;

    f = fopen("06_braille.utf8.txt", "rb");
    if (f) {
        printf("utf8:\n");
        iutf8_t in(f);
        lex_utf8(in);
        fclose(f);
    }

    f = fopen("06_braille.utf16.txt", "rb");
    if (f) {
        printf("utf16:\n");
        iutf16_t in(f);
        lex_utf16(in);
        fclose(f);
    }

    f = fopen("06_braille.utf32.txt", "rb");
    if (f) {
        printf("utf32:\n");
        iutf32_t in(f);
        lex_utf32(in);
        fclose(f);
    }

    f = fopen("06_braille.ucs2.txt", "rb");
    if (f) {
        printf("ucs2:\n");
        iucs2_t in(f);
        lex_ucs2(in);
        fclose(f);
    }

    return 0;
}

Notes:

  • The reuse mode is enabled with the -r option.
  • In the reuse mode, re2c expects a single /*!rules:re2c ... */ block followed by multiple /*!use:re2c ... */ blocks. All blocks can have their own configurations, definitions, and rules.
  • Encoding can be enabled either with a command-line option or a configuration.
  • Each encoding needs the appropriate code unit type (YYCTYPE).
  • We use conditions to switch between numeric and normal modes.

Compile:

$ re2c -cr8 -o braille.cc braille.re
$ g++ -o braille braille.cc

Run:

$ ./braille
utf8:
All human beings are born free and equal in dignity and rights.
They are endowed with reason and conscience and should act towards
one another in a spirit of brotherhood.

utf16:
All human beings are born free and equal in dignity and rights.
They are endowed with reason and conscience and should act towards
one another in a spirit of brotherhood.

utf32:
All human beings are born free and equal in dignity and rights.
They are endowed with reason and conscience and should act towards
one another in a spirit of brotherhood.

ucs2:
All human beings are born free and equal in dignity and rights.
They are endowed with reason and conscience and should act towards
one another in a spirit of brotherhood.