Braille patterns¶
This example is about encoding support in re2c. It’s a partial decoder from Grade-1 (uncontracted) Unicode English Braille to plain English. Grade-1 Braille is quite simple (compared to Grade-2 Braille). Patterns map directly to symbols (letters, digits, and punctuators) except for a couple of special patterns: the numeric mode indicator (⠼), the letter mode indicator (⠰), the capital letter indicator (⠠) and some others, which we omit here for the sake of simplicity (as well as a few ambiguous punctuation patterns). Grade-2 Braille allows contractions; those obey some rather complex rules (like those of a natural language) and are much harder to implement.
The input may be encoded in UTF-8, UTF-16, UTF-32, or UCS-2:
all of these encodings are capable of representing Braille patterns (code points [0x2800 - 0x28ff]
).
We use the -r
option to reuse the same block of re2c rules with different encodings.
The input to the example is the same Braille message
in different encodings:
braille.utf8.txt
,
braille.utf16.txt
,
braille.utf32.txt
and
braille.ucs2.txt
:
⠠⠁⠇⠇⠀⠓⠥⠍⠁⠝⠀⠃⠑⠊⠝⠛⠎⠀⠁⠗⠑⠀⠃⠕⠗⠝⠀⠋⠗⠑⠑⠀⠁⠝⠙⠀⠑⠟⠥⠁⠇⠀⠊⠝⠀⠙⠊⠛⠝⠊⠞⠽⠀⠁⠝⠙⠀⠗⠊⠛⠓⠞⠎⠲⠀ ⠠⠞⠓⠑⠽⠀⠁⠗⠑⠀⠑⠝⠙⠕⠺⠑⠙⠀⠺⠊⠞⠓⠀⠗⠑⠁⠎⠕⠝⠀⠁⠝⠙⠀⠉⠕⠝⠎⠉⠊⠑⠝⠉⠑⠀⠁⠝⠙⠀⠎⠓⠕⠥⠇⠙⠀⠁⠉⠞⠀⠞⠕⠺⠁⠗⠙⠎⠀ ⠕⠝⠑⠀⠁⠝⠕⠞⠓⠑⠗⠀⠊⠝⠀⠁⠀⠎⠏⠊⠗⠊⠞⠀⠕⠋⠀⠃⠗⠕⠞⠓⠑⠗⠓⠕⠕⠙⠲
It means “All human beings are born free and equal in dignity and rights. They are endowed with reason and conscience and should act towards one another in a spirit of brotherhood.”
// re2c $INPUT -o $OUTPUT -cri
#include <assert.h>
#include <ctype.h>
#include <stdio.h>
template<typename char_t>
struct input_t {
size_t len;
char_t *str;
input_t(FILE *f) : len(0), str(NULL)
{
fseek(f, 0, SEEK_END);
len = ftell(f) / sizeof(char_t);
fseek(f, 0, SEEK_SET);
str = new char_t[len + 1];
fread(str, sizeof(char_t), len, f);
str[len] = 0;
}
~input_t() { delete[] str; }
};
typedef input_t<unsigned char> iutf8_t;
typedef input_t<unsigned short> iutf16_t;
typedef input_t<unsigned int> iutf32_t;
typedef input_t<unsigned short> iucs2_t;
struct out_t {
bool caps;
out_t() : caps(false) {}
void prt(char c)
{
fprintf(stderr, "%c", caps ? toupper(c) : c);
caps = false;
}
void err()
{
fprintf(stderr, " ... error\n");
}
};
/*!rules:re2c
re2c:yyfill:enable = 0;
re2c:api:style = free-form;
re2c:encoding:utf8 = 1;
re2c:define:YYGETCONDITION = "c";
re2c:define:YYSETCONDITION = "c = @@;";
// letters
l = "\u2830";
la = "\u2801"; lb = "\u2803"; lc = "\u2809"; ld = "\u2819"; le = "\u2811";
lf = "\u280b"; lg = "\u281b"; lh = "\u2813"; li = "\u280a"; lj = "\u281a";
lk = "\u2805"; ll = "\u2807"; lm = "\u280d"; ln = "\u281d"; lo = "\u2815";
lp = "\u280f"; lq = "\u281f"; lr = "\u2817"; ls = "\u280e"; lt = "\u281e";
lu = "\u2825"; lv = "\u2827"; lw = "\u283a"; lx = "\u282d"; ly = "\u283d";
lz = "\u2835";
// numbers
n = "\u283c";
n1 = "\u2801"; n2 = "\u2803"; n3 = "\u2809"; n4 = "\u2819"; n5 = "\u2811";
n6 = "\u280b"; n7 = "\u281b"; n8 = "\u2813"; n9 = "\u280a"; n0 = "\u281a";
// punctuation
pcom = "\u2802"; psem = "\u2806"; pcln = "\u2812";
pdot = "\u2832"; pxcl = "\u2816"; pqst = "\u2826";
past = "\u2814"; pdsh = "\u2804"; phyp = "\u2824";
// formatting
fcp = "\u2820"; fsp = "\u2800" | "\x20"; fnl = "\n" | "\n\r";
<*> * { out.err(); return; }
<*> "\x00" { if (YYCURSOR != in.str + in.len + 1) out.err(); return; }
<*> l :=> l
<l> la { out.prt('a'); goto yyc_l; }
<l> lb { out.prt('b'); goto yyc_l; }
<l> lc { out.prt('c'); goto yyc_l; }
<l> ld { out.prt('d'); goto yyc_l; }
<l> le { out.prt('e'); goto yyc_l; }
<l> lf { out.prt('f'); goto yyc_l; }
<l> lg { out.prt('g'); goto yyc_l; }
<l> lh { out.prt('h'); goto yyc_l; }
<l> li { out.prt('i'); goto yyc_l; }
<l> lj { out.prt('j'); goto yyc_l; }
<l> lk { out.prt('k'); goto yyc_l; }
<l> ll { out.prt('l'); goto yyc_l; }
<l> lm { out.prt('m'); goto yyc_l; }
<l> ln { out.prt('n'); goto yyc_l; }
<l> lo { out.prt('o'); goto yyc_l; }
<l> lp { out.prt('p'); goto yyc_l; }
<l> lq { out.prt('q'); goto yyc_l; }
<l> lr { out.prt('r'); goto yyc_l; }
<l> ls { out.prt('s'); goto yyc_l; }
<l> lt { out.prt('t'); goto yyc_l; }
<l> lu { out.prt('u'); goto yyc_l; }
<l> lv { out.prt('v'); goto yyc_l; }
<l> lw { out.prt('w'); goto yyc_l; }
<l> lx { out.prt('x'); goto yyc_l; }
<l> ly { out.prt('y'); goto yyc_l; }
<l> lz { out.prt('z'); goto yyc_l; }
<*> n :=> n
<n> n1 { out.prt('1'); goto yyc_n; }
<n> n2 { out.prt('2'); goto yyc_n; }
<n> n3 { out.prt('3'); goto yyc_n; }
<n> n4 { out.prt('4'); goto yyc_n; }
<n> n5 { out.prt('5'); goto yyc_n; }
<n> n6 { out.prt('6'); goto yyc_n; }
<n> n7 { out.prt('7'); goto yyc_n; }
<n> n8 { out.prt('8'); goto yyc_n; }
<n> n9 { out.prt('9'); goto yyc_n; }
<n> n0 { out.prt('0'); goto yyc_n; }
<*> pcom { out.prt(','); goto yyc_l; }
<*> psem { out.prt(';'); goto yyc_l; }
<*> pcln { out.prt(':'); goto yyc_l; }
<*> pdot { out.prt('.'); goto yyc_l; }
<*> pxcl { out.prt('!'); goto yyc_l; }
<*> pqst { out.prt('?'); goto yyc_l; }
<*> past { out.prt('*'); goto yyc_l; }
<*> pdsh { out.prt('\''); goto yyc_l; }
<*> phyp { out.prt('-'); goto yyc_l; }
<*> fcp { out.caps = true; goto yyc_l; }
<*> fsp { out.prt(' '); goto yyc_l; }
<*> fnl { out.prt('\n'); goto yyc_l; }
*/
/*!types:re2c*/
static void lex_utf8(const iutf8_t & in)
{
const unsigned char *YYCURSOR = in.str, *YYMARKER;
int c = yycl;
out_t out;
/*!use:re2c
re2c:define:YYCTYPE = "unsigned char";
re2c:encoding:utf8 = 1;
*/
}
static void lex_utf16(const iutf16_t & in)
{
const unsigned short *YYCURSOR = in.str;
int c = yycl;
out_t out;
/*!use:re2c
re2c:define:YYCTYPE = "unsigned int";
re2c:encoding:utf16 = 1;
*/
}
static void lex_utf32(const iutf32_t & in)
{
const unsigned int *YYCURSOR = in.str;
int c = yycl;
out_t out;
/*!use:re2c
re2c:define:YYCTYPE = "unsigned int";
re2c:encoding:utf32 = 1;
*/
}
static void lex_ucs2(const iucs2_t & in)
{
const unsigned short *YYCURSOR = in.str;
int c = yycl;
out_t out;
/*!use:re2c
re2c:define:YYCTYPE = "unsigned int";
re2c:encoding:ucs2 = 1;
*/
}
int main()
{
FILE *f;
assert(f = fopen("braille.utf8.txt", "rb"));
fprintf(stderr, "utf8:\n");
iutf8_t in8(f);
lex_utf8(in8);
fclose(f);
assert(f = fopen("braille.utf16.txt", "rb"));
fprintf(stderr, "utf16:\n");
iutf16_t in16(f);
lex_utf16(in16);
fclose(f);
assert(f = fopen("braille.utf32.txt", "rb"));
fprintf(stderr, "utf32:\n");
iutf32_t in32(f);
lex_utf32(in32);
fclose(f);
assert(f = fopen("braille.ucs2.txt", "rb"));
fprintf(stderr, "ucs2:\n");
iucs2_t in2(f);
lex_ucs2(in2);
fclose(f);
return 0;
}
Compile as re2c -cr -o braille.cc braille.re
. Notes:
The reuse mode is enabled with the
-r
option.In the reuse mode, re2c expects a single
/*!rules:re2c ... */
block followed by multiple/*!use:re2c ... */
blocks. All blocks can have their own configurations, definitions, and rules.Encoding can be enabled either with a command-line option or a configuration.
Each encoding needs the appropriate code unit type (
YYCTYPE
).We use conditions to switch between numeric and normal modes.