Parsing integers (conditions)ΒΆ

This example does exactly the same as Parsing integers (multiple re2c blocks) example, but in a slightly different manner: it uses re2c conditions instead of blocks. Conditions allow to encode multiple interconnected lexers within a single re2c block.

[05_parsing_integers_conditions.re]

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
#include <limits.h>
#include <stdio.h>

template<int base>
static bool adddgt(unsigned long &u, unsigned int d)
{
    if (u > (ULONG_MAX - d) / base) {
        return false;
    }
    u = u * base + d;
    return true;
}

/*!types:re2c*/

static bool lex(const char *s, unsigned long &u)
{
    const char *YYMARKER;
    const char *YYCTXMARKER;
    int c = yycinit;
    u = 0;
    /*!re2c
        re2c:yyfill:enable = 0;
        re2c:define:YYCTYPE = char;
        re2c:define:YYCURSOR = s;
        re2c:define:YYGETCONDITION = "c";
        re2c:define:YYGETCONDITION:naked = 1;
        re2c:define:YYSETCONDITION = "c = @@;";
        re2c:define:YYSETCONDITION:naked = 1;

        <*> * { return false; }

        <init> '0b' / [01]        :=> bin
        <init> "0"                :=> oct
        <init> "" / [1-9]         :=> dec
        <init> '0x' / [0-9a-fA-F] :=> hex

        <bin, oct, dec, hex> "\x00" { return true; }
        <bin> [01]  { if (!adddgt<2>(u, s[-1] - '0')) return false; goto yyc_bin; }
        <oct> [0-7] { if (!adddgt<8>(u, s[-1] - '0')) return false; goto yyc_oct; }
        <dec> [0-9] { if (!adddgt<10>(u, s[-1] - '0')) return false; goto yyc_dec; }
        <hex> [0-9] { if (!adddgt<16>(u, s[-1] - '0'))      return false; goto yyc_hex; }
        <hex> [a-f] { if (!adddgt<16>(u, s[-1] - 'a' + 10)) return false; goto yyc_hex; }
        <hex> [A-F] { if (!adddgt<16>(u, s[-1] - 'A' + 10)) return false; goto yyc_hex; }
    */
}

int main(int argc, char **argv)
{
    for (int i = 1; i < argc; ++i) {
        unsigned long u;
        if (lex(argv[i], u)) {
            printf("%lu\n", u);
        } else {
            printf("error\n");
        }
    }
    return 0;
}

Notes:

  • Conditions are enabled with -c option.
  • Conditions are only syntactic sugar, they can be translated into multiple blocks.
  • Each condition is a standalone lexer (DFA).
  • Each condition has a unique identifier: /*!types:re2c*/ tells re2c to generate enumeration of all identifiers (names are prefixed with yyc by default). Lexer uses YYGETCONDITION to get the identifier of current condition and YYSETCONDITION to set it.
  • Each condition has a unique label (prefixed with yyc_ by default).
  • Conditions are connected: transitions are allowed between final states of one condition and start state of another condition (but not between inner states of different conditions). The generated code starts with dispatch. Actions can either jump to the initial dispatch or jump directly to any condition.
  • Rule <*> is merged to all conditions (low priority).
  • Rules with multiple conditions are merged to each listed condition (normal priority).
  • :=> jumps directly to the next condition (bypassing the initial dispatch).

Generate, compile and run:

$ re2c -c -o example.cc 05_parsing_integers_conditions.re
$ g++ -o example example.cc
$ ./example 0 12345678901234567890 0xFFFFffffFFFFffff 0x1FFFFffffFFFFffff 0xAbcDEf 0x00 007 0B0 0b110101010 ""
0
12345678901234567890
18446744073709551615
error
11259375
0
7
0
426
error