Push model

By default re2c generates pull-model lexers: it assumes that the lexer runs without interrupts and calls YYFILL to “pull” more input. In some cases it might be necessary to generate a push-model lexer that stops when it runs out of input and returns control to the outer program. Later, when the outer program obtains more input, it resumes lexer and continues lexing from the point where it stopped.

In order to function in this manner lexer must be able to store its inner state before returning to the caller. This can be done with re2c -f --storable-state option described here. The example below reads chunks of input from stdin and counts the number of words in it. Note that the parsing loop is located in the main function, and YYFILL merely returns instead of refilling buffer. Lexer state is represented with variables state, yych and yyaccept. Dispatch on state is generated with the help of /*!getstate:re2c*/ directive. In this example explicit use of the directive is necessary, because we need to put entry code between state dispatch and lexer start. If the directive is omitted, re2c emits state dispatch right before lexer start (in this case yy0 should be used as the start label).

[push_model.re]

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
#include <stdio.h>
#include <string.h>

/*!max:re2c*/
static const size_t SIZE = 4096;

struct input_t {
    char buf[SIZE + YYMAXFILL];
    char *lim;
    char *cur;
    char *tok;
    int state;
    unsigned need;
    unsigned yyaccept;
    char yych;

    input_t()
        : buf()
        , lim(buf + SIZE)
        , cur(lim)
        , tok(lim)
        , state(-1)
        , need(0)
        , yyaccept(0)
        , yych(0)
    {}

    bool fill()
    {
        const size_t free = tok - buf;
        if (free < need) return false;

        memmove(buf, tok, buf - tok + SIZE);
        lim -= free;
        cur -= free;
        tok -= free;
        lim += fread(lim, 1, free, stdin);
        if (lim < buf + SIZE) {
            memset(lim, 0, YYMAXFILL);
            lim += YYMAXFILL;
        }
        return true;
    }
};

enum status_t { OK, FAIL, NEED_MORE_INPUT };

static status_t lex(input_t &in, unsigned &words)
{
#   define YYGETSTATE()  in.state
#   define YYSETSTATE(s) in.state = s
#   define YYFILL(n)     do { in.need = n; return NEED_MORE_INPUT; } while (0)
    /*!getstate:re2c*/
loop:
    in.tok = in.cur;
    /*!re2c
        re2c:define:YYCTYPE  = char;
        re2c:define:YYCURSOR = in.cur;
        re2c:define:YYLIMIT  = in.lim;
        re2c:variable:yych   = in.yych;

        *         { return FAIL; }
        [\x00]    { return OK; }
        [\n ]+    { goto loop; }
        [a-zA-Z]+ { ++words; goto loop; }
    */
}

int main()
{
    input_t in;
    unsigned words = 0;

    while (true) {
        const status_t st = lex(in, words);

        // end of input: print result
        if (st == OK) {
            printf("\nword count: %u\n", words);
            break;

        // unexpected error: abort
        } else if (st == FAIL) {
            printf("\nerror\n");
            return 1;

        // get more input and continue
        } else if (!in.fill()) {
            printf("\nsmall buffer\n");
            return 2;
        }
    }

    return 0;
}

Compile:

$ re2c --input custom -o push_model.cc push_model.re
$ g++ -o push_model push_model.cc

Run:

$ ./push_model
Lorem ipsum dolor sit amet^D
word count: 5