Strings in binaries

The program below searches all strings starting with double underscore in the given binary file. The same method can be used to search for arbitrary signatures or keywords. Since we are dealing with a binary file, we cannot use the sentinel method to check for the end of input: binary files can contain all kinds of characters, so no sentinel can be chosen. The usual way in such cases is to use YYLIMIT-based checking: it requires padding input with YYMAXFILL fake characters, but it’s not a problem since the input is buffered anyway.

However, this exampe takes another approach: it uses generic API to override the default checking mechanism. First, it disables the usual mechanism: suppresses the generation of YYLESSTHAN and YYFILL with re2c:yyfill:enable = 0; configuration. Second, it redefines YYSKIP to perform checking before advancing to the next input character. In principle, this approach is less efficient: checking happens more frequently, as YYSKIP is invoked on each input character, while YYLESSTHAN happens only once per each strongly connected component of automaton. However, it allows to avoid padding.

[binsyms.re]

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
#include <stdlib.h>
#include <stdio.h>

static void lex(const char *cur, const char *lim)
{
    const char *mar, *tok;
#   define YYCTYPE     char
#   define YYPEEK()    *cur
#   define YYSKIP()    if (++cur == lim) return;
#   define YYBACKUP()  mar = cur
#   define YYRESTORE() cur = mar
loop:
    tok = cur;
    /*!re2c
        re2c:yyfill:enable = 0;

        * { goto loop; }
        "__" [a-zA-Z0-9_]+ {
            printf("%.*s\n", (int) (cur - tok), tok);
            goto loop;
        }
    */
}

int main(int argc, char **argv)
{
    if (argc < 2) {
        fprintf(stderr, "no input files\n");
        return 1;
    }

    FILE *file = fopen(argv[1], "rb");
    if (file == NULL) {
        fprintf(stderr, "cannot open file\n");
        return 1;
    }

    fseek(file, 0, SEEK_END);
    const size_t fsize = (size_t) ftell(file);
    fseek(file, 0, SEEK_SET);

    char *buffer = (char*) malloc(fsize);
    fread(buffer, 1, fsize, file);
    lex(buffer, buffer + fsize);

    free(buffer);
    fclose(file);
    return 0;
}

Compile:

$ re2c --input custom -o binsyms.cc binsyms.re
$ g++ -o binsyms binsyms.cc

Run:

$ ./binsyms binsyms
__gmon_start__
__libc_start_main
__off_t
__cxx11
__gnu_cxx3divExx
__off64_t
__pad1
__pad2
__pad3
__pad4
__pad5
__compar_fn_t
__gnu_cxx
__init_array_start
__libc_csu_fini
__libc_csu_init
__init_array_end
__GNU_EH_FRAME_HDR
__init_array_end
__init_array_start
__libc_csu_fini
__gmon_start__
__libc_start_main
__data_start
__TMC_END__
__dso_handle
__libc_csu_init
__bss_start