From 9108275af57f2f82e18f807dc0b5221521c39bcd Mon Sep 17 00:00:00 2001 From: Ulya Trofimovich Date: Tue, 22 Aug 2017 13:49:32 +0100 Subject: [PATCH] Added some examples of generic API usage. --- src/examples/15_binsyms.re.txt | 49 ++++++++++++++++++++ src/examples/16_fake_sentinel.re.txt | 31 +++++++++++++ src/examples/17_ifstream.re.txt | 30 +++++++++++++ src/examples/example_15.rst | 67 ++++++++++++++++++++++++++++ src/examples/example_16.rst | 38 ++++++++++++++++ src/examples/example_17.rst | 32 +++++++++++++ src/examples/examples.rst | 28 +++++++++--- src/examples/ifstream.re | 30 +++++++++++++ 8 files changed, 300 insertions(+), 5 deletions(-) create mode 100644 src/examples/15_binsyms.re.txt create mode 100644 src/examples/16_fake_sentinel.re.txt create mode 100644 src/examples/17_ifstream.re.txt create mode 100644 src/examples/example_15.rst create mode 100644 src/examples/example_16.rst create mode 100644 src/examples/example_17.rst create mode 100644 src/examples/ifstream.re diff --git a/src/examples/15_binsyms.re.txt b/src/examples/15_binsyms.re.txt new file mode 100644 index 00000000..ec1c2d6e --- /dev/null +++ b/src/examples/15_binsyms.re.txt @@ -0,0 +1,49 @@ +#include +#include + +static void lex(const char *cur, const char *lim) +{ + const char *mar, *tok; +# define YYCTYPE char +# define YYPEEK() *cur +# define YYSKIP() if (++cur == lim) return; +# define YYBACKUP() mar = cur +# define YYRESTORE() cur = mar +loop: + tok = cur; + /*!re2c + re2c:yyfill:enable = 0; + + * { goto loop; } + "__" [a-zA-Z0-9_]+ { + printf("%.*s\n", (int) (cur - tok), tok); + goto loop; + } + */ +} + +int main(int argc, char **argv) +{ + if (argc < 2) { + fprintf(stderr, "no input files\n"); + return 1; + } + + FILE *file = fopen(argv[1], "rb"); + if (file == NULL) { + fprintf(stderr, "cannot open file\n"); + return 1; + } + + fseek(file, 0, SEEK_END); + const size_t fsize = (size_t) ftell(file); + fseek(file, 0, SEEK_SET); + + char *buffer = (char*) malloc(fsize); + fread(buffer, 1, fsize, file); + lex(buffer, buffer + fsize); + + free(buffer); + fclose(file); + return 0; +} diff --git a/src/examples/16_fake_sentinel.re.txt b/src/examples/16_fake_sentinel.re.txt new file mode 100644 index 00000000..f06ffefb --- /dev/null +++ b/src/examples/16_fake_sentinel.re.txt @@ -0,0 +1,31 @@ +#include +#include + +static int lex(const char *cur, const char *lim) +{ + const char *mar, *tok = cur; +# define YYCTYPE char +# define YYPEEK() (cur < lim ? *cur : 0) +# define YYSKIP() ++cur +# define YYBACKUP() mar = cur +# define YYRESTORE() cur = mar + /*!re2c + re2c:yyfill:enable = 0; + + * { printf("error\n"); return 1; } + [0-9a-zA-Z]+ [;] [\x00] { + printf("%.*s\n", (int) (cur - tok) - 1, tok); + return 0; + } + */ +} + +int main(int argc, char **argv) +{ + if (argc != 2) return 1; + + char *s = argv[1]; + size_t l = strlen(s); + s[l] = ';'; // overwrite terminating NULL + return lex(s, s + l + 1); +} diff --git a/src/examples/17_ifstream.re.txt b/src/examples/17_ifstream.re.txt new file mode 100644 index 00000000..69b0f18a --- /dev/null +++ b/src/examples/17_ifstream.re.txt @@ -0,0 +1,30 @@ +#include + +static void conv(std::ifstream &in, std::ofstream &out) +{ + std::streampos mar; +# define YYCTYPE char +# define YYPEEK() in.peek() +# define YYSKIP() do { in.ignore(); if (in.eof()) return; } while(0) +# define YYBACKUP() mar = in.tellg() +# define YYRESTORE() in.seekg(mar) +loop: + /*!re2c + re2c:yyfill:enable = 0; + + * { out.put(yych); goto loop; } + "\r\n" { out.put('\n'); goto loop; } + */ +} + +int main(int argc, char **argv) +{ + if (argc != 3) return 1; + + std::ifstream in(argv[1], std::ios::binary); + std::ofstream out(argv[2], std::ios::binary); + if (in.fail() || out.fail()) return 2; + + conv(in, out); + return 0; +} diff --git a/src/examples/example_15.rst b/src/examples/example_15.rst new file mode 100644 index 00000000..528e311c --- /dev/null +++ b/src/examples/example_15.rst @@ -0,0 +1,67 @@ +Strings in binaries +------------------- + +The program below searches all strings starting with double underscore in the given binary file. +The same method can be used to search for arbitrary signatures or keywords. +Since we are dealing with a *binary* file, we cannot use the sentinel method to check for the end of input: +binary files can contain all kinds of characters, so no sentinel can be chosen. +The usual way in such cases is to use ``YYLIMIT``-based checking: it requires padding input with ``YYMAXFILL`` fake characters, +but it's not a problem since the input is buffered anyway. + +However, this exampe takes another approach: +it uses generic API to override the default checking mechanism. +First, it disables the usual mechanism: suppresses the generation of ``YYLESSTHAN`` and ``YYFILL`` with ``re2c:yyfill:enable = 0;`` configuration. +Second, it redefines ``YYSKIP`` to perform checking before advancing to the next input character. +In principle, this approach is less efficient: +checking happens more frequently, as ``YYSKIP`` is invoked on each input character, +while ``YYLESSTHAN`` happens only once per each strongly connected component of automaton. +However, it allows to avoid padding. + +:download:`[binsyms.re] <15_binsyms.re.txt>` + +.. literalinclude:: 15_binsyms.re.txt + :language: cpp + :linenos: + +Compile: + +.. code-block:: bash + + $ re2c --input custom -o binsyms.cc binsyms.re + $ g++ -o binsyms binsyms.cc + +Run: + +.. code-block:: bash + + $ ./binsyms binsyms + __gmon_start__ + __libc_start_main + __off_t + __cxx11 + __gnu_cxx3divExx + __off64_t + __pad1 + __pad2 + __pad3 + __pad4 + __pad5 + __compar_fn_t + __gnu_cxx + __init_array_start + __libc_csu_fini + __libc_csu_init + __init_array_end + __GNU_EH_FRAME_HDR + __init_array_end + __init_array_start + __libc_csu_fini + __gmon_start__ + __libc_start_main + __data_start + __TMC_END__ + __dso_handle + __libc_csu_init + __bss_start + + diff --git a/src/examples/example_16.rst b/src/examples/example_16.rst new file mode 100644 index 00000000..918c9be7 --- /dev/null +++ b/src/examples/example_16.rst @@ -0,0 +1,38 @@ +Fake sentinel +------------- + +This example explores the case when we know the *length* of input, +but there is no terminating character and buffering is not possible. +In such cases we cannot use the usual sentinel method; and we cannot use ``YYLIMIT``-based method as it requires ``YYMAXFILL`` padding. +The choiche then is to use generic API: +disable the default cheching mechanism with ``re2c:yyfill:enable = 0;`` +and use one of the primitives ``YYPEEK`` and ``YYSKIP`` to check for the end of input. + +In this example we use ``YYPEEK`` to emulate *fake sentinel*: +every time the lexer peeks a new character, it first checks for the end of input: +if it has already been reached, ``YYPEEK`` returns ``NULL`` (though the actual string has no terminating ``NULL``). +Checking on every ``YYPEEK`` is less efficient than the usual sentinel method +(which performs no checking at all), but it can be more efficient than copying input to buffer and padding it with a real sentinel character. + +Note that fake sentinel method also relies on the fact that sentinel cannot appear in the middle of well-formed input. +If the input can contain arbitrary characters, then one should utilize ``YYSKIP`` as shown in `this example `_. + +:download:`[fake_sentinel.re] <16_fake_sentinel.re.txt>` + +.. literalinclude:: 16_fake_sentinel.re.txt + :language: cpp + :linenos: + +Compile: + +.. code-block:: bash + + $ re2c --input custom -o fake_sentinel.cc fake_sentinel.re + $ g++ -o fake_sentinel fake_sentinel.cc + +Run: + +.. code-block:: bash + + $ ./fake_sentinel somestring + somestring; diff --git a/src/examples/example_17.rst b/src/examples/example_17.rst new file mode 100644 index 00000000..d3cd85f8 --- /dev/null +++ b/src/examples/example_17.rst @@ -0,0 +1,32 @@ +std::ifstream +------------- + +This example shows how to override re2c input mechanism: +instead of reading input characters from a buffer in memory, read them directly from file using STL ``std::ifstream`` class. +Note that we use ``tellg`` / ``seekg`` and rely on the ability to move backward and forward in the input stream: +this might not be possible, for example with ``stdin`` stream. +The program below converts Windows-style line endings ``CR LF`` to Unix-style line endings ``LF``. + +This program uses a non-standard way of checking for the end of input: +it disables the usual cheching mechnism with ``re2c:yyfill:enable = 0;`` +(this suppresses the generation of ``YYLESSTHAN`` and ``YYFILL``) +and puts the responsibility for checking on ``YYSKIP``. +This results in more frequent checks: ``YYSKIP`` is happens on each input character, +while ``YYLESSTHAN`` happens only once per each strongly connected component of automaton. +However, this method allows to avoid padding, which would require buffering input and nullify all advantages of direct-file input. + + + +:download:`[ifstream.re] <17_ifstream.re.txt>` + +.. literalinclude:: 17_ifstream.re.txt + :language: cpp + :linenos: + +Compile: + +.. code-block:: bash + + $ re2c --input custom -o ifstream.cc ifstream.re + $ g++ -o ifstream ifstream.cc + diff --git a/src/examples/examples.rst b/src/examples/examples.rst index a8f8e5c8..0dad5fca 100644 --- a/src/examples/examples.rst +++ b/src/examples/examples.rst @@ -19,17 +19,28 @@ Examples URI (RFC-3986) HTTP (RFC-7230) Braille patterns + Strings in binaries + Fake sentinel + std::ifstream Examples have been written with two goals in mind. First, they are practical: each example solves a distinct real-world problem, ranging from simple recognizers to complex parsers conforming to real-world standards and specifications. Second, examples show various aspects of using re2c API: - Checking for the end of input: - simple and efficient `sentinel method `_ - which should be used when it is possible to mark the end of input with a *sentinel character* that never appears in the middle of well-formed input; - and the more complex `general method `_ based on ``YYLIMIT`` - which requires apending ``YYMAXFILL`` characters of padding at the end of input. + Checking for the end of input: this can be done in a number of different ways. + The simplest and the most efficient way is the sentinel method demonstrated by `lexing numbers `_ example: + it should be used when there is a *sentinel character* that never appears in the middle of well-formed input, + such as the ``NULL`` character in null-terminated strings. + If the input is buffered, sentinel should be `appended at the end of buffer `_. + If appending is not possible, one can `emulate fake sentinel `_ using generic API. + Another, more general (but also less efficient) method is based on comparison of current input position and the end position: + that is, comparison of ``YYCURSOR`` and ``YYLIMIT`` as explained in `parsing strings `_ example, + or using ``YYLESSTHAN`` in case of generic API. + By default, this method requires padding input with ``YYMAXFILL`` fake characters; + if padding is undesirable or impossible, one can override the checking mechanism using generic API + and `perform checks on each input character `_ + (also used in `std::ifstream `_ example). Handling `large input `_: how to organize buffering and how to refill buffer with ``YYFILL``. Some additional details of handling *tags* in ``YYFILL`` are illustrated @@ -45,6 +56,13 @@ Second, examples show various aspects of using re2c API: using *m-tags* to handle repeated submatch and store repeated values efficiently in the form of a prefix tree (outlined by parsing non-recursive `records and structures `_ and also used in parsing `HTTP messages `_). + Using `generic API `_, + either to override the input mechanism (outlined by `std::ifstream `_ example), + or to tweak it (as explained in + `fake sentinel `_ and + `strings in binaries `_ + examples). + Switching between different lexing *modes* using multiple interrelated sub-lexers: either in semi-automated manner with re2c `conditions <../manual/features/conditions/conditions.html>`_ feature (outlined by a simple example of `parsing integers `_ and also used in parsing `Braille patterns `_), diff --git a/src/examples/ifstream.re b/src/examples/ifstream.re new file mode 100644 index 00000000..69b0f18a --- /dev/null +++ b/src/examples/ifstream.re @@ -0,0 +1,30 @@ +#include + +static void conv(std::ifstream &in, std::ofstream &out) +{ + std::streampos mar; +# define YYCTYPE char +# define YYPEEK() in.peek() +# define YYSKIP() do { in.ignore(); if (in.eof()) return; } while(0) +# define YYBACKUP() mar = in.tellg() +# define YYRESTORE() in.seekg(mar) +loop: + /*!re2c + re2c:yyfill:enable = 0; + + * { out.put(yych); goto loop; } + "\r\n" { out.put('\n'); goto loop; } + */ +} + +int main(int argc, char **argv) +{ + if (argc != 3) return 1; + + std::ifstream in(argv[1], std::ios::binary); + std::ofstream out(argv[2], std::ios::binary); + if (in.fail() || out.fail()) return 2; + + conv(in, out); + return 0; +} -- 2.40.0