From ed2a435971a247543f8119147d69b9487a09f3ee Mon Sep 17 00:00:00 2001 From: Ulya Trofimovich Date: Mon, 26 Oct 2015 13:16:45 +0000 Subject: [PATCH] Added example YYFILL example and fixed YYMAXFILL example. --- src/examples.rst | 201 ++++++++++++++++++----- src/examples/02_recognizing_strings.re | 6 +- src/examples/03_arbitrary_large_input.re | 107 ++++++++++++ src/manual.rst | 9 +- 4 files changed, 276 insertions(+), 47 deletions(-) create mode 100644 src/examples/03_arbitrary_large_input.re diff --git a/src/examples.rst b/src/examples.rst index 6f3be08a..627b200f 100644 --- a/src/examples.rst +++ b/src/examples.rst @@ -1,7 +1,7 @@ -.. _Recognizing integer literals: +.. _Recognizing integers: the sentinel method: -Recognizing integer literals ----------------------------- +Recognizing integers: the sentinel method +----------------------------------------- This program simply loops over its commad-line arguments and tries to match each argument against one of the four patterns: @@ -17,21 +17,22 @@ A couple of things should be noted: * Default case (when none of the patterns matched) is handled properly (line 17). * Check for the end of input is disabled (line 9). - In this case it is safe, because all arguments are ``NULL``-terminated - and none of the rules matches ``NULL`` in the middle: - when lexer reaches end of input, it will see ``NULL`` and stop. + In this case we can use ``NULL`` character as a sentinel: + all arguments are ``NULL``-terminated and none of the rules matches ``NULL`` in the middle. + Lexer will inevitably stop when it sees ``NULL``. It's a common practice to use ``re2c:yyfill:enable = 0;`` in cases when input character set is restricted and one special - character can be chosen to indicate end of input. - **But do make sure that the terminating character is not allowed in the middle of a rule!** + character can be chosen to indicate the end of input. + **But do make sure that the sentinel character is not allowed in the middle of a rule!** * ``YYMARKER`` (line 6) is needed because rules overlap: it backups input position of the longest successful match. Say, we have overlapping rules ``"a"`` and ``"abc"`` and input string ``"abd"``: - by the time ``"a"`` matches there's still a chance to match ``"abc"``. - But when lexer sees ``'d'`` it must rollback. - (You might wonder why the programmer has to bother with ``YYMARKER`` at all: couldn't re2c generate a local variable ``YYMARKER``? - The thing is, all input pointers must be updated syncronously by ``YYFILL``). + by the time ``"a"`` matches there's still a chance to match ``"abc"``, + but when lexer sees ``'d'`` it must rollback. + (You might wonder why ``YYMARKER`` is exposed at all: why not make it a local variable like ``yych``? + The reason is, all input pointers must be updated by ``YYFILL`` + as explained in example `Arbitrary large input and YYFILL`_.) Generate, compile and run: @@ -61,29 +62,40 @@ Recognizing strings: the need for YYMAXFILL ------------------------------------------- This example is about recognizing strings. -Strings may contain *any* characters in the range [0 - 0xFF] except quotes (quotes should be escaped). -It means that (unlike the previous example, `Recognizing integer literals`_) -we cannot use ``NULL`` or any other character as a terminator. -We must explicitely check for end of input. +Strings may contain *any* characters in the range ``[0 - 0xFF]`` except quotes ``"`` and escape ``\`` (they must be escaped). +It means that (unlike the previous example, `Recognizing integers: the sentinel method`_) +we cannot use ``NULL`` or any other character as a sentinel: +strings like ``"aha\0ha"\0`` are perfectly valid, but ``"aha\0`` is also possible and shouldn't crash lexer. -So how does it work? -The simplest possible way is to check on each character (right before advancing input position). -But this is very slow. -Instead, re2c estimates maximal lexeme length ``YYMAXFILL`` (disregarding loops) -and generates check if there's at least ``YYMAXFILL`` characters left: +By default re2c-generated lexers contain explicit checks for the end of input +(these checks can be suppressed with ``re2c:yyfill:enable = 0;`` configuration). +A naive approach is to check on each character (before advancing input position), but it's very slow. +Instead, re2c inserts checks only at certain points in the generated program. +Each check ensures that there is enough input to proceed until the next checkpoint. +If the check fails, lexer calls ``YYFILL``: -.. code-block:: cpp - - if ((YYLIMIT - YYCURSOR) < YYMAXFILL) - YYFILL(YYMAXFILL); + ``if ((YYLIMIT - YYCURSOR) < n) YYFILL(n);`` ``YYLIMIT`` must point at the end of input (so that ``YYLIMIT[-1]`` is the last input character). -These checks are inserted at start and before loops. -If there's not enough input characters, the generated lexer calls ``YYFILL(YYMAXFILL)`` -so that the programmer can supply more input or stop. - -The common practice is to pad input with ``YYMAXFILL`` fake characters. -**The padding should not form a valid lexeme suffix!** +``YYFILL(n)`` can either supply at least ``n`` more input characters or stop +(see example `Arbitrary large input and YYFILL`_ for details about ``YYFILL`` implementation). + +For those interested in the internal re2c algorithm used to determine checkpoints, +here is a quote from the original paper +`"RE2C: a more versatile scanner generator" <1994_bumbulis_cowan_re2c_a_more_versatile_scanner_generator.pdf>`_ +by Peter Bumbulis, Donald D. Cowan, 1994, ACM Letters on Programming Languages and Systems (LOPLAS): + + *A set of key states can be determined by discovering the strongly-connected components (SCCs) of the + DFA. An SCC is a maximal subset of states such that there exists a path from any state in the subset to any + other. The set of key states consists of all of the states in non-trivial SCCs, together with the start state. + Note that for each SCC S, we actually only have to include a subset of states of S such that when the subset + is removed, S becomes acyclic.* + +This approach reduces the number of checks significantly, but it has a downside. +Since the lexer checks for multiple characters at once, the last few input characters may become unreachable. +Common hack is to pad input with a few fake characters that **do not form a valid lexeme or lexeme suffix**. +The length of padding depends on the maximal argument to ``YYFILL`` +(this value is called ``YYMAXFILL`` and can be generated using ``/*!max:re2c*/`` directive). .. include:: examples/02_recognizing_strings.re :code: cpp @@ -91,15 +103,16 @@ The common practice is to pad input with ``YYMAXFILL`` fake characters. Notes: -* ``/*!max:re2c*/`` (line 4) tells re2c to generate ``#define YYMAXFILL ``. -* Input string is padded with ``YYMAXFILL`` zeroes (line 15). - Zeroes do not form a valid lexeme suffix (but padding with quotes would confuse the lexer ``;)``). +* ``/*!max:re2c*/`` (line 4) tells re2c to generate ``#define YYMAXFILL n``. +* Input string is padded with ``YYMAXFILL`` characters ``'a'`` (line 15). + Sequence of ``'a'`` does not form a valid lexeme suffix (but padding like ``"\0`` would cause false match on incorrect input like ``"aha``). +* ``YYLIMIT`` points to the end of padding (line 26). * ``YYFILL`` simply stops: there's nothing more to lex (line 30). * We have to use ``re2c:define:YYFILL:naked = 1;`` (line 31) in order to suppress passing parameter to ``YYFILL``. (It was an unfortunate idea to make ``YYFILL`` a call expression by default: ``YYFILL`` has to stop the lexer eventually, that's why it has to be a macro and not a function. - One should either set ``re2c:define:YYFILL:naked = 1;``, or define ``YYFILL(n)`` as a macro.) + One should either set ``re2c:define:YYFILL:naked = 1;`` or define ``YYFILL(n)`` as a macro.) Generate, compile and run: @@ -107,18 +120,128 @@ Generate, compile and run: $ re2c -o example.cc example.re $ g++ -o example example.cc - $ ./example \"a\ momentary\" \"\" \"lap\"se\" \"of \"rea\\\"son\" + $ ./example \"a\ momentary\" \"\" \"lap\"se\" \"of \"rea\\\"son\" "" str: "a momentary" str: "" err: "lap"se" err: "of str: "rea\"son" + err: .. _Arbitrary large input and YYFILL: Arbitrary large input and YYFILL -------------------------------- -Suppose the input cannot be mapped in memory at once. -The usual thing to do is to allocate a reasonably sized buffer and to read -input chunk by chunk. +Suppose that for some reason the whole input cannot be mapped in memory: +either it is very big or its size cannot be determined in advance. +The usual thing to do in such case is to allocate a buffer +and process input in chunks that fit into buffer. +For that we will need ``YYFILL``. + +See the previous example (`Recognizing strings: the need for YYMAXFILL`_) +for details about program points and conditions that trigger ``YYFILL`` invocation. +The idea of ``YYFILL`` is fairly simple: lexer is stuck upon the fact that +``(YYLIMIT - YYCURSOR) < n`` and ``YYFILL`` must either invert this condition or stop lexing. +Disaster will happen if ``YYFILL`` fails to provide at least ``n`` characters, yet resumes lexing. + +Technically ``YYFILL`` must somehow "extend" input for at least ``n`` characters: +after ``YYFILL`` all input pointers must point to exact same characters, +except ``YYLIMIT``: it must be advanced at least ``n`` positions. +Since we want to use a fixed amount of memory, we have to shift buffer contents: +discard characters that are already lexed, +move the remaining characters at the beginning of the buffer +and fill the vacant space with new characters. +All the pointers must be decreased by the length of discarded input, +except ``YYLIMIT`` (it must point at the end of buffer): + +.. code-block:: bash + + <--- discarded --> <----- n -----> + oxxxxxxxxxxxxxxxxxo----------o-----------------o--------o.....o..........o... (more input) + buffer lexeme YYMARKER YYCURSOR YYLIMIT * * + | * * * | * * + | * * * *| * + | * * * * | * + | * * * * | * + | * * * * |* + o-----------o-----------------o--------------o----------o + buffer, YYMARKER YYCURSOR YYLIMIT + lexeme + +End of input is a special case: as explained in the previous example (`Recognizing strings: the need for YYMAXFILL`_), +the input must be padded with ``YYMAXFILL`` fake characters. +In this case ``YYLIMIT`` must point at the end of padding: + +.. code-block:: bash + + <--- discarded --> <----- n -----> + oxxxxxxxxxxxxxxxxxo----------o-----------------o---o0000000000000000o + buffer lexeme YYMARKER YYCURSOR YYLIMIT * + | * * * * | * + | * * * * | * + | * * * * | * + | * * * * *| + | * * * <-- YYMAXFILL -->* | + o-----------o-----------------o---o0000000000000000o | + buffer, YYMARKER YYCURSOR YYLIMIT + lexeme + +Which part of input can be discarded? +The answer is, all input up to the leftmost meaningful pointer. +Intuitively it seems that it must be ``YYMARKER``: it backups input position of the latest match, +so it's always less than or equal to ``YYCURSOR``. +However, ``YYMARKER`` usage depends on the input: +not all control flow paths in lexer ever initialize it. +Thus for some inputs ``YYMARKER`` is meaningless +and should be used with care. + +In practice input rarely consists of one giant lexeme: it is usually a sequence of small lexemes. +In that case lexer runs in a loop and it is convenient to have a special "lexeme start" pointer. +It can be used as boundary in ``YYFILL``. + +Our example program will lex a file with strings (probably separated by whitespaces) +and count the total number of strings: + +.. include:: examples/03_arbitrary_large_input.re + :code: cpp + :number-lines: + +Notes: + +* ``YYMAXFILL`` bytes at the end of buffer are reserved for padding (line 9). + This memory is unused most of the time, but ``YYMAXFILL`` is usually negligably small compared to buffer size. + +* There is only one successsful way out (line 71): + lexer must recognize a standalone end of input lexeme ``'a'`` right at the beginning of padding. + Unlike the sentinel method, ``'a'`` in the middle of a rule is not recognized as end of input. + Standalone ``'a'`` in input (not in padding) is a lexing error. + ``YYFILL`` failure is also a lexing error: if the input was correct, + lexer should have stopped at the beginning of padding. + +* ``YYFILL`` may fail for two reasons: + either there is no more input (line 30), + or lexeme is too big: it occupies the whole buffer and nothing can be discarded (line 35). + We treat both cases as error, but a real-world program might handle them differently + (e.g. resize buffer in the second case). + +* ``@@`` in ``YYFILL`` definition (line 63) is a formal parameter: re2c substitutes it with the actual argument to ``YYFILL``. + +* There is a special ``tok`` pointer: it points at the beginning of lexeme (line 57) + and serves as a boundary in ``YYFILL`` (line 33). + +Generate, compile and run: + +.. code-block:: bash + + $ re2c -o example.cc example.re + $ g++ -o example example.cc + $ cat > input.txt + "a""momentary" + "lap\"se" "o\\f" + + "rea""son" + "" + $ ./example input.txt + glorious 7 strings! + diff --git a/src/examples/02_recognizing_strings.re b/src/examples/02_recognizing_strings.re index f0b39d3e..6346c9ce 100644 --- a/src/examples/02_recognizing_strings.re +++ b/src/examples/02_recognizing_strings.re @@ -8,11 +8,11 @@ struct input_t { char *str; input_t(const char *s) - : len(strlen(s)) + : len(strlen(s) + 1) , str(new char[len + YYMAXFILL]) { memcpy(str, s, len); - memset(str + len, 0, YYMAXFILL); + memset(str + len, 'a', YYMAXFILL); } ~input_t() { @@ -31,7 +31,7 @@ static const char *lex(const input_t & input) re2c:define:YYFILL:naked = 1; end = "\x00"; - str = "\"" ([^"] | "\\\"")* "\""; + str = "\"" ([^"\\] | "\\" ["\\])* "\""; * { return "err"; } str end { return "str"; } diff --git a/src/examples/03_arbitrary_large_input.re b/src/examples/03_arbitrary_large_input.re new file mode 100644 index 00000000..c3d8b562 --- /dev/null +++ b/src/examples/03_arbitrary_large_input.re @@ -0,0 +1,107 @@ +#include +#include +#include + +/*!max:re2c*/ +static const size_t SIZE = 1024; + +struct input_t { + char buf[SIZE + YYMAXFILL]; + char *lim; + char *cur; + char *mar; + char *tok; + bool eof; + + FILE *const file; + + input_t(FILE *f) + : buf() + , lim(buf + SIZE) + , cur(lim) + , mar(lim) + , tok(lim) + , eof(false) + , file(f) + {} + bool fill(size_t need) + { + if (eof) { + return false; + } + + const size_t free = tok - buf; + if (free < need) { + return false; + } + + memmove(buf, tok, lim - tok); + lim -= free; + cur -= free; + mar -= free; + tok -= free; + lim += fread(lim, 1, free, file); + if (lim < buf + SIZE) { + eof = true; + memset(lim, 'a', YYMAXFILL); + lim += YYMAXFILL; + } + return true; + } +}; + +static int lex(input_t & input) +{ + int count = 0; + for (;;) { + input.tok = input.cur; + /*!re2c + re2c:define:YYCTYPE = char; + re2c:define:YYCURSOR = input.cur; + re2c:define:YYMARKER = input.mar; + re2c:define:YYLIMIT = input.lim; + re2c:define:YYFILL = "if (!input.fill(@@)) return -1;"; + re2c:define:YYFILL:naked = 1; + + end = "a"; + str = "\"" ([^"\\] | "\\" ["\\])* "\""; + wsp = [ \t\n\r]+; + + * { return -1; } + end { return (input.lim - input.tok == YYMAXFILL) ? count : -1; } + wsp { continue; } + str { + if (count == INT_MAX) { + return -2; + } + ++count; + continue; + } + */ + } +} + +int main(int argc, char **argv) +{ + if (argc != 2) { + printf ("usage: ./example \n"); + return 1; + } + + FILE *file = fopen(argv[1], "rb"); + if (!file) { + printf("error: cannot open file: %s\n", argv[1]); + return 1; + } + + input_t input(file); + const int count = lex(input); + switch (count) { + case -1: printf("lexing error\n"); break; + case -2: printf("lines overflow\n"); break; + default: printf("glorious %d strings!\n", count); break; + } + + fclose(file); + return 0; +} diff --git a/src/manual.rst b/src/manual.rst index 2530ce07..0c0c5556 100644 --- a/src/manual.rst +++ b/src/manual.rst @@ -38,11 +38,10 @@ Authors ------- Originally written by Peter Bumbulis (peter@csg.uwaterloo.ca) -and described in research article *"RE2C: a more versatile scanner generator" -by Peter Bumbulis, Donald D. Cowan, -1994, -ACM Letters on Programming Languages and Systems (LOPLAS)* -`[PDF] <1994_bumbulis_cowan_re2c_a_more_versatile_scanner_generator.pdf>`_. +and described in research article +`"RE2C: a more versatile scanner generator" <1994_bumbulis_cowan_re2c_a_more_versatile_scanner_generator.pdf>`_ +by Peter Bumbulis, Donald D. Cowan, 1994, +ACM Letters on Programming Languages and Systems (LOPLAS). Since then many people have contributed to re2c: -- 2.40.0