From: Ulya Trofimovich Date: Fri, 26 Jul 2019 09:32:50 +0000 (+0100) Subject: Added paper "Efficient POSIX Submatch Extraction on NFA". X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=e30f9d5bafefe6fa59d862e0d20cc34f41569412;p=re2c Added paper "Efficient POSIX Submatch Extraction on NFA". --- diff --git a/src/about/2019_borsotti_trofimovich_efficient_posix_submatch_extraction_on_nfa.pdf b/src/about/2019_borsotti_trofimovich_efficient_posix_submatch_extraction_on_nfa.pdf new file mode 100644 index 00000000..6da28a4a Binary files /dev/null and b/src/about/2019_borsotti_trofimovich_efficient_posix_submatch_extraction_on_nfa.pdf differ diff --git a/src/manual/features/dot/utf8_any.re b/src/manual/features/dot/utf8_any.re new file mode 100644 index 00000000..96668a86 --- /dev/null +++ b/src/manual/features/dot/utf8_any.re @@ -0,0 +1,4 @@ +/*!re2c + * {} + [^] {} +*/ diff --git a/src/manual/features/eof/01_sentinel.re b/src/manual/features/eof/01_sentinel.re new file mode 100644 index 00000000..69accb28 --- /dev/null +++ b/src/manual/features/eof/01_sentinel.re @@ -0,0 +1,25 @@ +#include + +static int lex(const char *YYCURSOR) +{ + int count = 0; +loop: + /*!re2c + re2c:define:YYCTYPE = char; + re2c:yyfill:enable = 0; + + * { return -1; } + [\x00] { return count; } + [a-z]+ { ++count; goto loop; } + [ ]+ { goto loop; } + + */ +} + +int main() +{ + assert(lex("") == 0); + assert(lex("one two three") == 3); + assert(lex("one two 123?") == -1); + return 0; +} diff --git a/src/manual/features/eof/02_bounds_checking.re b/src/manual/features/eof/02_bounds_checking.re new file mode 100644 index 00000000..d155f7f6 --- /dev/null +++ b/src/manual/features/eof/02_bounds_checking.re @@ -0,0 +1,46 @@ +#include +#include +#include + +/*!max:re2c*/ + +static int lex(const char *str) +{ + const size_t len = strlen(str); + char *buf = malloc(len + YYMAXFILL); + memcpy(buf, str, len); + memset(buf + len, 0, YYMAXFILL); + + const char *YYCURSOR = buf; + const char *YYLIMIT = buf + len + YYMAXFILL; + int count = 0; + +loop: + /*!re2c + re2c:define:YYCTYPE = char; + re2c:define:YYFILL:naked = 1; + re2c:define:YYFILL = "goto error;"; + + * { goto error; } + [\x00] { if (YYCURSOR == YYLIMIT) goto end; else goto error; } + [a-z]+ { ++count; goto loop; } + ['] ([^'] | [\\]['])* ['] { ++count; goto loop; } + [ ]+ { goto loop; } + + */ +error: + count = -1; +end: + free(buf); + return count; +} + +int main() +{ + assert(lex("") == 0); + assert(lex("one two three") == 3); + assert(lex("one two 123?") == -1); + assert(lex("one 'two' 'th\\'ree' '123?' ''") == 5); + assert(lex("one 'two' 'three") == -1); + return 0; +} diff --git a/src/manual/features/eof/03_eof_rule.re b/src/manual/features/eof/03_eof_rule.re new file mode 100644 index 00000000..aed0e21d --- /dev/null +++ b/src/manual/features/eof/03_eof_rule.re @@ -0,0 +1,33 @@ +#include +#include + +static int lex(const char *str) +{ + const char *YYCURSOR = str; + const char *YYLIMIT = str + strlen(str); + int count = 0; + +loop: + /*!re2c + re2c:define:YYCTYPE = char; + re2c:yyfill:enable = 0; + re2c:eof = 0; + + * { return -1; } + $ { return count; } + [a-z]+ { ++count; goto loop; } + ['] ([^'] | [\\]['])* ['] { ++count; goto loop; } + [ ]+ { goto loop; } + + */ +} + +int main() +{ + assert(lex("") == 0); + assert(lex("one two three") == 3); + assert(lex("one two 123?") == -1); + assert(lex("one 'two' 'th\\'ree' '123?' ''") == 5); + assert(lex("one 'two' 'three") == -1); + return 0; +} diff --git a/src/manual/features/eof/04_generic_api.re b/src/manual/features/eof/04_generic_api.re new file mode 100644 index 00000000..7afc6736 --- /dev/null +++ b/src/manual/features/eof/04_generic_api.re @@ -0,0 +1,35 @@ +#include +#include + +#define YYPEEK() *cur +#define YYSKIP() if (++cur > lim) return -1 +static int lex(const char *str) +{ + const char *cur = str; + const char *lim = str + strlen(str) + 1; + int count = 0; + +loop: + /*!re2c + re2c:define:YYCTYPE = char; + re2c:yyfill:enable = 0; + re2c:flags:input = custom; + + * { return -1; } + [\x00] { return cur == lim ? count : -1; } + [a-z]+ { ++count; goto loop; } + ['] ([^'] | [\\]['])* ['] { ++count; goto loop; } + [ ]+ { goto loop; } + + */ +} + +int main() +{ + assert(lex("") == 0); + assert(lex("one two three") == 3); + assert(lex("one two 123?") == -1); + assert(lex("one 'two' 'th\\'ree' '123?' ''") == 5); + assert(lex("one 'two' 'three") == -1); + return 0; +} diff --git a/src/manual/features/eof/eof.rst b/src/manual/features/eof/eof.rst new file mode 100644 index 00000000..f1fdebc7 --- /dev/null +++ b/src/manual/features/eof/eof.rst @@ -0,0 +1,131 @@ +Re2c provides a number of ways to handle end-of-input situation. Which way to +use depends on the complexity of regular expressions, performance +considerations, the need for input buffering and various other factors. EOF +handling is probably the most complex part of re2c user interface --- it +definitely requires a bit of understanding of how the generated lexer works. +But in return is allows the user to customize lexer for a particular environment +and avoid the unnecessary overhead of generic methods when a simpler method is +sufficient. Roughly speaking, there are four main methods: + +- using sentinel symbol (simple and efficient, but limited) +- bounds checking with padding (generic, but complex) +- EOF rule: a combination of sentinel symbol and bounds checking (generic and + simple, can be more or less efficient than bounds checking with padding + depending on the grammar) +- using generic API (user-defined, so may be incorrect ;]) + +Using sentinel symbol +--------------------- +This is the simplest and the most efficient method. It is applicable in cases +when the input is small enough to fit into a continuous memory buffer and there +is a natural "sentinel" symbol --- a code unit that is not allowed by any of the +regular expressions in grammar (except possibly as a terminating character). +Sentinel symbol never appears in well-formed input, therefore it can be appended +at the end of input and used as a stop signal by the lexer. A good example of +such input is a null-terminated C-string, provided that the grammar does not +allow ``NULL`` in the middle of lexemes. Sentinel method is very efficient, +because the lexer does not need to perform any additional checks for the end of +input --- it comes naturally as a part of processing the next character. + +Below is an example of using sentinel method. Configuration +``re2c:yyfill:enable = 0;`` suppresses generation of end-of-input checks and +``YYFILL`` calls. + +.. literalinclude:: /manual/features/eof/01_sentinel.re + :language: c + +Bounds checking with padding +---------------------------- + +Bounds checking is a generic method: it can be used with any input grammar. +The basic idea is simple: we need to check for the end of input before reading +the next input character. However, if implemented in a straightforward way, this +would be quite inefficient: checking on each input character would cause a major +slowdown. Re2c avoids slowdown by generating checks only in certain key states +of the lexer, and letting it run without checks in-between the key states. +More precisely, re2c computes strongly connected components (SCCs) of +the underlying DFA (which roughly correspond to loops), and generates only a few +checks per each SCC (usually just one, but in general enough to make the SCC +acyclic). The check is of the form ``(YYLIMIT - YYCURSOR) < n``, where ``n`` +is the maximal length of a simple path in the corresponding SCC. If this +condiiton is true, the lexer calls ``YYFILL(n)``, which must either supply at +least ``n`` input characters, or do not return. When the lexer continues after +the check, it is certain that the next ``n`` characters can be read safely +without checks. + +This approach reduces the number of checks significantly (and makes the lexer +much faster as a result), but it has a downside. Since the lexer checks for +multiple characters at once, it may end up in a situation when there are a few +remaining input characters (less than ``n``) corresponding to a short path in +the SCC, but the lexer cannot proceed because of the check, and ``YYFILL`` +cannot supply more character because it is the end of input. To solve this +problem, re2c requires that additional padding consisting of fake characters is +appended at the end of input. The length of padding should be ``YYMAXFILL``, +which equals to the maximum ``n`` parameter to ``YYFILL`` and must be generated +by re2c using ``/*!max:re2c*/`` directive. The fake characters should not form a +valid lexeme suffix, otherwise the lexer may be fooled into matching a fake +lexeme. Usually it's a good idea to use ``NULL`` characters for padding. + +Below is an example of using bounds checking with padding. Note that the grammar +rule for single-quoted strings allows arbitrary symbols in the middle of lexeme, +so there is no natural sentinel in the grammar. Strings like ``"aha\0ha"`` are +perfectly valid, but ill-formed strings like ``"aha\0`` are also possible and +shouldn’t crash the lexer. In this example we do not use buffer refilling, +therefore ``YYFILL`` definition simply returns an error. Note that ``YYFILL`` +will only be called after the lexer reaches padding, because only then will the +check condition be satisfied. + +.. literalinclude:: /manual/features/eof/02_bounds_checking.re + :language: c + +EOF rule +-------- + +EOF rule ``$`` was introduced in version 1.2. It is a hybrid approach that tries +to take the best of both worlds: simplicity and efficiency of the sentinel +method combined with the generality of bounds-checking method. The idea is to +appoint an arbitrary symbol to be the sentinel, and only perform further bounds +checking if the sentinel symbol matches (more precisely, if the symbol class that +contains it matches). The check is of the form ``YYLIMIT <= YYCURSOR``. +If this condition is not satisfied, then the sentinel is just an ordinary input +character and the lexer continues. Otherwise this is a real sentinel, and the +lexer calls ``YYFILL()``. If ``YYFILL`` returns zero, the lexer assumes that it +has more input and tries to re-match. Otherwise ``YYFILL`` returns non-zero and +the lexer knows that it has reached the end of input. At this point there are +three possibilities. First, it might have already matched a shorter lexeme --- +in this case it just rolls back to the last accepting state. Second, it might +have consumed some characters, but failed to match --- in this case it falls +back to default rule ``*``. Finally, it might be in the initial state --- in +this (and only this!) case it matches EOF rule ``$``. + +Below is an example of using EOF rule. Configuration ``re2c:yyfill:enable = 0;`` +suppresses generation of ``YYFILL`` calls (but not the bounds checks). + +.. literalinclude:: /manual/features/eof/03_eof_rule.re + :language: c + +Using generic API +----------------- + +Generic API can be used with any of the above methods. It also allows to use a +user-defined method by placing EOF checks in one of the basic primitives. +Usually this is either ``YYSKIP`` (the check is performed when advancing to the +next input character), or ``YYPEEK`` (the check is performed when reading the +next input character). The resulting methods are inefficient, as they check on +each input character. However, they can be useful in cases when the input cannot +be buffered or padded and does not contain a sentinel character at the end. One +should be cautious when using such ad-hoc methods, as it is easy to overlook +some corner cases and come up with a method that only partially works. Also it +should be noted that not everything can be expressed via generic API: for +example, it is impossible to reimplement the way EOF rule works (in particular, +it is impossible to re-match the character after successfull ``YYFILL``). + +Below is an example of using ``YYSKIP`` to perform bounds checking without +padding. ``YYFILL`` generation is suppressed using ``re2c:yyfill:enable = 0;`` +configuration. Note that if the grammar was more complex, this method might not +work in case when two rules overlap and EOF check fails after a shorter lexeme +has already been matched (as it happens in our example, there are no overlapping +rules). + +.. literalinclude:: /manual/features/eof/04_generic_api.re + :language: c diff --git a/src/manual/features/fill/01_fill.re b/src/manual/features/fill/01_fill.re new file mode 100644 index 00000000..93966784 --- /dev/null +++ b/src/manual/features/fill/01_fill.re @@ -0,0 +1,74 @@ +#include +#include + +/*!max:re2c*/ +#define SIZE 4096 + +typedef struct { + FILE *file; + char buf[SIZE + YYMAXFILL], *lim, *cur, *tok; + int eof; +} Input; + +static int fill(Input *in, size_t need) +{ + if (in->eof) { + return 1; + } + const size_t free = in->tok - in->buf; + if (free < need) { + return 2; + } + memmove(in->buf, in->tok, in->lim - in->tok); + in->lim -= free; + in->cur -= free; + in->tok -= free; + in->lim += fread(in->lim, 1, free, in->file); + if (in->lim < in->buf + SIZE) { + in->eof = 1; + memset(in->lim, 0, YYMAXFILL); + in->lim += YYMAXFILL; + } + return 0; +} + +static void init(Input *in, FILE *file) +{ + in->file = file; + in->cur = in->tok = in->lim = in->buf + SIZE; + in->eof = 0; + fill(in, 1); +} + +#define YYFILL(n) if (fill(in, n) != 0) return -1 +static int lex(Input *in) +{ + int count = 0; +loop: + in->tok = in->cur; + /*!re2c + re2c:define:YYCTYPE = char; + re2c:define:YYCURSOR = in->cur; + re2c:define:YYLIMIT = in->lim; + + * { return -1; } + [\x00] { return (YYMAXFILL == in->lim - in->tok) ? count : -1; } + [a-z]+ { ++count; goto loop; } + ['] ([^'] | [\\]['])* ['] { ++count; goto loop; } + [ ]+ { goto loop; } + + */ +} + +int main() +{ + FILE *f = fopen("input.txt", "rb"); + if (!f) return 1; + + Input in; + init(&in, f); + printf("count: %d\n", lex(&in)); + + fclose(f); + return 0; +} diff --git a/src/manual/features/fill/02_fill.re b/src/manual/features/fill/02_fill.re new file mode 100644 index 00000000..d6dcafc4 --- /dev/null +++ b/src/manual/features/fill/02_fill.re @@ -0,0 +1,71 @@ +#include +#include + +#define SIZE 4096 + +typedef struct { + FILE *file; + char buf[SIZE + 1], *lim, *cur, *tok; + int eof; +} Input; + +static int fill(Input *in) +{ + if (in->eof) { + return 1; + } + const size_t free = in->tok - in->buf; + if (free < 1) { + return 2; + } + memmove(in->buf, in->tok, in->lim - in->tok); + in->lim -= free; + in->cur -= free; + in->tok -= free; + in->lim += fread(in->lim, 1, free, in->file); + in->lim[0] = 0; + in->eof |= in->lim < in->buf + SIZE; + return 0; +} + +static void init(Input *in, FILE *file) +{ + in->file = file; + in->cur = in->tok = in->lim = in->buf + SIZE; + in->eof = 0; + fill(in); +} + +#define YYFILL() fill(in) +static int lex(Input *in) +{ + int count = 0; +loop: + in->tok = in->cur; + /*!re2c + re2c:define:YYCTYPE = char; + re2c:define:YYCURSOR = in->cur; + re2c:define:YYLIMIT = in->lim; + re2c:eof = 0; + + * { return -1; } + $ { return count; } + [a-z]+ { ++count; goto loop; } + ['] ([^'] | [\\]['])* ['] { ++count; goto loop; } + [ ]+ { goto loop; } + + */ +} + +int main() +{ + FILE *f = fopen("input.txt", "rb"); + if (!f) return 1; + + Input in; + init(&in, f); + printf("count: %d\n", lex(&in)); + + fclose(f); + return 0; +} diff --git a/src/manual/features/fill/fill.rst b/src/manual/features/fill/fill.rst new file mode 100644 index 00000000..e06ceb87 --- /dev/null +++ b/src/manual/features/fill/fill.rst @@ -0,0 +1,123 @@ +The need for buffering arises when the input cannot be mapped in memory all at +once: either it is too large, or it comes in a streaming fashion (like reading +from a socket). The usual technique in such cases is to allocate a fixed-sized +memory buffer and process input in chunks that fit into the buffer. When the +current chunk is processed, it is moved out and new data is moved in. In +practice it is somewhat more complex, because lexer state consists not of a +single input position, but a set of interrelated posiitons: + +- cursor: the next input character to be read (``YYCURSOR`` in default API or + ``YYSKIP``/``YYPEEK`` in generic API) + +- limit: the position after the last available input character (``YYLIMIT`` in + default API, implicitly handled by ``YYLESSTHAN`` in generic API) + +- marker: the position of the most recent match, if any (``YYMARKER`` in default + API or ``YYBACKUP``/``YYRESTORE`` in generic API) + +- token: the start of the current lexeme (implicit in re2c API, as it is not + needed for the normal lexer operation and can be defined and updated by the + user) + +- context marker: the position of the trailing context (``YYCTXMARKER`` in + default API or ``YYBACKUPCTX``/``YYRESTORECTX`` in generic API) + +- tag variables: submatch positions (defined with ``/*!stags:re2c*/`` and + ``/*!mtags:re2c*/`` directives and + ``YYSTAGP``/``YYSTAGN``/``YYMTAGP``/``YYMTAGN`` in generic API) + +Not all these are used in every case, but if used, they must be updated by +``YYFILL``. All active positions are contained in the segment between token and +cursor, therefore everything between buffer start and token can be discarded, +the segment from token and up to limit should be moved to the beginning of +buffer, and the free space at the end of buffer should be filled with new data. +In order to avoid frequent ``YYFILL`` calls it is best to fill in as many input +characters as possible (even though fewer characters might suffice to resume the +lexer). The details of ``YYFILL`` implementation are slightly different +depending on which EOF handling method is used: the case of EOF rule is somewhat +simpler than the case of bounds-checking with padding. Also note that if +``-f --storable-state`` option is used, ``YYFILL`` has slightly different +semantics (desrbed in the section about storable state). + +YYFILL with EOF rule +-------------------- + +If EOF rule is used, ``YYFILL`` is a function-like primitive that accepts +no arguments and returns a value which is checked against zero. ``YYFILL`` +invocation is triggered by condition ``YYLIMIT <= YYCURSOR`` in default API and +``YYLESSTHAN()`` in generic API. A non-zero return value means that ``YYFILL`` +has failed. A successful ``YYFILL`` call must supply at least one character and +adjust input positions accordingly. Limit must always be set to one after the +last input position in buffer, and the character at the limit position must be +the sentinel symbol specified by ``re2c:eof`` configuration. The pictures below +show the relative locations of input positions in buffer before and after +``YYFILL`` call (sentinel symbol is marked with ``#``, and the second picture +shows the case when there is not enough input to fill the whole buffer). + +.. code-block:: none + + <-- shift --> + >-A~~~~~~~~~~~~B~~~~~~~~~C~~~~~~~~~~~~~D#-----------E-> + buffer token marker limit, + cursor + >-A------------B~~~~~~~~~C~~~~~~~~~~~~~D~~~~~~~~~~~~E#-> + buffer, marker cursor limit + token + + <-- shift --> + >-A~~~~~~~~~~~~B~~~~~~~~~C~~~~~~~~~~~~~D#--E (EOF) + buffer token marker limit, + cursor + >-A------------B~~~~~~~~~C~~~~~~~~~~~~~D~~~E#........ + buffer, marker cursor limit + token + + +Here is an example of a program that reads input file ``input.txt`` in chunks of +4096 bytes and uses EOF rule. + +.. literalinclude:: /manual/features/fill/02_fill.re + :language: c + +YYFILL with padding +------------------- + +In the default case (when EOF rule is not used) ``YYFILL`` is a function-like +primitive that accepts a single argument and does not return any value. +``YYFILL`` invocation is triggered by condition ``(YYLIMIT - YYCURSOR) < n`` in +default API and ``YYLESSTHAN(n)`` in generic API. The argument passed to +``YYFILL`` is the minimal number of characters that must be supplied. If it +fails to do so, ``YYFILL`` must not return to the lexer (for that reason it is +best implemented as a macro that returns from the calling function on failure). +In case of a successfull ``YYFILL`` invocation the limit position must be set +either to one after the last input position in buffer, or to the end of +``YYMAXFILL`` padding (in case ``YYFILL`` has successfully read at least ``n`` +characters, but not enough to fill the entire buffer). The pictures below show +the relative locations of input positions in buffer before and after ``YYFILL`` +invocation (``YYMAXFILL`` padding on the second picture is marked with ``#`` +symbols). + +.. code-block:: none + + <-- shift --> <-- need --> + >-A~~~~~~~~~~~~B~~~~~~~~~C~~~~~D~~~~~~~E---F--------G-> + buffer token marker cursor limit + + >-A------------B~~~~~~~~~C~~~~~D~~~~~~~E~~~F~~~~~~~~G-> + buffer, marker cursor limit + token + + <-- shift --> <-- need --> + >-A~~~~~~~~~~~~B~~~~~~~~~C~~~~~D~~~~~~~E-F (EOF) + buffer token marker cursor limit + + >-A------------B~~~~~~~~~C~~~~~D~~~~~~~E~F############### + buffer, marker cursor limit + token <- YYMAXFILL -> + +Here is an example of a program that reads input file ``input.txt`` in chunks of +4096 bytes and uses bounds-checking with padding. + +.. literalinclude:: /manual/features/fill/01_fill.re + :language: c + diff --git a/src/manual/features/fill/input.txt b/src/manual/features/fill/input.txt new file mode 100644 index 00000000..3e13503d --- /dev/null +++ b/src/manual/features/fill/input.txt @@ -0,0 +1 @@ +one two 'th\'ree' '123' '' \ No newline at end of file diff --git a/src/manual/features/submatch/mtags.re b/src/manual/features/submatch/mtags.re new file mode 100644 index 00000000..923d1e3d --- /dev/null +++ b/src/manual/features/submatch/mtags.re @@ -0,0 +1,59 @@ +#include +#include +#include + +static const int ROOT = -1; + +struct Mtag { + int pred; + const char *tag; +}; + +typedef std::vector MtagTree; +typedef std::vector Words; + +static void mtag(int *pt, const char *t, MtagTree *tree) +{ + Mtag m = {*pt, t}; + *pt = (int)tree->size(); + tree->push_back(m); +} + +static void unfold(const MtagTree &tree, int x, int y, Words &words) +{ + if (x == ROOT) return; + unfold(tree, tree[x].pred, tree[y].pred, words); + const char *px = tree[x].tag, *py = tree[y].tag; + words.push_back(std::string(px, py - px)); +} + +#define YYMTAGP(t) mtag(&t, YYCURSOR, &tree) +#define YYMTAGN(t) mtag(&t, NULL, &tree) +static bool lex(const char *YYCURSOR, Words &words) +{ + const char *YYMARKER; + /*!mtags:re2c format = "int @@ = ROOT;"; */ + MtagTree tree; + int x, y; + + /*!re2c + re2c:define:YYCTYPE = char; + re2c:yyfill:enable = 0; + re2c:flags:tags = 1; + + (#x [a-zA-Z0-9_]+ #y [;])+ { + words.clear(); + unfold(tree, x, y, words); + return true; + } + * { return false; } + + */ +} + +int main() +{ + Words w; + assert(lex("one;tw0;three;", w) && w == Words({"one", "tw0", "three"})); + return 0; +} diff --git a/src/manual/features/submatch/posix.re b/src/manual/features/submatch/posix.re new file mode 100644 index 00000000..c3aa045c --- /dev/null +++ b/src/manual/features/submatch/posix.re @@ -0,0 +1,45 @@ +#include +#include + +static uint32_t num(const char *s, const char *e) +{ + uint32_t n = 0; + for (; s < e; ++s) n = n * 10 + (*s - '0'); + return n; +} + +/*!maxnmatch:re2c*/ + +static uint32_t lex(const char *YYCURSOR) +{ + const char *YYMARKER; + const char *yypmatch[YYMAXNMATCH]; + uint32_t yynmatch; + /*!stags:re2c format = 'const char *@@;'; */ + + /*!re2c + re2c:define:YYCTYPE = char; + re2c:yyfill:enable = 0; + re2c:flags:posix-captures = 1; + + oct = [0-9]{1,3}; + dot = [.]; + + (oct) dot (oct) dot (oct) dot (oct) { + return num(yypmatch[8], yypmatch[9]) + + (num(yypmatch[6], yypmatch[7]) << 8) + + (num(yypmatch[4], yypmatch[5]) << 16) + + (num(yypmatch[2], yypmatch[3]) << 24); + } + * { return 0; } + + */ +} + +int main() +{ + assert(lex("1.2.3.4") == 0x01020304); + assert(lex("127.0.0.1") == 0x7f000001); + assert(lex("255.255.255.255") == 0xffffffff); + return 0; +} diff --git a/src/manual/features/submatch/stags.re b/src/manual/features/submatch/stags.re new file mode 100644 index 00000000..aa3b7bdf --- /dev/null +++ b/src/manual/features/submatch/stags.re @@ -0,0 +1,41 @@ +#include +#include + +static uint32_t num(const char *s, const char *e) +{ + uint32_t n = 0; + for (; s < e; ++s) n = n * 10 + (*s - '0'); + return n; +} + +static uint32_t lex(const char *YYCURSOR) +{ + const char *YYMARKER, *o1, *o2, *o3, *o4; + /*!stags:re2c format = 'const char *@@;'; */ + + /*!re2c + re2c:define:YYCTYPE = char; + re2c:yyfill:enable = 0; + re2c:flags:tags = 1; + + oct = [0-9]{1,3}; + dot = [.]; + + @o1 oct dot @o2 oct dot @o3 oct dot @o4 oct { + return num(o4, YYCURSOR) + + (num(o3, o4 - 1) << 8) + + (num(o2, o3 - 1) << 16) + + (num(o1, o2 - 1) << 24); + } + * { return 0; } + + */ +} + +int main() +{ + assert(lex("1.2.3.4") == 0x01020304); + assert(lex("127.0.0.1") == 0x7f000001); + assert(lex("255.255.255.255") == 0xffffffff); + return 0; +} diff --git a/src/manual/features/submatch/submatch.rst b/src/manual/features/submatch/submatch.rst index 052892d0..34024561 100644 --- a/src/manual/features/submatch/submatch.rst +++ b/src/manual/features/submatch/submatch.rst @@ -1,7 +1,87 @@ -Submatch --------- +Re2c has two options for submatch extraction. -.. toctree:: - :hidden: - -.. include:: submatch.rst_ +The first option is ``-T --tags``. With this option one can use standalone tags +of the form ``@stag`` and ``#mtag``, where ``stag`` and ``mtag`` are arbitrary +used-defined names. Tags can be used anywhere inside of a regular expression; +semantically they are just position markers. Tags of the form ``@stag`` are +called s-tags: they denote a single submatch value (the last input position +where this tag matched). Tags of the form ``#mtag`` are called m-tags: they +denote multiple submatch values (the whole history of repetitions of this tag). +All tags should be defined by the user as variables with the corresponding +names. With standalone tags re2c uses leftmost greedy disambiguation: submatch +positions correspond to the leftmost matching path through the regular +expression. + +The second option is ``-P --posix-captures``: it enables POSIX-compliant +capturing groups. In this mode parentheses in regular expressions denote the +beginning and the end of capturing groups; the whole regular expression is group +number zero. The number of groups for the matching rule is stored in a variable +``yynmatch``, and submatch results are stored in ``yypmatch`` array. Both +``yynmatch`` and ``yypmatch`` should be defined by the user, and ``yypmatch`` +size must be at least ``[yynmatch * 2]``. Re2c provides a directive +``/*!maxnmatch:re2c*/`` that defines ``YYMAXNMATCH``: a constant equal to the +maximal value of ``yynmatch`` among all rules. Note that re2c implements +POSIX-compliant disambiguation: each subexpression matches as long as possible, +and subexpressions that start earlier in regular expression have priority over +those starting later. Capturing groups are translated into s-tags under the +hood, therefore we use the word "tag" to describe them as well. + +With both ``-P --posix-captures`` and ``T --tags`` options re2c uses efficient +submatch extraction algorithm described in the +`Tagged Deterministic Finite Automata with Lookahead `_ +paper. The overhead on submatch extraction in the generated lexer grows with the +number of tags --- if this number is moderate, the overhead is barely +noticeable. In the lexer tags are implemented using a number of tag variables +generated by re2c. There is no one-to-one correspondence between tag variables +and tags: a single variable may be reused for different tags, and one tag may +require multiple variables to hold all its ambiguous values. Eventually +ambiguity is resolved, and only one final variable per tag survives. When a rule +matches, all its tags are set to the values of the corresponding tag variables. +The exact number of tag variables is unknown to the user; this number is +determined by re2c. However, tag variables should be defined by the user as a +part of the lexer state and updated by ``YYFILL``, therefore re2c provides +directives ``/*!stags:re2c*/`` and ``/*!mtags:re2c*/`` that can be used to +declare, initialize and manipulate tag variables. These directives have two +optional configurations: ``format = "@@";`` (specifies the template where ``@@`` +is substituted with the name of each tag variable), and ``separator = "";`` +(specifies the piece of code used to join the generated pieces for different +tag variables). + +S-tags support the following operations: + +* save input position to an s-tag: ``t = YYCURSOR`` with default API or a + user-defined operation ``YYSTAGP(t)`` with generic API +* save default value to an s-tag: ``t = NULL`` with default API or a + user-defined operation ``YYSTAGN(t)`` with generic API +* copy one s-tag to another: ``t1 = t2`` + +M-tags support the following operations: + +* append input position to an m-tag: a user-defined operation ``YYMTAGP(t)`` + with both default and generic API +* append default value to an m-tag: a user-defined operation ``YYMTAGN(t)`` + with both default and generic API +* copy one m-tag to another: ``t1 = t2`` + +S-tags can be implemented as scalar values (pointers or offsets). M-tags need a +more complex representation, as they need to store a sequence of tag values. The +most naive and inefficient representation of an m-tag is a list (array, vector) +of tag values; a more efficient representation is to store all m-tags in a +prefix-tree represented as array of nodes ``(v, p)``, where ``v`` is tag value +and ``p`` is a pointer to parent node. + +Here is an example of using s-tags to parse an IPv4 address. + +.. literalinclude:: /manual/features/submatch/stags.re + :language: c + +Here is an example of using POSIX capturing groups to parse an IPv4 address. + +.. literalinclude:: /manual/features/submatch/posix.re + :language: c + +Here is an example of using m-tags to parse a semicolon-separated sequence of +words (C++). Tag variables are stored in a tree that is packed in a vector. + +.. literalinclude:: /manual/features/submatch/mtags.re + :language: c diff --git a/src/manual/warnings/condition_order/fixorder.re.txt b/src/manual/warnings/condition_order/fixorder.re.txt new file mode 100644 index 00000000..40a3c734 --- /dev/null +++ b/src/manual/warnings/condition_order/fixorder.re.txt @@ -0,0 +1,34 @@ +#include + +#ifdef REVERSED_CONDITION_ORDER +# define yyca 1 +# define yycb 0 +#else +# define yyca 0 +# define yycb 1 +#endif + +int main() +{ + const char * YYCURSOR = "aaaa,bbb!"; + int c = yyca; + for (;;) { + /*!re2c + re2c:define:YYCTYPE = char; + re2c:yyfill:enable = 0; + re2c:define:YYSETCONDITION = "c = @@;"; + re2c:define:YYSETCONDITION:naked = 1; + re2c:define:YYGETCONDITION = c; + re2c:define:YYGETCONDITION:naked = 1; + + <*> * { printf ("error\n"); break; } + + "a" { printf ("a"); continue; } + "," => b { printf (","); continue; } + + "!" { printf ("!\n"); break; } + "b" { printf ("b"); continue; } + */ + } + return 0; +} diff --git a/src/manual/warnings/match_empty_string/hang.re b/src/manual/warnings/match_empty_string/hang.re new file mode 100644 index 00000000..f4926d90 --- /dev/null +++ b/src/manual/warnings/match_empty_string/hang.re @@ -0,0 +1,17 @@ +#include + +int main(int argc, char **argv) +{ + for (int i = 1; i < argc; ++i) { + for (char *YYCURSOR = argv[i];;) { + /*!re2c + re2c:define:YYCTYPE = char; + re2c:yyfill:enable = 0; + "\x00" { break; } + [a-z]* { continue; } + */ + } + printf("argv[%d]: %s\n", i, argv[i]); + } + return 0; +}