Added paper "Efficient POSIX Submatch Extraction on NFA".

author Ulya Trofimovich <skvadrik@gmail.com>

Fri, 26 Jul 2019 09:32:50 +0000 (10:32 +0100)

committer Ulya Trofimovich <skvadrik@gmail.com>

Fri, 26 Jul 2019 09:32:50 +0000 (10:32 +0100)
author Ulya Trofimovich <skvadrik@gmail.com>
Fri, 26 Jul 2019 09:32:50 +0000 (10:32 +0100)
committer Ulya Trofimovich <skvadrik@gmail.com>
Fri, 26 Jul 2019 09:32:50 +0000 (10:32 +0100)
diff --git a/src/about/2019_borsotti_trofimovich_efficient_posix_submatch_extraction_on_nfa.pdf b/src/about/2019_borsotti_trofimovich_efficient_posix_submatch_extraction_on_nfa.pdf

new file mode 100644 (file)

index 0000000..6da28a4

Binary files /dev/null and b/src/about/2019_borsotti_trofimovich_efficient_posix_submatch_extraction_on_nfa.pdf differ
diff --git a/src/manual/features/dot/utf8_any.re b/src/manual/features/dot/utf8_any.re

new file mode 100644 (file)

index 0000000..96668a8
--- /dev/null
+++ b/src/manual/features/dot/utf8_any.re
@@ -0,0 +1,4 @@
+/*!re2c
+    *   {}
+    [^] {}
+*/
diff --git a/src/manual/features/eof/01_sentinel.re b/src/manual/features/eof/01_sentinel.re

new file mode 100644 (file)

index 0000000..69accb2
--- /dev/null
+++ b/src/manual/features/eof/01_sentinel.re
@@ -0,0 +1,25 @@
+#include <assert.h>
+
+static int lex(const char *YYCURSOR)
+{
+    int count = 0;
+loop:
+    /*!re2c
+    re2c:define:YYCTYPE = char;
+    re2c:yyfill:enable = 0;
+
+    *      { return -1; }
+    [\x00] { return count; }
+    [a-z]+ { ++count; goto loop; }
+    [ ]+   { goto loop; }
+
+    */
+}
+
+int main()
+{
+    assert(lex("") == 0);
+    assert(lex("one two three") == 3);
+    assert(lex("one two 123?") == -1);
+    return 0;
+}
diff --git a/src/manual/features/eof/02_bounds_checking.re b/src/manual/features/eof/02_bounds_checking.re

new file mode 100644 (file)

index 0000000..d155f7f
--- /dev/null
+++ b/src/manual/features/eof/02_bounds_checking.re
@@ -0,0 +1,46 @@
+#include <assert.h>
+#include <stdlib.h>
+#include <string.h>
+
+/*!max:re2c*/
+
+static int lex(const char *str)
+{
+    const size_t len = strlen(str);
+    char *buf = malloc(len + YYMAXFILL);
+    memcpy(buf, str, len);
+    memset(buf + len, 0, YYMAXFILL);
+
+    const char *YYCURSOR = buf;
+    const char *YYLIMIT = buf + len + YYMAXFILL;
+    int count = 0;
+
+loop:
+    /*!re2c
+    re2c:define:YYCTYPE = char;
+    re2c:define:YYFILL:naked = 1;
+    re2c:define:YYFILL = "goto error;";
+
+    *                         { goto error; }
+    [\x00]                    { if (YYCURSOR == YYLIMIT) goto end; else goto error; }
+    [a-z]+                    { ++count; goto loop; }
+    ['] ([^'] | [\\]['])* ['] { ++count; goto loop; }
+    [ ]+                      { goto loop; }
+
+    */
+error:
+    count = -1;
+end:
+    free(buf);
+    return count;
+}
+
+int main()
+{
+    assert(lex("") == 0);
+    assert(lex("one two three") == 3);
+    assert(lex("one two 123?") == -1);
+    assert(lex("one 'two' 'th\\'ree' '123?' ''") == 5);
+    assert(lex("one 'two' 'three") == -1);
+    return 0;
+}
diff --git a/src/manual/features/eof/03_eof_rule.re b/src/manual/features/eof/03_eof_rule.re

new file mode 100644 (file)

index 0000000..aed0e21
--- /dev/null
+++ b/src/manual/features/eof/03_eof_rule.re
@@ -0,0 +1,33 @@
+#include <assert.h>
+#include <string.h>
+
+static int lex(const char *str)
+{
+    const char *YYCURSOR = str;
+    const char *YYLIMIT = str + strlen(str);
+    int count = 0;
+
+loop:
+    /*!re2c
+    re2c:define:YYCTYPE = char;
+    re2c:yyfill:enable = 0;
+    re2c:eof = 0;
+
+    *                         { return -1; }
+    $                         { return count; }
+    [a-z]+                    { ++count; goto loop; }
+    ['] ([^'] | [\\]['])* ['] { ++count; goto loop; }
+    [ ]+                      { goto loop; }
+
+    */
+}
+
+int main()
+{
+    assert(lex("") == 0);
+    assert(lex("one two three") == 3);
+    assert(lex("one two 123?") == -1);
+    assert(lex("one 'two' 'th\\'ree' '123?' ''") == 5);
+    assert(lex("one 'two' 'three") == -1);
+    return 0;
+}
diff --git a/src/manual/features/eof/04_generic_api.re b/src/manual/features/eof/04_generic_api.re

new file mode 100644 (file)

index 0000000..7afc673
--- /dev/null
+++ b/src/manual/features/eof/04_generic_api.re
@@ -0,0 +1,35 @@
+#include <assert.h>
+#include <string.h>
+
+#define YYPEEK() *cur
+#define YYSKIP() if (++cur > lim) return -1
+static int lex(const char *str)
+{
+    const char *cur = str;
+    const char *lim = str + strlen(str) + 1;
+    int count = 0;
+
+loop:
+    /*!re2c
+    re2c:define:YYCTYPE = char;
+    re2c:yyfill:enable = 0;
+    re2c:flags:input = custom;
+
+    *                         { return -1; }
+    [\x00]                    { return cur == lim ? count : -1; }
+    [a-z]+                    { ++count; goto loop; }
+    ['] ([^'] | [\\]['])* ['] { ++count; goto loop; }
+    [ ]+                      { goto loop; }
+
+    */
+}
+
+int main()
+{
+    assert(lex("") == 0);
+    assert(lex("one two three") == 3);
+    assert(lex("one two 123?") == -1);
+    assert(lex("one 'two' 'th\\'ree' '123?' ''") == 5);
+    assert(lex("one 'two' 'three") == -1);
+    return 0;
+}
diff --git a/src/manual/features/eof/eof.rst b/src/manual/features/eof/eof.rst

new file mode 100644 (file)

index 0000000..f1fdebc
--- /dev/null
+++ b/src/manual/features/eof/eof.rst
@@ -0,0 +1,131 @@
+Re2c provides a number of ways to handle end-of-input situation. Which way to
+use depends on the complexity of regular expressions, performance
+considerations, the need for input buffering and various other factors. EOF
+handling is probably the most complex part of re2c user interface --- it
+definitely requires a bit of understanding of how the generated lexer works.
+But in return is allows the user to customize lexer for a particular environment
+and avoid the unnecessary overhead of generic methods when a simpler method is
+sufficient. Roughly speaking, there are four main methods:
+
+- using sentinel symbol (simple and efficient, but limited)
+- bounds checking with padding (generic, but complex)
+- EOF rule: a combination of sentinel symbol and bounds checking (generic and
+  simple, can be more or less efficient than bounds checking with padding
+  depending on the grammar)
+- using generic API (user-defined, so may be incorrect ;])
+
+Using sentinel symbol
+---------------------
+This is the simplest and the most efficient method. It is applicable in cases
+when the input is small enough to fit into a continuous memory buffer and there
+is a natural "sentinel" symbol --- a code unit that is not allowed by any of the
+regular expressions in grammar (except possibly as a terminating character).
+Sentinel symbol never appears in well-formed input, therefore it can be appended
+at the end of input and used as a stop signal by the lexer. A good example of
+such input is a null-terminated C-string, provided that the grammar does not
+allow ``NULL`` in the middle of lexemes. Sentinel method is very efficient,
+because the lexer does not need to perform any additional checks for the end of
+input --- it comes naturally as a part of processing the next character.
+
+Below is an example of using sentinel method. Configuration
+``re2c:yyfill:enable = 0;`` suppresses generation of end-of-input checks and
+``YYFILL`` calls.
+
+.. literalinclude:: /manual/features/eof/01_sentinel.re
+    :language: c
+
+Bounds checking with padding
+----------------------------
+
+Bounds checking is a generic method: it can be used with any input grammar.
+The basic idea is simple: we need to check for the end of input before reading
+the next input character. However, if implemented in a straightforward way, this
+would be quite inefficient: checking on each input character would cause a major
+slowdown. Re2c avoids slowdown by generating checks only in certain key states
+of the lexer, and letting it run without checks in-between the key states.
+More precisely, re2c computes strongly connected components (SCCs) of
+the underlying DFA (which roughly correspond to loops), and generates only a few
+checks per each SCC (usually just one, but in general enough to make the SCC
+acyclic). The check is of the form ``(YYLIMIT - YYCURSOR) < n``, where ``n``
+is the maximal length of a simple path in the corresponding SCC. If this
+condiiton is true, the lexer calls ``YYFILL(n)``, which must either supply at
+least ``n`` input characters, or do not return. When the lexer continues after
+the check, it is certain that the next ``n`` characters can be read safely
+without checks.
+
+This approach reduces the number of checks significantly (and makes the lexer
+much faster as a result), but it has a downside. Since the lexer checks for
+multiple characters at once, it may end up in a situation when there are a few
+remaining input characters (less than ``n``) corresponding to a short path in
+the SCC, but the lexer cannot proceed because of the check, and ``YYFILL``
+cannot supply more character because it is the end of input. To solve this
+problem, re2c requires that additional padding consisting of fake characters is
+appended at the end of input. The length of padding should be ``YYMAXFILL``,
+which equals to the maximum ``n`` parameter to ``YYFILL`` and must be generated
+by re2c using ``/*!max:re2c*/`` directive. The fake characters should not form a
+valid lexeme suffix, otherwise the lexer may be fooled into matching a fake
+lexeme. Usually it's a good idea to use ``NULL`` characters for padding.
+
+Below is an example of using bounds checking with padding. Note that the grammar
+rule for single-quoted strings allows arbitrary symbols in the middle of lexeme,
+so there is no natural sentinel in the grammar. Strings like ``"aha\0ha"`` are
+perfectly valid, but ill-formed strings like ``"aha\0`` are also possible and
+shouldn’t crash the lexer. In this example we do not use buffer refilling,
+therefore ``YYFILL`` definition simply returns an error. Note that ``YYFILL``
+will only be called after the lexer reaches padding, because only then will the
+check condition be satisfied.
+
+.. literalinclude:: /manual/features/eof/02_bounds_checking.re
+    :language: c
+
+EOF rule
+--------
+
+EOF rule ``$`` was introduced in version 1.2. It is a hybrid approach that tries
+to take the best of both worlds: simplicity and efficiency of the sentinel
+method combined with the generality of bounds-checking method. The idea is to
+appoint an arbitrary symbol to be the sentinel, and only perform further bounds
+checking if the sentinel symbol matches (more precisely, if the symbol class that
+contains it matches). The check is of the form ``YYLIMIT <= YYCURSOR``.
+If this condition is not satisfied, then the sentinel is just an ordinary input
+character and the lexer continues. Otherwise this is a real sentinel, and the
+lexer calls ``YYFILL()``. If ``YYFILL`` returns zero, the lexer assumes that it
+has more input and tries to re-match. Otherwise ``YYFILL`` returns non-zero and
+the lexer knows that it has reached the end of input. At this point there are
+three possibilities. First, it might have already matched a shorter lexeme ---
+in this case it just rolls back to the last accepting state. Second, it might
+have consumed some characters, but failed to match --- in this case it falls
+back to default rule ``*``. Finally, it might be in the initial state --- in
+this (and only this!) case it matches EOF rule ``$``.
+
+Below is an example of using EOF rule. Configuration ``re2c:yyfill:enable = 0;``
+suppresses generation of ``YYFILL`` calls (but not the bounds checks).
+
+.. literalinclude:: /manual/features/eof/03_eof_rule.re
+    :language: c
+
+Using generic API
+-----------------
+
+Generic API can be used with any of the above methods. It also allows to use a
+user-defined method by placing EOF checks in one of the basic primitives.
+Usually this is either ``YYSKIP`` (the check is performed when advancing to the
+next input character), or ``YYPEEK`` (the check is performed when reading the
+next input character). The resulting methods are inefficient, as they check on
+each input character. However, they can be useful in cases when the input cannot
+be buffered or padded and does not contain a sentinel character at the end. One
+should be cautious when using such ad-hoc methods, as it is easy to overlook
+some corner cases and come up with a method that only partially works. Also it
+should be noted that not everything can be expressed via generic API: for
+example, it is impossible to reimplement the way EOF rule works (in particular,
+it is impossible to re-match the character after successfull ``YYFILL``).
+
+Below is an example of using ``YYSKIP`` to perform bounds checking without
+padding. ``YYFILL`` generation is suppressed using ``re2c:yyfill:enable = 0;``
+configuration. Note that if the grammar was more complex, this method might not
+work in case when two rules overlap and EOF check fails after a shorter lexeme
+has already been matched (as it happens in our example, there are no overlapping
+rules).
+
+.. literalinclude:: /manual/features/eof/04_generic_api.re
+    :language: c
diff --git a/src/manual/features/fill/01_fill.re b/src/manual/features/fill/01_fill.re

new file mode 100644 (file)

index 0000000..9396678
--- /dev/null
+++ b/src/manual/features/fill/01_fill.re
@@ -0,0 +1,74 @@
+#include <stdio.h>
+#include <string.h>
+
+/*!max:re2c*/
+#define SIZE 4096
+
+typedef struct {
+    FILE *file;
+    char buf[SIZE + YYMAXFILL], *lim, *cur, *tok;
+    int eof;
+} Input;
+
+static int fill(Input *in, size_t need)
+{
+    if (in->eof) {
+        return 1;
+    }
+    const size_t free = in->tok - in->buf;
+    if (free < need) {
+        return 2;
+    }
+    memmove(in->buf, in->tok, in->lim - in->tok);
+    in->lim -= free;
+    in->cur -= free;
+    in->tok -= free;
+    in->lim += fread(in->lim, 1, free, in->file);
+    if (in->lim < in->buf + SIZE) {
+        in->eof = 1;
+        memset(in->lim, 0, YYMAXFILL);
+        in->lim += YYMAXFILL;
+    }
+    return 0;
+}
+
+static void init(Input *in, FILE *file)
+{
+    in->file = file;
+    in->cur = in->tok = in->lim = in->buf + SIZE;
+    in->eof = 0;
+    fill(in, 1);
+}
+
+#define YYFILL(n) if (fill(in, n) != 0) return -1
+static int lex(Input *in)
+{
+    int count = 0;
+loop:
+    in->tok = in->cur;
+    /*!re2c
+    re2c:define:YYCTYPE = char;
+    re2c:define:YYCURSOR = in->cur;
+    re2c:define:YYLIMIT = in->lim;
+
+    *                         { return -1; }
+    [\x00]                    { return (YYMAXFILL == in->lim - in->tok) ? count : -1; }
+    [a-z]+                    { ++count; goto loop; }
+    ['] ([^'] | [\\]['])* ['] { ++count; goto loop; }
+    [ ]+                      { goto loop; }
+
+    */
+}
+
+int main()
+{
+    FILE *f = fopen("input.txt", "rb");
+    if (!f) return 1;
+
+    Input in;
+    init(&in, f);
+    printf("count: %d\n", lex(&in));
+
+    fclose(f);
+    return 0;
+}
diff --git a/src/manual/features/fill/02_fill.re b/src/manual/features/fill/02_fill.re

new file mode 100644 (file)

index 0000000..d6dcafc
--- /dev/null
+++ b/src/manual/features/fill/02_fill.re
@@ -0,0 +1,71 @@
+#include <stdio.h>
+#include <string.h>
+
+#define SIZE 4096
+
+typedef struct {
+    FILE *file;
+    char buf[SIZE + 1], *lim, *cur, *tok;
+    int eof;
+} Input;
+
+static int fill(Input *in)
+{
+    if (in->eof) {
+        return 1;
+    }
+    const size_t free = in->tok - in->buf;
+    if (free < 1) {
+        return 2;
+    }
+    memmove(in->buf, in->tok, in->lim - in->tok);
+    in->lim -= free;
+    in->cur -= free;
+    in->tok -= free;
+    in->lim += fread(in->lim, 1, free, in->file);
+    in->lim[0] = 0;
+    in->eof |= in->lim < in->buf + SIZE;
+    return 0;
+}
+
+static void init(Input *in, FILE *file)
+{
+    in->file = file;
+    in->cur = in->tok = in->lim = in->buf + SIZE;
+    in->eof = 0;
+    fill(in);
+}
+
+#define YYFILL() fill(in)
+static int lex(Input *in)
+{
+    int count = 0;
+loop:
+    in->tok = in->cur;
+    /*!re2c
+    re2c:define:YYCTYPE = char;
+    re2c:define:YYCURSOR = in->cur;
+    re2c:define:YYLIMIT = in->lim;
+    re2c:eof = 0;
+
+    *                         { return -1; }
+    $                         { return count; }
+    [a-z]+                    { ++count; goto loop; }
+    ['] ([^'] | [\\]['])* ['] { ++count; goto loop; }
+    [ ]+                      { goto loop; }
+
+    */
+}
+
+int main()
+{
+    FILE *f = fopen("input.txt", "rb");
+    if (!f) return 1;
+
+    Input in;
+    init(&in, f);
+    printf("count: %d\n", lex(&in));
+
+    fclose(f);
+    return 0;
+}
diff --git a/src/manual/features/fill/fill.rst b/src/manual/features/fill/fill.rst

new file mode 100644 (file)

index 0000000..e06ceb8
--- /dev/null
+++ b/src/manual/features/fill/fill.rst
@@ -0,0 +1,123 @@
+The need for buffering arises when the input cannot be mapped in memory all at
+once: either it is too large, or it comes in a streaming fashion (like reading
+from a socket). The usual technique in such cases is to allocate a fixed-sized
+memory buffer and process input in chunks that fit into the buffer. When the
+current chunk is processed, it is moved out and new data is moved in. In
+practice it is somewhat more complex, because lexer state consists not of a
+single input position, but a set of interrelated posiitons:
+
+- cursor: the next input character to be read (``YYCURSOR`` in default API or
+  ``YYSKIP``/``YYPEEK`` in generic API)
+
+- limit: the position after the last available input character (``YYLIMIT`` in
+  default API, implicitly handled by ``YYLESSTHAN`` in generic API)
+
+- marker: the position of the most recent match, if any (``YYMARKER`` in default
+  API or ``YYBACKUP``/``YYRESTORE`` in generic API)
+
+- token: the start of the current lexeme (implicit in re2c API, as it is not
+  needed for the normal lexer operation and can be defined and updated by the
+  user)
+
+- context marker: the position of the trailing context (``YYCTXMARKER`` in
+  default API or ``YYBACKUPCTX``/``YYRESTORECTX`` in generic API)
+
+- tag variables: submatch positions (defined with ``/*!stags:re2c*/`` and
+  ``/*!mtags:re2c*/`` directives and
+  ``YYSTAGP``/``YYSTAGN``/``YYMTAGP``/``YYMTAGN`` in generic API)
+
+Not all these are used in every case, but if used, they must be updated by
+``YYFILL``. All active positions are contained in the segment between token and
+cursor, therefore everything between buffer start and token can be discarded,
+the segment from token and up to limit should be moved to the beginning of
+buffer, and the free space at the end of buffer should be filled with new data.
+In order to avoid frequent ``YYFILL`` calls it is best to fill in as many input
+characters as possible (even though fewer characters might suffice to resume the
+lexer). The details of ``YYFILL`` implementation are slightly different
+depending on which EOF handling method is used: the case of EOF rule is somewhat
+simpler than the case of bounds-checking with padding. Also note that if
+``-f --storable-state`` option is used, ``YYFILL`` has slightly different
+semantics (desrbed in the section about storable state).
+
+YYFILL with EOF rule
+--------------------
+
+If EOF rule is used, ``YYFILL`` is a function-like primitive that accepts
+no arguments and returns a value which is checked against zero. ``YYFILL``
+invocation is triggered by condition ``YYLIMIT <= YYCURSOR`` in default API and
+``YYLESSTHAN()`` in generic API. A non-zero return value means that ``YYFILL``
+has failed. A successful ``YYFILL`` call must supply at least one character and
+adjust input positions accordingly. Limit must always be set to one after the
+last input position in buffer, and the character at the limit position must be
+the sentinel symbol specified by ``re2c:eof`` configuration. The pictures below
+show the relative locations of input positions in buffer before and after
+``YYFILL`` call (sentinel symbol is marked with ``#``, and the second picture
+shows the case when there is not enough input to fill the whole buffer).
+
+.. code-block:: none
+
+                   <-- shift -->
+                 >-A~~~~~~~~~~~~B~~~~~~~~~C~~~~~~~~~~~~~D#-----------E->
+                 buffer       token    marker         limit,
+                                                      cursor
+    >-A------------B~~~~~~~~~C~~~~~~~~~~~~~D~~~~~~~~~~~~E#->
+                 buffer,  marker        cursor        limit
+                 token
+
+                   <-- shift -->
+                 >-A~~~~~~~~~~~~B~~~~~~~~~C~~~~~~~~~~~~~D#--E (EOF)
+                 buffer       token    marker         limit,
+                                                      cursor
+    >-A------------B~~~~~~~~~C~~~~~~~~~~~~~D~~~E#........
+                 buffer,  marker       cursor limit
+                 token
+
+
+Here is an example of a program that reads input file ``input.txt`` in chunks of
+4096 bytes and uses EOF rule.
+
+.. literalinclude:: /manual/features/fill/02_fill.re
+    :language: c
+
+YYFILL with padding
+-------------------
+
+In the default case (when EOF rule is not used) ``YYFILL`` is a function-like
+primitive that accepts a single argument and does not return any value.
+``YYFILL`` invocation is triggered by condition ``(YYLIMIT - YYCURSOR) < n`` in
+default API and ``YYLESSTHAN(n)`` in generic API. The argument passed to
+``YYFILL`` is the minimal number of characters that must be supplied. If it
+fails to do so, ``YYFILL`` must not return to the lexer (for that reason it is
+best implemented as a macro that returns from the calling function on failure).
+In case of a successfull ``YYFILL`` invocation the limit position must be set
+either to one after the last input position in buffer, or to the end of
+``YYMAXFILL`` padding (in case ``YYFILL`` has successfully read at least ``n``
+characters, but not enough to fill the entire buffer). The pictures below show
+the relative locations of input positions in buffer before and after ``YYFILL``
+invocation (``YYMAXFILL`` padding on the second picture is marked with ``#``
+symbols).
+
+.. code-block:: none
+
+                   <-- shift -->                 <-- need -->
+                 >-A~~~~~~~~~~~~B~~~~~~~~~C~~~~~D~~~~~~~E---F--------G->
+                 buffer       token    marker cursor  limit
+                                                         
+    >-A------------B~~~~~~~~~C~~~~~D~~~~~~~E~~~F~~~~~~~~G->
+                 buffer,  marker cursor               limit
+                 token
+
+                   <-- shift -->                 <-- need -->
+                 >-A~~~~~~~~~~~~B~~~~~~~~~C~~~~~D~~~~~~~E-F        (EOF)
+                 buffer       token    marker cursor  limit
+                                                         
+    >-A------------B~~~~~~~~~C~~~~~D~~~~~~~E~F###############
+                 buffer,  marker cursor                   limit
+                 token                        <- YYMAXFILL ->
+
+Here is an example of a program that reads input file ``input.txt`` in chunks of
+4096 bytes and uses bounds-checking with padding.
+
+.. literalinclude:: /manual/features/fill/01_fill.re
+    :language: c
+
diff --git a/src/manual/features/fill/input.txt b/src/manual/features/fill/input.txt

new file mode 100644 (file)

index 0000000..3e13503
--- /dev/null
+++ b/src/manual/features/fill/input.txt
@@ -0,0 +1 @@
+one two 'th\'ree' '123' ''
+\ No newline at end of file
diff --git a/src/manual/features/submatch/mtags.re b/src/manual/features/submatch/mtags.re

new file mode 100644 (file)

index 0000000..923d1e3
--- /dev/null
+++ b/src/manual/features/submatch/mtags.re
@@ -0,0 +1,59 @@
+#include <assert.h>
+#include <vector>
+#include <string>
+
+static const int ROOT = -1;
+
+struct Mtag {
+    int pred;
+    const char *tag;
+};
+
+typedef std::vector<Mtag> MtagTree;
+typedef std::vector<std::string> Words;
+
+static void mtag(int *pt, const char *t, MtagTree *tree)
+{
+    Mtag m = {*pt, t};
+    *pt = (int)tree->size();
+    tree->push_back(m);
+}
+
+static void unfold(const MtagTree &tree, int x, int y, Words &words)
+{
+    if (x == ROOT) return;
+    unfold(tree, tree[x].pred, tree[y].pred, words);
+    const char *px = tree[x].tag, *py = tree[y].tag;
+    words.push_back(std::string(px, py - px));
+}
+
+#define YYMTAGP(t) mtag(&t, YYCURSOR, &tree)
+#define YYMTAGN(t) mtag(&t, NULL,     &tree)
+static bool lex(const char *YYCURSOR, Words &words)
+{
+    const char *YYMARKER;
+    /*!mtags:re2c format = "int @@ = ROOT;"; */
+    MtagTree tree;
+    int x, y;
+
+    /*!re2c
+    re2c:define:YYCTYPE = char;
+    re2c:yyfill:enable = 0;
+    re2c:flags:tags = 1;
+
+    (#x [a-zA-Z0-9_]+ #y [;])+ {
+        words.clear();
+        unfold(tree, x, y, words);
+        return true;
+    }
+    * { return false; }
+
+    */
+}
+
+int main()
+{
+    Words w;
+    assert(lex("one;tw0;three;", w) && w == Words({"one", "tw0", "three"}));
+    return 0;
+}
diff --git a/src/manual/features/submatch/posix.re b/src/manual/features/submatch/posix.re

new file mode 100644 (file)

index 0000000..c3aa045
--- /dev/null
+++ b/src/manual/features/submatch/posix.re
@@ -0,0 +1,45 @@
+#include <assert.h>
+#include <stdint.h>
+
+static uint32_t num(const char *s, const char *e)
+{
+    uint32_t n = 0;
+    for (; s < e; ++s) n = n * 10 + (*s - '0');
+    return n;
+}
+
+/*!maxnmatch:re2c*/
+
+static uint32_t lex(const char *YYCURSOR)
+{
+    const char *YYMARKER;
+    const char *yypmatch[YYMAXNMATCH];
+    uint32_t yynmatch;
+    /*!stags:re2c format = 'const char *@@;'; */
+
+    /*!re2c
+    re2c:define:YYCTYPE = char;
+    re2c:yyfill:enable = 0;
+    re2c:flags:posix-captures = 1;
+
+    oct = [0-9]{1,3};
+    dot = [.];
+
+    (oct) dot (oct) dot (oct) dot (oct) {
+        return num(yypmatch[8], yypmatch[9])
+            + (num(yypmatch[6], yypmatch[7]) << 8)
+            + (num(yypmatch[4], yypmatch[5]) << 16)
+            + (num(yypmatch[2], yypmatch[3]) << 24);
+    }
+    * { return 0; }
+
+    */
+}
+
+int main()
+{
+    assert(lex("1.2.3.4") == 0x01020304);
+    assert(lex("127.0.0.1") == 0x7f000001);
+    assert(lex("255.255.255.255") == 0xffffffff);
+    return 0;
+}
diff --git a/src/manual/features/submatch/stags.re b/src/manual/features/submatch/stags.re

new file mode 100644 (file)

index 0000000..aa3b7bd
--- /dev/null
+++ b/src/manual/features/submatch/stags.re
@@ -0,0 +1,41 @@
+#include <assert.h>
+#include <stdint.h>
+
+static uint32_t num(const char *s, const char *e)
+{
+    uint32_t n = 0;
+    for (; s < e; ++s) n = n * 10 + (*s - '0');
+    return n;
+}
+
+static uint32_t lex(const char *YYCURSOR)
+{
+    const char *YYMARKER, *o1, *o2, *o3, *o4;
+    /*!stags:re2c format = 'const char *@@;'; */
+
+    /*!re2c
+    re2c:define:YYCTYPE = char;
+    re2c:yyfill:enable = 0;
+    re2c:flags:tags = 1;
+
+    oct = [0-9]{1,3};
+    dot = [.];
+
+    @o1 oct dot @o2 oct dot @o3 oct dot @o4 oct {
+        return num(o4, YYCURSOR)
+            + (num(o3, o4 - 1) << 8)
+            + (num(o2, o3 - 1) << 16)
+            + (num(o1, o2 - 1) << 24);
+    }
+    * { return 0; }
+
+    */
+}
+
+int main()
+{
+    assert(lex("1.2.3.4") == 0x01020304);
+    assert(lex("127.0.0.1") == 0x7f000001);
+    assert(lex("255.255.255.255") == 0xffffffff);
+    return 0;
+}
diff --git a/src/manual/features/submatch/submatch.rst b/src/manual/features/submatch/submatch.rst

index 052892d0cbdbc2c489ee86b7dd3655f207017a33..34024561f57a90f2d4c61050316d0dd6c919bfbe 100644 (file)
--- a/src/manual/features/submatch/submatch.rst
+++ b/src/manual/features/submatch/submatch.rst
@@ -1,7 +1,87 @@
-Submatch
---------
+Re2c has two options for submatch extraction.
  
-.. toctree::
-    :hidden:
-    
-.. include:: submatch.rst_
+The first option is ``-T --tags``. With this option one can use standalone tags
+of the form ``@stag`` and ``#mtag``, where ``stag`` and ``mtag`` are arbitrary
+used-defined names. Tags can be used anywhere inside of a regular expression;
+semantically they are just position markers. Tags of the form ``@stag`` are
+called s-tags: they denote a single submatch value (the last input position
+where this tag matched). Tags of the form ``#mtag`` are called m-tags: they
+denote multiple submatch values (the whole history of repetitions of this tag).
+All tags should be defined by the user as variables with the corresponding
+names. With standalone tags re2c uses leftmost greedy disambiguation: submatch
+positions correspond to the leftmost matching path through the regular
+expression.
+
+The second option is ``-P --posix-captures``: it enables POSIX-compliant
+capturing groups. In this mode parentheses in regular expressions denote the
+beginning and the end of capturing groups; the whole regular expression is group
+number zero. The number of groups for the matching rule is stored in a variable
+``yynmatch``, and submatch results are stored in ``yypmatch`` array. Both
+``yynmatch`` and ``yypmatch`` should be defined by the user, and ``yypmatch``
+size must be at least ``[yynmatch * 2]``. Re2c provides a directive
+``/*!maxnmatch:re2c*/`` that defines ``YYMAXNMATCH``: a constant  equal to the
+maximal value of ``yynmatch`` among all rules. Note that re2c implements
+POSIX-compliant disambiguation: each subexpression matches as long as possible,
+and subexpressions that start earlier in regular expression have priority over
+those starting later. Capturing groups are translated into s-tags under the
+hood, therefore we use the word "tag" to describe them as well.
+
+With both ``-P --posix-captures`` and ``T --tags`` options re2c uses efficient
+submatch extraction algorithm described in the
+`Tagged Deterministic Finite Automata with Lookahead <https://arxiv.org/abs/1907.08837>`_
+paper. The overhead on submatch extraction in the generated lexer grows with the
+number of tags --- if this number is moderate, the overhead is barely
+noticeable. In the lexer tags are implemented using a number of tag variables
+generated by re2c. There is no one-to-one correspondence between tag variables
+and tags: a single variable may be reused for different tags, and one tag may
+require multiple variables to hold all its ambiguous values. Eventually
+ambiguity is resolved, and only one final variable per tag survives. When a rule
+matches, all its tags are set to the values of the corresponding tag variables.
+The exact number of tag variables is unknown to the user; this number is
+determined by re2c. However, tag variables should be defined by the user as a
+part of the lexer state and updated by ``YYFILL``, therefore re2c provides
+directives ``/*!stags:re2c*/`` and ``/*!mtags:re2c*/`` that can be used to
+declare, initialize and manipulate tag variables. These directives have two
+optional configurations: ``format = "@@";`` (specifies the template where ``@@``
+is substituted with the name of each tag variable), and ``separator = "";``
+(specifies the piece of code used to join the generated pieces for different
+tag variables).
+
+S-tags support the following operations:
+
+* save input position to an s-tag: ``t = YYCURSOR`` with default API or a
+  user-defined operation ``YYSTAGP(t)`` with generic API
+* save default value to an s-tag: ``t = NULL`` with default API or a
+  user-defined operation ``YYSTAGN(t)`` with generic API
+* copy one s-tag to another: ``t1 = t2``
+
+M-tags support the following operations:
+
+* append input position to an m-tag: a user-defined operation ``YYMTAGP(t)``
+  with both default and generic API
+* append default value to an m-tag: a user-defined operation ``YYMTAGN(t)``
+  with both default and generic API
+* copy one m-tag to another: ``t1 = t2``
+
+S-tags can be implemented as scalar values (pointers or offsets). M-tags need a
+more complex representation, as they need to store a sequence of tag values. The
+most naive and inefficient representation of an m-tag is a list (array, vector)
+of tag values; a more efficient representation is to store all m-tags in a
+prefix-tree represented as array of nodes ``(v, p)``, where ``v`` is tag value
+and ``p`` is a pointer to parent node.
+
+Here is an example of using s-tags to parse an IPv4 address.
+
+.. literalinclude:: /manual/features/submatch/stags.re
+    :language: c
+
+Here is an example of using POSIX capturing groups to parse an IPv4 address.
+
+.. literalinclude:: /manual/features/submatch/posix.re
+    :language: c
+
+Here is an example of using m-tags to parse a semicolon-separated sequence of
+words (C++). Tag variables are stored in a tree that is packed in a vector.
+
+.. literalinclude:: /manual/features/submatch/mtags.re
+    :language: c
diff --git a/src/manual/warnings/condition_order/fixorder.re.txt b/src/manual/warnings/condition_order/fixorder.re.txt

new file mode 100644 (file)

index 0000000..40a3c73
--- /dev/null
+++ b/src/manual/warnings/condition_order/fixorder.re.txt
@@ -0,0 +1,34 @@
+#include <stdio.h>
+
+#ifdef REVERSED_CONDITION_ORDER
+#    define yyca 1
+#    define yycb 0
+#else
+#    define yyca 0
+#    define yycb 1
+#endif
+
+int main()
+{
+    const char * YYCURSOR = "aaaa,bbb!";
+    int c = yyca;
+    for (;;) {
+    /*!re2c
+        re2c:define:YYCTYPE = char;
+        re2c:yyfill:enable = 0;
+        re2c:define:YYSETCONDITION = "c = @@;";
+        re2c:define:YYSETCONDITION:naked = 1;
+        re2c:define:YYGETCONDITION = c;
+        re2c:define:YYGETCONDITION:naked = 1;
+
+        <*> * { printf ("error\n"); break; }
+
+        <a> "a"      { printf ("a"); continue; }
+        <a> "," => b { printf (","); continue; }
+
+        <b> "!" { printf ("!\n"); break; }
+        <b> "b" { printf ("b"); continue; }
+    */
+    }
+    return 0;
+}
diff --git a/src/manual/warnings/match_empty_string/hang.re b/src/manual/warnings/match_empty_string/hang.re

new file mode 100644 (file)

index 0000000..f4926d9
--- /dev/null
+++ b/src/manual/warnings/match_empty_string/hang.re
@@ -0,0 +1,17 @@
+#include <stdio.h>
+
+int main(int argc, char **argv)
+{
+    for (int i = 1; i < argc; ++i) {
+        for (char *YYCURSOR = argv[i];;) {
+        /*!re2c
+            re2c:define:YYCTYPE = char;
+            re2c:yyfill:enable = 0;
+            "\x00" { break; }
+            [a-z]* { continue; }
+        */
+        }
+        printf("argv[%d]: %s\n", i, argv[i]);
+    }
+    return 0;
+}
author	Ulya Trofimovich <skvadrik@gmail.com>
	Fri, 26 Jul 2019 09:32:50 +0000 (10:32 +0100)
committer	Ulya Trofimovich <skvadrik@gmail.com>
	Fri, 26 Jul 2019 09:32:50 +0000 (10:32 +0100)
src/about/2019_borsotti_trofimovich_efficient_posix_submatch_extraction_on_nfa.pdf	[new file with mode: 0644]	patch \| blob
src/manual/features/dot/utf8_any.re	[new file with mode: 0644]	patch \| blob
src/manual/features/eof/01_sentinel.re	[new file with mode: 0644]	patch \| blob
src/manual/features/eof/02_bounds_checking.re	[new file with mode: 0644]	patch \| blob
src/manual/features/eof/03_eof_rule.re	[new file with mode: 0644]	patch \| blob
src/manual/features/eof/04_generic_api.re	[new file with mode: 0644]	patch \| blob
src/manual/features/eof/eof.rst	[new file with mode: 0644]	patch \| blob
src/manual/features/fill/01_fill.re	[new file with mode: 0644]	patch \| blob
src/manual/features/fill/02_fill.re	[new file with mode: 0644]	patch \| blob
src/manual/features/fill/fill.rst	[new file with mode: 0644]	patch \| blob
src/manual/features/fill/input.txt	[new file with mode: 0644]	patch \| blob
src/manual/features/submatch/mtags.re	[new file with mode: 0644]	patch \| blob
src/manual/features/submatch/posix.re	[new file with mode: 0644]	patch \| blob
src/manual/features/submatch/stags.re	[new file with mode: 0644]	patch \| blob
src/manual/features/submatch/submatch.rst		patch \| blob \| history
src/manual/warnings/condition_order/fixorder.re.txt	[new file with mode: 0644]	patch \| blob
src/manual/warnings/match_empty_string/hang.re	[new file with mode: 0644]	patch \| blob