--- /dev/null
+/*!re2c
+ * {}
+ [^] {}
+*/
--- /dev/null
+#include <assert.h>
+
+static int lex(const char *YYCURSOR)
+{
+ int count = 0;
+loop:
+ /*!re2c
+ re2c:define:YYCTYPE = char;
+ re2c:yyfill:enable = 0;
+
+ * { return -1; }
+ [\x00] { return count; }
+ [a-z]+ { ++count; goto loop; }
+ [ ]+ { goto loop; }
+
+ */
+}
+
+int main()
+{
+ assert(lex("") == 0);
+ assert(lex("one two three") == 3);
+ assert(lex("one two 123?") == -1);
+ return 0;
+}
--- /dev/null
+#include <assert.h>
+#include <stdlib.h>
+#include <string.h>
+
+/*!max:re2c*/
+
+static int lex(const char *str)
+{
+ const size_t len = strlen(str);
+ char *buf = malloc(len + YYMAXFILL);
+ memcpy(buf, str, len);
+ memset(buf + len, 0, YYMAXFILL);
+
+ const char *YYCURSOR = buf;
+ const char *YYLIMIT = buf + len + YYMAXFILL;
+ int count = 0;
+
+loop:
+ /*!re2c
+ re2c:define:YYCTYPE = char;
+ re2c:define:YYFILL:naked = 1;
+ re2c:define:YYFILL = "goto error;";
+
+ * { goto error; }
+ [\x00] { if (YYCURSOR == YYLIMIT) goto end; else goto error; }
+ [a-z]+ { ++count; goto loop; }
+ ['] ([^'] | [\\]['])* ['] { ++count; goto loop; }
+ [ ]+ { goto loop; }
+
+ */
+error:
+ count = -1;
+end:
+ free(buf);
+ return count;
+}
+
+int main()
+{
+ assert(lex("") == 0);
+ assert(lex("one two three") == 3);
+ assert(lex("one two 123?") == -1);
+ assert(lex("one 'two' 'th\\'ree' '123?' ''") == 5);
+ assert(lex("one 'two' 'three") == -1);
+ return 0;
+}
--- /dev/null
+#include <assert.h>
+#include <string.h>
+
+static int lex(const char *str)
+{
+ const char *YYCURSOR = str;
+ const char *YYLIMIT = str + strlen(str);
+ int count = 0;
+
+loop:
+ /*!re2c
+ re2c:define:YYCTYPE = char;
+ re2c:yyfill:enable = 0;
+ re2c:eof = 0;
+
+ * { return -1; }
+ $ { return count; }
+ [a-z]+ { ++count; goto loop; }
+ ['] ([^'] | [\\]['])* ['] { ++count; goto loop; }
+ [ ]+ { goto loop; }
+
+ */
+}
+
+int main()
+{
+ assert(lex("") == 0);
+ assert(lex("one two three") == 3);
+ assert(lex("one two 123?") == -1);
+ assert(lex("one 'two' 'th\\'ree' '123?' ''") == 5);
+ assert(lex("one 'two' 'three") == -1);
+ return 0;
+}
--- /dev/null
+#include <assert.h>
+#include <string.h>
+
+#define YYPEEK() *cur
+#define YYSKIP() if (++cur > lim) return -1
+static int lex(const char *str)
+{
+ const char *cur = str;
+ const char *lim = str + strlen(str) + 1;
+ int count = 0;
+
+loop:
+ /*!re2c
+ re2c:define:YYCTYPE = char;
+ re2c:yyfill:enable = 0;
+ re2c:flags:input = custom;
+
+ * { return -1; }
+ [\x00] { return cur == lim ? count : -1; }
+ [a-z]+ { ++count; goto loop; }
+ ['] ([^'] | [\\]['])* ['] { ++count; goto loop; }
+ [ ]+ { goto loop; }
+
+ */
+}
+
+int main()
+{
+ assert(lex("") == 0);
+ assert(lex("one two three") == 3);
+ assert(lex("one two 123?") == -1);
+ assert(lex("one 'two' 'th\\'ree' '123?' ''") == 5);
+ assert(lex("one 'two' 'three") == -1);
+ return 0;
+}
--- /dev/null
+Re2c provides a number of ways to handle end-of-input situation. Which way to
+use depends on the complexity of regular expressions, performance
+considerations, the need for input buffering and various other factors. EOF
+handling is probably the most complex part of re2c user interface --- it
+definitely requires a bit of understanding of how the generated lexer works.
+But in return is allows the user to customize lexer for a particular environment
+and avoid the unnecessary overhead of generic methods when a simpler method is
+sufficient. Roughly speaking, there are four main methods:
+
+- using sentinel symbol (simple and efficient, but limited)
+- bounds checking with padding (generic, but complex)
+- EOF rule: a combination of sentinel symbol and bounds checking (generic and
+ simple, can be more or less efficient than bounds checking with padding
+ depending on the grammar)
+- using generic API (user-defined, so may be incorrect ;])
+
+Using sentinel symbol
+---------------------
+This is the simplest and the most efficient method. It is applicable in cases
+when the input is small enough to fit into a continuous memory buffer and there
+is a natural "sentinel" symbol --- a code unit that is not allowed by any of the
+regular expressions in grammar (except possibly as a terminating character).
+Sentinel symbol never appears in well-formed input, therefore it can be appended
+at the end of input and used as a stop signal by the lexer. A good example of
+such input is a null-terminated C-string, provided that the grammar does not
+allow ``NULL`` in the middle of lexemes. Sentinel method is very efficient,
+because the lexer does not need to perform any additional checks for the end of
+input --- it comes naturally as a part of processing the next character.
+
+Below is an example of using sentinel method. Configuration
+``re2c:yyfill:enable = 0;`` suppresses generation of end-of-input checks and
+``YYFILL`` calls.
+
+.. literalinclude:: /manual/features/eof/01_sentinel.re
+ :language: c
+
+Bounds checking with padding
+----------------------------
+
+Bounds checking is a generic method: it can be used with any input grammar.
+The basic idea is simple: we need to check for the end of input before reading
+the next input character. However, if implemented in a straightforward way, this
+would be quite inefficient: checking on each input character would cause a major
+slowdown. Re2c avoids slowdown by generating checks only in certain key states
+of the lexer, and letting it run without checks in-between the key states.
+More precisely, re2c computes strongly connected components (SCCs) of
+the underlying DFA (which roughly correspond to loops), and generates only a few
+checks per each SCC (usually just one, but in general enough to make the SCC
+acyclic). The check is of the form ``(YYLIMIT - YYCURSOR) < n``, where ``n``
+is the maximal length of a simple path in the corresponding SCC. If this
+condiiton is true, the lexer calls ``YYFILL(n)``, which must either supply at
+least ``n`` input characters, or do not return. When the lexer continues after
+the check, it is certain that the next ``n`` characters can be read safely
+without checks.
+
+This approach reduces the number of checks significantly (and makes the lexer
+much faster as a result), but it has a downside. Since the lexer checks for
+multiple characters at once, it may end up in a situation when there are a few
+remaining input characters (less than ``n``) corresponding to a short path in
+the SCC, but the lexer cannot proceed because of the check, and ``YYFILL``
+cannot supply more character because it is the end of input. To solve this
+problem, re2c requires that additional padding consisting of fake characters is
+appended at the end of input. The length of padding should be ``YYMAXFILL``,
+which equals to the maximum ``n`` parameter to ``YYFILL`` and must be generated
+by re2c using ``/*!max:re2c*/`` directive. The fake characters should not form a
+valid lexeme suffix, otherwise the lexer may be fooled into matching a fake
+lexeme. Usually it's a good idea to use ``NULL`` characters for padding.
+
+Below is an example of using bounds checking with padding. Note that the grammar
+rule for single-quoted strings allows arbitrary symbols in the middle of lexeme,
+so there is no natural sentinel in the grammar. Strings like ``"aha\0ha"`` are
+perfectly valid, but ill-formed strings like ``"aha\0`` are also possible and
+shouldn’t crash the lexer. In this example we do not use buffer refilling,
+therefore ``YYFILL`` definition simply returns an error. Note that ``YYFILL``
+will only be called after the lexer reaches padding, because only then will the
+check condition be satisfied.
+
+.. literalinclude:: /manual/features/eof/02_bounds_checking.re
+ :language: c
+
+EOF rule
+--------
+
+EOF rule ``$`` was introduced in version 1.2. It is a hybrid approach that tries
+to take the best of both worlds: simplicity and efficiency of the sentinel
+method combined with the generality of bounds-checking method. The idea is to
+appoint an arbitrary symbol to be the sentinel, and only perform further bounds
+checking if the sentinel symbol matches (more precisely, if the symbol class that
+contains it matches). The check is of the form ``YYLIMIT <= YYCURSOR``.
+If this condition is not satisfied, then the sentinel is just an ordinary input
+character and the lexer continues. Otherwise this is a real sentinel, and the
+lexer calls ``YYFILL()``. If ``YYFILL`` returns zero, the lexer assumes that it
+has more input and tries to re-match. Otherwise ``YYFILL`` returns non-zero and
+the lexer knows that it has reached the end of input. At this point there are
+three possibilities. First, it might have already matched a shorter lexeme ---
+in this case it just rolls back to the last accepting state. Second, it might
+have consumed some characters, but failed to match --- in this case it falls
+back to default rule ``*``. Finally, it might be in the initial state --- in
+this (and only this!) case it matches EOF rule ``$``.
+
+Below is an example of using EOF rule. Configuration ``re2c:yyfill:enable = 0;``
+suppresses generation of ``YYFILL`` calls (but not the bounds checks).
+
+.. literalinclude:: /manual/features/eof/03_eof_rule.re
+ :language: c
+
+Using generic API
+-----------------
+
+Generic API can be used with any of the above methods. It also allows to use a
+user-defined method by placing EOF checks in one of the basic primitives.
+Usually this is either ``YYSKIP`` (the check is performed when advancing to the
+next input character), or ``YYPEEK`` (the check is performed when reading the
+next input character). The resulting methods are inefficient, as they check on
+each input character. However, they can be useful in cases when the input cannot
+be buffered or padded and does not contain a sentinel character at the end. One
+should be cautious when using such ad-hoc methods, as it is easy to overlook
+some corner cases and come up with a method that only partially works. Also it
+should be noted that not everything can be expressed via generic API: for
+example, it is impossible to reimplement the way EOF rule works (in particular,
+it is impossible to re-match the character after successfull ``YYFILL``).
+
+Below is an example of using ``YYSKIP`` to perform bounds checking without
+padding. ``YYFILL`` generation is suppressed using ``re2c:yyfill:enable = 0;``
+configuration. Note that if the grammar was more complex, this method might not
+work in case when two rules overlap and EOF check fails after a shorter lexeme
+has already been matched (as it happens in our example, there are no overlapping
+rules).
+
+.. literalinclude:: /manual/features/eof/04_generic_api.re
+ :language: c
--- /dev/null
+#include <stdio.h>
+#include <string.h>
+
+/*!max:re2c*/
+#define SIZE 4096
+
+typedef struct {
+ FILE *file;
+ char buf[SIZE + YYMAXFILL], *lim, *cur, *tok;
+ int eof;
+} Input;
+
+static int fill(Input *in, size_t need)
+{
+ if (in->eof) {
+ return 1;
+ }
+ const size_t free = in->tok - in->buf;
+ if (free < need) {
+ return 2;
+ }
+ memmove(in->buf, in->tok, in->lim - in->tok);
+ in->lim -= free;
+ in->cur -= free;
+ in->tok -= free;
+ in->lim += fread(in->lim, 1, free, in->file);
+ if (in->lim < in->buf + SIZE) {
+ in->eof = 1;
+ memset(in->lim, 0, YYMAXFILL);
+ in->lim += YYMAXFILL;
+ }
+ return 0;
+}
+
+static void init(Input *in, FILE *file)
+{
+ in->file = file;
+ in->cur = in->tok = in->lim = in->buf + SIZE;
+ in->eof = 0;
+ fill(in, 1);
+}
+
+#define YYFILL(n) if (fill(in, n) != 0) return -1
+static int lex(Input *in)
+{
+ int count = 0;
+loop:
+ in->tok = in->cur;
+ /*!re2c
+ re2c:define:YYCTYPE = char;
+ re2c:define:YYCURSOR = in->cur;
+ re2c:define:YYLIMIT = in->lim;
+
+ * { return -1; }
+ [\x00] { return (YYMAXFILL == in->lim - in->tok) ? count : -1; }
+ [a-z]+ { ++count; goto loop; }
+ ['] ([^'] | [\\]['])* ['] { ++count; goto loop; }
+ [ ]+ { goto loop; }
+
+ */
+}
+
+int main()
+{
+ FILE *f = fopen("input.txt", "rb");
+ if (!f) return 1;
+
+ Input in;
+ init(&in, f);
+ printf("count: %d\n", lex(&in));
+
+ fclose(f);
+ return 0;
+}
--- /dev/null
+#include <stdio.h>
+#include <string.h>
+
+#define SIZE 4096
+
+typedef struct {
+ FILE *file;
+ char buf[SIZE + 1], *lim, *cur, *tok;
+ int eof;
+} Input;
+
+static int fill(Input *in)
+{
+ if (in->eof) {
+ return 1;
+ }
+ const size_t free = in->tok - in->buf;
+ if (free < 1) {
+ return 2;
+ }
+ memmove(in->buf, in->tok, in->lim - in->tok);
+ in->lim -= free;
+ in->cur -= free;
+ in->tok -= free;
+ in->lim += fread(in->lim, 1, free, in->file);
+ in->lim[0] = 0;
+ in->eof |= in->lim < in->buf + SIZE;
+ return 0;
+}
+
+static void init(Input *in, FILE *file)
+{
+ in->file = file;
+ in->cur = in->tok = in->lim = in->buf + SIZE;
+ in->eof = 0;
+ fill(in);
+}
+
+#define YYFILL() fill(in)
+static int lex(Input *in)
+{
+ int count = 0;
+loop:
+ in->tok = in->cur;
+ /*!re2c
+ re2c:define:YYCTYPE = char;
+ re2c:define:YYCURSOR = in->cur;
+ re2c:define:YYLIMIT = in->lim;
+ re2c:eof = 0;
+
+ * { return -1; }
+ $ { return count; }
+ [a-z]+ { ++count; goto loop; }
+ ['] ([^'] | [\\]['])* ['] { ++count; goto loop; }
+ [ ]+ { goto loop; }
+
+ */
+}
+
+int main()
+{
+ FILE *f = fopen("input.txt", "rb");
+ if (!f) return 1;
+
+ Input in;
+ init(&in, f);
+ printf("count: %d\n", lex(&in));
+
+ fclose(f);
+ return 0;
+}
--- /dev/null
+The need for buffering arises when the input cannot be mapped in memory all at
+once: either it is too large, or it comes in a streaming fashion (like reading
+from a socket). The usual technique in such cases is to allocate a fixed-sized
+memory buffer and process input in chunks that fit into the buffer. When the
+current chunk is processed, it is moved out and new data is moved in. In
+practice it is somewhat more complex, because lexer state consists not of a
+single input position, but a set of interrelated posiitons:
+
+- cursor: the next input character to be read (``YYCURSOR`` in default API or
+ ``YYSKIP``/``YYPEEK`` in generic API)
+
+- limit: the position after the last available input character (``YYLIMIT`` in
+ default API, implicitly handled by ``YYLESSTHAN`` in generic API)
+
+- marker: the position of the most recent match, if any (``YYMARKER`` in default
+ API or ``YYBACKUP``/``YYRESTORE`` in generic API)
+
+- token: the start of the current lexeme (implicit in re2c API, as it is not
+ needed for the normal lexer operation and can be defined and updated by the
+ user)
+
+- context marker: the position of the trailing context (``YYCTXMARKER`` in
+ default API or ``YYBACKUPCTX``/``YYRESTORECTX`` in generic API)
+
+- tag variables: submatch positions (defined with ``/*!stags:re2c*/`` and
+ ``/*!mtags:re2c*/`` directives and
+ ``YYSTAGP``/``YYSTAGN``/``YYMTAGP``/``YYMTAGN`` in generic API)
+
+Not all these are used in every case, but if used, they must be updated by
+``YYFILL``. All active positions are contained in the segment between token and
+cursor, therefore everything between buffer start and token can be discarded,
+the segment from token and up to limit should be moved to the beginning of
+buffer, and the free space at the end of buffer should be filled with new data.
+In order to avoid frequent ``YYFILL`` calls it is best to fill in as many input
+characters as possible (even though fewer characters might suffice to resume the
+lexer). The details of ``YYFILL`` implementation are slightly different
+depending on which EOF handling method is used: the case of EOF rule is somewhat
+simpler than the case of bounds-checking with padding. Also note that if
+``-f --storable-state`` option is used, ``YYFILL`` has slightly different
+semantics (desrbed in the section about storable state).
+
+YYFILL with EOF rule
+--------------------
+
+If EOF rule is used, ``YYFILL`` is a function-like primitive that accepts
+no arguments and returns a value which is checked against zero. ``YYFILL``
+invocation is triggered by condition ``YYLIMIT <= YYCURSOR`` in default API and
+``YYLESSTHAN()`` in generic API. A non-zero return value means that ``YYFILL``
+has failed. A successful ``YYFILL`` call must supply at least one character and
+adjust input positions accordingly. Limit must always be set to one after the
+last input position in buffer, and the character at the limit position must be
+the sentinel symbol specified by ``re2c:eof`` configuration. The pictures below
+show the relative locations of input positions in buffer before and after
+``YYFILL`` call (sentinel symbol is marked with ``#``, and the second picture
+shows the case when there is not enough input to fill the whole buffer).
+
+.. code-block:: none
+
+ <-- shift -->
+ >-A~~~~~~~~~~~~B~~~~~~~~~C~~~~~~~~~~~~~D#-----------E->
+ buffer token marker limit,
+ cursor
+ >-A------------B~~~~~~~~~C~~~~~~~~~~~~~D~~~~~~~~~~~~E#->
+ buffer, marker cursor limit
+ token
+
+ <-- shift -->
+ >-A~~~~~~~~~~~~B~~~~~~~~~C~~~~~~~~~~~~~D#--E (EOF)
+ buffer token marker limit,
+ cursor
+ >-A------------B~~~~~~~~~C~~~~~~~~~~~~~D~~~E#........
+ buffer, marker cursor limit
+ token
+
+
+Here is an example of a program that reads input file ``input.txt`` in chunks of
+4096 bytes and uses EOF rule.
+
+.. literalinclude:: /manual/features/fill/02_fill.re
+ :language: c
+
+YYFILL with padding
+-------------------
+
+In the default case (when EOF rule is not used) ``YYFILL`` is a function-like
+primitive that accepts a single argument and does not return any value.
+``YYFILL`` invocation is triggered by condition ``(YYLIMIT - YYCURSOR) < n`` in
+default API and ``YYLESSTHAN(n)`` in generic API. The argument passed to
+``YYFILL`` is the minimal number of characters that must be supplied. If it
+fails to do so, ``YYFILL`` must not return to the lexer (for that reason it is
+best implemented as a macro that returns from the calling function on failure).
+In case of a successfull ``YYFILL`` invocation the limit position must be set
+either to one after the last input position in buffer, or to the end of
+``YYMAXFILL`` padding (in case ``YYFILL`` has successfully read at least ``n``
+characters, but not enough to fill the entire buffer). The pictures below show
+the relative locations of input positions in buffer before and after ``YYFILL``
+invocation (``YYMAXFILL`` padding on the second picture is marked with ``#``
+symbols).
+
+.. code-block:: none
+
+ <-- shift --> <-- need -->
+ >-A~~~~~~~~~~~~B~~~~~~~~~C~~~~~D~~~~~~~E---F--------G->
+ buffer token marker cursor limit
+
+ >-A------------B~~~~~~~~~C~~~~~D~~~~~~~E~~~F~~~~~~~~G->
+ buffer, marker cursor limit
+ token
+
+ <-- shift --> <-- need -->
+ >-A~~~~~~~~~~~~B~~~~~~~~~C~~~~~D~~~~~~~E-F (EOF)
+ buffer token marker cursor limit
+
+ >-A------------B~~~~~~~~~C~~~~~D~~~~~~~E~F###############
+ buffer, marker cursor limit
+ token <- YYMAXFILL ->
+
+Here is an example of a program that reads input file ``input.txt`` in chunks of
+4096 bytes and uses bounds-checking with padding.
+
+.. literalinclude:: /manual/features/fill/01_fill.re
+ :language: c
+
--- /dev/null
+one two 'th\'ree' '123' ''
\ No newline at end of file
--- /dev/null
+#include <assert.h>
+#include <vector>
+#include <string>
+
+static const int ROOT = -1;
+
+struct Mtag {
+ int pred;
+ const char *tag;
+};
+
+typedef std::vector<Mtag> MtagTree;
+typedef std::vector<std::string> Words;
+
+static void mtag(int *pt, const char *t, MtagTree *tree)
+{
+ Mtag m = {*pt, t};
+ *pt = (int)tree->size();
+ tree->push_back(m);
+}
+
+static void unfold(const MtagTree &tree, int x, int y, Words &words)
+{
+ if (x == ROOT) return;
+ unfold(tree, tree[x].pred, tree[y].pred, words);
+ const char *px = tree[x].tag, *py = tree[y].tag;
+ words.push_back(std::string(px, py - px));
+}
+
+#define YYMTAGP(t) mtag(&t, YYCURSOR, &tree)
+#define YYMTAGN(t) mtag(&t, NULL, &tree)
+static bool lex(const char *YYCURSOR, Words &words)
+{
+ const char *YYMARKER;
+ /*!mtags:re2c format = "int @@ = ROOT;"; */
+ MtagTree tree;
+ int x, y;
+
+ /*!re2c
+ re2c:define:YYCTYPE = char;
+ re2c:yyfill:enable = 0;
+ re2c:flags:tags = 1;
+
+ (#x [a-zA-Z0-9_]+ #y [;])+ {
+ words.clear();
+ unfold(tree, x, y, words);
+ return true;
+ }
+ * { return false; }
+
+ */
+}
+
+int main()
+{
+ Words w;
+ assert(lex("one;tw0;three;", w) && w == Words({"one", "tw0", "three"}));
+ return 0;
+}
--- /dev/null
+#include <assert.h>
+#include <stdint.h>
+
+static uint32_t num(const char *s, const char *e)
+{
+ uint32_t n = 0;
+ for (; s < e; ++s) n = n * 10 + (*s - '0');
+ return n;
+}
+
+/*!maxnmatch:re2c*/
+
+static uint32_t lex(const char *YYCURSOR)
+{
+ const char *YYMARKER;
+ const char *yypmatch[YYMAXNMATCH];
+ uint32_t yynmatch;
+ /*!stags:re2c format = 'const char *@@;'; */
+
+ /*!re2c
+ re2c:define:YYCTYPE = char;
+ re2c:yyfill:enable = 0;
+ re2c:flags:posix-captures = 1;
+
+ oct = [0-9]{1,3};
+ dot = [.];
+
+ (oct) dot (oct) dot (oct) dot (oct) {
+ return num(yypmatch[8], yypmatch[9])
+ + (num(yypmatch[6], yypmatch[7]) << 8)
+ + (num(yypmatch[4], yypmatch[5]) << 16)
+ + (num(yypmatch[2], yypmatch[3]) << 24);
+ }
+ * { return 0; }
+
+ */
+}
+
+int main()
+{
+ assert(lex("1.2.3.4") == 0x01020304);
+ assert(lex("127.0.0.1") == 0x7f000001);
+ assert(lex("255.255.255.255") == 0xffffffff);
+ return 0;
+}
--- /dev/null
+#include <assert.h>
+#include <stdint.h>
+
+static uint32_t num(const char *s, const char *e)
+{
+ uint32_t n = 0;
+ for (; s < e; ++s) n = n * 10 + (*s - '0');
+ return n;
+}
+
+static uint32_t lex(const char *YYCURSOR)
+{
+ const char *YYMARKER, *o1, *o2, *o3, *o4;
+ /*!stags:re2c format = 'const char *@@;'; */
+
+ /*!re2c
+ re2c:define:YYCTYPE = char;
+ re2c:yyfill:enable = 0;
+ re2c:flags:tags = 1;
+
+ oct = [0-9]{1,3};
+ dot = [.];
+
+ @o1 oct dot @o2 oct dot @o3 oct dot @o4 oct {
+ return num(o4, YYCURSOR)
+ + (num(o3, o4 - 1) << 8)
+ + (num(o2, o3 - 1) << 16)
+ + (num(o1, o2 - 1) << 24);
+ }
+ * { return 0; }
+
+ */
+}
+
+int main()
+{
+ assert(lex("1.2.3.4") == 0x01020304);
+ assert(lex("127.0.0.1") == 0x7f000001);
+ assert(lex("255.255.255.255") == 0xffffffff);
+ return 0;
+}
-Submatch
---------
+Re2c has two options for submatch extraction.
-.. toctree::
- :hidden:
-
-.. include:: submatch.rst_
+The first option is ``-T --tags``. With this option one can use standalone tags
+of the form ``@stag`` and ``#mtag``, where ``stag`` and ``mtag`` are arbitrary
+used-defined names. Tags can be used anywhere inside of a regular expression;
+semantically they are just position markers. Tags of the form ``@stag`` are
+called s-tags: they denote a single submatch value (the last input position
+where this tag matched). Tags of the form ``#mtag`` are called m-tags: they
+denote multiple submatch values (the whole history of repetitions of this tag).
+All tags should be defined by the user as variables with the corresponding
+names. With standalone tags re2c uses leftmost greedy disambiguation: submatch
+positions correspond to the leftmost matching path through the regular
+expression.
+
+The second option is ``-P --posix-captures``: it enables POSIX-compliant
+capturing groups. In this mode parentheses in regular expressions denote the
+beginning and the end of capturing groups; the whole regular expression is group
+number zero. The number of groups for the matching rule is stored in a variable
+``yynmatch``, and submatch results are stored in ``yypmatch`` array. Both
+``yynmatch`` and ``yypmatch`` should be defined by the user, and ``yypmatch``
+size must be at least ``[yynmatch * 2]``. Re2c provides a directive
+``/*!maxnmatch:re2c*/`` that defines ``YYMAXNMATCH``: a constant equal to the
+maximal value of ``yynmatch`` among all rules. Note that re2c implements
+POSIX-compliant disambiguation: each subexpression matches as long as possible,
+and subexpressions that start earlier in regular expression have priority over
+those starting later. Capturing groups are translated into s-tags under the
+hood, therefore we use the word "tag" to describe them as well.
+
+With both ``-P --posix-captures`` and ``T --tags`` options re2c uses efficient
+submatch extraction algorithm described in the
+`Tagged Deterministic Finite Automata with Lookahead <https://arxiv.org/abs/1907.08837>`_
+paper. The overhead on submatch extraction in the generated lexer grows with the
+number of tags --- if this number is moderate, the overhead is barely
+noticeable. In the lexer tags are implemented using a number of tag variables
+generated by re2c. There is no one-to-one correspondence between tag variables
+and tags: a single variable may be reused for different tags, and one tag may
+require multiple variables to hold all its ambiguous values. Eventually
+ambiguity is resolved, and only one final variable per tag survives. When a rule
+matches, all its tags are set to the values of the corresponding tag variables.
+The exact number of tag variables is unknown to the user; this number is
+determined by re2c. However, tag variables should be defined by the user as a
+part of the lexer state and updated by ``YYFILL``, therefore re2c provides
+directives ``/*!stags:re2c*/`` and ``/*!mtags:re2c*/`` that can be used to
+declare, initialize and manipulate tag variables. These directives have two
+optional configurations: ``format = "@@";`` (specifies the template where ``@@``
+is substituted with the name of each tag variable), and ``separator = "";``
+(specifies the piece of code used to join the generated pieces for different
+tag variables).
+
+S-tags support the following operations:
+
+* save input position to an s-tag: ``t = YYCURSOR`` with default API or a
+ user-defined operation ``YYSTAGP(t)`` with generic API
+* save default value to an s-tag: ``t = NULL`` with default API or a
+ user-defined operation ``YYSTAGN(t)`` with generic API
+* copy one s-tag to another: ``t1 = t2``
+
+M-tags support the following operations:
+
+* append input position to an m-tag: a user-defined operation ``YYMTAGP(t)``
+ with both default and generic API
+* append default value to an m-tag: a user-defined operation ``YYMTAGN(t)``
+ with both default and generic API
+* copy one m-tag to another: ``t1 = t2``
+
+S-tags can be implemented as scalar values (pointers or offsets). M-tags need a
+more complex representation, as they need to store a sequence of tag values. The
+most naive and inefficient representation of an m-tag is a list (array, vector)
+of tag values; a more efficient representation is to store all m-tags in a
+prefix-tree represented as array of nodes ``(v, p)``, where ``v`` is tag value
+and ``p`` is a pointer to parent node.
+
+Here is an example of using s-tags to parse an IPv4 address.
+
+.. literalinclude:: /manual/features/submatch/stags.re
+ :language: c
+
+Here is an example of using POSIX capturing groups to parse an IPv4 address.
+
+.. literalinclude:: /manual/features/submatch/posix.re
+ :language: c
+
+Here is an example of using m-tags to parse a semicolon-separated sequence of
+words (C++). Tag variables are stored in a tree that is packed in a vector.
+
+.. literalinclude:: /manual/features/submatch/mtags.re
+ :language: c
--- /dev/null
+#include <stdio.h>
+
+#ifdef REVERSED_CONDITION_ORDER
+# define yyca 1
+# define yycb 0
+#else
+# define yyca 0
+# define yycb 1
+#endif
+
+int main()
+{
+ const char * YYCURSOR = "aaaa,bbb!";
+ int c = yyca;
+ for (;;) {
+ /*!re2c
+ re2c:define:YYCTYPE = char;
+ re2c:yyfill:enable = 0;
+ re2c:define:YYSETCONDITION = "c = @@;";
+ re2c:define:YYSETCONDITION:naked = 1;
+ re2c:define:YYGETCONDITION = c;
+ re2c:define:YYGETCONDITION:naked = 1;
+
+ <*> * { printf ("error\n"); break; }
+
+ <a> "a" { printf ("a"); continue; }
+ <a> "," => b { printf (","); continue; }
+
+ <b> "!" { printf ("!\n"); break; }
+ <b> "b" { printf ("b"); continue; }
+ */
+ }
+ return 0;
+}
--- /dev/null
+#include <stdio.h>
+
+int main(int argc, char **argv)
+{
+ for (int i = 1; i < argc; ++i) {
+ for (char *YYCURSOR = argv[i];;) {
+ /*!re2c
+ re2c:define:YYCTYPE = char;
+ re2c:yyfill:enable = 0;
+ "\x00" { break; }
+ [a-z]* { continue; }
+ */
+ }
+ printf("argv[%d]: %s\n", i, argv[i]);
+ }
+ return 0;
+}