Recognizing strings: the need for YYMAXFILL
-------------------------------------------
-This example is about recognizing strings.
-Strings may contain *any* characters in the range ``[0 - 0xFF]`` except quotes ``"`` and escape ``\`` (they must be escaped).
+This example is about recognizing simple strings without escapes
+(strings with escapes are lexed in `Parsing strings: multiple re2c blocks`_ example).
+
+Our strings are single-quoted and may contain any characters in the range ``[0 - 0xFF]``, except sinle quotes ``'``.
It means that (unlike the previous example, `Recognizing integers: the sentinel method`_)
we cannot use ``NULL`` or any other character as a sentinel:
-strings like ``"aha\0ha"\0`` are perfectly valid, but ``"aha\0`` is also possible and shouldn't crash lexer.
+input strings like ``'aha\0ha'\0`` are perfectly valid,
+but incorrect input like ``'aha\0`` is also possible and shouldn't crash lexer.
-By default re2c-generated lexers contain explicit checks for the end of input
-(these checks can be suppressed with ``re2c:yyfill:enable = 0;`` configuration).
+By default re2c generates explicit checks for the end of input,
+so we must simply omit ``re2c:yyfill:enable = 0;`` configuration.
A naive approach is to check on each character (before advancing input position), but it's very slow.
Instead, re2c inserts checks only at certain points in the generated program.
Each check ensures that there is enough input to proceed until the next checkpoint.
$ re2c -o example.cc 02_recognizing_strings.re
$ g++ -o example example.cc
- $ ./example \"a\ momentary\" \"\" \"lap\"se\" \"of \"rea\\\"son\" ""
- str: "a momentary"
- str: ""
- err: "lap"se"
- err: "of
- str: "rea\"son"
+ $ ./example "'a momentary'" "''" "'lap'se'" "'of" "'" "'rea''son'" ""
+ str: 'a momentary'
+ str: ''
+ err: 'lap'se'
+ err: 'of
+ err: '
+ err: 'rea''son'
err:
.. _Arbitrary large input and YYFILL:
$ ./example input.txt
glorious 7 strings!
+.. _Parsing strings: multiple re2c blocks:
+
+Parsing strings: multiple re2c blocks
+-------------------------------------
+
+This example is based on `Recognizing strings: the need for YYMAXFILL`_ example,
+only now we will fully parse double-quoted C-like strings
+rather than simply recognize single-quoted Shell strings.
+
+Our strings can contain:
+
+* any unescaped ASCII character except double quote ``"``, escape ``\`` and newline
+* simple escapes: ``\’``, ``\"``, ``\?``, ``\\``, ``\a``, ``\b``, ``\f``, ``\n``, ``\r``, ``\t``, ``\v``
+* octal escapes: ``\`` followed by one or more characters in range ``[0 - 7]``
+* hexadecimal escapes: ``\`` followed by one or more characters in range ``[0 - 9]``, ``[a - f]`` or ``[A - F]``
+
+Octal and hexadecimal escapes are greedy: escape covers as many characters as possible (without causing a lexical error).
+
+`[04_parsing_strings.re] <examples/04_parsing_strings.re>`_
+
+.. include:: examples/04_parsing_strings.re
+ :code: cpp
+ :number-lines:
+
+Notes:
+
+* Configurations and definitions (lines 30 - 38) are not scoped to a single re2c block -- they are global.
+ Each block may override configurations, but this affects global scope.
+* Blocks don't have to be in the same function: they can be in separate functions or elsewhere
+ as long as the exposed interface fits into lexical scope.
+* Overflows in octal and hexadecimal escapes are not handled.
+
+Generate, compile and run:
+
+.. code-block:: bash
+
+ $ re2c -o example.cc 04_parsing_strings.re
+ $ g++ -o example example.cc
+ $ ./example '"\23005 re2c, flex \x438 quex\n\t \x27f7\n\t\x431\x440\x430\x442\x44c\x44f \x43d\x430\x432\x435\x43a! \x2605"'
+ "★ re2c, flex и quex
+ ⟷
+ братья навек! ★"
+
--- /dev/null
+#include <locale.h>
+#include <stdio.h>
+#include <string.h>
+
+/*!max:re2c*/
+
+struct input_t {
+ size_t len;
+ char *str;
+
+ input_t(const char *s)
+ : len(strlen(s) + 1)
+ , str(new char[len + YYMAXFILL])
+ {
+ memcpy(str, s, len);
+ memset(str + len, 'a', YYMAXFILL);
+ }
+ ~input_t()
+ {
+ delete[]str;
+ }
+};
+
+static void lex(const input_t & input)
+{
+ const char *YYCURSOR = input.str;
+ const char *const YYLIMIT = input.str + input.len + YYMAXFILL;
+ const char *YYMARKER, *YYCTXMARKER;
+ wchar_t c = '"';
+ /*!re2c
+ re2c:define:YYCTYPE = char;
+ re2c:define:YYFILL = "goto err;";
+ re2c:define:YYFILL:naked = 1;
+
+ end = "\x00";
+ q = "\"";
+ e = "\\";
+ */
+
+ /*!re2c
+ * { goto err; }
+ q { goto str; }
+ */
+
+str:
+ printf("%lc", c);
+ /*!re2c
+ * { goto err; }
+ . \ (e | q) { c = YYCURSOR[-1]; goto str; }
+ e "a" { c = '\a'; goto str; }
+ e "b" { c = '\b'; goto str; }
+ e "f" { c = '\f'; goto str; }
+ e "n" { c = '\n'; goto str; }
+ e "r" { c = '\r'; goto str; }
+ e "t" { c = '\t'; goto str; }
+ e "v" { c = '\v'; goto str; }
+ e e { c = '\\'; goto str; }
+ e "'" { c = '\''; goto str; }
+ e q { c = '"'; goto str; }
+ e "?" { c = '?'; goto str; }
+ e / [0-7] { c = 0; goto oct; }
+ e "x" / [0-9a-fA-F] { c = 0; goto hex; }
+ q end {
+ printf("\"\n");
+ return;
+ }
+ */
+
+oct:
+ /*!re2c
+ "" { goto str; }
+ [0-7] { c = (c << 3) + (YYCURSOR[-1] - '0'); goto oct; }
+ */
+
+hex:
+ /*!re2c
+ "" { goto str; }
+ [0-9] { c = (c << 4) + (YYCURSOR[-1] - '0'); goto hex; }
+ [a-f] { c = (c << 4) + (YYCURSOR[-1] - 'a' + 10); goto hex; }
+ [A-F] { c = (c << 4) + (YYCURSOR[-1] - 'A' + 10); goto hex; }
+ */
+
+err:
+ printf("... error :[\n");
+}
+
+int main(int argc, char **argv)
+{
+ if (!setlocale(LC_CTYPE, "en_US.utf8")) {
+ printf("setlocale failed: en_US.utf8\n");
+ return 1;
+ }
+ for (int i = 1; i < argc; ++i) {
+ input_t arg(argv[i]);
+ lex(arg);
+ }
+ return 0;
+}