From 8b21f34bdba74d8602cb6b9e016ebb4aa6387645 Mon Sep 17 00:00:00 2001 From: Ulya Trofimovich Date: Tue, 27 Oct 2015 17:18:43 +0000 Subject: [PATCH] Added example of parsing strings with multiple blocks. --- src/examples.rst | 69 +++++++++++++++--- src/examples/02_recognizing_strings.re | 2 +- src/examples/04_parsing_strings.re | 98 ++++++++++++++++++++++++++ 3 files changed, 157 insertions(+), 12 deletions(-) create mode 100644 src/examples/04_parsing_strings.re diff --git a/src/examples.rst b/src/examples.rst index 5924d9c0..71764814 100644 --- a/src/examples.rst +++ b/src/examples.rst @@ -63,14 +63,17 @@ Generate, compile and run: Recognizing strings: the need for YYMAXFILL ------------------------------------------- -This example is about recognizing strings. -Strings may contain *any* characters in the range ``[0 - 0xFF]`` except quotes ``"`` and escape ``\`` (they must be escaped). +This example is about recognizing simple strings without escapes +(strings with escapes are lexed in `Parsing strings: multiple re2c blocks`_ example). + +Our strings are single-quoted and may contain any characters in the range ``[0 - 0xFF]``, except sinle quotes ``'``. It means that (unlike the previous example, `Recognizing integers: the sentinel method`_) we cannot use ``NULL`` or any other character as a sentinel: -strings like ``"aha\0ha"\0`` are perfectly valid, but ``"aha\0`` is also possible and shouldn't crash lexer. +input strings like ``'aha\0ha'\0`` are perfectly valid, +but incorrect input like ``'aha\0`` is also possible and shouldn't crash lexer. -By default re2c-generated lexers contain explicit checks for the end of input -(these checks can be suppressed with ``re2c:yyfill:enable = 0;`` configuration). +By default re2c generates explicit checks for the end of input, +so we must simply omit ``re2c:yyfill:enable = 0;`` configuration. A naive approach is to check on each character (before advancing input position), but it's very slow. Instead, re2c inserts checks only at certain points in the generated program. Each check ensures that there is enough input to proceed until the next checkpoint. @@ -124,12 +127,13 @@ Generate, compile and run: $ re2c -o example.cc 02_recognizing_strings.re $ g++ -o example example.cc - $ ./example \"a\ momentary\" \"\" \"lap\"se\" \"of \"rea\\\"son\" "" - str: "a momentary" - str: "" - err: "lap"se" - err: "of - str: "rea\"son" + $ ./example "'a momentary'" "''" "'lap'se'" "'of" "'" "'rea''son'" "" + str: 'a momentary' + str: '' + err: 'lap'se' + err: 'of + err: ' + err: 'rea''son' err: .. _Arbitrary large input and YYFILL: @@ -251,3 +255,46 @@ Generate, compile and run: $ ./example input.txt glorious 7 strings! +.. _Parsing strings: multiple re2c blocks: + +Parsing strings: multiple re2c blocks +------------------------------------- + +This example is based on `Recognizing strings: the need for YYMAXFILL`_ example, +only now we will fully parse double-quoted C-like strings +rather than simply recognize single-quoted Shell strings. + +Our strings can contain: + +* any unescaped ASCII character except double quote ``"``, escape ``\`` and newline +* simple escapes: ``\’``, ``\"``, ``\?``, ``\\``, ``\a``, ``\b``, ``\f``, ``\n``, ``\r``, ``\t``, ``\v`` +* octal escapes: ``\`` followed by one or more characters in range ``[0 - 7]`` +* hexadecimal escapes: ``\`` followed by one or more characters in range ``[0 - 9]``, ``[a - f]`` or ``[A - F]`` + +Octal and hexadecimal escapes are greedy: escape covers as many characters as possible (without causing a lexical error). + +`[04_parsing_strings.re] `_ + +.. include:: examples/04_parsing_strings.re + :code: cpp + :number-lines: + +Notes: + +* Configurations and definitions (lines 30 - 38) are not scoped to a single re2c block -- they are global. + Each block may override configurations, but this affects global scope. +* Blocks don't have to be in the same function: they can be in separate functions or elsewhere + as long as the exposed interface fits into lexical scope. +* Overflows in octal and hexadecimal escapes are not handled. + +Generate, compile and run: + +.. code-block:: bash + + $ re2c -o example.cc 04_parsing_strings.re + $ g++ -o example example.cc + $ ./example '"\23005 re2c, flex \x438 quex\n\t \x27f7\n\t\x431\x440\x430\x442\x44c\x44f \x43d\x430\x432\x435\x43a! \x2605"' + "★ re2c, flex и quex + ⟷ + братья навек! ★" + diff --git a/src/examples/02_recognizing_strings.re b/src/examples/02_recognizing_strings.re index 6346c9ce..83157319 100644 --- a/src/examples/02_recognizing_strings.re +++ b/src/examples/02_recognizing_strings.re @@ -31,7 +31,7 @@ static const char *lex(const input_t & input) re2c:define:YYFILL:naked = 1; end = "\x00"; - str = "\"" ([^"\\] | "\\" ["\\])* "\""; + str = "'" [^']* "'"; * { return "err"; } str end { return "str"; } diff --git a/src/examples/04_parsing_strings.re b/src/examples/04_parsing_strings.re new file mode 100644 index 00000000..0eb6d6ba --- /dev/null +++ b/src/examples/04_parsing_strings.re @@ -0,0 +1,98 @@ +#include +#include +#include + +/*!max:re2c*/ + +struct input_t { + size_t len; + char *str; + + input_t(const char *s) + : len(strlen(s) + 1) + , str(new char[len + YYMAXFILL]) + { + memcpy(str, s, len); + memset(str + len, 'a', YYMAXFILL); + } + ~input_t() + { + delete[]str; + } +}; + +static void lex(const input_t & input) +{ + const char *YYCURSOR = input.str; + const char *const YYLIMIT = input.str + input.len + YYMAXFILL; + const char *YYMARKER, *YYCTXMARKER; + wchar_t c = '"'; + /*!re2c + re2c:define:YYCTYPE = char; + re2c:define:YYFILL = "goto err;"; + re2c:define:YYFILL:naked = 1; + + end = "\x00"; + q = "\""; + e = "\\"; + */ + + /*!re2c + * { goto err; } + q { goto str; } + */ + +str: + printf("%lc", c); + /*!re2c + * { goto err; } + . \ (e | q) { c = YYCURSOR[-1]; goto str; } + e "a" { c = '\a'; goto str; } + e "b" { c = '\b'; goto str; } + e "f" { c = '\f'; goto str; } + e "n" { c = '\n'; goto str; } + e "r" { c = '\r'; goto str; } + e "t" { c = '\t'; goto str; } + e "v" { c = '\v'; goto str; } + e e { c = '\\'; goto str; } + e "'" { c = '\''; goto str; } + e q { c = '"'; goto str; } + e "?" { c = '?'; goto str; } + e / [0-7] { c = 0; goto oct; } + e "x" / [0-9a-fA-F] { c = 0; goto hex; } + q end { + printf("\"\n"); + return; + } + */ + +oct: + /*!re2c + "" { goto str; } + [0-7] { c = (c << 3) + (YYCURSOR[-1] - '0'); goto oct; } + */ + +hex: + /*!re2c + "" { goto str; } + [0-9] { c = (c << 4) + (YYCURSOR[-1] - '0'); goto hex; } + [a-f] { c = (c << 4) + (YYCURSOR[-1] - 'a' + 10); goto hex; } + [A-F] { c = (c << 4) + (YYCURSOR[-1] - 'A' + 10); goto hex; } + */ + +err: + printf("... error :[\n"); +} + +int main(int argc, char **argv) +{ + if (!setlocale(LC_CTYPE, "en_US.utf8")) { + printf("setlocale failed: en_US.utf8\n"); + return 1; + } + for (int i = 1; i < argc; ++i) { + input_t arg(argv[i]); + lex(arg); + } + return 0; +} -- 2.40.0