From: Ulya Trofimovich Date: Wed, 28 Oct 2015 17:36:57 +0000 (+0000) Subject: Changed multiple blocks example, added conditions example. X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=10f9b4c9e2d71f9d1cb66d36f3bbb6bc6be7749e;p=re2c Changed multiple blocks example, added conditions example. --- diff --git a/src/examples.rst b/src/examples.rst index 71764814..1ef1c8c9 100644 --- a/src/examples.rst +++ b/src/examples.rst @@ -42,11 +42,9 @@ Generate, compile and run: $ re2c -o example.cc 01_recognizing_integers.re $ g++ -o example example.cc - $ ./example 0 -12 +345 12345678901234567890 0xAbcDEf 0x00 007 0B0 0b110101010 0x 0b a ? "" - dec: 0 - dec: -12 - dec: +345 - dec: 12345678901234567890 + $ ./example 0 12345678901234567890 0xAbcDEf 0x00 007 0B0 0b110101010 0x 0b ? "" + oct: 0 + dec: 1234567890 hex: 0xAbcDEf hex: 0x00 oct: 007 @@ -54,7 +52,6 @@ Generate, compile and run: bin: 0b110101010 err: 0x err: 0b - err: a err: ? err: @@ -63,11 +60,9 @@ Generate, compile and run: Recognizing strings: the need for YYMAXFILL ------------------------------------------- -This example is about recognizing simple strings without escapes -(strings with escapes are lexed in `Parsing strings: multiple re2c blocks`_ example). - -Our strings are single-quoted and may contain any characters in the range ``[0 - 0xFF]``, except sinle quotes ``'``. -It means that (unlike the previous example, `Recognizing integers: the sentinel method`_) +This example is about recognizing strings. +Our strings are very simple: they are single-quoted and may contain any character in range ``[0 - 0xFF]``, except sinle quotes ``'``. +Yet this time (unlike the previous example, `Recognizing integers: the sentinel method`_) we cannot use ``NULL`` or any other character as a sentinel: input strings like ``'aha\0ha'\0`` are perfectly valid, but incorrect input like ``'aha\0`` is also possible and shouldn't crash lexer. @@ -255,46 +250,95 @@ Generate, compile and run: $ ./example input.txt glorious 7 strings! -.. _Parsing strings: multiple re2c blocks: - -Parsing strings: multiple re2c blocks -------------------------------------- - -This example is based on `Recognizing strings: the need for YYMAXFILL`_ example, -only now we will fully parse double-quoted C-like strings -rather than simply recognize single-quoted Shell strings. +.. _Parsing integers (multiple re2c blocks): -Our strings can contain: +Parsing integers (multiple re2c blocks) +--------------------------------------- -* any unescaped ASCII character except double quote ``"``, escape ``\`` and newline -* simple escapes: ``\’``, ``\"``, ``\?``, ``\\``, ``\a``, ``\b``, ``\f``, ``\n``, ``\r``, ``\t``, ``\v`` -* octal escapes: ``\`` followed by one or more characters in range ``[0 - 7]`` -* hexadecimal escapes: ``\`` followed by one or more characters in range ``[0 - 9]``, ``[a - f]`` or ``[A - F]`` +This example is based on `Recognizing integers: the sentinel method`_ example, +only now integer literals are parsed rather than simply recognized. +The aim of this example is to show how to use multiple re2c blocks, +not how to parse integers (overflows are not handled). ``:)`` -Octal and hexadecimal escapes are greedy: escape covers as many characters as possible (without causing a lexical error). +`[04_parsing_integers_blocks.re] `_ -`[04_parsing_strings.re] `_ - -.. include:: examples/04_parsing_strings.re +.. include:: examples/04_parsing_integers_blocks.re :code: cpp :number-lines: Notes: -* Configurations and definitions (lines 30 - 38) are not scoped to a single re2c block -- they are global. +* Configurations and definitions (lines 9 - 15) are not scoped to a single re2c block --- they are global. Each block may override configurations, but this affects global scope. * Blocks don't have to be in the same function: they can be in separate functions or elsewhere as long as the exposed interface fits into lexical scope. -* Overflows in octal and hexadecimal escapes are not handled. Generate, compile and run: .. code-block:: bash - $ re2c -o example.cc 04_parsing_strings.re + $ re2c -o example.cc 04_parsing_integers_blocks.re + $ g++ -o example example.cc + $ ./example "" 0 0b11100001 012345 67890 0xffE 0x 0b + error :[ + 0 + 225 + 5349 + 67890 + 4094 + error :[ + error :[ + + +.. _Parsing integers (conditions): + +Parsing integers (conditions) +----------------------------- + +This example does exactly the same as `Parsing integers (multiple re2c blocks)`_ example, +but in a slightly different manner: it uses re2c conditions instead of blocks. +Conditions allow to encode multiple interconnected lexers within a single re2c block. + +`[05_parsing_integers_conditions.re] `_ + +.. include:: examples/05_parsing_integers_conditions.re + :code: cpp + :number-lines: + +Notes: + +* Conditions are enabled with ``-c`` option. +* Conditions are only syntactic sugar, they can be translated into multiple blocks. +* Each condition is a standalone lexer (DFA). +* Conditions are interconnected: transitions are allowed between final states of one DFA + and start state of another DFA (but no transitions between inner states of different DFAs). +* Each condition has a unique identifier: ``/*!types:re2c*/`` directive (line 3) + tells re2c to generate enumeration of them (names are prefixed with ``yyc`` by default). + These identifiers are used in the initial dispatch on conditions: + lexer uses ``YYGETCONDITION`` to get current condition (line 16) + and ``YYSETCONDITION`` to set it (line 18). +* Each condition has a unique label (prefixed with ``yyc_`` by default). + Actions can use these labels to jump between conditions. + Alternatively the whole block may be enclosed in a loop: + then lexer will go through the initial dispatch on each iteration (but this might be slow). +* Star rule ``<*>`` (line 21) is merged to all conditions (low priority). +* Rule with multiple conditions (line 28) is merged to each listed condition (normal priority). +* ``:=>`` (lines 23, 24, 25, 26) implies immediate transition + (bypassing initial dispatch). + +Generate, compile and run: + +.. code-block:: bash + + $ re2c -c -o example.cc 05_parsing_integers_conditions.re $ g++ -o example example.cc - $ ./example '"\23005 re2c, flex \x438 quex\n\t \x27f7\n\t\x431\x440\x430\x442\x44c\x44f \x43d\x430\x432\x435\x43a! \x2605"' - "★ re2c, flex и quex - ⟷ - братья навек! ★" + $ ./example "" 0 0b11100001 012345 67890 0xffE 0x 0b + error :[ + 0 + 225 + 5349 + 67890 + 4094 + error :[ + error :[ diff --git a/src/examples/01_recognizing_integers.re b/src/examples/01_recognizing_integers.re index 99a41f28..73642b7e 100644 --- a/src/examples/01_recognizing_integers.re +++ b/src/examples/01_recognizing_integers.re @@ -9,10 +9,10 @@ static const char *lex(const char *input) re2c:yyfill:enable = 0; end = "\x00"; - bin = "0" [bB] [01]+; - oct = "0" [0-7]+; - dec = [+-]? ("0" | [1-9][0-9]*); - hex = "0" [xX] [0-9a-fA-F]+; + bin = '0b' [01]+; + oct = "0" [0-7]*; + dec = [1-9][0-9]; + hex = '0x' [0-9a-fA-F]+; * { return "err"; } bin end { return "bin"; } diff --git a/src/examples/04_parsing_integers_blocks.re b/src/examples/04_parsing_integers_blocks.re new file mode 100644 index 00000000..0a591639 --- /dev/null +++ b/src/examples/04_parsing_integers_blocks.re @@ -0,0 +1,67 @@ +#include + +static int lex(const char *s) +{ + const char *YYMARKER; + const char *YYCTXMARKER; + int n = 0; + + /*!re2c + re2c:yyfill:enable = 0; + re2c:define:YYCTYPE = char; + re2c:define:YYCURSOR = s; + + end = "\x00"; + */ + + /*!re2c + * { return -1; } + '0b' / [01] { goto bin; } + "0" { goto oct; } + "" / [1-9] { goto dec; } + '0x' / [0-9a-fA-F] { goto hex; } + */ + +bin: + /*!re2c + * { return -1; } + end { return n; } + [01] { n = (n << 1) + (s[-1] - '0'); goto bin; } + */ + +oct: + /*!re2c + * { return -1; } + end { return n; } + [0-7] { n = (n << 3) + (s[-1] - '0'); goto oct; } + */ + +dec: + /*!re2c + * { return -1; } + end { return n; } + [0-9] { n = (n * 10) + (s[-1] - '0'); goto dec; } + */ + +hex: + /*!re2c + * { return -1; } + end { return n; } + [0-9] { n = (n << 4) + (s[-1] - '0'); goto hex; } + [a-f] { n = (n << 4) + (s[-1] - 'a' + 10); goto hex; } + [A-F] { n = (n << 4) + (s[-1] - 'A' + 10); goto hex; } + */ +} + +int main(int argc, char **argv) +{ + for (int i = 1; i < argc; ++i) { + const int n = lex(argv[i]); + if (n < 0) { + printf("error :[\n"); + } else { + printf("%d\n", n); + } + } + return 0; +} diff --git a/src/examples/04_parsing_strings.re b/src/examples/04_parsing_strings.re deleted file mode 100644 index 0eb6d6ba..00000000 --- a/src/examples/04_parsing_strings.re +++ /dev/null @@ -1,98 +0,0 @@ -#include -#include -#include - -/*!max:re2c*/ - -struct input_t { - size_t len; - char *str; - - input_t(const char *s) - : len(strlen(s) + 1) - , str(new char[len + YYMAXFILL]) - { - memcpy(str, s, len); - memset(str + len, 'a', YYMAXFILL); - } - ~input_t() - { - delete[]str; - } -}; - -static void lex(const input_t & input) -{ - const char *YYCURSOR = input.str; - const char *const YYLIMIT = input.str + input.len + YYMAXFILL; - const char *YYMARKER, *YYCTXMARKER; - wchar_t c = '"'; - /*!re2c - re2c:define:YYCTYPE = char; - re2c:define:YYFILL = "goto err;"; - re2c:define:YYFILL:naked = 1; - - end = "\x00"; - q = "\""; - e = "\\"; - */ - - /*!re2c - * { goto err; } - q { goto str; } - */ - -str: - printf("%lc", c); - /*!re2c - * { goto err; } - . \ (e | q) { c = YYCURSOR[-1]; goto str; } - e "a" { c = '\a'; goto str; } - e "b" { c = '\b'; goto str; } - e "f" { c = '\f'; goto str; } - e "n" { c = '\n'; goto str; } - e "r" { c = '\r'; goto str; } - e "t" { c = '\t'; goto str; } - e "v" { c = '\v'; goto str; } - e e { c = '\\'; goto str; } - e "'" { c = '\''; goto str; } - e q { c = '"'; goto str; } - e "?" { c = '?'; goto str; } - e / [0-7] { c = 0; goto oct; } - e "x" / [0-9a-fA-F] { c = 0; goto hex; } - q end { - printf("\"\n"); - return; - } - */ - -oct: - /*!re2c - "" { goto str; } - [0-7] { c = (c << 3) + (YYCURSOR[-1] - '0'); goto oct; } - */ - -hex: - /*!re2c - "" { goto str; } - [0-9] { c = (c << 4) + (YYCURSOR[-1] - '0'); goto hex; } - [a-f] { c = (c << 4) + (YYCURSOR[-1] - 'a' + 10); goto hex; } - [A-F] { c = (c << 4) + (YYCURSOR[-1] - 'A' + 10); goto hex; } - */ - -err: - printf("... error :[\n"); -} - -int main(int argc, char **argv) -{ - if (!setlocale(LC_CTYPE, "en_US.utf8")) { - printf("setlocale failed: en_US.utf8\n"); - return 1; - } - for (int i = 1; i < argc; ++i) { - input_t arg(argv[i]); - lex(arg); - } - return 0; -} diff --git a/src/examples/05_parsing_integers_conditions.re b/src/examples/05_parsing_integers_conditions.re new file mode 100644 index 00000000..eecfb30c --- /dev/null +++ b/src/examples/05_parsing_integers_conditions.re @@ -0,0 +1,49 @@ +#include + +/*!types:re2c*/ + +static int lex(const char *s) +{ + const char *YYMARKER; + const char *YYCTXMARKER; + int n = 0; + int c = yycinit; + + /*!re2c + re2c:yyfill:enable = 0; + re2c:define:YYCTYPE = char; + re2c:define:YYCURSOR = s; + re2c:define:YYGETCONDITION = "c"; + re2c:define:YYGETCONDITION:naked = 1; + re2c:define:YYSETCONDITION = "c = @@;"; + re2c:define:YYSETCONDITION:naked = 1; + + <*> * { return -1; } + + '0b' / [01] :=> bin + "0" :=> oct + "" / [1-9] :=> dec + '0x' / [0-9a-fA-F] :=> hex + + "\x00" { return n; } + [01] { n = (n << 1) + (s[-1] - '0'); goto yyc_bin; } + [0-7] { n = (n << 3) + (s[-1] - '0'); goto yyc_oct; } + [0-9] { n = (n * 10) + (s[-1] - '0'); goto yyc_dec; } + [0-9] { n = (n << 4) + (s[-1] - '0'); goto yyc_hex; } + [a-f] { n = (n << 4) + (s[-1] - 'a' + 10); goto yyc_hex; } + [A-F] { n = (n << 4) + (s[-1] - 'A' + 10); goto yyc_hex; } + */ +} + +int main(int argc, char **argv) +{ + for (int i = 1; i < argc; ++i) { + const int n = lex(argv[i]); + if (n < 0) { + printf("error :[\n"); + } else { + printf("%d\n", n); + } + } + return 0; +}