Added example of parsing strings with multiple blocks.

author Ulya Trofimovich <skvadrik@gmail.com>

Tue, 27 Oct 2015 17:18:43 +0000 (17:18 +0000)

committer Ulya Trofimovich <skvadrik@gmail.com>

Tue, 27 Oct 2015 17:18:43 +0000 (17:18 +0000)
author Ulya Trofimovich <skvadrik@gmail.com>
Tue, 27 Oct 2015 17:18:43 +0000 (17:18 +0000)
committer Ulya Trofimovich <skvadrik@gmail.com>
Tue, 27 Oct 2015 17:18:43 +0000 (17:18 +0000)
diff --git a/src/examples.rst b/src/examples.rst

index 5924d9c0d6c03668d6541301da2b0abf3e5b2d12..717648143f4d9d087b83a4f92baf63f588d5302d 100644 (file)
--- a/src/examples.rst
+++ b/src/examples.rst
@@ -63,14 +63,17 @@ Generate, compile and run:
  Recognizing strings: the need for YYMAXFILL
  -------------------------------------------
  
-This example is about recognizing strings.
-Strings may contain *any* characters in the range ``[0 - 0xFF]`` except quotes ``"`` and escape ``\`` (they must be escaped).
+This example is about recognizing simple strings without escapes
+(strings with escapes are lexed in `Parsing strings: multiple re2c blocks`_ example).
+
+Our strings are single-quoted and may contain any characters in the range ``[0 - 0xFF]``, except sinle quotes ``'``.
  It means that (unlike the previous example, `Recognizing integers: the sentinel method`_)
  we cannot use ``NULL`` or any other character as a sentinel:
-strings like ``"aha\0ha"\0`` are perfectly valid, but ``"aha\0`` is also possible and shouldn't crash lexer.
+input strings like ``'aha\0ha'\0`` are perfectly valid,
+but incorrect input like ``'aha\0`` is also possible and shouldn't crash lexer.
  
-By default re2c-generated lexers contain explicit checks for the end of input
-(these checks can be suppressed with ``re2c:yyfill:enable = 0;`` configuration).
+By default re2c generates explicit checks for the end of input,
+so we must simply omit ``re2c:yyfill:enable = 0;`` configuration.
  A naive approach is to check on each character (before advancing input position), but it's very slow.
  Instead, re2c inserts checks only at certain points in the generated program.
  Each check ensures that there is enough input to proceed until the next checkpoint.
@@ -124,12 +127,13 @@ Generate, compile and run:
  
      $ re2c -o example.cc 02_recognizing_strings.re
      $ g++ -o example example.cc
-    $ ./example \"a\ momentary\" \"\" \"lap\"se\" \"of \"rea\\\"son\" ""
-    str: "a momentary"
-    str: ""
-    err: "lap"se"
-    err: "of
-    str: "rea\"son"
+    $ ./example "'a momentary'" "''" "'lap'se'" "'of" "'" "'rea''son'" ""
+    str: 'a momentary'
+    str: ''
+    err: 'lap'se'
+    err: 'of
+    err: '
+    err: 'rea''son'
      err: 
  
  .. _Arbitrary large input and YYFILL:
@@ -251,3 +255,46 @@ Generate, compile and run:
      $ ./example input.txt
      glorious 7 strings!
  
+.. _Parsing strings: multiple re2c blocks:
+
+Parsing strings: multiple re2c blocks
+-------------------------------------
+
+This example is based on `Recognizing strings: the need for YYMAXFILL`_ example,
+only now we will fully parse double-quoted C-like strings
+rather than simply recognize single-quoted Shell strings.
+
+Our strings can contain:
+
+* any unescaped ASCII character except double quote ``"``, escape ``\`` and newline
+* simple escapes: ``\’``, ``\"``, ``\?``, ``\\``, ``\a``, ``\b``, ``\f``, ``\n``, ``\r``, ``\t``, ``\v``
+* octal escapes: ``\`` followed by one or more characters in range ``[0 - 7]``
+* hexadecimal escapes: ``\`` followed by one or more characters in range ``[0 - 9]``, ``[a - f]`` or ``[A - F]``
+
+Octal and hexadecimal escapes are greedy: escape covers as many characters as possible (without causing a lexical error).
+
+`[04_parsing_strings.re] <examples/04_parsing_strings.re>`_
+
+.. include:: examples/04_parsing_strings.re
+    :code: cpp
+    :number-lines:
+
+Notes:
+
+* Configurations and definitions (lines 30 - 38) are not scoped to a single re2c block -- they are global.
+  Each block may override configurations, but this affects global scope.
+* Blocks don't have to be in the same function: they can be in separate functions or elsewhere
+  as long as the exposed interface fits into lexical scope.
+* Overflows in octal and hexadecimal escapes are not handled.
+
+Generate, compile and run:
+
+.. code-block:: bash
+
+    $ re2c -o example.cc 04_parsing_strings.re
+    $ g++ -o example example.cc
+    $ ./example '"\23005 re2c, flex \x438 quex\n\t  \x27f7\n\t\x431\x440\x430\x442\x44c\x44f \x43d\x430\x432\x435\x43a! \x2605"'
+    "★ re2c, flex и quex
+              ⟷
+            братья навек! ★"
+
diff --git a/src/examples/02_recognizing_strings.re b/src/examples/02_recognizing_strings.re

index 6346c9ce85c009716d79e27c242f0be75441de51..83157319cb200cfdb5aafdbb81c0d4461da83b7f 100644 (file)
--- a/src/examples/02_recognizing_strings.re
+++ b/src/examples/02_recognizing_strings.re
@@ -31,7 +31,7 @@ static const char *lex(const input_t & input)
          re2c:define:YYFILL:naked = 1;
  
          end = "\x00";
-        str = "\"" ([^"\\] | "\\" ["\\])* "\"";
+        str = "'" [^']* "'";
  
          *       { return "err"; }
          str end { return "str"; }
diff --git a/src/examples/04_parsing_strings.re b/src/examples/04_parsing_strings.re

new file mode 100644 (file)

index 0000000..0eb6d6b
--- /dev/null
+++ b/src/examples/04_parsing_strings.re
@@ -0,0 +1,98 @@
+#include <locale.h>
+#include <stdio.h>
+#include <string.h>
+
+/*!max:re2c*/
+
+struct input_t {
+    size_t len;
+    char *str;
+
+    input_t(const char *s)
+        : len(strlen(s) + 1)
+        , str(new char[len + YYMAXFILL])
+    {
+        memcpy(str, s, len);
+        memset(str + len, 'a', YYMAXFILL);
+    }
+    ~input_t()
+    {
+        delete[]str;
+    }
+};
+
+static void lex(const input_t & input)
+{
+    const char *YYCURSOR = input.str;
+    const char *const YYLIMIT = input.str + input.len + YYMAXFILL;
+    const char *YYMARKER, *YYCTXMARKER;
+    wchar_t c = '"';
+    /*!re2c
+        re2c:define:YYCTYPE = char;
+        re2c:define:YYFILL = "goto err;";
+        re2c:define:YYFILL:naked = 1;
+
+        end = "\x00";
+        q = "\"";
+        e = "\\";
+    */
+
+    /*!re2c
+        * { goto err; }
+        q { goto str; }
+    */
+
+str:
+    printf("%lc", c);
+    /*!re2c
+        *                   { goto err; }
+        . \ (e | q)         { c = YYCURSOR[-1]; goto str; }
+        e "a"               { c = '\a'; goto str; }
+        e "b"               { c = '\b'; goto str; }
+        e "f"               { c = '\f'; goto str; }
+        e "n"               { c = '\n'; goto str; }
+        e "r"               { c = '\r'; goto str; }
+        e "t"               { c = '\t'; goto str; }
+        e "v"               { c = '\v'; goto str; }
+        e e                 { c = '\\'; goto str; }
+        e "'"               { c = '\''; goto str; }
+        e q                 { c = '"';  goto str; }
+        e "?"               { c = '?';  goto str; }
+        e / [0-7]           { c = 0; goto oct; }
+        e "x" / [0-9a-fA-F] { c = 0; goto hex; }
+        q end               {
+            printf("\"\n");
+            return;
+        }
+    */
+
+oct:
+    /*!re2c
+        ""    { goto str; }
+        [0-7] { c = (c << 3) + (YYCURSOR[-1] - '0'); goto oct; }
+    */
+
+hex:
+    /*!re2c
+        ""    { goto str; }
+        [0-9] { c = (c << 4) + (YYCURSOR[-1] - '0');      goto hex; }
+        [a-f] { c = (c << 4) + (YYCURSOR[-1] - 'a' + 10); goto hex; }
+        [A-F] { c = (c << 4) + (YYCURSOR[-1] - 'A' + 10); goto hex; }
+    */
+
+err:
+    printf("... error :[\n");
+}
+
+int main(int argc, char **argv)
+{
+    if (!setlocale(LC_CTYPE, "en_US.utf8")) {
+        printf("setlocale failed: en_US.utf8\n");
+        return 1;
+    }
+    for (int i = 1; i < argc; ++i) {
+        input_t arg(argv[i]);
+        lex(arg);
+    }
+    return 0;
+}
author	Ulya Trofimovich <skvadrik@gmail.com>
	Tue, 27 Oct 2015 17:18:43 +0000 (17:18 +0000)
committer	Ulya Trofimovich <skvadrik@gmail.com>
	Tue, 27 Oct 2015 17:18:43 +0000 (17:18 +0000)
src/examples.rst		patch \| blob \| history
src/examples/02_recognizing_strings.re		patch \| blob \| history
src/examples/04_parsing_strings.re	[new file with mode: 0644]	patch \| blob