Overflow checks in "parsing integers" examples.

author Ulya Trofimovich <skvadrik@gmail.com>

Mon, 2 Nov 2015 15:09:38 +0000 (15:09 +0000)

committer Ulya Trofimovich <skvadrik@gmail.com>

Mon, 2 Nov 2015 15:09:38 +0000 (15:09 +0000)
author Ulya Trofimovich <skvadrik@gmail.com>
Mon, 2 Nov 2015 15:09:38 +0000 (15:09 +0000)
committer Ulya Trofimovich <skvadrik@gmail.com>
Mon, 2 Nov 2015 15:09:38 +0000 (15:09 +0000)
diff --git a/src/examples.rst b/src/examples.rst

index 636043cee1decf700e13472529f0dbe6ac34710f..a5b7fd7d11bbe7d82df26f74a005ce800cb59a85 100644 (file)
--- a/src/examples.rst
+++ b/src/examples.rst
@@ -259,8 +259,9 @@ Parsing integers (multiple re2c blocks)
  
  This example is based on `Recognizing integers: the sentinel method`_ example,
  only now integer literals are parsed rather than simply recognized.
-The aim of this example is to show how to use multiple re2c blocks,
-not how to parse integers (overflows are not handled). ``:)``
+Parsing integers is simple: one can easily do it by hand.
+However, re2c-generated code *does* look like a simple handwritten parser:
+a couple of dereferences and conditional jumps. No overhead. ``:)``
  
  `[04_parsing_integers_blocks.re] <examples/04_parsing_integers_blocks.re>`_
  
@@ -270,7 +271,7 @@ not how to parse integers (overflows are not handled). ``:)``
  
  Notes:
  
-* Configurations and definitions (lines 9 - 15) are not scoped to a single re2c block --- they are global.
+* Configurations and definitions (lines 20 - 26) are not scoped to a single re2c block --- they are global.
    Each block may override configurations, but this affects global scope.
  * Blocks don't have to be in the same function: they can be in separate functions or elsewhere
    as long as the exposed interface fits into lexical scope.
@@ -281,15 +282,17 @@ Generate, compile and run:
  
      $ re2c -o example.cc 04_parsing_integers_blocks.re
      $ g++ -o example example.cc
-    $ ./example "" 0 0b11100001 012345 67890 0xffE 0x 0b
-    error :[
+    $ ./example 0 12345678901234567890 0xFFFFffffFFFFffff 0x1FFFFffffFFFFffff 0xAbcDEf 0x00 007 0B0 0b110101010 ""
      0
-    225
-    5349
-    67890
-    4094
-    error :[
-    error :[
+    12345678901234567890
+    18446744073709551615
+    error
+    11259375
+    0
+    7
+    0
+    426
+    error
  
  
  .. _Parsing integers (conditions):
@@ -310,24 +313,28 @@ Conditions allow to encode multiple interconnected lexers within a single re2c b
  Notes:
  
  * Conditions are enabled with ``-c`` option.
+
  * Conditions are only syntactic sugar, they can be translated into multiple blocks.
+
  * Each condition is a standalone lexer (DFA).
-* Conditions are interconnected: transitions are allowed between final states of one DFA
-  and start state of another DFA (but no transitions between inner states of different DFAs).
-  The generated code starts with dispatch on conditions.
-* Each condition has a unique identifier: ``/*!types:re2c*/`` directive (line 3)
-  tells re2c to generate enumeration of them (names are prefixed with ``yyc`` by default).
-  These identifiers are used in the initial dispatch on conditions:
-  lexer uses ``YYGETCONDITION`` to get current condition (line 16)
-  and ``YYSETCONDITION`` to set it (line 18).
+
+* Each condition has a unique identifier: ``/*!types:re2c*/`` tells re2c to generate
+  enumeration of all identifiers (names are prefixed with ``yyc`` by default).
+  Lexer uses ``YYGETCONDITION`` to get the identifier of current condition
+  and ``YYSETCONDITION`` to set it.
+
  * Each condition has a unique label (prefixed with ``yyc_`` by default).
-  Actions can use these labels to jump between conditions.
-  Alternatively the whole block may be enclosed in a loop:
-  then lexer will go through the initial dispatch on each iteration (but this might be slow).
-* Star rule ``<*>`` (line 21) is merged to all conditions (low priority).
-* Rule with multiple conditions (line 28) is merged to each listed condition (normal priority).
-* ``:=>`` (lines 23, 24, 25, 26) implies immediate transition
-  (bypassing initial dispatch).
+
+* Conditions are connected: transitions are allowed between final states of one condition
+  and start state of another condition (but not between inner states of different conditions).
+  The generated code starts with dispatch.
+  Actions can either jump to the initial dispatch or jump directly to any condition.
+
+* Rule ``<*>`` is merged to all conditions (low priority).
+
+* Rules with multiple conditions are merged to each listed condition (normal priority).
+
+* ``:=>`` jumps directly to the next condition (bypassing the initial dispatch).
  
  Generate, compile and run:
  
@@ -335,15 +342,17 @@ Generate, compile and run:
  
      $ re2c -c -o example.cc 05_parsing_integers_conditions.re
      $ g++ -o example example.cc
-    $ ./example "" 0 0b11100001 012345 67890 0xffE 0x 0b
-    error :[
+    $ ./example 0 12345678901234567890 0xFFFFffffFFFFffff 0x1FFFFffffFFFFffff 0xAbcDEf 0x00 007 0B0 0b110101010 ""
+    0
+    12345678901234567890
+    18446744073709551615
+    error
+    11259375
+    0
+    7
      0
-    225
-    5349
-    67890
-    4094
-    error :[
-    error :[
+    426
+    error
  
  
  .. Braille patterns (encodings):
@@ -363,7 +372,7 @@ Here is a message out of the void:
  .. include:: examples/06_braille.utf8.txt
  
  It appears to be UTF-8 encoded `[06_braille.utf8.txt] <examples/06_braille.utf8.txt.html>`_.
-Now translate it into UTF-16, UTF-32 or UCS-2:
+Convert it into UTF-16, UTF-32 or UCS-2:
  
  .. code-block:: bash
  
diff --git a/src/examples/04_parsing_integers_blocks.re b/src/examples/04_parsing_integers_blocks.re

index 0a5916394f40c5f6892a175f9ce343eb9d434758..20bf9de9d407820b5955169e8a7a99a180abba22 100644 (file)
--- a/src/examples/04_parsing_integers_blocks.re
+++ b/src/examples/04_parsing_integers_blocks.re
@@ -1,10 +1,21 @@
+#include <limits.h>
  #include <stdio.h>
  
-static int lex(const char *s)
+template<int base>
+static bool adddgt(unsigned long &u, unsigned int d)
+{
+    if (u > (ULONG_MAX - d) / base) {
+        return false;
+    }
+    u = u * base + d;
+    return true;
+}
+
+static bool lex(const char *s, unsigned long &u)
  {
      const char *YYMARKER;
      const char *YYCTXMARKER;
-    int n = 0;
+    u = 0;
  
      /*!re2c
          re2c:yyfill:enable = 0;
@@ -15,7 +26,7 @@ static int lex(const char *s)
      */
  
      /*!re2c
-        *                  { return -1; }
+        *                  { return false; }
          '0b' / [01]        { goto bin; }
          "0"                { goto oct; }
          "" / [1-9]         { goto dec; }
@@ -24,43 +35,43 @@ static int lex(const char *s)
  
  bin:
      /*!re2c
-        *     { return -1; }
-        end   { return n; }
-        [01]  { n = (n << 1) + (s[-1] - '0'); goto bin; }
+        *     { return false; }
+        end   { return true; }
+        [01]  { if (!adddgt<2>(u, s[-1] - '0')) return false; goto bin; }
      */
  
  oct:
      /*!re2c
-        *     { return -1; }
-        end   { return n; }
-        [0-7] { n = (n << 3) + (s[-1] - '0'); goto oct; }
+        *     { return false; }
+        end   { return true; }
+        [0-7] { if (!adddgt<8>(u, s[-1] - '0')) return false; goto oct; }
      */
  
  dec:
      /*!re2c
-        *     { return -1; }
-        end   { return n; }
-        [0-9] { n = (n * 10) + (s[-1] - '0'); goto dec; }
+        *     { return false; }
+        end   { return true; }
+        [0-9] { if (!adddgt<10>(u, s[-1] - '0')) return false; goto dec; }
      */
  
  hex:
      /*!re2c
-        *     { return -1; }
-        end   { return n; }
-        [0-9] { n = (n << 4) + (s[-1] - '0');      goto hex; }
-        [a-f] { n = (n << 4) + (s[-1] - 'a' + 10); goto hex; }
-        [A-F] { n = (n << 4) + (s[-1] - 'A' + 10); goto hex; }
+        *     { return false; }
+        end   { return true; }
+        [0-9] { if (!adddgt<16>(u, s[-1] - '0'))      return false; goto hex; }
+        [a-f] { if (!adddgt<16>(u, s[-1] - 'a' + 10)) return false; goto hex; }
+        [A-F] { if (!adddgt<16>(u, s[-1] - 'A' + 10)) return false; goto hex; }
      */
  }
  
  int main(int argc, char **argv)
  {
      for (int i = 1; i < argc; ++i) {
-        const int n = lex(argv[i]);
-        if (n < 0) {
-            printf("error :[\n");
+        unsigned long u;
+        if (lex(argv[i], u)) {
+            printf("%lu\n", u);
          } else {
-            printf("%d\n", n);
+            printf("error\n");
          }
      }
      return 0;
diff --git a/src/examples/05_parsing_integers_conditions.re b/src/examples/05_parsing_integers_conditions.re

index eecfb30cf1160b9a4bcc5364dcc2cfac300fafd7..9fbc0fd632ae2c21bccdbe58d65527bd1cf67322 100644 (file)
--- a/src/examples/05_parsing_integers_conditions.re
+++ b/src/examples/05_parsing_integers_conditions.re
@@ -1,14 +1,24 @@
+#include <limits.h>
  #include <stdio.h>
  
+template<int base>
+static bool adddgt(unsigned long &u, unsigned int d)
+{
+    if (u > (ULONG_MAX - d) / base) {
+        return false;
+    }
+    u = u * base + d;
+    return true;
+}
+
  /*!types:re2c*/
  
-static int lex(const char *s)
+static bool lex(const char *s, unsigned long &u)
  {
      const char *YYMARKER;
      const char *YYCTXMARKER;
-    int n = 0;
      int c = yycinit;
-
+    u = 0;
      /*!re2c
          re2c:yyfill:enable = 0;
          re2c:define:YYCTYPE = char;
@@ -18,31 +28,31 @@ static int lex(const char *s)
          re2c:define:YYSETCONDITION = "c = @@;";
          re2c:define:YYSETCONDITION:naked = 1;
  
-        <*> * { return -1; }
+        <*> * { return false; }
  
          <init> '0b' / [01]        :=> bin
          <init> "0"                :=> oct
          <init> "" / [1-9]         :=> dec
          <init> '0x' / [0-9a-fA-F] :=> hex
  
-        <bin, oct, dec, hex> "\x00" { return n; }
-        <bin> [01]  { n = (n << 1) + (s[-1] - '0'); goto yyc_bin; }
-        <oct> [0-7] { n = (n << 3) + (s[-1] - '0'); goto yyc_oct; }
-        <dec> [0-9] { n = (n * 10) + (s[-1] - '0'); goto yyc_dec; }
-        <hex> [0-9] { n = (n << 4) + (s[-1] - '0');      goto yyc_hex; }
-        <hex> [a-f] { n = (n << 4) + (s[-1] - 'a' + 10); goto yyc_hex; }
-        <hex> [A-F] { n = (n << 4) + (s[-1] - 'A' + 10); goto yyc_hex; }
+        <bin, oct, dec, hex> "\x00" { return true; }
+        <bin> [01]  { if (!adddgt<2>(u, s[-1] - '0')) return false; goto yyc_bin; }
+        <oct> [0-7] { if (!adddgt<8>(u, s[-1] - '0')) return false; goto yyc_oct; }
+        <dec> [0-9] { if (!adddgt<10>(u, s[-1] - '0')) return false; goto yyc_dec; }
+        <hex> [0-9] { if (!adddgt<16>(u, s[-1] - '0'))      return false; goto yyc_hex; }
+        <hex> [a-f] { if (!adddgt<16>(u, s[-1] - 'a' + 10)) return false; goto yyc_hex; }
+        <hex> [A-F] { if (!adddgt<16>(u, s[-1] - 'A' + 10)) return false; goto yyc_hex; }
      */
  }
  
  int main(int argc, char **argv)
  {
      for (int i = 1; i < argc; ++i) {
-        const int n = lex(argv[i]);
-        if (n < 0) {
-            printf("error :[\n");
+        unsigned long u;
+        if (lex(argv[i], u)) {
+            printf("%lu\n", u);
          } else {
-            printf("%d\n", n);
+            printf("error\n");
          }
      }
      return 0;
author	Ulya Trofimovich <skvadrik@gmail.com>
	Mon, 2 Nov 2015 15:09:38 +0000 (15:09 +0000)
committer	Ulya Trofimovich <skvadrik@gmail.com>
	Mon, 2 Nov 2015 15:09:38 +0000 (15:09 +0000)
src/examples.rst		patch \| blob \| history
src/examples/04_parsing_integers_blocks.re		patch \| blob \| history
src/examples/05_parsing_integers_conditions.re		patch \| blob \| history