]> granicus.if.org Git - re2c/commitdiff
Added [-Wunreachable-rules] description.
authorUlya Trofimovich <skvadrik@gmail.com>
Sat, 7 Nov 2015 12:59:09 +0000 (12:59 +0000)
committerUlya Trofimovich <skvadrik@gmail.com>
Sat, 7 Nov 2015 12:59:09 +0000 (12:59 +0000)
src/manual/warnings/undefined_control_flow/default_vs_any.rst
src/manual/warnings/undefined_control_flow/real_world.rst
src/manual/warnings/undefined_control_flow/simple_example.rst
src/manual/warnings/undefined_control_flow/wundefined_control_flow.rst
src/manual/warnings/unreachable_rules/how_it_works.rst [new file with mode: 0644]
src/manual/warnings/unreachable_rules/infinite_rules.rst [new file with mode: 0644]
src/manual/warnings/unreachable_rules/real_world.rst [new file with mode: 0644]
src/manual/warnings/unreachable_rules/simple_example.rst [new file with mode: 0644]
src/manual/warnings/unreachable_rules/wunreachable_rules.rst [new file with mode: 0644]
src/manual/warnings/warnings.rst
src/manual/warnings/wunreachable_rules.rst [deleted file]

index 413e144fbfd2b1395db1910d0c04f14cfc756543..d2ad81378f4c393193b8706a2ec9609609adcf90 100644 (file)
@@ -5,6 +5,7 @@ When the the world was young and re2c didn't have default rule ``*`` (that is, b
 everyone used ``[^]`` as default rule:
 
 .. code-block:: cpp
+    :number-lines:
 
     /*!re2c
         // ... normal rules ...
@@ -30,7 +31,7 @@ However, with UTF-8 encoding ```re2c -i8 -Wundefined-control-flow``` says:
 
 .. code-block::
 
-    re2c: warning: line 3: control flow is undefined for strings that match 
+    re2c: warning: line 4: control flow is undefined for strings that match 
             '[\x80-\xC1\xF5-\xFF]'
             '\xF0 [\x0-\x8F\xC0-\xFF]'
             '[\xE1-\xEF] [\x0-\x7F\xC0-\xFF]'
@@ -49,7 +50,7 @@ If we tell re2c to exclude surrogates, ```re2c -ix --encoding-policy fail -Wunde
 
 .. code-block::
 
-    re2c: warning: line 3: control flow is undefined for strings that match 
+    re2c: warning: line 4: control flow is undefined for strings that match 
             '[\xDC00-\xDFFF]'
             '[\xD800-\xDBFF] [\x0-\xDBFF\xE000-\xFFFF]'
     , use default rule '*' [-Wundefined-control-flow]
index 5baa9872c956b1e97da3c16ec61edde56f6434bb..0aee17ba4b408dd75c8a63ed6125ef0f55a606d5 100644 (file)
@@ -10,8 +10,7 @@ Even it you are absolutely sure that default case is impossible, do handle it.
 No additional checks and transitions.
 It simply binds code to default label.
 
-I found ``[-Wundefined-control-flow]`` warnings in many real-world programs
-(including some of the PHP lexers and re2c own lexer).
+I found ``[-Wundefined-control-flow]`` warnings in many real-world programs (including re2c own lexer).
 Mostly these are minor issues like forgetting to handle newlines or zeroes in already preprocessed input,
 but it's curious how they creeped into the code.
 I bet they were just forgotten and not omitted for a good reason. ``:)``
index cbc822eb941bd9829501a72c44091452e08ed536..fc4479848e2ee5609c6d6ba956d91e61095b0d13 100644 (file)
@@ -4,6 +4,7 @@ A simple example
 Say, we want to match ``'a'``:
 
 .. code-block:: cpp
+    :number-lines:
 
     /*!re2c
         "a" { return 'a'; }
@@ -12,6 +13,7 @@ Say, we want to match ``'a'``:
 ```re2c -i -Wundefined-control-flow```:
 
 .. code-block:: cpp
+    :number-lines:
 
     /* Generated by re2c 0.14.1.dev on Thu Nov  5 14:35:46 2015*/
     
@@ -39,6 +41,7 @@ re2c grumbles something about undefined control flow and says that default ``*``
 Let's add it:
 
 .. code-block:: cpp
+    :number-lines:
 
     /*!re2c
         *   { return '*'; }
@@ -48,6 +51,7 @@ Let's add it:
 Now that's better:
 
 .. code-block:: cpp
+    :number-lines:
 
     /* Generated by re2c 0.14.1.dev on Thu Nov  5 14:35:08 2015*/
     
index f184c4b2477e35816e397145b14041a7119ea2d1..3f2600e7de9356d67ce1983f9fe506c0fca8bfdf 100644 (file)
@@ -5,7 +5,7 @@
 .. include:: ../../../contents.rst
 
 .. include:: simple_example.rst
-.. include:: how_it_works.rst
 .. include:: default_vs_any.rst
+.. include:: how_it_works.rst
 .. include:: real_world.rst
 
diff --git a/src/manual/warnings/unreachable_rules/how_it_works.rst b/src/manual/warnings/unreachable_rules/how_it_works.rst
new file mode 100644 (file)
index 0000000..186baa4
--- /dev/null
@@ -0,0 +1,16 @@
+How it works
+~~~~~~~~~~~~
+
+For each state of the generated DFA re2c calculates the set of all rules reachable from this state
+(including default rule and no rule at all).
+The algorithm starts with initial state and recurses to child states.
+Recursion stops if either current state is final (then its set is trivial)
+or the set for current state has already been calculated.
+Thus each state is processed only once and the algorithm has ``O(n)`` complexity
+(where ``n`` is the number of states in DFA).
+When all the sets have been calculated, re2c checks that the set of rules
+reachable from each accepting state includes either accepted rule or no rule at all.
+
+Analyses is done regardless of the ``-Wunreachable-rules`` option,
+the option only controls if the warning is reported or not.
+
diff --git a/src/manual/warnings/unreachable_rules/infinite_rules.rst b/src/manual/warnings/unreachable_rules/infinite_rules.rst
new file mode 100644 (file)
index 0000000..e6e7793
--- /dev/null
@@ -0,0 +1,42 @@
+Infinite rules
+~~~~~~~~~~~~~~
+
+A rule may be unreachable all by itself, without being shadowed by other rules:
+
+.. code-block:: cpp
+    :number-lines:
+
+    /*!re2c
+        [^]* { return "greeedy"; }
+    */
+
+This rule is so greedy that it never stops.
+It will continue eating input until ``YYFILL`` finally fails (```re2c -i -Wunreachable-rules```):
+
+.. code-block:: cpp
+    :number-lines:
+
+    /* Generated by re2c 0.14.1.dev on Fri Nov  6 21:36:56 2015*/
+    
+    {
+            YYCTYPE yych;
+            goto yy0;
+    yy1:
+            ++YYCURSOR;
+    yy0:
+            if (YYLIMIT <= YYCURSOR) YYFILL(1);
+            yych = *YYCURSOR;
+            goto yy1;
+            { return "greeedy"; }
+    }
+
+And if we suppress ``YYFILL`` generation with ``re2c:yyfill:enable = 0;``,
+lexer will loop until segfault or another disaster forces its untimely shutdown
+(unless you use ``--input-api custom`` and do some magic to stop it).
+And re2c warns us:
+
+.. code-block::
+
+    re2c: warning: line 2: unreachable rule  [-Wunreachable-rules]
+
+All rules that include ``[^]*`` (or any component isomorphic to ``[^]*``) are also infinite.
diff --git a/src/manual/warnings/unreachable_rules/real_world.rst b/src/manual/warnings/unreachable_rules/real_world.rst
new file mode 100644 (file)
index 0000000..8dd0932
--- /dev/null
@@ -0,0 +1,97 @@
+Real-world examples
+~~~~~~~~~~~~~~~~~~~
+
+In many real-world examples re2c reports unreachable ``[^]`` rule,
+which is quite understandable: ``[^]`` served as default rule long before
+the true default rule ``*`` was added to re2c.
+However, in some cases the warning is quite interesting.
+Here is an example of a real-world lexer (all the non-re2c code has beed removed):
+
+.. code-block:: cpp
+    :number-lines:
+
+    /*!re2c
+        re2c:yyfill:check = 0;
+    
+        LNUM                [0-9]+
+        DNUM                ([0-9]*[\.][0-9]+)|([0-9]+[\.][0-9]*)
+        NUMBER              [-]?{LNUM}|{DNUM}
+        ANY_CHAR            (.|[\n\t])
+        NEWLINE             ("\r"|"\n"|"\r\n")
+        TABS_AND_SPACES     [ \t]
+        WHITESPACE          [ \t]+
+        CONSTANT            [a-zA-Z_][a-zA-Z0-9_]*
+        LABEL               [^=\n\r\t;&|^$~(){}!"\[]+
+        TOKENS              [:,.\[\]"'()&|^+-/*=%$!~<>?@{}]
+        OPERATORS           [&|^~()!]
+        DOLLAR_CURLY        "${"
+        SECTION_RAW_CHARS   [^\]\n\r]
+        SINGLE_QUOTED_CHARS [^']
+        RAW_VALUE_CHARS     [^\n\r;\000]
+        LITERAL_DOLLAR      ("$"([^{\000]|("\\"{ANY_CHAR})))
+        VALUE_CHARS         ([^$= \t\n\r;&|^~()!"'\000]|{LITERAL_DOLLAR})
+        SECTION_VALUE_CHARS ([^$\n\r;"'\]\\]|("\\"{ANY_CHAR})|{LITERAL_DOLLAR})
+
+        <INITIAL>"[" {}
+        <ST_VALUE,ST_SECTION_VALUE,ST_OFFSET>"'"{SINGLE_QUOTED_CHARS}+"'" {}
+        <ST_SECTION_RAW,ST_SECTION_VALUE>"]"{TABS_AND_SPACES}*{NEWLINE}? {}
+        <INITIAL>{LABEL}"["{TABS_AND_SPACES}* {}
+        <ST_OFFSET>{TABS_AND_SPACES}*"]" {}
+        <ST_DOUBLE_QUOTES,ST_SECTION_VALUE,ST_VALUE,ST_OFFSET>{DOLLAR_CURLY} {}
+        <ST_VARNAME>{LABEL} {}
+        <ST_VARNAME>"}" {}
+        <INITIAL,ST_VALUE>("true"|"on"|"yes"){TABS_AND_SPACES}* {}
+        <INITIAL,ST_VALUE>("false"|"off"|"no"|"none"){TABS_AND_SPACES}* {}
+        <INITIAL,ST_VALUE>("null"){TABS_AND_SPACES}* {}
+        <INITIAL>{LABEL} {}
+        <INITIAL>{TABS_AND_SPACES}*[=]{TABS_AND_SPACES}* {}
+        <ST_RAW>{RAW_VALUE_CHARS} {}
+        <ST_SECTION_RAW>{SECTION_RAW_CHARS}+ {}
+        <ST_VALUE,ST_RAW>{TABS_AND_SPACES}*{NEWLINE} {}
+        <ST_SECTION_VALUE,ST_VALUE,ST_OFFSET>{CONSTANT} {}
+        <ST_SECTION_VALUE,ST_VALUE,ST_OFFSET>{NUMBER} {}
+        <INITIAL>{TOKENS} {}
+        <ST_VALUE>{OPERATORS}{TABS_AND_SPACES}* {}
+        <ST_VALUE>[=] {}
+        <ST_VALUE>{VALUE_CHARS}+ {}
+        <ST_SECTION_VALUE,ST_OFFSET>{SECTION_VALUE_CHARS}+ {}
+        <ST_SECTION_VALUE,ST_VALUE,ST_OFFSET>{TABS_AND_SPACES}*["] {}
+        <ST_DOUBLE_QUOTES>["]{TABS_AND_SPACES}* {}
+        <ST_DOUBLE_QUOTES>[^] {}
+        <ST_SECTION_VALUE,ST_VALUE,ST_OFFSET>{WHITESPACE} {}
+        <INITIAL,ST_RAW>{TABS_AND_SPACES}+ {}
+        <INITIAL>{TABS_AND_SPACES}*{NEWLINE} {}
+        <INITIAL,ST_VALUE,ST_RAW>{TABS_AND_SPACES}*[;][^\r\n]*{NEWLINE} {}
+        <ST_VALUE,ST_RAW>[^] {}
+        <*>[^] {}
+    */
+
+```re2c -cF -Wunreachable-rules``` says:
+
+.. code-block::
+
+    re2c: warning: line 54: unreachable rule in condition 'ST_DOUBLE_QUOTES' (shadowed by rules at lines 47, 48) [-Wunreachable-rules]
+    re2c: warning: line 49: unreachable rule in condition 'ST_OFFSET' (shadowed by rule at line 45) [-Wunreachable-rules]
+    re2c: warning: line 54: unreachable rule in condition 'ST_RAW' (shadowed by rules at lines 36, 38, 53) [-Wunreachable-rules]
+    re2c: warning: line 49: unreachable rule in condition 'ST_SECTION_VALUE' (shadowed by rule at line 45) [-Wunreachable-rules]
+    re2c: warning: line 54: unreachable rule in condition 'ST_VALUE' (shadowed by rules at lines 38, 39, 40, 42, 43, 44, 46, 49, 53) [-Wunreachable-rules]
+
+The interesting part is the unreachable rule on line 49 in conditions ``ST_OFFSET`` and ``ST_SECTION_VALUE``.
+The rule is ``{WHITESPACE}``:
+
+.. code-block::
+
+    WHITESPACE [ \t]+
+
+re2c claims that it is shadowed by the rule on line 45, which is ``{SECTION_VALUE_CHARS}+``:
+
+.. code-block::
+
+    ANY_CHAR            (.|[\n\t])
+    LITERAL_DOLLAR      ("$"([^{\000]|("\\"{ANY_CHAR})))
+    SECTION_VALUE_CHARS ([^$\n\r;"'\]\\]|("\\"{ANY_CHAR})|{LITERAL_DOLLAR})
+
+Indeed, ``{SECTION_VALUE_CHARS}+`` allows all the patterns accepted by ``{WHITESPACE}``.
+In the original program these rules return different types of tokens:
+perhaps this is not critical, but clearly unintended.
+
diff --git a/src/manual/warnings/unreachable_rules/simple_example.rst b/src/manual/warnings/unreachable_rules/simple_example.rst
new file mode 100644 (file)
index 0000000..2ac4866
--- /dev/null
@@ -0,0 +1,57 @@
+A simple example
+~~~~~~~~~~~~~~~~
+
+.. code-block:: cpp
+    :number-lines:
+
+    /*!re2c
+        ""          { return ""; }
+        *           { return "*"; }
+        "a" | "b"   { return "a | b"; }
+        "a"         { return "a"; }
+        [\x00-\xFF] { return "[0 - 0xFF]"; }
+        [^]         { return "[^]"; }
+    */
+
+Given this strange code, ```re2c -i -Wunreachable-rules``` says:
+
+.. code-block::
+
+    re2c: warning: line 2: unreachable rule (shadowed by rules at lines 4, 6) [-Wunreachable-rules]
+    re2c: warning: line 5: unreachable rule (shadowed by rule at line 4) [-Wunreachable-rules]
+    re2c: warning: line 7: unreachable rule (shadowed by rule at line 6) [-Wunreachable-rules]
+
+A look at the generated code suggests that re2c was right:
+
+.. code-block:: cpp
+    :number-lines:
+
+    /* Generated by re2c 0.14.1.dev on Fri Nov  6 15:21:36 2015*/
+    
+    {
+            YYCTYPE yych;
+            if (YYLIMIT <= YYCURSOR) YYFILL(1);
+            yych = *YYCURSOR;
+            switch (yych) {
+            case 'a':       goto yy5;
+            case 'b':       goto yy7;
+            default:        goto yy3;
+            }
+            { return ""; }
+    yy3:
+            ++YYCURSOR;
+            { return "[0 - 0xFF]"; }
+    yy5:
+            ++YYCURSOR;
+    yy6:
+            { return "a | b"; }
+    yy7:
+            ++YYCURSOR;
+            yych = *YYCURSOR;
+            goto yy6;
+    }
+
+Clearly, all the reported rules are unreachable (some of them are not even present in the generated code).
+Default rule ``*`` at line 3 is also unreachable, but re2c appreciates paranoid attempts
+to handle default case and never reports unreachable default rule.
+
diff --git a/src/manual/warnings/unreachable_rules/wunreachable_rules.rst b/src/manual/warnings/unreachable_rules/wunreachable_rules.rst
new file mode 100644 (file)
index 0000000..e88082b
--- /dev/null
@@ -0,0 +1,11 @@
+[-Wunreachable-rules]
+--------------------------
+
+.. include:: ../home.rst
+.. include:: ../../../contents.rst
+
+.. include:: simple_example.rst
+.. include:: infinite_rules.rst
+.. include:: how_it_works.rst
+.. include:: real_world.rst
+
index 832d56333acf4d16c44c4837acef22ac151fdaf0..e59fc5db178a364cb033ba676087e5c1dfdcc163 100644 (file)
@@ -7,7 +7,7 @@ Warnings
 ★
 
 * `[-Wundefined-control-flow] <undefined_control_flow/wundefined_control_flow.html>`_
-* `[-Wunreachable-rules]      <wunreachable_rules.html>`_
+* `[-Wunreachable-rules]      <unreachable_rules/wunreachable_rules.html>`_
 * `[-Wcondition-order]        <wcondition_order.html>`_
 * `[-Wuseless-escape]         <wuseless_escape.html>`_
 * `[-Wswapped-range]          <wswapped_range.html>`_
diff --git a/src/manual/warnings/wunreachable_rules.rst b/src/manual/warnings/wunreachable_rules.rst
deleted file mode 100644 (file)
index 4e4c8a1..0000000
+++ /dev/null
@@ -1,11 +0,0 @@
-[-Wunreachable-rules]
---------------------------
-
-.. include:: home.rst
-
-.. code-block:: cpp
-
-    /*!re2c
-        [^]* "a" {}
-    */
-