From eebb142941d191533d0ffc14e4c8374cd07ec8fc Mon Sep 17 00:00:00 2001 From: Ulya Trofimovich Date: Sat, 7 Nov 2015 12:59:09 +0000 Subject: [PATCH] Added [-Wunreachable-rules] description. --- .../undefined_control_flow/default_vs_any.rst | 5 +- .../undefined_control_flow/real_world.rst | 3 +- .../undefined_control_flow/simple_example.rst | 4 + .../wundefined_control_flow.rst | 2 +- .../unreachable_rules/how_it_works.rst | 16 +++ .../unreachable_rules/infinite_rules.rst | 42 ++++++++ .../warnings/unreachable_rules/real_world.rst | 97 +++++++++++++++++++ .../unreachable_rules/simple_example.rst | 57 +++++++++++ .../unreachable_rules/wunreachable_rules.rst | 11 +++ src/manual/warnings/warnings.rst | 2 +- src/manual/warnings/wunreachable_rules.rst | 11 --- 11 files changed, 233 insertions(+), 17 deletions(-) create mode 100644 src/manual/warnings/unreachable_rules/how_it_works.rst create mode 100644 src/manual/warnings/unreachable_rules/infinite_rules.rst create mode 100644 src/manual/warnings/unreachable_rules/real_world.rst create mode 100644 src/manual/warnings/unreachable_rules/simple_example.rst create mode 100644 src/manual/warnings/unreachable_rules/wunreachable_rules.rst delete mode 100644 src/manual/warnings/wunreachable_rules.rst diff --git a/src/manual/warnings/undefined_control_flow/default_vs_any.rst b/src/manual/warnings/undefined_control_flow/default_vs_any.rst index 413e144f..d2ad8137 100644 --- a/src/manual/warnings/undefined_control_flow/default_vs_any.rst +++ b/src/manual/warnings/undefined_control_flow/default_vs_any.rst @@ -5,6 +5,7 @@ When the the world was young and re2c didn't have default rule ``*`` (that is, b everyone used ``[^]`` as default rule: .. code-block:: cpp + :number-lines: /*!re2c // ... normal rules ... @@ -30,7 +31,7 @@ However, with UTF-8 encoding ```re2c -i8 -Wundefined-control-flow``` says: .. code-block:: - re2c: warning: line 3: control flow is undefined for strings that match + re2c: warning: line 4: control flow is undefined for strings that match '[\x80-\xC1\xF5-\xFF]' '\xF0 [\x0-\x8F\xC0-\xFF]' '[\xE1-\xEF] [\x0-\x7F\xC0-\xFF]' @@ -49,7 +50,7 @@ If we tell re2c to exclude surrogates, ```re2c -ix --encoding-policy fail -Wunde .. code-block:: - re2c: warning: line 3: control flow is undefined for strings that match + re2c: warning: line 4: control flow is undefined for strings that match '[\xDC00-\xDFFF]' '[\xD800-\xDBFF] [\x0-\xDBFF\xE000-\xFFFF]' , use default rule '*' [-Wundefined-control-flow] diff --git a/src/manual/warnings/undefined_control_flow/real_world.rst b/src/manual/warnings/undefined_control_flow/real_world.rst index 5baa9872..0aee17ba 100644 --- a/src/manual/warnings/undefined_control_flow/real_world.rst +++ b/src/manual/warnings/undefined_control_flow/real_world.rst @@ -10,8 +10,7 @@ Even it you are absolutely sure that default case is impossible, do handle it. No additional checks and transitions. It simply binds code to default label. -I found ``[-Wundefined-control-flow]`` warnings in many real-world programs -(including some of the PHP lexers and re2c own lexer). +I found ``[-Wundefined-control-flow]`` warnings in many real-world programs (including re2c own lexer). Mostly these are minor issues like forgetting to handle newlines or zeroes in already preprocessed input, but it's curious how they creeped into the code. I bet they were just forgotten and not omitted for a good reason. ``:)`` diff --git a/src/manual/warnings/undefined_control_flow/simple_example.rst b/src/manual/warnings/undefined_control_flow/simple_example.rst index cbc822eb..fc447984 100644 --- a/src/manual/warnings/undefined_control_flow/simple_example.rst +++ b/src/manual/warnings/undefined_control_flow/simple_example.rst @@ -4,6 +4,7 @@ A simple example Say, we want to match ``'a'``: .. code-block:: cpp + :number-lines: /*!re2c "a" { return 'a'; } @@ -12,6 +13,7 @@ Say, we want to match ``'a'``: ```re2c -i -Wundefined-control-flow```: .. code-block:: cpp + :number-lines: /* Generated by re2c 0.14.1.dev on Thu Nov 5 14:35:46 2015*/ @@ -39,6 +41,7 @@ re2c grumbles something about undefined control flow and says that default ``*`` Let's add it: .. code-block:: cpp + :number-lines: /*!re2c * { return '*'; } @@ -48,6 +51,7 @@ Let's add it: Now that's better: .. code-block:: cpp + :number-lines: /* Generated by re2c 0.14.1.dev on Thu Nov 5 14:35:08 2015*/ diff --git a/src/manual/warnings/undefined_control_flow/wundefined_control_flow.rst b/src/manual/warnings/undefined_control_flow/wundefined_control_flow.rst index f184c4b2..3f2600e7 100644 --- a/src/manual/warnings/undefined_control_flow/wundefined_control_flow.rst +++ b/src/manual/warnings/undefined_control_flow/wundefined_control_flow.rst @@ -5,7 +5,7 @@ .. include:: ../../../contents.rst .. include:: simple_example.rst -.. include:: how_it_works.rst .. include:: default_vs_any.rst +.. include:: how_it_works.rst .. include:: real_world.rst diff --git a/src/manual/warnings/unreachable_rules/how_it_works.rst b/src/manual/warnings/unreachable_rules/how_it_works.rst new file mode 100644 index 00000000..186baa49 --- /dev/null +++ b/src/manual/warnings/unreachable_rules/how_it_works.rst @@ -0,0 +1,16 @@ +How it works +~~~~~~~~~~~~ + +For each state of the generated DFA re2c calculates the set of all rules reachable from this state +(including default rule and no rule at all). +The algorithm starts with initial state and recurses to child states. +Recursion stops if either current state is final (then its set is trivial) +or the set for current state has already been calculated. +Thus each state is processed only once and the algorithm has ``O(n)`` complexity +(where ``n`` is the number of states in DFA). +When all the sets have been calculated, re2c checks that the set of rules +reachable from each accepting state includes either accepted rule or no rule at all. + +Analyses is done regardless of the ``-Wunreachable-rules`` option, +the option only controls if the warning is reported or not. + diff --git a/src/manual/warnings/unreachable_rules/infinite_rules.rst b/src/manual/warnings/unreachable_rules/infinite_rules.rst new file mode 100644 index 00000000..e6e7793b --- /dev/null +++ b/src/manual/warnings/unreachable_rules/infinite_rules.rst @@ -0,0 +1,42 @@ +Infinite rules +~~~~~~~~~~~~~~ + +A rule may be unreachable all by itself, without being shadowed by other rules: + +.. code-block:: cpp + :number-lines: + + /*!re2c + [^]* { return "greeedy"; } + */ + +This rule is so greedy that it never stops. +It will continue eating input until ``YYFILL`` finally fails (```re2c -i -Wunreachable-rules```): + +.. code-block:: cpp + :number-lines: + + /* Generated by re2c 0.14.1.dev on Fri Nov 6 21:36:56 2015*/ + + { + YYCTYPE yych; + goto yy0; + yy1: + ++YYCURSOR; + yy0: + if (YYLIMIT <= YYCURSOR) YYFILL(1); + yych = *YYCURSOR; + goto yy1; + { return "greeedy"; } + } + +And if we suppress ``YYFILL`` generation with ``re2c:yyfill:enable = 0;``, +lexer will loop until segfault or another disaster forces its untimely shutdown +(unless you use ``--input-api custom`` and do some magic to stop it). +And re2c warns us: + +.. code-block:: + + re2c: warning: line 2: unreachable rule [-Wunreachable-rules] + +All rules that include ``[^]*`` (or any component isomorphic to ``[^]*``) are also infinite. diff --git a/src/manual/warnings/unreachable_rules/real_world.rst b/src/manual/warnings/unreachable_rules/real_world.rst new file mode 100644 index 00000000..8dd0932a --- /dev/null +++ b/src/manual/warnings/unreachable_rules/real_world.rst @@ -0,0 +1,97 @@ +Real-world examples +~~~~~~~~~~~~~~~~~~~ + +In many real-world examples re2c reports unreachable ``[^]`` rule, +which is quite understandable: ``[^]`` served as default rule long before +the true default rule ``*`` was added to re2c. +However, in some cases the warning is quite interesting. +Here is an example of a real-world lexer (all the non-re2c code has beed removed): + +.. code-block:: cpp + :number-lines: + + /*!re2c + re2c:yyfill:check = 0; + + LNUM [0-9]+ + DNUM ([0-9]*[\.][0-9]+)|([0-9]+[\.][0-9]*) + NUMBER [-]?{LNUM}|{DNUM} + ANY_CHAR (.|[\n\t]) + NEWLINE ("\r"|"\n"|"\r\n") + TABS_AND_SPACES [ \t] + WHITESPACE [ \t]+ + CONSTANT [a-zA-Z_][a-zA-Z0-9_]* + LABEL [^=\n\r\t;&|^$~(){}!"\[]+ + TOKENS [:,.\[\]"'()&|^+-/*=%$!~<>?@{}] + OPERATORS [&|^~()!] + DOLLAR_CURLY "${" + SECTION_RAW_CHARS [^\]\n\r] + SINGLE_QUOTED_CHARS [^'] + RAW_VALUE_CHARS [^\n\r;\000] + LITERAL_DOLLAR ("$"([^{\000]|("\\"{ANY_CHAR}))) + VALUE_CHARS ([^$= \t\n\r;&|^~()!"'\000]|{LITERAL_DOLLAR}) + SECTION_VALUE_CHARS ([^$\n\r;"'\]\\]|("\\"{ANY_CHAR})|{LITERAL_DOLLAR}) + + "[" {} + "'"{SINGLE_QUOTED_CHARS}+"'" {} + "]"{TABS_AND_SPACES}*{NEWLINE}? {} + {LABEL}"["{TABS_AND_SPACES}* {} + {TABS_AND_SPACES}*"]" {} + {DOLLAR_CURLY} {} + {LABEL} {} + "}" {} + ("true"|"on"|"yes"){TABS_AND_SPACES}* {} + ("false"|"off"|"no"|"none"){TABS_AND_SPACES}* {} + ("null"){TABS_AND_SPACES}* {} + {LABEL} {} + {TABS_AND_SPACES}*[=]{TABS_AND_SPACES}* {} + {RAW_VALUE_CHARS} {} + {SECTION_RAW_CHARS}+ {} + {TABS_AND_SPACES}*{NEWLINE} {} + {CONSTANT} {} + {NUMBER} {} + {TOKENS} {} + {OPERATORS}{TABS_AND_SPACES}* {} + [=] {} + {VALUE_CHARS}+ {} + {SECTION_VALUE_CHARS}+ {} + {TABS_AND_SPACES}*["] {} + ["]{TABS_AND_SPACES}* {} + [^] {} + {WHITESPACE} {} + {TABS_AND_SPACES}+ {} + {TABS_AND_SPACES}*{NEWLINE} {} + {TABS_AND_SPACES}*[;][^\r\n]*{NEWLINE} {} + [^] {} + <*>[^] {} + */ + +```re2c -cF -Wunreachable-rules``` says: + +.. code-block:: + + re2c: warning: line 54: unreachable rule in condition 'ST_DOUBLE_QUOTES' (shadowed by rules at lines 47, 48) [-Wunreachable-rules] + re2c: warning: line 49: unreachable rule in condition 'ST_OFFSET' (shadowed by rule at line 45) [-Wunreachable-rules] + re2c: warning: line 54: unreachable rule in condition 'ST_RAW' (shadowed by rules at lines 36, 38, 53) [-Wunreachable-rules] + re2c: warning: line 49: unreachable rule in condition 'ST_SECTION_VALUE' (shadowed by rule at line 45) [-Wunreachable-rules] + re2c: warning: line 54: unreachable rule in condition 'ST_VALUE' (shadowed by rules at lines 38, 39, 40, 42, 43, 44, 46, 49, 53) [-Wunreachable-rules] + +The interesting part is the unreachable rule on line 49 in conditions ``ST_OFFSET`` and ``ST_SECTION_VALUE``. +The rule is ``{WHITESPACE}``: + +.. code-block:: + + WHITESPACE [ \t]+ + +re2c claims that it is shadowed by the rule on line 45, which is ``{SECTION_VALUE_CHARS}+``: + +.. code-block:: + + ANY_CHAR (.|[\n\t]) + LITERAL_DOLLAR ("$"([^{\000]|("\\"{ANY_CHAR}))) + SECTION_VALUE_CHARS ([^$\n\r;"'\]\\]|("\\"{ANY_CHAR})|{LITERAL_DOLLAR}) + +Indeed, ``{SECTION_VALUE_CHARS}+`` allows all the patterns accepted by ``{WHITESPACE}``. +In the original program these rules return different types of tokens: +perhaps this is not critical, but clearly unintended. + diff --git a/src/manual/warnings/unreachable_rules/simple_example.rst b/src/manual/warnings/unreachable_rules/simple_example.rst new file mode 100644 index 00000000..2ac4866e --- /dev/null +++ b/src/manual/warnings/unreachable_rules/simple_example.rst @@ -0,0 +1,57 @@ +A simple example +~~~~~~~~~~~~~~~~ + +.. code-block:: cpp + :number-lines: + + /*!re2c + "" { return ""; } + * { return "*"; } + "a" | "b" { return "a | b"; } + "a" { return "a"; } + [\x00-\xFF] { return "[0 - 0xFF]"; } + [^] { return "[^]"; } + */ + +Given this strange code, ```re2c -i -Wunreachable-rules``` says: + +.. code-block:: + + re2c: warning: line 2: unreachable rule (shadowed by rules at lines 4, 6) [-Wunreachable-rules] + re2c: warning: line 5: unreachable rule (shadowed by rule at line 4) [-Wunreachable-rules] + re2c: warning: line 7: unreachable rule (shadowed by rule at line 6) [-Wunreachable-rules] + +A look at the generated code suggests that re2c was right: + +.. code-block:: cpp + :number-lines: + + /* Generated by re2c 0.14.1.dev on Fri Nov 6 15:21:36 2015*/ + + { + YYCTYPE yych; + if (YYLIMIT <= YYCURSOR) YYFILL(1); + yych = *YYCURSOR; + switch (yych) { + case 'a': goto yy5; + case 'b': goto yy7; + default: goto yy3; + } + { return ""; } + yy3: + ++YYCURSOR; + { return "[0 - 0xFF]"; } + yy5: + ++YYCURSOR; + yy6: + { return "a | b"; } + yy7: + ++YYCURSOR; + yych = *YYCURSOR; + goto yy6; + } + +Clearly, all the reported rules are unreachable (some of them are not even present in the generated code). +Default rule ``*`` at line 3 is also unreachable, but re2c appreciates paranoid attempts +to handle default case and never reports unreachable default rule. + diff --git a/src/manual/warnings/unreachable_rules/wunreachable_rules.rst b/src/manual/warnings/unreachable_rules/wunreachable_rules.rst new file mode 100644 index 00000000..e88082bb --- /dev/null +++ b/src/manual/warnings/unreachable_rules/wunreachable_rules.rst @@ -0,0 +1,11 @@ +[-Wunreachable-rules] +-------------------------- + +.. include:: ../home.rst +.. include:: ../../../contents.rst + +.. include:: simple_example.rst +.. include:: infinite_rules.rst +.. include:: how_it_works.rst +.. include:: real_world.rst + diff --git a/src/manual/warnings/warnings.rst b/src/manual/warnings/warnings.rst index 832d5633..e59fc5db 100644 --- a/src/manual/warnings/warnings.rst +++ b/src/manual/warnings/warnings.rst @@ -7,7 +7,7 @@ Warnings ★ * `[-Wundefined-control-flow] `_ -* `[-Wunreachable-rules] `_ +* `[-Wunreachable-rules] `_ * `[-Wcondition-order] `_ * `[-Wuseless-escape] `_ * `[-Wswapped-range] `_ diff --git a/src/manual/warnings/wunreachable_rules.rst b/src/manual/warnings/wunreachable_rules.rst deleted file mode 100644 index 4e4c8a1d..00000000 --- a/src/manual/warnings/wunreachable_rules.rst +++ /dev/null @@ -1,11 +0,0 @@ -[-Wunreachable-rules] --------------------------- - -.. include:: home.rst - -.. code-block:: cpp - - /*!re2c - [^]* "a" {} - */ - -- 2.40.0