]> granicus.if.org Git - re2c/commitdiff
Added [-Wmatch-empty-string] description.
authorUlya Trofimovich <skvadrik@gmail.com>
Wed, 11 Nov 2015 15:29:23 +0000 (15:29 +0000)
committerUlya Trofimovich <skvadrik@gmail.com>
Wed, 11 Nov 2015 15:29:23 +0000 (15:29 +0000)
src/manual/warnings/condition_order/simple_example.rst
src/manual/warnings/match_empty_string/false_alarms.rst [new file with mode: 0644]
src/manual/warnings/match_empty_string/real_world.rst [new file with mode: 0644]
src/manual/warnings/match_empty_string/simple_example.rst [new file with mode: 0644]
src/manual/warnings/match_empty_string/wmatch_empty_string.re [new file with mode: 0644]
src/manual/warnings/match_empty_string/wmatch_empty_string.rst [new file with mode: 0644]
src/manual/warnings/warnings.rst
src/manual/warnings/wmatch_empty_string.rst [deleted file]

index 10da472b90e093124e94ebaec3a2184df198f96c..1707185c8bb427719b4d0f79ca2fda3dff52b648 100644 (file)
@@ -21,11 +21,11 @@ Let's compile and run it:
 
     $ re2c -c -o example.c -Wcondition-order wcondition_order.re
     $
-    $ gcc -o example example.c
+    $ g++ -o example example.c
     $ ./example
     aaaa,bbb!
     $
-    $ gcc -o example -DREVERSED_CONDITION_ORDER example.c
+    $ g++ -o example -DREVERSED_CONDITION_ORDER example.c
     $ ./example
     aaaa,bbb!
 
@@ -38,11 +38,11 @@ However, if we use ``-s`` re2c option, lexer becomes sensitive to condition orde
     re2c: warning: line 31: looks like you use hardcoded numbers instead of autogenerated condition names: better add
     '/*!types:re2c*/' directive or '-t, --type-header' option and don't rely on fixed condition order. [-Wcondition-order]
     $
-    $ gcc -o example example.c
+    $ g++ -o example example.c
     $ ./example
     aaaa,bbb!
     $
-    $ gcc -o example -DREVERSED_CONDITION_ORDER example.c
+    $ g++ -o example -DREVERSED_CONDITION_ORDER example.c
     $ ./example
     error
 
diff --git a/src/manual/warnings/match_empty_string/false_alarms.rst b/src/manual/warnings/match_empty_string/false_alarms.rst
new file mode 100644 (file)
index 0000000..e6c4e9c
--- /dev/null
@@ -0,0 +1,16 @@
+False alarms
+~~~~~~~~~~~~
+
+In many cases matching empty string makes perfect sense:
+
+* It might be used as a non-consuming default rule.
+
+* It might be used to lex an optional lexeme: if lexeme rules didn't match,
+  lexer must jump to another block and resume lexing at the same input position.
+
+Or any other useful examples you can invent.
+All these cases are perfectly sane.
+If ``[-Wmatch-empty-string]`` becomes annoying, use ``[-Wno-match-empty-string]``.
+Maybe re2c should move this warning to some paranoid level.
+
+
diff --git a/src/manual/warnings/match_empty_string/real_world.rst b/src/manual/warnings/match_empty_string/real_world.rst
new file mode 100644 (file)
index 0000000..beebf83
--- /dev/null
@@ -0,0 +1,198 @@
+Real-world examples
+~~~~~~~~~~~~~~~~~~~
+
+In general, it is a common mistake to use ``*`` instead of ``+`` in repetitions.
+That is, to accept zero or more repetitions instead of one or more.
+
+Typos in definitions
+....................
+
+Here is the skeleton of REXX lexer (the very lexer which motivated Peter to write re2c ``:)``).
+
+.. code-block:: cpp
+    :number-lines:
+
+    /*!re2c
+        all    = [\000-\377];
+        eof    = [\000];
+        any    = all\eof;
+        letter = [a-z]|[A-Z];
+        digit  = [0-9];
+        symchr = letter|digit|[.!?_];
+        const  = (digit|[.])symchr*([eE][+-]?digit+)?;
+        simple = (symchr\(digit|[.]))(symchr\[.])*;
+        stem   = simple [.];
+        symbol = symchr*;
+        sqstr  = ['] ((any\['\n])|(['][']))* ['];
+        dqstr  = ["] ((any\["\n])|(["]["]))* ["];
+        str    = sqstr|dqstr;
+        ob     = [ \t]*;
+        not    = [\\~];
+        A      = [aA];
+        B      = [bB];
+        C      = [cC];
+        D      = [dD];
+        E      = [eE];
+        F      = [fF];
+        G      = [gG];
+        H      = [hH];
+        I      = [iI];
+        J      = [jJ];
+        K      = [kK];
+        L      = [lL];
+        M      = [mM];
+        N      = [nN];
+        O      = [oO];
+        P      = [pP];
+        Q      = [qQ];
+        R      = [rR];
+        S      = [sS];
+        T      = [tT];
+        U      = [uU];
+        V      = [vV];
+        W      = [wW];
+        X      = [xX];
+        Y      = [yY];
+        Z      = [zZ];
+    
+        "\n"                                  {}
+        "|" ob "|"                            {}
+        "+"                                   {}
+        "-"                                   {}
+        "*"                                   {}
+        "/"                                   {}
+        "%"                                   {}
+        "/" ob "/"                            {}
+        "*" ob "*"                            {}
+        "="                                   {}
+        not ob "=" | "<" ob ">" | ">" ob "<"  {}
+        ">"                                   {}
+        "<"                                   {}
+        ">" ob "=" | not ob "<"               {}
+        "<" ob "=" | not ob ">"               {}
+        "=" ob "="                            {}
+        not ob "=" ob "="                     {}
+        ">" ob ">"                            {}
+        "<" ob "<"                            {}
+        ">" ob ">" ob "=" | not ob "<" ob "<" {}
+        "<" ob "<" ob "=" | not ob ">" ob ">" {}
+        "&"                                   {}
+        "|"                                   {}
+        "&" ob "&"                            {}
+        not                                   {}
+        ":"                                   {}
+        ","                                   {}
+        "("                                   {}
+        ")"                                   {}
+        ";"                                   {}
+        A D D R E S S                         {}
+        A R G                                 {}
+        C A L L                               {}
+        D O                                   {}
+        D R O P                               {}
+        E L S E                               {}
+        E N D                                 {}
+        E X I T                               {}
+        I F                                   {}
+        I N T E R P R E T                     {}
+        I T E R A T E                         {}
+        L E A V E                             {}
+        N O P                                 {}
+        N U M E R I C                         {}
+        O P T I O N S                         {}
+        O T H E R W I S E                     {}
+        P A R S E                             {}
+        P R O C E D U R E                     {}
+        P U L L                               {}
+        P U S H                               {}
+        Q U E U E                             {}
+        R E T U R N                           {}
+        S A Y                                 {}
+        S E L E C T                           {}
+        S I G N A L                           {}
+        T H E N                               {}
+        T R A C E                             {}
+        W H E N                               {}
+        O F F                                 {}
+        O N                                   {}
+        B Y                                   {}
+        D I G I T S                           {}
+        E N G I N E E R I N G                 {}
+        E R R O R                             {}
+        E X P O S E                           {}
+        F A I L U R E                         {}
+        F O R                                 {}
+        F O R E V E R                         {}
+        F O R M                               {}
+        F U Z Z                               {}
+        H A L T                               {}
+        L I N E I N                           {}
+        N A M E                               {}
+        N O T R E A D Y                       {}
+        N O V A L U E                         {}
+        S C I E N T I F I C                   {}
+        S O U R C E                           {}
+        S Y N T A X                           {}
+        T O                                   {}
+        U N T I L                             {}
+        U P P E R                             {}
+        V A L U E                             {}
+        V A R                                 {}
+        V E R S I O N                         {}
+        W H I L E                             {}
+        W I T H                               {}
+        const                                 {}
+        simple                                {}
+        stem                                  {}
+        symbol                                {}
+        str                                   {}
+        str [bB] / (all\symchr)               {}
+        str [xX] / (all\symchr)               {}
+        eof                                   {}
+        any                                   {}
+    */
+
+```re2c -Wmatch-empty-string``` warns:
+
+.. code-block::
+
+    re2c: warning: line 133: rule matches empty string [-Wmatch-empty-string]
+
+The faulty rule is ``symbol``.
+It is defined as ``symchr*`` and clearly is nullable.
+In this particular example (assuming ASCII encoding) empty match is shadowed by other rules:
+together ``eof`` and ``any`` cover all possible code units.
+So in this case there is no chance of hitting eternal loop.
+
+However, by no means ``symbol`` should be nullable: it makes no sense.
+Sure, it's just a typo and the author meant ``symchr+``.
+
+
+Skipping uninteresting stuff
+............................
+
+Like spaces.
+One often needs to skip variable number of them:
+
+.. code-block:: cpp
+    :number-lines:
+
+    /*!re2c
+        TABS_AND_SPACES = [ \t]*;
+    */
+
+This definition is ok when used inside of another (non-nullable) rule:
+
+.. code-block:: cpp
+    :number-lines:
+
+    /*!re2c
+        TABS_AND_SPACES = [ \t]*;
+        "(" TABS_AND_SPACES ("int" | "integer") TABS_AND_SPACES ")" {}
+    */
+
+However, as a standalone rule it may cause eternal loop on ill-formed input.
+And it's very common to reuse one rule for multiple purposes.
+
+
+
diff --git a/src/manual/warnings/match_empty_string/simple_example.rst b/src/manual/warnings/match_empty_string/simple_example.rst
new file mode 100644 (file)
index 0000000..6b5f558
--- /dev/null
@@ -0,0 +1,45 @@
+A simple example
+~~~~~~~~~~~~~~~~
+
+``[-Wmatch-empty-string]`` warns when a rule is nullable (matches empty string).
+It was intended to prevent hitting eternal loop in cases like this:
+
+`[wmatch_empty_string.re] <wmatch_empty_string.re>`_
+
+.. include:: wmatch_empty_string.re
+    :code: cpp
+    :number-lines:
+
+The program loops over its arguments (the outer ``for`` loop)
+and tries to lex each argument (the inner ``for`` loop).
+Lexer stops when all input has been consumed and it sees the terminating ``NULL``.
+Arguments must consist of lowercase letters only.
+Generate, compile and run:
+
+.. code-block::
+
+    $ re2c -o example.c -Wmatch-empty-string wmatch_empty_string.re
+    re2c: warning: line 11: rule matches empty string [-Wmatch-empty-string]
+    $ g++ -o example example.c
+    $
+    $ ./example only lowercase letters are allowed
+    argv[1]: only
+    argv[2]: lowercase
+    argv[3]: letters
+    argv[4]: are
+    argv[5]: allowed
+    $
+    $ ./example oh really?
+    argv[1]: oh
+    ^C
+
+The program hangs forever if one of the arguments is ill-formed.
+
+Note that `[-Wundefined-control-flow] <../undefined_control_flow/wundefined_control_flow.html>`_
+has no complaints about this particular case: all input patterns are covered by rules.
+Yet if we add default rule ``*``, lexer won't hang anymore: it will match default rule
+instead of nullable rule.
+
+The fix is easy: make the rule non-nullable (say, ``[a-z]+``) and add default rule ``*``.
+
+
diff --git a/src/manual/warnings/match_empty_string/wmatch_empty_string.re b/src/manual/warnings/match_empty_string/wmatch_empty_string.re
new file mode 100644 (file)
index 0000000..f4926d9
--- /dev/null
@@ -0,0 +1,17 @@
+#include <stdio.h>
+
+int main(int argc, char **argv)
+{
+    for (int i = 1; i < argc; ++i) {
+        for (char *YYCURSOR = argv[i];;) {
+        /*!re2c
+            re2c:define:YYCTYPE = char;
+            re2c:yyfill:enable = 0;
+            "\x00" { break; }
+            [a-z]* { continue; }
+        */
+        }
+        printf("argv[%d]: %s\n", i, argv[i]);
+    }
+    return 0;
+}
diff --git a/src/manual/warnings/match_empty_string/wmatch_empty_string.rst b/src/manual/warnings/match_empty_string/wmatch_empty_string.rst
new file mode 100644 (file)
index 0000000..dd4a9f6
--- /dev/null
@@ -0,0 +1,10 @@
+[-Wmatch-empty-string]
+--------------------------
+
+.. include:: ../home.rst
+.. include:: ../../../contents.rst
+
+.. include:: simple_example.rst
+.. include:: false_alarms.rst
+.. include:: real_world.rst
+
index 0e77c0f74cde8851824a19886ef0fc6ae0786ab8..d450b997c8e160ba9ce6f01479ec5c0684621ffa 100644 (file)
@@ -12,7 +12,7 @@ Warnings
 * `[-Wuseless-escape]         <useless_escape/wuseless_escape.html>`_
 * `[-Wswapped-range]          <swapped_range/wswapped_range.html>`_
 * `[-Wempty-character-class]  <empty_character_class/wempty_character_class.html>`_
-* `[-Wmatch-empty-string]     <wmatch_empty_string.html>`_
+* `[-Wmatch-empty-string]     <match_empty_string/wmatch_empty_string.html>`_
 
 .. include:: warnings_general.rst
 
diff --git a/src/manual/warnings/wmatch_empty_string.rst b/src/manual/warnings/wmatch_empty_string.rst
deleted file mode 100644 (file)
index 7d89643..0000000
+++ /dev/null
@@ -1,5 +0,0 @@
-[-Wmatch-empty-string]
---------------------------
-
-.. include:: home.rst
-