From b264702120aabd4402ddfb09c697a90a54aca068 Mon Sep 17 00:00:00 2001 From: Ulya Trofimovich Date: Wed, 11 Nov 2015 15:29:23 +0000 Subject: [PATCH] Added [-Wmatch-empty-string] description. --- .../condition_order/simple_example.rst | 8 +- .../match_empty_string/false_alarms.rst | 16 ++ .../match_empty_string/real_world.rst | 198 ++++++++++++++++++ .../match_empty_string/simple_example.rst | 45 ++++ .../match_empty_string/wmatch_empty_string.re | 17 ++ .../wmatch_empty_string.rst | 10 + src/manual/warnings/warnings.rst | 2 +- src/manual/warnings/wmatch_empty_string.rst | 5 - 8 files changed, 291 insertions(+), 10 deletions(-) create mode 100644 src/manual/warnings/match_empty_string/false_alarms.rst create mode 100644 src/manual/warnings/match_empty_string/real_world.rst create mode 100644 src/manual/warnings/match_empty_string/simple_example.rst create mode 100644 src/manual/warnings/match_empty_string/wmatch_empty_string.re create mode 100644 src/manual/warnings/match_empty_string/wmatch_empty_string.rst delete mode 100644 src/manual/warnings/wmatch_empty_string.rst diff --git a/src/manual/warnings/condition_order/simple_example.rst b/src/manual/warnings/condition_order/simple_example.rst index 10da472b..1707185c 100644 --- a/src/manual/warnings/condition_order/simple_example.rst +++ b/src/manual/warnings/condition_order/simple_example.rst @@ -21,11 +21,11 @@ Let's compile and run it: $ re2c -c -o example.c -Wcondition-order wcondition_order.re $ - $ gcc -o example example.c + $ g++ -o example example.c $ ./example aaaa,bbb! $ - $ gcc -o example -DREVERSED_CONDITION_ORDER example.c + $ g++ -o example -DREVERSED_CONDITION_ORDER example.c $ ./example aaaa,bbb! @@ -38,11 +38,11 @@ However, if we use ``-s`` re2c option, lexer becomes sensitive to condition orde re2c: warning: line 31: looks like you use hardcoded numbers instead of autogenerated condition names: better add '/*!types:re2c*/' directive or '-t, --type-header' option and don't rely on fixed condition order. [-Wcondition-order] $ - $ gcc -o example example.c + $ g++ -o example example.c $ ./example aaaa,bbb! $ - $ gcc -o example -DREVERSED_CONDITION_ORDER example.c + $ g++ -o example -DREVERSED_CONDITION_ORDER example.c $ ./example error diff --git a/src/manual/warnings/match_empty_string/false_alarms.rst b/src/manual/warnings/match_empty_string/false_alarms.rst new file mode 100644 index 00000000..e6c4e9cc --- /dev/null +++ b/src/manual/warnings/match_empty_string/false_alarms.rst @@ -0,0 +1,16 @@ +False alarms +~~~~~~~~~~~~ + +In many cases matching empty string makes perfect sense: + +* It might be used as a non-consuming default rule. + +* It might be used to lex an optional lexeme: if lexeme rules didn't match, + lexer must jump to another block and resume lexing at the same input position. + +Or any other useful examples you can invent. +All these cases are perfectly sane. +If ``[-Wmatch-empty-string]`` becomes annoying, use ``[-Wno-match-empty-string]``. +Maybe re2c should move this warning to some paranoid level. + + diff --git a/src/manual/warnings/match_empty_string/real_world.rst b/src/manual/warnings/match_empty_string/real_world.rst new file mode 100644 index 00000000..beebf834 --- /dev/null +++ b/src/manual/warnings/match_empty_string/real_world.rst @@ -0,0 +1,198 @@ +Real-world examples +~~~~~~~~~~~~~~~~~~~ + +In general, it is a common mistake to use ``*`` instead of ``+`` in repetitions. +That is, to accept zero or more repetitions instead of one or more. + +Typos in definitions +.................... + +Here is the skeleton of REXX lexer (the very lexer which motivated Peter to write re2c ``:)``). + +.. code-block:: cpp + :number-lines: + + /*!re2c + all = [\000-\377]; + eof = [\000]; + any = all\eof; + letter = [a-z]|[A-Z]; + digit = [0-9]; + symchr = letter|digit|[.!?_]; + const = (digit|[.])symchr*([eE][+-]?digit+)?; + simple = (symchr\(digit|[.]))(symchr\[.])*; + stem = simple [.]; + symbol = symchr*; + sqstr = ['] ((any\['\n])|(['][']))* [']; + dqstr = ["] ((any\["\n])|(["]["]))* ["]; + str = sqstr|dqstr; + ob = [ \t]*; + not = [\\~]; + A = [aA]; + B = [bB]; + C = [cC]; + D = [dD]; + E = [eE]; + F = [fF]; + G = [gG]; + H = [hH]; + I = [iI]; + J = [jJ]; + K = [kK]; + L = [lL]; + M = [mM]; + N = [nN]; + O = [oO]; + P = [pP]; + Q = [qQ]; + R = [rR]; + S = [sS]; + T = [tT]; + U = [uU]; + V = [vV]; + W = [wW]; + X = [xX]; + Y = [yY]; + Z = [zZ]; + + "\n" {} + "|" ob "|" {} + "+" {} + "-" {} + "*" {} + "/" {} + "%" {} + "/" ob "/" {} + "*" ob "*" {} + "=" {} + not ob "=" | "<" ob ">" | ">" ob "<" {} + ">" {} + "<" {} + ">" ob "=" | not ob "<" {} + "<" ob "=" | not ob ">" {} + "=" ob "=" {} + not ob "=" ob "=" {} + ">" ob ">" {} + "<" ob "<" {} + ">" ob ">" ob "=" | not ob "<" ob "<" {} + "<" ob "<" ob "=" | not ob ">" ob ">" {} + "&" {} + "|" {} + "&" ob "&" {} + not {} + ":" {} + "," {} + "(" {} + ")" {} + ";" {} + A D D R E S S {} + A R G {} + C A L L {} + D O {} + D R O P {} + E L S E {} + E N D {} + E X I T {} + I F {} + I N T E R P R E T {} + I T E R A T E {} + L E A V E {} + N O P {} + N U M E R I C {} + O P T I O N S {} + O T H E R W I S E {} + P A R S E {} + P R O C E D U R E {} + P U L L {} + P U S H {} + Q U E U E {} + R E T U R N {} + S A Y {} + S E L E C T {} + S I G N A L {} + T H E N {} + T R A C E {} + W H E N {} + O F F {} + O N {} + B Y {} + D I G I T S {} + E N G I N E E R I N G {} + E R R O R {} + E X P O S E {} + F A I L U R E {} + F O R {} + F O R E V E R {} + F O R M {} + F U Z Z {} + H A L T {} + L I N E I N {} + N A M E {} + N O T R E A D Y {} + N O V A L U E {} + S C I E N T I F I C {} + S O U R C E {} + S Y N T A X {} + T O {} + U N T I L {} + U P P E R {} + V A L U E {} + V A R {} + V E R S I O N {} + W H I L E {} + W I T H {} + const {} + simple {} + stem {} + symbol {} + str {} + str [bB] / (all\symchr) {} + str [xX] / (all\symchr) {} + eof {} + any {} + */ + +```re2c -Wmatch-empty-string``` warns: + +.. code-block:: + + re2c: warning: line 133: rule matches empty string [-Wmatch-empty-string] + +The faulty rule is ``symbol``. +It is defined as ``symchr*`` and clearly is nullable. +In this particular example (assuming ASCII encoding) empty match is shadowed by other rules: +together ``eof`` and ``any`` cover all possible code units. +So in this case there is no chance of hitting eternal loop. + +However, by no means ``symbol`` should be nullable: it makes no sense. +Sure, it's just a typo and the author meant ``symchr+``. + + +Skipping uninteresting stuff +............................ + +Like spaces. +One often needs to skip variable number of them: + +.. code-block:: cpp + :number-lines: + + /*!re2c + TABS_AND_SPACES = [ \t]*; + */ + +This definition is ok when used inside of another (non-nullable) rule: + +.. code-block:: cpp + :number-lines: + + /*!re2c + TABS_AND_SPACES = [ \t]*; + "(" TABS_AND_SPACES ("int" | "integer") TABS_AND_SPACES ")" {} + */ + +However, as a standalone rule it may cause eternal loop on ill-formed input. +And it's very common to reuse one rule for multiple purposes. + + + diff --git a/src/manual/warnings/match_empty_string/simple_example.rst b/src/manual/warnings/match_empty_string/simple_example.rst new file mode 100644 index 00000000..6b5f5588 --- /dev/null +++ b/src/manual/warnings/match_empty_string/simple_example.rst @@ -0,0 +1,45 @@ +A simple example +~~~~~~~~~~~~~~~~ + +``[-Wmatch-empty-string]`` warns when a rule is nullable (matches empty string). +It was intended to prevent hitting eternal loop in cases like this: + +`[wmatch_empty_string.re] `_ + +.. include:: wmatch_empty_string.re + :code: cpp + :number-lines: + +The program loops over its arguments (the outer ``for`` loop) +and tries to lex each argument (the inner ``for`` loop). +Lexer stops when all input has been consumed and it sees the terminating ``NULL``. +Arguments must consist of lowercase letters only. +Generate, compile and run: + +.. code-block:: + + $ re2c -o example.c -Wmatch-empty-string wmatch_empty_string.re + re2c: warning: line 11: rule matches empty string [-Wmatch-empty-string] + $ g++ -o example example.c + $ + $ ./example only lowercase letters are allowed + argv[1]: only + argv[2]: lowercase + argv[3]: letters + argv[4]: are + argv[5]: allowed + $ + $ ./example oh really? + argv[1]: oh + ^C + +The program hangs forever if one of the arguments is ill-formed. + +Note that `[-Wundefined-control-flow] <../undefined_control_flow/wundefined_control_flow.html>`_ +has no complaints about this particular case: all input patterns are covered by rules. +Yet if we add default rule ``*``, lexer won't hang anymore: it will match default rule +instead of nullable rule. + +The fix is easy: make the rule non-nullable (say, ``[a-z]+``) and add default rule ``*``. + + diff --git a/src/manual/warnings/match_empty_string/wmatch_empty_string.re b/src/manual/warnings/match_empty_string/wmatch_empty_string.re new file mode 100644 index 00000000..f4926d90 --- /dev/null +++ b/src/manual/warnings/match_empty_string/wmatch_empty_string.re @@ -0,0 +1,17 @@ +#include + +int main(int argc, char **argv) +{ + for (int i = 1; i < argc; ++i) { + for (char *YYCURSOR = argv[i];;) { + /*!re2c + re2c:define:YYCTYPE = char; + re2c:yyfill:enable = 0; + "\x00" { break; } + [a-z]* { continue; } + */ + } + printf("argv[%d]: %s\n", i, argv[i]); + } + return 0; +} diff --git a/src/manual/warnings/match_empty_string/wmatch_empty_string.rst b/src/manual/warnings/match_empty_string/wmatch_empty_string.rst new file mode 100644 index 00000000..dd4a9f6b --- /dev/null +++ b/src/manual/warnings/match_empty_string/wmatch_empty_string.rst @@ -0,0 +1,10 @@ +[-Wmatch-empty-string] +-------------------------- + +.. include:: ../home.rst +.. include:: ../../../contents.rst + +.. include:: simple_example.rst +.. include:: false_alarms.rst +.. include:: real_world.rst + diff --git a/src/manual/warnings/warnings.rst b/src/manual/warnings/warnings.rst index 0e77c0f7..d450b997 100644 --- a/src/manual/warnings/warnings.rst +++ b/src/manual/warnings/warnings.rst @@ -12,7 +12,7 @@ Warnings * `[-Wuseless-escape] `_ * `[-Wswapped-range] `_ * `[-Wempty-character-class] `_ -* `[-Wmatch-empty-string] `_ +* `[-Wmatch-empty-string] `_ .. include:: warnings_general.rst diff --git a/src/manual/warnings/wmatch_empty_string.rst b/src/manual/warnings/wmatch_empty_string.rst deleted file mode 100644 index 7d89643f..00000000 --- a/src/manual/warnings/wmatch_empty_string.rst +++ /dev/null @@ -1,5 +0,0 @@ -[-Wmatch-empty-string] --------------------------- - -.. include:: home.rst - -- 2.50.1