Base '*' (zero or more repetitions) on '+' (one or more repetitions).

author Ulya Trofimovich <skvadrik@gmail.com>

Mon, 26 Sep 2016 11:29:27 +0000 (12:29 +0100)

committer Ulya Trofimovich <skvadrik@gmail.com>

Mon, 26 Sep 2016 11:29:27 +0000 (12:29 +0100)
author Ulya Trofimovich <skvadrik@gmail.com>
Mon, 26 Sep 2016 11:29:27 +0000 (12:29 +0100)
committer Ulya Trofimovich <skvadrik@gmail.com>
Mon, 26 Sep 2016 11:29:27 +0000 (12:29 +0100)
diff --git a/re2c/bootstrap/src/parse/lex.cc b/re2c/bootstrap/src/parse/lex.cc

index a6c45443410d7665aa62c50fcd1a8144be695a4a..dc23c0c7e2651a1c54d5fd831b957c4af60658d7 100644 (file)
--- a/re2c/bootstrap/src/parse/lex.cc
+++ b/re2c/bootstrap/src/parse/lex.cc
@@ -1,4 +1,4 @@
-/* Generated by re2c 0.16 on Wed May 11 15:38:08 2016 */
+/* Generated by re2c 0.16 on Mon Sep 26 12:27:15 2016 */
  #line 1 "../src/parse/lex.re"
  #include "src/util/c99_stdint.h"
  #include <stddef.h>
diff --git a/re2c/bootstrap/src/parse/parser.cc b/re2c/bootstrap/src/parse/parser.cc

index fb5f17dd20a748f5a3f28d7d20d734055828e993..40f1de550c35c5409e3060001ea33d34cee698a2 100644 (file)
--- a/re2c/bootstrap/src/parse/parser.cc
+++ b/re2c/bootstrap/src/parse/parser.cc
@@ -559,7 +559,7 @@ static const yytype_uint16 yyrline[] =
       185,   185,   188,   197,   208,   212,   218,   224,   231,   240,
       248,   258,   269,   275,   281,   284,   291,   297,   307,   310,
       317,   321,   327,   331,   338,   342,   349,   353,   360,   364,
-     379,   398,   402,   406,   410,   417,   427,   431
+     381,   400,   404,   408,   412,   419,   429,   433
  };
  #endif
  
@@ -1790,13 +1790,15 @@ yyreduce:
    case 39:
  
      {
+                       // see note [Kleene star is expressed in terms of plus]
                         switch((yyvsp[(2) - (2)].op))
                         {
                         case '*':
-                               (yyval.regexp) = RegExp::make_iter((yyvsp[(1) - (2)].regexp));
+                               (yyval.regexp) = RegExp::make_alt(RegExp::make_nil(),
+                                       RegExp::make_iter((yyvsp[(1) - (2)].regexp)));
                                 break;
                         case '+':
-                               (yyval.regexp) = RegExp::make_cat(RegExp::make_iter((yyvsp[(1) - (2)].regexp)), (yyvsp[(1) - (2)].regexp));
+                               (yyval.regexp) = RegExp::make_iter((yyvsp[(1) - (2)].regexp));
                                 break;
                         case '?':
                                 (yyval.regexp) = mkAlt((yyvsp[(1) - (2)].regexp), RegExp::make_nil());
diff --git a/re2c/src/ir/nfa/regexps2nfa.cc b/re2c/src/ir/nfa/regexps2nfa.cc

index dcb911cdff905c9e0af9a34d3138128d2e5e5dbd..d07e4cfad42b568a2d2a2ccaf8d5867e453463a4 100644 (file)
--- a/re2c/src/ir/nfa/regexps2nfa.cc
+++ b/re2c/src/ir/nfa/regexps2nfa.cc
@@ -24,10 +24,13 @@ static nfa_state_t *regexp2nfa(nfa_t &nfa, size_t nrule,
                         s = regexp2nfa(nfa, nrule, tagidx, re->cat.re2, t);
                         s = regexp2nfa(nfa, nrule, tagidx, re->cat.re1, s);
                         break;
-               case RegExp::ITER:
-                       s = &nfa.states[nfa.size++];
-                       s->make_alt(nrule, t, regexp2nfa(nfa, nrule, tagidx, re->iter, s));
+               case RegExp::ITER: {
+                       // see note [Kleene star is expressed in terms of plus]
+                       nfa_state_t *q = &nfa.states[nfa.size++];
+                       s = regexp2nfa(nfa, nrule, tagidx, re->iter, q);
+                       q->make_alt(nrule, t, s);
                         break;
+               }
                 case RegExp::TAG:
                         if ((*nfa.tags)[tagidx].type == Tag::VAR) {
                                 s = &nfa.states[nfa.size++];
diff --git a/re2c/src/ir/regexp/nullable.cc b/re2c/src/ir/regexp/nullable.cc

index 3abef0544e05ac05b5a23cce04913e65773c7021..286652e3ddd8597049b4f1a6bdca8e14e56a392d 100644 (file)
--- a/re2c/src/ir/regexp/nullable.cc
+++ b/re2c/src/ir/regexp/nullable.cc
@@ -10,24 +10,21 @@ static bool nullable(const RegExp *re, bool &trail)
                 return true;
         }
         switch (re->type) {
-               case RegExp::NIL:
-               case RegExp::ITER:
-                       return true;
+               default: assert(false);
+               case RegExp::NIL: return true;
+               case RegExp::SYM:
+               case RegExp::ITER: return false;
                 case RegExp::TAG:
                         if (re->tag == NULL) {
                                 trail = true;
                         }
                         return true;
-               case RegExp::SYM:
-                       return false;
                 case RegExp::ALT:
                         return nullable(re->alt.re1, trail)
                                 || nullable(re->alt.re2, trail);
                 case RegExp::CAT:
                         return nullable(re->cat.re1, trail)
                                 && nullable(re->cat.re2, trail);
-               default:
-                       assert(false);
         }
  }
  
diff --git a/re2c/src/ir/regexp/regexp.cc b/re2c/src/ir/regexp/regexp.cc

index cab9fc3c5a64a01e6cb323296843fbeec6d55073..ca13ec3be1757e47299579eb5195cdc68ef809f1 100644 (file)
--- a/re2c/src/ir/regexp/regexp.cc
+++ b/re2c/src/ir/regexp/regexp.cc
@@ -199,8 +199,9 @@ const RegExp *repeat_from_to(const RegExp *re, uint32_t n, uint32_t m)
  // see note [counted repetition expansion]
  const RegExp *repeat_from(const RegExp *re, uint32_t n)
  {
+       // see note [Kleene star is expressed in terms of plus]
         return doCat(repeat(re, n),
-               RegExp::make_iter(re));
+               RegExp::make_alt(RegExp::make_nil(), RegExp::make_iter(re)));
  }
  
  } // namespace re2c
diff --git a/re2c/src/ir/regexp/regexp.h b/re2c/src/ir/regexp/regexp.h

index 2d216753ad2bb0b81cb33a92c66b3ecd3d7d1965..9f083eca85800e31ebfee400167eec9ad6612fb8 100644 (file)
--- a/re2c/src/ir/regexp/regexp.h
+++ b/re2c/src/ir/regexp/regexp.h
@@ -18,6 +18,16 @@ struct nfa_t;
  
  typedef std::vector<uint32_t> charset_t;
  
+/* note [Kleene star is expressed in terms of plus]
+ *
+ * In literature Kleene star 'r*' (zero or more repetitions of 'r')
+ * is the basic operation. In practice it is more convenient to use
+ * 'r+' (one or more repetitions of 'r'), because expansion 'r+ ::= r r*'
+ * duplicates 'r', while expansion 'r* = r+ | <empty>' allows to
+ * avoid duplication. This is more efficient in general and crucial
+ * in cases when duplication of 'r' is forbidden (e.g. if 'r' has tags).
+ */
+
  struct RegExp
  {
         static free_list<RegExp*> flist;
diff --git a/re2c/src/parse/parser.ypp b/re2c/src/parse/parser.ypp

index 2d278a07f8ddeb5b1c34bbcf896f9c05da081d6c..d02b777ff2a476e52eb8154fed75da2f977fd812 100644 (file)
--- a/re2c/src/parse/parser.ypp
+++ b/re2c/src/parse/parser.ypp
@@ -363,13 +363,15 @@ factor:
                 }
         |       primary close
                 {
+                       // see note [Kleene star is expressed in terms of plus]
                         switch($2)
                         {
                         case '*':
-                               $$ = RegExp::make_iter($1);
+                               $$ = RegExp::make_alt(RegExp::make_nil(),
+                                       RegExp::make_iter($1));
                                 break;
                         case '+':
-                               $$ = RegExp::make_cat(RegExp::make_iter($1), $1);
+                               $$ = RegExp::make_iter($1);
                                 break;
                         case '?':
                                 $$ = mkAlt($1, RegExp::make_nil());
diff --git a/re2c/test/tags/iter_plus.i--tags.c b/re2c/test/tags/iter_plus.i--tags.c

new file mode 100644 (file)

index 0000000..94c1cfb
--- /dev/null
+++ b/re2c/test/tags/iter_plus.i--tags.c
@@ -0,0 +1,34 @@
+/* Generated by re2c */
+// ensure 'r+' (one or more repetitions) expansion does not duplicate 'r'
+// this is crucial if 'r' contains tags (tag duplication is forbidden)
+
+
+{
+       YYCTYPE yych;
+       long yytag0p;
+       YYCTXMARKER = YYCURSOR;
+       if (YYLIMIT <= YYCURSOR) YYFILL(1);
+       yych = *YYCURSOR;
+       switch (yych) {
+       case 'a':
+               yytag0p = (YYCURSOR - YYCTXMARKER);
+               goto yy4;
+       default:        goto yy2;
+       }
+yy2:
+       ++YYCURSOR;
+       { d }
+yy4:
+       ++YYCURSOR;
+       if (YYLIMIT <= YYCURSOR) YYFILL(1);
+       yych = *YYCURSOR;
+       switch (yych) {
+       case 'a':
+               yytag0p = (YYCURSOR - YYCTXMARKER);
+               goto yy4;
+       default:        goto yy6;
+       }
+yy6:
+       { (YYCTXMARKER + yytag0p) }
+}
+
diff --git a/re2c/test/tags/iter_plus.i--tags.re b/re2c/test/tags/iter_plus.i--tags.re

new file mode 100644 (file)

index 0000000..d7d25e7
--- /dev/null
+++ b/re2c/test/tags/iter_plus.i--tags.re
@@ -0,0 +1,7 @@
+// ensure 'r+' (one or more repetitions) expansion does not duplicate 'r'
+// this is crucial if 'r' contains tags (tag duplication is forbidden)
+
+/*!re2c
+    (@p "a")+ { @p }
+    *         { d }
+*/
author	Ulya Trofimovich <skvadrik@gmail.com>
	Mon, 26 Sep 2016 11:29:27 +0000 (12:29 +0100)
committer	Ulya Trofimovich <skvadrik@gmail.com>
	Mon, 26 Sep 2016 11:29:27 +0000 (12:29 +0100)
re2c/bootstrap/src/parse/lex.cc		patch \| blob \| history
re2c/bootstrap/src/parse/parser.cc		patch \| blob \| history
re2c/src/ir/nfa/regexps2nfa.cc		patch \| blob \| history
re2c/src/ir/regexp/nullable.cc		patch \| blob \| history
re2c/src/ir/regexp/regexp.cc		patch \| blob \| history
re2c/src/ir/regexp/regexp.h		patch \| blob \| history
re2c/src/parse/parser.ypp		patch \| blob \| history
re2c/test/tags/iter_plus.i--tags.c	[new file with mode: 0644]	patch \| blob
re2c/test/tags/iter_plus.i--tags.re	[new file with mode: 0644]	patch \| blob