From 30cebcbb3ecbead10dd9f2a379429bf6061559ce Mon Sep 17 00:00:00 2001
From: Ulya Trofimovich <skvadrik@gmail.com>
Date: Mon, 26 Sep 2016 12:29:27 +0100
Subject: [PATCH] Base '*' (zero or more repetitions) on '+' (one or more
 repetitions).

This commit effectively reverts commit 5cf441dc936c16e2264e49038ebe9a108dc750b9
"Base '+' (one or more repetitions) on '*' (zero or more repetitions)."

It turn out, there is a good reason for using '+' as base operation:
'*' can be expressed in terms of '+' as 'r* ::= r+ | <empty>', while
'+' expands as 'r+ ::= r r*' and 'r' is duplicated.

Duplication becomes crucial in presence of tags: if the duplicated
subexpression has tags, then duplication causes an error.
---
 re2c/bootstrap/src/parse/lex.cc     |  2 +-
 re2c/bootstrap/src/parse/parser.cc  |  8 ++++---
 re2c/src/ir/nfa/regexps2nfa.cc      |  9 +++++---
 re2c/src/ir/regexp/nullable.cc      | 11 ++++------
 re2c/src/ir/regexp/regexp.cc        |  3 ++-
 re2c/src/ir/regexp/regexp.h         | 10 +++++++++
 re2c/src/parse/parser.ypp           |  6 +++--
 re2c/test/tags/iter_plus.i--tags.c  | 34 +++++++++++++++++++++++++++++
 re2c/test/tags/iter_plus.i--tags.re |  7 ++++++
 9 files changed, 73 insertions(+), 17 deletions(-)
 create mode 100644 re2c/test/tags/iter_plus.i--tags.c
 create mode 100644 re2c/test/tags/iter_plus.i--tags.re

diff --git a/re2c/bootstrap/src/parse/lex.cc b/re2c/bootstrap/src/parse/lex.cc
index a6c45443..dc23c0c7 100644
--- a/re2c/bootstrap/src/parse/lex.cc
+++ b/re2c/bootstrap/src/parse/lex.cc
@@ -1,4 +1,4 @@
-/* Generated by re2c 0.16 on Wed May 11 15:38:08 2016 */
+/* Generated by re2c 0.16 on Mon Sep 26 12:27:15 2016 */
 #line 1 "../src/parse/lex.re"
 #include "src/util/c99_stdint.h"
 #include <stddef.h>
diff --git a/re2c/bootstrap/src/parse/parser.cc b/re2c/bootstrap/src/parse/parser.cc
index fb5f17dd..40f1de55 100644
--- a/re2c/bootstrap/src/parse/parser.cc
+++ b/re2c/bootstrap/src/parse/parser.cc
@@ -559,7 +559,7 @@ static const yytype_uint16 yyrline[] =
      185,   185,   188,   197,   208,   212,   218,   224,   231,   240,
      248,   258,   269,   275,   281,   284,   291,   297,   307,   310,
      317,   321,   327,   331,   338,   342,   349,   353,   360,   364,
-     379,   398,   402,   406,   410,   417,   427,   431
+     381,   400,   404,   408,   412,   419,   429,   433
 };
 #endif
 
@@ -1790,13 +1790,15 @@ yyreduce:
   case 39:
 
     {
+			// see note [Kleene star is expressed in terms of plus]
 			switch((yyvsp[(2) - (2)].op))
 			{
 			case '*':
-				(yyval.regexp) = RegExp::make_iter((yyvsp[(1) - (2)].regexp));
+				(yyval.regexp) = RegExp::make_alt(RegExp::make_nil(),
+					RegExp::make_iter((yyvsp[(1) - (2)].regexp)));
 				break;
 			case '+':
-				(yyval.regexp) = RegExp::make_cat(RegExp::make_iter((yyvsp[(1) - (2)].regexp)), (yyvsp[(1) - (2)].regexp));
+				(yyval.regexp) = RegExp::make_iter((yyvsp[(1) - (2)].regexp));
 				break;
 			case '?':
 				(yyval.regexp) = mkAlt((yyvsp[(1) - (2)].regexp), RegExp::make_nil());
diff --git a/re2c/src/ir/nfa/regexps2nfa.cc b/re2c/src/ir/nfa/regexps2nfa.cc
index dcb911cd..d07e4cfa 100644
--- a/re2c/src/ir/nfa/regexps2nfa.cc
+++ b/re2c/src/ir/nfa/regexps2nfa.cc
@@ -24,10 +24,13 @@ static nfa_state_t *regexp2nfa(nfa_t &nfa, size_t nrule,
 			s = regexp2nfa(nfa, nrule, tagidx, re->cat.re2, t);
 			s = regexp2nfa(nfa, nrule, tagidx, re->cat.re1, s);
 			break;
-		case RegExp::ITER:
-			s = &nfa.states[nfa.size++];
-			s->make_alt(nrule, t, regexp2nfa(nfa, nrule, tagidx, re->iter, s));
+		case RegExp::ITER: {
+			// see note [Kleene star is expressed in terms of plus]
+			nfa_state_t *q = &nfa.states[nfa.size++];
+			s = regexp2nfa(nfa, nrule, tagidx, re->iter, q);
+			q->make_alt(nrule, t, s);
 			break;
+		}
 		case RegExp::TAG:
 			if ((*nfa.tags)[tagidx].type == Tag::VAR) {
 				s = &nfa.states[nfa.size++];
diff --git a/re2c/src/ir/regexp/nullable.cc b/re2c/src/ir/regexp/nullable.cc
index 3abef054..286652e3 100644
--- a/re2c/src/ir/regexp/nullable.cc
+++ b/re2c/src/ir/regexp/nullable.cc
@@ -10,24 +10,21 @@ static bool nullable(const RegExp *re, bool &trail)
 		return true;
 	}
 	switch (re->type) {
-		case RegExp::NIL:
-		case RegExp::ITER:
-			return true;
+		default: assert(false);
+		case RegExp::NIL: return true;
+		case RegExp::SYM:
+		case RegExp::ITER: return false;
 		case RegExp::TAG:
 			if (re->tag == NULL) {
 				trail = true;
 			}
 			return true;
-		case RegExp::SYM:
-			return false;
 		case RegExp::ALT:
 			return nullable(re->alt.re1, trail)
 				|| nullable(re->alt.re2, trail);
 		case RegExp::CAT:
 			return nullable(re->cat.re1, trail)
 				&& nullable(re->cat.re2, trail);
-		default:
-			assert(false);
 	}
 }
 
diff --git a/re2c/src/ir/regexp/regexp.cc b/re2c/src/ir/regexp/regexp.cc
index cab9fc3c..ca13ec3b 100644
--- a/re2c/src/ir/regexp/regexp.cc
+++ b/re2c/src/ir/regexp/regexp.cc
@@ -199,8 +199,9 @@ const RegExp *repeat_from_to(const RegExp *re, uint32_t n, uint32_t m)
 // see note [counted repetition expansion]
 const RegExp *repeat_from(const RegExp *re, uint32_t n)
 {
+	// see note [Kleene star is expressed in terms of plus]
 	return doCat(repeat(re, n),
-		RegExp::make_iter(re));
+		RegExp::make_alt(RegExp::make_nil(), RegExp::make_iter(re)));
 }
 
 } // namespace re2c
diff --git a/re2c/src/ir/regexp/regexp.h b/re2c/src/ir/regexp/regexp.h
index 2d216753..9f083eca 100644
--- a/re2c/src/ir/regexp/regexp.h
+++ b/re2c/src/ir/regexp/regexp.h
@@ -18,6 +18,16 @@ struct nfa_t;
 
 typedef std::vector<uint32_t> charset_t;
 
+/* note [Kleene star is expressed in terms of plus]
+ *
+ * In literature Kleene star 'r*' (zero or more repetitions of 'r')
+ * is the basic operation. In practice it is more convenient to use
+ * 'r+' (one or more repetitions of 'r'), because expansion 'r+ ::= r r*'
+ * duplicates 'r', while expansion 'r* = r+ | <empty>' allows to
+ * avoid duplication. This is more efficient in general and crucial
+ * in cases when duplication of 'r' is forbidden (e.g. if 'r' has tags).
+ */
+
 struct RegExp
 {
 	static free_list<RegExp*> flist;
diff --git a/re2c/src/parse/parser.ypp b/re2c/src/parse/parser.ypp
index 2d278a07..d02b777f 100644
--- a/re2c/src/parse/parser.ypp
+++ b/re2c/src/parse/parser.ypp
@@ -363,13 +363,15 @@ factor:
 		}
 	|	primary close
 		{
+			// see note [Kleene star is expressed in terms of plus]
 			switch($2)
 			{
 			case '*':
-				$$ = RegExp::make_iter($1);
+				$$ = RegExp::make_alt(RegExp::make_nil(),
+					RegExp::make_iter($1));
 				break;
 			case '+':
-				$$ = RegExp::make_cat(RegExp::make_iter($1), $1);
+				$$ = RegExp::make_iter($1);
 				break;
 			case '?':
 				$$ = mkAlt($1, RegExp::make_nil());
diff --git a/re2c/test/tags/iter_plus.i--tags.c b/re2c/test/tags/iter_plus.i--tags.c
new file mode 100644
index 00000000..94c1cfbe
--- /dev/null
+++ b/re2c/test/tags/iter_plus.i--tags.c
@@ -0,0 +1,34 @@
+/* Generated by re2c */
+// ensure 'r+' (one or more repetitions) expansion does not duplicate 'r'
+// this is crucial if 'r' contains tags (tag duplication is forbidden)
+
+
+{
+	YYCTYPE yych;
+	long yytag0p;
+	YYCTXMARKER = YYCURSOR;
+	if (YYLIMIT <= YYCURSOR) YYFILL(1);
+	yych = *YYCURSOR;
+	switch (yych) {
+	case 'a':
+		yytag0p = (YYCURSOR - YYCTXMARKER);
+		goto yy4;
+	default:	goto yy2;
+	}
+yy2:
+	++YYCURSOR;
+	{ d }
+yy4:
+	++YYCURSOR;
+	if (YYLIMIT <= YYCURSOR) YYFILL(1);
+	yych = *YYCURSOR;
+	switch (yych) {
+	case 'a':
+		yytag0p = (YYCURSOR - YYCTXMARKER);
+		goto yy4;
+	default:	goto yy6;
+	}
+yy6:
+	{ (YYCTXMARKER + yytag0p) }
+}
+
diff --git a/re2c/test/tags/iter_plus.i--tags.re b/re2c/test/tags/iter_plus.i--tags.re
new file mode 100644
index 00000000..d7d25e73
--- /dev/null
+++ b/re2c/test/tags/iter_plus.i--tags.re
@@ -0,0 +1,7 @@
+// ensure 'r+' (one or more repetitions) expansion does not duplicate 'r'
+// this is crucial if 'r' contains tags (tag duplication is forbidden)
+
+/*!re2c
+    (@p "a")+ { @p }
+    *         { d }
+*/
-- 
2.40.0