From 7f337a2451c4df12af41527fdfb693ccace0dcbd Mon Sep 17 00:00:00 2001
From: Ulya Trofimovich <skvadrik@gmail.com>
Date: Fri, 3 Mar 2017 01:15:05 +0000
Subject: [PATCH] Delay expansion of counted repetition until NFA construction
 time.

This is necessary to avoid creating default tags for tags that
are duplicated by expansion of counted repetition. Default tags
should only be created if zero repetitions are acceptable; this case
is still expanded early.
---
 re2c/src/ir/nfa/estimate_size.cc | 15 ++++++----
 re2c/src/ir/nfa/re_to_nfa.cc     | 48 +++++++++++++++++++++++++-------
 re2c/src/ir/re/ast_to_re.cc      | 41 ++++-----------------------
 re2c/src/ir/re/default_tags.cc   |  5 +---
 re2c/src/ir/re/fixed_tags.cc     |  6 +---
 re2c/src/ir/re/nullable.cc       |  4 +--
 re2c/src/ir/re/re.h              | 29 ++++++++++---------
 re2c/src/ir/re/split_charset.cc  |  5 +---
 8 files changed, 71 insertions(+), 82 deletions(-)
diff --git a/re2c/src/ir/nfa/estimate_size.cc b/re2c/src/ir/nfa/estimate_size.cc
index 90c2d6b3..f7ed3000 100644
--- a/re2c/src/ir/nfa/estimate_size.cc
+++ b/re2c/src/ir/nfa/estimate_size.cc
@@ -16,12 +16,15 @@ static size_t estimate(const RE *re)
 		case RE::CAT:
 			return estimate(re->cat.re1)
 				+ estimate(re->cat.re2);
-		case RE::ITER:
-			return estimate(re->iter)
-				+ 1;
-		case RE::REPEAT:
-			return estimate(re->repeat.re) * (re->repeat.upto + 1)
-				+ re->repeat.upto;
+		case RE::ITER: {
+			const size_t
+				iter = estimate(re->iter.re),
+				min = re->iter.min,
+				max = re->iter.max;
+			return max == RegExp::MANY
+				? iter * min + 1
+				: iter * max + (max - min);
+		}
 	}
 }
 
diff --git a/re2c/src/ir/nfa/re_to_nfa.cc b/re2c/src/ir/nfa/re_to_nfa.cc
index 31e4c02f..8ed5b0dd 100644
--- a/re2c/src/ir/nfa/re_to_nfa.cc
+++ b/re2c/src/ir/nfa/re_to_nfa.cc
@@ -2,6 +2,26 @@
 
 namespace re2c {
 
+/*
+ * note [counted repetition and iteration expansion]
+ *
+ * It is more convenient to express zero-or-more iterations in terms of
+ * one-or-more iterations than vice versa, because the expansion 'r+ ::= r r*'
+ * duplicates 'r', while 'r* = r+ | <empty>' allows to avoid duplication.
+ *
+ * Sometimes duplcation is unavoidable, like 'r{n}' for 'n' > 1 and 'r{n,m}'
+ * for 'n' < 'm'. In such cases we duplicate 'r' together with all tags;
+ * this may cause multiple (non-bottom) occurences of the same tag in the NFA.
+ * Determinization must be careful to track multiple occurences of the same
+ * tag while building epsilon-closure (this matters for POSIX disambiguation
+ * strategy).
+ *
+ * We allow tags to apper only once in the original regular expression.
+ * This is not strictly necessary (putting the same tag in non-overlapping
+ * alternative branches may be handy), but it would allow to create very
+ * confusing regexps and the disambiguation strategy would behave strangely.
+ */
+
 static nfa_state_t *re_to_nfa(nfa_t &nfa, size_t nrule, const RE *re, nfa_state_t *t)
 {
 	nfa_state_t *s = NULL;
@@ -26,20 +46,28 @@ static nfa_state_t *re_to_nfa(nfa_t &nfa, size_t nrule, const RE *re, nfa_state_
 			s = re_to_nfa(nfa, nrule, re->cat.re1, s);
 			break;
 		case RE::ITER: {
-			// see note [Kleene star is expressed in terms of plus]
-			nfa_state_t *q = &nfa.states[nfa.size++];
-			s = re_to_nfa(nfa, nrule, re->iter, q);
-			q->make_alt(nrule, s, t);
-			break;
-		}
-		case RE::REPEAT:
-			s = re_to_nfa(nfa, nrule, re->repeat.re, t);
-			for (uint32_t i = 0; i < re->repeat.upto; ++i) {
+			const uint32_t
+				min = re->iter.min,
+				max = re->iter.max;
+			const RE *iter = re->iter.re;
+			// see note [counted repetition and iteration expansion]
+			if (max == RegExp::MANY) {
 				nfa_state_t *q = &nfa.states[nfa.size++];
+				s = re_to_nfa(nfa, nrule, iter, q);
 				q->make_alt(nrule, s, t);
-				s = re_to_nfa(nfa, nrule, re->repeat.re, q);
+			} else {
+				s = re_to_nfa(nfa, nrule, iter, t);
+				for (uint32_t i = min; i < max; ++i) {
+					nfa_state_t *q = &nfa.states[nfa.size++];
+					q->make_alt(nrule, s, t);
+					s = re_to_nfa(nfa, nrule, iter, q);
+				}
+			}
+			for (uint32_t i = 1; i < min; ++i) {
+				s = re_to_nfa(nfa, nrule, iter, s);
 			}
 			break;
+		}
 		case RE::TAG:
 			if (fixed(nfa.tags[re->tag.idx])) {
 				s = t;
diff --git a/re2c/src/ir/re/ast_to_re.cc b/re2c/src/ir/re/ast_to_re.cc
index 73276225..e5f5efd9 100644
--- a/re2c/src/ir/re/ast_to_re.cc
+++ b/re2c/src/ir/re/ast_to_re.cc
@@ -3,25 +3,6 @@
 
 namespace re2c {
 
-/* note [Kleene star is expressed in terms of plus]
- *
- * In literature Kleene star 'r*' (zero or more repetitions of 'r')
- * is the basic operation. In practice it is more convenient to use
- * 'r+' (one or more repetitions of 'r'), because expansion 'r+ ::= r r*'
- * duplicates 'r', while expansion 'r* = r+ | <empty>' allows to
- * avoid duplication. This is more efficient in general and crucial
- * in cases when duplication of 'r' is forbidden (e.g. if 'r' has tags).
- */
-
-/*
- * note [counted repetition expansion]
- *
- * r{0} ;;= <empty regexp>
- * r{n} ::= r{n-1} r
- * r{n,m} ::= r{n} (r{0} | ... | r{m-n})
- * r{n,} ::= r{n} r*
- */
-
 static RE *ast_to_re(RESpec &spec, const RegExp *ast)
 {
 	RE::alc_t &alc = spec.alc;
@@ -50,27 +31,17 @@ static RE *ast_to_re(RESpec &spec, const RegExp *ast)
 		case RegExp::ITER: {
 			const uint32_t
 				n = ast->iter.min,
-				m = ast->iter.max;
+				n1 = std::max(n, 1u),
+				m = std::max(n, ast->iter.max);
 			const RegExp *x = ast->iter.re;
 
-			if (n == 0 && m == 0) {
-				return re_nil(alc);
-			}
-
-			RE *y = ast_to_re(spec, x);
-			if (m == RegExp::MANY) {
-				y = re_iter(alc, y);
-			} else if (n == 0) {
-				y = re_repeat(alc, y, m - 1);
-			} else if (m > n) {
-				y = re_repeat(alc, y, m - n);
+			RE *y = NULL;
+			if (m > 0) {
+				y = ast_to_re(spec, x);
+				y = re_iter(alc, y, n1, m);
 			}
 			if (n == 0) {
 				y = re_alt(alc, y, re_nil(alc));
-			} else {
-				for (uint32_t i = 1; i < n; ++i) {
-					y = re_cat(alc, ast_to_re(spec, x), y);
-				}
 			}
 			return y;
 		}
diff --git a/re2c/src/ir/re/default_tags.cc b/re2c/src/ir/re/default_tags.cc
index e943500b..2253f030 100644
--- a/re2c/src/ir/re/default_tags.cc
+++ b/re2c/src/ir/re/default_tags.cc
@@ -28,10 +28,7 @@ static void insert_default_tags(RESpec &spec, RE *re, size_t &tidx)
 			insert_default_tags(spec, re->cat.re2, tidx);
 			break;
 		case RE::ITER:
-			insert_default_tags(spec, re->iter, tidx);
-			break;
-		case RE::REPEAT:
-			insert_default_tags(spec, re->repeat.re, tidx);
+			insert_default_tags(spec, re->iter.re, tidx);
 			break;
 		case RE::TAG:
 			assert(re->tag.idx == tidx);
diff --git a/re2c/src/ir/re/fixed_tags.cc b/re2c/src/ir/re/fixed_tags.cc
index 2918bd39..5f055919 100644
--- a/re2c/src/ir/re/fixed_tags.cc
+++ b/re2c/src/ir/re/fixed_tags.cc
@@ -36,11 +36,7 @@ static void find_fixed_tags(RE *re, std::vector<Tag> &tags,
 			find_fixed_tags(re->cat.re1, tags, dist, base, toplevel);
 			break;
 		case RE::ITER:
-			find_fixed_tags(re->iter, tags, dist, base, false);
-			dist = Tag::VARDIST;
-			break;
-		case RE::REPEAT:
-			find_fixed_tags(re->repeat.re, tags, dist, base, false);
+			find_fixed_tags(re->iter.re, tags, dist, base, false);
 			dist = Tag::VARDIST;
 			break;
 		case RE::TAG:
diff --git a/re2c/src/ir/re/nullable.cc b/re2c/src/ir/re/nullable.cc
index 159865dd..783e931a 100644
--- a/re2c/src/ir/re/nullable.cc
+++ b/re2c/src/ir/re/nullable.cc
@@ -11,9 +11,7 @@ static bool nullable(const RESpec &spec, const RE *re, bool &trail)
 		case RE::NIL: return true;
 		case RE::SYM: return false;
 		case RE::ITER:
-			return nullable(spec, re->iter, trail);
-		case RE::REPEAT:
-			return nullable(spec, re->repeat.re, trail);
+			return nullable(spec, re->iter.re, trail);
 		case RE::TAG:
 			trail |= spec.tags[re->tag.idx].name == NULL;
 			return true;
diff --git a/re2c/src/ir/re/re.h b/re2c/src/ir/re/re.h
index d30d06fd..e0b30c80 100644
--- a/re2c/src/ir/re/re.h
+++ b/re2c/src/ir/re/re.h
@@ -15,7 +15,7 @@ namespace re2c
 struct RE
 {
 	typedef slab_allocator_t<~0u, 4096> alc_t;
-	enum type_t {NIL, SYM, ALT, CAT, ITER, REPEAT, TAG} type;
+	enum type_t {NIL, SYM, ALT, CAT, ITER, TAG} type;
 	union {
 		const Range *sym;
 		struct {
@@ -26,11 +26,11 @@ struct RE
 			RE *re1;
 			RE *re2;
 		} cat;
-		RE *iter;
 		struct {
 			RE *re;
-			uint32_t upto;
-		} repeat;
+			uint32_t min;
+			uint32_t max;
+		} iter;
 		struct {
 			size_t idx;
 			bool bottom;
@@ -71,6 +71,9 @@ inline RE *re_sym(RE::alc_t &alc, const Range *r)
 
 inline RE *re_alt(RE::alc_t &alc, RE *x, RE *y)
 {
+	if (!x) return y;
+	if (!y) return x;
+
 	RE *z = alc.alloct<RE>(1);
 	z->type = RE::ALT;
 	z->alt.re1 = x;
@@ -80,6 +83,9 @@ inline RE *re_alt(RE::alc_t &alc, RE *x, RE *y)
 
 inline RE *re_cat(RE::alc_t &alc, RE *x, RE *y)
 {
+	if (!x) return y;
+	if (!y) return x;
+
 	RE *z = alc.alloct<RE>(1);
 	z->type = RE::CAT;
 	z->cat.re1 = x;
@@ -87,20 +93,13 @@ inline RE *re_cat(RE::alc_t &alc, RE *x, RE *y)
 	return z;
 }
 
-inline RE *re_iter(RE::alc_t &alc, RE *x)
+inline RE *re_iter(RE::alc_t &alc, RE *x, uint32_t n, uint32_t m)
 {
 	RE *y = alc.alloct<RE>(1);
 	y->type = RE::ITER;
-	y->iter = x;
-	return y;
-}
-
-inline RE *re_repeat(RE::alc_t &alc, RE *x, uint32_t n)
-{
-	RE *y = alc.alloct<RE>(1);
-	y->type = RE::REPEAT;
-	y->repeat.re = x;
-	y->repeat.upto = n;
+	y->iter.re = x;
+	y->iter.min = n;
+	y->iter.max = m;
 	return y;
 }
 
diff --git a/re2c/src/ir/re/split_charset.cc b/re2c/src/ir/re/split_charset.cc
index c64df9b9..66cd58da 100644
--- a/re2c/src/ir/re/split_charset.cc
+++ b/re2c/src/ir/re/split_charset.cc
@@ -26,10 +26,7 @@ static void split(const RE* re, std::set<uint32_t> &cs)
 			split(re->cat.re2, cs);
 			break;
 		case RE::ITER:
-			split(re->iter, cs);
-			break;
-		case RE::REPEAT:
-			split(re->repeat.re, cs);
+			split(re->iter.re, cs);
 			break;
 	}
 }
-- 
2.40.0