From d5fa75187c6c0417362db70ef0b9939fb9721b78 Mon Sep 17 00:00:00 2001
From: Ulya Trofimovich <skvadrik@gmail.com>
Date: Mon, 12 Dec 2016 13:16:56 +0000
Subject: [PATCH] Search for the best candidate in case of non-bijective
 mapping.

For bijective mappings there is always at most one mapping candidate,
for non-bijective mappings there may be many candidates.
---
 re2c/src/ir/dfa/find_state.cc                 | 109 +++++++++++++++---
 re2c/src/ir/dfa/find_state.h                  |  17 +--
 re2c/src/util/lookup.h                        |  31 +++--
 ...mapping1.i--tags--dfa-mapping(injective).c |   4 +-
 4 files changed, 129 insertions(+), 32 deletions(-)

diff --git a/re2c/src/ir/dfa/find_state.cc b/re2c/src/ir/dfa/find_state.cc
index 221b9598..238def28 100644
--- a/re2c/src/ir/dfa/find_state.cc
+++ b/re2c/src/ir/dfa/find_state.cc
@@ -44,14 +44,17 @@ struct kernel_eq_t
 };
 
 mapping_t::mapping_t(Tagpool &pool)
-	: type(opts->dfa_mapping)
+	: tagpool(pool)
 	, cap(0)
 	, mem(NULL)
-	, tagpool(pool)
-	, max(0)
 	, x2t(NULL)
 	, x2y(NULL)
 	, y2x(NULL)
+
+	, type(opts->dfa_mapping)
+	, max(0)
+	, x2t_backup(NULL)
+	, x2y_backup(NULL)
 	, indeg(NULL)
 {}
 
@@ -69,25 +72,49 @@ void mapping_t::init(tagver_t v)
 		cap = max * 2; // in advance
 
 		const size_t
-			sz = static_cast<size_t>(cap),
-			sz_x2t = (2 * sz + 1) * sizeof(size_t),
-			sz_x2y = (2 * sz + 1) * sizeof(tagver_t),
-			sz_indeg = sz * sizeof(uint32_t);
+			n = static_cast<size_t>(cap),
+			n2 = 2 * n + 1,
+			sz_x2t = 2 * n2 * sizeof(size_t),
+			sz_x2y = 3 * n2 * sizeof(tagver_t),
+			sz_indeg = n * sizeof(uint32_t);
 		delete[] mem;
-		mem = new char[sz_x2t + 2 * sz_x2y + sz_indeg];
+		mem = new char[sz_x2t + sz_x2y + sz_indeg];
 
 		// point to the center (zero index) of each buffer
 		// indexes in range [-N .. N] must be valid, where N is capacity
-		x2t = cap + reinterpret_cast<size_t*>(mem);
-		x2y = cap + reinterpret_cast<tagver_t*>(mem + sz_x2t);
-		y2x = cap + reinterpret_cast<tagver_t*>(mem + sz_x2t + sz_x2y);
-		indeg = reinterpret_cast<uint32_t*>(mem + sz_x2t + 2 * sz_x2y);
+		x2t        = reinterpret_cast<size_t*>(mem) + cap;
+		x2t_backup = x2t + n2;
+		x2y        = reinterpret_cast<tagver_t*>(mem + sz_x2t) + cap;
+		x2y_backup = x2y + n2;
+		y2x        = x2y + n2 * 2;
+		indeg      = reinterpret_cast<uint32_t*>(mem + sz_x2t + sz_x2y);
 
 		// see note [topological ordering of copy commands]
 		memset(indeg, 0, sz_indeg);
 	}
 }
 
+bool mapping_t::better() const
+{
+	size_t n1 = 0, n2 = 0;
+
+	for (tagver_t x = -max; x < max; ++x) {
+		const tagver_t
+			y1 = x2y[x],
+			y2 = x2y_backup[x];
+		if (y1 != TAGVER_ZERO && y1 != x) ++n1;
+		if (y2 != TAGVER_ZERO && y2 != x) ++n2;
+	}
+
+	return n1 < n2;
+}
+
+void mapping_t::backup()
+{
+	std::swap(x2t, x2t_backup);
+	std::swap(x2y, x2y_backup);
+}
+
 /* note [mapping ignores items with lookahead tags]
  *
  * Consider two items X and Y being mapped.
@@ -196,6 +223,44 @@ const kernel_t *kernels_t::operator[](size_t idx) const
 	return lookup[idx];
 }
 
+/* note [bijective vs surjective mappings]
+ *
+ * Suppose we have just constructed a new DFA state Y and want to map it to
+ * an existing DFA state X. States must have identical sets of NFA substates
+ * and identical sets of lookahead tags for each substate. The difference is
+ * in tag versions: we need to map version of each tag in each substate of Y
+ * to the corresponging version of this tag in this substate of X.
+ *
+ * This imposes a binary relation between the set of versions {x1, ..., xN}
+ * used in X and the set of versions {y1, ... yM} used in Y, subject to the
+ * following constraints:
+ *
+ * (1) Different versions in Y must correspond to different versions in X,
+ *     otherwise information will be lost: if (x1, y1) and (x2, y2) belong
+ *     to the relation and y1 != y2, then x1 != x2. The inverse is not true:
+ *     different versions in X may correspond to the same version in Y (in
+ *     which case mapping adds redundant information to Y). Therefore this
+ *     relation must be a surjection from {x1, ..., xN} to {y1, ... yM} and
+ *     may, but doesn't have to be a bijection.
+ *
+ * (2) The order of versions with respect to priorities must be preserved:
+ *     if y1 < y2, then x1 < x2 for all (x1, y1) and (x2, y2) that belong to
+ *     relation. The inverse is less strict: if x1 < x2, then y1 <= y2, since
+ *     different X versions are allowed to map to the same Y version.
+ *
+ * Bijective mappings have a nice property: there is only one possible state
+ * X to which Y can be mapped. Indeed, if there was another state Z that
+ * can be bijectively mapped to Y preserving priorities, then Z itself can
+ * be mapped to X: both (1) and (2) are symmetrical in case of bijection
+ * and the relation is transitive. So the existence of Z is a contradiction.
+ *
+ * Non-bijective mappings do not have this property: for example, any state
+ * can be mapped to a state with only one tag version in all substates, but
+ * that does not mean that any state can be mapped to any other state.
+ * Therefore for non-bijective mappings it makes sense to search for the
+ * best mapping candidate.
+ */
+
 kernels_t::result_t kernels_t::insert(const closure_t &clos, tagver_t maxver)
 {
 	const size_t nkern = clos.size();
@@ -233,7 +298,21 @@ kernels_t::result_t kernels_t::insert(const closure_t &clos, tagver_t maxver)
 	// else try to find mappable kernel
 	mapping.init(maxver);
 	x = lookup.find_with(hash, buffer, mapping);
-	if (x != index_t::NIL) return result_t(x, &mapping, false);
+	if (x != index_t::NIL) {
+		// see note [bijective vs surjective mappings]
+		mapping.backup();
+		if (mapping.type == mapping_t::INJECTIVE) {
+			for (size_t y = x;;) {
+				y = lookup.find_next_with(y, buffer, mapping);
+				if (y == index_t::NIL) break;
+				if (mapping.better()) {
+					mapping.backup();
+					x = y;
+				}
+			}
+		}
+		return result_t(x, &mapping, false);
+	}
 
 	// otherwise add new kernel
 	x = lookup.push(hash, kernel_t::copy(*buffer));
@@ -290,10 +369,10 @@ static tcmd_t commands(const closure_t &closure, const Tagpool &tagpool,
 	} else {
 		// mapping: see note [save(X), copy(Y,X) optimization]
 		for (tagver_t x = -map->max; x < map->max; ++x) {
-			const tagver_t y = map->x2y[x];
+			const tagver_t y = map->x2y_backup[x];
 			if (y == TAGVER_ZERO || y == x) continue;
 
-			const size_t t = map->x2t[x];
+			const size_t t = map->x2t_backup[x];
 			if (cur[t] == y) {
 				save = tcpool.make_save(save, abs(x), false);
 			} else if (bot[t] == y) {
diff --git a/re2c/src/ir/dfa/find_state.h b/re2c/src/ir/dfa/find_state.h
index 68c9bbc9..ba0c949b 100644
--- a/re2c/src/ir/dfa/find_state.h
+++ b/re2c/src/ir/dfa/find_state.h
@@ -24,24 +24,27 @@ struct kernel_t
 
 struct mapping_t
 {
-	enum type_t {BIJECTIVE, INJECTIVE};
-
 private:
-	const type_t type;
+	Tagpool &tagpool;
 	tagver_t cap; // capacity (greater or equal to max)
 	char *mem;
-	Tagpool &tagpool;
-
-public:
-	tagver_t max; // maximal tag version
 	size_t *x2t;
 	tagver_t *x2y;
 	tagver_t *y2x;
+
+public:
+	enum type_t {BIJECTIVE, INJECTIVE};
+	const type_t type;
+	tagver_t max; // maximal tag version
+	size_t *x2t_backup;
+	tagver_t *x2y_backup;
 	uint32_t *indeg;
 
 	explicit mapping_t(Tagpool &pool);
 	~mapping_t();
 	void init(tagver_t v);
+	bool better() const;
+	void backup();
 	bool operator()(const kernel_t *k1, const kernel_t *k2);
 	FORBID_COPY(mapping_t);
 };
diff --git a/re2c/src/util/lookup.h b/re2c/src/util/lookup.h
index 10676b97..e649b9c3 100644
--- a/re2c/src/util/lookup.h
+++ b/re2c/src/util/lookup.h
@@ -37,12 +37,13 @@ public:
 	size_t size() const;
 	data_t& operator[](size_t idx);
 	const data_t& operator[](size_t idx) const;
-	size_t push(hash_t h, const data_t &data);
-	template<typename pred_t>
-		size_t find_with(hash_t h, const data_t &data, pred_t &pred) const;
+	size_t push(hash_t hash, const data_t &data);
+	template<typename pred_t> size_t find_with(hash_t hash, const data_t &data, pred_t &pred) const;
+	template<typename pred_t> size_t find_next_with(size_t prev, const data_t &data, pred_t &pred) const;
 
 private:
 	size_t head(hash_t) const;
+	template<typename pred_t> size_t find(size_t next, const data_t &data, pred_t &pred) const;
 };
 
 template<typename data_t, typename hash_t>
@@ -80,19 +81,19 @@ size_t lookup_t<data_t, hash_t>::head(hash_t h) const
 }
 
 template<typename data_t, typename hash_t>
-size_t lookup_t<data_t, hash_t>::push(hash_t h, const data_t &data)
+size_t lookup_t<data_t, hash_t>::push(hash_t hash, const data_t &data)
 {
 	const size_t idx = elems.size();
-	elems.push_back(elem_t(head(h), data));
-	lookup[h] = idx;
+	elems.push_back(elem_t(head(hash), data));
+	lookup[hash] = idx;
 	return idx;
 }
 
 template<typename data_t, typename hash_t>
 template<typename pred_t>
-size_t lookup_t<data_t, hash_t>::find_with(hash_t h, const data_t &data, pred_t &pred) const
+size_t lookup_t<data_t, hash_t>::find(size_t next, const data_t &data, pred_t &pred) const
 {
-	for (size_t i = head(h); i != NIL;) {
+	for (size_t i = next; i != NIL;) {
 		const elem_t &e = elems[i];
 		if (pred(e.data, data)) {
 			return i;
@@ -102,6 +103,20 @@ size_t lookup_t<data_t, hash_t>::find_with(hash_t h, const data_t &data, pred_t
 	return NIL;
 }
 
+template<typename data_t, typename hash_t>
+template<typename pred_t>
+size_t lookup_t<data_t, hash_t>::find_with(hash_t hash, const data_t &data, pred_t &pred) const
+{
+	return find(head(hash), data, pred);
+}
+
+template<typename data_t, typename hash_t>
+template<typename pred_t>
+size_t lookup_t<data_t, hash_t>::find_next_with(size_t prev, const data_t &data, pred_t &pred) const
+{
+	return find(elems[prev].next, data, pred);
+}
+
 } // namespace re2c
 
 #endif // _RE2C_UTIL_LOOKUP_
diff --git a/re2c/test/tags/mapping1.i--tags--dfa-mapping(injective).c b/re2c/test/tags/mapping1.i--tags--dfa-mapping(injective).c
index c06a105d..d11efafc 100644
--- a/re2c/test/tags/mapping1.i--tags--dfa-mapping(injective).c
+++ b/re2c/test/tags/mapping1.i--tags--dfa-mapping(injective).c
@@ -42,8 +42,8 @@ yy7:
 	yych = *++YYCURSOR;
 	switch (yych) {
 	case 'a':
-		yyt1 = yyt2 = NULL;
-		goto yy9;
+		yyt2 = NULL;
+		goto yy5;
 	case 'b':	goto yy4;
 	default:
 		yyt2 = NULL;
-- 
2.40.0