From: Ulya Trofimovich <skvadrik@gmail.com>
Date: Mon, 26 Sep 2016 16:09:18 +0000 (+0100)
Subject: Restructured determinization algorithm DFA construction in a more canonical way.
X-Git-Tag: 1.0~39^2~280
X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=101bad34f8a31a21a2a919f1109e1e9857e346b0;p=re2c

Restructured determinization algorithm DFA construction in a more canonical way.

All textbooks describe determinization (powerset construction)
like this:

    S0 <- epsilon-closure of start NFA state
    add S0 to DFA
    while there is unmarked state S:
        mark S
        for each symbol X in the alphabet:
            R <- the set of NFA states reachable from S on X
            T <- epsilon-closure of R
            if T is not in DFA
                add T to DFA

In re2c the inner loop on alphabet symbols was split in two parts
and somewhat obfuscated; this commit makes re2c follow textbook
version more closely.
---

diff --git a/re2c/src/ir/dfa/determinization.cc b/re2c/src/ir/dfa/determinization.cc
index b48b07bd..c030e190 100644
--- a/re2c/src/ir/dfa/determinization.cc
+++ b/re2c/src/ir/dfa/determinization.cc
@@ -14,8 +14,38 @@
 namespace re2c
 {
 
+static nfa_state_t *transition(nfa_state_t *state, uint32_t symbol);
+static void reach(const kitem_t *kernel, size_t kcount, kitem_t *&r, uint32_t symbol);
+
 const size_t dfa_t::NIL = std::numeric_limits<size_t>::max();
 
+nfa_state_t *transition(nfa_state_t *state, uint32_t symbol)
+{
+	if (state->type != nfa_state_t::RAN) {
+		return NULL;
+	}
+	for (const Range *r = state->ran.ran; r; r = r->next()) {
+		if ((r->lower() <= symbol) && (symbol < r->upper())) {
+			return state->ran.out;
+		}
+	}
+	return NULL;
+}
+
+void reach(const kitem_t *kernel, size_t kcount, kitem_t *&r, uint32_t symbol)
+{
+	for (size_t i = 0; i < kcount; ++i) {
+		nfa_state_t
+			*s = kernel[i].state,
+			*t = transition(s, symbol);
+		if (t) {
+			r->state = t;
+			r->tagidx = kernel[i].tagidx;
+			++r;
+		}
+	}
+}
+
 static void merge_tags_with_mask(bool *oldtags, const bool *newtags,
 	bool *oldmask, const bool *newmask,
 	bool *badtags, size_t ntags)
@@ -38,73 +68,56 @@ dfa_t::dfa_t(const nfa_t &nfa,
 {
 	const size_t ntags = tags.size();
 	const size_t nrules = rules.size();
-	const size_t mask_size = (nchars + 1) * ntags;
 
 	ord_hash_set_t kernels;
+	kitem_t *rstart = new kitem_t[nfa.size], *rend = rstart;
 	kitem_t *kstart = new kitem_t[nfa.size], *kend = kstart;
 	bool *ktags = new bool[ntags]();
 	bool *badtags = new bool[ntags]();
-	bool *arctags = new bool[mask_size];
-	bool *mask = new bool[mask_size];
+	bool *arctags = new bool[ntags];
+	bool *mask = new bool[ntags];
 	bool *fin = new bool[nrules];
-	std::vector<nfa_state_t*> *arcs = new std::vector<nfa_state_t*>[nchars];
 
 	closure(kstart, kend, nfa.root, ktags, badtags, ntags);
 	find_state(kstart, kend, kernels, tagpool);
 	for (size_t i = 0; i < kernels.size(); ++i) {
-		memset(fin, 0, nrules * sizeof(bool));
-		memset(arctags, 0, mask_size * sizeof(bool));
-		memset(mask, 0, mask_size * sizeof(bool));
-		for(size_t c = 0; c < nchars; ++c) {
-			arcs[c].clear();
-		}
-
 		dfa_state_t *s = new dfa_state_t(nchars);
 		states.push_back(s);
 
 		const kitem_t *kernel;
 		const size_t kcount = kernels.deref<const kitem_t>(i, kernel);
-		for (size_t j = 0; j < kcount; ++j) {
-			nfa_state_t *n = kernel[j].state;
-			const bool *newtags = tagpool[kernel[j].tagidx];
-			switch (n->type) {
-				case nfa_state_t::RAN: {
-					nfa_state_t *m = n->ran.out;
-					size_t c = 0;
-					for (const Range *r = n->ran.ran; r; r = r->next ()) {
-						for (; charset[c] != r->lower(); ++c);
-						for (; charset[c] != r->upper(); ++c) {
-							merge_tags_with_mask(&arctags[c * ntags], newtags,
-								&mask[c * ntags], tagpool[rules[m->rule].tags],
-								badtags, ntags);
-							arcs[c].push_back(m);
-						}
-					}
-					break;
-				}
-				case nfa_state_t::FIN:
-					merge_tags_with_mask(&arctags[nchars * ntags], newtags,
-						&mask[nchars * ntags], tagpool[rules[n->rule].tags],
-						badtags, ntags);
-					fin[n->rule] = true;
-					break;
-				default:
-					assert(false);
-					break;
-			}
-		}
 
 		for (size_t c = 0; c < nchars; ++c) {
+			rend = rstart;
+			reach(kernel, kcount, rend, charset[c]);
+
 			kend = kstart;
-			const std::vector<nfa_state_t*> &a = arcs[c];
-			for (size_t j = 0; j < a.size(); ++j) {
-				closure(kstart, kend, a[j], ktags, badtags, ntags);
+			for (const kitem_t *r = rstart; r != rend; ++r) {
+				closure(kstart, kend, r->state, ktags, badtags, ntags);
 			}
 			s->arcs[c] = find_state(kstart, kend, kernels, tagpool);
-			s->tags[c] = tagpool.insert(&arctags[c * ntags]);
+
+			memset(arctags, 0, ntags * sizeof(bool));
+			memset(mask, 0, ntags * sizeof(bool));
+			for (const kitem_t *r = rstart; r != rend; ++r) {
+				merge_tags_with_mask(arctags, tagpool[r->tagidx], mask,
+					tagpool[rules[r->state->rule].tags], badtags, ntags);
+			}
+			s->tags[c] = tagpool.insert(arctags);
 		}
-		s->rule_tags = tagpool.insert(&arctags[nchars * ntags]);
 
+		memset(fin, 0, nrules * sizeof(bool));
+		memset(arctags, 0, ntags * sizeof(bool));
+		memset(mask, 0, ntags * sizeof(bool));
+		for (size_t j = 0; j < kcount; ++j) {
+			nfa_state_t *n = kernel[j].state;
+			if (n->type == nfa_state_t::FIN) {
+				merge_tags_with_mask(arctags, tagpool[kernel[j].tagidx], mask,
+					tagpool[rules[n->rule].tags], badtags, ntags);
+				fin[n->rule] = true;
+			}
+		}
+		s->rule_tags = tagpool.insert(arctags);
 		// choose the first rule (the one with smallest rank)
 		size_t r;
 		for (r = 0; r < nrules; ++r) {
@@ -128,13 +141,13 @@ dfa_t::dfa_t(const nfa_t &nfa,
 		}
 	}
 
+	delete[] rstart;
 	delete[] kstart;
 	delete[] ktags;
 	delete[] badtags;
 	delete[] arctags;
 	delete[] mask;
 	delete[] fin;
-	delete[] arcs;
 }
 
 dfa_t::~dfa_t()