From 756b938077eb4e572691f8935462ecc621d4d065 Mon Sep 17 00:00:00 2001 From: Ulya Trofimovich Date: Tue, 27 Sep 2016 13:00:23 +0100 Subject: [PATCH] Use 'std::vector' to store closure items during determinization. This makes code somewhat clearer compared to using plain C arrays: all function accept one parameter (vector) instead of two (array start, array end/array length). However, vectors have nontrivial structure and 'ord_hash_set_t' is not suitable for them. That's why instead of 'ord_hash_set_t' we now use 'lookup_t': just like 'ord_hash_set_t' it has constant random access ang logarithmic insertion complexity, but unike 'ord_hash_set_t' it is not meant to be a container (rather an index). --- re2c/Makefile.am | 2 + re2c/src/ir/dfa/closure.cc | 35 ++++++---- re2c/src/ir/dfa/closure.h | 42 ++++++++--- re2c/src/ir/dfa/determinization.cc | 60 +++++++--------- re2c/src/ir/dfa/find_state.cc | 84 ++++++++++++++++++---- re2c/src/ir/dfa/find_state.h | 21 +++++- re2c/src/util/hash32.h | 20 ++++++ re2c/src/util/lookup.h | 107 +++++++++++++++++++++++++++++ 8 files changed, 300 insertions(+), 71 deletions(-) create mode 100644 re2c/src/util/hash32.h create mode 100644 re2c/src/util/lookup.h diff --git a/re2c/Makefile.am b/re2c/Makefile.am index 625f2dfa..f8561dc5 100644 --- a/re2c/Makefile.am +++ b/re2c/Makefile.am @@ -59,8 +59,10 @@ SRC_HDR = \ src/util/counter.h \ src/util/forbid_copy.h \ src/util/free_list.h \ + src/util/hash32.h \ src/util/intersect_sorted.h \ src/util/local_increment.h \ + src/util/lookup.h \ src/util/ord_hash_set.h \ src/util/range.h \ src/util/s_to_n32_unsafe.h \ diff --git a/re2c/src/ir/dfa/closure.cc b/re2c/src/ir/dfa/closure.cc index d06e691f..540246e2 100644 --- a/re2c/src/ir/dfa/closure.cc +++ b/re2c/src/ir/dfa/closure.cc @@ -6,8 +6,17 @@ namespace re2c { +static void closure_one(closure_t &clos, nfa_state_t *n, bool *tags, bool *badtags, size_t ntags); static void merge_tags(bool *oldtags, const bool *newtags, bool *badtags, size_t ntags); +void closure(const closure_t &clos1, closure_t &clos2, bool *tags, bool *badtags, size_t ntags) +{ + clos2.clear(); + for (cclositer_t c = clos1.begin(); c != clos1.end(); ++c) { + closure_one(clos2, c->state, tags, badtags, ntags); + } +} + void merge_tags(bool *oldtags, const bool *newtags, bool *badtags, size_t ntags) { @@ -30,8 +39,7 @@ void merge_tags(bool *oldtags, const bool *newtags, * resulting tag sets: if they differ, then NFA is tagwise * ambiguous. All tags are merged together; ambiguity is reported. */ -void closure(kitem_t *const kernel, kitem_t *&kend, - nfa_state_t *n, bool *tags, bool *badtags, size_t ntags) +void closure_one(closure_t &clos, nfa_state_t *n, bool *tags, bool *badtags, size_t ntags) { // trace the first iteration of each loop: // epsilon-loops may add ney tags and reveal conflicts @@ -42,30 +50,31 @@ void closure(kitem_t *const kernel, kitem_t *&kend, ++n->loop; switch (n->type) { case nfa_state_t::ALT: - closure(kernel, kend, n->alt.out1, tags, badtags, ntags); - closure(kernel, kend, n->alt.out2, tags, badtags, ntags); + closure_one(clos, n->alt.out1, tags, badtags, ntags); + closure_one(clos, n->alt.out2, tags, badtags, ntags); break; case nfa_state_t::TAG: { const size_t t = n->tag.info; const bool old = tags[t]; tags[t] = true; - closure(kernel, kend, n->tag.out, tags, badtags, ntags); + closure_one(clos, n->tag.out, tags, badtags, ntags); tags[t] = old; break; } case nfa_state_t::RAN: case nfa_state_t::FIN: { - kitem_t *k = kernel; - while (k != kend && k->state != n) ++k; - if (k == kend) { - kend->state = n; - kend->tagptr = new bool[ntags]; - memcpy(kend->tagptr, tags, ntags * sizeof(bool)); - ++kend; + clositer_t + c = clos.begin(), + e = clos.end(); + for (; c != e && c->state != n; ++c); + if (c == e) { + bool *tagptr = new bool[ntags]; + memcpy(tagptr, tags, ntags * sizeof(bool)); + clos.push_back(clos_t(n, tagptr)); } else { // it is impossible to reach the same NFA state from // different rules, so no need to mess with masks here - merge_tags(k->tagptr, tags, badtags, ntags); + merge_tags(c->tagptr, tags, badtags, ntags); } break; } diff --git a/re2c/src/ir/dfa/closure.h b/re2c/src/ir/dfa/closure.h index d45e27ff..2d09d439 100644 --- a/re2c/src/ir/dfa/closure.h +++ b/re2c/src/ir/dfa/closure.h @@ -9,7 +9,7 @@ namespace re2c { -struct kitem_t +struct clos_t { nfa_state_t *state; union @@ -18,15 +18,41 @@ struct kitem_t size_t tagidx; }; - static bool compare(const kitem_t &k1, const kitem_t &k2) - { - return k1.state < k2.state - || (k1.state == k2.state && k1.tagidx < k2.tagidx); - } + inline clos_t(); + inline clos_t(nfa_state_t *s, size_t i); + inline clos_t(nfa_state_t *s, bool *p); + static inline bool compare(const clos_t &c1, const clos_t &c2); }; -void closure(kitem_t *const kernel, kitem_t *&kend, nfa_state_t *n, - bool *tags, bool *badtags, size_t ntags); +typedef std::vector closure_t; +typedef closure_t::iterator clositer_t; +typedef closure_t::const_iterator cclositer_t; + +void closure(const closure_t &clos1, closure_t &clos2, bool *tags, bool *badtags, size_t ntags); + +clos_t::clos_t() + : state(NULL) + , tagidx(0) +{} + +clos_t::clos_t(nfa_state_t *s, size_t i) + : state(s) + , tagidx(i) +{} + +clos_t::clos_t(nfa_state_t *s, bool *t) + : state(s) + , tagptr(t) +{} + +bool clos_t::compare(const clos_t &c1, const clos_t &c2) +{ + const nfa_state_t + *s1 = c1.state, + *s2 = c2.state; + return s1 < s2 + || (s1 == s2 && c1.tagidx < c2.tagidx); +} } // namespace re2c diff --git a/re2c/src/ir/dfa/determinization.cc b/re2c/src/ir/dfa/determinization.cc index c030e190..a3c7f8d5 100644 --- a/re2c/src/ir/dfa/determinization.cc +++ b/re2c/src/ir/dfa/determinization.cc @@ -15,7 +15,7 @@ namespace re2c { static nfa_state_t *transition(nfa_state_t *state, uint32_t symbol); -static void reach(const kitem_t *kernel, size_t kcount, kitem_t *&r, uint32_t symbol); +static void reach(const closure_t &clos1, closure_t &clos2, uint32_t symbol); const size_t dfa_t::NIL = std::numeric_limits::max(); @@ -32,16 +32,15 @@ nfa_state_t *transition(nfa_state_t *state, uint32_t symbol) return NULL; } -void reach(const kitem_t *kernel, size_t kcount, kitem_t *&r, uint32_t symbol) +void reach(const closure_t &clos1, closure_t &clos2, uint32_t symbol) { - for (size_t i = 0; i < kcount; ++i) { + clos2.clear(); + for (cclositer_t c = clos1.begin(); c != clos1.end(); ++c) { nfa_state_t - *s = kernel[i].state, - *t = transition(s, symbol); - if (t) { - r->state = t; - r->tagidx = kernel[i].tagidx; - ++r; + *s1 = c->state, + *s2 = transition(s1, symbol); + if (s2) { + clos2.push_back(clos_t(s2, c->tagidx)); } } } @@ -66,42 +65,35 @@ dfa_t::dfa_t(const nfa_t &nfa, , tags(*nfa.tags) , tagpool(*nfa.tagpool) { + clospool_t clospool; + closure_t clos1, clos2; const size_t ntags = tags.size(); const size_t nrules = rules.size(); - - ord_hash_set_t kernels; - kitem_t *rstart = new kitem_t[nfa.size], *rend = rstart; - kitem_t *kstart = new kitem_t[nfa.size], *kend = kstart; bool *ktags = new bool[ntags](); bool *badtags = new bool[ntags](); bool *arctags = new bool[ntags]; bool *mask = new bool[ntags]; bool *fin = new bool[nrules]; - closure(kstart, kend, nfa.root, ktags, badtags, ntags); - find_state(kstart, kend, kernels, tagpool); - for (size_t i = 0; i < kernels.size(); ++i) { + clos1.push_back(clos_t(nfa.root, static_cast(0))); + closure(clos1, clos2, ktags, badtags, ntags); + find_state(clos2, clospool, tagpool); + + for (size_t i = 0; i < clospool.size(); ++i) { + const closure_t &clos0 = clospool[i]; dfa_state_t *s = new dfa_state_t(nchars); states.push_back(s); - const kitem_t *kernel; - const size_t kcount = kernels.deref(i, kernel); - for (size_t c = 0; c < nchars; ++c) { - rend = rstart; - reach(kernel, kcount, rend, charset[c]); - - kend = kstart; - for (const kitem_t *r = rstart; r != rend; ++r) { - closure(kstart, kend, r->state, ktags, badtags, ntags); - } - s->arcs[c] = find_state(kstart, kend, kernels, tagpool); + reach(clos0, clos1, charset[c]); + closure(clos1, clos2, ktags, badtags, ntags); + s->arcs[c] = find_state(clos2, clospool, tagpool); memset(arctags, 0, ntags * sizeof(bool)); memset(mask, 0, ntags * sizeof(bool)); - for (const kitem_t *r = rstart; r != rend; ++r) { - merge_tags_with_mask(arctags, tagpool[r->tagidx], mask, - tagpool[rules[r->state->rule].tags], badtags, ntags); + for (cclositer_t p = clos1.begin(); p != clos1.end(); ++p) { + merge_tags_with_mask(arctags, tagpool[p->tagidx], mask, + tagpool[rules[p->state->rule].tags], badtags, ntags); } s->tags[c] = tagpool.insert(arctags); } @@ -109,10 +101,10 @@ dfa_t::dfa_t(const nfa_t &nfa, memset(fin, 0, nrules * sizeof(bool)); memset(arctags, 0, ntags * sizeof(bool)); memset(mask, 0, ntags * sizeof(bool)); - for (size_t j = 0; j < kcount; ++j) { - nfa_state_t *n = kernel[j].state; + for (cclositer_t p = clos0.begin(); p != clos0.end(); ++p) { + nfa_state_t *n = p->state; if (n->type == nfa_state_t::FIN) { - merge_tags_with_mask(arctags, tagpool[kernel[j].tagidx], mask, + merge_tags_with_mask(arctags, tagpool[p->tagidx], mask, tagpool[rules[n->rule].tags], badtags, ntags); fin[n->rule] = true; } @@ -141,8 +133,6 @@ dfa_t::dfa_t(const nfa_t &nfa, } } - delete[] rstart; - delete[] kstart; delete[] ktags; delete[] badtags; delete[] arctags; diff --git a/re2c/src/ir/dfa/find_state.cc b/re2c/src/ir/dfa/find_state.cc index c0b06c5d..da392a38 100644 --- a/re2c/src/ir/dfa/find_state.cc +++ b/re2c/src/ir/dfa/find_state.cc @@ -1,32 +1,92 @@ #include #include "src/ir/dfa/find_state.h" +#include "src/util/hash32.h" namespace re2c { -size_t find_state(kitem_t *kernel, kitem_t *kend, - ord_hash_set_t &kernels, Tagpool &tagpool) +static uint32_t hashclos(const closure_t &clos); +static bool eqclos(const closure_t *clos1, const closure_t *clos2); + +uint32_t hashclos(const closure_t &clos) +{ + uint32_t h = static_cast(clos.size()); // seed + for (cclositer_t c = clos.begin(); c != clos.end(); ++c) { + h = hash32(h, &c->state, sizeof(c->state)); + h = hash32(h, &c->tagidx, sizeof(c->tagidx)); + } + return h; +} + +bool eqclos(const closure_t *clos1, const closure_t *clos2) +{ + if (clos1->size() != clos2->size()) { + return false; + } + for (cclositer_t c1 = clos1->begin(), c2 = clos2->begin(); + c1 != clos1->end(); ++c1, ++c2) { + if (c1->state != c2->state + || c1->tagidx != c2->tagidx) { + return false; + } + } + return true; +} + +clospool_t::clospool_t(): lookup() {} + +clospool_t::~clospool_t() +{ + const size_t n = lookup.size(); + for (size_t i = 0; i < n; ++i) { + delete lookup[i]; + } +} + +size_t clospool_t::size() const { - const size_t kcount = static_cast(kend - kernel); + return lookup.size(); +} - // zero-sized kernel corresponds to default state - if (kcount == 0) { +const closure_t& clospool_t::operator[](size_t idx) const +{ + return *lookup[idx]; +} + +size_t clospool_t::insert(const closure_t &clos) +{ + const uint32_t hash = hashclos(clos); + + // try to find an identical DFA state + size_t idx = lookup.find_with(hash, &clos, eqclos); + if (idx != closlookup_t::NIL) { + return idx; + } + + // otherwise add a new state + return lookup.push(hash, new closure_t(clos)); +} + +size_t find_state(closure_t &clos, clospool_t &clospool, Tagpool &tagpool) +{ + // empty closure corresponds to default state + if (clos.empty()) { return dfa_t::NIL; } // dump tagsets to tagpool and address them by index: - // this simpifies storing and comparing kernels - for (kitem_t *k = kernel; k != kend; ++k) { - bool *tags = k->tagptr; - k->tagidx = tagpool.insert(tags); + // this simpifies storing and comparing closures + for (clositer_t c = clos.begin(); c != clos.end(); ++c) { + bool *tags = c->tagptr; + c->tagidx = tagpool.insert(tags); delete[] tags; } - // sort kernel items to allow comparison by hash and 'memcmp' - std::sort(kernel, kend, kitem_t::compare); + // sort closure to allow comparison by hash and 'memcmp' + std::sort(clos.begin(), clos.end(), clos_t::compare); - return kernels.insert(kernel, kcount * sizeof(kitem_t)); + return clospool.insert(clos); } } // namespace re2c diff --git a/re2c/src/ir/dfa/find_state.h b/re2c/src/ir/dfa/find_state.h index 18860769..6939deb7 100644 --- a/re2c/src/ir/dfa/find_state.h +++ b/re2c/src/ir/dfa/find_state.h @@ -2,13 +2,28 @@ #define _RE2C_IR_DFA_FIND_STATE_ #include "src/ir/dfa/closure.h" -#include "src/ir/tagpool.h" -#include "src/util/ord_hash_set.h" +#include "src/util/lookup.h" namespace re2c { -size_t find_state(kitem_t *kernel, kitem_t *kend, ord_hash_set_t &kernels, Tagpool &tagpool); +struct Tagpool; + +struct clospool_t +{ +private: + typedef lookup_t closlookup_t; + closlookup_t lookup; + +public: + clospool_t(); + ~clospool_t(); + size_t size() const; + const closure_t& operator[](size_t idx) const; + size_t insert(const closure_t &clos); +}; + +size_t find_state(closure_t &clos, clospool_t &clospool, Tagpool &tagpool); } // namespace re2c diff --git a/re2c/src/util/hash32.h b/re2c/src/util/hash32.h new file mode 100644 index 00000000..3ee612af --- /dev/null +++ b/re2c/src/util/hash32.h @@ -0,0 +1,20 @@ +#ifndef _RE2C_UTIL_HASH32_ +#define _RE2C_UTIL_HASH32_ + +#include "src/util/c99_stdint.h" + +namespace re2c +{ + +inline uint32_t hash32(uint32_t h, const void *data, size_t size) +{ + const uint8_t *bytes = static_cast(data); + for (size_t i = 0; i < size; ++i) { + h = h ^ ((h << 5) + (h >> 2) + bytes[i]); + } + return h; +} + +} // namespace re2c + +#endif // _RE2C_UTIL_HASH32_ diff --git a/re2c/src/util/lookup.h b/re2c/src/util/lookup.h new file mode 100644 index 00000000..10676b97 --- /dev/null +++ b/re2c/src/util/lookup.h @@ -0,0 +1,107 @@ +#ifndef _RE2C_UTIL_LOOKUP_ +#define _RE2C_UTIL_LOOKUP_ + +#include "src/util/c99_stdint.h" +#include +#include +#include +#include + +namespace re2c +{ + +/* + * O(1) random access + * O(log(n)) insertion + */ +template +struct lookup_t +{ + static const size_t NIL; + +private: + struct elem_t + { + size_t next; + data_t data; + + elem_t(size_t n, const data_t &d) + : next(n), data(d) {} + }; + + std::vector elems; + std::map lookup; + +public: + lookup_t(); + size_t size() const; + data_t& operator[](size_t idx); + const data_t& operator[](size_t idx) const; + size_t push(hash_t h, const data_t &data); + template + size_t find_with(hash_t h, const data_t &data, pred_t &pred) const; + +private: + size_t head(hash_t) const; +}; + +template +const size_t lookup_t::NIL = std::numeric_limits::max(); + +template +lookup_t::lookup_t() + : elems() + , lookup() +{} + +template +size_t lookup_t::size() const +{ + return elems.size(); +} + +template +data_t& lookup_t::operator[](size_t idx) +{ + return elems[idx].data; +} + +template +const data_t& lookup_t::operator[](size_t idx) const +{ + return elems[idx].data; +} + +template +size_t lookup_t::head(hash_t h) const +{ + typename std::map::const_iterator x = lookup.find(h); + return x == lookup.end() ? NIL : x->second; +} + +template +size_t lookup_t::push(hash_t h, const data_t &data) +{ + const size_t idx = elems.size(); + elems.push_back(elem_t(head(h), data)); + lookup[h] = idx; + return idx; +} + +template +template +size_t lookup_t::find_with(hash_t h, const data_t &data, pred_t &pred) const +{ + for (size_t i = head(h); i != NIL;) { + const elem_t &e = elems[i]; + if (pred(e.data, data)) { + return i; + } + i = e.next; + } + return NIL; +} + +} // namespace re2c + +#endif // _RE2C_UTIL_LOOKUP_ -- 2.40.0