]> granicus.if.org Git - re2c/commitdiff
Keep data relevant to DFA determinization outsde of DFA states.
authorUlya Trofimovich <skvadrik@gmail.com>
Wed, 13 Jan 2016 08:14:38 +0000 (08:14 +0000)
committerUlya Trofimovich <skvadrik@gmail.com>
Wed, 13 Jan 2016 08:14:38 +0000 (08:14 +0000)
re2c/Makefile.am
re2c/src/ir/dfa/determinization.cc
re2c/src/ir/dfa/dfa.h
re2c/src/util/ord_hash_set.h [new file with mode: 0644]

index 353372c2de16920954a4758c6a21986a69a34ffe..c575762b1a6d19a0ad202855d8e552db8634c6de 100644 (file)
@@ -63,6 +63,7 @@ SRC_HDR = \
        src/util/forbid_copy.h \
        src/util/free_list.h \
        src/util/local_increment.h \
+       src/util/ord_hash_set.h \
        src/util/range.h \
        src/util/s_to_n32_unsafe.h \
        src/util/smart_ptr.h \
index de4d60e10b4386de16fd92a51866f0613c896c3e..8886d36313a537e016881def7a376b426f03cdd9 100644 (file)
@@ -1,8 +1,6 @@
 #include <algorithm>
 #include <assert.h>
 #include <limits>
-#include <list>
-#include <set>
 #include <string.h>
 #include <queue>
 
@@ -10,6 +8,7 @@
 #include "src/ir/nfa/nfa.h"
 #include "src/ir/regexp/regexp_rule.h"
 #include "src/ir/rule_rank.h"
+#include "src/util/ord_hash_set.h"
 #include "src/util/range.h"
 
 namespace re2c
@@ -65,94 +64,62 @@ static nfa_state_t **closure(nfa_state_t **cP, nfa_state_t *n)
        return cP;
 }
 
-static size_t findState
+static size_t find_state
        ( nfa_state_t **kernel
-       , nfa_state_t **kernel_end
-       , std::vector<dfa_state_t*> &states
-       , std::map<uintptr_t, std::list<size_t> > &kernels
+       , nfa_state_t **end
+       , ord_hash_set_t &kernels
        )
 {
-       const size_t kCount = static_cast<size_t>(kernel_end - kernel);
+       // zero-sized kernel corresponds to default state
+       if (kernel == end)
+       {
+               return dfa_t::NIL;
+       }
 
        // see note [marking DFA states]
-       for (size_t i = 0; i < kCount; ++i)
+       for (nfa_state_t **p = kernel; p != end; ++p)
        {
-               kernel[i]->mark = false;
+               (*p)->mark = false;
        }
 
        // sort kernel states: we need this to get stable hash
        // and to compare states with simple 'memcmp'
-       std::sort(kernel, kernel_end);
-
-       // get hash of the new DFA state
-       uintptr_t hash = kCount; // seed
-       for (size_t i = 0; i < kCount; ++i)
-       {
-               hash = hash ^ ((hash << 5) + (hash >> 2) + (uintptr_t)kernel[i]);
-       }
-
-       // try to find an existing DFA state identical to the new state
-       std::map<uintptr_t, std::list<size_t> >::const_iterator i = kernels.find(hash);
-       if (i != kernels.end())
-       {
-               std::list<size_t>::const_iterator
-                       j = i->second.begin(),
-                       e = i->second.end();
-               for (; j != e; ++j)
-               {
-                       const size_t k = *j;
-                       if (states[k]->kCount == kCount
-                               && memcmp(states[k]->kernel, kernel, kCount * sizeof(nfa_state_t*)) == 0)
-                       {
-                               return k;
-                       }
-               }
-       }
-
-       // no identical DFA state was found; add new state
-       dfa_state_t *s = new dfa_state_t;
-       s->kCount = kCount;
-       s->kernel = new nfa_state_t* [kCount];
-       memcpy(s->kernel, kernel, kCount * sizeof(nfa_state_t*));
-       const size_t k = states.size();
-       states.push_back(s);
-       kernels[hash].push_back(k);
-       return k;
+       std::sort(kernel, end);
+       const size_t size = static_cast<size_t>(end - kernel) * sizeof(nfa_state_t*);
+       return kernels.insert(kernel, size);
 }
 
 dfa_t::dfa_t(const nfa_t &nfa, const charset_t &charset, rules_t &rules)
        : states()
        , nchars(charset.size() - 1) // (n + 1) bounds for n ranges
 {
-       std::map<uintptr_t, std::list<size_t> > kernels;
-       nfa_state_t **kernel = new nfa_state_t*[nfa.size];
-       std::vector<nfa_state_t*> *arcs = new std::vector<nfa_state_t*>[nchars];
+       ord_hash_set_t kernels;
+       nfa_state_t **const buffer = new nfa_state_t*[nfa.size];
+       std::vector<std::vector<nfa_state_t*> > arcs(nchars);
 
-       findState(kernel, closure(kernel, nfa.root), states, kernels);
-       for (size_t n = 0; n < states.size(); ++n)
+       find_state(buffer, closure(buffer, nfa.root), kernels);
+       for (size_t i = 0; i < kernels.size(); ++i)
        {
-               dfa_state_t *s = states[n];
-
-               for(size_t i = 0; i < nchars; ++i)
-               {
-                       arcs[i].clear();
-               }
+               dfa_state_t *s = new dfa_state_t;
+               states.push_back(s);
 
-               for (size_t k = 0; k < s->kCount; ++k)
+               nfa_state_t **kernel;
+               const size_t kernel_size = kernels.deref<nfa_state_t*>(i, kernel);
+               for (size_t j = 0; j < kernel_size; ++j)
                {
-                       nfa_state_t *n = s->kernel[k];
+                       nfa_state_t *n = kernel[j];
                        switch (n->type)
                        {
                                case nfa_state_t::RAN:
                                {
-                                       nfa_state_t *n2 = n->value.ran.out;
-                                       size_t j = 0;
+                                       nfa_state_t *m = n->value.ran.out;
+                                       size_t c = 0;
                                        for (Range *r = n->value.ran.ran; r; r = r->next ())
                                        {
-                                               for (; charset[j] != r->lower(); ++j);
-                                               for (; charset[j] != r->upper(); ++j)
+                                               for (; charset[c] != r->lower(); ++c);
+                                               for (; charset[c] != r->upper(); ++c)
                                                {
-                                                       arcs[j].push_back(n2);
+                                                       arcs[c].push_back(m);
                                                }
                                        }
                                        break;
@@ -189,25 +156,22 @@ dfa_t::dfa_t(const nfa_t &nfa, const charset_t &charset, rules_t &rules)
                }
 
                s->arcs = new size_t[nchars];
-               for(size_t i = 0; i < nchars; ++i)
+               for(size_t c = 0; c < nchars; ++c)
                {
-                       if(arcs[i].empty())
+                       nfa_state_t **end = buffer;
+                       for (std::vector<nfa_state_t*>::const_iterator j = arcs[c].begin(); j != arcs[c].end(); ++j)
                        {
-                               s->arcs[i] = NIL;
-                       }
-                       else
-                       {
-                               nfa_state_t **end = kernel;
-                               for (std::vector<nfa_state_t*>::const_iterator j = arcs[i].begin(); j != arcs[i].end(); ++j)
-                               {
-                                       end = closure(end, *j);
-                               }
-                               s->arcs[i] = findState(kernel, end, states, kernels);
+                               end = closure(end, *j);
                        }
+                       s->arcs[c] = find_state(buffer, end, kernels);
+               }
+
+               for(size_t c = 0; c < nchars; ++c)
+               {
+                       arcs[c].clear();
                }
        }
-       delete[] kernel;
-       delete[] arcs;
+       delete[] buffer;
 }
 
 dfa_t::~dfa_t()
index b148b469a160885ad52d8ee79cbaa037e91cdeb4..0f0806c9bfb9c8f7fc21b9808f54940360bd0ad2 100644 (file)
 namespace re2c
 {
 
-struct nfa_state_t;
 struct nfa_t;
 class RuleOp;
 
 struct dfa_state_t
 {
-       size_t kCount;
-       nfa_state_t **kernel;
        size_t *arcs;
        RuleOp *rule;
        bool ctx;
 
        dfa_state_t()
-               : kCount(0)
-               , kernel(NULL)
-               , arcs(NULL)
+               : arcs(NULL)
                , rule(NULL)
                , ctx(false)
        {}
        ~dfa_state_t()
        {
-               delete[] kernel;
                delete[] arcs;
        }
 
diff --git a/re2c/src/util/ord_hash_set.h b/re2c/src/util/ord_hash_set.h
new file mode 100644 (file)
index 0000000..0275e6e
--- /dev/null
@@ -0,0 +1,113 @@
+#ifndef _RE2C_UTIL_ORD_HASH_SET_
+#define _RE2C_UTIL_ORD_HASH_SET_
+
+#include "src/util/c99_stdint.h"
+#include <map>
+#include <vector>
+
+namespace re2c
+{
+
+/*
+ * ordered hash set:
+ *   - access element by index: O(1)
+ *   - insert element (find existing or add new): O(log(n))
+ *
+ */
+class ord_hash_set_t
+{
+       struct elem_t
+       {
+               elem_t *next;
+               size_t index;
+               size_t size;
+               char data[1]; // inlined array of variable length
+       };
+       typedef size_t hash_t;
+
+       std::vector<elem_t*> elems;
+       std::map<hash_t, elem_t*> lookup;
+
+       static hash_t hash(const void *data, size_t size);
+       elem_t *make_elem(elem_t *next, size_t index, size_t size, const void *data);
+
+public:
+       ord_hash_set_t();
+       ~ord_hash_set_t();
+       size_t size() const;
+       size_t insert(const void *data, size_t size);
+       template<typename data_t> size_t deref(size_t i, data_t *&data);
+};
+
+ord_hash_set_t::hash_t ord_hash_set_t::hash(const void *data, size_t size)
+{
+       const uint8_t *bytes = static_cast<const uint8_t*>(data);
+       hash_t h = size; // seed
+       for (size_t i = 0; i < size; ++i)
+       {
+               h = h ^ ((h << 5) + (h >> 2) + bytes[i]);
+       }
+       return h;
+}
+
+ord_hash_set_t::elem_t* ord_hash_set_t::make_elem(
+       elem_t *next,
+       size_t index,
+       size_t size,
+       const void *data)
+{
+       elem_t *e = static_cast<elem_t*>(malloc(offsetof(elem_t, data) + size));
+       e->next = next;
+       e->index = index;
+       e->size = size;
+       memcpy(e->data, data, size);
+       return e;
+}
+
+ord_hash_set_t::ord_hash_set_t()
+       : elems()
+       , lookup()
+{}
+
+ord_hash_set_t::~ord_hash_set_t()
+{
+       std::for_each(elems.begin(), elems.end(), free);
+}
+
+size_t ord_hash_set_t::size() const
+{
+       return elems.size();
+}
+
+size_t ord_hash_set_t::insert(const void *data, size_t size)
+{
+       const hash_t h = hash(data, size);
+
+       typename std::map<hash_t, elem_t*>::const_iterator i = lookup.find(h);
+       if (i != lookup.end())
+       {
+               for (elem_t *e = i->second; e; e = e->next)
+               {
+                       if (e->size == size
+                               && memcmp(e->data, data, size) == 0)
+                       {
+                               return e->index;
+                       }
+               }
+       }
+
+       const size_t index = elems.size();
+       elems.push_back(lookup[h] = make_elem(lookup[h], index, size, data));
+       return index;
+}
+
+template<typename data_t> size_t ord_hash_set_t::deref(size_t i, data_t *&data)
+{
+       elem_t *e = elems[i];
+       data = reinterpret_cast<data_t*>(e->data);
+       return e->size / sizeof(data_t);
+}
+
+} // namespace re2c
+
+#endif // _RE2C_UTIL_ORD_HASH_SET_