#include <algorithm>
#include <assert.h>
#include <limits>
-#include <list>
-#include <set>
#include <string.h>
#include <queue>
#include "src/ir/nfa/nfa.h"
#include "src/ir/regexp/regexp_rule.h"
#include "src/ir/rule_rank.h"
+#include "src/util/ord_hash_set.h"
#include "src/util/range.h"
namespace re2c
return cP;
}
-static size_t findState
+static size_t find_state
( nfa_state_t **kernel
- , nfa_state_t **kernel_end
- , std::vector<dfa_state_t*> &states
- , std::map<uintptr_t, std::list<size_t> > &kernels
+ , nfa_state_t **end
+ , ord_hash_set_t &kernels
)
{
- const size_t kCount = static_cast<size_t>(kernel_end - kernel);
+ // zero-sized kernel corresponds to default state
+ if (kernel == end)
+ {
+ return dfa_t::NIL;
+ }
// see note [marking DFA states]
- for (size_t i = 0; i < kCount; ++i)
+ for (nfa_state_t **p = kernel; p != end; ++p)
{
- kernel[i]->mark = false;
+ (*p)->mark = false;
}
// sort kernel states: we need this to get stable hash
// and to compare states with simple 'memcmp'
- std::sort(kernel, kernel_end);
-
- // get hash of the new DFA state
- uintptr_t hash = kCount; // seed
- for (size_t i = 0; i < kCount; ++i)
- {
- hash = hash ^ ((hash << 5) + (hash >> 2) + (uintptr_t)kernel[i]);
- }
-
- // try to find an existing DFA state identical to the new state
- std::map<uintptr_t, std::list<size_t> >::const_iterator i = kernels.find(hash);
- if (i != kernels.end())
- {
- std::list<size_t>::const_iterator
- j = i->second.begin(),
- e = i->second.end();
- for (; j != e; ++j)
- {
- const size_t k = *j;
- if (states[k]->kCount == kCount
- && memcmp(states[k]->kernel, kernel, kCount * sizeof(nfa_state_t*)) == 0)
- {
- return k;
- }
- }
- }
-
- // no identical DFA state was found; add new state
- dfa_state_t *s = new dfa_state_t;
- s->kCount = kCount;
- s->kernel = new nfa_state_t* [kCount];
- memcpy(s->kernel, kernel, kCount * sizeof(nfa_state_t*));
- const size_t k = states.size();
- states.push_back(s);
- kernels[hash].push_back(k);
- return k;
+ std::sort(kernel, end);
+ const size_t size = static_cast<size_t>(end - kernel) * sizeof(nfa_state_t*);
+ return kernels.insert(kernel, size);
}
dfa_t::dfa_t(const nfa_t &nfa, const charset_t &charset, rules_t &rules)
: states()
, nchars(charset.size() - 1) // (n + 1) bounds for n ranges
{
- std::map<uintptr_t, std::list<size_t> > kernels;
- nfa_state_t **kernel = new nfa_state_t*[nfa.size];
- std::vector<nfa_state_t*> *arcs = new std::vector<nfa_state_t*>[nchars];
+ ord_hash_set_t kernels;
+ nfa_state_t **const buffer = new nfa_state_t*[nfa.size];
+ std::vector<std::vector<nfa_state_t*> > arcs(nchars);
- findState(kernel, closure(kernel, nfa.root), states, kernels);
- for (size_t n = 0; n < states.size(); ++n)
+ find_state(buffer, closure(buffer, nfa.root), kernels);
+ for (size_t i = 0; i < kernels.size(); ++i)
{
- dfa_state_t *s = states[n];
-
- for(size_t i = 0; i < nchars; ++i)
- {
- arcs[i].clear();
- }
+ dfa_state_t *s = new dfa_state_t;
+ states.push_back(s);
- for (size_t k = 0; k < s->kCount; ++k)
+ nfa_state_t **kernel;
+ const size_t kernel_size = kernels.deref<nfa_state_t*>(i, kernel);
+ for (size_t j = 0; j < kernel_size; ++j)
{
- nfa_state_t *n = s->kernel[k];
+ nfa_state_t *n = kernel[j];
switch (n->type)
{
case nfa_state_t::RAN:
{
- nfa_state_t *n2 = n->value.ran.out;
- size_t j = 0;
+ nfa_state_t *m = n->value.ran.out;
+ size_t c = 0;
for (Range *r = n->value.ran.ran; r; r = r->next ())
{
- for (; charset[j] != r->lower(); ++j);
- for (; charset[j] != r->upper(); ++j)
+ for (; charset[c] != r->lower(); ++c);
+ for (; charset[c] != r->upper(); ++c)
{
- arcs[j].push_back(n2);
+ arcs[c].push_back(m);
}
}
break;
}
s->arcs = new size_t[nchars];
- for(size_t i = 0; i < nchars; ++i)
+ for(size_t c = 0; c < nchars; ++c)
{
- if(arcs[i].empty())
+ nfa_state_t **end = buffer;
+ for (std::vector<nfa_state_t*>::const_iterator j = arcs[c].begin(); j != arcs[c].end(); ++j)
{
- s->arcs[i] = NIL;
- }
- else
- {
- nfa_state_t **end = kernel;
- for (std::vector<nfa_state_t*>::const_iterator j = arcs[i].begin(); j != arcs[i].end(); ++j)
- {
- end = closure(end, *j);
- }
- s->arcs[i] = findState(kernel, end, states, kernels);
+ end = closure(end, *j);
}
+ s->arcs[c] = find_state(buffer, end, kernels);
+ }
+
+ for(size_t c = 0; c < nchars; ++c)
+ {
+ arcs[c].clear();
}
}
- delete[] kernel;
- delete[] arcs;
+ delete[] buffer;
}
dfa_t::~dfa_t()
--- /dev/null
+#ifndef _RE2C_UTIL_ORD_HASH_SET_
+#define _RE2C_UTIL_ORD_HASH_SET_
+
+#include "src/util/c99_stdint.h"
+#include <map>
+#include <vector>
+
+namespace re2c
+{
+
+/*
+ * ordered hash set:
+ * - access element by index: O(1)
+ * - insert element (find existing or add new): O(log(n))
+ *
+ */
+class ord_hash_set_t
+{
+ struct elem_t
+ {
+ elem_t *next;
+ size_t index;
+ size_t size;
+ char data[1]; // inlined array of variable length
+ };
+ typedef size_t hash_t;
+
+ std::vector<elem_t*> elems;
+ std::map<hash_t, elem_t*> lookup;
+
+ static hash_t hash(const void *data, size_t size);
+ elem_t *make_elem(elem_t *next, size_t index, size_t size, const void *data);
+
+public:
+ ord_hash_set_t();
+ ~ord_hash_set_t();
+ size_t size() const;
+ size_t insert(const void *data, size_t size);
+ template<typename data_t> size_t deref(size_t i, data_t *&data);
+};
+
+ord_hash_set_t::hash_t ord_hash_set_t::hash(const void *data, size_t size)
+{
+ const uint8_t *bytes = static_cast<const uint8_t*>(data);
+ hash_t h = size; // seed
+ for (size_t i = 0; i < size; ++i)
+ {
+ h = h ^ ((h << 5) + (h >> 2) + bytes[i]);
+ }
+ return h;
+}
+
+ord_hash_set_t::elem_t* ord_hash_set_t::make_elem(
+ elem_t *next,
+ size_t index,
+ size_t size,
+ const void *data)
+{
+ elem_t *e = static_cast<elem_t*>(malloc(offsetof(elem_t, data) + size));
+ e->next = next;
+ e->index = index;
+ e->size = size;
+ memcpy(e->data, data, size);
+ return e;
+}
+
+ord_hash_set_t::ord_hash_set_t()
+ : elems()
+ , lookup()
+{}
+
+ord_hash_set_t::~ord_hash_set_t()
+{
+ std::for_each(elems.begin(), elems.end(), free);
+}
+
+size_t ord_hash_set_t::size() const
+{
+ return elems.size();
+}
+
+size_t ord_hash_set_t::insert(const void *data, size_t size)
+{
+ const hash_t h = hash(data, size);
+
+ typename std::map<hash_t, elem_t*>::const_iterator i = lookup.find(h);
+ if (i != lookup.end())
+ {
+ for (elem_t *e = i->second; e; e = e->next)
+ {
+ if (e->size == size
+ && memcmp(e->data, data, size) == 0)
+ {
+ return e->index;
+ }
+ }
+ }
+
+ const size_t index = elems.size();
+ elems.push_back(lookup[h] = make_elem(lookup[h], index, size, data));
+ return index;
+}
+
+template<typename data_t> size_t ord_hash_set_t::deref(size_t i, data_t *&data)
+{
+ elem_t *e = elems[i];
+ data = reinterpret_cast<data_t*>(e->data);
+ return e->size / sizeof(data_t);
+}
+
+} // namespace re2c
+
+#endif // _RE2C_UTIL_ORD_HASH_SET_