From 150c84722952f4163ba6e37650f69c362683c85e Mon Sep 17 00:00:00 2001 From: Ulya Trofimovich Date: Mon, 26 Sep 2016 15:35:17 +0100 Subject: [PATCH] Moved important DFA construction subroutines into separate files. --- re2c/Makefile.am | 4 ++ re2c/src/ir/dfa/closure.cc | 76 ++++++++++++++++++++ re2c/src/ir/dfa/closure.h | 33 +++++++++ re2c/src/ir/dfa/determinization.cc | 107 +---------------------------- re2c/src/ir/dfa/find_state.cc | 32 +++++++++ re2c/src/ir/dfa/find_state.h | 15 ++++ 6 files changed, 162 insertions(+), 105 deletions(-) create mode 100644 re2c/src/ir/dfa/closure.cc create mode 100644 re2c/src/ir/dfa/closure.h create mode 100644 re2c/src/ir/dfa/find_state.cc create mode 100644 re2c/src/ir/dfa/find_state.h diff --git a/re2c/Makefile.am b/re2c/Makefile.am index a09208f5..625f2dfa 100644 --- a/re2c/Makefile.am +++ b/re2c/Makefile.am @@ -22,7 +22,9 @@ SRC_HDR = \ src/conf/warn.h \ src/ir/adfa/action.h \ src/ir/adfa/adfa.h \ + src/ir/dfa/closure.h \ src/ir/dfa/dfa.h \ + src/ir/dfa/find_state.h \ src/ir/nfa/nfa.h \ src/ir/regexp/encoding/case.h \ src/ir/regexp/encoding/enc.h \ @@ -89,9 +91,11 @@ SRC = \ src/ir/nfa/regexps2nfa.cc \ src/ir/adfa/adfa.cc \ src/ir/adfa/prepare.cc \ + src/ir/dfa/closure.cc \ src/ir/dfa/dead_rules.cc \ src/ir/dfa/determinization.cc \ src/ir/dfa/fillpoints.cc \ + src/ir/dfa/find_state.cc \ src/ir/dfa/minimization.cc \ src/ir/dfa/tag_deduplication.cc \ src/ir/regexp/encoding/enc.cc \ diff --git a/re2c/src/ir/dfa/closure.cc b/re2c/src/ir/dfa/closure.cc new file mode 100644 index 00000000..d06e691f --- /dev/null +++ b/re2c/src/ir/dfa/closure.cc @@ -0,0 +1,76 @@ +#include + +#include "src/ir/dfa/closure.h" +#include "src/ir/nfa/nfa.h" + +namespace re2c +{ + +static void merge_tags(bool *oldtags, const bool *newtags, bool *badtags, size_t ntags); + +void merge_tags(bool *oldtags, const bool *newtags, + bool *badtags, size_t ntags) +{ + for (size_t i = 0; i < ntags; ++i) { + badtags[i] |= oldtags[i] ^ newtags[i]; + oldtags[i] |= newtags[i]; + } +} + +/* note [epsilon-closures in tagged NFA] + * + * DFA state is a set of NFA states. + * However, DFA state includes not all NFA states that are in + * epsilon-closure (NFA states that have only epsilon-transitions + * and are not final states are omitted). + * The included states are called 'kernel' states. + * + * For tagged NFA we have to trace all epsilon-paths to each + * kernel state, accumulate tags along the way and compare + * resulting tag sets: if they differ, then NFA is tagwise + * ambiguous. All tags are merged together; ambiguity is reported. + */ +void closure(kitem_t *const kernel, kitem_t *&kend, + nfa_state_t *n, bool *tags, bool *badtags, size_t ntags) +{ + // trace the first iteration of each loop: + // epsilon-loops may add ney tags and reveal conflicts + if (n->loop > 1) { + return; + } + + ++n->loop; + switch (n->type) { + case nfa_state_t::ALT: + closure(kernel, kend, n->alt.out1, tags, badtags, ntags); + closure(kernel, kend, n->alt.out2, tags, badtags, ntags); + break; + case nfa_state_t::TAG: { + const size_t t = n->tag.info; + const bool old = tags[t]; + tags[t] = true; + closure(kernel, kend, n->tag.out, tags, badtags, ntags); + tags[t] = old; + break; + } + case nfa_state_t::RAN: + case nfa_state_t::FIN: { + kitem_t *k = kernel; + while (k != kend && k->state != n) ++k; + if (k == kend) { + kend->state = n; + kend->tagptr = new bool[ntags]; + memcpy(kend->tagptr, tags, ntags * sizeof(bool)); + ++kend; + } else { + // it is impossible to reach the same NFA state from + // different rules, so no need to mess with masks here + merge_tags(k->tagptr, tags, badtags, ntags); + } + break; + } + } + --n->loop; +} + +} // namespace re2c diff --git a/re2c/src/ir/dfa/closure.h b/re2c/src/ir/dfa/closure.h new file mode 100644 index 00000000..d45e27ff --- /dev/null +++ b/re2c/src/ir/dfa/closure.h @@ -0,0 +1,33 @@ +#ifndef _RE2C_IR_DFA_CLOSURE_ +#define _RE2C_IR_DFA_CLOSURE_ + +#include "src/util/c99_stdint.h" + +#include "src/ir/dfa/dfa.h" +#include "src/ir/nfa/nfa.h" + +namespace re2c +{ + +struct kitem_t +{ + nfa_state_t *state; + union + { + bool *tagptr; + size_t tagidx; + }; + + static bool compare(const kitem_t &k1, const kitem_t &k2) + { + return k1.state < k2.state + || (k1.state == k2.state && k1.tagidx < k2.tagidx); + } +}; + +void closure(kitem_t *const kernel, kitem_t *&kend, nfa_state_t *n, + bool *tags, bool *badtags, size_t ntags); + +} // namespace re2c + +#endif // _RE2C_IR_DFA_CLOSURE_ diff --git a/re2c/src/ir/dfa/determinization.cc b/re2c/src/ir/dfa/determinization.cc index 6a9666eb..b48b07bd 100644 --- a/re2c/src/ir/dfa/determinization.cc +++ b/re2c/src/ir/dfa/determinization.cc @@ -4,6 +4,8 @@ #include "src/conf/warn.h" #include "src/ir/dfa/dfa.h" +#include "src/ir/dfa/closure.h" +#include "src/ir/dfa/find_state.h" #include "src/ir/nfa/nfa.h" #include "src/ir/regexp/regexp.h" #include "src/util/range.h" @@ -14,15 +16,6 @@ namespace re2c const size_t dfa_t::NIL = std::numeric_limits::max(); -static void merge_tags(bool *oldtags, const bool *newtags, - bool *badtags, size_t ntags) -{ - for (size_t i = 0; i < ntags; ++i) { - badtags[i] |= oldtags[i] ^ newtags[i]; - oldtags[i] |= newtags[i]; - } -} - static void merge_tags_with_mask(bool *oldtags, const bool *newtags, bool *oldmask, const bool *newmask, bool *badtags, size_t ntags) @@ -34,102 +27,6 @@ static void merge_tags_with_mask(bool *oldtags, const bool *newtags, } } -struct kitem_t -{ - nfa_state_t *state; - union - { - bool *tagptr; - size_t tagidx; - }; - - static bool compare(const kitem_t &k1, const kitem_t &k2) - { - return k1.state < k2.state - || (k1.state == k2.state && k1.tagidx < k2.tagidx); - } -}; - -/* note [epsilon-closures in tagged NFA] - * - * DFA state is a set of NFA states. - * However, DFA state includes not all NFA states that are in - * epsilon-closure (NFA states that have only epsilon-transitions - * and are not final states are omitted). - * The included states are called 'kernel' states. - * - * For tagged NFA we have to trace all epsilon-paths to each - * kernel state, accumulate tags along the way and compare - * resulting tag sets: if they differ, then NFA is tagwise - * ambiguous. All tags are merged together; ambiguity is reported. - */ -static void closure(kitem_t *const kernel, kitem_t *&kend, - nfa_state_t *n, bool *tags, bool *badtags, size_t ntags) -{ - // trace the first iteration of each loop: - // epsilon-loops may add ney tags and reveal conflicts - if (n->loop > 1) { - return; - } - - ++n->loop; - switch (n->type) { - case nfa_state_t::ALT: - closure(kernel, kend, n->alt.out1, tags, badtags, ntags); - closure(kernel, kend, n->alt.out2, tags, badtags, ntags); - break; - case nfa_state_t::TAG: { - const size_t t = n->tag.info; - const bool old = tags[t]; - tags[t] = true; - closure(kernel, kend, n->tag.out, tags, badtags, ntags); - tags[t] = old; - break; - } - case nfa_state_t::RAN: - case nfa_state_t::FIN: { - kitem_t *k = kernel; - while (k != kend && k->state != n) ++k; - if (k == kend) { - kend->state = n; - kend->tagptr = new bool[ntags]; - memcpy(kend->tagptr, tags, ntags * sizeof(bool)); - ++kend; - } else { - // it is impossible to reach the same NFA state from - // different rules, so no need to mess with masks here - merge_tags(k->tagptr, tags, badtags, ntags); - } - break; - } - } - --n->loop; -} - -static size_t find_state(kitem_t *kernel, kitem_t *kend, - ord_hash_set_t &kernels, Tagpool &tagpool) -{ - const size_t kcount = static_cast(kend - kernel); - - // zero-sized kernel corresponds to default state - if (kcount == 0) { - return dfa_t::NIL; - } - - // dump tagsets to tagpool and address them by index: - // this simpifies storing and comparing kernels - for (kitem_t *k = kernel; k != kend; ++k) { - bool *tags = k->tagptr; - k->tagidx = tagpool.insert(tags); - delete[] tags; - } - - // sort kernel items to allow comparison by hash and 'memcmp' - std::sort(kernel, kend, kitem_t::compare); - - return kernels.insert(kernel, kcount * sizeof(kitem_t)); -} - dfa_t::dfa_t(const nfa_t &nfa, const charset_t &charset, const std::string &cond) diff --git a/re2c/src/ir/dfa/find_state.cc b/re2c/src/ir/dfa/find_state.cc new file mode 100644 index 00000000..c0b06c5d --- /dev/null +++ b/re2c/src/ir/dfa/find_state.cc @@ -0,0 +1,32 @@ +#include + +#include "src/ir/dfa/find_state.h" + +namespace re2c +{ + +size_t find_state(kitem_t *kernel, kitem_t *kend, + ord_hash_set_t &kernels, Tagpool &tagpool) +{ + const size_t kcount = static_cast(kend - kernel); + + // zero-sized kernel corresponds to default state + if (kcount == 0) { + return dfa_t::NIL; + } + + // dump tagsets to tagpool and address them by index: + // this simpifies storing and comparing kernels + for (kitem_t *k = kernel; k != kend; ++k) { + bool *tags = k->tagptr; + k->tagidx = tagpool.insert(tags); + delete[] tags; + } + + // sort kernel items to allow comparison by hash and 'memcmp' + std::sort(kernel, kend, kitem_t::compare); + + return kernels.insert(kernel, kcount * sizeof(kitem_t)); +} + +} // namespace re2c diff --git a/re2c/src/ir/dfa/find_state.h b/re2c/src/ir/dfa/find_state.h new file mode 100644 index 00000000..18860769 --- /dev/null +++ b/re2c/src/ir/dfa/find_state.h @@ -0,0 +1,15 @@ +#ifndef _RE2C_IR_DFA_FIND_STATE_ +#define _RE2C_IR_DFA_FIND_STATE_ + +#include "src/ir/dfa/closure.h" +#include "src/ir/tagpool.h" +#include "src/util/ord_hash_set.h" + +namespace re2c +{ + +size_t find_state(kitem_t *kernel, kitem_t *kend, ord_hash_set_t &kernels, Tagpool &tagpool); + +} // namespace re2c + +#endif // _RE2C_IR_DFA_FIND_STATE_ -- 2.40.0