From: Ulya Trofimovich Date: Fri, 18 Dec 2015 12:52:17 +0000 (+0000) Subject: Changed bytecode intermediate representation to a simpler NFA representation. X-Git-Tag: 0.16~1^2~27 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=45066b3a3ab2524cfdecb03d32b4cf39993fec40;p=re2c Changed bytecode intermediate representation to a simpler NFA representation. --- diff --git a/re2c/Makefile.am b/re2c/Makefile.am index 4f0805eb..00f866a6 100644 --- a/re2c/Makefile.am +++ b/re2c/Makefile.am @@ -24,8 +24,7 @@ SRC_HDR = \ src/conf/msg.h \ src/conf/opt.h \ src/conf/warn.h \ - src/ir/bytecode/bytecode.h \ - src/ir/bytecode/ins.h \ + src/ir/bytecode/nfa.h \ src/ir/dfa/state.h \ src/ir/dfa/dfa.h \ src/ir/dfa/action.h \ @@ -46,6 +45,7 @@ SRC_HDR = \ src/ir/regexp/regexp_null.h \ src/ir/regexp/regexp.h \ src/ir/regexp/regexp_close.h \ + src/ir/compile.h \ src/ir/rule_rank.h \ src/globals.h \ src/parse/code.h \ @@ -95,11 +95,9 @@ SRC = \ src/conf/msg.cc \ src/conf/opt.cc \ src/conf/warn.cc \ - src/ir/bytecode/bytecode.cc \ - src/ir/bytecode/ins.cc \ - src/ir/bytecode/split.cc \ - src/ir/bytecode/compile.cc \ - src/ir/bytecode/calc_size.cc \ + src/ir/nfa/calc_size.cc \ + src/ir/nfa/nfa.cc \ + src/ir/nfa/split.cc \ src/ir/dfa/dfa.cc \ src/ir/regexp/display.cc \ src/ir/regexp/encoding/enc.cc \ @@ -112,6 +110,7 @@ SRC = \ src/ir/regexp/encoding/utf16/utf16_range.cc \ src/ir/regexp/fixed_length.cc \ src/ir/regexp/regexp.cc \ + src/ir/compile.cc \ src/ir/rule_rank.cc \ src/main.cc \ src/parse/code.cc \ diff --git a/re2c/bootstrap/src/parse/lex.cc b/re2c/bootstrap/src/parse/lex.cc index de3c8f65..f2d09142 100644 --- a/re2c/bootstrap/src/parse/lex.cc +++ b/re2c/bootstrap/src/parse/lex.cc @@ -1,4 +1,4 @@ -/* Generated by re2c 0.15.3 on Tue Dec 15 12:16:26 2015 */ +/* Generated by re2c 0.15.3 on Tue Dec 15 14:38:22 2015 */ #line 1 "../src/parse/lex.re" #include "src/util/c99_stdint.h" #include diff --git a/re2c/bootstrap/src/parse/parser.cc b/re2c/bootstrap/src/parse/parser.cc index 7071b3d8..f8e7c4b5 100644 --- a/re2c/bootstrap/src/parse/parser.cc +++ b/re2c/bootstrap/src/parse/parser.cc @@ -84,7 +84,7 @@ #include "src/codegen/skeleton/skeleton.h" #include "src/conf/opt.h" #include "src/globals.h" -#include "src/ir/bytecode/bytecode.h" +#include "src/ir/compile.h" #include "src/ir/dfa/dfa.h" #include "src/ir/regexp/encoding/enc.h" #include "src/ir/regexp/encoding/range_suffix.h" @@ -2406,7 +2406,7 @@ void parse(Scanner& i, Output & o) } } - dfa_map[it->first] = genCode(it->second, o, it->first, opts->encoding.nCodeUnits ()); + dfa_map[it->first] = compile(it->second, o, it->first, opts->encoding.nCodeUnits ()); } if (parseMode != Scanner::Rules && dfa_map.find(it->first) != dfa_map.end()) { @@ -2420,7 +2420,7 @@ void parse(Scanner& i, Output & o) { if (parseMode != Scanner::Reuse) { - dfa_map[""] = genCode(spec, o, "", opts->encoding.nCodeUnits ()); + dfa_map[""] = compile(spec, o, "", opts->encoding.nCodeUnits ()); } if (parseMode != Scanner::Rules && dfa_map.find("") != dfa_map.end()) { diff --git a/re2c/src/ir/bytecode/bytecode.cc b/re2c/src/ir/bytecode/bytecode.cc deleted file mode 100644 index d878a34a..00000000 --- a/re2c/src/ir/bytecode/bytecode.cc +++ /dev/null @@ -1,131 +0,0 @@ -#include -#include -#include - -#include "src/codegen/output.h" -#include "src/ir/bytecode/bytecode.h" -#include "src/ir/bytecode/ins.h" -#include "src/ir/dfa/dfa.h" -#include "src/ir/regexp/regexp.h" -#include "src/parse/spec.h" - -namespace re2c { - -static void optimize (Ins * i); - -smart_ptr genCode (Spec & spec, Output & output, const std::string & cond, uint32_t cunits) -{ - RegExp * re = spec.re; - - // The original set of code units (charset) might be very large. - // A common trick it is to split charset into disjoint character ranges - // and choose a representative of each range (we choose lower bound). - // The set of all representatives is the new (compacted) charset. - // Don't forget to include zero and upper bound, even if they - // do not explicitely apper in ranges. - std::set bounds; - re->split(bounds); - bounds.insert(0); - bounds.insert(cunits); - charset_t cs; - for (std::set::const_iterator i = bounds.begin(); i != bounds.end(); ++i) - { - cs.push_back(*i); - } - - re->calcSize(cs); - - Ins *ins = new Ins[re->size + 1]; - memset(ins, 0, (re->size + 1)*sizeof(Ins)); - const uint32_t size = re->compile(cs, ins); - Ins *eoi = &ins[size]; - eoi->i.tag = GOTO; - eoi->i.link = eoi; - - optimize(ins); - - /* - for (const Ins *inst = &ins[0]; inst < &ins[size]; ) - { - inst = showIns(std::cout, *inst, ins[0]); - } - */ - - for (uint32_t j = 0; j < size;) - { - unmark(&ins[j]); - - if (ins[j].i.tag == CHAR) - { - j = static_cast ((Ins*) ins[j].i.link - ins); - } - else - { - j++; - } - } - - smart_ptr dfa = make_smart_ptr (new DFA - ( cond - , output.source.get_block_line () - , ins - , size - , 0 - , cunits - , cs - , spec.rules - )); - - // accumulate global statistics from this particular DFA - output.max_fill = std::max (output.max_fill, dfa->max_fill); - if (dfa->need_accept) - { - output.source.set_used_yyaccept (); - } - - return dfa; -} - -void optimize (Ins * i) -{ - while (!isMarked (i)) - { - mark (i); - if (i->i.tag == CHAR) - { - i = (Ins *) i->i.link; - } - else if (i->i.tag == GOTO || i->i.tag == FORK) - { - Ins * target = (Ins *) i->i.link; - optimize (target); - if (target->i.tag == GOTO) - { - i->i.link = target->i.link == target - ? i - : target; - } - if (i->i.tag == FORK) - { - Ins * follow = (Ins *) & i[1]; - optimize (follow); - if (follow->i.tag == GOTO && follow->i.link == follow) - { - i->i.tag = GOTO; - } - else if (i->i.link == i) - { - i->i.tag = GOTO; - i->i.link = follow; - } - } - return; - } - else - { - ++i; - } - } -} - -} // namespace re2c diff --git a/re2c/src/ir/bytecode/calc_size.cc b/re2c/src/ir/bytecode/calc_size.cc deleted file mode 100644 index 5feca8cb..00000000 --- a/re2c/src/ir/bytecode/calc_size.cc +++ /dev/null @@ -1,61 +0,0 @@ -#include "src/util/c99_stdint.h" - -#include "src/ir/regexp/regexp.h" -#include "src/ir/regexp/regexp_alt.h" -#include "src/ir/regexp/regexp_cat.h" -#include "src/ir/regexp/regexp_close.h" -#include "src/ir/regexp/regexp_match.h" -#include "src/ir/regexp/regexp_null.h" -#include "src/ir/regexp/regexp_rule.h" -#include "src/util/range.h" - -namespace re2c -{ - -void AltOp::calcSize (const charset_t & cs) -{ - exp1->calcSize (cs); - exp2->calcSize (cs); - size = exp1->size + exp2->size + 2; -} - -void CatOp::calcSize (const charset_t & cs) -{ - exp1->calcSize (cs); - exp2->calcSize (cs); - size = exp1->size + exp2->size; -} - -void CloseOp::calcSize (const charset_t & cs) -{ - exp->calcSize (cs); - size = exp->size + 2; -} - -void MatchOp::calcSize (const charset_t & cs) -{ - size = 1; - uint32_t k = 0; - for (Range * r = match; r; r = r->next ()) - { - for (; cs[k] != r->lower(); ++k); - for (; cs[k] != r->upper(); ++k) - { - ++size; - } - } -} - -void NullOp::calcSize (const charset_t &) -{ - size = 0; -} - -void RuleOp::calcSize (const charset_t & cs) -{ - exp->calcSize (cs); - ctx->calcSize (cs); - size = exp->size + (ctx->size ? ctx->size + 2 : 1); -} - -} // end namespace re2c diff --git a/re2c/src/ir/bytecode/compile.cc b/re2c/src/ir/bytecode/compile.cc deleted file mode 100644 index 67f7c36e..00000000 --- a/re2c/src/ir/bytecode/compile.cc +++ /dev/null @@ -1,220 +0,0 @@ -#include "src/util/c99_stdint.h" - -#include "src/ir/bytecode/ins.h" -#include "src/ir/regexp/regexp.h" -#include "src/ir/regexp/regexp_alt.h" -#include "src/ir/regexp/regexp_cat.h" -#include "src/ir/regexp/regexp_close.h" -#include "src/ir/regexp/regexp_match.h" -#include "src/ir/regexp/regexp_null.h" -#include "src/ir/regexp/regexp_rule.h" -#include "src/util/range.h" - -namespace re2c -{ - -static uint32_t compile_goto (Ins * ins, Ins * i); - -uint32_t AltOp::compile (const charset_t & cs, Ins * i) -{ - if (ins_cache) - { - return compile_goto (ins_cache, i); - } - else - { - ins_cache = i; - - i->i.tag = FORK; - const uint32_t sz1 = exp1->compile (cs, &i[1]); - Ins * const j = &i[sz1 + 1]; - i->i.link = &j[1]; - j->i.tag = GOTO; - const uint32_t sz2 = exp2->compile (cs, &j[1]); - j->i.link = &j[sz2 + 1]; - - if (ins_access == PRIVATE) - { - decompile (); - } - - return sz1 + sz2 + 2; - } -} - -void AltOp::decompile () -{ - if (ins_cache) - { - exp1->decompile (); - exp2->decompile (); - ins_cache = NULL; - } -} - -uint32_t CatOp::compile (const charset_t & cs, Ins * i) -{ - if (ins_cache) - { - return compile_goto (ins_cache, i); - } - else - { - ins_cache = i; - - const uint32_t sz1 = exp1->compile (cs, &i[0]); - const uint32_t sz2 = exp2->compile (cs, &i[sz1]); - - if (ins_access == PRIVATE) - { - decompile (); - } - - return sz1 + sz2; - } -} - -void CatOp::decompile () -{ - if (ins_cache) - { - exp1->decompile (); - exp2->decompile (); - ins_cache = NULL; - } -} - -uint32_t CloseOp::compile (const charset_t & cs, Ins * i) -{ - if (ins_cache) - { - return compile_goto (ins_cache, i); - } - else - { - ins_cache = i; - - i->i.tag = FORK; - ++i; - i += exp->compile (cs, i); - i->i.tag = GOTO; - i->i.link = ins_cache; - ++i; - ins_cache->i.link = i; - - const uint32_t sz = static_cast (i - ins_cache); - if (ins_access == PRIVATE) - { - decompile (); - } - - return sz; - } -} - -void CloseOp::decompile () -{ - if (ins_cache) - { - exp->decompile (); - ins_cache = NULL; - } -} - -uint32_t MatchOp::compile (const charset_t & cs, Ins * i) -{ - if (ins_cache) - { - return compile_goto (ins_cache, i); - } - else - { - ins_cache = i; - - i->i.tag = CHAR; - i->i.link = &i[size]; - Ins *j = &i[1]; - uint32_t bump = size; - uint32_t k = 0; - for (Range *r = match; r; r = r->next ()) - { - for (; cs[k] != r->lower(); ++k); - for (; cs[k] != r->upper(); ++k) - { - j->c.value = k; - j->c.bump = --bump; - j++; - } - } - - if (ins_access == PRIVATE) - { - decompile (); - } - - return size; - } -} - -void MatchOp::decompile () -{ - ins_cache = NULL; -} - -uint32_t NullOp::compile (const charset_t &, Ins *) -{ - return 0; -} - -void NullOp::decompile () {} - -uint32_t RuleOp::compile (const charset_t & cs, Ins * i) -{ - if (ins_cache) - { - return compile_goto (ins_cache, i); - } - else - { - ins_cache = i; - - i += exp->compile (cs, &i[0]); - if (ctx->size) - { - i->i.tag = CTXT; - i->i.link = &i[1]; - ++i; - i += ctx->compile (cs, &i[0]); - } - i->i.tag = TERM; - i->i.link = this; - ++i; - const uint32_t sz = static_cast (i - ins_cache); - - if (ins_access == PRIVATE) - { - decompile (); - } - - return sz; - } -} - -void RuleOp::decompile () -{ - if (ins_cache) - { - exp->decompile (); - ctx->decompile (); - ins_cache = NULL; - } -} - -uint32_t compile_goto (Ins * ins, Ins * i) -{ - i->i.tag = GOTO; - i->i.link = ins; - return 1; -} - -} // end namespace re2c diff --git a/re2c/src/ir/bytecode/ins.cc b/re2c/src/ir/bytecode/ins.cc deleted file mode 100644 index d153520c..00000000 --- a/re2c/src/ir/bytecode/ins.cc +++ /dev/null @@ -1,42 +0,0 @@ -#include - -#include "src/ir/bytecode/ins.h" -#include "src/ir/regexp/regexp_rule.h" -#include "src/ir/rule_rank.h" - -namespace re2c { - -const Ins * showIns (std::ostream & o, const Ins & i, const Ins & base) -{ - o.width (3); - o << &i - &base << ": "; - const Ins * ret = &(&i)[1]; - switch (i.i.tag) - { - case CHAR: - { - o << "match "; - for (; ret < (Ins *) i.i.link; ++ret) - { - o << "\\x" << std::hex << ret->c.value; - } - break; - } - case GOTO: - o << "goto " << ((Ins *) i.i.link - &base); - break; - case FORK: - o << "fork " << ((Ins *) i.i.link - &base); - break; - case CTXT: - o << "ctxt"; - break; - case TERM: - o << "term " << ((RuleOp *) i.i.link)->rank; - break; - } - o << "\n"; - return ret; -} - -} // namespace re2c diff --git a/re2c/src/ir/bytecode/ins.h b/re2c/src/ir/bytecode/ins.h deleted file mode 100644 index 4ec14fd1..00000000 --- a/re2c/src/ir/bytecode/ins.h +++ /dev/null @@ -1,51 +0,0 @@ -#ifndef _RE2C_IR_BYTECODE_INS_ -#define _RE2C_IR_BYTECODE_INS_ - -#include "src/util/c99_stdint.h" -#include - -namespace re2c -{ - -static const uint32_t CHAR = 0; -static const uint32_t GOTO = 1; -static const uint32_t FORK = 2; -static const uint32_t TERM = 3; -static const uint32_t CTXT = 4; - -union Ins -{ - struct - { - uint8_t tag; - uint8_t marked; - void * link; - } i; - struct - { - uint32_t value; - uint32_t bump; - void * link; - } c; -}; - -inline bool isMarked (Ins * i) -{ - return i->i.marked != 0; -} - -inline void mark (Ins * i) -{ - i->i.marked = true; -} - -inline void unmark (Ins * i) -{ - i->i.marked = false; -} - -const Ins * showIns (std::ostream & o, const Ins & i, const Ins & base); - -} // namespace re2c - -#endif // _RE2C_IR_BYTECODE_INS_ diff --git a/re2c/src/ir/compile.cc b/re2c/src/ir/compile.cc new file mode 100644 index 00000000..914f1583 --- /dev/null +++ b/re2c/src/ir/compile.cc @@ -0,0 +1,54 @@ +#include + +#include "src/codegen/output.h" +#include "src/ir/compile.h" +#include "src/ir/dfa/dfa.h" +#include "src/ir/nfa/nfa.h" +#include "src/ir/regexp/regexp.h" +#include "src/parse/spec.h" + +namespace re2c { + +smart_ptr compile (Spec & spec, Output & output, const std::string & cond, uint32_t cunits) +{ + RegExp * re = spec.re; + + // The original set of code units (charset) might be very large. + // A common trick it is to split charset into disjoint character ranges + // and choose a representative of each range (we choose lower bound). + // The set of all representatives is the new (compacted) charset. + // Don't forget to include zero and upper bound, even if they + // do not explicitely apper in ranges. + std::set bounds; + re->split(bounds); + bounds.insert(0); + bounds.insert(cunits); + charset_t cs; + for (std::set::const_iterator i = bounds.begin(); i != bounds.end(); ++i) + { + cs.push_back(*i); + } + + nfa_t nfa(re); + + smart_ptr dfa = make_smart_ptr (new DFA + ( cond + , output.source.get_block_line () + , 0 + , cunits + , cs + , spec.rules + , nfa + )); + + // accumulate global statistics from this particular DFA + output.max_fill = std::max (output.max_fill, dfa->max_fill); + if (dfa->need_accept) + { + output.source.set_used_yyaccept (); + } + + return dfa; +} + +} // namespace re2c diff --git a/re2c/src/ir/bytecode/bytecode.h b/re2c/src/ir/compile.h similarity index 53% rename from re2c/src/ir/bytecode/bytecode.h rename to re2c/src/ir/compile.h index 67f7c731..6883c1c3 100644 --- a/re2c/src/ir/bytecode/bytecode.h +++ b/re2c/src/ir/compile.h @@ -1,5 +1,5 @@ -#ifndef _RE2C_IR_BYTECODE_BYTECODE_ -#define _RE2C_IR_BYTECODE_BYTECODE_ +#ifndef _RE2C_IR_COMPILE_ +#define _RE2C_IR_COMPILE_ #include "src/util/c99_stdint.h" #include @@ -13,8 +13,8 @@ class DFA; struct Output; struct Spec; -smart_ptr genCode (Spec & spec, Output & output, const std::string & cond, uint32_t cunits); +smart_ptr compile (Spec & spec, Output & output, const std::string & cond, uint32_t cunits); } // namespace re2c -#endif // _RE2C_IR_BYTECODE_BYTECODE_ +#endif // _RE2C_IR_COMPILE_ diff --git a/re2c/src/ir/dfa/dfa.cc b/re2c/src/ir/dfa/dfa.cc index edf588ca..085ed6a5 100644 --- a/re2c/src/ir/dfa/dfa.cc +++ b/re2c/src/ir/dfa/dfa.cc @@ -1,40 +1,40 @@ #include -#include -#include +#include #include +#include #include -#include #include "src/codegen/go.h" #include "src/codegen/skeleton/skeleton.h" #include "src/ir/dfa/dfa.h" -#include "src/ir/bytecode/ins.h" +#include "src/ir/nfa/nfa.h" #include "src/ir/dfa/state.h" #include "src/ir/regexp/regexp_rule.h" #include "src/ir/rule_rank.h" #include "src/util/allocate.h" +#include "src/util/range.h" namespace re2c { -static Ins **closure(Ins **cP, Ins *i) +static nfa_state_t **closure(nfa_state_t **cP, nfa_state_t *n) { - while (!isMarked(i)) + if (!n->mark) { - mark(i); - *(cP++) = i; - - if (i->i.tag == FORK) - { - cP = closure(cP, i + 1); - i = (Ins*) i->i.link; - } - else if (i->i.tag == GOTO || i->i.tag == CTXT) + n->mark = true; + *(cP++) = n; + switch (n->type) { - i = (Ins*) i->i.link; + case nfa_state_t::ALT: + cP = closure(cP, n->value.alt.out2); + cP = closure(cP, n->value.alt.out1); + break; + case nfa_state_t::CTX: + cP = closure(cP, n->value.ctx.out); + break; + default: + break; } - else - break; } return cP; @@ -43,12 +43,11 @@ static Ins **closure(Ins **cP, Ins *i) DFA::DFA ( const std::string & c , uint32_t l - , Ins * ins - , uint32_t ni , uint32_t lb , uint32_t ub , const charset_t & cs - , rules_t rules + , rules_t & rules + , nfa_t & nfa ) : accepts () , skeleton (NULL) @@ -61,7 +60,6 @@ DFA::DFA , head(NULL) , tail(&head) , toDo(NULL) - , free_ins(ins) // statistics , max_fill (0) @@ -78,11 +76,12 @@ DFA::DFA name += cond; } - Ins **work = new Ins * [ni + 1]; - findState(work, closure(work, &ins[0])); - const size_t nc = cs.size() - 1; // (n + 1) bounds for n ranges - void **goTo = new void*[nc]; + + nfa_state_t **work = new nfa_state_t* [nfa.size]; + findState(work, closure(work, nfa.root)); + + std::vector *go = new std::vector[nc]; Span *span = allocate (nc); while (toDo) @@ -90,66 +89,84 @@ DFA::DFA State *s = toDo; toDo = s->link; - memset(goTo, 0, nc * sizeof(void*)); + for(uint32_t i = 0; i < nc; ++i) + { + go[i].clear(); + } + memset(span, 0, sizeof(Span)*nc); s->rule = NULL; for (uint32_t k = 0; k < s->kCount; ++k) { - Ins * i = s->kernel[k]; - if (i->i.tag == CHAR) + nfa_state_t *n = s->kernel[k]; + switch (n->type) { - for (Ins *j = i + 1; j < (Ins*) i->i.link; ++j) +// case nfa_state_t::CHR: +// go[n->value.chr.chr].push_back(n->value.chr.out); +// break; + case nfa_state_t::RAN: { - j->c.link = goTo[j->c.value]; - goTo[j->c.value] = j; - } - } - else if (i->i.tag == TERM) - { - RuleOp * rule = static_cast (i->i.link); - if (!s->rule) - { - s->rule = rule; + nfa_state_t *n2 = n->value.ran.out; + uint32_t j = 0; + for (Range *r = n->value.ran.ran; r; r = r->next ()) + { + for (; cs[j] != r->lower(); ++j); + for (; cs[j] != r->upper(); ++j) + { + go[j].push_back(n2); + } + } + break; } - else + case nfa_state_t::CTX: + s->isPreCtxt = true; + break; + case nfa_state_t::FIN: { - const rule_rank_t r1 = s->rule->rank; - const rule_rank_t r2 = rule->rank; - if (r2 < r1) + RuleOp *rule = n->value.fin.rule; + if (!s->rule) { - rules[r1].shadow.insert (r2); s->rule = rule; } - else if (r1 < r2) + else { - rules[r2].shadow.insert (r1); + const rule_rank_t r1 = s->rule->rank; + const rule_rank_t r2 = rule->rank; + if (r2 < r1) + { + rules[r1].shadow.insert (r2); + s->rule = rule; + } + else if (r1 < r2) + { + rules[r2].shadow.insert (r1); + } } + break; } - } - else if (i->i.tag == CTXT) - { - s->isPreCtxt = true; + default: + break; } } - for (uint32_t j = 0; j < nc; ++j) + for(uint32_t i = 0; i < nc; ++i) { - if (goTo[j]) + if(!go[i].empty()) { - Ins **cP = work; - for (Ins *i = (Ins*)goTo[j]; i; i = (Ins*) i->c.link) + nfa_state_t **cP = work; + for (std::vector::const_iterator j = go[i].begin(); j != go[i].end(); ++j) { - cP = closure(cP, i + i->c.bump); + cP = closure(cP, *j); } - goTo[j] = findState(work, cP); + span[i].to = findState(work, cP); } } s->go.nSpans = 0; for (uint32_t j = 0; j < nc;) { - State *to = (State*) goTo[j]; - while (++j < nc && goTo[j] == to) ; + State *to = span[j].to; + while (++j < nc && span[j].to == to) ; span[s->go.nSpans].ub = cs[j]; span[s->go.nSpans].to = to; s->go.nSpans++; @@ -159,7 +176,7 @@ DFA::DFA } delete [] work; - delete [] goTo; + delete [] go; operator delete (span); /* @@ -238,7 +255,6 @@ DFA::~DFA() head = s->next; delete s; } - delete [] free_ins; delete skeleton; } @@ -253,19 +269,23 @@ void DFA::addState(State **a, State *s) tail = &s->next; } -State *DFA::findState(Ins **kernel, Ins ** kernel_end) +State *DFA::findState(nfa_state_t **kernel, nfa_state_t ** kernel_end) { uint32_t kCount = 0; - for (Ins ** i = kernel; i < kernel_end; ++i) + for (nfa_state_t ** pn = kernel; pn < kernel_end; ++pn) { - Ins * ins = *i; - if (ins->i.tag == CHAR || ins->i.tag == TERM || ins->i.tag == CTXT) - { - kernel[kCount++] = ins; - } - else + nfa_state_t *n = *pn; + switch (n->type) { - unmark (ins); +// case nfa_state_t::CHR: + case nfa_state_t::RAN: + case nfa_state_t::CTX: + case nfa_state_t::FIN: + kernel[kCount++] = n; + break; + default: + n->mark = false; + break; } } @@ -277,7 +297,7 @@ State *DFA::findState(Ins **kernel, Ins ** kernel_end) bool marked = true; for (uint32_t i = 0; marked && i < s->kCount; ++i) { - marked = isMarked (s->kernel[i]); + marked = s->kernel[i]->mark; } if (marked) { @@ -291,15 +311,15 @@ State *DFA::findState(Ins **kernel, Ins ** kernel_end) s = new State; addState(tail, s); s->kCount = kCount; - s->kernel = new Ins * [kCount]; - memcpy(s->kernel, kernel, kCount * sizeof (Ins *)); + s->kernel = new nfa_state_t* [kCount]; + memcpy(s->kernel, kernel, kCount * sizeof(nfa_state_t*)); s->link = toDo; toDo = s; } for (uint32_t i = 0; i < kCount; ++i) { - unmark (kernel[i]); + kernel[i]->mark = false; } return s; diff --git a/re2c/src/ir/dfa/dfa.h b/re2c/src/ir/dfa/dfa.h index 9c0b886b..e32a5f3a 100644 --- a/re2c/src/ir/dfa/dfa.h +++ b/re2c/src/ir/dfa/dfa.h @@ -19,6 +19,8 @@ class label_t; struct Output; struct OutputFile; union Ins; +struct nfa_t; +struct nfa_state_t; class DFA { @@ -36,7 +38,6 @@ public: State * head; State ** tail; State * toDo; - const Ins * free_ins; // statistics uint32_t max_fill; @@ -48,19 +49,18 @@ public: DFA ( const std::string & , uint32_t - , Ins * - , uint32_t , uint32_t , uint32_t , const charset_t & - , rules_t + , rules_t & + , nfa_t & ); ~DFA (); void emit (Output &, uint32_t &, bool, bool &); private: void addState (State **, State *); - State * findState (Ins **, Ins **); + State * findState (nfa_state_t **, nfa_state_t **); void reorder(); void split (State *); void findSCCs (); diff --git a/re2c/src/ir/dfa/state.h b/re2c/src/ir/dfa/state.h index 41af7dfd..4bb445af 100644 --- a/re2c/src/ir/dfa/state.h +++ b/re2c/src/ir/dfa/state.h @@ -9,6 +9,8 @@ namespace re2c { +struct nfa_state_t; + class State { public: @@ -18,7 +20,7 @@ public: State * link; uint32_t depth; // for finding SCCs uint32_t kCount; - Ins ** kernel; + nfa_state_t ** kernel; bool isPreCtxt; bool isBase; diff --git a/re2c/src/ir/nfa/calc_size.cc b/re2c/src/ir/nfa/calc_size.cc new file mode 100644 index 00000000..27c31188 --- /dev/null +++ b/re2c/src/ir/nfa/calc_size.cc @@ -0,0 +1,51 @@ +#include "src/util/c99_stdint.h" + +#include "src/ir/regexp/regexp.h" +#include "src/ir/regexp/regexp_alt.h" +#include "src/ir/regexp/regexp_cat.h" +#include "src/ir/regexp/regexp_close.h" +#include "src/ir/regexp/regexp_match.h" +#include "src/ir/regexp/regexp_null.h" +#include "src/ir/regexp/regexp_rule.h" +#include "src/util/range.h" + +namespace re2c +{ + +uint32_t AltOp::calc_size() const +{ + return exp1->calc_size() + + exp2->calc_size() + + 1; +} + +uint32_t CatOp::calc_size() const +{ + return exp1->calc_size() + + exp2->calc_size(); +} + +uint32_t CloseOp::calc_size() const +{ + return exp->calc_size() + 1; +} + +uint32_t MatchOp::calc_size() const +{ + return 1; +} + +uint32_t NullOp::calc_size() const +{ + return 0; +} + +uint32_t RuleOp::calc_size() const +{ + const uint32_t n = ctx->calc_size(); + return exp->calc_size() + + (n > 0 ? n + 1 : 0) + + 1; +} + +} // end namespace re2c diff --git a/re2c/src/ir/nfa/nfa.cc b/re2c/src/ir/nfa/nfa.cc new file mode 100644 index 00000000..c7b4ea9e --- /dev/null +++ b/re2c/src/ir/nfa/nfa.cc @@ -0,0 +1,188 @@ +#include "src/ir/nfa/nfa.h" +#include "src/ir/regexp/regexp.h" +#include "src/ir/regexp/regexp_alt.h" +#include "src/ir/regexp/regexp_cat.h" +#include "src/ir/regexp/regexp_close.h" +#include "src/ir/regexp/regexp_match.h" +#include "src/ir/regexp/regexp_null.h" +#include "src/ir/regexp/regexp_rule.h" + +namespace re2c { + +nfa_t::nfa_t(RegExp *re) + : max_size(re->calc_size()) + , size(0) + , states(new nfa_state_t[max_size]) + , root(re->compile(*this, NULL)) +{} + +nfa_t::~nfa_t() +{ + delete[] states; +} + +nfa_state_t *AltOp::compile(nfa_t &nfa, nfa_state_t *t) +{ + if (ins_cache) + { + return ins_cache; + } + else + { + nfa_state_t *s = &nfa.states[nfa.size++]; + s->alt(exp1->compile(nfa, t) + , exp2->compile(nfa, t)); + + ins_cache = s; + if (ins_access == PRIVATE) + { + decompile(); + } + + return s; + } +} + +void AltOp::decompile () +{ + if (ins_cache) + { + exp1->decompile (); + exp2->decompile (); + ins_cache = NULL; + } +} + +nfa_state_t *CatOp::compile(nfa_t &nfa, nfa_state_t *t) +{ + if (ins_cache) + { + return ins_cache; + } + else + { + nfa_state_t *s2 = exp2->compile(nfa, t); + nfa_state_t *s1 = exp1->compile(nfa, s2); + + ins_cache = s1; + if (ins_access == PRIVATE) + { + decompile(); + } + + return s1; + } +} + +void CatOp::decompile () +{ + if (ins_cache) + { + exp1->decompile (); + exp2->decompile (); + ins_cache = NULL; + } +} + +nfa_state_t *CloseOp::compile(nfa_t &nfa, nfa_state_t *t) +{ + if (ins_cache) + { + return ins_cache; + } + else + { + nfa_state_t *s = &nfa.states[nfa.size++]; + s->alt(t, exp->compile(nfa, s)); + + ins_cache = s; + if (ins_access == PRIVATE) + { + decompile(); + } + + return s; + } +} + +void CloseOp::decompile () +{ + if (ins_cache) + { + exp->decompile (); + ins_cache = NULL; + } +} + +nfa_state_t *MatchOp::compile(nfa_t &nfa, nfa_state_t *t) +{ + if (ins_cache) + { + return ins_cache; + } + else + { + nfa_state_t *s = &nfa.states[nfa.size++]; + s->ran(t, match); + + ins_cache = s; + if (ins_access == PRIVATE) + { + decompile(); + } + + return s; + } +} + +void MatchOp::decompile () +{ + ins_cache = NULL; +} + +nfa_state_t *NullOp::compile(nfa_t &, nfa_state_t *t) +{ + return t; +} + +void NullOp::decompile () {} + +nfa_state_t *RuleOp::compile(nfa_t &nfa, nfa_state_t *) +{ + if (ins_cache) + { + return ins_cache; + } + else + { + nfa_state_t *s3 = &nfa.states[nfa.size++]; + s3->fin(this); + if (ctx->calc_size() > 0) + { + nfa_state_t *s2 = &nfa.states[nfa.size++]; + s2->ctx(ctx->compile(nfa, s3)); + s3 = s2; + } + nfa_state_t *s1 = exp->compile(nfa, s3); + + ins_cache = s1; + if (ins_access == PRIVATE) + { + decompile(); + } + + return s1; + } +} + +void RuleOp::decompile () +{ + if (ins_cache) + { + exp->decompile (); + ctx->decompile (); + ins_cache = NULL; + } +} + +} // namespace re2c diff --git a/re2c/src/ir/nfa/nfa.h b/re2c/src/ir/nfa/nfa.h new file mode 100644 index 00000000..969012ff --- /dev/null +++ b/re2c/src/ir/nfa/nfa.h @@ -0,0 +1,105 @@ +#ifndef _RE2C_IR_NFA_NFA_ +#define _RE2C_IR_NFA_NFA_ + +#include "src/util/c99_stdint.h" +#include +#include + +#include "src/util/forbid_copy.h" + +namespace re2c +{ + +struct Range; +struct RegExp; +struct RuleOp; + +struct nfa_state_t +{ + enum type_t + { + ALT, +// CHR, + RAN, + CTX, + FIN + } type; + union + { + struct + { + nfa_state_t *out1; + nfa_state_t *out2; + } alt; +// struct +// { +// nfa_state_t *out; +// uint32_t chr; +// } chr; + struct + { + nfa_state_t *out; + Range *ran; + } ran; + struct + { + nfa_state_t *out; + } ctx; + struct + { + RuleOp *rule; + } fin; + } value; + bool mark; + + void alt(nfa_state_t *s1, nfa_state_t *s2) + { + type = ALT; + value.alt.out1 = s1; + value.alt.out2 = s2; + mark = false; + } +// void chr(nfa_state_t *s, uint32_t c) +// { +// type = CHR; +// value.chr.out = s; +// value.chr.chr = c; +// mark = false; +// } + void ran(nfa_state_t *s, Range *r) + { + type = RAN; + value.ran.out = s; + value.ran.ran = r; + mark = false; + } + void ctx(nfa_state_t *s) + { + type = CTX; + value.ctx.out = s; + mark = false; + } + void fin(RuleOp *r) + { + type = FIN; + value.fin.rule = r; + mark = false; + } +}; + +struct nfa_t +{ + const uint32_t max_size; + uint32_t size; + nfa_state_t *states; + nfa_state_t *root; + + nfa_t(RegExp *re); + ~nfa_t(); + + FORBID_COPY(nfa_t); +}; + +} // namespace re2c + +#endif // _RE2C_IR_NFA_NFA_ diff --git a/re2c/src/ir/bytecode/split.cc b/re2c/src/ir/nfa/split.cc similarity index 100% rename from re2c/src/ir/bytecode/split.cc rename to re2c/src/ir/nfa/split.cc diff --git a/re2c/src/ir/regexp/regexp.h b/re2c/src/ir/regexp/regexp.h index 05828fea..a37ca95b 100644 --- a/re2c/src/ir/regexp/regexp.h +++ b/re2c/src/ir/regexp/regexp.h @@ -12,7 +12,8 @@ namespace re2c { -union Ins; +struct nfa_state_t; +struct nfa_t; typedef std::vector charset_t; @@ -21,7 +22,6 @@ class RegExp public: static free_list vFreeList; - uint32_t size; /* * There're several different cases when the same regexp * can be used multiple times: @@ -40,7 +40,7 @@ public: * [^]{3} in UTF-8 mode, each of sub-regexps [^] will have common suffix * [\x80-\xBF] factored out, but they won't share instructions. */ - Ins * ins_cache; /* if non-NULL, points to compiled instructions */ + nfa_state_t *ins_cache; /* if non-NULL, points to compiled instructions */ enum InsAccess { SHARED, @@ -48,8 +48,7 @@ public: } ins_access; inline RegExp () - : size (0) - , ins_cache (NULL) + : ins_cache (NULL) , ins_access (SHARED) { vFreeList.insert (this); @@ -59,9 +58,9 @@ public: vFreeList.erase (this); } virtual void split (std::set &) = 0; - virtual void calcSize (const charset_t &) = 0; + virtual uint32_t calc_size() const = 0; virtual uint32_t fixedLength (); - virtual uint32_t compile (const charset_t &, Ins *) = 0; + virtual nfa_state_t *compile(nfa_t &nfa, nfa_state_t *n) = 0; virtual void decompile () = 0; virtual void display (std::ostream &) const = 0; friend std::ostream & operator << (std::ostream & o, const RegExp & re); diff --git a/re2c/src/ir/regexp/regexp_alt.h b/re2c/src/ir/regexp/regexp_alt.h index 90e2ecc6..5dc1605a 100644 --- a/re2c/src/ir/regexp/regexp_alt.h +++ b/re2c/src/ir/regexp/regexp_alt.h @@ -17,9 +17,9 @@ public: , exp2 (e2) {} void split (std::set &); - void calcSize (const charset_t &); + uint32_t calc_size() const; uint32_t fixedLength (); - uint32_t compile (const charset_t &, Ins *); + nfa_state_t *compile(nfa_t &nfa, nfa_state_t *n); void decompile (); void display (std::ostream & o) const; friend RegExp * mkAlt (RegExp *, RegExp *); diff --git a/re2c/src/ir/regexp/regexp_cat.h b/re2c/src/ir/regexp/regexp_cat.h index d72f8ece..aa872c67 100644 --- a/re2c/src/ir/regexp/regexp_cat.h +++ b/re2c/src/ir/regexp/regexp_cat.h @@ -17,9 +17,9 @@ public: , exp2 (e2) {} void split (std::set &); - void calcSize (const charset_t &); + uint32_t calc_size() const; uint32_t fixedLength (); - uint32_t compile (const charset_t &, Ins *); + nfa_state_t *compile(nfa_t &nfa, nfa_state_t *n); void decompile (); void display (std::ostream & o) const; diff --git a/re2c/src/ir/regexp/regexp_close.h b/re2c/src/ir/regexp/regexp_close.h index aa323c65..40496e3d 100644 --- a/re2c/src/ir/regexp/regexp_close.h +++ b/re2c/src/ir/regexp/regexp_close.h @@ -15,8 +15,8 @@ public: : exp (e) {} void split (std::set &); - void calcSize (const charset_t &); - uint32_t compile (const charset_t &, Ins *); + uint32_t calc_size() const; + nfa_state_t *compile(nfa_t &nfa, nfa_state_t *n); void decompile (); void display (std::ostream & o) const; diff --git a/re2c/src/ir/regexp/regexp_match.h b/re2c/src/ir/regexp/regexp_match.h index fab57dc6..cd8cc124 100644 --- a/re2c/src/ir/regexp/regexp_match.h +++ b/re2c/src/ir/regexp/regexp_match.h @@ -16,9 +16,9 @@ public: : match (m) {} void split (std::set &); - void calcSize (const charset_t &); + uint32_t calc_size() const; uint32_t fixedLength (); - uint32_t compile (const charset_t &, Ins *); + nfa_state_t *compile(nfa_t &nfa, nfa_state_t *n); void decompile (); void display (std::ostream & o) const; diff --git a/re2c/src/ir/regexp/regexp_null.h b/re2c/src/ir/regexp/regexp_null.h index f9a97a61..ed079f3a 100644 --- a/re2c/src/ir/regexp/regexp_null.h +++ b/re2c/src/ir/regexp/regexp_null.h @@ -10,9 +10,9 @@ class NullOp: public RegExp { public: void split (std::set &); - void calcSize (const charset_t &); + uint32_t calc_size() const; uint32_t fixedLength (); - uint32_t compile (const charset_t &, Ins *); + nfa_state_t *compile(nfa_t &nfa, nfa_state_t *n); void decompile (); void display (std::ostream & o) const; }; diff --git a/re2c/src/ir/regexp/regexp_rule.h b/re2c/src/ir/regexp/regexp_rule.h index 1bb4b51f..91c88e48 100644 --- a/re2c/src/ir/regexp/regexp_rule.h +++ b/re2c/src/ir/regexp/regexp_rule.h @@ -20,7 +20,6 @@ private: public: RegExp * ctx; - Ins * ins; rule_rank_t rank; const Code * code; const std::string newcond; @@ -37,7 +36,6 @@ public: : loc (l) , exp (r1) , ctx (r2) - , ins (NULL) , rank (r) , code (c) , newcond (cond ? *cond : "") @@ -46,8 +44,8 @@ public: } void display (std::ostream & o) const; void split (std::set &); - void calcSize (const charset_t &); - uint32_t compile (const charset_t &, Ins *); + uint32_t calc_size() const; + nfa_state_t *compile(nfa_t &nfa, nfa_state_t *n); void decompile (); FORBID_COPY (RuleOp); diff --git a/re2c/src/parse/parser.ypp b/re2c/src/parse/parser.ypp index 96a37a95..cf009ea7 100644 --- a/re2c/src/parse/parser.ypp +++ b/re2c/src/parse/parser.ypp @@ -16,7 +16,7 @@ #include "src/codegen/skeleton/skeleton.h" #include "src/conf/opt.h" #include "src/globals.h" -#include "src/ir/bytecode/bytecode.h" +#include "src/ir/compile.h" #include "src/ir/dfa/dfa.h" #include "src/ir/regexp/encoding/enc.h" #include "src/ir/regexp/encoding/range_suffix.h" @@ -718,7 +718,7 @@ void parse(Scanner& i, Output & o) } } - dfa_map[it->first] = genCode(it->second, o, it->first, opts->encoding.nCodeUnits ()); + dfa_map[it->first] = compile(it->second, o, it->first, opts->encoding.nCodeUnits ()); } if (parseMode != Scanner::Rules && dfa_map.find(it->first) != dfa_map.end()) { @@ -732,7 +732,7 @@ void parse(Scanner& i, Output & o) { if (parseMode != Scanner::Reuse) { - dfa_map[""] = genCode(spec, o, "", opts->encoding.nCodeUnits ()); + dfa_map[""] = compile(spec, o, "", opts->encoding.nCodeUnits ()); } if (parseMode != Scanner::Rules && dfa_map.find("") != dfa_map.end()) {