src/conf/msg.h \
src/conf/opt.h \
src/conf/warn.h \
- src/ir/bytecode/bytecode.h \
- src/ir/bytecode/ins.h \
+ src/ir/bytecode/nfa.h \
src/ir/dfa/state.h \
src/ir/dfa/dfa.h \
src/ir/dfa/action.h \
src/ir/regexp/regexp_null.h \
src/ir/regexp/regexp.h \
src/ir/regexp/regexp_close.h \
+ src/ir/compile.h \
src/ir/rule_rank.h \
src/globals.h \
src/parse/code.h \
src/conf/msg.cc \
src/conf/opt.cc \
src/conf/warn.cc \
- src/ir/bytecode/bytecode.cc \
- src/ir/bytecode/ins.cc \
- src/ir/bytecode/split.cc \
- src/ir/bytecode/compile.cc \
- src/ir/bytecode/calc_size.cc \
+ src/ir/nfa/calc_size.cc \
+ src/ir/nfa/nfa.cc \
+ src/ir/nfa/split.cc \
src/ir/dfa/dfa.cc \
src/ir/regexp/display.cc \
src/ir/regexp/encoding/enc.cc \
src/ir/regexp/encoding/utf16/utf16_range.cc \
src/ir/regexp/fixed_length.cc \
src/ir/regexp/regexp.cc \
+ src/ir/compile.cc \
src/ir/rule_rank.cc \
src/main.cc \
src/parse/code.cc \
-/* Generated by re2c 0.15.3 on Tue Dec 15 12:16:26 2015 */
+/* Generated by re2c 0.15.3 on Tue Dec 15 14:38:22 2015 */
#line 1 "../src/parse/lex.re"
#include "src/util/c99_stdint.h"
#include <stddef.h>
#include "src/codegen/skeleton/skeleton.h"
#include "src/conf/opt.h"
#include "src/globals.h"
-#include "src/ir/bytecode/bytecode.h"
+#include "src/ir/compile.h"
#include "src/ir/dfa/dfa.h"
#include "src/ir/regexp/encoding/enc.h"
#include "src/ir/regexp/encoding/range_suffix.h"
}
}
- dfa_map[it->first] = genCode(it->second, o, it->first, opts->encoding.nCodeUnits ());
+ dfa_map[it->first] = compile(it->second, o, it->first, opts->encoding.nCodeUnits ());
}
if (parseMode != Scanner::Rules && dfa_map.find(it->first) != dfa_map.end())
{
{
if (parseMode != Scanner::Reuse)
{
- dfa_map[""] = genCode(spec, o, "", opts->encoding.nCodeUnits ());
+ dfa_map[""] = compile(spec, o, "", opts->encoding.nCodeUnits ());
}
if (parseMode != Scanner::Rules && dfa_map.find("") != dfa_map.end())
{
+++ /dev/null
-#include <assert.h>
-#include <string.h>
-#include <algorithm>
-
-#include "src/codegen/output.h"
-#include "src/ir/bytecode/bytecode.h"
-#include "src/ir/bytecode/ins.h"
-#include "src/ir/dfa/dfa.h"
-#include "src/ir/regexp/regexp.h"
-#include "src/parse/spec.h"
-
-namespace re2c {
-
-static void optimize (Ins * i);
-
-smart_ptr<DFA> genCode (Spec & spec, Output & output, const std::string & cond, uint32_t cunits)
-{
- RegExp * re = spec.re;
-
- // The original set of code units (charset) might be very large.
- // A common trick it is to split charset into disjoint character ranges
- // and choose a representative of each range (we choose lower bound).
- // The set of all representatives is the new (compacted) charset.
- // Don't forget to include zero and upper bound, even if they
- // do not explicitely apper in ranges.
- std::set<uint32_t> bounds;
- re->split(bounds);
- bounds.insert(0);
- bounds.insert(cunits);
- charset_t cs;
- for (std::set<uint32_t>::const_iterator i = bounds.begin(); i != bounds.end(); ++i)
- {
- cs.push_back(*i);
- }
-
- re->calcSize(cs);
-
- Ins *ins = new Ins[re->size + 1];
- memset(ins, 0, (re->size + 1)*sizeof(Ins));
- const uint32_t size = re->compile(cs, ins);
- Ins *eoi = &ins[size];
- eoi->i.tag = GOTO;
- eoi->i.link = eoi;
-
- optimize(ins);
-
- /*
- for (const Ins *inst = &ins[0]; inst < &ins[size]; )
- {
- inst = showIns(std::cout, *inst, ins[0]);
- }
- */
-
- for (uint32_t j = 0; j < size;)
- {
- unmark(&ins[j]);
-
- if (ins[j].i.tag == CHAR)
- {
- j = static_cast<uint32_t> ((Ins*) ins[j].i.link - ins);
- }
- else
- {
- j++;
- }
- }
-
- smart_ptr<DFA> dfa = make_smart_ptr (new DFA
- ( cond
- , output.source.get_block_line ()
- , ins
- , size
- , 0
- , cunits
- , cs
- , spec.rules
- ));
-
- // accumulate global statistics from this particular DFA
- output.max_fill = std::max (output.max_fill, dfa->max_fill);
- if (dfa->need_accept)
- {
- output.source.set_used_yyaccept ();
- }
-
- return dfa;
-}
-
-void optimize (Ins * i)
-{
- while (!isMarked (i))
- {
- mark (i);
- if (i->i.tag == CHAR)
- {
- i = (Ins *) i->i.link;
- }
- else if (i->i.tag == GOTO || i->i.tag == FORK)
- {
- Ins * target = (Ins *) i->i.link;
- optimize (target);
- if (target->i.tag == GOTO)
- {
- i->i.link = target->i.link == target
- ? i
- : target;
- }
- if (i->i.tag == FORK)
- {
- Ins * follow = (Ins *) & i[1];
- optimize (follow);
- if (follow->i.tag == GOTO && follow->i.link == follow)
- {
- i->i.tag = GOTO;
- }
- else if (i->i.link == i)
- {
- i->i.tag = GOTO;
- i->i.link = follow;
- }
- }
- return;
- }
- else
- {
- ++i;
- }
- }
-}
-
-} // namespace re2c
+++ /dev/null
-#include "src/util/c99_stdint.h"
-
-#include "src/ir/regexp/regexp.h"
-#include "src/ir/regexp/regexp_alt.h"
-#include "src/ir/regexp/regexp_cat.h"
-#include "src/ir/regexp/regexp_close.h"
-#include "src/ir/regexp/regexp_match.h"
-#include "src/ir/regexp/regexp_null.h"
-#include "src/ir/regexp/regexp_rule.h"
-#include "src/util/range.h"
-
-namespace re2c
-{
-
-void AltOp::calcSize (const charset_t & cs)
-{
- exp1->calcSize (cs);
- exp2->calcSize (cs);
- size = exp1->size + exp2->size + 2;
-}
-
-void CatOp::calcSize (const charset_t & cs)
-{
- exp1->calcSize (cs);
- exp2->calcSize (cs);
- size = exp1->size + exp2->size;
-}
-
-void CloseOp::calcSize (const charset_t & cs)
-{
- exp->calcSize (cs);
- size = exp->size + 2;
-}
-
-void MatchOp::calcSize (const charset_t & cs)
-{
- size = 1;
- uint32_t k = 0;
- for (Range * r = match; r; r = r->next ())
- {
- for (; cs[k] != r->lower(); ++k);
- for (; cs[k] != r->upper(); ++k)
- {
- ++size;
- }
- }
-}
-
-void NullOp::calcSize (const charset_t &)
-{
- size = 0;
-}
-
-void RuleOp::calcSize (const charset_t & cs)
-{
- exp->calcSize (cs);
- ctx->calcSize (cs);
- size = exp->size + (ctx->size ? ctx->size + 2 : 1);
-}
-
-} // end namespace re2c
+++ /dev/null
-#include "src/util/c99_stdint.h"
-
-#include "src/ir/bytecode/ins.h"
-#include "src/ir/regexp/regexp.h"
-#include "src/ir/regexp/regexp_alt.h"
-#include "src/ir/regexp/regexp_cat.h"
-#include "src/ir/regexp/regexp_close.h"
-#include "src/ir/regexp/regexp_match.h"
-#include "src/ir/regexp/regexp_null.h"
-#include "src/ir/regexp/regexp_rule.h"
-#include "src/util/range.h"
-
-namespace re2c
-{
-
-static uint32_t compile_goto (Ins * ins, Ins * i);
-
-uint32_t AltOp::compile (const charset_t & cs, Ins * i)
-{
- if (ins_cache)
- {
- return compile_goto (ins_cache, i);
- }
- else
- {
- ins_cache = i;
-
- i->i.tag = FORK;
- const uint32_t sz1 = exp1->compile (cs, &i[1]);
- Ins * const j = &i[sz1 + 1];
- i->i.link = &j[1];
- j->i.tag = GOTO;
- const uint32_t sz2 = exp2->compile (cs, &j[1]);
- j->i.link = &j[sz2 + 1];
-
- if (ins_access == PRIVATE)
- {
- decompile ();
- }
-
- return sz1 + sz2 + 2;
- }
-}
-
-void AltOp::decompile ()
-{
- if (ins_cache)
- {
- exp1->decompile ();
- exp2->decompile ();
- ins_cache = NULL;
- }
-}
-
-uint32_t CatOp::compile (const charset_t & cs, Ins * i)
-{
- if (ins_cache)
- {
- return compile_goto (ins_cache, i);
- }
- else
- {
- ins_cache = i;
-
- const uint32_t sz1 = exp1->compile (cs, &i[0]);
- const uint32_t sz2 = exp2->compile (cs, &i[sz1]);
-
- if (ins_access == PRIVATE)
- {
- decompile ();
- }
-
- return sz1 + sz2;
- }
-}
-
-void CatOp::decompile ()
-{
- if (ins_cache)
- {
- exp1->decompile ();
- exp2->decompile ();
- ins_cache = NULL;
- }
-}
-
-uint32_t CloseOp::compile (const charset_t & cs, Ins * i)
-{
- if (ins_cache)
- {
- return compile_goto (ins_cache, i);
- }
- else
- {
- ins_cache = i;
-
- i->i.tag = FORK;
- ++i;
- i += exp->compile (cs, i);
- i->i.tag = GOTO;
- i->i.link = ins_cache;
- ++i;
- ins_cache->i.link = i;
-
- const uint32_t sz = static_cast<uint32_t> (i - ins_cache);
- if (ins_access == PRIVATE)
- {
- decompile ();
- }
-
- return sz;
- }
-}
-
-void CloseOp::decompile ()
-{
- if (ins_cache)
- {
- exp->decompile ();
- ins_cache = NULL;
- }
-}
-
-uint32_t MatchOp::compile (const charset_t & cs, Ins * i)
-{
- if (ins_cache)
- {
- return compile_goto (ins_cache, i);
- }
- else
- {
- ins_cache = i;
-
- i->i.tag = CHAR;
- i->i.link = &i[size];
- Ins *j = &i[1];
- uint32_t bump = size;
- uint32_t k = 0;
- for (Range *r = match; r; r = r->next ())
- {
- for (; cs[k] != r->lower(); ++k);
- for (; cs[k] != r->upper(); ++k)
- {
- j->c.value = k;
- j->c.bump = --bump;
- j++;
- }
- }
-
- if (ins_access == PRIVATE)
- {
- decompile ();
- }
-
- return size;
- }
-}
-
-void MatchOp::decompile ()
-{
- ins_cache = NULL;
-}
-
-uint32_t NullOp::compile (const charset_t &, Ins *)
-{
- return 0;
-}
-
-void NullOp::decompile () {}
-
-uint32_t RuleOp::compile (const charset_t & cs, Ins * i)
-{
- if (ins_cache)
- {
- return compile_goto (ins_cache, i);
- }
- else
- {
- ins_cache = i;
-
- i += exp->compile (cs, &i[0]);
- if (ctx->size)
- {
- i->i.tag = CTXT;
- i->i.link = &i[1];
- ++i;
- i += ctx->compile (cs, &i[0]);
- }
- i->i.tag = TERM;
- i->i.link = this;
- ++i;
- const uint32_t sz = static_cast<uint32_t> (i - ins_cache);
-
- if (ins_access == PRIVATE)
- {
- decompile ();
- }
-
- return sz;
- }
-}
-
-void RuleOp::decompile ()
-{
- if (ins_cache)
- {
- exp->decompile ();
- ctx->decompile ();
- ins_cache = NULL;
- }
-}
-
-uint32_t compile_goto (Ins * ins, Ins * i)
-{
- i->i.tag = GOTO;
- i->i.link = ins;
- return 1;
-}
-
-} // end namespace re2c
+++ /dev/null
-#include <iostream>
-
-#include "src/ir/bytecode/ins.h"
-#include "src/ir/regexp/regexp_rule.h"
-#include "src/ir/rule_rank.h"
-
-namespace re2c {
-
-const Ins * showIns (std::ostream & o, const Ins & i, const Ins & base)
-{
- o.width (3);
- o << &i - &base << ": ";
- const Ins * ret = &(&i)[1];
- switch (i.i.tag)
- {
- case CHAR:
- {
- o << "match ";
- for (; ret < (Ins *) i.i.link; ++ret)
- {
- o << "\\x" << std::hex << ret->c.value;
- }
- break;
- }
- case GOTO:
- o << "goto " << ((Ins *) i.i.link - &base);
- break;
- case FORK:
- o << "fork " << ((Ins *) i.i.link - &base);
- break;
- case CTXT:
- o << "ctxt";
- break;
- case TERM:
- o << "term " << ((RuleOp *) i.i.link)->rank;
- break;
- }
- o << "\n";
- return ret;
-}
-
-} // namespace re2c
+++ /dev/null
-#ifndef _RE2C_IR_BYTECODE_INS_
-#define _RE2C_IR_BYTECODE_INS_
-
-#include "src/util/c99_stdint.h"
-#include <iosfwd>
-
-namespace re2c
-{
-
-static const uint32_t CHAR = 0;
-static const uint32_t GOTO = 1;
-static const uint32_t FORK = 2;
-static const uint32_t TERM = 3;
-static const uint32_t CTXT = 4;
-
-union Ins
-{
- struct
- {
- uint8_t tag;
- uint8_t marked;
- void * link;
- } i;
- struct
- {
- uint32_t value;
- uint32_t bump;
- void * link;
- } c;
-};
-
-inline bool isMarked (Ins * i)
-{
- return i->i.marked != 0;
-}
-
-inline void mark (Ins * i)
-{
- i->i.marked = true;
-}
-
-inline void unmark (Ins * i)
-{
- i->i.marked = false;
-}
-
-const Ins * showIns (std::ostream & o, const Ins & i, const Ins & base);
-
-} // namespace re2c
-
-#endif // _RE2C_IR_BYTECODE_INS_
--- /dev/null
+#include <algorithm>
+
+#include "src/codegen/output.h"
+#include "src/ir/compile.h"
+#include "src/ir/dfa/dfa.h"
+#include "src/ir/nfa/nfa.h"
+#include "src/ir/regexp/regexp.h"
+#include "src/parse/spec.h"
+
+namespace re2c {
+
+smart_ptr<DFA> compile (Spec & spec, Output & output, const std::string & cond, uint32_t cunits)
+{
+ RegExp * re = spec.re;
+
+ // The original set of code units (charset) might be very large.
+ // A common trick it is to split charset into disjoint character ranges
+ // and choose a representative of each range (we choose lower bound).
+ // The set of all representatives is the new (compacted) charset.
+ // Don't forget to include zero and upper bound, even if they
+ // do not explicitely apper in ranges.
+ std::set<uint32_t> bounds;
+ re->split(bounds);
+ bounds.insert(0);
+ bounds.insert(cunits);
+ charset_t cs;
+ for (std::set<uint32_t>::const_iterator i = bounds.begin(); i != bounds.end(); ++i)
+ {
+ cs.push_back(*i);
+ }
+
+ nfa_t nfa(re);
+
+ smart_ptr<DFA> dfa = make_smart_ptr (new DFA
+ ( cond
+ , output.source.get_block_line ()
+ , 0
+ , cunits
+ , cs
+ , spec.rules
+ , nfa
+ ));
+
+ // accumulate global statistics from this particular DFA
+ output.max_fill = std::max (output.max_fill, dfa->max_fill);
+ if (dfa->need_accept)
+ {
+ output.source.set_used_yyaccept ();
+ }
+
+ return dfa;
+}
+
+} // namespace re2c
-#ifndef _RE2C_IR_BYTECODE_BYTECODE_
-#define _RE2C_IR_BYTECODE_BYTECODE_
+#ifndef _RE2C_IR_COMPILE_
+#define _RE2C_IR_COMPILE_
#include "src/util/c99_stdint.h"
#include <string>
struct Output;
struct Spec;
-smart_ptr<DFA> genCode (Spec & spec, Output & output, const std::string & cond, uint32_t cunits);
+smart_ptr<DFA> compile (Spec & spec, Output & output, const std::string & cond, uint32_t cunits);
} // namespace re2c
-#endif // _RE2C_IR_BYTECODE_BYTECODE_
+#endif // _RE2C_IR_COMPILE_
#include <assert.h>
-#include <string.h>
-#include <map>
+#include <ostream>
#include <set>
+#include <string.h>
#include <queue>
-#include <ostream>
#include "src/codegen/go.h"
#include "src/codegen/skeleton/skeleton.h"
#include "src/ir/dfa/dfa.h"
-#include "src/ir/bytecode/ins.h"
+#include "src/ir/nfa/nfa.h"
#include "src/ir/dfa/state.h"
#include "src/ir/regexp/regexp_rule.h"
#include "src/ir/rule_rank.h"
#include "src/util/allocate.h"
+#include "src/util/range.h"
namespace re2c
{
-static Ins **closure(Ins **cP, Ins *i)
+static nfa_state_t **closure(nfa_state_t **cP, nfa_state_t *n)
{
- while (!isMarked(i))
+ if (!n->mark)
{
- mark(i);
- *(cP++) = i;
-
- if (i->i.tag == FORK)
- {
- cP = closure(cP, i + 1);
- i = (Ins*) i->i.link;
- }
- else if (i->i.tag == GOTO || i->i.tag == CTXT)
+ n->mark = true;
+ *(cP++) = n;
+ switch (n->type)
{
- i = (Ins*) i->i.link;
+ case nfa_state_t::ALT:
+ cP = closure(cP, n->value.alt.out2);
+ cP = closure(cP, n->value.alt.out1);
+ break;
+ case nfa_state_t::CTX:
+ cP = closure(cP, n->value.ctx.out);
+ break;
+ default:
+ break;
}
- else
- break;
}
return cP;
DFA::DFA
( const std::string & c
, uint32_t l
- , Ins * ins
- , uint32_t ni
, uint32_t lb
, uint32_t ub
, const charset_t & cs
- , rules_t rules
+ , rules_t & rules
+ , nfa_t & nfa
)
: accepts ()
, skeleton (NULL)
, head(NULL)
, tail(&head)
, toDo(NULL)
- , free_ins(ins)
// statistics
, max_fill (0)
name += cond;
}
- Ins **work = new Ins * [ni + 1];
- findState(work, closure(work, &ins[0]));
-
const size_t nc = cs.size() - 1; // (n + 1) bounds for n ranges
- void **goTo = new void*[nc];
+
+ nfa_state_t **work = new nfa_state_t* [nfa.size];
+ findState(work, closure(work, nfa.root));
+
+ std::vector<nfa_state_t*> *go = new std::vector<nfa_state_t*>[nc];
Span *span = allocate<Span> (nc);
while (toDo)
State *s = toDo;
toDo = s->link;
- memset(goTo, 0, nc * sizeof(void*));
+ for(uint32_t i = 0; i < nc; ++i)
+ {
+ go[i].clear();
+ }
+ memset(span, 0, sizeof(Span)*nc);
s->rule = NULL;
for (uint32_t k = 0; k < s->kCount; ++k)
{
- Ins * i = s->kernel[k];
- if (i->i.tag == CHAR)
+ nfa_state_t *n = s->kernel[k];
+ switch (n->type)
{
- for (Ins *j = i + 1; j < (Ins*) i->i.link; ++j)
+// case nfa_state_t::CHR:
+// go[n->value.chr.chr].push_back(n->value.chr.out);
+// break;
+ case nfa_state_t::RAN:
{
- j->c.link = goTo[j->c.value];
- goTo[j->c.value] = j;
- }
- }
- else if (i->i.tag == TERM)
- {
- RuleOp * rule = static_cast<RuleOp *> (i->i.link);
- if (!s->rule)
- {
- s->rule = rule;
+ nfa_state_t *n2 = n->value.ran.out;
+ uint32_t j = 0;
+ for (Range *r = n->value.ran.ran; r; r = r->next ())
+ {
+ for (; cs[j] != r->lower(); ++j);
+ for (; cs[j] != r->upper(); ++j)
+ {
+ go[j].push_back(n2);
+ }
+ }
+ break;
}
- else
+ case nfa_state_t::CTX:
+ s->isPreCtxt = true;
+ break;
+ case nfa_state_t::FIN:
{
- const rule_rank_t r1 = s->rule->rank;
- const rule_rank_t r2 = rule->rank;
- if (r2 < r1)
+ RuleOp *rule = n->value.fin.rule;
+ if (!s->rule)
{
- rules[r1].shadow.insert (r2);
s->rule = rule;
}
- else if (r1 < r2)
+ else
{
- rules[r2].shadow.insert (r1);
+ const rule_rank_t r1 = s->rule->rank;
+ const rule_rank_t r2 = rule->rank;
+ if (r2 < r1)
+ {
+ rules[r1].shadow.insert (r2);
+ s->rule = rule;
+ }
+ else if (r1 < r2)
+ {
+ rules[r2].shadow.insert (r1);
+ }
}
+ break;
}
- }
- else if (i->i.tag == CTXT)
- {
- s->isPreCtxt = true;
+ default:
+ break;
}
}
- for (uint32_t j = 0; j < nc; ++j)
+ for(uint32_t i = 0; i < nc; ++i)
{
- if (goTo[j])
+ if(!go[i].empty())
{
- Ins **cP = work;
- for (Ins *i = (Ins*)goTo[j]; i; i = (Ins*) i->c.link)
+ nfa_state_t **cP = work;
+ for (std::vector<nfa_state_t*>::const_iterator j = go[i].begin(); j != go[i].end(); ++j)
{
- cP = closure(cP, i + i->c.bump);
+ cP = closure(cP, *j);
}
- goTo[j] = findState(work, cP);
+ span[i].to = findState(work, cP);
}
}
s->go.nSpans = 0;
for (uint32_t j = 0; j < nc;)
{
- State *to = (State*) goTo[j];
- while (++j < nc && goTo[j] == to) ;
+ State *to = span[j].to;
+ while (++j < nc && span[j].to == to) ;
span[s->go.nSpans].ub = cs[j];
span[s->go.nSpans].to = to;
s->go.nSpans++;
}
delete [] work;
- delete [] goTo;
+ delete [] go;
operator delete (span);
/*
head = s->next;
delete s;
}
- delete [] free_ins;
delete skeleton;
}
tail = &s->next;
}
-State *DFA::findState(Ins **kernel, Ins ** kernel_end)
+State *DFA::findState(nfa_state_t **kernel, nfa_state_t ** kernel_end)
{
uint32_t kCount = 0;
- for (Ins ** i = kernel; i < kernel_end; ++i)
+ for (nfa_state_t ** pn = kernel; pn < kernel_end; ++pn)
{
- Ins * ins = *i;
- if (ins->i.tag == CHAR || ins->i.tag == TERM || ins->i.tag == CTXT)
- {
- kernel[kCount++] = ins;
- }
- else
+ nfa_state_t *n = *pn;
+ switch (n->type)
{
- unmark (ins);
+// case nfa_state_t::CHR:
+ case nfa_state_t::RAN:
+ case nfa_state_t::CTX:
+ case nfa_state_t::FIN:
+ kernel[kCount++] = n;
+ break;
+ default:
+ n->mark = false;
+ break;
}
}
bool marked = true;
for (uint32_t i = 0; marked && i < s->kCount; ++i)
{
- marked = isMarked (s->kernel[i]);
+ marked = s->kernel[i]->mark;
}
if (marked)
{
s = new State;
addState(tail, s);
s->kCount = kCount;
- s->kernel = new Ins * [kCount];
- memcpy(s->kernel, kernel, kCount * sizeof (Ins *));
+ s->kernel = new nfa_state_t* [kCount];
+ memcpy(s->kernel, kernel, kCount * sizeof(nfa_state_t*));
s->link = toDo;
toDo = s;
}
for (uint32_t i = 0; i < kCount; ++i)
{
- unmark (kernel[i]);
+ kernel[i]->mark = false;
}
return s;
struct Output;
struct OutputFile;
union Ins;
+struct nfa_t;
+struct nfa_state_t;
class DFA
{
State * head;
State ** tail;
State * toDo;
- const Ins * free_ins;
// statistics
uint32_t max_fill;
DFA
( const std::string &
, uint32_t
- , Ins *
- , uint32_t
, uint32_t
, uint32_t
, const charset_t &
- , rules_t
+ , rules_t &
+ , nfa_t &
);
~DFA ();
void emit (Output &, uint32_t &, bool, bool &);
private:
void addState (State **, State *);
- State * findState (Ins **, Ins **);
+ State * findState (nfa_state_t **, nfa_state_t **);
void reorder();
void split (State *);
void findSCCs ();
namespace re2c
{
+struct nfa_state_t;
+
class State
{
public:
State * link;
uint32_t depth; // for finding SCCs
uint32_t kCount;
- Ins ** kernel;
+ nfa_state_t ** kernel;
bool isPreCtxt;
bool isBase;
--- /dev/null
+#include "src/util/c99_stdint.h"
+
+#include "src/ir/regexp/regexp.h"
+#include "src/ir/regexp/regexp_alt.h"
+#include "src/ir/regexp/regexp_cat.h"
+#include "src/ir/regexp/regexp_close.h"
+#include "src/ir/regexp/regexp_match.h"
+#include "src/ir/regexp/regexp_null.h"
+#include "src/ir/regexp/regexp_rule.h"
+#include "src/util/range.h"
+
+namespace re2c
+{
+
+uint32_t AltOp::calc_size() const
+{
+ return exp1->calc_size()
+ + exp2->calc_size()
+ + 1;
+}
+
+uint32_t CatOp::calc_size() const
+{
+ return exp1->calc_size()
+ + exp2->calc_size();
+}
+
+uint32_t CloseOp::calc_size() const
+{
+ return exp->calc_size() + 1;
+}
+
+uint32_t MatchOp::calc_size() const
+{
+ return 1;
+}
+
+uint32_t NullOp::calc_size() const
+{
+ return 0;
+}
+
+uint32_t RuleOp::calc_size() const
+{
+ const uint32_t n = ctx->calc_size();
+ return exp->calc_size()
+ + (n > 0 ? n + 1 : 0)
+ + 1;
+}
+
+} // end namespace re2c
--- /dev/null
+#include "src/ir/nfa/nfa.h"
+#include "src/ir/regexp/regexp.h"
+#include "src/ir/regexp/regexp_alt.h"
+#include "src/ir/regexp/regexp_cat.h"
+#include "src/ir/regexp/regexp_close.h"
+#include "src/ir/regexp/regexp_match.h"
+#include "src/ir/regexp/regexp_null.h"
+#include "src/ir/regexp/regexp_rule.h"
+
+namespace re2c {
+
+nfa_t::nfa_t(RegExp *re)
+ : max_size(re->calc_size())
+ , size(0)
+ , states(new nfa_state_t[max_size])
+ , root(re->compile(*this, NULL))
+{}
+
+nfa_t::~nfa_t()
+{
+ delete[] states;
+}
+
+nfa_state_t *AltOp::compile(nfa_t &nfa, nfa_state_t *t)
+{
+ if (ins_cache)
+ {
+ return ins_cache;
+ }
+ else
+ {
+ nfa_state_t *s = &nfa.states[nfa.size++];
+ s->alt(exp1->compile(nfa, t)
+ , exp2->compile(nfa, t));
+
+ ins_cache = s;
+ if (ins_access == PRIVATE)
+ {
+ decompile();
+ }
+
+ return s;
+ }
+}
+
+void AltOp::decompile ()
+{
+ if (ins_cache)
+ {
+ exp1->decompile ();
+ exp2->decompile ();
+ ins_cache = NULL;
+ }
+}
+
+nfa_state_t *CatOp::compile(nfa_t &nfa, nfa_state_t *t)
+{
+ if (ins_cache)
+ {
+ return ins_cache;
+ }
+ else
+ {
+ nfa_state_t *s2 = exp2->compile(nfa, t);
+ nfa_state_t *s1 = exp1->compile(nfa, s2);
+
+ ins_cache = s1;
+ if (ins_access == PRIVATE)
+ {
+ decompile();
+ }
+
+ return s1;
+ }
+}
+
+void CatOp::decompile ()
+{
+ if (ins_cache)
+ {
+ exp1->decompile ();
+ exp2->decompile ();
+ ins_cache = NULL;
+ }
+}
+
+nfa_state_t *CloseOp::compile(nfa_t &nfa, nfa_state_t *t)
+{
+ if (ins_cache)
+ {
+ return ins_cache;
+ }
+ else
+ {
+ nfa_state_t *s = &nfa.states[nfa.size++];
+ s->alt(t, exp->compile(nfa, s));
+
+ ins_cache = s;
+ if (ins_access == PRIVATE)
+ {
+ decompile();
+ }
+
+ return s;
+ }
+}
+
+void CloseOp::decompile ()
+{
+ if (ins_cache)
+ {
+ exp->decompile ();
+ ins_cache = NULL;
+ }
+}
+
+nfa_state_t *MatchOp::compile(nfa_t &nfa, nfa_state_t *t)
+{
+ if (ins_cache)
+ {
+ return ins_cache;
+ }
+ else
+ {
+ nfa_state_t *s = &nfa.states[nfa.size++];
+ s->ran(t, match);
+
+ ins_cache = s;
+ if (ins_access == PRIVATE)
+ {
+ decompile();
+ }
+
+ return s;
+ }
+}
+
+void MatchOp::decompile ()
+{
+ ins_cache = NULL;
+}
+
+nfa_state_t *NullOp::compile(nfa_t &, nfa_state_t *t)
+{
+ return t;
+}
+
+void NullOp::decompile () {}
+
+nfa_state_t *RuleOp::compile(nfa_t &nfa, nfa_state_t *)
+{
+ if (ins_cache)
+ {
+ return ins_cache;
+ }
+ else
+ {
+ nfa_state_t *s3 = &nfa.states[nfa.size++];
+ s3->fin(this);
+ if (ctx->calc_size() > 0)
+ {
+ nfa_state_t *s2 = &nfa.states[nfa.size++];
+ s2->ctx(ctx->compile(nfa, s3));
+ s3 = s2;
+ }
+ nfa_state_t *s1 = exp->compile(nfa, s3);
+
+ ins_cache = s1;
+ if (ins_access == PRIVATE)
+ {
+ decompile();
+ }
+
+ return s1;
+ }
+}
+
+void RuleOp::decompile ()
+{
+ if (ins_cache)
+ {
+ exp->decompile ();
+ ctx->decompile ();
+ ins_cache = NULL;
+ }
+}
+
+} // namespace re2c
--- /dev/null
+#ifndef _RE2C_IR_NFA_NFA_
+#define _RE2C_IR_NFA_NFA_
+
+#include "src/util/c99_stdint.h"
+#include <vector>
+#include <set>
+
+#include "src/util/forbid_copy.h"
+
+namespace re2c
+{
+
+struct Range;
+struct RegExp;
+struct RuleOp;
+
+struct nfa_state_t
+{
+ enum type_t
+ {
+ ALT,
+// CHR,
+ RAN,
+ CTX,
+ FIN
+ } type;
+ union
+ {
+ struct
+ {
+ nfa_state_t *out1;
+ nfa_state_t *out2;
+ } alt;
+// struct
+// {
+// nfa_state_t *out;
+// uint32_t chr;
+// } chr;
+ struct
+ {
+ nfa_state_t *out;
+ Range *ran;
+ } ran;
+ struct
+ {
+ nfa_state_t *out;
+ } ctx;
+ struct
+ {
+ RuleOp *rule;
+ } fin;
+ } value;
+ bool mark;
+
+ void alt(nfa_state_t *s1, nfa_state_t *s2)
+ {
+ type = ALT;
+ value.alt.out1 = s1;
+ value.alt.out2 = s2;
+ mark = false;
+ }
+// void chr(nfa_state_t *s, uint32_t c)
+// {
+// type = CHR;
+// value.chr.out = s;
+// value.chr.chr = c;
+// mark = false;
+// }
+ void ran(nfa_state_t *s, Range *r)
+ {
+ type = RAN;
+ value.ran.out = s;
+ value.ran.ran = r;
+ mark = false;
+ }
+ void ctx(nfa_state_t *s)
+ {
+ type = CTX;
+ value.ctx.out = s;
+ mark = false;
+ }
+ void fin(RuleOp *r)
+ {
+ type = FIN;
+ value.fin.rule = r;
+ mark = false;
+ }
+};
+
+struct nfa_t
+{
+ const uint32_t max_size;
+ uint32_t size;
+ nfa_state_t *states;
+ nfa_state_t *root;
+
+ nfa_t(RegExp *re);
+ ~nfa_t();
+
+ FORBID_COPY(nfa_t);
+};
+
+} // namespace re2c
+
+#endif // _RE2C_IR_NFA_NFA_
namespace re2c
{
-union Ins;
+struct nfa_state_t;
+struct nfa_t;
typedef std::vector<uint32_t> charset_t;
public:
static free_list <RegExp *> vFreeList;
- uint32_t size;
/*
* There're several different cases when the same regexp
* can be used multiple times:
* [^]{3} in UTF-8 mode, each of sub-regexps [^] will have common suffix
* [\x80-\xBF] factored out, but they won't share instructions.
*/
- Ins * ins_cache; /* if non-NULL, points to compiled instructions */
+ nfa_state_t *ins_cache; /* if non-NULL, points to compiled instructions */
enum InsAccess
{
SHARED,
} ins_access;
inline RegExp ()
- : size (0)
- , ins_cache (NULL)
+ : ins_cache (NULL)
, ins_access (SHARED)
{
vFreeList.insert (this);
vFreeList.erase (this);
}
virtual void split (std::set<uint32_t> &) = 0;
- virtual void calcSize (const charset_t &) = 0;
+ virtual uint32_t calc_size() const = 0;
virtual uint32_t fixedLength ();
- virtual uint32_t compile (const charset_t &, Ins *) = 0;
+ virtual nfa_state_t *compile(nfa_t &nfa, nfa_state_t *n) = 0;
virtual void decompile () = 0;
virtual void display (std::ostream &) const = 0;
friend std::ostream & operator << (std::ostream & o, const RegExp & re);
, exp2 (e2)
{}
void split (std::set<uint32_t> &);
- void calcSize (const charset_t &);
+ uint32_t calc_size() const;
uint32_t fixedLength ();
- uint32_t compile (const charset_t &, Ins *);
+ nfa_state_t *compile(nfa_t &nfa, nfa_state_t *n);
void decompile ();
void display (std::ostream & o) const;
friend RegExp * mkAlt (RegExp *, RegExp *);
, exp2 (e2)
{}
void split (std::set<uint32_t> &);
- void calcSize (const charset_t &);
+ uint32_t calc_size() const;
uint32_t fixedLength ();
- uint32_t compile (const charset_t &, Ins *);
+ nfa_state_t *compile(nfa_t &nfa, nfa_state_t *n);
void decompile ();
void display (std::ostream & o) const;
: exp (e)
{}
void split (std::set<uint32_t> &);
- void calcSize (const charset_t &);
- uint32_t compile (const charset_t &, Ins *);
+ uint32_t calc_size() const;
+ nfa_state_t *compile(nfa_t &nfa, nfa_state_t *n);
void decompile ();
void display (std::ostream & o) const;
: match (m)
{}
void split (std::set<uint32_t> &);
- void calcSize (const charset_t &);
+ uint32_t calc_size() const;
uint32_t fixedLength ();
- uint32_t compile (const charset_t &, Ins *);
+ nfa_state_t *compile(nfa_t &nfa, nfa_state_t *n);
void decompile ();
void display (std::ostream & o) const;
{
public:
void split (std::set<uint32_t> &);
- void calcSize (const charset_t &);
+ uint32_t calc_size() const;
uint32_t fixedLength ();
- uint32_t compile (const charset_t &, Ins *);
+ nfa_state_t *compile(nfa_t &nfa, nfa_state_t *n);
void decompile ();
void display (std::ostream & o) const;
};
public:
RegExp * ctx;
- Ins * ins;
rule_rank_t rank;
const Code * code;
const std::string newcond;
: loc (l)
, exp (r1)
, ctx (r2)
- , ins (NULL)
, rank (r)
, code (c)
, newcond (cond ? *cond : "")
}
void display (std::ostream & o) const;
void split (std::set<uint32_t> &);
- void calcSize (const charset_t &);
- uint32_t compile (const charset_t &, Ins *);
+ uint32_t calc_size() const;
+ nfa_state_t *compile(nfa_t &nfa, nfa_state_t *n);
void decompile ();
FORBID_COPY (RuleOp);
#include "src/codegen/skeleton/skeleton.h"
#include "src/conf/opt.h"
#include "src/globals.h"
-#include "src/ir/bytecode/bytecode.h"
+#include "src/ir/compile.h"
#include "src/ir/dfa/dfa.h"
#include "src/ir/regexp/encoding/enc.h"
#include "src/ir/regexp/encoding/range_suffix.h"
}
}
- dfa_map[it->first] = genCode(it->second, o, it->first, opts->encoding.nCodeUnits ());
+ dfa_map[it->first] = compile(it->second, o, it->first, opts->encoding.nCodeUnits ());
}
if (parseMode != Scanner::Rules && dfa_map.find(it->first) != dfa_map.end())
{
{
if (parseMode != Scanner::Reuse)
{
- dfa_map[""] = genCode(spec, o, "", opts->encoding.nCodeUnits ());
+ dfa_map[""] = compile(spec, o, "", opts->encoding.nCodeUnits ());
}
if (parseMode != Scanner::Rules && dfa_map.find("") != dfa_map.end())
{