]> granicus.if.org Git - re2c/commitdiff
Changed bytecode intermediate representation to a simpler NFA representation.
authorUlya Trofimovich <skvadrik@gmail.com>
Fri, 18 Dec 2015 12:52:17 +0000 (12:52 +0000)
committerUlya Trofimovich <skvadrik@gmail.com>
Fri, 18 Dec 2015 12:52:17 +0000 (12:52 +0000)
25 files changed:
re2c/Makefile.am
re2c/bootstrap/src/parse/lex.cc
re2c/bootstrap/src/parse/parser.cc
re2c/src/ir/bytecode/bytecode.cc [deleted file]
re2c/src/ir/bytecode/calc_size.cc [deleted file]
re2c/src/ir/bytecode/compile.cc [deleted file]
re2c/src/ir/bytecode/ins.cc [deleted file]
re2c/src/ir/bytecode/ins.h [deleted file]
re2c/src/ir/compile.cc [new file with mode: 0644]
re2c/src/ir/compile.h [moved from re2c/src/ir/bytecode/bytecode.h with 53% similarity]
re2c/src/ir/dfa/dfa.cc
re2c/src/ir/dfa/dfa.h
re2c/src/ir/dfa/state.h
re2c/src/ir/nfa/calc_size.cc [new file with mode: 0644]
re2c/src/ir/nfa/nfa.cc [new file with mode: 0644]
re2c/src/ir/nfa/nfa.h [new file with mode: 0644]
re2c/src/ir/nfa/split.cc [moved from re2c/src/ir/bytecode/split.cc with 100% similarity]
re2c/src/ir/regexp/regexp.h
re2c/src/ir/regexp/regexp_alt.h
re2c/src/ir/regexp/regexp_cat.h
re2c/src/ir/regexp/regexp_close.h
re2c/src/ir/regexp/regexp_match.h
re2c/src/ir/regexp/regexp_null.h
re2c/src/ir/regexp/regexp_rule.h
re2c/src/parse/parser.ypp

index 4f0805ebb6939f908a6b68ae4887860251f35db3..00f866a64647e16479976d2e8a198ef4230696c6 100644 (file)
@@ -24,8 +24,7 @@ SRC_HDR = \
        src/conf/msg.h \
        src/conf/opt.h \
        src/conf/warn.h \
-       src/ir/bytecode/bytecode.h \
-       src/ir/bytecode/ins.h \
+       src/ir/bytecode/nfa.h \
        src/ir/dfa/state.h \
        src/ir/dfa/dfa.h \
        src/ir/dfa/action.h \
@@ -46,6 +45,7 @@ SRC_HDR = \
        src/ir/regexp/regexp_null.h \
        src/ir/regexp/regexp.h \
        src/ir/regexp/regexp_close.h \
+       src/ir/compile.h \
        src/ir/rule_rank.h \
        src/globals.h \
        src/parse/code.h \
@@ -95,11 +95,9 @@ SRC = \
        src/conf/msg.cc \
        src/conf/opt.cc \
        src/conf/warn.cc \
-       src/ir/bytecode/bytecode.cc \
-       src/ir/bytecode/ins.cc \
-       src/ir/bytecode/split.cc \
-       src/ir/bytecode/compile.cc \
-       src/ir/bytecode/calc_size.cc \
+       src/ir/nfa/calc_size.cc \
+       src/ir/nfa/nfa.cc \
+       src/ir/nfa/split.cc \
        src/ir/dfa/dfa.cc \
        src/ir/regexp/display.cc \
        src/ir/regexp/encoding/enc.cc \
@@ -112,6 +110,7 @@ SRC = \
        src/ir/regexp/encoding/utf16/utf16_range.cc \
        src/ir/regexp/fixed_length.cc \
        src/ir/regexp/regexp.cc \
+       src/ir/compile.cc \
        src/ir/rule_rank.cc \
        src/main.cc \
        src/parse/code.cc \
index de3c8f658a1f0b506f3e7c99bd9303dd01f04e00..f2d091429520b46f4b30045d1b45d655b7aff647 100644 (file)
@@ -1,4 +1,4 @@
-/* Generated by re2c 0.15.3 on Tue Dec 15 12:16:26 2015 */
+/* Generated by re2c 0.15.3 on Tue Dec 15 14:38:22 2015 */
 #line 1 "../src/parse/lex.re"
 #include "src/util/c99_stdint.h"
 #include <stddef.h>
index 7071b3d8ec3a91d5b0f13d6cd000537657148084..f8e7c4b5620a3bc930fd5c9c87ac3e4313597334 100644 (file)
@@ -84,7 +84,7 @@
 #include "src/codegen/skeleton/skeleton.h"
 #include "src/conf/opt.h"
 #include "src/globals.h"
-#include "src/ir/bytecode/bytecode.h"
+#include "src/ir/compile.h"
 #include "src/ir/dfa/dfa.h"
 #include "src/ir/regexp/encoding/enc.h"
 #include "src/ir/regexp/encoding/range_suffix.h"
@@ -2406,7 +2406,7 @@ void parse(Scanner& i, Output & o)
                                                }
                                        }
 
-                                       dfa_map[it->first] = genCode(it->second, o, it->first, opts->encoding.nCodeUnits ());
+                                       dfa_map[it->first] = compile(it->second, o, it->first, opts->encoding.nCodeUnits ());
                                }
                                if (parseMode != Scanner::Rules && dfa_map.find(it->first) != dfa_map.end())
                                {
@@ -2420,7 +2420,7 @@ void parse(Scanner& i, Output & o)
                        {
                                if (parseMode != Scanner::Reuse)
                                {
-                                       dfa_map[""] = genCode(spec, o, "", opts->encoding.nCodeUnits ());
+                                       dfa_map[""] = compile(spec, o, "", opts->encoding.nCodeUnits ());
                                }
                                if (parseMode != Scanner::Rules && dfa_map.find("") != dfa_map.end())
                                {
diff --git a/re2c/src/ir/bytecode/bytecode.cc b/re2c/src/ir/bytecode/bytecode.cc
deleted file mode 100644 (file)
index d878a34..0000000
+++ /dev/null
@@ -1,131 +0,0 @@
-#include <assert.h>
-#include <string.h>
-#include <algorithm>
-
-#include "src/codegen/output.h"
-#include "src/ir/bytecode/bytecode.h"
-#include "src/ir/bytecode/ins.h"
-#include "src/ir/dfa/dfa.h"
-#include "src/ir/regexp/regexp.h"
-#include "src/parse/spec.h"
-
-namespace re2c {
-
-static void optimize (Ins * i);
-
-smart_ptr<DFA> genCode (Spec & spec, Output & output, const std::string & cond, uint32_t cunits)
-{
-       RegExp * re = spec.re;
-
-       // The original set of code units (charset) might be very large.
-       // A common trick it is to split charset into disjoint character ranges
-       // and choose a representative of each range (we choose lower bound).
-       // The set of all representatives is the new (compacted) charset.
-       // Don't forget to include zero and upper bound, even if they
-       // do not explicitely apper in ranges.
-       std::set<uint32_t> bounds;
-       re->split(bounds);
-       bounds.insert(0);
-       bounds.insert(cunits);
-       charset_t cs;
-       for (std::set<uint32_t>::const_iterator i = bounds.begin(); i != bounds.end(); ++i)
-       {
-               cs.push_back(*i);
-       }
-
-       re->calcSize(cs);
-
-       Ins *ins = new Ins[re->size + 1];
-       memset(ins, 0, (re->size + 1)*sizeof(Ins));
-       const uint32_t size = re->compile(cs, ins);
-       Ins *eoi = &ins[size];
-       eoi->i.tag = GOTO;
-       eoi->i.link = eoi;
-
-       optimize(ins);
-
-       /*
-       for (const Ins *inst = &ins[0]; inst < &ins[size]; )
-       {
-               inst = showIns(std::cout, *inst, ins[0]);
-       }
-       */
-
-       for (uint32_t j = 0; j < size;)
-       {
-               unmark(&ins[j]);
-
-               if (ins[j].i.tag == CHAR)
-               {
-                       j = static_cast<uint32_t> ((Ins*) ins[j].i.link - ins);
-               }
-               else
-               {
-                       j++;
-               }
-       }
-
-       smart_ptr<DFA> dfa = make_smart_ptr (new DFA
-               ( cond
-               , output.source.get_block_line ()
-               , ins
-               , size
-               , 0
-               , cunits
-               , cs
-               , spec.rules
-               ));
-
-       // accumulate global statistics from this particular DFA
-       output.max_fill = std::max (output.max_fill, dfa->max_fill);
-       if (dfa->need_accept)
-       {
-               output.source.set_used_yyaccept ();
-       }
-
-       return dfa;
-}
-
-void optimize (Ins * i)
-{
-       while (!isMarked (i))
-       {
-               mark (i);
-               if (i->i.tag == CHAR)
-               {
-                       i = (Ins *) i->i.link;
-               }
-               else if (i->i.tag == GOTO || i->i.tag == FORK)
-               {
-                       Ins * target = (Ins *) i->i.link;
-                       optimize (target);
-                       if (target->i.tag == GOTO)
-                       {
-                               i->i.link = target->i.link == target
-                                       ? i
-                                       : target;
-                       }
-                       if (i->i.tag == FORK)
-                       {
-                               Ins * follow = (Ins *) & i[1];
-                               optimize (follow);
-                               if (follow->i.tag == GOTO && follow->i.link == follow)
-                               {
-                                       i->i.tag = GOTO;
-                               }
-                               else if (i->i.link == i)
-                               {
-                                       i->i.tag = GOTO;
-                                       i->i.link = follow;
-                               }
-                       }
-                       return;
-               }
-               else
-               {
-                       ++i;
-               }
-       }
-}
-
-} // namespace re2c
diff --git a/re2c/src/ir/bytecode/calc_size.cc b/re2c/src/ir/bytecode/calc_size.cc
deleted file mode 100644 (file)
index 5feca8c..0000000
+++ /dev/null
@@ -1,61 +0,0 @@
-#include "src/util/c99_stdint.h"
-
-#include "src/ir/regexp/regexp.h"
-#include "src/ir/regexp/regexp_alt.h"
-#include "src/ir/regexp/regexp_cat.h"
-#include "src/ir/regexp/regexp_close.h"
-#include "src/ir/regexp/regexp_match.h"
-#include "src/ir/regexp/regexp_null.h"
-#include "src/ir/regexp/regexp_rule.h"
-#include "src/util/range.h"
-
-namespace re2c
-{
-
-void AltOp::calcSize (const charset_t & cs)
-{
-       exp1->calcSize (cs);
-       exp2->calcSize (cs);
-       size = exp1->size + exp2->size + 2;
-}
-
-void CatOp::calcSize (const charset_t & cs)
-{
-       exp1->calcSize (cs);
-       exp2->calcSize (cs);
-       size = exp1->size + exp2->size;
-}
-
-void CloseOp::calcSize (const charset_t & cs)
-{
-       exp->calcSize (cs);
-       size = exp->size + 2;
-}
-
-void MatchOp::calcSize (const charset_t & cs)
-{
-       size = 1;
-       uint32_t k = 0;
-       for (Range * r = match; r; r = r->next ())
-       {
-               for (; cs[k] != r->lower(); ++k);
-               for (; cs[k] != r->upper(); ++k)
-               {
-                       ++size;
-               }
-       }
-}
-
-void NullOp::calcSize (const charset_t &)
-{
-       size = 0;
-}
-
-void RuleOp::calcSize (const charset_t & cs)
-{
-       exp->calcSize (cs);
-       ctx->calcSize (cs);
-       size = exp->size + (ctx->size ? ctx->size + 2 : 1);
-}
-
-} // end namespace re2c
diff --git a/re2c/src/ir/bytecode/compile.cc b/re2c/src/ir/bytecode/compile.cc
deleted file mode 100644 (file)
index 67f7c36..0000000
+++ /dev/null
@@ -1,220 +0,0 @@
-#include "src/util/c99_stdint.h"
-
-#include "src/ir/bytecode/ins.h"
-#include "src/ir/regexp/regexp.h"
-#include "src/ir/regexp/regexp_alt.h"
-#include "src/ir/regexp/regexp_cat.h"
-#include "src/ir/regexp/regexp_close.h"
-#include "src/ir/regexp/regexp_match.h"
-#include "src/ir/regexp/regexp_null.h"
-#include "src/ir/regexp/regexp_rule.h"
-#include "src/util/range.h"
-
-namespace re2c
-{
-
-static uint32_t compile_goto (Ins * ins, Ins * i);
-
-uint32_t AltOp::compile (const charset_t & cs, Ins * i)
-{
-       if (ins_cache)
-       {
-               return compile_goto (ins_cache, i);
-       }
-       else
-       {
-               ins_cache = i;
-
-               i->i.tag = FORK;
-               const uint32_t sz1 = exp1->compile (cs, &i[1]);
-               Ins * const j = &i[sz1 + 1];
-               i->i.link = &j[1];
-               j->i.tag = GOTO;
-               const uint32_t sz2 = exp2->compile (cs, &j[1]);
-               j->i.link = &j[sz2 + 1];
-
-               if (ins_access == PRIVATE)
-               {
-                       decompile ();
-               }
-
-               return sz1 + sz2 + 2;
-       }
-}
-
-void AltOp::decompile ()
-{
-       if (ins_cache)
-       {
-               exp1->decompile ();
-               exp2->decompile ();
-               ins_cache = NULL;
-       }
-}
-
-uint32_t CatOp::compile (const charset_t & cs, Ins * i)
-{
-       if (ins_cache)
-       {
-               return compile_goto (ins_cache, i);
-       }
-       else
-       {
-               ins_cache = i;
-
-               const uint32_t sz1 = exp1->compile (cs, &i[0]);
-               const uint32_t sz2 = exp2->compile (cs, &i[sz1]);
-
-               if (ins_access == PRIVATE)
-               {
-                       decompile ();
-               }
-
-               return sz1 + sz2;
-       }
-}
-
-void CatOp::decompile ()
-{
-       if (ins_cache)
-       {
-               exp1->decompile ();
-               exp2->decompile ();
-               ins_cache = NULL;
-       }
-}
-
-uint32_t CloseOp::compile (const charset_t & cs, Ins * i)
-{
-       if (ins_cache)
-       {
-               return compile_goto (ins_cache, i);
-       }
-       else
-       {
-               ins_cache = i;
-
-               i->i.tag = FORK;
-               ++i;
-               i += exp->compile (cs, i);
-               i->i.tag = GOTO;
-               i->i.link = ins_cache;
-               ++i;
-               ins_cache->i.link = i;
-
-               const uint32_t sz = static_cast<uint32_t> (i - ins_cache);
-               if (ins_access == PRIVATE)
-               {
-                       decompile ();
-               }
-
-               return sz;
-       }
-}
-
-void CloseOp::decompile ()
-{
-       if (ins_cache)
-       {
-               exp->decompile ();
-               ins_cache = NULL;
-       }
-}
-
-uint32_t MatchOp::compile (const charset_t & cs, Ins * i)
-{
-       if (ins_cache)
-       {
-               return compile_goto (ins_cache, i);
-       }
-       else
-       {
-               ins_cache = i;
-
-               i->i.tag = CHAR;
-               i->i.link = &i[size];
-               Ins *j = &i[1];
-               uint32_t bump = size;
-               uint32_t k = 0;
-               for (Range *r = match; r; r = r->next ())
-               {
-                       for (; cs[k] != r->lower(); ++k);
-                       for (; cs[k] != r->upper(); ++k)
-                       {
-                               j->c.value = k;
-                               j->c.bump = --bump;
-                               j++;
-                       }
-               }
-
-               if (ins_access == PRIVATE)
-               {
-                       decompile ();
-               }
-
-               return size;
-       }
-}
-
-void MatchOp::decompile ()
-{
-       ins_cache = NULL;
-}
-
-uint32_t NullOp::compile (const charset_t &, Ins *)
-{
-       return 0;
-}
-
-void NullOp::decompile () {}
-
-uint32_t RuleOp::compile (const charset_t & cs, Ins * i)
-{
-       if (ins_cache)
-       {
-               return compile_goto (ins_cache, i);
-       }
-       else
-       {
-               ins_cache = i;
-
-               i += exp->compile (cs, &i[0]);
-               if (ctx->size)
-               {
-                       i->i.tag = CTXT;
-                       i->i.link = &i[1];
-                       ++i;
-                       i += ctx->compile (cs, &i[0]);
-               }
-               i->i.tag = TERM;
-               i->i.link = this;
-               ++i;
-               const uint32_t sz = static_cast<uint32_t> (i - ins_cache);
-
-               if (ins_access == PRIVATE)
-               {
-                       decompile ();
-               }
-
-               return sz;
-       }
-}
-
-void RuleOp::decompile ()
-{
-       if (ins_cache)
-       {
-               exp->decompile ();
-               ctx->decompile ();
-               ins_cache = NULL;
-       }
-}
-
-uint32_t compile_goto (Ins * ins, Ins * i)
-{
-       i->i.tag = GOTO;
-       i->i.link = ins;
-       return 1;
-}
-
-} // end namespace re2c
diff --git a/re2c/src/ir/bytecode/ins.cc b/re2c/src/ir/bytecode/ins.cc
deleted file mode 100644 (file)
index d153520..0000000
+++ /dev/null
@@ -1,42 +0,0 @@
-#include <iostream>
-
-#include "src/ir/bytecode/ins.h"
-#include "src/ir/regexp/regexp_rule.h"
-#include "src/ir/rule_rank.h"
-
-namespace re2c {
-
-const Ins * showIns (std::ostream & o, const Ins & i, const Ins & base)
-{
-       o.width (3);
-       o << &i - &base << ": ";
-       const Ins * ret = &(&i)[1];
-       switch (i.i.tag)
-       {
-               case CHAR:
-               {
-                       o << "match ";
-                       for (; ret < (Ins *) i.i.link; ++ret)
-                       {
-                               o << "\\x" << std::hex << ret->c.value;
-                       }
-                       break;
-               }
-               case GOTO:
-                       o << "goto " << ((Ins *) i.i.link - &base);
-                       break;
-               case FORK:
-                       o << "fork " << ((Ins *) i.i.link - &base);
-                       break;
-               case CTXT:
-                       o << "ctxt";
-                       break;
-               case TERM:
-                       o << "term " << ((RuleOp *) i.i.link)->rank;
-                       break;
-       }
-       o << "\n";
-       return ret;
-}
-
-} // namespace re2c
diff --git a/re2c/src/ir/bytecode/ins.h b/re2c/src/ir/bytecode/ins.h
deleted file mode 100644 (file)
index 4ec14fd..0000000
+++ /dev/null
@@ -1,51 +0,0 @@
-#ifndef _RE2C_IR_BYTECODE_INS_
-#define _RE2C_IR_BYTECODE_INS_
-
-#include "src/util/c99_stdint.h"
-#include <iosfwd>
-
-namespace re2c
-{
-
-static const uint32_t CHAR = 0;
-static const uint32_t GOTO = 1;
-static const uint32_t FORK = 2;
-static const uint32_t TERM = 3;
-static const uint32_t CTXT = 4;
-
-union Ins
-{
-       struct
-       {
-               uint8_t tag;
-               uint8_t marked;
-               void * link;
-       } i;
-       struct
-       {
-               uint32_t value;
-               uint32_t bump;
-               void * link;
-       } c;
-};
-
-inline bool isMarked (Ins * i)
-{
-       return i->i.marked != 0;
-}
-
-inline void mark (Ins * i)
-{
-       i->i.marked = true;
-}
-
-inline void unmark (Ins * i)
-{
-       i->i.marked = false;
-}
-
-const Ins * showIns (std::ostream & o, const Ins & i, const Ins & base);
-
-} // namespace re2c
-
-#endif // _RE2C_IR_BYTECODE_INS_
diff --git a/re2c/src/ir/compile.cc b/re2c/src/ir/compile.cc
new file mode 100644 (file)
index 0000000..914f158
--- /dev/null
@@ -0,0 +1,54 @@
+#include <algorithm>
+
+#include "src/codegen/output.h"
+#include "src/ir/compile.h"
+#include "src/ir/dfa/dfa.h"
+#include "src/ir/nfa/nfa.h"
+#include "src/ir/regexp/regexp.h"
+#include "src/parse/spec.h"
+
+namespace re2c {
+
+smart_ptr<DFA> compile (Spec & spec, Output & output, const std::string & cond, uint32_t cunits)
+{
+       RegExp * re = spec.re;
+
+       // The original set of code units (charset) might be very large.
+       // A common trick it is to split charset into disjoint character ranges
+       // and choose a representative of each range (we choose lower bound).
+       // The set of all representatives is the new (compacted) charset.
+       // Don't forget to include zero and upper bound, even if they
+       // do not explicitely apper in ranges.
+       std::set<uint32_t> bounds;
+       re->split(bounds);
+       bounds.insert(0);
+       bounds.insert(cunits);
+       charset_t cs;
+       for (std::set<uint32_t>::const_iterator i = bounds.begin(); i != bounds.end(); ++i)
+       {
+               cs.push_back(*i);
+       }
+
+       nfa_t nfa(re);
+
+       smart_ptr<DFA> dfa = make_smart_ptr (new DFA
+               ( cond
+               , output.source.get_block_line ()
+               , 0
+               , cunits
+               , cs
+               , spec.rules
+               , nfa
+               ));
+
+       // accumulate global statistics from this particular DFA
+       output.max_fill = std::max (output.max_fill, dfa->max_fill);
+       if (dfa->need_accept)
+       {
+               output.source.set_used_yyaccept ();
+       }
+
+       return dfa;
+}
+
+} // namespace re2c
similarity index 53%
rename from re2c/src/ir/bytecode/bytecode.h
rename to re2c/src/ir/compile.h
index 67f7c73130583048b75910617b97614c5ff02f54..6883c1c3f5bcb7bd6ac51e4eaafcdb929786c40f 100644 (file)
@@ -1,5 +1,5 @@
-#ifndef _RE2C_IR_BYTECODE_BYTECODE_
-#define _RE2C_IR_BYTECODE_BYTECODE_
+#ifndef _RE2C_IR_COMPILE_
+#define _RE2C_IR_COMPILE_
 
 #include "src/util/c99_stdint.h"
 #include <string>
@@ -13,8 +13,8 @@ class DFA;
 struct Output;
 struct Spec;
 
-smart_ptr<DFA> genCode (Spec & spec, Output & output, const std::string & cond, uint32_t cunits);
+smart_ptr<DFA> compile (Spec & spec, Output & output, const std::string & cond, uint32_t cunits);
 
 } // namespace re2c
 
-#endif // _RE2C_IR_BYTECODE_BYTECODE_
+#endif // _RE2C_IR_COMPILE_
index edf588cab452848a8b8c0e654c75853cf70360fc..085ed6a5c56cb9aee5ebb8bdaf32a97146472034 100644 (file)
@@ -1,40 +1,40 @@
 #include <assert.h>
-#include <string.h>
-#include <map>
+#include <ostream>
 #include <set>
+#include <string.h>
 #include <queue>
-#include <ostream>
 
 #include "src/codegen/go.h"
 #include "src/codegen/skeleton/skeleton.h"
 #include "src/ir/dfa/dfa.h"
-#include "src/ir/bytecode/ins.h"
+#include "src/ir/nfa/nfa.h"
 #include "src/ir/dfa/state.h"
 #include "src/ir/regexp/regexp_rule.h"
 #include "src/ir/rule_rank.h"
 #include "src/util/allocate.h"
+#include "src/util/range.h"
 
 namespace re2c
 {
 
-static Ins **closure(Ins **cP, Ins *i)
+static nfa_state_t **closure(nfa_state_t **cP, nfa_state_t *n)
 {
-       while (!isMarked(i))
+       if (!n->mark)
        {
-               mark(i);
-               *(cP++) = i;
-
-               if (i->i.tag == FORK)
-               {
-                       cP = closure(cP, i + 1);
-                       i = (Ins*) i->i.link;
-               }
-               else if (i->i.tag == GOTO || i->i.tag == CTXT)
+               n->mark = true;
+               *(cP++) = n;
+               switch (n->type)
                {
-                       i = (Ins*) i->i.link;
+                       case nfa_state_t::ALT:
+                               cP = closure(cP, n->value.alt.out2);
+                               cP = closure(cP, n->value.alt.out1);
+                               break;
+                       case nfa_state_t::CTX:
+                               cP = closure(cP, n->value.ctx.out);
+                               break;
+                       default:
+                               break;
                }
-               else
-                       break;
        }
 
        return cP;
@@ -43,12 +43,11 @@ static Ins **closure(Ins **cP, Ins *i)
 DFA::DFA
        ( const std::string & c
        , uint32_t l
-       , Ins * ins
-       , uint32_t ni
        , uint32_t lb
        , uint32_t ub
        , const charset_t & cs
-       , rules_t rules
+       , rules_t & rules
+       , nfa_t & nfa
        )
        : accepts ()
        , skeleton (NULL)
@@ -61,7 +60,6 @@ DFA::DFA
        , head(NULL)
        , tail(&head)
        , toDo(NULL)
-       , free_ins(ins)
 
        // statistics
        , max_fill (0)
@@ -78,11 +76,12 @@ DFA::DFA
                name += cond;
        }
 
-       Ins **work = new Ins * [ni + 1];
-       findState(work, closure(work, &ins[0]));
-
        const size_t nc = cs.size() - 1; // (n + 1) bounds for n ranges
-       void **goTo = new void*[nc];
+
+       nfa_state_t **work = new nfa_state_t* [nfa.size];
+       findState(work, closure(work, nfa.root));
+
+       std::vector<nfa_state_t*> *go = new std::vector<nfa_state_t*>[nc];
        Span *span = allocate<Span> (nc);
 
        while (toDo)
@@ -90,66 +89,84 @@ DFA::DFA
                State *s = toDo;
                toDo = s->link;
 
-               memset(goTo, 0, nc * sizeof(void*));
+               for(uint32_t i = 0; i < nc; ++i)
+               {
+                       go[i].clear();
+               }
+               memset(span, 0, sizeof(Span)*nc);
 
                s->rule = NULL;
                for (uint32_t k = 0; k < s->kCount; ++k)
                {
-                       Ins * i = s->kernel[k];
-                       if (i->i.tag == CHAR)
+                       nfa_state_t *n = s->kernel[k];
+                       switch (n->type)
                        {
-                               for (Ins *j = i + 1; j < (Ins*) i->i.link; ++j)
+//                             case nfa_state_t::CHR:
+//                                     go[n->value.chr.chr].push_back(n->value.chr.out);
+//                                     break;
+                               case nfa_state_t::RAN:
                                {
-                                       j->c.link = goTo[j->c.value];
-                                       goTo[j->c.value] = j;
-                               }
-                       }
-                       else if (i->i.tag == TERM)
-                       {
-                               RuleOp * rule = static_cast<RuleOp *> (i->i.link);
-                               if (!s->rule)
-                               {
-                                       s->rule = rule;
+                                       nfa_state_t *n2 = n->value.ran.out;
+                                       uint32_t j = 0;
+                                       for (Range *r = n->value.ran.ran; r; r = r->next ())
+                                       {
+                                               for (; cs[j] != r->lower(); ++j);
+                                               for (; cs[j] != r->upper(); ++j)
+                                               {
+                                                       go[j].push_back(n2);
+                                               }
+                                       }
+                                       break;
                                }
-                               else
+                               case nfa_state_t::CTX:
+                                       s->isPreCtxt = true;
+                                       break;
+                               case nfa_state_t::FIN:
                                {
-                                       const rule_rank_t r1 = s->rule->rank;
-                                       const rule_rank_t r2 = rule->rank;
-                                       if (r2 < r1)
+                                       RuleOp *rule = n->value.fin.rule;
+                                       if (!s->rule)
                                        {
-                                               rules[r1].shadow.insert (r2);
                                                s->rule = rule;
                                        }
-                                       else if (r1 < r2)
+                                       else
                                        {
-                                               rules[r2].shadow.insert (r1);
+                                               const rule_rank_t r1 = s->rule->rank;
+                                               const rule_rank_t r2 = rule->rank;
+                                               if (r2 < r1)
+                                               {
+                                                       rules[r1].shadow.insert (r2);
+                                                       s->rule = rule;
+                                               }
+                                               else if (r1 < r2)
+                                               {
+                                                       rules[r2].shadow.insert (r1);
+                                               }
                                        }
+                                       break;
                                }
-                       }
-                       else if (i->i.tag == CTXT)
-                       {
-                               s->isPreCtxt = true;
+                               default:
+                                       break;
                        }
                }
 
-               for (uint32_t j = 0; j < nc; ++j)
+               for(uint32_t i = 0; i < nc; ++i)
                {
-                       if (goTo[j])
+                       if(!go[i].empty())
                        {
-                               Ins **cP = work;
-                               for (Ins *i = (Ins*)goTo[j]; i; i = (Ins*) i->c.link)
+                               nfa_state_t **cP = work;
+                               for (std::vector<nfa_state_t*>::const_iterator j = go[i].begin(); j != go[i].end(); ++j)
                                {
-                                       cP = closure(cP, i + i->c.bump);
+                                       cP = closure(cP, *j);
                                }
-                               goTo[j] = findState(work, cP);
+                               span[i].to = findState(work, cP);
                        }
                }
 
                s->go.nSpans = 0;
                for (uint32_t j = 0; j < nc;)
                {
-                       State *to = (State*) goTo[j];
-                       while (++j < nc && goTo[j] == to) ;
+                       State *to = span[j].to;
+                       while (++j < nc && span[j].to == to) ;
                        span[s->go.nSpans].ub = cs[j];
                        span[s->go.nSpans].to = to;
                        s->go.nSpans++;
@@ -159,7 +176,7 @@ DFA::DFA
        }
 
        delete [] work;
-       delete [] goTo;
+       delete [] go;
        operator delete (span);
 
        /*
@@ -238,7 +255,6 @@ DFA::~DFA()
                head = s->next;
                delete s;
        }
-       delete [] free_ins;
 
        delete skeleton;
 }
@@ -253,19 +269,23 @@ void DFA::addState(State **a, State *s)
                tail = &s->next;
 }
 
-State *DFA::findState(Ins **kernel, Ins ** kernel_end)
+State *DFA::findState(nfa_state_t **kernel, nfa_state_t ** kernel_end)
 {
        uint32_t kCount = 0;
-       for (Ins ** i = kernel; i < kernel_end; ++i)
+       for (nfa_state_t ** pn = kernel; pn < kernel_end; ++pn)
        {
-               Ins * ins = *i;
-               if (ins->i.tag == CHAR || ins->i.tag == TERM || ins->i.tag == CTXT)
-               {
-                       kernel[kCount++] = ins;
-               }
-               else
+               nfa_state_t *n = *pn;
+               switch (n->type)
                {
-                       unmark (ins);
+//                     case nfa_state_t::CHR:
+                       case nfa_state_t::RAN:
+                       case nfa_state_t::CTX:
+                       case nfa_state_t::FIN:
+                               kernel[kCount++] = n;
+                               break;
+                       default:
+                               n->mark = false;
+                               break;
                }
        }
 
@@ -277,7 +297,7 @@ State *DFA::findState(Ins **kernel, Ins ** kernel_end)
                        bool marked = true;
                        for (uint32_t i = 0; marked && i < s->kCount; ++i)
                        {
-                               marked = isMarked (s->kernel[i]);
+                               marked = s->kernel[i]->mark;
                        }
                        if (marked)
                        {
@@ -291,15 +311,15 @@ State *DFA::findState(Ins **kernel, Ins ** kernel_end)
                s = new State;
                addState(tail, s);
                s->kCount = kCount;
-               s->kernel = new Ins * [kCount];
-               memcpy(s->kernel, kernel, kCount * sizeof (Ins *));
+               s->kernel = new nfa_state_t* [kCount];
+               memcpy(s->kernel, kernel, kCount * sizeof(nfa_state_t*));
                s->link = toDo;
                toDo = s;
        }
 
        for (uint32_t i = 0; i < kCount; ++i)
        {
-               unmark (kernel[i]);
+               kernel[i]->mark = false;
        }
 
        return s;
index 9c0b886b9e01d03487615e4302aff3df55e01e9c..e32a5f3a6de978cfbe225bc86ed686f18daee106 100644 (file)
@@ -19,6 +19,8 @@ class label_t;
 struct Output;
 struct OutputFile;
 union Ins;
+struct nfa_t;
+struct nfa_state_t;
 
 class DFA
 {
@@ -36,7 +38,6 @@ public:
        State * head;
        State ** tail;
        State * toDo;
-       const Ins * free_ins;
 
        // statistics
        uint32_t max_fill;
@@ -48,19 +49,18 @@ public:
        DFA
                ( const std::string &
                , uint32_t
-               , Ins *
-               , uint32_t
                , uint32_t
                , uint32_t
                , const charset_t &
-               , rules_t
+               , rules_t &
+               , nfa_t &
                );
        ~DFA ();
        void emit (Output &, uint32_t &, bool, bool &);
 
 private:
        void addState (State **, State *);
-       State * findState (Ins **, Ins **);
+       State * findState (nfa_state_t **, nfa_state_t **);
        void reorder();
        void split (State *);
        void findSCCs ();
index 41af7dfdb12805a18dfad00c4efa95cb237b5268..4bb445af7a5a46a594da31c69dd4fd74f75da940 100644 (file)
@@ -9,6 +9,8 @@
 namespace re2c
 {
 
+struct nfa_state_t;
+
 class State
 {
 public:
@@ -18,7 +20,7 @@ public:
        State * link;
        uint32_t depth; // for finding SCCs
        uint32_t kCount;
-       Ins ** kernel;
+       nfa_state_t ** kernel;
 
        bool isPreCtxt;
        bool isBase;
diff --git a/re2c/src/ir/nfa/calc_size.cc b/re2c/src/ir/nfa/calc_size.cc
new file mode 100644 (file)
index 0000000..27c3118
--- /dev/null
@@ -0,0 +1,51 @@
+#include "src/util/c99_stdint.h"
+
+#include "src/ir/regexp/regexp.h"
+#include "src/ir/regexp/regexp_alt.h"
+#include "src/ir/regexp/regexp_cat.h"
+#include "src/ir/regexp/regexp_close.h"
+#include "src/ir/regexp/regexp_match.h"
+#include "src/ir/regexp/regexp_null.h"
+#include "src/ir/regexp/regexp_rule.h"
+#include "src/util/range.h"
+
+namespace re2c
+{
+
+uint32_t AltOp::calc_size() const
+{
+       return exp1->calc_size()
+               + exp2->calc_size()
+               + 1;
+}
+
+uint32_t CatOp::calc_size() const
+{
+       return exp1->calc_size()
+               + exp2->calc_size();
+}
+
+uint32_t CloseOp::calc_size() const
+{
+       return exp->calc_size() + 1;
+}
+
+uint32_t MatchOp::calc_size() const
+{
+       return 1;
+}
+
+uint32_t NullOp::calc_size() const
+{
+       return 0;
+}
+
+uint32_t RuleOp::calc_size() const
+{
+       const uint32_t n = ctx->calc_size();
+       return exp->calc_size()
+               + (n > 0 ? n + 1 : 0)
+               + 1;
+}
+
+} // end namespace re2c
diff --git a/re2c/src/ir/nfa/nfa.cc b/re2c/src/ir/nfa/nfa.cc
new file mode 100644 (file)
index 0000000..c7b4ea9
--- /dev/null
@@ -0,0 +1,188 @@
+#include "src/ir/nfa/nfa.h"
+#include "src/ir/regexp/regexp.h"
+#include "src/ir/regexp/regexp_alt.h"
+#include "src/ir/regexp/regexp_cat.h"
+#include "src/ir/regexp/regexp_close.h"
+#include "src/ir/regexp/regexp_match.h"
+#include "src/ir/regexp/regexp_null.h"
+#include "src/ir/regexp/regexp_rule.h"
+
+namespace re2c {
+
+nfa_t::nfa_t(RegExp *re)
+       : max_size(re->calc_size())
+       , size(0)
+       , states(new nfa_state_t[max_size])
+       , root(re->compile(*this, NULL))
+{}
+
+nfa_t::~nfa_t()
+{
+       delete[] states;
+}
+
+nfa_state_t *AltOp::compile(nfa_t &nfa, nfa_state_t *t)
+{
+       if (ins_cache)
+       {
+               return ins_cache;
+       }
+       else
+       {
+               nfa_state_t *s = &nfa.states[nfa.size++];
+               s->alt(exp1->compile(nfa, t)
+                       , exp2->compile(nfa, t));
+
+               ins_cache = s;
+               if (ins_access == PRIVATE)
+               {
+                       decompile();
+               }
+
+               return s;
+       }
+}
+
+void AltOp::decompile ()
+{
+       if (ins_cache)
+       {
+               exp1->decompile ();
+               exp2->decompile ();
+               ins_cache = NULL;
+       }
+}
+
+nfa_state_t *CatOp::compile(nfa_t &nfa, nfa_state_t *t)
+{
+       if (ins_cache)
+       {
+               return ins_cache;
+       }
+       else
+       {
+               nfa_state_t *s2 = exp2->compile(nfa, t);
+               nfa_state_t *s1 = exp1->compile(nfa, s2);
+
+               ins_cache = s1;
+               if (ins_access == PRIVATE)
+               {
+                       decompile();
+               }
+
+               return s1;
+       }
+}
+
+void CatOp::decompile ()
+{
+       if (ins_cache)
+       {
+               exp1->decompile ();
+               exp2->decompile ();
+               ins_cache = NULL;
+       }
+}
+
+nfa_state_t *CloseOp::compile(nfa_t &nfa, nfa_state_t *t)
+{
+       if (ins_cache)
+       {
+               return ins_cache;
+       }
+       else
+       {
+               nfa_state_t *s = &nfa.states[nfa.size++];
+               s->alt(t, exp->compile(nfa, s));
+
+               ins_cache = s;
+               if (ins_access == PRIVATE)
+               {
+                       decompile();
+               }
+
+               return s;
+       }
+}
+
+void CloseOp::decompile ()
+{
+       if (ins_cache)
+       {
+               exp->decompile ();
+               ins_cache = NULL;
+       }
+}
+
+nfa_state_t *MatchOp::compile(nfa_t &nfa, nfa_state_t *t)
+{
+       if (ins_cache)
+       {
+               return ins_cache;
+       }
+       else
+       {
+               nfa_state_t *s = &nfa.states[nfa.size++];
+               s->ran(t, match);
+
+               ins_cache = s;
+               if (ins_access == PRIVATE)
+               {
+                       decompile();
+               }
+
+               return s;
+       }
+}
+
+void MatchOp::decompile ()
+{
+       ins_cache = NULL;
+}
+
+nfa_state_t *NullOp::compile(nfa_t &, nfa_state_t *t)
+{
+       return t;
+}
+
+void NullOp::decompile () {}
+
+nfa_state_t *RuleOp::compile(nfa_t &nfa, nfa_state_t *)
+{
+       if (ins_cache)
+       {
+               return ins_cache;
+       }
+       else
+       {
+               nfa_state_t *s3 = &nfa.states[nfa.size++];
+               s3->fin(this);
+               if (ctx->calc_size() > 0)
+               {
+                       nfa_state_t *s2 = &nfa.states[nfa.size++];
+                       s2->ctx(ctx->compile(nfa, s3));
+                       s3 = s2;
+               }
+               nfa_state_t *s1 = exp->compile(nfa, s3);
+
+               ins_cache = s1;
+               if (ins_access == PRIVATE)
+               {
+                       decompile();
+               }
+
+               return s1;
+       }
+}
+
+void RuleOp::decompile ()
+{
+       if (ins_cache)
+       {
+               exp->decompile ();
+               ctx->decompile ();
+               ins_cache = NULL;
+       }
+}
+
+} // namespace re2c
diff --git a/re2c/src/ir/nfa/nfa.h b/re2c/src/ir/nfa/nfa.h
new file mode 100644 (file)
index 0000000..969012f
--- /dev/null
@@ -0,0 +1,105 @@
+#ifndef _RE2C_IR_NFA_NFA_
+#define _RE2C_IR_NFA_NFA_
+
+#include "src/util/c99_stdint.h"
+#include <vector>
+#include <set>
+
+#include "src/util/forbid_copy.h"
+
+namespace re2c
+{
+
+struct Range;
+struct RegExp;
+struct RuleOp;
+
+struct nfa_state_t
+{
+       enum type_t
+       {
+               ALT,
+//             CHR,
+               RAN,
+               CTX,
+               FIN
+       } type;
+       union
+       {
+               struct
+               {
+                       nfa_state_t *out1;
+                       nfa_state_t *out2;
+               } alt;
+//             struct
+//             {
+//                     nfa_state_t *out;
+//                     uint32_t chr;
+//             } chr;
+               struct
+               {
+                       nfa_state_t *out;
+                       Range *ran;
+               } ran;
+               struct
+               {
+                       nfa_state_t *out;
+               } ctx;
+               struct
+               {
+                       RuleOp *rule;
+               } fin;
+       } value;
+       bool mark;
+
+       void alt(nfa_state_t *s1, nfa_state_t *s2)
+       {
+               type = ALT;
+               value.alt.out1 = s1;
+               value.alt.out2 = s2;
+               mark = false;
+       }
+//     void chr(nfa_state_t *s, uint32_t c)
+//     {
+//             type = CHR;
+//             value.chr.out = s;
+//             value.chr.chr = c;
+//             mark = false;
+//     }
+       void ran(nfa_state_t *s, Range *r)
+       {
+               type = RAN;
+               value.ran.out = s;
+               value.ran.ran = r;
+               mark = false;
+       }
+       void ctx(nfa_state_t *s)
+       {
+               type = CTX;
+               value.ctx.out = s;
+               mark = false;
+       }
+       void fin(RuleOp *r)
+       {
+               type = FIN;
+               value.fin.rule = r;
+               mark = false;
+       }
+};
+
+struct nfa_t
+{
+       const uint32_t max_size;
+       uint32_t size;
+       nfa_state_t *states;
+       nfa_state_t *root;
+
+       nfa_t(RegExp *re);
+       ~nfa_t();
+
+       FORBID_COPY(nfa_t);
+};
+
+} // namespace re2c
+
+#endif // _RE2C_IR_NFA_NFA_
index 05828feabb37835734f2abb0d6a39f0193912a52..a37ca95b1e344e169a09550d0658d030c5bafd9b 100644 (file)
@@ -12,7 +12,8 @@
 namespace re2c
 {
 
-union Ins;
+struct nfa_state_t;
+struct nfa_t;
 
 typedef std::vector<uint32_t> charset_t;
 
@@ -21,7 +22,6 @@ class RegExp
 public:
        static free_list <RegExp *> vFreeList;
 
-       uint32_t size;
        /*
         * There're several different cases when the same regexp
         * can be used multiple times:
@@ -40,7 +40,7 @@ public:
         * [^]{3} in UTF-8 mode, each of sub-regexps [^] will have common suffix
         * [\x80-\xBF] factored out, but they won't share instructions.
         */
-       Ins * ins_cache; /* if non-NULL, points to compiled instructions */
+       nfa_state_t *ins_cache; /* if non-NULL, points to compiled instructions */
        enum InsAccess
        {
                SHARED,
@@ -48,8 +48,7 @@ public:
        } ins_access;
 
        inline RegExp ()
-               : size (0)
-               , ins_cache (NULL)
+               : ins_cache (NULL)
                , ins_access (SHARED)
        {
                vFreeList.insert (this);
@@ -59,9 +58,9 @@ public:
                vFreeList.erase (this);
        }
        virtual void split (std::set<uint32_t> &) = 0;
-       virtual void calcSize (const charset_t &) = 0;
+       virtual uint32_t calc_size() const = 0;
        virtual uint32_t fixedLength ();
-       virtual uint32_t compile (const charset_t &, Ins *) = 0;
+       virtual nfa_state_t *compile(nfa_t &nfa, nfa_state_t *n) = 0;
        virtual void decompile () = 0;
        virtual void display (std::ostream &) const = 0;
        friend std::ostream & operator << (std::ostream & o, const RegExp & re);
index 90e2ecc6f0e91085665844b5748a5000dd85a857..5dc1605a27aab2e3e310f629e88e482814851579 100644 (file)
@@ -17,9 +17,9 @@ public:
                , exp2 (e2)
        {}
        void split (std::set<uint32_t> &);
-       void calcSize (const charset_t &);
+       uint32_t calc_size() const;
        uint32_t fixedLength ();
-       uint32_t compile (const charset_t &, Ins *);
+       nfa_state_t *compile(nfa_t &nfa, nfa_state_t *n);
        void decompile ();
        void display (std::ostream & o) const;
        friend RegExp * mkAlt (RegExp *, RegExp *);
index d72f8ece7b9a8e77b4e5331382a0a5afe8910f06..aa872c6732647ae237d819489287bd57f2665a75 100644 (file)
@@ -17,9 +17,9 @@ public:
                , exp2 (e2)
        {}
        void split (std::set<uint32_t> &);
-       void calcSize (const charset_t &);
+       uint32_t calc_size() const;
        uint32_t fixedLength ();
-       uint32_t compile (const charset_t &, Ins *);
+       nfa_state_t *compile(nfa_t &nfa, nfa_state_t *n);
        void decompile ();
        void display (std::ostream & o) const;
 
index aa323c6527d70d9f6e12737b1d67cfa52ee54424..40496e3d6a47ef41b84152635a16bf550a624624 100644 (file)
@@ -15,8 +15,8 @@ public:
                : exp (e)
        {}
        void split (std::set<uint32_t> &);
-       void calcSize (const charset_t &);
-       uint32_t compile (const charset_t &, Ins *);
+       uint32_t calc_size() const;
+       nfa_state_t *compile(nfa_t &nfa, nfa_state_t *n);
        void decompile ();
        void display (std::ostream & o) const;
 
index fab57dc6bed69969e00e9a2ef56ef4a41e253cc5..cd8cc1247b6875aad96d398ebe601e6004b75733 100644 (file)
@@ -16,9 +16,9 @@ public:
                : match (m)
        {}
        void split (std::set<uint32_t> &);
-       void calcSize (const charset_t &);
+       uint32_t calc_size() const;
        uint32_t fixedLength ();
-       uint32_t compile (const charset_t &, Ins *);
+       nfa_state_t *compile(nfa_t &nfa, nfa_state_t *n);
        void decompile ();
        void display (std::ostream & o) const;
 
index f9a97a615ff8d5b0d02a7cecceb4d77215e918e9..ed079f3a355d96b90de1f51e61e204514babab96 100644 (file)
@@ -10,9 +10,9 @@ class NullOp: public RegExp
 {
 public:
        void split (std::set<uint32_t> &);
-       void calcSize (const charset_t &);
+       uint32_t calc_size() const;
        uint32_t fixedLength ();
-       uint32_t compile (const charset_t &, Ins *);
+       nfa_state_t *compile(nfa_t &nfa, nfa_state_t *n);
        void decompile ();
        void display (std::ostream & o) const;
 };
index 1bb4b51fe50a1c4ad9eda06099c13698c392afd2..91c88e482420e9fbfcb3adfd613ae746fe2d64d0 100644 (file)
@@ -20,7 +20,6 @@ private:
 
 public:
        RegExp * ctx;
-       Ins * ins;
        rule_rank_t rank;
        const Code * code;
        const std::string newcond;
@@ -37,7 +36,6 @@ public:
                : loc (l)
                , exp (r1)
                , ctx (r2)
-               , ins (NULL)
                , rank (r)
                , code (c)
                , newcond (cond ? *cond : "")
@@ -46,8 +44,8 @@ public:
        }
        void display (std::ostream & o) const;
        void split (std::set<uint32_t> &);
-       void calcSize (const charset_t &);
-       uint32_t compile (const charset_t &, Ins *);
+       uint32_t calc_size() const;
+       nfa_state_t *compile(nfa_t &nfa, nfa_state_t *n);
        void decompile ();
 
        FORBID_COPY (RuleOp);
index 96a37a9571c3a23b7640d917767afc7fb737a587..cf009ea7a4b5f7cad73cdfa0fa46864638b052e5 100644 (file)
@@ -16,7 +16,7 @@
 #include "src/codegen/skeleton/skeleton.h"
 #include "src/conf/opt.h"
 #include "src/globals.h"
-#include "src/ir/bytecode/bytecode.h"
+#include "src/ir/compile.h"
 #include "src/ir/dfa/dfa.h"
 #include "src/ir/regexp/encoding/enc.h"
 #include "src/ir/regexp/encoding/range_suffix.h"
@@ -718,7 +718,7 @@ void parse(Scanner& i, Output & o)
                                                }
                                        }
 
-                                       dfa_map[it->first] = genCode(it->second, o, it->first, opts->encoding.nCodeUnits ());
+                                       dfa_map[it->first] = compile(it->second, o, it->first, opts->encoding.nCodeUnits ());
                                }
                                if (parseMode != Scanner::Rules && dfa_map.find(it->first) != dfa_map.end())
                                {
@@ -732,7 +732,7 @@ void parse(Scanner& i, Output & o)
                        {
                                if (parseMode != Scanner::Reuse)
                                {
-                                       dfa_map[""] = genCode(spec, o, "", opts->encoding.nCodeUnits ());
+                                       dfa_map[""] = compile(spec, o, "", opts->encoding.nCodeUnits ());
                                }
                                if (parseMode != Scanner::Rules && dfa_map.find("") != dfa_map.end())
                                {