Restructured sources layout, improved formatting.

author Ulya Trofimovich <skvadrik@gmail.com>

Wed, 10 Jun 2015 16:30:15 +0000 (17:30 +0100)

committer Ulya Trofimovich <skvadrik@gmail.com>

Wed, 10 Jun 2015 16:30:15 +0000 (17:30 +0100)
author Ulya Trofimovich <skvadrik@gmail.com>
Wed, 10 Jun 2015 16:30:15 +0000 (17:30 +0100)
committer Ulya Trofimovich <skvadrik@gmail.com>
Wed, 10 Jun 2015 16:30:15 +0000 (17:30 +0100)
diff --git a/re2c/Makefile.am b/re2c/Makefile.am

index 0f76783526e7dd280902a33d0f43003bfb965a9a..59122534daa1adc456cd9b6a159797b21927d7a5 100644 (file)
--- a/re2c/Makefile.am
+++ b/re2c/Makefile.am
@@ -22,22 +22,32 @@ SRC_HDR = \
         src/codegen/scc.h \
         src/codegen/skeleton/path.h \
         src/codegen/skeleton/skeleton.h \
-       src/dfa/action.h \
-       src/dfa/encoding/enc.h \
-       src/dfa/encoding/range_suffix.h \
-       src/dfa/encoding/utf16/utf16.h \
-       src/dfa/encoding/utf16/utf16_range.h \
-       src/dfa/encoding/utf16/utf16_regexp.h \
-       src/dfa/encoding/utf8/utf8.h \
-       src/dfa/encoding/utf8/utf8_range.h \
-       src/dfa/encoding/utf8/utf8_regexp.h \
-       src/dfa/dfa.h \
-       src/dfa/ins.h \
-       src/dfa/re.h \
-       src/dfa/rule_rank.h \
-       src/dfa/state.h \
+       src/ir/bytecode/charset.h \
+       src/ir/bytecode/bytecode.h \
+       src/ir/bytecode/ins.h \
+       src/ir/dfa/state.h \
+       src/ir/dfa/dfa.h \
+       src/ir/dfa/action.h \
+       src/ir/regexp/regexp_alt.h \
+       src/ir/regexp/regexp_match.h \
+       src/ir/regexp/encoding/enc.h \
+       src/ir/regexp/encoding/range_suffix.h \
+       src/ir/regexp/encoding/utf8/utf8.h \
+       src/ir/regexp/encoding/utf8/utf8_regexp.h \
+       src/ir/regexp/encoding/utf8/utf8_range.h \
+       src/ir/regexp/encoding/utf16/utf16_range.h \
+       src/ir/regexp/encoding/utf16/utf16_regexp.h \
+       src/ir/regexp/encoding/utf16/utf16.h \
+       src/ir/regexp/regexp_rule.h \
+       src/ir/regexp/regexp_cat.h \
+       src/ir/regexp/regexp_closev.h \
+       src/ir/regexp/regexp_null.h \
+       src/ir/regexp/regexp.h \
+       src/ir/regexp/regexp_close.h \
+       src/ir/rule_rank.h \
         src/globals.h \
         src/mbo_getopt.h \
+       src/parse/extop.h \
         src/parse/input.h \
         src/parse/parser.h \
         src/parse/scanner.h \
@@ -70,21 +80,30 @@ SRC = \
         src/codegen/scc.cc \
         src/codegen/skeleton/path.cc \
         src/codegen/skeleton/skeleton.cc \
-       src/dfa/actions.cc \
-       src/dfa/encoding/enc.cc \
-       src/dfa/encoding/range_suffix.cc \
-       src/dfa/encoding/utf16/utf16.cc \
-       src/dfa/encoding/utf16/utf16_range.cc \
-       src/dfa/encoding/utf16/utf16_regexp.cc \
-       src/dfa/encoding/utf8/utf8.cc \
-       src/dfa/encoding/utf8/utf8_range.cc \
-       src/dfa/encoding/utf8/utf8_regexp.cc \
-       src/dfa/dfa.cc \
-       src/dfa/rule_rank.cc \
+       src/ir/bytecode/bytecode.cc \
+       src/ir/bytecode/ins.cc \
+       src/ir/bytecode/charset.cc \
+       src/ir/bytecode/split.cc \
+       src/ir/bytecode/compile.cc \
+       src/ir/bytecode/calc_size.cc \
+       src/ir/dfa/dfa.cc \
+       src/ir/regexp/display.cc \
+       src/ir/regexp/encoding/enc.cc \
+       src/ir/regexp/encoding/range_suffix.cc \
+       src/ir/regexp/encoding/utf8/utf8_regexp.cc \
+       src/ir/regexp/encoding/utf8/utf8_range.cc \
+       src/ir/regexp/encoding/utf8/utf8.cc \
+       src/ir/regexp/encoding/utf16/utf16_regexp.cc \
+       src/ir/regexp/encoding/utf16/utf16.cc \
+       src/ir/regexp/encoding/utf16/utf16_range.cc \
+       src/ir/regexp/fixed_length.cc \
+       src/ir/regexp/regexp.cc \
+       src/ir/rule_rank.cc \
         src/main.cc \
         src/mbo_getopt.cc \
         src/parse/input.cc \
         src/parse/scanner.cc \
+       src/parse/unescape.cc \
         src/util/range.cc
  re2c_SOURCES = \
         $(SRC_HDR) \
diff --git a/re2c/bootstrap/scanner_lex.cc b/re2c/bootstrap/scanner_lex.cc

index 469393ad1dff343b6e9a0a46ceaa7a965335c950..48160f727628e909b46ec649cd21737fea39fddc 100644 (file)
--- a/re2c/bootstrap/scanner_lex.cc
+++ b/re2c/bootstrap/scanner_lex.cc
@@ -1,11 +1,12 @@
-/* Generated by re2c 0.14.1.dev on Thu May 14 17:19:41 2015*/
+/* Generated by re2c 0.14.2 on Wed Jun 10 17:27:57 2015 */
  #include <stdlib.h>
  #include <string.h>
  #include <iostream>
  #include <sstream>
  
-#include "src/dfa/dfa.h"
+#include "src/ir/dfa/dfa.h"
  #include "src/globals.h"
+#include "src/parse/extop.h"
  #include "src/parse/parser.h"
  #include "src/parse/scanner.h"
  #include "y.tab.h"
diff --git a/re2c/src/codegen/emit.h b/re2c/src/codegen/emit.h

index c616175eed4efcd32f25ba7ad1f8a040f81e6afc..5c78ef18056dec8b0461ae4108c3667d12f09d4b 100644 (file)
--- a/re2c/src/codegen/emit.h
+++ b/re2c/src/codegen/emit.h
@@ -2,11 +2,10 @@
  #define _RE2C_CODEGEN_EMIT_
  
  #include "src/codegen/output.h"
-#include "src/dfa/dfa.h"
+#include "src/ir/dfa/dfa.h"
  
  namespace re2c {
  
-class DFA;
  typedef std::vector<std::string> RegExpIndices;
  
  void emit_action
diff --git a/re2c/src/codegen/emit_action.cc b/re2c/src/codegen/emit_action.cc

index 87f33ad82e51561c34fc3b0d25942c644d1afbf9..e5d75afd1324480efea78c2ae8f43559e8f05292 100644 (file)
--- a/re2c/src/codegen/emit_action.cc
+++ b/re2c/src/codegen/emit_action.cc
@@ -1,8 +1,8 @@
  #include "src/codegen/emit.h"
  #include "src/codegen/indent.h"
  #include "src/codegen/input_api.h"
-#include "src/codegen/skeleton/skeleton.h"
-#include "src/dfa/action.h"
+#include "src/ir/dfa/action.h"
+#include "src/ir/regexp/regexp_rule.h"
  
  namespace re2c
  {
diff --git a/re2c/src/codegen/emit_dfa.cc b/re2c/src/codegen/emit_dfa.cc

index 147ce6dad2fef7bb11b96849a19791e225e64a61..11653a34dc81eebb65eb2210a5d129e1d99916b8 100644 (file)
--- a/re2c/src/codegen/emit_dfa.cc
+++ b/re2c/src/codegen/emit_dfa.cc
@@ -6,7 +6,6 @@
  #include "src/codegen/indent.h"
  #include "src/codegen/input_api.h"
  #include "src/codegen/skeleton/skeleton.h"
-#include "src/dfa/dfa.h"
  
  namespace re2c
  {
diff --git a/re2c/src/codegen/go_construct.cc b/re2c/src/codegen/go_construct.cc

index 321f334494dd9bae40dd0f4d16c4c09bfd8c7c34..398f4a80f216233f8509640b7095072327e604f1 100644 (file)
--- a/re2c/src/codegen/go_construct.cc
+++ b/re2c/src/codegen/go_construct.cc
@@ -1,6 +1,6 @@
  #include "src/codegen/bitmap.h"
  #include "src/codegen/go.h"
-#include "src/dfa/dfa.h"
+#include "src/ir/dfa/dfa.h"
  #include "src/util/allocate.h"
  
  namespace re2c
diff --git a/re2c/src/codegen/go_destruct.cc b/re2c/src/codegen/go_destruct.cc

index e53cedfb59486865d4b042af4e9bb3f654f0597a..041945d80ce63c300e2cd96cbe60b2c76f48c23e 100644 (file)
--- a/re2c/src/codegen/go_destruct.cc
+++ b/re2c/src/codegen/go_destruct.cc
@@ -1,5 +1,5 @@
  #include "src/codegen/go.h"
-#include "src/dfa/dfa.h"
+#include "src/ir/dfa/dfa.h"
  
  namespace re2c
  {
diff --git a/re2c/src/codegen/go_emit.cc b/re2c/src/codegen/go_emit.cc

index 5da68ad6a322507c4ea90780f818729d725992c8..9c5d33c64ed5b4ad9280ac8d25b20636bc34dd2c 100644 (file)
--- a/re2c/src/codegen/go_emit.cc
+++ b/re2c/src/codegen/go_emit.cc
@@ -2,7 +2,7 @@
  #include "src/codegen/go.h"
  #include "src/codegen/indent.h"
  #include "src/codegen/print.h"
-#include "src/dfa/dfa.h"
+#include "src/ir/dfa/dfa.h"
  
  namespace re2c
  {
diff --git a/re2c/src/codegen/go_used_labels.cc b/re2c/src/codegen/go_used_labels.cc

index d17ab92ea45011c2d08b28c6d7b85ddde9c4c8a7..850f270af7d1bb306ce7723699103b2ade2a497f 100644 (file)
--- a/re2c/src/codegen/go_used_labels.cc
+++ b/re2c/src/codegen/go_used_labels.cc
@@ -1,5 +1,5 @@
  #include "src/codegen/go.h"
-#include "src/dfa/dfa.h"
+#include "src/ir/dfa/dfa.h"
  
  namespace re2c
  {
diff --git a/re2c/src/codegen/output.h b/re2c/src/codegen/output.h

index 4daacf2503776151917329fc503a264c8a6dd717..4890a7f4649ccc95b0055b71e8375c95a89af8fd 100644 (file)
--- a/re2c/src/codegen/output.h
+++ b/re2c/src/codegen/output.h
@@ -7,7 +7,7 @@
  #include <vector>
  
  #include "src/codegen/label.h"
-#include "src/dfa/rule_rank.h"
+#include "src/ir/rule_rank.h"
  #include "src/util/c99_stdint.h"
  #include "src/util/forbid_copy.h"
  
diff --git a/re2c/src/codegen/prepare_dfa.cc b/re2c/src/codegen/prepare_dfa.cc

index c5ef6255d13e0998d02f26c1bfbfe55d76643d2a..5764043082fd49c006294a69743f4b945aa906f1 100644 (file)
--- a/re2c/src/codegen/prepare_dfa.cc
+++ b/re2c/src/codegen/prepare_dfa.cc
@@ -2,7 +2,8 @@
  
  #include "src/codegen/bitmap.h"
  #include "src/codegen/scc.h"
-#include "src/dfa/dfa.h"
+#include "src/ir/dfa/dfa.h"
+#include "src/ir/regexp/regexp_rule.h"
  #include "src/util/allocate.h"
  
  namespace re2c {
diff --git a/re2c/src/codegen/scc.cc b/re2c/src/codegen/scc.cc

index ba79dbb02c43f3a5b4b9e1195806f8820ac5f2e8..243497a8ebbf7e2beb8173b225314b0c00435657 100644 (file)
--- a/re2c/src/codegen/scc.cc
+++ b/re2c/src/codegen/scc.cc
@@ -1,5 +1,5 @@
  #include "src/codegen/scc.h"
-#include "src/dfa/dfa.h"
+#include "src/ir/dfa/dfa.h"
  
  namespace re2c {
  
diff --git a/re2c/src/codegen/skeleton/path.h b/re2c/src/codegen/skeleton/path.h

index 097e2c9f282468ce9fa76c623ea6d3cd7fb80aae..b24e20a048ccafc4c81c9abd11aeeefb92e3e491 100644 (file)
--- a/re2c/src/codegen/skeleton/path.h
+++ b/re2c/src/codegen/skeleton/path.h
@@ -3,7 +3,7 @@
  
  #include <vector>
  
-#include "src/dfa/rule_rank.h"
+#include "src/ir/rule_rank.h"
  #include "src/util/c99_stdint.h"
  
  namespace re2c
diff --git a/re2c/src/codegen/skeleton/skeleton.cc b/re2c/src/codegen/skeleton/skeleton.cc

index ca243c86d4ad1f3ee2894d4d2ca191f673bfd06f..172754989f6bb45b2d6596f10fcbb0ed1b9b8716 100644 (file)
--- a/re2c/src/codegen/skeleton/skeleton.cc
+++ b/re2c/src/codegen/skeleton/skeleton.cc
@@ -1,6 +1,7 @@
  #include "src/codegen/indent.h"
  #include "src/codegen/print.h"
  #include "src/codegen/skeleton/skeleton.h"
+#include "src/ir/regexp/regexp_rule.h"
  #include "src/util/allocate.h"
  
  namespace re2c
diff --git a/re2c/src/codegen/skeleton/skeleton.h b/re2c/src/codegen/skeleton/skeleton.h

index 41af35d36e0e777c20b90b67f2d810953c57d745..aebaee10ad5c45ae5516b199fa4b3e9d6ff928f8 100644 (file)
--- a/re2c/src/codegen/skeleton/skeleton.h
+++ b/re2c/src/codegen/skeleton/skeleton.h
@@ -4,7 +4,7 @@
  #include <map>
  
  #include "src/codegen/skeleton/path.h"
-#include "src/dfa/dfa.h"
+#include "src/ir/dfa/dfa.h"
  #include "src/util/c99_stdint.h"
  #include "src/util/forbid_copy.h"
  #include "src/util/local_increment.h"
diff --git a/re2c/src/dfa/actions.cc b/re2c/src/dfa/actions.cc

deleted file mode 100644 (file)

index 01a7ee0..0000000
--- a/re2c/src/dfa/actions.cc
+++ /dev/null
@@ -1,1073 +0,0 @@
-#include <time.h>
-#include <string.h>
-#include <iostream>
-#include <iomanip>
-#include <cctype>
-
-#include "src/codegen/skeleton/skeleton.h"
-#include "src/dfa/dfa.h"
-#include "src/dfa/encoding/utf16/utf16_regexp.h"
-#include "src/dfa/encoding/utf8/utf8_regexp.h"
-#include "src/globals.h"
-#include "src/parse/scanner.h"
-#include "src/util/allocate.h"
-
-namespace re2c
-{
-
-const Ins* showIns(std::ostream &o, const Ins &i, const Ins &base)
-{
-       o.width(3);
-       o << &i - &base << ": ";
-
-       const Ins *ret = &(&i)[1];
-
-       switch (i.i.tag)
-       {
-
-               case CHAR:
-               {
-                       o << "match ";
-
-                       for (; ret < (Ins*) i.i.link; ++ret)
-                       {
-                               o << "\\x" << std::hex << ret->c.value;
-                       }
-
-                       break;
-               }
-
-               case GOTO:
-               o << "goto " << ((Ins*) i.i.link - &base);
-               break;
-
-               case FORK:
-               o << "fork " << ((Ins*) i.i.link - &base);
-               break;
-
-               case CTXT:
-               o << "ctxt";
-               break;
-
-               case TERM:
-               o << "term " << ((RuleOp*) i.i.link)->rank;
-               break;
-       }
-
-       o << "\n";
-       return ret;
-}
-
-uint32_t RegExp::fixedLength()
-{
-       return ~0u;
-}
-
-uint32_t compile_goto(Ins *ins, Ins *i)
-{
-       i->i.tag = GOTO;
-       i->i.link = ins;
-       return 1;
-}
-
-void NullOp::calcSize(Char*)
-{
-       size = 0;
-}
-
-uint32_t NullOp::fixedLength()
-{
-       return 0;
-}
-
-uint32_t NullOp::compile(Char*, Ins*)
-{
-       return 0;
-}
-
-void NullOp::decompile()
-{
-       ;
-}
-
-void NullOp::split(CharSet&)
-{
-       ;
-}
-
-MatchOp *merge(MatchOp *m1, MatchOp *m2)
-{
-       if (!m1)
-               return m2;
-
-       if (!m2)
-               return m1;
-
-       MatchOp* m = new MatchOp(doUnion(m1->match, m2->match));
-       if (m1->ins_access == RegExp::PRIVATE
-               || m2->ins_access == RegExp::PRIVATE)
-               m->ins_access = RegExp::PRIVATE;
-
-       return m;
-}
-
-void MatchOp::display(std::ostream &o) const
-{
-       o << match;
-}
-
-void MatchOp::calcSize(Char *rep)
-{
-       size = 1;
-
-       for (Range *r = match; r; r = r->next)
-               for (uint32_t c = r->lb; c < r->ub; ++c)
-                       if (rep[c] == c)
-                               ++size;
-}
-
-uint32_t MatchOp::fixedLength()
-{
-       return 1;
-}
-
-uint32_t MatchOp::compile(Char *rep, Ins *i)
-{
-       if (ins_cache)
-               return compile_goto(ins_cache, i);
-       else
-       {
-               ins_cache = i;
-
-               i->i.tag = CHAR;
-               i->i.link = &i[size];
-               Ins *j = &i[1];
-               uint32_t bump = size;
-
-               for (Range *r = match; r; r = r->next)
-               {
-                       for (uint32_t c = r->lb; c < r->ub; ++c)
-                       {
-                               if (rep[c] == c)
-                               {
-                                       j->c.value = c;
-                                       j->c.bump = --bump;
-                                       j++;
-                               }
-                       }
-               }
-
-               if (ins_access == PRIVATE)
-                       decompile();
-
-               return size;
-       }
-}
-
-void MatchOp::decompile()
-{
-       ins_cache = NULL;
-}
-
-void MatchOp::split(CharSet &s)
-{
-       for (Range *r = match; r; r = r->next)
-       {
-               for (uint32_t c = r->lb; c < r->ub; ++c)
-               {
-                       CharPtn *x = s.rep[c], *a = x->nxt;
-
-                       if (!a)
-                       {
-                               if (x->card == 1)
-                                       continue;
-
-                               x->nxt = a = s.freeHead;
-
-                               if (!(s.freeHead = s.freeHead->nxt))
-                                       s.freeTail = &s.freeHead;
-
-                               a->nxt = NULL;
-
-                               x->fix = s.fix;
-
-                               s.fix = x;
-                       }
-
-                       if (--(x->card) == 0)
-                       {
-                               *s.freeTail = x;
-                               *(s.freeTail = &x->nxt) = NULL;
-                       }
-
-                       s.rep[c] = a;
-                       ++(a->card);
-               }
-       }
-
-       for (; s.fix; s.fix = s.fix->fix)
-               if (s.fix->card)
-                       s.fix->nxt = NULL;
-}
-
-RegExp * mkDiff(RegExp *e1, RegExp *e2)
-{
-       MatchOp * m1 = dynamic_cast<MatchOp*>(e1);
-       MatchOp * m2 = dynamic_cast<MatchOp*>(e2);
-       if (m1 == NULL || m2 == NULL)
-               return NULL;
-
-       Range *r = doDiff(m1->match, m2->match);
-
-       return r ? (RegExp*) new MatchOp(r) : (RegExp*) new NullOp;
-}
-
-RegExp *doAlt(RegExp *e1, RegExp *e2)
-{
-       if (!e1)
-               return e2;
-
-       if (!e2)
-               return e1;
-
-       return new AltOp(e1, e2);
-}
-
-RegExp *doCat(RegExp *e1, RegExp *e2)
-{
-       if (!e1)
-               return e2;
-
-       if (!e2)
-               return e1;
-
-       return new CatOp(e1, e2);
-}
-
-RegExp *mkAlt(RegExp *e1, RegExp *e2)
-{
-       AltOp *a;
-       MatchOp *m1, *m2;
-
-       a = dynamic_cast<AltOp*>(e1);
-       if (a != NULL)
-       {
-               m1 = dynamic_cast<MatchOp*>(a->exp1);
-               if (m1 != NULL)
-               {
-                       m1->ins_access = e1->ins_access;
-                       a->exp2->ins_access = e1->ins_access;
-                       e1 = a->exp2;
-               }
-       }
-       else
-       {
-               m1 = dynamic_cast<MatchOp*>(e1);
-               if (m1 != NULL)
-                       e1 = NULL;
-       }
-
-       a = dynamic_cast<AltOp*>(e2);
-       if (a != NULL)
-       {
-               m2 = dynamic_cast<MatchOp*>(a->exp1);
-               if (m2 != NULL)
-               {
-                       m2->ins_access = e2->ins_access;
-                       a->exp2->ins_access = e2->ins_access;
-                       e2 = a->exp2;
-               }
-       }
-       else
-       {
-               m2 = dynamic_cast<MatchOp*>(e2);
-               if (m2 != NULL)
-                       e2 = NULL;
-       }
-
-       return doAlt(merge(m1, m2), doAlt(e1, e2));
-}
-
-void AltOp::calcSize(Char *rep)
-{
-       exp1->calcSize(rep);
-       exp2->calcSize(rep);
-       size = exp1->size + exp2->size + 2;
-}
-
-uint32_t AltOp::fixedLength()
-{
-       uint32_t l1 = exp1->fixedLength();
-       uint32_t l2 = exp1->fixedLength();
-
-       if (l1 != l2 || l1 == ~0u)
-               return ~0u;
-
-       return l1;
-}
-
-uint32_t AltOp::compile(Char *rep, Ins *i)
-{
-       if (ins_cache)
-               return compile_goto(ins_cache, i);
-       else
-       {
-               ins_cache = i;
-
-               i->i.tag = FORK;
-               const uint32_t sz1 = exp1->compile(rep, &i[1]);
-               Ins * const j = &i[sz1 + 1];
-               i->i.link = &j[1];
-               j->i.tag = GOTO;
-               const uint32_t sz2 = exp2->compile(rep, &j[1]);
-               j->i.link = &j[sz2 + 1];
-
-               if (ins_access == PRIVATE)
-                       decompile();
-
-               return sz1 + sz2 + 2;
-       }
-}
-
-void AltOp::decompile()
-{
-       if (ins_cache)
-       {
-               exp1->decompile();
-               exp2->decompile();
-               ins_cache = NULL;
-       }
-}
-
-void AltOp::split(CharSet &s)
-{
-       exp1->split(s);
-       exp2->split(s);
-}
-
-void CatOp::calcSize(Char *rep)
-{
-       exp1->calcSize(rep);
-       exp2->calcSize(rep);
-       size = exp1->size + exp2->size;
-}
-
-uint32_t CatOp::fixedLength()
-{
-       uint32_t l1, l2;
-
-       if ((l1 = exp1->fixedLength()) != ~0u )
-               if ((l2 = exp2->fixedLength()) != ~0u)
-                       return l1 + l2;
-
-       return ~0u;
-}
-
-uint32_t CatOp::compile(Char *rep, Ins *i)
-{
-       if (ins_cache)
-               return compile_goto(ins_cache, i);
-       else
-       {
-               ins_cache = i;
-
-               const uint32_t sz1 = exp1->compile(rep, &i[0]);
-               const uint32_t sz2 = exp2->compile(rep, &i[sz1]);
-
-               if (ins_access == PRIVATE)
-                       decompile();
-
-               return sz1 + sz2;
-       }
-}
-
-void CatOp::decompile()
-{
-       if (ins_cache)
-       {
-               exp1->decompile();
-               exp2->decompile();
-               ins_cache = NULL;
-       }
-}
-
-void CatOp::split(CharSet &s)
-{
-       exp1->split(s);
-       exp2->split(s);
-}
-
-void CloseOp::calcSize(Char *rep)
-{
-       exp->calcSize(rep);
-       size = exp->size + 1;
-}
-
-uint32_t CloseOp::compile(Char *rep, Ins *i)
-{
-       if (ins_cache)
-               return compile_goto(ins_cache, i);
-       else
-       {
-               ins_cache = i;
-
-               i += exp->compile(rep, &i[0]);
-               i->i.tag = FORK;
-               i->i.link = ins_cache;
-               ++i;
-
-               const uint32_t sz = i - ins_cache;
-               if (ins_access == PRIVATE)
-                       decompile();
-
-               return sz;
-       }
-}
-
-void CloseOp::decompile()
-{
-       if (ins_cache)
-       {
-               exp->decompile();
-               ins_cache = NULL;
-       }
-}
-
-void CloseOp::split(CharSet &s)
-{
-       exp->split(s);
-}
-
-void CloseVOp::calcSize(Char *rep)
-{
-       exp->calcSize(rep);
-
-       if (max >= 0)
-       {
-               size = (exp->size * min) + ((1 + exp->size) * (max - min));
-       }
-       else
-       {
-               size = (exp->size * min) + 1;
-       }
-}
-
-uint32_t CloseVOp::compile(Char *rep, Ins *i)
-{
-       if (ins_cache)
-               return compile_goto(ins_cache, i);
-       else
-       {
-               ins_cache = i;
-
-               for (int st = min; st < max; st++)
-               {
-                       const uint32_t sz = exp->compile(rep, &i[1]);
-                       i->i.tag = FORK;
-                       i->i.link = ins_cache + (1 + sz) * (max - min);
-                       i += sz + 1;
-               }
-
-               for (int st = 0; st < min; st++)
-               {
-                       const uint32_t sz = exp->compile(rep, &i[0]);
-                       i += sz;
-                       if (max < 0 && st == 0)
-                       {
-                               i->i.tag = FORK;
-                               i->i.link = i - sz;
-                               i++;
-                       }
-               }
-
-               const uint32_t sz = i - ins_cache;
-               if (ins_access == PRIVATE)
-                       decompile();
-
-               return sz;
-       }
-}
-
-void CloseVOp::decompile()
-{
-       if (ins_cache)
-       {
-               exp->decompile();
-               ins_cache = NULL;
-       }
-}
-
-void CloseVOp::split(CharSet &s)
-{
-       exp->split(s);
-}
-
-RegExp *expr(Scanner &);
-
-uint32_t Scanner::unescape(SubStr &s) const
-{
-       static const char * hex = "0123456789abcdef";
-       static const char * oct = "01234567";
-
-       s.len--;
-       uint32_t c, ucb = 0;
-
-       if ((c = *s.str++) != '\\' || s.len == 0)
-       {
-               return c;
-       }
-
-       s.len--;
-
-       switch (c = *s.str++)
-       {
-               case 'n': return '\n';
-               case 't': return '\t';
-               case 'v': return '\v';
-               case 'b': return '\b';
-               case 'r': return '\r';
-               case 'f': return '\f';
-               case 'a': return '\a';
-
-               case 'x':
-               {
-                       if (s.len < 2)
-                       {
-                               fatal(s.ofs()+s.len, "Illegal hexadecimal character code, two hexadecimal digits are required");
-                               return ~0u;
-                       }
-                       
-                       const char *p1 = strchr(hex, tolower(s.str[0]));
-                       const char *p2 = strchr(hex, tolower(s.str[1]));
-
-                       if (!p1 || !p2)
-                       {
-                               fatal(s.ofs()+(p1?1:0), "Illegal hexadecimal character code");
-                               return ~0u;
-                       }
-                       else
-                       {
-                               s.len -= 2;
-                               s.str += 2;
-                               
-                               uint32_t v = (uint32_t)((p1 - hex) << 4) 
-                                      + (uint32_t)((p2 - hex));
-       
-                               return v;
-                       }
-               }
-
-               case 'U':
-               {
-                       if (s.len < 8)
-                       {
-                               fatal(s.ofs()+s.len, "Illegal unicode character, eight hexadecimal digits are required");
-                               return ~0u;
-                       }
-
-                       uint32_t l = 0;
-                       if (s.str[0] == '0')
-                       {
-                               l++;
-                               if (s.str[1] == '0')
-                               {
-                                       l++;
-                                       if (s.str[2] == '0' || (s.str[2] == '1' && encoding.szCodePoint() == 4))
-                                       {
-                                               l++;
-                                               if (encoding.szCodePoint() == 4)
-                                               {
-                                                       const char *u3 = strchr(hex, tolower(s.str[2]));
-                                                       const char *u4 = strchr(hex, tolower(s.str[3]));
-                                                       if (u3 && u4)
-                                                       {
-                                                               ucb = (uint32_t)((u3 - hex) << 20)
-                                                               + (uint32_t)((u4 - hex) << 16);
-                                                               l++;
-                                                       }
-                                               }
-                                               else if (s.str[3] == '0')
-                                               {
-                                                       l++;
-                                               }
-                                       }
-                               }
-                       }
-
-                       if (l != 4)
-                       {
-                               fatal(s.ofs()+l, "Illegal unicode character, eight hexadecimal digits are required");
-                       }
-
-                       s.len -= 4;
-                       s.str += 4;
-                       
-                       // no break;
-               }
-               case 'X':
-               case 'u':
-               {
-                       if (s.len < 4)
-                       {
-                               fatal(s.ofs()+s.len, 
-                                       c == 'X'
-                                       ? "Illegal hexadecimal character code, four hexadecimal digits are required"
-                                       : "Illegal unicode character, four hexadecimal digits are required");
-                               return ~0u;
-                       }
-                       
-                       const char *p1 = strchr(hex, tolower(s.str[0]));
-                       const char *p2 = strchr(hex, tolower(s.str[1]));
-                       const char *p3 = strchr(hex, tolower(s.str[2]));
-                       const char *p4 = strchr(hex, tolower(s.str[3]));
-
-                       if (!p1 || !p2 || !p3 || !p4)
-                       {
-                               fatal(s.ofs()+(p1?1:0)+(p2?1:0)+(p3?1:0), 
-                                       c == 'X'
-                                       ? "Illegal hexadecimal character code, non hexxdecimal digit found"
-                                       : "Illegal unicode character, non hexadecimal digit found");
-                               return ~0u;
-                       }
-                       else
-                       {
-                               s.len -= 4;
-                               s.str += 4;
-                               
-                               uint32_t v = (uint32_t)((p1 - hex) << 12) 
-                                      + (uint32_t)((p2 - hex) <<  8)
-                                      + (uint32_t)((p3 - hex) <<  4)
-                                      + (uint32_t)((p4 - hex))
-                                      + ucb;
-       
-                               if (v >= encoding.nCodePoints())
-                               {
-                                       fatal(s.ofs(),
-                                               c == 'X'
-                                               ? "Illegal hexadecimal character code, out of range"
-                                               : "Illegal unicode character, out of range");
-                               }
-       
-                               return v;
-                       }
-               }
-
-               case '4':
-               case '5':
-               case '6':
-               case '7':
-               {
-                       fatal(s.ofs()-1, "Illegal octal character code, first digit must be 0 thru 3");
-                       return ~0u;
-               }
-
-               case '0':
-               case '1':
-               case '2':
-               case '3':
-               {
-                       if (s.len < 2)
-                       {
-                               fatal(s.ofs()+s.len, "Illegal octal character code, three octal digits are required");
-                               return ~0u;
-                       }
-
-                       const char *p0 = strchr(oct, c);
-                       const char *p1 = strchr(oct, s.str[0]);
-                       const char *p2 = strchr(oct, s.str[1]);
-
-                       if (!p0 || !p1 || !p2)
-                       {
-                               fatal(s.ofs()+(p1?1:0), "Illegal octal character code, non octal digit found");
-                               return ~0u;
-                       }
-                       else
-                       {
-                               s.len -= 2;
-                               s.str += 2;
-                               
-                               uint32_t v = (uint32_t)((p0 - oct) << 6) + (uint32_t)((p1 - oct) << 3) + (uint32_t)(p2 - oct);
-       
-                               return v;
-                       }
-               }
-
-               default:
-               return c;
-       }
-}
-
-std::string& Scanner::unescape(SubStr& str_in, std::string& str_out) const
-{
-       str_out.clear();
-
-       while(str_in.len)
-       {
-               uint32_t c = unescape(str_in);
-
-               if (c > 0xFF)
-               {
-                       fatal(str_in.ofs(), "Illegal character");
-               }
-
-               str_out += static_cast<char>(c);
-       }
-
-       return str_out;
-}
-
-Range * Scanner::getRange(SubStr &s) const
-{
-       uint32_t lb = unescape(s), ub;
-
-       if (s.len < 2 || *s.str != '-')
-       {
-               ub = lb;
-       }
-       else
-       {
-               s.len--;
-               s.str++;
-               ub = unescape(s);
-               if (ub < lb)
-               {
-                       uint32_t tmp = lb;
-                       lb = ub;
-                       ub = tmp;
-               }
-       }
-
-       Range * r = encoding.encodeRange(lb, ub);
-       if (r == NULL)
-               fatalf("Bad code point range: '0x%X - 0x%X'", lb, ub);
-       return r;
-}
-
-RegExp * Scanner::matchSymbol(uint32_t c) const
-{
-       if (!encoding.encode(c))
-               fatalf("Bad code point: '0x%X'", c);
-
-       if (encoding.is(Enc::UTF16))
-               return UTF16Symbol(c);
-       else if (encoding.is(Enc::UTF8))
-               return UTF8Symbol(c);
-       else
-               return new MatchOp(new Range(c, c + 1));
-}
-
-RegExp * Scanner::strToRE (SubStr & s) const
-{
-       if (s.len == 0)
-               return new NullOp;
-
-       RegExp *re = matchSymbol(unescape(s));
-
-       while (s.len > 0)
-               re = new CatOp(re, matchSymbol(unescape(s)));
-
-       return re;
-}
-
-RegExp * Scanner::strToCaseInsensitiveRE (SubStr & s) const
-{
-       if (s.len == 0)
-               return new NullOp;
-
-       uint32_t c = unescape(s);
-
-       RegExp *re, *reL, *reU;
-
-       if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z'))
-       {
-               reL = matchSymbol(tolower(c));
-               reU = matchSymbol(toupper(c));
-               re = mkAlt(reL, reU);
-       }
-       else
-       {
-               re = matchSymbol(c);
-       }
-
-       while (s.len > 0)
-       {
-               c = unescape(s);
-
-               if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z'))
-               {
-                       reL = matchSymbol(tolower(c));
-                       reU = matchSymbol(toupper(c));
-                       re = new CatOp(re, mkAlt(reL, reU));
-               }
-               else
-               {
-                       re = new CatOp(re, matchSymbol(c));
-               }
-       }
-
-       return re;
-}
-
-Range * Scanner::mkRange(SubStr &s) const
-{
-       Range *r = getRange(s);
-       while (s.len > 0)
-               r = doUnion(r, getRange(s));
-
-       return r;
-}
-
-RegExp * Scanner::matchSymbolRange(Range * r) const
-{
-       if (encoding.is(Enc::UTF16))
-               return UTF16Range(r);
-       else if (encoding.is(Enc::UTF8))
-               return UTF8Range(r);
-       else
-               return new MatchOp(r);
-}
-
-RegExp * Scanner::ranToRE (SubStr & s) const
-{
-       s.len -= 2;
-       s.str += 1;
-
-       if (s.len == 0)
-               return new NullOp;
-
-       return matchSymbolRange(mkRange(s));
-}
-
-RegExp * Scanner::invToRE (SubStr & s) const
-{
-       s.len -= 3;
-       s.str += 2;
-
-       Range * full = encoding.fullRange();
-
-       Range * r = s.len == 0
-               ? full
-               : doDiff(full, mkRange (s));
-
-       return matchSymbolRange(r);
-}
-
-RegExp * Scanner::mkDot() const
-{
-       Range * full = encoding.fullRange();
-       uint32_t c = '\n';
-       if (!encoding.encode(c))
-               fatalf("Bad code point: '0x%X'", c);
-       Range * ran = new Range(c, c + 1);
-       Range * inv = doDiff(full, ran);
-
-       return matchSymbolRange(inv);
-}
-
-/*
- * Create a byte range that includes all possible input characters.
- * This may include characters, which do not map to any valid symbol
- * in current encoding. For encodings, which directly map symbols to
- * input characters (ASCII, EBCDIC, UTF-32), it equals [^]. For other
- * encodings (UTF-16, UTF-8), [^] and this range are different.
- *
- * Also note that default range doesn't respect encoding policy
- * (the way invalid code points are treated).
- */
-RegExp * Scanner::mkDefault() const
-{
-       Range * def = new Range(0, encoding.nCodeUnits());
-       return new MatchOp(def);
-}
-
-RuleOp::RuleOp(RegExp *e, RegExp *c, Token *t, rule_rank_t r, InsAccess access)
-       : exp(e)
-       , ctx(c)
-       , ins(NULL)
-       , rank(r)
-       , code(t)
-       , line(0)
-{
-       ins_access = access;
-}
-
-void RuleOp::calcSize(Char *rep)
-{
-       exp->calcSize(rep);
-       ctx->calcSize(rep);
-       size = exp->size + (ctx->size ? ctx->size + 2 : 1);
-}
-
-uint32_t RuleOp::compile(Char *rep, Ins *i)
-{
-       if (ins_cache)
-               return compile_goto(ins_cache, i);
-       else
-       {
-               ins_cache = i;
-
-               i += exp->compile(rep, &i[0]);
-               if (ctx->size)
-               {
-                       i->i.tag = CTXT;
-                       i->i.link = &i[1];
-                       ++i;
-                       i += ctx->compile(rep, &i[0]);
-               }
-               i->i.tag = TERM;
-               i->i.link = this;
-               ++i;
-
-               const uint32_t sz = i - ins_cache;
-               if (ins_access == PRIVATE)
-                       decompile();
-
-               return sz;
-       }
-}
-
-void RuleOp::decompile()
-{
-       if (ins_cache)
-       {
-               exp->decompile();
-               ctx->decompile();
-               ins_cache = NULL;
-       }
-}
-
-void RuleOp::split(CharSet &s)
-{
-       exp->split(s);
-       ctx->split(s);
-}
-
-void optimize(Ins *i)
-{
-       while (!isMarked(i))
-       {
-               mark(i);
-
-               if (i->i.tag == CHAR)
-               {
-                       i = (Ins*) i->i.link;
-               }
-               else if (i->i.tag == GOTO || i->i.tag == FORK)
-               {
-                       Ins *target = (Ins*) i->i.link;
-                       optimize(target);
-
-                       if (target->i.tag == GOTO)
-                               i->i.link = target->i.link == target ? i : target;
-
-                       if (i->i.tag == FORK)
-                       {
-                               Ins *follow = (Ins*) & i[1];
-                               optimize(follow);
-
-                               if (follow->i.tag == GOTO && follow->i.link == follow)
-                               {
-                                       i->i.tag = GOTO;
-                               }
-                               else if (i->i.link == i)
-                               {
-                                       i->i.tag = GOTO;
-                                       i->i.link = follow;
-                               }
-                       }
-
-                       return ;
-               }
-               else
-               {
-                       ++i;
-               }
-       }
-}
-
-CharSet::CharSet()
-       : fix(0)
-       , freeHead(0)
-       , freeTail(0)
-       , rep(allocate<CharPtr> (encoding.nCodeUnits()))
-       , ptn(allocate<CharPtn> (encoding.nCodeUnits()))
-{
-       for (uint32_t j = 0; j < encoding.nCodeUnits(); ++j)
-       {
-               rep[j] = &ptn[0];
-               ptn[j].nxt = &ptn[j + 1]; /* wrong for j=encoding.nCodeUnits() - 1 but will be corrected below */
-               ptn[j].card = 0;
-       }
-
-       freeHead = &ptn[1];
-       *(freeTail = &ptn[encoding.nCodeUnits() - 1].nxt) = NULL;
-       ptn[0].card = encoding.nCodeUnits();
-       ptn[0].nxt = NULL;
-}
-       
-CharSet::~CharSet()
-{
-       operator delete (rep);
-       operator delete (ptn);
-}
-
-smart_ptr<DFA> genCode(RegExp *re, Output & output, uint32_t ind)
-{
-       CharSet cs;
-       re->split(cs);
-
-       Char *rep = new Char[encoding.nCodeUnits()];
-
-       for (uint32_t j = 0; j < encoding.nCodeUnits(); ++j)
-       {
-               if (!cs.rep[j]->nxt)
-                       cs.rep[j]->nxt = &cs.ptn[j];
-
-               rep[j] = cs.rep[j]->nxt - &cs.ptn[0];
-       }
-
-       re->calcSize(rep);
-       Ins *ins = new Ins[re->size + 1];
-       memset(ins, 0, (re->size + 1)*sizeof(Ins));
-       const uint32_t size = re->compile(rep, ins);
-       Ins *eoi = &ins[size];
-       eoi->i.tag = GOTO;
-       eoi->i.link = eoi;
-
-       optimize(ins);
-
-       /*
-       for (const Ins *inst = &ins[0]; inst < &ins[size]; )
-       {
-               inst = showIns(std::cout, *inst, ins[0]);
-       }
-       */
-
-       for (uint32_t j = 0; j < size;)
-       {
-               unmark(&ins[j]);
-
-               if (ins[j].i.tag == CHAR)
-               {
-                       j = (Ins*) ins[j].i.link - ins;
-               }
-               else
-               {
-                       j++;
-               }
-       }
-
-       smart_ptr<DFA> dfa = make_smart_ptr(new DFA(ins, size, 0, encoding.nCodeUnits(), rep));
-       if (flag_skeleton)
-       {
-               skeleton::Skeleton skeleton (*dfa);
-               skeleton.emit_data (output.data);
-               skeleton::emit_prolog (output.source, ind, output.data.file_name.c_str ());
-       }
-       dfa->prepare (output.source, output.max_fill);
-
-       return dfa;
-}
-
-} // end namespace re2c
-
diff --git a/re2c/src/dfa/encoding/utf16/utf16_regexp.h b/re2c/src/dfa/encoding/utf16/utf16_regexp.h

deleted file mode 100644 (file)

index 14e4fde..0000000
--- a/re2c/src/dfa/encoding/utf16/utf16_regexp.h
+++ /dev/null
@@ -1,16 +0,0 @@
-#ifndef _RE2C_DFA_ENCODING_UTF16_REGEXP_
-#define _RE2C_DFA_ENCODING_UTF16_REGEXP_
-
-#include "src/dfa/encoding/utf16/utf16.h"
-#include "src/util/range.h"
-
-namespace re2c {
-
-class RegExp; // forward
-
-RegExp * UTF16Symbol(utf16::rune r);
-RegExp * UTF16Range(const Range * r);
-
-} // namespace re2c
-
-#endif // _RE2C_DFA_ENCODING_UTF16_REGEXP_
diff --git a/re2c/src/dfa/encoding/utf8/utf8_regexp.h b/re2c/src/dfa/encoding/utf8/utf8_regexp.h

deleted file mode 100644 (file)

index ccfaab0..0000000
--- a/re2c/src/dfa/encoding/utf8/utf8_regexp.h
+++ /dev/null
@@ -1,16 +0,0 @@
-#ifndef _RE2C_DFA_ENCODING_UTF8_REGEXP_
-#define _RE2C_DFA_ENCODING_UTF8_REGEXP_
-
-#include "src/dfa/encoding/utf8/utf8.h"
-#include "src/util/range.h"
-
-namespace re2c {
-
-class RegExp; // forward
-
-RegExp * UTF8Symbol(utf8::rune r);
-RegExp * UTF8Range(const Range * r);
-
-} // namespace re2c
-
-#endif // _RE2C_DFA_ENCODING_UTF8_REGEXP_
diff --git a/re2c/src/dfa/ins.h b/re2c/src/dfa/ins.h

deleted file mode 100644 (file)

index 1e1170c..0000000
--- a/re2c/src/dfa/ins.h
+++ /dev/null
@@ -1,55 +0,0 @@
-#ifndef _RE2C_DFA_INS_
-#define _RE2C_DFA_INS_
-
-#include "src/util/c99_stdint.h"
-
-namespace re2c
-{
-
-typedef uint32_t Char;
-
-const uint32_t CHAR = 0;
-const uint32_t GOTO = 1;
-const uint32_t FORK = 2;
-const uint32_t TERM = 3;
-const uint32_t CTXT = 4;
-
-union Ins {
-
-       struct
-       {
-               uint8_t tag;
-               uint8_t marked;
-               void    *link;
-       }
-
-       i;
-
-       struct
-       {
-               uint32_t        value;
-               uint32_t        bump;
-               void            *link;
-       }
-
-       c;
-};
-
-inline bool isMarked(Ins *i)
-{
-       return i->i.marked != 0;
-}
-
-inline void mark(Ins *i)
-{
-       i->i.marked = true;
-}
-
-inline void unmark(Ins *i)
-{
-       i->i.marked = false;
-}
-
-} // end namespace re2c
-
-#endif // _RE2C_DFA_INS_
diff --git a/re2c/src/dfa/re.h b/re2c/src/dfa/re.h

deleted file mode 100644 (file)

index efb3c70..0000000
--- a/re2c/src/dfa/re.h
+++ /dev/null
@@ -1,310 +0,0 @@
-#ifndef _RE2C_DFA_RE_
-#define _RE2C_DFA_RE_
-
-#include <iostream>
-#include <set>
-#include <map>
-#include <list>
-#include <vector>
-#include <string>
-
-#include "src/dfa/ins.h"
-#include "src/dfa/rule_rank.h"
-#include "src/globals.h"
-#include "src/parse/token.h"
-#include "src/util/range.h"
-#include "src/util/forbid_copy.h"
-#include "src/util/free_list.h"
-
-namespace re2c
-{
-
-typedef struct extop
-{
-       char op;
-       int     minsize;
-       int     maxsize;
-}
-
-ExtOp;
-
-struct CharPtn
-{
-       uint32_t        card;
-       CharPtn *fix;
-       CharPtn *nxt;
-
-       FORBID_COPY (CharPtn);
-};
-
-typedef CharPtn *CharPtr;
-
-struct CharSet
-{
-       CharSet();
-       ~CharSet();
-
-       CharPtn *fix;
-       CharPtn *freeHead, **freeTail;
-       CharPtr *rep;
-       CharPtn *ptn;
-
-       FORBID_COPY (CharSet);
-};
-
-class RegExp
-{
-
-public:
-       uint32_t        size;
-
-       /*
-        * There're several different cases when the same regexp
-        * can be used multiple times:
-        *      1) named definitions, e.g. digit = [0-9];
-        *      2) counted repetition, e.g. "a"{3}, "a"{3,}, "a"{3,5}
-        *      3) multiple DFA's sharing the same regexp , e.g. <c1, c2> "abc" { }
-        *      4) common suffixes, e.g. suffix [\x80-\xBF] in UTF-8 ranges
-        * In cases 1-3, regexp must be recompiled each time it's reused.
-        * In case 4, regexp should be compiled only once, and instructions
-        * should be shared in order to reduce space.
-        *
-        * Note: if regexp must always be recompiled, it doesn't imply that
-        * parts of this regexp must always be recompiled. It only forces
-        * the compilation function to traverse the regexp after compilation
-        * and reset compilation cache for each sub-regexp. E.g., for a regexp
-        * [^]{3} in UTF-8 mode, each of sub-regexps [^] will have common suffix
-        * [\x80-\xBF] factored out, but they won't share instructions.
-        */
-       Ins*    ins_cache; /* if non-NULL, points to compiled instructions */
-       enum InsAccess
-               { SHARED
-               , PRIVATE
-               } ins_access;
-
-       static free_list<RegExp*> vFreeList;
-
-public:
-       RegExp()
-               : size(0)
-               , ins_cache(NULL)
-               , ins_access(SHARED)
-       {
-               vFreeList.insert(this);
-       }
-
-       virtual ~RegExp()
-       {
-               vFreeList.erase(this);
-       }
-
-       virtual void split(CharSet&) = 0;
-       virtual void calcSize(Char*) = 0;
-       virtual uint32_t fixedLength();
-       virtual uint32_t compile(Char*, Ins*) = 0;
-       virtual void decompile() = 0;
-       virtual void display(std::ostream&) const = 0;
-       friend std::ostream& operator<<(std::ostream&, const RegExp&);
-       friend std::ostream& operator<<(std::ostream&, const RegExp*);
-
-       FORBID_COPY (RegExp);
-};
-
-inline std::ostream& operator<<(std::ostream &o, const RegExp &re)
-{
-       re.display(o);
-       return o;
-}
-
-inline std::ostream& operator<<(std::ostream &o, const RegExp *re)
-{
-       return o << *re;
-}
-
-class NullOp: public RegExp
-{
-
-public:
-       void split(CharSet&);
-       void calcSize(Char*);
-       uint32_t fixedLength();
-       uint32_t compile(Char*, Ins*);
-       void decompile();
-       void display(std::ostream &o) const
-       {
-               o << "_";
-       }
-};
-
-class MatchOp: public RegExp
-{
-
-public:
-       Range   *match;
-
-public:
-       MatchOp(Range *m) : match(m)
-       {
-       }
-
-       void split(CharSet&);
-       void calcSize(Char*);
-       uint32_t fixedLength();
-       uint32_t compile(Char*, Ins*);
-       void decompile();
-       void display(std::ostream&) const;
-
-       FORBID_COPY (MatchOp);
-};
-
-class RuleOp: public RegExp
-{
-private:
-       RegExp   *exp;
-
-public:
-       RegExp   *ctx;
-       Ins      *ins;
-       rule_rank_t rank;
-       Token    *code;
-       uint32_t line;
-
-public:
-       RuleOp(RegExp*, RegExp*, Token*, rule_rank_t, InsAccess);
-
-       ~RuleOp()
-       {
-               delete code;
-       }
-
-       void split(CharSet&);
-       void calcSize(Char*);
-       uint32_t compile(Char*, Ins*);
-       void decompile();
-       void display(std::ostream &o) const
-       {
-               o << exp << "/" << ctx << ";";
-       }
-
-       FORBID_COPY (RuleOp);
-};
-
-RegExp *doAlt(RegExp*, RegExp*);
-RegExp *mkAlt(RegExp*, RegExp*);
-
-class AltOp: public RegExp
-{
-
-private:
-       RegExp  *exp1, *exp2;
-
-public:
-       AltOp(RegExp *e1, RegExp *e2)
-               : exp1(e1)
-               , exp2(e2)
-       {
-       }
-
-       void split(CharSet&);
-       void calcSize(Char*);
-       uint32_t fixedLength();
-       uint32_t compile(Char*, Ins*);
-       void decompile();
-       void display(std::ostream &o) const
-       {
-               o << exp1 << "|" << exp2;
-       }
-
-       friend RegExp *mkAlt(RegExp*, RegExp*);
-
-       FORBID_COPY (AltOp);
-};
-
-RegExp *doCat(RegExp*, RegExp*);
-RegExp *mkCat(RegExp*, RegExp*);
-
-class CatOp: public RegExp
-{
-
-private:
-       RegExp  *exp1, *exp2;
-
-public:
-       CatOp(RegExp *e1, RegExp *e2)
-               : exp1(e1)
-               , exp2(e2)
-       {
-       }
-
-       void split(CharSet&);
-       void calcSize(Char*);
-       uint32_t fixedLength();
-       uint32_t compile(Char*, Ins*);
-       void decompile();
-       void display(std::ostream &o) const
-       {
-               o << exp1 << exp2;
-       }
-
-       FORBID_COPY (CatOp);
-};
-
-class CloseOp: public RegExp
-{
-
-private:
-       RegExp  *exp;
-
-public:
-       CloseOp(RegExp *e)
-               : exp(e)
-       {
-       }
-
-       void split(CharSet&);
-       void calcSize(Char*);
-       uint32_t compile(Char*, Ins*);
-       void decompile();
-       void display(std::ostream &o) const
-       {
-               o << exp << "+";
-       }
-
-       FORBID_COPY (CloseOp);
-};
-
-class CloseVOp: public RegExp
-{
-
-private:
-       RegExp  *exp;
-       int     min;
-       int     max;
-
-public:
-       CloseVOp(RegExp *e, int lb, int ub)
-               : exp(e)
-               , min(lb)
-               , max(ub)
-       {
-               exp->ins_access = PRIVATE;
-       }
-
-       void split(CharSet&);
-       void calcSize(Char*);
-       uint32_t compile(Char*, Ins*);
-       void decompile();
-       void display(std::ostream &o) const
-       {
-               o << exp << "+";
-       }
-
-       FORBID_COPY (CloseVOp);
-};
-
-extern RegExp *mkDiff(RegExp*, RegExp*);
-extern RegExp *mkAlt(RegExp*, RegExp*);
-
-} // end namespace re2c
-
-#endif // _RE2C_DFA_RE_
diff --git a/re2c/src/globals.h b/re2c/src/globals.h

index e77bcf8ddb4e2ceb76b459750fbeea39d9079812..9f2549ad28c136d96314ef26cd2517cbbaca7945 100644 (file)
--- a/re2c/src/globals.h
+++ b/re2c/src/globals.h
@@ -5,7 +5,7 @@
  
  #include "src/codegen/code_names.h"
  #include "src/codegen/input_api.h"
-#include "src/dfa/encoding/enc.h"
+#include "src/ir/regexp/encoding/enc.h"
  #include "src/util/c99_stdint.h"
  
  namespace re2c
diff --git a/re2c/src/ir/bytecode/bytecode.cc b/re2c/src/ir/bytecode/bytecode.cc

new file mode 100644 (file)

index 0000000..0fda50b
--- /dev/null
+++ b/re2c/src/ir/bytecode/bytecode.cc
@@ -0,0 +1,111 @@
+#include <string.h>
+
+#include "src/codegen/skeleton/skeleton.h"
+#include "src/ir/bytecode/bytecode.h"
+#include "src/globals.h"
+
+namespace re2c {
+
+static void optimize (Ins * i);
+
+smart_ptr<DFA> genCode (RegExp *re, Output & output, uint32_t ind)
+{
+       CharSet cs;
+       re->split(cs);
+
+       Char *rep = new Char[encoding.nCodeUnits()];
+
+       for (uint32_t j = 0; j < encoding.nCodeUnits(); ++j)
+       {
+               if (!cs.rep[j]->nxt)
+                       cs.rep[j]->nxt = &cs.ptn[j];
+
+               rep[j] = cs.rep[j]->nxt - &cs.ptn[0];
+       }
+
+       re->calcSize(rep);
+       Ins *ins = new Ins[re->size + 1];
+       memset(ins, 0, (re->size + 1)*sizeof(Ins));
+       const uint32_t size = re->compile(rep, ins);
+       Ins *eoi = &ins[size];
+       eoi->i.tag = GOTO;
+       eoi->i.link = eoi;
+
+       optimize(ins);
+
+       /*
+       for (const Ins *inst = &ins[0]; inst < &ins[size]; )
+       {
+               inst = showIns(std::cout, *inst, ins[0]);
+       }
+       */
+
+       for (uint32_t j = 0; j < size;)
+       {
+               unmark(&ins[j]);
+
+               if (ins[j].i.tag == CHAR)
+               {
+                       j = (Ins*) ins[j].i.link - ins;
+               }
+               else
+               {
+                       j++;
+               }
+       }
+
+       smart_ptr<DFA> dfa = make_smart_ptr(new DFA(ins, size, 0, encoding.nCodeUnits(), rep));
+       if (flag_skeleton)
+       {
+               skeleton::Skeleton skeleton (*dfa);
+               skeleton.emit_data (output.data);
+               skeleton::emit_prolog (output.source, ind, output.data.file_name.c_str ());
+       }
+       dfa->prepare (output.source, output.max_fill);
+
+       return dfa;
+}
+
+void optimize (Ins * i)
+{
+       while (!isMarked (i))
+       {
+               mark (i);
+               if (i->i.tag == CHAR)
+               {
+                       i = (Ins *) i->i.link;
+               }
+               else if (i->i.tag == GOTO || i->i.tag == FORK)
+               {
+                       Ins * target = (Ins *) i->i.link;
+                       optimize (target);
+                       if (target->i.tag == GOTO)
+                       {
+                               i->i.link = target->i.link == target
+                                       ? i
+                                       : target;
+                       }
+                       if (i->i.tag == FORK)
+                       {
+                               Ins * follow = (Ins *) & i[1];
+                               optimize (follow);
+                               if (follow->i.tag == GOTO && follow->i.link == follow)
+                               {
+                                       i->i.tag = GOTO;
+                               }
+                               else if (i->i.link == i)
+                               {
+                                       i->i.tag = GOTO;
+                                       i->i.link = follow;
+                               }
+                       }
+                       return;
+               }
+               else
+               {
+                       ++i;
+               }
+       }
+}
+
+} // namespace re2c
diff --git a/re2c/src/ir/bytecode/bytecode.h b/re2c/src/ir/bytecode/bytecode.h

new file mode 100644 (file)

index 0000000..50757e1
--- /dev/null
+++ b/re2c/src/ir/bytecode/bytecode.h
@@ -0,0 +1,16 @@
+#ifndef _RE2C_IR_BYTECODE_BYTECODE_
+#define _RE2C_IR_BYTECODE_BYTECODE_
+
+#include "src/codegen/output.h"
+#include "src/ir/dfa/dfa.h"
+#include "src/ir/regexp/regexp.h"
+#include "src/util/smart_ptr.h"
+
+namespace re2c
+{
+
+smart_ptr<DFA> genCode (RegExp * re, Output & output, uint32_t ind);
+
+} // namespace re2c
+
+#endif // _RE2C_IR_BYTECODE_BYTECODE_
diff --git a/re2c/src/ir/bytecode/calc_size.cc b/re2c/src/ir/bytecode/calc_size.cc

new file mode 100644 (file)

index 0000000..fc90e7e
--- /dev/null
+++ b/re2c/src/ir/bytecode/calc_size.cc
@@ -0,0 +1,72 @@
+#include "src/ir/regexp/regexp_alt.h"
+#include "src/ir/regexp/regexp_cat.h"
+#include "src/ir/regexp/regexp_close.h"
+#include "src/ir/regexp/regexp_closev.h"
+#include "src/ir/regexp/regexp_match.h"
+#include "src/ir/regexp/regexp_null.h"
+#include "src/ir/regexp/regexp_rule.h"
+
+namespace re2c
+{
+
+void AltOp::calcSize (Char * rep)
+{
+       exp1->calcSize (rep);
+       exp2->calcSize (rep);
+       size = exp1->size + exp2->size + 2;
+}
+
+void CatOp::calcSize (Char * rep)
+{
+       exp1->calcSize (rep);
+       exp2->calcSize (rep);
+       size = exp1->size + exp2->size;
+}
+
+void CloseOp::calcSize (Char * rep)
+{
+       exp->calcSize (rep);
+       size = exp->size + 1;
+}
+
+void CloseVOp::calcSize (Char * rep)
+{
+       exp->calcSize (rep);
+       if (max >= 0)
+       {
+               size = (exp->size * min) + ((1 + exp->size) * (max - min));
+       }
+       else
+       {
+               size = (exp->size * min) + 1;
+       }
+}
+
+void MatchOp::calcSize (Char * rep)
+{
+       size = 1;
+       for (Range * r = match; r; r = r->next)
+       {
+               for (uint32_t c = r->lb; c < r->ub; ++c)
+               {
+                       if (rep[c] == c)
+                       {
+                               ++size;
+                       }
+               }
+       }
+}
+
+void NullOp::calcSize (Char *)
+{
+       size = 0;
+}
+
+void RuleOp::calcSize (Char * rep)
+{
+       exp->calcSize (rep);
+       ctx->calcSize (rep);
+       size = exp->size + (ctx->size ? ctx->size + 2 : 1);
+}
+
+} // end namespace re2c
diff --git a/re2c/src/ir/bytecode/charset.cc b/re2c/src/ir/bytecode/charset.cc

new file mode 100644 (file)

index 0000000..1543abe
--- /dev/null
+++ b/re2c/src/ir/bytecode/charset.cc
@@ -0,0 +1,32 @@
+#include "src/ir/bytecode/charset.h"
+#include "src/globals.h"
+#include "src/util/allocate.h"
+
+namespace re2c {
+
+CharSet::CharSet ()
+       : fix (0)
+       , freeHead (0)
+       , freeTail (0)
+       , rep (allocate<CharPtr> (encoding.nCodeUnits ()))
+       , ptn (allocate<CharPtn> (encoding.nCodeUnits ()))
+{
+       for (uint32_t j = 0; j < encoding.nCodeUnits (); ++j)
+       {
+               rep[j] = &ptn[0];
+               ptn[j].nxt = &ptn[j + 1]; /* wrong for j=encoding.nCodeUnits() - 1 but will be corrected below */
+               ptn[j].card = 0;
+       }
+       freeHead = &ptn[1];
+       * (freeTail = &ptn[encoding.nCodeUnits() - 1].nxt) = NULL;
+       ptn[0].card = encoding.nCodeUnits ();
+       ptn[0].nxt = NULL;
+}
+
+CharSet::~CharSet ()
+{
+       operator delete (rep);
+       operator delete (ptn);
+}
+
+} // namespace re2c
diff --git a/re2c/src/ir/bytecode/charset.h b/re2c/src/ir/bytecode/charset.h

new file mode 100644 (file)

index 0000000..af36b4f
--- /dev/null
+++ b/re2c/src/ir/bytecode/charset.h
@@ -0,0 +1,37 @@
+#ifndef _RE2C_IR_BYTECODE_CHARSET_
+#define _RE2C_IR_BYTECODE_CHARSET_
+
+#include "src/util/c99_stdint.h"
+#include "src/util/forbid_copy.h"
+
+namespace re2c
+{
+
+struct CharPtn
+{
+       uint32_t card;
+       CharPtn * fix;
+       CharPtn * nxt;
+
+       FORBID_COPY (CharPtn);
+};
+
+typedef CharPtn * CharPtr;
+
+struct CharSet
+{
+       CharPtn * fix;
+       CharPtn * freeHead;
+       CharPtn ** freeTail;
+       CharPtr * rep;
+       CharPtn * ptn;
+
+       CharSet ();
+       ~CharSet ();
+
+       FORBID_COPY (CharSet);
+};
+
+} // namespace re2c
+
+#endif // _RE2C_IR_BYTECODE_CHARSET_
diff --git a/re2c/src/ir/bytecode/compile.cc b/re2c/src/ir/bytecode/compile.cc

new file mode 100644 (file)

index 0000000..2f4609b
--- /dev/null
+++ b/re2c/src/ir/bytecode/compile.cc
@@ -0,0 +1,262 @@
+#include "src/ir/regexp/regexp_alt.h"
+#include "src/ir/regexp/regexp_cat.h"
+#include "src/ir/regexp/regexp_close.h"
+#include "src/ir/regexp/regexp_closev.h"
+#include "src/ir/regexp/regexp_match.h"
+#include "src/ir/regexp/regexp_null.h"
+#include "src/ir/regexp/regexp_rule.h"
+
+namespace re2c
+{
+
+static uint32_t compile_goto (Ins * ins, Ins * i);
+
+uint32_t AltOp::compile (Char * rep, Ins * i)
+{
+       if (ins_cache)
+       {
+               return compile_goto (ins_cache, i);
+       }
+       else
+       {
+               ins_cache = i;
+
+               i->i.tag = FORK;
+               const uint32_t sz1 = exp1->compile (rep, &i[1]);
+               Ins * const j = &i[sz1 + 1];
+               i->i.link = &j[1];
+               j->i.tag = GOTO;
+               const uint32_t sz2 = exp2->compile (rep, &j[1]);
+               j->i.link = &j[sz2 + 1];
+
+               if (ins_access == PRIVATE)
+               {
+                       decompile ();
+               }
+
+               return sz1 + sz2 + 2;
+       }
+}
+
+void AltOp::decompile ()
+{
+       if (ins_cache)
+       {
+               exp1->decompile ();
+               exp2->decompile ();
+               ins_cache = NULL;
+       }
+}
+
+uint32_t CatOp::compile (Char * rep, Ins * i)
+{
+       if (ins_cache)
+       {
+               return compile_goto (ins_cache, i);
+       }
+       else
+       {
+               ins_cache = i;
+
+               const uint32_t sz1 = exp1->compile (rep, &i[0]);
+               const uint32_t sz2 = exp2->compile (rep, &i[sz1]);
+
+               if (ins_access == PRIVATE)
+               {
+                       decompile ();
+               }
+
+               return sz1 + sz2;
+       }
+}
+
+void CatOp::decompile ()
+{
+       if (ins_cache)
+       {
+               exp1->decompile ();
+               exp2->decompile ();
+               ins_cache = NULL;
+       }
+}
+
+uint32_t CloseOp::compile (Char * rep, Ins * i)
+{
+       if (ins_cache)
+       {
+               return compile_goto (ins_cache, i);
+       }
+       else
+       {
+               ins_cache = i;
+
+               i += exp->compile (rep, &i[0]);
+               i->i.tag = FORK;
+               i->i.link = ins_cache;
+               ++i;
+
+               const uint32_t sz = i - ins_cache;
+               if (ins_access == PRIVATE)
+               {
+                       decompile ();
+               }
+
+               return sz;
+       }
+}
+
+void CloseOp::decompile ()
+{
+       if (ins_cache)
+       {
+               exp->decompile ();
+               ins_cache = NULL;
+       }
+}
+
+uint32_t CloseVOp::compile (Char * rep, Ins * i)
+{
+       if (ins_cache)
+       {
+               return compile_goto (ins_cache, i);
+       }
+       else
+       {
+               ins_cache = i;
+
+               for (int st = min; st < max; st++)
+               {
+                       const uint32_t sz = exp->compile (rep, &i[1]);
+                       i->i.tag = FORK;
+                       i->i.link = ins_cache + (1 + sz) * (max - min);
+                       i += sz + 1;
+               }
+               for (int st = 0; st < min; st++)
+               {
+                       const uint32_t sz = exp->compile (rep, &i[0]);
+                       i += sz;
+                       if (max < 0 && st == 0)
+                       {
+                               i->i.tag = FORK;
+                               i->i.link = i - sz;
+                               i++;
+                       }
+               }
+               const uint32_t sz = i - ins_cache;
+
+               if (ins_access == PRIVATE)
+               {
+                       decompile ();
+               }
+
+               return sz;
+       }
+}
+
+void CloseVOp::decompile ()
+{
+       if (ins_cache)
+       {
+               exp->decompile ();
+               ins_cache = NULL;
+       }
+}
+
+uint32_t MatchOp::compile (Char * rep, Ins * i)
+{
+       if (ins_cache)
+       {
+               return compile_goto (ins_cache, i);
+       }
+       else
+       {
+               ins_cache = i;
+
+               i->i.tag = CHAR;
+               i->i.link = &i[size];
+               Ins *j = &i[1];
+               uint32_t bump = size;
+               for (Range *r = match; r; r = r->next)
+               {
+                       for (uint32_t c = r->lb; c < r->ub; ++c)
+                       {
+                               if (rep[c] == c)
+                               {
+                                       j->c.value = c;
+                                       j->c.bump = --bump;
+                                       j++;
+                               }
+                       }
+               }
+
+               if (ins_access == PRIVATE)
+               {
+                       decompile ();
+               }
+
+               return size;
+       }
+}
+
+void MatchOp::decompile ()
+{
+       ins_cache = NULL;
+}
+
+uint32_t NullOp::compile (Char *, Ins *)
+{
+       return 0;
+}
+
+void NullOp::decompile () {}
+
+uint32_t RuleOp::compile (Char * rep, Ins * i)
+{
+       if (ins_cache)
+       {
+               return compile_goto (ins_cache, i);
+       }
+       else
+       {
+               ins_cache = i;
+
+               i += exp->compile (rep, &i[0]);
+               if (ctx->size)
+               {
+                       i->i.tag = CTXT;
+                       i->i.link = &i[1];
+                       ++i;
+                       i += ctx->compile (rep, &i[0]);
+               }
+               i->i.tag = TERM;
+               i->i.link = this;
+               ++i;
+               const uint32_t sz = i - ins_cache;
+
+               if (ins_access == PRIVATE)
+               {
+                       decompile ();
+               }
+
+               return sz;
+       }
+}
+
+void RuleOp::decompile ()
+{
+       if (ins_cache)
+       {
+               exp->decompile ();
+               ctx->decompile ();
+               ins_cache = NULL;
+       }
+}
+
+uint32_t compile_goto (Ins * ins, Ins * i)
+{
+       i->i.tag = GOTO;
+       i->i.link = ins;
+       return 1;
+}
+
+} // end namespace re2c
diff --git a/re2c/src/ir/bytecode/ins.cc b/re2c/src/ir/bytecode/ins.cc

new file mode 100644 (file)

index 0000000..f0041e5
--- /dev/null
+++ b/re2c/src/ir/bytecode/ins.cc
@@ -0,0 +1,41 @@
+#include <iostream>
+
+#include "src/ir/bytecode/ins.h"
+#include "src/ir/regexp/regexp_rule.h"
+
+namespace re2c {
+
+const Ins * showIns (std::ostream & o, const Ins & i, const Ins & base)
+{
+       o.width (3);
+       o << &i - &base << ": ";
+       const Ins * ret = &(&i)[1];
+       switch (i.i.tag)
+       {
+               case CHAR:
+               {
+                       o << "match ";
+                       for (; ret < (Ins *) i.i.link; ++ret)
+                       {
+                               o << "\\x" << std::hex << ret->c.value;
+                       }
+                       break;
+               }
+               case GOTO:
+                       o << "goto " << ((Ins *) i.i.link - &base);
+                       break;
+               case FORK:
+                       o << "fork " << ((Ins *) i.i.link - &base);
+                       break;
+               case CTXT:
+                       o << "ctxt";
+                       break;
+               case TERM:
+                       o << "term " << ((RuleOp *) i.i.link)->rank;
+                       break;
+       }
+       o << "\n";
+       return ret;
+}
+
+} // namespace re2c
diff --git a/re2c/src/ir/bytecode/ins.h b/re2c/src/ir/bytecode/ins.h

new file mode 100644 (file)

index 0000000..6cd14d5
--- /dev/null
+++ b/re2c/src/ir/bytecode/ins.h
@@ -0,0 +1,52 @@
+#ifndef _RE2C_IR_BYTECODE_INS_
+#define _RE2C_IR_BYTECODE_INS_
+
+#include <iosfwd>
+
+#include "src/util/c99_stdint.h"
+
+namespace re2c
+{
+
+static const uint32_t CHAR = 0;
+static const uint32_t GOTO = 1;
+static const uint32_t FORK = 2;
+static const uint32_t TERM = 3;
+static const uint32_t CTXT = 4;
+
+union Ins
+{
+       struct
+       {
+               uint8_t tag;
+               uint8_t marked;
+               void * link;
+       } i;
+       struct
+       {
+               uint32_t value;
+               uint32_t bump;
+               void * link;
+       } c;
+};
+
+inline bool isMarked (Ins * i)
+{
+       return i->i.marked != 0;
+}
+
+inline void mark (Ins * i)
+{
+       i->i.marked = true;
+}
+
+inline void unmark (Ins * i)
+{
+       i->i.marked = false;
+}
+
+const Ins * showIns (std::ostream & o, const Ins & i, const Ins & base);
+
+} // namespace re2c
+
+#endif // _RE2C_IR_BYTECODE_INS_
diff --git a/re2c/src/ir/bytecode/split.cc b/re2c/src/ir/bytecode/split.cc

new file mode 100644 (file)

index 0000000..7e0bc6d
--- /dev/null
+++ b/re2c/src/ir/bytecode/split.cc
@@ -0,0 +1,82 @@
+#include "src/ir/regexp/regexp_alt.h"
+#include "src/ir/regexp/regexp_cat.h"
+#include "src/ir/regexp/regexp_close.h"
+#include "src/ir/regexp/regexp_closev.h"
+#include "src/ir/regexp/regexp_match.h"
+#include "src/ir/regexp/regexp_null.h"
+#include "src/ir/regexp/regexp_rule.h"
+
+namespace re2c {
+
+void AltOp::split (CharSet & s)
+{
+       exp1->split (s);
+       exp2->split (s);
+}
+
+void CatOp::split (CharSet & s)
+{
+       exp1->split (s);
+       exp2->split (s);
+}
+
+void CloseOp::split (CharSet & s)
+{
+       exp->split (s);
+}
+
+void CloseVOp::split (CharSet & s)
+{
+       exp->split (s);
+}
+
+void MatchOp::split (CharSet & s)
+{
+       for (Range *r = match; r; r = r->next)
+       {
+               for (uint32_t c = r->lb; c < r->ub; ++c)
+               {
+                       CharPtn * x = s.rep[c];
+                       CharPtn * a = x->nxt;
+                       if (!a)
+                       {
+                               if (x->card == 1)
+                               {
+                                       continue;
+                               }
+                               x->nxt = a = s.freeHead;
+                               if (!(s.freeHead = s.freeHead->nxt))
+                               {
+                                       s.freeTail = &s.freeHead;
+                               }
+                               a->nxt = NULL;
+                               x->fix = s.fix;
+                               s.fix = x;
+                       }
+                       if (--(x->card) == 0)
+                       {
+                               *s.freeTail = x;
+                               *(s.freeTail = &x->nxt) = NULL;
+                       }
+                       s.rep[c] = a;
+                       ++(a->card);
+               }
+       }
+       for (; s.fix; s.fix = s.fix->fix)
+       {
+               if (s.fix->card)
+               {
+                       s.fix->nxt = NULL;
+               }
+       }
+}
+
+void NullOp::split (CharSet &) {}
+
+void RuleOp::split (CharSet & s)
+{
+       exp->split (s);
+       ctx->split (s);
+}
+
+} // namespace re2c
diff --git a/re2c/src/dfa/action.h b/re2c/src/ir/dfa/action.h

similarity index 93%

rename from re2c/src/dfa/action.h

rename to re2c/src/ir/dfa/action.h

index a188913d814df60dee32d9d962708b50ea84de1f..582d1abf26218871543145881759ee7aff3fb9c3 100644 (file)
--- a/re2c/src/dfa/action.h
+++ b/re2c/src/ir/dfa/action.h
@@ -1,5 +1,5 @@
-#ifndef _RE2C_DFA_ACTION_
-#define _RE2C_DFA_ACTION_
+#ifndef _RE2C_IR_DFA_ACTION_
+#define _RE2C_IR_DFA_ACTION_
  
  #include <vector>
  
@@ -106,4 +106,4 @@ private:
  
  } // namespace re2c
  
-#endif // _RE2C_DFA_ACTION_
+#endif // _RE2C_IR_DFA_ACTION_
diff --git a/re2c/src/dfa/dfa.cc b/re2c/src/ir/dfa/dfa.cc

similarity index 98%

rename from re2c/src/dfa/dfa.cc

rename to re2c/src/ir/dfa/dfa.cc

index 246e21210a488981f3622e8f1305f658deda5fee..2721ea67f091e96964689495d5909dcfc73e63b8 100644 (file)
--- a/re2c/src/dfa/dfa.cc
+++ b/re2c/src/ir/dfa/dfa.cc
@@ -1,6 +1,7 @@
  #include <string.h>
  
-#include "src/dfa/dfa.h"
+#include "src/ir/dfa/dfa.h"
+#include "src/ir/regexp/regexp_rule.h"
  #include "src/util/allocate.h"
  
  namespace re2c
diff --git a/re2c/src/dfa/dfa.h b/re2c/src/ir/dfa/dfa.h

similarity index 80%

rename from re2c/src/dfa/dfa.h

rename to re2c/src/ir/dfa/dfa.h

index fd00cecf04020bad5fbda2c616e5d9d8e63a5a5e..98229f670071c9552f2b7273a16677704644b24d 100644 (file)
--- a/re2c/src/dfa/dfa.h
+++ b/re2c/src/ir/dfa/dfa.h
@@ -1,10 +1,9 @@
-#ifndef _RE2C_DFA_DFA_
-#define _RE2C_DFA_DFA_
+#ifndef _RE2C_IR_DFA_DFA_
+#define _RE2C_IR_DFA_DFA_
  
-#include "src/dfa/action.h"
-#include "src/dfa/state.h"
+#include "src/ir/dfa/action.h"
+#include "src/ir/dfa/state.h"
  #include "src/util/forbid_copy.h"
-#include "src/util/smart_ptr.h"
  
  namespace re2c
  {
@@ -44,8 +43,6 @@ public:
         FORBID_COPY (DFA);
  };
  
-smart_ptr<DFA> genCode (RegExp *, Output &, uint32_t);
-
  } // namespace re2c
  
-#endif // _RE2C_DFA_DFA_
+#endif // _RE2C_IR_DFA_DFA_
diff --git a/re2c/src/dfa/state.h b/re2c/src/ir/dfa/state.h

similarity index 80%

rename from re2c/src/dfa/state.h

rename to re2c/src/ir/dfa/state.h

index 720135017d0c566c79a6b52a115ed9d5c3642e0d..41af7dfdb12805a18dfad00c4efa95cb237b5268 100644 (file)
--- a/re2c/src/dfa/state.h
+++ b/re2c/src/ir/dfa/state.h
@@ -1,9 +1,9 @@
-#ifndef _RE2C_DFA_STATE_
-#define _RE2C_DFA_STATE_
+#ifndef _RE2C_IR_DFA_STATE_
+#define _RE2C_IR_DFA_STATE_
  
  #include "src/codegen/go.h"
-#include "src/dfa/action.h"
-#include "src/dfa/re.h"
+#include "src/ir/dfa/action.h"
+#include "src/ir/regexp/regexp.h"
  #include "src/util/forbid_copy.h"
  
  namespace re2c
@@ -49,4 +49,4 @@ public:
  
  } // namespace re2c
  
-#endif // _RE2C_DFA_STATE_
+#endif // _RE2C_IR_DFA_STATE_
diff --git a/re2c/src/ir/regexp/display.cc b/re2c/src/ir/regexp/display.cc

new file mode 100644 (file)

index 0000000..1e12d35
--- /dev/null
+++ b/re2c/src/ir/regexp/display.cc
@@ -0,0 +1,57 @@
+#include <iostream>
+
+#include "src/ir/regexp/regexp.h"
+#include "src/ir/regexp/regexp_alt.h"
+#include "src/ir/regexp/regexp_cat.h"
+#include "src/ir/regexp/regexp_close.h"
+#include "src/ir/regexp/regexp_closev.h"
+#include "src/ir/regexp/regexp_match.h"
+#include "src/ir/regexp/regexp_null.h"
+#include "src/ir/regexp/regexp_rule.h"
+
+namespace re2c
+{
+
+std::ostream & operator << (std::ostream & o, const RegExp & re)
+{
+       re.display (o);
+       return o;
+}
+
+void AltOp::display (std::ostream & o) const
+{
+       o << exp1 << "|" << exp2;
+}
+
+void CatOp::display (std::ostream & o) const
+{
+       o << exp1 << exp2;
+}
+
+void CloseOp::display (std::ostream & o) const
+{
+       o << exp << "+";
+}
+
+void CloseVOp::display (std::ostream & o) const
+{
+       o << exp << "+";
+}
+
+void MatchOp::display (std::ostream & o) const
+{
+       o << match;
+}
+
+void NullOp::display (std::ostream & o) const
+{
+       o << "_";
+}
+
+void RuleOp::display (std::ostream & o) const
+{
+       o << exp << "/" << ctx << ";";
+}
+
+} // end namespace re2c
+
diff --git a/re2c/src/dfa/encoding/enc.cc b/re2c/src/ir/regexp/encoding/enc.cc

similarity index 99%

rename from re2c/src/dfa/encoding/enc.cc

rename to re2c/src/ir/regexp/encoding/enc.cc

index 2d77597260a052c55398aff2ab4c5a37c79c0c04..9cd8e3b11592346f24b15b1b4fe60d2f80227b79 100644 (file)
--- a/re2c/src/dfa/encoding/enc.cc
+++ b/re2c/src/ir/regexp/encoding/enc.cc
@@ -1,4 +1,4 @@
-#include "src/dfa/encoding/enc.h"
+#include "src/ir/regexp/encoding/enc.h"
  
  namespace re2c {
  
diff --git a/re2c/src/dfa/encoding/enc.h b/re2c/src/ir/regexp/encoding/enc.h

similarity index 97%

rename from re2c/src/dfa/encoding/enc.h

rename to re2c/src/ir/regexp/encoding/enc.h

index f54997f3d8fb9782cb824734066b3ab053e7c43c..ca5399b43a75b1c27016a514fab5c1b9b1f698c4 100644 (file)
--- a/re2c/src/dfa/encoding/enc.h
+++ b/re2c/src/ir/regexp/encoding/enc.h
@@ -1,5 +1,5 @@
-#ifndef _RE2C_DFA_ENCODING_ENC_
-#define _RE2C_DFA_ENCODING_ENC_
+#ifndef _RE2C_IR_REGEXP_ENCODING_ENC_
+#define _RE2C_IR_REGEXP_ENCODING_ENC_
  
  #include "src/util/c99_stdint.h"
  #include "src/util/range.h"
@@ -172,4 +172,4 @@ inline void Enc::setPolicy(policy_t t)
  
  } // namespace re2c
  
-#endif // _RE2C_DFA_ENCODING_ENC_
+#endif // _RE2C_IR_REGEXP_ENCODING_ENC_
diff --git a/re2c/src/dfa/encoding/range_suffix.cc b/re2c/src/ir/regexp/encoding/range_suffix.cc

similarity index 77%

rename from re2c/src/dfa/encoding/range_suffix.cc

rename to re2c/src/ir/regexp/encoding/range_suffix.cc

index 3c8ccb50d4aaaa447ae0a43a092c0c0abc4b9698..96615927697114c56964420dcbe0eb9fbbb7c0c6 100644 (file)
--- a/re2c/src/dfa/encoding/range_suffix.cc
+++ b/re2c/src/ir/regexp/encoding/range_suffix.cc
@@ -1,5 +1,6 @@
-#include "src/dfa/encoding/range_suffix.h"
-#include "src/dfa/re.h"
+#include "src/ir/regexp/encoding/range_suffix.h"
+#include "src/ir/regexp/regexp.h"
+#include "src/ir/regexp/regexp_match.h"
  
  namespace re2c {
  
diff --git a/re2c/src/dfa/encoding/range_suffix.h b/re2c/src/ir/regexp/encoding/range_suffix.h

similarity index 79%

rename from re2c/src/dfa/encoding/range_suffix.h

rename to re2c/src/ir/regexp/encoding/range_suffix.h

index 770c0bc66812cbf423556f25d58a74bf3a878c20..9300897788e6d791da3ce9cdd2efa5866143c561 100644 (file)
--- a/re2c/src/dfa/encoding/range_suffix.h
+++ b/re2c/src/ir/regexp/encoding/range_suffix.h
@@ -1,5 +1,5 @@
-#ifndef _RE2C_DFA_ENCODING_RANGE_SUFFIX_
-#define _RE2C_DFA_ENCODING_RANGE_SUFFIX_
+#ifndef _RE2C_IR_REGEXP_ENCODING_RANGE_SUFFIX_
+#define _RE2C_IR_REGEXP_ENCODING_RANGE_SUFFIX_
  
  #include <stddef.h> // NULL
  
@@ -37,4 +37,4 @@ RegExp * emit(RangeSuffix * p, RegExp * re);
  
  } // namespace re2c
  
-#endif // _RE2C_DFA_ENCODING_RANGE_SUFFIX_
+#endif // _RE2C_IR_REGEXP_ENCODING_RANGE_SUFFIX_
diff --git a/re2c/src/dfa/encoding/utf16/utf16.cc b/re2c/src/ir/regexp/encoding/utf16/utf16.cc

similarity index 82%

rename from re2c/src/dfa/encoding/utf16/utf16.cc

rename to re2c/src/ir/regexp/encoding/utf16/utf16.cc

index 5b2d8448d0383b8714076afa554ce87c45528d2a..47743edf18d0967c001cdb70345ae118ec7c0752 100644 (file)
--- a/re2c/src/dfa/encoding/utf16/utf16.cc
+++ b/re2c/src/ir/regexp/encoding/utf16/utf16.cc
@@ -1,4 +1,4 @@
-#include "src/dfa/encoding/utf16/utf16.h"
+#include "src/ir/regexp/encoding/utf16/utf16.h"
  
  namespace re2c {
  
diff --git a/re2c/src/dfa/encoding/utf16/utf16.h b/re2c/src/ir/regexp/encoding/utf16/utf16.h

similarity index 82%

rename from re2c/src/dfa/encoding/utf16/utf16.h

rename to re2c/src/ir/regexp/encoding/utf16/utf16.h

index c6389d263c0522e97063bce4703b79c1af65cae3..d64de3494d849783edbaebf3b4aa1d7405f68cad 100644 (file)
--- a/re2c/src/dfa/encoding/utf16/utf16.h
+++ b/re2c/src/ir/regexp/encoding/utf16/utf16.h
@@ -1,5 +1,5 @@
-#ifndef _RE2C_DFA_ENCODING_UTF16_UTF16_
-#define _RE2C_DFA_ENCODING_UTF16_UTF16_
+#ifndef _RE2C_IR_REGEXP_ENCODING_UTF16_UTF16_
+#define _RE2C_IR_REGEXP_ENCODING_UTF16_UTF16_
  
  #include "src/util/c99_stdint.h"
  
@@ -34,4 +34,4 @@ inline uint16_t utf16::trail_surr(rune r)
  
  }  // namespace re2c
  
-#endif // _RE2C_DFA_ENCODING_UTF16_UTF16_
+#endif // _RE2C_IR_REGEXP_ENCODING_UTF16_UTF16_
diff --git a/re2c/src/dfa/encoding/utf16/utf16_range.cc b/re2c/src/ir/regexp/encoding/utf16/utf16_range.cc

similarity index 97%

rename from re2c/src/dfa/encoding/utf16/utf16_range.cc

rename to re2c/src/ir/regexp/encoding/utf16/utf16_range.cc

index 187f42ffa92929a4a0aa53864e19f3ca129a1d03..c4f27f6cd0cd570facd38212c637af3419935942 100644 (file)
--- a/re2c/src/dfa/encoding/utf16/utf16_range.cc
+++ b/re2c/src/ir/regexp/encoding/utf16/utf16_range.cc
@@ -1,5 +1,5 @@
-#include "src/dfa/encoding/utf16/utf16_range.h"
-#include "src/dfa/re.h"
+#include "src/ir/regexp/encoding/utf16/utf16_range.h"
+#include "src/ir/regexp/regexp.h"
  
  namespace re2c {
  
diff --git a/re2c/src/dfa/encoding/utf16/utf16_range.h b/re2c/src/ir/regexp/encoding/utf16/utf16_range.h

similarity index 63%

rename from re2c/src/dfa/encoding/utf16/utf16_range.h

rename to re2c/src/ir/regexp/encoding/utf16/utf16_range.h

index 66cf3b9139aa57e98205d88bf3f042777cfef730..4e75560d88b95d785c30b928b292220c059e07e7 100644 (file)
--- a/re2c/src/dfa/encoding/utf16/utf16_range.h
+++ b/re2c/src/ir/regexp/encoding/utf16/utf16_range.h
@@ -1,8 +1,8 @@
-#ifndef _RE2C_DFA_ENCODING_UTF16_RANGE_
-#define _RE2C_DFA_ENCODING_UTF16_RANGE_
+#ifndef _RE2C_IR_REGEXP_ENCODING_UTF16_RANGE_
+#define _RE2C_IR_REGEXP_ENCODING_UTF16_RANGE_
  
-#include "src/dfa/encoding/range_suffix.h"
-#include "src/dfa/encoding/utf16/utf16.h"
+#include "src/ir/regexp/encoding/range_suffix.h"
+#include "src/ir/regexp/encoding/utf16/utf16.h"
  
  namespace re2c {
  
@@ -13,4 +13,4 @@ void UTF16splitByRuneLength(RangeSuffix * & root, utf16::rune l, utf16::rune h);
  
  } // namespace re2c
  
-#endif // _RE2C_DFA_ENCODING_UTF16_RANGE_
+#endif // _RE2C_IR_REGEXP_ENCODING_UTF16_RANGE_
diff --git a/re2c/src/dfa/encoding/utf16/utf16_regexp.cc b/re2c/src/ir/regexp/encoding/utf16/utf16_regexp.cc

similarity index 77%

rename from re2c/src/dfa/encoding/utf16/utf16_regexp.cc

rename to re2c/src/ir/regexp/encoding/utf16/utf16_regexp.cc

index ee26ce0e10c9de1e5b8306f78234a42ffbb0682b..2e7f69df8bcadb53b2350196ec15c50d26b26c17 100644 (file)
--- a/re2c/src/dfa/encoding/utf16/utf16_regexp.cc
+++ b/re2c/src/ir/regexp/encoding/utf16/utf16_regexp.cc
@@ -1,6 +1,8 @@
-#include "src/dfa/encoding/utf16/utf16_range.h"
-#include "src/dfa/encoding/utf16/utf16_regexp.h"
-#include "src/dfa/re.h"
+#include "src/ir/regexp/encoding/utf16/utf16_range.h"
+#include "src/ir/regexp/encoding/utf16/utf16_regexp.h"
+#include "src/ir/regexp/regexp.h"
+#include "src/ir/regexp/regexp_cat.h"
+#include "src/ir/regexp/regexp_match.h"
  
  namespace re2c {
  
diff --git a/re2c/src/ir/regexp/encoding/utf16/utf16_regexp.h b/re2c/src/ir/regexp/encoding/utf16/utf16_regexp.h

new file mode 100644 (file)

index 0000000..af1b7e7
--- /dev/null
+++ b/re2c/src/ir/regexp/encoding/utf16/utf16_regexp.h
@@ -0,0 +1,16 @@
+#ifndef _RE2C_IR_REGEXP_ENCODING_UTF16_REGEXP_
+#define _RE2C_IR_REGEXP_ENCODING_UTF16_REGEXP_
+
+#include "src/ir/regexp/encoding/utf16/utf16.h"
+#include "src/util/range.h"
+
+namespace re2c {
+
+class RegExp; // forward
+
+RegExp * UTF16Symbol(utf16::rune r);
+RegExp * UTF16Range(const Range * r);
+
+} // namespace re2c
+
+#endif // _RE2C_IR_REGEXP_ENCODING_UTF16_REGEXP_
diff --git a/re2c/src/dfa/encoding/utf8/utf8.cc b/re2c/src/ir/regexp/encoding/utf8/utf8.cc

similarity index 97%

rename from re2c/src/dfa/encoding/utf8/utf8.cc

rename to re2c/src/ir/regexp/encoding/utf8/utf8.cc

index 702569332efcb02eceb1c9dc2b93d8997e5e10da..8332e1311c29d0d59464575b2dcf21232c85c153 100644 (file)
--- a/re2c/src/dfa/encoding/utf8/utf8.cc
+++ b/re2c/src/ir/regexp/encoding/utf8/utf8.cc
@@ -1,4 +1,4 @@
-#include "src/dfa/encoding/utf8/utf8.h"
+#include "src/ir/regexp/encoding/utf8/utf8.h"
  
  namespace re2c {
  
diff --git a/re2c/src/dfa/encoding/utf8/utf8.h b/re2c/src/ir/regexp/encoding/utf8/utf8.h

similarity index 87%

rename from re2c/src/dfa/encoding/utf8/utf8.h

rename to re2c/src/ir/regexp/encoding/utf8/utf8.h

index 9c049a500202a410548ab4f6e6e77092cfab3508..72d14ccac3fee1042d7ed35fdfe01c9c8c4eab4b 100644 (file)
--- a/re2c/src/dfa/encoding/utf8/utf8.h
+++ b/re2c/src/ir/regexp/encoding/utf8/utf8.h
@@ -1,5 +1,5 @@
-#ifndef _RE2C_DFA_ENCODING_UTF8_UTF8_
-#define _RE2C_DFA_ENCODING_UTF8_UTF8_
+#ifndef _RE2C_IR_REGEXP_ENCODING_UTF8_UTF8_
+#define _RE2C_IR_REGEXP_ENCODING_UTF8_UTF8_
  
  #include "src/util/c99_stdint.h"
  
@@ -43,4 +43,4 @@ public:
  
  }  // namespace re2c
  
-#endif // _RE2C_DFA_ENCODING_UTF8_UTF8_
+#endif // _RE2C_IR_REGEXP_ENCODING_UTF8_UTF8_
diff --git a/re2c/src/dfa/encoding/utf8/utf8_range.cc b/re2c/src/ir/regexp/encoding/utf8/utf8_range.cc

similarity index 97%

rename from re2c/src/dfa/encoding/utf8/utf8_range.cc

rename to re2c/src/ir/regexp/encoding/utf8/utf8_range.cc

index c8fd9cb9f33bd193146d454907dac660239b4ab4..466526af555932ad375c42d1a4b2b4cc1c7a8e99 100644 (file)
--- a/re2c/src/dfa/encoding/utf8/utf8_range.cc
+++ b/re2c/src/ir/regexp/encoding/utf8/utf8_range.cc
@@ -1,5 +1,5 @@
-#include "src/dfa/encoding/utf8/utf8_range.h"
-#include "src/dfa/re.h"
+#include "src/ir/regexp/encoding/utf8/utf8_range.h"
+#include "src/ir/regexp/regexp.h"
  
  namespace re2c {
  
diff --git a/re2c/src/dfa/encoding/utf8/utf8_range.h b/re2c/src/ir/regexp/encoding/utf8/utf8_range.h

similarity index 55%

rename from re2c/src/dfa/encoding/utf8/utf8_range.h

rename to re2c/src/ir/regexp/encoding/utf8/utf8_range.h

index e4f91da34557cd22fd4e29dc8da6316d97921189..490e76bc44ecc59809083eca7ea85254f51b7add 100644 (file)
--- a/re2c/src/dfa/encoding/utf8/utf8_range.h
+++ b/re2c/src/ir/regexp/encoding/utf8/utf8_range.h
@@ -1,8 +1,8 @@
-#ifndef _RE2C_DFA_ENCODING_UTF8_RANGE_
-#define _RE2C_DFA_ENCODING_UTF8_RANGE_
+#ifndef _RE2C_IR_REGEXP_ENCODING_UTF8_RANGE_
+#define _RE2C_IR_REGEXP_ENCODING_UTF8_RANGE_
  
-#include "src/dfa/encoding/range_suffix.h"
-#include "src/dfa/encoding/utf8/utf8.h"
+#include "src/ir/regexp/encoding/range_suffix.h"
+#include "src/ir/regexp/encoding/utf8/utf8.h"
  
  namespace re2c {
  
@@ -12,4 +12,4 @@ void UTF8splitByRuneLength(RangeSuffix * & p, utf8::rune l, utf8::rune h);
  
  } // namespace re2c
  
-#endif // _RE2C_DFA_ENCODING_UTF8_RANGE_
+#endif // _RE2C_IR_REGEXP_ENCODING_UTF8_RANGE_
diff --git a/re2c/src/dfa/encoding/utf8/utf8_regexp.cc b/re2c/src/ir/regexp/encoding/utf8/utf8_regexp.cc

similarity index 78%

rename from re2c/src/dfa/encoding/utf8/utf8_regexp.cc

rename to re2c/src/ir/regexp/encoding/utf8/utf8_regexp.cc

index 1b9fb06c95f8187a4fc7f9e010c63c26ece4cf53..0e6001d3787046a94d10d8e36adb1820e2429856 100644 (file)
--- a/re2c/src/dfa/encoding/utf8/utf8_regexp.cc
+++ b/re2c/src/ir/regexp/encoding/utf8/utf8_regexp.cc
@@ -1,6 +1,8 @@
-#include "src/dfa/encoding/utf8/utf8_range.h"
-#include "src/dfa/encoding/utf8/utf8_regexp.h"
-#include "src/dfa/re.h"
+#include "src/ir/regexp/encoding/utf8/utf8_range.h"
+#include "src/ir/regexp/encoding/utf8/utf8_regexp.h"
+#include "src/ir/regexp/regexp.h"
+#include "src/ir/regexp/regexp_cat.h"
+#include "src/ir/regexp/regexp_match.h"
  
  namespace re2c {
  
diff --git a/re2c/src/ir/regexp/encoding/utf8/utf8_regexp.h b/re2c/src/ir/regexp/encoding/utf8/utf8_regexp.h

new file mode 100644 (file)

index 0000000..434fc32
--- /dev/null
+++ b/re2c/src/ir/regexp/encoding/utf8/utf8_regexp.h
@@ -0,0 +1,16 @@
+#ifndef _RE2C_IR_REGEXP_ENCODING_UTF8_REGEXP_
+#define _RE2C_IR_REGEXP_ENCODING_UTF8_REGEXP_
+
+#include "src/ir/regexp/encoding/utf8/utf8.h"
+#include "src/util/range.h"
+
+namespace re2c {
+
+class RegExp; // forward
+
+RegExp * UTF8Symbol(utf8::rune r);
+RegExp * UTF8Range(const Range * r);
+
+} // namespace re2c
+
+#endif // _RE2C_IR_REGEXP_ENCODING_UTF8_REGEXP_
diff --git a/re2c/src/ir/regexp/fixed_length.cc b/re2c/src/ir/regexp/fixed_length.cc

new file mode 100644 (file)

index 0000000..7319be8
--- /dev/null
+++ b/re2c/src/ir/regexp/fixed_length.cc
@@ -0,0 +1,53 @@
+#include "src/ir/regexp/regexp.h"
+#include "src/ir/regexp/regexp_alt.h"
+#include "src/ir/regexp/regexp_cat.h"
+#include "src/ir/regexp/regexp_match.h"
+#include "src/ir/regexp/regexp_null.h"
+
+namespace re2c
+{
+
+uint32_t RegExp::fixedLength ()
+{
+       return ~0u;
+}
+
+uint32_t AltOp::fixedLength ()
+{
+       uint32_t l1 = exp1->fixedLength ();
+       uint32_t l2 = exp1->fixedLength ();
+
+       if (l1 != l2 || l1 == ~0u)
+       {
+               return ~0u;
+       }
+
+       return l1;
+}
+
+uint32_t CatOp::fixedLength ()
+{
+       const uint32_t l1 = exp1->fixedLength ();
+       if (l1 != ~0u)
+       {
+               const uint32_t l2 = exp2->fixedLength ();
+               if (l2 != ~0u)
+               {
+                       return l1 + l2;
+               }
+       }
+       return ~0u;
+}
+
+uint32_t MatchOp::fixedLength ()
+{
+       return 1;
+}
+
+uint32_t NullOp::fixedLength ()
+{
+       return 0;
+}
+
+} // end namespace re2c
+
diff --git a/re2c/src/ir/regexp/regexp.cc b/re2c/src/ir/regexp/regexp.cc

new file mode 100644 (file)

index 0000000..67515cb
--- /dev/null
+++ b/re2c/src/ir/regexp/regexp.cc
@@ -0,0 +1,287 @@
+#include "src/ir/regexp/encoding/utf16/utf16_regexp.h"
+#include "src/ir/regexp/encoding/utf8/utf8_regexp.h"
+#include "src/ir/regexp/regexp.h"
+#include "src/ir/regexp/regexp_alt.h"
+#include "src/ir/regexp/regexp_cat.h"
+#include "src/ir/regexp/regexp_match.h"
+#include "src/ir/regexp/regexp_null.h"
+#include "src/parse/scanner.h"
+#include "src/util/range.h"
+
+namespace re2c
+{
+
+static MatchOp * merge (MatchOp * m1, MatchOp * m2);
+
+RegExp * doAlt (RegExp * e1, RegExp * e2)
+{
+       if (!e1)
+       {
+               return e2;
+       }
+       if (!e2)
+       {
+               return e1;
+       }
+       return new AltOp (e1, e2);
+}
+
+RegExp * mkAlt (RegExp * e1, RegExp * e2)
+{
+       AltOp * a;
+       MatchOp * m1;
+       MatchOp * m2;
+
+       a = dynamic_cast<AltOp*> (e1);
+       if (a != NULL)
+       {
+               m1 = dynamic_cast<MatchOp*> (a->exp1);
+               if (m1 != NULL)
+               {
+                       m1->ins_access = e1->ins_access;
+                       a->exp2->ins_access = e1->ins_access;
+                       e1 = a->exp2;
+               }
+       }
+       else
+       {
+               m1 = dynamic_cast<MatchOp*> (e1);
+               if (m1 != NULL)
+               {
+                       e1 = NULL;
+               }
+       }
+       a = dynamic_cast<AltOp*> (e2);
+       if (a != NULL)
+       {
+               m2 = dynamic_cast<MatchOp*> (a->exp1);
+               if (m2 != NULL)
+               {
+                       m2->ins_access = e2->ins_access;
+                       a->exp2->ins_access = e2->ins_access;
+                       e2 = a->exp2;
+               }
+       }
+       else
+       {
+               m2 = dynamic_cast<MatchOp*> (e2);
+               if (m2 != NULL)
+               {
+                       e2 = NULL;
+               }
+       }
+
+       return doAlt (merge (m1, m2), doAlt (e1, e2));
+}
+
+MatchOp * merge (MatchOp * m1, MatchOp * m2)
+{
+       if (!m1)
+       {
+               return m2;
+       }
+       if (!m2)
+       {
+               return m1;
+       }
+       MatchOp * m = new MatchOp (doUnion (m1->match, m2->match));
+       if (m1->ins_access == RegExp::PRIVATE
+               || m2->ins_access == RegExp::PRIVATE)
+       {
+               m->ins_access = RegExp::PRIVATE;
+       }
+       return m;
+}
+
+RegExp * doCat (RegExp * e1, RegExp * e2)
+{
+       if (!e1)
+       {
+               return e2;
+       }
+       if (!e2)
+       {
+               return e1;
+       }
+       return new CatOp (e1, e2);
+}
+
+RegExp * mkDiff (RegExp * e1, RegExp * e2)
+{
+       MatchOp * m1 = dynamic_cast<MatchOp *> (e1);
+       MatchOp * m2 = dynamic_cast<MatchOp *> (e2);
+       if (m1 == NULL || m2 == NULL)
+       {
+               return NULL;
+       }
+       Range * r = doDiff (m1->match, m2->match);
+       return r
+               ? (RegExp *) new MatchOp(r)
+               : (RegExp *) new NullOp;
+}
+
+Range * Scanner::getRange(SubStr &s) const
+{
+       uint32_t lb = unescape(s), ub;
+
+       if (s.len < 2 || *s.str != '-')
+       {
+               ub = lb;
+       }
+       else
+       {
+               s.len--;
+               s.str++;
+               ub = unescape(s);
+               if (ub < lb)
+               {
+                       uint32_t tmp = lb;
+                       lb = ub;
+                       ub = tmp;
+               }
+       }
+
+       Range * r = encoding.encodeRange(lb, ub);
+       if (r == NULL)
+               fatalf("Bad code point range: '0x%X - 0x%X'", lb, ub);
+       return r;
+}
+
+RegExp * Scanner::matchSymbol(uint32_t c) const
+{
+       if (!encoding.encode(c))
+               fatalf("Bad code point: '0x%X'", c);
+
+       if (encoding.is(Enc::UTF16))
+               return UTF16Symbol(c);
+       else if (encoding.is(Enc::UTF8))
+               return UTF8Symbol(c);
+       else
+               return new MatchOp(new Range(c, c + 1));
+}
+
+RegExp * Scanner::strToRE (SubStr & s) const
+{
+       if (s.len == 0)
+               return new NullOp;
+
+       RegExp *re = matchSymbol(unescape(s));
+
+       while (s.len > 0)
+               re = new CatOp(re, matchSymbol(unescape(s)));
+
+       return re;
+}
+
+RegExp * Scanner::strToCaseInsensitiveRE (SubStr & s) const
+{
+       if (s.len == 0)
+               return new NullOp;
+
+       uint32_t c = unescape(s);
+
+       RegExp *re, *reL, *reU;
+
+       if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z'))
+       {
+               reL = matchSymbol(tolower(c));
+               reU = matchSymbol(toupper(c));
+               re = mkAlt(reL, reU);
+       }
+       else
+       {
+               re = matchSymbol(c);
+       }
+
+       while (s.len > 0)
+       {
+               c = unescape(s);
+
+               if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z'))
+               {
+                       reL = matchSymbol(tolower(c));
+                       reU = matchSymbol(toupper(c));
+                       re = new CatOp(re, mkAlt(reL, reU));
+               }
+               else
+               {
+                       re = new CatOp(re, matchSymbol(c));
+               }
+       }
+
+       return re;
+}
+
+Range * Scanner::mkRange(SubStr &s) const
+{
+       Range *r = getRange(s);
+       while (s.len > 0)
+               r = doUnion(r, getRange(s));
+
+       return r;
+}
+
+RegExp * Scanner::matchSymbolRange(Range * r) const
+{
+       if (encoding.is(Enc::UTF16))
+               return UTF16Range(r);
+       else if (encoding.is(Enc::UTF8))
+               return UTF8Range(r);
+       else
+               return new MatchOp(r);
+}
+
+RegExp * Scanner::ranToRE (SubStr & s) const
+{
+       s.len -= 2;
+       s.str += 1;
+
+       if (s.len == 0)
+               return new NullOp;
+
+       return matchSymbolRange(mkRange(s));
+}
+
+RegExp * Scanner::invToRE (SubStr & s) const
+{
+       s.len -= 3;
+       s.str += 2;
+
+       Range * full = encoding.fullRange();
+
+       Range * r = s.len == 0
+               ? full
+               : doDiff(full, mkRange (s));
+
+       return matchSymbolRange(r);
+}
+
+RegExp * Scanner::mkDot() const
+{
+       Range * full = encoding.fullRange();
+       uint32_t c = '\n';
+       if (!encoding.encode(c))
+               fatalf("Bad code point: '0x%X'", c);
+       Range * ran = new Range(c, c + 1);
+       Range * inv = doDiff(full, ran);
+
+       return matchSymbolRange(inv);
+}
+
+/*
+ * Create a byte range that includes all possible input characters.
+ * This may include characters, which do not map to any valid symbol
+ * in current encoding. For encodings, which directly map symbols to
+ * input characters (ASCII, EBCDIC, UTF-32), it equals [^]. For other
+ * encodings (UTF-16, UTF-8), [^] and this range are different.
+ *
+ * Also note that default range doesn't respect encoding policy
+ * (the way invalid code points are treated).
+ */
+RegExp * Scanner::mkDefault() const
+{
+       Range * def = new Range(0, encoding.nCodeUnits());
+       return new MatchOp(def);
+}
+
+} // namespace re2c
diff --git a/re2c/src/ir/regexp/regexp.h b/re2c/src/ir/regexp/regexp.h

new file mode 100644 (file)

index 0000000..758d3b4
--- /dev/null
+++ b/re2c/src/ir/regexp/regexp.h
@@ -0,0 +1,76 @@
+#ifndef _RE2C_IR_REGEXP_REGEXP_
+#define _RE2C_IR_REGEXP_REGEXP_
+
+#include <iosfwd>
+
+#include "src/ir/bytecode/charset.h"
+#include "src/ir/bytecode/ins.h"
+#include "src/globals.h"
+#include "src/util/forbid_copy.h"
+
+namespace re2c
+{
+
+typedef uint32_t Char;
+
+class RegExp
+{
+public:
+       static free_list <RegExp *> vFreeList;
+
+       uint32_t size;
+       /*
+        * There're several different cases when the same regexp
+        * can be used multiple times:
+        *      1) named definitions, e.g. digit = [0-9];
+        *      2) counted repetition, e.g. "a"{3}, "a"{3,}, "a"{3,5}
+        *      3) multiple DFA's sharing the same regexp , e.g. <c1, c2> "abc" { }
+        *      4) common suffixes, e.g. suffix [\x80-\xBF] in UTF-8 ranges
+        * In cases 1-3, regexp must be recompiled each time it's reused.
+        * In case 4, regexp should be compiled only once, and instructions
+        * should be shared in order to reduce space.
+        *
+        * Note: if regexp must always be recompiled, it doesn't imply that
+        * parts of this regexp must always be recompiled. It only forces
+        * the compilation function to traverse the regexp after compilation
+        * and reset compilation cache for each sub-regexp. E.g., for a regexp
+        * [^]{3} in UTF-8 mode, each of sub-regexps [^] will have common suffix
+        * [\x80-\xBF] factored out, but they won't share instructions.
+        */
+       Ins * ins_cache; /* if non-NULL, points to compiled instructions */
+       enum InsAccess
+       {
+               SHARED,
+               PRIVATE
+       } ins_access;
+
+       inline RegExp ()
+               : size (0)
+               , ins_cache (NULL)
+               , ins_access (SHARED)
+       {
+               vFreeList.insert (this);
+       }
+       inline virtual ~RegExp ()
+       {
+               vFreeList.erase (this);
+       }
+       virtual void split (CharSet &) = 0;
+       virtual void calcSize (Char *) = 0;
+       virtual uint32_t fixedLength ();
+       virtual uint32_t compile (Char *, Ins *) = 0;
+       virtual void decompile () = 0;
+       virtual void display (std::ostream &) const = 0;
+       friend std::ostream & operator << (std::ostream & o, const RegExp & re);
+
+       FORBID_COPY (RegExp);
+};
+
+RegExp * doAlt (RegExp * e1, RegExp * e2);
+RegExp * mkAlt (RegExp * e1, RegExp * e2);
+RegExp * doCat (RegExp * e1, RegExp * e2);
+RegExp * mkDiff (RegExp * e1, RegExp * e2);
+
+} // end namespace re2c
+
+#endif // _RE2C_IR_REGEXP_REGEXP_
diff --git a/re2c/src/ir/regexp/regexp_alt.h b/re2c/src/ir/regexp/regexp_alt.h

new file mode 100644 (file)

index 0000000..fb84be5
--- /dev/null
+++ b/re2c/src/ir/regexp/regexp_alt.h
@@ -0,0 +1,32 @@
+#ifndef _RE2C_IR_REGEXP_REGEXP_ALT_
+#define _RE2C_IR_REGEXP_REGEXP_ALT_
+
+#include "src/ir/regexp/regexp.h"
+
+namespace re2c
+{
+
+class AltOp: public RegExp
+{
+       RegExp * exp1;
+       RegExp * exp2;
+
+public:
+       inline AltOp (RegExp * e1, RegExp * e2)
+               : exp1 (e1)
+               , exp2 (e2)
+       {}
+       void split (CharSet &);
+       void calcSize (Char *);
+       uint32_t fixedLength ();
+       uint32_t compile (Char *, Ins *);
+       void decompile ();
+       void display (std::ostream & o) const;
+       friend RegExp * mkAlt (RegExp *, RegExp *);
+
+       FORBID_COPY (AltOp);
+};
+
+} // end namespace re2c
+
+#endif // _RE2C_IR_REGEXP_REGEXP_ALT_
diff --git a/re2c/src/ir/regexp/regexp_cat.h b/re2c/src/ir/regexp/regexp_cat.h

new file mode 100644 (file)

index 0000000..08ad5f0
--- /dev/null
+++ b/re2c/src/ir/regexp/regexp_cat.h
@@ -0,0 +1,31 @@
+#ifndef _RE2C_IR_REGEXP_REGEXP_CAT_
+#define _RE2C_IR_REGEXP_REGEXP_CAT_
+
+#include "src/ir/regexp/regexp.h"
+
+namespace re2c
+{
+
+class CatOp: public RegExp
+{
+       RegExp * exp1;
+       RegExp * exp2;
+
+public:
+       inline CatOp (RegExp * e1, RegExp * e2)
+               : exp1 (e1)
+               , exp2 (e2)
+       {}
+       void split (CharSet &);
+       void calcSize (Char *);
+       uint32_t fixedLength ();
+       uint32_t compile (Char *, Ins *);
+       void decompile ();
+       void display (std::ostream & o) const;
+
+       FORBID_COPY (CatOp);
+};
+
+} // end namespace re2c
+
+#endif // _RE2C_IR_REGEXP_REGEXP_CAT_
diff --git a/re2c/src/ir/regexp/regexp_close.h b/re2c/src/ir/regexp/regexp_close.h

new file mode 100644 (file)

index 0000000..e147d8c
--- /dev/null
+++ b/re2c/src/ir/regexp/regexp_close.h
@@ -0,0 +1,28 @@
+#ifndef _RE2C_IR_REGEXP_REGEXP_CLOSE_
+#define _RE2C_IR_REGEXP_REGEXP_CLOSE_
+
+#include "src/ir/regexp/regexp.h"
+
+namespace re2c
+{
+
+class CloseOp: public RegExp
+{
+       RegExp * exp;
+
+public:
+       inline CloseOp (RegExp * e)
+               : exp (e)
+       {}
+       void split (CharSet &);
+       void calcSize (Char *);
+       uint32_t compile (Char *, Ins *);
+       void decompile ();
+       void display (std::ostream & o) const;
+
+       FORBID_COPY (CloseOp);
+};
+
+} // end namespace re2c
+
+#endif // _RE2C_IR_REGEXP_REGEXP_CLOSE_
diff --git a/re2c/src/ir/regexp/regexp_closev.h b/re2c/src/ir/regexp/regexp_closev.h

new file mode 100644 (file)

index 0000000..8cd054e
--- /dev/null
+++ b/re2c/src/ir/regexp/regexp_closev.h
@@ -0,0 +1,34 @@
+#ifndef _RE2C_IR_REGEXP_REGEXP_CLOSEV_
+#define _RE2C_IR_REGEXP_REGEXP_CLOSEV_
+
+#include "src/ir/regexp/regexp.h"
+
+namespace re2c
+{
+
+class CloseVOp: public RegExp
+{
+       RegExp * exp;
+       int32_t min;
+       int32_t max;
+
+public:
+       inline CloseVOp (RegExp * e, int lb, int ub)
+               : exp (e)
+               , min (lb)
+               , max (ub)
+       {
+               exp->ins_access = PRIVATE;
+       }
+       void split (CharSet &);
+       void calcSize (Char *);
+       uint32_t compile (Char *, Ins *);
+       void decompile ();
+       void display (std::ostream & o) const;
+
+       FORBID_COPY (CloseVOp);
+};
+
+} // end namespace re2c
+
+#endif // _RE2C_IR_REGEXP_REGEXP_CLOSEV_
diff --git a/re2c/src/ir/regexp/regexp_match.h b/re2c/src/ir/regexp/regexp_match.h

new file mode 100644 (file)

index 0000000..4905994
--- /dev/null
+++ b/re2c/src/ir/regexp/regexp_match.h
@@ -0,0 +1,30 @@
+#ifndef _RE2C_IR_REGEXP_REGEXP_MATCH_
+#define _RE2C_IR_REGEXP_REGEXP_MATCH_
+
+#include "src/ir/regexp/regexp.h"
+#include "src/util/range.h"
+
+namespace re2c
+{
+
+class MatchOp: public RegExp
+{
+public:
+       Range * match;
+
+       inline MatchOp (Range * m)
+               : match (m)
+       {}
+       void split (CharSet &);
+       void calcSize (Char *);
+       uint32_t fixedLength ();
+       uint32_t compile (Char *, Ins *);
+       void decompile ();
+       void display (std::ostream & o) const;
+
+       FORBID_COPY (MatchOp);
+};
+
+} // end namespace re2c
+
+#endif // _RE2C_IR_REGEXP_REGEXP_MATCH_
diff --git a/re2c/src/ir/regexp/regexp_null.h b/re2c/src/ir/regexp/regexp_null.h

new file mode 100644 (file)

index 0000000..d2227b9
--- /dev/null
+++ b/re2c/src/ir/regexp/regexp_null.h
@@ -0,0 +1,22 @@
+#ifndef _RE2C_IR_REGEXP_REGEXP_NULL_
+#define _RE2C_IR_REGEXP_REGEXP_NULL_
+
+#include "src/ir/regexp/regexp.h"
+
+namespace re2c
+{
+
+class NullOp: public RegExp
+{
+public:
+       void split (CharSet &);
+       void calcSize (Char *);
+       uint32_t fixedLength ();
+       uint32_t compile (Char *, Ins *);
+       void decompile ();
+       void display (std::ostream & o) const;
+};
+
+} // end namespace re2c
+
+#endif // _RE2C_IR_REGEXP_REGEXP_NULL_
diff --git a/re2c/src/ir/regexp/regexp_rule.h b/re2c/src/ir/regexp/regexp_rule.h

new file mode 100644 (file)

index 0000000..3e6718f
--- /dev/null
+++ b/re2c/src/ir/regexp/regexp_rule.h
@@ -0,0 +1,47 @@
+#ifndef _RE2C_IR_REGEXP_REGEXP_RULE_
+#define _RE2C_IR_REGEXP_REGEXP_RULE_
+
+#include "src/ir/regexp/regexp.h"
+#include "src/ir/rule_rank.h"
+#include "src/parse/token.h"
+
+namespace re2c
+{
+
+class RuleOp: public RegExp
+{
+       RegExp * exp;
+
+public:
+       RegExp * ctx;
+       Ins * ins;
+       rule_rank_t rank;
+       Token * code;
+       uint32_t line;
+
+       inline RuleOp (RegExp * e, RegExp * c, Token * t, rule_rank_t r, InsAccess access)
+               : exp (e)
+               , ctx (c)
+               , ins (NULL)
+               , rank (r)
+               , code (t)
+               , line (0)
+       {
+               ins_access = access;
+       }
+       inline ~RuleOp ()
+       {
+               delete code;
+       }
+       void display (std::ostream & o) const;
+       void split (CharSet &);
+       void calcSize (Char *);
+       uint32_t compile (Char *, Ins *);
+       void decompile ();
+
+       FORBID_COPY (RuleOp);
+};
+
+} // end namespace re2c
+
+#endif // _RE2C_IR_REGEXP_REGEXP_RULE_
diff --git a/re2c/src/dfa/rule_rank.cc b/re2c/src/ir/rule_rank.cc

similarity index 94%

rename from re2c/src/dfa/rule_rank.cc

rename to re2c/src/ir/rule_rank.cc

index 4120ff28bf628d245c31b3440ee75dcc221054a3..ca19f689268c543731740dd8dc97b13b7cd308bb 100644 (file)
--- a/re2c/src/dfa/rule_rank.cc
+++ b/re2c/src/ir/rule_rank.cc
@@ -1,6 +1,6 @@
  #include <ostream>
  
-#include "src/dfa/rule_rank.h"
+#include "src/ir/rule_rank.h"
  
  namespace re2c
  {
diff --git a/re2c/src/dfa/rule_rank.h b/re2c/src/ir/rule_rank.h

similarity index 89%

rename from re2c/src/dfa/rule_rank.h

rename to re2c/src/ir/rule_rank.h

index f3e49ebd8e65ba87d57dec9351c14e44370d0ccf..39473b8835ebd6ca6bd8b708b9e3a995912f6ce3 100644 (file)
--- a/re2c/src/dfa/rule_rank.h
+++ b/re2c/src/ir/rule_rank.h
@@ -1,5 +1,5 @@
-#ifndef _RE2C_DFA_RULE_RANK_
-#define _RE2C_DFA_RULE_RANK_
+#ifndef _RE2C_IR_RULE_RANK_
+#define _RE2C_IR_RULE_RANK_
  
  #include <iosfwd>
  
@@ -36,4 +36,4 @@ public:
  
  } // namespace re2c
  
-#endif // _RE2C_DFA_RULE_RANK_
+#endif // _RE2C_IR_RULE_RANK_
diff --git a/re2c/src/main.cc b/re2c/src/main.cc

index 7d005971e642da043319a09502cb9674c5f5b0f1..d97f7283acb567b864c7d0d9c95e0fd9ab34467e 100644 (file)
--- a/re2c/src/main.cc
+++ b/re2c/src/main.cc
@@ -4,8 +4,6 @@
  #include <string.h>
  
  #include "config.h"
-#include "src/dfa/dfa.h"
-#include "src/dfa/encoding/enc.h"
  #include "src/globals.h"
  #include "src/mbo_getopt.h"
  #include "src/parse/parser.h"
diff --git a/re2c/src/parse/extop.h b/re2c/src/parse/extop.h

new file mode 100644 (file)

index 0000000..44b33ff
--- /dev/null
+++ b/re2c/src/parse/extop.h
@@ -0,0 +1,18 @@
+#ifndef _RE2C_PARSE_EXTOP_
+#define _RE2C_PARSE_EXTOP_
+
+#include "src/util/c99_stdint.h"
+
+namespace re2c
+{
+
+struct ExtOp
+{
+       char op;
+       int32_t minsize;
+       int32_t maxsize;
+};
+
+} // end namespace re2c
+
+#endif // _RE2C_PARSE_EXTOP_
diff --git a/re2c/src/parse/parser.h b/re2c/src/parse/parser.h

index f4b3f295901923b26b0e1735b2cbd9c5b2a00d37..2c973279b379e5d426e0dd2e70693d3279e0c731 100644 (file)
--- a/re2c/src/parse/parser.h
+++ b/re2c/src/parse/parser.h
@@ -1,10 +1,12 @@
  #ifndef _RE2C_PARSE_PARSER_
  #define _RE2C_PARSE_PARSER_
  
+#include <list>
  #include <string>
  
  #include "src/codegen/output.h"
-#include "src/dfa/re.h"
+#include "src/ir/regexp/regexp.h"
+#include "src/ir/regexp/regexp_rule.h"
  #include "src/parse/scanner.h"
  
  namespace re2c
diff --git a/re2c/src/parse/parser.ypp b/re2c/src/parse/parser.ypp

index 164f7db5cabe79239d7391884a65bb27450b0ac8..114ad9402e692e5364c1395e4ac1effe7e8f78bd 100644 (file)
--- a/re2c/src/parse/parser.ypp
+++ b/re2c/src/parse/parser.ypp
@@ -8,9 +8,14 @@
  #include <set>
  
  #include "config.h"
-#include "src/dfa/dfa.h"
+#include "src/ir/bytecode/bytecode.h"
+#include "src/ir/regexp/regexp_cat.h"
+#include "src/ir/regexp/regexp_close.h"
+#include "src/ir/regexp/regexp_closev.h"
+#include "src/ir/regexp/regexp_null.h"
  #include "src/codegen/emit.h" // genTypes
  #include "src/globals.h"
+#include "src/parse/extop.h"
  #include "src/parse/parser.h"
  #include "src/util/c99_stdint.h"
  #include "src/util/smart_ptr.h"
diff --git a/re2c/src/parse/scanner.h b/re2c/src/parse/scanner.h

index 1475dd702eea2e8906244db0a28ca6ca68419751..51236b5a112d8176527a2f7da9dbafc268770f55 100644 (file)
--- a/re2c/src/parse/scanner.h
+++ b/re2c/src/parse/scanner.h
@@ -4,7 +4,7 @@
  #include <string>
  
  #include "src/codegen/output.h"
-#include "src/dfa/re.h"
+#include "src/ir/regexp/regexp.h"
  #include "src/globals.h"
  #include "src/parse/input.h"
  #include "src/parse/token.h"
diff --git a/re2c/src/parse/scanner_lex.re b/re2c/src/parse/scanner_lex.re

index 6ddc50825a1cf93a5b552a1c5f86dc061abb9651..1eca76033bcfeb86f6e7d95ca075aa2e89ddcc72 100644 (file)
--- a/re2c/src/parse/scanner_lex.re
+++ b/re2c/src/parse/scanner_lex.re
@@ -3,8 +3,9 @@
  #include <iostream>
  #include <sstream>
  
-#include "src/dfa/dfa.h"
+#include "src/ir/dfa/dfa.h"
  #include "src/globals.h"
+#include "src/parse/extop.h"
  #include "src/parse/parser.h"
  #include "src/parse/scanner.h"
  #include "y.tab.h"
diff --git a/re2c/src/parse/unescape.cc b/re2c/src/parse/unescape.cc

new file mode 100644 (file)

index 0000000..5064d5e
--- /dev/null
+++ b/re2c/src/parse/unescape.cc
@@ -0,0 +1,220 @@
+#include <string.h>
+
+#include "src/parse/scanner.h"
+#include "src/util/substr.h"
+
+namespace re2c {
+
+uint32_t Scanner::unescape(SubStr &s) const
+{
+       static const char * hex = "0123456789abcdef";
+       static const char * oct = "01234567";
+
+       s.len--;
+       uint32_t c, ucb = 0;
+
+       if ((c = *s.str++) != '\\' || s.len == 0)
+       {
+               return c;
+       }
+
+       s.len--;
+
+       switch (c = *s.str++)
+       {
+               case 'n': return '\n';
+               case 't': return '\t';
+               case 'v': return '\v';
+               case 'b': return '\b';
+               case 'r': return '\r';
+               case 'f': return '\f';
+               case 'a': return '\a';
+
+               case 'x':
+               {
+                       if (s.len < 2)
+                       {
+                               fatal(s.ofs()+s.len, "Illegal hexadecimal character code, two hexadecimal digits are required");
+                               return ~0u;
+                       }
+                       
+                       const char *p1 = strchr(hex, tolower(s.str[0]));
+                       const char *p2 = strchr(hex, tolower(s.str[1]));
+
+                       if (!p1 || !p2)
+                       {
+                               fatal(s.ofs()+(p1?1:0), "Illegal hexadecimal character code");
+                               return ~0u;
+                       }
+                       else
+                       {
+                               s.len -= 2;
+                               s.str += 2;
+                               
+                               uint32_t v = (uint32_t)((p1 - hex) << 4) 
+                                      + (uint32_t)((p2 - hex));
+       
+                               return v;
+                       }
+               }
+
+               case 'U':
+               {
+                       if (s.len < 8)
+                       {
+                               fatal(s.ofs()+s.len, "Illegal unicode character, eight hexadecimal digits are required");
+                               return ~0u;
+                       }
+
+                       uint32_t l = 0;
+                       if (s.str[0] == '0')
+                       {
+                               l++;
+                               if (s.str[1] == '0')
+                               {
+                                       l++;
+                                       if (s.str[2] == '0' || (s.str[2] == '1' && encoding.szCodePoint() == 4))
+                                       {
+                                               l++;
+                                               if (encoding.szCodePoint() == 4)
+                                               {
+                                                       const char *u3 = strchr(hex, tolower(s.str[2]));
+                                                       const char *u4 = strchr(hex, tolower(s.str[3]));
+                                                       if (u3 && u4)
+                                                       {
+                                                               ucb = (uint32_t)((u3 - hex) << 20)
+                                                               + (uint32_t)((u4 - hex) << 16);
+                                                               l++;
+                                                       }
+                                               }
+                                               else if (s.str[3] == '0')
+                                               {
+                                                       l++;
+                                               }
+                                       }
+                               }
+                       }
+
+                       if (l != 4)
+                       {
+                               fatal(s.ofs()+l, "Illegal unicode character, eight hexadecimal digits are required");
+                       }
+
+                       s.len -= 4;
+                       s.str += 4;
+                       
+                       // no break;
+               }
+               case 'X':
+               case 'u':
+               {
+                       if (s.len < 4)
+                       {
+                               fatal(s.ofs()+s.len, 
+                                       c == 'X'
+                                       ? "Illegal hexadecimal character code, four hexadecimal digits are required"
+                                       : "Illegal unicode character, four hexadecimal digits are required");
+                               return ~0u;
+                       }
+                       
+                       const char *p1 = strchr(hex, tolower(s.str[0]));
+                       const char *p2 = strchr(hex, tolower(s.str[1]));
+                       const char *p3 = strchr(hex, tolower(s.str[2]));
+                       const char *p4 = strchr(hex, tolower(s.str[3]));
+
+                       if (!p1 || !p2 || !p3 || !p4)
+                       {
+                               fatal(s.ofs()+(p1?1:0)+(p2?1:0)+(p3?1:0), 
+                                       c == 'X'
+                                       ? "Illegal hexadecimal character code, non hexxdecimal digit found"
+                                       : "Illegal unicode character, non hexadecimal digit found");
+                               return ~0u;
+                       }
+                       else
+                       {
+                               s.len -= 4;
+                               s.str += 4;
+                               
+                               uint32_t v = (uint32_t)((p1 - hex) << 12) 
+                                      + (uint32_t)((p2 - hex) <<  8)
+                                      + (uint32_t)((p3 - hex) <<  4)
+                                      + (uint32_t)((p4 - hex))
+                                      + ucb;
+       
+                               if (v >= encoding.nCodePoints())
+                               {
+                                       fatal(s.ofs(),
+                                               c == 'X'
+                                               ? "Illegal hexadecimal character code, out of range"
+                                               : "Illegal unicode character, out of range");
+                               }
+       
+                               return v;
+                       }
+               }
+
+               case '4':
+               case '5':
+               case '6':
+               case '7':
+               {
+                       fatal(s.ofs()-1, "Illegal octal character code, first digit must be 0 thru 3");
+                       return ~0u;
+               }
+
+               case '0':
+               case '1':
+               case '2':
+               case '3':
+               {
+                       if (s.len < 2)
+                       {
+                               fatal(s.ofs()+s.len, "Illegal octal character code, three octal digits are required");
+                               return ~0u;
+                       }
+
+                       const char *p0 = strchr(oct, c);
+                       const char *p1 = strchr(oct, s.str[0]);
+                       const char *p2 = strchr(oct, s.str[1]);
+
+                       if (!p0 || !p1 || !p2)
+                       {
+                               fatal(s.ofs()+(p1?1:0), "Illegal octal character code, non octal digit found");
+                               return ~0u;
+                       }
+                       else
+                       {
+                               s.len -= 2;
+                               s.str += 2;
+                               
+                               uint32_t v = (uint32_t)((p0 - oct) << 6) + (uint32_t)((p1 - oct) << 3) + (uint32_t)(p2 - oct);
+       
+                               return v;
+                       }
+               }
+
+               default:
+               return c;
+       }
+}
+
+std::string& Scanner::unescape(SubStr& str_in, std::string& str_out) const
+{
+       str_out.clear();
+
+       while(str_in.len)
+       {
+               uint32_t c = unescape(str_in);
+
+               if (c > 0xFF)
+               {
+                       fatal(str_in.ofs(), "Illegal character");
+               }
+
+               str_out += static_cast<char>(c);
+       }
+
+       return str_out;
+}
+
+} // namespace re2c
author	Ulya Trofimovich <skvadrik@gmail.com>
	Wed, 10 Jun 2015 16:30:15 +0000 (17:30 +0100)
committer	Ulya Trofimovich <skvadrik@gmail.com>
	Wed, 10 Jun 2015 16:30:15 +0000 (17:30 +0100)
re2c/Makefile.am		patch \| blob \| history
re2c/bootstrap/scanner_lex.cc		patch \| blob \| history
re2c/src/codegen/emit.h		patch \| blob \| history
re2c/src/codegen/emit_action.cc		patch \| blob \| history
re2c/src/codegen/emit_dfa.cc		patch \| blob \| history
re2c/src/codegen/go_construct.cc		patch \| blob \| history
re2c/src/codegen/go_destruct.cc		patch \| blob \| history
re2c/src/codegen/go_emit.cc		patch \| blob \| history
re2c/src/codegen/go_used_labels.cc		patch \| blob \| history
re2c/src/codegen/output.h		patch \| blob \| history
re2c/src/codegen/prepare_dfa.cc		patch \| blob \| history
re2c/src/codegen/scc.cc		patch \| blob \| history
re2c/src/codegen/skeleton/path.h		patch \| blob \| history
re2c/src/codegen/skeleton/skeleton.cc		patch \| blob \| history
re2c/src/codegen/skeleton/skeleton.h		patch \| blob \| history
re2c/src/dfa/actions.cc	[deleted file]	patch \| blob \| history
re2c/src/dfa/encoding/utf16/utf16_regexp.h	[deleted file]	patch \| blob \| history
re2c/src/dfa/encoding/utf8/utf8_regexp.h	[deleted file]	patch \| blob \| history
re2c/src/dfa/ins.h	[deleted file]	patch \| blob \| history
re2c/src/dfa/re.h	[deleted file]	patch \| blob \| history
re2c/src/globals.h		patch \| blob \| history
re2c/src/ir/bytecode/bytecode.cc	[new file with mode: 0644]	patch \| blob
re2c/src/ir/bytecode/bytecode.h	[new file with mode: 0644]	patch \| blob
re2c/src/ir/bytecode/calc_size.cc	[new file with mode: 0644]	patch \| blob
re2c/src/ir/bytecode/charset.cc	[new file with mode: 0644]	patch \| blob
re2c/src/ir/bytecode/charset.h	[new file with mode: 0644]	patch \| blob
re2c/src/ir/bytecode/compile.cc	[new file with mode: 0644]	patch \| blob
re2c/src/ir/bytecode/ins.cc	[new file with mode: 0644]	patch \| blob
re2c/src/ir/bytecode/ins.h	[new file with mode: 0644]	patch \| blob
re2c/src/ir/bytecode/split.cc	[new file with mode: 0644]	patch \| blob
re2c/src/ir/dfa/action.h	[moved from re2c/src/dfa/action.h with 93% similarity]	patch \| blob \| history
re2c/src/ir/dfa/dfa.cc	[moved from re2c/src/dfa/dfa.cc with 98% similarity]	patch \| blob \| history
re2c/src/ir/dfa/dfa.h	[moved from re2c/src/dfa/dfa.h with 80% similarity]	patch \| blob \| history
re2c/src/ir/dfa/state.h	[moved from re2c/src/dfa/state.h with 80% similarity]	patch \| blob \| history
re2c/src/ir/regexp/display.cc	[new file with mode: 0644]	patch \| blob
re2c/src/ir/regexp/encoding/enc.cc	[moved from re2c/src/dfa/encoding/enc.cc with 99% similarity]	patch \| blob \| history
re2c/src/ir/regexp/encoding/enc.h	[moved from re2c/src/dfa/encoding/enc.h with 97% similarity]	patch \| blob \| history
re2c/src/ir/regexp/encoding/range_suffix.cc	[moved from re2c/src/dfa/encoding/range_suffix.cc with 77% similarity]	patch \| blob \| history
re2c/src/ir/regexp/encoding/range_suffix.h	[moved from re2c/src/dfa/encoding/range_suffix.h with 79% similarity]	patch \| blob \| history
re2c/src/ir/regexp/encoding/utf16/utf16.cc	[moved from re2c/src/dfa/encoding/utf16/utf16.cc with 82% similarity]	patch \| blob \| history
re2c/src/ir/regexp/encoding/utf16/utf16.h	[moved from re2c/src/dfa/encoding/utf16/utf16.h with 82% similarity]	patch \| blob \| history
re2c/src/ir/regexp/encoding/utf16/utf16_range.cc	[moved from re2c/src/dfa/encoding/utf16/utf16_range.cc with 97% similarity]	patch \| blob \| history
re2c/src/ir/regexp/encoding/utf16/utf16_range.h	[moved from re2c/src/dfa/encoding/utf16/utf16_range.h with 63% similarity]	patch \| blob \| history
re2c/src/ir/regexp/encoding/utf16/utf16_regexp.cc	[moved from re2c/src/dfa/encoding/utf16/utf16_regexp.cc with 77% similarity]	patch \| blob \| history
re2c/src/ir/regexp/encoding/utf16/utf16_regexp.h	[new file with mode: 0644]	patch \| blob
re2c/src/ir/regexp/encoding/utf8/utf8.cc	[moved from re2c/src/dfa/encoding/utf8/utf8.cc with 97% similarity]	patch \| blob \| history
re2c/src/ir/regexp/encoding/utf8/utf8.h	[moved from re2c/src/dfa/encoding/utf8/utf8.h with 87% similarity]	patch \| blob \| history
re2c/src/ir/regexp/encoding/utf8/utf8_range.cc	[moved from re2c/src/dfa/encoding/utf8/utf8_range.cc with 97% similarity]	patch \| blob \| history
re2c/src/ir/regexp/encoding/utf8/utf8_range.h	[moved from re2c/src/dfa/encoding/utf8/utf8_range.h with 55% similarity]	patch \| blob \| history
re2c/src/ir/regexp/encoding/utf8/utf8_regexp.cc	[moved from re2c/src/dfa/encoding/utf8/utf8_regexp.cc with 78% similarity]	patch \| blob \| history
re2c/src/ir/regexp/encoding/utf8/utf8_regexp.h	[new file with mode: 0644]	patch \| blob
re2c/src/ir/regexp/fixed_length.cc	[new file with mode: 0644]	patch \| blob
re2c/src/ir/regexp/regexp.cc	[new file with mode: 0644]	patch \| blob
re2c/src/ir/regexp/regexp.h	[new file with mode: 0644]	patch \| blob
re2c/src/ir/regexp/regexp_alt.h	[new file with mode: 0644]	patch \| blob
re2c/src/ir/regexp/regexp_cat.h	[new file with mode: 0644]	patch \| blob
re2c/src/ir/regexp/regexp_close.h	[new file with mode: 0644]	patch \| blob
re2c/src/ir/regexp/regexp_closev.h	[new file with mode: 0644]	patch \| blob
re2c/src/ir/regexp/regexp_match.h	[new file with mode: 0644]	patch \| blob
re2c/src/ir/regexp/regexp_null.h	[new file with mode: 0644]	patch \| blob
re2c/src/ir/regexp/regexp_rule.h	[new file with mode: 0644]	patch \| blob
re2c/src/ir/rule_rank.cc	[moved from re2c/src/dfa/rule_rank.cc with 94% similarity]	patch \| blob \| history
re2c/src/ir/rule_rank.h	[moved from re2c/src/dfa/rule_rank.h with 89% similarity]	patch \| blob \| history
re2c/src/main.cc		patch \| blob \| history
re2c/src/parse/extop.h	[new file with mode: 0644]	patch \| blob
re2c/src/parse/parser.h		patch \| blob \| history
re2c/src/parse/parser.ypp		patch \| blob \| history
re2c/src/parse/scanner.h		patch \| blob \| history
re2c/src/parse/scanner_lex.re		patch \| blob \| history
re2c/src/parse/unescape.cc	[new file with mode: 0644]	patch \| blob