src/codegen/scc.h \
src/codegen/skeleton/path.h \
src/codegen/skeleton/skeleton.h \
- src/dfa/action.h \
- src/dfa/encoding/enc.h \
- src/dfa/encoding/range_suffix.h \
- src/dfa/encoding/utf16/utf16.h \
- src/dfa/encoding/utf16/utf16_range.h \
- src/dfa/encoding/utf16/utf16_regexp.h \
- src/dfa/encoding/utf8/utf8.h \
- src/dfa/encoding/utf8/utf8_range.h \
- src/dfa/encoding/utf8/utf8_regexp.h \
- src/dfa/dfa.h \
- src/dfa/ins.h \
- src/dfa/re.h \
- src/dfa/rule_rank.h \
- src/dfa/state.h \
+ src/ir/bytecode/charset.h \
+ src/ir/bytecode/bytecode.h \
+ src/ir/bytecode/ins.h \
+ src/ir/dfa/state.h \
+ src/ir/dfa/dfa.h \
+ src/ir/dfa/action.h \
+ src/ir/regexp/regexp_alt.h \
+ src/ir/regexp/regexp_match.h \
+ src/ir/regexp/encoding/enc.h \
+ src/ir/regexp/encoding/range_suffix.h \
+ src/ir/regexp/encoding/utf8/utf8.h \
+ src/ir/regexp/encoding/utf8/utf8_regexp.h \
+ src/ir/regexp/encoding/utf8/utf8_range.h \
+ src/ir/regexp/encoding/utf16/utf16_range.h \
+ src/ir/regexp/encoding/utf16/utf16_regexp.h \
+ src/ir/regexp/encoding/utf16/utf16.h \
+ src/ir/regexp/regexp_rule.h \
+ src/ir/regexp/regexp_cat.h \
+ src/ir/regexp/regexp_closev.h \
+ src/ir/regexp/regexp_null.h \
+ src/ir/regexp/regexp.h \
+ src/ir/regexp/regexp_close.h \
+ src/ir/rule_rank.h \
src/globals.h \
src/mbo_getopt.h \
+ src/parse/extop.h \
src/parse/input.h \
src/parse/parser.h \
src/parse/scanner.h \
src/codegen/scc.cc \
src/codegen/skeleton/path.cc \
src/codegen/skeleton/skeleton.cc \
- src/dfa/actions.cc \
- src/dfa/encoding/enc.cc \
- src/dfa/encoding/range_suffix.cc \
- src/dfa/encoding/utf16/utf16.cc \
- src/dfa/encoding/utf16/utf16_range.cc \
- src/dfa/encoding/utf16/utf16_regexp.cc \
- src/dfa/encoding/utf8/utf8.cc \
- src/dfa/encoding/utf8/utf8_range.cc \
- src/dfa/encoding/utf8/utf8_regexp.cc \
- src/dfa/dfa.cc \
- src/dfa/rule_rank.cc \
+ src/ir/bytecode/bytecode.cc \
+ src/ir/bytecode/ins.cc \
+ src/ir/bytecode/charset.cc \
+ src/ir/bytecode/split.cc \
+ src/ir/bytecode/compile.cc \
+ src/ir/bytecode/calc_size.cc \
+ src/ir/dfa/dfa.cc \
+ src/ir/regexp/display.cc \
+ src/ir/regexp/encoding/enc.cc \
+ src/ir/regexp/encoding/range_suffix.cc \
+ src/ir/regexp/encoding/utf8/utf8_regexp.cc \
+ src/ir/regexp/encoding/utf8/utf8_range.cc \
+ src/ir/regexp/encoding/utf8/utf8.cc \
+ src/ir/regexp/encoding/utf16/utf16_regexp.cc \
+ src/ir/regexp/encoding/utf16/utf16.cc \
+ src/ir/regexp/encoding/utf16/utf16_range.cc \
+ src/ir/regexp/fixed_length.cc \
+ src/ir/regexp/regexp.cc \
+ src/ir/rule_rank.cc \
src/main.cc \
src/mbo_getopt.cc \
src/parse/input.cc \
src/parse/scanner.cc \
+ src/parse/unescape.cc \
src/util/range.cc
re2c_SOURCES = \
$(SRC_HDR) \
-/* Generated by re2c 0.14.1.dev on Thu May 14 17:19:41 2015*/
+/* Generated by re2c 0.14.2 on Wed Jun 10 17:27:57 2015 */
#include <stdlib.h>
#include <string.h>
#include <iostream>
#include <sstream>
-#include "src/dfa/dfa.h"
+#include "src/ir/dfa/dfa.h"
#include "src/globals.h"
+#include "src/parse/extop.h"
#include "src/parse/parser.h"
#include "src/parse/scanner.h"
#include "y.tab.h"
#define _RE2C_CODEGEN_EMIT_
#include "src/codegen/output.h"
-#include "src/dfa/dfa.h"
+#include "src/ir/dfa/dfa.h"
namespace re2c {
-class DFA;
typedef std::vector<std::string> RegExpIndices;
void emit_action
#include "src/codegen/emit.h"
#include "src/codegen/indent.h"
#include "src/codegen/input_api.h"
-#include "src/codegen/skeleton/skeleton.h"
-#include "src/dfa/action.h"
+#include "src/ir/dfa/action.h"
+#include "src/ir/regexp/regexp_rule.h"
namespace re2c
{
#include "src/codegen/indent.h"
#include "src/codegen/input_api.h"
#include "src/codegen/skeleton/skeleton.h"
-#include "src/dfa/dfa.h"
namespace re2c
{
#include "src/codegen/bitmap.h"
#include "src/codegen/go.h"
-#include "src/dfa/dfa.h"
+#include "src/ir/dfa/dfa.h"
#include "src/util/allocate.h"
namespace re2c
#include "src/codegen/go.h"
-#include "src/dfa/dfa.h"
+#include "src/ir/dfa/dfa.h"
namespace re2c
{
#include "src/codegen/go.h"
#include "src/codegen/indent.h"
#include "src/codegen/print.h"
-#include "src/dfa/dfa.h"
+#include "src/ir/dfa/dfa.h"
namespace re2c
{
#include "src/codegen/go.h"
-#include "src/dfa/dfa.h"
+#include "src/ir/dfa/dfa.h"
namespace re2c
{
#include <vector>
#include "src/codegen/label.h"
-#include "src/dfa/rule_rank.h"
+#include "src/ir/rule_rank.h"
#include "src/util/c99_stdint.h"
#include "src/util/forbid_copy.h"
#include "src/codegen/bitmap.h"
#include "src/codegen/scc.h"
-#include "src/dfa/dfa.h"
+#include "src/ir/dfa/dfa.h"
+#include "src/ir/regexp/regexp_rule.h"
#include "src/util/allocate.h"
namespace re2c {
#include "src/codegen/scc.h"
-#include "src/dfa/dfa.h"
+#include "src/ir/dfa/dfa.h"
namespace re2c {
#include <vector>
-#include "src/dfa/rule_rank.h"
+#include "src/ir/rule_rank.h"
#include "src/util/c99_stdint.h"
namespace re2c
#include "src/codegen/indent.h"
#include "src/codegen/print.h"
#include "src/codegen/skeleton/skeleton.h"
+#include "src/ir/regexp/regexp_rule.h"
#include "src/util/allocate.h"
namespace re2c
#include <map>
#include "src/codegen/skeleton/path.h"
-#include "src/dfa/dfa.h"
+#include "src/ir/dfa/dfa.h"
#include "src/util/c99_stdint.h"
#include "src/util/forbid_copy.h"
#include "src/util/local_increment.h"
+++ /dev/null
-#include <time.h>
-#include <string.h>
-#include <iostream>
-#include <iomanip>
-#include <cctype>
-
-#include "src/codegen/skeleton/skeleton.h"
-#include "src/dfa/dfa.h"
-#include "src/dfa/encoding/utf16/utf16_regexp.h"
-#include "src/dfa/encoding/utf8/utf8_regexp.h"
-#include "src/globals.h"
-#include "src/parse/scanner.h"
-#include "src/util/allocate.h"
-
-namespace re2c
-{
-
-const Ins* showIns(std::ostream &o, const Ins &i, const Ins &base)
-{
- o.width(3);
- o << &i - &base << ": ";
-
- const Ins *ret = &(&i)[1];
-
- switch (i.i.tag)
- {
-
- case CHAR:
- {
- o << "match ";
-
- for (; ret < (Ins*) i.i.link; ++ret)
- {
- o << "\\x" << std::hex << ret->c.value;
- }
-
- break;
- }
-
- case GOTO:
- o << "goto " << ((Ins*) i.i.link - &base);
- break;
-
- case FORK:
- o << "fork " << ((Ins*) i.i.link - &base);
- break;
-
- case CTXT:
- o << "ctxt";
- break;
-
- case TERM:
- o << "term " << ((RuleOp*) i.i.link)->rank;
- break;
- }
-
- o << "\n";
- return ret;
-}
-
-uint32_t RegExp::fixedLength()
-{
- return ~0u;
-}
-
-uint32_t compile_goto(Ins *ins, Ins *i)
-{
- i->i.tag = GOTO;
- i->i.link = ins;
- return 1;
-}
-
-void NullOp::calcSize(Char*)
-{
- size = 0;
-}
-
-uint32_t NullOp::fixedLength()
-{
- return 0;
-}
-
-uint32_t NullOp::compile(Char*, Ins*)
-{
- return 0;
-}
-
-void NullOp::decompile()
-{
- ;
-}
-
-void NullOp::split(CharSet&)
-{
- ;
-}
-
-MatchOp *merge(MatchOp *m1, MatchOp *m2)
-{
- if (!m1)
- return m2;
-
- if (!m2)
- return m1;
-
- MatchOp* m = new MatchOp(doUnion(m1->match, m2->match));
- if (m1->ins_access == RegExp::PRIVATE
- || m2->ins_access == RegExp::PRIVATE)
- m->ins_access = RegExp::PRIVATE;
-
- return m;
-}
-
-void MatchOp::display(std::ostream &o) const
-{
- o << match;
-}
-
-void MatchOp::calcSize(Char *rep)
-{
- size = 1;
-
- for (Range *r = match; r; r = r->next)
- for (uint32_t c = r->lb; c < r->ub; ++c)
- if (rep[c] == c)
- ++size;
-}
-
-uint32_t MatchOp::fixedLength()
-{
- return 1;
-}
-
-uint32_t MatchOp::compile(Char *rep, Ins *i)
-{
- if (ins_cache)
- return compile_goto(ins_cache, i);
- else
- {
- ins_cache = i;
-
- i->i.tag = CHAR;
- i->i.link = &i[size];
- Ins *j = &i[1];
- uint32_t bump = size;
-
- for (Range *r = match; r; r = r->next)
- {
- for (uint32_t c = r->lb; c < r->ub; ++c)
- {
- if (rep[c] == c)
- {
- j->c.value = c;
- j->c.bump = --bump;
- j++;
- }
- }
- }
-
- if (ins_access == PRIVATE)
- decompile();
-
- return size;
- }
-}
-
-void MatchOp::decompile()
-{
- ins_cache = NULL;
-}
-
-void MatchOp::split(CharSet &s)
-{
- for (Range *r = match; r; r = r->next)
- {
- for (uint32_t c = r->lb; c < r->ub; ++c)
- {
- CharPtn *x = s.rep[c], *a = x->nxt;
-
- if (!a)
- {
- if (x->card == 1)
- continue;
-
- x->nxt = a = s.freeHead;
-
- if (!(s.freeHead = s.freeHead->nxt))
- s.freeTail = &s.freeHead;
-
- a->nxt = NULL;
-
- x->fix = s.fix;
-
- s.fix = x;
- }
-
- if (--(x->card) == 0)
- {
- *s.freeTail = x;
- *(s.freeTail = &x->nxt) = NULL;
- }
-
- s.rep[c] = a;
- ++(a->card);
- }
- }
-
- for (; s.fix; s.fix = s.fix->fix)
- if (s.fix->card)
- s.fix->nxt = NULL;
-}
-
-RegExp * mkDiff(RegExp *e1, RegExp *e2)
-{
- MatchOp * m1 = dynamic_cast<MatchOp*>(e1);
- MatchOp * m2 = dynamic_cast<MatchOp*>(e2);
- if (m1 == NULL || m2 == NULL)
- return NULL;
-
- Range *r = doDiff(m1->match, m2->match);
-
- return r ? (RegExp*) new MatchOp(r) : (RegExp*) new NullOp;
-}
-
-RegExp *doAlt(RegExp *e1, RegExp *e2)
-{
- if (!e1)
- return e2;
-
- if (!e2)
- return e1;
-
- return new AltOp(e1, e2);
-}
-
-RegExp *doCat(RegExp *e1, RegExp *e2)
-{
- if (!e1)
- return e2;
-
- if (!e2)
- return e1;
-
- return new CatOp(e1, e2);
-}
-
-RegExp *mkAlt(RegExp *e1, RegExp *e2)
-{
- AltOp *a;
- MatchOp *m1, *m2;
-
- a = dynamic_cast<AltOp*>(e1);
- if (a != NULL)
- {
- m1 = dynamic_cast<MatchOp*>(a->exp1);
- if (m1 != NULL)
- {
- m1->ins_access = e1->ins_access;
- a->exp2->ins_access = e1->ins_access;
- e1 = a->exp2;
- }
- }
- else
- {
- m1 = dynamic_cast<MatchOp*>(e1);
- if (m1 != NULL)
- e1 = NULL;
- }
-
- a = dynamic_cast<AltOp*>(e2);
- if (a != NULL)
- {
- m2 = dynamic_cast<MatchOp*>(a->exp1);
- if (m2 != NULL)
- {
- m2->ins_access = e2->ins_access;
- a->exp2->ins_access = e2->ins_access;
- e2 = a->exp2;
- }
- }
- else
- {
- m2 = dynamic_cast<MatchOp*>(e2);
- if (m2 != NULL)
- e2 = NULL;
- }
-
- return doAlt(merge(m1, m2), doAlt(e1, e2));
-}
-
-void AltOp::calcSize(Char *rep)
-{
- exp1->calcSize(rep);
- exp2->calcSize(rep);
- size = exp1->size + exp2->size + 2;
-}
-
-uint32_t AltOp::fixedLength()
-{
- uint32_t l1 = exp1->fixedLength();
- uint32_t l2 = exp1->fixedLength();
-
- if (l1 != l2 || l1 == ~0u)
- return ~0u;
-
- return l1;
-}
-
-uint32_t AltOp::compile(Char *rep, Ins *i)
-{
- if (ins_cache)
- return compile_goto(ins_cache, i);
- else
- {
- ins_cache = i;
-
- i->i.tag = FORK;
- const uint32_t sz1 = exp1->compile(rep, &i[1]);
- Ins * const j = &i[sz1 + 1];
- i->i.link = &j[1];
- j->i.tag = GOTO;
- const uint32_t sz2 = exp2->compile(rep, &j[1]);
- j->i.link = &j[sz2 + 1];
-
- if (ins_access == PRIVATE)
- decompile();
-
- return sz1 + sz2 + 2;
- }
-}
-
-void AltOp::decompile()
-{
- if (ins_cache)
- {
- exp1->decompile();
- exp2->decompile();
- ins_cache = NULL;
- }
-}
-
-void AltOp::split(CharSet &s)
-{
- exp1->split(s);
- exp2->split(s);
-}
-
-void CatOp::calcSize(Char *rep)
-{
- exp1->calcSize(rep);
- exp2->calcSize(rep);
- size = exp1->size + exp2->size;
-}
-
-uint32_t CatOp::fixedLength()
-{
- uint32_t l1, l2;
-
- if ((l1 = exp1->fixedLength()) != ~0u )
- if ((l2 = exp2->fixedLength()) != ~0u)
- return l1 + l2;
-
- return ~0u;
-}
-
-uint32_t CatOp::compile(Char *rep, Ins *i)
-{
- if (ins_cache)
- return compile_goto(ins_cache, i);
- else
- {
- ins_cache = i;
-
- const uint32_t sz1 = exp1->compile(rep, &i[0]);
- const uint32_t sz2 = exp2->compile(rep, &i[sz1]);
-
- if (ins_access == PRIVATE)
- decompile();
-
- return sz1 + sz2;
- }
-}
-
-void CatOp::decompile()
-{
- if (ins_cache)
- {
- exp1->decompile();
- exp2->decompile();
- ins_cache = NULL;
- }
-}
-
-void CatOp::split(CharSet &s)
-{
- exp1->split(s);
- exp2->split(s);
-}
-
-void CloseOp::calcSize(Char *rep)
-{
- exp->calcSize(rep);
- size = exp->size + 1;
-}
-
-uint32_t CloseOp::compile(Char *rep, Ins *i)
-{
- if (ins_cache)
- return compile_goto(ins_cache, i);
- else
- {
- ins_cache = i;
-
- i += exp->compile(rep, &i[0]);
- i->i.tag = FORK;
- i->i.link = ins_cache;
- ++i;
-
- const uint32_t sz = i - ins_cache;
- if (ins_access == PRIVATE)
- decompile();
-
- return sz;
- }
-}
-
-void CloseOp::decompile()
-{
- if (ins_cache)
- {
- exp->decompile();
- ins_cache = NULL;
- }
-}
-
-void CloseOp::split(CharSet &s)
-{
- exp->split(s);
-}
-
-void CloseVOp::calcSize(Char *rep)
-{
- exp->calcSize(rep);
-
- if (max >= 0)
- {
- size = (exp->size * min) + ((1 + exp->size) * (max - min));
- }
- else
- {
- size = (exp->size * min) + 1;
- }
-}
-
-uint32_t CloseVOp::compile(Char *rep, Ins *i)
-{
- if (ins_cache)
- return compile_goto(ins_cache, i);
- else
- {
- ins_cache = i;
-
- for (int st = min; st < max; st++)
- {
- const uint32_t sz = exp->compile(rep, &i[1]);
- i->i.tag = FORK;
- i->i.link = ins_cache + (1 + sz) * (max - min);
- i += sz + 1;
- }
-
- for (int st = 0; st < min; st++)
- {
- const uint32_t sz = exp->compile(rep, &i[0]);
- i += sz;
- if (max < 0 && st == 0)
- {
- i->i.tag = FORK;
- i->i.link = i - sz;
- i++;
- }
- }
-
- const uint32_t sz = i - ins_cache;
- if (ins_access == PRIVATE)
- decompile();
-
- return sz;
- }
-}
-
-void CloseVOp::decompile()
-{
- if (ins_cache)
- {
- exp->decompile();
- ins_cache = NULL;
- }
-}
-
-void CloseVOp::split(CharSet &s)
-{
- exp->split(s);
-}
-
-RegExp *expr(Scanner &);
-
-uint32_t Scanner::unescape(SubStr &s) const
-{
- static const char * hex = "0123456789abcdef";
- static const char * oct = "01234567";
-
- s.len--;
- uint32_t c, ucb = 0;
-
- if ((c = *s.str++) != '\\' || s.len == 0)
- {
- return c;
- }
-
- s.len--;
-
- switch (c = *s.str++)
- {
- case 'n': return '\n';
- case 't': return '\t';
- case 'v': return '\v';
- case 'b': return '\b';
- case 'r': return '\r';
- case 'f': return '\f';
- case 'a': return '\a';
-
- case 'x':
- {
- if (s.len < 2)
- {
- fatal(s.ofs()+s.len, "Illegal hexadecimal character code, two hexadecimal digits are required");
- return ~0u;
- }
-
- const char *p1 = strchr(hex, tolower(s.str[0]));
- const char *p2 = strchr(hex, tolower(s.str[1]));
-
- if (!p1 || !p2)
- {
- fatal(s.ofs()+(p1?1:0), "Illegal hexadecimal character code");
- return ~0u;
- }
- else
- {
- s.len -= 2;
- s.str += 2;
-
- uint32_t v = (uint32_t)((p1 - hex) << 4)
- + (uint32_t)((p2 - hex));
-
- return v;
- }
- }
-
- case 'U':
- {
- if (s.len < 8)
- {
- fatal(s.ofs()+s.len, "Illegal unicode character, eight hexadecimal digits are required");
- return ~0u;
- }
-
- uint32_t l = 0;
- if (s.str[0] == '0')
- {
- l++;
- if (s.str[1] == '0')
- {
- l++;
- if (s.str[2] == '0' || (s.str[2] == '1' && encoding.szCodePoint() == 4))
- {
- l++;
- if (encoding.szCodePoint() == 4)
- {
- const char *u3 = strchr(hex, tolower(s.str[2]));
- const char *u4 = strchr(hex, tolower(s.str[3]));
- if (u3 && u4)
- {
- ucb = (uint32_t)((u3 - hex) << 20)
- + (uint32_t)((u4 - hex) << 16);
- l++;
- }
- }
- else if (s.str[3] == '0')
- {
- l++;
- }
- }
- }
- }
-
- if (l != 4)
- {
- fatal(s.ofs()+l, "Illegal unicode character, eight hexadecimal digits are required");
- }
-
- s.len -= 4;
- s.str += 4;
-
- // no break;
- }
- case 'X':
- case 'u':
- {
- if (s.len < 4)
- {
- fatal(s.ofs()+s.len,
- c == 'X'
- ? "Illegal hexadecimal character code, four hexadecimal digits are required"
- : "Illegal unicode character, four hexadecimal digits are required");
- return ~0u;
- }
-
- const char *p1 = strchr(hex, tolower(s.str[0]));
- const char *p2 = strchr(hex, tolower(s.str[1]));
- const char *p3 = strchr(hex, tolower(s.str[2]));
- const char *p4 = strchr(hex, tolower(s.str[3]));
-
- if (!p1 || !p2 || !p3 || !p4)
- {
- fatal(s.ofs()+(p1?1:0)+(p2?1:0)+(p3?1:0),
- c == 'X'
- ? "Illegal hexadecimal character code, non hexxdecimal digit found"
- : "Illegal unicode character, non hexadecimal digit found");
- return ~0u;
- }
- else
- {
- s.len -= 4;
- s.str += 4;
-
- uint32_t v = (uint32_t)((p1 - hex) << 12)
- + (uint32_t)((p2 - hex) << 8)
- + (uint32_t)((p3 - hex) << 4)
- + (uint32_t)((p4 - hex))
- + ucb;
-
- if (v >= encoding.nCodePoints())
- {
- fatal(s.ofs(),
- c == 'X'
- ? "Illegal hexadecimal character code, out of range"
- : "Illegal unicode character, out of range");
- }
-
- return v;
- }
- }
-
- case '4':
- case '5':
- case '6':
- case '7':
- {
- fatal(s.ofs()-1, "Illegal octal character code, first digit must be 0 thru 3");
- return ~0u;
- }
-
- case '0':
- case '1':
- case '2':
- case '3':
- {
- if (s.len < 2)
- {
- fatal(s.ofs()+s.len, "Illegal octal character code, three octal digits are required");
- return ~0u;
- }
-
- const char *p0 = strchr(oct, c);
- const char *p1 = strchr(oct, s.str[0]);
- const char *p2 = strchr(oct, s.str[1]);
-
- if (!p0 || !p1 || !p2)
- {
- fatal(s.ofs()+(p1?1:0), "Illegal octal character code, non octal digit found");
- return ~0u;
- }
- else
- {
- s.len -= 2;
- s.str += 2;
-
- uint32_t v = (uint32_t)((p0 - oct) << 6) + (uint32_t)((p1 - oct) << 3) + (uint32_t)(p2 - oct);
-
- return v;
- }
- }
-
- default:
- return c;
- }
-}
-
-std::string& Scanner::unescape(SubStr& str_in, std::string& str_out) const
-{
- str_out.clear();
-
- while(str_in.len)
- {
- uint32_t c = unescape(str_in);
-
- if (c > 0xFF)
- {
- fatal(str_in.ofs(), "Illegal character");
- }
-
- str_out += static_cast<char>(c);
- }
-
- return str_out;
-}
-
-Range * Scanner::getRange(SubStr &s) const
-{
- uint32_t lb = unescape(s), ub;
-
- if (s.len < 2 || *s.str != '-')
- {
- ub = lb;
- }
- else
- {
- s.len--;
- s.str++;
- ub = unescape(s);
- if (ub < lb)
- {
- uint32_t tmp = lb;
- lb = ub;
- ub = tmp;
- }
- }
-
- Range * r = encoding.encodeRange(lb, ub);
- if (r == NULL)
- fatalf("Bad code point range: '0x%X - 0x%X'", lb, ub);
- return r;
-}
-
-RegExp * Scanner::matchSymbol(uint32_t c) const
-{
- if (!encoding.encode(c))
- fatalf("Bad code point: '0x%X'", c);
-
- if (encoding.is(Enc::UTF16))
- return UTF16Symbol(c);
- else if (encoding.is(Enc::UTF8))
- return UTF8Symbol(c);
- else
- return new MatchOp(new Range(c, c + 1));
-}
-
-RegExp * Scanner::strToRE (SubStr & s) const
-{
- if (s.len == 0)
- return new NullOp;
-
- RegExp *re = matchSymbol(unescape(s));
-
- while (s.len > 0)
- re = new CatOp(re, matchSymbol(unescape(s)));
-
- return re;
-}
-
-RegExp * Scanner::strToCaseInsensitiveRE (SubStr & s) const
-{
- if (s.len == 0)
- return new NullOp;
-
- uint32_t c = unescape(s);
-
- RegExp *re, *reL, *reU;
-
- if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z'))
- {
- reL = matchSymbol(tolower(c));
- reU = matchSymbol(toupper(c));
- re = mkAlt(reL, reU);
- }
- else
- {
- re = matchSymbol(c);
- }
-
- while (s.len > 0)
- {
- c = unescape(s);
-
- if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z'))
- {
- reL = matchSymbol(tolower(c));
- reU = matchSymbol(toupper(c));
- re = new CatOp(re, mkAlt(reL, reU));
- }
- else
- {
- re = new CatOp(re, matchSymbol(c));
- }
- }
-
- return re;
-}
-
-Range * Scanner::mkRange(SubStr &s) const
-{
- Range *r = getRange(s);
- while (s.len > 0)
- r = doUnion(r, getRange(s));
-
- return r;
-}
-
-RegExp * Scanner::matchSymbolRange(Range * r) const
-{
- if (encoding.is(Enc::UTF16))
- return UTF16Range(r);
- else if (encoding.is(Enc::UTF8))
- return UTF8Range(r);
- else
- return new MatchOp(r);
-}
-
-RegExp * Scanner::ranToRE (SubStr & s) const
-{
- s.len -= 2;
- s.str += 1;
-
- if (s.len == 0)
- return new NullOp;
-
- return matchSymbolRange(mkRange(s));
-}
-
-RegExp * Scanner::invToRE (SubStr & s) const
-{
- s.len -= 3;
- s.str += 2;
-
- Range * full = encoding.fullRange();
-
- Range * r = s.len == 0
- ? full
- : doDiff(full, mkRange (s));
-
- return matchSymbolRange(r);
-}
-
-RegExp * Scanner::mkDot() const
-{
- Range * full = encoding.fullRange();
- uint32_t c = '\n';
- if (!encoding.encode(c))
- fatalf("Bad code point: '0x%X'", c);
- Range * ran = new Range(c, c + 1);
- Range * inv = doDiff(full, ran);
-
- return matchSymbolRange(inv);
-}
-
-/*
- * Create a byte range that includes all possible input characters.
- * This may include characters, which do not map to any valid symbol
- * in current encoding. For encodings, which directly map symbols to
- * input characters (ASCII, EBCDIC, UTF-32), it equals [^]. For other
- * encodings (UTF-16, UTF-8), [^] and this range are different.
- *
- * Also note that default range doesn't respect encoding policy
- * (the way invalid code points are treated).
- */
-RegExp * Scanner::mkDefault() const
-{
- Range * def = new Range(0, encoding.nCodeUnits());
- return new MatchOp(def);
-}
-
-RuleOp::RuleOp(RegExp *e, RegExp *c, Token *t, rule_rank_t r, InsAccess access)
- : exp(e)
- , ctx(c)
- , ins(NULL)
- , rank(r)
- , code(t)
- , line(0)
-{
- ins_access = access;
-}
-
-void RuleOp::calcSize(Char *rep)
-{
- exp->calcSize(rep);
- ctx->calcSize(rep);
- size = exp->size + (ctx->size ? ctx->size + 2 : 1);
-}
-
-uint32_t RuleOp::compile(Char *rep, Ins *i)
-{
- if (ins_cache)
- return compile_goto(ins_cache, i);
- else
- {
- ins_cache = i;
-
- i += exp->compile(rep, &i[0]);
- if (ctx->size)
- {
- i->i.tag = CTXT;
- i->i.link = &i[1];
- ++i;
- i += ctx->compile(rep, &i[0]);
- }
- i->i.tag = TERM;
- i->i.link = this;
- ++i;
-
- const uint32_t sz = i - ins_cache;
- if (ins_access == PRIVATE)
- decompile();
-
- return sz;
- }
-}
-
-void RuleOp::decompile()
-{
- if (ins_cache)
- {
- exp->decompile();
- ctx->decompile();
- ins_cache = NULL;
- }
-}
-
-void RuleOp::split(CharSet &s)
-{
- exp->split(s);
- ctx->split(s);
-}
-
-void optimize(Ins *i)
-{
- while (!isMarked(i))
- {
- mark(i);
-
- if (i->i.tag == CHAR)
- {
- i = (Ins*) i->i.link;
- }
- else if (i->i.tag == GOTO || i->i.tag == FORK)
- {
- Ins *target = (Ins*) i->i.link;
- optimize(target);
-
- if (target->i.tag == GOTO)
- i->i.link = target->i.link == target ? i : target;
-
- if (i->i.tag == FORK)
- {
- Ins *follow = (Ins*) & i[1];
- optimize(follow);
-
- if (follow->i.tag == GOTO && follow->i.link == follow)
- {
- i->i.tag = GOTO;
- }
- else if (i->i.link == i)
- {
- i->i.tag = GOTO;
- i->i.link = follow;
- }
- }
-
- return ;
- }
- else
- {
- ++i;
- }
- }
-}
-
-CharSet::CharSet()
- : fix(0)
- , freeHead(0)
- , freeTail(0)
- , rep(allocate<CharPtr> (encoding.nCodeUnits()))
- , ptn(allocate<CharPtn> (encoding.nCodeUnits()))
-{
- for (uint32_t j = 0; j < encoding.nCodeUnits(); ++j)
- {
- rep[j] = &ptn[0];
- ptn[j].nxt = &ptn[j + 1]; /* wrong for j=encoding.nCodeUnits() - 1 but will be corrected below */
- ptn[j].card = 0;
- }
-
- freeHead = &ptn[1];
- *(freeTail = &ptn[encoding.nCodeUnits() - 1].nxt) = NULL;
- ptn[0].card = encoding.nCodeUnits();
- ptn[0].nxt = NULL;
-}
-
-CharSet::~CharSet()
-{
- operator delete (rep);
- operator delete (ptn);
-}
-
-smart_ptr<DFA> genCode(RegExp *re, Output & output, uint32_t ind)
-{
- CharSet cs;
- re->split(cs);
-
- Char *rep = new Char[encoding.nCodeUnits()];
-
- for (uint32_t j = 0; j < encoding.nCodeUnits(); ++j)
- {
- if (!cs.rep[j]->nxt)
- cs.rep[j]->nxt = &cs.ptn[j];
-
- rep[j] = cs.rep[j]->nxt - &cs.ptn[0];
- }
-
- re->calcSize(rep);
- Ins *ins = new Ins[re->size + 1];
- memset(ins, 0, (re->size + 1)*sizeof(Ins));
- const uint32_t size = re->compile(rep, ins);
- Ins *eoi = &ins[size];
- eoi->i.tag = GOTO;
- eoi->i.link = eoi;
-
- optimize(ins);
-
- /*
- for (const Ins *inst = &ins[0]; inst < &ins[size]; )
- {
- inst = showIns(std::cout, *inst, ins[0]);
- }
- */
-
- for (uint32_t j = 0; j < size;)
- {
- unmark(&ins[j]);
-
- if (ins[j].i.tag == CHAR)
- {
- j = (Ins*) ins[j].i.link - ins;
- }
- else
- {
- j++;
- }
- }
-
- smart_ptr<DFA> dfa = make_smart_ptr(new DFA(ins, size, 0, encoding.nCodeUnits(), rep));
- if (flag_skeleton)
- {
- skeleton::Skeleton skeleton (*dfa);
- skeleton.emit_data (output.data);
- skeleton::emit_prolog (output.source, ind, output.data.file_name.c_str ());
- }
- dfa->prepare (output.source, output.max_fill);
-
- return dfa;
-}
-
-} // end namespace re2c
-
+++ /dev/null
-#ifndef _RE2C_DFA_ENCODING_UTF16_REGEXP_
-#define _RE2C_DFA_ENCODING_UTF16_REGEXP_
-
-#include "src/dfa/encoding/utf16/utf16.h"
-#include "src/util/range.h"
-
-namespace re2c {
-
-class RegExp; // forward
-
-RegExp * UTF16Symbol(utf16::rune r);
-RegExp * UTF16Range(const Range * r);
-
-} // namespace re2c
-
-#endif // _RE2C_DFA_ENCODING_UTF16_REGEXP_
+++ /dev/null
-#ifndef _RE2C_DFA_ENCODING_UTF8_REGEXP_
-#define _RE2C_DFA_ENCODING_UTF8_REGEXP_
-
-#include "src/dfa/encoding/utf8/utf8.h"
-#include "src/util/range.h"
-
-namespace re2c {
-
-class RegExp; // forward
-
-RegExp * UTF8Symbol(utf8::rune r);
-RegExp * UTF8Range(const Range * r);
-
-} // namespace re2c
-
-#endif // _RE2C_DFA_ENCODING_UTF8_REGEXP_
+++ /dev/null
-#ifndef _RE2C_DFA_INS_
-#define _RE2C_DFA_INS_
-
-#include "src/util/c99_stdint.h"
-
-namespace re2c
-{
-
-typedef uint32_t Char;
-
-const uint32_t CHAR = 0;
-const uint32_t GOTO = 1;
-const uint32_t FORK = 2;
-const uint32_t TERM = 3;
-const uint32_t CTXT = 4;
-
-union Ins {
-
- struct
- {
- uint8_t tag;
- uint8_t marked;
- void *link;
- }
-
- i;
-
- struct
- {
- uint32_t value;
- uint32_t bump;
- void *link;
- }
-
- c;
-};
-
-inline bool isMarked(Ins *i)
-{
- return i->i.marked != 0;
-}
-
-inline void mark(Ins *i)
-{
- i->i.marked = true;
-}
-
-inline void unmark(Ins *i)
-{
- i->i.marked = false;
-}
-
-} // end namespace re2c
-
-#endif // _RE2C_DFA_INS_
+++ /dev/null
-#ifndef _RE2C_DFA_RE_
-#define _RE2C_DFA_RE_
-
-#include <iostream>
-#include <set>
-#include <map>
-#include <list>
-#include <vector>
-#include <string>
-
-#include "src/dfa/ins.h"
-#include "src/dfa/rule_rank.h"
-#include "src/globals.h"
-#include "src/parse/token.h"
-#include "src/util/range.h"
-#include "src/util/forbid_copy.h"
-#include "src/util/free_list.h"
-
-namespace re2c
-{
-
-typedef struct extop
-{
- char op;
- int minsize;
- int maxsize;
-}
-
-ExtOp;
-
-struct CharPtn
-{
- uint32_t card;
- CharPtn *fix;
- CharPtn *nxt;
-
- FORBID_COPY (CharPtn);
-};
-
-typedef CharPtn *CharPtr;
-
-struct CharSet
-{
- CharSet();
- ~CharSet();
-
- CharPtn *fix;
- CharPtn *freeHead, **freeTail;
- CharPtr *rep;
- CharPtn *ptn;
-
- FORBID_COPY (CharSet);
-};
-
-class RegExp
-{
-
-public:
- uint32_t size;
-
- /*
- * There're several different cases when the same regexp
- * can be used multiple times:
- * 1) named definitions, e.g. digit = [0-9];
- * 2) counted repetition, e.g. "a"{3}, "a"{3,}, "a"{3,5}
- * 3) multiple DFA's sharing the same regexp , e.g. <c1, c2> "abc" { }
- * 4) common suffixes, e.g. suffix [\x80-\xBF] in UTF-8 ranges
- * In cases 1-3, regexp must be recompiled each time it's reused.
- * In case 4, regexp should be compiled only once, and instructions
- * should be shared in order to reduce space.
- *
- * Note: if regexp must always be recompiled, it doesn't imply that
- * parts of this regexp must always be recompiled. It only forces
- * the compilation function to traverse the regexp after compilation
- * and reset compilation cache for each sub-regexp. E.g., for a regexp
- * [^]{3} in UTF-8 mode, each of sub-regexps [^] will have common suffix
- * [\x80-\xBF] factored out, but they won't share instructions.
- */
- Ins* ins_cache; /* if non-NULL, points to compiled instructions */
- enum InsAccess
- { SHARED
- , PRIVATE
- } ins_access;
-
- static free_list<RegExp*> vFreeList;
-
-public:
- RegExp()
- : size(0)
- , ins_cache(NULL)
- , ins_access(SHARED)
- {
- vFreeList.insert(this);
- }
-
- virtual ~RegExp()
- {
- vFreeList.erase(this);
- }
-
- virtual void split(CharSet&) = 0;
- virtual void calcSize(Char*) = 0;
- virtual uint32_t fixedLength();
- virtual uint32_t compile(Char*, Ins*) = 0;
- virtual void decompile() = 0;
- virtual void display(std::ostream&) const = 0;
- friend std::ostream& operator<<(std::ostream&, const RegExp&);
- friend std::ostream& operator<<(std::ostream&, const RegExp*);
-
- FORBID_COPY (RegExp);
-};
-
-inline std::ostream& operator<<(std::ostream &o, const RegExp &re)
-{
- re.display(o);
- return o;
-}
-
-inline std::ostream& operator<<(std::ostream &o, const RegExp *re)
-{
- return o << *re;
-}
-
-class NullOp: public RegExp
-{
-
-public:
- void split(CharSet&);
- void calcSize(Char*);
- uint32_t fixedLength();
- uint32_t compile(Char*, Ins*);
- void decompile();
- void display(std::ostream &o) const
- {
- o << "_";
- }
-};
-
-class MatchOp: public RegExp
-{
-
-public:
- Range *match;
-
-public:
- MatchOp(Range *m) : match(m)
- {
- }
-
- void split(CharSet&);
- void calcSize(Char*);
- uint32_t fixedLength();
- uint32_t compile(Char*, Ins*);
- void decompile();
- void display(std::ostream&) const;
-
- FORBID_COPY (MatchOp);
-};
-
-class RuleOp: public RegExp
-{
-private:
- RegExp *exp;
-
-public:
- RegExp *ctx;
- Ins *ins;
- rule_rank_t rank;
- Token *code;
- uint32_t line;
-
-public:
- RuleOp(RegExp*, RegExp*, Token*, rule_rank_t, InsAccess);
-
- ~RuleOp()
- {
- delete code;
- }
-
- void split(CharSet&);
- void calcSize(Char*);
- uint32_t compile(Char*, Ins*);
- void decompile();
- void display(std::ostream &o) const
- {
- o << exp << "/" << ctx << ";";
- }
-
- FORBID_COPY (RuleOp);
-};
-
-RegExp *doAlt(RegExp*, RegExp*);
-RegExp *mkAlt(RegExp*, RegExp*);
-
-class AltOp: public RegExp
-{
-
-private:
- RegExp *exp1, *exp2;
-
-public:
- AltOp(RegExp *e1, RegExp *e2)
- : exp1(e1)
- , exp2(e2)
- {
- }
-
- void split(CharSet&);
- void calcSize(Char*);
- uint32_t fixedLength();
- uint32_t compile(Char*, Ins*);
- void decompile();
- void display(std::ostream &o) const
- {
- o << exp1 << "|" << exp2;
- }
-
- friend RegExp *mkAlt(RegExp*, RegExp*);
-
- FORBID_COPY (AltOp);
-};
-
-RegExp *doCat(RegExp*, RegExp*);
-RegExp *mkCat(RegExp*, RegExp*);
-
-class CatOp: public RegExp
-{
-
-private:
- RegExp *exp1, *exp2;
-
-public:
- CatOp(RegExp *e1, RegExp *e2)
- : exp1(e1)
- , exp2(e2)
- {
- }
-
- void split(CharSet&);
- void calcSize(Char*);
- uint32_t fixedLength();
- uint32_t compile(Char*, Ins*);
- void decompile();
- void display(std::ostream &o) const
- {
- o << exp1 << exp2;
- }
-
- FORBID_COPY (CatOp);
-};
-
-class CloseOp: public RegExp
-{
-
-private:
- RegExp *exp;
-
-public:
- CloseOp(RegExp *e)
- : exp(e)
- {
- }
-
- void split(CharSet&);
- void calcSize(Char*);
- uint32_t compile(Char*, Ins*);
- void decompile();
- void display(std::ostream &o) const
- {
- o << exp << "+";
- }
-
- FORBID_COPY (CloseOp);
-};
-
-class CloseVOp: public RegExp
-{
-
-private:
- RegExp *exp;
- int min;
- int max;
-
-public:
- CloseVOp(RegExp *e, int lb, int ub)
- : exp(e)
- , min(lb)
- , max(ub)
- {
- exp->ins_access = PRIVATE;
- }
-
- void split(CharSet&);
- void calcSize(Char*);
- uint32_t compile(Char*, Ins*);
- void decompile();
- void display(std::ostream &o) const
- {
- o << exp << "+";
- }
-
- FORBID_COPY (CloseVOp);
-};
-
-extern RegExp *mkDiff(RegExp*, RegExp*);
-extern RegExp *mkAlt(RegExp*, RegExp*);
-
-} // end namespace re2c
-
-#endif // _RE2C_DFA_RE_
#include "src/codegen/code_names.h"
#include "src/codegen/input_api.h"
-#include "src/dfa/encoding/enc.h"
+#include "src/ir/regexp/encoding/enc.h"
#include "src/util/c99_stdint.h"
namespace re2c
--- /dev/null
+#include <string.h>
+
+#include "src/codegen/skeleton/skeleton.h"
+#include "src/ir/bytecode/bytecode.h"
+#include "src/globals.h"
+
+namespace re2c {
+
+static void optimize (Ins * i);
+
+smart_ptr<DFA> genCode (RegExp *re, Output & output, uint32_t ind)
+{
+ CharSet cs;
+ re->split(cs);
+
+ Char *rep = new Char[encoding.nCodeUnits()];
+
+ for (uint32_t j = 0; j < encoding.nCodeUnits(); ++j)
+ {
+ if (!cs.rep[j]->nxt)
+ cs.rep[j]->nxt = &cs.ptn[j];
+
+ rep[j] = cs.rep[j]->nxt - &cs.ptn[0];
+ }
+
+ re->calcSize(rep);
+ Ins *ins = new Ins[re->size + 1];
+ memset(ins, 0, (re->size + 1)*sizeof(Ins));
+ const uint32_t size = re->compile(rep, ins);
+ Ins *eoi = &ins[size];
+ eoi->i.tag = GOTO;
+ eoi->i.link = eoi;
+
+ optimize(ins);
+
+ /*
+ for (const Ins *inst = &ins[0]; inst < &ins[size]; )
+ {
+ inst = showIns(std::cout, *inst, ins[0]);
+ }
+ */
+
+ for (uint32_t j = 0; j < size;)
+ {
+ unmark(&ins[j]);
+
+ if (ins[j].i.tag == CHAR)
+ {
+ j = (Ins*) ins[j].i.link - ins;
+ }
+ else
+ {
+ j++;
+ }
+ }
+
+ smart_ptr<DFA> dfa = make_smart_ptr(new DFA(ins, size, 0, encoding.nCodeUnits(), rep));
+ if (flag_skeleton)
+ {
+ skeleton::Skeleton skeleton (*dfa);
+ skeleton.emit_data (output.data);
+ skeleton::emit_prolog (output.source, ind, output.data.file_name.c_str ());
+ }
+ dfa->prepare (output.source, output.max_fill);
+
+ return dfa;
+}
+
+void optimize (Ins * i)
+{
+ while (!isMarked (i))
+ {
+ mark (i);
+ if (i->i.tag == CHAR)
+ {
+ i = (Ins *) i->i.link;
+ }
+ else if (i->i.tag == GOTO || i->i.tag == FORK)
+ {
+ Ins * target = (Ins *) i->i.link;
+ optimize (target);
+ if (target->i.tag == GOTO)
+ {
+ i->i.link = target->i.link == target
+ ? i
+ : target;
+ }
+ if (i->i.tag == FORK)
+ {
+ Ins * follow = (Ins *) & i[1];
+ optimize (follow);
+ if (follow->i.tag == GOTO && follow->i.link == follow)
+ {
+ i->i.tag = GOTO;
+ }
+ else if (i->i.link == i)
+ {
+ i->i.tag = GOTO;
+ i->i.link = follow;
+ }
+ }
+ return;
+ }
+ else
+ {
+ ++i;
+ }
+ }
+}
+
+} // namespace re2c
--- /dev/null
+#ifndef _RE2C_IR_BYTECODE_BYTECODE_
+#define _RE2C_IR_BYTECODE_BYTECODE_
+
+#include "src/codegen/output.h"
+#include "src/ir/dfa/dfa.h"
+#include "src/ir/regexp/regexp.h"
+#include "src/util/smart_ptr.h"
+
+namespace re2c
+{
+
+smart_ptr<DFA> genCode (RegExp * re, Output & output, uint32_t ind);
+
+} // namespace re2c
+
+#endif // _RE2C_IR_BYTECODE_BYTECODE_
--- /dev/null
+#include "src/ir/regexp/regexp_alt.h"
+#include "src/ir/regexp/regexp_cat.h"
+#include "src/ir/regexp/regexp_close.h"
+#include "src/ir/regexp/regexp_closev.h"
+#include "src/ir/regexp/regexp_match.h"
+#include "src/ir/regexp/regexp_null.h"
+#include "src/ir/regexp/regexp_rule.h"
+
+namespace re2c
+{
+
+void AltOp::calcSize (Char * rep)
+{
+ exp1->calcSize (rep);
+ exp2->calcSize (rep);
+ size = exp1->size + exp2->size + 2;
+}
+
+void CatOp::calcSize (Char * rep)
+{
+ exp1->calcSize (rep);
+ exp2->calcSize (rep);
+ size = exp1->size + exp2->size;
+}
+
+void CloseOp::calcSize (Char * rep)
+{
+ exp->calcSize (rep);
+ size = exp->size + 1;
+}
+
+void CloseVOp::calcSize (Char * rep)
+{
+ exp->calcSize (rep);
+ if (max >= 0)
+ {
+ size = (exp->size * min) + ((1 + exp->size) * (max - min));
+ }
+ else
+ {
+ size = (exp->size * min) + 1;
+ }
+}
+
+void MatchOp::calcSize (Char * rep)
+{
+ size = 1;
+ for (Range * r = match; r; r = r->next)
+ {
+ for (uint32_t c = r->lb; c < r->ub; ++c)
+ {
+ if (rep[c] == c)
+ {
+ ++size;
+ }
+ }
+ }
+}
+
+void NullOp::calcSize (Char *)
+{
+ size = 0;
+}
+
+void RuleOp::calcSize (Char * rep)
+{
+ exp->calcSize (rep);
+ ctx->calcSize (rep);
+ size = exp->size + (ctx->size ? ctx->size + 2 : 1);
+}
+
+} // end namespace re2c
--- /dev/null
+#include "src/ir/bytecode/charset.h"
+#include "src/globals.h"
+#include "src/util/allocate.h"
+
+namespace re2c {
+
+CharSet::CharSet ()
+ : fix (0)
+ , freeHead (0)
+ , freeTail (0)
+ , rep (allocate<CharPtr> (encoding.nCodeUnits ()))
+ , ptn (allocate<CharPtn> (encoding.nCodeUnits ()))
+{
+ for (uint32_t j = 0; j < encoding.nCodeUnits (); ++j)
+ {
+ rep[j] = &ptn[0];
+ ptn[j].nxt = &ptn[j + 1]; /* wrong for j=encoding.nCodeUnits() - 1 but will be corrected below */
+ ptn[j].card = 0;
+ }
+ freeHead = &ptn[1];
+ * (freeTail = &ptn[encoding.nCodeUnits() - 1].nxt) = NULL;
+ ptn[0].card = encoding.nCodeUnits ();
+ ptn[0].nxt = NULL;
+}
+
+CharSet::~CharSet ()
+{
+ operator delete (rep);
+ operator delete (ptn);
+}
+
+} // namespace re2c
--- /dev/null
+#ifndef _RE2C_IR_BYTECODE_CHARSET_
+#define _RE2C_IR_BYTECODE_CHARSET_
+
+#include "src/util/c99_stdint.h"
+#include "src/util/forbid_copy.h"
+
+namespace re2c
+{
+
+struct CharPtn
+{
+ uint32_t card;
+ CharPtn * fix;
+ CharPtn * nxt;
+
+ FORBID_COPY (CharPtn);
+};
+
+typedef CharPtn * CharPtr;
+
+struct CharSet
+{
+ CharPtn * fix;
+ CharPtn * freeHead;
+ CharPtn ** freeTail;
+ CharPtr * rep;
+ CharPtn * ptn;
+
+ CharSet ();
+ ~CharSet ();
+
+ FORBID_COPY (CharSet);
+};
+
+} // namespace re2c
+
+#endif // _RE2C_IR_BYTECODE_CHARSET_
--- /dev/null
+#include "src/ir/regexp/regexp_alt.h"
+#include "src/ir/regexp/regexp_cat.h"
+#include "src/ir/regexp/regexp_close.h"
+#include "src/ir/regexp/regexp_closev.h"
+#include "src/ir/regexp/regexp_match.h"
+#include "src/ir/regexp/regexp_null.h"
+#include "src/ir/regexp/regexp_rule.h"
+
+namespace re2c
+{
+
+static uint32_t compile_goto (Ins * ins, Ins * i);
+
+uint32_t AltOp::compile (Char * rep, Ins * i)
+{
+ if (ins_cache)
+ {
+ return compile_goto (ins_cache, i);
+ }
+ else
+ {
+ ins_cache = i;
+
+ i->i.tag = FORK;
+ const uint32_t sz1 = exp1->compile (rep, &i[1]);
+ Ins * const j = &i[sz1 + 1];
+ i->i.link = &j[1];
+ j->i.tag = GOTO;
+ const uint32_t sz2 = exp2->compile (rep, &j[1]);
+ j->i.link = &j[sz2 + 1];
+
+ if (ins_access == PRIVATE)
+ {
+ decompile ();
+ }
+
+ return sz1 + sz2 + 2;
+ }
+}
+
+void AltOp::decompile ()
+{
+ if (ins_cache)
+ {
+ exp1->decompile ();
+ exp2->decompile ();
+ ins_cache = NULL;
+ }
+}
+
+uint32_t CatOp::compile (Char * rep, Ins * i)
+{
+ if (ins_cache)
+ {
+ return compile_goto (ins_cache, i);
+ }
+ else
+ {
+ ins_cache = i;
+
+ const uint32_t sz1 = exp1->compile (rep, &i[0]);
+ const uint32_t sz2 = exp2->compile (rep, &i[sz1]);
+
+ if (ins_access == PRIVATE)
+ {
+ decompile ();
+ }
+
+ return sz1 + sz2;
+ }
+}
+
+void CatOp::decompile ()
+{
+ if (ins_cache)
+ {
+ exp1->decompile ();
+ exp2->decompile ();
+ ins_cache = NULL;
+ }
+}
+
+uint32_t CloseOp::compile (Char * rep, Ins * i)
+{
+ if (ins_cache)
+ {
+ return compile_goto (ins_cache, i);
+ }
+ else
+ {
+ ins_cache = i;
+
+ i += exp->compile (rep, &i[0]);
+ i->i.tag = FORK;
+ i->i.link = ins_cache;
+ ++i;
+
+ const uint32_t sz = i - ins_cache;
+ if (ins_access == PRIVATE)
+ {
+ decompile ();
+ }
+
+ return sz;
+ }
+}
+
+void CloseOp::decompile ()
+{
+ if (ins_cache)
+ {
+ exp->decompile ();
+ ins_cache = NULL;
+ }
+}
+
+uint32_t CloseVOp::compile (Char * rep, Ins * i)
+{
+ if (ins_cache)
+ {
+ return compile_goto (ins_cache, i);
+ }
+ else
+ {
+ ins_cache = i;
+
+ for (int st = min; st < max; st++)
+ {
+ const uint32_t sz = exp->compile (rep, &i[1]);
+ i->i.tag = FORK;
+ i->i.link = ins_cache + (1 + sz) * (max - min);
+ i += sz + 1;
+ }
+ for (int st = 0; st < min; st++)
+ {
+ const uint32_t sz = exp->compile (rep, &i[0]);
+ i += sz;
+ if (max < 0 && st == 0)
+ {
+ i->i.tag = FORK;
+ i->i.link = i - sz;
+ i++;
+ }
+ }
+ const uint32_t sz = i - ins_cache;
+
+ if (ins_access == PRIVATE)
+ {
+ decompile ();
+ }
+
+ return sz;
+ }
+}
+
+void CloseVOp::decompile ()
+{
+ if (ins_cache)
+ {
+ exp->decompile ();
+ ins_cache = NULL;
+ }
+}
+
+uint32_t MatchOp::compile (Char * rep, Ins * i)
+{
+ if (ins_cache)
+ {
+ return compile_goto (ins_cache, i);
+ }
+ else
+ {
+ ins_cache = i;
+
+ i->i.tag = CHAR;
+ i->i.link = &i[size];
+ Ins *j = &i[1];
+ uint32_t bump = size;
+ for (Range *r = match; r; r = r->next)
+ {
+ for (uint32_t c = r->lb; c < r->ub; ++c)
+ {
+ if (rep[c] == c)
+ {
+ j->c.value = c;
+ j->c.bump = --bump;
+ j++;
+ }
+ }
+ }
+
+ if (ins_access == PRIVATE)
+ {
+ decompile ();
+ }
+
+ return size;
+ }
+}
+
+void MatchOp::decompile ()
+{
+ ins_cache = NULL;
+}
+
+uint32_t NullOp::compile (Char *, Ins *)
+{
+ return 0;
+}
+
+void NullOp::decompile () {}
+
+uint32_t RuleOp::compile (Char * rep, Ins * i)
+{
+ if (ins_cache)
+ {
+ return compile_goto (ins_cache, i);
+ }
+ else
+ {
+ ins_cache = i;
+
+ i += exp->compile (rep, &i[0]);
+ if (ctx->size)
+ {
+ i->i.tag = CTXT;
+ i->i.link = &i[1];
+ ++i;
+ i += ctx->compile (rep, &i[0]);
+ }
+ i->i.tag = TERM;
+ i->i.link = this;
+ ++i;
+ const uint32_t sz = i - ins_cache;
+
+ if (ins_access == PRIVATE)
+ {
+ decompile ();
+ }
+
+ return sz;
+ }
+}
+
+void RuleOp::decompile ()
+{
+ if (ins_cache)
+ {
+ exp->decompile ();
+ ctx->decompile ();
+ ins_cache = NULL;
+ }
+}
+
+uint32_t compile_goto (Ins * ins, Ins * i)
+{
+ i->i.tag = GOTO;
+ i->i.link = ins;
+ return 1;
+}
+
+} // end namespace re2c
--- /dev/null
+#include <iostream>
+
+#include "src/ir/bytecode/ins.h"
+#include "src/ir/regexp/regexp_rule.h"
+
+namespace re2c {
+
+const Ins * showIns (std::ostream & o, const Ins & i, const Ins & base)
+{
+ o.width (3);
+ o << &i - &base << ": ";
+ const Ins * ret = &(&i)[1];
+ switch (i.i.tag)
+ {
+ case CHAR:
+ {
+ o << "match ";
+ for (; ret < (Ins *) i.i.link; ++ret)
+ {
+ o << "\\x" << std::hex << ret->c.value;
+ }
+ break;
+ }
+ case GOTO:
+ o << "goto " << ((Ins *) i.i.link - &base);
+ break;
+ case FORK:
+ o << "fork " << ((Ins *) i.i.link - &base);
+ break;
+ case CTXT:
+ o << "ctxt";
+ break;
+ case TERM:
+ o << "term " << ((RuleOp *) i.i.link)->rank;
+ break;
+ }
+ o << "\n";
+ return ret;
+}
+
+} // namespace re2c
--- /dev/null
+#ifndef _RE2C_IR_BYTECODE_INS_
+#define _RE2C_IR_BYTECODE_INS_
+
+#include <iosfwd>
+
+#include "src/util/c99_stdint.h"
+
+namespace re2c
+{
+
+static const uint32_t CHAR = 0;
+static const uint32_t GOTO = 1;
+static const uint32_t FORK = 2;
+static const uint32_t TERM = 3;
+static const uint32_t CTXT = 4;
+
+union Ins
+{
+ struct
+ {
+ uint8_t tag;
+ uint8_t marked;
+ void * link;
+ } i;
+ struct
+ {
+ uint32_t value;
+ uint32_t bump;
+ void * link;
+ } c;
+};
+
+inline bool isMarked (Ins * i)
+{
+ return i->i.marked != 0;
+}
+
+inline void mark (Ins * i)
+{
+ i->i.marked = true;
+}
+
+inline void unmark (Ins * i)
+{
+ i->i.marked = false;
+}
+
+const Ins * showIns (std::ostream & o, const Ins & i, const Ins & base);
+
+} // namespace re2c
+
+#endif // _RE2C_IR_BYTECODE_INS_
--- /dev/null
+#include "src/ir/regexp/regexp_alt.h"
+#include "src/ir/regexp/regexp_cat.h"
+#include "src/ir/regexp/regexp_close.h"
+#include "src/ir/regexp/regexp_closev.h"
+#include "src/ir/regexp/regexp_match.h"
+#include "src/ir/regexp/regexp_null.h"
+#include "src/ir/regexp/regexp_rule.h"
+
+namespace re2c {
+
+void AltOp::split (CharSet & s)
+{
+ exp1->split (s);
+ exp2->split (s);
+}
+
+void CatOp::split (CharSet & s)
+{
+ exp1->split (s);
+ exp2->split (s);
+}
+
+void CloseOp::split (CharSet & s)
+{
+ exp->split (s);
+}
+
+void CloseVOp::split (CharSet & s)
+{
+ exp->split (s);
+}
+
+void MatchOp::split (CharSet & s)
+{
+ for (Range *r = match; r; r = r->next)
+ {
+ for (uint32_t c = r->lb; c < r->ub; ++c)
+ {
+ CharPtn * x = s.rep[c];
+ CharPtn * a = x->nxt;
+ if (!a)
+ {
+ if (x->card == 1)
+ {
+ continue;
+ }
+ x->nxt = a = s.freeHead;
+ if (!(s.freeHead = s.freeHead->nxt))
+ {
+ s.freeTail = &s.freeHead;
+ }
+ a->nxt = NULL;
+ x->fix = s.fix;
+ s.fix = x;
+ }
+ if (--(x->card) == 0)
+ {
+ *s.freeTail = x;
+ *(s.freeTail = &x->nxt) = NULL;
+ }
+ s.rep[c] = a;
+ ++(a->card);
+ }
+ }
+ for (; s.fix; s.fix = s.fix->fix)
+ {
+ if (s.fix->card)
+ {
+ s.fix->nxt = NULL;
+ }
+ }
+}
+
+void NullOp::split (CharSet &) {}
+
+void RuleOp::split (CharSet & s)
+{
+ exp->split (s);
+ ctx->split (s);
+}
+
+} // namespace re2c
-#ifndef _RE2C_DFA_ACTION_
-#define _RE2C_DFA_ACTION_
+#ifndef _RE2C_IR_DFA_ACTION_
+#define _RE2C_IR_DFA_ACTION_
#include <vector>
} // namespace re2c
-#endif // _RE2C_DFA_ACTION_
+#endif // _RE2C_IR_DFA_ACTION_
#include <string.h>
-#include "src/dfa/dfa.h"
+#include "src/ir/dfa/dfa.h"
+#include "src/ir/regexp/regexp_rule.h"
#include "src/util/allocate.h"
namespace re2c
-#ifndef _RE2C_DFA_DFA_
-#define _RE2C_DFA_DFA_
+#ifndef _RE2C_IR_DFA_DFA_
+#define _RE2C_IR_DFA_DFA_
-#include "src/dfa/action.h"
-#include "src/dfa/state.h"
+#include "src/ir/dfa/action.h"
+#include "src/ir/dfa/state.h"
#include "src/util/forbid_copy.h"
-#include "src/util/smart_ptr.h"
namespace re2c
{
FORBID_COPY (DFA);
};
-smart_ptr<DFA> genCode (RegExp *, Output &, uint32_t);
-
} // namespace re2c
-#endif // _RE2C_DFA_DFA_
+#endif // _RE2C_IR_DFA_DFA_
-#ifndef _RE2C_DFA_STATE_
-#define _RE2C_DFA_STATE_
+#ifndef _RE2C_IR_DFA_STATE_
+#define _RE2C_IR_DFA_STATE_
#include "src/codegen/go.h"
-#include "src/dfa/action.h"
-#include "src/dfa/re.h"
+#include "src/ir/dfa/action.h"
+#include "src/ir/regexp/regexp.h"
#include "src/util/forbid_copy.h"
namespace re2c
} // namespace re2c
-#endif // _RE2C_DFA_STATE_
+#endif // _RE2C_IR_DFA_STATE_
--- /dev/null
+#include <iostream>
+
+#include "src/ir/regexp/regexp.h"
+#include "src/ir/regexp/regexp_alt.h"
+#include "src/ir/regexp/regexp_cat.h"
+#include "src/ir/regexp/regexp_close.h"
+#include "src/ir/regexp/regexp_closev.h"
+#include "src/ir/regexp/regexp_match.h"
+#include "src/ir/regexp/regexp_null.h"
+#include "src/ir/regexp/regexp_rule.h"
+
+namespace re2c
+{
+
+std::ostream & operator << (std::ostream & o, const RegExp & re)
+{
+ re.display (o);
+ return o;
+}
+
+void AltOp::display (std::ostream & o) const
+{
+ o << exp1 << "|" << exp2;
+}
+
+void CatOp::display (std::ostream & o) const
+{
+ o << exp1 << exp2;
+}
+
+void CloseOp::display (std::ostream & o) const
+{
+ o << exp << "+";
+}
+
+void CloseVOp::display (std::ostream & o) const
+{
+ o << exp << "+";
+}
+
+void MatchOp::display (std::ostream & o) const
+{
+ o << match;
+}
+
+void NullOp::display (std::ostream & o) const
+{
+ o << "_";
+}
+
+void RuleOp::display (std::ostream & o) const
+{
+ o << exp << "/" << ctx << ";";
+}
+
+} // end namespace re2c
+
-#include "src/dfa/encoding/enc.h"
+#include "src/ir/regexp/encoding/enc.h"
namespace re2c {
-#ifndef _RE2C_DFA_ENCODING_ENC_
-#define _RE2C_DFA_ENCODING_ENC_
+#ifndef _RE2C_IR_REGEXP_ENCODING_ENC_
+#define _RE2C_IR_REGEXP_ENCODING_ENC_
#include "src/util/c99_stdint.h"
#include "src/util/range.h"
} // namespace re2c
-#endif // _RE2C_DFA_ENCODING_ENC_
+#endif // _RE2C_IR_REGEXP_ENCODING_ENC_
-#include "src/dfa/encoding/range_suffix.h"
-#include "src/dfa/re.h"
+#include "src/ir/regexp/encoding/range_suffix.h"
+#include "src/ir/regexp/regexp.h"
+#include "src/ir/regexp/regexp_match.h"
namespace re2c {
-#ifndef _RE2C_DFA_ENCODING_RANGE_SUFFIX_
-#define _RE2C_DFA_ENCODING_RANGE_SUFFIX_
+#ifndef _RE2C_IR_REGEXP_ENCODING_RANGE_SUFFIX_
+#define _RE2C_IR_REGEXP_ENCODING_RANGE_SUFFIX_
#include <stddef.h> // NULL
} // namespace re2c
-#endif // _RE2C_DFA_ENCODING_RANGE_SUFFIX_
+#endif // _RE2C_IR_REGEXP_ENCODING_RANGE_SUFFIX_
-#include "src/dfa/encoding/utf16/utf16.h"
+#include "src/ir/regexp/encoding/utf16/utf16.h"
namespace re2c {
-#ifndef _RE2C_DFA_ENCODING_UTF16_UTF16_
-#define _RE2C_DFA_ENCODING_UTF16_UTF16_
+#ifndef _RE2C_IR_REGEXP_ENCODING_UTF16_UTF16_
+#define _RE2C_IR_REGEXP_ENCODING_UTF16_UTF16_
#include "src/util/c99_stdint.h"
} // namespace re2c
-#endif // _RE2C_DFA_ENCODING_UTF16_UTF16_
+#endif // _RE2C_IR_REGEXP_ENCODING_UTF16_UTF16_
-#include "src/dfa/encoding/utf16/utf16_range.h"
-#include "src/dfa/re.h"
+#include "src/ir/regexp/encoding/utf16/utf16_range.h"
+#include "src/ir/regexp/regexp.h"
namespace re2c {
-#ifndef _RE2C_DFA_ENCODING_UTF16_RANGE_
-#define _RE2C_DFA_ENCODING_UTF16_RANGE_
+#ifndef _RE2C_IR_REGEXP_ENCODING_UTF16_RANGE_
+#define _RE2C_IR_REGEXP_ENCODING_UTF16_RANGE_
-#include "src/dfa/encoding/range_suffix.h"
-#include "src/dfa/encoding/utf16/utf16.h"
+#include "src/ir/regexp/encoding/range_suffix.h"
+#include "src/ir/regexp/encoding/utf16/utf16.h"
namespace re2c {
} // namespace re2c
-#endif // _RE2C_DFA_ENCODING_UTF16_RANGE_
+#endif // _RE2C_IR_REGEXP_ENCODING_UTF16_RANGE_
-#include "src/dfa/encoding/utf16/utf16_range.h"
-#include "src/dfa/encoding/utf16/utf16_regexp.h"
-#include "src/dfa/re.h"
+#include "src/ir/regexp/encoding/utf16/utf16_range.h"
+#include "src/ir/regexp/encoding/utf16/utf16_regexp.h"
+#include "src/ir/regexp/regexp.h"
+#include "src/ir/regexp/regexp_cat.h"
+#include "src/ir/regexp/regexp_match.h"
namespace re2c {
--- /dev/null
+#ifndef _RE2C_IR_REGEXP_ENCODING_UTF16_REGEXP_
+#define _RE2C_IR_REGEXP_ENCODING_UTF16_REGEXP_
+
+#include "src/ir/regexp/encoding/utf16/utf16.h"
+#include "src/util/range.h"
+
+namespace re2c {
+
+class RegExp; // forward
+
+RegExp * UTF16Symbol(utf16::rune r);
+RegExp * UTF16Range(const Range * r);
+
+} // namespace re2c
+
+#endif // _RE2C_IR_REGEXP_ENCODING_UTF16_REGEXP_
-#include "src/dfa/encoding/utf8/utf8.h"
+#include "src/ir/regexp/encoding/utf8/utf8.h"
namespace re2c {
-#ifndef _RE2C_DFA_ENCODING_UTF8_UTF8_
-#define _RE2C_DFA_ENCODING_UTF8_UTF8_
+#ifndef _RE2C_IR_REGEXP_ENCODING_UTF8_UTF8_
+#define _RE2C_IR_REGEXP_ENCODING_UTF8_UTF8_
#include "src/util/c99_stdint.h"
} // namespace re2c
-#endif // _RE2C_DFA_ENCODING_UTF8_UTF8_
+#endif // _RE2C_IR_REGEXP_ENCODING_UTF8_UTF8_
-#include "src/dfa/encoding/utf8/utf8_range.h"
-#include "src/dfa/re.h"
+#include "src/ir/regexp/encoding/utf8/utf8_range.h"
+#include "src/ir/regexp/regexp.h"
namespace re2c {
-#ifndef _RE2C_DFA_ENCODING_UTF8_RANGE_
-#define _RE2C_DFA_ENCODING_UTF8_RANGE_
+#ifndef _RE2C_IR_REGEXP_ENCODING_UTF8_RANGE_
+#define _RE2C_IR_REGEXP_ENCODING_UTF8_RANGE_
-#include "src/dfa/encoding/range_suffix.h"
-#include "src/dfa/encoding/utf8/utf8.h"
+#include "src/ir/regexp/encoding/range_suffix.h"
+#include "src/ir/regexp/encoding/utf8/utf8.h"
namespace re2c {
} // namespace re2c
-#endif // _RE2C_DFA_ENCODING_UTF8_RANGE_
+#endif // _RE2C_IR_REGEXP_ENCODING_UTF8_RANGE_
-#include "src/dfa/encoding/utf8/utf8_range.h"
-#include "src/dfa/encoding/utf8/utf8_regexp.h"
-#include "src/dfa/re.h"
+#include "src/ir/regexp/encoding/utf8/utf8_range.h"
+#include "src/ir/regexp/encoding/utf8/utf8_regexp.h"
+#include "src/ir/regexp/regexp.h"
+#include "src/ir/regexp/regexp_cat.h"
+#include "src/ir/regexp/regexp_match.h"
namespace re2c {
--- /dev/null
+#ifndef _RE2C_IR_REGEXP_ENCODING_UTF8_REGEXP_
+#define _RE2C_IR_REGEXP_ENCODING_UTF8_REGEXP_
+
+#include "src/ir/regexp/encoding/utf8/utf8.h"
+#include "src/util/range.h"
+
+namespace re2c {
+
+class RegExp; // forward
+
+RegExp * UTF8Symbol(utf8::rune r);
+RegExp * UTF8Range(const Range * r);
+
+} // namespace re2c
+
+#endif // _RE2C_IR_REGEXP_ENCODING_UTF8_REGEXP_
--- /dev/null
+#include "src/ir/regexp/regexp.h"
+#include "src/ir/regexp/regexp_alt.h"
+#include "src/ir/regexp/regexp_cat.h"
+#include "src/ir/regexp/regexp_match.h"
+#include "src/ir/regexp/regexp_null.h"
+
+namespace re2c
+{
+
+uint32_t RegExp::fixedLength ()
+{
+ return ~0u;
+}
+
+uint32_t AltOp::fixedLength ()
+{
+ uint32_t l1 = exp1->fixedLength ();
+ uint32_t l2 = exp1->fixedLength ();
+
+ if (l1 != l2 || l1 == ~0u)
+ {
+ return ~0u;
+ }
+
+ return l1;
+}
+
+uint32_t CatOp::fixedLength ()
+{
+ const uint32_t l1 = exp1->fixedLength ();
+ if (l1 != ~0u)
+ {
+ const uint32_t l2 = exp2->fixedLength ();
+ if (l2 != ~0u)
+ {
+ return l1 + l2;
+ }
+ }
+ return ~0u;
+}
+
+uint32_t MatchOp::fixedLength ()
+{
+ return 1;
+}
+
+uint32_t NullOp::fixedLength ()
+{
+ return 0;
+}
+
+} // end namespace re2c
+
--- /dev/null
+#include "src/ir/regexp/encoding/utf16/utf16_regexp.h"
+#include "src/ir/regexp/encoding/utf8/utf8_regexp.h"
+#include "src/ir/regexp/regexp.h"
+#include "src/ir/regexp/regexp_alt.h"
+#include "src/ir/regexp/regexp_cat.h"
+#include "src/ir/regexp/regexp_match.h"
+#include "src/ir/regexp/regexp_null.h"
+#include "src/parse/scanner.h"
+#include "src/util/range.h"
+
+namespace re2c
+{
+
+static MatchOp * merge (MatchOp * m1, MatchOp * m2);
+
+RegExp * doAlt (RegExp * e1, RegExp * e2)
+{
+ if (!e1)
+ {
+ return e2;
+ }
+ if (!e2)
+ {
+ return e1;
+ }
+ return new AltOp (e1, e2);
+}
+
+RegExp * mkAlt (RegExp * e1, RegExp * e2)
+{
+ AltOp * a;
+ MatchOp * m1;
+ MatchOp * m2;
+
+ a = dynamic_cast<AltOp*> (e1);
+ if (a != NULL)
+ {
+ m1 = dynamic_cast<MatchOp*> (a->exp1);
+ if (m1 != NULL)
+ {
+ m1->ins_access = e1->ins_access;
+ a->exp2->ins_access = e1->ins_access;
+ e1 = a->exp2;
+ }
+ }
+ else
+ {
+ m1 = dynamic_cast<MatchOp*> (e1);
+ if (m1 != NULL)
+ {
+ e1 = NULL;
+ }
+ }
+ a = dynamic_cast<AltOp*> (e2);
+ if (a != NULL)
+ {
+ m2 = dynamic_cast<MatchOp*> (a->exp1);
+ if (m2 != NULL)
+ {
+ m2->ins_access = e2->ins_access;
+ a->exp2->ins_access = e2->ins_access;
+ e2 = a->exp2;
+ }
+ }
+ else
+ {
+ m2 = dynamic_cast<MatchOp*> (e2);
+ if (m2 != NULL)
+ {
+ e2 = NULL;
+ }
+ }
+
+ return doAlt (merge (m1, m2), doAlt (e1, e2));
+}
+
+MatchOp * merge (MatchOp * m1, MatchOp * m2)
+{
+ if (!m1)
+ {
+ return m2;
+ }
+ if (!m2)
+ {
+ return m1;
+ }
+ MatchOp * m = new MatchOp (doUnion (m1->match, m2->match));
+ if (m1->ins_access == RegExp::PRIVATE
+ || m2->ins_access == RegExp::PRIVATE)
+ {
+ m->ins_access = RegExp::PRIVATE;
+ }
+ return m;
+}
+
+RegExp * doCat (RegExp * e1, RegExp * e2)
+{
+ if (!e1)
+ {
+ return e2;
+ }
+ if (!e2)
+ {
+ return e1;
+ }
+ return new CatOp (e1, e2);
+}
+
+RegExp * mkDiff (RegExp * e1, RegExp * e2)
+{
+ MatchOp * m1 = dynamic_cast<MatchOp *> (e1);
+ MatchOp * m2 = dynamic_cast<MatchOp *> (e2);
+ if (m1 == NULL || m2 == NULL)
+ {
+ return NULL;
+ }
+ Range * r = doDiff (m1->match, m2->match);
+ return r
+ ? (RegExp *) new MatchOp(r)
+ : (RegExp *) new NullOp;
+}
+
+Range * Scanner::getRange(SubStr &s) const
+{
+ uint32_t lb = unescape(s), ub;
+
+ if (s.len < 2 || *s.str != '-')
+ {
+ ub = lb;
+ }
+ else
+ {
+ s.len--;
+ s.str++;
+ ub = unescape(s);
+ if (ub < lb)
+ {
+ uint32_t tmp = lb;
+ lb = ub;
+ ub = tmp;
+ }
+ }
+
+ Range * r = encoding.encodeRange(lb, ub);
+ if (r == NULL)
+ fatalf("Bad code point range: '0x%X - 0x%X'", lb, ub);
+ return r;
+}
+
+RegExp * Scanner::matchSymbol(uint32_t c) const
+{
+ if (!encoding.encode(c))
+ fatalf("Bad code point: '0x%X'", c);
+
+ if (encoding.is(Enc::UTF16))
+ return UTF16Symbol(c);
+ else if (encoding.is(Enc::UTF8))
+ return UTF8Symbol(c);
+ else
+ return new MatchOp(new Range(c, c + 1));
+}
+
+RegExp * Scanner::strToRE (SubStr & s) const
+{
+ if (s.len == 0)
+ return new NullOp;
+
+ RegExp *re = matchSymbol(unescape(s));
+
+ while (s.len > 0)
+ re = new CatOp(re, matchSymbol(unescape(s)));
+
+ return re;
+}
+
+RegExp * Scanner::strToCaseInsensitiveRE (SubStr & s) const
+{
+ if (s.len == 0)
+ return new NullOp;
+
+ uint32_t c = unescape(s);
+
+ RegExp *re, *reL, *reU;
+
+ if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z'))
+ {
+ reL = matchSymbol(tolower(c));
+ reU = matchSymbol(toupper(c));
+ re = mkAlt(reL, reU);
+ }
+ else
+ {
+ re = matchSymbol(c);
+ }
+
+ while (s.len > 0)
+ {
+ c = unescape(s);
+
+ if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z'))
+ {
+ reL = matchSymbol(tolower(c));
+ reU = matchSymbol(toupper(c));
+ re = new CatOp(re, mkAlt(reL, reU));
+ }
+ else
+ {
+ re = new CatOp(re, matchSymbol(c));
+ }
+ }
+
+ return re;
+}
+
+Range * Scanner::mkRange(SubStr &s) const
+{
+ Range *r = getRange(s);
+ while (s.len > 0)
+ r = doUnion(r, getRange(s));
+
+ return r;
+}
+
+RegExp * Scanner::matchSymbolRange(Range * r) const
+{
+ if (encoding.is(Enc::UTF16))
+ return UTF16Range(r);
+ else if (encoding.is(Enc::UTF8))
+ return UTF8Range(r);
+ else
+ return new MatchOp(r);
+}
+
+RegExp * Scanner::ranToRE (SubStr & s) const
+{
+ s.len -= 2;
+ s.str += 1;
+
+ if (s.len == 0)
+ return new NullOp;
+
+ return matchSymbolRange(mkRange(s));
+}
+
+RegExp * Scanner::invToRE (SubStr & s) const
+{
+ s.len -= 3;
+ s.str += 2;
+
+ Range * full = encoding.fullRange();
+
+ Range * r = s.len == 0
+ ? full
+ : doDiff(full, mkRange (s));
+
+ return matchSymbolRange(r);
+}
+
+RegExp * Scanner::mkDot() const
+{
+ Range * full = encoding.fullRange();
+ uint32_t c = '\n';
+ if (!encoding.encode(c))
+ fatalf("Bad code point: '0x%X'", c);
+ Range * ran = new Range(c, c + 1);
+ Range * inv = doDiff(full, ran);
+
+ return matchSymbolRange(inv);
+}
+
+/*
+ * Create a byte range that includes all possible input characters.
+ * This may include characters, which do not map to any valid symbol
+ * in current encoding. For encodings, which directly map symbols to
+ * input characters (ASCII, EBCDIC, UTF-32), it equals [^]. For other
+ * encodings (UTF-16, UTF-8), [^] and this range are different.
+ *
+ * Also note that default range doesn't respect encoding policy
+ * (the way invalid code points are treated).
+ */
+RegExp * Scanner::mkDefault() const
+{
+ Range * def = new Range(0, encoding.nCodeUnits());
+ return new MatchOp(def);
+}
+
+} // namespace re2c
--- /dev/null
+#ifndef _RE2C_IR_REGEXP_REGEXP_
+#define _RE2C_IR_REGEXP_REGEXP_
+
+#include <iosfwd>
+
+#include "src/ir/bytecode/charset.h"
+#include "src/ir/bytecode/ins.h"
+#include "src/globals.h"
+#include "src/util/forbid_copy.h"
+
+namespace re2c
+{
+
+typedef uint32_t Char;
+
+class RegExp
+{
+public:
+ static free_list <RegExp *> vFreeList;
+
+ uint32_t size;
+ /*
+ * There're several different cases when the same regexp
+ * can be used multiple times:
+ * 1) named definitions, e.g. digit = [0-9];
+ * 2) counted repetition, e.g. "a"{3}, "a"{3,}, "a"{3,5}
+ * 3) multiple DFA's sharing the same regexp , e.g. <c1, c2> "abc" { }
+ * 4) common suffixes, e.g. suffix [\x80-\xBF] in UTF-8 ranges
+ * In cases 1-3, regexp must be recompiled each time it's reused.
+ * In case 4, regexp should be compiled only once, and instructions
+ * should be shared in order to reduce space.
+ *
+ * Note: if regexp must always be recompiled, it doesn't imply that
+ * parts of this regexp must always be recompiled. It only forces
+ * the compilation function to traverse the regexp after compilation
+ * and reset compilation cache for each sub-regexp. E.g., for a regexp
+ * [^]{3} in UTF-8 mode, each of sub-regexps [^] will have common suffix
+ * [\x80-\xBF] factored out, but they won't share instructions.
+ */
+ Ins * ins_cache; /* if non-NULL, points to compiled instructions */
+ enum InsAccess
+ {
+ SHARED,
+ PRIVATE
+ } ins_access;
+
+ inline RegExp ()
+ : size (0)
+ , ins_cache (NULL)
+ , ins_access (SHARED)
+ {
+ vFreeList.insert (this);
+ }
+ inline virtual ~RegExp ()
+ {
+ vFreeList.erase (this);
+ }
+ virtual void split (CharSet &) = 0;
+ virtual void calcSize (Char *) = 0;
+ virtual uint32_t fixedLength ();
+ virtual uint32_t compile (Char *, Ins *) = 0;
+ virtual void decompile () = 0;
+ virtual void display (std::ostream &) const = 0;
+ friend std::ostream & operator << (std::ostream & o, const RegExp & re);
+
+ FORBID_COPY (RegExp);
+};
+
+RegExp * doAlt (RegExp * e1, RegExp * e2);
+RegExp * mkAlt (RegExp * e1, RegExp * e2);
+RegExp * doCat (RegExp * e1, RegExp * e2);
+RegExp * mkDiff (RegExp * e1, RegExp * e2);
+
+} // end namespace re2c
+
+#endif // _RE2C_IR_REGEXP_REGEXP_
--- /dev/null
+#ifndef _RE2C_IR_REGEXP_REGEXP_ALT_
+#define _RE2C_IR_REGEXP_REGEXP_ALT_
+
+#include "src/ir/regexp/regexp.h"
+
+namespace re2c
+{
+
+class AltOp: public RegExp
+{
+ RegExp * exp1;
+ RegExp * exp2;
+
+public:
+ inline AltOp (RegExp * e1, RegExp * e2)
+ : exp1 (e1)
+ , exp2 (e2)
+ {}
+ void split (CharSet &);
+ void calcSize (Char *);
+ uint32_t fixedLength ();
+ uint32_t compile (Char *, Ins *);
+ void decompile ();
+ void display (std::ostream & o) const;
+ friend RegExp * mkAlt (RegExp *, RegExp *);
+
+ FORBID_COPY (AltOp);
+};
+
+} // end namespace re2c
+
+#endif // _RE2C_IR_REGEXP_REGEXP_ALT_
--- /dev/null
+#ifndef _RE2C_IR_REGEXP_REGEXP_CAT_
+#define _RE2C_IR_REGEXP_REGEXP_CAT_
+
+#include "src/ir/regexp/regexp.h"
+
+namespace re2c
+{
+
+class CatOp: public RegExp
+{
+ RegExp * exp1;
+ RegExp * exp2;
+
+public:
+ inline CatOp (RegExp * e1, RegExp * e2)
+ : exp1 (e1)
+ , exp2 (e2)
+ {}
+ void split (CharSet &);
+ void calcSize (Char *);
+ uint32_t fixedLength ();
+ uint32_t compile (Char *, Ins *);
+ void decompile ();
+ void display (std::ostream & o) const;
+
+ FORBID_COPY (CatOp);
+};
+
+} // end namespace re2c
+
+#endif // _RE2C_IR_REGEXP_REGEXP_CAT_
--- /dev/null
+#ifndef _RE2C_IR_REGEXP_REGEXP_CLOSE_
+#define _RE2C_IR_REGEXP_REGEXP_CLOSE_
+
+#include "src/ir/regexp/regexp.h"
+
+namespace re2c
+{
+
+class CloseOp: public RegExp
+{
+ RegExp * exp;
+
+public:
+ inline CloseOp (RegExp * e)
+ : exp (e)
+ {}
+ void split (CharSet &);
+ void calcSize (Char *);
+ uint32_t compile (Char *, Ins *);
+ void decompile ();
+ void display (std::ostream & o) const;
+
+ FORBID_COPY (CloseOp);
+};
+
+} // end namespace re2c
+
+#endif // _RE2C_IR_REGEXP_REGEXP_CLOSE_
--- /dev/null
+#ifndef _RE2C_IR_REGEXP_REGEXP_CLOSEV_
+#define _RE2C_IR_REGEXP_REGEXP_CLOSEV_
+
+#include "src/ir/regexp/regexp.h"
+
+namespace re2c
+{
+
+class CloseVOp: public RegExp
+{
+ RegExp * exp;
+ int32_t min;
+ int32_t max;
+
+public:
+ inline CloseVOp (RegExp * e, int lb, int ub)
+ : exp (e)
+ , min (lb)
+ , max (ub)
+ {
+ exp->ins_access = PRIVATE;
+ }
+ void split (CharSet &);
+ void calcSize (Char *);
+ uint32_t compile (Char *, Ins *);
+ void decompile ();
+ void display (std::ostream & o) const;
+
+ FORBID_COPY (CloseVOp);
+};
+
+} // end namespace re2c
+
+#endif // _RE2C_IR_REGEXP_REGEXP_CLOSEV_
--- /dev/null
+#ifndef _RE2C_IR_REGEXP_REGEXP_MATCH_
+#define _RE2C_IR_REGEXP_REGEXP_MATCH_
+
+#include "src/ir/regexp/regexp.h"
+#include "src/util/range.h"
+
+namespace re2c
+{
+
+class MatchOp: public RegExp
+{
+public:
+ Range * match;
+
+ inline MatchOp (Range * m)
+ : match (m)
+ {}
+ void split (CharSet &);
+ void calcSize (Char *);
+ uint32_t fixedLength ();
+ uint32_t compile (Char *, Ins *);
+ void decompile ();
+ void display (std::ostream & o) const;
+
+ FORBID_COPY (MatchOp);
+};
+
+} // end namespace re2c
+
+#endif // _RE2C_IR_REGEXP_REGEXP_MATCH_
--- /dev/null
+#ifndef _RE2C_IR_REGEXP_REGEXP_NULL_
+#define _RE2C_IR_REGEXP_REGEXP_NULL_
+
+#include "src/ir/regexp/regexp.h"
+
+namespace re2c
+{
+
+class NullOp: public RegExp
+{
+public:
+ void split (CharSet &);
+ void calcSize (Char *);
+ uint32_t fixedLength ();
+ uint32_t compile (Char *, Ins *);
+ void decompile ();
+ void display (std::ostream & o) const;
+};
+
+} // end namespace re2c
+
+#endif // _RE2C_IR_REGEXP_REGEXP_NULL_
--- /dev/null
+#ifndef _RE2C_IR_REGEXP_REGEXP_RULE_
+#define _RE2C_IR_REGEXP_REGEXP_RULE_
+
+#include "src/ir/regexp/regexp.h"
+#include "src/ir/rule_rank.h"
+#include "src/parse/token.h"
+
+namespace re2c
+{
+
+class RuleOp: public RegExp
+{
+ RegExp * exp;
+
+public:
+ RegExp * ctx;
+ Ins * ins;
+ rule_rank_t rank;
+ Token * code;
+ uint32_t line;
+
+ inline RuleOp (RegExp * e, RegExp * c, Token * t, rule_rank_t r, InsAccess access)
+ : exp (e)
+ , ctx (c)
+ , ins (NULL)
+ , rank (r)
+ , code (t)
+ , line (0)
+ {
+ ins_access = access;
+ }
+ inline ~RuleOp ()
+ {
+ delete code;
+ }
+ void display (std::ostream & o) const;
+ void split (CharSet &);
+ void calcSize (Char *);
+ uint32_t compile (Char *, Ins *);
+ void decompile ();
+
+ FORBID_COPY (RuleOp);
+};
+
+} // end namespace re2c
+
+#endif // _RE2C_IR_REGEXP_REGEXP_RULE_
#include <ostream>
-#include "src/dfa/rule_rank.h"
+#include "src/ir/rule_rank.h"
namespace re2c
{
-#ifndef _RE2C_DFA_RULE_RANK_
-#define _RE2C_DFA_RULE_RANK_
+#ifndef _RE2C_IR_RULE_RANK_
+#define _RE2C_IR_RULE_RANK_
#include <iosfwd>
} // namespace re2c
-#endif // _RE2C_DFA_RULE_RANK_
+#endif // _RE2C_IR_RULE_RANK_
#include <string.h>
#include "config.h"
-#include "src/dfa/dfa.h"
-#include "src/dfa/encoding/enc.h"
#include "src/globals.h"
#include "src/mbo_getopt.h"
#include "src/parse/parser.h"
--- /dev/null
+#ifndef _RE2C_PARSE_EXTOP_
+#define _RE2C_PARSE_EXTOP_
+
+#include "src/util/c99_stdint.h"
+
+namespace re2c
+{
+
+struct ExtOp
+{
+ char op;
+ int32_t minsize;
+ int32_t maxsize;
+};
+
+} // end namespace re2c
+
+#endif // _RE2C_PARSE_EXTOP_
#ifndef _RE2C_PARSE_PARSER_
#define _RE2C_PARSE_PARSER_
+#include <list>
#include <string>
#include "src/codegen/output.h"
-#include "src/dfa/re.h"
+#include "src/ir/regexp/regexp.h"
+#include "src/ir/regexp/regexp_rule.h"
#include "src/parse/scanner.h"
namespace re2c
#include <set>
#include "config.h"
-#include "src/dfa/dfa.h"
+#include "src/ir/bytecode/bytecode.h"
+#include "src/ir/regexp/regexp_cat.h"
+#include "src/ir/regexp/regexp_close.h"
+#include "src/ir/regexp/regexp_closev.h"
+#include "src/ir/regexp/regexp_null.h"
#include "src/codegen/emit.h" // genTypes
#include "src/globals.h"
+#include "src/parse/extop.h"
#include "src/parse/parser.h"
#include "src/util/c99_stdint.h"
#include "src/util/smart_ptr.h"
#include <string>
#include "src/codegen/output.h"
-#include "src/dfa/re.h"
+#include "src/ir/regexp/regexp.h"
#include "src/globals.h"
#include "src/parse/input.h"
#include "src/parse/token.h"
#include <iostream>
#include <sstream>
-#include "src/dfa/dfa.h"
+#include "src/ir/dfa/dfa.h"
#include "src/globals.h"
+#include "src/parse/extop.h"
#include "src/parse/parser.h"
#include "src/parse/scanner.h"
#include "y.tab.h"
--- /dev/null
+#include <string.h>
+
+#include "src/parse/scanner.h"
+#include "src/util/substr.h"
+
+namespace re2c {
+
+uint32_t Scanner::unescape(SubStr &s) const
+{
+ static const char * hex = "0123456789abcdef";
+ static const char * oct = "01234567";
+
+ s.len--;
+ uint32_t c, ucb = 0;
+
+ if ((c = *s.str++) != '\\' || s.len == 0)
+ {
+ return c;
+ }
+
+ s.len--;
+
+ switch (c = *s.str++)
+ {
+ case 'n': return '\n';
+ case 't': return '\t';
+ case 'v': return '\v';
+ case 'b': return '\b';
+ case 'r': return '\r';
+ case 'f': return '\f';
+ case 'a': return '\a';
+
+ case 'x':
+ {
+ if (s.len < 2)
+ {
+ fatal(s.ofs()+s.len, "Illegal hexadecimal character code, two hexadecimal digits are required");
+ return ~0u;
+ }
+
+ const char *p1 = strchr(hex, tolower(s.str[0]));
+ const char *p2 = strchr(hex, tolower(s.str[1]));
+
+ if (!p1 || !p2)
+ {
+ fatal(s.ofs()+(p1?1:0), "Illegal hexadecimal character code");
+ return ~0u;
+ }
+ else
+ {
+ s.len -= 2;
+ s.str += 2;
+
+ uint32_t v = (uint32_t)((p1 - hex) << 4)
+ + (uint32_t)((p2 - hex));
+
+ return v;
+ }
+ }
+
+ case 'U':
+ {
+ if (s.len < 8)
+ {
+ fatal(s.ofs()+s.len, "Illegal unicode character, eight hexadecimal digits are required");
+ return ~0u;
+ }
+
+ uint32_t l = 0;
+ if (s.str[0] == '0')
+ {
+ l++;
+ if (s.str[1] == '0')
+ {
+ l++;
+ if (s.str[2] == '0' || (s.str[2] == '1' && encoding.szCodePoint() == 4))
+ {
+ l++;
+ if (encoding.szCodePoint() == 4)
+ {
+ const char *u3 = strchr(hex, tolower(s.str[2]));
+ const char *u4 = strchr(hex, tolower(s.str[3]));
+ if (u3 && u4)
+ {
+ ucb = (uint32_t)((u3 - hex) << 20)
+ + (uint32_t)((u4 - hex) << 16);
+ l++;
+ }
+ }
+ else if (s.str[3] == '0')
+ {
+ l++;
+ }
+ }
+ }
+ }
+
+ if (l != 4)
+ {
+ fatal(s.ofs()+l, "Illegal unicode character, eight hexadecimal digits are required");
+ }
+
+ s.len -= 4;
+ s.str += 4;
+
+ // no break;
+ }
+ case 'X':
+ case 'u':
+ {
+ if (s.len < 4)
+ {
+ fatal(s.ofs()+s.len,
+ c == 'X'
+ ? "Illegal hexadecimal character code, four hexadecimal digits are required"
+ : "Illegal unicode character, four hexadecimal digits are required");
+ return ~0u;
+ }
+
+ const char *p1 = strchr(hex, tolower(s.str[0]));
+ const char *p2 = strchr(hex, tolower(s.str[1]));
+ const char *p3 = strchr(hex, tolower(s.str[2]));
+ const char *p4 = strchr(hex, tolower(s.str[3]));
+
+ if (!p1 || !p2 || !p3 || !p4)
+ {
+ fatal(s.ofs()+(p1?1:0)+(p2?1:0)+(p3?1:0),
+ c == 'X'
+ ? "Illegal hexadecimal character code, non hexxdecimal digit found"
+ : "Illegal unicode character, non hexadecimal digit found");
+ return ~0u;
+ }
+ else
+ {
+ s.len -= 4;
+ s.str += 4;
+
+ uint32_t v = (uint32_t)((p1 - hex) << 12)
+ + (uint32_t)((p2 - hex) << 8)
+ + (uint32_t)((p3 - hex) << 4)
+ + (uint32_t)((p4 - hex))
+ + ucb;
+
+ if (v >= encoding.nCodePoints())
+ {
+ fatal(s.ofs(),
+ c == 'X'
+ ? "Illegal hexadecimal character code, out of range"
+ : "Illegal unicode character, out of range");
+ }
+
+ return v;
+ }
+ }
+
+ case '4':
+ case '5':
+ case '6':
+ case '7':
+ {
+ fatal(s.ofs()-1, "Illegal octal character code, first digit must be 0 thru 3");
+ return ~0u;
+ }
+
+ case '0':
+ case '1':
+ case '2':
+ case '3':
+ {
+ if (s.len < 2)
+ {
+ fatal(s.ofs()+s.len, "Illegal octal character code, three octal digits are required");
+ return ~0u;
+ }
+
+ const char *p0 = strchr(oct, c);
+ const char *p1 = strchr(oct, s.str[0]);
+ const char *p2 = strchr(oct, s.str[1]);
+
+ if (!p0 || !p1 || !p2)
+ {
+ fatal(s.ofs()+(p1?1:0), "Illegal octal character code, non octal digit found");
+ return ~0u;
+ }
+ else
+ {
+ s.len -= 2;
+ s.str += 2;
+
+ uint32_t v = (uint32_t)((p0 - oct) << 6) + (uint32_t)((p1 - oct) << 3) + (uint32_t)(p2 - oct);
+
+ return v;
+ }
+ }
+
+ default:
+ return c;
+ }
+}
+
+std::string& Scanner::unescape(SubStr& str_in, std::string& str_out) const
+{
+ str_out.clear();
+
+ while(str_in.len)
+ {
+ uint32_t c = unescape(str_in);
+
+ if (c > 0xFF)
+ {
+ fatal(str_in.ofs(), "Illegal character");
+ }
+
+ str_out += static_cast<char>(c);
+ }
+
+ return str_out;
+}
+
+} // namespace re2c