From 66babb646b50bfcfa98f2873c90398b44936a2ca Mon Sep 17 00:00:00 2001 From: Ulya Trofimovich Date: Wed, 10 Jun 2015 17:30:15 +0100 Subject: [PATCH] Restructured sources layout, improved formatting. --- re2c/Makefile.am | 69 +- re2c/bootstrap/scanner_lex.cc | 5 +- re2c/src/codegen/emit.h | 3 +- re2c/src/codegen/emit_action.cc | 4 +- re2c/src/codegen/emit_dfa.cc | 1 - re2c/src/codegen/go_construct.cc | 2 +- re2c/src/codegen/go_destruct.cc | 2 +- re2c/src/codegen/go_emit.cc | 2 +- re2c/src/codegen/go_used_labels.cc | 2 +- re2c/src/codegen/output.h | 2 +- re2c/src/codegen/prepare_dfa.cc | 3 +- re2c/src/codegen/scc.cc | 2 +- re2c/src/codegen/skeleton/path.h | 2 +- re2c/src/codegen/skeleton/skeleton.cc | 1 + re2c/src/codegen/skeleton/skeleton.h | 2 +- re2c/src/dfa/actions.cc | 1073 ----------------- re2c/src/dfa/encoding/utf16/utf16_regexp.h | 16 - re2c/src/dfa/encoding/utf8/utf8_regexp.h | 16 - re2c/src/dfa/ins.h | 55 - re2c/src/dfa/re.h | 310 ----- re2c/src/globals.h | 2 +- re2c/src/ir/bytecode/bytecode.cc | 111 ++ re2c/src/ir/bytecode/bytecode.h | 16 + re2c/src/ir/bytecode/calc_size.cc | 72 ++ re2c/src/ir/bytecode/charset.cc | 32 + re2c/src/ir/bytecode/charset.h | 37 + re2c/src/ir/bytecode/compile.cc | 262 ++++ re2c/src/ir/bytecode/ins.cc | 41 + re2c/src/ir/bytecode/ins.h | 52 + re2c/src/ir/bytecode/split.cc | 82 ++ re2c/src/{ => ir}/dfa/action.h | 6 +- re2c/src/{ => ir}/dfa/dfa.cc | 3 +- re2c/src/{ => ir}/dfa/dfa.h | 13 +- re2c/src/{ => ir}/dfa/state.h | 10 +- re2c/src/ir/regexp/display.cc | 57 + re2c/src/{dfa => ir/regexp}/encoding/enc.cc | 2 +- re2c/src/{dfa => ir/regexp}/encoding/enc.h | 6 +- .../regexp}/encoding/range_suffix.cc | 5 +- .../regexp}/encoding/range_suffix.h | 6 +- .../regexp}/encoding/utf16/utf16.cc | 2 +- .../{dfa => ir/regexp}/encoding/utf16/utf16.h | 6 +- .../regexp}/encoding/utf16/utf16_range.cc | 4 +- .../regexp}/encoding/utf16/utf16_range.h | 10 +- .../regexp}/encoding/utf16/utf16_regexp.cc | 8 +- .../ir/regexp/encoding/utf16/utf16_regexp.h | 16 + .../{dfa => ir/regexp}/encoding/utf8/utf8.cc | 2 +- .../{dfa => ir/regexp}/encoding/utf8/utf8.h | 6 +- .../regexp}/encoding/utf8/utf8_range.cc | 4 +- .../regexp}/encoding/utf8/utf8_range.h | 10 +- .../regexp}/encoding/utf8/utf8_regexp.cc | 8 +- .../src/ir/regexp/encoding/utf8/utf8_regexp.h | 16 + re2c/src/ir/regexp/fixed_length.cc | 53 + re2c/src/ir/regexp/regexp.cc | 287 +++++ re2c/src/ir/regexp/regexp.h | 76 ++ re2c/src/ir/regexp/regexp_alt.h | 32 + re2c/src/ir/regexp/regexp_cat.h | 31 + re2c/src/ir/regexp/regexp_close.h | 28 + re2c/src/ir/regexp/regexp_closev.h | 34 + re2c/src/ir/regexp/regexp_match.h | 30 + re2c/src/ir/regexp/regexp_null.h | 22 + re2c/src/ir/regexp/regexp_rule.h | 47 + re2c/src/{dfa => ir}/rule_rank.cc | 2 +- re2c/src/{dfa => ir}/rule_rank.h | 6 +- re2c/src/main.cc | 2 - re2c/src/parse/extop.h | 18 + re2c/src/parse/parser.h | 4 +- re2c/src/parse/parser.ypp | 7 +- re2c/src/parse/scanner.h | 2 +- re2c/src/parse/scanner_lex.re | 3 +- re2c/src/parse/unescape.cc | 220 ++++ 70 files changed, 1807 insertions(+), 1576 deletions(-) delete mode 100644 re2c/src/dfa/actions.cc delete mode 100644 re2c/src/dfa/encoding/utf16/utf16_regexp.h delete mode 100644 re2c/src/dfa/encoding/utf8/utf8_regexp.h delete mode 100644 re2c/src/dfa/ins.h delete mode 100644 re2c/src/dfa/re.h create mode 100644 re2c/src/ir/bytecode/bytecode.cc create mode 100644 re2c/src/ir/bytecode/bytecode.h create mode 100644 re2c/src/ir/bytecode/calc_size.cc create mode 100644 re2c/src/ir/bytecode/charset.cc create mode 100644 re2c/src/ir/bytecode/charset.h create mode 100644 re2c/src/ir/bytecode/compile.cc create mode 100644 re2c/src/ir/bytecode/ins.cc create mode 100644 re2c/src/ir/bytecode/ins.h create mode 100644 re2c/src/ir/bytecode/split.cc rename re2c/src/{ => ir}/dfa/action.h (93%) rename re2c/src/{ => ir}/dfa/dfa.cc (98%) rename re2c/src/{ => ir}/dfa/dfa.h (80%) rename re2c/src/{ => ir}/dfa/state.h (80%) create mode 100644 re2c/src/ir/regexp/display.cc rename re2c/src/{dfa => ir/regexp}/encoding/enc.cc (99%) rename re2c/src/{dfa => ir/regexp}/encoding/enc.h (97%) rename re2c/src/{dfa => ir/regexp}/encoding/range_suffix.cc (77%) rename re2c/src/{dfa => ir/regexp}/encoding/range_suffix.h (79%) rename re2c/src/{dfa => ir/regexp}/encoding/utf16/utf16.cc (82%) rename re2c/src/{dfa => ir/regexp}/encoding/utf16/utf16.h (82%) rename re2c/src/{dfa => ir/regexp}/encoding/utf16/utf16_range.cc (97%) rename re2c/src/{dfa => ir/regexp}/encoding/utf16/utf16_range.h (63%) rename re2c/src/{dfa => ir/regexp}/encoding/utf16/utf16_regexp.cc (77%) create mode 100644 re2c/src/ir/regexp/encoding/utf16/utf16_regexp.h rename re2c/src/{dfa => ir/regexp}/encoding/utf8/utf8.cc (97%) rename re2c/src/{dfa => ir/regexp}/encoding/utf8/utf8.h (87%) rename re2c/src/{dfa => ir/regexp}/encoding/utf8/utf8_range.cc (97%) rename re2c/src/{dfa => ir/regexp}/encoding/utf8/utf8_range.h (55%) rename re2c/src/{dfa => ir/regexp}/encoding/utf8/utf8_regexp.cc (78%) create mode 100644 re2c/src/ir/regexp/encoding/utf8/utf8_regexp.h create mode 100644 re2c/src/ir/regexp/fixed_length.cc create mode 100644 re2c/src/ir/regexp/regexp.cc create mode 100644 re2c/src/ir/regexp/regexp.h create mode 100644 re2c/src/ir/regexp/regexp_alt.h create mode 100644 re2c/src/ir/regexp/regexp_cat.h create mode 100644 re2c/src/ir/regexp/regexp_close.h create mode 100644 re2c/src/ir/regexp/regexp_closev.h create mode 100644 re2c/src/ir/regexp/regexp_match.h create mode 100644 re2c/src/ir/regexp/regexp_null.h create mode 100644 re2c/src/ir/regexp/regexp_rule.h rename re2c/src/{dfa => ir}/rule_rank.cc (94%) rename re2c/src/{dfa => ir}/rule_rank.h (89%) create mode 100644 re2c/src/parse/extop.h create mode 100644 re2c/src/parse/unescape.cc diff --git a/re2c/Makefile.am b/re2c/Makefile.am index 0f767835..59122534 100644 --- a/re2c/Makefile.am +++ b/re2c/Makefile.am @@ -22,22 +22,32 @@ SRC_HDR = \ src/codegen/scc.h \ src/codegen/skeleton/path.h \ src/codegen/skeleton/skeleton.h \ - src/dfa/action.h \ - src/dfa/encoding/enc.h \ - src/dfa/encoding/range_suffix.h \ - src/dfa/encoding/utf16/utf16.h \ - src/dfa/encoding/utf16/utf16_range.h \ - src/dfa/encoding/utf16/utf16_regexp.h \ - src/dfa/encoding/utf8/utf8.h \ - src/dfa/encoding/utf8/utf8_range.h \ - src/dfa/encoding/utf8/utf8_regexp.h \ - src/dfa/dfa.h \ - src/dfa/ins.h \ - src/dfa/re.h \ - src/dfa/rule_rank.h \ - src/dfa/state.h \ + src/ir/bytecode/charset.h \ + src/ir/bytecode/bytecode.h \ + src/ir/bytecode/ins.h \ + src/ir/dfa/state.h \ + src/ir/dfa/dfa.h \ + src/ir/dfa/action.h \ + src/ir/regexp/regexp_alt.h \ + src/ir/regexp/regexp_match.h \ + src/ir/regexp/encoding/enc.h \ + src/ir/regexp/encoding/range_suffix.h \ + src/ir/regexp/encoding/utf8/utf8.h \ + src/ir/regexp/encoding/utf8/utf8_regexp.h \ + src/ir/regexp/encoding/utf8/utf8_range.h \ + src/ir/regexp/encoding/utf16/utf16_range.h \ + src/ir/regexp/encoding/utf16/utf16_regexp.h \ + src/ir/regexp/encoding/utf16/utf16.h \ + src/ir/regexp/regexp_rule.h \ + src/ir/regexp/regexp_cat.h \ + src/ir/regexp/regexp_closev.h \ + src/ir/regexp/regexp_null.h \ + src/ir/regexp/regexp.h \ + src/ir/regexp/regexp_close.h \ + src/ir/rule_rank.h \ src/globals.h \ src/mbo_getopt.h \ + src/parse/extop.h \ src/parse/input.h \ src/parse/parser.h \ src/parse/scanner.h \ @@ -70,21 +80,30 @@ SRC = \ src/codegen/scc.cc \ src/codegen/skeleton/path.cc \ src/codegen/skeleton/skeleton.cc \ - src/dfa/actions.cc \ - src/dfa/encoding/enc.cc \ - src/dfa/encoding/range_suffix.cc \ - src/dfa/encoding/utf16/utf16.cc \ - src/dfa/encoding/utf16/utf16_range.cc \ - src/dfa/encoding/utf16/utf16_regexp.cc \ - src/dfa/encoding/utf8/utf8.cc \ - src/dfa/encoding/utf8/utf8_range.cc \ - src/dfa/encoding/utf8/utf8_regexp.cc \ - src/dfa/dfa.cc \ - src/dfa/rule_rank.cc \ + src/ir/bytecode/bytecode.cc \ + src/ir/bytecode/ins.cc \ + src/ir/bytecode/charset.cc \ + src/ir/bytecode/split.cc \ + src/ir/bytecode/compile.cc \ + src/ir/bytecode/calc_size.cc \ + src/ir/dfa/dfa.cc \ + src/ir/regexp/display.cc \ + src/ir/regexp/encoding/enc.cc \ + src/ir/regexp/encoding/range_suffix.cc \ + src/ir/regexp/encoding/utf8/utf8_regexp.cc \ + src/ir/regexp/encoding/utf8/utf8_range.cc \ + src/ir/regexp/encoding/utf8/utf8.cc \ + src/ir/regexp/encoding/utf16/utf16_regexp.cc \ + src/ir/regexp/encoding/utf16/utf16.cc \ + src/ir/regexp/encoding/utf16/utf16_range.cc \ + src/ir/regexp/fixed_length.cc \ + src/ir/regexp/regexp.cc \ + src/ir/rule_rank.cc \ src/main.cc \ src/mbo_getopt.cc \ src/parse/input.cc \ src/parse/scanner.cc \ + src/parse/unescape.cc \ src/util/range.cc re2c_SOURCES = \ $(SRC_HDR) \ diff --git a/re2c/bootstrap/scanner_lex.cc b/re2c/bootstrap/scanner_lex.cc index 469393ad..48160f72 100644 --- a/re2c/bootstrap/scanner_lex.cc +++ b/re2c/bootstrap/scanner_lex.cc @@ -1,11 +1,12 @@ -/* Generated by re2c 0.14.1.dev on Thu May 14 17:19:41 2015*/ +/* Generated by re2c 0.14.2 on Wed Jun 10 17:27:57 2015 */ #include #include #include #include -#include "src/dfa/dfa.h" +#include "src/ir/dfa/dfa.h" #include "src/globals.h" +#include "src/parse/extop.h" #include "src/parse/parser.h" #include "src/parse/scanner.h" #include "y.tab.h" diff --git a/re2c/src/codegen/emit.h b/re2c/src/codegen/emit.h index c616175e..5c78ef18 100644 --- a/re2c/src/codegen/emit.h +++ b/re2c/src/codegen/emit.h @@ -2,11 +2,10 @@ #define _RE2C_CODEGEN_EMIT_ #include "src/codegen/output.h" -#include "src/dfa/dfa.h" +#include "src/ir/dfa/dfa.h" namespace re2c { -class DFA; typedef std::vector RegExpIndices; void emit_action diff --git a/re2c/src/codegen/emit_action.cc b/re2c/src/codegen/emit_action.cc index 87f33ad8..e5d75afd 100644 --- a/re2c/src/codegen/emit_action.cc +++ b/re2c/src/codegen/emit_action.cc @@ -1,8 +1,8 @@ #include "src/codegen/emit.h" #include "src/codegen/indent.h" #include "src/codegen/input_api.h" -#include "src/codegen/skeleton/skeleton.h" -#include "src/dfa/action.h" +#include "src/ir/dfa/action.h" +#include "src/ir/regexp/regexp_rule.h" namespace re2c { diff --git a/re2c/src/codegen/emit_dfa.cc b/re2c/src/codegen/emit_dfa.cc index 147ce6da..11653a34 100644 --- a/re2c/src/codegen/emit_dfa.cc +++ b/re2c/src/codegen/emit_dfa.cc @@ -6,7 +6,6 @@ #include "src/codegen/indent.h" #include "src/codegen/input_api.h" #include "src/codegen/skeleton/skeleton.h" -#include "src/dfa/dfa.h" namespace re2c { diff --git a/re2c/src/codegen/go_construct.cc b/re2c/src/codegen/go_construct.cc index 321f3344..398f4a80 100644 --- a/re2c/src/codegen/go_construct.cc +++ b/re2c/src/codegen/go_construct.cc @@ -1,6 +1,6 @@ #include "src/codegen/bitmap.h" #include "src/codegen/go.h" -#include "src/dfa/dfa.h" +#include "src/ir/dfa/dfa.h" #include "src/util/allocate.h" namespace re2c diff --git a/re2c/src/codegen/go_destruct.cc b/re2c/src/codegen/go_destruct.cc index e53cedfb..041945d8 100644 --- a/re2c/src/codegen/go_destruct.cc +++ b/re2c/src/codegen/go_destruct.cc @@ -1,5 +1,5 @@ #include "src/codegen/go.h" -#include "src/dfa/dfa.h" +#include "src/ir/dfa/dfa.h" namespace re2c { diff --git a/re2c/src/codegen/go_emit.cc b/re2c/src/codegen/go_emit.cc index 5da68ad6..9c5d33c6 100644 --- a/re2c/src/codegen/go_emit.cc +++ b/re2c/src/codegen/go_emit.cc @@ -2,7 +2,7 @@ #include "src/codegen/go.h" #include "src/codegen/indent.h" #include "src/codegen/print.h" -#include "src/dfa/dfa.h" +#include "src/ir/dfa/dfa.h" namespace re2c { diff --git a/re2c/src/codegen/go_used_labels.cc b/re2c/src/codegen/go_used_labels.cc index d17ab92e..850f270a 100644 --- a/re2c/src/codegen/go_used_labels.cc +++ b/re2c/src/codegen/go_used_labels.cc @@ -1,5 +1,5 @@ #include "src/codegen/go.h" -#include "src/dfa/dfa.h" +#include "src/ir/dfa/dfa.h" namespace re2c { diff --git a/re2c/src/codegen/output.h b/re2c/src/codegen/output.h index 4daacf25..4890a7f4 100644 --- a/re2c/src/codegen/output.h +++ b/re2c/src/codegen/output.h @@ -7,7 +7,7 @@ #include #include "src/codegen/label.h" -#include "src/dfa/rule_rank.h" +#include "src/ir/rule_rank.h" #include "src/util/c99_stdint.h" #include "src/util/forbid_copy.h" diff --git a/re2c/src/codegen/prepare_dfa.cc b/re2c/src/codegen/prepare_dfa.cc index c5ef6255..57640430 100644 --- a/re2c/src/codegen/prepare_dfa.cc +++ b/re2c/src/codegen/prepare_dfa.cc @@ -2,7 +2,8 @@ #include "src/codegen/bitmap.h" #include "src/codegen/scc.h" -#include "src/dfa/dfa.h" +#include "src/ir/dfa/dfa.h" +#include "src/ir/regexp/regexp_rule.h" #include "src/util/allocate.h" namespace re2c { diff --git a/re2c/src/codegen/scc.cc b/re2c/src/codegen/scc.cc index ba79dbb0..243497a8 100644 --- a/re2c/src/codegen/scc.cc +++ b/re2c/src/codegen/scc.cc @@ -1,5 +1,5 @@ #include "src/codegen/scc.h" -#include "src/dfa/dfa.h" +#include "src/ir/dfa/dfa.h" namespace re2c { diff --git a/re2c/src/codegen/skeleton/path.h b/re2c/src/codegen/skeleton/path.h index 097e2c9f..b24e20a0 100644 --- a/re2c/src/codegen/skeleton/path.h +++ b/re2c/src/codegen/skeleton/path.h @@ -3,7 +3,7 @@ #include -#include "src/dfa/rule_rank.h" +#include "src/ir/rule_rank.h" #include "src/util/c99_stdint.h" namespace re2c diff --git a/re2c/src/codegen/skeleton/skeleton.cc b/re2c/src/codegen/skeleton/skeleton.cc index ca243c86..17275498 100644 --- a/re2c/src/codegen/skeleton/skeleton.cc +++ b/re2c/src/codegen/skeleton/skeleton.cc @@ -1,6 +1,7 @@ #include "src/codegen/indent.h" #include "src/codegen/print.h" #include "src/codegen/skeleton/skeleton.h" +#include "src/ir/regexp/regexp_rule.h" #include "src/util/allocate.h" namespace re2c diff --git a/re2c/src/codegen/skeleton/skeleton.h b/re2c/src/codegen/skeleton/skeleton.h index 41af35d3..aebaee10 100644 --- a/re2c/src/codegen/skeleton/skeleton.h +++ b/re2c/src/codegen/skeleton/skeleton.h @@ -4,7 +4,7 @@ #include #include "src/codegen/skeleton/path.h" -#include "src/dfa/dfa.h" +#include "src/ir/dfa/dfa.h" #include "src/util/c99_stdint.h" #include "src/util/forbid_copy.h" #include "src/util/local_increment.h" diff --git a/re2c/src/dfa/actions.cc b/re2c/src/dfa/actions.cc deleted file mode 100644 index 01a7ee08..00000000 --- a/re2c/src/dfa/actions.cc +++ /dev/null @@ -1,1073 +0,0 @@ -#include -#include -#include -#include -#include - -#include "src/codegen/skeleton/skeleton.h" -#include "src/dfa/dfa.h" -#include "src/dfa/encoding/utf16/utf16_regexp.h" -#include "src/dfa/encoding/utf8/utf8_regexp.h" -#include "src/globals.h" -#include "src/parse/scanner.h" -#include "src/util/allocate.h" - -namespace re2c -{ - -const Ins* showIns(std::ostream &o, const Ins &i, const Ins &base) -{ - o.width(3); - o << &i - &base << ": "; - - const Ins *ret = &(&i)[1]; - - switch (i.i.tag) - { - - case CHAR: - { - o << "match "; - - for (; ret < (Ins*) i.i.link; ++ret) - { - o << "\\x" << std::hex << ret->c.value; - } - - break; - } - - case GOTO: - o << "goto " << ((Ins*) i.i.link - &base); - break; - - case FORK: - o << "fork " << ((Ins*) i.i.link - &base); - break; - - case CTXT: - o << "ctxt"; - break; - - case TERM: - o << "term " << ((RuleOp*) i.i.link)->rank; - break; - } - - o << "\n"; - return ret; -} - -uint32_t RegExp::fixedLength() -{ - return ~0u; -} - -uint32_t compile_goto(Ins *ins, Ins *i) -{ - i->i.tag = GOTO; - i->i.link = ins; - return 1; -} - -void NullOp::calcSize(Char*) -{ - size = 0; -} - -uint32_t NullOp::fixedLength() -{ - return 0; -} - -uint32_t NullOp::compile(Char*, Ins*) -{ - return 0; -} - -void NullOp::decompile() -{ - ; -} - -void NullOp::split(CharSet&) -{ - ; -} - -MatchOp *merge(MatchOp *m1, MatchOp *m2) -{ - if (!m1) - return m2; - - if (!m2) - return m1; - - MatchOp* m = new MatchOp(doUnion(m1->match, m2->match)); - if (m1->ins_access == RegExp::PRIVATE - || m2->ins_access == RegExp::PRIVATE) - m->ins_access = RegExp::PRIVATE; - - return m; -} - -void MatchOp::display(std::ostream &o) const -{ - o << match; -} - -void MatchOp::calcSize(Char *rep) -{ - size = 1; - - for (Range *r = match; r; r = r->next) - for (uint32_t c = r->lb; c < r->ub; ++c) - if (rep[c] == c) - ++size; -} - -uint32_t MatchOp::fixedLength() -{ - return 1; -} - -uint32_t MatchOp::compile(Char *rep, Ins *i) -{ - if (ins_cache) - return compile_goto(ins_cache, i); - else - { - ins_cache = i; - - i->i.tag = CHAR; - i->i.link = &i[size]; - Ins *j = &i[1]; - uint32_t bump = size; - - for (Range *r = match; r; r = r->next) - { - for (uint32_t c = r->lb; c < r->ub; ++c) - { - if (rep[c] == c) - { - j->c.value = c; - j->c.bump = --bump; - j++; - } - } - } - - if (ins_access == PRIVATE) - decompile(); - - return size; - } -} - -void MatchOp::decompile() -{ - ins_cache = NULL; -} - -void MatchOp::split(CharSet &s) -{ - for (Range *r = match; r; r = r->next) - { - for (uint32_t c = r->lb; c < r->ub; ++c) - { - CharPtn *x = s.rep[c], *a = x->nxt; - - if (!a) - { - if (x->card == 1) - continue; - - x->nxt = a = s.freeHead; - - if (!(s.freeHead = s.freeHead->nxt)) - s.freeTail = &s.freeHead; - - a->nxt = NULL; - - x->fix = s.fix; - - s.fix = x; - } - - if (--(x->card) == 0) - { - *s.freeTail = x; - *(s.freeTail = &x->nxt) = NULL; - } - - s.rep[c] = a; - ++(a->card); - } - } - - for (; s.fix; s.fix = s.fix->fix) - if (s.fix->card) - s.fix->nxt = NULL; -} - -RegExp * mkDiff(RegExp *e1, RegExp *e2) -{ - MatchOp * m1 = dynamic_cast(e1); - MatchOp * m2 = dynamic_cast(e2); - if (m1 == NULL || m2 == NULL) - return NULL; - - Range *r = doDiff(m1->match, m2->match); - - return r ? (RegExp*) new MatchOp(r) : (RegExp*) new NullOp; -} - -RegExp *doAlt(RegExp *e1, RegExp *e2) -{ - if (!e1) - return e2; - - if (!e2) - return e1; - - return new AltOp(e1, e2); -} - -RegExp *doCat(RegExp *e1, RegExp *e2) -{ - if (!e1) - return e2; - - if (!e2) - return e1; - - return new CatOp(e1, e2); -} - -RegExp *mkAlt(RegExp *e1, RegExp *e2) -{ - AltOp *a; - MatchOp *m1, *m2; - - a = dynamic_cast(e1); - if (a != NULL) - { - m1 = dynamic_cast(a->exp1); - if (m1 != NULL) - { - m1->ins_access = e1->ins_access; - a->exp2->ins_access = e1->ins_access; - e1 = a->exp2; - } - } - else - { - m1 = dynamic_cast(e1); - if (m1 != NULL) - e1 = NULL; - } - - a = dynamic_cast(e2); - if (a != NULL) - { - m2 = dynamic_cast(a->exp1); - if (m2 != NULL) - { - m2->ins_access = e2->ins_access; - a->exp2->ins_access = e2->ins_access; - e2 = a->exp2; - } - } - else - { - m2 = dynamic_cast(e2); - if (m2 != NULL) - e2 = NULL; - } - - return doAlt(merge(m1, m2), doAlt(e1, e2)); -} - -void AltOp::calcSize(Char *rep) -{ - exp1->calcSize(rep); - exp2->calcSize(rep); - size = exp1->size + exp2->size + 2; -} - -uint32_t AltOp::fixedLength() -{ - uint32_t l1 = exp1->fixedLength(); - uint32_t l2 = exp1->fixedLength(); - - if (l1 != l2 || l1 == ~0u) - return ~0u; - - return l1; -} - -uint32_t AltOp::compile(Char *rep, Ins *i) -{ - if (ins_cache) - return compile_goto(ins_cache, i); - else - { - ins_cache = i; - - i->i.tag = FORK; - const uint32_t sz1 = exp1->compile(rep, &i[1]); - Ins * const j = &i[sz1 + 1]; - i->i.link = &j[1]; - j->i.tag = GOTO; - const uint32_t sz2 = exp2->compile(rep, &j[1]); - j->i.link = &j[sz2 + 1]; - - if (ins_access == PRIVATE) - decompile(); - - return sz1 + sz2 + 2; - } -} - -void AltOp::decompile() -{ - if (ins_cache) - { - exp1->decompile(); - exp2->decompile(); - ins_cache = NULL; - } -} - -void AltOp::split(CharSet &s) -{ - exp1->split(s); - exp2->split(s); -} - -void CatOp::calcSize(Char *rep) -{ - exp1->calcSize(rep); - exp2->calcSize(rep); - size = exp1->size + exp2->size; -} - -uint32_t CatOp::fixedLength() -{ - uint32_t l1, l2; - - if ((l1 = exp1->fixedLength()) != ~0u ) - if ((l2 = exp2->fixedLength()) != ~0u) - return l1 + l2; - - return ~0u; -} - -uint32_t CatOp::compile(Char *rep, Ins *i) -{ - if (ins_cache) - return compile_goto(ins_cache, i); - else - { - ins_cache = i; - - const uint32_t sz1 = exp1->compile(rep, &i[0]); - const uint32_t sz2 = exp2->compile(rep, &i[sz1]); - - if (ins_access == PRIVATE) - decompile(); - - return sz1 + sz2; - } -} - -void CatOp::decompile() -{ - if (ins_cache) - { - exp1->decompile(); - exp2->decompile(); - ins_cache = NULL; - } -} - -void CatOp::split(CharSet &s) -{ - exp1->split(s); - exp2->split(s); -} - -void CloseOp::calcSize(Char *rep) -{ - exp->calcSize(rep); - size = exp->size + 1; -} - -uint32_t CloseOp::compile(Char *rep, Ins *i) -{ - if (ins_cache) - return compile_goto(ins_cache, i); - else - { - ins_cache = i; - - i += exp->compile(rep, &i[0]); - i->i.tag = FORK; - i->i.link = ins_cache; - ++i; - - const uint32_t sz = i - ins_cache; - if (ins_access == PRIVATE) - decompile(); - - return sz; - } -} - -void CloseOp::decompile() -{ - if (ins_cache) - { - exp->decompile(); - ins_cache = NULL; - } -} - -void CloseOp::split(CharSet &s) -{ - exp->split(s); -} - -void CloseVOp::calcSize(Char *rep) -{ - exp->calcSize(rep); - - if (max >= 0) - { - size = (exp->size * min) + ((1 + exp->size) * (max - min)); - } - else - { - size = (exp->size * min) + 1; - } -} - -uint32_t CloseVOp::compile(Char *rep, Ins *i) -{ - if (ins_cache) - return compile_goto(ins_cache, i); - else - { - ins_cache = i; - - for (int st = min; st < max; st++) - { - const uint32_t sz = exp->compile(rep, &i[1]); - i->i.tag = FORK; - i->i.link = ins_cache + (1 + sz) * (max - min); - i += sz + 1; - } - - for (int st = 0; st < min; st++) - { - const uint32_t sz = exp->compile(rep, &i[0]); - i += sz; - if (max < 0 && st == 0) - { - i->i.tag = FORK; - i->i.link = i - sz; - i++; - } - } - - const uint32_t sz = i - ins_cache; - if (ins_access == PRIVATE) - decompile(); - - return sz; - } -} - -void CloseVOp::decompile() -{ - if (ins_cache) - { - exp->decompile(); - ins_cache = NULL; - } -} - -void CloseVOp::split(CharSet &s) -{ - exp->split(s); -} - -RegExp *expr(Scanner &); - -uint32_t Scanner::unescape(SubStr &s) const -{ - static const char * hex = "0123456789abcdef"; - static const char * oct = "01234567"; - - s.len--; - uint32_t c, ucb = 0; - - if ((c = *s.str++) != '\\' || s.len == 0) - { - return c; - } - - s.len--; - - switch (c = *s.str++) - { - case 'n': return '\n'; - case 't': return '\t'; - case 'v': return '\v'; - case 'b': return '\b'; - case 'r': return '\r'; - case 'f': return '\f'; - case 'a': return '\a'; - - case 'x': - { - if (s.len < 2) - { - fatal(s.ofs()+s.len, "Illegal hexadecimal character code, two hexadecimal digits are required"); - return ~0u; - } - - const char *p1 = strchr(hex, tolower(s.str[0])); - const char *p2 = strchr(hex, tolower(s.str[1])); - - if (!p1 || !p2) - { - fatal(s.ofs()+(p1?1:0), "Illegal hexadecimal character code"); - return ~0u; - } - else - { - s.len -= 2; - s.str += 2; - - uint32_t v = (uint32_t)((p1 - hex) << 4) - + (uint32_t)((p2 - hex)); - - return v; - } - } - - case 'U': - { - if (s.len < 8) - { - fatal(s.ofs()+s.len, "Illegal unicode character, eight hexadecimal digits are required"); - return ~0u; - } - - uint32_t l = 0; - if (s.str[0] == '0') - { - l++; - if (s.str[1] == '0') - { - l++; - if (s.str[2] == '0' || (s.str[2] == '1' && encoding.szCodePoint() == 4)) - { - l++; - if (encoding.szCodePoint() == 4) - { - const char *u3 = strchr(hex, tolower(s.str[2])); - const char *u4 = strchr(hex, tolower(s.str[3])); - if (u3 && u4) - { - ucb = (uint32_t)((u3 - hex) << 20) - + (uint32_t)((u4 - hex) << 16); - l++; - } - } - else if (s.str[3] == '0') - { - l++; - } - } - } - } - - if (l != 4) - { - fatal(s.ofs()+l, "Illegal unicode character, eight hexadecimal digits are required"); - } - - s.len -= 4; - s.str += 4; - - // no break; - } - case 'X': - case 'u': - { - if (s.len < 4) - { - fatal(s.ofs()+s.len, - c == 'X' - ? "Illegal hexadecimal character code, four hexadecimal digits are required" - : "Illegal unicode character, four hexadecimal digits are required"); - return ~0u; - } - - const char *p1 = strchr(hex, tolower(s.str[0])); - const char *p2 = strchr(hex, tolower(s.str[1])); - const char *p3 = strchr(hex, tolower(s.str[2])); - const char *p4 = strchr(hex, tolower(s.str[3])); - - if (!p1 || !p2 || !p3 || !p4) - { - fatal(s.ofs()+(p1?1:0)+(p2?1:0)+(p3?1:0), - c == 'X' - ? "Illegal hexadecimal character code, non hexxdecimal digit found" - : "Illegal unicode character, non hexadecimal digit found"); - return ~0u; - } - else - { - s.len -= 4; - s.str += 4; - - uint32_t v = (uint32_t)((p1 - hex) << 12) - + (uint32_t)((p2 - hex) << 8) - + (uint32_t)((p3 - hex) << 4) - + (uint32_t)((p4 - hex)) - + ucb; - - if (v >= encoding.nCodePoints()) - { - fatal(s.ofs(), - c == 'X' - ? "Illegal hexadecimal character code, out of range" - : "Illegal unicode character, out of range"); - } - - return v; - } - } - - case '4': - case '5': - case '6': - case '7': - { - fatal(s.ofs()-1, "Illegal octal character code, first digit must be 0 thru 3"); - return ~0u; - } - - case '0': - case '1': - case '2': - case '3': - { - if (s.len < 2) - { - fatal(s.ofs()+s.len, "Illegal octal character code, three octal digits are required"); - return ~0u; - } - - const char *p0 = strchr(oct, c); - const char *p1 = strchr(oct, s.str[0]); - const char *p2 = strchr(oct, s.str[1]); - - if (!p0 || !p1 || !p2) - { - fatal(s.ofs()+(p1?1:0), "Illegal octal character code, non octal digit found"); - return ~0u; - } - else - { - s.len -= 2; - s.str += 2; - - uint32_t v = (uint32_t)((p0 - oct) << 6) + (uint32_t)((p1 - oct) << 3) + (uint32_t)(p2 - oct); - - return v; - } - } - - default: - return c; - } -} - -std::string& Scanner::unescape(SubStr& str_in, std::string& str_out) const -{ - str_out.clear(); - - while(str_in.len) - { - uint32_t c = unescape(str_in); - - if (c > 0xFF) - { - fatal(str_in.ofs(), "Illegal character"); - } - - str_out += static_cast(c); - } - - return str_out; -} - -Range * Scanner::getRange(SubStr &s) const -{ - uint32_t lb = unescape(s), ub; - - if (s.len < 2 || *s.str != '-') - { - ub = lb; - } - else - { - s.len--; - s.str++; - ub = unescape(s); - if (ub < lb) - { - uint32_t tmp = lb; - lb = ub; - ub = tmp; - } - } - - Range * r = encoding.encodeRange(lb, ub); - if (r == NULL) - fatalf("Bad code point range: '0x%X - 0x%X'", lb, ub); - return r; -} - -RegExp * Scanner::matchSymbol(uint32_t c) const -{ - if (!encoding.encode(c)) - fatalf("Bad code point: '0x%X'", c); - - if (encoding.is(Enc::UTF16)) - return UTF16Symbol(c); - else if (encoding.is(Enc::UTF8)) - return UTF8Symbol(c); - else - return new MatchOp(new Range(c, c + 1)); -} - -RegExp * Scanner::strToRE (SubStr & s) const -{ - if (s.len == 0) - return new NullOp; - - RegExp *re = matchSymbol(unescape(s)); - - while (s.len > 0) - re = new CatOp(re, matchSymbol(unescape(s))); - - return re; -} - -RegExp * Scanner::strToCaseInsensitiveRE (SubStr & s) const -{ - if (s.len == 0) - return new NullOp; - - uint32_t c = unescape(s); - - RegExp *re, *reL, *reU; - - if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')) - { - reL = matchSymbol(tolower(c)); - reU = matchSymbol(toupper(c)); - re = mkAlt(reL, reU); - } - else - { - re = matchSymbol(c); - } - - while (s.len > 0) - { - c = unescape(s); - - if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')) - { - reL = matchSymbol(tolower(c)); - reU = matchSymbol(toupper(c)); - re = new CatOp(re, mkAlt(reL, reU)); - } - else - { - re = new CatOp(re, matchSymbol(c)); - } - } - - return re; -} - -Range * Scanner::mkRange(SubStr &s) const -{ - Range *r = getRange(s); - while (s.len > 0) - r = doUnion(r, getRange(s)); - - return r; -} - -RegExp * Scanner::matchSymbolRange(Range * r) const -{ - if (encoding.is(Enc::UTF16)) - return UTF16Range(r); - else if (encoding.is(Enc::UTF8)) - return UTF8Range(r); - else - return new MatchOp(r); -} - -RegExp * Scanner::ranToRE (SubStr & s) const -{ - s.len -= 2; - s.str += 1; - - if (s.len == 0) - return new NullOp; - - return matchSymbolRange(mkRange(s)); -} - -RegExp * Scanner::invToRE (SubStr & s) const -{ - s.len -= 3; - s.str += 2; - - Range * full = encoding.fullRange(); - - Range * r = s.len == 0 - ? full - : doDiff(full, mkRange (s)); - - return matchSymbolRange(r); -} - -RegExp * Scanner::mkDot() const -{ - Range * full = encoding.fullRange(); - uint32_t c = '\n'; - if (!encoding.encode(c)) - fatalf("Bad code point: '0x%X'", c); - Range * ran = new Range(c, c + 1); - Range * inv = doDiff(full, ran); - - return matchSymbolRange(inv); -} - -/* - * Create a byte range that includes all possible input characters. - * This may include characters, which do not map to any valid symbol - * in current encoding. For encodings, which directly map symbols to - * input characters (ASCII, EBCDIC, UTF-32), it equals [^]. For other - * encodings (UTF-16, UTF-8), [^] and this range are different. - * - * Also note that default range doesn't respect encoding policy - * (the way invalid code points are treated). - */ -RegExp * Scanner::mkDefault() const -{ - Range * def = new Range(0, encoding.nCodeUnits()); - return new MatchOp(def); -} - -RuleOp::RuleOp(RegExp *e, RegExp *c, Token *t, rule_rank_t r, InsAccess access) - : exp(e) - , ctx(c) - , ins(NULL) - , rank(r) - , code(t) - , line(0) -{ - ins_access = access; -} - -void RuleOp::calcSize(Char *rep) -{ - exp->calcSize(rep); - ctx->calcSize(rep); - size = exp->size + (ctx->size ? ctx->size + 2 : 1); -} - -uint32_t RuleOp::compile(Char *rep, Ins *i) -{ - if (ins_cache) - return compile_goto(ins_cache, i); - else - { - ins_cache = i; - - i += exp->compile(rep, &i[0]); - if (ctx->size) - { - i->i.tag = CTXT; - i->i.link = &i[1]; - ++i; - i += ctx->compile(rep, &i[0]); - } - i->i.tag = TERM; - i->i.link = this; - ++i; - - const uint32_t sz = i - ins_cache; - if (ins_access == PRIVATE) - decompile(); - - return sz; - } -} - -void RuleOp::decompile() -{ - if (ins_cache) - { - exp->decompile(); - ctx->decompile(); - ins_cache = NULL; - } -} - -void RuleOp::split(CharSet &s) -{ - exp->split(s); - ctx->split(s); -} - -void optimize(Ins *i) -{ - while (!isMarked(i)) - { - mark(i); - - if (i->i.tag == CHAR) - { - i = (Ins*) i->i.link; - } - else if (i->i.tag == GOTO || i->i.tag == FORK) - { - Ins *target = (Ins*) i->i.link; - optimize(target); - - if (target->i.tag == GOTO) - i->i.link = target->i.link == target ? i : target; - - if (i->i.tag == FORK) - { - Ins *follow = (Ins*) & i[1]; - optimize(follow); - - if (follow->i.tag == GOTO && follow->i.link == follow) - { - i->i.tag = GOTO; - } - else if (i->i.link == i) - { - i->i.tag = GOTO; - i->i.link = follow; - } - } - - return ; - } - else - { - ++i; - } - } -} - -CharSet::CharSet() - : fix(0) - , freeHead(0) - , freeTail(0) - , rep(allocate (encoding.nCodeUnits())) - , ptn(allocate (encoding.nCodeUnits())) -{ - for (uint32_t j = 0; j < encoding.nCodeUnits(); ++j) - { - rep[j] = &ptn[0]; - ptn[j].nxt = &ptn[j + 1]; /* wrong for j=encoding.nCodeUnits() - 1 but will be corrected below */ - ptn[j].card = 0; - } - - freeHead = &ptn[1]; - *(freeTail = &ptn[encoding.nCodeUnits() - 1].nxt) = NULL; - ptn[0].card = encoding.nCodeUnits(); - ptn[0].nxt = NULL; -} - -CharSet::~CharSet() -{ - operator delete (rep); - operator delete (ptn); -} - -smart_ptr genCode(RegExp *re, Output & output, uint32_t ind) -{ - CharSet cs; - re->split(cs); - - Char *rep = new Char[encoding.nCodeUnits()]; - - for (uint32_t j = 0; j < encoding.nCodeUnits(); ++j) - { - if (!cs.rep[j]->nxt) - cs.rep[j]->nxt = &cs.ptn[j]; - - rep[j] = cs.rep[j]->nxt - &cs.ptn[0]; - } - - re->calcSize(rep); - Ins *ins = new Ins[re->size + 1]; - memset(ins, 0, (re->size + 1)*sizeof(Ins)); - const uint32_t size = re->compile(rep, ins); - Ins *eoi = &ins[size]; - eoi->i.tag = GOTO; - eoi->i.link = eoi; - - optimize(ins); - - /* - for (const Ins *inst = &ins[0]; inst < &ins[size]; ) - { - inst = showIns(std::cout, *inst, ins[0]); - } - */ - - for (uint32_t j = 0; j < size;) - { - unmark(&ins[j]); - - if (ins[j].i.tag == CHAR) - { - j = (Ins*) ins[j].i.link - ins; - } - else - { - j++; - } - } - - smart_ptr dfa = make_smart_ptr(new DFA(ins, size, 0, encoding.nCodeUnits(), rep)); - if (flag_skeleton) - { - skeleton::Skeleton skeleton (*dfa); - skeleton.emit_data (output.data); - skeleton::emit_prolog (output.source, ind, output.data.file_name.c_str ()); - } - dfa->prepare (output.source, output.max_fill); - - return dfa; -} - -} // end namespace re2c - diff --git a/re2c/src/dfa/encoding/utf16/utf16_regexp.h b/re2c/src/dfa/encoding/utf16/utf16_regexp.h deleted file mode 100644 index 14e4fde0..00000000 --- a/re2c/src/dfa/encoding/utf16/utf16_regexp.h +++ /dev/null @@ -1,16 +0,0 @@ -#ifndef _RE2C_DFA_ENCODING_UTF16_REGEXP_ -#define _RE2C_DFA_ENCODING_UTF16_REGEXP_ - -#include "src/dfa/encoding/utf16/utf16.h" -#include "src/util/range.h" - -namespace re2c { - -class RegExp; // forward - -RegExp * UTF16Symbol(utf16::rune r); -RegExp * UTF16Range(const Range * r); - -} // namespace re2c - -#endif // _RE2C_DFA_ENCODING_UTF16_REGEXP_ diff --git a/re2c/src/dfa/encoding/utf8/utf8_regexp.h b/re2c/src/dfa/encoding/utf8/utf8_regexp.h deleted file mode 100644 index ccfaab0e..00000000 --- a/re2c/src/dfa/encoding/utf8/utf8_regexp.h +++ /dev/null @@ -1,16 +0,0 @@ -#ifndef _RE2C_DFA_ENCODING_UTF8_REGEXP_ -#define _RE2C_DFA_ENCODING_UTF8_REGEXP_ - -#include "src/dfa/encoding/utf8/utf8.h" -#include "src/util/range.h" - -namespace re2c { - -class RegExp; // forward - -RegExp * UTF8Symbol(utf8::rune r); -RegExp * UTF8Range(const Range * r); - -} // namespace re2c - -#endif // _RE2C_DFA_ENCODING_UTF8_REGEXP_ diff --git a/re2c/src/dfa/ins.h b/re2c/src/dfa/ins.h deleted file mode 100644 index 1e1170c7..00000000 --- a/re2c/src/dfa/ins.h +++ /dev/null @@ -1,55 +0,0 @@ -#ifndef _RE2C_DFA_INS_ -#define _RE2C_DFA_INS_ - -#include "src/util/c99_stdint.h" - -namespace re2c -{ - -typedef uint32_t Char; - -const uint32_t CHAR = 0; -const uint32_t GOTO = 1; -const uint32_t FORK = 2; -const uint32_t TERM = 3; -const uint32_t CTXT = 4; - -union Ins { - - struct - { - uint8_t tag; - uint8_t marked; - void *link; - } - - i; - - struct - { - uint32_t value; - uint32_t bump; - void *link; - } - - c; -}; - -inline bool isMarked(Ins *i) -{ - return i->i.marked != 0; -} - -inline void mark(Ins *i) -{ - i->i.marked = true; -} - -inline void unmark(Ins *i) -{ - i->i.marked = false; -} - -} // end namespace re2c - -#endif // _RE2C_DFA_INS_ diff --git a/re2c/src/dfa/re.h b/re2c/src/dfa/re.h deleted file mode 100644 index efb3c70f..00000000 --- a/re2c/src/dfa/re.h +++ /dev/null @@ -1,310 +0,0 @@ -#ifndef _RE2C_DFA_RE_ -#define _RE2C_DFA_RE_ - -#include -#include -#include -#include -#include -#include - -#include "src/dfa/ins.h" -#include "src/dfa/rule_rank.h" -#include "src/globals.h" -#include "src/parse/token.h" -#include "src/util/range.h" -#include "src/util/forbid_copy.h" -#include "src/util/free_list.h" - -namespace re2c -{ - -typedef struct extop -{ - char op; - int minsize; - int maxsize; -} - -ExtOp; - -struct CharPtn -{ - uint32_t card; - CharPtn *fix; - CharPtn *nxt; - - FORBID_COPY (CharPtn); -}; - -typedef CharPtn *CharPtr; - -struct CharSet -{ - CharSet(); - ~CharSet(); - - CharPtn *fix; - CharPtn *freeHead, **freeTail; - CharPtr *rep; - CharPtn *ptn; - - FORBID_COPY (CharSet); -}; - -class RegExp -{ - -public: - uint32_t size; - - /* - * There're several different cases when the same regexp - * can be used multiple times: - * 1) named definitions, e.g. digit = [0-9]; - * 2) counted repetition, e.g. "a"{3}, "a"{3,}, "a"{3,5} - * 3) multiple DFA's sharing the same regexp , e.g. "abc" { } - * 4) common suffixes, e.g. suffix [\x80-\xBF] in UTF-8 ranges - * In cases 1-3, regexp must be recompiled each time it's reused. - * In case 4, regexp should be compiled only once, and instructions - * should be shared in order to reduce space. - * - * Note: if regexp must always be recompiled, it doesn't imply that - * parts of this regexp must always be recompiled. It only forces - * the compilation function to traverse the regexp after compilation - * and reset compilation cache for each sub-regexp. E.g., for a regexp - * [^]{3} in UTF-8 mode, each of sub-regexps [^] will have common suffix - * [\x80-\xBF] factored out, but they won't share instructions. - */ - Ins* ins_cache; /* if non-NULL, points to compiled instructions */ - enum InsAccess - { SHARED - , PRIVATE - } ins_access; - - static free_list vFreeList; - -public: - RegExp() - : size(0) - , ins_cache(NULL) - , ins_access(SHARED) - { - vFreeList.insert(this); - } - - virtual ~RegExp() - { - vFreeList.erase(this); - } - - virtual void split(CharSet&) = 0; - virtual void calcSize(Char*) = 0; - virtual uint32_t fixedLength(); - virtual uint32_t compile(Char*, Ins*) = 0; - virtual void decompile() = 0; - virtual void display(std::ostream&) const = 0; - friend std::ostream& operator<<(std::ostream&, const RegExp&); - friend std::ostream& operator<<(std::ostream&, const RegExp*); - - FORBID_COPY (RegExp); -}; - -inline std::ostream& operator<<(std::ostream &o, const RegExp &re) -{ - re.display(o); - return o; -} - -inline std::ostream& operator<<(std::ostream &o, const RegExp *re) -{ - return o << *re; -} - -class NullOp: public RegExp -{ - -public: - void split(CharSet&); - void calcSize(Char*); - uint32_t fixedLength(); - uint32_t compile(Char*, Ins*); - void decompile(); - void display(std::ostream &o) const - { - o << "_"; - } -}; - -class MatchOp: public RegExp -{ - -public: - Range *match; - -public: - MatchOp(Range *m) : match(m) - { - } - - void split(CharSet&); - void calcSize(Char*); - uint32_t fixedLength(); - uint32_t compile(Char*, Ins*); - void decompile(); - void display(std::ostream&) const; - - FORBID_COPY (MatchOp); -}; - -class RuleOp: public RegExp -{ -private: - RegExp *exp; - -public: - RegExp *ctx; - Ins *ins; - rule_rank_t rank; - Token *code; - uint32_t line; - -public: - RuleOp(RegExp*, RegExp*, Token*, rule_rank_t, InsAccess); - - ~RuleOp() - { - delete code; - } - - void split(CharSet&); - void calcSize(Char*); - uint32_t compile(Char*, Ins*); - void decompile(); - void display(std::ostream &o) const - { - o << exp << "/" << ctx << ";"; - } - - FORBID_COPY (RuleOp); -}; - -RegExp *doAlt(RegExp*, RegExp*); -RegExp *mkAlt(RegExp*, RegExp*); - -class AltOp: public RegExp -{ - -private: - RegExp *exp1, *exp2; - -public: - AltOp(RegExp *e1, RegExp *e2) - : exp1(e1) - , exp2(e2) - { - } - - void split(CharSet&); - void calcSize(Char*); - uint32_t fixedLength(); - uint32_t compile(Char*, Ins*); - void decompile(); - void display(std::ostream &o) const - { - o << exp1 << "|" << exp2; - } - - friend RegExp *mkAlt(RegExp*, RegExp*); - - FORBID_COPY (AltOp); -}; - -RegExp *doCat(RegExp*, RegExp*); -RegExp *mkCat(RegExp*, RegExp*); - -class CatOp: public RegExp -{ - -private: - RegExp *exp1, *exp2; - -public: - CatOp(RegExp *e1, RegExp *e2) - : exp1(e1) - , exp2(e2) - { - } - - void split(CharSet&); - void calcSize(Char*); - uint32_t fixedLength(); - uint32_t compile(Char*, Ins*); - void decompile(); - void display(std::ostream &o) const - { - o << exp1 << exp2; - } - - FORBID_COPY (CatOp); -}; - -class CloseOp: public RegExp -{ - -private: - RegExp *exp; - -public: - CloseOp(RegExp *e) - : exp(e) - { - } - - void split(CharSet&); - void calcSize(Char*); - uint32_t compile(Char*, Ins*); - void decompile(); - void display(std::ostream &o) const - { - o << exp << "+"; - } - - FORBID_COPY (CloseOp); -}; - -class CloseVOp: public RegExp -{ - -private: - RegExp *exp; - int min; - int max; - -public: - CloseVOp(RegExp *e, int lb, int ub) - : exp(e) - , min(lb) - , max(ub) - { - exp->ins_access = PRIVATE; - } - - void split(CharSet&); - void calcSize(Char*); - uint32_t compile(Char*, Ins*); - void decompile(); - void display(std::ostream &o) const - { - o << exp << "+"; - } - - FORBID_COPY (CloseVOp); -}; - -extern RegExp *mkDiff(RegExp*, RegExp*); -extern RegExp *mkAlt(RegExp*, RegExp*); - -} // end namespace re2c - -#endif // _RE2C_DFA_RE_ diff --git a/re2c/src/globals.h b/re2c/src/globals.h index e77bcf8d..9f2549ad 100644 --- a/re2c/src/globals.h +++ b/re2c/src/globals.h @@ -5,7 +5,7 @@ #include "src/codegen/code_names.h" #include "src/codegen/input_api.h" -#include "src/dfa/encoding/enc.h" +#include "src/ir/regexp/encoding/enc.h" #include "src/util/c99_stdint.h" namespace re2c diff --git a/re2c/src/ir/bytecode/bytecode.cc b/re2c/src/ir/bytecode/bytecode.cc new file mode 100644 index 00000000..0fda50b2 --- /dev/null +++ b/re2c/src/ir/bytecode/bytecode.cc @@ -0,0 +1,111 @@ +#include + +#include "src/codegen/skeleton/skeleton.h" +#include "src/ir/bytecode/bytecode.h" +#include "src/globals.h" + +namespace re2c { + +static void optimize (Ins * i); + +smart_ptr genCode (RegExp *re, Output & output, uint32_t ind) +{ + CharSet cs; + re->split(cs); + + Char *rep = new Char[encoding.nCodeUnits()]; + + for (uint32_t j = 0; j < encoding.nCodeUnits(); ++j) + { + if (!cs.rep[j]->nxt) + cs.rep[j]->nxt = &cs.ptn[j]; + + rep[j] = cs.rep[j]->nxt - &cs.ptn[0]; + } + + re->calcSize(rep); + Ins *ins = new Ins[re->size + 1]; + memset(ins, 0, (re->size + 1)*sizeof(Ins)); + const uint32_t size = re->compile(rep, ins); + Ins *eoi = &ins[size]; + eoi->i.tag = GOTO; + eoi->i.link = eoi; + + optimize(ins); + + /* + for (const Ins *inst = &ins[0]; inst < &ins[size]; ) + { + inst = showIns(std::cout, *inst, ins[0]); + } + */ + + for (uint32_t j = 0; j < size;) + { + unmark(&ins[j]); + + if (ins[j].i.tag == CHAR) + { + j = (Ins*) ins[j].i.link - ins; + } + else + { + j++; + } + } + + smart_ptr dfa = make_smart_ptr(new DFA(ins, size, 0, encoding.nCodeUnits(), rep)); + if (flag_skeleton) + { + skeleton::Skeleton skeleton (*dfa); + skeleton.emit_data (output.data); + skeleton::emit_prolog (output.source, ind, output.data.file_name.c_str ()); + } + dfa->prepare (output.source, output.max_fill); + + return dfa; +} + +void optimize (Ins * i) +{ + while (!isMarked (i)) + { + mark (i); + if (i->i.tag == CHAR) + { + i = (Ins *) i->i.link; + } + else if (i->i.tag == GOTO || i->i.tag == FORK) + { + Ins * target = (Ins *) i->i.link; + optimize (target); + if (target->i.tag == GOTO) + { + i->i.link = target->i.link == target + ? i + : target; + } + if (i->i.tag == FORK) + { + Ins * follow = (Ins *) & i[1]; + optimize (follow); + if (follow->i.tag == GOTO && follow->i.link == follow) + { + i->i.tag = GOTO; + } + else if (i->i.link == i) + { + i->i.tag = GOTO; + i->i.link = follow; + } + } + return; + } + else + { + ++i; + } + } +} + +} // namespace re2c diff --git a/re2c/src/ir/bytecode/bytecode.h b/re2c/src/ir/bytecode/bytecode.h new file mode 100644 index 00000000..50757e12 --- /dev/null +++ b/re2c/src/ir/bytecode/bytecode.h @@ -0,0 +1,16 @@ +#ifndef _RE2C_IR_BYTECODE_BYTECODE_ +#define _RE2C_IR_BYTECODE_BYTECODE_ + +#include "src/codegen/output.h" +#include "src/ir/dfa/dfa.h" +#include "src/ir/regexp/regexp.h" +#include "src/util/smart_ptr.h" + +namespace re2c +{ + +smart_ptr genCode (RegExp * re, Output & output, uint32_t ind); + +} // namespace re2c + +#endif // _RE2C_IR_BYTECODE_BYTECODE_ diff --git a/re2c/src/ir/bytecode/calc_size.cc b/re2c/src/ir/bytecode/calc_size.cc new file mode 100644 index 00000000..fc90e7e8 --- /dev/null +++ b/re2c/src/ir/bytecode/calc_size.cc @@ -0,0 +1,72 @@ +#include "src/ir/regexp/regexp_alt.h" +#include "src/ir/regexp/regexp_cat.h" +#include "src/ir/regexp/regexp_close.h" +#include "src/ir/regexp/regexp_closev.h" +#include "src/ir/regexp/regexp_match.h" +#include "src/ir/regexp/regexp_null.h" +#include "src/ir/regexp/regexp_rule.h" + +namespace re2c +{ + +void AltOp::calcSize (Char * rep) +{ + exp1->calcSize (rep); + exp2->calcSize (rep); + size = exp1->size + exp2->size + 2; +} + +void CatOp::calcSize (Char * rep) +{ + exp1->calcSize (rep); + exp2->calcSize (rep); + size = exp1->size + exp2->size; +} + +void CloseOp::calcSize (Char * rep) +{ + exp->calcSize (rep); + size = exp->size + 1; +} + +void CloseVOp::calcSize (Char * rep) +{ + exp->calcSize (rep); + if (max >= 0) + { + size = (exp->size * min) + ((1 + exp->size) * (max - min)); + } + else + { + size = (exp->size * min) + 1; + } +} + +void MatchOp::calcSize (Char * rep) +{ + size = 1; + for (Range * r = match; r; r = r->next) + { + for (uint32_t c = r->lb; c < r->ub; ++c) + { + if (rep[c] == c) + { + ++size; + } + } + } +} + +void NullOp::calcSize (Char *) +{ + size = 0; +} + +void RuleOp::calcSize (Char * rep) +{ + exp->calcSize (rep); + ctx->calcSize (rep); + size = exp->size + (ctx->size ? ctx->size + 2 : 1); +} + +} // end namespace re2c diff --git a/re2c/src/ir/bytecode/charset.cc b/re2c/src/ir/bytecode/charset.cc new file mode 100644 index 00000000..1543abea --- /dev/null +++ b/re2c/src/ir/bytecode/charset.cc @@ -0,0 +1,32 @@ +#include "src/ir/bytecode/charset.h" +#include "src/globals.h" +#include "src/util/allocate.h" + +namespace re2c { + +CharSet::CharSet () + : fix (0) + , freeHead (0) + , freeTail (0) + , rep (allocate (encoding.nCodeUnits ())) + , ptn (allocate (encoding.nCodeUnits ())) +{ + for (uint32_t j = 0; j < encoding.nCodeUnits (); ++j) + { + rep[j] = &ptn[0]; + ptn[j].nxt = &ptn[j + 1]; /* wrong for j=encoding.nCodeUnits() - 1 but will be corrected below */ + ptn[j].card = 0; + } + freeHead = &ptn[1]; + * (freeTail = &ptn[encoding.nCodeUnits() - 1].nxt) = NULL; + ptn[0].card = encoding.nCodeUnits (); + ptn[0].nxt = NULL; +} + +CharSet::~CharSet () +{ + operator delete (rep); + operator delete (ptn); +} + +} // namespace re2c diff --git a/re2c/src/ir/bytecode/charset.h b/re2c/src/ir/bytecode/charset.h new file mode 100644 index 00000000..af36b4f5 --- /dev/null +++ b/re2c/src/ir/bytecode/charset.h @@ -0,0 +1,37 @@ +#ifndef _RE2C_IR_BYTECODE_CHARSET_ +#define _RE2C_IR_BYTECODE_CHARSET_ + +#include "src/util/c99_stdint.h" +#include "src/util/forbid_copy.h" + +namespace re2c +{ + +struct CharPtn +{ + uint32_t card; + CharPtn * fix; + CharPtn * nxt; + + FORBID_COPY (CharPtn); +}; + +typedef CharPtn * CharPtr; + +struct CharSet +{ + CharPtn * fix; + CharPtn * freeHead; + CharPtn ** freeTail; + CharPtr * rep; + CharPtn * ptn; + + CharSet (); + ~CharSet (); + + FORBID_COPY (CharSet); +}; + +} // namespace re2c + +#endif // _RE2C_IR_BYTECODE_CHARSET_ diff --git a/re2c/src/ir/bytecode/compile.cc b/re2c/src/ir/bytecode/compile.cc new file mode 100644 index 00000000..2f4609bf --- /dev/null +++ b/re2c/src/ir/bytecode/compile.cc @@ -0,0 +1,262 @@ +#include "src/ir/regexp/regexp_alt.h" +#include "src/ir/regexp/regexp_cat.h" +#include "src/ir/regexp/regexp_close.h" +#include "src/ir/regexp/regexp_closev.h" +#include "src/ir/regexp/regexp_match.h" +#include "src/ir/regexp/regexp_null.h" +#include "src/ir/regexp/regexp_rule.h" + +namespace re2c +{ + +static uint32_t compile_goto (Ins * ins, Ins * i); + +uint32_t AltOp::compile (Char * rep, Ins * i) +{ + if (ins_cache) + { + return compile_goto (ins_cache, i); + } + else + { + ins_cache = i; + + i->i.tag = FORK; + const uint32_t sz1 = exp1->compile (rep, &i[1]); + Ins * const j = &i[sz1 + 1]; + i->i.link = &j[1]; + j->i.tag = GOTO; + const uint32_t sz2 = exp2->compile (rep, &j[1]); + j->i.link = &j[sz2 + 1]; + + if (ins_access == PRIVATE) + { + decompile (); + } + + return sz1 + sz2 + 2; + } +} + +void AltOp::decompile () +{ + if (ins_cache) + { + exp1->decompile (); + exp2->decompile (); + ins_cache = NULL; + } +} + +uint32_t CatOp::compile (Char * rep, Ins * i) +{ + if (ins_cache) + { + return compile_goto (ins_cache, i); + } + else + { + ins_cache = i; + + const uint32_t sz1 = exp1->compile (rep, &i[0]); + const uint32_t sz2 = exp2->compile (rep, &i[sz1]); + + if (ins_access == PRIVATE) + { + decompile (); + } + + return sz1 + sz2; + } +} + +void CatOp::decompile () +{ + if (ins_cache) + { + exp1->decompile (); + exp2->decompile (); + ins_cache = NULL; + } +} + +uint32_t CloseOp::compile (Char * rep, Ins * i) +{ + if (ins_cache) + { + return compile_goto (ins_cache, i); + } + else + { + ins_cache = i; + + i += exp->compile (rep, &i[0]); + i->i.tag = FORK; + i->i.link = ins_cache; + ++i; + + const uint32_t sz = i - ins_cache; + if (ins_access == PRIVATE) + { + decompile (); + } + + return sz; + } +} + +void CloseOp::decompile () +{ + if (ins_cache) + { + exp->decompile (); + ins_cache = NULL; + } +} + +uint32_t CloseVOp::compile (Char * rep, Ins * i) +{ + if (ins_cache) + { + return compile_goto (ins_cache, i); + } + else + { + ins_cache = i; + + for (int st = min; st < max; st++) + { + const uint32_t sz = exp->compile (rep, &i[1]); + i->i.tag = FORK; + i->i.link = ins_cache + (1 + sz) * (max - min); + i += sz + 1; + } + for (int st = 0; st < min; st++) + { + const uint32_t sz = exp->compile (rep, &i[0]); + i += sz; + if (max < 0 && st == 0) + { + i->i.tag = FORK; + i->i.link = i - sz; + i++; + } + } + const uint32_t sz = i - ins_cache; + + if (ins_access == PRIVATE) + { + decompile (); + } + + return sz; + } +} + +void CloseVOp::decompile () +{ + if (ins_cache) + { + exp->decompile (); + ins_cache = NULL; + } +} + +uint32_t MatchOp::compile (Char * rep, Ins * i) +{ + if (ins_cache) + { + return compile_goto (ins_cache, i); + } + else + { + ins_cache = i; + + i->i.tag = CHAR; + i->i.link = &i[size]; + Ins *j = &i[1]; + uint32_t bump = size; + for (Range *r = match; r; r = r->next) + { + for (uint32_t c = r->lb; c < r->ub; ++c) + { + if (rep[c] == c) + { + j->c.value = c; + j->c.bump = --bump; + j++; + } + } + } + + if (ins_access == PRIVATE) + { + decompile (); + } + + return size; + } +} + +void MatchOp::decompile () +{ + ins_cache = NULL; +} + +uint32_t NullOp::compile (Char *, Ins *) +{ + return 0; +} + +void NullOp::decompile () {} + +uint32_t RuleOp::compile (Char * rep, Ins * i) +{ + if (ins_cache) + { + return compile_goto (ins_cache, i); + } + else + { + ins_cache = i; + + i += exp->compile (rep, &i[0]); + if (ctx->size) + { + i->i.tag = CTXT; + i->i.link = &i[1]; + ++i; + i += ctx->compile (rep, &i[0]); + } + i->i.tag = TERM; + i->i.link = this; + ++i; + const uint32_t sz = i - ins_cache; + + if (ins_access == PRIVATE) + { + decompile (); + } + + return sz; + } +} + +void RuleOp::decompile () +{ + if (ins_cache) + { + exp->decompile (); + ctx->decompile (); + ins_cache = NULL; + } +} + +uint32_t compile_goto (Ins * ins, Ins * i) +{ + i->i.tag = GOTO; + i->i.link = ins; + return 1; +} + +} // end namespace re2c diff --git a/re2c/src/ir/bytecode/ins.cc b/re2c/src/ir/bytecode/ins.cc new file mode 100644 index 00000000..f0041e51 --- /dev/null +++ b/re2c/src/ir/bytecode/ins.cc @@ -0,0 +1,41 @@ +#include + +#include "src/ir/bytecode/ins.h" +#include "src/ir/regexp/regexp_rule.h" + +namespace re2c { + +const Ins * showIns (std::ostream & o, const Ins & i, const Ins & base) +{ + o.width (3); + o << &i - &base << ": "; + const Ins * ret = &(&i)[1]; + switch (i.i.tag) + { + case CHAR: + { + o << "match "; + for (; ret < (Ins *) i.i.link; ++ret) + { + o << "\\x" << std::hex << ret->c.value; + } + break; + } + case GOTO: + o << "goto " << ((Ins *) i.i.link - &base); + break; + case FORK: + o << "fork " << ((Ins *) i.i.link - &base); + break; + case CTXT: + o << "ctxt"; + break; + case TERM: + o << "term " << ((RuleOp *) i.i.link)->rank; + break; + } + o << "\n"; + return ret; +} + +} // namespace re2c diff --git a/re2c/src/ir/bytecode/ins.h b/re2c/src/ir/bytecode/ins.h new file mode 100644 index 00000000..6cd14d54 --- /dev/null +++ b/re2c/src/ir/bytecode/ins.h @@ -0,0 +1,52 @@ +#ifndef _RE2C_IR_BYTECODE_INS_ +#define _RE2C_IR_BYTECODE_INS_ + +#include + +#include "src/util/c99_stdint.h" + +namespace re2c +{ + +static const uint32_t CHAR = 0; +static const uint32_t GOTO = 1; +static const uint32_t FORK = 2; +static const uint32_t TERM = 3; +static const uint32_t CTXT = 4; + +union Ins +{ + struct + { + uint8_t tag; + uint8_t marked; + void * link; + } i; + struct + { + uint32_t value; + uint32_t bump; + void * link; + } c; +}; + +inline bool isMarked (Ins * i) +{ + return i->i.marked != 0; +} + +inline void mark (Ins * i) +{ + i->i.marked = true; +} + +inline void unmark (Ins * i) +{ + i->i.marked = false; +} + +const Ins * showIns (std::ostream & o, const Ins & i, const Ins & base); + +} // namespace re2c + +#endif // _RE2C_IR_BYTECODE_INS_ diff --git a/re2c/src/ir/bytecode/split.cc b/re2c/src/ir/bytecode/split.cc new file mode 100644 index 00000000..7e0bc6d1 --- /dev/null +++ b/re2c/src/ir/bytecode/split.cc @@ -0,0 +1,82 @@ +#include "src/ir/regexp/regexp_alt.h" +#include "src/ir/regexp/regexp_cat.h" +#include "src/ir/regexp/regexp_close.h" +#include "src/ir/regexp/regexp_closev.h" +#include "src/ir/regexp/regexp_match.h" +#include "src/ir/regexp/regexp_null.h" +#include "src/ir/regexp/regexp_rule.h" + +namespace re2c { + +void AltOp::split (CharSet & s) +{ + exp1->split (s); + exp2->split (s); +} + +void CatOp::split (CharSet & s) +{ + exp1->split (s); + exp2->split (s); +} + +void CloseOp::split (CharSet & s) +{ + exp->split (s); +} + +void CloseVOp::split (CharSet & s) +{ + exp->split (s); +} + +void MatchOp::split (CharSet & s) +{ + for (Range *r = match; r; r = r->next) + { + for (uint32_t c = r->lb; c < r->ub; ++c) + { + CharPtn * x = s.rep[c]; + CharPtn * a = x->nxt; + if (!a) + { + if (x->card == 1) + { + continue; + } + x->nxt = a = s.freeHead; + if (!(s.freeHead = s.freeHead->nxt)) + { + s.freeTail = &s.freeHead; + } + a->nxt = NULL; + x->fix = s.fix; + s.fix = x; + } + if (--(x->card) == 0) + { + *s.freeTail = x; + *(s.freeTail = &x->nxt) = NULL; + } + s.rep[c] = a; + ++(a->card); + } + } + for (; s.fix; s.fix = s.fix->fix) + { + if (s.fix->card) + { + s.fix->nxt = NULL; + } + } +} + +void NullOp::split (CharSet &) {} + +void RuleOp::split (CharSet & s) +{ + exp->split (s); + ctx->split (s); +} + +} // namespace re2c diff --git a/re2c/src/dfa/action.h b/re2c/src/ir/dfa/action.h similarity index 93% rename from re2c/src/dfa/action.h rename to re2c/src/ir/dfa/action.h index a188913d..582d1abf 100644 --- a/re2c/src/dfa/action.h +++ b/re2c/src/ir/dfa/action.h @@ -1,5 +1,5 @@ -#ifndef _RE2C_DFA_ACTION_ -#define _RE2C_DFA_ACTION_ +#ifndef _RE2C_IR_DFA_ACTION_ +#define _RE2C_IR_DFA_ACTION_ #include @@ -106,4 +106,4 @@ private: } // namespace re2c -#endif // _RE2C_DFA_ACTION_ +#endif // _RE2C_IR_DFA_ACTION_ diff --git a/re2c/src/dfa/dfa.cc b/re2c/src/ir/dfa/dfa.cc similarity index 98% rename from re2c/src/dfa/dfa.cc rename to re2c/src/ir/dfa/dfa.cc index 246e2121..2721ea67 100644 --- a/re2c/src/dfa/dfa.cc +++ b/re2c/src/ir/dfa/dfa.cc @@ -1,6 +1,7 @@ #include -#include "src/dfa/dfa.h" +#include "src/ir/dfa/dfa.h" +#include "src/ir/regexp/regexp_rule.h" #include "src/util/allocate.h" namespace re2c diff --git a/re2c/src/dfa/dfa.h b/re2c/src/ir/dfa/dfa.h similarity index 80% rename from re2c/src/dfa/dfa.h rename to re2c/src/ir/dfa/dfa.h index fd00cecf..98229f67 100644 --- a/re2c/src/dfa/dfa.h +++ b/re2c/src/ir/dfa/dfa.h @@ -1,10 +1,9 @@ -#ifndef _RE2C_DFA_DFA_ -#define _RE2C_DFA_DFA_ +#ifndef _RE2C_IR_DFA_DFA_ +#define _RE2C_IR_DFA_DFA_ -#include "src/dfa/action.h" -#include "src/dfa/state.h" +#include "src/ir/dfa/action.h" +#include "src/ir/dfa/state.h" #include "src/util/forbid_copy.h" -#include "src/util/smart_ptr.h" namespace re2c { @@ -44,8 +43,6 @@ public: FORBID_COPY (DFA); }; -smart_ptr genCode (RegExp *, Output &, uint32_t); - } // namespace re2c -#endif // _RE2C_DFA_DFA_ +#endif // _RE2C_IR_DFA_DFA_ diff --git a/re2c/src/dfa/state.h b/re2c/src/ir/dfa/state.h similarity index 80% rename from re2c/src/dfa/state.h rename to re2c/src/ir/dfa/state.h index 72013501..41af7dfd 100644 --- a/re2c/src/dfa/state.h +++ b/re2c/src/ir/dfa/state.h @@ -1,9 +1,9 @@ -#ifndef _RE2C_DFA_STATE_ -#define _RE2C_DFA_STATE_ +#ifndef _RE2C_IR_DFA_STATE_ +#define _RE2C_IR_DFA_STATE_ #include "src/codegen/go.h" -#include "src/dfa/action.h" -#include "src/dfa/re.h" +#include "src/ir/dfa/action.h" +#include "src/ir/regexp/regexp.h" #include "src/util/forbid_copy.h" namespace re2c @@ -49,4 +49,4 @@ public: } // namespace re2c -#endif // _RE2C_DFA_STATE_ +#endif // _RE2C_IR_DFA_STATE_ diff --git a/re2c/src/ir/regexp/display.cc b/re2c/src/ir/regexp/display.cc new file mode 100644 index 00000000..1e12d350 --- /dev/null +++ b/re2c/src/ir/regexp/display.cc @@ -0,0 +1,57 @@ +#include + +#include "src/ir/regexp/regexp.h" +#include "src/ir/regexp/regexp_alt.h" +#include "src/ir/regexp/regexp_cat.h" +#include "src/ir/regexp/regexp_close.h" +#include "src/ir/regexp/regexp_closev.h" +#include "src/ir/regexp/regexp_match.h" +#include "src/ir/regexp/regexp_null.h" +#include "src/ir/regexp/regexp_rule.h" + +namespace re2c +{ + +std::ostream & operator << (std::ostream & o, const RegExp & re) +{ + re.display (o); + return o; +} + +void AltOp::display (std::ostream & o) const +{ + o << exp1 << "|" << exp2; +} + +void CatOp::display (std::ostream & o) const +{ + o << exp1 << exp2; +} + +void CloseOp::display (std::ostream & o) const +{ + o << exp << "+"; +} + +void CloseVOp::display (std::ostream & o) const +{ + o << exp << "+"; +} + +void MatchOp::display (std::ostream & o) const +{ + o << match; +} + +void NullOp::display (std::ostream & o) const +{ + o << "_"; +} + +void RuleOp::display (std::ostream & o) const +{ + o << exp << "/" << ctx << ";"; +} + +} // end namespace re2c + diff --git a/re2c/src/dfa/encoding/enc.cc b/re2c/src/ir/regexp/encoding/enc.cc similarity index 99% rename from re2c/src/dfa/encoding/enc.cc rename to re2c/src/ir/regexp/encoding/enc.cc index 2d775972..9cd8e3b1 100644 --- a/re2c/src/dfa/encoding/enc.cc +++ b/re2c/src/ir/regexp/encoding/enc.cc @@ -1,4 +1,4 @@ -#include "src/dfa/encoding/enc.h" +#include "src/ir/regexp/encoding/enc.h" namespace re2c { diff --git a/re2c/src/dfa/encoding/enc.h b/re2c/src/ir/regexp/encoding/enc.h similarity index 97% rename from re2c/src/dfa/encoding/enc.h rename to re2c/src/ir/regexp/encoding/enc.h index f54997f3..ca5399b4 100644 --- a/re2c/src/dfa/encoding/enc.h +++ b/re2c/src/ir/regexp/encoding/enc.h @@ -1,5 +1,5 @@ -#ifndef _RE2C_DFA_ENCODING_ENC_ -#define _RE2C_DFA_ENCODING_ENC_ +#ifndef _RE2C_IR_REGEXP_ENCODING_ENC_ +#define _RE2C_IR_REGEXP_ENCODING_ENC_ #include "src/util/c99_stdint.h" #include "src/util/range.h" @@ -172,4 +172,4 @@ inline void Enc::setPolicy(policy_t t) } // namespace re2c -#endif // _RE2C_DFA_ENCODING_ENC_ +#endif // _RE2C_IR_REGEXP_ENCODING_ENC_ diff --git a/re2c/src/dfa/encoding/range_suffix.cc b/re2c/src/ir/regexp/encoding/range_suffix.cc similarity index 77% rename from re2c/src/dfa/encoding/range_suffix.cc rename to re2c/src/ir/regexp/encoding/range_suffix.cc index 3c8ccb50..96615927 100644 --- a/re2c/src/dfa/encoding/range_suffix.cc +++ b/re2c/src/ir/regexp/encoding/range_suffix.cc @@ -1,5 +1,6 @@ -#include "src/dfa/encoding/range_suffix.h" -#include "src/dfa/re.h" +#include "src/ir/regexp/encoding/range_suffix.h" +#include "src/ir/regexp/regexp.h" +#include "src/ir/regexp/regexp_match.h" namespace re2c { diff --git a/re2c/src/dfa/encoding/range_suffix.h b/re2c/src/ir/regexp/encoding/range_suffix.h similarity index 79% rename from re2c/src/dfa/encoding/range_suffix.h rename to re2c/src/ir/regexp/encoding/range_suffix.h index 770c0bc6..93008977 100644 --- a/re2c/src/dfa/encoding/range_suffix.h +++ b/re2c/src/ir/regexp/encoding/range_suffix.h @@ -1,5 +1,5 @@ -#ifndef _RE2C_DFA_ENCODING_RANGE_SUFFIX_ -#define _RE2C_DFA_ENCODING_RANGE_SUFFIX_ +#ifndef _RE2C_IR_REGEXP_ENCODING_RANGE_SUFFIX_ +#define _RE2C_IR_REGEXP_ENCODING_RANGE_SUFFIX_ #include // NULL @@ -37,4 +37,4 @@ RegExp * emit(RangeSuffix * p, RegExp * re); } // namespace re2c -#endif // _RE2C_DFA_ENCODING_RANGE_SUFFIX_ +#endif // _RE2C_IR_REGEXP_ENCODING_RANGE_SUFFIX_ diff --git a/re2c/src/dfa/encoding/utf16/utf16.cc b/re2c/src/ir/regexp/encoding/utf16/utf16.cc similarity index 82% rename from re2c/src/dfa/encoding/utf16/utf16.cc rename to re2c/src/ir/regexp/encoding/utf16/utf16.cc index 5b2d8448..47743edf 100644 --- a/re2c/src/dfa/encoding/utf16/utf16.cc +++ b/re2c/src/ir/regexp/encoding/utf16/utf16.cc @@ -1,4 +1,4 @@ -#include "src/dfa/encoding/utf16/utf16.h" +#include "src/ir/regexp/encoding/utf16/utf16.h" namespace re2c { diff --git a/re2c/src/dfa/encoding/utf16/utf16.h b/re2c/src/ir/regexp/encoding/utf16/utf16.h similarity index 82% rename from re2c/src/dfa/encoding/utf16/utf16.h rename to re2c/src/ir/regexp/encoding/utf16/utf16.h index c6389d26..d64de349 100644 --- a/re2c/src/dfa/encoding/utf16/utf16.h +++ b/re2c/src/ir/regexp/encoding/utf16/utf16.h @@ -1,5 +1,5 @@ -#ifndef _RE2C_DFA_ENCODING_UTF16_UTF16_ -#define _RE2C_DFA_ENCODING_UTF16_UTF16_ +#ifndef _RE2C_IR_REGEXP_ENCODING_UTF16_UTF16_ +#define _RE2C_IR_REGEXP_ENCODING_UTF16_UTF16_ #include "src/util/c99_stdint.h" @@ -34,4 +34,4 @@ inline uint16_t utf16::trail_surr(rune r) } // namespace re2c -#endif // _RE2C_DFA_ENCODING_UTF16_UTF16_ +#endif // _RE2C_IR_REGEXP_ENCODING_UTF16_UTF16_ diff --git a/re2c/src/dfa/encoding/utf16/utf16_range.cc b/re2c/src/ir/regexp/encoding/utf16/utf16_range.cc similarity index 97% rename from re2c/src/dfa/encoding/utf16/utf16_range.cc rename to re2c/src/ir/regexp/encoding/utf16/utf16_range.cc index 187f42ff..c4f27f6c 100644 --- a/re2c/src/dfa/encoding/utf16/utf16_range.cc +++ b/re2c/src/ir/regexp/encoding/utf16/utf16_range.cc @@ -1,5 +1,5 @@ -#include "src/dfa/encoding/utf16/utf16_range.h" -#include "src/dfa/re.h" +#include "src/ir/regexp/encoding/utf16/utf16_range.h" +#include "src/ir/regexp/regexp.h" namespace re2c { diff --git a/re2c/src/dfa/encoding/utf16/utf16_range.h b/re2c/src/ir/regexp/encoding/utf16/utf16_range.h similarity index 63% rename from re2c/src/dfa/encoding/utf16/utf16_range.h rename to re2c/src/ir/regexp/encoding/utf16/utf16_range.h index 66cf3b91..4e75560d 100644 --- a/re2c/src/dfa/encoding/utf16/utf16_range.h +++ b/re2c/src/ir/regexp/encoding/utf16/utf16_range.h @@ -1,8 +1,8 @@ -#ifndef _RE2C_DFA_ENCODING_UTF16_RANGE_ -#define _RE2C_DFA_ENCODING_UTF16_RANGE_ +#ifndef _RE2C_IR_REGEXP_ENCODING_UTF16_RANGE_ +#define _RE2C_IR_REGEXP_ENCODING_UTF16_RANGE_ -#include "src/dfa/encoding/range_suffix.h" -#include "src/dfa/encoding/utf16/utf16.h" +#include "src/ir/regexp/encoding/range_suffix.h" +#include "src/ir/regexp/encoding/utf16/utf16.h" namespace re2c { @@ -13,4 +13,4 @@ void UTF16splitByRuneLength(RangeSuffix * & root, utf16::rune l, utf16::rune h); } // namespace re2c -#endif // _RE2C_DFA_ENCODING_UTF16_RANGE_ +#endif // _RE2C_IR_REGEXP_ENCODING_UTF16_RANGE_ diff --git a/re2c/src/dfa/encoding/utf16/utf16_regexp.cc b/re2c/src/ir/regexp/encoding/utf16/utf16_regexp.cc similarity index 77% rename from re2c/src/dfa/encoding/utf16/utf16_regexp.cc rename to re2c/src/ir/regexp/encoding/utf16/utf16_regexp.cc index ee26ce0e..2e7f69df 100644 --- a/re2c/src/dfa/encoding/utf16/utf16_regexp.cc +++ b/re2c/src/ir/regexp/encoding/utf16/utf16_regexp.cc @@ -1,6 +1,8 @@ -#include "src/dfa/encoding/utf16/utf16_range.h" -#include "src/dfa/encoding/utf16/utf16_regexp.h" -#include "src/dfa/re.h" +#include "src/ir/regexp/encoding/utf16/utf16_range.h" +#include "src/ir/regexp/encoding/utf16/utf16_regexp.h" +#include "src/ir/regexp/regexp.h" +#include "src/ir/regexp/regexp_cat.h" +#include "src/ir/regexp/regexp_match.h" namespace re2c { diff --git a/re2c/src/ir/regexp/encoding/utf16/utf16_regexp.h b/re2c/src/ir/regexp/encoding/utf16/utf16_regexp.h new file mode 100644 index 00000000..af1b7e79 --- /dev/null +++ b/re2c/src/ir/regexp/encoding/utf16/utf16_regexp.h @@ -0,0 +1,16 @@ +#ifndef _RE2C_IR_REGEXP_ENCODING_UTF16_REGEXP_ +#define _RE2C_IR_REGEXP_ENCODING_UTF16_REGEXP_ + +#include "src/ir/regexp/encoding/utf16/utf16.h" +#include "src/util/range.h" + +namespace re2c { + +class RegExp; // forward + +RegExp * UTF16Symbol(utf16::rune r); +RegExp * UTF16Range(const Range * r); + +} // namespace re2c + +#endif // _RE2C_IR_REGEXP_ENCODING_UTF16_REGEXP_ diff --git a/re2c/src/dfa/encoding/utf8/utf8.cc b/re2c/src/ir/regexp/encoding/utf8/utf8.cc similarity index 97% rename from re2c/src/dfa/encoding/utf8/utf8.cc rename to re2c/src/ir/regexp/encoding/utf8/utf8.cc index 70256933..8332e131 100644 --- a/re2c/src/dfa/encoding/utf8/utf8.cc +++ b/re2c/src/ir/regexp/encoding/utf8/utf8.cc @@ -1,4 +1,4 @@ -#include "src/dfa/encoding/utf8/utf8.h" +#include "src/ir/regexp/encoding/utf8/utf8.h" namespace re2c { diff --git a/re2c/src/dfa/encoding/utf8/utf8.h b/re2c/src/ir/regexp/encoding/utf8/utf8.h similarity index 87% rename from re2c/src/dfa/encoding/utf8/utf8.h rename to re2c/src/ir/regexp/encoding/utf8/utf8.h index 9c049a50..72d14cca 100644 --- a/re2c/src/dfa/encoding/utf8/utf8.h +++ b/re2c/src/ir/regexp/encoding/utf8/utf8.h @@ -1,5 +1,5 @@ -#ifndef _RE2C_DFA_ENCODING_UTF8_UTF8_ -#define _RE2C_DFA_ENCODING_UTF8_UTF8_ +#ifndef _RE2C_IR_REGEXP_ENCODING_UTF8_UTF8_ +#define _RE2C_IR_REGEXP_ENCODING_UTF8_UTF8_ #include "src/util/c99_stdint.h" @@ -43,4 +43,4 @@ public: } // namespace re2c -#endif // _RE2C_DFA_ENCODING_UTF8_UTF8_ +#endif // _RE2C_IR_REGEXP_ENCODING_UTF8_UTF8_ diff --git a/re2c/src/dfa/encoding/utf8/utf8_range.cc b/re2c/src/ir/regexp/encoding/utf8/utf8_range.cc similarity index 97% rename from re2c/src/dfa/encoding/utf8/utf8_range.cc rename to re2c/src/ir/regexp/encoding/utf8/utf8_range.cc index c8fd9cb9..466526af 100644 --- a/re2c/src/dfa/encoding/utf8/utf8_range.cc +++ b/re2c/src/ir/regexp/encoding/utf8/utf8_range.cc @@ -1,5 +1,5 @@ -#include "src/dfa/encoding/utf8/utf8_range.h" -#include "src/dfa/re.h" +#include "src/ir/regexp/encoding/utf8/utf8_range.h" +#include "src/ir/regexp/regexp.h" namespace re2c { diff --git a/re2c/src/dfa/encoding/utf8/utf8_range.h b/re2c/src/ir/regexp/encoding/utf8/utf8_range.h similarity index 55% rename from re2c/src/dfa/encoding/utf8/utf8_range.h rename to re2c/src/ir/regexp/encoding/utf8/utf8_range.h index e4f91da3..490e76bc 100644 --- a/re2c/src/dfa/encoding/utf8/utf8_range.h +++ b/re2c/src/ir/regexp/encoding/utf8/utf8_range.h @@ -1,8 +1,8 @@ -#ifndef _RE2C_DFA_ENCODING_UTF8_RANGE_ -#define _RE2C_DFA_ENCODING_UTF8_RANGE_ +#ifndef _RE2C_IR_REGEXP_ENCODING_UTF8_RANGE_ +#define _RE2C_IR_REGEXP_ENCODING_UTF8_RANGE_ -#include "src/dfa/encoding/range_suffix.h" -#include "src/dfa/encoding/utf8/utf8.h" +#include "src/ir/regexp/encoding/range_suffix.h" +#include "src/ir/regexp/encoding/utf8/utf8.h" namespace re2c { @@ -12,4 +12,4 @@ void UTF8splitByRuneLength(RangeSuffix * & p, utf8::rune l, utf8::rune h); } // namespace re2c -#endif // _RE2C_DFA_ENCODING_UTF8_RANGE_ +#endif // _RE2C_IR_REGEXP_ENCODING_UTF8_RANGE_ diff --git a/re2c/src/dfa/encoding/utf8/utf8_regexp.cc b/re2c/src/ir/regexp/encoding/utf8/utf8_regexp.cc similarity index 78% rename from re2c/src/dfa/encoding/utf8/utf8_regexp.cc rename to re2c/src/ir/regexp/encoding/utf8/utf8_regexp.cc index 1b9fb06c..0e6001d3 100644 --- a/re2c/src/dfa/encoding/utf8/utf8_regexp.cc +++ b/re2c/src/ir/regexp/encoding/utf8/utf8_regexp.cc @@ -1,6 +1,8 @@ -#include "src/dfa/encoding/utf8/utf8_range.h" -#include "src/dfa/encoding/utf8/utf8_regexp.h" -#include "src/dfa/re.h" +#include "src/ir/regexp/encoding/utf8/utf8_range.h" +#include "src/ir/regexp/encoding/utf8/utf8_regexp.h" +#include "src/ir/regexp/regexp.h" +#include "src/ir/regexp/regexp_cat.h" +#include "src/ir/regexp/regexp_match.h" namespace re2c { diff --git a/re2c/src/ir/regexp/encoding/utf8/utf8_regexp.h b/re2c/src/ir/regexp/encoding/utf8/utf8_regexp.h new file mode 100644 index 00000000..434fc320 --- /dev/null +++ b/re2c/src/ir/regexp/encoding/utf8/utf8_regexp.h @@ -0,0 +1,16 @@ +#ifndef _RE2C_IR_REGEXP_ENCODING_UTF8_REGEXP_ +#define _RE2C_IR_REGEXP_ENCODING_UTF8_REGEXP_ + +#include "src/ir/regexp/encoding/utf8/utf8.h" +#include "src/util/range.h" + +namespace re2c { + +class RegExp; // forward + +RegExp * UTF8Symbol(utf8::rune r); +RegExp * UTF8Range(const Range * r); + +} // namespace re2c + +#endif // _RE2C_IR_REGEXP_ENCODING_UTF8_REGEXP_ diff --git a/re2c/src/ir/regexp/fixed_length.cc b/re2c/src/ir/regexp/fixed_length.cc new file mode 100644 index 00000000..7319be8d --- /dev/null +++ b/re2c/src/ir/regexp/fixed_length.cc @@ -0,0 +1,53 @@ +#include "src/ir/regexp/regexp.h" +#include "src/ir/regexp/regexp_alt.h" +#include "src/ir/regexp/regexp_cat.h" +#include "src/ir/regexp/regexp_match.h" +#include "src/ir/regexp/regexp_null.h" + +namespace re2c +{ + +uint32_t RegExp::fixedLength () +{ + return ~0u; +} + +uint32_t AltOp::fixedLength () +{ + uint32_t l1 = exp1->fixedLength (); + uint32_t l2 = exp1->fixedLength (); + + if (l1 != l2 || l1 == ~0u) + { + return ~0u; + } + + return l1; +} + +uint32_t CatOp::fixedLength () +{ + const uint32_t l1 = exp1->fixedLength (); + if (l1 != ~0u) + { + const uint32_t l2 = exp2->fixedLength (); + if (l2 != ~0u) + { + return l1 + l2; + } + } + return ~0u; +} + +uint32_t MatchOp::fixedLength () +{ + return 1; +} + +uint32_t NullOp::fixedLength () +{ + return 0; +} + +} // end namespace re2c + diff --git a/re2c/src/ir/regexp/regexp.cc b/re2c/src/ir/regexp/regexp.cc new file mode 100644 index 00000000..67515cb2 --- /dev/null +++ b/re2c/src/ir/regexp/regexp.cc @@ -0,0 +1,287 @@ +#include "src/ir/regexp/encoding/utf16/utf16_regexp.h" +#include "src/ir/regexp/encoding/utf8/utf8_regexp.h" +#include "src/ir/regexp/regexp.h" +#include "src/ir/regexp/regexp_alt.h" +#include "src/ir/regexp/regexp_cat.h" +#include "src/ir/regexp/regexp_match.h" +#include "src/ir/regexp/regexp_null.h" +#include "src/parse/scanner.h" +#include "src/util/range.h" + +namespace re2c +{ + +static MatchOp * merge (MatchOp * m1, MatchOp * m2); + +RegExp * doAlt (RegExp * e1, RegExp * e2) +{ + if (!e1) + { + return e2; + } + if (!e2) + { + return e1; + } + return new AltOp (e1, e2); +} + +RegExp * mkAlt (RegExp * e1, RegExp * e2) +{ + AltOp * a; + MatchOp * m1; + MatchOp * m2; + + a = dynamic_cast (e1); + if (a != NULL) + { + m1 = dynamic_cast (a->exp1); + if (m1 != NULL) + { + m1->ins_access = e1->ins_access; + a->exp2->ins_access = e1->ins_access; + e1 = a->exp2; + } + } + else + { + m1 = dynamic_cast (e1); + if (m1 != NULL) + { + e1 = NULL; + } + } + a = dynamic_cast (e2); + if (a != NULL) + { + m2 = dynamic_cast (a->exp1); + if (m2 != NULL) + { + m2->ins_access = e2->ins_access; + a->exp2->ins_access = e2->ins_access; + e2 = a->exp2; + } + } + else + { + m2 = dynamic_cast (e2); + if (m2 != NULL) + { + e2 = NULL; + } + } + + return doAlt (merge (m1, m2), doAlt (e1, e2)); +} + +MatchOp * merge (MatchOp * m1, MatchOp * m2) +{ + if (!m1) + { + return m2; + } + if (!m2) + { + return m1; + } + MatchOp * m = new MatchOp (doUnion (m1->match, m2->match)); + if (m1->ins_access == RegExp::PRIVATE + || m2->ins_access == RegExp::PRIVATE) + { + m->ins_access = RegExp::PRIVATE; + } + return m; +} + +RegExp * doCat (RegExp * e1, RegExp * e2) +{ + if (!e1) + { + return e2; + } + if (!e2) + { + return e1; + } + return new CatOp (e1, e2); +} + +RegExp * mkDiff (RegExp * e1, RegExp * e2) +{ + MatchOp * m1 = dynamic_cast (e1); + MatchOp * m2 = dynamic_cast (e2); + if (m1 == NULL || m2 == NULL) + { + return NULL; + } + Range * r = doDiff (m1->match, m2->match); + return r + ? (RegExp *) new MatchOp(r) + : (RegExp *) new NullOp; +} + +Range * Scanner::getRange(SubStr &s) const +{ + uint32_t lb = unescape(s), ub; + + if (s.len < 2 || *s.str != '-') + { + ub = lb; + } + else + { + s.len--; + s.str++; + ub = unescape(s); + if (ub < lb) + { + uint32_t tmp = lb; + lb = ub; + ub = tmp; + } + } + + Range * r = encoding.encodeRange(lb, ub); + if (r == NULL) + fatalf("Bad code point range: '0x%X - 0x%X'", lb, ub); + return r; +} + +RegExp * Scanner::matchSymbol(uint32_t c) const +{ + if (!encoding.encode(c)) + fatalf("Bad code point: '0x%X'", c); + + if (encoding.is(Enc::UTF16)) + return UTF16Symbol(c); + else if (encoding.is(Enc::UTF8)) + return UTF8Symbol(c); + else + return new MatchOp(new Range(c, c + 1)); +} + +RegExp * Scanner::strToRE (SubStr & s) const +{ + if (s.len == 0) + return new NullOp; + + RegExp *re = matchSymbol(unescape(s)); + + while (s.len > 0) + re = new CatOp(re, matchSymbol(unescape(s))); + + return re; +} + +RegExp * Scanner::strToCaseInsensitiveRE (SubStr & s) const +{ + if (s.len == 0) + return new NullOp; + + uint32_t c = unescape(s); + + RegExp *re, *reL, *reU; + + if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')) + { + reL = matchSymbol(tolower(c)); + reU = matchSymbol(toupper(c)); + re = mkAlt(reL, reU); + } + else + { + re = matchSymbol(c); + } + + while (s.len > 0) + { + c = unescape(s); + + if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')) + { + reL = matchSymbol(tolower(c)); + reU = matchSymbol(toupper(c)); + re = new CatOp(re, mkAlt(reL, reU)); + } + else + { + re = new CatOp(re, matchSymbol(c)); + } + } + + return re; +} + +Range * Scanner::mkRange(SubStr &s) const +{ + Range *r = getRange(s); + while (s.len > 0) + r = doUnion(r, getRange(s)); + + return r; +} + +RegExp * Scanner::matchSymbolRange(Range * r) const +{ + if (encoding.is(Enc::UTF16)) + return UTF16Range(r); + else if (encoding.is(Enc::UTF8)) + return UTF8Range(r); + else + return new MatchOp(r); +} + +RegExp * Scanner::ranToRE (SubStr & s) const +{ + s.len -= 2; + s.str += 1; + + if (s.len == 0) + return new NullOp; + + return matchSymbolRange(mkRange(s)); +} + +RegExp * Scanner::invToRE (SubStr & s) const +{ + s.len -= 3; + s.str += 2; + + Range * full = encoding.fullRange(); + + Range * r = s.len == 0 + ? full + : doDiff(full, mkRange (s)); + + return matchSymbolRange(r); +} + +RegExp * Scanner::mkDot() const +{ + Range * full = encoding.fullRange(); + uint32_t c = '\n'; + if (!encoding.encode(c)) + fatalf("Bad code point: '0x%X'", c); + Range * ran = new Range(c, c + 1); + Range * inv = doDiff(full, ran); + + return matchSymbolRange(inv); +} + +/* + * Create a byte range that includes all possible input characters. + * This may include characters, which do not map to any valid symbol + * in current encoding. For encodings, which directly map symbols to + * input characters (ASCII, EBCDIC, UTF-32), it equals [^]. For other + * encodings (UTF-16, UTF-8), [^] and this range are different. + * + * Also note that default range doesn't respect encoding policy + * (the way invalid code points are treated). + */ +RegExp * Scanner::mkDefault() const +{ + Range * def = new Range(0, encoding.nCodeUnits()); + return new MatchOp(def); +} + +} // namespace re2c diff --git a/re2c/src/ir/regexp/regexp.h b/re2c/src/ir/regexp/regexp.h new file mode 100644 index 00000000..758d3b4f --- /dev/null +++ b/re2c/src/ir/regexp/regexp.h @@ -0,0 +1,76 @@ +#ifndef _RE2C_IR_REGEXP_REGEXP_ +#define _RE2C_IR_REGEXP_REGEXP_ + +#include + +#include "src/ir/bytecode/charset.h" +#include "src/ir/bytecode/ins.h" +#include "src/globals.h" +#include "src/util/forbid_copy.h" + +namespace re2c +{ + +typedef uint32_t Char; + +class RegExp +{ +public: + static free_list vFreeList; + + uint32_t size; + /* + * There're several different cases when the same regexp + * can be used multiple times: + * 1) named definitions, e.g. digit = [0-9]; + * 2) counted repetition, e.g. "a"{3}, "a"{3,}, "a"{3,5} + * 3) multiple DFA's sharing the same regexp , e.g. "abc" { } + * 4) common suffixes, e.g. suffix [\x80-\xBF] in UTF-8 ranges + * In cases 1-3, regexp must be recompiled each time it's reused. + * In case 4, regexp should be compiled only once, and instructions + * should be shared in order to reduce space. + * + * Note: if regexp must always be recompiled, it doesn't imply that + * parts of this regexp must always be recompiled. It only forces + * the compilation function to traverse the regexp after compilation + * and reset compilation cache for each sub-regexp. E.g., for a regexp + * [^]{3} in UTF-8 mode, each of sub-regexps [^] will have common suffix + * [\x80-\xBF] factored out, but they won't share instructions. + */ + Ins * ins_cache; /* if non-NULL, points to compiled instructions */ + enum InsAccess + { + SHARED, + PRIVATE + } ins_access; + + inline RegExp () + : size (0) + , ins_cache (NULL) + , ins_access (SHARED) + { + vFreeList.insert (this); + } + inline virtual ~RegExp () + { + vFreeList.erase (this); + } + virtual void split (CharSet &) = 0; + virtual void calcSize (Char *) = 0; + virtual uint32_t fixedLength (); + virtual uint32_t compile (Char *, Ins *) = 0; + virtual void decompile () = 0; + virtual void display (std::ostream &) const = 0; + friend std::ostream & operator << (std::ostream & o, const RegExp & re); + + FORBID_COPY (RegExp); +}; + +RegExp * doAlt (RegExp * e1, RegExp * e2); +RegExp * mkAlt (RegExp * e1, RegExp * e2); +RegExp * doCat (RegExp * e1, RegExp * e2); +RegExp * mkDiff (RegExp * e1, RegExp * e2); + +} // end namespace re2c + +#endif // _RE2C_IR_REGEXP_REGEXP_ diff --git a/re2c/src/ir/regexp/regexp_alt.h b/re2c/src/ir/regexp/regexp_alt.h new file mode 100644 index 00000000..fb84be5e --- /dev/null +++ b/re2c/src/ir/regexp/regexp_alt.h @@ -0,0 +1,32 @@ +#ifndef _RE2C_IR_REGEXP_REGEXP_ALT_ +#define _RE2C_IR_REGEXP_REGEXP_ALT_ + +#include "src/ir/regexp/regexp.h" + +namespace re2c +{ + +class AltOp: public RegExp +{ + RegExp * exp1; + RegExp * exp2; + +public: + inline AltOp (RegExp * e1, RegExp * e2) + : exp1 (e1) + , exp2 (e2) + {} + void split (CharSet &); + void calcSize (Char *); + uint32_t fixedLength (); + uint32_t compile (Char *, Ins *); + void decompile (); + void display (std::ostream & o) const; + friend RegExp * mkAlt (RegExp *, RegExp *); + + FORBID_COPY (AltOp); +}; + +} // end namespace re2c + +#endif // _RE2C_IR_REGEXP_REGEXP_ALT_ diff --git a/re2c/src/ir/regexp/regexp_cat.h b/re2c/src/ir/regexp/regexp_cat.h new file mode 100644 index 00000000..08ad5f07 --- /dev/null +++ b/re2c/src/ir/regexp/regexp_cat.h @@ -0,0 +1,31 @@ +#ifndef _RE2C_IR_REGEXP_REGEXP_CAT_ +#define _RE2C_IR_REGEXP_REGEXP_CAT_ + +#include "src/ir/regexp/regexp.h" + +namespace re2c +{ + +class CatOp: public RegExp +{ + RegExp * exp1; + RegExp * exp2; + +public: + inline CatOp (RegExp * e1, RegExp * e2) + : exp1 (e1) + , exp2 (e2) + {} + void split (CharSet &); + void calcSize (Char *); + uint32_t fixedLength (); + uint32_t compile (Char *, Ins *); + void decompile (); + void display (std::ostream & o) const; + + FORBID_COPY (CatOp); +}; + +} // end namespace re2c + +#endif // _RE2C_IR_REGEXP_REGEXP_CAT_ diff --git a/re2c/src/ir/regexp/regexp_close.h b/re2c/src/ir/regexp/regexp_close.h new file mode 100644 index 00000000..e147d8c6 --- /dev/null +++ b/re2c/src/ir/regexp/regexp_close.h @@ -0,0 +1,28 @@ +#ifndef _RE2C_IR_REGEXP_REGEXP_CLOSE_ +#define _RE2C_IR_REGEXP_REGEXP_CLOSE_ + +#include "src/ir/regexp/regexp.h" + +namespace re2c +{ + +class CloseOp: public RegExp +{ + RegExp * exp; + +public: + inline CloseOp (RegExp * e) + : exp (e) + {} + void split (CharSet &); + void calcSize (Char *); + uint32_t compile (Char *, Ins *); + void decompile (); + void display (std::ostream & o) const; + + FORBID_COPY (CloseOp); +}; + +} // end namespace re2c + +#endif // _RE2C_IR_REGEXP_REGEXP_CLOSE_ diff --git a/re2c/src/ir/regexp/regexp_closev.h b/re2c/src/ir/regexp/regexp_closev.h new file mode 100644 index 00000000..8cd054ee --- /dev/null +++ b/re2c/src/ir/regexp/regexp_closev.h @@ -0,0 +1,34 @@ +#ifndef _RE2C_IR_REGEXP_REGEXP_CLOSEV_ +#define _RE2C_IR_REGEXP_REGEXP_CLOSEV_ + +#include "src/ir/regexp/regexp.h" + +namespace re2c +{ + +class CloseVOp: public RegExp +{ + RegExp * exp; + int32_t min; + int32_t max; + +public: + inline CloseVOp (RegExp * e, int lb, int ub) + : exp (e) + , min (lb) + , max (ub) + { + exp->ins_access = PRIVATE; + } + void split (CharSet &); + void calcSize (Char *); + uint32_t compile (Char *, Ins *); + void decompile (); + void display (std::ostream & o) const; + + FORBID_COPY (CloseVOp); +}; + +} // end namespace re2c + +#endif // _RE2C_IR_REGEXP_REGEXP_CLOSEV_ diff --git a/re2c/src/ir/regexp/regexp_match.h b/re2c/src/ir/regexp/regexp_match.h new file mode 100644 index 00000000..49059941 --- /dev/null +++ b/re2c/src/ir/regexp/regexp_match.h @@ -0,0 +1,30 @@ +#ifndef _RE2C_IR_REGEXP_REGEXP_MATCH_ +#define _RE2C_IR_REGEXP_REGEXP_MATCH_ + +#include "src/ir/regexp/regexp.h" +#include "src/util/range.h" + +namespace re2c +{ + +class MatchOp: public RegExp +{ +public: + Range * match; + + inline MatchOp (Range * m) + : match (m) + {} + void split (CharSet &); + void calcSize (Char *); + uint32_t fixedLength (); + uint32_t compile (Char *, Ins *); + void decompile (); + void display (std::ostream & o) const; + + FORBID_COPY (MatchOp); +}; + +} // end namespace re2c + +#endif // _RE2C_IR_REGEXP_REGEXP_MATCH_ diff --git a/re2c/src/ir/regexp/regexp_null.h b/re2c/src/ir/regexp/regexp_null.h new file mode 100644 index 00000000..d2227b9d --- /dev/null +++ b/re2c/src/ir/regexp/regexp_null.h @@ -0,0 +1,22 @@ +#ifndef _RE2C_IR_REGEXP_REGEXP_NULL_ +#define _RE2C_IR_REGEXP_REGEXP_NULL_ + +#include "src/ir/regexp/regexp.h" + +namespace re2c +{ + +class NullOp: public RegExp +{ +public: + void split (CharSet &); + void calcSize (Char *); + uint32_t fixedLength (); + uint32_t compile (Char *, Ins *); + void decompile (); + void display (std::ostream & o) const; +}; + +} // end namespace re2c + +#endif // _RE2C_IR_REGEXP_REGEXP_NULL_ diff --git a/re2c/src/ir/regexp/regexp_rule.h b/re2c/src/ir/regexp/regexp_rule.h new file mode 100644 index 00000000..3e6718f3 --- /dev/null +++ b/re2c/src/ir/regexp/regexp_rule.h @@ -0,0 +1,47 @@ +#ifndef _RE2C_IR_REGEXP_REGEXP_RULE_ +#define _RE2C_IR_REGEXP_REGEXP_RULE_ + +#include "src/ir/regexp/regexp.h" +#include "src/ir/rule_rank.h" +#include "src/parse/token.h" + +namespace re2c +{ + +class RuleOp: public RegExp +{ + RegExp * exp; + +public: + RegExp * ctx; + Ins * ins; + rule_rank_t rank; + Token * code; + uint32_t line; + + inline RuleOp (RegExp * e, RegExp * c, Token * t, rule_rank_t r, InsAccess access) + : exp (e) + , ctx (c) + , ins (NULL) + , rank (r) + , code (t) + , line (0) + { + ins_access = access; + } + inline ~RuleOp () + { + delete code; + } + void display (std::ostream & o) const; + void split (CharSet &); + void calcSize (Char *); + uint32_t compile (Char *, Ins *); + void decompile (); + + FORBID_COPY (RuleOp); +}; + +} // end namespace re2c + +#endif // _RE2C_IR_REGEXP_REGEXP_RULE_ diff --git a/re2c/src/dfa/rule_rank.cc b/re2c/src/ir/rule_rank.cc similarity index 94% rename from re2c/src/dfa/rule_rank.cc rename to re2c/src/ir/rule_rank.cc index 4120ff28..ca19f689 100644 --- a/re2c/src/dfa/rule_rank.cc +++ b/re2c/src/ir/rule_rank.cc @@ -1,6 +1,6 @@ #include -#include "src/dfa/rule_rank.h" +#include "src/ir/rule_rank.h" namespace re2c { diff --git a/re2c/src/dfa/rule_rank.h b/re2c/src/ir/rule_rank.h similarity index 89% rename from re2c/src/dfa/rule_rank.h rename to re2c/src/ir/rule_rank.h index f3e49ebd..39473b88 100644 --- a/re2c/src/dfa/rule_rank.h +++ b/re2c/src/ir/rule_rank.h @@ -1,5 +1,5 @@ -#ifndef _RE2C_DFA_RULE_RANK_ -#define _RE2C_DFA_RULE_RANK_ +#ifndef _RE2C_IR_RULE_RANK_ +#define _RE2C_IR_RULE_RANK_ #include @@ -36,4 +36,4 @@ public: } // namespace re2c -#endif // _RE2C_DFA_RULE_RANK_ +#endif // _RE2C_IR_RULE_RANK_ diff --git a/re2c/src/main.cc b/re2c/src/main.cc index 7d005971..d97f7283 100644 --- a/re2c/src/main.cc +++ b/re2c/src/main.cc @@ -4,8 +4,6 @@ #include #include "config.h" -#include "src/dfa/dfa.h" -#include "src/dfa/encoding/enc.h" #include "src/globals.h" #include "src/mbo_getopt.h" #include "src/parse/parser.h" diff --git a/re2c/src/parse/extop.h b/re2c/src/parse/extop.h new file mode 100644 index 00000000..44b33ff4 --- /dev/null +++ b/re2c/src/parse/extop.h @@ -0,0 +1,18 @@ +#ifndef _RE2C_PARSE_EXTOP_ +#define _RE2C_PARSE_EXTOP_ + +#include "src/util/c99_stdint.h" + +namespace re2c +{ + +struct ExtOp +{ + char op; + int32_t minsize; + int32_t maxsize; +}; + +} // end namespace re2c + +#endif // _RE2C_PARSE_EXTOP_ diff --git a/re2c/src/parse/parser.h b/re2c/src/parse/parser.h index f4b3f295..2c973279 100644 --- a/re2c/src/parse/parser.h +++ b/re2c/src/parse/parser.h @@ -1,10 +1,12 @@ #ifndef _RE2C_PARSE_PARSER_ #define _RE2C_PARSE_PARSER_ +#include #include #include "src/codegen/output.h" -#include "src/dfa/re.h" +#include "src/ir/regexp/regexp.h" +#include "src/ir/regexp/regexp_rule.h" #include "src/parse/scanner.h" namespace re2c diff --git a/re2c/src/parse/parser.ypp b/re2c/src/parse/parser.ypp index 164f7db5..114ad940 100644 --- a/re2c/src/parse/parser.ypp +++ b/re2c/src/parse/parser.ypp @@ -8,9 +8,14 @@ #include #include "config.h" -#include "src/dfa/dfa.h" +#include "src/ir/bytecode/bytecode.h" +#include "src/ir/regexp/regexp_cat.h" +#include "src/ir/regexp/regexp_close.h" +#include "src/ir/regexp/regexp_closev.h" +#include "src/ir/regexp/regexp_null.h" #include "src/codegen/emit.h" // genTypes #include "src/globals.h" +#include "src/parse/extop.h" #include "src/parse/parser.h" #include "src/util/c99_stdint.h" #include "src/util/smart_ptr.h" diff --git a/re2c/src/parse/scanner.h b/re2c/src/parse/scanner.h index 1475dd70..51236b5a 100644 --- a/re2c/src/parse/scanner.h +++ b/re2c/src/parse/scanner.h @@ -4,7 +4,7 @@ #include #include "src/codegen/output.h" -#include "src/dfa/re.h" +#include "src/ir/regexp/regexp.h" #include "src/globals.h" #include "src/parse/input.h" #include "src/parse/token.h" diff --git a/re2c/src/parse/scanner_lex.re b/re2c/src/parse/scanner_lex.re index 6ddc5082..1eca7603 100644 --- a/re2c/src/parse/scanner_lex.re +++ b/re2c/src/parse/scanner_lex.re @@ -3,8 +3,9 @@ #include #include -#include "src/dfa/dfa.h" +#include "src/ir/dfa/dfa.h" #include "src/globals.h" +#include "src/parse/extop.h" #include "src/parse/parser.h" #include "src/parse/scanner.h" #include "y.tab.h" diff --git a/re2c/src/parse/unescape.cc b/re2c/src/parse/unescape.cc new file mode 100644 index 00000000..5064d5e8 --- /dev/null +++ b/re2c/src/parse/unescape.cc @@ -0,0 +1,220 @@ +#include + +#include "src/parse/scanner.h" +#include "src/util/substr.h" + +namespace re2c { + +uint32_t Scanner::unescape(SubStr &s) const +{ + static const char * hex = "0123456789abcdef"; + static const char * oct = "01234567"; + + s.len--; + uint32_t c, ucb = 0; + + if ((c = *s.str++) != '\\' || s.len == 0) + { + return c; + } + + s.len--; + + switch (c = *s.str++) + { + case 'n': return '\n'; + case 't': return '\t'; + case 'v': return '\v'; + case 'b': return '\b'; + case 'r': return '\r'; + case 'f': return '\f'; + case 'a': return '\a'; + + case 'x': + { + if (s.len < 2) + { + fatal(s.ofs()+s.len, "Illegal hexadecimal character code, two hexadecimal digits are required"); + return ~0u; + } + + const char *p1 = strchr(hex, tolower(s.str[0])); + const char *p2 = strchr(hex, tolower(s.str[1])); + + if (!p1 || !p2) + { + fatal(s.ofs()+(p1?1:0), "Illegal hexadecimal character code"); + return ~0u; + } + else + { + s.len -= 2; + s.str += 2; + + uint32_t v = (uint32_t)((p1 - hex) << 4) + + (uint32_t)((p2 - hex)); + + return v; + } + } + + case 'U': + { + if (s.len < 8) + { + fatal(s.ofs()+s.len, "Illegal unicode character, eight hexadecimal digits are required"); + return ~0u; + } + + uint32_t l = 0; + if (s.str[0] == '0') + { + l++; + if (s.str[1] == '0') + { + l++; + if (s.str[2] == '0' || (s.str[2] == '1' && encoding.szCodePoint() == 4)) + { + l++; + if (encoding.szCodePoint() == 4) + { + const char *u3 = strchr(hex, tolower(s.str[2])); + const char *u4 = strchr(hex, tolower(s.str[3])); + if (u3 && u4) + { + ucb = (uint32_t)((u3 - hex) << 20) + + (uint32_t)((u4 - hex) << 16); + l++; + } + } + else if (s.str[3] == '0') + { + l++; + } + } + } + } + + if (l != 4) + { + fatal(s.ofs()+l, "Illegal unicode character, eight hexadecimal digits are required"); + } + + s.len -= 4; + s.str += 4; + + // no break; + } + case 'X': + case 'u': + { + if (s.len < 4) + { + fatal(s.ofs()+s.len, + c == 'X' + ? "Illegal hexadecimal character code, four hexadecimal digits are required" + : "Illegal unicode character, four hexadecimal digits are required"); + return ~0u; + } + + const char *p1 = strchr(hex, tolower(s.str[0])); + const char *p2 = strchr(hex, tolower(s.str[1])); + const char *p3 = strchr(hex, tolower(s.str[2])); + const char *p4 = strchr(hex, tolower(s.str[3])); + + if (!p1 || !p2 || !p3 || !p4) + { + fatal(s.ofs()+(p1?1:0)+(p2?1:0)+(p3?1:0), + c == 'X' + ? "Illegal hexadecimal character code, non hexxdecimal digit found" + : "Illegal unicode character, non hexadecimal digit found"); + return ~0u; + } + else + { + s.len -= 4; + s.str += 4; + + uint32_t v = (uint32_t)((p1 - hex) << 12) + + (uint32_t)((p2 - hex) << 8) + + (uint32_t)((p3 - hex) << 4) + + (uint32_t)((p4 - hex)) + + ucb; + + if (v >= encoding.nCodePoints()) + { + fatal(s.ofs(), + c == 'X' + ? "Illegal hexadecimal character code, out of range" + : "Illegal unicode character, out of range"); + } + + return v; + } + } + + case '4': + case '5': + case '6': + case '7': + { + fatal(s.ofs()-1, "Illegal octal character code, first digit must be 0 thru 3"); + return ~0u; + } + + case '0': + case '1': + case '2': + case '3': + { + if (s.len < 2) + { + fatal(s.ofs()+s.len, "Illegal octal character code, three octal digits are required"); + return ~0u; + } + + const char *p0 = strchr(oct, c); + const char *p1 = strchr(oct, s.str[0]); + const char *p2 = strchr(oct, s.str[1]); + + if (!p0 || !p1 || !p2) + { + fatal(s.ofs()+(p1?1:0), "Illegal octal character code, non octal digit found"); + return ~0u; + } + else + { + s.len -= 2; + s.str += 2; + + uint32_t v = (uint32_t)((p0 - oct) << 6) + (uint32_t)((p1 - oct) << 3) + (uint32_t)(p2 - oct); + + return v; + } + } + + default: + return c; + } +} + +std::string& Scanner::unescape(SubStr& str_in, std::string& str_out) const +{ + str_out.clear(); + + while(str_in.len) + { + uint32_t c = unescape(str_in); + + if (c > 0xFF) + { + fatal(str_in.ofs(), "Illegal character"); + } + + str_out += static_cast(c); + } + + return str_out; +} + +} // namespace re2c -- 2.40.0