]> granicus.if.org Git - re2c/commitdiff
Delay encoding expansion until AST is converted to intermediate representation.
authorUlya Trofimovich <skvadrik@gmail.com>
Sat, 4 Mar 2017 19:19:04 +0000 (19:19 +0000)
committerUlya Trofimovich <skvadrik@gmail.com>
Sat, 4 Mar 2017 22:54:22 +0000 (22:54 +0000)
50 files changed:
re2c/Makefile.am
re2c/bootstrap/src/conf/parse_opts.cc
re2c/bootstrap/src/parse/lex.cc
re2c/bootstrap/src/parse/lex_conf.cc
re2c/bootstrap/src/parse/parser.cc
re2c/src/codegen/emit_action.cc
re2c/src/codegen/go_emit.cc
re2c/src/conf/opt.h
re2c/src/conf/parse_opts.re
re2c/src/ir/adfa/adfa.h
re2c/src/ir/compile.cc
re2c/src/ir/dfa/determinization.cc
re2c/src/ir/dfa/dfa.h
re2c/src/ir/nfa/nfa.h
re2c/src/ir/re/ast_to_re.cc
re2c/src/ir/re/empty_class_policy.h [moved from re2c/src/ir/regexp/empty_class_policy.h with 61% similarity]
re2c/src/ir/re/encoding/case.h [moved from re2c/src/ir/regexp/encoding/case.h with 78% similarity]
re2c/src/ir/re/encoding/enc.cc [moved from re2c/src/ir/regexp/encoding/enc.cc with 99% similarity]
re2c/src/ir/re/encoding/enc.h [moved from re2c/src/ir/regexp/encoding/enc.h with 97% similarity]
re2c/src/ir/re/encoding/range_suffix.cc [new file with mode: 0644]
re2c/src/ir/re/encoding/range_suffix.h [moved from re2c/src/ir/regexp/encoding/range_suffix.h with 67% similarity]
re2c/src/ir/re/encoding/utf16/utf16.cc [moved from re2c/src/ir/regexp/encoding/utf16/utf16.cc with 82% similarity]
re2c/src/ir/re/encoding/utf16/utf16.h [moved from re2c/src/ir/regexp/encoding/utf16/utf16.h with 82% similarity]
re2c/src/ir/re/encoding/utf16/utf16_range.cc [moved from re2c/src/ir/regexp/encoding/utf16/utf16_range.cc with 97% similarity]
re2c/src/ir/re/encoding/utf16/utf16_range.h [moved from re2c/src/ir/regexp/encoding/utf16/utf16_range.h with 71% similarity]
re2c/src/ir/re/encoding/utf16/utf16_regexp.cc [moved from re2c/src/ir/regexp/encoding/utf16/utf16_regexp.cc with 54% similarity]
re2c/src/ir/re/encoding/utf16/utf16_regexp.h [new file with mode: 0644]
re2c/src/ir/re/encoding/utf8/utf8.cc [moved from re2c/src/ir/regexp/encoding/utf8/utf8.cc with 97% similarity]
re2c/src/ir/re/encoding/utf8/utf8.h [moved from re2c/src/ir/regexp/encoding/utf8/utf8.h with 88% similarity]
re2c/src/ir/re/encoding/utf8/utf8_range.cc [moved from re2c/src/ir/regexp/encoding/utf8/utf8_range.cc with 96% similarity]
re2c/src/ir/re/encoding/utf8/utf8_range.h [moved from re2c/src/ir/regexp/encoding/utf8/utf8_range.h with 65% similarity]
re2c/src/ir/re/encoding/utf8/utf8_regexp.cc [moved from re2c/src/ir/regexp/encoding/utf8/utf8_regexp.cc with 56% similarity]
re2c/src/ir/re/encoding/utf8/utf8_regexp.h [new file with mode: 0644]
re2c/src/ir/re/fixed_tags.cc
re2c/src/ir/re/nullable.cc
re2c/src/ir/re/re.h
re2c/src/ir/re/split_charset.cc
re2c/src/ir/regexp/encoding/range_suffix.cc [deleted file]
re2c/src/ir/regexp/encoding/utf16/utf16_regexp.h [deleted file]
re2c/src/ir/regexp/encoding/utf8/utf8_regexp.h [deleted file]
re2c/src/ir/regexp/regexp.cc [deleted file]
re2c/src/ir/skeleton/generate_code.cc
re2c/src/ir/skeleton/generate_data.cc
re2c/src/ir/skeleton/skeleton.h
re2c/src/parse/lex.re
re2c/src/parse/lex_conf.re
re2c/src/parse/parser.h
re2c/src/parse/parser.ypp
re2c/src/parse/regexp.cc [new file with mode: 0644]
re2c/src/parse/regexp.h [moved from re2c/src/ir/regexp/regexp.h with 68% similarity]

index f0c83a0d793de2b9543dfed226198294875c2bc3..0d1ca32744c16541ab00482ac7e9120764234487 100644 (file)
@@ -30,18 +30,17 @@ SRC_HDR = \
        src/ir/dfa/tagpool.h \
        src/ir/dfa/tagtree.h \
        src/ir/nfa/nfa.h \
+       src/ir/re/encoding/case.h \
+       src/ir/re/encoding/enc.h \
+       src/ir/re/encoding/range_suffix.h \
+       src/ir/re/encoding/utf8/utf8.h \
+       src/ir/re/encoding/utf8/utf8_regexp.h \
+       src/ir/re/encoding/utf8/utf8_range.h \
+       src/ir/re/encoding/utf16/utf16_range.h \
+       src/ir/re/encoding/utf16/utf16_regexp.h \
+       src/ir/re/encoding/utf16/utf16.h \
+       src/ir/re/empty_class_policy.h \
        src/ir/re/re.h \
-       src/ir/regexp/encoding/case.h \
-       src/ir/regexp/encoding/enc.h \
-       src/ir/regexp/encoding/range_suffix.h \
-       src/ir/regexp/encoding/utf8/utf8.h \
-       src/ir/regexp/encoding/utf8/utf8_regexp.h \
-       src/ir/regexp/encoding/utf8/utf8_range.h \
-       src/ir/regexp/encoding/utf16/utf16_range.h \
-       src/ir/regexp/encoding/utf16/utf16_regexp.h \
-       src/ir/regexp/encoding/utf16/utf16.h \
-       src/ir/regexp/empty_class_policy.h \
-       src/ir/regexp/regexp.h \
        src/ir/compile.h \
        src/ir/rule.h \
        src/ir/tag.h \
@@ -51,6 +50,7 @@ SRC_HDR = \
        src/parse/extop.h \
        src/parse/input.h \
        src/parse/parser.h \
+       src/parse/regexp.h \
        src/parse/rules.h \
        src/parse/scanner.h \
        src/parse/unescape.h \
@@ -90,11 +90,6 @@ SRC = \
        src/ir/nfa/dump.cc \
        src/ir/nfa/estimate_size.cc \
        src/ir/nfa/re_to_nfa.cc \
-       src/ir/re/ast_to_re.cc \
-       src/ir/re/default_tags.cc \
-       src/ir/re/fixed_tags.cc \
-       src/ir/re/nullable.cc \
-       src/ir/re/split_charset.cc \
        src/ir/adfa/adfa.cc \
        src/ir/adfa/dump.cc \
        src/ir/adfa/prepare.cc \
@@ -117,15 +112,19 @@ SRC = \
        src/ir/dfa/minimization.cc \
        src/ir/dfa/tagpool.cc \
        src/ir/dfa/tagtree.cc \
-       src/ir/regexp/encoding/enc.cc \
-       src/ir/regexp/encoding/range_suffix.cc \
-       src/ir/regexp/encoding/utf8/utf8_regexp.cc \
-       src/ir/regexp/encoding/utf8/utf8_range.cc \
-       src/ir/regexp/encoding/utf8/utf8.cc \
-       src/ir/regexp/encoding/utf16/utf16_regexp.cc \
-       src/ir/regexp/encoding/utf16/utf16.cc \
-       src/ir/regexp/encoding/utf16/utf16_range.cc \
-       src/ir/regexp/regexp.cc \
+       src/ir/re/encoding/enc.cc \
+       src/ir/re/encoding/range_suffix.cc \
+       src/ir/re/encoding/utf8/utf8_regexp.cc \
+       src/ir/re/encoding/utf8/utf8_range.cc \
+       src/ir/re/encoding/utf8/utf8.cc \
+       src/ir/re/encoding/utf16/utf16_regexp.cc \
+       src/ir/re/encoding/utf16/utf16.cc \
+       src/ir/re/encoding/utf16/utf16_range.cc \
+       src/ir/re/ast_to_re.cc \
+       src/ir/re/default_tags.cc \
+       src/ir/re/fixed_tags.cc \
+       src/ir/re/nullable.cc \
+       src/ir/re/split_charset.cc \
        src/ir/compile.cc \
        src/ir/rule.cc \
        src/ir/skeleton/control_flow.cc \
@@ -137,6 +136,7 @@ SRC = \
        src/ir/tcmd.cc \
        src/main.cc \
        src/parse/input.cc \
+       src/parse/regexp.cc \
        src/parse/scanner.cc \
        src/parse/unescape.cc \
        src/util/s_to_n32_unsafe.cc \
index c47f528f182c448979af05e1c2b1cad41696805b..7064808bf9fde3b32b13d76546b2ea7feb54f0a2 100644 (file)
@@ -1,10 +1,10 @@
-/* Generated by re2c 0.16 on Fri Mar  3 02:03:51 2017 */
+/* Generated by re2c 0.16 on Sat Mar  4 18:55:56 2017 */
 #line 1 "../src/conf/parse_opts.re"
 #include "src/codegen/input_api.h"
 #include "src/conf/msg.h"
 #include "src/conf/opt.h"
-#include "src/ir/regexp/empty_class_policy.h"
-#include "src/ir/regexp/encoding/enc.h"
+#include "src/ir/re/empty_class_policy.h"
+#include "src/ir/re/encoding/enc.h"
 
 namespace re2c
 {
index a368cc93d03b0680598ebd8c687c7430d3d0164d..65d7e4dbc25ec3e6eee0d8f1ca01b55502582f57 100644 (file)
@@ -1,4 +1,4 @@
-/* Generated by re2c 0.16 on Sat Mar  4 16:56:34 2017 */
+/* Generated by re2c 0.16 on Sat Mar  4 19:02:13 2017 */
 #line 1 "../src/parse/lex.re"
 #include "src/util/c99_stdint.h"
 #include <stddef.h>
@@ -8,10 +8,10 @@
 #include <string>
 
 #include "src/codegen/output.h"
-#include "src/ir/regexp/encoding/enc.h"
-#include "src/ir/regexp/regexp.h"
+#include "src/ir/re/encoding/enc.h"
 #include "src/parse/extop.h"
 #include "src/parse/input.h"
+#include "src/parse/regexp.h"
 #include "src/parse/scanner.h"
 #include "src/parse/parser.h" // needed by "y.tab.h"
 #include "src/parse/unescape.h"
@@ -1051,7 +1051,7 @@ yy175:
        ++YYCURSOR;
 #line 359 "../src/parse/lex.re"
        {
-                                       yylval.regexp = RegExp::make_dot(cline, get_column(), opts, warn);
+                                       yylval.regexp = RegExp::make_dot(cline, get_column());
                                        return TOKEN_REGEXP;
                                }
 #line 1058 "src/parse/lex.cc"
@@ -1218,8 +1218,8 @@ yy198:
                                                                c = static_cast<uint8_t>(*s),
                                                                column = static_cast<uint32_t>(s - pos);
                                                        r = RegExp::make_cat(r, casing
-                                                               ? RegExp::make_ichar(cline, column, c, opts)
-                                                               : RegExp::make_schar(cline, column, c, opts));
+                                                               ? RegExp::make_ichar(cline, column, c)
+                                                               : RegExp::make_schar(cline, column, c));
                                                }
                                                yylval.regexp = r ? r : RegExp::make_nil(cline, get_column());
                                                return TOKEN_REGEXP;
@@ -2276,7 +2276,7 @@ end:
        if (neg) {
                r = Range::sub(opts->encoding.fullRange(), r);
        }
-       return RegExp::make_class(cline, column, r, opts, warn);
+       return RegExp::make_class(cline, column, r);
 }
 
 uint32_t Scanner::lex_cls_chr()
@@ -2863,8 +2863,8 @@ const RegExp *Scanner::lex_str(char quote, bool casing)
                        return r ? r : RegExp::make_nil(cline, get_column());
                }
                r = RegExp::make_cat(r, casing
-                       ? RegExp::make_ichar(cline, get_column(), c, opts)
-                       : RegExp::make_schar(cline, get_column(), c, opts));
+                       ? RegExp::make_ichar(cline, get_column(), c)
+                       : RegExp::make_schar(cline, get_column(), c));
        }
 }
 
index f29dfc9ffe639ce31b510c68d4a564d3c20566a3..ebcb5503df3e070ae88b1b9de55ea0f7cb9fc9c9 100644 (file)
@@ -1,10 +1,10 @@
-/* Generated by re2c 0.16 on Sat Mar  4 15:12:07 2017 */
+/* Generated by re2c 0.16 on Sat Mar  4 18:55:19 2017 */
 #line 1 "../src/parse/lex_conf.re"
 #include "src/util/c99_stdint.h"
 #include <string>
 
 #include "src/codegen/output.h"
-#include "src/ir/regexp/encoding/enc.h"
+#include "src/ir/re/encoding/enc.h"
 #include "src/parse/scanner.h"
 #include "src/util/s_to_n32_unsafe.h"
 
index 720024821d9db13ea28af56bd82b20d13a7cfe4e..c1fdfb61dbe8f242a0886aaf221057c61a71a402 100644 (file)
 #include "src/codegen/output.h"
 #include "src/ir/compile.h"
 #include "src/ir/adfa/adfa.h"
-#include "src/ir/regexp/encoding/enc.h"
-#include "src/ir/regexp/encoding/range_suffix.h"
-#include "src/ir/regexp/regexp.h"
+#include "src/ir/re/encoding/enc.h"
+#include "src/ir/re/encoding/range_suffix.h"
 #include "src/ir/skeleton/skeleton.h"
 #include "src/parse/extop.h"
 #include "src/parse/parser.h"
+#include "src/parse/regexp.h"
 #include "src/parse/scanner.h"
 #include "src/util/free_list.h"
 #include "src/util/range.h"
@@ -181,7 +181,7 @@ static void check(const specs_t &specs, bool cflag)
        }
 }
 
-static void prepare(specs_t &specs, const Scanner &in)
+static void prepare(specs_t &specs)
 {
        specs_t::iterator i, b = specs.begin(), e = specs.end();
 
@@ -207,7 +207,7 @@ static void prepare(specs_t &specs, const Scanner &in)
        for (i = b; i != e; ++i) {
                if (!i->defs.empty()) {
                        const Code *c = i->defs[0];
-                       const RegExp *r = RegExp::make_default(c->fline, 0, in.opts);
+                       const RegExp *r = RegExp::make_default(c->fline, 0);
                        i->rules.push_back(RegExpRule(r, c));
                }
        }
@@ -1613,7 +1613,7 @@ yyreduce:
   case 31:
 
     {
-                       (yyval.regexp) = RegExp::make_diff((yyvsp[-2].regexp), (yyvsp[0].regexp), context.input.opts, context.input.warn);
+                       (yyval.regexp) = RegExp::make_diff((yyvsp[-2].regexp), (yyvsp[0].regexp));
                }
 
     break;
@@ -2009,7 +2009,7 @@ void parse(Scanner &input, Output & o)
                // compile regular expressions to automata
                if (mode != Scanner::Reuse) {
                        check(specs, opts->cFlag);
-                       prepare(specs, input);
+                       prepare(specs);
                        o.source.block().line = input.get_cline();
                        for (specs_t::const_iterator i = specs.begin(); i != specs.end(); ++i) {
                                dfas.push_back(compile(*i, o));
index 153eb0efdd393726cbd533632b0213af7b192459..b6050099a57ff4b03744e6b0f1d3a43eef0d68b1 100644 (file)
@@ -8,8 +8,8 @@
 #include "src/codegen/output.h"
 #include "src/ir/adfa/action.h"
 #include "src/ir/adfa/adfa.h"
-#include "src/ir/regexp/regexp.h"
 #include "src/ir/skeleton/skeleton.h"
+#include "src/parse/regexp.h"
 #include "src/util/string_utils.h"
 
 namespace re2c
index f9aa0246b29d89453388385a3cfa6aedd1cba650..59d5166d587274fd4e2706497eaf543d943ffaa1 100644 (file)
@@ -12,7 +12,7 @@
 #include "src/codegen/output.h"
 #include "src/codegen/print.h"
 #include "src/ir/adfa/adfa.h"
-#include "src/ir/regexp/encoding/enc.h"
+#include "src/ir/re/encoding/enc.h"
 
 namespace re2c
 {
index c51b07946cb9a55bf3a929fc5455066d31940a3f..b83fb33eea84c1c48ef28e4c95d1a0618abedf97 100644 (file)
@@ -8,8 +8,8 @@
 #include "src/codegen/input_api.h"
 #include "src/conf/warn.h"
 #include "src/ir/dfa/dfa.h"
-#include "src/ir/regexp/empty_class_policy.h"
-#include "src/ir/regexp/encoding/enc.h"
+#include "src/ir/re/empty_class_policy.h"
+#include "src/ir/re/encoding/enc.h"
 #include "src/util/forbid_copy.h"
 
 namespace re2c
index c952615ec4ec5b4e82d5248ed3f675fc4de28522..d7ad30e731fcefd55697ce9a37d25f195d906764 100644 (file)
@@ -1,8 +1,8 @@
 #include "src/codegen/input_api.h"
 #include "src/conf/msg.h"
 #include "src/conf/opt.h"
-#include "src/ir/regexp/empty_class_policy.h"
-#include "src/ir/regexp/encoding/enc.h"
+#include "src/ir/re/empty_class_policy.h"
+#include "src/ir/re/encoding/enc.h"
 
 namespace re2c
 {
index 5108248742caecd7409712ce83810a53a1872901..c4020b3d72240815fdcfcfad2621ee108a7b3840 100644 (file)
@@ -11,7 +11,6 @@
 #include "src/codegen/go.h"
 #include "src/codegen/label.h"
 #include "src/ir/adfa/action.h"
-#include "src/ir/regexp/regexp.h"
 #include "src/ir/rule.h"
 #include "src/ir/tag.h"
 #include "src/util/forbid_copy.h"
index c27fd62eb3db6ec29ed46cfe1fecb570f15d521d..58a210be17c9763ad9aa47dc91989d4940952abe 100644 (file)
@@ -9,8 +9,8 @@
 #include "src/ir/dfa/dfa.h"
 #include "src/ir/dfa/dump.h"
 #include "src/ir/nfa/nfa.h"
-#include "src/ir/regexp/regexp.h"
 #include "src/ir/skeleton/skeleton.h"
+#include "src/parse/regexp.h"
 
 namespace re2c {
 
@@ -41,11 +41,11 @@ smart_ptr<DFA> compile(const spec_t &spec, Output &output)
                name = make_name(cond, line),
                &setup = spec.setup.empty() ? "" : spec.setup[0]->text;
 
-       RESpec re(rules);
-       split_charset(re, opts);
-       find_fixed_tags(re, opts);
+       RESpec re(rules, opts, warn);
+       split_charset(re);
+       find_fixed_tags(re);
        insert_default_tags(re);
-       warn_nullable(re, cond, warn);
+       warn_nullable(re, cond);
 
        nfa_t nfa(re);
        if (opts->dump_nfa) dump_nfa(nfa);
index a39b550bc182ed9d069e014a14dadf096c7e21a0..ea9a94cb3118ff4a2422073f029b697120d40278 100644 (file)
@@ -8,7 +8,7 @@
 #include "src/ir/dfa/dump.h"
 #include "src/ir/dfa/find_state.h"
 #include "src/ir/nfa/nfa.h"
-#include "src/ir/regexp/regexp.h"
+#include "src/parse/regexp.h"
 #include "src/util/range.h"
 
 namespace re2c
index 935fd72f970539984cafbe25245271770d65820b..b1bfbea580bddcf88c7a7955cb3ee444f3dbcad4 100644 (file)
@@ -7,11 +7,11 @@
 #include <set>
 
 #include "src/conf/warn.h"
-#include "src/ir/regexp/regexp.h"
 #include "src/ir/rule.h"
 #include "src/ir/tag.h"
 #include "src/ir/tcmd.h"
 #include "src/ir/dfa/tagpool.h"
+#include "src/parse/regexp.h"
 #include "src/util/forbid_copy.h"
 
 namespace re2c
index 7112f08c632cfcd4b95f4e48875efe20077dacb5..adb55b65ad9862c6ec827d8d549fa8df840271a2 100644 (file)
@@ -8,7 +8,6 @@
 
 #include "src/codegen/input_api.h"
 #include "src/ir/re/re.h"
-#include "src/ir/regexp/regexp.h"
 #include "src/ir/rule.h"
 #include "src/ir/tag.h"
 #include "src/util/forbid_copy.h"
index a6f2b74146f8651c336f15880fcdc3ad500fde69..40ed209345c6cc886dacfca1076393d94d207fe5 100644 (file)
@@ -1,19 +1,36 @@
 #include "src/conf/msg.h"
 #include "src/ir/re/re.h"
+#include "src/ir/re/empty_class_policy.h"
+#include "src/ir/re/encoding/case.h"
+#include "src/ir/re/encoding/enc.h"
+#include "src/ir/re/encoding/utf16/utf16_regexp.h"
+#include "src/ir/re/encoding/utf8/utf8_regexp.h"
 
 namespace re2c {
 
+/* note [default regexp]
+ *
+ * Create a byte range that includes all possible input characters.
+ * This may include characters, which do not map to any valid symbol
+ * in current encoding. For encodings, which directly map symbols to
+ * input characters (ASCII, EBCDIC, UTF-32), it equals [^]. For other
+ * encodings (UTF-16, UTF-8), [^] and this range are different.
+ *
+ * Also note that default range doesn't respect encoding policy
+ * (the way invalid code points are treated).
+ */
+
 static RE *ast_to_re(RESpec &spec, const RegExp *ast, size_t &ncap)
 {
        RE::alc_t &alc = spec.alc;
        std::vector<Tag> &tags = spec.tags;
+       const opt_t *opts = spec.opts;
+       Warn &warn = spec.warn;
 
        switch (ast->type) {
                default: assert(false);
                case RegExp::NIL:
                        return re_nil(alc);
-               case RegExp::SYM:
-                       return re_sym(alc, ast->sym);
                case RegExp::ALT: {
                        RE *x = ast_to_re(spec, ast->alt.re1, ncap);
                        RE *y = ast_to_re(spec, ast->alt.re2, ncap);
@@ -85,6 +102,76 @@ static RE *ast_to_re(RESpec &spec, const RegExp *ast, size_t &ncap)
                        }
                        return y;
                }
+               case RegExp::SCHAR:
+                       return re_schar(alc, ast->line, ast->column, ast->schar, opts);
+               case RegExp::ICHAR:
+                       return re_ichar(alc, ast->line, ast->column, ast->ichar, opts);
+               case RegExp::CLASS:
+                       return re_class(alc, ast->line, ast->column, ast->cls, opts, warn);
+               case RegExp::DIFF: {
+                       RE *x = ast_to_re(spec, ast->diff.re1, ncap);
+                       RE *y = ast_to_re(spec, ast->diff.re2, ncap);
+                       if (x->type != RE::SYM || y->type != RE::SYM) {
+                               fatal_error(ast->line, ast->column, "can only difference char sets");
+                       }
+                       return re_class(alc, ast->line, ast->column, Range::sub(x->sym, y->sym), opts, warn);
+               }
+               case RegExp::DOT: {
+                       uint32_t c = '\n';
+                       if (!opts->encoding.encode(c)) {
+                               fatal_error(ast->line, ast->column, "bad code point: '0x%X'", c);
+                       }
+                       return re_class(alc, ast->line, ast->column,
+                               Range::sub(opts->encoding.fullRange(), Range::sym(c)), opts, warn);
+               }
+               case RegExp::DEFAULT:
+                       // see note [default regexp]
+                       return re_sym(alc, Range::ran(0, opts->encoding.nCodeUnits()));
+       }
+}
+
+RE *re_schar(RE::alc_t &alc, uint32_t line, uint32_t column, uint32_t c, const opt_t *opts)
+{
+       if (!opts->encoding.encode(c)) {
+               fatal_error(line, column, "bad code point: '0x%X'", c);
+       }
+       switch (opts->encoding.type ()) {
+               case Enc::UTF16: return UTF16Symbol(alc, c);
+               case Enc::UTF8: return UTF8Symbol(alc, c);
+               default: return re_sym(alc, Range::sym(c));
+       }
+}
+
+RE *re_ichar(RE::alc_t &alc, uint32_t line, uint32_t column, uint32_t c, const opt_t *opts)
+{
+       if (is_alpha(c)) {
+               return re_alt(alc,
+                       re_schar(alc, line, column, to_lower_unsafe(c), opts),
+                       re_schar(alc, line, column, to_upper_unsafe(c), opts));
+       } else {
+               return re_schar(alc, line, column, c, opts);
+       }
+}
+
+RE *re_class(RE::alc_t &alc, uint32_t line, uint32_t column, const Range *r, const opt_t *opts, Warn &warn)
+{
+       if (!r) {
+               switch (opts->empty_class_policy) {
+                       case EMPTY_CLASS_MATCH_EMPTY:
+                               warn.empty_class(line);
+                               return re_nil(alc);
+                       case EMPTY_CLASS_MATCH_NONE:
+                               warn.empty_class(line);
+                               break;
+                       case EMPTY_CLASS_ERROR:
+                               fatal_error(line, column, "empty character class");
+                               break;
+               }
+       }
+       switch (opts->encoding.type()) {
+               case Enc::UTF16: return UTF16Range(alc, r);
+               case Enc::UTF8: return UTF8Range(alc, r);
+               default: return re_sym(alc, r);
        }
 }
 
@@ -114,12 +201,14 @@ static void init_rule(Rule &rule, const Code *code, const std::vector<Tag> &tags
        assert_tags_used_once(rule, tags);
 }
 
-RESpec::RESpec(const std::vector<RegExpRule> &ast)
+RESpec::RESpec(const std::vector<RegExpRule> &ast, const opt_t *o, Warn &w)
        : alc()
        , res()
        , charset(*new std::vector<uint32_t>)
        , tags(*new std::vector<Tag>)
        , rules(*new std::valarray<Rule>(ast.size()))
+       , opts(o)
+       , warn(w)
 {
        for (size_t i = 0; i < ast.size(); ++i) {
                size_t ltag = tags.size(), ncap = 0;
similarity index 61%
rename from re2c/src/ir/regexp/empty_class_policy.h
rename to re2c/src/ir/re/empty_class_policy.h
index bb062de034d1b877b19f6af732013c792e405e5d..a3fcbd02cfdaba52e63bf3e0ce40d6f456c4acc3 100644 (file)
@@ -1,5 +1,5 @@
-#ifndef _RE2C_IR_REGEXP_EMPTY_CLASS_POLICY_
-#define _RE2C_IR_REGEXP_EMPTY_CLASS_POLICY_
+#ifndef _RE2C_IR_RE_EMPTY_CLASS_POLICY_
+#define _RE2C_IR_RE_EMPTY_CLASS_POLICY_
 
 namespace re2c {
 
@@ -12,4 +12,4 @@ enum empty_class_policy_t
 
 } // namespace re2c
 
-#endif // _RE2C_IR_REGEXP_EMPTY_CLASS_POLICY_
+#endif // _RE2C_IR_RE_EMPTY_CLASS_POLICY_
similarity index 78%
rename from re2c/src/ir/regexp/encoding/case.h
rename to re2c/src/ir/re/encoding/case.h
index 38efa0e19f02b2797e46906847599b2c644e0044..582d3373646cbb91218859ed1808517e492f6612 100644 (file)
@@ -1,5 +1,5 @@
-#ifndef _RE2C_IR_REGEXP_ENCODING_CASE_
-#define _RE2C_IR_REGEXP_ENCODING_CASE_
+#ifndef _RE2C_IR_RE_ENCODING_CASE_
+#define _RE2C_IR_RE_ENCODING_CASE_
 
 #include "src/util/c99_stdint.h"
 
@@ -28,4 +28,4 @@ inline uint32_t to_upper_unsafe (uint32_t c)
 
 }
 
-#endif // _RE2C_IR_REGEXP_ENCODING_CASE_
+#endif // _RE2C_IR_RE_ENCODING_CASE_
similarity index 99%
rename from re2c/src/ir/regexp/encoding/enc.cc
rename to re2c/src/ir/re/encoding/enc.cc
index d8c5e9836c3ab2b70daa2f1f4b4e584a4edd79b5..e514300517fb20609fb938c5c13c11f029b0b372 100644 (file)
@@ -1,4 +1,4 @@
-#include "src/ir/regexp/encoding/enc.h"
+#include "src/ir/re/encoding/enc.h"
 #include "src/util/range.h"
 
 namespace re2c {
similarity index 97%
rename from re2c/src/ir/regexp/encoding/enc.h
rename to re2c/src/ir/re/encoding/enc.h
index b85ae0bec9bea2812ee61ce45515bdf32e08a3f6..ed4ca0267eec4cbc8630d7ba2c2f83a9e2c19b7b 100644 (file)
@@ -1,5 +1,5 @@
-#ifndef _RE2C_IR_REGEXP_ENCODING_ENC_
-#define _RE2C_IR_REGEXP_ENCODING_ENC_
+#ifndef _RE2C_IR_RE_ENCODING_ENC_
+#define _RE2C_IR_RE_ENCODING_ENC_
 
 #include "src/util/c99_stdint.h"
 
@@ -194,4 +194,4 @@ inline void Enc::setPolicy(policy_t t)
 
 } // namespace re2c
 
-#endif // _RE2C_IR_REGEXP_ENCODING_ENC_
+#endif // _RE2C_IR_RE_ENCODING_ENC_
diff --git a/re2c/src/ir/re/encoding/range_suffix.cc b/re2c/src/ir/re/encoding/range_suffix.cc
new file mode 100644 (file)
index 0000000..b11bb5d
--- /dev/null
@@ -0,0 +1,32 @@
+#include "src/ir/re/encoding/range_suffix.h"
+#include "src/util/range.h"
+
+namespace re2c {
+
+static RE *emit(RE::alc_t &alc, RangeSuffix *p, RE *re);
+
+free_list<RangeSuffix *> RangeSuffix::freeList;
+
+RE *to_regexp(RE::alc_t &alc, RangeSuffix *p)
+{
+       return p ? emit(alc, p, NULL) : re_sym(alc, NULL);
+}
+
+/*
+ * Build regexp from suffix tree.
+ */
+RE *emit(RE::alc_t &alc, RangeSuffix *p, RE *re)
+{
+       if (p == NULL) {
+               return re;
+       } else {
+               RE *regexp = NULL;
+               for (; p != NULL; p = p->next) {
+                       RE *re1 = re_cat(alc, re_sym(alc, Range::ran(p->l, p->h + 1)), re);
+                       regexp = re_alt(alc, regexp, emit(alc, p->child, re1));
+               }
+               return regexp;
+       }
+}
+
+} // namespace re2c
similarity index 67%
rename from re2c/src/ir/regexp/encoding/range_suffix.h
rename to re2c/src/ir/re/encoding/range_suffix.h
index 3ad1fac1629ef3adda5f8e9ee7a34855e704a3e7..20f379674c09f39fac4a5d47eb26c15c72a72d27 100644 (file)
@@ -1,16 +1,15 @@
-#ifndef _RE2C_IR_REGEXP_ENCODING_RANGE_SUFFIX_
-#define _RE2C_IR_REGEXP_ENCODING_RANGE_SUFFIX_
+#ifndef _RE2C_IR_RE_ENCODING_RANGE_SUFFIX_
+#define _RE2C_IR_RE_ENCODING_RANGE_SUFFIX_
 
 #include "src/util/c99_stdint.h"
 #include <stddef.h> // NULL
 
+#include "src/ir/re/re.h"
 #include "src/util/forbid_copy.h"
 #include "src/util/free_list.h"
 
 namespace re2c {
 
-struct RegExp;
-
 struct RangeSuffix
 {
        static free_list<RangeSuffix *> freeList;
@@ -32,8 +31,8 @@ struct RangeSuffix
        FORBID_COPY (RangeSuffix);
 };
 
-const RegExp *to_regexp(uint32_t l, uint32_t c, RangeSuffix * p);
+RE *to_regexp(RE::alc_t &alc, RangeSuffix *p);
 
 } // namespace re2c
 
-#endif // _RE2C_IR_REGEXP_ENCODING_RANGE_SUFFIX_
+#endif // _RE2C_IR_RE_ENCODING_RANGE_SUFFIX_
similarity index 82%
rename from re2c/src/ir/regexp/encoding/utf16/utf16.cc
rename to re2c/src/ir/re/encoding/utf16/utf16.cc
index 4b0a13bbbaa5c7291f0bde80af8b6dbd91d9d348..fc79176ed5fcc37fe2946111dd6317ab437357d8 100644 (file)
@@ -1,4 +1,4 @@
-#include "src/ir/regexp/encoding/utf16/utf16.h"
+#include "src/ir/re/encoding/utf16/utf16.h"
 
 namespace re2c {
 
similarity index 82%
rename from re2c/src/ir/regexp/encoding/utf16/utf16.h
rename to re2c/src/ir/re/encoding/utf16/utf16.h
index 89cdbdbdd9b13a4b572fdcb4e4d56e85c6924c2a..69d8216c6bbd78aa2f67c7863ceea81798a7c26a 100644 (file)
@@ -1,5 +1,5 @@
-#ifndef _RE2C_IR_REGEXP_ENCODING_UTF16_UTF16_
-#define _RE2C_IR_REGEXP_ENCODING_UTF16_UTF16_
+#ifndef _RE2C_IR_RE_ENCODING_UTF16_UTF16_
+#define _RE2C_IR_RE_ENCODING_UTF16_UTF16_
 
 #include "src/util/c99_stdint.h"
 
@@ -34,4 +34,4 @@ inline uint32_t utf16::trail_surr(rune r)
 
 }  // namespace re2c
 
-#endif // _RE2C_IR_REGEXP_ENCODING_UTF16_UTF16_
+#endif // _RE2C_IR_RE_ENCODING_UTF16_UTF16_
similarity index 97%
rename from re2c/src/ir/regexp/encoding/utf16/utf16_range.cc
rename to re2c/src/ir/re/encoding/utf16/utf16_range.cc
index 51f966bac71ca5f37329b285a3e4653fb9a76994..9c91a2766766195c31f1a332051c915ebd62559b 100644 (file)
@@ -1,5 +1,5 @@
-#include "src/ir/regexp/encoding/utf16/utf16_range.h"
-#include "src/ir/regexp/encoding/range_suffix.h"
+#include "src/ir/re/encoding/utf16/utf16_range.h"
+#include "src/ir/re/encoding/range_suffix.h"
 
 namespace re2c {
 
similarity index 71%
rename from re2c/src/ir/regexp/encoding/utf16/utf16_range.h
rename to re2c/src/ir/re/encoding/utf16/utf16_range.h
index 8a74e8f341056427e8566fb93a1d51c1c9522de9..0a50a95ecaaff6e56f8442ea2e9f212da0f64170 100644 (file)
@@ -1,9 +1,9 @@
-#ifndef _RE2C_IR_REGEXP_ENCODING_UTF16_RANGE_
-#define _RE2C_IR_REGEXP_ENCODING_UTF16_RANGE_
+#ifndef _RE2C_IR_RE_ENCODING_UTF16_RANGE_
+#define _RE2C_IR_RE_ENCODING_UTF16_RANGE_
 
 #include "src/util/c99_stdint.h"
 
-#include "src/ir/regexp/encoding/utf16/utf16.h"
+#include "src/ir/re/encoding/utf16/utf16.h"
 
 namespace re2c {
 
@@ -16,4 +16,4 @@ void UTF16splitByRuneLength(RangeSuffix * & root, utf16::rune l, utf16::rune h);
 
 } // namespace re2c
 
-#endif // _RE2C_IR_REGEXP_ENCODING_UTF16_RANGE_
+#endif // _RE2C_IR_RE_ENCODING_UTF16_RANGE_
similarity index 54%
rename from re2c/src/ir/regexp/encoding/utf16/utf16_regexp.cc
rename to re2c/src/ir/re/encoding/utf16/utf16_regexp.cc
index fe80c5ab7805443f715f405970a45f4846b4b08a..e6b0b66a0e0fc1b22e331f5d5658e67347794feb 100644 (file)
@@ -1,22 +1,22 @@
 #include "src/util/c99_stdint.h"
 
-#include "src/ir/regexp/encoding/utf16/utf16_regexp.h"
-#include "src/ir/regexp/encoding/range_suffix.h"
-#include "src/ir/regexp/encoding/utf16/utf16_range.h"
-#include "src/ir/regexp/regexp.h"
+#include "src/ir/re/encoding/utf16/utf16_regexp.h"
+#include "src/ir/re/encoding/range_suffix.h"
+#include "src/ir/re/encoding/utf16/utf16_range.h"
 #include "src/util/range.h"
 
 namespace re2c {
 
-const RegExp *UTF16Symbol(uint32_t l, uint32_t c, utf16::rune r)
+RE *UTF16Symbol(RE::alc_t &alc, utf16::rune r)
 {
        if (r <= utf16::MAX_1WORD_RUNE) {
-               return RegExp::make_sym(l, c, Range::sym(r));
+               return re_sym(alc, Range::sym(r));
        } else {
                const uint32_t ld = utf16::lead_surr(r);
                const uint32_t tr = utf16::trail_surr(r);
-               return RegExp::make_cat(RegExp::make_sym(l, c, Range::sym(ld)),
-                       RegExp::make_sym(l, c, Range::sym(tr)));
+               return re_cat(alc,
+                       re_sym(alc, Range::sym(ld)),
+                       re_sym(alc, Range::sym(tr)));
        }
 }
 
@@ -26,12 +26,12 @@ const RegExp *UTF16Symbol(uint32_t l, uint32_t c, utf16::rune r)
  * them. We store partially built range in suffix tree, which
  * allows to eliminate common suffixes while building.
  */
-const RegExp *UTF16Range(uint32_t l, uint32_t c, const Range * r)
+RE *UTF16Range(RE::alc_t &alc, const Range *r)
 {
        RangeSuffix * root = NULL;
        for (; r != NULL; r = r->next ())
                UTF16splitByRuneLength(root, r->lower (), r->upper () - 1);
-       return to_regexp(l, c, root);
+       return to_regexp(alc, root);
 }
 
 } // namespace re2c
diff --git a/re2c/src/ir/re/encoding/utf16/utf16_regexp.h b/re2c/src/ir/re/encoding/utf16/utf16_regexp.h
new file mode 100644 (file)
index 0000000..27e1446
--- /dev/null
@@ -0,0 +1,16 @@
+#ifndef _RE2C_IR_RE_ENCODING_UTF16_REGEXP_
+#define _RE2C_IR_RE_ENCODING_UTF16_REGEXP_
+
+#include "src/ir/re/re.h"
+#include "src/ir/re/encoding/utf16/utf16.h"
+
+namespace re2c {
+
+class Range;
+
+RE *UTF16Symbol(RE::alc_t &alc, utf16::rune r);
+RE *UTF16Range(RE::alc_t &alc, const Range *r);
+
+} // namespace re2c
+
+#endif // _RE2C_IR_RE_ENCODING_UTF16_REGEXP_
similarity index 97%
rename from re2c/src/ir/regexp/encoding/utf8/utf8.cc
rename to re2c/src/ir/re/encoding/utf8/utf8.cc
index dd4b59ef2c7d30d793f803f5cc6ac1ed7c85c652..2e8c74e43f732fca3051d83428b264ac5d0019c3 100644 (file)
@@ -1,4 +1,4 @@
-#include "src/ir/regexp/encoding/utf8/utf8.h"
+#include "src/ir/re/encoding/utf8/utf8.h"
 
 namespace re2c {
 
similarity index 88%
rename from re2c/src/ir/regexp/encoding/utf8/utf8.h
rename to re2c/src/ir/re/encoding/utf8/utf8.h
index 0ca3142289b66e803d8528bf177d40552838b300..f1e0d8f1eb77b0130d691731f65a4acdd68883eb 100644 (file)
@@ -1,5 +1,5 @@
-#ifndef _RE2C_IR_REGEXP_ENCODING_UTF8_UTF8_
-#define _RE2C_IR_REGEXP_ENCODING_UTF8_UTF8_
+#ifndef _RE2C_IR_RE_ENCODING_UTF8_UTF8_
+#define _RE2C_IR_RE_ENCODING_UTF8_UTF8_
 
 #include "src/util/c99_stdint.h"
 
@@ -45,4 +45,4 @@ public:
 
 }  // namespace re2c
 
-#endif // _RE2C_IR_REGEXP_ENCODING_UTF8_UTF8_
+#endif // _RE2C_IR_RE_ENCODING_UTF8_UTF8_
similarity index 96%
rename from re2c/src/ir/regexp/encoding/utf8/utf8_range.cc
rename to re2c/src/ir/re/encoding/utf8/utf8_range.cc
index d3d256cf887efb70f38deb76a6ad2cf75eeec893..075c585c76673acee9cccf666f8170d4492f3a06 100644 (file)
@@ -1,5 +1,5 @@
-#include "src/ir/regexp/encoding/utf8/utf8_range.h"
-#include "src/ir/regexp/encoding/range_suffix.h"
+#include "src/ir/re/encoding/utf8/utf8_range.h"
+#include "src/ir/re/encoding/range_suffix.h"
 
 namespace re2c {
 
similarity index 65%
rename from re2c/src/ir/regexp/encoding/utf8/utf8_range.h
rename to re2c/src/ir/re/encoding/utf8/utf8_range.h
index 1ce46132fb7a6ede8ebc6edc5c2eb660a776ab5b..4c53491b7759928462984c137a3ba93712c62625 100644 (file)
@@ -1,9 +1,9 @@
-#ifndef _RE2C_IR_REGEXP_ENCODING_UTF8_RANGE_
-#define _RE2C_IR_REGEXP_ENCODING_UTF8_RANGE_
+#ifndef _RE2C_IR_RE_ENCODING_UTF8_RANGE_
+#define _RE2C_IR_RE_ENCODING_UTF8_RANGE_
 
 #include "src/util/c99_stdint.h"
 
-#include "src/ir/regexp/encoding/utf8/utf8.h"
+#include "src/ir/re/encoding/utf8/utf8.h"
 
 namespace re2c {
 
@@ -15,4 +15,4 @@ void UTF8splitByRuneLength(RangeSuffix * & p, utf8::rune l, utf8::rune h);
 
 } // namespace re2c
 
-#endif // _RE2C_IR_REGEXP_ENCODING_UTF8_RANGE_
+#endif // _RE2C_IR_RE_ENCODING_UTF8_RANGE_
similarity index 56%
rename from re2c/src/ir/regexp/encoding/utf8/utf8_regexp.cc
rename to re2c/src/ir/re/encoding/utf8/utf8_regexp.cc
index 7ac062b2207fc4b6b8034f0d129c383a929d1230..64583410b02a3c3f03365aae239ba3446be2afae 100644 (file)
@@ -1,20 +1,19 @@
 #include "src/util/c99_stdint.h"
 
-#include "src/ir/regexp/encoding/utf8/utf8_regexp.h"
-#include "src/ir/regexp/encoding/range_suffix.h"
-#include "src/ir/regexp/encoding/utf8/utf8_range.h"
-#include "src/ir/regexp/regexp.h"
+#include "src/ir/re/encoding/utf8/utf8_regexp.h"
+#include "src/ir/re/encoding/range_suffix.h"
+#include "src/ir/re/encoding/utf8/utf8_range.h"
 #include "src/util/range.h"
 
 namespace re2c {
 
-const RegExp * UTF8Symbol(uint32_t l, uint32_t c, utf8::rune r)
+RE *UTF8Symbol(RE::alc_t &alc, utf8::rune r)
 {
        uint32_t chars[utf8::MAX_RUNE_LENGTH];
        const uint32_t chars_count = utf8::rune_to_bytes(chars, r);
-       const RegExp *re = RegExp::make_sym(l, c, Range::sym(chars[0]));
+       RE *re = re_sym(alc, Range::sym(chars[0]));
        for (uint32_t i = 1; i < chars_count; ++i) {
-               re = RegExp::make_cat(re, RegExp::make_sym(l, c, Range::sym(chars[i])));
+               re = re_cat(alc, re, re_sym(alc, Range::sym(chars[i])));
        }
        return re;
 }
@@ -25,12 +24,12 @@ const RegExp * UTF8Symbol(uint32_t l, uint32_t c, utf8::rune r)
  * them. We store partially built range in suffix tree, which
  * allows to eliminate common suffixes while building.
  */
-const RegExp * UTF8Range(uint32_t l, uint32_t c, const Range * r)
+RE *UTF8Range(RE::alc_t &alc, const Range *r)
 {
        RangeSuffix * root = NULL;
        for (; r != NULL; r = r->next ())
                UTF8splitByRuneLength(root, r->lower (), r->upper () - 1);
-       return to_regexp (l, c, root);
+       return to_regexp(alc, root);
 }
 
 } // namespace re2c
diff --git a/re2c/src/ir/re/encoding/utf8/utf8_regexp.h b/re2c/src/ir/re/encoding/utf8/utf8_regexp.h
new file mode 100644 (file)
index 0000000..4b7e07f
--- /dev/null
@@ -0,0 +1,16 @@
+#ifndef _RE2C_IR_RE_ENCODING_UTF8_REGEXP_
+#define _RE2C_IR_RE_ENCODING_UTF8_REGEXP_
+
+#include "src/ir/re/re.h"
+#include "src/ir/re/encoding/utf8/utf8.h"
+
+namespace re2c {
+
+class Range;
+
+RE *UTF8Symbol(RE::alc_t &alc, utf8::rune r);
+RE *UTF8Range(RE::alc_t &alc, const Range *r);
+
+} // namespace re2c
+
+#endif // _RE2C_IR_RE_ENCODING_UTF8_REGEXP_
index 7cd336b28ef50f93a8fd24b55bffbc8559771083..cd6d8acbdcfd0bc23b2af2584d9484aacae9f0c0 100644 (file)
@@ -68,9 +68,9 @@ static void find_fixed_tags(RE *re, std::vector<Tag> &tags,
        }
 }
 
-void find_fixed_tags(RESpec &spec, const opt_t *opts)
+void find_fixed_tags(RESpec &spec)
 {
-       const bool generic = opts->input_api == INPUT_CUSTOM;
+       const bool generic = spec.opts->input_api == INPUT_CUSTOM;
        std::vector<RE*>::iterator
                i = spec.res.begin(),
                e = spec.res.end();
index b29ba17b8135cc00f6f98af957e5ec2e5b50d03b..61197f26c444a85aeae4c630f9e6d62914acb167 100644 (file)
@@ -29,13 +29,13 @@ static bool nullable(const RESpec &spec, const RE *re, bool &trail)
  * (including rules with nonempty trailing context)
  * false positives on partially self-shadowed rules like [^]?
  */
-void warn_nullable(const RESpec &spec, const std::string &cond, Warn &warn)
+void warn_nullable(const RESpec &spec, const std::string &cond)
 {
        const size_t nre = spec.res.size();
        for (size_t i = 0; i < nre; ++i) {
                bool trail = false;
                if (nullable(spec, spec.res[i], trail)) {
-                       warn.match_empty_string(spec.rules[i].code->fline, cond);
+                       spec.warn.match_empty_string(spec.rules[i].code->fline, cond);
                }
        }
 }
index e0b30c80f03283242e3755788e46fd47271053fe..03ea81323a985d33f8a1f996d19dd5f84c5fa8ea 100644 (file)
@@ -5,7 +5,8 @@
 
 #include "src/conf/opt.h"
 #include "src/ir/rule.h"
-#include "src/ir/regexp/regexp.h"
+#include "src/parse/regexp.h"
+#include "src/util/forbid_copy.h"
 #include "src/util/range.h"
 #include "src/util/slab_allocator.h"
 
@@ -45,14 +46,17 @@ struct RESpec
        std::vector<uint32_t> &charset;
        std::vector<Tag> &tags;
        std::valarray<Rule> &rules;
+       const opt_t *opts;
+       Warn &warn;
 
-       explicit RESpec(const std::vector<RegExpRule> &ast);
+       explicit RESpec(const std::vector<RegExpRule> &ast, const opt_t *o, Warn &w);
+       FORBID_COPY(RESpec);
 };
 
-void split_charset(RESpec &spec, const opt_t *opts);
-void find_fixed_tags(RESpec &spec, const opt_t *opts);
+void split_charset(RESpec &spec);
+void find_fixed_tags(RESpec &spec);
 void insert_default_tags(RESpec &spec);
-void warn_nullable(const RESpec &spec, const std::string &cond, Warn &warn);
+void warn_nullable(const RESpec &spec, const std::string &cond);
 
 inline RE *re_nil(RE::alc_t &alc)
 {
@@ -73,6 +77,9 @@ inline RE *re_alt(RE::alc_t &alc, RE *x, RE *y)
 {
        if (!x) return y;
        if (!y) return x;
+       if (x->type == RE::SYM && y->type == RE::SYM) {
+               return re_sym(alc, Range::add(x->sym, y->sym));
+       }
 
        RE *z = alc.alloct<RE>(1);
        z->type = RE::ALT;
@@ -112,6 +119,10 @@ inline RE *re_tag(RE::alc_t &alc, size_t i, bool b)
        return x;
 }
 
+RE *re_schar(RE::alc_t &alc, uint32_t line, uint32_t column, uint32_t c, const opt_t *opts);
+RE *re_ichar(RE::alc_t &alc, uint32_t line, uint32_t column, uint32_t c, const opt_t *opts);
+RE *re_class(RE::alc_t &alc, uint32_t line, uint32_t column, const Range *r, const opt_t *opts, Warn &warn);
+
 } // namespace re2c
 
 #endif // _RE2C_IR_RE_RE_
index 66cd58da2485e5ad2f0cbbe2b5a071bbf88de945..9f50d447c26b03669f37bad04ac3dd2b5f3671f5 100644 (file)
@@ -38,7 +38,7 @@ static void split(const RE* re, std::set<uint32_t> &cs)
  * Don't forget to include zero and upper bound, even if they
  * do not explicitely apper in ranges.
  */
-void split_charset(RESpec &spec, const opt_t *opts)
+void split_charset(RESpec &spec)
 {
        std::set<uint32_t> cs;
 
@@ -49,7 +49,7 @@ void split_charset(RESpec &spec, const opt_t *opts)
                split(*i, cs);
        }
        cs.insert(0);
-       cs.insert(opts->encoding.nCodeUnits());
+       cs.insert(spec.opts->encoding.nCodeUnits());
 
        spec.charset.insert(spec.charset.end(), cs.begin(), cs.end());
 }
diff --git a/re2c/src/ir/regexp/encoding/range_suffix.cc b/re2c/src/ir/regexp/encoding/range_suffix.cc
deleted file mode 100644 (file)
index 6ce03e8..0000000
+++ /dev/null
@@ -1,35 +0,0 @@
-#include "src/ir/regexp/encoding/range_suffix.h"
-#include "src/ir/regexp/regexp.h"
-#include "src/util/range.h"
-
-namespace re2c {
-
-static const RegExp *emit(uint32_t l, uint32_t c, RangeSuffix * p, const RegExp * re);
-
-free_list<RangeSuffix *> RangeSuffix::freeList;
-
-const RegExp *to_regexp(uint32_t l, uint32_t c, RangeSuffix * p)
-{
-       return p ? emit(l, c, p, NULL)
-               : RegExp::make_sym(l, c, NULL);
-}
-
-/*
- * Build regexp from suffix tree.
- */
-const RegExp *emit(uint32_t l, uint32_t c, RangeSuffix * p, const RegExp * re)
-{
-       if (p == NULL) {
-               return re;
-       } else {
-               const RegExp *regexp = NULL;
-               for (; p != NULL; p = p->next) {
-                       const RegExp *re1 = RegExp::make_cat(
-                               RegExp::make_sym(l, c, Range::ran(p->l, p->h + 1)), re);
-                       regexp = RegExp::make_alt(regexp, emit(l, c, p->child, re1));
-               }
-               return regexp;
-       }
-}
-
-} // namespace re2c
diff --git a/re2c/src/ir/regexp/encoding/utf16/utf16_regexp.h b/re2c/src/ir/regexp/encoding/utf16/utf16_regexp.h
deleted file mode 100644 (file)
index 87dbf8d..0000000
+++ /dev/null
@@ -1,16 +0,0 @@
-#ifndef _RE2C_IR_REGEXP_ENCODING_UTF16_REGEXP_
-#define _RE2C_IR_REGEXP_ENCODING_UTF16_REGEXP_
-
-#include "src/ir/regexp/encoding/utf16/utf16.h"
-
-namespace re2c {
-
-class Range;
-struct RegExp;
-
-const RegExp * UTF16Symbol(uint32_t l, uint32_t c, utf16::rune r);
-const RegExp * UTF16Range(uint32_t l, uint32_t c, const Range * r);
-
-} // namespace re2c
-
-#endif // _RE2C_IR_REGEXP_ENCODING_UTF16_REGEXP_
diff --git a/re2c/src/ir/regexp/encoding/utf8/utf8_regexp.h b/re2c/src/ir/regexp/encoding/utf8/utf8_regexp.h
deleted file mode 100644 (file)
index ae638d4..0000000
+++ /dev/null
@@ -1,16 +0,0 @@
-#ifndef _RE2C_IR_REGEXP_ENCODING_UTF8_REGEXP_
-#define _RE2C_IR_REGEXP_ENCODING_UTF8_REGEXP_
-
-#include "src/ir/regexp/encoding/utf8/utf8.h"
-
-namespace re2c {
-
-class Range;
-struct RegExp;
-
-const RegExp * UTF8Symbol(uint32_t l, uint32_t c, utf8::rune r);
-const RegExp * UTF8Range(uint32_t l, uint32_t c, const Range * r);
-
-} // namespace re2c
-
-#endif // _RE2C_IR_REGEXP_ENCODING_UTF8_REGEXP_
diff --git a/re2c/src/ir/regexp/regexp.cc b/re2c/src/ir/regexp/regexp.cc
deleted file mode 100644 (file)
index 95b4b23..0000000
+++ /dev/null
@@ -1,123 +0,0 @@
-#include <limits>
-#include <stddef.h>
-
-#include "src/conf/msg.h"
-#include "src/conf/opt.h"
-#include "src/ir/regexp/empty_class_policy.h"
-#include "src/ir/regexp/encoding/case.h"
-#include "src/ir/regexp/encoding/enc.h"
-#include "src/ir/regexp/encoding/utf16/utf16_regexp.h"
-#include "src/ir/regexp/encoding/utf8/utf8_regexp.h"
-#include "src/ir/regexp/regexp.h"
-#include "src/parse/scanner.h"
-#include "src/util/range.h"
-
-namespace re2c
-{
-
-free_list<RegExp*> RegExp::flist;
-
-const uint32_t RegExp::MANY = std::numeric_limits<uint32_t>::max();
-
-const RegExp *RegExp::make_schar(uint32_t line, uint32_t column, uint32_t c, Opt &opts)
-{
-       if (!opts->encoding.encode(c)) {
-               fatal_error(line, column, "bad code point: '0x%X'", c);
-       }
-       switch (opts->encoding.type ()) {
-               case Enc::UTF16: return UTF16Symbol(line, column, c);
-               case Enc::UTF8:  return UTF8Symbol(line, column, c);
-               default:         return RegExp::make_sym(line, column, Range::sym(c));
-       }
-}
-
-const RegExp *RegExp::make_ichar(uint32_t line, uint32_t column, uint32_t c, Opt &opts)
-{
-       if (is_alpha(c)) {
-               const RegExp *l = RegExp::make_schar(line, column, to_lower_unsafe(c), opts);
-               const RegExp *u = RegExp::make_schar(line, column, to_upper_unsafe(c), opts);
-               return RegExp::make_alt(l, u);
-       } else {
-               return RegExp::make_schar(line, column, c, opts);
-       }
-}
-
-const RegExp *RegExp::make_class(uint32_t line, uint32_t column, const Range *r, Opt &opts, Warn &warn)
-{
-       if (!r) {
-               switch (opts->empty_class_policy) {
-                       case EMPTY_CLASS_MATCH_EMPTY:
-                               warn.empty_class(line);
-                               return RegExp::make_nil(line, column);
-                       case EMPTY_CLASS_MATCH_NONE:
-                               warn.empty_class(line);
-                               break;
-                       case EMPTY_CLASS_ERROR:
-                               fatal_error(line, column, "empty character class");
-                               break;
-               }
-       }
-
-       switch (opts->encoding.type()) {
-               case Enc::UTF16: return UTF16Range(line, column, r);
-               case Enc::UTF8:  return UTF8Range(line, column, r);
-               default:         return RegExp::make_sym(line, column, r);
-       }
-}
-
-const RegExp *RegExp::make_diff(const RegExp *re1, const RegExp *re2, Opt &opts, Warn &warn)
-{
-       if (re1 && re2
-               && re1->type == RegExp::SYM
-               && re2->type == RegExp::SYM) {
-               return RegExp::make_class(re1->line, re1->column,
-                       Range::sub(re1->sym, re2->sym), opts, warn);
-       }
-       fatal_error(re1->line, re1->column, "can only difference char sets");
-       return NULL;
-}
-
-const RegExp *RegExp::make_dot(uint32_t line, uint32_t column, Opt &opts, Warn &warn)
-{
-       uint32_t c = '\n';
-       if (!opts->encoding.encode(c)) {
-               fatal_error(line, column, "bad code point: '0x%X'", c);
-       }
-       return RegExp::make_class(line, column,
-               Range::sub(opts->encoding.fullRange(), Range::sym(c)), opts, warn);
-}
-
-/*
- * Create a byte range that includes all possible input characters.
- * This may include characters, which do not map to any valid symbol
- * in current encoding. For encodings, which directly map symbols to
- * input characters (ASCII, EBCDIC, UTF-32), it equals [^]. For other
- * encodings (UTF-16, UTF-8), [^] and this range are different.
- *
- * Also note that default range doesn't respect encoding policy
- * (the way invalid code points are treated).
- */
-const RegExp *RegExp::make_default(uint32_t line, uint32_t column, Opt &opts)
-{
-       return RegExp::make_sym(line, column, Range::ran(0,
-               opts->encoding.nCodeUnits()));
-}
-
-bool RegExp::need_wrap(const RegExp *re)
-{
-       switch (re->type) {
-               case RegExp::ITER:
-               case RegExp::NIL:
-               case RegExp::SYM:
-               case RegExp::TAG:
-               case RegExp::CAP:
-                       return false;
-               case RegExp::ALT:
-               case RegExp::CAT:
-               case RegExp::REF:
-               default:
-                       return true;
-       }
-}
-
-} // namespace re2c
index b235555eed3ed4bed8c996954338ff1ee8205d6c..a43f75fd0e81b038ad1b5b8a289ca4ecca697e5f 100644 (file)
@@ -7,7 +7,7 @@
 #include "src/codegen/bitmap.h"
 #include "src/codegen/emit.h"
 #include "src/codegen/output.h"
-#include "src/ir/regexp/encoding/enc.h"
+#include "src/ir/re/encoding/enc.h"
 #include "src/ir/adfa/adfa.h"
 #include "src/ir/skeleton/skeleton.h"
 
index 7846538a512c672f52b378b3515e6230b73f8b24..8e84659d3edf41820b3f1b3ebb81c871692d8502 100644 (file)
@@ -10,7 +10,7 @@
 #include <vector>
 
 #include "src/conf/msg.h"
-#include "src/ir/regexp/encoding/enc.h"
+#include "src/ir/re/encoding/enc.h"
 #include "src/ir/skeleton/path.h"
 #include "src/ir/skeleton/skeleton.h"
 #include "src/util/u32lim.h"
index d85aa98bbf4137e08a6d308b306c120063bd688f..1298515fc3897f6a4d899da9a263512494682619 100644 (file)
@@ -13,7 +13,6 @@
 
 #include "src/codegen/bitmap.h"
 #include "src/conf/opt.h"
-#include "src/ir/regexp/regexp.h"
 #include "src/ir/rule.h"
 #include "src/ir/tcmd.h"
 #include "src/util/local_increment.h"
index 0c33a3b4f3884a51e6636d923476ed406b5f08ed..464eeda831a0f2d9a247d6de3f0dce9b65d92d2c 100644 (file)
@@ -6,10 +6,10 @@
 #include <string>
 
 #include "src/codegen/output.h"
-#include "src/ir/regexp/encoding/enc.h"
-#include "src/ir/regexp/regexp.h"
+#include "src/ir/re/encoding/enc.h"
 #include "src/parse/extop.h"
 #include "src/parse/input.h"
+#include "src/parse/regexp.h"
 #include "src/parse/scanner.h"
 #include "src/parse/parser.h" // needed by "y.tab.h"
 #include "src/parse/unescape.h"
@@ -348,8 +348,8 @@ start:
                                                                c = static_cast<uint8_t>(*s),
                                                                column = static_cast<uint32_t>(s - pos);
                                                        r = RegExp::make_cat(r, casing
-                                                               ? RegExp::make_ichar(cline, column, c, opts)
-                                                               : RegExp::make_schar(cline, column, c, opts));
+                                                               ? RegExp::make_ichar(cline, column, c)
+                                                               : RegExp::make_schar(cline, column, c));
                                                }
                                                yylval.regexp = r ? r : RegExp::make_nil(cline, get_column());
                                                return TOKEN_REGEXP;
@@ -357,7 +357,7 @@ start:
                                }
 
        "."                     {
-                                       yylval.regexp = RegExp::make_dot(cline, get_column(), opts, warn);
+                                       yylval.regexp = RegExp::make_dot(cline, get_column());
                                        return TOKEN_REGEXP;
                                }
 
@@ -580,7 +580,7 @@ end:
        if (neg) {
                r = Range::sub(opts->encoding.fullRange(), r);
        }
-       return RegExp::make_class(cline, column, r, opts, warn);
+       return RegExp::make_class(cline, column, r);
 }
 
 uint32_t Scanner::lex_cls_chr()
@@ -654,8 +654,8 @@ const RegExp *Scanner::lex_str(char quote, bool casing)
                        return r ? r : RegExp::make_nil(cline, get_column());
                }
                r = RegExp::make_cat(r, casing
-                       ? RegExp::make_ichar(cline, get_column(), c, opts)
-                       : RegExp::make_schar(cline, get_column(), c, opts));
+                       ? RegExp::make_ichar(cline, get_column(), c)
+                       : RegExp::make_schar(cline, get_column(), c));
        }
 }
 
index 2e25a5c596070789d92928db49a44930d1839935..6af3c466582f2897c22cc77ffc34ebe4ce9dbf5f 100644 (file)
@@ -2,7 +2,7 @@
 #include <string>
 
 #include "src/codegen/output.h"
-#include "src/ir/regexp/encoding/enc.h"
+#include "src/ir/re/encoding/enc.h"
 #include "src/parse/scanner.h"
 #include "src/util/s_to_n32_unsafe.h"
 
index ec23cf4cf76b03fa388e66ea1004fcb291f9433b..7153fb55ff34ed87e08e1584987c0f4f2fb9adc0 100644 (file)
@@ -5,7 +5,7 @@
 #include <string>
 
 #include "src/codegen/output.h"
-#include "src/ir/regexp/regexp.h"
+#include "src/parse/regexp.h"
 #include "src/parse/scanner.h"
 #include "src/util/smart_ptr.h"
 
index 4900343f96f37e4e35675cd57cb107c9fbc1317a..c85b20aba0b3f0d13897dba1add41c1c0e2118d9 100644 (file)
 #include "src/codegen/output.h"
 #include "src/ir/compile.h"
 #include "src/ir/adfa/adfa.h"
-#include "src/ir/regexp/encoding/enc.h"
-#include "src/ir/regexp/encoding/range_suffix.h"
-#include "src/ir/regexp/regexp.h"
+#include "src/ir/re/encoding/enc.h"
+#include "src/ir/re/encoding/range_suffix.h"
 #include "src/ir/skeleton/skeleton.h"
 #include "src/parse/extop.h"
 #include "src/parse/parser.h"
+#include "src/parse/regexp.h"
 #include "src/parse/scanner.h"
 #include "src/util/free_list.h"
 #include "src/util/range.h"
@@ -116,7 +116,7 @@ static void check(const specs_t &specs, bool cflag)
        }
 }
 
-static void prepare(specs_t &specs, const Scanner &in)
+static void prepare(specs_t &specs)
 {
        specs_t::iterator i, b = specs.begin(), e = specs.end();
 
@@ -142,7 +142,7 @@ static void prepare(specs_t &specs, const Scanner &in)
        for (i = b; i != e; ++i) {
                if (!i->defs.empty()) {
                        const Code *c = i->defs[0];
-                       const RegExp *r = RegExp::make_default(c->fline, 0, in.opts);
+                       const RegExp *r = RegExp::make_default(c->fline, 0);
                        i->rules.push_back(RegExpRule(r, c));
                }
        }
@@ -329,7 +329,7 @@ diff:
                }
        |       diff '\\' term
                {
-                       $$ = RegExp::make_diff($1, $3, context.input.opts, context.input.warn);
+                       $$ = RegExp::make_diff($1, $3);
                }
 ;
 
@@ -460,7 +460,7 @@ void parse(Scanner &input, Output & o)
                // compile regular expressions to automata
                if (mode != Scanner::Reuse) {
                        check(specs, opts->cFlag);
-                       prepare(specs, input);
+                       prepare(specs);
                        o.source.block().line = input.get_cline();
                        for (specs_t::const_iterator i = specs.begin(); i != specs.end(); ++i) {
                                dfas.push_back(compile(*i, o));
diff --git a/re2c/src/parse/regexp.cc b/re2c/src/parse/regexp.cc
new file mode 100644 (file)
index 0000000..3aea6c3
--- /dev/null
@@ -0,0 +1,34 @@
+#include <limits>
+
+#include "src/parse/regexp.h"
+
+namespace re2c
+{
+
+free_list<RegExp*> RegExp::flist;
+
+const uint32_t RegExp::MANY = std::numeric_limits<uint32_t>::max();
+
+bool RegExp::need_wrap(const RegExp *re)
+{
+       switch (re->type) {
+               case RegExp::ITER:
+               case RegExp::NIL:
+               case RegExp::SCHAR:
+               case RegExp::ICHAR:
+               case RegExp::CLASS:
+               case RegExp::DOT:
+               case RegExp::DEFAULT:
+               case RegExp::TAG:
+               case RegExp::CAP:
+                       return false;
+               case RegExp::ALT:
+               case RegExp::CAT:
+               case RegExp::DIFF:
+               case RegExp::REF:
+                       return true;
+       }
+       assert(false);
+}
+
+} // namespace re2c
similarity index 68%
rename from re2c/src/ir/regexp/regexp.h
rename to re2c/src/parse/regexp.h
index bd08bc0315c142693f3075ebcc812a6966f388a2..f0fa408ef9e6a81c701382db16821e6f054d8175 100644 (file)
@@ -1,5 +1,5 @@
-#ifndef _RE2C_IR_REGEXP_REGEXP_
-#define _RE2C_IR_REGEXP_REGEXP_
+#ifndef _RE2C_PARSE_REGEXP_
+#define _RE2C_PARSE_REGEXP_
 
 #include "src/util/c99_stdint.h"
 #include <set>
@@ -21,10 +21,12 @@ struct RegExp
        static free_list<RegExp*> flist;
        static const uint32_t MANY;
 
-       enum type_t {NIL, SYM, ALT, CAT, ITER, TAG, CAP, REF} type;
+       enum type_t {NIL, SCHAR, ICHAR, CLASS, DOT, DEFAULT, ALT, CAT, ITER, DIFF, TAG, CAP, REF} type;
        union
        {
-               const Range *sym;
+               uint32_t schar;
+               uint32_t ichar;
+               const Range *cls;
                struct
                {
                        const RegExp *re1;
@@ -41,6 +43,11 @@ struct RegExp
                        uint32_t min;
                        uint32_t max;
                } iter;
+               struct
+               {
+                       const RegExp *re1;
+                       const RegExp *re2;
+               } diff;
                const std::string *tag;
                const RegExp *cap;
                struct
@@ -56,20 +63,36 @@ struct RegExp
        {
                return new RegExp(l, c, NIL);
        }
-       static const RegExp *make_sym(uint32_t l, uint32_t c, const Range *r)
+       static const RegExp *make_schar(uint32_t l, uint32_t c, uint32_t x)
+       {
+               RegExp *re = new RegExp(l, c, SCHAR);
+               re->schar = x;
+               return re;
+       }
+       static const RegExp *make_ichar(uint32_t l, uint32_t c, uint32_t x)
+       {
+               RegExp *re = new RegExp(l, c, ICHAR);
+               re->ichar = x;
+               return re;
+       }
+       static const RegExp *make_class(uint32_t l, uint32_t c, const Range *r)
        {
-               RegExp *re = new RegExp(l, c, SYM);
-               re->sym = r;
+               RegExp *re = new RegExp(l, c, CLASS);
+               re->cls = r;
                return re;
        }
+       static const RegExp *make_dot(uint32_t l, uint32_t c)
+       {
+               return new RegExp(l, c, DOT);
+       }
+       static const RegExp *make_default(uint32_t l, uint32_t c)
+       {
+               return new RegExp(l, c, DEFAULT);
+       }
        static const RegExp *make_alt(const RegExp *r1, const RegExp *r2)
        {
                if (!r1) return r2;
                if (!r2) return r1;
-               if (r1->type == RegExp::SYM && r2->type == RegExp::SYM) {
-                       return RegExp::make_sym(r1->line, r1->column,
-                               Range::add(r1->sym, r2->sym));
-               }
                RegExp *re = new RegExp(r1->line, r1->column, ALT);
                re->alt.re1 = r1;
                re->alt.re2 = r2;
@@ -92,6 +115,13 @@ struct RegExp
                re->iter.max = m;
                return re;
        }
+       static const RegExp *make_diff(const RegExp *r1, const RegExp *r2)
+       {
+               RegExp *re = new RegExp(r1->line, r1->column, DIFF);
+               re->cat.re1 = r1;
+               re->cat.re2 = r2;
+               return re;
+       }
        static const RegExp *make_tag(uint32_t l, uint32_t c, const std::string *t)
        {
                RegExp *re = new RegExp(l, c, TAG);
@@ -120,12 +150,6 @@ struct RegExp
                        delete ref.name;
                }
        }
-       static const RegExp *make_schar(uint32_t line, uint32_t column, uint32_t c, Opt &opts);
-       static const RegExp *make_ichar(uint32_t line, uint32_t column, uint32_t c, Opt &opts);
-       static const RegExp *make_class(uint32_t line, uint32_t column, const Range *r, Opt &opts, Warn &warn);
-       static const RegExp *make_diff(const RegExp * e1, const RegExp * e2, Opt &opts, Warn &warn);
-       static const RegExp *make_dot(uint32_t line, uint32_t column, Opt &opts, Warn &warn);
-       static const RegExp *make_default(uint32_t line, uint32_t column, Opt &opts);
        static bool need_wrap(const RegExp *re);
 
 private:
@@ -149,4 +173,4 @@ struct RegExpRule
 
 } // end namespace re2c
 
-#endif // _RE2C_IR_REGEXP_REGEXP_
+#endif // _RE2C_PARSE_REGEXP_