src/ir/dfa/tagpool.h \
src/ir/dfa/tagtree.h \
src/ir/nfa/nfa.h \
+ src/ir/re/encoding/case.h \
+ src/ir/re/encoding/enc.h \
+ src/ir/re/encoding/range_suffix.h \
+ src/ir/re/encoding/utf8/utf8.h \
+ src/ir/re/encoding/utf8/utf8_regexp.h \
+ src/ir/re/encoding/utf8/utf8_range.h \
+ src/ir/re/encoding/utf16/utf16_range.h \
+ src/ir/re/encoding/utf16/utf16_regexp.h \
+ src/ir/re/encoding/utf16/utf16.h \
+ src/ir/re/empty_class_policy.h \
src/ir/re/re.h \
- src/ir/regexp/encoding/case.h \
- src/ir/regexp/encoding/enc.h \
- src/ir/regexp/encoding/range_suffix.h \
- src/ir/regexp/encoding/utf8/utf8.h \
- src/ir/regexp/encoding/utf8/utf8_regexp.h \
- src/ir/regexp/encoding/utf8/utf8_range.h \
- src/ir/regexp/encoding/utf16/utf16_range.h \
- src/ir/regexp/encoding/utf16/utf16_regexp.h \
- src/ir/regexp/encoding/utf16/utf16.h \
- src/ir/regexp/empty_class_policy.h \
- src/ir/regexp/regexp.h \
src/ir/compile.h \
src/ir/rule.h \
src/ir/tag.h \
src/parse/extop.h \
src/parse/input.h \
src/parse/parser.h \
+ src/parse/regexp.h \
src/parse/rules.h \
src/parse/scanner.h \
src/parse/unescape.h \
src/ir/nfa/dump.cc \
src/ir/nfa/estimate_size.cc \
src/ir/nfa/re_to_nfa.cc \
- src/ir/re/ast_to_re.cc \
- src/ir/re/default_tags.cc \
- src/ir/re/fixed_tags.cc \
- src/ir/re/nullable.cc \
- src/ir/re/split_charset.cc \
src/ir/adfa/adfa.cc \
src/ir/adfa/dump.cc \
src/ir/adfa/prepare.cc \
src/ir/dfa/minimization.cc \
src/ir/dfa/tagpool.cc \
src/ir/dfa/tagtree.cc \
- src/ir/regexp/encoding/enc.cc \
- src/ir/regexp/encoding/range_suffix.cc \
- src/ir/regexp/encoding/utf8/utf8_regexp.cc \
- src/ir/regexp/encoding/utf8/utf8_range.cc \
- src/ir/regexp/encoding/utf8/utf8.cc \
- src/ir/regexp/encoding/utf16/utf16_regexp.cc \
- src/ir/regexp/encoding/utf16/utf16.cc \
- src/ir/regexp/encoding/utf16/utf16_range.cc \
- src/ir/regexp/regexp.cc \
+ src/ir/re/encoding/enc.cc \
+ src/ir/re/encoding/range_suffix.cc \
+ src/ir/re/encoding/utf8/utf8_regexp.cc \
+ src/ir/re/encoding/utf8/utf8_range.cc \
+ src/ir/re/encoding/utf8/utf8.cc \
+ src/ir/re/encoding/utf16/utf16_regexp.cc \
+ src/ir/re/encoding/utf16/utf16.cc \
+ src/ir/re/encoding/utf16/utf16_range.cc \
+ src/ir/re/ast_to_re.cc \
+ src/ir/re/default_tags.cc \
+ src/ir/re/fixed_tags.cc \
+ src/ir/re/nullable.cc \
+ src/ir/re/split_charset.cc \
src/ir/compile.cc \
src/ir/rule.cc \
src/ir/skeleton/control_flow.cc \
src/ir/tcmd.cc \
src/main.cc \
src/parse/input.cc \
+ src/parse/regexp.cc \
src/parse/scanner.cc \
src/parse/unescape.cc \
src/util/s_to_n32_unsafe.cc \
-/* Generated by re2c 0.16 on Fri Mar 3 02:03:51 2017 */
+/* Generated by re2c 0.16 on Sat Mar 4 18:55:56 2017 */
#line 1 "../src/conf/parse_opts.re"
#include "src/codegen/input_api.h"
#include "src/conf/msg.h"
#include "src/conf/opt.h"
-#include "src/ir/regexp/empty_class_policy.h"
-#include "src/ir/regexp/encoding/enc.h"
+#include "src/ir/re/empty_class_policy.h"
+#include "src/ir/re/encoding/enc.h"
namespace re2c
{
-/* Generated by re2c 0.16 on Sat Mar 4 16:56:34 2017 */
+/* Generated by re2c 0.16 on Sat Mar 4 19:02:13 2017 */
#line 1 "../src/parse/lex.re"
#include "src/util/c99_stdint.h"
#include <stddef.h>
#include <string>
#include "src/codegen/output.h"
-#include "src/ir/regexp/encoding/enc.h"
-#include "src/ir/regexp/regexp.h"
+#include "src/ir/re/encoding/enc.h"
#include "src/parse/extop.h"
#include "src/parse/input.h"
+#include "src/parse/regexp.h"
#include "src/parse/scanner.h"
#include "src/parse/parser.h" // needed by "y.tab.h"
#include "src/parse/unescape.h"
++YYCURSOR;
#line 359 "../src/parse/lex.re"
{
- yylval.regexp = RegExp::make_dot(cline, get_column(), opts, warn);
+ yylval.regexp = RegExp::make_dot(cline, get_column());
return TOKEN_REGEXP;
}
#line 1058 "src/parse/lex.cc"
c = static_cast<uint8_t>(*s),
column = static_cast<uint32_t>(s - pos);
r = RegExp::make_cat(r, casing
- ? RegExp::make_ichar(cline, column, c, opts)
- : RegExp::make_schar(cline, column, c, opts));
+ ? RegExp::make_ichar(cline, column, c)
+ : RegExp::make_schar(cline, column, c));
}
yylval.regexp = r ? r : RegExp::make_nil(cline, get_column());
return TOKEN_REGEXP;
if (neg) {
r = Range::sub(opts->encoding.fullRange(), r);
}
- return RegExp::make_class(cline, column, r, opts, warn);
+ return RegExp::make_class(cline, column, r);
}
uint32_t Scanner::lex_cls_chr()
return r ? r : RegExp::make_nil(cline, get_column());
}
r = RegExp::make_cat(r, casing
- ? RegExp::make_ichar(cline, get_column(), c, opts)
- : RegExp::make_schar(cline, get_column(), c, opts));
+ ? RegExp::make_ichar(cline, get_column(), c)
+ : RegExp::make_schar(cline, get_column(), c));
}
}
-/* Generated by re2c 0.16 on Sat Mar 4 15:12:07 2017 */
+/* Generated by re2c 0.16 on Sat Mar 4 18:55:19 2017 */
#line 1 "../src/parse/lex_conf.re"
#include "src/util/c99_stdint.h"
#include <string>
#include "src/codegen/output.h"
-#include "src/ir/regexp/encoding/enc.h"
+#include "src/ir/re/encoding/enc.h"
#include "src/parse/scanner.h"
#include "src/util/s_to_n32_unsafe.h"
#include "src/codegen/output.h"
#include "src/ir/compile.h"
#include "src/ir/adfa/adfa.h"
-#include "src/ir/regexp/encoding/enc.h"
-#include "src/ir/regexp/encoding/range_suffix.h"
-#include "src/ir/regexp/regexp.h"
+#include "src/ir/re/encoding/enc.h"
+#include "src/ir/re/encoding/range_suffix.h"
#include "src/ir/skeleton/skeleton.h"
#include "src/parse/extop.h"
#include "src/parse/parser.h"
+#include "src/parse/regexp.h"
#include "src/parse/scanner.h"
#include "src/util/free_list.h"
#include "src/util/range.h"
}
}
-static void prepare(specs_t &specs, const Scanner &in)
+static void prepare(specs_t &specs)
{
specs_t::iterator i, b = specs.begin(), e = specs.end();
for (i = b; i != e; ++i) {
if (!i->defs.empty()) {
const Code *c = i->defs[0];
- const RegExp *r = RegExp::make_default(c->fline, 0, in.opts);
+ const RegExp *r = RegExp::make_default(c->fline, 0);
i->rules.push_back(RegExpRule(r, c));
}
}
case 31:
{
- (yyval.regexp) = RegExp::make_diff((yyvsp[-2].regexp), (yyvsp[0].regexp), context.input.opts, context.input.warn);
+ (yyval.regexp) = RegExp::make_diff((yyvsp[-2].regexp), (yyvsp[0].regexp));
}
break;
// compile regular expressions to automata
if (mode != Scanner::Reuse) {
check(specs, opts->cFlag);
- prepare(specs, input);
+ prepare(specs);
o.source.block().line = input.get_cline();
for (specs_t::const_iterator i = specs.begin(); i != specs.end(); ++i) {
dfas.push_back(compile(*i, o));
#include "src/codegen/output.h"
#include "src/ir/adfa/action.h"
#include "src/ir/adfa/adfa.h"
-#include "src/ir/regexp/regexp.h"
#include "src/ir/skeleton/skeleton.h"
+#include "src/parse/regexp.h"
#include "src/util/string_utils.h"
namespace re2c
#include "src/codegen/output.h"
#include "src/codegen/print.h"
#include "src/ir/adfa/adfa.h"
-#include "src/ir/regexp/encoding/enc.h"
+#include "src/ir/re/encoding/enc.h"
namespace re2c
{
#include "src/codegen/input_api.h"
#include "src/conf/warn.h"
#include "src/ir/dfa/dfa.h"
-#include "src/ir/regexp/empty_class_policy.h"
-#include "src/ir/regexp/encoding/enc.h"
+#include "src/ir/re/empty_class_policy.h"
+#include "src/ir/re/encoding/enc.h"
#include "src/util/forbid_copy.h"
namespace re2c
#include "src/codegen/input_api.h"
#include "src/conf/msg.h"
#include "src/conf/opt.h"
-#include "src/ir/regexp/empty_class_policy.h"
-#include "src/ir/regexp/encoding/enc.h"
+#include "src/ir/re/empty_class_policy.h"
+#include "src/ir/re/encoding/enc.h"
namespace re2c
{
#include "src/codegen/go.h"
#include "src/codegen/label.h"
#include "src/ir/adfa/action.h"
-#include "src/ir/regexp/regexp.h"
#include "src/ir/rule.h"
#include "src/ir/tag.h"
#include "src/util/forbid_copy.h"
#include "src/ir/dfa/dfa.h"
#include "src/ir/dfa/dump.h"
#include "src/ir/nfa/nfa.h"
-#include "src/ir/regexp/regexp.h"
#include "src/ir/skeleton/skeleton.h"
+#include "src/parse/regexp.h"
namespace re2c {
name = make_name(cond, line),
&setup = spec.setup.empty() ? "" : spec.setup[0]->text;
- RESpec re(rules);
- split_charset(re, opts);
- find_fixed_tags(re, opts);
+ RESpec re(rules, opts, warn);
+ split_charset(re);
+ find_fixed_tags(re);
insert_default_tags(re);
- warn_nullable(re, cond, warn);
+ warn_nullable(re, cond);
nfa_t nfa(re);
if (opts->dump_nfa) dump_nfa(nfa);
#include "src/ir/dfa/dump.h"
#include "src/ir/dfa/find_state.h"
#include "src/ir/nfa/nfa.h"
-#include "src/ir/regexp/regexp.h"
+#include "src/parse/regexp.h"
#include "src/util/range.h"
namespace re2c
#include <set>
#include "src/conf/warn.h"
-#include "src/ir/regexp/regexp.h"
#include "src/ir/rule.h"
#include "src/ir/tag.h"
#include "src/ir/tcmd.h"
#include "src/ir/dfa/tagpool.h"
+#include "src/parse/regexp.h"
#include "src/util/forbid_copy.h"
namespace re2c
#include "src/codegen/input_api.h"
#include "src/ir/re/re.h"
-#include "src/ir/regexp/regexp.h"
#include "src/ir/rule.h"
#include "src/ir/tag.h"
#include "src/util/forbid_copy.h"
#include "src/conf/msg.h"
#include "src/ir/re/re.h"
+#include "src/ir/re/empty_class_policy.h"
+#include "src/ir/re/encoding/case.h"
+#include "src/ir/re/encoding/enc.h"
+#include "src/ir/re/encoding/utf16/utf16_regexp.h"
+#include "src/ir/re/encoding/utf8/utf8_regexp.h"
namespace re2c {
+/* note [default regexp]
+ *
+ * Create a byte range that includes all possible input characters.
+ * This may include characters, which do not map to any valid symbol
+ * in current encoding. For encodings, which directly map symbols to
+ * input characters (ASCII, EBCDIC, UTF-32), it equals [^]. For other
+ * encodings (UTF-16, UTF-8), [^] and this range are different.
+ *
+ * Also note that default range doesn't respect encoding policy
+ * (the way invalid code points are treated).
+ */
+
static RE *ast_to_re(RESpec &spec, const RegExp *ast, size_t &ncap)
{
RE::alc_t &alc = spec.alc;
std::vector<Tag> &tags = spec.tags;
+ const opt_t *opts = spec.opts;
+ Warn &warn = spec.warn;
switch (ast->type) {
default: assert(false);
case RegExp::NIL:
return re_nil(alc);
- case RegExp::SYM:
- return re_sym(alc, ast->sym);
case RegExp::ALT: {
RE *x = ast_to_re(spec, ast->alt.re1, ncap);
RE *y = ast_to_re(spec, ast->alt.re2, ncap);
}
return y;
}
+ case RegExp::SCHAR:
+ return re_schar(alc, ast->line, ast->column, ast->schar, opts);
+ case RegExp::ICHAR:
+ return re_ichar(alc, ast->line, ast->column, ast->ichar, opts);
+ case RegExp::CLASS:
+ return re_class(alc, ast->line, ast->column, ast->cls, opts, warn);
+ case RegExp::DIFF: {
+ RE *x = ast_to_re(spec, ast->diff.re1, ncap);
+ RE *y = ast_to_re(spec, ast->diff.re2, ncap);
+ if (x->type != RE::SYM || y->type != RE::SYM) {
+ fatal_error(ast->line, ast->column, "can only difference char sets");
+ }
+ return re_class(alc, ast->line, ast->column, Range::sub(x->sym, y->sym), opts, warn);
+ }
+ case RegExp::DOT: {
+ uint32_t c = '\n';
+ if (!opts->encoding.encode(c)) {
+ fatal_error(ast->line, ast->column, "bad code point: '0x%X'", c);
+ }
+ return re_class(alc, ast->line, ast->column,
+ Range::sub(opts->encoding.fullRange(), Range::sym(c)), opts, warn);
+ }
+ case RegExp::DEFAULT:
+ // see note [default regexp]
+ return re_sym(alc, Range::ran(0, opts->encoding.nCodeUnits()));
+ }
+}
+
+RE *re_schar(RE::alc_t &alc, uint32_t line, uint32_t column, uint32_t c, const opt_t *opts)
+{
+ if (!opts->encoding.encode(c)) {
+ fatal_error(line, column, "bad code point: '0x%X'", c);
+ }
+ switch (opts->encoding.type ()) {
+ case Enc::UTF16: return UTF16Symbol(alc, c);
+ case Enc::UTF8: return UTF8Symbol(alc, c);
+ default: return re_sym(alc, Range::sym(c));
+ }
+}
+
+RE *re_ichar(RE::alc_t &alc, uint32_t line, uint32_t column, uint32_t c, const opt_t *opts)
+{
+ if (is_alpha(c)) {
+ return re_alt(alc,
+ re_schar(alc, line, column, to_lower_unsafe(c), opts),
+ re_schar(alc, line, column, to_upper_unsafe(c), opts));
+ } else {
+ return re_schar(alc, line, column, c, opts);
+ }
+}
+
+RE *re_class(RE::alc_t &alc, uint32_t line, uint32_t column, const Range *r, const opt_t *opts, Warn &warn)
+{
+ if (!r) {
+ switch (opts->empty_class_policy) {
+ case EMPTY_CLASS_MATCH_EMPTY:
+ warn.empty_class(line);
+ return re_nil(alc);
+ case EMPTY_CLASS_MATCH_NONE:
+ warn.empty_class(line);
+ break;
+ case EMPTY_CLASS_ERROR:
+ fatal_error(line, column, "empty character class");
+ break;
+ }
+ }
+ switch (opts->encoding.type()) {
+ case Enc::UTF16: return UTF16Range(alc, r);
+ case Enc::UTF8: return UTF8Range(alc, r);
+ default: return re_sym(alc, r);
}
}
assert_tags_used_once(rule, tags);
}
-RESpec::RESpec(const std::vector<RegExpRule> &ast)
+RESpec::RESpec(const std::vector<RegExpRule> &ast, const opt_t *o, Warn &w)
: alc()
, res()
, charset(*new std::vector<uint32_t>)
, tags(*new std::vector<Tag>)
, rules(*new std::valarray<Rule>(ast.size()))
+ , opts(o)
+ , warn(w)
{
for (size_t i = 0; i < ast.size(); ++i) {
size_t ltag = tags.size(), ncap = 0;
-#ifndef _RE2C_IR_REGEXP_EMPTY_CLASS_POLICY_
-#define _RE2C_IR_REGEXP_EMPTY_CLASS_POLICY_
+#ifndef _RE2C_IR_RE_EMPTY_CLASS_POLICY_
+#define _RE2C_IR_RE_EMPTY_CLASS_POLICY_
namespace re2c {
} // namespace re2c
-#endif // _RE2C_IR_REGEXP_EMPTY_CLASS_POLICY_
+#endif // _RE2C_IR_RE_EMPTY_CLASS_POLICY_
-#ifndef _RE2C_IR_REGEXP_ENCODING_CASE_
-#define _RE2C_IR_REGEXP_ENCODING_CASE_
+#ifndef _RE2C_IR_RE_ENCODING_CASE_
+#define _RE2C_IR_RE_ENCODING_CASE_
#include "src/util/c99_stdint.h"
}
-#endif // _RE2C_IR_REGEXP_ENCODING_CASE_
+#endif // _RE2C_IR_RE_ENCODING_CASE_
-#include "src/ir/regexp/encoding/enc.h"
+#include "src/ir/re/encoding/enc.h"
#include "src/util/range.h"
namespace re2c {
-#ifndef _RE2C_IR_REGEXP_ENCODING_ENC_
-#define _RE2C_IR_REGEXP_ENCODING_ENC_
+#ifndef _RE2C_IR_RE_ENCODING_ENC_
+#define _RE2C_IR_RE_ENCODING_ENC_
#include "src/util/c99_stdint.h"
} // namespace re2c
-#endif // _RE2C_IR_REGEXP_ENCODING_ENC_
+#endif // _RE2C_IR_RE_ENCODING_ENC_
--- /dev/null
+#include "src/ir/re/encoding/range_suffix.h"
+#include "src/util/range.h"
+
+namespace re2c {
+
+static RE *emit(RE::alc_t &alc, RangeSuffix *p, RE *re);
+
+free_list<RangeSuffix *> RangeSuffix::freeList;
+
+RE *to_regexp(RE::alc_t &alc, RangeSuffix *p)
+{
+ return p ? emit(alc, p, NULL) : re_sym(alc, NULL);
+}
+
+/*
+ * Build regexp from suffix tree.
+ */
+RE *emit(RE::alc_t &alc, RangeSuffix *p, RE *re)
+{
+ if (p == NULL) {
+ return re;
+ } else {
+ RE *regexp = NULL;
+ for (; p != NULL; p = p->next) {
+ RE *re1 = re_cat(alc, re_sym(alc, Range::ran(p->l, p->h + 1)), re);
+ regexp = re_alt(alc, regexp, emit(alc, p->child, re1));
+ }
+ return regexp;
+ }
+}
+
+} // namespace re2c
-#ifndef _RE2C_IR_REGEXP_ENCODING_RANGE_SUFFIX_
-#define _RE2C_IR_REGEXP_ENCODING_RANGE_SUFFIX_
+#ifndef _RE2C_IR_RE_ENCODING_RANGE_SUFFIX_
+#define _RE2C_IR_RE_ENCODING_RANGE_SUFFIX_
#include "src/util/c99_stdint.h"
#include <stddef.h> // NULL
+#include "src/ir/re/re.h"
#include "src/util/forbid_copy.h"
#include "src/util/free_list.h"
namespace re2c {
-struct RegExp;
-
struct RangeSuffix
{
static free_list<RangeSuffix *> freeList;
FORBID_COPY (RangeSuffix);
};
-const RegExp *to_regexp(uint32_t l, uint32_t c, RangeSuffix * p);
+RE *to_regexp(RE::alc_t &alc, RangeSuffix *p);
} // namespace re2c
-#endif // _RE2C_IR_REGEXP_ENCODING_RANGE_SUFFIX_
+#endif // _RE2C_IR_RE_ENCODING_RANGE_SUFFIX_
-#include "src/ir/regexp/encoding/utf16/utf16.h"
+#include "src/ir/re/encoding/utf16/utf16.h"
namespace re2c {
-#ifndef _RE2C_IR_REGEXP_ENCODING_UTF16_UTF16_
-#define _RE2C_IR_REGEXP_ENCODING_UTF16_UTF16_
+#ifndef _RE2C_IR_RE_ENCODING_UTF16_UTF16_
+#define _RE2C_IR_RE_ENCODING_UTF16_UTF16_
#include "src/util/c99_stdint.h"
} // namespace re2c
-#endif // _RE2C_IR_REGEXP_ENCODING_UTF16_UTF16_
+#endif // _RE2C_IR_RE_ENCODING_UTF16_UTF16_
-#include "src/ir/regexp/encoding/utf16/utf16_range.h"
-#include "src/ir/regexp/encoding/range_suffix.h"
+#include "src/ir/re/encoding/utf16/utf16_range.h"
+#include "src/ir/re/encoding/range_suffix.h"
namespace re2c {
-#ifndef _RE2C_IR_REGEXP_ENCODING_UTF16_RANGE_
-#define _RE2C_IR_REGEXP_ENCODING_UTF16_RANGE_
+#ifndef _RE2C_IR_RE_ENCODING_UTF16_RANGE_
+#define _RE2C_IR_RE_ENCODING_UTF16_RANGE_
#include "src/util/c99_stdint.h"
-#include "src/ir/regexp/encoding/utf16/utf16.h"
+#include "src/ir/re/encoding/utf16/utf16.h"
namespace re2c {
} // namespace re2c
-#endif // _RE2C_IR_REGEXP_ENCODING_UTF16_RANGE_
+#endif // _RE2C_IR_RE_ENCODING_UTF16_RANGE_
#include "src/util/c99_stdint.h"
-#include "src/ir/regexp/encoding/utf16/utf16_regexp.h"
-#include "src/ir/regexp/encoding/range_suffix.h"
-#include "src/ir/regexp/encoding/utf16/utf16_range.h"
-#include "src/ir/regexp/regexp.h"
+#include "src/ir/re/encoding/utf16/utf16_regexp.h"
+#include "src/ir/re/encoding/range_suffix.h"
+#include "src/ir/re/encoding/utf16/utf16_range.h"
#include "src/util/range.h"
namespace re2c {
-const RegExp *UTF16Symbol(uint32_t l, uint32_t c, utf16::rune r)
+RE *UTF16Symbol(RE::alc_t &alc, utf16::rune r)
{
if (r <= utf16::MAX_1WORD_RUNE) {
- return RegExp::make_sym(l, c, Range::sym(r));
+ return re_sym(alc, Range::sym(r));
} else {
const uint32_t ld = utf16::lead_surr(r);
const uint32_t tr = utf16::trail_surr(r);
- return RegExp::make_cat(RegExp::make_sym(l, c, Range::sym(ld)),
- RegExp::make_sym(l, c, Range::sym(tr)));
+ return re_cat(alc,
+ re_sym(alc, Range::sym(ld)),
+ re_sym(alc, Range::sym(tr)));
}
}
* them. We store partially built range in suffix tree, which
* allows to eliminate common suffixes while building.
*/
-const RegExp *UTF16Range(uint32_t l, uint32_t c, const Range * r)
+RE *UTF16Range(RE::alc_t &alc, const Range *r)
{
RangeSuffix * root = NULL;
for (; r != NULL; r = r->next ())
UTF16splitByRuneLength(root, r->lower (), r->upper () - 1);
- return to_regexp(l, c, root);
+ return to_regexp(alc, root);
}
} // namespace re2c
--- /dev/null
+#ifndef _RE2C_IR_RE_ENCODING_UTF16_REGEXP_
+#define _RE2C_IR_RE_ENCODING_UTF16_REGEXP_
+
+#include "src/ir/re/re.h"
+#include "src/ir/re/encoding/utf16/utf16.h"
+
+namespace re2c {
+
+class Range;
+
+RE *UTF16Symbol(RE::alc_t &alc, utf16::rune r);
+RE *UTF16Range(RE::alc_t &alc, const Range *r);
+
+} // namespace re2c
+
+#endif // _RE2C_IR_RE_ENCODING_UTF16_REGEXP_
-#include "src/ir/regexp/encoding/utf8/utf8.h"
+#include "src/ir/re/encoding/utf8/utf8.h"
namespace re2c {
-#ifndef _RE2C_IR_REGEXP_ENCODING_UTF8_UTF8_
-#define _RE2C_IR_REGEXP_ENCODING_UTF8_UTF8_
+#ifndef _RE2C_IR_RE_ENCODING_UTF8_UTF8_
+#define _RE2C_IR_RE_ENCODING_UTF8_UTF8_
#include "src/util/c99_stdint.h"
} // namespace re2c
-#endif // _RE2C_IR_REGEXP_ENCODING_UTF8_UTF8_
+#endif // _RE2C_IR_RE_ENCODING_UTF8_UTF8_
-#include "src/ir/regexp/encoding/utf8/utf8_range.h"
-#include "src/ir/regexp/encoding/range_suffix.h"
+#include "src/ir/re/encoding/utf8/utf8_range.h"
+#include "src/ir/re/encoding/range_suffix.h"
namespace re2c {
-#ifndef _RE2C_IR_REGEXP_ENCODING_UTF8_RANGE_
-#define _RE2C_IR_REGEXP_ENCODING_UTF8_RANGE_
+#ifndef _RE2C_IR_RE_ENCODING_UTF8_RANGE_
+#define _RE2C_IR_RE_ENCODING_UTF8_RANGE_
#include "src/util/c99_stdint.h"
-#include "src/ir/regexp/encoding/utf8/utf8.h"
+#include "src/ir/re/encoding/utf8/utf8.h"
namespace re2c {
} // namespace re2c
-#endif // _RE2C_IR_REGEXP_ENCODING_UTF8_RANGE_
+#endif // _RE2C_IR_RE_ENCODING_UTF8_RANGE_
#include "src/util/c99_stdint.h"
-#include "src/ir/regexp/encoding/utf8/utf8_regexp.h"
-#include "src/ir/regexp/encoding/range_suffix.h"
-#include "src/ir/regexp/encoding/utf8/utf8_range.h"
-#include "src/ir/regexp/regexp.h"
+#include "src/ir/re/encoding/utf8/utf8_regexp.h"
+#include "src/ir/re/encoding/range_suffix.h"
+#include "src/ir/re/encoding/utf8/utf8_range.h"
#include "src/util/range.h"
namespace re2c {
-const RegExp * UTF8Symbol(uint32_t l, uint32_t c, utf8::rune r)
+RE *UTF8Symbol(RE::alc_t &alc, utf8::rune r)
{
uint32_t chars[utf8::MAX_RUNE_LENGTH];
const uint32_t chars_count = utf8::rune_to_bytes(chars, r);
- const RegExp *re = RegExp::make_sym(l, c, Range::sym(chars[0]));
+ RE *re = re_sym(alc, Range::sym(chars[0]));
for (uint32_t i = 1; i < chars_count; ++i) {
- re = RegExp::make_cat(re, RegExp::make_sym(l, c, Range::sym(chars[i])));
+ re = re_cat(alc, re, re_sym(alc, Range::sym(chars[i])));
}
return re;
}
* them. We store partially built range in suffix tree, which
* allows to eliminate common suffixes while building.
*/
-const RegExp * UTF8Range(uint32_t l, uint32_t c, const Range * r)
+RE *UTF8Range(RE::alc_t &alc, const Range *r)
{
RangeSuffix * root = NULL;
for (; r != NULL; r = r->next ())
UTF8splitByRuneLength(root, r->lower (), r->upper () - 1);
- return to_regexp (l, c, root);
+ return to_regexp(alc, root);
}
} // namespace re2c
--- /dev/null
+#ifndef _RE2C_IR_RE_ENCODING_UTF8_REGEXP_
+#define _RE2C_IR_RE_ENCODING_UTF8_REGEXP_
+
+#include "src/ir/re/re.h"
+#include "src/ir/re/encoding/utf8/utf8.h"
+
+namespace re2c {
+
+class Range;
+
+RE *UTF8Symbol(RE::alc_t &alc, utf8::rune r);
+RE *UTF8Range(RE::alc_t &alc, const Range *r);
+
+} // namespace re2c
+
+#endif // _RE2C_IR_RE_ENCODING_UTF8_REGEXP_
}
}
-void find_fixed_tags(RESpec &spec, const opt_t *opts)
+void find_fixed_tags(RESpec &spec)
{
- const bool generic = opts->input_api == INPUT_CUSTOM;
+ const bool generic = spec.opts->input_api == INPUT_CUSTOM;
std::vector<RE*>::iterator
i = spec.res.begin(),
e = spec.res.end();
* (including rules with nonempty trailing context)
* false positives on partially self-shadowed rules like [^]?
*/
-void warn_nullable(const RESpec &spec, const std::string &cond, Warn &warn)
+void warn_nullable(const RESpec &spec, const std::string &cond)
{
const size_t nre = spec.res.size();
for (size_t i = 0; i < nre; ++i) {
bool trail = false;
if (nullable(spec, spec.res[i], trail)) {
- warn.match_empty_string(spec.rules[i].code->fline, cond);
+ spec.warn.match_empty_string(spec.rules[i].code->fline, cond);
}
}
}
#include "src/conf/opt.h"
#include "src/ir/rule.h"
-#include "src/ir/regexp/regexp.h"
+#include "src/parse/regexp.h"
+#include "src/util/forbid_copy.h"
#include "src/util/range.h"
#include "src/util/slab_allocator.h"
std::vector<uint32_t> &charset;
std::vector<Tag> &tags;
std::valarray<Rule> &rules;
+ const opt_t *opts;
+ Warn &warn;
- explicit RESpec(const std::vector<RegExpRule> &ast);
+ explicit RESpec(const std::vector<RegExpRule> &ast, const opt_t *o, Warn &w);
+ FORBID_COPY(RESpec);
};
-void split_charset(RESpec &spec, const opt_t *opts);
-void find_fixed_tags(RESpec &spec, const opt_t *opts);
+void split_charset(RESpec &spec);
+void find_fixed_tags(RESpec &spec);
void insert_default_tags(RESpec &spec);
-void warn_nullable(const RESpec &spec, const std::string &cond, Warn &warn);
+void warn_nullable(const RESpec &spec, const std::string &cond);
inline RE *re_nil(RE::alc_t &alc)
{
{
if (!x) return y;
if (!y) return x;
+ if (x->type == RE::SYM && y->type == RE::SYM) {
+ return re_sym(alc, Range::add(x->sym, y->sym));
+ }
RE *z = alc.alloct<RE>(1);
z->type = RE::ALT;
return x;
}
+RE *re_schar(RE::alc_t &alc, uint32_t line, uint32_t column, uint32_t c, const opt_t *opts);
+RE *re_ichar(RE::alc_t &alc, uint32_t line, uint32_t column, uint32_t c, const opt_t *opts);
+RE *re_class(RE::alc_t &alc, uint32_t line, uint32_t column, const Range *r, const opt_t *opts, Warn &warn);
+
} // namespace re2c
#endif // _RE2C_IR_RE_RE_
* Don't forget to include zero and upper bound, even if they
* do not explicitely apper in ranges.
*/
-void split_charset(RESpec &spec, const opt_t *opts)
+void split_charset(RESpec &spec)
{
std::set<uint32_t> cs;
split(*i, cs);
}
cs.insert(0);
- cs.insert(opts->encoding.nCodeUnits());
+ cs.insert(spec.opts->encoding.nCodeUnits());
spec.charset.insert(spec.charset.end(), cs.begin(), cs.end());
}
+++ /dev/null
-#include "src/ir/regexp/encoding/range_suffix.h"
-#include "src/ir/regexp/regexp.h"
-#include "src/util/range.h"
-
-namespace re2c {
-
-static const RegExp *emit(uint32_t l, uint32_t c, RangeSuffix * p, const RegExp * re);
-
-free_list<RangeSuffix *> RangeSuffix::freeList;
-
-const RegExp *to_regexp(uint32_t l, uint32_t c, RangeSuffix * p)
-{
- return p ? emit(l, c, p, NULL)
- : RegExp::make_sym(l, c, NULL);
-}
-
-/*
- * Build regexp from suffix tree.
- */
-const RegExp *emit(uint32_t l, uint32_t c, RangeSuffix * p, const RegExp * re)
-{
- if (p == NULL) {
- return re;
- } else {
- const RegExp *regexp = NULL;
- for (; p != NULL; p = p->next) {
- const RegExp *re1 = RegExp::make_cat(
- RegExp::make_sym(l, c, Range::ran(p->l, p->h + 1)), re);
- regexp = RegExp::make_alt(regexp, emit(l, c, p->child, re1));
- }
- return regexp;
- }
-}
-
-} // namespace re2c
+++ /dev/null
-#ifndef _RE2C_IR_REGEXP_ENCODING_UTF16_REGEXP_
-#define _RE2C_IR_REGEXP_ENCODING_UTF16_REGEXP_
-
-#include "src/ir/regexp/encoding/utf16/utf16.h"
-
-namespace re2c {
-
-class Range;
-struct RegExp;
-
-const RegExp * UTF16Symbol(uint32_t l, uint32_t c, utf16::rune r);
-const RegExp * UTF16Range(uint32_t l, uint32_t c, const Range * r);
-
-} // namespace re2c
-
-#endif // _RE2C_IR_REGEXP_ENCODING_UTF16_REGEXP_
+++ /dev/null
-#ifndef _RE2C_IR_REGEXP_ENCODING_UTF8_REGEXP_
-#define _RE2C_IR_REGEXP_ENCODING_UTF8_REGEXP_
-
-#include "src/ir/regexp/encoding/utf8/utf8.h"
-
-namespace re2c {
-
-class Range;
-struct RegExp;
-
-const RegExp * UTF8Symbol(uint32_t l, uint32_t c, utf8::rune r);
-const RegExp * UTF8Range(uint32_t l, uint32_t c, const Range * r);
-
-} // namespace re2c
-
-#endif // _RE2C_IR_REGEXP_ENCODING_UTF8_REGEXP_
+++ /dev/null
-#include <limits>
-#include <stddef.h>
-
-#include "src/conf/msg.h"
-#include "src/conf/opt.h"
-#include "src/ir/regexp/empty_class_policy.h"
-#include "src/ir/regexp/encoding/case.h"
-#include "src/ir/regexp/encoding/enc.h"
-#include "src/ir/regexp/encoding/utf16/utf16_regexp.h"
-#include "src/ir/regexp/encoding/utf8/utf8_regexp.h"
-#include "src/ir/regexp/regexp.h"
-#include "src/parse/scanner.h"
-#include "src/util/range.h"
-
-namespace re2c
-{
-
-free_list<RegExp*> RegExp::flist;
-
-const uint32_t RegExp::MANY = std::numeric_limits<uint32_t>::max();
-
-const RegExp *RegExp::make_schar(uint32_t line, uint32_t column, uint32_t c, Opt &opts)
-{
- if (!opts->encoding.encode(c)) {
- fatal_error(line, column, "bad code point: '0x%X'", c);
- }
- switch (opts->encoding.type ()) {
- case Enc::UTF16: return UTF16Symbol(line, column, c);
- case Enc::UTF8: return UTF8Symbol(line, column, c);
- default: return RegExp::make_sym(line, column, Range::sym(c));
- }
-}
-
-const RegExp *RegExp::make_ichar(uint32_t line, uint32_t column, uint32_t c, Opt &opts)
-{
- if (is_alpha(c)) {
- const RegExp *l = RegExp::make_schar(line, column, to_lower_unsafe(c), opts);
- const RegExp *u = RegExp::make_schar(line, column, to_upper_unsafe(c), opts);
- return RegExp::make_alt(l, u);
- } else {
- return RegExp::make_schar(line, column, c, opts);
- }
-}
-
-const RegExp *RegExp::make_class(uint32_t line, uint32_t column, const Range *r, Opt &opts, Warn &warn)
-{
- if (!r) {
- switch (opts->empty_class_policy) {
- case EMPTY_CLASS_MATCH_EMPTY:
- warn.empty_class(line);
- return RegExp::make_nil(line, column);
- case EMPTY_CLASS_MATCH_NONE:
- warn.empty_class(line);
- break;
- case EMPTY_CLASS_ERROR:
- fatal_error(line, column, "empty character class");
- break;
- }
- }
-
- switch (opts->encoding.type()) {
- case Enc::UTF16: return UTF16Range(line, column, r);
- case Enc::UTF8: return UTF8Range(line, column, r);
- default: return RegExp::make_sym(line, column, r);
- }
-}
-
-const RegExp *RegExp::make_diff(const RegExp *re1, const RegExp *re2, Opt &opts, Warn &warn)
-{
- if (re1 && re2
- && re1->type == RegExp::SYM
- && re2->type == RegExp::SYM) {
- return RegExp::make_class(re1->line, re1->column,
- Range::sub(re1->sym, re2->sym), opts, warn);
- }
- fatal_error(re1->line, re1->column, "can only difference char sets");
- return NULL;
-}
-
-const RegExp *RegExp::make_dot(uint32_t line, uint32_t column, Opt &opts, Warn &warn)
-{
- uint32_t c = '\n';
- if (!opts->encoding.encode(c)) {
- fatal_error(line, column, "bad code point: '0x%X'", c);
- }
- return RegExp::make_class(line, column,
- Range::sub(opts->encoding.fullRange(), Range::sym(c)), opts, warn);
-}
-
-/*
- * Create a byte range that includes all possible input characters.
- * This may include characters, which do not map to any valid symbol
- * in current encoding. For encodings, which directly map symbols to
- * input characters (ASCII, EBCDIC, UTF-32), it equals [^]. For other
- * encodings (UTF-16, UTF-8), [^] and this range are different.
- *
- * Also note that default range doesn't respect encoding policy
- * (the way invalid code points are treated).
- */
-const RegExp *RegExp::make_default(uint32_t line, uint32_t column, Opt &opts)
-{
- return RegExp::make_sym(line, column, Range::ran(0,
- opts->encoding.nCodeUnits()));
-}
-
-bool RegExp::need_wrap(const RegExp *re)
-{
- switch (re->type) {
- case RegExp::ITER:
- case RegExp::NIL:
- case RegExp::SYM:
- case RegExp::TAG:
- case RegExp::CAP:
- return false;
- case RegExp::ALT:
- case RegExp::CAT:
- case RegExp::REF:
- default:
- return true;
- }
-}
-
-} // namespace re2c
#include "src/codegen/bitmap.h"
#include "src/codegen/emit.h"
#include "src/codegen/output.h"
-#include "src/ir/regexp/encoding/enc.h"
+#include "src/ir/re/encoding/enc.h"
#include "src/ir/adfa/adfa.h"
#include "src/ir/skeleton/skeleton.h"
#include <vector>
#include "src/conf/msg.h"
-#include "src/ir/regexp/encoding/enc.h"
+#include "src/ir/re/encoding/enc.h"
#include "src/ir/skeleton/path.h"
#include "src/ir/skeleton/skeleton.h"
#include "src/util/u32lim.h"
#include "src/codegen/bitmap.h"
#include "src/conf/opt.h"
-#include "src/ir/regexp/regexp.h"
#include "src/ir/rule.h"
#include "src/ir/tcmd.h"
#include "src/util/local_increment.h"
#include <string>
#include "src/codegen/output.h"
-#include "src/ir/regexp/encoding/enc.h"
-#include "src/ir/regexp/regexp.h"
+#include "src/ir/re/encoding/enc.h"
#include "src/parse/extop.h"
#include "src/parse/input.h"
+#include "src/parse/regexp.h"
#include "src/parse/scanner.h"
#include "src/parse/parser.h" // needed by "y.tab.h"
#include "src/parse/unescape.h"
c = static_cast<uint8_t>(*s),
column = static_cast<uint32_t>(s - pos);
r = RegExp::make_cat(r, casing
- ? RegExp::make_ichar(cline, column, c, opts)
- : RegExp::make_schar(cline, column, c, opts));
+ ? RegExp::make_ichar(cline, column, c)
+ : RegExp::make_schar(cline, column, c));
}
yylval.regexp = r ? r : RegExp::make_nil(cline, get_column());
return TOKEN_REGEXP;
}
"." {
- yylval.regexp = RegExp::make_dot(cline, get_column(), opts, warn);
+ yylval.regexp = RegExp::make_dot(cline, get_column());
return TOKEN_REGEXP;
}
if (neg) {
r = Range::sub(opts->encoding.fullRange(), r);
}
- return RegExp::make_class(cline, column, r, opts, warn);
+ return RegExp::make_class(cline, column, r);
}
uint32_t Scanner::lex_cls_chr()
return r ? r : RegExp::make_nil(cline, get_column());
}
r = RegExp::make_cat(r, casing
- ? RegExp::make_ichar(cline, get_column(), c, opts)
- : RegExp::make_schar(cline, get_column(), c, opts));
+ ? RegExp::make_ichar(cline, get_column(), c)
+ : RegExp::make_schar(cline, get_column(), c));
}
}
#include <string>
#include "src/codegen/output.h"
-#include "src/ir/regexp/encoding/enc.h"
+#include "src/ir/re/encoding/enc.h"
#include "src/parse/scanner.h"
#include "src/util/s_to_n32_unsafe.h"
#include <string>
#include "src/codegen/output.h"
-#include "src/ir/regexp/regexp.h"
+#include "src/parse/regexp.h"
#include "src/parse/scanner.h"
#include "src/util/smart_ptr.h"
#include "src/codegen/output.h"
#include "src/ir/compile.h"
#include "src/ir/adfa/adfa.h"
-#include "src/ir/regexp/encoding/enc.h"
-#include "src/ir/regexp/encoding/range_suffix.h"
-#include "src/ir/regexp/regexp.h"
+#include "src/ir/re/encoding/enc.h"
+#include "src/ir/re/encoding/range_suffix.h"
#include "src/ir/skeleton/skeleton.h"
#include "src/parse/extop.h"
#include "src/parse/parser.h"
+#include "src/parse/regexp.h"
#include "src/parse/scanner.h"
#include "src/util/free_list.h"
#include "src/util/range.h"
}
}
-static void prepare(specs_t &specs, const Scanner &in)
+static void prepare(specs_t &specs)
{
specs_t::iterator i, b = specs.begin(), e = specs.end();
for (i = b; i != e; ++i) {
if (!i->defs.empty()) {
const Code *c = i->defs[0];
- const RegExp *r = RegExp::make_default(c->fline, 0, in.opts);
+ const RegExp *r = RegExp::make_default(c->fline, 0);
i->rules.push_back(RegExpRule(r, c));
}
}
}
| diff '\\' term
{
- $$ = RegExp::make_diff($1, $3, context.input.opts, context.input.warn);
+ $$ = RegExp::make_diff($1, $3);
}
;
// compile regular expressions to automata
if (mode != Scanner::Reuse) {
check(specs, opts->cFlag);
- prepare(specs, input);
+ prepare(specs);
o.source.block().line = input.get_cline();
for (specs_t::const_iterator i = specs.begin(); i != specs.end(); ++i) {
dfas.push_back(compile(*i, o));
--- /dev/null
+#include <limits>
+
+#include "src/parse/regexp.h"
+
+namespace re2c
+{
+
+free_list<RegExp*> RegExp::flist;
+
+const uint32_t RegExp::MANY = std::numeric_limits<uint32_t>::max();
+
+bool RegExp::need_wrap(const RegExp *re)
+{
+ switch (re->type) {
+ case RegExp::ITER:
+ case RegExp::NIL:
+ case RegExp::SCHAR:
+ case RegExp::ICHAR:
+ case RegExp::CLASS:
+ case RegExp::DOT:
+ case RegExp::DEFAULT:
+ case RegExp::TAG:
+ case RegExp::CAP:
+ return false;
+ case RegExp::ALT:
+ case RegExp::CAT:
+ case RegExp::DIFF:
+ case RegExp::REF:
+ return true;
+ }
+ assert(false);
+}
+
+} // namespace re2c
-#ifndef _RE2C_IR_REGEXP_REGEXP_
-#define _RE2C_IR_REGEXP_REGEXP_
+#ifndef _RE2C_PARSE_REGEXP_
+#define _RE2C_PARSE_REGEXP_
#include "src/util/c99_stdint.h"
#include <set>
static free_list<RegExp*> flist;
static const uint32_t MANY;
- enum type_t {NIL, SYM, ALT, CAT, ITER, TAG, CAP, REF} type;
+ enum type_t {NIL, SCHAR, ICHAR, CLASS, DOT, DEFAULT, ALT, CAT, ITER, DIFF, TAG, CAP, REF} type;
union
{
- const Range *sym;
+ uint32_t schar;
+ uint32_t ichar;
+ const Range *cls;
struct
{
const RegExp *re1;
uint32_t min;
uint32_t max;
} iter;
+ struct
+ {
+ const RegExp *re1;
+ const RegExp *re2;
+ } diff;
const std::string *tag;
const RegExp *cap;
struct
{
return new RegExp(l, c, NIL);
}
- static const RegExp *make_sym(uint32_t l, uint32_t c, const Range *r)
+ static const RegExp *make_schar(uint32_t l, uint32_t c, uint32_t x)
+ {
+ RegExp *re = new RegExp(l, c, SCHAR);
+ re->schar = x;
+ return re;
+ }
+ static const RegExp *make_ichar(uint32_t l, uint32_t c, uint32_t x)
+ {
+ RegExp *re = new RegExp(l, c, ICHAR);
+ re->ichar = x;
+ return re;
+ }
+ static const RegExp *make_class(uint32_t l, uint32_t c, const Range *r)
{
- RegExp *re = new RegExp(l, c, SYM);
- re->sym = r;
+ RegExp *re = new RegExp(l, c, CLASS);
+ re->cls = r;
return re;
}
+ static const RegExp *make_dot(uint32_t l, uint32_t c)
+ {
+ return new RegExp(l, c, DOT);
+ }
+ static const RegExp *make_default(uint32_t l, uint32_t c)
+ {
+ return new RegExp(l, c, DEFAULT);
+ }
static const RegExp *make_alt(const RegExp *r1, const RegExp *r2)
{
if (!r1) return r2;
if (!r2) return r1;
- if (r1->type == RegExp::SYM && r2->type == RegExp::SYM) {
- return RegExp::make_sym(r1->line, r1->column,
- Range::add(r1->sym, r2->sym));
- }
RegExp *re = new RegExp(r1->line, r1->column, ALT);
re->alt.re1 = r1;
re->alt.re2 = r2;
re->iter.max = m;
return re;
}
+ static const RegExp *make_diff(const RegExp *r1, const RegExp *r2)
+ {
+ RegExp *re = new RegExp(r1->line, r1->column, DIFF);
+ re->cat.re1 = r1;
+ re->cat.re2 = r2;
+ return re;
+ }
static const RegExp *make_tag(uint32_t l, uint32_t c, const std::string *t)
{
RegExp *re = new RegExp(l, c, TAG);
delete ref.name;
}
}
- static const RegExp *make_schar(uint32_t line, uint32_t column, uint32_t c, Opt &opts);
- static const RegExp *make_ichar(uint32_t line, uint32_t column, uint32_t c, Opt &opts);
- static const RegExp *make_class(uint32_t line, uint32_t column, const Range *r, Opt &opts, Warn &warn);
- static const RegExp *make_diff(const RegExp * e1, const RegExp * e2, Opt &opts, Warn &warn);
- static const RegExp *make_dot(uint32_t line, uint32_t column, Opt &opts, Warn &warn);
- static const RegExp *make_default(uint32_t line, uint32_t column, Opt &opts);
static bool need_wrap(const RegExp *re);
private:
} // end namespace re2c
-#endif // _RE2C_IR_REGEXP_REGEXP_
+#endif // _RE2C_PARSE_REGEXP_