src/ir/re/ast_to_re.cc \
src/ir/re/default_tags.cc \
src/ir/re/fixed_tags.cc \
+ src/ir/re/nullable.cc \
+ src/ir/re/split_charset.cc \
src/ir/adfa/adfa.cc \
src/ir/adfa/dump.cc \
src/ir/adfa/prepare.cc \
src/ir/regexp/encoding/utf16/utf16_regexp.cc \
src/ir/regexp/encoding/utf16/utf16.cc \
src/ir/regexp/encoding/utf16/utf16_range.cc \
- src/ir/regexp/nullable.cc \
src/ir/regexp/regexp.cc \
- src/ir/regexp/split_charset.cc \
src/ir/compile.cc \
src/ir/rule.cc \
src/ir/skeleton/control_flow.cc \
, const std::vector<size_t> &fill
, size_t def
, size_t key
- , const charset_t &charset
, const std::string &n
, const std::string &c
, uint32_t l
, cond (c)
, line (l)
, lbChar(0)
- , ubChar(charset.back())
+ , ubChar(dfa.charset.back())
, nStates(0)
, head(NULL)
, tags0(*dfa.tcid0)
+ , charset(dfa.charset)
, rules(dfa.rules)
, tags(dfa.tags)
, finvers(dfa.finvers)
delete s;
}
+ delete &charset;
delete &rules;
delete &tags;
delete[] finvers;
uint32_t nStates;
State * head;
const tcid_t tags0;
+ std::vector<uint32_t> &charset;
std::valarray<Rule> &rules;
std::vector<Tag> &tags;
const tagver_t *finvers;
, const std::vector<size_t> &fill
, size_t def
, size_t key
- , const charset_t &charset
, const std::string &n
, const std::string &c
, uint32_t l
const size_t defrule = spec.defs.empty()
? Rule::NONE
: rules.size() - 1;
- const uint32_t
- line = output.source.block().line,
- cunits = opts->encoding.nCodeUnits();
+ const uint32_t line = output.source.block().line;
const std::string
&cond = spec.name,
name = make_name(cond, line),
&setup = spec.setup.empty() ? "" : spec.setup[0]->text;
- warn_nullable(rules, cond, warn);
-
- // The original set of code units (charset) might be very large.
- // A common trick it is to split charset into disjoint character ranges
- // and choose a representative of each range (we choose lower bound).
- // The set of all representatives is the new (compacted) charset.
- // Don't forget to include zero and upper bound, even if they
- // do not explicitely apper in ranges.
- std::set<uint32_t> bounds;
- split(rules, bounds);
- bounds.insert(0);
- bounds.insert(cunits);
- charset_t cs;
- for (std::set<uint32_t>::const_iterator i = bounds.begin(); i != bounds.end(); ++i)
- {
- cs.push_back(*i);
- }
-
RESpec re(rules);
+ split_charset(re, opts);
find_fixed_tags(re, opts);
insert_default_tags(re);
+ warn_nullable(re, cond, warn);
nfa_t nfa(re);
if (opts->dump_nfa) dump_nfa(nfa);
- dfa_t dfa(nfa, cs, opts, cond, warn);
+ dfa_t dfa(nfa, opts, cond, warn);
if (opts->dump_dfa_det) dump_dfa(dfa);
// skeleton must be constructed after DFA construction
// but prior to any other DFA transformations
- Skeleton skeleton(dfa, cs, opts, defrule, name, cond, line);
+ Skeleton skeleton(dfa, opts, defrule, name, cond, line);
warn_undefined_control_flow(skeleton, warn);
if (opts->target == opt_t::SKELETON) {
emit_data(skeleton);
// ADFA stands for 'DFA with actions'
DFA *adfa = new DFA(dfa, fill, defrule, skeleton.sizeof_key,
- cs, name, cond, line, setup);
+ name, cond, line, setup);
// see note [reordering DFA states]
adfa->reorder();
}
}
-dfa_t::dfa_t(const nfa_t &nfa, const charset_t &charset, const opt_t *opts,
+dfa_t::dfa_t(const nfa_t &nfa, const opt_t *opts,
const std::string &cond, Warn &warn)
: states()
- , nchars(charset.size() - 1) // (n + 1) bounds for n ranges
+ , nchars(nfa.charset.size() - 1) // (n + 1) bounds for n ranges
+ , charset(nfa.charset)
, rules(nfa.rules)
, tags(nfa.tags)
, finvers(NULL)
std::vector<dfa_state_t*> states;
const size_t nchars;
+ std::vector<uint32_t> &charset;
std::valarray<Rule> &rules;
std::vector<Tag> &tags;
tagver_t *finvers;
tcmd_t *tcmd0;
tcid_t *tcid0;
- dfa_t(const nfa_t &nfa, const charset_t &charset, const opt_t *opts,
+ dfa_t(const nfa_t &nfa, const opt_t *opts,
const std::string &cond, Warn &warn);
~dfa_t();
size_t max_size;
size_t size;
nfa_state_t *states;
+ std::vector<uint32_t> &charset;
std::valarray<Rule> &rules;
std::vector<Tag> &tags;
nfa_state_t *root;
: max_size(estimate_size(spec.res))
, size(0)
, states(new nfa_state_t[max_size])
+ , charset(spec.charset)
, rules(spec.rules)
, tags(spec.tags)
, root(NULL)
RESpec::RESpec(const std::vector<RegExpRule> &ast)
: alc()
, res()
+ , charset(*new std::vector<uint32_t>)
, tags(*new std::vector<Tag>)
, rules(*new std::valarray<Rule>(ast.size()))
{
--- /dev/null
+#include "src/ir/re/re.h"
+
+namespace re2c {
+
+static bool nullable(const RESpec &spec, const RE *re, bool &trail)
+{
+ if (trail) return true;
+
+ switch (re->type) {
+ default: assert(false);
+ case RE::NIL: return true;
+ case RE::SYM: return false;
+ case RE::ITER:
+ return nullable(spec, re->iter, trail);
+ case RE::REPEAT:
+ return nullable(spec, re->repeat.re, trail);
+ case RE::TAG:
+ trail |= spec.tags[re->tag.idx].name == NULL;
+ return true;
+ case RE::ALT:
+ return nullable(spec, re->alt.re1, trail)
+ || nullable(spec, re->alt.re2, trail);
+ case RE::CAT:
+ return nullable(spec, re->cat.re1, trail)
+ && nullable(spec, re->cat.re2, trail);
+ }
+}
+
+/*
+ * warn about rules that match empty string
+ * (including rules with nonempty trailing context)
+ * false positives on partially self-shadowed rules like [^]?
+ */
+void warn_nullable(const RESpec &spec, const std::string &cond, Warn &warn)
+{
+ const size_t nre = spec.res.size();
+ for (size_t i = 0; i < nre; ++i) {
+ bool trail = false;
+ if (nullable(spec, spec.res[i], trail)) {
+ warn.match_empty_string(spec.rules[i].code->fline, cond);
+ }
+ }
+}
+
+} // namespace re2c
{
RE::alc_t alc;
std::vector<RE*> res;
+ std::vector<uint32_t> &charset;
std::vector<Tag> &tags;
std::valarray<Rule> &rules;
explicit RESpec(const std::vector<RegExpRule> &ast);
};
+void split_charset(RESpec &spec, const opt_t *opts);
void find_fixed_tags(RESpec &spec, const opt_t *opts);
void insert_default_tags(RESpec &spec);
+void warn_nullable(const RESpec &spec, const std::string &cond, Warn &warn);
inline RE *re_nil(RE::alc_t &alc)
{
--- /dev/null
+#include "src/util/c99_stdint.h"
+#include <set>
+
+#include "src/ir/re/re.h"
+
+namespace re2c {
+
+static void split(const RE* re, std::set<uint32_t> &cs)
+{
+ switch (re->type) {
+ default: assert(false);
+ case RE::NIL: break;
+ case RE::TAG: break;
+ case RE::SYM:
+ for (const Range *r = re->sym; r; r = r->next()) {
+ cs.insert(r->lower());
+ cs.insert(r->upper());
+ }
+ break;
+ case RE::ALT:
+ split(re->alt.re1, cs);
+ split(re->alt.re2, cs);
+ break;
+ case RE::CAT:
+ split(re->cat.re1, cs);
+ split(re->cat.re2, cs);
+ break;
+ case RE::ITER:
+ split(re->iter, cs);
+ break;
+ case RE::REPEAT:
+ split(re->repeat.re, cs);
+ break;
+ }
+}
+
+/* The original set of code units (charset) might be very large.
+ * A common trick it is to split charset into disjoint character ranges
+ * and choose a representative of each range (we choose lower bound).
+ * The set of all representatives is the new (compacted) charset.
+ * Don't forget to include zero and upper bound, even if they
+ * do not explicitely apper in ranges.
+ */
+void split_charset(RESpec &spec, const opt_t *opts)
+{
+ std::set<uint32_t> cs;
+
+ std::vector<RE*>::const_iterator
+ i = spec.res.begin(),
+ e = spec.res.end();
+ for (; i != e; ++i) {
+ split(*i, cs);
+ }
+ cs.insert(0);
+ cs.insert(opts->encoding.nCodeUnits());
+
+ spec.charset.insert(spec.charset.end(), cs.begin(), cs.end());
+}
+
+} // namespace re2c
+++ /dev/null
-#include "src/ir/regexp/regexp.h"
-
-namespace re2c {
-
-static bool nullable(const RegExp *re, bool &trail)
-{
- if (trail) {
- return true;
- }
- switch (re->type) {
- default: assert(false);
- case RegExp::NIL: return true;
- case RegExp::SYM: return false;
- case RegExp::ITER:
- return re->iter.min == 0
- || nullable(re->iter.re, trail);
- case RegExp::TAG:
- if (re->tag == NULL) {
- trail = true;
- }
- return true;
- case RegExp::ALT:
- return nullable(re->alt.re1, trail)
- || nullable(re->alt.re2, trail);
- case RegExp::CAT:
- return nullable(re->cat.re1, trail)
- && nullable(re->cat.re2, trail);
- }
-}
-
-/*
- * warn about rules that match empty string
- * (including rules with nonempty trailing context)
- * false positives on partially self-shadowed rules like [^]?
- */
-void warn_nullable(const std::vector<RegExpRule> ®exps,
- const std::string &cond, Warn &warn)
-{
- const size_t nregexps = regexps.size();
- for (size_t i = 0; i < nregexps; ++i) {
- const RegExpRule &r = regexps[i];
- bool trail = false;
- if (nullable(r.re, trail)) {
- warn.match_empty_string(r.code->fline, cond);
- }
- }
-}
-
-} // namespace re2c
namespace re2c
{
-struct nfa_state_t;
-struct nfa_t;
-
-typedef std::vector<uint32_t> charset_t;
-
-/* note [Kleene star is expressed in terms of plus]
- *
- * In literature Kleene star 'r*' (zero or more repetitions of 'r')
- * is the basic operation. In practice it is more convenient to use
- * 'r+' (one or more repetitions of 'r'), because expansion 'r+ ::= r r*'
- * duplicates 'r', while expansion 'r* = r+ | <empty>' allows to
- * avoid duplication. This is more efficient in general and crucial
- * in cases when duplication of 'r' is forbidden (e.g. if 'r' has tags).
- */
-
struct RegExp
{
static free_list<RegExp*> flist;
{}
};
-void split(const std::vector<RegExpRule> &rs, std::set<uint32_t> &cs);
const RegExp *mkAlt(const RegExp *re1, const RegExp *re2);
const RegExp *doAlt(const RegExp *re1, const RegExp *re2);
const RegExp *doCat(const RegExp *re1, const RegExp *re2);
-void warn_nullable(const std::vector<RegExpRule> ®exps,
- const std::string &cond, Warn &warn);
} // end namespace re2c
+++ /dev/null
-#include "src/util/c99_stdint.h"
-#include <set>
-
-#include "src/ir/regexp/regexp.h"
-#include "src/util/range.h"
-
-namespace re2c {
-
-static void split(const RegExp* re, std::set<uint32_t> &cs)
-{
- switch (re->type) {
- case RegExp::NIL:
- case RegExp::TAG:
- break;
- case RegExp::SYM:
- for (const Range *r = re->sym; r; r = r->next()) {
- cs.insert(r->lower());
- cs.insert(r->upper());
- }
- break;
- case RegExp::ALT:
- split(re->alt.re1, cs);
- split(re->alt.re2, cs);
- break;
- case RegExp::CAT:
- split(re->cat.re1, cs);
- split(re->cat.re2, cs);
- break;
- case RegExp::ITER:
- split(re->iter.re, cs);
- break;
- }
-}
-
-void split(const std::vector<RegExpRule> &rs, std::set<uint32_t> &cs)
-{
- const size_t nrs = rs.size();
- for (size_t i = 0; i < nrs; ++i) {
- split(rs[i].re, cs);
- }
-}
-
-} // namespace re2c
return x.save == y.save && x.copy == y.copy;
}
-void Node::init(const dfa_state_t *s, const charset_t &cs, size_t nil)
+void Node::init(const dfa_state_t *s,
+ const std::vector<uint32_t> &charset, size_t nil)
{
- const size_t nc = cs.size() - 1;
+ const size_t nc = charset.size() - 1;
for (uint32_t c = 0, l = 0; c < nc;) {
size_t j = s->arcs[c];
// all arcs go to default node => this node is final
if (l == 0 && c == nc && j == nil) break;
- const uint32_t u = cs[c];
+ const uint32_t u = charset[c];
arcs[j].push_back(Node::range_t(l, u - 1, &t));
l = u;
Skeleton::Skeleton(
const dfa_t &dfa,
- const charset_t &cs,
const opt_t *op,
size_t def,
const std::string &dfa_name,
, sizeof_key(8)
, defrule(def)
, ntagver(static_cast<size_t>(dfa.maxtagver) + 1)
+ , charset(dfa.charset)
, rules(dfa.rules)
, tags(dfa.tags)
, finvers(dfa.finvers)
// initialize nodes
const size_t nil = nodes_count - 1;
for (size_t i = 0; i < nil; ++i) {
- nodes[i].init(dfa.states[i], cs, nil);
+ nodes[i].init(dfa.states[i], charset, nil);
}
// initialize size of key
const tcmd_t *cmd;
Node();
- void init(const dfa_state_t *s, const charset_t &cs, size_t nil);
+ void init(const dfa_state_t *s,
+ const std::vector<uint32_t> &charset, size_t nil);
bool end() const;
FORBID_COPY(Node);
size_t sizeof_key;
size_t defrule;
size_t ntagver;
+ const std::vector<uint32_t> &charset;
const std::valarray<Rule> &rules;
const std::vector<Tag> &tags;
const tagver_t *finvers;
- Skeleton(const dfa_t &dfa, const charset_t &cs, const opt_t *opts,
- size_t def, const std::string &dfa_name, const std::string &dfa_cond,
+ Skeleton(const dfa_t &dfa, const opt_t *opts, size_t def,
+ const std::string &dfa_name, const std::string &dfa_cond,
uint32_t dfa_line);
~Skeleton ();
FORBID_COPY(Skeleton);