src/conf/msg.h \
src/conf/opt.h \
src/conf/warn.h \
- src/ir/bytecode/charset.h \
src/ir/bytecode/bytecode.h \
src/ir/bytecode/ins.h \
src/ir/dfa/state.h \
src/conf/warn.cc \
src/ir/bytecode/bytecode.cc \
src/ir/bytecode/ins.cc \
- src/ir/bytecode/charset.cc \
src/ir/bytecode/split.cc \
src/ir/bytecode/compile.cc \
src/ir/bytecode/calc_size.cc \
+#include <assert.h>
#include <string.h>
#include <algorithm>
#include "src/codegen/output.h"
#include "src/ir/bytecode/bytecode.h"
-#include "src/ir/bytecode/charset.h"
#include "src/ir/bytecode/ins.h"
#include "src/ir/dfa/dfa.h"
#include "src/ir/regexp/regexp.h"
{
RegExp * re = spec.re;
- CharSet cs (cunits);
+ // The original set of code units (charset) might be very large.
+ // A common trick it is to split charset into disjoint character ranges
+ // and choose a representative of each range (we choose lower bound).
+ // The set of all representatives is the new (compacted) charset.
+ // (Don't forget to include zero and exclude upper bound.)
+ charset_t cs;
re->split(cs);
+ cs.insert(0);
+ cs.erase(cunits);
- Char *rep = new Char[cunits];
+ re->calcSize(cs);
- for (uint32_t j = 0; j < cunits; ++j)
- {
- if (!cs.rep[j]->nxt)
- cs.rep[j]->nxt = &cs.ptn[j];
-
- rep[j] = static_cast<Char> (cs.rep[j]->nxt - &cs.ptn[0]);
- }
-
- re->calcSize(rep);
Ins *ins = new Ins[re->size + 1];
memset(ins, 0, (re->size + 1)*sizeof(Ins));
- const uint32_t size = re->compile(rep, ins);
+ const uint32_t size = re->compile(cs, ins);
Ins *eoi = &ins[size];
eoi->i.tag = GOTO;
eoi->i.link = eoi;
, size
, 0
, cunits
- , rep
+ , cs
, spec.rules
));
namespace re2c
{
-void AltOp::calcSize (Char * rep)
+void AltOp::calcSize (const charset_t & cs)
{
- exp1->calcSize (rep);
- exp2->calcSize (rep);
+ exp1->calcSize (cs);
+ exp2->calcSize (cs);
size = exp1->size + exp2->size + 2;
}
-void CatOp::calcSize (Char * rep)
+void CatOp::calcSize (const charset_t & cs)
{
- exp1->calcSize (rep);
- exp2->calcSize (rep);
+ exp1->calcSize (cs);
+ exp2->calcSize (cs);
size = exp1->size + exp2->size;
}
-void CloseOp::calcSize (Char * rep)
+void CloseOp::calcSize (const charset_t & cs)
{
- exp->calcSize (rep);
+ exp->calcSize (cs);
size = exp->size + 1;
}
-void MatchOp::calcSize (Char * rep)
+void MatchOp::calcSize (const charset_t & cs)
{
size = 1;
for (Range * r = match; r; r = r->next ())
{
- for (uint32_t c = r->lower (); c < r->upper (); ++c)
- {
- if (rep[c] == c)
- {
- ++size;
- }
- }
+ size += static_cast<uint32_t> (std::distance(
+ cs.find(r->lower()),
+ cs.find(r->upper())));
}
}
-void NullOp::calcSize (Char *)
+void NullOp::calcSize (const charset_t &)
{
size = 0;
}
-void RuleOp::calcSize (Char * rep)
+void RuleOp::calcSize (const charset_t & cs)
{
- exp->calcSize (rep);
- ctx->calcSize (rep);
+ exp->calcSize (cs);
+ ctx->calcSize (cs);
size = exp->size + (ctx->size ? ctx->size + 2 : 1);
}
+++ /dev/null
-#include "src/ir/bytecode/charset.h"
-#include "src/util/allocate.h"
-
-namespace re2c {
-
-CharSet::CharSet (uint32_t cunits)
- : fix (0)
- , freeHead (0)
- , freeTail (0)
- , rep (allocate<CharPtr> (cunits))
- , ptn (allocate<CharPtn> (cunits))
-{
- for (uint32_t j = 0; j < cunits; ++j)
- {
- rep[j] = &ptn[0];
- ptn[j].nxt = &ptn[j + 1]; /* wrong for j=encoding.nCodeUnits() - 1 but will be corrected below */
- ptn[j].card = 0;
- }
- freeHead = &ptn[1];
- * (freeTail = &ptn[cunits - 1].nxt) = NULL;
- ptn[0].card = cunits;
- ptn[0].nxt = NULL;
-}
-
-CharSet::~CharSet ()
-{
- operator delete (rep);
- operator delete (ptn);
-}
-
-} // namespace re2c
+++ /dev/null
-#ifndef _RE2C_IR_BYTECODE_CHARSET_
-#define _RE2C_IR_BYTECODE_CHARSET_
-
-#include "src/util/c99_stdint.h"
-#include "src/util/forbid_copy.h"
-
-namespace re2c
-{
-
-struct CharPtn
-{
- uint32_t card;
- CharPtn * fix;
- CharPtn * nxt;
-
- FORBID_COPY (CharPtn);
-};
-
-typedef CharPtn * CharPtr;
-
-struct CharSet
-{
- CharPtn * fix;
- CharPtn * freeHead;
- CharPtn ** freeTail;
- CharPtr * rep;
- CharPtn * ptn;
-
- CharSet (uint32_t cunits);
- ~CharSet ();
-
- FORBID_COPY (CharSet);
-};
-
-} // namespace re2c
-
-#endif // _RE2C_IR_BYTECODE_CHARSET_
static uint32_t compile_goto (Ins * ins, Ins * i);
-uint32_t AltOp::compile (Char * rep, Ins * i)
+uint32_t AltOp::compile (const charset_t & cs, Ins * i)
{
if (ins_cache)
{
ins_cache = i;
i->i.tag = FORK;
- const uint32_t sz1 = exp1->compile (rep, &i[1]);
+ const uint32_t sz1 = exp1->compile (cs, &i[1]);
Ins * const j = &i[sz1 + 1];
i->i.link = &j[1];
j->i.tag = GOTO;
- const uint32_t sz2 = exp2->compile (rep, &j[1]);
+ const uint32_t sz2 = exp2->compile (cs, &j[1]);
j->i.link = &j[sz2 + 1];
if (ins_access == PRIVATE)
}
}
-uint32_t CatOp::compile (Char * rep, Ins * i)
+uint32_t CatOp::compile (const charset_t & cs, Ins * i)
{
if (ins_cache)
{
{
ins_cache = i;
- const uint32_t sz1 = exp1->compile (rep, &i[0]);
- const uint32_t sz2 = exp2->compile (rep, &i[sz1]);
+ const uint32_t sz1 = exp1->compile (cs, &i[0]);
+ const uint32_t sz2 = exp2->compile (cs, &i[sz1]);
if (ins_access == PRIVATE)
{
}
}
-uint32_t CloseOp::compile (Char * rep, Ins * i)
+uint32_t CloseOp::compile (const charset_t & cs, Ins * i)
{
if (ins_cache)
{
{
ins_cache = i;
- i += exp->compile (rep, &i[0]);
+ i += exp->compile (cs, &i[0]);
i->i.tag = FORK;
i->i.link = ins_cache;
++i;
}
}
-uint32_t MatchOp::compile (Char * rep, Ins * i)
+uint32_t MatchOp::compile (const charset_t & cs, Ins * i)
{
if (ins_cache)
{
uint32_t bump = size;
for (Range *r = match; r; r = r->next ())
{
- for (uint32_t c = r->lower (); c < r->upper (); ++c)
+ charset_t::const_iterator l = cs.find(r->lower());
+ charset_t::const_iterator u = cs.find(r->upper());
+ for (; l != u; ++l)
{
- if (rep[c] == c)
- {
- j->c.value = c;
- j->c.bump = --bump;
- j++;
- }
+ j->c.value = *l;
+ j->c.bump = --bump;
+ j++;
}
}
ins_cache = NULL;
}
-uint32_t NullOp::compile (Char *, Ins *)
+uint32_t NullOp::compile (const charset_t &, Ins *)
{
return 0;
}
void NullOp::decompile () {}
-uint32_t RuleOp::compile (Char * rep, Ins * i)
+uint32_t RuleOp::compile (const charset_t & cs, Ins * i)
{
if (ins_cache)
{
{
ins_cache = i;
- i += exp->compile (rep, &i[0]);
+ i += exp->compile (cs, &i[0]);
if (ctx->size)
{
i->i.tag = CTXT;
i->i.link = &i[1];
++i;
- i += ctx->compile (rep, &i[0]);
+ i += ctx->compile (cs, &i[0]);
}
i->i.tag = TERM;
i->i.link = this;
#include "src/util/c99_stdint.h"
-#include "src/ir/bytecode/charset.h"
#include "src/ir/regexp/regexp.h"
#include "src/ir/regexp/regexp_alt.h"
#include "src/ir/regexp/regexp_cat.h"
namespace re2c {
-void AltOp::split (CharSet & s)
+void AltOp::split (charset_t & cs)
{
- exp1->split (s);
- exp2->split (s);
+ exp1->split (cs);
+ exp2->split (cs);
}
-void CatOp::split (CharSet & s)
+void CatOp::split (charset_t & cs)
{
- exp1->split (s);
- exp2->split (s);
+ exp1->split (cs);
+ exp2->split (cs);
}
-void CloseOp::split (CharSet & s)
+void CloseOp::split (charset_t & cs)
{
- exp->split (s);
+ exp->split (cs);
}
-void MatchOp::split (CharSet & s)
+void MatchOp::split (charset_t & cs)
{
for (Range *r = match; r; r = r->next ())
{
- for (uint32_t c = r->lower (); c < r->upper (); ++c)
- {
- CharPtn * x = s.rep[c];
- CharPtn * a = x->nxt;
- if (!a)
- {
- if (x->card == 1)
- {
- continue;
- }
- x->nxt = a = s.freeHead;
- if (!(s.freeHead = s.freeHead->nxt))
- {
- s.freeTail = &s.freeHead;
- }
- a->nxt = NULL;
- x->fix = s.fix;
- s.fix = x;
- }
- if (--(x->card) == 0)
- {
- *s.freeTail = x;
- *(s.freeTail = &x->nxt) = NULL;
- }
- s.rep[c] = a;
- ++(a->card);
- }
- }
- for (; s.fix; s.fix = s.fix->fix)
- {
- if (s.fix->card)
- {
- s.fix->nxt = NULL;
- }
+ cs.insert (r->lower ());
+ cs.insert (r->upper ());
}
}
-void NullOp::split (CharSet &) {}
+void NullOp::split (charset_t &) {}
-void RuleOp::split (CharSet & s)
+void RuleOp::split (charset_t & cs)
{
- exp->split (s);
- ctx->split (s);
+ exp->split (cs);
+ ctx->split (cs);
}
} // namespace re2c
struct GoTo
{
- Char ch;
+ uint32_t ch;
void *to;
};
, uint32_t ni
, uint32_t lb
, uint32_t ub
- , const Char * rep
+ , const charset_t & cs
, rules_t rules
)
: accepts ()
, tail(&head)
, toDo(NULL)
, free_ins(ins)
- , free_rep(rep)
// statistics
, max_fill (0)
s->go.nSpans = 0;
- for (uint32_t j = 0; j < nc;)
+ for (charset_t::const_iterator j = cs.begin(); j != cs.end();)
{
- State *to = (State*) goTo[rep[j]].to;
-
- while (++j < nc && goTo[rep[j]].to == to) ;
-
- span[s->go.nSpans].ub = lb + j;
-
+ State *to = (State*) goTo[*j].to;
+ while (++j != cs.end() && goTo[*j].to == to) ;
+ span[s->go.nSpans].ub = lb + (j == cs.end() ? nc : *j);
span[s->go.nSpans].to = to;
-
s->go.nSpans++;
}
delete s;
}
delete [] free_ins;
- delete [] free_rep;
delete skeleton;
}
State ** tail;
State * toDo;
const Ins * free_ins;
- const Char * free_rep;
// statistics
uint32_t max_fill;
, uint32_t
, uint32_t
, uint32_t
- , const Char *
+ , const charset_t &
, rules_t
);
~DFA ();
#include "src/util/c99_stdint.h"
#include <iosfwd>
+#include <set>
#include "src/util/free_list.h"
#include "src/util/forbid_copy.h"
namespace re2c
{
-struct CharSet;
union Ins;
-typedef uint32_t Char;
+typedef std::set<uint32_t> charset_t;
class RegExp
{
{
vFreeList.erase (this);
}
- virtual void split (CharSet &) = 0;
- virtual void calcSize (Char *) = 0;
+ virtual void split (charset_t &) = 0;
+ virtual void calcSize (const charset_t &) = 0;
virtual uint32_t fixedLength ();
- virtual uint32_t compile (Char *, Ins *) = 0;
+ virtual uint32_t compile (const charset_t &, Ins *) = 0;
virtual void decompile () = 0;
virtual void display (std::ostream &) const = 0;
friend std::ostream & operator << (std::ostream & o, const RegExp & re);
: exp1 (e1)
, exp2 (e2)
{}
- void split (CharSet &);
- void calcSize (Char *);
+ void split (charset_t &);
+ void calcSize (const charset_t &);
uint32_t fixedLength ();
- uint32_t compile (Char *, Ins *);
+ uint32_t compile (const charset_t &, Ins *);
void decompile ();
void display (std::ostream & o) const;
friend RegExp * mkAlt (RegExp *, RegExp *);
: exp1 (e1)
, exp2 (e2)
{}
- void split (CharSet &);
- void calcSize (Char *);
+ void split (charset_t &);
+ void calcSize (const charset_t &);
uint32_t fixedLength ();
- uint32_t compile (Char *, Ins *);
+ uint32_t compile (const charset_t &, Ins *);
void decompile ();
void display (std::ostream & o) const;
inline CloseOp (RegExp * e)
: exp (e)
{}
- void split (CharSet &);
- void calcSize (Char *);
- uint32_t compile (Char *, Ins *);
+ void split (charset_t &);
+ void calcSize (const charset_t &);
+ uint32_t compile (const charset_t &, Ins *);
void decompile ();
void display (std::ostream & o) const;
inline MatchOp (Range * m)
: match (m)
{}
- void split (CharSet &);
- void calcSize (Char *);
+ void split (charset_t &);
+ void calcSize (const charset_t &);
uint32_t fixedLength ();
- uint32_t compile (Char *, Ins *);
+ uint32_t compile (const charset_t &, Ins *);
void decompile ();
void display (std::ostream & o) const;
class NullOp: public RegExp
{
public:
- void split (CharSet &);
- void calcSize (Char *);
+ void split (charset_t &);
+ void calcSize (const charset_t &);
uint32_t fixedLength ();
- uint32_t compile (Char *, Ins *);
+ uint32_t compile (const charset_t &, Ins *);
void decompile ();
void display (std::ostream & o) const;
};
ins_access = access;
}
void display (std::ostream & o) const;
- void split (CharSet &);
- void calcSize (Char *);
- uint32_t compile (Char *, Ins *);
+ void split (charset_t &);
+ void calcSize (const charset_t &);
+ uint32_t compile (const charset_t &, Ins *);
void decompile ();
FORBID_COPY (RuleOp);