From 3e939ded3ff7788dfd1247f315cc3d70d2bb46b6 Mon Sep 17 00:00:00 2001 From: Ulya Trofimovich Date: Mon, 22 Jun 2015 13:50:17 +0100 Subject: [PATCH] Operations on character classes: fixed subtraction, simplified addition. Subtraction was broken by commit f56196d29f6c29b37e3e95a6777714c237e1c71c: "Simplified implementation of range union and difference." --- re2c/src/ir/regexp/encoding/enc.cc | 22 ++-- re2c/src/ir/regexp/encoding/range_suffix.cc | 2 +- .../ir/regexp/encoding/utf16/utf16_regexp.cc | 4 +- .../ir/regexp/encoding/utf8/utf8_regexp.cc | 4 +- re2c/src/ir/regexp/regexp.cc | 16 +-- re2c/src/main.cc | 1 - re2c/src/util/range.cc | 123 +++++++++++------- re2c/src/util/range.h | 30 +++-- 8 files changed, 119 insertions(+), 83 deletions(-) diff --git a/re2c/src/ir/regexp/encoding/enc.cc b/re2c/src/ir/regexp/encoding/enc.cc index fcc68302..b0d48e8b 100644 --- a/re2c/src/ir/regexp/encoding/enc.cc +++ b/re2c/src/ir/regexp/encoding/enc.cc @@ -132,16 +132,16 @@ Range * Enc::encodeRange(uint32_t l, uint32_t h) const case ASCII: l &= 0xFF; h &= 0xFF; - r = new Range(l, h + 1); + r = Range::ran (l, h + 1); break; case EBCDIC: { const uint32_t el = asc2ebc[l & 0xFF]; - r = new Range(el, el + 1); + r = Range::sym (el); for (uint32_t c = l + 1; c <= h; ++c) { const uint32_t ec = asc2ebc[c & 0xFF]; - r = range_union (r, new Range(ec, ec + 1)); + r = Range::add (r, Range::sym (ec)); } break; } @@ -149,7 +149,7 @@ Range * Enc::encodeRange(uint32_t l, uint32_t h) const case UTF16: case UTF32: case UTF8: - r = new Range(l, h + 1); + r = Range::ran (l, h + 1); if (l <= SURR_MAX && h >= SURR_MIN) { switch (policy) @@ -159,10 +159,10 @@ Range * Enc::encodeRange(uint32_t l, uint32_t h) const break; case POLICY_SUBSTITUTE: { - Range * surrs = new Range(SURR_MIN, SURR_MAX + 1); - Range * error = new Range(UNICODE_ERROR, UNICODE_ERROR + 1); - r = range_diff (r, surrs); - r = range_union (r, error); + Range * surrs = Range::ran (SURR_MIN, SURR_MAX + 1); + Range * error = Range::sym (UNICODE_ERROR); + r = Range::sub (r, surrs); + r = Range::add (r, error); break; } case POLICY_IGNORE: @@ -186,11 +186,11 @@ Range * Enc::encodeRange(uint32_t l, uint32_t h) const */ Range * Enc::fullRange() const { - Range * r = new Range(0, nCodePoints()); + Range * r = Range::ran (0, nCodePoints()); if (policy != POLICY_IGNORE) { - Range * surrs = new Range(SURR_MIN, SURR_MAX + 1); - r = range_diff (r, surrs); + Range * surrs = Range::ran (SURR_MIN, SURR_MAX + 1); + r = Range::sub (r, surrs); } return r; } diff --git a/re2c/src/ir/regexp/encoding/range_suffix.cc b/re2c/src/ir/regexp/encoding/range_suffix.cc index 87898491..09617379 100644 --- a/re2c/src/ir/regexp/encoding/range_suffix.cc +++ b/re2c/src/ir/regexp/encoding/range_suffix.cc @@ -27,7 +27,7 @@ RegExp * emit(RangeSuffix * p, RegExp * re) RegExp * regexp = NULL; for (; p != NULL; p = p->next) { - RegExp * re1 = doCat(new MatchOp(new Range(p->l, p->h + 1)), re); + RegExp * re1 = doCat(new MatchOp(Range::ran (p->l, p->h + 1)), re); regexp = doAlt(regexp, emit(p->child, re1)); } return regexp; diff --git a/re2c/src/ir/regexp/encoding/utf16/utf16_regexp.cc b/re2c/src/ir/regexp/encoding/utf16/utf16_regexp.cc index 49522657..b8fef48a 100644 --- a/re2c/src/ir/regexp/encoding/utf16/utf16_regexp.cc +++ b/re2c/src/ir/regexp/encoding/utf16/utf16_regexp.cc @@ -9,12 +9,12 @@ namespace re2c { RegExp * UTF16Symbol(utf16::rune r) { if (r <= utf16::MAX_1WORD_RUNE) - return new MatchOp(new Range(r, r + 1)); + return new MatchOp(Range::sym (r)); else { const uint16_t ld = utf16::lead_surr(r); const uint16_t tr = utf16::trail_surr(r); - return new CatOp(new MatchOp(new Range(ld, ld + 1)), new MatchOp(new Range(tr, tr + 1))); + return new CatOp(new MatchOp(Range::sym (ld)), new MatchOp(Range::sym (tr))); } } diff --git a/re2c/src/ir/regexp/encoding/utf8/utf8_regexp.cc b/re2c/src/ir/regexp/encoding/utf8/utf8_regexp.cc index 084755fa..e6bd1a20 100644 --- a/re2c/src/ir/regexp/encoding/utf8/utf8_regexp.cc +++ b/re2c/src/ir/regexp/encoding/utf8/utf8_regexp.cc @@ -10,9 +10,9 @@ RegExp * UTF8Symbol(utf8::rune r) { uint8_t chars[utf8::MAX_RUNE_LENGTH]; const int chars_count = utf8::rune_to_bytes(chars, r); - RegExp * re = new MatchOp(new Range(chars[0], chars[0] + 1)); + RegExp * re = new MatchOp(Range::sym (chars[0])); for (int i = 1; i < chars_count; ++i) - re = new CatOp(re, new MatchOp(new Range(chars[i], chars[i] + 1))); + re = new CatOp(re, new MatchOp(Range::sym (chars[i]))); return re; } diff --git a/re2c/src/ir/regexp/regexp.cc b/re2c/src/ir/regexp/regexp.cc index 6e1cdc99..55981b30 100644 --- a/re2c/src/ir/regexp/regexp.cc +++ b/re2c/src/ir/regexp/regexp.cc @@ -84,7 +84,7 @@ MatchOp * merge (MatchOp * m1, MatchOp * m2) { return m1; } - MatchOp * m = new MatchOp (range_union (m1->match, m2->match)); + MatchOp * m = new MatchOp (Range::add (m1->match, m2->match)); if (m1->ins_access == RegExp::PRIVATE || m2->ins_access == RegExp::PRIVATE) { @@ -143,7 +143,7 @@ RegExp * Scanner::matchSymbol(uint32_t c) const else if (encoding.is(Enc::UTF8)) return UTF8Symbol(c); else - return new MatchOp(new Range(c, c + 1)); + return new MatchOp (Range::sym (c)); } RegExp * Scanner::strToRE (SubStr & s) const @@ -202,7 +202,7 @@ Range * Scanner::mkRange(SubStr &s) const { Range *r = getRange(s); while (s.len > 0) - r = range_union (r, getRange(s)); + r = Range::add (r, getRange(s)); return r; } @@ -252,7 +252,7 @@ RegExp * Scanner::invToRE (SubStr & s) const Range * r = s.len == 0 ? full - : range_diff (full, mkRange (s)); + : Range::sub (full, mkRange (s)); return matchSymbolRange(r); } @@ -265,7 +265,7 @@ RegExp * Scanner::mkDiff (RegExp * e1, RegExp * e2) const { fatal("can only difference char sets"); } - Range * r = range_diff (m1->match, m2->match); + Range * r = Range::sub (m1->match, m2->match); return matchSymbolRange (r); } @@ -276,8 +276,8 @@ RegExp * Scanner::mkDot() const uint32_t c = '\n'; if (!encoding.encode(c)) fatalf("Bad code point: '0x%X'", c); - Range * ran = new Range(c, c + 1); - Range * inv = range_diff (full, ran); + Range * ran = Range::sym (c); + Range * inv = Range::sub (full, ran); return matchSymbolRange(inv); } @@ -294,7 +294,7 @@ RegExp * Scanner::mkDot() const */ RegExp * Scanner::mkDefault() const { - Range * def = new Range(0, encoding.nCodeUnits()); + Range * def = Range::ran (0, encoding.nCodeUnits()); return new MatchOp(def); } diff --git a/re2c/src/main.cc b/re2c/src/main.cc index f4a22634..37c44acc 100644 --- a/re2c/src/main.cc +++ b/re2c/src/main.cc @@ -70,7 +70,6 @@ uint32_t last_fill_index = 0; CodeNames mapCodeName; free_list RegExp::vFreeList; -free_list Range::vFreeList; using namespace std; diff --git a/re2c/src/util/range.cc b/re2c/src/util/range.cc index 92a5bead..fa46ab33 100644 --- a/re2c/src/util/range.cc +++ b/re2c/src/util/range.cc @@ -3,70 +3,95 @@ namespace re2c { -Range * range_union (Range * r1, Range * r2) +free_list Range::vFreeList; + +void Range::append_overlapping (Range * & head, Range * & tail, const Range * r) { - Range * r = NULL; - Range ** p = &r; - for (;;) + if (!head) { - if (!r1) - { - * p = r2; - break; - } - if (!r2) - { - * p = r1; - break; - } - if (r2->lb < r1->lb) // swap + head = Range::ran (r->lb, r->ub); + tail = head; + } + else if (tail->ub < r->lb) + { + tail->nx = Range::ran (r->lb, r->ub); + tail = tail->nx; + } + else if (tail->ub < r->ub) + { + tail->ub = r->ub; + } +} + +Range * Range::add (const Range * r1, const Range * r2) +{ + Range * head = NULL; + Range * tail = NULL; + for (; r1 && r2;) + { + if (r1->lb < r2->lb) { - Range * r1_old = r1; - r1 = r2; - r2 = r1_old; + append_overlapping (head, tail, r1); + r1 = r1->nx; } - uint32_t ub = r1->ub; - if (r2->lb < r1->ub) + else { - for (; r2 && r2->lb < r1->ub; r2 = r2->nx) - { - if (r1->ub < r2->ub) - { - ub = r2->ub; - } - } + append_overlapping (head, tail, r2); + r2 = r2->nx; } - * p = new Range (r1->lb, ub); - p = &(* p)->nx; - r1 = r1->nx; } - return r; + for (; r1; r1 = r1->nx) + { + append_overlapping (head, tail, r1); + } + for (; r2; r2 = r2->nx) + { + append_overlapping (head, tail, r2); + } + return head; } -Range * range_diff (Range * r1, Range * r2) +void Range::append (Range ** & ptail, uint32_t l, uint32_t u) { - Range * r = NULL; - Range ** p = &r; - for (; r1; r1 = r1->nx) + Range * & tail = * ptail; + tail = Range::ran (l, u); + ptail = &tail->nx; +} + +Range * Range::sub (const Range * r1, const Range * r2) +{ + Range * head = NULL; + Range ** ptail = &head; + while (r1) { - for (; r2 && r2->ub <= r1->lb; r2 = r2->nx); - uint32_t lb = r1->lb; - for (; r2 && r2->lb < r1->ub; r2 = r2->nx) + if (!r2 || r2->lb >= r1->ub) { - if (lb < r2->lb) - { - * p = new Range(lb, r2->lb); - p = &(* p)->nx; - } - lb = r2->ub; + append (ptail, r1->lb, r1->ub); + r1 = r1->nx; + } + else if (r2->ub <= r1->lb) + { + r2 = r2->nx; } - if (lb < r1->ub) + else { - * p = new Range(lb, r1->ub); - p = &(* p)->nx; + if (r1->lb < r2->lb) + { + append (ptail, r1->lb, r2->lb); + } + while (r2 && r2->ub < r1->ub) + { + const uint32_t lb = r2->ub; + r2 = r2->nx; + const uint32_t ub = r2 && r2->lb < r1->ub + ? r2->lb + : r1->ub; + append (ptail, lb, ub); + } + r1 = r1->nx; } } - return r; + return head; } -} // end namespace re2c +} // namespace re2c diff --git a/re2c/src/util/range.h b/re2c/src/util/range.h index 48d79f38..68d5d28c 100644 --- a/re2c/src/util/range.h +++ b/re2c/src/util/range.h @@ -23,13 +23,13 @@ private: uint32_t ub; public: - Range (uint32_t l, uint32_t u) - : nx (NULL) - , lb (l) - , ub (u) + static Range * sym (uint32_t c) { - assert (lb < ub); - vFreeList.insert (this); + return new Range (NULL, c, c + 1); + } + static Range * ran (uint32_t l, uint32_t u) + { + return new Range (NULL, l, u); } ~Range () { @@ -38,12 +38,24 @@ public: Range * next () const { return nx; } uint32_t lower () const { return lb; } uint32_t upper () const { return ub; } - friend Range * range_union (Range * r1, Range * r2); - friend Range * range_diff (Range * r1, Range * r2); + static Range * add (const Range * r1, const Range * r2); + static Range * sub (const Range * r1, const Range * r2); + +private: + Range (Range * n, uint32_t l, uint32_t u) + : nx (n) + , lb (l) + , ub (u) + { + assert (lb < ub); + vFreeList.insert (this); + } + static void append_overlapping (Range * & head, Range * & tail, const Range * r); + static void append (Range ** & ptail, uint32_t l, uint32_t u); FORBID_COPY (Range); }; -} // end namespace re2c +} // namespace re2c #endif // _RE2C_UTIL_RANGE_ -- 2.40.0