From: Ulya Trofimovich Date: Sun, 6 Jan 2019 08:40:16 +0000 (+0000) Subject: Handle single chars and 1-char ranges in the same way. X-Git-Tag: 1.2~234 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=126940e33fa2fd419ebd4feb25408fc32625d50e;p=re2c Handle single chars and 1-char ranges in the same way. --- diff --git a/re2c/Makefile.am b/re2c/Makefile.am index 41aa05bc..2a094e6b 100644 --- a/re2c/Makefile.am +++ b/re2c/Makefile.am @@ -39,8 +39,6 @@ SRC_HDR = \ src/encoding/range_suffix.h \ src/encoding/utf8/utf8.h \ src/encoding/utf8/utf8_regexp.h \ - src/encoding/utf8/utf8_range.h \ - src/encoding/utf16/utf16_range.h \ src/encoding/utf16/utf16_regexp.h \ src/encoding/utf16/utf16.h \ src/regexp/empty_class_policy.h \ @@ -126,11 +124,9 @@ SRC = \ src/encoding/enc.cc \ src/encoding/range_suffix.cc \ src/encoding/utf8/utf8_regexp.cc \ - src/encoding/utf8/utf8_range.cc \ src/encoding/utf8/utf8.cc \ src/encoding/utf16/utf16_regexp.cc \ src/encoding/utf16/utf16.cc \ - src/encoding/utf16/utf16_range.cc \ src/regexp/ast_to_re.cc \ src/regexp/default_tags.cc \ src/regexp/fixed_tags.cc \ diff --git a/re2c/src/encoding/ebcdic/ebcdic_regexp.cc b/re2c/src/encoding/ebcdic/ebcdic_regexp.cc index 58484a95..d77f661c 100644 --- a/re2c/src/encoding/ebcdic/ebcdic_regexp.cc +++ b/re2c/src/encoding/ebcdic/ebcdic_regexp.cc @@ -5,11 +5,6 @@ namespace re2c { -RE *EBCDICSymbol(RE::alc_t &alc, uint32_t c) -{ - return re_sym(alc, Range::sym(asc2ebc[c])); -} - RE *EBCDICRange(RE::alc_t &alc, const Range *r) { Range *s = NULL; diff --git a/re2c/src/encoding/ebcdic/ebcdic_regexp.h b/re2c/src/encoding/ebcdic/ebcdic_regexp.h index cfb93064..ba88b21b 100644 --- a/re2c/src/encoding/ebcdic/ebcdic_regexp.h +++ b/re2c/src/encoding/ebcdic/ebcdic_regexp.h @@ -8,7 +8,6 @@ namespace re2c { class Range; -RE *EBCDICSymbol(RE::alc_t &alc, uint32_t c); RE *EBCDICRange(RE::alc_t &alc, const Range *r); } // namespace re2c diff --git a/re2c/src/encoding/utf16/utf16_range.cc b/re2c/src/encoding/utf16/utf16_range.cc deleted file mode 100644 index 98e90697..00000000 --- a/re2c/src/encoding/utf16/utf16_range.cc +++ /dev/null @@ -1,149 +0,0 @@ -#include "src/encoding/utf16/utf16_range.h" - -#include - -#include "src/encoding/range_suffix.h" - -namespace re2c { - -/* - * Add word range [w1-w2]. - */ -void UTF16addContinuous1(RangeSuffix * & root, uint32_t l, uint32_t h) -{ - RangeSuffix ** p = &root; - for (;;) - { - if (*p == NULL) - { - *p = new RangeSuffix(l, h); - break; - } - else if ((*p)->l == l && (*p)->h == h) - { - break; - } - else - p = &(*p)->next; - } -} - -/* - * Now that we have catenation of word ranges [l1-h1],[l2-h2], - * we want to add it to existing range, merging suffixes on the fly. - */ -void UTF16addContinuous2(RangeSuffix * & root, uint32_t l_ld, uint32_t h_ld, uint32_t l_tr, uint32_t h_tr) -{ - RangeSuffix ** p = &root; - for (;;) - { - if (*p == NULL) - { - *p = new RangeSuffix(l_tr, h_tr); - p = &(*p)->child; - break; - } - else if ((*p)->l == l_tr && (*p)->h == h_tr) - { - p = &(*p)->child; - break; - } - else - p = &(*p)->next; - } - for (;;) - { - if (*p == NULL) - { - *p = new RangeSuffix(l_ld, h_ld); - break; - } - else if ((*p)->l == l_ld && (*p)->h == h_ld) - { - break; - } - else - p = &(*p)->next; - } -} - -/* - * Split range into sub-ranges that agree on leading surrogates. - * - * We have two Unicode runes, L and H, both map to UTF-16 - * surrogate pairs 'L1 L2' and 'H1 H2'. - * We want to represent Unicode range [L - H] as a catenation - * of word ranges [L1 - H1],[L2 - H2]. - * - * This is only possible if the following condition holds: - * if L1 /= H1, then L2 == 0xdc00 and H2 == 0xdfff. - * This condition ensures that: - * 1) all possible UTF-16 sequences between L and H are allowed - * 2) no word ranges [w1 - w2] appear, such that w1 > w2 - * - * E.g.: - * [\U00010001-\U00010400] => [d800-d801],[dc01-dc00]. - * The last word range, [dc01-dc00], is incorrect: its lower bound - * is greater than its upper bound. To fix this, we must split - * the original range into two sub-ranges: - * [\U00010001-\U000103ff] => [d800-d800],[dc01-dfff] - * [\U00010400-\U00010400] => [d801-d801],[dc00-dc00] - * - * This function finds all such 'points of discontinuity' - * and represents original range as alternation of continuous - * sub-ranges. - */ -void UTF16splitByContinuity(RangeSuffix * & root, uint32_t l_ld, uint32_t h_ld, uint32_t l_tr, uint32_t h_tr) -{ - if (l_ld != h_ld) - { - if (l_tr > utf16::MIN_TRAIL_SURR) - { - UTF16splitByContinuity(root, l_ld, l_ld, l_tr, utf16::MAX_TRAIL_SURR); - UTF16splitByContinuity(root, l_ld + 1, h_ld, utf16::MIN_TRAIL_SURR, h_tr); - return; - } - if (h_tr < utf16::MAX_TRAIL_SURR) - { - UTF16splitByContinuity(root, l_ld, h_ld - 1, l_tr, utf16::MAX_TRAIL_SURR); - UTF16splitByContinuity(root, h_ld, h_ld, utf16::MIN_TRAIL_SURR, h_tr); - return; - } - } - UTF16addContinuous2(root, l_ld, h_ld, l_tr, h_tr); -} - -/* - * Split range into sub-ranges, so that all runes in the same - * sub-range have equal length of UTF-16 sequence. E.g., full - * Unicode range [0-0x10FFFF] gets split into sub-ranges: - * [0 - 0xFFFF] (2-byte UTF-16 sequences) - * [0x10000 - 0x10FFFF] (4-byte UTF-16 sequences) - */ -void UTF16splitByRuneLength(RangeSuffix * & root, utf16::rune l, utf16::rune h) -{ - if (l <= utf16::MAX_1WORD_RUNE) - { - if (h <= utf16::MAX_1WORD_RUNE) - { - UTF16addContinuous1(root, l, h); - } - else - { - UTF16addContinuous1(root, l, utf16::MAX_1WORD_RUNE); - const uint32_t h_ld = utf16::lead_surr(h); - const uint32_t h_tr = utf16::trail_surr(h); - UTF16splitByContinuity(root, utf16::MIN_LEAD_SURR, h_ld, utf16::MIN_TRAIL_SURR, h_tr); - } - } - else - { - const uint32_t l_ld = utf16::lead_surr(l); - const uint32_t l_tr = utf16::trail_surr(l); - const uint32_t h_ld = utf16::lead_surr(h); - const uint32_t h_tr = utf16::trail_surr(h); - UTF16splitByContinuity(root, l_ld, h_ld, l_tr, h_tr); - } -} - -} // namespace re2c diff --git a/re2c/src/encoding/utf16/utf16_range.h b/re2c/src/encoding/utf16/utf16_range.h deleted file mode 100644 index af0fc4e8..00000000 --- a/re2c/src/encoding/utf16/utf16_range.h +++ /dev/null @@ -1,19 +0,0 @@ -#ifndef _RE2C_RE_ENCODING_UTF16_RANGE_ -#define _RE2C_RE_ENCODING_UTF16_RANGE_ - -#include "src/util/c99_stdint.h" - -#include "src/encoding/utf16/utf16.h" - -namespace re2c { - -struct RangeSuffix; - -void UTF16addContinuous1(RangeSuffix * & root, uint32_t l, uint32_t h); -void UTF16addContinuous2(RangeSuffix * & root, uint32_t l_ld, uint32_t h_ld, uint32_t l_tr, uint32_t h_tr); -void UTF16splitByContinuity(RangeSuffix * & root, uint32_t l_ld, uint32_t h_ld, uint32_t l_tr, uint32_t h_tr); -void UTF16splitByRuneLength(RangeSuffix * & root, utf16::rune l, utf16::rune h); - -} // namespace re2c - -#endif // _RE2C_RE_ENCODING_UTF16_RANGE_ diff --git a/re2c/src/encoding/utf16/utf16_regexp.cc b/re2c/src/encoding/utf16/utf16_regexp.cc index 72c0b05a..d83eb0ec 100644 --- a/re2c/src/encoding/utf16/utf16_regexp.cc +++ b/re2c/src/encoding/utf16/utf16_regexp.cc @@ -2,12 +2,40 @@ #include "src/util/c99_stdint.h" #include "src/encoding/range_suffix.h" -#include "src/encoding/utf16/utf16_range.h" #include "src/encoding/utf16/utf16_regexp.h" #include "src/util/range.h" + namespace re2c { +static RE *UTF16Symbol(RE::alc_t &, utf16::rune); +static void UTF16addContinuous1(RangeSuffix *&, uint32_t, uint32_t); +static void UTF16addContinuous2(RangeSuffix *&, uint32_t, uint32_t, uint32_t, uint32_t); +static void UTF16splitByContinuity(RangeSuffix *&, uint32_t, uint32_t, uint32_t, uint32_t); +static void UTF16splitByRuneLength(RangeSuffix *&, utf16::rune, utf16::rune); + +/* + * Split Unicode character class {[l1, h1), ..., [lN, hN)} into + * ranges [l1, h1-1], ..., [lN, hN-1] and return alternation of + * them. We store partially built range in suffix tree, which + * allows to eliminate common suffixes while building. + */ +RE *UTF16Range(RE::alc_t &alc, const Range *r) +{ + // empty range + if (!r) return NULL; + + // one-symbol range + if (!r->next() && r->lower() == r->upper() - 1) { + return UTF16Symbol(alc, r->lower()); + } + + RangeSuffix * root = NULL; + for (; r != NULL; r = r->next ()) + UTF16splitByRuneLength(root, r->lower (), r->upper () - 1); + return to_regexp(alc, root); +} + RE *UTF16Symbol(RE::alc_t &alc, utf16::rune r) { if (r <= utf16::MAX_1WORD_RUNE) { @@ -22,17 +50,143 @@ RE *UTF16Symbol(RE::alc_t &alc, utf16::rune r) } /* - * Split Unicode character class {[l1, h1), ..., [lN, hN)} into - * ranges [l1, h1-1], ..., [lN, hN-1] and return alternation of - * them. We store partially built range in suffix tree, which - * allows to eliminate common suffixes while building. + * Split range into sub-ranges, so that all runes in the same + * sub-range have equal length of UTF-16 sequence. E.g., full + * Unicode range [0-0x10FFFF] gets split into sub-ranges: + * [0 - 0xFFFF] (2-byte UTF-16 sequences) + * [0x10000 - 0x10FFFF] (4-byte UTF-16 sequences) */ -RE *UTF16Range(RE::alc_t &alc, const Range *r) +void UTF16splitByRuneLength(RangeSuffix * & root, utf16::rune l, utf16::rune h) { - RangeSuffix * root = NULL; - for (; r != NULL; r = r->next ()) - UTF16splitByRuneLength(root, r->lower (), r->upper () - 1); - return to_regexp(alc, root); + if (l <= utf16::MAX_1WORD_RUNE) + { + if (h <= utf16::MAX_1WORD_RUNE) + { + UTF16addContinuous1(root, l, h); + } + else + { + UTF16addContinuous1(root, l, utf16::MAX_1WORD_RUNE); + const uint32_t h_ld = utf16::lead_surr(h); + const uint32_t h_tr = utf16::trail_surr(h); + UTF16splitByContinuity(root, utf16::MIN_LEAD_SURR, h_ld, utf16::MIN_TRAIL_SURR, h_tr); + } + } + else + { + const uint32_t l_ld = utf16::lead_surr(l); + const uint32_t l_tr = utf16::trail_surr(l); + const uint32_t h_ld = utf16::lead_surr(h); + const uint32_t h_tr = utf16::trail_surr(h); + UTF16splitByContinuity(root, l_ld, h_ld, l_tr, h_tr); + } +} + +/* + * Split range into sub-ranges that agree on leading surrogates. + * + * We have two Unicode runes, L and H, both map to UTF-16 + * surrogate pairs 'L1 L2' and 'H1 H2'. + * We want to represent Unicode range [L - H] as a catenation + * of word ranges [L1 - H1],[L2 - H2]. + * + * This is only possible if the following condition holds: + * if L1 /= H1, then L2 == 0xdc00 and H2 == 0xdfff. + * This condition ensures that: + * 1) all possible UTF-16 sequences between L and H are allowed + * 2) no word ranges [w1 - w2] appear, such that w1 > w2 + * + * E.g.: + * [\U00010001-\U00010400] => [d800-d801],[dc01-dc00]. + * The last word range, [dc01-dc00], is incorrect: its lower bound + * is greater than its upper bound. To fix this, we must split + * the original range into two sub-ranges: + * [\U00010001-\U000103ff] => [d800-d800],[dc01-dfff] + * [\U00010400-\U00010400] => [d801-d801],[dc00-dc00] + * + * This function finds all such 'points of discontinuity' + * and represents original range as alternation of continuous + * sub-ranges. + */ +void UTF16splitByContinuity(RangeSuffix * & root, uint32_t l_ld, uint32_t h_ld, uint32_t l_tr, uint32_t h_tr) +{ + if (l_ld != h_ld) + { + if (l_tr > utf16::MIN_TRAIL_SURR) + { + UTF16splitByContinuity(root, l_ld, l_ld, l_tr, utf16::MAX_TRAIL_SURR); + UTF16splitByContinuity(root, l_ld + 1, h_ld, utf16::MIN_TRAIL_SURR, h_tr); + return; + } + if (h_tr < utf16::MAX_TRAIL_SURR) + { + UTF16splitByContinuity(root, l_ld, h_ld - 1, l_tr, utf16::MAX_TRAIL_SURR); + UTF16splitByContinuity(root, h_ld, h_ld, utf16::MIN_TRAIL_SURR, h_tr); + return; + } + } + UTF16addContinuous2(root, l_ld, h_ld, l_tr, h_tr); +} + +/* + * Add word range [w1-w2]. + */ +void UTF16addContinuous1(RangeSuffix * & root, uint32_t l, uint32_t h) +{ + RangeSuffix ** p = &root; + for (;;) + { + if (*p == NULL) + { + *p = new RangeSuffix(l, h); + break; + } + else if ((*p)->l == l && (*p)->h == h) + { + break; + } + else + p = &(*p)->next; + } +} + +/* + * Now that we have catenation of word ranges [l1-h1],[l2-h2], + * we want to add it to existing range, merging suffixes on the fly. + */ +void UTF16addContinuous2(RangeSuffix * & root, uint32_t l_ld, uint32_t h_ld, uint32_t l_tr, uint32_t h_tr) +{ + RangeSuffix ** p = &root; + for (;;) + { + if (*p == NULL) + { + *p = new RangeSuffix(l_tr, h_tr); + p = &(*p)->child; + break; + } + else if ((*p)->l == l_tr && (*p)->h == h_tr) + { + p = &(*p)->child; + break; + } + else + p = &(*p)->next; + } + for (;;) + { + if (*p == NULL) + { + *p = new RangeSuffix(l_ld, h_ld); + break; + } + else if ((*p)->l == l_ld && (*p)->h == h_ld) + { + break; + } + else + p = &(*p)->next; + } } } // namespace re2c diff --git a/re2c/src/encoding/utf16/utf16_regexp.h b/re2c/src/encoding/utf16/utf16_regexp.h index 30ac5ee3..880b70a8 100644 --- a/re2c/src/encoding/utf16/utf16_regexp.h +++ b/re2c/src/encoding/utf16/utf16_regexp.h @@ -4,11 +4,11 @@ #include "src/regexp/re.h" #include "src/encoding/utf16/utf16.h" + namespace re2c { class Range; -RE *UTF16Symbol(RE::alc_t &alc, utf16::rune r); RE *UTF16Range(RE::alc_t &alc, const Range *r); } // namespace re2c diff --git a/re2c/src/encoding/utf8/utf8_range.cc b/re2c/src/encoding/utf8/utf8_range.cc deleted file mode 100644 index f4359d32..00000000 --- a/re2c/src/encoding/utf8/utf8_range.cc +++ /dev/null @@ -1,115 +0,0 @@ -#include "src/encoding/utf8/utf8_range.h" - -#include - -#include "src/encoding/range_suffix.h" - -namespace re2c { - -/* - * Now that we have catenation of byte ranges [l1-h1]...[lN-hN], - * we want to add it to existing range, merging suffixes on the fly. - */ -void UTF8addContinuous(RangeSuffix * & root, utf8::rune l, utf8::rune h, uint32_t n) -{ - uint32_t lcs[utf8::MAX_RUNE_LENGTH]; - uint32_t hcs[utf8::MAX_RUNE_LENGTH]; - utf8::rune_to_bytes(lcs, l); - utf8::rune_to_bytes(hcs, h); - - RangeSuffix ** p = &root; - for (uint32_t i = 1; i <= n; ++i) - { - const uint32_t lc = lcs[n - i]; - const uint32_t hc = hcs[n - i]; - for (;;) - { - if (*p == NULL) - { - *p = new RangeSuffix(lc, hc); - p = &(*p)->child; - break; - } - else if ((*p)->l == lc && (*p)->h == hc) - { - p = &(*p)->child; - break; - } - else - p = &(*p)->next; - } - } -} - -/* - * Split range into sub-ranges that agree on leading bytes. - * - * We have two Unicode runes of equal length, L and H, which - * map to UTF-8 sequences 'L_1 ... L_n' and 'H_1 ... H_n'. - * We want to represent Unicode range [L - H] as a catenation - * of byte ranges [L_1 - H_1], ..., [L_n - H_n]. - * - * This is only possible if for all i > 1: - * if L_i /= H_i, then L_(i+1) == 0x80 and H_(i+1) == 0xbf. - * This condition ensures that: - * 1) all possible UTF-8 sequences between L and H are allowed - * 2) no byte ranges [b1 - b2] appear, such that b1 > b2 - * - * E.g.: - * [\U000e0031-\U000e0043] => [f3-f3],[a0-a0],[80-81],[b1-83]. - * The last byte range, [b1-83], is incorrect: its lower bound - * is greater than its upper bound. To fix this, we must split - * the original range into two sub-ranges: - * [\U000e0031-\U000e003f] => [f3-f3],[a0-a0],[80-80],[b1-bf] - * [\U000e0040-\U000e0043] => [f3-f3],[a0-a0],[81-81],[80-83] - * - * This function finds all such 'points of discontinuity' - * and represents original range as alternation of continuous - * sub-ranges. - */ -void UTF8splitByContinuity(RangeSuffix * & root, utf8::rune l, utf8::rune h, uint32_t n) -{ - for (uint32_t i = 1; i < n; ++i) - { - uint32_t m = (1u << (6u * i)) - 1u; // last i bytes of a UTF-8 sequence - if ((l & ~m) != (h & ~m)) - { - if ((l & m) != 0) - { - UTF8splitByContinuity(root, l, l | m, n); - UTF8splitByContinuity(root, (l | m) + 1, h, n); - return; - } - if ((h & m) != m) - { - UTF8splitByContinuity(root, l, (h & ~m) - 1, n); - UTF8splitByContinuity(root, h & ~m, h, n); - return; - } - } - } - UTF8addContinuous(root, l, h, n); -} - -/* - * Split range into sub-ranges, so that all runes in the same - * sub-range have equal length of UTF-8 sequence. E.g., full - * Unicode range [0-0x10FFFF] gets split into sub-ranges: - * [0 - 0x7F] (1-byte UTF-8 sequences) - * [0x80 - 0x7FF] (2-byte UTF-8 sequences) - * [0x800 - 0xFFFF] (3-byte UTF-8 sequences) - * [0x10000 - 0x10FFFF] (4-byte UTF-8 sequences) - */ -void UTF8splitByRuneLength(RangeSuffix * & root, utf8::rune l, utf8::rune h) -{ - const uint32_t nh = utf8::rune_length(h); - for (uint32_t nl = utf8::rune_length(l); nl < nh; ++nl) - { - utf8::rune r = utf8::max_rune(nl); - UTF8splitByContinuity(root, l, r, nl); - l = r + 1; - } - UTF8splitByContinuity(root, l, h, nh); -} - -} // namespace re2c diff --git a/re2c/src/encoding/utf8/utf8_range.h b/re2c/src/encoding/utf8/utf8_range.h deleted file mode 100644 index 7694e6b4..00000000 --- a/re2c/src/encoding/utf8/utf8_range.h +++ /dev/null @@ -1,18 +0,0 @@ -#ifndef _RE2C_RE_ENCODING_UTF8_RANGE_ -#define _RE2C_RE_ENCODING_UTF8_RANGE_ - -#include "src/util/c99_stdint.h" - -#include "src/encoding/utf8/utf8.h" - -namespace re2c { - -struct RangeSuffix; - -void UTF8addContinuous(RangeSuffix * & root, utf8::rune l, utf8::rune h, uint32_t n); -void UTF8splitByContinuity(RangeSuffix * & root, utf8::rune l, utf8::rune h, uint32_t n); -void UTF8splitByRuneLength(RangeSuffix * & root, utf8::rune l, utf8::rune h); - -} // namespace re2c - -#endif // _RE2C_RE_ENCODING_UTF8_RANGE_ diff --git a/re2c/src/encoding/utf8/utf8_regexp.cc b/re2c/src/encoding/utf8/utf8_regexp.cc index 1bb92fa1..994d7eec 100644 --- a/re2c/src/encoding/utf8/utf8_regexp.cc +++ b/re2c/src/encoding/utf8/utf8_regexp.cc @@ -2,12 +2,40 @@ #include "src/util/c99_stdint.h" #include "src/encoding/range_suffix.h" -#include "src/encoding/utf8/utf8_range.h" #include "src/encoding/utf8/utf8_regexp.h" #include "src/util/range.h" + namespace re2c { +static RE *UTF8Symbol(RE::alc_t &, utf8::rune); +static void UTF8addContinuous(RangeSuffix *&, utf8::rune, utf8::rune, uint32_t); +static void UTF8splitByContinuity(RangeSuffix *&, utf8::rune, utf8::rune, uint32_t); +static void UTF8splitByRuneLength(RangeSuffix *&, utf8::rune, utf8::rune); + +/* + * Split Unicode character class {[l1, h1), ..., [lN, hN)} into + * ranges [l1, h1-1], ..., [lN, hN-1] and return alternation of + * them. We store partially built range in suffix tree, which + * allows to eliminate common suffixes while building. + */ +RE *UTF8Range(RE::alc_t &alc, const Range *r) +{ + // empty range + if (!r) return NULL; + + // one-symbol range + if (!r->next() && r->lower() == r->upper() - 1) { + return UTF8Symbol(alc, r->lower()); + } + + RangeSuffix *root = NULL; + for (; r != NULL; r = r->next()) { + UTF8splitByRuneLength(root, r->lower(), r->upper() - 1); + } + return to_regexp(alc, root); +} + RE *UTF8Symbol(RE::alc_t &alc, utf8::rune r) { uint32_t chars[utf8::MAX_RUNE_LENGTH]; @@ -20,17 +48,109 @@ RE *UTF8Symbol(RE::alc_t &alc, utf8::rune r) } /* - * Split Unicode character class {[l1, h1), ..., [lN, hN)} into - * ranges [l1, h1-1], ..., [lN, hN-1] and return alternation of - * them. We store partially built range in suffix tree, which - * allows to eliminate common suffixes while building. + * Split range into sub-ranges, so that all runes in the same + * sub-range have equal length of UTF-8 sequence. E.g., full + * Unicode range [0-0x10FFFF] gets split into sub-ranges: + * [0 - 0x7F] (1-byte UTF-8 sequences) + * [0x80 - 0x7FF] (2-byte UTF-8 sequences) + * [0x800 - 0xFFFF] (3-byte UTF-8 sequences) + * [0x10000 - 0x10FFFF] (4-byte UTF-8 sequences) */ -RE *UTF8Range(RE::alc_t &alc, const Range *r) +void UTF8splitByRuneLength(RangeSuffix * & root, utf8::rune l, utf8::rune h) { - RangeSuffix * root = NULL; - for (; r != NULL; r = r->next ()) - UTF8splitByRuneLength(root, r->lower (), r->upper () - 1); - return to_regexp(alc, root); + const uint32_t nh = utf8::rune_length(h); + for (uint32_t nl = utf8::rune_length(l); nl < nh; ++nl) + { + utf8::rune r = utf8::max_rune(nl); + UTF8splitByContinuity(root, l, r, nl); + l = r + 1; + } + UTF8splitByContinuity(root, l, h, nh); +} + +/* + * Split range into sub-ranges that agree on leading bytes. + * + * We have two Unicode runes of equal length, L and H, which + * map to UTF-8 sequences 'L_1 ... L_n' and 'H_1 ... H_n'. + * We want to represent Unicode range [L - H] as a catenation + * of byte ranges [L_1 - H_1], ..., [L_n - H_n]. + * + * This is only possible if for all i > 1: + * if L_i /= H_i, then L_(i+1) == 0x80 and H_(i+1) == 0xbf. + * This condition ensures that: + * 1) all possible UTF-8 sequences between L and H are allowed + * 2) no byte ranges [b1 - b2] appear, such that b1 > b2 + * + * E.g.: + * [\U000e0031-\U000e0043] => [f3-f3],[a0-a0],[80-81],[b1-83]. + * The last byte range, [b1-83], is incorrect: its lower bound + * is greater than its upper bound. To fix this, we must split + * the original range into two sub-ranges: + * [\U000e0031-\U000e003f] => [f3-f3],[a0-a0],[80-80],[b1-bf] + * [\U000e0040-\U000e0043] => [f3-f3],[a0-a0],[81-81],[80-83] + * + * This function finds all such 'points of discontinuity' + * and represents original range as alternation of continuous + * sub-ranges. + */ +void UTF8splitByContinuity(RangeSuffix * & root, utf8::rune l, utf8::rune h, uint32_t n) +{ + for (uint32_t i = 1; i < n; ++i) + { + uint32_t m = (1u << (6u * i)) - 1u; // last i bytes of a UTF-8 sequence + if ((l & ~m) != (h & ~m)) + { + if ((l & m) != 0) + { + UTF8splitByContinuity(root, l, l | m, n); + UTF8splitByContinuity(root, (l | m) + 1, h, n); + return; + } + if ((h & m) != m) + { + UTF8splitByContinuity(root, l, (h & ~m) - 1, n); + UTF8splitByContinuity(root, h & ~m, h, n); + return; + } + } + } + UTF8addContinuous(root, l, h, n); +} + +/* + * Now that we have catenation of byte ranges [l1-h1]...[lN-hN], + * we want to add it to existing range, merging suffixes on the fly. + */ +void UTF8addContinuous(RangeSuffix * & root, utf8::rune l, utf8::rune h, uint32_t n) +{ + uint32_t lcs[utf8::MAX_RUNE_LENGTH]; + uint32_t hcs[utf8::MAX_RUNE_LENGTH]; + utf8::rune_to_bytes(lcs, l); + utf8::rune_to_bytes(hcs, h); + + RangeSuffix ** p = &root; + for (uint32_t i = 1; i <= n; ++i) + { + const uint32_t lc = lcs[n - i]; + const uint32_t hc = hcs[n - i]; + for (;;) + { + if (*p == NULL) + { + *p = new RangeSuffix(lc, hc); + p = &(*p)->child; + break; + } + else if ((*p)->l == lc && (*p)->h == hc) + { + p = &(*p)->child; + break; + } + else + p = &(*p)->next; + } + } } } // namespace re2c diff --git a/re2c/src/encoding/utf8/utf8_regexp.h b/re2c/src/encoding/utf8/utf8_regexp.h index c02284b2..7b4153ba 100644 --- a/re2c/src/encoding/utf8/utf8_regexp.h +++ b/re2c/src/encoding/utf8/utf8_regexp.h @@ -4,11 +4,11 @@ #include "src/regexp/re.h" #include "src/encoding/utf8/utf8.h" + namespace re2c { class Range; -RE *UTF8Symbol(RE::alc_t &alc, utf8::rune r); RE *UTF8Range(RE::alc_t &alc, const Range *r); } // namespace re2c diff --git a/re2c/src/regexp/ast_to_re.cc b/re2c/src/regexp/ast_to_re.cc index a4d98bed..a31569fd 100644 --- a/re2c/src/regexp/ast_to_re.cc +++ b/re2c/src/regexp/ast_to_re.cc @@ -49,13 +49,13 @@ namespace re2c { static bool has_tags(const AST *); static RE *ast_to_re(RESpec &, const AST *, size_t &, int32_t); -static RE *re_schar(RE::alc_t &, uint32_t, uint32_t, uint32_t, const opt_t *); -static RE *re_ichar(RE::alc_t &, uint32_t, uint32_t, uint32_t, const opt_t *); +static RE *re_string(RE::alc_t &, const AST *, const opt_t *, Warn &); static RE *re_class(RE::alc_t &, uint32_t, uint32_t, const Range *, const opt_t *, Warn &); -static Range *ast_to_range(const AST *ast, const opt_t *opts); -static Range *diff_to_range(const AST *ast, const opt_t *opts); -static Range *dot_to_range(const AST *ast, const opt_t *opts); -static Range *cls_to_range(const AST *ast, const opt_t *opts); +static Range *ast_to_range(const AST *, const opt_t *); +static Range *char_to_range(uint32_t, const ASTChar &, const opt_t *, bool); +static Range *diff_to_range(const AST *, const opt_t *); +static Range *dot_to_range(const AST *, const opt_t *); +static Range *cls_to_range(const AST *, const opt_t *); static bool misuse_of_named_def(const AST *, const opt_t *); static void assert_tags_used_once(const Rule &, const std::vector &); static void init_rule(Rule &, const Code *, const std::vector &, size_t, size_t); @@ -109,18 +109,8 @@ RE *ast_to_re(RESpec &spec, const AST *ast, size_t &ncap, int32_t height) switch (ast->type) { case AST::NIL: return re_nil(alc); - case AST::STR: { - RE *x = NULL; - std::vector::const_iterator - i = ast->str.chars->begin(), - e = ast->str.chars->end(); - for (; i != e; ++i) { - x = re_cat(alc, x, is_icase(opts, ast->str.icase) - ? re_ichar(alc, ast->line, i->column, i->chr, opts) - : re_schar(alc, ast->line, i->column, i->chr, opts)); - } - return x ? x : re_nil(alc); - } + case AST::STR: + return re_string(alc, ast, opts, warn); case AST::CLS: { Range *r = cls_to_range(ast, opts); return re_class(alc, ast->line, ast->column, r, opts, warn); @@ -256,6 +246,20 @@ RE *ast_to_re(RESpec &spec, const AST *ast, size_t &ncap, int32_t height) return NULL; /* unreachable */ } +Range *char_to_range(uint32_t line, const ASTChar &chr, const opt_t *opts + , bool icase) +{ + uint32_t c = chr.chr; + + if (!opts->encoding.validateChar(c)) { + fatal_lc(line, chr.column, "bad code point: '0x%X'", c); + } + + return icase && is_alpha(c) + ? Range::add(Range::sym(to_lower_unsafe(c)), Range::sym(to_upper_unsafe(c))) + : Range::sym(c); +} + Range *cls_to_range(const AST *ast, const opt_t *opts) { DASSERT(ast->type == AST::CLS); @@ -264,6 +268,7 @@ Range *cls_to_range(const AST *ast, const opt_t *opts) std::vector::const_iterator i = ast->cls.ranges->begin(), e = ast->cls.ranges->end(); + for (; i != e; ++i) { Range *s = opts->encoding.validateRange(i->lower, i->upper); if (!s) { @@ -272,9 +277,11 @@ Range *cls_to_range(const AST *ast, const opt_t *opts) } r = Range::add(r, s); } + if (ast->cls.negated) { r = Range::sub(opts->encoding.fullRange(), r); } + return r; } @@ -316,17 +323,10 @@ Range *ast_to_range(const AST *ast, const opt_t *opts) return cls_to_range(ast, opts); case AST::DOT: return dot_to_range(ast, opts); - case AST::STR: { + case AST::STR: if (ast->str.chars->size() != 1) break; - const ASTChar &i = ast->str.chars->front(); - uint32_t c = i.chr; - if (!opts->encoding.validateChar(c)) { - fatal_lc(ast->line, i.column, "bad code point: '0x%X'", c); - } - return is_icase(opts, ast->str.icase) && is_alpha(c) - ? Range::add(Range::sym(to_lower_unsafe(c)), Range::sym(to_upper_unsafe(c))) - : Range::sym(c); - } + return char_to_range(ast->line, ast->str.chars->front(), opts + , is_icase(opts, ast->str.icase)); case AST::DIFF: return diff_to_range(ast, opts); case AST::ALT: { @@ -339,38 +339,27 @@ Range *ast_to_range(const AST *ast, const opt_t *opts) return NULL; } -RE *re_schar(RE::alc_t &alc, uint32_t line, uint32_t column, uint32_t c, const opt_t *opts) +RE *re_string(RE::alc_t &alc, const AST *ast, const opt_t *opts, Warn &warn) { - if (!opts->encoding.validateChar(c)) { - fatal_lc(line, column, "bad code point: '0x%X'", c); - } - switch (opts->encoding.type()) { - case Enc::UTF16: - return UTF16Symbol(alc, c); - case Enc::UTF8: - return UTF8Symbol(alc, c); - case Enc::EBCDIC: - return EBCDICSymbol(alc, c); - case Enc::ASCII: - case Enc::UTF32: - case Enc::UCS2: - return re_sym(alc, Range::sym(c)); - } - return NULL; /* unreachable */ -} + DASSERT(ast->type == AST::STR); -RE *re_ichar(RE::alc_t &alc, uint32_t line, uint32_t column, uint32_t c, const opt_t *opts) -{ - if (is_alpha(c)) { - return re_alt(alc, - re_schar(alc, line, column, to_lower_unsafe(c), opts), - re_schar(alc, line, column, to_upper_unsafe(c), opts)); - } else { - return re_schar(alc, line, column, c, opts); + RE *x = NULL; + std::vector::const_iterator + i = ast->str.chars->begin(), + e = ast->str.chars->end(); + + bool icase = is_icase(opts, ast->str.icase); + for (; i != e; ++i) { + Range *r = char_to_range(ast->line, *i, opts, icase); + RE *y = re_class(alc, ast->line, i->column, r, opts, warn); + x = re_cat(alc, x, y); } + + return x ? x : re_nil(alc); } -RE *re_class(RE::alc_t &alc, uint32_t line, uint32_t column, const Range *r, const opt_t *opts, Warn &warn) +RE *re_class(RE::alc_t &alc, uint32_t line, uint32_t column, const Range *r + , const opt_t *opts, Warn &warn) { if (!r) { switch (opts->empty_class_policy) { @@ -384,6 +373,7 @@ RE *re_class(RE::alc_t &alc, uint32_t line, uint32_t column, const Range *r, con fatal_lc(line, column, "empty character class"); } } + switch (opts->encoding.type()) { case Enc::UTF16: return UTF16Range(alc, r); @@ -396,6 +386,7 @@ RE *re_class(RE::alc_t &alc, uint32_t line, uint32_t column, const Range *r, con case Enc::UCS2: return re_sym(alc, r); } + return NULL; /* unreachable */ }