src/encoding/range_suffix.h \
src/encoding/utf8/utf8.h \
src/encoding/utf8/utf8_regexp.h \
- src/encoding/utf8/utf8_range.h \
- src/encoding/utf16/utf16_range.h \
src/encoding/utf16/utf16_regexp.h \
src/encoding/utf16/utf16.h \
src/regexp/empty_class_policy.h \
src/encoding/enc.cc \
src/encoding/range_suffix.cc \
src/encoding/utf8/utf8_regexp.cc \
- src/encoding/utf8/utf8_range.cc \
src/encoding/utf8/utf8.cc \
src/encoding/utf16/utf16_regexp.cc \
src/encoding/utf16/utf16.cc \
- src/encoding/utf16/utf16_range.cc \
src/regexp/ast_to_re.cc \
src/regexp/default_tags.cc \
src/regexp/fixed_tags.cc \
namespace re2c {
-RE *EBCDICSymbol(RE::alc_t &alc, uint32_t c)
-{
- return re_sym(alc, Range::sym(asc2ebc[c]));
-}
-
RE *EBCDICRange(RE::alc_t &alc, const Range *r)
{
Range *s = NULL;
class Range;
-RE *EBCDICSymbol(RE::alc_t &alc, uint32_t c);
RE *EBCDICRange(RE::alc_t &alc, const Range *r);
} // namespace re2c
+++ /dev/null
-#include "src/encoding/utf16/utf16_range.h"
-
-#include <stddef.h>
-
-#include "src/encoding/range_suffix.h"
-
-namespace re2c {
-
-/*
- * Add word range [w1-w2].
- */
-void UTF16addContinuous1(RangeSuffix * & root, uint32_t l, uint32_t h)
-{
- RangeSuffix ** p = &root;
- for (;;)
- {
- if (*p == NULL)
- {
- *p = new RangeSuffix(l, h);
- break;
- }
- else if ((*p)->l == l && (*p)->h == h)
- {
- break;
- }
- else
- p = &(*p)->next;
- }
-}
-
-/*
- * Now that we have catenation of word ranges [l1-h1],[l2-h2],
- * we want to add it to existing range, merging suffixes on the fly.
- */
-void UTF16addContinuous2(RangeSuffix * & root, uint32_t l_ld, uint32_t h_ld, uint32_t l_tr, uint32_t h_tr)
-{
- RangeSuffix ** p = &root;
- for (;;)
- {
- if (*p == NULL)
- {
- *p = new RangeSuffix(l_tr, h_tr);
- p = &(*p)->child;
- break;
- }
- else if ((*p)->l == l_tr && (*p)->h == h_tr)
- {
- p = &(*p)->child;
- break;
- }
- else
- p = &(*p)->next;
- }
- for (;;)
- {
- if (*p == NULL)
- {
- *p = new RangeSuffix(l_ld, h_ld);
- break;
- }
- else if ((*p)->l == l_ld && (*p)->h == h_ld)
- {
- break;
- }
- else
- p = &(*p)->next;
- }
-}
-
-/*
- * Split range into sub-ranges that agree on leading surrogates.
- *
- * We have two Unicode runes, L and H, both map to UTF-16
- * surrogate pairs 'L1 L2' and 'H1 H2'.
- * We want to represent Unicode range [L - H] as a catenation
- * of word ranges [L1 - H1],[L2 - H2].
- *
- * This is only possible if the following condition holds:
- * if L1 /= H1, then L2 == 0xdc00 and H2 == 0xdfff.
- * This condition ensures that:
- * 1) all possible UTF-16 sequences between L and H are allowed
- * 2) no word ranges [w1 - w2] appear, such that w1 > w2
- *
- * E.g.:
- * [\U00010001-\U00010400] => [d800-d801],[dc01-dc00].
- * The last word range, [dc01-dc00], is incorrect: its lower bound
- * is greater than its upper bound. To fix this, we must split
- * the original range into two sub-ranges:
- * [\U00010001-\U000103ff] => [d800-d800],[dc01-dfff]
- * [\U00010400-\U00010400] => [d801-d801],[dc00-dc00]
- *
- * This function finds all such 'points of discontinuity'
- * and represents original range as alternation of continuous
- * sub-ranges.
- */
-void UTF16splitByContinuity(RangeSuffix * & root, uint32_t l_ld, uint32_t h_ld, uint32_t l_tr, uint32_t h_tr)
-{
- if (l_ld != h_ld)
- {
- if (l_tr > utf16::MIN_TRAIL_SURR)
- {
- UTF16splitByContinuity(root, l_ld, l_ld, l_tr, utf16::MAX_TRAIL_SURR);
- UTF16splitByContinuity(root, l_ld + 1, h_ld, utf16::MIN_TRAIL_SURR, h_tr);
- return;
- }
- if (h_tr < utf16::MAX_TRAIL_SURR)
- {
- UTF16splitByContinuity(root, l_ld, h_ld - 1, l_tr, utf16::MAX_TRAIL_SURR);
- UTF16splitByContinuity(root, h_ld, h_ld, utf16::MIN_TRAIL_SURR, h_tr);
- return;
- }
- }
- UTF16addContinuous2(root, l_ld, h_ld, l_tr, h_tr);
-}
-
-/*
- * Split range into sub-ranges, so that all runes in the same
- * sub-range have equal length of UTF-16 sequence. E.g., full
- * Unicode range [0-0x10FFFF] gets split into sub-ranges:
- * [0 - 0xFFFF] (2-byte UTF-16 sequences)
- * [0x10000 - 0x10FFFF] (4-byte UTF-16 sequences)
- */
-void UTF16splitByRuneLength(RangeSuffix * & root, utf16::rune l, utf16::rune h)
-{
- if (l <= utf16::MAX_1WORD_RUNE)
- {
- if (h <= utf16::MAX_1WORD_RUNE)
- {
- UTF16addContinuous1(root, l, h);
- }
- else
- {
- UTF16addContinuous1(root, l, utf16::MAX_1WORD_RUNE);
- const uint32_t h_ld = utf16::lead_surr(h);
- const uint32_t h_tr = utf16::trail_surr(h);
- UTF16splitByContinuity(root, utf16::MIN_LEAD_SURR, h_ld, utf16::MIN_TRAIL_SURR, h_tr);
- }
- }
- else
- {
- const uint32_t l_ld = utf16::lead_surr(l);
- const uint32_t l_tr = utf16::trail_surr(l);
- const uint32_t h_ld = utf16::lead_surr(h);
- const uint32_t h_tr = utf16::trail_surr(h);
- UTF16splitByContinuity(root, l_ld, h_ld, l_tr, h_tr);
- }
-}
-
-} // namespace re2c
+++ /dev/null
-#ifndef _RE2C_RE_ENCODING_UTF16_RANGE_
-#define _RE2C_RE_ENCODING_UTF16_RANGE_
-
-#include "src/util/c99_stdint.h"
-
-#include "src/encoding/utf16/utf16.h"
-
-namespace re2c {
-
-struct RangeSuffix;
-
-void UTF16addContinuous1(RangeSuffix * & root, uint32_t l, uint32_t h);
-void UTF16addContinuous2(RangeSuffix * & root, uint32_t l_ld, uint32_t h_ld, uint32_t l_tr, uint32_t h_tr);
-void UTF16splitByContinuity(RangeSuffix * & root, uint32_t l_ld, uint32_t h_ld, uint32_t l_tr, uint32_t h_tr);
-void UTF16splitByRuneLength(RangeSuffix * & root, utf16::rune l, utf16::rune h);
-
-} // namespace re2c
-
-#endif // _RE2C_RE_ENCODING_UTF16_RANGE_
#include "src/util/c99_stdint.h"
#include "src/encoding/range_suffix.h"
-#include "src/encoding/utf16/utf16_range.h"
#include "src/encoding/utf16/utf16_regexp.h"
#include "src/util/range.h"
+
namespace re2c {
+static RE *UTF16Symbol(RE::alc_t &, utf16::rune);
+static void UTF16addContinuous1(RangeSuffix *&, uint32_t, uint32_t);
+static void UTF16addContinuous2(RangeSuffix *&, uint32_t, uint32_t, uint32_t, uint32_t);
+static void UTF16splitByContinuity(RangeSuffix *&, uint32_t, uint32_t, uint32_t, uint32_t);
+static void UTF16splitByRuneLength(RangeSuffix *&, utf16::rune, utf16::rune);
+
+/*
+ * Split Unicode character class {[l1, h1), ..., [lN, hN)} into
+ * ranges [l1, h1-1], ..., [lN, hN-1] and return alternation of
+ * them. We store partially built range in suffix tree, which
+ * allows to eliminate common suffixes while building.
+ */
+RE *UTF16Range(RE::alc_t &alc, const Range *r)
+{
+ // empty range
+ if (!r) return NULL;
+
+ // one-symbol range
+ if (!r->next() && r->lower() == r->upper() - 1) {
+ return UTF16Symbol(alc, r->lower());
+ }
+
+ RangeSuffix * root = NULL;
+ for (; r != NULL; r = r->next ())
+ UTF16splitByRuneLength(root, r->lower (), r->upper () - 1);
+ return to_regexp(alc, root);
+}
+
RE *UTF16Symbol(RE::alc_t &alc, utf16::rune r)
{
if (r <= utf16::MAX_1WORD_RUNE) {
}
/*
- * Split Unicode character class {[l1, h1), ..., [lN, hN)} into
- * ranges [l1, h1-1], ..., [lN, hN-1] and return alternation of
- * them. We store partially built range in suffix tree, which
- * allows to eliminate common suffixes while building.
+ * Split range into sub-ranges, so that all runes in the same
+ * sub-range have equal length of UTF-16 sequence. E.g., full
+ * Unicode range [0-0x10FFFF] gets split into sub-ranges:
+ * [0 - 0xFFFF] (2-byte UTF-16 sequences)
+ * [0x10000 - 0x10FFFF] (4-byte UTF-16 sequences)
*/
-RE *UTF16Range(RE::alc_t &alc, const Range *r)
+void UTF16splitByRuneLength(RangeSuffix * & root, utf16::rune l, utf16::rune h)
{
- RangeSuffix * root = NULL;
- for (; r != NULL; r = r->next ())
- UTF16splitByRuneLength(root, r->lower (), r->upper () - 1);
- return to_regexp(alc, root);
+ if (l <= utf16::MAX_1WORD_RUNE)
+ {
+ if (h <= utf16::MAX_1WORD_RUNE)
+ {
+ UTF16addContinuous1(root, l, h);
+ }
+ else
+ {
+ UTF16addContinuous1(root, l, utf16::MAX_1WORD_RUNE);
+ const uint32_t h_ld = utf16::lead_surr(h);
+ const uint32_t h_tr = utf16::trail_surr(h);
+ UTF16splitByContinuity(root, utf16::MIN_LEAD_SURR, h_ld, utf16::MIN_TRAIL_SURR, h_tr);
+ }
+ }
+ else
+ {
+ const uint32_t l_ld = utf16::lead_surr(l);
+ const uint32_t l_tr = utf16::trail_surr(l);
+ const uint32_t h_ld = utf16::lead_surr(h);
+ const uint32_t h_tr = utf16::trail_surr(h);
+ UTF16splitByContinuity(root, l_ld, h_ld, l_tr, h_tr);
+ }
+}
+
+/*
+ * Split range into sub-ranges that agree on leading surrogates.
+ *
+ * We have two Unicode runes, L and H, both map to UTF-16
+ * surrogate pairs 'L1 L2' and 'H1 H2'.
+ * We want to represent Unicode range [L - H] as a catenation
+ * of word ranges [L1 - H1],[L2 - H2].
+ *
+ * This is only possible if the following condition holds:
+ * if L1 /= H1, then L2 == 0xdc00 and H2 == 0xdfff.
+ * This condition ensures that:
+ * 1) all possible UTF-16 sequences between L and H are allowed
+ * 2) no word ranges [w1 - w2] appear, such that w1 > w2
+ *
+ * E.g.:
+ * [\U00010001-\U00010400] => [d800-d801],[dc01-dc00].
+ * The last word range, [dc01-dc00], is incorrect: its lower bound
+ * is greater than its upper bound. To fix this, we must split
+ * the original range into two sub-ranges:
+ * [\U00010001-\U000103ff] => [d800-d800],[dc01-dfff]
+ * [\U00010400-\U00010400] => [d801-d801],[dc00-dc00]
+ *
+ * This function finds all such 'points of discontinuity'
+ * and represents original range as alternation of continuous
+ * sub-ranges.
+ */
+void UTF16splitByContinuity(RangeSuffix * & root, uint32_t l_ld, uint32_t h_ld, uint32_t l_tr, uint32_t h_tr)
+{
+ if (l_ld != h_ld)
+ {
+ if (l_tr > utf16::MIN_TRAIL_SURR)
+ {
+ UTF16splitByContinuity(root, l_ld, l_ld, l_tr, utf16::MAX_TRAIL_SURR);
+ UTF16splitByContinuity(root, l_ld + 1, h_ld, utf16::MIN_TRAIL_SURR, h_tr);
+ return;
+ }
+ if (h_tr < utf16::MAX_TRAIL_SURR)
+ {
+ UTF16splitByContinuity(root, l_ld, h_ld - 1, l_tr, utf16::MAX_TRAIL_SURR);
+ UTF16splitByContinuity(root, h_ld, h_ld, utf16::MIN_TRAIL_SURR, h_tr);
+ return;
+ }
+ }
+ UTF16addContinuous2(root, l_ld, h_ld, l_tr, h_tr);
+}
+
+/*
+ * Add word range [w1-w2].
+ */
+void UTF16addContinuous1(RangeSuffix * & root, uint32_t l, uint32_t h)
+{
+ RangeSuffix ** p = &root;
+ for (;;)
+ {
+ if (*p == NULL)
+ {
+ *p = new RangeSuffix(l, h);
+ break;
+ }
+ else if ((*p)->l == l && (*p)->h == h)
+ {
+ break;
+ }
+ else
+ p = &(*p)->next;
+ }
+}
+
+/*
+ * Now that we have catenation of word ranges [l1-h1],[l2-h2],
+ * we want to add it to existing range, merging suffixes on the fly.
+ */
+void UTF16addContinuous2(RangeSuffix * & root, uint32_t l_ld, uint32_t h_ld, uint32_t l_tr, uint32_t h_tr)
+{
+ RangeSuffix ** p = &root;
+ for (;;)
+ {
+ if (*p == NULL)
+ {
+ *p = new RangeSuffix(l_tr, h_tr);
+ p = &(*p)->child;
+ break;
+ }
+ else if ((*p)->l == l_tr && (*p)->h == h_tr)
+ {
+ p = &(*p)->child;
+ break;
+ }
+ else
+ p = &(*p)->next;
+ }
+ for (;;)
+ {
+ if (*p == NULL)
+ {
+ *p = new RangeSuffix(l_ld, h_ld);
+ break;
+ }
+ else if ((*p)->l == l_ld && (*p)->h == h_ld)
+ {
+ break;
+ }
+ else
+ p = &(*p)->next;
+ }
}
} // namespace re2c
#include "src/regexp/re.h"
#include "src/encoding/utf16/utf16.h"
+
namespace re2c {
class Range;
-RE *UTF16Symbol(RE::alc_t &alc, utf16::rune r);
RE *UTF16Range(RE::alc_t &alc, const Range *r);
} // namespace re2c
+++ /dev/null
-#include "src/encoding/utf8/utf8_range.h"
-
-#include <stddef.h>
-
-#include "src/encoding/range_suffix.h"
-
-namespace re2c {
-
-/*
- * Now that we have catenation of byte ranges [l1-h1]...[lN-hN],
- * we want to add it to existing range, merging suffixes on the fly.
- */
-void UTF8addContinuous(RangeSuffix * & root, utf8::rune l, utf8::rune h, uint32_t n)
-{
- uint32_t lcs[utf8::MAX_RUNE_LENGTH];
- uint32_t hcs[utf8::MAX_RUNE_LENGTH];
- utf8::rune_to_bytes(lcs, l);
- utf8::rune_to_bytes(hcs, h);
-
- RangeSuffix ** p = &root;
- for (uint32_t i = 1; i <= n; ++i)
- {
- const uint32_t lc = lcs[n - i];
- const uint32_t hc = hcs[n - i];
- for (;;)
- {
- if (*p == NULL)
- {
- *p = new RangeSuffix(lc, hc);
- p = &(*p)->child;
- break;
- }
- else if ((*p)->l == lc && (*p)->h == hc)
- {
- p = &(*p)->child;
- break;
- }
- else
- p = &(*p)->next;
- }
- }
-}
-
-/*
- * Split range into sub-ranges that agree on leading bytes.
- *
- * We have two Unicode runes of equal length, L and H, which
- * map to UTF-8 sequences 'L_1 ... L_n' and 'H_1 ... H_n'.
- * We want to represent Unicode range [L - H] as a catenation
- * of byte ranges [L_1 - H_1], ..., [L_n - H_n].
- *
- * This is only possible if for all i > 1:
- * if L_i /= H_i, then L_(i+1) == 0x80 and H_(i+1) == 0xbf.
- * This condition ensures that:
- * 1) all possible UTF-8 sequences between L and H are allowed
- * 2) no byte ranges [b1 - b2] appear, such that b1 > b2
- *
- * E.g.:
- * [\U000e0031-\U000e0043] => [f3-f3],[a0-a0],[80-81],[b1-83].
- * The last byte range, [b1-83], is incorrect: its lower bound
- * is greater than its upper bound. To fix this, we must split
- * the original range into two sub-ranges:
- * [\U000e0031-\U000e003f] => [f3-f3],[a0-a0],[80-80],[b1-bf]
- * [\U000e0040-\U000e0043] => [f3-f3],[a0-a0],[81-81],[80-83]
- *
- * This function finds all such 'points of discontinuity'
- * and represents original range as alternation of continuous
- * sub-ranges.
- */
-void UTF8splitByContinuity(RangeSuffix * & root, utf8::rune l, utf8::rune h, uint32_t n)
-{
- for (uint32_t i = 1; i < n; ++i)
- {
- uint32_t m = (1u << (6u * i)) - 1u; // last i bytes of a UTF-8 sequence
- if ((l & ~m) != (h & ~m))
- {
- if ((l & m) != 0)
- {
- UTF8splitByContinuity(root, l, l | m, n);
- UTF8splitByContinuity(root, (l | m) + 1, h, n);
- return;
- }
- if ((h & m) != m)
- {
- UTF8splitByContinuity(root, l, (h & ~m) - 1, n);
- UTF8splitByContinuity(root, h & ~m, h, n);
- return;
- }
- }
- }
- UTF8addContinuous(root, l, h, n);
-}
-
-/*
- * Split range into sub-ranges, so that all runes in the same
- * sub-range have equal length of UTF-8 sequence. E.g., full
- * Unicode range [0-0x10FFFF] gets split into sub-ranges:
- * [0 - 0x7F] (1-byte UTF-8 sequences)
- * [0x80 - 0x7FF] (2-byte UTF-8 sequences)
- * [0x800 - 0xFFFF] (3-byte UTF-8 sequences)
- * [0x10000 - 0x10FFFF] (4-byte UTF-8 sequences)
- */
-void UTF8splitByRuneLength(RangeSuffix * & root, utf8::rune l, utf8::rune h)
-{
- const uint32_t nh = utf8::rune_length(h);
- for (uint32_t nl = utf8::rune_length(l); nl < nh; ++nl)
- {
- utf8::rune r = utf8::max_rune(nl);
- UTF8splitByContinuity(root, l, r, nl);
- l = r + 1;
- }
- UTF8splitByContinuity(root, l, h, nh);
-}
-
-} // namespace re2c
+++ /dev/null
-#ifndef _RE2C_RE_ENCODING_UTF8_RANGE_
-#define _RE2C_RE_ENCODING_UTF8_RANGE_
-
-#include "src/util/c99_stdint.h"
-
-#include "src/encoding/utf8/utf8.h"
-
-namespace re2c {
-
-struct RangeSuffix;
-
-void UTF8addContinuous(RangeSuffix * & root, utf8::rune l, utf8::rune h, uint32_t n);
-void UTF8splitByContinuity(RangeSuffix * & root, utf8::rune l, utf8::rune h, uint32_t n);
-void UTF8splitByRuneLength(RangeSuffix * & root, utf8::rune l, utf8::rune h);
-
-} // namespace re2c
-
-#endif // _RE2C_RE_ENCODING_UTF8_RANGE_
#include "src/util/c99_stdint.h"
#include "src/encoding/range_suffix.h"
-#include "src/encoding/utf8/utf8_range.h"
#include "src/encoding/utf8/utf8_regexp.h"
#include "src/util/range.h"
+
namespace re2c {
+static RE *UTF8Symbol(RE::alc_t &, utf8::rune);
+static void UTF8addContinuous(RangeSuffix *&, utf8::rune, utf8::rune, uint32_t);
+static void UTF8splitByContinuity(RangeSuffix *&, utf8::rune, utf8::rune, uint32_t);
+static void UTF8splitByRuneLength(RangeSuffix *&, utf8::rune, utf8::rune);
+
+/*
+ * Split Unicode character class {[l1, h1), ..., [lN, hN)} into
+ * ranges [l1, h1-1], ..., [lN, hN-1] and return alternation of
+ * them. We store partially built range in suffix tree, which
+ * allows to eliminate common suffixes while building.
+ */
+RE *UTF8Range(RE::alc_t &alc, const Range *r)
+{
+ // empty range
+ if (!r) return NULL;
+
+ // one-symbol range
+ if (!r->next() && r->lower() == r->upper() - 1) {
+ return UTF8Symbol(alc, r->lower());
+ }
+
+ RangeSuffix *root = NULL;
+ for (; r != NULL; r = r->next()) {
+ UTF8splitByRuneLength(root, r->lower(), r->upper() - 1);
+ }
+ return to_regexp(alc, root);
+}
+
RE *UTF8Symbol(RE::alc_t &alc, utf8::rune r)
{
uint32_t chars[utf8::MAX_RUNE_LENGTH];
}
/*
- * Split Unicode character class {[l1, h1), ..., [lN, hN)} into
- * ranges [l1, h1-1], ..., [lN, hN-1] and return alternation of
- * them. We store partially built range in suffix tree, which
- * allows to eliminate common suffixes while building.
+ * Split range into sub-ranges, so that all runes in the same
+ * sub-range have equal length of UTF-8 sequence. E.g., full
+ * Unicode range [0-0x10FFFF] gets split into sub-ranges:
+ * [0 - 0x7F] (1-byte UTF-8 sequences)
+ * [0x80 - 0x7FF] (2-byte UTF-8 sequences)
+ * [0x800 - 0xFFFF] (3-byte UTF-8 sequences)
+ * [0x10000 - 0x10FFFF] (4-byte UTF-8 sequences)
*/
-RE *UTF8Range(RE::alc_t &alc, const Range *r)
+void UTF8splitByRuneLength(RangeSuffix * & root, utf8::rune l, utf8::rune h)
{
- RangeSuffix * root = NULL;
- for (; r != NULL; r = r->next ())
- UTF8splitByRuneLength(root, r->lower (), r->upper () - 1);
- return to_regexp(alc, root);
+ const uint32_t nh = utf8::rune_length(h);
+ for (uint32_t nl = utf8::rune_length(l); nl < nh; ++nl)
+ {
+ utf8::rune r = utf8::max_rune(nl);
+ UTF8splitByContinuity(root, l, r, nl);
+ l = r + 1;
+ }
+ UTF8splitByContinuity(root, l, h, nh);
+}
+
+/*
+ * Split range into sub-ranges that agree on leading bytes.
+ *
+ * We have two Unicode runes of equal length, L and H, which
+ * map to UTF-8 sequences 'L_1 ... L_n' and 'H_1 ... H_n'.
+ * We want to represent Unicode range [L - H] as a catenation
+ * of byte ranges [L_1 - H_1], ..., [L_n - H_n].
+ *
+ * This is only possible if for all i > 1:
+ * if L_i /= H_i, then L_(i+1) == 0x80 and H_(i+1) == 0xbf.
+ * This condition ensures that:
+ * 1) all possible UTF-8 sequences between L and H are allowed
+ * 2) no byte ranges [b1 - b2] appear, such that b1 > b2
+ *
+ * E.g.:
+ * [\U000e0031-\U000e0043] => [f3-f3],[a0-a0],[80-81],[b1-83].
+ * The last byte range, [b1-83], is incorrect: its lower bound
+ * is greater than its upper bound. To fix this, we must split
+ * the original range into two sub-ranges:
+ * [\U000e0031-\U000e003f] => [f3-f3],[a0-a0],[80-80],[b1-bf]
+ * [\U000e0040-\U000e0043] => [f3-f3],[a0-a0],[81-81],[80-83]
+ *
+ * This function finds all such 'points of discontinuity'
+ * and represents original range as alternation of continuous
+ * sub-ranges.
+ */
+void UTF8splitByContinuity(RangeSuffix * & root, utf8::rune l, utf8::rune h, uint32_t n)
+{
+ for (uint32_t i = 1; i < n; ++i)
+ {
+ uint32_t m = (1u << (6u * i)) - 1u; // last i bytes of a UTF-8 sequence
+ if ((l & ~m) != (h & ~m))
+ {
+ if ((l & m) != 0)
+ {
+ UTF8splitByContinuity(root, l, l | m, n);
+ UTF8splitByContinuity(root, (l | m) + 1, h, n);
+ return;
+ }
+ if ((h & m) != m)
+ {
+ UTF8splitByContinuity(root, l, (h & ~m) - 1, n);
+ UTF8splitByContinuity(root, h & ~m, h, n);
+ return;
+ }
+ }
+ }
+ UTF8addContinuous(root, l, h, n);
+}
+
+/*
+ * Now that we have catenation of byte ranges [l1-h1]...[lN-hN],
+ * we want to add it to existing range, merging suffixes on the fly.
+ */
+void UTF8addContinuous(RangeSuffix * & root, utf8::rune l, utf8::rune h, uint32_t n)
+{
+ uint32_t lcs[utf8::MAX_RUNE_LENGTH];
+ uint32_t hcs[utf8::MAX_RUNE_LENGTH];
+ utf8::rune_to_bytes(lcs, l);
+ utf8::rune_to_bytes(hcs, h);
+
+ RangeSuffix ** p = &root;
+ for (uint32_t i = 1; i <= n; ++i)
+ {
+ const uint32_t lc = lcs[n - i];
+ const uint32_t hc = hcs[n - i];
+ for (;;)
+ {
+ if (*p == NULL)
+ {
+ *p = new RangeSuffix(lc, hc);
+ p = &(*p)->child;
+ break;
+ }
+ else if ((*p)->l == lc && (*p)->h == hc)
+ {
+ p = &(*p)->child;
+ break;
+ }
+ else
+ p = &(*p)->next;
+ }
+ }
}
} // namespace re2c
#include "src/regexp/re.h"
#include "src/encoding/utf8/utf8.h"
+
namespace re2c {
class Range;
-RE *UTF8Symbol(RE::alc_t &alc, utf8::rune r);
RE *UTF8Range(RE::alc_t &alc, const Range *r);
} // namespace re2c
static bool has_tags(const AST *);
static RE *ast_to_re(RESpec &, const AST *, size_t &, int32_t);
-static RE *re_schar(RE::alc_t &, uint32_t, uint32_t, uint32_t, const opt_t *);
-static RE *re_ichar(RE::alc_t &, uint32_t, uint32_t, uint32_t, const opt_t *);
+static RE *re_string(RE::alc_t &, const AST *, const opt_t *, Warn &);
static RE *re_class(RE::alc_t &, uint32_t, uint32_t, const Range *, const opt_t *, Warn &);
-static Range *ast_to_range(const AST *ast, const opt_t *opts);
-static Range *diff_to_range(const AST *ast, const opt_t *opts);
-static Range *dot_to_range(const AST *ast, const opt_t *opts);
-static Range *cls_to_range(const AST *ast, const opt_t *opts);
+static Range *ast_to_range(const AST *, const opt_t *);
+static Range *char_to_range(uint32_t, const ASTChar &, const opt_t *, bool);
+static Range *diff_to_range(const AST *, const opt_t *);
+static Range *dot_to_range(const AST *, const opt_t *);
+static Range *cls_to_range(const AST *, const opt_t *);
static bool misuse_of_named_def(const AST *, const opt_t *);
static void assert_tags_used_once(const Rule &, const std::vector<Tag> &);
static void init_rule(Rule &, const Code *, const std::vector<Tag> &, size_t, size_t);
switch (ast->type) {
case AST::NIL:
return re_nil(alc);
- case AST::STR: {
- RE *x = NULL;
- std::vector<ASTChar>::const_iterator
- i = ast->str.chars->begin(),
- e = ast->str.chars->end();
- for (; i != e; ++i) {
- x = re_cat(alc, x, is_icase(opts, ast->str.icase)
- ? re_ichar(alc, ast->line, i->column, i->chr, opts)
- : re_schar(alc, ast->line, i->column, i->chr, opts));
- }
- return x ? x : re_nil(alc);
- }
+ case AST::STR:
+ return re_string(alc, ast, opts, warn);
case AST::CLS: {
Range *r = cls_to_range(ast, opts);
return re_class(alc, ast->line, ast->column, r, opts, warn);
return NULL; /* unreachable */
}
+Range *char_to_range(uint32_t line, const ASTChar &chr, const opt_t *opts
+ , bool icase)
+{
+ uint32_t c = chr.chr;
+
+ if (!opts->encoding.validateChar(c)) {
+ fatal_lc(line, chr.column, "bad code point: '0x%X'", c);
+ }
+
+ return icase && is_alpha(c)
+ ? Range::add(Range::sym(to_lower_unsafe(c)), Range::sym(to_upper_unsafe(c)))
+ : Range::sym(c);
+}
+
Range *cls_to_range(const AST *ast, const opt_t *opts)
{
DASSERT(ast->type == AST::CLS);
std::vector<ASTRange>::const_iterator
i = ast->cls.ranges->begin(),
e = ast->cls.ranges->end();
+
for (; i != e; ++i) {
Range *s = opts->encoding.validateRange(i->lower, i->upper);
if (!s) {
}
r = Range::add(r, s);
}
+
if (ast->cls.negated) {
r = Range::sub(opts->encoding.fullRange(), r);
}
+
return r;
}
return cls_to_range(ast, opts);
case AST::DOT:
return dot_to_range(ast, opts);
- case AST::STR: {
+ case AST::STR:
if (ast->str.chars->size() != 1) break;
- const ASTChar &i = ast->str.chars->front();
- uint32_t c = i.chr;
- if (!opts->encoding.validateChar(c)) {
- fatal_lc(ast->line, i.column, "bad code point: '0x%X'", c);
- }
- return is_icase(opts, ast->str.icase) && is_alpha(c)
- ? Range::add(Range::sym(to_lower_unsafe(c)), Range::sym(to_upper_unsafe(c)))
- : Range::sym(c);
- }
+ return char_to_range(ast->line, ast->str.chars->front(), opts
+ , is_icase(opts, ast->str.icase));
case AST::DIFF:
return diff_to_range(ast, opts);
case AST::ALT: {
return NULL;
}
-RE *re_schar(RE::alc_t &alc, uint32_t line, uint32_t column, uint32_t c, const opt_t *opts)
+RE *re_string(RE::alc_t &alc, const AST *ast, const opt_t *opts, Warn &warn)
{
- if (!opts->encoding.validateChar(c)) {
- fatal_lc(line, column, "bad code point: '0x%X'", c);
- }
- switch (opts->encoding.type()) {
- case Enc::UTF16:
- return UTF16Symbol(alc, c);
- case Enc::UTF8:
- return UTF8Symbol(alc, c);
- case Enc::EBCDIC:
- return EBCDICSymbol(alc, c);
- case Enc::ASCII:
- case Enc::UTF32:
- case Enc::UCS2:
- return re_sym(alc, Range::sym(c));
- }
- return NULL; /* unreachable */
-}
+ DASSERT(ast->type == AST::STR);
-RE *re_ichar(RE::alc_t &alc, uint32_t line, uint32_t column, uint32_t c, const opt_t *opts)
-{
- if (is_alpha(c)) {
- return re_alt(alc,
- re_schar(alc, line, column, to_lower_unsafe(c), opts),
- re_schar(alc, line, column, to_upper_unsafe(c), opts));
- } else {
- return re_schar(alc, line, column, c, opts);
+ RE *x = NULL;
+ std::vector<ASTChar>::const_iterator
+ i = ast->str.chars->begin(),
+ e = ast->str.chars->end();
+
+ bool icase = is_icase(opts, ast->str.icase);
+ for (; i != e; ++i) {
+ Range *r = char_to_range(ast->line, *i, opts, icase);
+ RE *y = re_class(alc, ast->line, i->column, r, opts, warn);
+ x = re_cat(alc, x, y);
}
+
+ return x ? x : re_nil(alc);
}
-RE *re_class(RE::alc_t &alc, uint32_t line, uint32_t column, const Range *r, const opt_t *opts, Warn &warn)
+RE *re_class(RE::alc_t &alc, uint32_t line, uint32_t column, const Range *r
+ , const opt_t *opts, Warn &warn)
{
if (!r) {
switch (opts->empty_class_policy) {
fatal_lc(line, column, "empty character class");
}
}
+
switch (opts->encoding.type()) {
case Enc::UTF16:
return UTF16Range(alc, r);
case Enc::UCS2:
return re_sym(alc, r);
}
+
return NULL; /* unreachable */
}