Handle single chars and 1-char ranges in the same way.

author Ulya Trofimovich <skvadrik@gmail.com>

Sun, 6 Jan 2019 08:40:16 +0000 (08:40 +0000)

committer Ulya Trofimovich <skvadrik@gmail.com>

Sun, 6 Jan 2019 08:40:16 +0000 (08:40 +0000)
author Ulya Trofimovich <skvadrik@gmail.com>
Sun, 6 Jan 2019 08:40:16 +0000 (08:40 +0000)
committer Ulya Trofimovich <skvadrik@gmail.com>
Sun, 6 Jan 2019 08:40:16 +0000 (08:40 +0000)
diff --git a/re2c/Makefile.am b/re2c/Makefile.am

index 41aa05bc0c2129ee4ef1510a7d97f820e470f97c..2a094e6b66c2ff83f4d9d52b6a6fd06781403623 100644 (file)
--- a/re2c/Makefile.am
+++ b/re2c/Makefile.am
@@ -39,8 +39,6 @@ SRC_HDR = \
         src/encoding/range_suffix.h \
         src/encoding/utf8/utf8.h \
         src/encoding/utf8/utf8_regexp.h \
-       src/encoding/utf8/utf8_range.h \
-       src/encoding/utf16/utf16_range.h \
         src/encoding/utf16/utf16_regexp.h \
         src/encoding/utf16/utf16.h \
         src/regexp/empty_class_policy.h \
@@ -126,11 +124,9 @@ SRC = \
         src/encoding/enc.cc \
         src/encoding/range_suffix.cc \
         src/encoding/utf8/utf8_regexp.cc \
-       src/encoding/utf8/utf8_range.cc \
         src/encoding/utf8/utf8.cc \
         src/encoding/utf16/utf16_regexp.cc \
         src/encoding/utf16/utf16.cc \
-       src/encoding/utf16/utf16_range.cc \
         src/regexp/ast_to_re.cc \
         src/regexp/default_tags.cc \
         src/regexp/fixed_tags.cc \
diff --git a/re2c/src/encoding/ebcdic/ebcdic_regexp.cc b/re2c/src/encoding/ebcdic/ebcdic_regexp.cc

index 58484a9516ce3bffd7a2fc6c54831d3de55c3290..d77f661c91820ba07f846a6554bef1b3e8a6b158 100644 (file)
--- a/re2c/src/encoding/ebcdic/ebcdic_regexp.cc
+++ b/re2c/src/encoding/ebcdic/ebcdic_regexp.cc
@@ -5,11 +5,6 @@
  
  namespace re2c {
  
-RE *EBCDICSymbol(RE::alc_t &alc, uint32_t c)
-{
-    return re_sym(alc, Range::sym(asc2ebc[c]));
-}
-
  RE *EBCDICRange(RE::alc_t &alc, const Range *r)
  {
      Range *s = NULL;
diff --git a/re2c/src/encoding/ebcdic/ebcdic_regexp.h b/re2c/src/encoding/ebcdic/ebcdic_regexp.h

index cfb930643e3d11a58a772cf667028403998f49bb..ba88b21b5fb8f3d64ba6b26be4d20fd1e42c099f 100644 (file)
--- a/re2c/src/encoding/ebcdic/ebcdic_regexp.h
+++ b/re2c/src/encoding/ebcdic/ebcdic_regexp.h
@@ -8,7 +8,6 @@ namespace re2c {
  
  class Range;
  
-RE *EBCDICSymbol(RE::alc_t &alc, uint32_t c);
  RE *EBCDICRange(RE::alc_t &alc, const Range *r);
  
  } // namespace re2c
diff --git a/re2c/src/encoding/utf16/utf16_range.cc b/re2c/src/encoding/utf16/utf16_range.cc

deleted file mode 100644 (file)

index 98e9069..0000000
--- a/re2c/src/encoding/utf16/utf16_range.cc
+++ /dev/null
@@ -1,149 +0,0 @@
-#include "src/encoding/utf16/utf16_range.h"
-
-#include <stddef.h>
-
-#include "src/encoding/range_suffix.h"
-
-namespace re2c {
-
-/*
- * Add word range [w1-w2].
- */
-void UTF16addContinuous1(RangeSuffix * & root, uint32_t l, uint32_t h)
-{
-    RangeSuffix ** p = &root;
-    for (;;)
-    {
-        if (*p == NULL)
-        {
-            *p = new RangeSuffix(l, h);
-            break;
-        }
-        else if ((*p)->l == l && (*p)->h == h)
-        {
-            break;
-        }
-        else
-            p = &(*p)->next;
-    }
-}
-
-/*
- * Now that we have catenation of word ranges [l1-h1],[l2-h2],
- * we want to add it to existing range, merging suffixes on the fly.
- */
-void UTF16addContinuous2(RangeSuffix * & root, uint32_t l_ld, uint32_t h_ld, uint32_t l_tr, uint32_t h_tr)
-{
-    RangeSuffix ** p = &root;
-    for (;;)
-    {
-        if (*p == NULL)
-        {
-            *p = new RangeSuffix(l_tr, h_tr);
-            p = &(*p)->child;
-            break;
-        }
-        else if ((*p)->l == l_tr && (*p)->h == h_tr)
-        {
-            p = &(*p)->child;
-            break;
-        }
-        else
-            p = &(*p)->next;
-    }
-    for (;;)
-    {
-        if (*p == NULL)
-        {
-            *p = new RangeSuffix(l_ld, h_ld);
-            break;
-        }
-        else if ((*p)->l == l_ld && (*p)->h == h_ld)
-        {
-            break;
-        }
-        else
-            p = &(*p)->next;
-    }
-}
-
-/*
- * Split range into sub-ranges that agree on leading surrogates.
- *
- * We have two Unicode runes, L and H, both map to UTF-16
- * surrogate pairs 'L1 L2' and 'H1 H2'.
- * We want to represent Unicode range [L - H] as a catenation
- * of word ranges [L1 - H1],[L2 - H2].
- *
- * This is only possible if the following condition holds:
- * if L1 /= H1, then L2 == 0xdc00 and H2 == 0xdfff.
- * This condition ensures that:
- *     1) all possible UTF-16 sequences between L and H are allowed
- *     2) no word ranges [w1 - w2] appear, such that w1 > w2
- *
- * E.g.:
- * [\U00010001-\U00010400] => [d800-d801],[dc01-dc00].
- * The last word range, [dc01-dc00], is incorrect: its lower bound
- * is greater than its upper bound. To fix this, we must split
- * the original range into two sub-ranges:
- * [\U00010001-\U000103ff] => [d800-d800],[dc01-dfff]
- * [\U00010400-\U00010400] => [d801-d801],[dc00-dc00]
- *
- * This function finds all such 'points of discontinuity'
- * and represents original range as alternation of continuous
- * sub-ranges.
- */
-void UTF16splitByContinuity(RangeSuffix * & root, uint32_t l_ld, uint32_t h_ld, uint32_t l_tr, uint32_t h_tr)
-{
-    if (l_ld != h_ld)
-    {
-        if (l_tr > utf16::MIN_TRAIL_SURR)
-        {
-            UTF16splitByContinuity(root, l_ld, l_ld, l_tr, utf16::MAX_TRAIL_SURR);
-            UTF16splitByContinuity(root, l_ld + 1, h_ld, utf16::MIN_TRAIL_SURR, h_tr);
-            return;
-        }
-        if (h_tr < utf16::MAX_TRAIL_SURR)
-        {
-            UTF16splitByContinuity(root, l_ld, h_ld - 1, l_tr, utf16::MAX_TRAIL_SURR);
-            UTF16splitByContinuity(root, h_ld, h_ld, utf16::MIN_TRAIL_SURR, h_tr);
-            return;
-        }
-    }
-    UTF16addContinuous2(root, l_ld, h_ld, l_tr, h_tr);
-}
-
-/*
- * Split range into sub-ranges, so that all runes in the same
- * sub-range have equal length of UTF-16 sequence. E.g., full
- * Unicode range [0-0x10FFFF] gets split into sub-ranges:
- * [0 - 0xFFFF]         (2-byte UTF-16 sequences)
- * [0x10000 - 0x10FFFF] (4-byte UTF-16 sequences)
- */
-void UTF16splitByRuneLength(RangeSuffix * & root, utf16::rune l, utf16::rune h)
-{
-    if (l <= utf16::MAX_1WORD_RUNE)
-    {
-        if (h <= utf16::MAX_1WORD_RUNE)
-        {
-            UTF16addContinuous1(root, l, h);
-        }
-        else
-        {
-            UTF16addContinuous1(root, l, utf16::MAX_1WORD_RUNE);
-            const uint32_t h_ld = utf16::lead_surr(h);
-            const uint32_t h_tr = utf16::trail_surr(h);
-            UTF16splitByContinuity(root, utf16::MIN_LEAD_SURR, h_ld, utf16::MIN_TRAIL_SURR, h_tr);
-        }
-    }
-    else
-    {
-            const uint32_t l_ld = utf16::lead_surr(l);
-            const uint32_t l_tr = utf16::trail_surr(l);
-            const uint32_t h_ld = utf16::lead_surr(h);
-            const uint32_t h_tr = utf16::trail_surr(h);
-            UTF16splitByContinuity(root, l_ld, h_ld, l_tr, h_tr);
-    }
-}
-
-} // namespace re2c
diff --git a/re2c/src/encoding/utf16/utf16_range.h b/re2c/src/encoding/utf16/utf16_range.h

deleted file mode 100644 (file)

index af0fc4e..0000000
--- a/re2c/src/encoding/utf16/utf16_range.h
+++ /dev/null
@@ -1,19 +0,0 @@
-#ifndef _RE2C_RE_ENCODING_UTF16_RANGE_
-#define _RE2C_RE_ENCODING_UTF16_RANGE_
-
-#include "src/util/c99_stdint.h"
-
-#include "src/encoding/utf16/utf16.h"
-
-namespace re2c {
-
-struct RangeSuffix;
-
-void UTF16addContinuous1(RangeSuffix * & root, uint32_t l, uint32_t h);
-void UTF16addContinuous2(RangeSuffix * & root, uint32_t l_ld, uint32_t h_ld, uint32_t l_tr, uint32_t h_tr);
-void UTF16splitByContinuity(RangeSuffix * & root, uint32_t l_ld, uint32_t h_ld, uint32_t l_tr, uint32_t h_tr);
-void UTF16splitByRuneLength(RangeSuffix * & root, utf16::rune l, utf16::rune h);
-
-} // namespace re2c
-
-#endif // _RE2C_RE_ENCODING_UTF16_RANGE_
diff --git a/re2c/src/encoding/utf16/utf16_regexp.cc b/re2c/src/encoding/utf16/utf16_regexp.cc

index 72c0b05a33885461ebf4010cd16c6c4e8d8b222f..d83eb0ecc0613b076744bd59511c202a2af37247 100644 (file)
--- a/re2c/src/encoding/utf16/utf16_regexp.cc
+++ b/re2c/src/encoding/utf16/utf16_regexp.cc
@@ -2,12 +2,40 @@
  #include "src/util/c99_stdint.h"
  
  #include "src/encoding/range_suffix.h"
-#include "src/encoding/utf16/utf16_range.h"
  #include "src/encoding/utf16/utf16_regexp.h"
  #include "src/util/range.h"
  
+
  namespace re2c {
  
+static RE *UTF16Symbol(RE::alc_t &, utf16::rune);
+static void UTF16addContinuous1(RangeSuffix *&, uint32_t, uint32_t);
+static void UTF16addContinuous2(RangeSuffix *&, uint32_t, uint32_t, uint32_t, uint32_t);
+static void UTF16splitByContinuity(RangeSuffix *&, uint32_t, uint32_t, uint32_t, uint32_t);
+static void UTF16splitByRuneLength(RangeSuffix *&, utf16::rune, utf16::rune);
+
+/*
+ * Split Unicode character class {[l1, h1), ..., [lN, hN)} into
+ * ranges [l1, h1-1], ..., [lN, hN-1] and return alternation of
+ * them. We store partially built range in suffix tree, which
+ * allows to eliminate common suffixes while building.
+ */
+RE *UTF16Range(RE::alc_t &alc, const Range *r)
+{
+    // empty range
+    if (!r) return NULL;
+
+    // one-symbol range
+    if (!r->next() && r->lower() == r->upper() - 1) {
+        return UTF16Symbol(alc, r->lower());
+    }
+
+    RangeSuffix * root = NULL;
+    for (; r != NULL; r = r->next ())
+        UTF16splitByRuneLength(root, r->lower (), r->upper () - 1);
+    return to_regexp(alc, root);
+}
+
  RE *UTF16Symbol(RE::alc_t &alc, utf16::rune r)
  {
      if (r <= utf16::MAX_1WORD_RUNE) {
@@ -22,17 +50,143 @@ RE *UTF16Symbol(RE::alc_t &alc, utf16::rune r)
  }
  
  /*
- * Split Unicode character class {[l1, h1), ..., [lN, hN)} into
- * ranges [l1, h1-1], ..., [lN, hN-1] and return alternation of
- * them. We store partially built range in suffix tree, which
- * allows to eliminate common suffixes while building.
+ * Split range into sub-ranges, so that all runes in the same
+ * sub-range have equal length of UTF-16 sequence. E.g., full
+ * Unicode range [0-0x10FFFF] gets split into sub-ranges:
+ * [0 - 0xFFFF]         (2-byte UTF-16 sequences)
+ * [0x10000 - 0x10FFFF] (4-byte UTF-16 sequences)
   */
-RE *UTF16Range(RE::alc_t &alc, const Range *r)
+void UTF16splitByRuneLength(RangeSuffix * & root, utf16::rune l, utf16::rune h)
  {
-    RangeSuffix * root = NULL;
-    for (; r != NULL; r = r->next ())
-        UTF16splitByRuneLength(root, r->lower (), r->upper () - 1);
-    return to_regexp(alc, root);
+    if (l <= utf16::MAX_1WORD_RUNE)
+    {
+        if (h <= utf16::MAX_1WORD_RUNE)
+        {
+            UTF16addContinuous1(root, l, h);
+        }
+        else
+        {
+            UTF16addContinuous1(root, l, utf16::MAX_1WORD_RUNE);
+            const uint32_t h_ld = utf16::lead_surr(h);
+            const uint32_t h_tr = utf16::trail_surr(h);
+            UTF16splitByContinuity(root, utf16::MIN_LEAD_SURR, h_ld, utf16::MIN_TRAIL_SURR, h_tr);
+        }
+    }
+    else
+    {
+            const uint32_t l_ld = utf16::lead_surr(l);
+            const uint32_t l_tr = utf16::trail_surr(l);
+            const uint32_t h_ld = utf16::lead_surr(h);
+            const uint32_t h_tr = utf16::trail_surr(h);
+            UTF16splitByContinuity(root, l_ld, h_ld, l_tr, h_tr);
+    }
+}
+
+/*
+ * Split range into sub-ranges that agree on leading surrogates.
+ *
+ * We have two Unicode runes, L and H, both map to UTF-16
+ * surrogate pairs 'L1 L2' and 'H1 H2'.
+ * We want to represent Unicode range [L - H] as a catenation
+ * of word ranges [L1 - H1],[L2 - H2].
+ *
+ * This is only possible if the following condition holds:
+ * if L1 /= H1, then L2 == 0xdc00 and H2 == 0xdfff.
+ * This condition ensures that:
+ *     1) all possible UTF-16 sequences between L and H are allowed
+ *     2) no word ranges [w1 - w2] appear, such that w1 > w2
+ *
+ * E.g.:
+ * [\U00010001-\U00010400] => [d800-d801],[dc01-dc00].
+ * The last word range, [dc01-dc00], is incorrect: its lower bound
+ * is greater than its upper bound. To fix this, we must split
+ * the original range into two sub-ranges:
+ * [\U00010001-\U000103ff] => [d800-d800],[dc01-dfff]
+ * [\U00010400-\U00010400] => [d801-d801],[dc00-dc00]
+ *
+ * This function finds all such 'points of discontinuity'
+ * and represents original range as alternation of continuous
+ * sub-ranges.
+ */
+void UTF16splitByContinuity(RangeSuffix * & root, uint32_t l_ld, uint32_t h_ld, uint32_t l_tr, uint32_t h_tr)
+{
+    if (l_ld != h_ld)
+    {
+        if (l_tr > utf16::MIN_TRAIL_SURR)
+        {
+            UTF16splitByContinuity(root, l_ld, l_ld, l_tr, utf16::MAX_TRAIL_SURR);
+            UTF16splitByContinuity(root, l_ld + 1, h_ld, utf16::MIN_TRAIL_SURR, h_tr);
+            return;
+        }
+        if (h_tr < utf16::MAX_TRAIL_SURR)
+        {
+            UTF16splitByContinuity(root, l_ld, h_ld - 1, l_tr, utf16::MAX_TRAIL_SURR);
+            UTF16splitByContinuity(root, h_ld, h_ld, utf16::MIN_TRAIL_SURR, h_tr);
+            return;
+        }
+    }
+    UTF16addContinuous2(root, l_ld, h_ld, l_tr, h_tr);
+}
+
+/*
+ * Add word range [w1-w2].
+ */
+void UTF16addContinuous1(RangeSuffix * & root, uint32_t l, uint32_t h)
+{
+    RangeSuffix ** p = &root;
+    for (;;)
+    {
+        if (*p == NULL)
+        {
+            *p = new RangeSuffix(l, h);
+            break;
+        }
+        else if ((*p)->l == l && (*p)->h == h)
+        {
+            break;
+        }
+        else
+            p = &(*p)->next;
+    }
+}
+
+/*
+ * Now that we have catenation of word ranges [l1-h1],[l2-h2],
+ * we want to add it to existing range, merging suffixes on the fly.
+ */
+void UTF16addContinuous2(RangeSuffix * & root, uint32_t l_ld, uint32_t h_ld, uint32_t l_tr, uint32_t h_tr)
+{
+    RangeSuffix ** p = &root;
+    for (;;)
+    {
+        if (*p == NULL)
+        {
+            *p = new RangeSuffix(l_tr, h_tr);
+            p = &(*p)->child;
+            break;
+        }
+        else if ((*p)->l == l_tr && (*p)->h == h_tr)
+        {
+            p = &(*p)->child;
+            break;
+        }
+        else
+            p = &(*p)->next;
+    }
+    for (;;)
+    {
+        if (*p == NULL)
+        {
+            *p = new RangeSuffix(l_ld, h_ld);
+            break;
+        }
+        else if ((*p)->l == l_ld && (*p)->h == h_ld)
+        {
+            break;
+        }
+        else
+            p = &(*p)->next;
+    }
  }
  
  } // namespace re2c
diff --git a/re2c/src/encoding/utf16/utf16_regexp.h b/re2c/src/encoding/utf16/utf16_regexp.h

index 30ac5ee3822f557dc8b9acef2fd45ea7ad926667..880b70a8829a6a06df1f9e472c063555f5675dea 100644 (file)
--- a/re2c/src/encoding/utf16/utf16_regexp.h
+++ b/re2c/src/encoding/utf16/utf16_regexp.h
@@ -4,11 +4,11 @@
  #include "src/regexp/re.h"
  #include "src/encoding/utf16/utf16.h"
  
+
  namespace re2c {
  
  class Range;
  
-RE *UTF16Symbol(RE::alc_t &alc, utf16::rune r);
  RE *UTF16Range(RE::alc_t &alc, const Range *r);
  
  } // namespace re2c
diff --git a/re2c/src/encoding/utf8/utf8_range.cc b/re2c/src/encoding/utf8/utf8_range.cc

deleted file mode 100644 (file)

index f4359d3..0000000
--- a/re2c/src/encoding/utf8/utf8_range.cc
+++ /dev/null
@@ -1,115 +0,0 @@
-#include "src/encoding/utf8/utf8_range.h"
-
-#include <stddef.h>
-
-#include "src/encoding/range_suffix.h"
-
-namespace re2c {
-
-/*
- * Now that we have catenation of byte ranges [l1-h1]...[lN-hN],
- * we want to add it to existing range, merging suffixes on the fly.
- */
-void UTF8addContinuous(RangeSuffix * & root, utf8::rune l, utf8::rune h, uint32_t n)
-{
-    uint32_t lcs[utf8::MAX_RUNE_LENGTH];
-    uint32_t hcs[utf8::MAX_RUNE_LENGTH];
-    utf8::rune_to_bytes(lcs, l);
-    utf8::rune_to_bytes(hcs, h);
-
-    RangeSuffix ** p = &root;
-    for (uint32_t i = 1; i <= n; ++i)
-    {
-        const uint32_t lc = lcs[n - i];
-        const uint32_t hc = hcs[n - i];
-        for (;;)
-        {
-            if (*p == NULL)
-            {
-                *p = new RangeSuffix(lc, hc);
-                p = &(*p)->child;
-                break;
-            }
-            else if ((*p)->l == lc && (*p)->h == hc)
-            {
-                p = &(*p)->child;
-                break;
-            }
-            else
-                p = &(*p)->next;
-        }
-    }
-}
-
-/*
- * Split range into sub-ranges that agree on leading bytes.
- *
- * We have two Unicode runes of equal length, L and H, which
- * map to UTF-8 sequences 'L_1 ... L_n' and 'H_1 ... H_n'.
- * We want to represent Unicode range [L - H] as a catenation
- * of byte ranges [L_1 - H_1], ..., [L_n - H_n].
- *
- * This is only possible if for all i > 1:
- * if L_i /= H_i, then L_(i+1) == 0x80 and H_(i+1) == 0xbf.
- * This condition ensures that:
- *     1) all possible UTF-8 sequences between L and H are allowed
- *     2) no byte ranges [b1 - b2] appear, such that b1 > b2
- *
- * E.g.:
- * [\U000e0031-\U000e0043] => [f3-f3],[a0-a0],[80-81],[b1-83].
- * The last byte range, [b1-83], is incorrect: its lower bound
- * is greater than its upper bound. To fix this, we must split
- * the original range into two sub-ranges:
- * [\U000e0031-\U000e003f] => [f3-f3],[a0-a0],[80-80],[b1-bf]
- * [\U000e0040-\U000e0043] => [f3-f3],[a0-a0],[81-81],[80-83]
- *
- * This function finds all such 'points of discontinuity'
- * and represents original range as alternation of continuous
- * sub-ranges.
- */
-void UTF8splitByContinuity(RangeSuffix * & root, utf8::rune l, utf8::rune h, uint32_t n)
-{
-    for (uint32_t i = 1; i < n; ++i)
-    {
-        uint32_t m = (1u << (6u * i)) - 1u; // last i bytes of a UTF-8 sequence
-        if ((l & ~m) != (h & ~m))
-        {
-            if ((l & m) != 0)
-            {
-                UTF8splitByContinuity(root, l, l | m, n);
-                UTF8splitByContinuity(root, (l | m) + 1, h, n);
-                return;
-            }
-            if ((h & m) != m)
-            {
-                UTF8splitByContinuity(root, l, (h & ~m) - 1, n);
-                UTF8splitByContinuity(root, h & ~m, h, n);
-                return;
-            }
-        }
-    }
-    UTF8addContinuous(root, l, h, n);
-}
-
-/*
- * Split range into sub-ranges, so that all runes in the same
- * sub-range have equal length of UTF-8 sequence. E.g., full
- * Unicode range [0-0x10FFFF] gets split into sub-ranges:
- * [0 - 0x7F]           (1-byte UTF-8 sequences)
- * [0x80 - 0x7FF]       (2-byte UTF-8 sequences)
- * [0x800 - 0xFFFF]     (3-byte UTF-8 sequences)
- * [0x10000 - 0x10FFFF] (4-byte UTF-8 sequences)
- */
-void UTF8splitByRuneLength(RangeSuffix * & root, utf8::rune l, utf8::rune h)
-{
-    const uint32_t nh = utf8::rune_length(h);
-    for (uint32_t nl = utf8::rune_length(l); nl < nh; ++nl)
-    {
-        utf8::rune r = utf8::max_rune(nl);
-        UTF8splitByContinuity(root, l, r, nl);
-        l = r + 1;
-    }
-    UTF8splitByContinuity(root, l, h, nh);
-}
-
-} // namespace re2c
diff --git a/re2c/src/encoding/utf8/utf8_range.h b/re2c/src/encoding/utf8/utf8_range.h

deleted file mode 100644 (file)

index 7694e6b..0000000
--- a/re2c/src/encoding/utf8/utf8_range.h
+++ /dev/null
@@ -1,18 +0,0 @@
-#ifndef _RE2C_RE_ENCODING_UTF8_RANGE_
-#define _RE2C_RE_ENCODING_UTF8_RANGE_
-
-#include "src/util/c99_stdint.h"
-
-#include "src/encoding/utf8/utf8.h"
-
-namespace re2c {
-
-struct RangeSuffix;
-
-void UTF8addContinuous(RangeSuffix * & root, utf8::rune l, utf8::rune h, uint32_t n);
-void UTF8splitByContinuity(RangeSuffix * & root, utf8::rune l, utf8::rune h, uint32_t n);
-void UTF8splitByRuneLength(RangeSuffix * & root, utf8::rune l, utf8::rune h);
-
-} // namespace re2c
-
-#endif // _RE2C_RE_ENCODING_UTF8_RANGE_
diff --git a/re2c/src/encoding/utf8/utf8_regexp.cc b/re2c/src/encoding/utf8/utf8_regexp.cc

index 1bb92fa1cb7189c373c9cb6cec9343ee40838a42..994d7eecdaf1960ea771bbb0a1b6a92451196753 100644 (file)
--- a/re2c/src/encoding/utf8/utf8_regexp.cc
+++ b/re2c/src/encoding/utf8/utf8_regexp.cc
@@ -2,12 +2,40 @@
  #include "src/util/c99_stdint.h"
  
  #include "src/encoding/range_suffix.h"
-#include "src/encoding/utf8/utf8_range.h"
  #include "src/encoding/utf8/utf8_regexp.h"
  #include "src/util/range.h"
  
+
  namespace re2c {
  
+static RE *UTF8Symbol(RE::alc_t &, utf8::rune);
+static void UTF8addContinuous(RangeSuffix *&, utf8::rune, utf8::rune, uint32_t);
+static void UTF8splitByContinuity(RangeSuffix *&, utf8::rune, utf8::rune, uint32_t);
+static void UTF8splitByRuneLength(RangeSuffix *&, utf8::rune, utf8::rune);
+
+/*
+ * Split Unicode character class {[l1, h1), ..., [lN, hN)} into
+ * ranges [l1, h1-1], ..., [lN, hN-1] and return alternation of
+ * them. We store partially built range in suffix tree, which
+ * allows to eliminate common suffixes while building.
+ */
+RE *UTF8Range(RE::alc_t &alc, const Range *r)
+{
+    // empty range
+    if (!r) return NULL;
+
+    // one-symbol range
+    if (!r->next() && r->lower() == r->upper() - 1) {
+        return UTF8Symbol(alc, r->lower());
+    }
+
+    RangeSuffix *root = NULL;
+    for (; r != NULL; r = r->next()) {
+        UTF8splitByRuneLength(root, r->lower(), r->upper() - 1);
+    }
+    return to_regexp(alc, root);
+}
+
  RE *UTF8Symbol(RE::alc_t &alc, utf8::rune r)
  {
      uint32_t chars[utf8::MAX_RUNE_LENGTH];
@@ -20,17 +48,109 @@ RE *UTF8Symbol(RE::alc_t &alc, utf8::rune r)
  }
  
  /*
- * Split Unicode character class {[l1, h1), ..., [lN, hN)} into
- * ranges [l1, h1-1], ..., [lN, hN-1] and return alternation of
- * them. We store partially built range in suffix tree, which
- * allows to eliminate common suffixes while building.
+ * Split range into sub-ranges, so that all runes in the same
+ * sub-range have equal length of UTF-8 sequence. E.g., full
+ * Unicode range [0-0x10FFFF] gets split into sub-ranges:
+ * [0 - 0x7F]           (1-byte UTF-8 sequences)
+ * [0x80 - 0x7FF]       (2-byte UTF-8 sequences)
+ * [0x800 - 0xFFFF]     (3-byte UTF-8 sequences)
+ * [0x10000 - 0x10FFFF] (4-byte UTF-8 sequences)
   */
-RE *UTF8Range(RE::alc_t &alc, const Range *r)
+void UTF8splitByRuneLength(RangeSuffix * & root, utf8::rune l, utf8::rune h)
  {
-    RangeSuffix * root = NULL;
-    for (; r != NULL; r = r->next ())
-        UTF8splitByRuneLength(root, r->lower (), r->upper () - 1);
-    return to_regexp(alc, root);
+    const uint32_t nh = utf8::rune_length(h);
+    for (uint32_t nl = utf8::rune_length(l); nl < nh; ++nl)
+    {
+        utf8::rune r = utf8::max_rune(nl);
+        UTF8splitByContinuity(root, l, r, nl);
+        l = r + 1;
+    }
+    UTF8splitByContinuity(root, l, h, nh);
+}
+
+/*
+ * Split range into sub-ranges that agree on leading bytes.
+ *
+ * We have two Unicode runes of equal length, L and H, which
+ * map to UTF-8 sequences 'L_1 ... L_n' and 'H_1 ... H_n'.
+ * We want to represent Unicode range [L - H] as a catenation
+ * of byte ranges [L_1 - H_1], ..., [L_n - H_n].
+ *
+ * This is only possible if for all i > 1:
+ * if L_i /= H_i, then L_(i+1) == 0x80 and H_(i+1) == 0xbf.
+ * This condition ensures that:
+ *     1) all possible UTF-8 sequences between L and H are allowed
+ *     2) no byte ranges [b1 - b2] appear, such that b1 > b2
+ *
+ * E.g.:
+ * [\U000e0031-\U000e0043] => [f3-f3],[a0-a0],[80-81],[b1-83].
+ * The last byte range, [b1-83], is incorrect: its lower bound
+ * is greater than its upper bound. To fix this, we must split
+ * the original range into two sub-ranges:
+ * [\U000e0031-\U000e003f] => [f3-f3],[a0-a0],[80-80],[b1-bf]
+ * [\U000e0040-\U000e0043] => [f3-f3],[a0-a0],[81-81],[80-83]
+ *
+ * This function finds all such 'points of discontinuity'
+ * and represents original range as alternation of continuous
+ * sub-ranges.
+ */
+void UTF8splitByContinuity(RangeSuffix * & root, utf8::rune l, utf8::rune h, uint32_t n)
+{
+    for (uint32_t i = 1; i < n; ++i)
+    {
+        uint32_t m = (1u << (6u * i)) - 1u; // last i bytes of a UTF-8 sequence
+        if ((l & ~m) != (h & ~m))
+        {
+            if ((l & m) != 0)
+            {
+                UTF8splitByContinuity(root, l, l | m, n);
+                UTF8splitByContinuity(root, (l | m) + 1, h, n);
+                return;
+            }
+            if ((h & m) != m)
+            {
+                UTF8splitByContinuity(root, l, (h & ~m) - 1, n);
+                UTF8splitByContinuity(root, h & ~m, h, n);
+                return;
+            }
+        }
+    }
+    UTF8addContinuous(root, l, h, n);
+}
+
+/*
+ * Now that we have catenation of byte ranges [l1-h1]...[lN-hN],
+ * we want to add it to existing range, merging suffixes on the fly.
+ */
+void UTF8addContinuous(RangeSuffix * & root, utf8::rune l, utf8::rune h, uint32_t n)
+{
+    uint32_t lcs[utf8::MAX_RUNE_LENGTH];
+    uint32_t hcs[utf8::MAX_RUNE_LENGTH];
+    utf8::rune_to_bytes(lcs, l);
+    utf8::rune_to_bytes(hcs, h);
+
+    RangeSuffix ** p = &root;
+    for (uint32_t i = 1; i <= n; ++i)
+    {
+        const uint32_t lc = lcs[n - i];
+        const uint32_t hc = hcs[n - i];
+        for (;;)
+        {
+            if (*p == NULL)
+            {
+                *p = new RangeSuffix(lc, hc);
+                p = &(*p)->child;
+                break;
+            }
+            else if ((*p)->l == lc && (*p)->h == hc)
+            {
+                p = &(*p)->child;
+                break;
+            }
+            else
+                p = &(*p)->next;
+        }
+    }
  }
  
  } // namespace re2c
diff --git a/re2c/src/encoding/utf8/utf8_regexp.h b/re2c/src/encoding/utf8/utf8_regexp.h

index c02284b224edd7c44220d84f2d773d171880df30..7b4153bad58b6f9758aaed308a3f69b4de7c7b6c 100644 (file)
--- a/re2c/src/encoding/utf8/utf8_regexp.h
+++ b/re2c/src/encoding/utf8/utf8_regexp.h
@@ -4,11 +4,11 @@
  #include "src/regexp/re.h"
  #include "src/encoding/utf8/utf8.h"
  
+
  namespace re2c {
  
  class Range;
  
-RE *UTF8Symbol(RE::alc_t &alc, utf8::rune r);
  RE *UTF8Range(RE::alc_t &alc, const Range *r);
  
  } // namespace re2c
diff --git a/re2c/src/regexp/ast_to_re.cc b/re2c/src/regexp/ast_to_re.cc

index a4d98bede355ed2bf885a78867910d04c8c0d95a..a31569fdc4eefbab9c76752af25be8c851642c2b 100644 (file)
--- a/re2c/src/regexp/ast_to_re.cc
+++ b/re2c/src/regexp/ast_to_re.cc
@@ -49,13 +49,13 @@ namespace re2c {
  
  static bool has_tags(const AST *);
  static RE *ast_to_re(RESpec &, const AST *, size_t &, int32_t);
-static RE *re_schar(RE::alc_t &, uint32_t, uint32_t, uint32_t, const opt_t *);
-static RE *re_ichar(RE::alc_t &, uint32_t, uint32_t, uint32_t, const opt_t *);
+static RE *re_string(RE::alc_t &, const AST *, const opt_t *, Warn &);
  static RE *re_class(RE::alc_t &, uint32_t, uint32_t, const Range *, const opt_t *, Warn &);
-static Range *ast_to_range(const AST *ast, const opt_t *opts);
-static Range *diff_to_range(const AST *ast, const opt_t *opts);
-static Range *dot_to_range(const AST *ast, const opt_t *opts);
-static Range *cls_to_range(const AST *ast, const opt_t *opts);
+static Range *ast_to_range(const AST *, const opt_t *);
+static Range *char_to_range(uint32_t, const ASTChar &, const opt_t *, bool);
+static Range *diff_to_range(const AST *, const opt_t *);
+static Range *dot_to_range(const AST *, const opt_t *);
+static Range *cls_to_range(const AST *, const opt_t *);
  static bool misuse_of_named_def(const AST *, const opt_t *);
  static void assert_tags_used_once(const Rule &, const std::vector<Tag> &);
  static void init_rule(Rule &, const Code *, const std::vector<Tag> &, size_t, size_t);
@@ -109,18 +109,8 @@ RE *ast_to_re(RESpec &spec, const AST *ast, size_t &ncap, int32_t height)
      switch (ast->type) {
          case AST::NIL:
              return re_nil(alc);
-        case AST::STR: {
-            RE *x = NULL;
-            std::vector<ASTChar>::const_iterator
-                i = ast->str.chars->begin(),
-                e = ast->str.chars->end();
-            for (; i != e; ++i) {
-                x = re_cat(alc, x, is_icase(opts, ast->str.icase)
-                    ? re_ichar(alc, ast->line, i->column, i->chr, opts)
-                    : re_schar(alc, ast->line, i->column, i->chr, opts));
-            }
-            return x ? x : re_nil(alc);
-        }
+        case AST::STR:
+            return re_string(alc, ast, opts, warn);
          case AST::CLS: {
              Range *r = cls_to_range(ast, opts);
              return re_class(alc, ast->line, ast->column, r, opts, warn);
@@ -256,6 +246,20 @@ RE *ast_to_re(RESpec &spec, const AST *ast, size_t &ncap, int32_t height)
      return NULL; /* unreachable */
  }
  
+Range *char_to_range(uint32_t line, const ASTChar &chr, const opt_t *opts
+    , bool icase)
+{
+    uint32_t c = chr.chr;
+
+    if (!opts->encoding.validateChar(c)) {
+        fatal_lc(line, chr.column, "bad code point: '0x%X'", c);
+    }
+
+    return icase && is_alpha(c)
+        ? Range::add(Range::sym(to_lower_unsafe(c)), Range::sym(to_upper_unsafe(c)))
+        : Range::sym(c);
+}
+
  Range *cls_to_range(const AST *ast, const opt_t *opts)
  {
      DASSERT(ast->type == AST::CLS);
@@ -264,6 +268,7 @@ Range *cls_to_range(const AST *ast, const opt_t *opts)
      std::vector<ASTRange>::const_iterator
          i = ast->cls.ranges->begin(),
          e = ast->cls.ranges->end();
+
      for (; i != e; ++i) {
          Range *s = opts->encoding.validateRange(i->lower, i->upper);
          if (!s) {
@@ -272,9 +277,11 @@ Range *cls_to_range(const AST *ast, const opt_t *opts)
          }
          r = Range::add(r, s);
      }
+
      if (ast->cls.negated) {
          r = Range::sub(opts->encoding.fullRange(), r);
      }
+
      return r;
  }
  
@@ -316,17 +323,10 @@ Range *ast_to_range(const AST *ast, const opt_t *opts)
              return cls_to_range(ast, opts);
          case AST::DOT:
              return dot_to_range(ast, opts);
-        case AST::STR: {
+        case AST::STR:
              if (ast->str.chars->size() != 1) break;
-            const ASTChar &i = ast->str.chars->front();
-            uint32_t c = i.chr;
-            if (!opts->encoding.validateChar(c)) {
-                fatal_lc(ast->line, i.column, "bad code point: '0x%X'", c);
-            }
-            return is_icase(opts, ast->str.icase) && is_alpha(c)
-                ? Range::add(Range::sym(to_lower_unsafe(c)), Range::sym(to_upper_unsafe(c)))
-                : Range::sym(c);
-        }
+            return char_to_range(ast->line, ast->str.chars->front(), opts
+                , is_icase(opts, ast->str.icase));
          case AST::DIFF:
              return diff_to_range(ast, opts);
          case AST::ALT: {
@@ -339,38 +339,27 @@ Range *ast_to_range(const AST *ast, const opt_t *opts)
      return NULL;
  }
  
-RE *re_schar(RE::alc_t &alc, uint32_t line, uint32_t column, uint32_t c, const opt_t *opts)
+RE *re_string(RE::alc_t &alc, const AST *ast, const opt_t *opts, Warn &warn)
  {
-    if (!opts->encoding.validateChar(c)) {
-        fatal_lc(line, column, "bad code point: '0x%X'", c);
-    }
-    switch (opts->encoding.type()) {
-        case Enc::UTF16:
-            return UTF16Symbol(alc, c);
-        case Enc::UTF8:
-            return UTF8Symbol(alc, c);
-        case Enc::EBCDIC:
-            return EBCDICSymbol(alc, c);
-        case Enc::ASCII:
-        case Enc::UTF32:
-        case Enc::UCS2:
-            return re_sym(alc, Range::sym(c));
-    }
-    return NULL; /* unreachable */
-}
+    DASSERT(ast->type == AST::STR);
  
-RE *re_ichar(RE::alc_t &alc, uint32_t line, uint32_t column, uint32_t c, const opt_t *opts)
-{
-    if (is_alpha(c)) {
-        return re_alt(alc,
-            re_schar(alc, line, column, to_lower_unsafe(c), opts),
-            re_schar(alc, line, column, to_upper_unsafe(c), opts));
-    } else {
-        return re_schar(alc, line, column, c, opts);
+    RE *x = NULL;
+    std::vector<ASTChar>::const_iterator
+        i = ast->str.chars->begin(),
+        e = ast->str.chars->end();
+
+    bool icase = is_icase(opts, ast->str.icase);
+    for (; i != e; ++i) {
+        Range *r = char_to_range(ast->line, *i, opts, icase);
+        RE *y = re_class(alc, ast->line, i->column, r, opts, warn);
+        x = re_cat(alc, x, y);
      }
+
+    return x ? x : re_nil(alc);
  }
  
-RE *re_class(RE::alc_t &alc, uint32_t line, uint32_t column, const Range *r, const opt_t *opts, Warn &warn)
+RE *re_class(RE::alc_t &alc, uint32_t line, uint32_t column, const Range *r
+    , const opt_t *opts, Warn &warn)
  {
      if (!r) {
          switch (opts->empty_class_policy) {
@@ -384,6 +373,7 @@ RE *re_class(RE::alc_t &alc, uint32_t line, uint32_t column, const Range *r, con
                  fatal_lc(line, column, "empty character class");
          }
      }
+
      switch (opts->encoding.type()) {
          case Enc::UTF16:
              return UTF16Range(alc, r);
@@ -396,6 +386,7 @@ RE *re_class(RE::alc_t &alc, uint32_t line, uint32_t column, const Range *r, con
          case Enc::UCS2:
              return re_sym(alc, r);
      }
+
      return NULL; /* unreachable */
  }
author	Ulya Trofimovich <skvadrik@gmail.com>
	Sun, 6 Jan 2019 08:40:16 +0000 (08:40 +0000)
committer	Ulya Trofimovich <skvadrik@gmail.com>
	Sun, 6 Jan 2019 08:40:16 +0000 (08:40 +0000)
re2c/Makefile.am		patch \| blob \| history
re2c/src/encoding/ebcdic/ebcdic_regexp.cc		patch \| blob \| history
re2c/src/encoding/ebcdic/ebcdic_regexp.h		patch \| blob \| history
re2c/src/encoding/utf16/utf16_range.cc	[deleted file]	patch \| blob \| history
re2c/src/encoding/utf16/utf16_range.h	[deleted file]	patch \| blob \| history
re2c/src/encoding/utf16/utf16_regexp.cc		patch \| blob \| history
re2c/src/encoding/utf16/utf16_regexp.h		patch \| blob \| history
re2c/src/encoding/utf8/utf8_range.cc	[deleted file]	patch \| blob \| history
re2c/src/encoding/utf8/utf8_range.h	[deleted file]	patch \| blob \| history
re2c/src/encoding/utf8/utf8_regexp.cc		patch \| blob \| history
re2c/src/encoding/utf8/utf8_regexp.h		patch \| blob \| history
re2c/src/regexp/ast_to_re.cc		patch \| blob \| history