Encodings: use 32-bit unsigned arithmetics instead of 8-bit and 16-bit.

author Ulya Trofimovich <skvadrik@gmail.com>

Sun, 9 Aug 2015 18:07:10 +0000 (19:07 +0100)

committer Ulya Trofimovich <skvadrik@gmail.com>

Sun, 9 Aug 2015 18:07:10 +0000 (19:07 +0100)
author Ulya Trofimovich <skvadrik@gmail.com>
Sun, 9 Aug 2015 18:07:10 +0000 (19:07 +0100)
committer Ulya Trofimovich <skvadrik@gmail.com>
Sun, 9 Aug 2015 18:07:10 +0000 (19:07 +0100)
diff --git a/re2c/src/ir/regexp/encoding/utf16/utf16.cc b/re2c/src/ir/regexp/encoding/utf16/utf16.cc

index 47743edf18d0967c001cdb70345ae118ec7c0752..4b0a13bbbaa5c7291f0bde80af8b6dbd91d9d348 100644 (file)
--- a/re2c/src/ir/regexp/encoding/utf16/utf16.cc
+++ b/re2c/src/ir/regexp/encoding/utf16/utf16.cc
@@ -2,9 +2,9 @@
  
  namespace re2c {
  
-const uint32_t utf16::MAX_1WORD_RUNE   = 0xFFFF;
-const uint32_t utf16::MIN_LEAD_SURR    = 0xD800;
-const uint32_t utf16::MIN_TRAIL_SURR   = 0xDC00;
-const uint32_t utf16::MAX_TRAIL_SURR   = 0xDFFF;
+const uint32_t utf16::MAX_1WORD_RUNE   = 0xFFFFu;
+const uint32_t utf16::MIN_LEAD_SURR    = 0xD800u;
+const uint32_t utf16::MIN_TRAIL_SURR   = 0xDC00u;
+const uint32_t utf16::MAX_TRAIL_SURR   = 0xDFFFu;
  
  } // namespace re2c
diff --git a/re2c/src/ir/regexp/encoding/utf16/utf16.h b/re2c/src/ir/regexp/encoding/utf16/utf16.h

index d64de3494d849783edbaebf3b4aa1d7405f68cad..89cdbdbdd9b13a4b572fdcb4e4d56e85c6924c2a 100644 (file)
--- a/re2c/src/ir/regexp/encoding/utf16/utf16.h
+++ b/re2c/src/ir/regexp/encoding/utf16/utf16.h
@@ -16,20 +16,20 @@ public:
         static const uint32_t MAX_TRAIL_SURR;
  
         /* leading surrogate of UTF-16 symbol */
-       static inline uint16_t lead_surr(rune r);
+       static inline uint32_t lead_surr(rune r);
  
         /* trailing surrogate of UTF-16 symbol */
-       static inline uint16_t trail_surr(rune r);
+       static inline uint32_t trail_surr(rune r);
  };
  
-inline uint16_t utf16::lead_surr(rune r)
+inline uint32_t utf16::lead_surr(rune r)
  {
-       return ((r - 0x10000) / 0x400) + MIN_LEAD_SURR;
+       return ((r - 0x10000u) / 0x400u) + MIN_LEAD_SURR;
  }
  
-inline uint16_t utf16::trail_surr(rune r)
+inline uint32_t utf16::trail_surr(rune r)
  {
-       return ((r - 0x10000) % 0x400) + MIN_TRAIL_SURR;
+       return ((r - 0x10000u) % 0x400u) + MIN_TRAIL_SURR;
  }
  
  }  // namespace re2c
diff --git a/re2c/src/ir/regexp/encoding/utf16/utf16_range.cc b/re2c/src/ir/regexp/encoding/utf16/utf16_range.cc

index c4f27f6cd0cd570facd38212c637af3419935942..1cc8c2a6e0b7972843e05ae9fbb00352e46635fd 100644 (file)
--- a/re2c/src/ir/regexp/encoding/utf16/utf16_range.cc
+++ b/re2c/src/ir/regexp/encoding/utf16/utf16_range.cc
@@ -6,7 +6,7 @@ namespace re2c {
  /*
   * Add word range [w1-w2].
   */
-void UTF16addContinuous1(RangeSuffix * & root, uint16_t l, uint16_t h)
+void UTF16addContinuous1(RangeSuffix * & root, uint32_t l, uint32_t h)
  {
         RangeSuffix ** p = &root;
         for (;;)
@@ -29,7 +29,7 @@ void UTF16addContinuous1(RangeSuffix * & root, uint16_t l, uint16_t h)
   * Now that we have catenation of word ranges [l1-h1],[l2-h2],
   * we want to add it to existing range, merging suffixes on the fly.
   */
-void UTF16addContinuous2(RangeSuffix * & root, uint16_t l_ld, uint16_t h_ld, uint16_t l_tr, uint16_t h_tr)
+void UTF16addContinuous2(RangeSuffix * & root, uint32_t l_ld, uint32_t h_ld, uint32_t l_tr, uint32_t h_tr)
  {
         RangeSuffix ** p = &root;
         for (;;)
@@ -90,7 +90,7 @@ void UTF16addContinuous2(RangeSuffix * & root, uint16_t l_ld, uint16_t h_ld, uin
   * and represents original range as alternation of continuous
   * sub-ranges.
   */
-void UTF16splitByContinuity(RangeSuffix * & root, uint16_t l_ld, uint16_t h_ld, uint16_t l_tr, uint16_t h_tr)
+void UTF16splitByContinuity(RangeSuffix * & root, uint32_t l_ld, uint32_t h_ld, uint32_t l_tr, uint32_t h_tr)
  {
         if (l_ld != h_ld)
         {
@@ -120,21 +120,25 @@ void UTF16splitByContinuity(RangeSuffix * & root, uint16_t l_ld, uint16_t h_ld,
  void UTF16splitByRuneLength(RangeSuffix * & root, utf16::rune l, utf16::rune h)
  {
         if (l <= utf16::MAX_1WORD_RUNE)
+       {
                 if (h <= utf16::MAX_1WORD_RUNE)
+               {
                         UTF16addContinuous1(root, l, h);
+               }
                 else
                 {
                         UTF16addContinuous1(root, l, utf16::MAX_1WORD_RUNE);
-                       const uint16_t h_ld = utf16::lead_surr(h);
-                       const uint16_t h_tr = utf16::trail_surr(h);
+                       const uint32_t h_ld = utf16::lead_surr(h);
+                       const uint32_t h_tr = utf16::trail_surr(h);
                         UTF16splitByContinuity(root, utf16::MIN_LEAD_SURR, h_ld, utf16::MIN_TRAIL_SURR, h_tr);
                 }
+       }
         else
         {
-                       const uint16_t l_ld = utf16::lead_surr(l);
-                       const uint16_t l_tr = utf16::trail_surr(l);
-                       const uint16_t h_ld = utf16::lead_surr(h);
-                       const uint16_t h_tr = utf16::trail_surr(h);
+                       const uint32_t l_ld = utf16::lead_surr(l);
+                       const uint32_t l_tr = utf16::trail_surr(l);
+                       const uint32_t h_ld = utf16::lead_surr(h);
+                       const uint32_t h_tr = utf16::trail_surr(h);
                         UTF16splitByContinuity(root, l_ld, h_ld, l_tr, h_tr);
         }
  }
diff --git a/re2c/src/ir/regexp/encoding/utf16/utf16_range.h b/re2c/src/ir/regexp/encoding/utf16/utf16_range.h

index 4e75560d88b95d785c30b928b292220c059e07e7..43b3869e5cc98f8df77c8049f3367f249a766fb5 100644 (file)
--- a/re2c/src/ir/regexp/encoding/utf16/utf16_range.h
+++ b/re2c/src/ir/regexp/encoding/utf16/utf16_range.h
@@ -6,9 +6,9 @@
  
  namespace re2c {
  
-void UTF16addContinuous1(RangeSuffix * & root, uint16_t l, uint16_t h);
-void UTF16addContinuous2(RangeSuffix * & root, uint16_t l_ld, uint16_t h_ld, uint16_t l_tr, uint16_t h_tr);
-void UTF16splitByContinuity(RangeSuffix * & root, uint16_t l_ld, uint16_t h_ld, uint16_t l_tr, uint16_t h_tr);
+void UTF16addContinuous1(RangeSuffix * & root, uint32_t l, uint32_t h);
+void UTF16addContinuous2(RangeSuffix * & root, uint32_t l_ld, uint32_t h_ld, uint32_t l_tr, uint32_t h_tr);
+void UTF16splitByContinuity(RangeSuffix * & root, uint32_t l_ld, uint32_t h_ld, uint32_t l_tr, uint32_t h_tr);
  void UTF16splitByRuneLength(RangeSuffix * & root, utf16::rune l, utf16::rune h);
  
  } // namespace re2c
diff --git a/re2c/src/ir/regexp/encoding/utf16/utf16_regexp.cc b/re2c/src/ir/regexp/encoding/utf16/utf16_regexp.cc

index b8fef48a3dde3f8d28475d164a08bde4a127d0fb..68e05a898739d78196c9772168500e853644dbaa 100644 (file)
--- a/re2c/src/ir/regexp/encoding/utf16/utf16_regexp.cc
+++ b/re2c/src/ir/regexp/encoding/utf16/utf16_regexp.cc
@@ -12,8 +12,8 @@ RegExp * UTF16Symbol(utf16::rune r)
                 return new MatchOp(Range::sym (r));
         else
         {
-               const uint16_t ld = utf16::lead_surr(r);
-               const uint16_t tr = utf16::trail_surr(r);
+               const uint32_t ld = utf16::lead_surr(r);
+               const uint32_t tr = utf16::trail_surr(r);
                 return new CatOp(new MatchOp(Range::sym (ld)), new MatchOp(Range::sym (tr)));
         }
  }
diff --git a/re2c/src/ir/regexp/encoding/utf8/utf8.cc b/re2c/src/ir/regexp/encoding/utf8/utf8.cc

index 8332e1311c29d0d59464575b2dcf21232c85c153..dd4b59ef2c7d30d793f803f5cc6ac1ed7c85c652 100644 (file)
--- a/re2c/src/ir/regexp/encoding/utf8/utf8.cc
+++ b/re2c/src/ir/regexp/encoding/utf8/utf8.cc
@@ -2,22 +2,33 @@
  
  namespace re2c {
  
-uint32_t utf8::rune_to_bytes(uint8_t *str, rune c)
+const uint32_t utf8::ERROR = 0xFFFDu;
+
+const utf8::rune utf8::MAX_1BYTE_RUNE = 0x7Fu;
+const utf8::rune utf8::MAX_2BYTE_RUNE = 0x7FFu;
+const utf8::rune utf8::MAX_3BYTE_RUNE = 0xFFFFu;
+const utf8::rune utf8::MAX_4BYTE_RUNE = 0x10FFFFu;
+const utf8::rune utf8::MAX_RUNE       = utf8::MAX_4BYTE_RUNE;
+
+const uint32_t utf8::PREFIX_1BYTE = 0u;    // 0000 0000
+const uint32_t utf8::INFIX        = 0x80u; // 1000 0000
+const uint32_t utf8::PREFIX_2BYTE = 0xC0u; // 1100 0000
+const uint32_t utf8::PREFIX_3BYTE = 0xE0u; // 1110 0000
+const uint32_t utf8::PREFIX_4BYTE = 0xF0u; // 1111 0000
+
+const uint32_t utf8::SHIFT = 6u;
+const uint32_t utf8::MASK = 0x3Fu; // 0011 1111
+
+uint32_t utf8::rune_to_bytes(uint32_t *str, rune c)
  {
-       /*
-        * one byte sequence
-        *      0-0x7F => 0xxxxxxx
-        */
+       // one byte sequence: 0-0x7F => 0xxxxxxx
         if (c <= MAX_1BYTE_RUNE)
         {
                 str[0] = PREFIX_1BYTE | c;
                 return 1;
         }
  
-       /*
-        * two byte sequence
-        *      0x80-0x7FF => 110xxxxx 10xxxxxx
-        */
+       // two byte sequence: 0x80-0x7FF => 110xxxxx 10xxxxxx
         if (c <= MAX_2BYTE_RUNE)
         {
                 str[0] = PREFIX_2BYTE | (c >> 1*SHIFT);
@@ -25,19 +36,14 @@ uint32_t utf8::rune_to_bytes(uint8_t *str, rune c)
                 return 2;
         }
  
-       /*
-        * If the Rune is out of range, convert it to the error rune.
-        * Do this test here because the error rune encodes to three bytes.
-        * Doing it earlier would duplicate work, since an out of range
-        * Rune wouldn't have fit in one or two bytes.
-        */
+       // If the Rune is out of range, convert it to the error rune.
+       // Do this test here because the error rune encodes to three bytes.
+       // Doing it earlier would duplicate work, since an out of range
+       // Rune wouldn't have fit in one or two bytes.
         if (c > MAX_RUNE)
                 c = ERROR;
  
-       /*
-        * three byte sequence
-        *      0x800 - 0xFFFF => 1110xxxx 10xxxxxx 10xxxxxx
-        */
+       // three byte sequence: 0x800 - 0xFFFF => 1110xxxx 10xxxxxx 10xxxxxx
         if (c <= MAX_3BYTE_RUNE)
         {
                 str[0] = PREFIX_3BYTE | (c >> 2*SHIFT);
@@ -46,10 +52,8 @@ uint32_t utf8::rune_to_bytes(uint8_t *str, rune c)
                 return 3;
         }
  
-       /*
-        * four byte sequence (21-bit value)
-        *     0x10000 - 0x1FFFFF => 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
-        */
+       // four byte sequence (21-bit value):
+       // 0x10000 - 0x1FFFFF => 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
         str[0] = PREFIX_4BYTE | (c >> 3*SHIFT);
         str[1] = INFIX        | ((c >> 2*SHIFT) & MASK);
         str[2] = INFIX        | ((c >> 1*SHIFT) & MASK);
diff --git a/re2c/src/ir/regexp/encoding/utf8/utf8.h b/re2c/src/ir/regexp/encoding/utf8/utf8.h

index 72d14ccac3fee1042d7ed35fdfe01c9c8c4eab4b..0ca3142289b66e803d8528bf177d40552838b300 100644 (file)
--- a/re2c/src/ir/regexp/encoding/utf8/utf8.h
+++ b/re2c/src/ir/regexp/encoding/utf8/utf8.h
@@ -10,34 +10,36 @@ class utf8
  public:
         typedef uint32_t rune;
  
-       enum    { MAX_RUNE_LENGTH = 4 /* maximum characters per rune */
+       // maximum characters per rune
+       // enum instead of static const member because of [-Wvla]
+       enum { MAX_RUNE_LENGTH = 4u };
  
-               , ERROR = 0xFFFD /* decoding error */
+       // decoding error
+       static const uint32_t ERROR;
  
-               /* maximal runes for each rune length */
-               , MAX_1BYTE_RUNE = 0x7F
-               , MAX_2BYTE_RUNE = 0x7FF
-               , MAX_3BYTE_RUNE = 0xFFFF
-               , MAX_4BYTE_RUNE = 0x10FFFF
-               , MAX_RUNE       = MAX_4BYTE_RUNE
+       // maximal runes for each rune length
+       static const rune MAX_1BYTE_RUNE;
+       static const rune MAX_2BYTE_RUNE;
+       static const rune MAX_3BYTE_RUNE;
+       static const rune MAX_4BYTE_RUNE;
+       static const rune MAX_RUNE;
  
-               , PREFIX_1BYTE = 0    /* 0000 0000 */
-               , INFIX        = 0x80 /* 1000 0000 */
-               , PREFIX_2BYTE = 0xC0 /* 1100 0000 */
-               , PREFIX_3BYTE = 0xE0 /* 1110 0000 */
-               , PREFIX_4BYTE = 0xF0 /* 1111 0000 */
+       static const uint32_t PREFIX_1BYTE;
+       static const uint32_t INFIX;
+       static const uint32_t PREFIX_2BYTE;
+       static const uint32_t PREFIX_3BYTE;
+       static const uint32_t PREFIX_4BYTE;
  
-               , SHIFT = 6
-               , MASK = 0x3F /* 0011 1111 */
-               };
+       static const uint32_t SHIFT;
+       static const uint32_t MASK;
  
-       /* UTF-8 bytestring for given Unicode rune */
-       static uint32_t rune_to_bytes(uint8_t * s, rune r);
+       // UTF-8 bytestring for given Unicode rune
+       static uint32_t rune_to_bytes(uint32_t * s, rune r);
  
-       /* length of UTF-8 bytestring for given Unicode rune */
+       // length of UTF-8 bytestring for given Unicode rune
         static uint32_t rune_length(rune r);
  
-       /* maximal Unicode rune with given length of UTF-8 bytestring */
+       // maximal Unicode rune with given length of UTF-8 bytestring
         static rune max_rune(uint32_t i);
  };
  
diff --git a/re2c/src/ir/regexp/encoding/utf8/utf8_range.cc b/re2c/src/ir/regexp/encoding/utf8/utf8_range.cc

index 466526af555932ad375c42d1a4b2b4cc1c7a8e99..147a55738f7bb1e17c85eb94c92607fbe260251a 100644 (file)
--- a/re2c/src/ir/regexp/encoding/utf8/utf8_range.cc
+++ b/re2c/src/ir/regexp/encoding/utf8/utf8_range.cc
@@ -9,8 +9,8 @@ namespace re2c {
   */
  void UTF8addContinuous(RangeSuffix * & root, utf8::rune l, utf8::rune h, uint32_t n)
  {
-       uint8_t cl[utf8::MAX_RUNE_LENGTH];
-       uint8_t ch[utf8::MAX_RUNE_LENGTH];
+       uint32_t cl[utf8::MAX_RUNE_LENGTH];
+       uint32_t ch[utf8::MAX_RUNE_LENGTH];
         utf8::rune_to_bytes(cl, l);
         utf8::rune_to_bytes(ch, h);
  
diff --git a/re2c/src/ir/regexp/encoding/utf8/utf8_regexp.cc b/re2c/src/ir/regexp/encoding/utf8/utf8_regexp.cc

index e6bd1a20a87d6399182cc2d4d6eb5bf3a90374b4..8b74636662d87ff8fa5d6168720b5bd920e3583b 100644 (file)
--- a/re2c/src/ir/regexp/encoding/utf8/utf8_regexp.cc
+++ b/re2c/src/ir/regexp/encoding/utf8/utf8_regexp.cc
@@ -8,7 +8,7 @@ namespace re2c {
  
  RegExp * UTF8Symbol(utf8::rune r)
  {
-       uint8_t chars[utf8::MAX_RUNE_LENGTH];
+       uint32_t chars[utf8::MAX_RUNE_LENGTH];
         const int chars_count = utf8::rune_to_bytes(chars, r);
         RegExp * re = new MatchOp(Range::sym (chars[0]));
         for (int i = 1; i < chars_count; ++i)
author	Ulya Trofimovich <skvadrik@gmail.com>
	Sun, 9 Aug 2015 18:07:10 +0000 (19:07 +0100)
committer	Ulya Trofimovich <skvadrik@gmail.com>
	Sun, 9 Aug 2015 18:07:10 +0000 (19:07 +0100)
re2c/src/ir/regexp/encoding/utf16/utf16.cc		patch \| blob \| history
re2c/src/ir/regexp/encoding/utf16/utf16.h		patch \| blob \| history
re2c/src/ir/regexp/encoding/utf16/utf16_range.cc		patch \| blob \| history
re2c/src/ir/regexp/encoding/utf16/utf16_range.h		patch \| blob \| history
re2c/src/ir/regexp/encoding/utf16/utf16_regexp.cc		patch \| blob \| history
re2c/src/ir/regexp/encoding/utf8/utf8.cc		patch \| blob \| history
re2c/src/ir/regexp/encoding/utf8/utf8.h		patch \| blob \| history
re2c/src/ir/regexp/encoding/utf8/utf8_range.cc		patch \| blob \| history
re2c/src/ir/regexp/encoding/utf8/utf8_regexp.cc		patch \| blob \| history