From: Ulya Trofimovich Date: Sun, 9 Aug 2015 18:07:10 +0000 (+0100) Subject: Encodings: use 32-bit unsigned arithmetics instead of 8-bit and 16-bit. X-Git-Tag: 0.15~156 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=e71aedbee438ef608b5e2b89e7411e9e112af4b0;p=re2c Encodings: use 32-bit unsigned arithmetics instead of 8-bit and 16-bit. 8-bit and 16-bit unsigned integers used in arithmetic operations are promoted to 32 bits before operation and then truncated back. Theoretically this may change their value. This fixes a lot of [-Wconversion] warnings. --- diff --git a/re2c/src/ir/regexp/encoding/utf16/utf16.cc b/re2c/src/ir/regexp/encoding/utf16/utf16.cc index 47743edf..4b0a13bb 100644 --- a/re2c/src/ir/regexp/encoding/utf16/utf16.cc +++ b/re2c/src/ir/regexp/encoding/utf16/utf16.cc @@ -2,9 +2,9 @@ namespace re2c { -const uint32_t utf16::MAX_1WORD_RUNE = 0xFFFF; -const uint32_t utf16::MIN_LEAD_SURR = 0xD800; -const uint32_t utf16::MIN_TRAIL_SURR = 0xDC00; -const uint32_t utf16::MAX_TRAIL_SURR = 0xDFFF; +const uint32_t utf16::MAX_1WORD_RUNE = 0xFFFFu; +const uint32_t utf16::MIN_LEAD_SURR = 0xD800u; +const uint32_t utf16::MIN_TRAIL_SURR = 0xDC00u; +const uint32_t utf16::MAX_TRAIL_SURR = 0xDFFFu; } // namespace re2c diff --git a/re2c/src/ir/regexp/encoding/utf16/utf16.h b/re2c/src/ir/regexp/encoding/utf16/utf16.h index d64de349..89cdbdbd 100644 --- a/re2c/src/ir/regexp/encoding/utf16/utf16.h +++ b/re2c/src/ir/regexp/encoding/utf16/utf16.h @@ -16,20 +16,20 @@ public: static const uint32_t MAX_TRAIL_SURR; /* leading surrogate of UTF-16 symbol */ - static inline uint16_t lead_surr(rune r); + static inline uint32_t lead_surr(rune r); /* trailing surrogate of UTF-16 symbol */ - static inline uint16_t trail_surr(rune r); + static inline uint32_t trail_surr(rune r); }; -inline uint16_t utf16::lead_surr(rune r) +inline uint32_t utf16::lead_surr(rune r) { - return ((r - 0x10000) / 0x400) + MIN_LEAD_SURR; + return ((r - 0x10000u) / 0x400u) + MIN_LEAD_SURR; } -inline uint16_t utf16::trail_surr(rune r) +inline uint32_t utf16::trail_surr(rune r) { - return ((r - 0x10000) % 0x400) + MIN_TRAIL_SURR; + return ((r - 0x10000u) % 0x400u) + MIN_TRAIL_SURR; } } // namespace re2c diff --git a/re2c/src/ir/regexp/encoding/utf16/utf16_range.cc b/re2c/src/ir/regexp/encoding/utf16/utf16_range.cc index c4f27f6c..1cc8c2a6 100644 --- a/re2c/src/ir/regexp/encoding/utf16/utf16_range.cc +++ b/re2c/src/ir/regexp/encoding/utf16/utf16_range.cc @@ -6,7 +6,7 @@ namespace re2c { /* * Add word range [w1-w2]. */ -void UTF16addContinuous1(RangeSuffix * & root, uint16_t l, uint16_t h) +void UTF16addContinuous1(RangeSuffix * & root, uint32_t l, uint32_t h) { RangeSuffix ** p = &root; for (;;) @@ -29,7 +29,7 @@ void UTF16addContinuous1(RangeSuffix * & root, uint16_t l, uint16_t h) * Now that we have catenation of word ranges [l1-h1],[l2-h2], * we want to add it to existing range, merging suffixes on the fly. */ -void UTF16addContinuous2(RangeSuffix * & root, uint16_t l_ld, uint16_t h_ld, uint16_t l_tr, uint16_t h_tr) +void UTF16addContinuous2(RangeSuffix * & root, uint32_t l_ld, uint32_t h_ld, uint32_t l_tr, uint32_t h_tr) { RangeSuffix ** p = &root; for (;;) @@ -90,7 +90,7 @@ void UTF16addContinuous2(RangeSuffix * & root, uint16_t l_ld, uint16_t h_ld, uin * and represents original range as alternation of continuous * sub-ranges. */ -void UTF16splitByContinuity(RangeSuffix * & root, uint16_t l_ld, uint16_t h_ld, uint16_t l_tr, uint16_t h_tr) +void UTF16splitByContinuity(RangeSuffix * & root, uint32_t l_ld, uint32_t h_ld, uint32_t l_tr, uint32_t h_tr) { if (l_ld != h_ld) { @@ -120,21 +120,25 @@ void UTF16splitByContinuity(RangeSuffix * & root, uint16_t l_ld, uint16_t h_ld, void UTF16splitByRuneLength(RangeSuffix * & root, utf16::rune l, utf16::rune h) { if (l <= utf16::MAX_1WORD_RUNE) + { if (h <= utf16::MAX_1WORD_RUNE) + { UTF16addContinuous1(root, l, h); + } else { UTF16addContinuous1(root, l, utf16::MAX_1WORD_RUNE); - const uint16_t h_ld = utf16::lead_surr(h); - const uint16_t h_tr = utf16::trail_surr(h); + const uint32_t h_ld = utf16::lead_surr(h); + const uint32_t h_tr = utf16::trail_surr(h); UTF16splitByContinuity(root, utf16::MIN_LEAD_SURR, h_ld, utf16::MIN_TRAIL_SURR, h_tr); } + } else { - const uint16_t l_ld = utf16::lead_surr(l); - const uint16_t l_tr = utf16::trail_surr(l); - const uint16_t h_ld = utf16::lead_surr(h); - const uint16_t h_tr = utf16::trail_surr(h); + const uint32_t l_ld = utf16::lead_surr(l); + const uint32_t l_tr = utf16::trail_surr(l); + const uint32_t h_ld = utf16::lead_surr(h); + const uint32_t h_tr = utf16::trail_surr(h); UTF16splitByContinuity(root, l_ld, h_ld, l_tr, h_tr); } } diff --git a/re2c/src/ir/regexp/encoding/utf16/utf16_range.h b/re2c/src/ir/regexp/encoding/utf16/utf16_range.h index 4e75560d..43b3869e 100644 --- a/re2c/src/ir/regexp/encoding/utf16/utf16_range.h +++ b/re2c/src/ir/regexp/encoding/utf16/utf16_range.h @@ -6,9 +6,9 @@ namespace re2c { -void UTF16addContinuous1(RangeSuffix * & root, uint16_t l, uint16_t h); -void UTF16addContinuous2(RangeSuffix * & root, uint16_t l_ld, uint16_t h_ld, uint16_t l_tr, uint16_t h_tr); -void UTF16splitByContinuity(RangeSuffix * & root, uint16_t l_ld, uint16_t h_ld, uint16_t l_tr, uint16_t h_tr); +void UTF16addContinuous1(RangeSuffix * & root, uint32_t l, uint32_t h); +void UTF16addContinuous2(RangeSuffix * & root, uint32_t l_ld, uint32_t h_ld, uint32_t l_tr, uint32_t h_tr); +void UTF16splitByContinuity(RangeSuffix * & root, uint32_t l_ld, uint32_t h_ld, uint32_t l_tr, uint32_t h_tr); void UTF16splitByRuneLength(RangeSuffix * & root, utf16::rune l, utf16::rune h); } // namespace re2c diff --git a/re2c/src/ir/regexp/encoding/utf16/utf16_regexp.cc b/re2c/src/ir/regexp/encoding/utf16/utf16_regexp.cc index b8fef48a..68e05a89 100644 --- a/re2c/src/ir/regexp/encoding/utf16/utf16_regexp.cc +++ b/re2c/src/ir/regexp/encoding/utf16/utf16_regexp.cc @@ -12,8 +12,8 @@ RegExp * UTF16Symbol(utf16::rune r) return new MatchOp(Range::sym (r)); else { - const uint16_t ld = utf16::lead_surr(r); - const uint16_t tr = utf16::trail_surr(r); + const uint32_t ld = utf16::lead_surr(r); + const uint32_t tr = utf16::trail_surr(r); return new CatOp(new MatchOp(Range::sym (ld)), new MatchOp(Range::sym (tr))); } } diff --git a/re2c/src/ir/regexp/encoding/utf8/utf8.cc b/re2c/src/ir/regexp/encoding/utf8/utf8.cc index 8332e131..dd4b59ef 100644 --- a/re2c/src/ir/regexp/encoding/utf8/utf8.cc +++ b/re2c/src/ir/regexp/encoding/utf8/utf8.cc @@ -2,22 +2,33 @@ namespace re2c { -uint32_t utf8::rune_to_bytes(uint8_t *str, rune c) +const uint32_t utf8::ERROR = 0xFFFDu; + +const utf8::rune utf8::MAX_1BYTE_RUNE = 0x7Fu; +const utf8::rune utf8::MAX_2BYTE_RUNE = 0x7FFu; +const utf8::rune utf8::MAX_3BYTE_RUNE = 0xFFFFu; +const utf8::rune utf8::MAX_4BYTE_RUNE = 0x10FFFFu; +const utf8::rune utf8::MAX_RUNE = utf8::MAX_4BYTE_RUNE; + +const uint32_t utf8::PREFIX_1BYTE = 0u; // 0000 0000 +const uint32_t utf8::INFIX = 0x80u; // 1000 0000 +const uint32_t utf8::PREFIX_2BYTE = 0xC0u; // 1100 0000 +const uint32_t utf8::PREFIX_3BYTE = 0xE0u; // 1110 0000 +const uint32_t utf8::PREFIX_4BYTE = 0xF0u; // 1111 0000 + +const uint32_t utf8::SHIFT = 6u; +const uint32_t utf8::MASK = 0x3Fu; // 0011 1111 + +uint32_t utf8::rune_to_bytes(uint32_t *str, rune c) { - /* - * one byte sequence - * 0-0x7F => 0xxxxxxx - */ + // one byte sequence: 0-0x7F => 0xxxxxxx if (c <= MAX_1BYTE_RUNE) { str[0] = PREFIX_1BYTE | c; return 1; } - /* - * two byte sequence - * 0x80-0x7FF => 110xxxxx 10xxxxxx - */ + // two byte sequence: 0x80-0x7FF => 110xxxxx 10xxxxxx if (c <= MAX_2BYTE_RUNE) { str[0] = PREFIX_2BYTE | (c >> 1*SHIFT); @@ -25,19 +36,14 @@ uint32_t utf8::rune_to_bytes(uint8_t *str, rune c) return 2; } - /* - * If the Rune is out of range, convert it to the error rune. - * Do this test here because the error rune encodes to three bytes. - * Doing it earlier would duplicate work, since an out of range - * Rune wouldn't have fit in one or two bytes. - */ + // If the Rune is out of range, convert it to the error rune. + // Do this test here because the error rune encodes to three bytes. + // Doing it earlier would duplicate work, since an out of range + // Rune wouldn't have fit in one or two bytes. if (c > MAX_RUNE) c = ERROR; - /* - * three byte sequence - * 0x800 - 0xFFFF => 1110xxxx 10xxxxxx 10xxxxxx - */ + // three byte sequence: 0x800 - 0xFFFF => 1110xxxx 10xxxxxx 10xxxxxx if (c <= MAX_3BYTE_RUNE) { str[0] = PREFIX_3BYTE | (c >> 2*SHIFT); @@ -46,10 +52,8 @@ uint32_t utf8::rune_to_bytes(uint8_t *str, rune c) return 3; } - /* - * four byte sequence (21-bit value) - * 0x10000 - 0x1FFFFF => 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx - */ + // four byte sequence (21-bit value): + // 0x10000 - 0x1FFFFF => 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx str[0] = PREFIX_4BYTE | (c >> 3*SHIFT); str[1] = INFIX | ((c >> 2*SHIFT) & MASK); str[2] = INFIX | ((c >> 1*SHIFT) & MASK); diff --git a/re2c/src/ir/regexp/encoding/utf8/utf8.h b/re2c/src/ir/regexp/encoding/utf8/utf8.h index 72d14cca..0ca31422 100644 --- a/re2c/src/ir/regexp/encoding/utf8/utf8.h +++ b/re2c/src/ir/regexp/encoding/utf8/utf8.h @@ -10,34 +10,36 @@ class utf8 public: typedef uint32_t rune; - enum { MAX_RUNE_LENGTH = 4 /* maximum characters per rune */ + // maximum characters per rune + // enum instead of static const member because of [-Wvla] + enum { MAX_RUNE_LENGTH = 4u }; - , ERROR = 0xFFFD /* decoding error */ + // decoding error + static const uint32_t ERROR; - /* maximal runes for each rune length */ - , MAX_1BYTE_RUNE = 0x7F - , MAX_2BYTE_RUNE = 0x7FF - , MAX_3BYTE_RUNE = 0xFFFF - , MAX_4BYTE_RUNE = 0x10FFFF - , MAX_RUNE = MAX_4BYTE_RUNE + // maximal runes for each rune length + static const rune MAX_1BYTE_RUNE; + static const rune MAX_2BYTE_RUNE; + static const rune MAX_3BYTE_RUNE; + static const rune MAX_4BYTE_RUNE; + static const rune MAX_RUNE; - , PREFIX_1BYTE = 0 /* 0000 0000 */ - , INFIX = 0x80 /* 1000 0000 */ - , PREFIX_2BYTE = 0xC0 /* 1100 0000 */ - , PREFIX_3BYTE = 0xE0 /* 1110 0000 */ - , PREFIX_4BYTE = 0xF0 /* 1111 0000 */ + static const uint32_t PREFIX_1BYTE; + static const uint32_t INFIX; + static const uint32_t PREFIX_2BYTE; + static const uint32_t PREFIX_3BYTE; + static const uint32_t PREFIX_4BYTE; - , SHIFT = 6 - , MASK = 0x3F /* 0011 1111 */ - }; + static const uint32_t SHIFT; + static const uint32_t MASK; - /* UTF-8 bytestring for given Unicode rune */ - static uint32_t rune_to_bytes(uint8_t * s, rune r); + // UTF-8 bytestring for given Unicode rune + static uint32_t rune_to_bytes(uint32_t * s, rune r); - /* length of UTF-8 bytestring for given Unicode rune */ + // length of UTF-8 bytestring for given Unicode rune static uint32_t rune_length(rune r); - /* maximal Unicode rune with given length of UTF-8 bytestring */ + // maximal Unicode rune with given length of UTF-8 bytestring static rune max_rune(uint32_t i); }; diff --git a/re2c/src/ir/regexp/encoding/utf8/utf8_range.cc b/re2c/src/ir/regexp/encoding/utf8/utf8_range.cc index 466526af..147a5573 100644 --- a/re2c/src/ir/regexp/encoding/utf8/utf8_range.cc +++ b/re2c/src/ir/regexp/encoding/utf8/utf8_range.cc @@ -9,8 +9,8 @@ namespace re2c { */ void UTF8addContinuous(RangeSuffix * & root, utf8::rune l, utf8::rune h, uint32_t n) { - uint8_t cl[utf8::MAX_RUNE_LENGTH]; - uint8_t ch[utf8::MAX_RUNE_LENGTH]; + uint32_t cl[utf8::MAX_RUNE_LENGTH]; + uint32_t ch[utf8::MAX_RUNE_LENGTH]; utf8::rune_to_bytes(cl, l); utf8::rune_to_bytes(ch, h); diff --git a/re2c/src/ir/regexp/encoding/utf8/utf8_regexp.cc b/re2c/src/ir/regexp/encoding/utf8/utf8_regexp.cc index e6bd1a20..8b746366 100644 --- a/re2c/src/ir/regexp/encoding/utf8/utf8_regexp.cc +++ b/re2c/src/ir/regexp/encoding/utf8/utf8_regexp.cc @@ -8,7 +8,7 @@ namespace re2c { RegExp * UTF8Symbol(utf8::rune r) { - uint8_t chars[utf8::MAX_RUNE_LENGTH]; + uint32_t chars[utf8::MAX_RUNE_LENGTH]; const int chars_count = utf8::rune_to_bytes(chars, r); RegExp * re = new MatchOp(Range::sym (chars[0])); for (int i = 1; i < chars_count; ++i)