From: Ulya Trofimovich <skvadrik@gmail.com>
Date: Sun, 9 Aug 2015 18:07:10 +0000 (+0100)
Subject: Encodings: use 32-bit unsigned arithmetics instead of 8-bit and 16-bit.
X-Git-Tag: 0.15~156
X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=e71aedbee438ef608b5e2b89e7411e9e112af4b0;p=re2c

Encodings: use 32-bit unsigned arithmetics instead of 8-bit and 16-bit.

8-bit and 16-bit unsigned integers used in arithmetic operations
are promoted to 32 bits before operation and then truncated back.
Theoretically this may change their value.

This fixes a lot of [-Wconversion] warnings.
---

diff --git a/re2c/src/ir/regexp/encoding/utf16/utf16.cc b/re2c/src/ir/regexp/encoding/utf16/utf16.cc
index 47743edf..4b0a13bb 100644
--- a/re2c/src/ir/regexp/encoding/utf16/utf16.cc
+++ b/re2c/src/ir/regexp/encoding/utf16/utf16.cc
@@ -2,9 +2,9 @@
 
 namespace re2c {
 
-const uint32_t utf16::MAX_1WORD_RUNE	= 0xFFFF;
-const uint32_t utf16::MIN_LEAD_SURR	= 0xD800;
-const uint32_t utf16::MIN_TRAIL_SURR	= 0xDC00;
-const uint32_t utf16::MAX_TRAIL_SURR	= 0xDFFF;
+const uint32_t utf16::MAX_1WORD_RUNE	= 0xFFFFu;
+const uint32_t utf16::MIN_LEAD_SURR	= 0xD800u;
+const uint32_t utf16::MIN_TRAIL_SURR	= 0xDC00u;
+const uint32_t utf16::MAX_TRAIL_SURR	= 0xDFFFu;
 
 } // namespace re2c
diff --git a/re2c/src/ir/regexp/encoding/utf16/utf16.h b/re2c/src/ir/regexp/encoding/utf16/utf16.h
index d64de349..89cdbdbd 100644
--- a/re2c/src/ir/regexp/encoding/utf16/utf16.h
+++ b/re2c/src/ir/regexp/encoding/utf16/utf16.h
@@ -16,20 +16,20 @@ public:
 	static const uint32_t MAX_TRAIL_SURR;
 
 	/* leading surrogate of UTF-16 symbol */
-	static inline uint16_t lead_surr(rune r);
+	static inline uint32_t lead_surr(rune r);
 
 	/* trailing surrogate of UTF-16 symbol */
-	static inline uint16_t trail_surr(rune r);
+	static inline uint32_t trail_surr(rune r);
 };
 
-inline uint16_t utf16::lead_surr(rune r)
+inline uint32_t utf16::lead_surr(rune r)
 {
-	return ((r - 0x10000) / 0x400) + MIN_LEAD_SURR;
+	return ((r - 0x10000u) / 0x400u) + MIN_LEAD_SURR;
 }
 
-inline uint16_t utf16::trail_surr(rune r)
+inline uint32_t utf16::trail_surr(rune r)
 {
-	return ((r - 0x10000) % 0x400) + MIN_TRAIL_SURR;
+	return ((r - 0x10000u) % 0x400u) + MIN_TRAIL_SURR;
 }
 
 }  // namespace re2c
diff --git a/re2c/src/ir/regexp/encoding/utf16/utf16_range.cc b/re2c/src/ir/regexp/encoding/utf16/utf16_range.cc
index c4f27f6c..1cc8c2a6 100644
--- a/re2c/src/ir/regexp/encoding/utf16/utf16_range.cc
+++ b/re2c/src/ir/regexp/encoding/utf16/utf16_range.cc
@@ -6,7 +6,7 @@ namespace re2c {
 /*
  * Add word range [w1-w2].
  */
-void UTF16addContinuous1(RangeSuffix * & root, uint16_t l, uint16_t h)
+void UTF16addContinuous1(RangeSuffix * & root, uint32_t l, uint32_t h)
 {
 	RangeSuffix ** p = &root;
 	for (;;)
@@ -29,7 +29,7 @@ void UTF16addContinuous1(RangeSuffix * & root, uint16_t l, uint16_t h)
  * Now that we have catenation of word ranges [l1-h1],[l2-h2],
  * we want to add it to existing range, merging suffixes on the fly.
  */
-void UTF16addContinuous2(RangeSuffix * & root, uint16_t l_ld, uint16_t h_ld, uint16_t l_tr, uint16_t h_tr)
+void UTF16addContinuous2(RangeSuffix * & root, uint32_t l_ld, uint32_t h_ld, uint32_t l_tr, uint32_t h_tr)
 {
 	RangeSuffix ** p = &root;
 	for (;;)
@@ -90,7 +90,7 @@ void UTF16addContinuous2(RangeSuffix * & root, uint16_t l_ld, uint16_t h_ld, uin
  * and represents original range as alternation of continuous
  * sub-ranges.
  */
-void UTF16splitByContinuity(RangeSuffix * & root, uint16_t l_ld, uint16_t h_ld, uint16_t l_tr, uint16_t h_tr)
+void UTF16splitByContinuity(RangeSuffix * & root, uint32_t l_ld, uint32_t h_ld, uint32_t l_tr, uint32_t h_tr)
 {
 	if (l_ld != h_ld)
 	{
@@ -120,21 +120,25 @@ void UTF16splitByContinuity(RangeSuffix * & root, uint16_t l_ld, uint16_t h_ld,
 void UTF16splitByRuneLength(RangeSuffix * & root, utf16::rune l, utf16::rune h)
 {
 	if (l <= utf16::MAX_1WORD_RUNE)
+	{
 		if (h <= utf16::MAX_1WORD_RUNE)
+		{
 			UTF16addContinuous1(root, l, h);
+		}
 		else
 		{
 			UTF16addContinuous1(root, l, utf16::MAX_1WORD_RUNE);
-			const uint16_t h_ld = utf16::lead_surr(h);
-			const uint16_t h_tr = utf16::trail_surr(h);
+			const uint32_t h_ld = utf16::lead_surr(h);
+			const uint32_t h_tr = utf16::trail_surr(h);
 			UTF16splitByContinuity(root, utf16::MIN_LEAD_SURR, h_ld, utf16::MIN_TRAIL_SURR, h_tr);
 		}
+	}
 	else
 	{
-			const uint16_t l_ld = utf16::lead_surr(l);
-			const uint16_t l_tr = utf16::trail_surr(l);
-			const uint16_t h_ld = utf16::lead_surr(h);
-			const uint16_t h_tr = utf16::trail_surr(h);
+			const uint32_t l_ld = utf16::lead_surr(l);
+			const uint32_t l_tr = utf16::trail_surr(l);
+			const uint32_t h_ld = utf16::lead_surr(h);
+			const uint32_t h_tr = utf16::trail_surr(h);
 			UTF16splitByContinuity(root, l_ld, h_ld, l_tr, h_tr);
 	}
 }
diff --git a/re2c/src/ir/regexp/encoding/utf16/utf16_range.h b/re2c/src/ir/regexp/encoding/utf16/utf16_range.h
index 4e75560d..43b3869e 100644
--- a/re2c/src/ir/regexp/encoding/utf16/utf16_range.h
+++ b/re2c/src/ir/regexp/encoding/utf16/utf16_range.h
@@ -6,9 +6,9 @@
 
 namespace re2c {
 
-void UTF16addContinuous1(RangeSuffix * & root, uint16_t l, uint16_t h);
-void UTF16addContinuous2(RangeSuffix * & root, uint16_t l_ld, uint16_t h_ld, uint16_t l_tr, uint16_t h_tr);
-void UTF16splitByContinuity(RangeSuffix * & root, uint16_t l_ld, uint16_t h_ld, uint16_t l_tr, uint16_t h_tr);
+void UTF16addContinuous1(RangeSuffix * & root, uint32_t l, uint32_t h);
+void UTF16addContinuous2(RangeSuffix * & root, uint32_t l_ld, uint32_t h_ld, uint32_t l_tr, uint32_t h_tr);
+void UTF16splitByContinuity(RangeSuffix * & root, uint32_t l_ld, uint32_t h_ld, uint32_t l_tr, uint32_t h_tr);
 void UTF16splitByRuneLength(RangeSuffix * & root, utf16::rune l, utf16::rune h);
 
 } // namespace re2c
diff --git a/re2c/src/ir/regexp/encoding/utf16/utf16_regexp.cc b/re2c/src/ir/regexp/encoding/utf16/utf16_regexp.cc
index b8fef48a..68e05a89 100644
--- a/re2c/src/ir/regexp/encoding/utf16/utf16_regexp.cc
+++ b/re2c/src/ir/regexp/encoding/utf16/utf16_regexp.cc
@@ -12,8 +12,8 @@ RegExp * UTF16Symbol(utf16::rune r)
 		return new MatchOp(Range::sym (r));
 	else
 	{
-		const uint16_t ld = utf16::lead_surr(r);
-		const uint16_t tr = utf16::trail_surr(r);
+		const uint32_t ld = utf16::lead_surr(r);
+		const uint32_t tr = utf16::trail_surr(r);
 		return new CatOp(new MatchOp(Range::sym (ld)), new MatchOp(Range::sym (tr)));
 	}
 }
diff --git a/re2c/src/ir/regexp/encoding/utf8/utf8.cc b/re2c/src/ir/regexp/encoding/utf8/utf8.cc
index 8332e131..dd4b59ef 100644
--- a/re2c/src/ir/regexp/encoding/utf8/utf8.cc
+++ b/re2c/src/ir/regexp/encoding/utf8/utf8.cc
@@ -2,22 +2,33 @@
 
 namespace re2c {
 
-uint32_t utf8::rune_to_bytes(uint8_t *str, rune c)
+const uint32_t utf8::ERROR = 0xFFFDu;
+
+const utf8::rune utf8::MAX_1BYTE_RUNE = 0x7Fu;
+const utf8::rune utf8::MAX_2BYTE_RUNE = 0x7FFu;
+const utf8::rune utf8::MAX_3BYTE_RUNE = 0xFFFFu;
+const utf8::rune utf8::MAX_4BYTE_RUNE = 0x10FFFFu;
+const utf8::rune utf8::MAX_RUNE       = utf8::MAX_4BYTE_RUNE;
+
+const uint32_t utf8::PREFIX_1BYTE = 0u;    // 0000 0000
+const uint32_t utf8::INFIX        = 0x80u; // 1000 0000
+const uint32_t utf8::PREFIX_2BYTE = 0xC0u; // 1100 0000
+const uint32_t utf8::PREFIX_3BYTE = 0xE0u; // 1110 0000
+const uint32_t utf8::PREFIX_4BYTE = 0xF0u; // 1111 0000
+
+const uint32_t utf8::SHIFT = 6u;
+const uint32_t utf8::MASK = 0x3Fu; // 0011 1111
+
+uint32_t utf8::rune_to_bytes(uint32_t *str, rune c)
 {
-	/*
-	 * one byte sequence
-	 *	0-0x7F => 0xxxxxxx
-	 */
+	// one byte sequence: 0-0x7F => 0xxxxxxx
 	if (c <= MAX_1BYTE_RUNE)
 	{
 		str[0] = PREFIX_1BYTE | c;
 		return 1;
 	}
 
-	/*
-	 * two byte sequence
-	 *	0x80-0x7FF => 110xxxxx 10xxxxxx
-	 */
+	// two byte sequence: 0x80-0x7FF => 110xxxxx 10xxxxxx
 	if (c <= MAX_2BYTE_RUNE)
 	{
 		str[0] = PREFIX_2BYTE | (c >> 1*SHIFT);
@@ -25,19 +36,14 @@ uint32_t utf8::rune_to_bytes(uint8_t *str, rune c)
 		return 2;
 	}
 
-	/*
-	 * If the Rune is out of range, convert it to the error rune.
-	 * Do this test here because the error rune encodes to three bytes.
-	 * Doing it earlier would duplicate work, since an out of range
-	 * Rune wouldn't have fit in one or two bytes.
-	 */
+	// If the Rune is out of range, convert it to the error rune.
+	// Do this test here because the error rune encodes to three bytes.
+	// Doing it earlier would duplicate work, since an out of range
+	// Rune wouldn't have fit in one or two bytes.
 	if (c > MAX_RUNE)
 		c = ERROR;
 
-	/*
-	 * three byte sequence
-	 *	0x800 - 0xFFFF => 1110xxxx 10xxxxxx 10xxxxxx
-	 */
+	// three byte sequence: 0x800 - 0xFFFF => 1110xxxx 10xxxxxx 10xxxxxx
 	if (c <= MAX_3BYTE_RUNE)
 	{
 		str[0] = PREFIX_3BYTE | (c >> 2*SHIFT);
@@ -46,10 +52,8 @@ uint32_t utf8::rune_to_bytes(uint8_t *str, rune c)
 		return 3;
 	}
 
-	/*
-	 * four byte sequence (21-bit value)
-	 *     0x10000 - 0x1FFFFF => 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
-	 */
+	// four byte sequence (21-bit value):
+	// 0x10000 - 0x1FFFFF => 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
 	str[0] = PREFIX_4BYTE | (c >> 3*SHIFT);
 	str[1] = INFIX        | ((c >> 2*SHIFT) & MASK);
 	str[2] = INFIX        | ((c >> 1*SHIFT) & MASK);
diff --git a/re2c/src/ir/regexp/encoding/utf8/utf8.h b/re2c/src/ir/regexp/encoding/utf8/utf8.h
index 72d14cca..0ca31422 100644
--- a/re2c/src/ir/regexp/encoding/utf8/utf8.h
+++ b/re2c/src/ir/regexp/encoding/utf8/utf8.h
@@ -10,34 +10,36 @@ class utf8
 public:
 	typedef uint32_t rune;
 
-	enum	{ MAX_RUNE_LENGTH = 4 /* maximum characters per rune */
+	// maximum characters per rune
+	// enum instead of static const member because of [-Wvla]
+	enum { MAX_RUNE_LENGTH = 4u };
 
-		, ERROR = 0xFFFD /* decoding error */
+	// decoding error
+	static const uint32_t ERROR;
 
-		/* maximal runes for each rune length */
-		, MAX_1BYTE_RUNE = 0x7F
-		, MAX_2BYTE_RUNE = 0x7FF
-		, MAX_3BYTE_RUNE = 0xFFFF
-		, MAX_4BYTE_RUNE = 0x10FFFF
-		, MAX_RUNE       = MAX_4BYTE_RUNE
+	// maximal runes for each rune length
+	static const rune MAX_1BYTE_RUNE;
+	static const rune MAX_2BYTE_RUNE;
+	static const rune MAX_3BYTE_RUNE;
+	static const rune MAX_4BYTE_RUNE;
+	static const rune MAX_RUNE;
 
-		, PREFIX_1BYTE = 0    /* 0000 0000 */
-		, INFIX        = 0x80 /* 1000 0000 */
-		, PREFIX_2BYTE = 0xC0 /* 1100 0000 */
-		, PREFIX_3BYTE = 0xE0 /* 1110 0000 */
-		, PREFIX_4BYTE = 0xF0 /* 1111 0000 */
+	static const uint32_t PREFIX_1BYTE;
+	static const uint32_t INFIX;
+	static const uint32_t PREFIX_2BYTE;
+	static const uint32_t PREFIX_3BYTE;
+	static const uint32_t PREFIX_4BYTE;
 
-		, SHIFT = 6
-		, MASK = 0x3F /* 0011 1111 */
-		};
+	static const uint32_t SHIFT;
+	static const uint32_t MASK;
 
-	/* UTF-8 bytestring for given Unicode rune */
-	static uint32_t rune_to_bytes(uint8_t * s, rune r);
+	// UTF-8 bytestring for given Unicode rune
+	static uint32_t rune_to_bytes(uint32_t * s, rune r);
 
-	/* length of UTF-8 bytestring for given Unicode rune */
+	// length of UTF-8 bytestring for given Unicode rune
 	static uint32_t rune_length(rune r);
 
-	/* maximal Unicode rune with given length of UTF-8 bytestring */
+	// maximal Unicode rune with given length of UTF-8 bytestring
 	static rune max_rune(uint32_t i);
 };
 
diff --git a/re2c/src/ir/regexp/encoding/utf8/utf8_range.cc b/re2c/src/ir/regexp/encoding/utf8/utf8_range.cc
index 466526af..147a5573 100644
--- a/re2c/src/ir/regexp/encoding/utf8/utf8_range.cc
+++ b/re2c/src/ir/regexp/encoding/utf8/utf8_range.cc
@@ -9,8 +9,8 @@ namespace re2c {
  */
 void UTF8addContinuous(RangeSuffix * & root, utf8::rune l, utf8::rune h, uint32_t n)
 {
-	uint8_t cl[utf8::MAX_RUNE_LENGTH];
-	uint8_t ch[utf8::MAX_RUNE_LENGTH];
+	uint32_t cl[utf8::MAX_RUNE_LENGTH];
+	uint32_t ch[utf8::MAX_RUNE_LENGTH];
 	utf8::rune_to_bytes(cl, l);
 	utf8::rune_to_bytes(ch, h);
 
diff --git a/re2c/src/ir/regexp/encoding/utf8/utf8_regexp.cc b/re2c/src/ir/regexp/encoding/utf8/utf8_regexp.cc
index e6bd1a20..8b746366 100644
--- a/re2c/src/ir/regexp/encoding/utf8/utf8_regexp.cc
+++ b/re2c/src/ir/regexp/encoding/utf8/utf8_regexp.cc
@@ -8,7 +8,7 @@ namespace re2c {
 
 RegExp * UTF8Symbol(utf8::rune r)
 {
-	uint8_t chars[utf8::MAX_RUNE_LENGTH];
+	uint32_t chars[utf8::MAX_RUNE_LENGTH];
 	const int chars_count = utf8::rune_to_bytes(chars, r);
 	RegExp * re = new MatchOp(Range::sym (chars[0]));
 	for (int i = 1; i < chars_count; ++i)