From: Ulya Trofimovich <skvadrik@gmail.com>
Date: Thu, 20 Aug 2015 11:09:52 +0000 (+0100)
Subject: Check if code point exceeds maximum. Correctly cast 'char' to 'uint32_t'.
X-Git-Tag: 0.15~107
X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=0346d1666f58da5dbe35d156e0487d95354153ff;p=re2c

Check if code point exceeds maximum. Correctly cast 'char' to 'uint32_t'.

First fix:
re2c used to check if code point exceeds maximal value for current
encoding when parsing it. When I moved parsing code points to lexer
I forgot the check.

Second fix:
I assumed that 'static_cast<uint32_t>' on 'signed char' zero-extends.
But I was wrong: it sign-extends. Need to cast to 'unsigned char'
instead.
---

diff --git a/re2c/bootstrap/src/parse/scanner_lex.cc b/re2c/bootstrap/src/parse/scanner_lex.cc
index 88f14d5f..bb77bd47 100644
--- a/re2c/bootstrap/src/parse/scanner_lex.cc
+++ b/re2c/bootstrap/src/parse/scanner_lex.cc
@@ -1,4 +1,4 @@
-/* Generated by re2c 0.14.3 on Wed Aug 19 18:44:26 2015 */
+/* Generated by re2c 0.14.3 on Thu Aug 20 12:08:21 2015 */
 #line 1 "../src/parse/scanner_lex.re"
 #include <stdlib.h>
 #include <string.h>
@@ -1512,7 +1512,7 @@ yy237:
 		}
 		else
 		{
-			cpoints.push_back (static_cast<uint32_t> (c));
+			cpoints.push_back (static_cast<uint8_t> (c));
 			goto cpoints;
 		}
 	}
@@ -1526,7 +1526,7 @@ yy239:
 		{
 			warn.useless_escape (tline, tok - pos, c);
 		}
-		cpoints.push_back (static_cast<uint32_t> (c));
+		cpoints.push_back (static_cast<uint8_t> (c));
 		goto cpoints;
 	}
 #line 1533 "src/parse/scanner_lex.cc"
diff --git a/re2c/src/ir/regexp/encoding/enc.cc b/re2c/src/ir/regexp/encoding/enc.cc
index 65857cab..029d525c 100644
--- a/re2c/src/ir/regexp/encoding/enc.cc
+++ b/re2c/src/ir/regexp/encoding/enc.cc
@@ -54,19 +54,23 @@ const uint32_t Enc::ebc2asc[256] =
  * it is assumed that user considers it to be valid.
  * We must check it.
  *
- * Returns false if this code point is forbidden
- * by current policy, otherwise returns true.
- * Overwrites code point.
+ * Returns false if this code point exceeds maximum
+ * or is forbidden by current policy, otherwise
+ * returns true. Overwrites code point.
  */
 bool Enc::encode(uint32_t & c) const
 {
+	if (c >= nCodePoints ())
+	{
+		return false;
+	}
+
 	switch (type)
 	{
 		case ASCII:
-			c &= 0xFF;
 			return true;
 		case EBCDIC:
-			c = asc2ebc[c & 0xFF];
+			c = asc2ebc[c];
 			return true;
 		case UCS2:
 		case UTF16:
@@ -120,29 +124,30 @@ uint32_t Enc::decodeUnsafe(uint32_t c) const
  * it is assumed that user considers that all code
  * points from this range are valid. re2c must check it.
  *
- * Returns NULL if range contains code points forbidden
- * by current policy, otherwise returns pointer to newly
- * constructed Range.
+ * Returns NULL if range contains code points that
+ * exceed maximum or are forbidden by current policy,
+ * otherwise returns pointer to newly constructed range.
  */
 Range * Enc::encodeRange(uint32_t l, uint32_t h) const
 {
+	if (l >= nCodePoints () || h >= nCodePoints ())
+	{
+		return NULL;
+	}
+
 	Range * r = NULL;
 	switch (type)
 	{
 		case ASCII:
-			if (l > 0xFF || h > 0xFF)
-			{
-				return NULL;
-			}
 			r = Range::ran (l, h + 1);
 			break;
 		case EBCDIC:
 		{
-			const uint32_t el = asc2ebc[l & 0xFF];
+			const uint32_t el = asc2ebc[l];
 			r = Range::sym (el);
 			for (uint32_t c = l + 1; c <= h; ++c)
 			{
-				const uint32_t ec = asc2ebc[c & 0xFF];
+				const uint32_t ec = asc2ebc[c];
 				r = Range::add (r, Range::sym (ec));
 			}
 			break;
@@ -177,14 +182,14 @@ Range * Enc::encodeRange(uint32_t l, uint32_t h) const
 }
 
 /*
- * Returns [0 - CPOINT_MAX] (full range) representation
- * for current encoding with regard to current policy.
+ * Returns full range representation for current encoding
+ * with regard to current policy.
  *
  * Since range is defined declaratively, re2c does
  * all the necessary corrections 'for free'.
  *
  * Always succeeds, returns pointer to newly constructed
- * Range.
+ * range.
  */
 Range * Enc::fullRange() const
 {
diff --git a/re2c/src/ir/regexp/encoding/enc.h b/re2c/src/ir/regexp/encoding/enc.h
index 699b406f..daa76dbd 100644
--- a/re2c/src/ir/regexp/encoding/enc.h
+++ b/re2c/src/ir/regexp/encoding/enc.h
@@ -6,28 +6,33 @@
 
 namespace re2c {
 
-// Each encoding defines two concepts:
-//
-// 1) Code point -- abstract number, which represents single encoding symbol.
-//	E.g., Unicode defines 0x10FFFF code points, so each Unicode encoding
-//	must be capable of representing 0x10FFFF code points.
-//
-// 2) Code unit -- the smallest unit of memory, which is used in the encoded
-//	text. One or more code units can be needed to represent a single code
-//	point, depending on the encoding. For each encoding, all code points
-//	either are represented with equal number of code units (fixed-length
-//	encodings), or with variable number of code units (variable-length
-//	encodings).
-//
-// encoding | code point number | code point size       | code unit number | code unit size
-// ---------|-------------------|-----------------------|------------------|----------------
-// ASCII    | 0xFF              | fixed,        1 byte  | 0xFF             | 1 byte
-// EBCDIC   | 0xFF              | fixed,        1 byte  | 0xFF             | 1 byte
-// UCS2     | 0xFFFF            | fixed,        2 bytes | 0xFFFF           | 2 bytes
-// UTF16    | 0x10FFFF          | variable, 2 - 4 bytes | 0xFFFF           | 2 bytes
-// UTF32    | 0x10FFFF          | fixed,        4 bytes | 0x10FFFF         | 4 bytes
-// UTF8     | 0x10FFFF          | variable, 1 - 4 bytes | 0xFF             | 1 byte
-// -----------------------------------------------------------------------------------------
+/*
+ * note [encodings]
+ *
+ * Each encoding defines two concepts:
+ *
+ * 1) Code point -- abstract number, which represents single encoding symbol.
+ *    E.g., Unicode defines code points in the range [0 - 0x10FFFF] , so each
+ *    Unicode encoding must be capable of representing 0x110000 code points.
+ *
+ * 2) Code unit -- the smallest unit of memory, which is used in the encoded
+ *    text. One or more code units can be needed to represent a single code
+ *    point, depending on the encoding. For each encoding, all code points
+ *    either are represented with equal number of code units (fixed-length
+ *    encodings), or with variable number of code units (variable-length
+ *    encodings).
+ *
+ * +----------+------------------+-----------------------+-----------------+----------------+
+ * | encoding | code point range | code point size       | code unit range | code unit size |
+ * +----------+------------------+-----------------------+-----------------+----------------+
+ * | ASCII    | 0 - 0xFF         | fixed,        1 byte  | 0 - 0xFF        | 1 byte         |
+ * | EBCDIC   | 0 - 0xFF         | fixed,        1 byte  | 0 - 0xFF        | 1 byte         |
+ * | UCS2     | 0 - 0xFFFF       | fixed,        2 bytes | 0 - 0xFFFF      | 2 bytes        |
+ * | UTF16    | 0 - 0x10FFFF     | variable, 2 - 4 bytes | 0 - 0xFFFF      | 2 bytes        |
+ * | UTF32    | 0 - 0x10FFFF     | fixed,        4 bytes | 0 - 0x10FFFF    | 4 bytes        |
+ * | UTF8     | 0 - 0x10FFFF     | variable, 1 - 4 bytes | 0 - 0xFF        | 1 byte         |
+ * +----------+------------------+-----------------------+-----------------+----------------+
+ */
 
 class Enc
 {
diff --git a/re2c/src/parse/scanner_lex.re b/re2c/src/parse/scanner_lex.re
index e152ebc0..595225af 100644
--- a/re2c/src/parse/scanner_lex.re
+++ b/re2c/src/parse/scanner_lex.re
@@ -427,7 +427,7 @@ cpoints:
 		{
 			warn.useless_escape (tline, tok - pos, c);
 		}
-		cpoints.push_back (static_cast<uint32_t> (c));
+		cpoints.push_back (static_cast<uint8_t> (c));
 		goto cpoints;
 	}
 	[^] \ esc
@@ -450,7 +450,7 @@ cpoints:
 		}
 		else
 		{
-			cpoints.push_back (static_cast<uint32_t> (c));
+			cpoints.push_back (static_cast<uint8_t> (c));
 			goto cpoints;
 		}
 	}