From: Ulya Fokanova <skvadrik@gmail.com>
Date: Wed, 9 Apr 2014 22:56:16 +0000 (+0300)
Subject: Moved encoding-specific character handling to 'Enc' class.
X-Git-Tag: 0.13.7.1~8
X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=8de8bfbd2828c127e162604496bf094e98153352;p=re2c

Moved encoding-specific character handling to 'Enc' class.
---

diff --git a/re2c/actions.cc b/re2c/actions.cc
index d3d02478..d4a8d2b1 100644
--- a/re2c/actions.cc
+++ b/re2c/actions.cc
@@ -735,7 +735,7 @@ std::string& Scanner::unescape(SubStr& str_in, std::string& str_out) const
 
 Range * Scanner::getRange(SubStr &s) const
 {
-	uint lb = unescape(s), ub, xlb, xub;
+	uint lb = unescape(s), ub;
 
 	if (s.len < 2 || *s.str != '-')
 	{
@@ -754,39 +754,23 @@ Range * Scanner::getRange(SubStr &s) const
 		}
 	}
 
-	xlb = encoding.xlat(lb);
-	xub = encoding.xlat(ub);
-
-	if (encoding.is(Enc::EBCDIC))
-	{
-		Range * r = new Range(xlb, xlb + 1);
-		for (uint c = lb + 1; c <= ub; ++c)
-		{
-			uint xc = encoding.xlat(c);
-			r = doUnion(r, new Range(xc, xc + 1));
-		}
-		return r;
-	}
-	else
-	{
-		return new Range(xlb, xub + 1);
-	}
-}
-
-RegExp * Scanner::matchChar(uint c) const
-{
-	uint xc = encoding.xlat(c);
-	return new MatchOp(new Range(xc, xc + 1));
+	Range * r = encoding.encodeRange(lb, ub);
+	if (r == NULL)
+		fatalf("Bad code point range: '0x%X - 0x%X'", lb, ub);
+	return r;
 }
 
 RegExp * Scanner::matchSymbol(uint c) const
 {
+	if (!encoding.encode(c))
+		fatalf("Bad code point: '0x%X'", c);
+
 	if (encoding.is(Enc::UTF16))
 		return UTF16Symbol(c);
 	else if (encoding.is(Enc::UTF8))
 		return UTF8Symbol(c);
 	else
-		return matchChar(c);
+		return new MatchOp(new Range(c, c + 1));
 }
 
 RegExp * Scanner::strToRE(SubStr s) const
@@ -882,21 +866,23 @@ RegExp * Scanner::invToRE(SubStr s) const
 	s.len -= 3;
 	s.str += 2;
 
-	Range * any = new Range(0, encoding.nCodePoints());
+	Range * full = encoding.fullRange();
 
 	Range * r = s.len == 0
-		? any
-		: doDiff(any, mkRange (s));
+		? full
+		: doDiff(full, mkRange (s));
 
 	return matchSymbolRange(r);
 }
 
 RegExp * Scanner::mkDot() const
 {
-	Range * any = new Range(0, encoding.nCodePoints());
-	const uint c = encoding.xlat('\n');
+	Range * full = encoding.fullRange();
+	uint c = '\n';
+	if (!encoding.encode(c))
+		fatalf("Bad code point: '0x%X'", c);
 	Range * ran = new Range(c, c + 1);
-	Range * inv = doDiff(any, ran);
+	Range * inv = doDiff(full, ran);
 
 	return matchSymbolRange(inv);
 }
diff --git a/re2c/code.cc b/re2c/code.cc
index 40bf7b6f..65e64f56 100644
--- a/re2c/code.cc
+++ b/re2c/code.cc
@@ -908,9 +908,11 @@ static bool genCases(std::ostream &o, uint ind, uint lb, Span *s, bool &newLine,
 					o << indent(ind) << "case ";
 					prtChOrHex(o, lb);
 					o << ":";
-					if (dFlag && encoding.is(Enc::EBCDIC) && lb < 256u && isprint(encoding.talx(lb)))
+					if (dFlag && encoding.is(Enc::EBCDIC))
 					{
-						o << " /* " << std::string(1, encoding.talx(lb)) << " */";
+						const uint c = encoding.decode(lb);
+						if (isprint(c))
+							o << " /* " << std::string(1, c) << " */";
 					}
 				}
 				newLine = false;
diff --git a/re2c/enc.cc b/re2c/enc.cc
index 6d85b412..67f7e11a 100644
--- a/re2c/enc.cc
+++ b/re2c/enc.cc
@@ -2,6 +2,10 @@
 
 namespace re2c {
 
+const uint Enc::SURR_MIN = 0xD800;
+const uint Enc::SURR_MAX = 0xDFFF;
+const uint Enc::UNICODE_ERROR = 0xFFFD;
+
 const uint Enc::asc2ebc[256] =
     { /* Based on ISO 8859/1 and Code Page 37 */
         0x00, 0x01, 0x02, 0x03, 0x37, 0x2d, 0x2e, 0x2f, 0x16, 0x05, 0x25, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
@@ -42,4 +46,92 @@ const uint Enc::ebc2asc[256] =
         0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0xb3, 0xdb, 0xdc, 0xd9, 0xda, 0x9f
     };
 
+bool Enc::encode(uint & c) const
+{
+	switch (type)
+	{
+		case ASCII:
+			c &= 0xFF;
+			return true;
+		case EBCDIC:
+			c = asc2ebc[c & 0xFF];
+			return true;
+		case UCS2:
+		case UTF16:
+		case UTF32:
+		case UTF8:
+			return true;
+	}
+	return false; // to silence gcc warning
+}
+
+uint Enc::decode(uint c) const
+{
+	switch (type)
+	{
+		case EBCDIC:
+			c = ebc2asc[c & 0xFF];
+			break;
+		case ASCII:
+		case UCS2:
+		case UTF16:
+		case UTF32:
+		case UTF8:
+			break;
+	}
+	return c;
+}
+
+Range * Enc::encodeRange(uint l, uint h) const
+{
+	Range * r = NULL;
+	switch (type)
+	{
+		case ASCII:
+			l &= 0xFF;
+			h &= 0xFF;
+			r = new Range(l, h + 1);
+			break;
+		case EBCDIC:
+		{
+			const uint el = asc2ebc[l & 0xFF];
+			r = new Range(el, el + 1);
+			for (uint c = l + 1; c <= h; ++c)
+			{
+				const uint ec = asc2ebc[c & 0xFF];
+				r = doUnion(r, new Range(ec, ec + 1));
+			}
+			break;
+		}
+		case UCS2:
+		case UTF16:
+		case UTF32:
+		case UTF8:
+			r = new Range(l, h + 1);
+			break;
+	}
+	return r;
+}
+
+Range * Enc::fullRange() const
+{
+	Range * r = NULL;
+	switch (type)
+	{
+		case ASCII:
+		case EBCDIC:
+			r = new Range(0, 0x100);
+			break;
+		case UCS2:
+			r = new Range(0, 0x10000);
+			break;
+		case UTF16:
+		case UTF32:
+		case UTF8:
+			r = new Range(0, 0x110000);
+			break;
+	}
+	return r;
+}
+
 } // namespace re2c
diff --git a/re2c/enc.h b/re2c/enc.h
index 40372447..b8df64c4 100644
--- a/re2c/enc.h
+++ b/re2c/enc.h
@@ -2,6 +2,7 @@
 #define _enc_h
 
 #include "basics.h"
+#include "range.h"
 
 namespace re2c {
 
@@ -41,10 +42,13 @@ public:
 		, UTF8
 		};
 
+private:
 	static const uint asc2ebc[256];
 	static const uint ebc2asc[256];
+	static const uint SURR_MIN;
+	static const uint SURR_MAX;
+	static const uint UNICODE_ERROR;
 
-private:
 	type_t type;
 
 public:
@@ -63,8 +67,10 @@ public:
 	inline void unset(type_t);
 	inline bool is(type_t) const;
 
-	inline uint xlat(uint c) const;
-	inline uint talx(uint c) const;
+	bool encode(uint & c) const;
+	uint decode(uint c) const;
+	Range * encodeRange(uint l, uint h) const;
+	Range * fullRange() const;
 };
 
 inline uint Enc::nCodePoints() const
@@ -148,34 +154,6 @@ inline bool Enc::is(type_t t) const
 	return type == t;
 }
 
-inline uint Enc::xlat(uint c) const
-{
-	switch (type)
-	{
-		case ASCII:	return c & 0xFF;
-		case EBCDIC:	return asc2ebc[c & 0xFF];
-		case UCS2:
-		case UTF16:
-		case UTF32:
-		case UTF8:	return c;
-	}
-	return ~0; // to silence gcc warning
-}
-
-inline uint Enc::talx(uint c) const
-{
-	switch (type)
-	{
-		case ASCII:	return c & 0xFF;
-		case EBCDIC:	return ebc2asc[c & 0xFF];
-		case UCS2:
-		case UTF16:
-		case UTF32:
-		case UTF8:	return c;
-	}
-	return ~0; // to silence gcc warning
-}
-
 } // namespace re2c
 
 #endif // _enc_h
diff --git a/re2c/scanner.h b/re2c/scanner.h
index e3ee3322..8df3e5a6 100644
--- a/re2c/scanner.h
+++ b/re2c/scanner.h
@@ -77,7 +77,6 @@ public:
 	Range * getRange(SubStr &s) const;
 	RegExp * matchSymbol(uint c) const;
 	RegExp * matchSymbolRange(Range * r) const;
-	RegExp * matchChar(uint c) const;
 	RegExp * strToName(SubStr s) const;
 	RegExp * strToRE(SubStr s) const;
 	RegExp * strToCaseInsensitiveRE(SubStr s) const;