From: Ulya Fokanova <skvadrik@gmail.com>
Date: Sun, 13 Apr 2014 10:25:01 +0000 (+0300)
Subject: Added option to control how invalid code points are treated.
X-Git-Tag: 0.13.7.1~7
X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=fdf9f5d96f8555d6d76cb1d0ef9672f2f118b6b6;p=re2c

Added option to control how invalid code points are treated.

Usage: 're2c --encoding-policy <fail | substitute | ignore> ...'

When re2c encounters invalis code point (e.g., surrogate in Unicode),
it acts with regard to current encoding policy:
    'fail' - fail with error;
    'substitute' - silently substitute offending code point with
        error code point;
    'ignore' - ignore offending code point, consider it valid.
---

diff --git a/re2c/actions.cc b/re2c/actions.cc
index d4a8d2b1..d54b3ddf 100644
--- a/re2c/actions.cc
+++ b/re2c/actions.cc
@@ -893,6 +893,9 @@ RegExp * Scanner::mkDot() const
  * in current encoding. For encodings, which directly map symbols to
  * input characters (ASCII, EBCDIC, UTF-32), it equals [^]. For other
  * encodings (UTF-16, UTF-8), [^] and this range are different.
+ *
+ * Also note that default range doesn't respect encoding policy
+ * (the way invalid code points are treated).
  */
 RegExp * Scanner::mkDefault() const
 {
diff --git a/re2c/code.cc b/re2c/code.cc
index 65e64f56..8e258360 100644
--- a/re2c/code.cc
+++ b/re2c/code.cc
@@ -910,7 +910,7 @@ static bool genCases(std::ostream &o, uint ind, uint lb, Span *s, bool &newLine,
 					o << ":";
 					if (dFlag && encoding.is(Enc::EBCDIC))
 					{
-						const uint c = encoding.decode(lb);
+						const uint c = encoding.decodeUnsafe(lb);
 						if (isprint(c))
 							o << " /* " << std::string(1, c) << " */";
 					}
diff --git a/re2c/enc.cc b/re2c/enc.cc
index 67f7e11a..ddf389aa 100644
--- a/re2c/enc.cc
+++ b/re2c/enc.cc
@@ -46,6 +46,18 @@ const uint Enc::ebc2asc[256] =
         0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0xb3, 0xdb, 0xdc, 0xd9, 0xda, 0x9f
     };
 
+/*
+ * Returns code point representation for current
+ * encoding with regard to current policy.
+ *
+ * Since code point is exacly specified by user,
+ * it is assumed that user considers it to be valid.
+ * We must check it.
+ *
+ * Returns false if this code point is forbidden
+ * by current policy, otherwise returns true.
+ * Overwrites code point.
+ */
 bool Enc::encode(uint & c) const
 {
 	switch (type)
@@ -60,12 +72,30 @@ bool Enc::encode(uint & c) const
 		case UTF16:
 		case UTF32:
 		case UTF8:
-			return true;
+			if (c < SURR_MIN || c > SURR_MAX)
+				return true;
+			else
+			{
+				switch (policy)
+				{
+					case POLICY_FAIL:
+						return false;
+					case POLICY_SUBSTITUTE:
+						c = UNICODE_ERROR;
+						return true;
+					case POLICY_IGNORE:
+						return true;
+				}
+			}
 	}
 	return false; // to silence gcc warning
 }
 
-uint Enc::decode(uint c) const
+/*
+ * Returns original representation of code point.
+ * Assumes code point is valid (hence 'unsafe').
+ */
+uint Enc::decodeUnsafe(uint c) const
 {
 	switch (type)
 	{
@@ -82,6 +112,18 @@ uint Enc::decode(uint c) const
 	return c;
 }
 
+/*
+ * Returns [l - h] range representation for current
+ * encoding with regard to current policy.
+ *
+ * Since range borders are exacly specified by user,
+ * it is assumed that user considers that all code
+ * points from this range are valid. re2c must check it.
+ *
+ * Returns NULL if range contains code points forbidden
+ * by current policy, otherwise returns pointer to newly
+ * constructed Range.
+ */
 Range * Enc::encodeRange(uint l, uint h) const
 {
 	Range * r = NULL;
@@ -108,28 +150,47 @@ Range * Enc::encodeRange(uint l, uint h) const
 		case UTF32:
 		case UTF8:
 			r = new Range(l, h + 1);
+			if (l <= SURR_MAX && h >= SURR_MIN)
+			{
+				switch (policy)
+				{
+					case POLICY_FAIL:
+						r = NULL;
+						break;
+					case POLICY_SUBSTITUTE:
+					{
+						Range * surrs = new Range(SURR_MIN, SURR_MAX + 1);
+						Range * error = new Range(UNICODE_ERROR, UNICODE_ERROR + 1);
+						r = doDiff(r, surrs);
+						r = doUnion(r, error);
+						break;
+					}
+					case POLICY_IGNORE:
+						break;
+				}
+			}
 			break;
 	}
 	return r;
 }
 
+/*
+ * Returns [0 - CPOINT_MAX] (full range) representation
+ * for current encoding with regard to current policy.
+ *
+ * Since range is defined declaratively, re2c does
+ * all the necessary corrections 'for free'.
+ *
+ * Always succeeds, returns pointer to newly constructed
+ * Range.
+ */
 Range * Enc::fullRange() const
 {
-	Range * r = NULL;
-	switch (type)
+	Range * r = new Range(0, nCodePoints());
+	if (policy != POLICY_IGNORE)
 	{
-		case ASCII:
-		case EBCDIC:
-			r = new Range(0, 0x100);
-			break;
-		case UCS2:
-			r = new Range(0, 0x10000);
-			break;
-		case UTF16:
-		case UTF32:
-		case UTF8:
-			r = new Range(0, 0x110000);
-			break;
+		Range * surrs = new Range(SURR_MIN, SURR_MAX + 1);
+		r = doDiff(r, surrs);
 	}
 	return r;
 }
diff --git a/re2c/enc.h b/re2c/enc.h
index b8df64c4..bc7d0edf 100644
--- a/re2c/enc.h
+++ b/re2c/enc.h
@@ -42,6 +42,13 @@ public:
 		, UTF8
 		};
 
+	// What to do with invalid code points
+	enum policy_t
+		{ POLICY_FAIL
+		, POLICY_SUBSTITUTE
+		, POLICY_IGNORE
+		};
+
 private:
 	static const uint asc2ebc[256];
 	static const uint ebc2asc[256];
@@ -50,10 +57,12 @@ private:
 	static const uint UNICODE_ERROR;
 
 	type_t type;
+	policy_t policy;
 
 public:
 	Enc()
 		: type (ASCII)
+		, policy (POLICY_IGNORE)
 	{ }
 
 	bool operator != (const Enc & e) const { return type != e.type; }
@@ -67,8 +76,10 @@ public:
 	inline void unset(type_t);
 	inline bool is(type_t) const;
 
+	inline void setPolicy(policy_t t);
+
 	bool encode(uint & c) const;
-	uint decode(uint c) const;
+	uint decodeUnsafe(uint c) const;
 	Range * encodeRange(uint l, uint h) const;
 	Range * fullRange() const;
 };
@@ -154,6 +165,11 @@ inline bool Enc::is(type_t t) const
 	return type == t;
 }
 
+inline void Enc::setPolicy(policy_t t)
+{
+	policy = t;
+}
+
 } // namespace re2c
 
 #endif // _enc_h
diff --git a/re2c/main.cc b/re2c/main.cc
index 4af5c0f1..e297770a 100644
--- a/re2c/main.cc
+++ b/re2c/main.cc
@@ -129,6 +129,7 @@ static const mbo_opt_struct OPTIONS[] =
 	mbo_opt_struct(10,  0, "no-generation-date"),
 	mbo_opt_struct(11,  0, "case-insensitive"),
 	mbo_opt_struct(12,  0, "case-inverted"),
+	mbo_opt_struct(13,  1, "encoding-policy"),
 	mbo_opt_struct('-', 0, NULL) /* end of args */
 };
 
@@ -211,6 +212,9 @@ static void usage()
 	"--case-inverted         Invert the meaning of single and double quoted strings.\n"
 	"                        With this switch single quotes are case sensitive and\n"
 	"                        double quotes are case insensitive.\n"
+	"\n"
+	"--encoding-policy ep    Specify what re2c should do when given bad code unit.\n"
+	"                        ep can be one of the following: fail, substitute, ignore.\n"
 	;
 }
 
@@ -326,7 +330,7 @@ int main(int argc, char *argv[])
 				cout << vernum << endl;
 				return 2;
 			}
-
+			
 			case 'w':
 			sFlag = true;
 			if (!encoding.set(Enc::UCS2))
@@ -361,7 +365,7 @@ int main(int argc, char *argv[])
 				return 2;
 			}
 			break;
-
+	  
 			default:
 			case 'h':
 			case '?':
@@ -379,6 +383,20 @@ int main(int argc, char *argv[])
 			case 12:
 			bCaseInverted = true;
 			break;
+
+			case 13:
+			if (strcmp(opt_arg, "fail") == 0)
+				encoding.setPolicy(Enc::POLICY_FAIL);
+			else if (strcmp(opt_arg, "substitute") == 0)
+				encoding.setPolicy(Enc::POLICY_SUBSTITUTE);
+			else if (strcmp(opt_arg, "ignore") == 0)
+				encoding.setPolicy(Enc::POLICY_IGNORE);
+			else
+			{
+				std::cerr << "re2c: error: Invalid encoding policy: \"" << opt_arg << "\"\n";
+				return 1;
+			}
+			break;
 		}
 	}