* in current encoding. For encodings, which directly map symbols to
* input characters (ASCII, EBCDIC, UTF-32), it equals [^]. For other
* encodings (UTF-16, UTF-8), [^] and this range are different.
+ *
+ * Also note that default range doesn't respect encoding policy
+ * (the way invalid code points are treated).
*/
RegExp * Scanner::mkDefault() const
{
o << ":";
if (dFlag && encoding.is(Enc::EBCDIC))
{
- const uint c = encoding.decode(lb);
+ const uint c = encoding.decodeUnsafe(lb);
if (isprint(c))
o << " /* " << std::string(1, c) << " */";
}
0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0xb3, 0xdb, 0xdc, 0xd9, 0xda, 0x9f
};
+/*
+ * Returns code point representation for current
+ * encoding with regard to current policy.
+ *
+ * Since code point is exacly specified by user,
+ * it is assumed that user considers it to be valid.
+ * We must check it.
+ *
+ * Returns false if this code point is forbidden
+ * by current policy, otherwise returns true.
+ * Overwrites code point.
+ */
bool Enc::encode(uint & c) const
{
switch (type)
case UTF16:
case UTF32:
case UTF8:
- return true;
+ if (c < SURR_MIN || c > SURR_MAX)
+ return true;
+ else
+ {
+ switch (policy)
+ {
+ case POLICY_FAIL:
+ return false;
+ case POLICY_SUBSTITUTE:
+ c = UNICODE_ERROR;
+ return true;
+ case POLICY_IGNORE:
+ return true;
+ }
+ }
}
return false; // to silence gcc warning
}
-uint Enc::decode(uint c) const
+/*
+ * Returns original representation of code point.
+ * Assumes code point is valid (hence 'unsafe').
+ */
+uint Enc::decodeUnsafe(uint c) const
{
switch (type)
{
return c;
}
+/*
+ * Returns [l - h] range representation for current
+ * encoding with regard to current policy.
+ *
+ * Since range borders are exacly specified by user,
+ * it is assumed that user considers that all code
+ * points from this range are valid. re2c must check it.
+ *
+ * Returns NULL if range contains code points forbidden
+ * by current policy, otherwise returns pointer to newly
+ * constructed Range.
+ */
Range * Enc::encodeRange(uint l, uint h) const
{
Range * r = NULL;
case UTF32:
case UTF8:
r = new Range(l, h + 1);
+ if (l <= SURR_MAX && h >= SURR_MIN)
+ {
+ switch (policy)
+ {
+ case POLICY_FAIL:
+ r = NULL;
+ break;
+ case POLICY_SUBSTITUTE:
+ {
+ Range * surrs = new Range(SURR_MIN, SURR_MAX + 1);
+ Range * error = new Range(UNICODE_ERROR, UNICODE_ERROR + 1);
+ r = doDiff(r, surrs);
+ r = doUnion(r, error);
+ break;
+ }
+ case POLICY_IGNORE:
+ break;
+ }
+ }
break;
}
return r;
}
+/*
+ * Returns [0 - CPOINT_MAX] (full range) representation
+ * for current encoding with regard to current policy.
+ *
+ * Since range is defined declaratively, re2c does
+ * all the necessary corrections 'for free'.
+ *
+ * Always succeeds, returns pointer to newly constructed
+ * Range.
+ */
Range * Enc::fullRange() const
{
- Range * r = NULL;
- switch (type)
+ Range * r = new Range(0, nCodePoints());
+ if (policy != POLICY_IGNORE)
{
- case ASCII:
- case EBCDIC:
- r = new Range(0, 0x100);
- break;
- case UCS2:
- r = new Range(0, 0x10000);
- break;
- case UTF16:
- case UTF32:
- case UTF8:
- r = new Range(0, 0x110000);
- break;
+ Range * surrs = new Range(SURR_MIN, SURR_MAX + 1);
+ r = doDiff(r, surrs);
}
return r;
}
, UTF8
};
+ // What to do with invalid code points
+ enum policy_t
+ { POLICY_FAIL
+ , POLICY_SUBSTITUTE
+ , POLICY_IGNORE
+ };
+
private:
static const uint asc2ebc[256];
static const uint ebc2asc[256];
static const uint UNICODE_ERROR;
type_t type;
+ policy_t policy;
public:
Enc()
: type (ASCII)
+ , policy (POLICY_IGNORE)
{ }
bool operator != (const Enc & e) const { return type != e.type; }
inline void unset(type_t);
inline bool is(type_t) const;
+ inline void setPolicy(policy_t t);
+
bool encode(uint & c) const;
- uint decode(uint c) const;
+ uint decodeUnsafe(uint c) const;
Range * encodeRange(uint l, uint h) const;
Range * fullRange() const;
};
return type == t;
}
+inline void Enc::setPolicy(policy_t t)
+{
+ policy = t;
+}
+
} // namespace re2c
#endif // _enc_h
mbo_opt_struct(10, 0, "no-generation-date"),
mbo_opt_struct(11, 0, "case-insensitive"),
mbo_opt_struct(12, 0, "case-inverted"),
+ mbo_opt_struct(13, 1, "encoding-policy"),
mbo_opt_struct('-', 0, NULL) /* end of args */
};
"--case-inverted Invert the meaning of single and double quoted strings.\n"
" With this switch single quotes are case sensitive and\n"
" double quotes are case insensitive.\n"
+ "\n"
+ "--encoding-policy ep Specify what re2c should do when given bad code unit.\n"
+ " ep can be one of the following: fail, substitute, ignore.\n"
;
}
cout << vernum << endl;
return 2;
}
-
+
case 'w':
sFlag = true;
if (!encoding.set(Enc::UCS2))
return 2;
}
break;
-
+
default:
case 'h':
case '?':
case 12:
bCaseInverted = true;
break;
+
+ case 13:
+ if (strcmp(opt_arg, "fail") == 0)
+ encoding.setPolicy(Enc::POLICY_FAIL);
+ else if (strcmp(opt_arg, "substitute") == 0)
+ encoding.setPolicy(Enc::POLICY_SUBSTITUTE);
+ else if (strcmp(opt_arg, "ignore") == 0)
+ encoding.setPolicy(Enc::POLICY_IGNORE);
+ else
+ {
+ std::cerr << "re2c: error: Invalid encoding policy: \"" << opt_arg << "\"\n";
+ return 1;
+ }
+ break;
}
}