From: Ulya Fokanova Date: Sun, 13 Apr 2014 10:25:01 +0000 (+0300) Subject: Added option to control how invalid code points are treated. X-Git-Tag: 0.13.7.1~7 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=fdf9f5d96f8555d6d76cb1d0ef9672f2f118b6b6;p=re2c Added option to control how invalid code points are treated. Usage: 're2c --encoding-policy ...' When re2c encounters invalis code point (e.g., surrogate in Unicode), it acts with regard to current encoding policy: 'fail' - fail with error; 'substitute' - silently substitute offending code point with error code point; 'ignore' - ignore offending code point, consider it valid. --- diff --git a/re2c/actions.cc b/re2c/actions.cc index d4a8d2b1..d54b3ddf 100644 --- a/re2c/actions.cc +++ b/re2c/actions.cc @@ -893,6 +893,9 @@ RegExp * Scanner::mkDot() const * in current encoding. For encodings, which directly map symbols to * input characters (ASCII, EBCDIC, UTF-32), it equals [^]. For other * encodings (UTF-16, UTF-8), [^] and this range are different. + * + * Also note that default range doesn't respect encoding policy + * (the way invalid code points are treated). */ RegExp * Scanner::mkDefault() const { diff --git a/re2c/code.cc b/re2c/code.cc index 65e64f56..8e258360 100644 --- a/re2c/code.cc +++ b/re2c/code.cc @@ -910,7 +910,7 @@ static bool genCases(std::ostream &o, uint ind, uint lb, Span *s, bool &newLine, o << ":"; if (dFlag && encoding.is(Enc::EBCDIC)) { - const uint c = encoding.decode(lb); + const uint c = encoding.decodeUnsafe(lb); if (isprint(c)) o << " /* " << std::string(1, c) << " */"; } diff --git a/re2c/enc.cc b/re2c/enc.cc index 67f7e11a..ddf389aa 100644 --- a/re2c/enc.cc +++ b/re2c/enc.cc @@ -46,6 +46,18 @@ const uint Enc::ebc2asc[256] = 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0xb3, 0xdb, 0xdc, 0xd9, 0xda, 0x9f }; +/* + * Returns code point representation for current + * encoding with regard to current policy. + * + * Since code point is exacly specified by user, + * it is assumed that user considers it to be valid. + * We must check it. + * + * Returns false if this code point is forbidden + * by current policy, otherwise returns true. + * Overwrites code point. + */ bool Enc::encode(uint & c) const { switch (type) @@ -60,12 +72,30 @@ bool Enc::encode(uint & c) const case UTF16: case UTF32: case UTF8: - return true; + if (c < SURR_MIN || c > SURR_MAX) + return true; + else + { + switch (policy) + { + case POLICY_FAIL: + return false; + case POLICY_SUBSTITUTE: + c = UNICODE_ERROR; + return true; + case POLICY_IGNORE: + return true; + } + } } return false; // to silence gcc warning } -uint Enc::decode(uint c) const +/* + * Returns original representation of code point. + * Assumes code point is valid (hence 'unsafe'). + */ +uint Enc::decodeUnsafe(uint c) const { switch (type) { @@ -82,6 +112,18 @@ uint Enc::decode(uint c) const return c; } +/* + * Returns [l - h] range representation for current + * encoding with regard to current policy. + * + * Since range borders are exacly specified by user, + * it is assumed that user considers that all code + * points from this range are valid. re2c must check it. + * + * Returns NULL if range contains code points forbidden + * by current policy, otherwise returns pointer to newly + * constructed Range. + */ Range * Enc::encodeRange(uint l, uint h) const { Range * r = NULL; @@ -108,28 +150,47 @@ Range * Enc::encodeRange(uint l, uint h) const case UTF32: case UTF8: r = new Range(l, h + 1); + if (l <= SURR_MAX && h >= SURR_MIN) + { + switch (policy) + { + case POLICY_FAIL: + r = NULL; + break; + case POLICY_SUBSTITUTE: + { + Range * surrs = new Range(SURR_MIN, SURR_MAX + 1); + Range * error = new Range(UNICODE_ERROR, UNICODE_ERROR + 1); + r = doDiff(r, surrs); + r = doUnion(r, error); + break; + } + case POLICY_IGNORE: + break; + } + } break; } return r; } +/* + * Returns [0 - CPOINT_MAX] (full range) representation + * for current encoding with regard to current policy. + * + * Since range is defined declaratively, re2c does + * all the necessary corrections 'for free'. + * + * Always succeeds, returns pointer to newly constructed + * Range. + */ Range * Enc::fullRange() const { - Range * r = NULL; - switch (type) + Range * r = new Range(0, nCodePoints()); + if (policy != POLICY_IGNORE) { - case ASCII: - case EBCDIC: - r = new Range(0, 0x100); - break; - case UCS2: - r = new Range(0, 0x10000); - break; - case UTF16: - case UTF32: - case UTF8: - r = new Range(0, 0x110000); - break; + Range * surrs = new Range(SURR_MIN, SURR_MAX + 1); + r = doDiff(r, surrs); } return r; } diff --git a/re2c/enc.h b/re2c/enc.h index b8df64c4..bc7d0edf 100644 --- a/re2c/enc.h +++ b/re2c/enc.h @@ -42,6 +42,13 @@ public: , UTF8 }; + // What to do with invalid code points + enum policy_t + { POLICY_FAIL + , POLICY_SUBSTITUTE + , POLICY_IGNORE + }; + private: static const uint asc2ebc[256]; static const uint ebc2asc[256]; @@ -50,10 +57,12 @@ private: static const uint UNICODE_ERROR; type_t type; + policy_t policy; public: Enc() : type (ASCII) + , policy (POLICY_IGNORE) { } bool operator != (const Enc & e) const { return type != e.type; } @@ -67,8 +76,10 @@ public: inline void unset(type_t); inline bool is(type_t) const; + inline void setPolicy(policy_t t); + bool encode(uint & c) const; - uint decode(uint c) const; + uint decodeUnsafe(uint c) const; Range * encodeRange(uint l, uint h) const; Range * fullRange() const; }; @@ -154,6 +165,11 @@ inline bool Enc::is(type_t t) const return type == t; } +inline void Enc::setPolicy(policy_t t) +{ + policy = t; +} + } // namespace re2c #endif // _enc_h diff --git a/re2c/main.cc b/re2c/main.cc index 4af5c0f1..e297770a 100644 --- a/re2c/main.cc +++ b/re2c/main.cc @@ -129,6 +129,7 @@ static const mbo_opt_struct OPTIONS[] = mbo_opt_struct(10, 0, "no-generation-date"), mbo_opt_struct(11, 0, "case-insensitive"), mbo_opt_struct(12, 0, "case-inverted"), + mbo_opt_struct(13, 1, "encoding-policy"), mbo_opt_struct('-', 0, NULL) /* end of args */ }; @@ -211,6 +212,9 @@ static void usage() "--case-inverted Invert the meaning of single and double quoted strings.\n" " With this switch single quotes are case sensitive and\n" " double quotes are case insensitive.\n" + "\n" + "--encoding-policy ep Specify what re2c should do when given bad code unit.\n" + " ep can be one of the following: fail, substitute, ignore.\n" ; } @@ -326,7 +330,7 @@ int main(int argc, char *argv[]) cout << vernum << endl; return 2; } - + case 'w': sFlag = true; if (!encoding.set(Enc::UCS2)) @@ -361,7 +365,7 @@ int main(int argc, char *argv[]) return 2; } break; - + default: case 'h': case '?': @@ -379,6 +383,20 @@ int main(int argc, char *argv[]) case 12: bCaseInverted = true; break; + + case 13: + if (strcmp(opt_arg, "fail") == 0) + encoding.setPolicy(Enc::POLICY_FAIL); + else if (strcmp(opt_arg, "substitute") == 0) + encoding.setPolicy(Enc::POLICY_SUBSTITUTE); + else if (strcmp(opt_arg, "ignore") == 0) + encoding.setPolicy(Enc::POLICY_IGNORE); + else + { + std::cerr << "re2c: error: Invalid encoding policy: \"" << opt_arg << "\"\n"; + return 1; + } + break; } }