Added option to control how invalid code points are treated.
authorUlya Fokanova <skvadrik@gmail.com>
Sun, 13 Apr 2014 10:25:01 +0000 (13:25 +0300)
committerUlya Fokanova <skvadrik@gmail.com>
Sun, 13 Apr 2014 10:25:01 +0000 (13:25 +0300)
Usage: 're2c --encoding-policy <fail | substitute | ignore> ...'

When re2c encounters invalis code point (e.g., surrogate in Unicode),
it acts with regard to current encoding policy:
    'fail' - fail with error;
    'substitute' - silently substitute offending code point with
        error code point;
    'ignore' - ignore offending code point, consider it valid.

re2c/actions.cc
re2c/code.cc
re2c/enc.cc
re2c/enc.h
re2c/main.cc

index d4a8d2b1605b2dba7fd11201ccdcd9ae1bc5cc51..d54b3ddf466474b8a438bbc8dc5789cbfaad3228 100644 (file)
@@ -893,6 +893,9 @@ RegExp * Scanner::mkDot() const
  * in current encoding. For encodings, which directly map symbols to
  * input characters (ASCII, EBCDIC, UTF-32), it equals [^]. For other
  * encodings (UTF-16, UTF-8), [^] and this range are different.
+ *
+ * Also note that default range doesn't respect encoding policy
+ * (the way invalid code points are treated).
  */
 RegExp * Scanner::mkDefault() const
 {
index 65e64f56fc03bb46ef8238a1cee18d4d72ffb544..8e2583603baa682fbb428c5855c3581754dc90f8 100644 (file)
@@ -910,7 +910,7 @@ static bool genCases(std::ostream &o, uint ind, uint lb, Span *s, bool &newLine,
                                        o << ":";
                                        if (dFlag && encoding.is(Enc::EBCDIC))
                                        {
-                                               const uint c = encoding.decode(lb);
+                                               const uint c = encoding.decodeUnsafe(lb);
                                                if (isprint(c))
                                                        o << " /* " << std::string(1, c) << " */";
                                        }
index 67f7e11a0064613b400ff595f7266094a1f30da2..ddf389aa779c2af23ff6ee22ff61da5d1c9b403e 100644 (file)
@@ -46,6 +46,18 @@ const uint Enc::ebc2asc[256] =
         0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0xb3, 0xdb, 0xdc, 0xd9, 0xda, 0x9f
     };
 
+/*
+ * Returns code point representation for current
+ * encoding with regard to current policy.
+ *
+ * Since code point is exacly specified by user,
+ * it is assumed that user considers it to be valid.
+ * We must check it.
+ *
+ * Returns false if this code point is forbidden
+ * by current policy, otherwise returns true.
+ * Overwrites code point.
+ */
 bool Enc::encode(uint & c) const
 {
        switch (type)
@@ -60,12 +72,30 @@ bool Enc::encode(uint & c) const
                case UTF16:
                case UTF32:
                case UTF8:
-                       return true;
+                       if (c < SURR_MIN || c > SURR_MAX)
+                               return true;
+                       else
+                       {
+                               switch (policy)
+                               {
+                                       case POLICY_FAIL:
+                                               return false;
+                                       case POLICY_SUBSTITUTE:
+                                               c = UNICODE_ERROR;
+                                               return true;
+                                       case POLICY_IGNORE:
+                                               return true;
+                               }
+                       }
        }
        return false; // to silence gcc warning
 }
 
-uint Enc::decode(uint c) const
+/*
+ * Returns original representation of code point.
+ * Assumes code point is valid (hence 'unsafe').
+ */
+uint Enc::decodeUnsafe(uint c) const
 {
        switch (type)
        {
@@ -82,6 +112,18 @@ uint Enc::decode(uint c) const
        return c;
 }
 
+/*
+ * Returns [l - h] range representation for current
+ * encoding with regard to current policy.
+ *
+ * Since range borders are exacly specified by user,
+ * it is assumed that user considers that all code
+ * points from this range are valid. re2c must check it.
+ *
+ * Returns NULL if range contains code points forbidden
+ * by current policy, otherwise returns pointer to newly
+ * constructed Range.
+ */
 Range * Enc::encodeRange(uint l, uint h) const
 {
        Range * r = NULL;
@@ -108,28 +150,47 @@ Range * Enc::encodeRange(uint l, uint h) const
                case UTF32:
                case UTF8:
                        r = new Range(l, h + 1);
+                       if (l <= SURR_MAX && h >= SURR_MIN)
+                       {
+                               switch (policy)
+                               {
+                                       case POLICY_FAIL:
+                                               r = NULL;
+                                               break;
+                                       case POLICY_SUBSTITUTE:
+                                       {
+                                               Range * surrs = new Range(SURR_MIN, SURR_MAX + 1);
+                                               Range * error = new Range(UNICODE_ERROR, UNICODE_ERROR + 1);
+                                               r = doDiff(r, surrs);
+                                               r = doUnion(r, error);
+                                               break;
+                                       }
+                                       case POLICY_IGNORE:
+                                               break;
+                               }
+                       }
                        break;
        }
        return r;
 }
 
+/*
+ * Returns [0 - CPOINT_MAX] (full range) representation
+ * for current encoding with regard to current policy.
+ *
+ * Since range is defined declaratively, re2c does
+ * all the necessary corrections 'for free'.
+ *
+ * Always succeeds, returns pointer to newly constructed
+ * Range.
+ */
 Range * Enc::fullRange() const
 {
-       Range * r = NULL;
-       switch (type)
+       Range * r = new Range(0, nCodePoints());
+       if (policy != POLICY_IGNORE)
        {
-               case ASCII:
-               case EBCDIC:
-                       r = new Range(0, 0x100);
-                       break;
-               case UCS2:
-                       r = new Range(0, 0x10000);
-                       break;
-               case UTF16:
-               case UTF32:
-               case UTF8:
-                       r = new Range(0, 0x110000);
-                       break;
+               Range * surrs = new Range(SURR_MIN, SURR_MAX + 1);
+               r = doDiff(r, surrs);
        }
        return r;
 }
index b8df64c45035a5546ab4e8d20aaa4a4aadd74327..bc7d0edf731a4f3a2fdc224d22b37dc385bbd59d 100644 (file)
@@ -42,6 +42,13 @@ public:
                , UTF8
                };
 
+       // What to do with invalid code points
+       enum policy_t
+               { POLICY_FAIL
+               , POLICY_SUBSTITUTE
+               , POLICY_IGNORE
+               };
+
 private:
        static const uint asc2ebc[256];
        static const uint ebc2asc[256];
@@ -50,10 +57,12 @@ private:
        static const uint UNICODE_ERROR;
 
        type_t type;
+       policy_t policy;
 
 public:
        Enc()
                : type (ASCII)
+               , policy (POLICY_IGNORE)
        { }
 
        bool operator != (const Enc & e) const { return type != e.type; }
@@ -67,8 +76,10 @@ public:
        inline void unset(type_t);
        inline bool is(type_t) const;
 
+       inline void setPolicy(policy_t t);
+
        bool encode(uint & c) const;
-       uint decode(uint c) const;
+       uint decodeUnsafe(uint c) const;
        Range * encodeRange(uint l, uint h) const;
        Range * fullRange() const;
 };
@@ -154,6 +165,11 @@ inline bool Enc::is(type_t t) const
        return type == t;
 }
 
+inline void Enc::setPolicy(policy_t t)
+{
+       policy = t;
+}
+
 } // namespace re2c
 
 #endif // _enc_h
index 4af5c0f12eb2c1dbbcfa22226286d700e17c93c6..e297770a763666ae4cc4278e8c9a0c6d6fcd1c88 100644 (file)
@@ -129,6 +129,7 @@ static const mbo_opt_struct OPTIONS[] =
        mbo_opt_struct(10,  0, "no-generation-date"),
        mbo_opt_struct(11,  0, "case-insensitive"),
        mbo_opt_struct(12,  0, "case-inverted"),
+       mbo_opt_struct(13,  1, "encoding-policy"),
        mbo_opt_struct('-', 0, NULL) /* end of args */
 };
 
@@ -211,6 +212,9 @@ static void usage()
        "--case-inverted         Invert the meaning of single and double quoted strings.\n"
        "                        With this switch single quotes are case sensitive and\n"
        "                        double quotes are case insensitive.\n"
+       "\n"
+       "--encoding-policy ep    Specify what re2c should do when given bad code unit.\n"
+       "                        ep can be one of the following: fail, substitute, ignore.\n"
        ;
 }
 
@@ -326,7 +330,7 @@ int main(int argc, char *argv[])
                                cout << vernum << endl;
                                return 2;
                        }
-
+                       
                        case 'w':
                        sFlag = true;
                        if (!encoding.set(Enc::UCS2))
@@ -361,7 +365,7 @@ int main(int argc, char *argv[])
                                return 2;
                        }
                        break;
-
+         
                        default:
                        case 'h':
                        case '?':
@@ -379,6 +383,20 @@ int main(int argc, char *argv[])
                        case 12:
                        bCaseInverted = true;
                        break;
+
+                       case 13:
+                       if (strcmp(opt_arg, "fail") == 0)
+                               encoding.setPolicy(Enc::POLICY_FAIL);
+                       else if (strcmp(opt_arg, "substitute") == 0)
+                               encoding.setPolicy(Enc::POLICY_SUBSTITUTE);
+                       else if (strcmp(opt_arg, "ignore") == 0)
+                               encoding.setPolicy(Enc::POLICY_IGNORE);
+                       else
+                       {
+                               std::cerr << "re2c: error: Invalid encoding policy: \"" << opt_arg << "\"\n";
+                               return 1;
+                       }
+                       break;
                }
        }