Added option to control how invalid code points are treated.

author Ulya Fokanova <skvadrik@gmail.com>

Sun, 13 Apr 2014 10:25:01 +0000 (13:25 +0300)

committer Ulya Fokanova <skvadrik@gmail.com>

Sun, 13 Apr 2014 10:25:01 +0000 (13:25 +0300)
author Ulya Fokanova <skvadrik@gmail.com>
Sun, 13 Apr 2014 10:25:01 +0000 (13:25 +0300)
committer Ulya Fokanova <skvadrik@gmail.com>
Sun, 13 Apr 2014 10:25:01 +0000 (13:25 +0300)
diff --git a/re2c/actions.cc b/re2c/actions.cc

index d4a8d2b1605b2dba7fd11201ccdcd9ae1bc5cc51..d54b3ddf466474b8a438bbc8dc5789cbfaad3228 100644 (file)
--- a/re2c/actions.cc
+++ b/re2c/actions.cc
@@ -893,6 +893,9 @@ RegExp * Scanner::mkDot() const
   * in current encoding. For encodings, which directly map symbols to
   * input characters (ASCII, EBCDIC, UTF-32), it equals [^]. For other
   * encodings (UTF-16, UTF-8), [^] and this range are different.
+ *
+ * Also note that default range doesn't respect encoding policy
+ * (the way invalid code points are treated).
   */
  RegExp * Scanner::mkDefault() const
  {
diff --git a/re2c/code.cc b/re2c/code.cc

index 65e64f56fc03bb46ef8238a1cee18d4d72ffb544..8e2583603baa682fbb428c5855c3581754dc90f8 100644 (file)
--- a/re2c/code.cc
+++ b/re2c/code.cc
@@ -910,7 +910,7 @@ static bool genCases(std::ostream &o, uint ind, uint lb, Span *s, bool &newLine,
                                         o << ":";
                                         if (dFlag && encoding.is(Enc::EBCDIC))
                                         {
-                                               const uint c = encoding.decode(lb);
+                                               const uint c = encoding.decodeUnsafe(lb);
                                                 if (isprint(c))
                                                         o << " /* " << std::string(1, c) << " */";
                                         }
diff --git a/re2c/enc.cc b/re2c/enc.cc

index 67f7e11a0064613b400ff595f7266094a1f30da2..ddf389aa779c2af23ff6ee22ff61da5d1c9b403e 100644 (file)
--- a/re2c/enc.cc
+++ b/re2c/enc.cc
@@ -46,6 +46,18 @@ const uint Enc::ebc2asc[256] =
          0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0xb3, 0xdb, 0xdc, 0xd9, 0xda, 0x9f
      };
  
+/*
+ * Returns code point representation for current
+ * encoding with regard to current policy.
+ *
+ * Since code point is exacly specified by user,
+ * it is assumed that user considers it to be valid.
+ * We must check it.
+ *
+ * Returns false if this code point is forbidden
+ * by current policy, otherwise returns true.
+ * Overwrites code point.
+ */
  bool Enc::encode(uint & c) const
  {
         switch (type)
@@ -60,12 +72,30 @@ bool Enc::encode(uint & c) const
                 case UTF16:
                 case UTF32:
                 case UTF8:
-                       return true;
+                       if (c < SURR_MIN || c > SURR_MAX)
+                               return true;
+                       else
+                       {
+                               switch (policy)
+                               {
+                                       case POLICY_FAIL:
+                                               return false;
+                                       case POLICY_SUBSTITUTE:
+                                               c = UNICODE_ERROR;
+                                               return true;
+                                       case POLICY_IGNORE:
+                                               return true;
+                               }
+                       }
         }
         return false; // to silence gcc warning
  }
  
-uint Enc::decode(uint c) const
+/*
+ * Returns original representation of code point.
+ * Assumes code point is valid (hence 'unsafe').
+ */
+uint Enc::decodeUnsafe(uint c) const
  {
         switch (type)
         {
@@ -82,6 +112,18 @@ uint Enc::decode(uint c) const
         return c;
  }
  
+/*
+ * Returns [l - h] range representation for current
+ * encoding with regard to current policy.
+ *
+ * Since range borders are exacly specified by user,
+ * it is assumed that user considers that all code
+ * points from this range are valid. re2c must check it.
+ *
+ * Returns NULL if range contains code points forbidden
+ * by current policy, otherwise returns pointer to newly
+ * constructed Range.
+ */
  Range * Enc::encodeRange(uint l, uint h) const
  {
         Range * r = NULL;
@@ -108,28 +150,47 @@ Range * Enc::encodeRange(uint l, uint h) const
                 case UTF32:
                 case UTF8:
                         r = new Range(l, h + 1);
+                       if (l <= SURR_MAX && h >= SURR_MIN)
+                       {
+                               switch (policy)
+                               {
+                                       case POLICY_FAIL:
+                                               r = NULL;
+                                               break;
+                                       case POLICY_SUBSTITUTE:
+                                       {
+                                               Range * surrs = new Range(SURR_MIN, SURR_MAX + 1);
+                                               Range * error = new Range(UNICODE_ERROR, UNICODE_ERROR + 1);
+                                               r = doDiff(r, surrs);
+                                               r = doUnion(r, error);
+                                               break;
+                                       }
+                                       case POLICY_IGNORE:
+                                               break;
+                               }
+                       }
                         break;
         }
         return r;
  }
  
+/*
+ * Returns [0 - CPOINT_MAX] (full range) representation
+ * for current encoding with regard to current policy.
+ *
+ * Since range is defined declaratively, re2c does
+ * all the necessary corrections 'for free'.
+ *
+ * Always succeeds, returns pointer to newly constructed
+ * Range.
+ */
  Range * Enc::fullRange() const
  {
-       Range * r = NULL;
-       switch (type)
+       Range * r = new Range(0, nCodePoints());
+       if (policy != POLICY_IGNORE)
         {
-               case ASCII:
-               case EBCDIC:
-                       r = new Range(0, 0x100);
-                       break;
-               case UCS2:
-                       r = new Range(0, 0x10000);
-                       break;
-               case UTF16:
-               case UTF32:
-               case UTF8:
-                       r = new Range(0, 0x110000);
-                       break;
+               Range * surrs = new Range(SURR_MIN, SURR_MAX + 1);
+               r = doDiff(r, surrs);
         }
         return r;
  }
diff --git a/re2c/enc.h b/re2c/enc.h

index b8df64c45035a5546ab4e8d20aaa4a4aadd74327..bc7d0edf731a4f3a2fdc224d22b37dc385bbd59d 100644 (file)
--- a/re2c/enc.h
+++ b/re2c/enc.h
@@ -42,6 +42,13 @@ public:
                 , UTF8
                 };
  
+       // What to do with invalid code points
+       enum policy_t
+               { POLICY_FAIL
+               , POLICY_SUBSTITUTE
+               , POLICY_IGNORE
+               };
+
  private:
         static const uint asc2ebc[256];
         static const uint ebc2asc[256];
@@ -50,10 +57,12 @@ private:
         static const uint UNICODE_ERROR;
  
         type_t type;
+       policy_t policy;
  
  public:
         Enc()
                 : type (ASCII)
+               , policy (POLICY_IGNORE)
         { }
  
         bool operator != (const Enc & e) const { return type != e.type; }
@@ -67,8 +76,10 @@ public:
         inline void unset(type_t);
         inline bool is(type_t) const;
  
+       inline void setPolicy(policy_t t);
+
         bool encode(uint & c) const;
-       uint decode(uint c) const;
+       uint decodeUnsafe(uint c) const;
         Range * encodeRange(uint l, uint h) const;
         Range * fullRange() const;
  };
@@ -154,6 +165,11 @@ inline bool Enc::is(type_t t) const
         return type == t;
  }
  
+inline void Enc::setPolicy(policy_t t)
+{
+       policy = t;
+}
+
  } // namespace re2c
  
  #endif // _enc_h
diff --git a/re2c/main.cc b/re2c/main.cc

index 4af5c0f12eb2c1dbbcfa22226286d700e17c93c6..e297770a763666ae4cc4278e8c9a0c6d6fcd1c88 100644 (file)
--- a/re2c/main.cc
+++ b/re2c/main.cc
@@ -129,6 +129,7 @@ static const mbo_opt_struct OPTIONS[] =
         mbo_opt_struct(10,  0, "no-generation-date"),
         mbo_opt_struct(11,  0, "case-insensitive"),
         mbo_opt_struct(12,  0, "case-inverted"),
+       mbo_opt_struct(13,  1, "encoding-policy"),
         mbo_opt_struct('-', 0, NULL) /* end of args */
  };
  
@@ -211,6 +212,9 @@ static void usage()
         "--case-inverted         Invert the meaning of single and double quoted strings.\n"
         "                        With this switch single quotes are case sensitive and\n"
         "                        double quotes are case insensitive.\n"
+       "\n"
+       "--encoding-policy ep    Specify what re2c should do when given bad code unit.\n"
+       "                        ep can be one of the following: fail, substitute, ignore.\n"
         ;
  }
  
@@ -326,7 +330,7 @@ int main(int argc, char *argv[])
                                 cout << vernum << endl;
                                 return 2;
                         }
-
+                       
                         case 'w':
                         sFlag = true;
                         if (!encoding.set(Enc::UCS2))
@@ -361,7 +365,7 @@ int main(int argc, char *argv[])
                                 return 2;
                         }
                         break;
-
+         
                         default:
                         case 'h':
                         case '?':
@@ -379,6 +383,20 @@ int main(int argc, char *argv[])
                         case 12:
                         bCaseInverted = true;
                         break;
+
+                       case 13:
+                       if (strcmp(opt_arg, "fail") == 0)
+                               encoding.setPolicy(Enc::POLICY_FAIL);
+                       else if (strcmp(opt_arg, "substitute") == 0)
+                               encoding.setPolicy(Enc::POLICY_SUBSTITUTE);
+                       else if (strcmp(opt_arg, "ignore") == 0)
+                               encoding.setPolicy(Enc::POLICY_IGNORE);
+                       else
+                       {
+                               std::cerr << "re2c: error: Invalid encoding policy: \"" << opt_arg << "\"\n";
+                               return 1;
+                       }
+                       break;
                 }
         }
author	Ulya Fokanova <skvadrik@gmail.com>
	Sun, 13 Apr 2014 10:25:01 +0000 (13:25 +0300)
committer	Ulya Fokanova <skvadrik@gmail.com>
	Sun, 13 Apr 2014 10:25:01 +0000 (13:25 +0300)
re2c/actions.cc		patch \| blob \| history
re2c/code.cc		patch \| blob \| history
re2c/enc.cc		patch \| blob \| history
re2c/enc.h		patch \| blob \| history
re2c/main.cc		patch \| blob \| history