Check if code point exceeds maximum. Correctly cast 'char' to 'uint32_t'.

author Ulya Trofimovich <skvadrik@gmail.com>

Thu, 20 Aug 2015 11:09:52 +0000 (12:09 +0100)

committer Ulya Trofimovich <skvadrik@gmail.com>

Thu, 20 Aug 2015 11:09:52 +0000 (12:09 +0100)
author Ulya Trofimovich <skvadrik@gmail.com>
Thu, 20 Aug 2015 11:09:52 +0000 (12:09 +0100)
committer Ulya Trofimovich <skvadrik@gmail.com>
Thu, 20 Aug 2015 11:09:52 +0000 (12:09 +0100)
diff --git a/re2c/bootstrap/src/parse/scanner_lex.cc b/re2c/bootstrap/src/parse/scanner_lex.cc

index 88f14d5f622c5d66b52da99cb8bdefbd51d8c29b..bb77bd471f3d7d6c215898e0928707984bc95093 100644 (file)
--- a/re2c/bootstrap/src/parse/scanner_lex.cc
+++ b/re2c/bootstrap/src/parse/scanner_lex.cc
@@ -1,4 +1,4 @@
-/* Generated by re2c 0.14.3 on Wed Aug 19 18:44:26 2015 */
+/* Generated by re2c 0.14.3 on Thu Aug 20 12:08:21 2015 */
  #line 1 "../src/parse/scanner_lex.re"
  #include <stdlib.h>
  #include <string.h>
@@ -1512,7 +1512,7 @@ yy237:
                 }
                 else
                 {
-                       cpoints.push_back (static_cast<uint32_t> (c));
+                       cpoints.push_back (static_cast<uint8_t> (c));
                         goto cpoints;
                 }
         }
@@ -1526,7 +1526,7 @@ yy239:
                 {
                         warn.useless_escape (tline, tok - pos, c);
                 }
-               cpoints.push_back (static_cast<uint32_t> (c));
+               cpoints.push_back (static_cast<uint8_t> (c));
                 goto cpoints;
         }
  #line 1533 "src/parse/scanner_lex.cc"
diff --git a/re2c/src/ir/regexp/encoding/enc.cc b/re2c/src/ir/regexp/encoding/enc.cc

index 65857cab375e20e0cfcacc1a6676fe582fe81be6..029d525cd831d86c3acc44c87b590739aef3fdf7 100644 (file)
--- a/re2c/src/ir/regexp/encoding/enc.cc
+++ b/re2c/src/ir/regexp/encoding/enc.cc
@@ -54,19 +54,23 @@ const uint32_t Enc::ebc2asc[256] =
   * it is assumed that user considers it to be valid.
   * We must check it.
   *
- * Returns false if this code point is forbidden
- * by current policy, otherwise returns true.
- * Overwrites code point.
+ * Returns false if this code point exceeds maximum
+ * or is forbidden by current policy, otherwise
+ * returns true. Overwrites code point.
   */
  bool Enc::encode(uint32_t & c) const
  {
+       if (c >= nCodePoints ())
+       {
+               return false;
+       }
+
         switch (type)
         {
                 case ASCII:
-                       c &= 0xFF;
                         return true;
                 case EBCDIC:
-                       c = asc2ebc[c & 0xFF];
+                       c = asc2ebc[c];
                         return true;
                 case UCS2:
                 case UTF16:
@@ -120,29 +124,30 @@ uint32_t Enc::decodeUnsafe(uint32_t c) const
   * it is assumed that user considers that all code
   * points from this range are valid. re2c must check it.
   *
- * Returns NULL if range contains code points forbidden
- * by current policy, otherwise returns pointer to newly
- * constructed Range.
+ * Returns NULL if range contains code points that
+ * exceed maximum or are forbidden by current policy,
+ * otherwise returns pointer to newly constructed range.
   */
  Range * Enc::encodeRange(uint32_t l, uint32_t h) const
  {
+       if (l >= nCodePoints () || h >= nCodePoints ())
+       {
+               return NULL;
+       }
+
         Range * r = NULL;
         switch (type)
         {
                 case ASCII:
-                       if (l > 0xFF || h > 0xFF)
-                       {
-                               return NULL;
-                       }
                         r = Range::ran (l, h + 1);
                         break;
                 case EBCDIC:
                 {
-                       const uint32_t el = asc2ebc[l & 0xFF];
+                       const uint32_t el = asc2ebc[l];
                         r = Range::sym (el);
                         for (uint32_t c = l + 1; c <= h; ++c)
                         {
-                               const uint32_t ec = asc2ebc[c & 0xFF];
+                               const uint32_t ec = asc2ebc[c];
                                 r = Range::add (r, Range::sym (ec));
                         }
                         break;
@@ -177,14 +182,14 @@ Range * Enc::encodeRange(uint32_t l, uint32_t h) const
  }
  
  /*
- * Returns [0 - CPOINT_MAX] (full range) representation
- * for current encoding with regard to current policy.
+ * Returns full range representation for current encoding
+ * with regard to current policy.
   *
   * Since range is defined declaratively, re2c does
   * all the necessary corrections 'for free'.
   *
   * Always succeeds, returns pointer to newly constructed
- * Range.
+ * range.
   */
  Range * Enc::fullRange() const
  {
diff --git a/re2c/src/ir/regexp/encoding/enc.h b/re2c/src/ir/regexp/encoding/enc.h

index 699b406f9a153412e970c859bed5e1be51ff611b..daa76dbd0727f681e7ad3422145cd5d75940700d 100644 (file)
--- a/re2c/src/ir/regexp/encoding/enc.h
+++ b/re2c/src/ir/regexp/encoding/enc.h
@@ -6,28 +6,33 @@
  
  namespace re2c {
  
-// Each encoding defines two concepts:
-//
-// 1) Code point -- abstract number, which represents single encoding symbol.
-//     E.g., Unicode defines 0x10FFFF code points, so each Unicode encoding
-//     must be capable of representing 0x10FFFF code points.
-//
-// 2) Code unit -- the smallest unit of memory, which is used in the encoded
-//     text. One or more code units can be needed to represent a single code
-//     point, depending on the encoding. For each encoding, all code points
-//     either are represented with equal number of code units (fixed-length
-//     encodings), or with variable number of code units (variable-length
-//     encodings).
-//
-// encoding | code point number | code point size       | code unit number | code unit size
-// ---------|-------------------|-----------------------|------------------|----------------
-// ASCII    | 0xFF              | fixed,        1 byte  | 0xFF             | 1 byte
-// EBCDIC   | 0xFF              | fixed,        1 byte  | 0xFF             | 1 byte
-// UCS2     | 0xFFFF            | fixed,        2 bytes | 0xFFFF           | 2 bytes
-// UTF16    | 0x10FFFF          | variable, 2 - 4 bytes | 0xFFFF           | 2 bytes
-// UTF32    | 0x10FFFF          | fixed,        4 bytes | 0x10FFFF         | 4 bytes
-// UTF8     | 0x10FFFF          | variable, 1 - 4 bytes | 0xFF             | 1 byte
-// -----------------------------------------------------------------------------------------
+/*
+ * note [encodings]
+ *
+ * Each encoding defines two concepts:
+ *
+ * 1) Code point -- abstract number, which represents single encoding symbol.
+ *    E.g., Unicode defines code points in the range [0 - 0x10FFFF] , so each
+ *    Unicode encoding must be capable of representing 0x110000 code points.
+ *
+ * 2) Code unit -- the smallest unit of memory, which is used in the encoded
+ *    text. One or more code units can be needed to represent a single code
+ *    point, depending on the encoding. For each encoding, all code points
+ *    either are represented with equal number of code units (fixed-length
+ *    encodings), or with variable number of code units (variable-length
+ *    encodings).
+ *
+ * +----------+------------------+-----------------------+-----------------+----------------+
+ * | encoding | code point range | code point size       | code unit range | code unit size |
+ * +----------+------------------+-----------------------+-----------------+----------------+
+ * | ASCII    | 0 - 0xFF         | fixed,        1 byte  | 0 - 0xFF        | 1 byte         |
+ * | EBCDIC   | 0 - 0xFF         | fixed,        1 byte  | 0 - 0xFF        | 1 byte         |
+ * | UCS2     | 0 - 0xFFFF       | fixed,        2 bytes | 0 - 0xFFFF      | 2 bytes        |
+ * | UTF16    | 0 - 0x10FFFF     | variable, 2 - 4 bytes | 0 - 0xFFFF      | 2 bytes        |
+ * | UTF32    | 0 - 0x10FFFF     | fixed,        4 bytes | 0 - 0x10FFFF    | 4 bytes        |
+ * | UTF8     | 0 - 0x10FFFF     | variable, 1 - 4 bytes | 0 - 0xFF        | 1 byte         |
+ * +----------+------------------+-----------------------+-----------------+----------------+
+ */
  
  class Enc
  {
diff --git a/re2c/src/parse/scanner_lex.re b/re2c/src/parse/scanner_lex.re

index e152ebc0a58638163552a9b03b2ae2f1adb944a3..595225af525904ec28df26ed4eeb46086898bc3f 100644 (file)
--- a/re2c/src/parse/scanner_lex.re
+++ b/re2c/src/parse/scanner_lex.re
@@ -427,7 +427,7 @@ cpoints:
                 {
                         warn.useless_escape (tline, tok - pos, c);
                 }
-               cpoints.push_back (static_cast<uint32_t> (c));
+               cpoints.push_back (static_cast<uint8_t> (c));
                 goto cpoints;
         }
         [^] \ esc
@@ -450,7 +450,7 @@ cpoints:
                 }
                 else
                 {
-                       cpoints.push_back (static_cast<uint32_t> (c));
+                       cpoints.push_back (static_cast<uint8_t> (c));
                         goto cpoints;
                 }
         }
author	Ulya Trofimovich <skvadrik@gmail.com>
	Thu, 20 Aug 2015 11:09:52 +0000 (12:09 +0100)
committer	Ulya Trofimovich <skvadrik@gmail.com>
	Thu, 20 Aug 2015 11:09:52 +0000 (12:09 +0100)
re2c/bootstrap/src/parse/scanner_lex.cc		patch \| blob \| history
re2c/src/ir/regexp/encoding/enc.cc		patch \| blob \| history
re2c/src/ir/regexp/encoding/enc.h		patch \| blob \| history
re2c/src/parse/scanner_lex.re		patch \| blob \| history