From: Ulya Trofimovich Date: Thu, 20 Aug 2015 11:09:52 +0000 (+0100) Subject: Check if code point exceeds maximum. Correctly cast 'char' to 'uint32_t'. X-Git-Tag: 0.15~107 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=0346d1666f58da5dbe35d156e0487d95354153ff;p=re2c Check if code point exceeds maximum. Correctly cast 'char' to 'uint32_t'. First fix: re2c used to check if code point exceeds maximal value for current encoding when parsing it. When I moved parsing code points to lexer I forgot the check. Second fix: I assumed that 'static_cast' on 'signed char' zero-extends. But I was wrong: it sign-extends. Need to cast to 'unsigned char' instead. --- diff --git a/re2c/bootstrap/src/parse/scanner_lex.cc b/re2c/bootstrap/src/parse/scanner_lex.cc index 88f14d5f..bb77bd47 100644 --- a/re2c/bootstrap/src/parse/scanner_lex.cc +++ b/re2c/bootstrap/src/parse/scanner_lex.cc @@ -1,4 +1,4 @@ -/* Generated by re2c 0.14.3 on Wed Aug 19 18:44:26 2015 */ +/* Generated by re2c 0.14.3 on Thu Aug 20 12:08:21 2015 */ #line 1 "../src/parse/scanner_lex.re" #include #include @@ -1512,7 +1512,7 @@ yy237: } else { - cpoints.push_back (static_cast (c)); + cpoints.push_back (static_cast (c)); goto cpoints; } } @@ -1526,7 +1526,7 @@ yy239: { warn.useless_escape (tline, tok - pos, c); } - cpoints.push_back (static_cast (c)); + cpoints.push_back (static_cast (c)); goto cpoints; } #line 1533 "src/parse/scanner_lex.cc" diff --git a/re2c/src/ir/regexp/encoding/enc.cc b/re2c/src/ir/regexp/encoding/enc.cc index 65857cab..029d525c 100644 --- a/re2c/src/ir/regexp/encoding/enc.cc +++ b/re2c/src/ir/regexp/encoding/enc.cc @@ -54,19 +54,23 @@ const uint32_t Enc::ebc2asc[256] = * it is assumed that user considers it to be valid. * We must check it. * - * Returns false if this code point is forbidden - * by current policy, otherwise returns true. - * Overwrites code point. + * Returns false if this code point exceeds maximum + * or is forbidden by current policy, otherwise + * returns true. Overwrites code point. */ bool Enc::encode(uint32_t & c) const { + if (c >= nCodePoints ()) + { + return false; + } + switch (type) { case ASCII: - c &= 0xFF; return true; case EBCDIC: - c = asc2ebc[c & 0xFF]; + c = asc2ebc[c]; return true; case UCS2: case UTF16: @@ -120,29 +124,30 @@ uint32_t Enc::decodeUnsafe(uint32_t c) const * it is assumed that user considers that all code * points from this range are valid. re2c must check it. * - * Returns NULL if range contains code points forbidden - * by current policy, otherwise returns pointer to newly - * constructed Range. + * Returns NULL if range contains code points that + * exceed maximum or are forbidden by current policy, + * otherwise returns pointer to newly constructed range. */ Range * Enc::encodeRange(uint32_t l, uint32_t h) const { + if (l >= nCodePoints () || h >= nCodePoints ()) + { + return NULL; + } + Range * r = NULL; switch (type) { case ASCII: - if (l > 0xFF || h > 0xFF) - { - return NULL; - } r = Range::ran (l, h + 1); break; case EBCDIC: { - const uint32_t el = asc2ebc[l & 0xFF]; + const uint32_t el = asc2ebc[l]; r = Range::sym (el); for (uint32_t c = l + 1; c <= h; ++c) { - const uint32_t ec = asc2ebc[c & 0xFF]; + const uint32_t ec = asc2ebc[c]; r = Range::add (r, Range::sym (ec)); } break; @@ -177,14 +182,14 @@ Range * Enc::encodeRange(uint32_t l, uint32_t h) const } /* - * Returns [0 - CPOINT_MAX] (full range) representation - * for current encoding with regard to current policy. + * Returns full range representation for current encoding + * with regard to current policy. * * Since range is defined declaratively, re2c does * all the necessary corrections 'for free'. * * Always succeeds, returns pointer to newly constructed - * Range. + * range. */ Range * Enc::fullRange() const { diff --git a/re2c/src/ir/regexp/encoding/enc.h b/re2c/src/ir/regexp/encoding/enc.h index 699b406f..daa76dbd 100644 --- a/re2c/src/ir/regexp/encoding/enc.h +++ b/re2c/src/ir/regexp/encoding/enc.h @@ -6,28 +6,33 @@ namespace re2c { -// Each encoding defines two concepts: -// -// 1) Code point -- abstract number, which represents single encoding symbol. -// E.g., Unicode defines 0x10FFFF code points, so each Unicode encoding -// must be capable of representing 0x10FFFF code points. -// -// 2) Code unit -- the smallest unit of memory, which is used in the encoded -// text. One or more code units can be needed to represent a single code -// point, depending on the encoding. For each encoding, all code points -// either are represented with equal number of code units (fixed-length -// encodings), or with variable number of code units (variable-length -// encodings). -// -// encoding | code point number | code point size | code unit number | code unit size -// ---------|-------------------|-----------------------|------------------|---------------- -// ASCII | 0xFF | fixed, 1 byte | 0xFF | 1 byte -// EBCDIC | 0xFF | fixed, 1 byte | 0xFF | 1 byte -// UCS2 | 0xFFFF | fixed, 2 bytes | 0xFFFF | 2 bytes -// UTF16 | 0x10FFFF | variable, 2 - 4 bytes | 0xFFFF | 2 bytes -// UTF32 | 0x10FFFF | fixed, 4 bytes | 0x10FFFF | 4 bytes -// UTF8 | 0x10FFFF | variable, 1 - 4 bytes | 0xFF | 1 byte -// ----------------------------------------------------------------------------------------- +/* + * note [encodings] + * + * Each encoding defines two concepts: + * + * 1) Code point -- abstract number, which represents single encoding symbol. + * E.g., Unicode defines code points in the range [0 - 0x10FFFF] , so each + * Unicode encoding must be capable of representing 0x110000 code points. + * + * 2) Code unit -- the smallest unit of memory, which is used in the encoded + * text. One or more code units can be needed to represent a single code + * point, depending on the encoding. For each encoding, all code points + * either are represented with equal number of code units (fixed-length + * encodings), or with variable number of code units (variable-length + * encodings). + * + * +----------+------------------+-----------------------+-----------------+----------------+ + * | encoding | code point range | code point size | code unit range | code unit size | + * +----------+------------------+-----------------------+-----------------+----------------+ + * | ASCII | 0 - 0xFF | fixed, 1 byte | 0 - 0xFF | 1 byte | + * | EBCDIC | 0 - 0xFF | fixed, 1 byte | 0 - 0xFF | 1 byte | + * | UCS2 | 0 - 0xFFFF | fixed, 2 bytes | 0 - 0xFFFF | 2 bytes | + * | UTF16 | 0 - 0x10FFFF | variable, 2 - 4 bytes | 0 - 0xFFFF | 2 bytes | + * | UTF32 | 0 - 0x10FFFF | fixed, 4 bytes | 0 - 0x10FFFF | 4 bytes | + * | UTF8 | 0 - 0x10FFFF | variable, 1 - 4 bytes | 0 - 0xFF | 1 byte | + * +----------+------------------+-----------------------+-----------------+----------------+ + */ class Enc { diff --git a/re2c/src/parse/scanner_lex.re b/re2c/src/parse/scanner_lex.re index e152ebc0..595225af 100644 --- a/re2c/src/parse/scanner_lex.re +++ b/re2c/src/parse/scanner_lex.re @@ -427,7 +427,7 @@ cpoints: { warn.useless_escape (tline, tok - pos, c); } - cpoints.push_back (static_cast (c)); + cpoints.push_back (static_cast (c)); goto cpoints; } [^] \ esc @@ -450,7 +450,7 @@ cpoints: } else { - cpoints.push_back (static_cast (c)); + cpoints.push_back (static_cast (c)); goto cpoints; } }