* it is assumed that user considers it to be valid.
* We must check it.
*
- * Returns false if this code point is forbidden
- * by current policy, otherwise returns true.
- * Overwrites code point.
+ * Returns false if this code point exceeds maximum
+ * or is forbidden by current policy, otherwise
+ * returns true. Overwrites code point.
*/
bool Enc::encode(uint32_t & c) const
{
+ if (c >= nCodePoints ())
+ {
+ return false;
+ }
+
switch (type)
{
case ASCII:
- c &= 0xFF;
return true;
case EBCDIC:
- c = asc2ebc[c & 0xFF];
+ c = asc2ebc[c];
return true;
case UCS2:
case UTF16:
* it is assumed that user considers that all code
* points from this range are valid. re2c must check it.
*
- * Returns NULL if range contains code points forbidden
- * by current policy, otherwise returns pointer to newly
- * constructed Range.
+ * Returns NULL if range contains code points that
+ * exceed maximum or are forbidden by current policy,
+ * otherwise returns pointer to newly constructed range.
*/
Range * Enc::encodeRange(uint32_t l, uint32_t h) const
{
+ if (l >= nCodePoints () || h >= nCodePoints ())
+ {
+ return NULL;
+ }
+
Range * r = NULL;
switch (type)
{
case ASCII:
- if (l > 0xFF || h > 0xFF)
- {
- return NULL;
- }
r = Range::ran (l, h + 1);
break;
case EBCDIC:
{
- const uint32_t el = asc2ebc[l & 0xFF];
+ const uint32_t el = asc2ebc[l];
r = Range::sym (el);
for (uint32_t c = l + 1; c <= h; ++c)
{
- const uint32_t ec = asc2ebc[c & 0xFF];
+ const uint32_t ec = asc2ebc[c];
r = Range::add (r, Range::sym (ec));
}
break;
}
/*
- * Returns [0 - CPOINT_MAX] (full range) representation
- * for current encoding with regard to current policy.
+ * Returns full range representation for current encoding
+ * with regard to current policy.
*
* Since range is defined declaratively, re2c does
* all the necessary corrections 'for free'.
*
* Always succeeds, returns pointer to newly constructed
- * Range.
+ * range.
*/
Range * Enc::fullRange() const
{
namespace re2c {
-// Each encoding defines two concepts:
-//
-// 1) Code point -- abstract number, which represents single encoding symbol.
-// E.g., Unicode defines 0x10FFFF code points, so each Unicode encoding
-// must be capable of representing 0x10FFFF code points.
-//
-// 2) Code unit -- the smallest unit of memory, which is used in the encoded
-// text. One or more code units can be needed to represent a single code
-// point, depending on the encoding. For each encoding, all code points
-// either are represented with equal number of code units (fixed-length
-// encodings), or with variable number of code units (variable-length
-// encodings).
-//
-// encoding | code point number | code point size | code unit number | code unit size
-// ---------|-------------------|-----------------------|------------------|----------------
-// ASCII | 0xFF | fixed, 1 byte | 0xFF | 1 byte
-// EBCDIC | 0xFF | fixed, 1 byte | 0xFF | 1 byte
-// UCS2 | 0xFFFF | fixed, 2 bytes | 0xFFFF | 2 bytes
-// UTF16 | 0x10FFFF | variable, 2 - 4 bytes | 0xFFFF | 2 bytes
-// UTF32 | 0x10FFFF | fixed, 4 bytes | 0x10FFFF | 4 bytes
-// UTF8 | 0x10FFFF | variable, 1 - 4 bytes | 0xFF | 1 byte
-// -----------------------------------------------------------------------------------------
+/*
+ * note [encodings]
+ *
+ * Each encoding defines two concepts:
+ *
+ * 1) Code point -- abstract number, which represents single encoding symbol.
+ * E.g., Unicode defines code points in the range [0 - 0x10FFFF] , so each
+ * Unicode encoding must be capable of representing 0x110000 code points.
+ *
+ * 2) Code unit -- the smallest unit of memory, which is used in the encoded
+ * text. One or more code units can be needed to represent a single code
+ * point, depending on the encoding. For each encoding, all code points
+ * either are represented with equal number of code units (fixed-length
+ * encodings), or with variable number of code units (variable-length
+ * encodings).
+ *
+ * +----------+------------------+-----------------------+-----------------+----------------+
+ * | encoding | code point range | code point size | code unit range | code unit size |
+ * +----------+------------------+-----------------------+-----------------+----------------+
+ * | ASCII | 0 - 0xFF | fixed, 1 byte | 0 - 0xFF | 1 byte |
+ * | EBCDIC | 0 - 0xFF | fixed, 1 byte | 0 - 0xFF | 1 byte |
+ * | UCS2 | 0 - 0xFFFF | fixed, 2 bytes | 0 - 0xFFFF | 2 bytes |
+ * | UTF16 | 0 - 0x10FFFF | variable, 2 - 4 bytes | 0 - 0xFFFF | 2 bytes |
+ * | UTF32 | 0 - 0x10FFFF | fixed, 4 bytes | 0 - 0x10FFFF | 4 bytes |
+ * | UTF8 | 0 - 0x10FFFF | variable, 1 - 4 bytes | 0 - 0xFF | 1 byte |
+ * +----------+------------------+-----------------------+-----------------+----------------+
+ */
class Enc
{