namespace re2c {
-const uint32_t utf16::MAX_1WORD_RUNE = 0xFFFF;
-const uint32_t utf16::MIN_LEAD_SURR = 0xD800;
-const uint32_t utf16::MIN_TRAIL_SURR = 0xDC00;
-const uint32_t utf16::MAX_TRAIL_SURR = 0xDFFF;
+const uint32_t utf16::MAX_1WORD_RUNE = 0xFFFFu;
+const uint32_t utf16::MIN_LEAD_SURR = 0xD800u;
+const uint32_t utf16::MIN_TRAIL_SURR = 0xDC00u;
+const uint32_t utf16::MAX_TRAIL_SURR = 0xDFFFu;
} // namespace re2c
static const uint32_t MAX_TRAIL_SURR;
/* leading surrogate of UTF-16 symbol */
- static inline uint16_t lead_surr(rune r);
+ static inline uint32_t lead_surr(rune r);
/* trailing surrogate of UTF-16 symbol */
- static inline uint16_t trail_surr(rune r);
+ static inline uint32_t trail_surr(rune r);
};
-inline uint16_t utf16::lead_surr(rune r)
+inline uint32_t utf16::lead_surr(rune r)
{
- return ((r - 0x10000) / 0x400) + MIN_LEAD_SURR;
+ return ((r - 0x10000u) / 0x400u) + MIN_LEAD_SURR;
}
-inline uint16_t utf16::trail_surr(rune r)
+inline uint32_t utf16::trail_surr(rune r)
{
- return ((r - 0x10000) % 0x400) + MIN_TRAIL_SURR;
+ return ((r - 0x10000u) % 0x400u) + MIN_TRAIL_SURR;
}
} // namespace re2c
/*
* Add word range [w1-w2].
*/
-void UTF16addContinuous1(RangeSuffix * & root, uint16_t l, uint16_t h)
+void UTF16addContinuous1(RangeSuffix * & root, uint32_t l, uint32_t h)
{
RangeSuffix ** p = &root;
for (;;)
* Now that we have catenation of word ranges [l1-h1],[l2-h2],
* we want to add it to existing range, merging suffixes on the fly.
*/
-void UTF16addContinuous2(RangeSuffix * & root, uint16_t l_ld, uint16_t h_ld, uint16_t l_tr, uint16_t h_tr)
+void UTF16addContinuous2(RangeSuffix * & root, uint32_t l_ld, uint32_t h_ld, uint32_t l_tr, uint32_t h_tr)
{
RangeSuffix ** p = &root;
for (;;)
* and represents original range as alternation of continuous
* sub-ranges.
*/
-void UTF16splitByContinuity(RangeSuffix * & root, uint16_t l_ld, uint16_t h_ld, uint16_t l_tr, uint16_t h_tr)
+void UTF16splitByContinuity(RangeSuffix * & root, uint32_t l_ld, uint32_t h_ld, uint32_t l_tr, uint32_t h_tr)
{
if (l_ld != h_ld)
{
void UTF16splitByRuneLength(RangeSuffix * & root, utf16::rune l, utf16::rune h)
{
if (l <= utf16::MAX_1WORD_RUNE)
+ {
if (h <= utf16::MAX_1WORD_RUNE)
+ {
UTF16addContinuous1(root, l, h);
+ }
else
{
UTF16addContinuous1(root, l, utf16::MAX_1WORD_RUNE);
- const uint16_t h_ld = utf16::lead_surr(h);
- const uint16_t h_tr = utf16::trail_surr(h);
+ const uint32_t h_ld = utf16::lead_surr(h);
+ const uint32_t h_tr = utf16::trail_surr(h);
UTF16splitByContinuity(root, utf16::MIN_LEAD_SURR, h_ld, utf16::MIN_TRAIL_SURR, h_tr);
}
+ }
else
{
- const uint16_t l_ld = utf16::lead_surr(l);
- const uint16_t l_tr = utf16::trail_surr(l);
- const uint16_t h_ld = utf16::lead_surr(h);
- const uint16_t h_tr = utf16::trail_surr(h);
+ const uint32_t l_ld = utf16::lead_surr(l);
+ const uint32_t l_tr = utf16::trail_surr(l);
+ const uint32_t h_ld = utf16::lead_surr(h);
+ const uint32_t h_tr = utf16::trail_surr(h);
UTF16splitByContinuity(root, l_ld, h_ld, l_tr, h_tr);
}
}
namespace re2c {
-void UTF16addContinuous1(RangeSuffix * & root, uint16_t l, uint16_t h);
-void UTF16addContinuous2(RangeSuffix * & root, uint16_t l_ld, uint16_t h_ld, uint16_t l_tr, uint16_t h_tr);
-void UTF16splitByContinuity(RangeSuffix * & root, uint16_t l_ld, uint16_t h_ld, uint16_t l_tr, uint16_t h_tr);
+void UTF16addContinuous1(RangeSuffix * & root, uint32_t l, uint32_t h);
+void UTF16addContinuous2(RangeSuffix * & root, uint32_t l_ld, uint32_t h_ld, uint32_t l_tr, uint32_t h_tr);
+void UTF16splitByContinuity(RangeSuffix * & root, uint32_t l_ld, uint32_t h_ld, uint32_t l_tr, uint32_t h_tr);
void UTF16splitByRuneLength(RangeSuffix * & root, utf16::rune l, utf16::rune h);
} // namespace re2c
return new MatchOp(Range::sym (r));
else
{
- const uint16_t ld = utf16::lead_surr(r);
- const uint16_t tr = utf16::trail_surr(r);
+ const uint32_t ld = utf16::lead_surr(r);
+ const uint32_t tr = utf16::trail_surr(r);
return new CatOp(new MatchOp(Range::sym (ld)), new MatchOp(Range::sym (tr)));
}
}
namespace re2c {
-uint32_t utf8::rune_to_bytes(uint8_t *str, rune c)
+const uint32_t utf8::ERROR = 0xFFFDu;
+
+const utf8::rune utf8::MAX_1BYTE_RUNE = 0x7Fu;
+const utf8::rune utf8::MAX_2BYTE_RUNE = 0x7FFu;
+const utf8::rune utf8::MAX_3BYTE_RUNE = 0xFFFFu;
+const utf8::rune utf8::MAX_4BYTE_RUNE = 0x10FFFFu;
+const utf8::rune utf8::MAX_RUNE = utf8::MAX_4BYTE_RUNE;
+
+const uint32_t utf8::PREFIX_1BYTE = 0u; // 0000 0000
+const uint32_t utf8::INFIX = 0x80u; // 1000 0000
+const uint32_t utf8::PREFIX_2BYTE = 0xC0u; // 1100 0000
+const uint32_t utf8::PREFIX_3BYTE = 0xE0u; // 1110 0000
+const uint32_t utf8::PREFIX_4BYTE = 0xF0u; // 1111 0000
+
+const uint32_t utf8::SHIFT = 6u;
+const uint32_t utf8::MASK = 0x3Fu; // 0011 1111
+
+uint32_t utf8::rune_to_bytes(uint32_t *str, rune c)
{
- /*
- * one byte sequence
- * 0-0x7F => 0xxxxxxx
- */
+ // one byte sequence: 0-0x7F => 0xxxxxxx
if (c <= MAX_1BYTE_RUNE)
{
str[0] = PREFIX_1BYTE | c;
return 1;
}
- /*
- * two byte sequence
- * 0x80-0x7FF => 110xxxxx 10xxxxxx
- */
+ // two byte sequence: 0x80-0x7FF => 110xxxxx 10xxxxxx
if (c <= MAX_2BYTE_RUNE)
{
str[0] = PREFIX_2BYTE | (c >> 1*SHIFT);
return 2;
}
- /*
- * If the Rune is out of range, convert it to the error rune.
- * Do this test here because the error rune encodes to three bytes.
- * Doing it earlier would duplicate work, since an out of range
- * Rune wouldn't have fit in one or two bytes.
- */
+ // If the Rune is out of range, convert it to the error rune.
+ // Do this test here because the error rune encodes to three bytes.
+ // Doing it earlier would duplicate work, since an out of range
+ // Rune wouldn't have fit in one or two bytes.
if (c > MAX_RUNE)
c = ERROR;
- /*
- * three byte sequence
- * 0x800 - 0xFFFF => 1110xxxx 10xxxxxx 10xxxxxx
- */
+ // three byte sequence: 0x800 - 0xFFFF => 1110xxxx 10xxxxxx 10xxxxxx
if (c <= MAX_3BYTE_RUNE)
{
str[0] = PREFIX_3BYTE | (c >> 2*SHIFT);
return 3;
}
- /*
- * four byte sequence (21-bit value)
- * 0x10000 - 0x1FFFFF => 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
- */
+ // four byte sequence (21-bit value):
+ // 0x10000 - 0x1FFFFF => 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
str[0] = PREFIX_4BYTE | (c >> 3*SHIFT);
str[1] = INFIX | ((c >> 2*SHIFT) & MASK);
str[2] = INFIX | ((c >> 1*SHIFT) & MASK);
public:
typedef uint32_t rune;
- enum { MAX_RUNE_LENGTH = 4 /* maximum characters per rune */
+ // maximum characters per rune
+ // enum instead of static const member because of [-Wvla]
+ enum { MAX_RUNE_LENGTH = 4u };
- , ERROR = 0xFFFD /* decoding error */
+ // decoding error
+ static const uint32_t ERROR;
- /* maximal runes for each rune length */
- , MAX_1BYTE_RUNE = 0x7F
- , MAX_2BYTE_RUNE = 0x7FF
- , MAX_3BYTE_RUNE = 0xFFFF
- , MAX_4BYTE_RUNE = 0x10FFFF
- , MAX_RUNE = MAX_4BYTE_RUNE
+ // maximal runes for each rune length
+ static const rune MAX_1BYTE_RUNE;
+ static const rune MAX_2BYTE_RUNE;
+ static const rune MAX_3BYTE_RUNE;
+ static const rune MAX_4BYTE_RUNE;
+ static const rune MAX_RUNE;
- , PREFIX_1BYTE = 0 /* 0000 0000 */
- , INFIX = 0x80 /* 1000 0000 */
- , PREFIX_2BYTE = 0xC0 /* 1100 0000 */
- , PREFIX_3BYTE = 0xE0 /* 1110 0000 */
- , PREFIX_4BYTE = 0xF0 /* 1111 0000 */
+ static const uint32_t PREFIX_1BYTE;
+ static const uint32_t INFIX;
+ static const uint32_t PREFIX_2BYTE;
+ static const uint32_t PREFIX_3BYTE;
+ static const uint32_t PREFIX_4BYTE;
- , SHIFT = 6
- , MASK = 0x3F /* 0011 1111 */
- };
+ static const uint32_t SHIFT;
+ static const uint32_t MASK;
- /* UTF-8 bytestring for given Unicode rune */
- static uint32_t rune_to_bytes(uint8_t * s, rune r);
+ // UTF-8 bytestring for given Unicode rune
+ static uint32_t rune_to_bytes(uint32_t * s, rune r);
- /* length of UTF-8 bytestring for given Unicode rune */
+ // length of UTF-8 bytestring for given Unicode rune
static uint32_t rune_length(rune r);
- /* maximal Unicode rune with given length of UTF-8 bytestring */
+ // maximal Unicode rune with given length of UTF-8 bytestring
static rune max_rune(uint32_t i);
};
*/
void UTF8addContinuous(RangeSuffix * & root, utf8::rune l, utf8::rune h, uint32_t n)
{
- uint8_t cl[utf8::MAX_RUNE_LENGTH];
- uint8_t ch[utf8::MAX_RUNE_LENGTH];
+ uint32_t cl[utf8::MAX_RUNE_LENGTH];
+ uint32_t ch[utf8::MAX_RUNE_LENGTH];
utf8::rune_to_bytes(cl, l);
utf8::rune_to_bytes(ch, h);
RegExp * UTF8Symbol(utf8::rune r)
{
- uint8_t chars[utf8::MAX_RUNE_LENGTH];
+ uint32_t chars[utf8::MAX_RUNE_LENGTH];
const int chars_count = utf8::rune_to_bytes(chars, r);
RegExp * re = new MatchOp(Range::sym (chars[0]));
for (int i = 1; i < chars_count; ++i)