From: Ulya Fokanova Date: Wed, 9 Apr 2014 22:56:16 +0000 (+0300) Subject: Moved encoding-specific character handling to 'Enc' class. X-Git-Tag: 0.13.7.1~8 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=8de8bfbd2828c127e162604496bf094e98153352;p=re2c Moved encoding-specific character handling to 'Enc' class. --- diff --git a/re2c/actions.cc b/re2c/actions.cc index d3d02478..d4a8d2b1 100644 --- a/re2c/actions.cc +++ b/re2c/actions.cc @@ -735,7 +735,7 @@ std::string& Scanner::unescape(SubStr& str_in, std::string& str_out) const Range * Scanner::getRange(SubStr &s) const { - uint lb = unescape(s), ub, xlb, xub; + uint lb = unescape(s), ub; if (s.len < 2 || *s.str != '-') { @@ -754,39 +754,23 @@ Range * Scanner::getRange(SubStr &s) const } } - xlb = encoding.xlat(lb); - xub = encoding.xlat(ub); - - if (encoding.is(Enc::EBCDIC)) - { - Range * r = new Range(xlb, xlb + 1); - for (uint c = lb + 1; c <= ub; ++c) - { - uint xc = encoding.xlat(c); - r = doUnion(r, new Range(xc, xc + 1)); - } - return r; - } - else - { - return new Range(xlb, xub + 1); - } -} - -RegExp * Scanner::matchChar(uint c) const -{ - uint xc = encoding.xlat(c); - return new MatchOp(new Range(xc, xc + 1)); + Range * r = encoding.encodeRange(lb, ub); + if (r == NULL) + fatalf("Bad code point range: '0x%X - 0x%X'", lb, ub); + return r; } RegExp * Scanner::matchSymbol(uint c) const { + if (!encoding.encode(c)) + fatalf("Bad code point: '0x%X'", c); + if (encoding.is(Enc::UTF16)) return UTF16Symbol(c); else if (encoding.is(Enc::UTF8)) return UTF8Symbol(c); else - return matchChar(c); + return new MatchOp(new Range(c, c + 1)); } RegExp * Scanner::strToRE(SubStr s) const @@ -882,21 +866,23 @@ RegExp * Scanner::invToRE(SubStr s) const s.len -= 3; s.str += 2; - Range * any = new Range(0, encoding.nCodePoints()); + Range * full = encoding.fullRange(); Range * r = s.len == 0 - ? any - : doDiff(any, mkRange (s)); + ? full + : doDiff(full, mkRange (s)); return matchSymbolRange(r); } RegExp * Scanner::mkDot() const { - Range * any = new Range(0, encoding.nCodePoints()); - const uint c = encoding.xlat('\n'); + Range * full = encoding.fullRange(); + uint c = '\n'; + if (!encoding.encode(c)) + fatalf("Bad code point: '0x%X'", c); Range * ran = new Range(c, c + 1); - Range * inv = doDiff(any, ran); + Range * inv = doDiff(full, ran); return matchSymbolRange(inv); } diff --git a/re2c/code.cc b/re2c/code.cc index 40bf7b6f..65e64f56 100644 --- a/re2c/code.cc +++ b/re2c/code.cc @@ -908,9 +908,11 @@ static bool genCases(std::ostream &o, uint ind, uint lb, Span *s, bool &newLine, o << indent(ind) << "case "; prtChOrHex(o, lb); o << ":"; - if (dFlag && encoding.is(Enc::EBCDIC) && lb < 256u && isprint(encoding.talx(lb))) + if (dFlag && encoding.is(Enc::EBCDIC)) { - o << " /* " << std::string(1, encoding.talx(lb)) << " */"; + const uint c = encoding.decode(lb); + if (isprint(c)) + o << " /* " << std::string(1, c) << " */"; } } newLine = false; diff --git a/re2c/enc.cc b/re2c/enc.cc index 6d85b412..67f7e11a 100644 --- a/re2c/enc.cc +++ b/re2c/enc.cc @@ -2,6 +2,10 @@ namespace re2c { +const uint Enc::SURR_MIN = 0xD800; +const uint Enc::SURR_MAX = 0xDFFF; +const uint Enc::UNICODE_ERROR = 0xFFFD; + const uint Enc::asc2ebc[256] = { /* Based on ISO 8859/1 and Code Page 37 */ 0x00, 0x01, 0x02, 0x03, 0x37, 0x2d, 0x2e, 0x2f, 0x16, 0x05, 0x25, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, @@ -42,4 +46,92 @@ const uint Enc::ebc2asc[256] = 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0xb3, 0xdb, 0xdc, 0xd9, 0xda, 0x9f }; +bool Enc::encode(uint & c) const +{ + switch (type) + { + case ASCII: + c &= 0xFF; + return true; + case EBCDIC: + c = asc2ebc[c & 0xFF]; + return true; + case UCS2: + case UTF16: + case UTF32: + case UTF8: + return true; + } + return false; // to silence gcc warning +} + +uint Enc::decode(uint c) const +{ + switch (type) + { + case EBCDIC: + c = ebc2asc[c & 0xFF]; + break; + case ASCII: + case UCS2: + case UTF16: + case UTF32: + case UTF8: + break; + } + return c; +} + +Range * Enc::encodeRange(uint l, uint h) const +{ + Range * r = NULL; + switch (type) + { + case ASCII: + l &= 0xFF; + h &= 0xFF; + r = new Range(l, h + 1); + break; + case EBCDIC: + { + const uint el = asc2ebc[l & 0xFF]; + r = new Range(el, el + 1); + for (uint c = l + 1; c <= h; ++c) + { + const uint ec = asc2ebc[c & 0xFF]; + r = doUnion(r, new Range(ec, ec + 1)); + } + break; + } + case UCS2: + case UTF16: + case UTF32: + case UTF8: + r = new Range(l, h + 1); + break; + } + return r; +} + +Range * Enc::fullRange() const +{ + Range * r = NULL; + switch (type) + { + case ASCII: + case EBCDIC: + r = new Range(0, 0x100); + break; + case UCS2: + r = new Range(0, 0x10000); + break; + case UTF16: + case UTF32: + case UTF8: + r = new Range(0, 0x110000); + break; + } + return r; +} + } // namespace re2c diff --git a/re2c/enc.h b/re2c/enc.h index 40372447..b8df64c4 100644 --- a/re2c/enc.h +++ b/re2c/enc.h @@ -2,6 +2,7 @@ #define _enc_h #include "basics.h" +#include "range.h" namespace re2c { @@ -41,10 +42,13 @@ public: , UTF8 }; +private: static const uint asc2ebc[256]; static const uint ebc2asc[256]; + static const uint SURR_MIN; + static const uint SURR_MAX; + static const uint UNICODE_ERROR; -private: type_t type; public: @@ -63,8 +67,10 @@ public: inline void unset(type_t); inline bool is(type_t) const; - inline uint xlat(uint c) const; - inline uint talx(uint c) const; + bool encode(uint & c) const; + uint decode(uint c) const; + Range * encodeRange(uint l, uint h) const; + Range * fullRange() const; }; inline uint Enc::nCodePoints() const @@ -148,34 +154,6 @@ inline bool Enc::is(type_t t) const return type == t; } -inline uint Enc::xlat(uint c) const -{ - switch (type) - { - case ASCII: return c & 0xFF; - case EBCDIC: return asc2ebc[c & 0xFF]; - case UCS2: - case UTF16: - case UTF32: - case UTF8: return c; - } - return ~0; // to silence gcc warning -} - -inline uint Enc::talx(uint c) const -{ - switch (type) - { - case ASCII: return c & 0xFF; - case EBCDIC: return ebc2asc[c & 0xFF]; - case UCS2: - case UTF16: - case UTF32: - case UTF8: return c; - } - return ~0; // to silence gcc warning -} - } // namespace re2c #endif // _enc_h diff --git a/re2c/scanner.h b/re2c/scanner.h index e3ee3322..8df3e5a6 100644 --- a/re2c/scanner.h +++ b/re2c/scanner.h @@ -77,7 +77,6 @@ public: Range * getRange(SubStr &s) const; RegExp * matchSymbol(uint c) const; RegExp * matchSymbolRange(Range * r) const; - RegExp * matchChar(uint c) const; RegExp * strToName(SubStr s) const; RegExp * strToRE(SubStr s) const; RegExp * strToCaseInsensitiveRE(SubStr s) const;