]> granicus.if.org Git - re2c/commitdiff
Moved encoding-specific character handling to 'Enc' class.
authorUlya Fokanova <skvadrik@gmail.com>
Wed, 9 Apr 2014 22:56:16 +0000 (01:56 +0300)
committerUlya Fokanova <skvadrik@gmail.com>
Wed, 9 Apr 2014 22:56:16 +0000 (01:56 +0300)
re2c/actions.cc
re2c/code.cc
re2c/enc.cc
re2c/enc.h
re2c/scanner.h

index d3d02478303a921e877366cf5b4b7aa4f425c4b5..d4a8d2b1605b2dba7fd11201ccdcd9ae1bc5cc51 100644 (file)
@@ -735,7 +735,7 @@ std::string& Scanner::unescape(SubStr& str_in, std::string& str_out) const
 
 Range * Scanner::getRange(SubStr &s) const
 {
-       uint lb = unescape(s), ub, xlb, xub;
+       uint lb = unescape(s), ub;
 
        if (s.len < 2 || *s.str != '-')
        {
@@ -754,39 +754,23 @@ Range * Scanner::getRange(SubStr &s) const
                }
        }
 
-       xlb = encoding.xlat(lb);
-       xub = encoding.xlat(ub);
-
-       if (encoding.is(Enc::EBCDIC))
-       {
-               Range * r = new Range(xlb, xlb + 1);
-               for (uint c = lb + 1; c <= ub; ++c)
-               {
-                       uint xc = encoding.xlat(c);
-                       r = doUnion(r, new Range(xc, xc + 1));
-               }
-               return r;
-       }
-       else
-       {
-               return new Range(xlb, xub + 1);
-       }
-}
-
-RegExp * Scanner::matchChar(uint c) const
-{
-       uint xc = encoding.xlat(c);
-       return new MatchOp(new Range(xc, xc + 1));
+       Range * r = encoding.encodeRange(lb, ub);
+       if (r == NULL)
+               fatalf("Bad code point range: '0x%X - 0x%X'", lb, ub);
+       return r;
 }
 
 RegExp * Scanner::matchSymbol(uint c) const
 {
+       if (!encoding.encode(c))
+               fatalf("Bad code point: '0x%X'", c);
+
        if (encoding.is(Enc::UTF16))
                return UTF16Symbol(c);
        else if (encoding.is(Enc::UTF8))
                return UTF8Symbol(c);
        else
-               return matchChar(c);
+               return new MatchOp(new Range(c, c + 1));
 }
 
 RegExp * Scanner::strToRE(SubStr s) const
@@ -882,21 +866,23 @@ RegExp * Scanner::invToRE(SubStr s) const
        s.len -= 3;
        s.str += 2;
 
-       Range * any = new Range(0, encoding.nCodePoints());
+       Range * full = encoding.fullRange();
 
        Range * r = s.len == 0
-               ? any
-               : doDiff(any, mkRange (s));
+               ? full
+               : doDiff(full, mkRange (s));
 
        return matchSymbolRange(r);
 }
 
 RegExp * Scanner::mkDot() const
 {
-       Range * any = new Range(0, encoding.nCodePoints());
-       const uint c = encoding.xlat('\n');
+       Range * full = encoding.fullRange();
+       uint c = '\n';
+       if (!encoding.encode(c))
+               fatalf("Bad code point: '0x%X'", c);
        Range * ran = new Range(c, c + 1);
-       Range * inv = doDiff(any, ran);
+       Range * inv = doDiff(full, ran);
 
        return matchSymbolRange(inv);
 }
index 40bf7b6f60fb753aed53565e8de294eb013fe5e6..65e64f56fc03bb46ef8238a1cee18d4d72ffb544 100644 (file)
@@ -908,9 +908,11 @@ static bool genCases(std::ostream &o, uint ind, uint lb, Span *s, bool &newLine,
                                        o << indent(ind) << "case ";
                                        prtChOrHex(o, lb);
                                        o << ":";
-                                       if (dFlag && encoding.is(Enc::EBCDIC) && lb < 256u && isprint(encoding.talx(lb)))
+                                       if (dFlag && encoding.is(Enc::EBCDIC))
                                        {
-                                               o << " /* " << std::string(1, encoding.talx(lb)) << " */";
+                                               const uint c = encoding.decode(lb);
+                                               if (isprint(c))
+                                                       o << " /* " << std::string(1, c) << " */";
                                        }
                                }
                                newLine = false;
index 6d85b412da5c40cb8d82e5ae7ff48071a172792b..67f7e11a0064613b400ff595f7266094a1f30da2 100644 (file)
@@ -2,6 +2,10 @@
 
 namespace re2c {
 
+const uint Enc::SURR_MIN = 0xD800;
+const uint Enc::SURR_MAX = 0xDFFF;
+const uint Enc::UNICODE_ERROR = 0xFFFD;
+
 const uint Enc::asc2ebc[256] =
     { /* Based on ISO 8859/1 and Code Page 37 */
         0x00, 0x01, 0x02, 0x03, 0x37, 0x2d, 0x2e, 0x2f, 0x16, 0x05, 0x25, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
@@ -42,4 +46,92 @@ const uint Enc::ebc2asc[256] =
         0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0xb3, 0xdb, 0xdc, 0xd9, 0xda, 0x9f
     };
 
+bool Enc::encode(uint & c) const
+{
+       switch (type)
+       {
+               case ASCII:
+                       c &= 0xFF;
+                       return true;
+               case EBCDIC:
+                       c = asc2ebc[c & 0xFF];
+                       return true;
+               case UCS2:
+               case UTF16:
+               case UTF32:
+               case UTF8:
+                       return true;
+       }
+       return false; // to silence gcc warning
+}
+
+uint Enc::decode(uint c) const
+{
+       switch (type)
+       {
+               case EBCDIC:
+                       c = ebc2asc[c & 0xFF];
+                       break;
+               case ASCII:
+               case UCS2:
+               case UTF16:
+               case UTF32:
+               case UTF8:
+                       break;
+       }
+       return c;
+}
+
+Range * Enc::encodeRange(uint l, uint h) const
+{
+       Range * r = NULL;
+       switch (type)
+       {
+               case ASCII:
+                       l &= 0xFF;
+                       h &= 0xFF;
+                       r = new Range(l, h + 1);
+                       break;
+               case EBCDIC:
+               {
+                       const uint el = asc2ebc[l & 0xFF];
+                       r = new Range(el, el + 1);
+                       for (uint c = l + 1; c <= h; ++c)
+                       {
+                               const uint ec = asc2ebc[c & 0xFF];
+                               r = doUnion(r, new Range(ec, ec + 1));
+                       }
+                       break;
+               }
+               case UCS2:
+               case UTF16:
+               case UTF32:
+               case UTF8:
+                       r = new Range(l, h + 1);
+                       break;
+       }
+       return r;
+}
+
+Range * Enc::fullRange() const
+{
+       Range * r = NULL;
+       switch (type)
+       {
+               case ASCII:
+               case EBCDIC:
+                       r = new Range(0, 0x100);
+                       break;
+               case UCS2:
+                       r = new Range(0, 0x10000);
+                       break;
+               case UTF16:
+               case UTF32:
+               case UTF8:
+                       r = new Range(0, 0x110000);
+                       break;
+       }
+       return r;
+}
+
 } // namespace re2c
index 40372447e3f9bbc20d9c3cd80122eed20c8739ee..b8df64c45035a5546ab4e8d20aaa4a4aadd74327 100644 (file)
@@ -2,6 +2,7 @@
 #define _enc_h
 
 #include "basics.h"
+#include "range.h"
 
 namespace re2c {
 
@@ -41,10 +42,13 @@ public:
                , UTF8
                };
 
+private:
        static const uint asc2ebc[256];
        static const uint ebc2asc[256];
+       static const uint SURR_MIN;
+       static const uint SURR_MAX;
+       static const uint UNICODE_ERROR;
 
-private:
        type_t type;
 
 public:
@@ -63,8 +67,10 @@ public:
        inline void unset(type_t);
        inline bool is(type_t) const;
 
-       inline uint xlat(uint c) const;
-       inline uint talx(uint c) const;
+       bool encode(uint & c) const;
+       uint decode(uint c) const;
+       Range * encodeRange(uint l, uint h) const;
+       Range * fullRange() const;
 };
 
 inline uint Enc::nCodePoints() const
@@ -148,34 +154,6 @@ inline bool Enc::is(type_t t) const
        return type == t;
 }
 
-inline uint Enc::xlat(uint c) const
-{
-       switch (type)
-       {
-               case ASCII:     return c & 0xFF;
-               case EBCDIC:    return asc2ebc[c & 0xFF];
-               case UCS2:
-               case UTF16:
-               case UTF32:
-               case UTF8:      return c;
-       }
-       return ~0; // to silence gcc warning
-}
-
-inline uint Enc::talx(uint c) const
-{
-       switch (type)
-       {
-               case ASCII:     return c & 0xFF;
-               case EBCDIC:    return ebc2asc[c & 0xFF];
-               case UCS2:
-               case UTF16:
-               case UTF32:
-               case UTF8:      return c;
-       }
-       return ~0; // to silence gcc warning
-}
-
 } // namespace re2c
 
 #endif // _enc_h
index e3ee3322889699575dead8a121572b7ae99e7f9c..8df3e5a6e96738dc05ac0504db8f31a5b686dfb0 100644 (file)
@@ -77,7 +77,6 @@ public:
        Range * getRange(SubStr &s) const;
        RegExp * matchSymbol(uint c) const;
        RegExp * matchSymbolRange(Range * r) const;
-       RegExp * matchChar(uint c) const;
        RegExp * strToName(SubStr s) const;
        RegExp * strToRE(SubStr s) const;
        RegExp * strToCaseInsensitiveRE(SubStr s) const;