Range * Scanner::getRange(SubStr &s) const
{
- uint lb = unescape(s), ub, xlb, xub;
+ uint lb = unescape(s), ub;
if (s.len < 2 || *s.str != '-')
{
}
}
- xlb = encoding.xlat(lb);
- xub = encoding.xlat(ub);
-
- if (encoding.is(Enc::EBCDIC))
- {
- Range * r = new Range(xlb, xlb + 1);
- for (uint c = lb + 1; c <= ub; ++c)
- {
- uint xc = encoding.xlat(c);
- r = doUnion(r, new Range(xc, xc + 1));
- }
- return r;
- }
- else
- {
- return new Range(xlb, xub + 1);
- }
-}
-
-RegExp * Scanner::matchChar(uint c) const
-{
- uint xc = encoding.xlat(c);
- return new MatchOp(new Range(xc, xc + 1));
+ Range * r = encoding.encodeRange(lb, ub);
+ if (r == NULL)
+ fatalf("Bad code point range: '0x%X - 0x%X'", lb, ub);
+ return r;
}
RegExp * Scanner::matchSymbol(uint c) const
{
+ if (!encoding.encode(c))
+ fatalf("Bad code point: '0x%X'", c);
+
if (encoding.is(Enc::UTF16))
return UTF16Symbol(c);
else if (encoding.is(Enc::UTF8))
return UTF8Symbol(c);
else
- return matchChar(c);
+ return new MatchOp(new Range(c, c + 1));
}
RegExp * Scanner::strToRE(SubStr s) const
s.len -= 3;
s.str += 2;
- Range * any = new Range(0, encoding.nCodePoints());
+ Range * full = encoding.fullRange();
Range * r = s.len == 0
- ? any
- : doDiff(any, mkRange (s));
+ ? full
+ : doDiff(full, mkRange (s));
return matchSymbolRange(r);
}
RegExp * Scanner::mkDot() const
{
- Range * any = new Range(0, encoding.nCodePoints());
- const uint c = encoding.xlat('\n');
+ Range * full = encoding.fullRange();
+ uint c = '\n';
+ if (!encoding.encode(c))
+ fatalf("Bad code point: '0x%X'", c);
Range * ran = new Range(c, c + 1);
- Range * inv = doDiff(any, ran);
+ Range * inv = doDiff(full, ran);
return matchSymbolRange(inv);
}
o << indent(ind) << "case ";
prtChOrHex(o, lb);
o << ":";
- if (dFlag && encoding.is(Enc::EBCDIC) && lb < 256u && isprint(encoding.talx(lb)))
+ if (dFlag && encoding.is(Enc::EBCDIC))
{
- o << " /* " << std::string(1, encoding.talx(lb)) << " */";
+ const uint c = encoding.decode(lb);
+ if (isprint(c))
+ o << " /* " << std::string(1, c) << " */";
}
}
newLine = false;
namespace re2c {
+const uint Enc::SURR_MIN = 0xD800;
+const uint Enc::SURR_MAX = 0xDFFF;
+const uint Enc::UNICODE_ERROR = 0xFFFD;
+
const uint Enc::asc2ebc[256] =
{ /* Based on ISO 8859/1 and Code Page 37 */
0x00, 0x01, 0x02, 0x03, 0x37, 0x2d, 0x2e, 0x2f, 0x16, 0x05, 0x25, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0xb3, 0xdb, 0xdc, 0xd9, 0xda, 0x9f
};
+bool Enc::encode(uint & c) const
+{
+ switch (type)
+ {
+ case ASCII:
+ c &= 0xFF;
+ return true;
+ case EBCDIC:
+ c = asc2ebc[c & 0xFF];
+ return true;
+ case UCS2:
+ case UTF16:
+ case UTF32:
+ case UTF8:
+ return true;
+ }
+ return false; // to silence gcc warning
+}
+
+uint Enc::decode(uint c) const
+{
+ switch (type)
+ {
+ case EBCDIC:
+ c = ebc2asc[c & 0xFF];
+ break;
+ case ASCII:
+ case UCS2:
+ case UTF16:
+ case UTF32:
+ case UTF8:
+ break;
+ }
+ return c;
+}
+
+Range * Enc::encodeRange(uint l, uint h) const
+{
+ Range * r = NULL;
+ switch (type)
+ {
+ case ASCII:
+ l &= 0xFF;
+ h &= 0xFF;
+ r = new Range(l, h + 1);
+ break;
+ case EBCDIC:
+ {
+ const uint el = asc2ebc[l & 0xFF];
+ r = new Range(el, el + 1);
+ for (uint c = l + 1; c <= h; ++c)
+ {
+ const uint ec = asc2ebc[c & 0xFF];
+ r = doUnion(r, new Range(ec, ec + 1));
+ }
+ break;
+ }
+ case UCS2:
+ case UTF16:
+ case UTF32:
+ case UTF8:
+ r = new Range(l, h + 1);
+ break;
+ }
+ return r;
+}
+
+Range * Enc::fullRange() const
+{
+ Range * r = NULL;
+ switch (type)
+ {
+ case ASCII:
+ case EBCDIC:
+ r = new Range(0, 0x100);
+ break;
+ case UCS2:
+ r = new Range(0, 0x10000);
+ break;
+ case UTF16:
+ case UTF32:
+ case UTF8:
+ r = new Range(0, 0x110000);
+ break;
+ }
+ return r;
+}
+
} // namespace re2c
#define _enc_h
#include "basics.h"
+#include "range.h"
namespace re2c {
, UTF8
};
+private:
static const uint asc2ebc[256];
static const uint ebc2asc[256];
+ static const uint SURR_MIN;
+ static const uint SURR_MAX;
+ static const uint UNICODE_ERROR;
-private:
type_t type;
public:
inline void unset(type_t);
inline bool is(type_t) const;
- inline uint xlat(uint c) const;
- inline uint talx(uint c) const;
+ bool encode(uint & c) const;
+ uint decode(uint c) const;
+ Range * encodeRange(uint l, uint h) const;
+ Range * fullRange() const;
};
inline uint Enc::nCodePoints() const
return type == t;
}
-inline uint Enc::xlat(uint c) const
-{
- switch (type)
- {
- case ASCII: return c & 0xFF;
- case EBCDIC: return asc2ebc[c & 0xFF];
- case UCS2:
- case UTF16:
- case UTF32:
- case UTF8: return c;
- }
- return ~0; // to silence gcc warning
-}
-
-inline uint Enc::talx(uint c) const
-{
- switch (type)
- {
- case ASCII: return c & 0xFF;
- case EBCDIC: return ebc2asc[c & 0xFF];
- case UCS2:
- case UTF16:
- case UTF32:
- case UTF8: return c;
- }
- return ~0; // to silence gcc warning
-}
-
} // namespace re2c
#endif // _enc_h
Range * getRange(SubStr &s) const;
RegExp * matchSymbol(uint c) const;
RegExp * matchSymbolRange(Range * r) const;
- RegExp * matchChar(uint c) const;
RegExp * strToName(SubStr s) const;
RegExp * strToRE(SubStr s) const;
RegExp * strToCaseInsensitiveRE(SubStr s) const;