src/dfa/tcmd.h \
src/nfa/nfa.h \
src/re/encoding/case.h \
+ src/re/encoding/ebcdic/ebcdic.h \
+ src/re/encoding/ebcdic/ebcdic_regexp.h \
src/re/encoding/enc.h \
src/re/encoding/range_suffix.h \
src/re/encoding/utf8/utf8.h \
src/dfa/tag_history.cc \
src/dfa/tagver_table.cc \
src/dfa/tcmd.cc \
+ src/re/encoding/ebcdic/ebcdic_regexp.cc \
src/re/encoding/enc.cc \
src/re/encoding/range_suffix.cc \
src/re/encoding/utf8/utf8_regexp.cc \
#include "src/re/empty_class_policy.h"
#include "src/re/encoding/case.h"
#include "src/re/encoding/enc.h"
+#include "src/re/encoding/ebcdic/ebcdic_regexp.h"
#include "src/re/encoding/utf16/utf16_regexp.h"
#include "src/re/encoding/utf8/utf8_regexp.h"
#include "src/re/re.h"
i = ast->cls.ranges->begin(),
e = ast->cls.ranges->end();
for (; i != e; ++i) {
- Range *s = opts->encoding.encodeRange(i->lower, i->upper);
- if (!s) fatal_lc(ast->line, i->column,
+ Range *s = opts->encoding.validateRange(i->lower, i->upper);
+ if (!s) {
+ fatal_lc(ast->line, i->column,
"bad code point range: '0x%X - 0x%X'", i->lower, i->upper);
+ }
r = Range::add(r, s);
}
if (ast->cls.negated) {
DASSERT(ast->type == AST::DOT);
uint32_t c = '\n';
- if (!opts->encoding.encode(c)) {
+ if (!opts->encoding.validateChar(c)) {
fatal_lc(ast->line, ast->column, "bad code point: '0x%X'", c);
}
return Range::sub(opts->encoding.fullRange(), Range::sym(c));
if (ast->str.chars->size() != 1) break;
const ASTChar &i = ast->str.chars->front();
uint32_t c = i.chr;
- if (!opts->encoding.encode(c)) {
+ if (!opts->encoding.validateChar(c)) {
fatal_lc(ast->line, i.column, "bad code point: '0x%X'", c);
}
const bool icase = opts->bCaseInsensitive
RE *re_schar(RE::alc_t &alc, uint32_t line, uint32_t column, uint32_t c, const opt_t *opts)
{
- if (!opts->encoding.encode(c)) {
+ if (!opts->encoding.validateChar(c)) {
fatal_lc(line, column, "bad code point: '0x%X'", c);
}
switch (opts->encoding.type()) {
return UTF16Symbol(alc, c);
case Enc::UTF8:
return UTF8Symbol(alc, c);
- case Enc::ASCII:
case Enc::EBCDIC:
+ return EBCDICSymbol(alc, c);
+ case Enc::ASCII:
case Enc::UTF32:
case Enc::UCS2:
return re_sym(alc, Range::sym(c));
return UTF16Range(alc, r);
case Enc::UTF8:
return UTF8Range(alc, r);
- case Enc::ASCII:
case Enc::EBCDIC:
+ return EBCDICRange(alc, r);
+ case Enc::ASCII:
case Enc::UTF32:
case Enc::UCS2:
return re_sym(alc, r);
--- /dev/null
+#ifndef _RE2C_RE_ENCODING_EBCDIC_EBCDIC_
+#define _RE2C_RE_ENCODING_EBCDIC_EBCDIC_
+
+#include "src/util/c99_stdint.h"
+
+
+namespace re2c {
+
+const uint32_t asc2ebc[256] = {
+ /* Based on ISO 8859/1 and Code Page 37 */
+ 0x00, 0x01, 0x02, 0x03, 0x37, 0x2d, 0x2e, 0x2f, 0x16, 0x05, 0x25, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
+ 0x10, 0x11, 0x12, 0x13, 0x3c, 0x3d, 0x32, 0x26, 0x18, 0x19, 0x3f, 0x27, 0x1c, 0x1d, 0x1e, 0x1f,
+ 0x40, 0x5a, 0x7f, 0x7b, 0x5b, 0x6c, 0x50, 0x7d, 0x4d, 0x5d, 0x5c, 0x4e, 0x6b, 0x60, 0x4b, 0x61,
+ 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8, 0xf9, 0x7a, 0x5e, 0x4c, 0x7e, 0x6e, 0x6f,
+ 0x7c, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, 0xc8, 0xc9, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6,
+ 0xd7, 0xd8, 0xd9, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xba, 0xe0, 0xbb, 0xb0, 0x6d,
+ 0x79, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96,
+ 0x97, 0x98, 0x99, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, 0xa8, 0xa9, 0xc0, 0x4f, 0xd0, 0xa1, 0x07,
+ 0x20, 0x21, 0x22, 0x23, 0x24, 0x15, 0x06, 0x17, 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x09, 0x0a, 0x1b,
+ 0x30, 0x31, 0x1a, 0x33, 0x34, 0x35, 0x36, 0x08, 0x38, 0x39, 0x3a, 0x3b, 0x04, 0x14, 0x3e, 0xff,
+ 0x41, 0xaa, 0x4a, 0xb1, 0x9f, 0xb2, 0x6a, 0xb5, 0xbd, 0xb4, 0x9a, 0x8a, 0x5f, 0xca, 0xaf, 0xbc,
+ 0x90, 0x8f, 0xea, 0xfa, 0xbe, 0xa0, 0xb6, 0xb3, 0x9d, 0xda, 0x9b, 0x8b, 0xb7, 0xb8, 0xb9, 0xab,
+ 0x64, 0x65, 0x62, 0x66, 0x63, 0x67, 0x9e, 0x68, 0x74, 0x71, 0x72, 0x73, 0x78, 0x75, 0x76, 0x77,
+ 0xac, 0x69, 0xed, 0xee, 0xeb, 0xef, 0xec, 0xbf, 0x80, 0xfd, 0xfe, 0xfb, 0xfc, 0xad, 0x8e, 0x59,
+ 0x44, 0x45, 0x42, 0x46, 0x43, 0x47, 0x9c, 0x48, 0x54, 0x51, 0x52, 0x53, 0x58, 0x55, 0x56, 0x57,
+ 0x8c, 0x49, 0xcd, 0xce, 0xcb, 0xcf, 0xcc, 0xe1, 0x70, 0xdd, 0xde, 0xdb, 0xdc, 0x8d, 0xae, 0xdf
+};
+
+const uint32_t ebc2asc[256] = {
+ /* Based on ISO 8859/1 and Code Page 37 */
+ 0x00, 0x01, 0x02, 0x03, 0x9c, 0x09, 0x86, 0x7f, 0x97, 0x8d, 0x8e, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
+ 0x10, 0x11, 0x12, 0x13, 0x9d, 0x85, 0x08, 0x87, 0x18, 0x19, 0x92, 0x8f, 0x1c, 0x1d, 0x1e, 0x1f,
+ 0x80, 0x81, 0x82, 0x83, 0x84, 0x0a, 0x17, 0x1b, 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x05, 0x06, 0x07,
+ 0x90, 0x91, 0x16, 0x93, 0x94, 0x95, 0x96, 0x04, 0x98, 0x99, 0x9a, 0x9b, 0x14, 0x15, 0x9e, 0x1a,
+ 0x20, 0xa0, 0xe2, 0xe4, 0xe0, 0xe1, 0xe3, 0xe5, 0xe7, 0xf1, 0xa2, 0x2e, 0x3c, 0x28, 0x2b, 0x7c,
+ 0x26, 0xe9, 0xea, 0xeb, 0xe8, 0xed, 0xee, 0xef, 0xec, 0xdf, 0x21, 0x24, 0x2a, 0x29, 0x3b, 0xac,
+ 0x2d, 0x2f, 0xc2, 0xc4, 0xc0, 0xc1, 0xc3, 0xc5, 0xc7, 0xd1, 0xa6, 0x2c, 0x25, 0x5f, 0x3e, 0x3f,
+ 0xf8, 0xc9, 0xca, 0xcb, 0xc8, 0xcd, 0xce, 0xcf, 0xcc, 0x60, 0x3a, 0x23, 0x40, 0x27, 0x3d, 0x22,
+ 0xd8, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xab, 0xbb, 0xf0, 0xfd, 0xde, 0xb1,
+ 0xb0, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, 0x70, 0x71, 0x72, 0xaa, 0xba, 0xe6, 0xb8, 0xc6, 0xa4,
+ 0xb5, 0x7e, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0xa1, 0xbf, 0xd0, 0xdd, 0xfe, 0xae,
+ 0x5e, 0xa3, 0xa5, 0xb7, 0xa9, 0xa7, 0xb6, 0xbc, 0xbd, 0xbe, 0x5b, 0x5d, 0xaf, 0xa8, 0xb4, 0xd7,
+ 0x7b, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0xad, 0xf4, 0xf6, 0xf2, 0xf3, 0xf5,
+ 0x7d, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, 0x50, 0x51, 0x52, 0xb9, 0xfb, 0xfc, 0xf9, 0xfa, 0xff,
+ 0x5c, 0xf7, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5a, 0xb2, 0xd4, 0xd6, 0xd2, 0xd3, 0xd5,
+ 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0xb3, 0xdb, 0xdc, 0xd9, 0xda, 0x9f
+};
+
+} // namespace re2c
+
+#endif // _RE2C_RE_ENCODING_EBCDIC_EBCDIC_
--- /dev/null
+#include "src/re/encoding/ebcdic/ebcdic.h"
+#include "src/re/encoding/ebcdic/ebcdic_regexp.h"
+#include "src/util/range.h"
+
+
+namespace re2c {
+
+RE *EBCDICSymbol(RE::alc_t &alc, uint32_t c)
+{
+ return re_sym(alc, Range::sym(asc2ebc[c]));
+}
+
+RE *EBCDICRange(RE::alc_t &alc, const Range *r)
+{
+ Range *s = NULL;
+ for (; r; r = r->next()) {
+ const uint32_t l = r->lower(), u = r->upper();
+ for (uint32_t c = l; c < u; ++c) {
+ s = Range::add(s, Range::sym(asc2ebc[c]));
+ }
+ }
+ return re_sym(alc, s);
+}
+
+} // namespace re2c
--- /dev/null
+#ifndef _RE2C_RE_ENCODING_EBCDIC_REGEXP_
+#define _RE2C_RE_ENCODING_EBCDIC_REGEXP_
+
+#include "src/util/c99_stdint.h"
+#include "src/re/re.h"
+
+namespace re2c {
+
+class Range;
+
+RE *EBCDICSymbol(RE::alc_t &alc, uint32_t c);
+RE *EBCDICRange(RE::alc_t &alc, const Range *r);
+
+} // namespace re2c
+
+#endif // _RE2C_RE_ENCODING_EBCDIC_REGEXP_
-#include "src/re/encoding/enc.h"
-
#include <stddef.h>
+#include "src/re/encoding/ebcdic/ebcdic.h"
+#include "src/re/encoding/enc.h"
#include "src/util/range.h"
namespace re2c {
const uint32_t Enc::SURR_MAX = 0xDFFF;
const uint32_t Enc::UNICODE_ERROR = 0xFFFD;
-const uint32_t Enc::asc2ebc[256] =
- { /* Based on ISO 8859/1 and Code Page 37 */
- 0x00, 0x01, 0x02, 0x03, 0x37, 0x2d, 0x2e, 0x2f, 0x16, 0x05, 0x25, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
- 0x10, 0x11, 0x12, 0x13, 0x3c, 0x3d, 0x32, 0x26, 0x18, 0x19, 0x3f, 0x27, 0x1c, 0x1d, 0x1e, 0x1f,
- 0x40, 0x5a, 0x7f, 0x7b, 0x5b, 0x6c, 0x50, 0x7d, 0x4d, 0x5d, 0x5c, 0x4e, 0x6b, 0x60, 0x4b, 0x61,
- 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8, 0xf9, 0x7a, 0x5e, 0x4c, 0x7e, 0x6e, 0x6f,
- 0x7c, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, 0xc8, 0xc9, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6,
- 0xd7, 0xd8, 0xd9, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xba, 0xe0, 0xbb, 0xb0, 0x6d,
- 0x79, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96,
- 0x97, 0x98, 0x99, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, 0xa8, 0xa9, 0xc0, 0x4f, 0xd0, 0xa1, 0x07,
- 0x20, 0x21, 0x22, 0x23, 0x24, 0x15, 0x06, 0x17, 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x09, 0x0a, 0x1b,
- 0x30, 0x31, 0x1a, 0x33, 0x34, 0x35, 0x36, 0x08, 0x38, 0x39, 0x3a, 0x3b, 0x04, 0x14, 0x3e, 0xff,
- 0x41, 0xaa, 0x4a, 0xb1, 0x9f, 0xb2, 0x6a, 0xb5, 0xbd, 0xb4, 0x9a, 0x8a, 0x5f, 0xca, 0xaf, 0xbc,
- 0x90, 0x8f, 0xea, 0xfa, 0xbe, 0xa0, 0xb6, 0xb3, 0x9d, 0xda, 0x9b, 0x8b, 0xb7, 0xb8, 0xb9, 0xab,
- 0x64, 0x65, 0x62, 0x66, 0x63, 0x67, 0x9e, 0x68, 0x74, 0x71, 0x72, 0x73, 0x78, 0x75, 0x76, 0x77,
- 0xac, 0x69, 0xed, 0xee, 0xeb, 0xef, 0xec, 0xbf, 0x80, 0xfd, 0xfe, 0xfb, 0xfc, 0xad, 0x8e, 0x59,
- 0x44, 0x45, 0x42, 0x46, 0x43, 0x47, 0x9c, 0x48, 0x54, 0x51, 0x52, 0x53, 0x58, 0x55, 0x56, 0x57,
- 0x8c, 0x49, 0xcd, 0xce, 0xcb, 0xcf, 0xcc, 0xe1, 0x70, 0xdd, 0xde, 0xdb, 0xdc, 0x8d, 0xae, 0xdf
- };
-
-const uint32_t Enc::ebc2asc[256] =
- { /* Based on ISO 8859/1 and Code Page 37 */
- 0x00, 0x01, 0x02, 0x03, 0x9c, 0x09, 0x86, 0x7f, 0x97, 0x8d, 0x8e, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
- 0x10, 0x11, 0x12, 0x13, 0x9d, 0x85, 0x08, 0x87, 0x18, 0x19, 0x92, 0x8f, 0x1c, 0x1d, 0x1e, 0x1f,
- 0x80, 0x81, 0x82, 0x83, 0x84, 0x0a, 0x17, 0x1b, 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x05, 0x06, 0x07,
- 0x90, 0x91, 0x16, 0x93, 0x94, 0x95, 0x96, 0x04, 0x98, 0x99, 0x9a, 0x9b, 0x14, 0x15, 0x9e, 0x1a,
- 0x20, 0xa0, 0xe2, 0xe4, 0xe0, 0xe1, 0xe3, 0xe5, 0xe7, 0xf1, 0xa2, 0x2e, 0x3c, 0x28, 0x2b, 0x7c,
- 0x26, 0xe9, 0xea, 0xeb, 0xe8, 0xed, 0xee, 0xef, 0xec, 0xdf, 0x21, 0x24, 0x2a, 0x29, 0x3b, 0xac,
- 0x2d, 0x2f, 0xc2, 0xc4, 0xc0, 0xc1, 0xc3, 0xc5, 0xc7, 0xd1, 0xa6, 0x2c, 0x25, 0x5f, 0x3e, 0x3f,
- 0xf8, 0xc9, 0xca, 0xcb, 0xc8, 0xcd, 0xce, 0xcf, 0xcc, 0x60, 0x3a, 0x23, 0x40, 0x27, 0x3d, 0x22,
- 0xd8, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xab, 0xbb, 0xf0, 0xfd, 0xde, 0xb1,
- 0xb0, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, 0x70, 0x71, 0x72, 0xaa, 0xba, 0xe6, 0xb8, 0xc6, 0xa4,
- 0xb5, 0x7e, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0xa1, 0xbf, 0xd0, 0xdd, 0xfe, 0xae,
- 0x5e, 0xa3, 0xa5, 0xb7, 0xa9, 0xa7, 0xb6, 0xbc, 0xbd, 0xbe, 0x5b, 0x5d, 0xaf, 0xa8, 0xb4, 0xd7,
- 0x7b, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0xad, 0xf4, 0xf6, 0xf2, 0xf3, 0xf5,
- 0x7d, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, 0x50, 0x51, 0x52, 0xb9, 0xfb, 0xfc, 0xf9, 0xfa, 0xff,
- 0x5c, 0xf7, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5a, 0xb2, 0xd4, 0xd6, 0xd2, 0xd3, 0xd5,
- 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0xb3, 0xdb, 0xdc, 0xd9, 0xda, 0x9f
- };
-
/*
- * Returns code point representation for current
- * encoding with regard to current policy.
+ * Returns code point representation with regard to current policy.
*
- * Since code point is exacly specified by user,
- * it is assumed that user considers it to be valid.
- * We must check it.
+ * Since code point is specified by user, it is assumed that the user
+ * considers it to be valid. re2c must check it.
*
- * Returns false if this code point exceeds maximum
- * or is forbidden by current policy, otherwise
- * returns true. Overwrites code point.
+ * Returns false if this code point exceeds maximum or is forbidden
+ * by current policy, otherwise returns true. Overwrites code point.
*/
-bool Enc::encode(uint32_t & c) const
+bool Enc::validateChar(uint32_t &c) const
{
- if (c >= nCodePoints ())
- {
- return false;
- }
+ if (c >= nCodePoints()) return false;
- switch (type_)
- {
+ switch (type_) {
case ASCII:
- return true;
case EBCDIC:
- c = asc2ebc[c];
return true;
case UCS2:
case UTF16:
case UTF8:
if (c < SURR_MIN || c > SURR_MAX)
return true;
- else
- {
- switch (policy_)
- {
- case POLICY_FAIL:
- return false;
- case POLICY_SUBSTITUTE:
- c = UNICODE_ERROR;
- return true;
- case POLICY_IGNORE:
- return true;
- }
+ switch (policy_) {
+ case POLICY_FAIL:
+ return false;
+ case POLICY_SUBSTITUTE:
+ c = UNICODE_ERROR;
+ return true;
+ case POLICY_IGNORE:
+ return true;
}
}
+
return false; // to silence gcc warning
}
*/
uint32_t Enc::decodeUnsafe(uint32_t c) const
{
- switch (type_)
- {
+ switch (type_) {
case EBCDIC:
c = ebc2asc[c & 0xFF];
break;
}
/*
- * Returns [l - h] range representation for current
- * encoding with regard to current policy.
+ * Returns [l - h] range representation with regard to current policy.
*
- * Since range borders are exacly specified by user,
- * it is assumed that user considers that all code
- * points from this range are valid. re2c must check it.
+ * Since range borders are specified by user, it is assumed that the user
+ * considers all code points from this range to be valid. re2c must check.
*
- * Returns NULL if range contains code points that
- * exceed maximum or are forbidden by current policy,
- * otherwise returns pointer to newly constructed range.
+ * Returns NULL if range contains code points that exceed maximum or are
+ * forbidden by current policy, otherwise returns newly constructed range.
*/
-Range * Enc::encodeRange(uint32_t l, uint32_t h) const
+Range * Enc::validateRange(uint32_t l, uint32_t h) const
{
- if (l >= nCodePoints () || h >= nCodePoints ())
- {
- return NULL;
- }
+ if (l >= nCodePoints () || h >= nCodePoints ()) return NULL;
Range * r = NULL;
- switch (type_)
- {
+ switch (type_) {
case ASCII:
- r = Range::ran (l, h + 1);
- break;
case EBCDIC:
- {
- const uint32_t el = asc2ebc[l];
- r = Range::sym (el);
- for (uint32_t c = l + 1; c <= h; ++c)
- {
- const uint32_t ec = asc2ebc[c];
- r = Range::add (r, Range::sym (ec));
- }
+ r = Range::ran (l, h + 1);
break;
- }
case UCS2:
case UTF16:
case UTF32:
case UTF8:
r = Range::ran (l, h + 1);
- if (l <= SURR_MAX && h >= SURR_MIN)
- {
- switch (policy_)
- {
+ if (l <= SURR_MAX && h >= SURR_MIN) {
+ switch (policy_) {
case POLICY_FAIL:
r = NULL;
break;
- case POLICY_SUBSTITUTE:
- {
+ case POLICY_SUBSTITUTE: {
Range * surrs = Range::ran (SURR_MIN, SURR_MAX + 1);
Range * error = Range::sym (UNICODE_ERROR);
r = Range::sub (r, surrs);
};
private:
- static const uint32_t asc2ebc[256];
- static const uint32_t ebc2asc[256];
static const uint32_t SURR_MIN;
static const uint32_t SURR_MAX;
static const uint32_t UNICODE_ERROR;
inline void setPolicy(policy_t t);
- bool encode(uint32_t & c) const;
uint32_t decodeUnsafe(uint32_t c) const;
- Range * encodeRange(uint32_t l, uint32_t h) const;
+ bool validateChar(uint32_t & c) const;
+ Range * validateRange(uint32_t l, uint32_t h) const;
Range * fullRange() const;
};