From 7683f5601da7c3086063a5b6924dc14f2e3b6a1d Mon Sep 17 00:00:00 2001 From: Ulya Trofimovich Date: Sat, 5 Jan 2019 13:16:36 +0000 Subject: [PATCH] Handle EBCDIC like other encodings instead of coupling it with character validation. --- re2c/Makefile.am | 3 + re2c/src/re/ast_to_re.cc | 19 ++- re2c/src/re/encoding/ebcdic/ebcdic.h | 51 +++++++ re2c/src/re/encoding/ebcdic/ebcdic_regexp.cc | 25 ++++ re2c/src/re/encoding/ebcdic/ebcdic_regexp.h | 16 +++ re2c/src/re/encoding/enc.cc | 137 +++++-------------- re2c/src/re/encoding/enc.h | 6 +- 7 files changed, 141 insertions(+), 116 deletions(-) create mode 100644 re2c/src/re/encoding/ebcdic/ebcdic.h create mode 100644 re2c/src/re/encoding/ebcdic/ebcdic_regexp.cc create mode 100644 re2c/src/re/encoding/ebcdic/ebcdic_regexp.h diff --git a/re2c/Makefile.am b/re2c/Makefile.am index 5afc3855..509be2a8 100644 --- a/re2c/Makefile.am +++ b/re2c/Makefile.am @@ -33,6 +33,8 @@ SRC_HDR = \ src/dfa/tcmd.h \ src/nfa/nfa.h \ src/re/encoding/case.h \ + src/re/encoding/ebcdic/ebcdic.h \ + src/re/encoding/ebcdic/ebcdic_regexp.h \ src/re/encoding/enc.h \ src/re/encoding/range_suffix.h \ src/re/encoding/utf8/utf8.h \ @@ -120,6 +122,7 @@ SRC = \ src/dfa/tag_history.cc \ src/dfa/tagver_table.cc \ src/dfa/tcmd.cc \ + src/re/encoding/ebcdic/ebcdic_regexp.cc \ src/re/encoding/enc.cc \ src/re/encoding/range_suffix.cc \ src/re/encoding/utf8/utf8_regexp.cc \ diff --git a/re2c/src/re/ast_to_re.cc b/re2c/src/re/ast_to_re.cc index db7b86e2..6f03f4e8 100644 --- a/re2c/src/re/ast_to_re.cc +++ b/re2c/src/re/ast_to_re.cc @@ -14,6 +14,7 @@ #include "src/re/empty_class_policy.h" #include "src/re/encoding/case.h" #include "src/re/encoding/enc.h" +#include "src/re/encoding/ebcdic/ebcdic_regexp.h" #include "src/re/encoding/utf16/utf16_regexp.h" #include "src/re/encoding/utf8/utf8_regexp.h" #include "src/re/re.h" @@ -265,9 +266,11 @@ Range *cls_to_range(const AST *ast, const opt_t *opts) i = ast->cls.ranges->begin(), e = ast->cls.ranges->end(); for (; i != e; ++i) { - Range *s = opts->encoding.encodeRange(i->lower, i->upper); - if (!s) fatal_lc(ast->line, i->column, + Range *s = opts->encoding.validateRange(i->lower, i->upper); + if (!s) { + fatal_lc(ast->line, i->column, "bad code point range: '0x%X - 0x%X'", i->lower, i->upper); + } r = Range::add(r, s); } if (ast->cls.negated) { @@ -281,7 +284,7 @@ Range *dot_to_range(const AST *ast, const opt_t *opts) DASSERT(ast->type == AST::DOT); uint32_t c = '\n'; - if (!opts->encoding.encode(c)) { + if (!opts->encoding.validateChar(c)) { fatal_lc(ast->line, ast->column, "bad code point: '0x%X'", c); } return Range::sub(opts->encoding.fullRange(), Range::sym(c)); @@ -318,7 +321,7 @@ Range *ast_to_range(const AST *ast, const opt_t *opts) if (ast->str.chars->size() != 1) break; const ASTChar &i = ast->str.chars->front(); uint32_t c = i.chr; - if (!opts->encoding.encode(c)) { + if (!opts->encoding.validateChar(c)) { fatal_lc(ast->line, i.column, "bad code point: '0x%X'", c); } const bool icase = opts->bCaseInsensitive @@ -341,7 +344,7 @@ Range *ast_to_range(const AST *ast, const opt_t *opts) RE *re_schar(RE::alc_t &alc, uint32_t line, uint32_t column, uint32_t c, const opt_t *opts) { - if (!opts->encoding.encode(c)) { + if (!opts->encoding.validateChar(c)) { fatal_lc(line, column, "bad code point: '0x%X'", c); } switch (opts->encoding.type()) { @@ -349,8 +352,9 @@ RE *re_schar(RE::alc_t &alc, uint32_t line, uint32_t column, uint32_t c, const o return UTF16Symbol(alc, c); case Enc::UTF8: return UTF8Symbol(alc, c); - case Enc::ASCII: case Enc::EBCDIC: + return EBCDICSymbol(alc, c); + case Enc::ASCII: case Enc::UTF32: case Enc::UCS2: return re_sym(alc, Range::sym(c)); @@ -388,8 +392,9 @@ RE *re_class(RE::alc_t &alc, uint32_t line, uint32_t column, const Range *r, con return UTF16Range(alc, r); case Enc::UTF8: return UTF8Range(alc, r); - case Enc::ASCII: case Enc::EBCDIC: + return EBCDICRange(alc, r); + case Enc::ASCII: case Enc::UTF32: case Enc::UCS2: return re_sym(alc, r); diff --git a/re2c/src/re/encoding/ebcdic/ebcdic.h b/re2c/src/re/encoding/ebcdic/ebcdic.h new file mode 100644 index 00000000..7874bc93 --- /dev/null +++ b/re2c/src/re/encoding/ebcdic/ebcdic.h @@ -0,0 +1,51 @@ +#ifndef _RE2C_RE_ENCODING_EBCDIC_EBCDIC_ +#define _RE2C_RE_ENCODING_EBCDIC_EBCDIC_ + +#include "src/util/c99_stdint.h" + + +namespace re2c { + +const uint32_t asc2ebc[256] = { + /* Based on ISO 8859/1 and Code Page 37 */ + 0x00, 0x01, 0x02, 0x03, 0x37, 0x2d, 0x2e, 0x2f, 0x16, 0x05, 0x25, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, + 0x10, 0x11, 0x12, 0x13, 0x3c, 0x3d, 0x32, 0x26, 0x18, 0x19, 0x3f, 0x27, 0x1c, 0x1d, 0x1e, 0x1f, + 0x40, 0x5a, 0x7f, 0x7b, 0x5b, 0x6c, 0x50, 0x7d, 0x4d, 0x5d, 0x5c, 0x4e, 0x6b, 0x60, 0x4b, 0x61, + 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8, 0xf9, 0x7a, 0x5e, 0x4c, 0x7e, 0x6e, 0x6f, + 0x7c, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, 0xc8, 0xc9, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, + 0xd7, 0xd8, 0xd9, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xba, 0xe0, 0xbb, 0xb0, 0x6d, + 0x79, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, + 0x97, 0x98, 0x99, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, 0xa8, 0xa9, 0xc0, 0x4f, 0xd0, 0xa1, 0x07, + 0x20, 0x21, 0x22, 0x23, 0x24, 0x15, 0x06, 0x17, 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x09, 0x0a, 0x1b, + 0x30, 0x31, 0x1a, 0x33, 0x34, 0x35, 0x36, 0x08, 0x38, 0x39, 0x3a, 0x3b, 0x04, 0x14, 0x3e, 0xff, + 0x41, 0xaa, 0x4a, 0xb1, 0x9f, 0xb2, 0x6a, 0xb5, 0xbd, 0xb4, 0x9a, 0x8a, 0x5f, 0xca, 0xaf, 0xbc, + 0x90, 0x8f, 0xea, 0xfa, 0xbe, 0xa0, 0xb6, 0xb3, 0x9d, 0xda, 0x9b, 0x8b, 0xb7, 0xb8, 0xb9, 0xab, + 0x64, 0x65, 0x62, 0x66, 0x63, 0x67, 0x9e, 0x68, 0x74, 0x71, 0x72, 0x73, 0x78, 0x75, 0x76, 0x77, + 0xac, 0x69, 0xed, 0xee, 0xeb, 0xef, 0xec, 0xbf, 0x80, 0xfd, 0xfe, 0xfb, 0xfc, 0xad, 0x8e, 0x59, + 0x44, 0x45, 0x42, 0x46, 0x43, 0x47, 0x9c, 0x48, 0x54, 0x51, 0x52, 0x53, 0x58, 0x55, 0x56, 0x57, + 0x8c, 0x49, 0xcd, 0xce, 0xcb, 0xcf, 0xcc, 0xe1, 0x70, 0xdd, 0xde, 0xdb, 0xdc, 0x8d, 0xae, 0xdf +}; + +const uint32_t ebc2asc[256] = { + /* Based on ISO 8859/1 and Code Page 37 */ + 0x00, 0x01, 0x02, 0x03, 0x9c, 0x09, 0x86, 0x7f, 0x97, 0x8d, 0x8e, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, + 0x10, 0x11, 0x12, 0x13, 0x9d, 0x85, 0x08, 0x87, 0x18, 0x19, 0x92, 0x8f, 0x1c, 0x1d, 0x1e, 0x1f, + 0x80, 0x81, 0x82, 0x83, 0x84, 0x0a, 0x17, 0x1b, 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x05, 0x06, 0x07, + 0x90, 0x91, 0x16, 0x93, 0x94, 0x95, 0x96, 0x04, 0x98, 0x99, 0x9a, 0x9b, 0x14, 0x15, 0x9e, 0x1a, + 0x20, 0xa0, 0xe2, 0xe4, 0xe0, 0xe1, 0xe3, 0xe5, 0xe7, 0xf1, 0xa2, 0x2e, 0x3c, 0x28, 0x2b, 0x7c, + 0x26, 0xe9, 0xea, 0xeb, 0xe8, 0xed, 0xee, 0xef, 0xec, 0xdf, 0x21, 0x24, 0x2a, 0x29, 0x3b, 0xac, + 0x2d, 0x2f, 0xc2, 0xc4, 0xc0, 0xc1, 0xc3, 0xc5, 0xc7, 0xd1, 0xa6, 0x2c, 0x25, 0x5f, 0x3e, 0x3f, + 0xf8, 0xc9, 0xca, 0xcb, 0xc8, 0xcd, 0xce, 0xcf, 0xcc, 0x60, 0x3a, 0x23, 0x40, 0x27, 0x3d, 0x22, + 0xd8, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xab, 0xbb, 0xf0, 0xfd, 0xde, 0xb1, + 0xb0, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, 0x70, 0x71, 0x72, 0xaa, 0xba, 0xe6, 0xb8, 0xc6, 0xa4, + 0xb5, 0x7e, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0xa1, 0xbf, 0xd0, 0xdd, 0xfe, 0xae, + 0x5e, 0xa3, 0xa5, 0xb7, 0xa9, 0xa7, 0xb6, 0xbc, 0xbd, 0xbe, 0x5b, 0x5d, 0xaf, 0xa8, 0xb4, 0xd7, + 0x7b, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0xad, 0xf4, 0xf6, 0xf2, 0xf3, 0xf5, + 0x7d, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, 0x50, 0x51, 0x52, 0xb9, 0xfb, 0xfc, 0xf9, 0xfa, 0xff, + 0x5c, 0xf7, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5a, 0xb2, 0xd4, 0xd6, 0xd2, 0xd3, 0xd5, + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0xb3, 0xdb, 0xdc, 0xd9, 0xda, 0x9f +}; + +} // namespace re2c + +#endif // _RE2C_RE_ENCODING_EBCDIC_EBCDIC_ diff --git a/re2c/src/re/encoding/ebcdic/ebcdic_regexp.cc b/re2c/src/re/encoding/ebcdic/ebcdic_regexp.cc new file mode 100644 index 00000000..626c5c62 --- /dev/null +++ b/re2c/src/re/encoding/ebcdic/ebcdic_regexp.cc @@ -0,0 +1,25 @@ +#include "src/re/encoding/ebcdic/ebcdic.h" +#include "src/re/encoding/ebcdic/ebcdic_regexp.h" +#include "src/util/range.h" + + +namespace re2c { + +RE *EBCDICSymbol(RE::alc_t &alc, uint32_t c) +{ + return re_sym(alc, Range::sym(asc2ebc[c])); +} + +RE *EBCDICRange(RE::alc_t &alc, const Range *r) +{ + Range *s = NULL; + for (; r; r = r->next()) { + const uint32_t l = r->lower(), u = r->upper(); + for (uint32_t c = l; c < u; ++c) { + s = Range::add(s, Range::sym(asc2ebc[c])); + } + } + return re_sym(alc, s); +} + +} // namespace re2c diff --git a/re2c/src/re/encoding/ebcdic/ebcdic_regexp.h b/re2c/src/re/encoding/ebcdic/ebcdic_regexp.h new file mode 100644 index 00000000..14f32e08 --- /dev/null +++ b/re2c/src/re/encoding/ebcdic/ebcdic_regexp.h @@ -0,0 +1,16 @@ +#ifndef _RE2C_RE_ENCODING_EBCDIC_REGEXP_ +#define _RE2C_RE_ENCODING_EBCDIC_REGEXP_ + +#include "src/util/c99_stdint.h" +#include "src/re/re.h" + +namespace re2c { + +class Range; + +RE *EBCDICSymbol(RE::alc_t &alc, uint32_t c); +RE *EBCDICRange(RE::alc_t &alc, const Range *r); + +} // namespace re2c + +#endif // _RE2C_RE_ENCODING_EBCDIC_REGEXP_ diff --git a/re2c/src/re/encoding/enc.cc b/re2c/src/re/encoding/enc.cc index 411f0804..d1abe7f9 100644 --- a/re2c/src/re/encoding/enc.cc +++ b/re2c/src/re/encoding/enc.cc @@ -1,7 +1,7 @@ -#include "src/re/encoding/enc.h" - #include +#include "src/re/encoding/ebcdic/ebcdic.h" +#include "src/re/encoding/enc.h" #include "src/util/range.h" namespace re2c { @@ -10,71 +10,22 @@ const uint32_t Enc::SURR_MIN = 0xD800; const uint32_t Enc::SURR_MAX = 0xDFFF; const uint32_t Enc::UNICODE_ERROR = 0xFFFD; -const uint32_t Enc::asc2ebc[256] = - { /* Based on ISO 8859/1 and Code Page 37 */ - 0x00, 0x01, 0x02, 0x03, 0x37, 0x2d, 0x2e, 0x2f, 0x16, 0x05, 0x25, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, - 0x10, 0x11, 0x12, 0x13, 0x3c, 0x3d, 0x32, 0x26, 0x18, 0x19, 0x3f, 0x27, 0x1c, 0x1d, 0x1e, 0x1f, - 0x40, 0x5a, 0x7f, 0x7b, 0x5b, 0x6c, 0x50, 0x7d, 0x4d, 0x5d, 0x5c, 0x4e, 0x6b, 0x60, 0x4b, 0x61, - 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8, 0xf9, 0x7a, 0x5e, 0x4c, 0x7e, 0x6e, 0x6f, - 0x7c, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, 0xc8, 0xc9, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, - 0xd7, 0xd8, 0xd9, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xba, 0xe0, 0xbb, 0xb0, 0x6d, - 0x79, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, - 0x97, 0x98, 0x99, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, 0xa8, 0xa9, 0xc0, 0x4f, 0xd0, 0xa1, 0x07, - 0x20, 0x21, 0x22, 0x23, 0x24, 0x15, 0x06, 0x17, 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x09, 0x0a, 0x1b, - 0x30, 0x31, 0x1a, 0x33, 0x34, 0x35, 0x36, 0x08, 0x38, 0x39, 0x3a, 0x3b, 0x04, 0x14, 0x3e, 0xff, - 0x41, 0xaa, 0x4a, 0xb1, 0x9f, 0xb2, 0x6a, 0xb5, 0xbd, 0xb4, 0x9a, 0x8a, 0x5f, 0xca, 0xaf, 0xbc, - 0x90, 0x8f, 0xea, 0xfa, 0xbe, 0xa0, 0xb6, 0xb3, 0x9d, 0xda, 0x9b, 0x8b, 0xb7, 0xb8, 0xb9, 0xab, - 0x64, 0x65, 0x62, 0x66, 0x63, 0x67, 0x9e, 0x68, 0x74, 0x71, 0x72, 0x73, 0x78, 0x75, 0x76, 0x77, - 0xac, 0x69, 0xed, 0xee, 0xeb, 0xef, 0xec, 0xbf, 0x80, 0xfd, 0xfe, 0xfb, 0xfc, 0xad, 0x8e, 0x59, - 0x44, 0x45, 0x42, 0x46, 0x43, 0x47, 0x9c, 0x48, 0x54, 0x51, 0x52, 0x53, 0x58, 0x55, 0x56, 0x57, - 0x8c, 0x49, 0xcd, 0xce, 0xcb, 0xcf, 0xcc, 0xe1, 0x70, 0xdd, 0xde, 0xdb, 0xdc, 0x8d, 0xae, 0xdf - }; - -const uint32_t Enc::ebc2asc[256] = - { /* Based on ISO 8859/1 and Code Page 37 */ - 0x00, 0x01, 0x02, 0x03, 0x9c, 0x09, 0x86, 0x7f, 0x97, 0x8d, 0x8e, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, - 0x10, 0x11, 0x12, 0x13, 0x9d, 0x85, 0x08, 0x87, 0x18, 0x19, 0x92, 0x8f, 0x1c, 0x1d, 0x1e, 0x1f, - 0x80, 0x81, 0x82, 0x83, 0x84, 0x0a, 0x17, 0x1b, 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x05, 0x06, 0x07, - 0x90, 0x91, 0x16, 0x93, 0x94, 0x95, 0x96, 0x04, 0x98, 0x99, 0x9a, 0x9b, 0x14, 0x15, 0x9e, 0x1a, - 0x20, 0xa0, 0xe2, 0xe4, 0xe0, 0xe1, 0xe3, 0xe5, 0xe7, 0xf1, 0xa2, 0x2e, 0x3c, 0x28, 0x2b, 0x7c, - 0x26, 0xe9, 0xea, 0xeb, 0xe8, 0xed, 0xee, 0xef, 0xec, 0xdf, 0x21, 0x24, 0x2a, 0x29, 0x3b, 0xac, - 0x2d, 0x2f, 0xc2, 0xc4, 0xc0, 0xc1, 0xc3, 0xc5, 0xc7, 0xd1, 0xa6, 0x2c, 0x25, 0x5f, 0x3e, 0x3f, - 0xf8, 0xc9, 0xca, 0xcb, 0xc8, 0xcd, 0xce, 0xcf, 0xcc, 0x60, 0x3a, 0x23, 0x40, 0x27, 0x3d, 0x22, - 0xd8, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xab, 0xbb, 0xf0, 0xfd, 0xde, 0xb1, - 0xb0, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, 0x70, 0x71, 0x72, 0xaa, 0xba, 0xe6, 0xb8, 0xc6, 0xa4, - 0xb5, 0x7e, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0xa1, 0xbf, 0xd0, 0xdd, 0xfe, 0xae, - 0x5e, 0xa3, 0xa5, 0xb7, 0xa9, 0xa7, 0xb6, 0xbc, 0xbd, 0xbe, 0x5b, 0x5d, 0xaf, 0xa8, 0xb4, 0xd7, - 0x7b, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0xad, 0xf4, 0xf6, 0xf2, 0xf3, 0xf5, - 0x7d, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, 0x50, 0x51, 0x52, 0xb9, 0xfb, 0xfc, 0xf9, 0xfa, 0xff, - 0x5c, 0xf7, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5a, 0xb2, 0xd4, 0xd6, 0xd2, 0xd3, 0xd5, - 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0xb3, 0xdb, 0xdc, 0xd9, 0xda, 0x9f - }; - /* - * Returns code point representation for current - * encoding with regard to current policy. + * Returns code point representation with regard to current policy. * - * Since code point is exacly specified by user, - * it is assumed that user considers it to be valid. - * We must check it. + * Since code point is specified by user, it is assumed that the user + * considers it to be valid. re2c must check it. * - * Returns false if this code point exceeds maximum - * or is forbidden by current policy, otherwise - * returns true. Overwrites code point. + * Returns false if this code point exceeds maximum or is forbidden + * by current policy, otherwise returns true. Overwrites code point. */ -bool Enc::encode(uint32_t & c) const +bool Enc::validateChar(uint32_t &c) const { - if (c >= nCodePoints ()) - { - return false; - } + if (c >= nCodePoints()) return false; - switch (type_) - { + switch (type_) { case ASCII: - return true; case EBCDIC: - c = asc2ebc[c]; return true; case UCS2: case UTF16: @@ -82,20 +33,17 @@ bool Enc::encode(uint32_t & c) const case UTF8: if (c < SURR_MIN || c > SURR_MAX) return true; - else - { - switch (policy_) - { - case POLICY_FAIL: - return false; - case POLICY_SUBSTITUTE: - c = UNICODE_ERROR; - return true; - case POLICY_IGNORE: - return true; - } + switch (policy_) { + case POLICY_FAIL: + return false; + case POLICY_SUBSTITUTE: + c = UNICODE_ERROR; + return true; + case POLICY_IGNORE: + return true; } } + return false; // to silence gcc warning } @@ -105,8 +53,7 @@ bool Enc::encode(uint32_t & c) const */ uint32_t Enc::decodeUnsafe(uint32_t c) const { - switch (type_) - { + switch (type_) { case EBCDIC: c = ebc2asc[c & 0xFF]; break; @@ -121,55 +68,35 @@ uint32_t Enc::decodeUnsafe(uint32_t c) const } /* - * Returns [l - h] range representation for current - * encoding with regard to current policy. + * Returns [l - h] range representation with regard to current policy. * - * Since range borders are exacly specified by user, - * it is assumed that user considers that all code - * points from this range are valid. re2c must check it. + * Since range borders are specified by user, it is assumed that the user + * considers all code points from this range to be valid. re2c must check. * - * Returns NULL if range contains code points that - * exceed maximum or are forbidden by current policy, - * otherwise returns pointer to newly constructed range. + * Returns NULL if range contains code points that exceed maximum or are + * forbidden by current policy, otherwise returns newly constructed range. */ -Range * Enc::encodeRange(uint32_t l, uint32_t h) const +Range * Enc::validateRange(uint32_t l, uint32_t h) const { - if (l >= nCodePoints () || h >= nCodePoints ()) - { - return NULL; - } + if (l >= nCodePoints () || h >= nCodePoints ()) return NULL; Range * r = NULL; - switch (type_) - { + switch (type_) { case ASCII: - r = Range::ran (l, h + 1); - break; case EBCDIC: - { - const uint32_t el = asc2ebc[l]; - r = Range::sym (el); - for (uint32_t c = l + 1; c <= h; ++c) - { - const uint32_t ec = asc2ebc[c]; - r = Range::add (r, Range::sym (ec)); - } + r = Range::ran (l, h + 1); break; - } case UCS2: case UTF16: case UTF32: case UTF8: r = Range::ran (l, h + 1); - if (l <= SURR_MAX && h >= SURR_MIN) - { - switch (policy_) - { + if (l <= SURR_MAX && h >= SURR_MIN) { + switch (policy_) { case POLICY_FAIL: r = NULL; break; - case POLICY_SUBSTITUTE: - { + case POLICY_SUBSTITUTE: { Range * surrs = Range::ran (SURR_MIN, SURR_MAX + 1); Range * error = Range::sym (UNICODE_ERROR); r = Range::sub (r, surrs); diff --git a/re2c/src/re/encoding/enc.h b/re2c/src/re/encoding/enc.h index 2dbb1d2d..3268fdeb 100644 --- a/re2c/src/re/encoding/enc.h +++ b/re2c/src/re/encoding/enc.h @@ -56,8 +56,6 @@ public: }; private: - static const uint32_t asc2ebc[256]; - static const uint32_t ebc2asc[256]; static const uint32_t SURR_MIN; static const uint32_t SURR_MAX; static const uint32_t UNICODE_ERROR; @@ -86,9 +84,9 @@ public: inline void setPolicy(policy_t t); - bool encode(uint32_t & c) const; uint32_t decodeUnsafe(uint32_t c) const; - Range * encodeRange(uint32_t l, uint32_t h) const; + bool validateChar(uint32_t & c) const; + Range * validateRange(uint32_t l, uint32_t h) const; Range * fullRange() const; }; -- 2.40.0