From ea2b021571de830226e56efbbd0b6668c45e7ece Mon Sep 17 00:00:00 2001 From: Ulya Trofimovich Date: Wed, 8 May 2019 14:31:14 +0100 Subject: [PATCH] libre2c: extended lexer to handle some escape sequences in charachter classes. --- Makefile.lib.am | 2 + bootstrap/lib/lex.cc | 210 +++++++++++++++++++++++++++++++------------ lib/lex.re | 19 +++- 3 files changed, 173 insertions(+), 58 deletions(-) diff --git a/Makefile.lib.am b/Makefile.lib.am index 5c26d370..3ac4cf1f 100644 --- a/Makefile.lib.am +++ b/Makefile.lib.am @@ -50,6 +50,7 @@ libre2c_la_HDR = \ src/skeleton/path.h \ src/skeleton/skeleton.h \ src/parse/ast.h \ + src/parse/unescape.h \ src/parse/input.h \ src/parse/parser.h \ src/parse/scanner.h \ @@ -88,6 +89,7 @@ libre2c_la_SRC = \ lib/regfree.cc \ lib/stubs.cc \ src/parse/ast.cc \ + src/parse/unescape.cc \ src/options/opt.cc \ src/cfg/cfg.cc \ src/cfg/compact.cc \ diff --git a/bootstrap/lib/lex.cc b/bootstrap/lib/lex.cc index 62e0e78e..eb5fc303 100644 --- a/bootstrap/lib/lex.cc +++ b/bootstrap/lib/lex.cc @@ -1,4 +1,4 @@ -/* Generated by re2c 1.1.1 on Tue Feb 19 07:33:43 2019 */ +/* Generated by re2c 1.1.1 on Wed May 8 14:28:22 2019 */ #line 1 "../lib/lex.re" #include @@ -6,6 +6,7 @@ #include "src/encoding/enc.h" #include "src/parse/ast.h" +#include "src/parse/unescape.h" #include "src/util/range.h" #include "src/util/s_to_n32_unsafe.h" #include "parse.h" @@ -18,7 +19,7 @@ namespace re2c { static int32_t lex_cls_chr(const char *&, uint32_t &); -#line 28 "../lib/lex.re" +#line 29 "../lib/lex.re" int lex(const char *&cur) @@ -30,7 +31,7 @@ int lex(const char *&cur) uint32_t l, u; -#line 34 "lib/lex.cc" +#line 35 "lib/lex.cc" { char yych; static const unsigned char yybm[] = { @@ -96,13 +97,13 @@ int lex(const char *&cur) } yy2: ++cur; -#line 41 "../lib/lex.re" +#line 42 "../lib/lex.re" { return 0; } -#line 102 "lib/lex.cc" +#line 103 "lib/lex.cc" yy4: ++cur; yy5: -#line 76 "../lib/lex.re" +#line 77 "../lib/lex.re" { ASTChar c = {static_cast(cur[-1]), NOWHERE}; std::vector *str = new std::vector; @@ -110,34 +111,34 @@ yy5: yylval.regexp = ast_str(NOWHERE, str, false); return REGEXP; } -#line 114 "lib/lex.cc" +#line 115 "lib/lex.cc" yy6: ++cur; -#line 45 "../lib/lex.re" +#line 46 "../lib/lex.re" { error("anchors are not supported"); return ERROR; } -#line 122 "lib/lex.cc" +#line 123 "lib/lex.cc" yy8: ++cur; -#line 43 "../lib/lex.re" +#line 44 "../lib/lex.re" { return cur[-1]; } -#line 127 "lib/lex.cc" +#line 128 "lib/lex.cc" yy10: ++cur; -#line 71 "../lib/lex.re" +#line 72 "../lib/lex.re" { yylval.regexp = ast_dot(NOWHERE); return REGEXP; } -#line 135 "lib/lex.cc" +#line 136 "lib/lex.cc" yy12: yych = *++cur; if (yych == '^') goto yy15; -#line 51 "../lib/lex.re" +#line 52 "../lib/lex.re" { goto cls; } -#line 141 "lib/lex.cc" +#line 142 "lib/lex.cc" yy14: yych = *(mar = ++cur); if (yych <= '/') goto yy5; @@ -148,9 +149,9 @@ yy14: goto yy5; yy15: ++cur; -#line 50 "../lib/lex.re" +#line 51 "../lib/lex.re" { neg = true; goto cls; } -#line 154 "lib/lex.cc" +#line 155 "lib/lex.cc" yy17: yych = *++cur; if (yybm[0+yych] & 128) { @@ -173,13 +174,13 @@ yy20: yy21: ++cur; x = yyt1; -#line 53 "../lib/lex.re" +#line 54 "../lib/lex.re" { if (!s_to_u32_unsafe(x, cur - 1, yylval.bounds.min)) goto err_cnt; yylval.bounds.max = yylval.bounds.min; return COUNT; } -#line 183 "lib/lex.cc" +#line 184 "lib/lex.cc" yy23: yych = *++cur; if (yych <= '/') goto yy19; @@ -189,40 +190,40 @@ yy23: yy25: ++cur; x = yyt1; -#line 65 "../lib/lex.re" +#line 66 "../lib/lex.re" { if (!s_to_u32_unsafe(x, cur - 2, yylval.bounds.min)) goto err_cnt; yylval.bounds.max = AST::MANY; return COUNT; } -#line 199 "lib/lex.cc" +#line 200 "lib/lex.cc" yy27: ++cur; x = yyt1; y = yyt2; -#line 59 "../lib/lex.re" +#line 60 "../lib/lex.re" { if (!s_to_u32_unsafe(x, y - 1, yylval.bounds.min) || !s_to_u32_unsafe(y, cur - 1, yylval.bounds.max)) goto err_cnt; return COUNT; } -#line 210 "lib/lex.cc" +#line 211 "lib/lex.cc" } -#line 83 "../lib/lex.re" +#line 84 "../lib/lex.re" cls: if (lex_cls_chr(cur, l) != 0) goto err; -#line 218 "lib/lex.cc" +#line 219 "lib/lex.cc" { char yych; yych = *(mar = cur); if (yych == '-') goto yy32; yy31: -#line 88 "../lib/lex.re" +#line 89 "../lib/lex.re" { u = l; goto add; } -#line 226 "lib/lex.cc" +#line 227 "lib/lex.cc" yy32: yych = *++cur; if (yych != ']') goto yy34; @@ -231,36 +232,36 @@ yy32: yy34: ++cur; cur -= 1; -#line 89 "../lib/lex.re" +#line 90 "../lib/lex.re" { if (lex_cls_chr(cur, u) != 0) goto err; goto add; } -#line 237 "lib/lex.cc" +#line 238 "lib/lex.cc" } -#line 90 "../lib/lex.re" +#line 91 "../lib/lex.re" add: if (l > u) goto err; cls.push_back(ASTRange(l, u, NOWHERE)); -#line 245 "lib/lex.cc" +#line 246 "lib/lex.cc" { char yych; yych = *cur; if (yych == ']') goto yy39; -#line 95 "../lib/lex.re" +#line 96 "../lib/lex.re" { goto cls; } -#line 252 "lib/lex.cc" +#line 253 "lib/lex.cc" yy39: ++cur; -#line 96 "../lib/lex.re" +#line 97 "../lib/lex.re" { std::vector *p = new std::vector; p->swap(cls); yylval.regexp = ast_cls(NOWHERE, p, neg); return REGEXP; } -#line 262 "lib/lex.cc" +#line 263 "lib/lex.cc" } -#line 102 "../lib/lex.re" +#line 103 "../lib/lex.re" err: @@ -274,50 +275,147 @@ err_cnt: int32_t lex_cls_chr(const char *&cur, uint32_t &c) { + const char *mar, *p = cur; -#line 279 "lib/lex.cc" +#line 281 "lib/lex.cc" { char yych; yych = *cur; if (yych <= 0x00) goto yy43; - if (yych == '[') goto yy47; + if (yych <= 'Z') goto yy45; + if (yych <= '[') goto yy47; + if (yych <= '\\') goto yy48; goto yy45; yy43: ++cur; -#line 116 "../lib/lex.re" +#line 118 "../lib/lex.re" { return 1; } -#line 290 "lib/lex.cc" +#line 294 "lib/lex.cc" yy45: ++cur; yy46: -#line 121 "../lib/lex.re" +#line 136 "../lib/lex.re" { c = static_cast(cur[-1]); return 0; } -#line 296 "lib/lex.cc" +#line 300 "lib/lex.cc" yy47: yych = *++cur; if (yych <= '9') { - if (yych != '.') goto yy46; + if (yych == '.') goto yy50; + goto yy46; } else { - if (yych <= ':') goto yy50; - if (yych == '=') goto yy52; + if (yych <= ':') goto yy52; + if (yych == '=') goto yy54; goto yy46; } - ++cur; -#line 117 "../lib/lex.re" - { error("collating characters not supported"); return 1; } -#line 309 "lib/lex.cc" +yy48: + yych = *(mar = ++cur); + switch (yych) { + case '\\': goto yy56; + case ']': goto yy58; + case 'a': goto yy60; + case 'b': goto yy62; + case 'f': goto yy64; + case 'n': goto yy66; + case 'r': goto yy68; + case 't': goto yy70; + case 'v': goto yy72; + case 'x': goto yy74; + default: goto yy49; + } +yy49: +#line 125 "../lib/lex.re" + { c = static_cast('\\'); return 0; } +#line 329 "lib/lex.cc" yy50: ++cur; -#line 118 "../lib/lex.re" - { error("character classes not supported"); return 1; } -#line 314 "lib/lex.cc" +#line 119 "../lib/lex.re" + { error("collating characters not supported"); return 1; } +#line 334 "lib/lex.cc" yy52: ++cur; -#line 119 "../lib/lex.re" - { error("equivalence classes not supported"); return 1; } -#line 319 "lib/lex.cc" +#line 120 "../lib/lex.re" + { error("character classes not supported"); return 1; } +#line 339 "lib/lex.cc" +yy54: + ++cur; +#line 121 "../lib/lex.re" + { error("equivalence classes not supported"); return 1; } +#line 344 "lib/lex.cc" +yy56: + ++cur; +#line 133 "../lib/lex.re" + { c = static_cast('\\'); return 0; } +#line 349 "lib/lex.cc" +yy58: + ++cur; +#line 134 "../lib/lex.re" + { c = static_cast(']'); return 0; } +#line 354 "lib/lex.cc" +yy60: + ++cur; +#line 126 "../lib/lex.re" + { c = static_cast('\a'); return 0; } +#line 359 "lib/lex.cc" +yy62: + ++cur; +#line 127 "../lib/lex.re" + { c = static_cast('\b'); return 0; } +#line 364 "lib/lex.cc" +yy64: + ++cur; +#line 128 "../lib/lex.re" + { c = static_cast('\f'); return 0; } +#line 369 "lib/lex.cc" +yy66: + ++cur; +#line 129 "../lib/lex.re" + { c = static_cast('\n'); return 0; } +#line 374 "lib/lex.cc" +yy68: + ++cur; +#line 130 "../lib/lex.re" + { c = static_cast('\r'); return 0; } +#line 379 "lib/lex.cc" +yy70: + ++cur; +#line 131 "../lib/lex.re" + { c = static_cast('\t'); return 0; } +#line 384 "lib/lex.cc" +yy72: + ++cur; +#line 132 "../lib/lex.re" + { c = static_cast('\v'); return 0; } +#line 389 "lib/lex.cc" +yy74: + yych = *++cur; + if (yych <= '@') { + if (yych <= '/') goto yy75; + if (yych <= '9') goto yy76; + } else { + if (yych <= 'F') goto yy76; + if (yych <= '`') goto yy75; + if (yych <= 'f') goto yy76; + } +yy75: + cur = mar; + goto yy49; +yy76: + yych = *++cur; + if (yych <= '@') { + if (yych <= '/') goto yy75; + if (yych >= ':') goto yy75; + } else { + if (yych <= 'F') goto yy77; + if (yych <= '`') goto yy75; + if (yych >= 'g') goto yy75; + } +yy77: + ++cur; +#line 123 "../lib/lex.re" + { c = unesc_hex(p, cur); return 0; } +#line 417 "lib/lex.cc" } -#line 122 "../lib/lex.re" +#line 137 "../lib/lex.re" } diff --git a/lib/lex.re b/lib/lex.re index 28c927b3..754d4c90 100644 --- a/lib/lex.re +++ b/lib/lex.re @@ -4,6 +4,7 @@ #include "src/encoding/enc.h" #include "src/parse/ast.h" +#include "src/parse/unescape.h" #include "src/util/range.h" #include "src/util/s_to_n32_unsafe.h" #include "parse.h" @@ -112,11 +113,25 @@ err_cnt: int32_t lex_cls_chr(const char *&cur, uint32_t &c) { + const char *mar, *p = cur; /*!re2c * { return 1; } "[." { error("collating characters not supported"); return 1; } - "[:" { error("character classes not supported"); return 1; } - "[=" { error("equivalence classes not supported"); return 1; } + "[:" { error("character classes not supported"); return 1; } + "[=" { error("equivalence classes not supported"); return 1; } + + "\\x"[0-9a-fA-F]{2} { c = unesc_hex(p, cur); return 0; } + + "\\" { c = static_cast('\\'); return 0; } + "\\a" { c = static_cast('\a'); return 0; } + "\\b" { c = static_cast('\b'); return 0; } + "\\f" { c = static_cast('\f'); return 0; } + "\\n" { c = static_cast('\n'); return 0; } + "\\r" { c = static_cast('\r'); return 0; } + "\\t" { c = static_cast('\t'); return 0; } + "\\v" { c = static_cast('\v'); return 0; } + "\\\\" { c = static_cast('\\'); return 0; } + "\\]" { c = static_cast(']'); return 0; } [^] \ nil { c = static_cast(cur[-1]); return 0; } */ -- 2.40.0