From: helly Date: Wed, 28 Dec 2005 18:33:37 +0000 (+0000) Subject: - Added experimental unicode support X-Git-Tag: 0.13.6~569 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=34d774e78d723f68324f88d4be747c0f3890c630;p=re2c - Added experimental unicode support --- diff --git a/CHANGELOG b/CHANGELOG index 85cc98a6..44cf0e9d 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,5 +1,8 @@ Version 0.9.13 (????-??-??) --------------------------- +- Added support for DOS line endings. +- Added experimental unicode support. +- Applied #1307467 Unicode patch for 0.9.7. Version 0.9.12 (2005-12-28) --------------------------- diff --git a/actions.cc b/actions.cc index bc900add..f697e1d7 100644 --- a/actions.cc +++ b/actions.cc @@ -513,40 +513,28 @@ void CloseVOp::split(CharSet &s) RegExp *expr(Scanner &); -uchar Scanner::unescape(SubStr &s) const +uint Scanner::unescape(SubStr &s) const { s.len--; - uchar c; + uint c; if ((c = *s.str++) != '\\' || s.len == 0) - return xlat[c]; + { + return xlat(c); + } s.len--; switch (c = *s.str++) { + case 'n': return xlat('\n'); + case 't': return xlat('\t'); + case 'v': return xlat('\v'); + case 'b': return xlat('\b'); + case 'r': return xlat('\r'); + case 'f': return xlat('\f'); + case 'a': return xlat('\a'); - case 'n': - return xlat['\n']; - - case 't': - return xlat['\t']; - - case 'v': - return xlat['\v']; - - case 'b': - return xlat['\b']; - - case 'r': - return xlat['\r']; - - case 'f': - return xlat['\f']; - - case 'a': - return xlat['\a']; - case 'x': { static const char * hex = "0123456789abcdef"; @@ -560,25 +548,42 @@ uchar Scanner::unescape(SubStr &s) const s.len -= 2; s.str += 2; - uchar v = (uchar)((p1 - hex) << 4) + (uchar)(p2 - hex); + uint v = (uint)((p1 - hex) << 4) + + (uint)((p2 - hex)); return v; } - case '0': + case 'X': + { + static const char * hex = "0123456789abcdef"; + char *p1, *p2, *p3, *p4; - case '1': + if (s.len < 4 || !(p1 = strchr(hex, tolower(s.str[0]))) + || !(p2 = strchr(hex, tolower(s.str[1]))) + || !(p3 = strchr(hex, tolower(s.str[2]))) + || !(p4 = strchr(hex, tolower(s.str[3])))) + { + fatal("Illegal hexadecimal character code"); + } + s.len -= 4; + s.str += 4; + + uint v = (uint)((p1 - hex) << 12) + + (uint)((p2 - hex) << 8) + + (uint)((p3 - hex) << 4) + + (uint)((p4 - hex)); - case '2': + return v; + } + case '0': + case '1': + case '2': case '3': - case '4': - case '5': - case '6': - case '7': { static const char * oct = "01234567"; @@ -593,13 +598,13 @@ uchar Scanner::unescape(SubStr &s) const s.len -= 2; s.str += 2; - uchar v = (uchar)((p0 - oct) << 6) + (uchar)((p1 - oct) << 3) + (uchar)(p2 - oct); + uint v = (uint)((p0 - oct) << 6) + (uint)((p1 - oct) << 3) + (uint)(p2 - oct); return v; } default: - return xlat[c]; + return xlat(c); } } @@ -624,18 +629,18 @@ Range * Scanner::getRange(SubStr &s) const ub = tmp; } - xlb = xlat[lb]; - xub = xlat[ub]; + xlb = xlat(lb); + xub = xlat(ub); for(c = lb; c <= ub; c++) { - if (!(xlb <= xlat[c] && xlat[c] <= ub)) + if (!(xlb <= xlat(c) && xlat(c) <= ub)) { /* range doesn't work */ Range * r = new Range(xlb, xlb + 1); for (c = lb + 1; c <= ub; c++) { - r = doUnion(r, new Range(xlat[c], xlat[c] + 1)); + r = doUnion(r, new Range(xlat(c), xlat(c) + 1)); } return r; } @@ -677,14 +682,14 @@ RegExp * Scanner::strToCaseInsensitiveRE(SubStr s) const if (s.len == 0) return new NullOp; - uchar c = unescape(s); + uint c = unescape(s); RegExp *re, *reL, *reU; if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')) { - reL = matchChar(xlat[tolower(c)]); - reU = matchChar(xlat[toupper(c)]); + reL = matchChar(xlat(tolower(c))); + reU = matchChar(xlat(toupper(c))); re = mkAlt(reL, reU); } else @@ -694,12 +699,12 @@ RegExp * Scanner::strToCaseInsensitiveRE(SubStr s) const while (s.len > 0) { - uchar c = unescape(s); + uint c = unescape(s); if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')) { - reL = matchChar(xlat[tolower(c)]); - reU = matchChar(xlat[toupper(c)]); + reL = matchChar(xlat(tolower(c))); + reU = matchChar(xlat(toupper(c))); re = new CatOp(re, mkAlt(reL, reU)); } else @@ -751,7 +756,7 @@ RegExp * Scanner::invToRE(SubStr s) const RegExp * Scanner::mkDot() const { RegExp * any = ranToRE(SubStr("[\\000-\\377]")); - RegExp * ran = matchChar(xlat['\n']); + RegExp * ran = matchChar(xlat('\n')); RegExp * inv = mkDiff(any, ran); delete ran; @@ -843,7 +848,7 @@ void genCode(std::ostream& o, RegExp *re) uint j; memset(&cs, 0, sizeof(cs)); - for (j = 0; j < nChars; ++j) + for (j = 0; j < nRealChars; ++j) { cs.rep[j] = &cs.ptn[0]; cs.ptn[j].nxt = &cs.ptn[j + 1]; @@ -863,7 +868,7 @@ void genCode(std::ostream& o, RegExp *re) */ Char rep[nChars]; - for (j = 0; j < nChars; ++j) + for (j = 0; j < nRealChars; ++j) { if (!cs.rep[j]->nxt) cs.rep[j]->nxt = &cs.ptn[j]; @@ -895,7 +900,7 @@ void genCode(std::ostream& o, RegExp *re) } } - DFA *dfa = new DFA(ins, re->size, 0, 256, rep); + DFA *dfa = new DFA(ins, re->size, 0, nRealChars, rep); dfa->emit(o); delete dfa; delete [] ins; diff --git a/bootstrap/scanner.cc b/bootstrap/scanner.cc index 1ef13067..4c6497df 100644 --- a/bootstrap/scanner.cc +++ b/bootstrap/scanner.cc @@ -1,4 +1,4 @@ -/* Generated by re2c 0.9.12.dev on Wed Dec 28 00:53:17 2005 */ +/* Generated by re2c 0.9.13.dev on Wed Dec 28 18:30:39 2005 */ #line 1 "scanner.re" /* $Id$ */ #include @@ -30,42 +30,48 @@ namespace re2c Scanner::Scanner(std::istream& i) : in(i), bot(NULL), tok(NULL), ptr(NULL), cur(NULL), pos(NULL), lim(NULL), - top(NULL), eof(NULL), tchar(0), tline(0), cline(1) { + top(NULL), eof(NULL), tchar(0), tline(0), cline(1) +{ ; } -char *Scanner::fill(char *cursor){ - if(!eof){ - uint cnt = tok - bot; - if(cnt){ - memcpy(bot, tok, lim - tok); - tok = bot; - ptr -= cnt; - cursor -= cnt; - pos -= cnt; - lim -= cnt; - } - if((top - lim) < BSIZE){ - char *buf = new char[(lim - bot) + BSIZE]; - memcpy(buf, tok, lim - tok); - tok = buf; - ptr = &buf[ptr - bot]; - cursor = &buf[cursor - bot]; - pos = &buf[pos - bot]; - lim = &buf[lim - bot]; - top = &lim[BSIZE]; - delete [] bot; - bot = buf; - } - if((cnt = in.rdbuf()->sgetn((char*) lim, BSIZE)) != BSIZE){ - eof = &lim[cnt]; *eof++ = '\0'; +char *Scanner::fill(char *cursor) +{ + if(!eof) + { + uint cnt = tok - bot; + if(cnt) + { + memcpy(bot, tok, lim - tok); + tok = bot; + ptr -= cnt; + cursor -= cnt; + pos -= cnt; + lim -= cnt; + } + if((top - lim) < BSIZE) + { + char *buf = new char[(lim - bot) + BSIZE]; + memcpy(buf, tok, lim - tok); + tok = buf; + ptr = &buf[ptr - bot]; + cursor = &buf[cursor - bot]; + pos = &buf[pos - bot]; + lim = &buf[lim - bot]; + top = &lim[BSIZE]; + delete [] bot; + bot = buf; + } + if((cnt = in.rdbuf()->sgetn((char*) lim, BSIZE)) != BSIZE) + { + eof = &lim[cnt]; *eof++ = '\0'; + } + lim += cnt; } - lim += cnt; - } - return cursor; + return cursor; } -#line 77 "scanner.re" +#line 83 "scanner.re" int Scanner::echo(std::ostream &out){ @@ -81,7 +87,7 @@ int Scanner::echo(std::ostream &out){ tok = cursor; echo: -#line 85 "scanner.cc" +#line 91 "scanner.cc" { YYCTYPE yych; unsigned int yyaccept = 0; @@ -104,41 +110,41 @@ yy2: yyaccept = 0; if(yych == '*') goto yy12; goto yy3; yy3: -#line 123 "scanner.re" +#line 129 "scanner.re" { goto echo; } -#line 112 "scanner.cc" +#line 118 "scanner.cc" yy4: yych = *++YYCURSOR; if(yych == '/') goto yy10; goto yy3; yy5: ++YYCURSOR; goto yy6; yy6: -#line 112 "scanner.re" +#line 118 "scanner.re" { out.write((const char*)(tok), (const char*)(cursor) - (const char*)(tok)); tok = pos = cursor; cline++; oline++; goto echo; } -#line 125 "scanner.cc" +#line 131 "scanner.cc" yy7: ++YYCURSOR; goto yy8; yy8: -#line 117 "scanner.re" +#line 123 "scanner.re" { out.write((const char*)(tok), (const char*)(cursor) - (const char*)(tok) - 1); // -1 so we don't write out the \0 if(cursor == eof) { RETURN(0); } } -#line 136 "scanner.cc" +#line 142 "scanner.cc" yy9: yych = *++YYCURSOR; goto yy3; yy10: ++YYCURSOR; goto yy11; yy11: -#line 103 "scanner.re" +#line 109 "scanner.re" { if (ignore_eoc) { ignore_eoc = false; @@ -148,7 +154,7 @@ yy11: tok = pos = cursor; goto echo; } -#line 152 "scanner.cc" +#line 158 "scanner.cc" yy12: yych = *++YYCURSOR; if(yych == '!') goto yy14; goto yy13; @@ -175,13 +181,13 @@ yy18: yych = *++YYCURSOR; yy19: ++YYCURSOR; goto yy20; yy20: -#line 92 "scanner.re" +#line 98 "scanner.re" { out.write((const char*)(tok), (const char*)(&cursor[-7]) - (const char*)(tok)); tok = cursor; RETURN(1); } -#line 185 "scanner.cc" +#line 191 "scanner.cc" yy21: yych = *++YYCURSOR; if(yych != 'x') goto yy13; goto yy22; @@ -203,16 +209,16 @@ yy26: yych = *++YYCURSOR; yy27: ++YYCURSOR; goto yy28; yy28: -#line 97 "scanner.re" +#line 103 "scanner.re" { out << "#define YYMAXFILL " << maxFill << std::endl; tok = pos = cursor; ignore_eoc = true; goto echo; } -#line 214 "scanner.cc" +#line 220 "scanner.cc" } -#line 126 "scanner.re" +#line 132 "scanner.re" } @@ -226,7 +232,7 @@ scan: tline = cline; tok = cursor; -#line 230 "scanner.cc" +#line 236 "scanner.cc" { YYCTYPE yych; unsigned int yyaccept = 0; @@ -235,544 +241,567 @@ scan: yy29: if((YYLIMIT - YYCURSOR) < 4) YYFILL(4); yych = *YYCURSOR; - if(yych <= '/'){ - if(yych <= '"'){ - if(yych <= 0x0A){ - if(yych <= 0x08) goto yy53; + if(yych <= '.'){ + if(yych <= '!'){ + if(yych <= 0x0C){ + if(yych <= 0x08) goto yy55; if(yych <= 0x09) goto yy49; - goto yy51; + if(yych <= 0x0A) goto yy53; + goto yy55; } else { + if(yych <= 0x0D) goto yy51; if(yych == ' ') goto yy49; - if(yych <= '!') goto yy53; - goto yy37; + goto yy55; } } else { - if(yych <= '*'){ - if(yych <= '&') goto yy53; + if(yych <= ')'){ + if(yych <= '"') goto yy37; + if(yych <= '&') goto yy55; if(yych <= '\'') goto yy39; - if(yych <= ')') goto yy43; - goto yy35; + goto yy43; } else { + if(yych <= '*') goto yy35; if(yych <= '+') goto yy44; - if(yych <= '-') goto yy53; - if(yych <= '.') goto yy47; - goto yy33; + if(yych <= '-') goto yy55; + goto yy47; } } } else { if(yych <= '@'){ if(yych <= '<'){ + if(yych <= '/') goto yy33; if(yych == ';') goto yy43; - goto yy53; + goto yy55; } else { if(yych <= '=') goto yy43; if(yych == '?') goto yy44; - goto yy53; + goto yy55; } } else { if(yych <= '`'){ if(yych <= 'Z') goto yy45; if(yych <= '[') goto yy41; if(yych <= '\\') goto yy43; - goto yy53; + goto yy55; } else { if(yych <= 'z') goto yy45; if(yych <= '{') goto yy31; if(yych <= '|') goto yy43; - goto yy53; + goto yy55; } } } yy31: yyaccept = 0; yych = *(YYMARKER = ++YYCURSOR); if(yych <= '/'){ - if(yych == ',') goto yy87; + if(yych == ',') goto yy90; goto yy32; } else { - if(yych <= '0') goto yy84; - if(yych <= '9') goto yy85; + if(yych <= '0') goto yy87; + if(yych <= '9') goto yy88; goto yy32; } yy32: -#line 139 "scanner.re" +#line 145 "scanner.re" { depth = 1; goto code; } -#line 302 "scanner.cc" +#line 310 "scanner.cc" yy33: ++YYCURSOR; - if((yych = *YYCURSOR) == '*') goto yy82; + if((yych = *YYCURSOR) == '*') goto yy85; goto yy34; yy34: -#line 169 "scanner.re" +#line 175 "scanner.re" { RETURN(*tok); } -#line 309 "scanner.cc" +#line 317 "scanner.cc" yy35: ++YYCURSOR; - if((yych = *YYCURSOR) == '/') goto yy80; + if((yych = *YYCURSOR) == '/') goto yy83; goto yy36; yy36: -#line 171 "scanner.re" +#line 177 "scanner.re" { yylval.op = *tok; RETURN(CLOSE); } -#line 317 "scanner.cc" +#line 325 "scanner.cc" yy37: yyaccept = 1; yych = *(YYMARKER = ++YYCURSOR); - if(yych != 0x0A) goto yy76; + if(yych != 0x0A) goto yy79; goto yy38; yy38: -#line 156 "scanner.re" +#line 162 "scanner.re" { fatal("unterminated string constant (missing \")"); } -#line 325 "scanner.cc" +#line 333 "scanner.cc" yy39: yyaccept = 2; yych = *(YYMARKER = ++YYCURSOR); - if(yych != 0x0A) goto yy71; + if(yych != 0x0A) goto yy74; goto yy40; yy40: -#line 157 "scanner.re" +#line 163 "scanner.re" { fatal("unterminated string constant (missing ')"); } -#line 333 "scanner.cc" +#line 341 "scanner.cc" yy41: yyaccept = 3; yych = *(YYMARKER = ++YYCURSOR); if(yych == 0x0A) goto yy42; - if(yych == '^') goto yy62; - goto yy60; + if(yych == '^') goto yy65; + goto yy63; yy42: -#line 167 "scanner.re" +#line 173 "scanner.re" { fatal("unterminated range (missing ])"); } -#line 342 "scanner.cc" +#line 350 "scanner.cc" yy43: yych = *++YYCURSOR; goto yy34; yy44: yych = *++YYCURSOR; goto yy36; yy45: ++YYCURSOR; yych = *YYCURSOR; - goto yy58; + goto yy61; yy46: -#line 191 "scanner.re" +#line 197 "scanner.re" { cur = cursor; yylval.symbol = Symbol::find(token()); return ID; } -#line 355 "scanner.cc" +#line 363 "scanner.cc" yy47: ++YYCURSOR; goto yy48; yy48: -#line 195 "scanner.re" +#line 201 "scanner.re" { cur = cursor; yylval.regexp = mkDot(); return RANGE; } -#line 364 "scanner.cc" +#line 372 "scanner.cc" yy49: ++YYCURSOR; yych = *YYCURSOR; - goto yy56; + goto yy59; yy50: -#line 200 "scanner.re" +#line 206 "scanner.re" { goto scan; } -#line 371 "scanner.cc" +#line 379 "scanner.cc" yy51: ++YYCURSOR; + if((yych = *YYCURSOR) == 0x0A) goto yy56; goto yy52; yy52: -#line 202 "scanner.re" -{ if(cursor == eof) RETURN(0); - pos = cursor; cline++; +#line 217 "scanner.re" +{ std::cerr << "line " << tline << ", column " << (tchar + 1) + << ": unexpected character: "; + if (isprint(*tok)) + { + std::cerr << *tok << std::endl; + } + else + { + std::cerr << "0x" << hexCh(*tok >> 4) << hexCh(*tok) << std::endl; + } goto scan; - } -#line 380 "scanner.cc" + } +#line 397 "scanner.cc" yy53: ++YYCURSOR; goto yy54; yy54: -#line 207 "scanner.re" -{ std::cerr << "unexpected character: " << *tok << std::endl; +#line 212 "scanner.re" +{ if(cursor == eof) RETURN(0); + pos = cursor; cline++; goto scan; - } -#line 388 "scanner.cc" -yy55: ++YYCURSOR; + } +#line 406 "scanner.cc" +yy55: yych = *++YYCURSOR; + goto yy52; +yy56: ++YYCURSOR; + goto yy57; +yy57: +#line 208 "scanner.re" +{ if(cursor == eof) RETURN(0); + pos = cursor; cline++; + goto scan; + } +#line 417 "scanner.cc" +yy58: ++YYCURSOR; if(YYLIMIT == YYCURSOR) YYFILL(1); yych = *YYCURSOR; - goto yy56; -yy56: if(yych == 0x09) goto yy55; - if(yych == ' ') goto yy55; + goto yy59; +yy59: if(yych == 0x09) goto yy58; + if(yych == ' ') goto yy58; goto yy50; -yy57: ++YYCURSOR; +yy60: ++YYCURSOR; if(YYLIMIT == YYCURSOR) YYFILL(1); yych = *YYCURSOR; - goto yy58; -yy58: if(yych <= '@'){ + goto yy61; +yy61: if(yych <= '@'){ if(yych <= '/') goto yy46; - if(yych <= '9') goto yy57; + if(yych <= '9') goto yy60; goto yy46; } else { - if(yych <= 'Z') goto yy57; + if(yych <= 'Z') goto yy60; if(yych <= '`') goto yy46; - if(yych <= 'z') goto yy57; + if(yych <= 'z') goto yy60; goto yy46; } -yy59: ++YYCURSOR; +yy62: ++YYCURSOR; if(YYLIMIT == YYCURSOR) YYFILL(1); yych = *YYCURSOR; - goto yy60; -yy60: if(yych <= '['){ - if(yych != 0x0A) goto yy59; - goto yy61; + goto yy63; +yy63: if(yych <= '['){ + if(yych != 0x0A) goto yy62; + goto yy64; } else { - if(yych <= '\\') goto yy64; - if(yych <= ']') goto yy65; - goto yy59; + if(yych <= '\\') goto yy67; + if(yych <= ']') goto yy68; + goto yy62; } -yy61: YYCURSOR = YYMARKER; +yy64: YYCURSOR = YYMARKER; switch(yyaccept){ case 0: goto yy32; case 1: goto yy38; case 2: goto yy40; case 3: goto yy42; - case 4: goto yy88; + case 4: goto yy91; } -yy62: ++YYCURSOR; +yy65: ++YYCURSOR; if(YYLIMIT == YYCURSOR) YYFILL(1); yych = *YYCURSOR; - goto yy63; -yy63: if(yych <= '['){ - if(yych == 0x0A) goto yy61; - goto yy62; + goto yy66; +yy66: if(yych <= '['){ + if(yych == 0x0A) goto yy64; + goto yy65; } else { - if(yych <= '\\') goto yy67; - if(yych <= ']') goto yy68; - goto yy62; + if(yych <= '\\') goto yy70; + if(yych <= ']') goto yy71; + goto yy65; } -yy64: ++YYCURSOR; - if(YYLIMIT == YYCURSOR) YYFILL(1); - yych = *YYCURSOR; - if(yych == 0x0A) goto yy61; - goto yy59; -yy65: ++YYCURSOR; - goto yy66; -yy66: -#line 163 "scanner.re" -{ cur = cursor; - yylval.regexp = ranToRE(token()); - return RANGE; } -#line 454 "scanner.cc" yy67: ++YYCURSOR; if(YYLIMIT == YYCURSOR) YYFILL(1); yych = *YYCURSOR; - if(yych == 0x0A) goto yy61; + if(yych == 0x0A) goto yy64; goto yy62; yy68: ++YYCURSOR; goto yy69; yy69: -#line 159 "scanner.re" +#line 169 "scanner.re" { cur = cursor; - yylval.regexp = invToRE(token()); + yylval.regexp = ranToRE(token()); return RANGE; } -#line 467 "scanner.cc" +#line 483 "scanner.cc" yy70: ++YYCURSOR; if(YYLIMIT == YYCURSOR) YYFILL(1); yych = *YYCURSOR; - goto yy71; -yy71: if(yych <= '&'){ - if(yych == 0x0A) goto yy61; - goto yy70; + if(yych == 0x0A) goto yy64; + goto yy65; +yy71: ++YYCURSOR; + goto yy72; +yy72: +#line 165 "scanner.re" +{ cur = cursor; + yylval.regexp = invToRE(token()); + return RANGE; } +#line 496 "scanner.cc" +yy73: ++YYCURSOR; + if(YYLIMIT == YYCURSOR) YYFILL(1); + yych = *YYCURSOR; + goto yy74; +yy74: if(yych <= '&'){ + if(yych == 0x0A) goto yy64; + goto yy73; } else { - if(yych <= '\'') goto yy73; - if(yych != '\\') goto yy70; - goto yy72; + if(yych <= '\'') goto yy76; + if(yych != '\\') goto yy73; + goto yy75; } -yy72: ++YYCURSOR; +yy75: ++YYCURSOR; if(YYLIMIT == YYCURSOR) YYFILL(1); yych = *YYCURSOR; - if(yych == 0x0A) goto yy61; - goto yy70; -yy73: ++YYCURSOR; - goto yy74; -yy74: -#line 152 "scanner.re" + if(yych == 0x0A) goto yy64; + goto yy73; +yy76: ++YYCURSOR; + goto yy77; +yy77: +#line 158 "scanner.re" { cur = cursor; yylval.regexp = strToCaseInsensitiveRE(token()); return STRING; } -#line 492 "scanner.cc" -yy75: ++YYCURSOR; +#line 521 "scanner.cc" +yy78: ++YYCURSOR; if(YYLIMIT == YYCURSOR) YYFILL(1); yych = *YYCURSOR; - goto yy76; -yy76: if(yych <= '!'){ - if(yych == 0x0A) goto yy61; - goto yy75; + goto yy79; +yy79: if(yych <= '!'){ + if(yych == 0x0A) goto yy64; + goto yy78; } else { - if(yych <= '"') goto yy78; - if(yych != '\\') goto yy75; - goto yy77; + if(yych <= '"') goto yy81; + if(yych != '\\') goto yy78; + goto yy80; } -yy77: ++YYCURSOR; +yy80: ++YYCURSOR; if(YYLIMIT == YYCURSOR) YYFILL(1); yych = *YYCURSOR; - if(yych == 0x0A) goto yy61; - goto yy75; -yy78: ++YYCURSOR; - goto yy79; -yy79: -#line 148 "scanner.re" + if(yych == 0x0A) goto yy64; + goto yy78; +yy81: ++YYCURSOR; + goto yy82; +yy82: +#line 154 "scanner.re" { cur = cursor; yylval.regexp = strToRE(token()); return STRING; } -#line 517 "scanner.cc" -yy80: ++YYCURSOR; - goto yy81; -yy81: -#line 145 "scanner.re" +#line 546 "scanner.cc" +yy83: ++YYCURSOR; + goto yy84; +yy84: +#line 151 "scanner.re" { tok = cursor; RETURN(0); } -#line 524 "scanner.cc" -yy82: ++YYCURSOR; - goto yy83; -yy83: -#line 142 "scanner.re" +#line 553 "scanner.cc" +yy85: ++YYCURSOR; + goto yy86; +yy86: +#line 148 "scanner.re" { depth = 1; goto comment; } -#line 531 "scanner.cc" -yy84: yych = *++YYCURSOR; - if(yych == ',') goto yy98; - goto yy86; -yy85: ++YYCURSOR; +#line 560 "scanner.cc" +yy87: yych = *++YYCURSOR; + if(yych == ',') goto yy101; + goto yy89; +yy88: ++YYCURSOR; if((YYLIMIT - YYCURSOR) < 2) YYFILL(2); yych = *YYCURSOR; - goto yy86; -yy86: if(yych <= '/'){ - if(yych == ',') goto yy91; - goto yy61; + goto yy89; +yy89: if(yych <= '/'){ + if(yych == ',') goto yy94; + goto yy64; } else { - if(yych <= '9') goto yy85; - if(yych == '}') goto yy89; - goto yy61; + if(yych <= '9') goto yy88; + if(yych == '}') goto yy92; + goto yy64; } -yy87: ++YYCURSOR; - goto yy88; -yy88: -#line 189 "scanner.re" +yy90: ++YYCURSOR; + goto yy91; +yy91: +#line 195 "scanner.re" { fatal("illegal closure form, use '{n}', '{n,}', '{n,m}' where n and m are numbers"); } -#line 552 "scanner.cc" -yy89: ++YYCURSOR; - goto yy90; -yy90: -#line 177 "scanner.re" -{ yylval.extop.minsize = atoi((char *)tok+1); - yylval.extop.maxsize = atoi((char *)tok+1); - RETURN(CLOSESIZE); } -#line 560 "scanner.cc" -yy91: yyaccept = 4; - yych = *(YYMARKER = ++YYCURSOR); - if(yych <= '/') goto yy88; - if(yych <= '9') goto yy94; - if(yych != '}') goto yy88; - goto yy92; +#line 581 "scanner.cc" yy92: ++YYCURSOR; goto yy93; yy93: -#line 185 "scanner.re" +#line 183 "scanner.re" { yylval.extop.minsize = atoi((char *)tok+1); - yylval.extop.maxsize = -1; + yylval.extop.maxsize = atoi((char *)tok+1); RETURN(CLOSESIZE); } -#line 574 "scanner.cc" -yy94: ++YYCURSOR; - if(YYLIMIT == YYCURSOR) YYFILL(1); - yych = *YYCURSOR; +#line 589 "scanner.cc" +yy94: yyaccept = 4; + yych = *(YYMARKER = ++YYCURSOR); + if(yych <= '/') goto yy91; + if(yych <= '9') goto yy97; + if(yych != '}') goto yy91; goto yy95; -yy95: if(yych <= '/') goto yy61; - if(yych <= '9') goto yy94; - if(yych != '}') goto yy61; +yy95: ++YYCURSOR; goto yy96; -yy96: ++YYCURSOR; - goto yy97; -yy97: -#line 181 "scanner.re" +yy96: +#line 191 "scanner.re" { yylval.extop.minsize = atoi((char *)tok+1); - yylval.extop.maxsize = MAX(yylval.extop.minsize,atoi(strchr((char *)tok, ',')+1)); + yylval.extop.maxsize = -1; RETURN(CLOSESIZE); } -#line 590 "scanner.cc" -yy98: yyaccept = 4; - yych = *(YYMARKER = ++YYCURSOR); - if(yych <= '/') goto yy88; - if(yych <= '9') goto yy94; - if(yych != '}') goto yy88; +#line 603 "scanner.cc" +yy97: ++YYCURSOR; + if(YYLIMIT == YYCURSOR) YYFILL(1); + yych = *YYCURSOR; + goto yy98; +yy98: if(yych <= '/') goto yy64; + if(yych <= '9') goto yy97; + if(yych != '}') goto yy64; goto yy99; yy99: ++YYCURSOR; goto yy100; yy100: -#line 174 "scanner.re" +#line 187 "scanner.re" +{ yylval.extop.minsize = atoi((char *)tok+1); + yylval.extop.maxsize = MAX(yylval.extop.minsize,atoi(strchr((char *)tok, ',')+1)); + RETURN(CLOSESIZE); } +#line 619 "scanner.cc" +yy101: yyaccept = 4; + yych = *(YYMARKER = ++YYCURSOR); + if(yych <= '/') goto yy91; + if(yych <= '9') goto yy97; + if(yych != '}') goto yy91; + goto yy102; +yy102: ++YYCURSOR; + goto yy103; +yy103: +#line 180 "scanner.re" { yylval.op = '*'; RETURN(CLOSE); } -#line 603 "scanner.cc" +#line 632 "scanner.cc" } -#line 210 "scanner.re" +#line 229 "scanner.re" code: -#line 610 "scanner.cc" +#line 639 "scanner.cc" { YYCTYPE yych; unsigned int yyaccept = 0; - goto yy101; + goto yy104; ++YYCURSOR; -yy101: +yy104: if((YYLIMIT - YYCURSOR) < 2) YYFILL(2); yych = *YYCURSOR; if(yych <= '&'){ if(yych <= 0x0A){ - if(yych <= 0x09) goto yy109; - goto yy107; + if(yych <= 0x09) goto yy112; + goto yy110; } else { - if(yych == '"') goto yy111; - goto yy109; + if(yych == '"') goto yy114; + goto yy112; } } else { if(yych <= '{'){ - if(yych <= '\'') goto yy112; - if(yych <= 'z') goto yy109; - goto yy105; + if(yych <= '\'') goto yy115; + if(yych <= 'z') goto yy112; + goto yy108; } else { - if(yych != '}') goto yy109; - goto yy103; + if(yych != '}') goto yy112; + goto yy106; } } -yy103: ++YYCURSOR; - goto yy104; -yy104: -#line 214 "scanner.re" +yy106: ++YYCURSOR; + goto yy107; +yy107: +#line 233 "scanner.re" { if(--depth == 0){ cur = cursor; yylval.token = new Token(token(), tline); return CODE; } goto code; } -#line 647 "scanner.cc" -yy105: ++YYCURSOR; - goto yy106; -yy106: -#line 220 "scanner.re" +#line 676 "scanner.cc" +yy108: ++YYCURSOR; + goto yy109; +yy109: +#line 239 "scanner.re" { ++depth; goto code; } -#line 654 "scanner.cc" -yy107: ++YYCURSOR; - goto yy108; -yy108: -#line 222 "scanner.re" +#line 683 "scanner.cc" +yy110: ++YYCURSOR; + goto yy111; +yy111: +#line 241 "scanner.re" { if(cursor == eof) fatal("missing '}'"); pos = cursor; cline++; goto code; } -#line 663 "scanner.cc" -yy109: ++YYCURSOR; - goto yy110; -yy110: -#line 226 "scanner.re" +#line 692 "scanner.cc" +yy112: ++YYCURSOR; + goto yy113; +yy113: +#line 245 "scanner.re" { goto code; } -#line 669 "scanner.cc" -yy111: yyaccept = 0; +#line 698 "scanner.cc" +yy114: yyaccept = 0; yych = *(YYMARKER = ++YYCURSOR); - if(yych == 0x0A) goto yy110; - goto yy118; -yy112: yyaccept = 0; + if(yych == 0x0A) goto yy113; + goto yy121; +yy115: yyaccept = 0; yych = *(YYMARKER = ++YYCURSOR); - if(yych == 0x0A) goto yy110; - goto yy114; -yy113: ++YYCURSOR; + if(yych == 0x0A) goto yy113; + goto yy117; +yy116: ++YYCURSOR; if(YYLIMIT == YYCURSOR) YYFILL(1); yych = *YYCURSOR; - goto yy114; -yy114: if(yych <= '&'){ - if(yych != 0x0A) goto yy113; - goto yy115; + goto yy117; +yy117: if(yych <= '&'){ + if(yych != 0x0A) goto yy116; + goto yy118; } else { - if(yych <= '\'') goto yy109; - if(yych == '\\') goto yy116; - goto yy113; + if(yych <= '\'') goto yy112; + if(yych == '\\') goto yy119; + goto yy116; } -yy115: YYCURSOR = YYMARKER; +yy118: YYCURSOR = YYMARKER; switch(yyaccept){ - case 0: goto yy110; + case 0: goto yy113; } -yy116: ++YYCURSOR; +yy119: ++YYCURSOR; if(YYLIMIT == YYCURSOR) YYFILL(1); yych = *YYCURSOR; - if(yych == 0x0A) goto yy115; - goto yy113; -yy117: ++YYCURSOR; + if(yych == 0x0A) goto yy118; + goto yy116; +yy120: ++YYCURSOR; if(YYLIMIT == YYCURSOR) YYFILL(1); yych = *YYCURSOR; - goto yy118; -yy118: if(yych <= '!'){ - if(yych == 0x0A) goto yy115; - goto yy117; + goto yy121; +yy121: if(yych <= '!'){ + if(yych == 0x0A) goto yy118; + goto yy120; } else { - if(yych <= '"') goto yy109; - if(yych != '\\') goto yy117; - goto yy119; + if(yych <= '"') goto yy112; + if(yych != '\\') goto yy120; + goto yy122; } -yy119: ++YYCURSOR; +yy122: ++YYCURSOR; if(YYLIMIT == YYCURSOR) YYFILL(1); yych = *YYCURSOR; - if(yych == 0x0A) goto yy115; - goto yy117; + if(yych == 0x0A) goto yy118; + goto yy120; } -#line 227 "scanner.re" +#line 246 "scanner.re" comment: -#line 722 "scanner.cc" +#line 751 "scanner.cc" { YYCTYPE yych; - goto yy120; + goto yy123; ++YYCURSOR; -yy120: +yy123: if((YYLIMIT - YYCURSOR) < 2) YYFILL(2); yych = *YYCURSOR; if(yych <= ')'){ - if(yych == 0x0A) goto yy125; - goto yy127; + if(yych == 0x0A) goto yy128; + goto yy130; } else { - if(yych <= '*') goto yy122; - if(yych == '/') goto yy124; - goto yy127; + if(yych <= '*') goto yy125; + if(yych == '/') goto yy127; + goto yy130; } -yy122: ++YYCURSOR; - if((yych = *YYCURSOR) == '/') goto yy130; - goto yy123; -yy123: -#line 241 "scanner.re" -{ goto comment; } -#line 744 "scanner.cc" -yy124: yych = *++YYCURSOR; - if(yych == '*') goto yy128; - goto yy123; yy125: ++YYCURSOR; + if((yych = *YYCURSOR) == '/') goto yy133; goto yy126; yy126: -#line 237 "scanner.re" -{ if(cursor == eof) RETURN(0); - tok = pos = cursor; cline++; - goto comment; - } -#line 756 "scanner.cc" +#line 260 "scanner.re" +{ goto comment; } +#line 773 "scanner.cc" yy127: yych = *++YYCURSOR; - goto yy123; + if(yych == '*') goto yy131; + goto yy126; yy128: ++YYCURSOR; goto yy129; yy129: -#line 235 "scanner.re" +#line 256 "scanner.re" +{ if(cursor == eof) RETURN(0); + tok = pos = cursor; cline++; + goto comment; + } +#line 785 "scanner.cc" +yy130: yych = *++YYCURSOR; + goto yy126; +yy131: ++YYCURSOR; + goto yy132; +yy132: +#line 254 "scanner.re" { ++depth; goto comment; } -#line 765 "scanner.cc" -yy130: ++YYCURSOR; - goto yy131; -yy131: -#line 231 "scanner.re" +#line 794 "scanner.cc" +yy133: ++YYCURSOR; + goto yy134; +yy134: +#line 250 "scanner.re" { if(--depth == 0) goto scan; else goto comment; } -#line 774 "scanner.cc" +#line 803 "scanner.cc" } -#line 242 "scanner.re" +#line 261 "scanner.re" } @@ -784,4 +813,3 @@ void Scanner::fatal(char *msg) const } } // end namespace re2c - diff --git a/dfa.cc b/dfa.cc index 7ba54f99..833d99c1 100644 --- a/dfa.cc +++ b/dfa.cc @@ -8,41 +8,38 @@ namespace re2c { -inline char octCh(uint c) +void prtChOrHex(std::ostream& o, uint c) { - return '0' + c % 8; -} - -inline char hexCh(uint c) -{ - const char * sHex = "0123456789ABCDEF"; - - return sHex[c & 0x0F]; -} - -void prtChOrHex(std::ostream& o, uchar c) -{ - uchar oc = talx[c]; + int oc = (int)(re2c::wFlag ? c : re2c::talx[c]); - if (isprint(oc)) + if ((oc < 256) && isprint(oc)) { o << '\''; - prtCh(o, c); + prtCh(o, oc); o << '\''; } + else if (re2c::wFlag) + { + o << "0x" + << hexCh(oc >> 12) + << hexCh(oc >> 8) + << hexCh(oc >> 4) + << hexCh(oc); + } else { - o << "0x" << hexCh(c >> 4) << hexCh(c); + o << "0x" + << hexCh(oc >> 4) + << hexCh(oc); } } -void prtCh(std::ostream &o, uchar c) +void prtCh(std::ostream &o, uint c) { - uchar oc = talx[c]; + int oc = (int)(re2c::wFlag ? c : re2c::talx[c]); switch (oc) { - case '\'': o << "\\'"; break; @@ -81,17 +78,31 @@ void prtCh(std::ostream &o, uchar c) default: - if (isprint(oc)) + if ((oc < 256) && isprint(oc)) + { o << (char) oc; + } + else if (re2c::wFlag) + { + o << "0x" + << hexCh(oc >> 12) + << hexCh(oc >> 8) + << hexCh(oc >> 4) + << hexCh(oc); + } else - o << '\\' << octCh(c / 64) << octCh(c / 8) << octCh(c); + { + o << '\\' << octCh(oc / 64) << octCh(oc / 8) << octCh(oc); + } } } void printSpan(std::ostream &o, uint lb, uint ub) { if (lb > ub) + { o << "*"; + } o << "["; @@ -125,7 +136,9 @@ std::ostream& operator<<(std::ostream &o, const State &s) o << "state " << s.label; if (s.rule) + { o << " accepts " << s.rule->accept; + } o << "\n"; @@ -134,7 +147,9 @@ std::ostream& operator<<(std::ostream &o, const State &s) uint lb = 0; for (uint i = 0; i < s.go.nSpans; ++i) + { lb = s.go.span[i].show(o, lb); + } return o; } @@ -252,9 +267,8 @@ DFA::DFA(Ins *ins, uint ni, uint lb, uint ub, Char *rep) { State *to = (State*) goTo[rep[j]].to; - while (++j < nc && goTo[rep[j]].to == to) + while (++j < nc && goTo[rep[j]].to == to) ; - ; span[s->go.nSpans].ub = lb + j; span[s->go.nSpans].to = to; diff --git a/dfa.h b/dfa.h index 032d80c8..0573240d 100644 --- a/dfa.h +++ b/dfa.h @@ -8,8 +8,8 @@ namespace re2c { -extern void prtCh(std::ostream&, uchar); -extern void prtChOrHex(std::ostream&, uchar); +extern void prtCh(std::ostream&, uint); +extern void prtChOrHex(std::ostream&, uint); extern void printSpan(std::ostream&, uint, uint); class DFA; diff --git a/globals.h b/globals.h index 65fc51da..488189c2 100644 --- a/globals.h +++ b/globals.h @@ -31,22 +31,39 @@ namespace re2c extern char *fileName; extern char *outputFileName; -extern bool sFlag; extern bool bFlag; extern bool dFlag; +extern bool eFlag; extern bool iFlag; +extern bool sFlag; +extern bool wFlag; + extern bool bUsedYYAccept; extern unsigned int oline; extern uint maxFill; -extern uchar asc2ebc[256]; -extern uchar ebc2asc[256]; +extern uint asc2ebc[256]; +extern uint ebc2asc[256]; -extern uchar *xlat, *talx; +extern uint *xlat, *talx; extern int vFillIndexes; extern label_list vUsedLabels; +extern uint nRealChars; + +inline char octCh(uint c) +{ + return '0' + c % 8; +} + +inline char hexCh(uint c) +{ + static const char * sHex = "0123456789ABCDEF"; + + return sHex[c & 0x0F]; +} + } // end namespace re2c #endif diff --git a/ins.h b/ins.h index 5700121b..14b11feb 100644 --- a/ins.h +++ b/ins.h @@ -7,8 +7,8 @@ namespace re2c { -const uint nChars = 256; -typedef uchar Char; +const uint nChars = (1<<16); +typedef unsigned short Char; const uint CHAR = 0; const uint GOTO = 1; diff --git a/main.cc b/main.cc index 45c9d60b..1714509d 100644 --- a/main.cc +++ b/main.cc @@ -18,13 +18,16 @@ namespace re2c char *fileName = 0; char *outputFileName = 0; -bool sFlag = false; bool bFlag = false; -bool dFlag = false; +bool dFlag = false; +bool eFlag = false; bool iFlag = false; +bool sFlag = false; +bool wFlag = false; bool bUsedYYAccept = false; unsigned int oline = 1; uint maxFill = 1; +uint nRealChars = 256; int vFillIndexes = -1; label_list vUsedLabels; @@ -47,6 +50,7 @@ static const mbo_opt_struct OPTIONS[] = mbo_opt_struct('s', 0, "nested-ifs"), mbo_opt_struct('v', 0, "version"), mbo_opt_struct('V', 0, "vernum"), + mbo_opt_struct('w', 0, "wide-chars"), mbo_opt_struct('-', 0, NULL) /* end of args */ }; @@ -54,31 +58,34 @@ static void usage() { cerr << "usage: re2c [-esbvhd] file\n" "\n" - "-? -h --help Display this info.\n" + "-? -h --help Display this info.\n" "\n" - "-b --bit-vectors Implies -s. Use bit vectors as well in the attempt to\n" + "-b --bit-vectors Implies -s. Use bit vectors as well in the attempt to\n" " coax better code out of the compiler. Most useful for\n" " specifications with more than a few keywords (e.g. for\n" " most programming languages).\n" "\n" - "-e --ecb Cross-compile from an ASCII platform to\n" + "-d --debug-output Creates a parser that dumps information during\n" + " about the current position and in which state the\n" + " parser is.\n" + "\n" + "-e --ecb Cross-compile from an ASCII platform to\n" " an EBCDIC one.\n" "\n" - "-s --nested-ifs Generate nested ifs for some switches. Many compilers\n" - " need this assist to generate better code.\n" + "-f --storable-state Generate a scanner with support for storable state\n" "\n" - "-f --storable-state Generate a scanner with support for storable state\n" + "-i --no-debug-info Do not generate '#line' info (usefull for versioning).\n" "\n" - "-o --output=output Specify the output file instead of stdout\n" + "-o --output=output Specify the output file instead of stdout\n" + " This cannot be used together with switches -b or -e.\n" "\n" - "-d --debug-output Creates a parser that dumps information during\n" - " about the current position and in which state the\n" - " parser is.\n" + "-s --nested-ifs Generate nested ifs for some switches. Many compilers\n" + " need this assist to generate better code.\n" "\n" - "-i --no-debug-info Do not generate '#line' info (usefull for versioning).\n" + "-v --version Show version information.\n" + "-V --vernum Show version as one number.\n" "\n" - "-v --version Show version information.\n" - "-V --vernum Show version as one number.\n" + "-w --wide-chars Create a parser that supports wide chars (UCS-2).\n" ; } @@ -110,6 +117,7 @@ int main(int argc, char *argv[]) case 'e': xlat = asc2ebc; talx = ebc2asc; + eFlag = true; break; case 's': @@ -135,7 +143,7 @@ int main(int argc, char *argv[]) case 'v': cout << "re2c " << PACKAGE_VERSION << "\n"; return 2; - + case 'V': { int v1, v2, v3; char version[16]; @@ -145,17 +153,25 @@ int main(int argc, char *argv[]) return 2; } + case 'w': + nRealChars = (1<<16); + wFlag = true; + break; + case 'h': - case '?': - default: usage(); return 2; } } - if (argc == opt_ind + 1) + if (wFlag && (bFlag || eFlag)) + { + usage(); + return 2; + } + else if (argc == opt_ind + 1) { fileName = argv[opt_ind]; } diff --git a/re2c.1.in b/re2c.1.in index b83fe992..572cce34 100644 --- a/re2c.1.in +++ b/re2c.1.in @@ -7,6 +7,9 @@ .ds rx regular expression .ds lx \fIl\fP-expression \"$Log$ +\"Revision 1.27 2005/12/28 18:33:37 helly +\"- Added experimental unicode support +\" \"Revision 1.26 2005/12/18 18:47:06 helly \"- Apply #1362806 Addition to man on flag -f \" @@ -185,16 +188,9 @@ will receive. .SH OPTIONS \*(re provides the following options: .TP -\fB-e\fP -Cross-compile from an ASCII platform to an EBCDIC one. -.TP -\fB-f\fP -Generate a scanner with support for storable state. -For details see below at \fBSCANNER WITH STORABLE STATES\fP. -.TP -\fB-s\fP -Generate nested \fCif\fPs for some \fCswitch\fPes. Many compilers need this -assist to generate better code. +\fB-?\fP +\fB-h\fP +Invoke a short help. .TP \fB-b\fP Implies \fB-s\fP. Use bit vectors as well in the attempt to coax better @@ -209,14 +205,23 @@ parser issues and states. If you use this switch you need to define a macro \fIvoid YYDEBUG(int state, char current)\fP. The first parameter receives the state or -1 and the second parameter receives the input at the current cursor. .TP +\fB-e\fP +Cross-compile from an ASCII platform to an EBCDIC one. +.TP +\fB-f\fP +Generate a scanner with support for storable state. +For details see below at \fBSCANNER WITH STORABLE STATES\fP. +.TP \fB-i\fP Do not output #line information. This is usefull when you want use a CMS tool with the re2c output which you might want if you do not require your users to have re2c themselves when building from your source. +\fB-o output\fP +Specify the output file. .TP -\fB-h\fP -\fB-?\fP -Invoke a short help. +\fB-s\fP +Generate nested \fCif\fPs for some \fCswitch\fPes. Many compilers need this +assist to generate better code. .TP \fB-v\fP Show version information. @@ -224,9 +229,9 @@ Show version information. \fB-V\fP Show the version as a number XXYYZZ. .TP -\fB-o output\fP -Specify the output file. - +\fB-w\fP +Create a parser that supports wide chars (UCS-2). This cannot be used together +with switches \fB-b\fP or \fB-e\fP. .SH "INTERFACE CODE" Unlike other scanner generators, \*(re does not generate complete scanners: the user must supply some interface code. diff --git a/scanner.h b/scanner.h index 66a6200c..25e73679 100644 --- a/scanner.h +++ b/scanner.h @@ -5,6 +5,7 @@ #include #include "token.h" #include "re.h" +#include "globals.h" namespace re2c { @@ -27,10 +28,10 @@ public: int echo(std::ostream&); int scan(); void fatal(char*) const; - SubStr token(); - uint line(); - - uchar unescape(SubStr &s) const; + SubStr token() const; + uint line() const; + uint xlat(uint c) const; + uint unescape(SubStr &s) const; Range * getRange(SubStr &s) const; RegExp * matchChar(uint c) const; RegExp * strToRE(SubStr s) const; @@ -40,16 +41,21 @@ public: RegExp * mkDot() const; }; -inline SubStr Scanner::token() +inline SubStr Scanner::token() const { return SubStr(tok, cur - tok); } -inline uint Scanner::line() +inline uint Scanner::line() const { return cline; } +inline uint Scanner::xlat(uint c) const +{ + return re2c::wFlag ? c : re2c::xlat[c]; +} + } // end namespace re2c #endif diff --git a/scanner.re b/scanner.re index 242e1fb7..19ddd451 100644 --- a/scanner.re +++ b/scanner.re @@ -28,39 +28,45 @@ namespace re2c Scanner::Scanner(std::istream& i) : in(i), bot(NULL), tok(NULL), ptr(NULL), cur(NULL), pos(NULL), lim(NULL), - top(NULL), eof(NULL), tchar(0), tline(0), cline(1) { + top(NULL), eof(NULL), tchar(0), tline(0), cline(1) +{ ; } -char *Scanner::fill(char *cursor){ - if(!eof){ - uint cnt = tok - bot; - if(cnt){ - memcpy(bot, tok, lim - tok); - tok = bot; - ptr -= cnt; - cursor -= cnt; - pos -= cnt; - lim -= cnt; - } - if((top - lim) < BSIZE){ - char *buf = new char[(lim - bot) + BSIZE]; - memcpy(buf, tok, lim - tok); - tok = buf; - ptr = &buf[ptr - bot]; - cursor = &buf[cursor - bot]; - pos = &buf[pos - bot]; - lim = &buf[lim - bot]; - top = &lim[BSIZE]; - delete [] bot; - bot = buf; - } - if((cnt = in.rdbuf()->sgetn((char*) lim, BSIZE)) != BSIZE){ - eof = &lim[cnt]; *eof++ = '\0'; +char *Scanner::fill(char *cursor) +{ + if(!eof) + { + uint cnt = tok - bot; + if(cnt) + { + memcpy(bot, tok, lim - tok); + tok = bot; + ptr -= cnt; + cursor -= cnt; + pos -= cnt; + lim -= cnt; + } + if((top - lim) < BSIZE) + { + char *buf = new char[(lim - bot) + BSIZE]; + memcpy(buf, tok, lim - tok); + tok = buf; + ptr = &buf[ptr - bot]; + cursor = &buf[cursor - bot]; + pos = &buf[pos - bot]; + lim = &buf[lim - bot]; + top = &lim[BSIZE]; + delete [] bot; + bot = buf; + } + if((cnt = in.rdbuf()->sgetn((char*) lim, BSIZE)) != BSIZE) + { + eof = &lim[cnt]; *eof++ = '\0'; + } + lim += cnt; } - lim += cnt; - } - return cursor; + return cursor; } /*!re2c @@ -199,12 +205,25 @@ scan: [ \t]+ { goto scan; } + "\r\n" { if(cursor == eof) RETURN(0); + pos = cursor; cline++; + goto scan; + } "\n" { if(cursor == eof) RETURN(0); pos = cursor; cline++; goto scan; } - any { std::cerr << "unexpected character: " << *tok << std::endl; + any { std::cerr << "line " << tline << ", column " << (tchar + 1) + << ": unexpected character: "; + if (isprint(*tok)) + { + std::cerr << *tok << std::endl; + } + else + { + std::cerr << "0x" << hexCh(*tok >> 4) << hexCh(*tok) << std::endl; + } goto scan; } */ @@ -250,4 +269,3 @@ void Scanner::fatal(char *msg) const } } // end namespace re2c - diff --git a/translate.cc b/translate.cc index c8b73a42..a8be85ed 100644 --- a/translate.cc +++ b/translate.cc @@ -4,7 +4,7 @@ namespace re2c { -uchar asc2asc[256] = +uint asc2asc[256] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, @@ -24,10 +24,10 @@ uchar asc2asc[256] = 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff }; -uchar *xlat = asc2asc; -uchar *talx = asc2asc; +uint *xlat = asc2asc; +uint *talx = asc2asc; -uchar asc2ebc[256] = +uint asc2ebc[256] = { /* Based on ISO 8859/1 and Code Page 37 */ 0x00, 0x01, 0x02, 0x03, 0x37, 0x2d, 0x2e, 0x2f, 0x16, 0x05, 0x25, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x3c, 0x3d, 0x32, 0x26, 0x18, 0x19, 0x3f, 0x27, 0x1c, 0x1d, 0x1e, 0x1f, @@ -47,7 +47,7 @@ uchar asc2ebc[256] = 0x8c, 0x49, 0xcd, 0xce, 0xcb, 0xcf, 0xcc, 0xe1, 0x70, 0xdd, 0xde, 0xdb, 0xdc, 0x8d, 0xae, 0xdf }; -uchar ebc2asc[256] = +uint ebc2asc[256] = { /* Based on ISO 8859/1 and Code Page 37 */ 0x00, 0x01, 0x02, 0x03, 0x9c, 0x09, 0x86, 0x7f, 0x97, 0x8d, 0x8e, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x9d, 0x85, 0x08, 0x87, 0x18, 0x19, 0x92, 0x8f, 0x1c, 0x1d, 0x1e, 0x1f, @@ -68,4 +68,3 @@ uchar ebc2asc[256] = }; } // end namespace re2c -