]> granicus.if.org Git - re2c/commitdiff
- Added experimental unicode support
authorhelly <helly@642ea486-5414-0410-9d7f-a0204ed87703>
Wed, 28 Dec 2005 18:33:37 +0000 (18:33 +0000)
committerhelly <helly@642ea486-5414-0410-9d7f-a0204ed87703>
Wed, 28 Dec 2005 18:33:37 +0000 (18:33 +0000)
12 files changed:
CHANGELOG
actions.cc
bootstrap/scanner.cc
dfa.cc
dfa.h
globals.h
ins.h
main.cc
re2c.1.in
scanner.h
scanner.re
translate.cc

index 85cc98a6f1b0d4acc04eace44b0f1df70481b984..44cf0e9dd3903a37d19828de829dcbcf50847c91 100644 (file)
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -1,5 +1,8 @@
 Version 0.9.13 (????-??-??)
 ---------------------------
+- Added support for DOS line endings.
+- Added experimental unicode support.
+- Applied #1307467 Unicode patch for 0.9.7.
 
 Version 0.9.12 (2005-12-28)
 ---------------------------
index bc900add0861ecc6c4fa321ed3e0cf479ae815ab..f697e1d77d0bf1c77fc0c102963a365575bd08a0 100644 (file)
@@ -513,40 +513,28 @@ void CloseVOp::split(CharSet &s)
 
 RegExp *expr(Scanner &);
 
-uchar Scanner::unescape(SubStr &s) const
+uint Scanner::unescape(SubStr &s) const
 {
        s.len--;
-       uchar c;
+       uint c;
 
        if ((c = *s.str++) != '\\' || s.len == 0)
-               return xlat[c];
+       {
+               return xlat(c);
+       }
 
        s.len--;
 
        switch (c = *s.str++)
        {
+               case 'n': return xlat('\n');
+               case 't': return xlat('\t');
+               case 'v': return xlat('\v');
+               case 'b': return xlat('\b');
+               case 'r': return xlat('\r');
+               case 'f': return xlat('\f');
+               case 'a': return xlat('\a');
 
-               case 'n':
-               return xlat['\n'];
-
-               case 't':
-               return xlat['\t'];
-
-               case 'v':
-               return xlat['\v'];
-
-               case 'b':
-               return xlat['\b'];
-
-               case 'r':
-               return xlat['\r'];
-
-               case 'f':
-               return xlat['\f'];
-
-               case 'a':
-               return xlat['\a'];
-               
                case 'x':
                {
                        static const char * hex = "0123456789abcdef";
@@ -560,25 +548,42 @@ uchar Scanner::unescape(SubStr &s) const
                        s.len -= 2;
                        s.str += 2;
                        
-                       uchar v = (uchar)((p1 - hex) << 4) + (uchar)(p2 - hex);
+                       uint v = (uint)((p1 - hex) << 4) 
+                              + (uint)((p2 - hex));
 
                        return v;
                }
 
-               case '0':
+               case 'X':
+               {
+                       static const char * hex = "0123456789abcdef";
+                       char *p1, *p2, *p3, *p4;
 
-               case '1':
+                       if (s.len < 4 || !(p1 = strchr(hex, tolower(s.str[0]))) 
+                                     || !(p2 = strchr(hex, tolower(s.str[1])))
+                                     || !(p3 = strchr(hex, tolower(s.str[2])))
+                                     || !(p4 = strchr(hex, tolower(s.str[3]))))
+                       {
+                               fatal("Illegal hexadecimal character code");
+                       }
+                       s.len -= 4;
+                       s.str += 4;
+                       
+                       uint v = (uint)((p1 - hex) << 12) 
+                              + (uint)((p2 - hex) <<  8)
+                              + (uint)((p3 - hex) <<  4)
+                              + (uint)((p4 - hex));
 
-               case '2':
+                       return v;
+               }
 
+               case '0':
+               case '1':
+               case '2':
                case '3':
-
                case '4':
-
                case '5':
-
                case '6':
-
                case '7':
                {
                        static const char * oct = "01234567";
@@ -593,13 +598,13 @@ uchar Scanner::unescape(SubStr &s) const
                        s.len -= 2;
                        s.str += 2;
                        
-                       uchar v = (uchar)((p0 - oct) << 6) + (uchar)((p1 - oct) << 3) + (uchar)(p2 - oct);
+                       uint v = (uint)((p0 - oct) << 6) + (uint)((p1 - oct) << 3) + (uint)(p2 - oct);
 
                        return v;
                }
 
                default:
-               return xlat[c];
+               return xlat(c);
        }
 }
 
@@ -624,18 +629,18 @@ Range * Scanner::getRange(SubStr &s) const
                        ub = tmp;
                }
                
-               xlb = xlat[lb];
-               xub = xlat[ub];
+               xlb = xlat(lb);
+               xub = xlat(ub);
                
                for(c = lb; c <= ub; c++)
                {
-                       if (!(xlb <= xlat[c] && xlat[c] <= ub))
+                       if (!(xlb <= xlat(c) && xlat(c) <= ub))
                        {
                                /* range doesn't work */
                                Range * r = new Range(xlb, xlb + 1);
                                for (c = lb + 1; c <= ub; c++)
                                {
-                                       r = doUnion(r, new Range(xlat[c], xlat[c] + 1));
+                                       r = doUnion(r, new Range(xlat(c), xlat(c) + 1));
                                }
                                return r;
                        }
@@ -677,14 +682,14 @@ RegExp * Scanner::strToCaseInsensitiveRE(SubStr s) const
        if (s.len == 0)
                return new NullOp;
 
-       uchar c = unescape(s);
+       uint c = unescape(s);
 
        RegExp *re, *reL, *reU;
 
        if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z'))
        {
-               reL = matchChar(xlat[tolower(c)]);
-               reU = matchChar(xlat[toupper(c)]);
+               reL = matchChar(xlat(tolower(c)));
+               reU = matchChar(xlat(toupper(c)));
                re = mkAlt(reL, reU);
        }
        else
@@ -694,12 +699,12 @@ RegExp * Scanner::strToCaseInsensitiveRE(SubStr s) const
 
        while (s.len > 0)
        {
-               uchar c = unescape(s);
+               uint c = unescape(s);
 
                if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z'))
                {
-                       reL = matchChar(xlat[tolower(c)]);
-                       reU = matchChar(xlat[toupper(c)]);
+                       reL = matchChar(xlat(tolower(c)));
+                       reU = matchChar(xlat(toupper(c)));
                        re = new CatOp(re, mkAlt(reL, reU));
                }
                else
@@ -751,7 +756,7 @@ RegExp * Scanner::invToRE(SubStr s) const
 RegExp * Scanner::mkDot() const
 {
        RegExp * any = ranToRE(SubStr("[\\000-\\377]"));
-       RegExp * ran = matchChar(xlat['\n']);
+       RegExp * ran = matchChar(xlat('\n'));
        RegExp * inv = mkDiff(any, ran);
        
        delete ran;
@@ -843,7 +848,7 @@ void genCode(std::ostream& o, RegExp *re)
        uint j;
        memset(&cs, 0, sizeof(cs));
 
-       for (j = 0; j < nChars; ++j)
+       for (j = 0; j < nRealChars; ++j)
        {
                cs.rep[j] = &cs.ptn[0];
                cs.ptn[j].nxt = &cs.ptn[j + 1];
@@ -863,7 +868,7 @@ void genCode(std::ostream& o, RegExp *re)
        */
        Char rep[nChars];
 
-       for (j = 0; j < nChars; ++j)
+       for (j = 0; j < nRealChars; ++j)
        {
                if (!cs.rep[j]->nxt)
                        cs.rep[j]->nxt = &cs.ptn[j];
@@ -895,7 +900,7 @@ void genCode(std::ostream& o, RegExp *re)
                }
        }
 
-       DFA *dfa = new DFA(ins, re->size, 0, 256, rep);
+       DFA *dfa = new DFA(ins, re->size, 0, nRealChars, rep);
        dfa->emit(o);
        delete dfa;
        delete [] ins;
index 1ef13067b20dcbbb29812c679361510633ffcc5e..4c6497df6ea3f052337c1d28f94d5515f6355ebe 100644 (file)
@@ -1,4 +1,4 @@
-/* Generated by re2c 0.9.12.dev on Wed Dec 28 00:53:17 2005 */
+/* Generated by re2c 0.9.13.dev on Wed Dec 28 18:30:39 2005 */
 #line 1 "scanner.re"
 /* $Id$ */
 #include <stdlib.h>
@@ -30,42 +30,48 @@ namespace re2c
 
 Scanner::Scanner(std::istream& i) : in(i),
        bot(NULL), tok(NULL), ptr(NULL), cur(NULL), pos(NULL), lim(NULL),
-       top(NULL), eof(NULL), tchar(0), tline(0), cline(1) {
+       top(NULL), eof(NULL), tchar(0), tline(0), cline(1)
+{
     ;
 }
 
-char *Scanner::fill(char *cursor){
-    if(!eof){
-       uint cnt = tok - bot;
-       if(cnt){
-           memcpy(bot, tok, lim - tok);
-           tok = bot;
-           ptr -= cnt;
-           cursor -= cnt;
-           pos -= cnt;
-           lim -= cnt;
-       }
-       if((top - lim) < BSIZE){
-           char *buf = new char[(lim - bot) + BSIZE];
-           memcpy(buf, tok, lim - tok);
-           tok = buf;
-           ptr = &buf[ptr - bot];
-           cursor = &buf[cursor - bot];
-           pos = &buf[pos - bot];
-           lim = &buf[lim - bot];
-           top = &lim[BSIZE];
-           delete [] bot;
-           bot = buf;
-       }
-       if((cnt = in.rdbuf()->sgetn((char*) lim, BSIZE)) != BSIZE){
-           eof = &lim[cnt]; *eof++ = '\0';
+char *Scanner::fill(char *cursor)
+{
+       if(!eof)
+       {
+               uint cnt = tok - bot;
+               if(cnt)
+               {
+                       memcpy(bot, tok, lim - tok);
+                       tok = bot;
+                       ptr -= cnt;
+                       cursor -= cnt;
+                       pos -= cnt;
+                       lim -= cnt;
+               }
+               if((top - lim) < BSIZE)
+               {
+                       char *buf = new char[(lim - bot) + BSIZE];
+                       memcpy(buf, tok, lim - tok);
+                       tok = buf;
+                       ptr = &buf[ptr - bot];
+                       cursor = &buf[cursor - bot];
+                       pos = &buf[pos - bot];
+                       lim = &buf[lim - bot];
+                       top = &lim[BSIZE];
+                       delete [] bot;
+                       bot = buf;
+               }
+               if((cnt = in.rdbuf()->sgetn((char*) lim, BSIZE)) != BSIZE)
+               {
+                       eof = &lim[cnt]; *eof++ = '\0';
+               }
+               lim += cnt;
        }
-       lim += cnt;
-    }
-    return cursor;
+       return cursor;
 }
 
-#line 77 "scanner.re"
+#line 83 "scanner.re"
 
 
 int Scanner::echo(std::ostream &out){
@@ -81,7 +87,7 @@ int Scanner::echo(std::ostream &out){
     tok = cursor;
 echo:
 
-#line 85 "scanner.cc"
+#line 91 "scanner.cc"
 {
        YYCTYPE yych;
        unsigned int yyaccept = 0;
@@ -104,41 +110,41 @@ yy2:      yyaccept = 0;
        if(yych == '*') goto yy12;
        goto yy3;
 yy3:
-#line 123 "scanner.re"
+#line 129 "scanner.re"
 {
                                        goto echo;
                                }
-#line 112 "scanner.cc"
+#line 118 "scanner.cc"
 yy4:   yych = *++YYCURSOR;
        if(yych == '/') goto yy10;
        goto yy3;
 yy5:   ++YYCURSOR;
        goto yy6;
 yy6:
-#line 112 "scanner.re"
+#line 118 "scanner.re"
 {
                                        out.write((const char*)(tok), (const char*)(cursor) - (const char*)(tok));
                                        tok = pos = cursor; cline++; oline++;
                                        goto echo;
                                }
-#line 125 "scanner.cc"
+#line 131 "scanner.cc"
 yy7:   ++YYCURSOR;
        goto yy8;
 yy8:
-#line 117 "scanner.re"
+#line 123 "scanner.re"
 {
                                        out.write((const char*)(tok), (const char*)(cursor) - (const char*)(tok) - 1); // -1 so we don't write out the \0
                                        if(cursor == eof) {
                                                RETURN(0);
                                        }
                                }
-#line 136 "scanner.cc"
+#line 142 "scanner.cc"
 yy9:   yych = *++YYCURSOR;
        goto yy3;
 yy10:  ++YYCURSOR;
        goto yy11;
 yy11:
-#line 103 "scanner.re"
+#line 109 "scanner.re"
 {
                                        if (ignore_eoc) {
                                                ignore_eoc = false;
@@ -148,7 +154,7 @@ yy11:
                                        tok = pos = cursor;
                                        goto echo;
                                }
-#line 152 "scanner.cc"
+#line 158 "scanner.cc"
 yy12:  yych = *++YYCURSOR;
        if(yych == '!') goto yy14;
        goto yy13;
@@ -175,13 +181,13 @@ yy18:     yych = *++YYCURSOR;
 yy19:  ++YYCURSOR;
        goto yy20;
 yy20:
-#line 92 "scanner.re"
+#line 98 "scanner.re"
 { 
                                        out.write((const char*)(tok), (const char*)(&cursor[-7]) - (const char*)(tok));
                                        tok = cursor;
                                        RETURN(1);
                                }
-#line 185 "scanner.cc"
+#line 191 "scanner.cc"
 yy21:  yych = *++YYCURSOR;
        if(yych != 'x') goto yy13;
        goto yy22;
@@ -203,16 +209,16 @@ yy26:     yych = *++YYCURSOR;
 yy27:  ++YYCURSOR;
        goto yy28;
 yy28:
-#line 97 "scanner.re"
+#line 103 "scanner.re"
 {
                                        out << "#define YYMAXFILL " << maxFill << std::endl;
                                        tok = pos = cursor;
                                        ignore_eoc = true;
                                        goto echo;
                                }
-#line 214 "scanner.cc"
+#line 220 "scanner.cc"
 }
-#line 126 "scanner.re"
+#line 132 "scanner.re"
 
 }
 
@@ -226,7 +232,7 @@ scan:
     tline = cline;
     tok = cursor;
 
-#line 230 "scanner.cc"
+#line 236 "scanner.cc"
 {
        YYCTYPE yych;
        unsigned int yyaccept = 0;
@@ -235,544 +241,567 @@ scan:
 yy29:
        if((YYLIMIT - YYCURSOR) < 4) YYFILL(4);
        yych = *YYCURSOR;
-       if(yych <= '/'){
-               if(yych <= '"'){
-                       if(yych <= 0x0A){
-                               if(yych <= 0x08)        goto yy53;
+       if(yych <= '.'){
+               if(yych <= '!'){
+                       if(yych <= 0x0C){
+                               if(yych <= 0x08)        goto yy55;
                                if(yych <= 0x09)        goto yy49;
-                               goto yy51;
+                               if(yych <= 0x0A)        goto yy53;
+                               goto yy55;
                        } else {
+                               if(yych <= 0x0D)        goto yy51;
                                if(yych == ' ') goto yy49;
-                               if(yych <= '!') goto yy53;
-                               goto yy37;
+                               goto yy55;
                        }
                } else {
-                       if(yych <= '*'){
-                               if(yych <= '&') goto yy53;
+                       if(yych <= ')'){
+                               if(yych <= '"') goto yy37;
+                               if(yych <= '&') goto yy55;
                                if(yych <= '\'')        goto yy39;
-                               if(yych <= ')') goto yy43;
-                               goto yy35;
+                               goto yy43;
                        } else {
+                               if(yych <= '*') goto yy35;
                                if(yych <= '+') goto yy44;
-                               if(yych <= '-') goto yy53;
-                               if(yych <= '.') goto yy47;
-                               goto yy33;
+                               if(yych <= '-') goto yy55;
+                               goto yy47;
                        }
                }
        } else {
                if(yych <= '@'){
                        if(yych <= '<'){
+                               if(yych <= '/') goto yy33;
                                if(yych == ';') goto yy43;
-                               goto yy53;
+                               goto yy55;
                        } else {
                                if(yych <= '=') goto yy43;
                                if(yych == '?') goto yy44;
-                               goto yy53;
+                               goto yy55;
                        }
                } else {
                        if(yych <= '`'){
                                if(yych <= 'Z') goto yy45;
                                if(yych <= '[') goto yy41;
                                if(yych <= '\\')        goto yy43;
-                               goto yy53;
+                               goto yy55;
                        } else {
                                if(yych <= 'z') goto yy45;
                                if(yych <= '{') goto yy31;
                                if(yych <= '|') goto yy43;
-                               goto yy53;
+                               goto yy55;
                        }
                }
        }
 yy31:  yyaccept = 0;
        yych = *(YYMARKER = ++YYCURSOR);
        if(yych <= '/'){
-               if(yych == ',') goto yy87;
+               if(yych == ',') goto yy90;
                goto yy32;
        } else {
-               if(yych <= '0') goto yy84;
-               if(yych <= '9') goto yy85;
+               if(yych <= '0') goto yy87;
+               if(yych <= '9') goto yy88;
                goto yy32;
        }
 yy32:
-#line 139 "scanner.re"
+#line 145 "scanner.re"
 { depth = 1;
                                  goto code;
                                }
-#line 302 "scanner.cc"
+#line 310 "scanner.cc"
 yy33:  ++YYCURSOR;
-       if((yych = *YYCURSOR) == '*')   goto yy82;
+       if((yych = *YYCURSOR) == '*')   goto yy85;
        goto yy34;
 yy34:
-#line 169 "scanner.re"
+#line 175 "scanner.re"
 { RETURN(*tok); }
-#line 309 "scanner.cc"
+#line 317 "scanner.cc"
 yy35:  ++YYCURSOR;
-       if((yych = *YYCURSOR) == '/')   goto yy80;
+       if((yych = *YYCURSOR) == '/')   goto yy83;
        goto yy36;
 yy36:
-#line 171 "scanner.re"
+#line 177 "scanner.re"
 { yylval.op = *tok;
                                  RETURN(CLOSE); }
-#line 317 "scanner.cc"
+#line 325 "scanner.cc"
 yy37:  yyaccept = 1;
        yych = *(YYMARKER = ++YYCURSOR);
-       if(yych != 0x0A)        goto yy76;
+       if(yych != 0x0A)        goto yy79;
        goto yy38;
 yy38:
-#line 156 "scanner.re"
+#line 162 "scanner.re"
 { fatal("unterminated string constant (missing \")"); }
-#line 325 "scanner.cc"
+#line 333 "scanner.cc"
 yy39:  yyaccept = 2;
        yych = *(YYMARKER = ++YYCURSOR);
-       if(yych != 0x0A)        goto yy71;
+       if(yych != 0x0A)        goto yy74;
        goto yy40;
 yy40:
-#line 157 "scanner.re"
+#line 163 "scanner.re"
 { fatal("unterminated string constant (missing ')"); }
-#line 333 "scanner.cc"
+#line 341 "scanner.cc"
 yy41:  yyaccept = 3;
        yych = *(YYMARKER = ++YYCURSOR);
        if(yych == 0x0A)        goto yy42;
-       if(yych == '^') goto yy62;
-       goto yy60;
+       if(yych == '^') goto yy65;
+       goto yy63;
 yy42:
-#line 167 "scanner.re"
+#line 173 "scanner.re"
 { fatal("unterminated range (missing ])"); }
-#line 342 "scanner.cc"
+#line 350 "scanner.cc"
 yy43:  yych = *++YYCURSOR;
        goto yy34;
 yy44:  yych = *++YYCURSOR;
        goto yy36;
 yy45:  ++YYCURSOR;
        yych = *YYCURSOR;
-       goto yy58;
+       goto yy61;
 yy46:
-#line 191 "scanner.re"
+#line 197 "scanner.re"
 { cur = cursor;
                                  yylval.symbol = Symbol::find(token());
                                  return ID; }
-#line 355 "scanner.cc"
+#line 363 "scanner.cc"
 yy47:  ++YYCURSOR;
        goto yy48;
 yy48:
-#line 195 "scanner.re"
+#line 201 "scanner.re"
 { cur = cursor;
                                  yylval.regexp = mkDot();
                                  return RANGE;
                                }
-#line 364 "scanner.cc"
+#line 372 "scanner.cc"
 yy49:  ++YYCURSOR;
        yych = *YYCURSOR;
-       goto yy56;
+       goto yy59;
 yy50:
-#line 200 "scanner.re"
+#line 206 "scanner.re"
 { goto scan; }
-#line 371 "scanner.cc"
+#line 379 "scanner.cc"
 yy51:  ++YYCURSOR;
+       if((yych = *YYCURSOR) == 0x0A)  goto yy56;
        goto yy52;
 yy52:
-#line 202 "scanner.re"
-{ if(cursor == eof) RETURN(0);
-                                 pos = cursor; cline++;
+#line 217 "scanner.re"
+{ std::cerr << "line " << tline << ", column " << (tchar + 1) 
+                                               << ": unexpected character: ";
+                                 if (isprint(*tok))
+                                 {
+                                       std::cerr << *tok << std::endl;
+                                 }
+                                 else
+                                 {
+                                       std::cerr << "0x" << hexCh(*tok >> 4) << hexCh(*tok) << std::endl;
+                                 }
                                  goto scan;
-                               }
-#line 380 "scanner.cc"
+                               }
+#line 397 "scanner.cc"
 yy53:  ++YYCURSOR;
        goto yy54;
 yy54:
-#line 207 "scanner.re"
-{ std::cerr << "unexpected character: " << *tok << std::endl;
+#line 212 "scanner.re"
+{ if(cursor == eof) RETURN(0);
+                                 pos = cursor; cline++;
                                  goto scan;
-                               }
-#line 388 "scanner.cc"
-yy55:  ++YYCURSOR;
+                               }
+#line 406 "scanner.cc"
+yy55:  yych = *++YYCURSOR;
+       goto yy52;
+yy56:  ++YYCURSOR;
+       goto yy57;
+yy57:
+#line 208 "scanner.re"
+{ if(cursor == eof) RETURN(0);
+                                 pos = cursor; cline++;
+                                 goto scan;
+                               }
+#line 417 "scanner.cc"
+yy58:  ++YYCURSOR;
        if(YYLIMIT == YYCURSOR) YYFILL(1);
        yych = *YYCURSOR;
-       goto yy56;
-yy56:  if(yych == 0x09)        goto yy55;
-       if(yych == ' ') goto yy55;
+       goto yy59;
+yy59:  if(yych == 0x09)        goto yy58;
+       if(yych == ' ') goto yy58;
        goto yy50;
-yy57:  ++YYCURSOR;
+yy60:  ++YYCURSOR;
        if(YYLIMIT == YYCURSOR) YYFILL(1);
        yych = *YYCURSOR;
-       goto yy58;
-yy58:  if(yych <= '@'){
+       goto yy61;
+yy61:  if(yych <= '@'){
                if(yych <= '/') goto yy46;
-               if(yych <= '9') goto yy57;
+               if(yych <= '9') goto yy60;
                goto yy46;
        } else {
-               if(yych <= 'Z') goto yy57;
+               if(yych <= 'Z') goto yy60;
                if(yych <= '`') goto yy46;
-               if(yych <= 'z') goto yy57;
+               if(yych <= 'z') goto yy60;
                goto yy46;
        }
-yy59:  ++YYCURSOR;
+yy62:  ++YYCURSOR;
        if(YYLIMIT == YYCURSOR) YYFILL(1);
        yych = *YYCURSOR;
-       goto yy60;
-yy60:  if(yych <= '['){
-               if(yych != 0x0A)        goto yy59;
-               goto yy61;
+       goto yy63;
+yy63:  if(yych <= '['){
+               if(yych != 0x0A)        goto yy62;
+               goto yy64;
        } else {
-               if(yych <= '\\')        goto yy64;
-               if(yych <= ']') goto yy65;
-               goto yy59;
+               if(yych <= '\\')        goto yy67;
+               if(yych <= ']') goto yy68;
+               goto yy62;
        }
-yy61:  YYCURSOR = YYMARKER;
+yy64:  YYCURSOR = YYMARKER;
        switch(yyaccept){
        case 0: goto yy32;
        case 1: goto yy38;
        case 2: goto yy40;
        case 3: goto yy42;
-       case 4: goto yy88;
+       case 4: goto yy91;
        }
-yy62:  ++YYCURSOR;
+yy65:  ++YYCURSOR;
        if(YYLIMIT == YYCURSOR) YYFILL(1);
        yych = *YYCURSOR;
-       goto yy63;
-yy63:  if(yych <= '['){
-               if(yych == 0x0A)        goto yy61;
-               goto yy62;
+       goto yy66;
+yy66:  if(yych <= '['){
+               if(yych == 0x0A)        goto yy64;
+               goto yy65;
        } else {
-               if(yych <= '\\')        goto yy67;
-               if(yych <= ']') goto yy68;
-               goto yy62;
+               if(yych <= '\\')        goto yy70;
+               if(yych <= ']') goto yy71;
+               goto yy65;
        }
-yy64:  ++YYCURSOR;
-       if(YYLIMIT == YYCURSOR) YYFILL(1);
-       yych = *YYCURSOR;
-       if(yych == 0x0A)        goto yy61;
-       goto yy59;
-yy65:  ++YYCURSOR;
-       goto yy66;
-yy66:
-#line 163 "scanner.re"
-{ cur = cursor;
-                                 yylval.regexp = ranToRE(token());
-                                 return RANGE; }
-#line 454 "scanner.cc"
 yy67:  ++YYCURSOR;
        if(YYLIMIT == YYCURSOR) YYFILL(1);
        yych = *YYCURSOR;
-       if(yych == 0x0A)        goto yy61;
+       if(yych == 0x0A)        goto yy64;
        goto yy62;
 yy68:  ++YYCURSOR;
        goto yy69;
 yy69:
-#line 159 "scanner.re"
+#line 169 "scanner.re"
 { cur = cursor;
-                                 yylval.regexp = invToRE(token());
+                                 yylval.regexp = ranToRE(token());
                                  return RANGE; }
-#line 467 "scanner.cc"
+#line 483 "scanner.cc"
 yy70:  ++YYCURSOR;
        if(YYLIMIT == YYCURSOR) YYFILL(1);
        yych = *YYCURSOR;
-       goto yy71;
-yy71:  if(yych <= '&'){
-               if(yych == 0x0A)        goto yy61;
-               goto yy70;
+       if(yych == 0x0A)        goto yy64;
+       goto yy65;
+yy71:  ++YYCURSOR;
+       goto yy72;
+yy72:
+#line 165 "scanner.re"
+{ cur = cursor;
+                                 yylval.regexp = invToRE(token());
+                                 return RANGE; }
+#line 496 "scanner.cc"
+yy73:  ++YYCURSOR;
+       if(YYLIMIT == YYCURSOR) YYFILL(1);
+       yych = *YYCURSOR;
+       goto yy74;
+yy74:  if(yych <= '&'){
+               if(yych == 0x0A)        goto yy64;
+               goto yy73;
        } else {
-               if(yych <= '\'')        goto yy73;
-               if(yych != '\\')        goto yy70;
-               goto yy72;
+               if(yych <= '\'')        goto yy76;
+               if(yych != '\\')        goto yy73;
+               goto yy75;
        }
-yy72:  ++YYCURSOR;
+yy75:  ++YYCURSOR;
        if(YYLIMIT == YYCURSOR) YYFILL(1);
        yych = *YYCURSOR;
-       if(yych == 0x0A)        goto yy61;
-       goto yy70;
-yy73:  ++YYCURSOR;
-       goto yy74;
-yy74:
-#line 152 "scanner.re"
+       if(yych == 0x0A)        goto yy64;
+       goto yy73;
+yy76:  ++YYCURSOR;
+       goto yy77;
+yy77:
+#line 158 "scanner.re"
 { cur = cursor;
                                  yylval.regexp = strToCaseInsensitiveRE(token());
                                  return STRING; }
-#line 492 "scanner.cc"
-yy75:  ++YYCURSOR;
+#line 521 "scanner.cc"
+yy78:  ++YYCURSOR;
        if(YYLIMIT == YYCURSOR) YYFILL(1);
        yych = *YYCURSOR;
-       goto yy76;
-yy76:  if(yych <= '!'){
-               if(yych == 0x0A)        goto yy61;
-               goto yy75;
+       goto yy79;
+yy79:  if(yych <= '!'){
+               if(yych == 0x0A)        goto yy64;
+               goto yy78;
        } else {
-               if(yych <= '"') goto yy78;
-               if(yych != '\\')        goto yy75;
-               goto yy77;
+               if(yych <= '"') goto yy81;
+               if(yych != '\\')        goto yy78;
+               goto yy80;
        }
-yy77:  ++YYCURSOR;
+yy80:  ++YYCURSOR;
        if(YYLIMIT == YYCURSOR) YYFILL(1);
        yych = *YYCURSOR;
-       if(yych == 0x0A)        goto yy61;
-       goto yy75;
-yy78:  ++YYCURSOR;
-       goto yy79;
-yy79:
-#line 148 "scanner.re"
+       if(yych == 0x0A)        goto yy64;
+       goto yy78;
+yy81:  ++YYCURSOR;
+       goto yy82;
+yy82:
+#line 154 "scanner.re"
 { cur = cursor;
                                  yylval.regexp = strToRE(token());
                                  return STRING; }
-#line 517 "scanner.cc"
-yy80:  ++YYCURSOR;
-       goto yy81;
-yy81:
-#line 145 "scanner.re"
+#line 546 "scanner.cc"
+yy83:  ++YYCURSOR;
+       goto yy84;
+yy84:
+#line 151 "scanner.re"
 { tok = cursor;
                                  RETURN(0); }
-#line 524 "scanner.cc"
-yy82:  ++YYCURSOR;
-       goto yy83;
-yy83:
-#line 142 "scanner.re"
+#line 553 "scanner.cc"
+yy85:  ++YYCURSOR;
+       goto yy86;
+yy86:
+#line 148 "scanner.re"
 { depth = 1;
                                  goto comment; }
-#line 531 "scanner.cc"
-yy84:  yych = *++YYCURSOR;
-       if(yych == ',') goto yy98;
-       goto yy86;
-yy85:  ++YYCURSOR;
+#line 560 "scanner.cc"
+yy87:  yych = *++YYCURSOR;
+       if(yych == ',') goto yy101;
+       goto yy89;
+yy88:  ++YYCURSOR;
        if((YYLIMIT - YYCURSOR) < 2) YYFILL(2);
        yych = *YYCURSOR;
-       goto yy86;
-yy86:  if(yych <= '/'){
-               if(yych == ',') goto yy91;
-               goto yy61;
+       goto yy89;
+yy89:  if(yych <= '/'){
+               if(yych == ',') goto yy94;
+               goto yy64;
        } else {
-               if(yych <= '9') goto yy85;
-               if(yych == '}') goto yy89;
-               goto yy61;
+               if(yych <= '9') goto yy88;
+               if(yych == '}') goto yy92;
+               goto yy64;
        }
-yy87:  ++YYCURSOR;
-       goto yy88;
-yy88:
-#line 189 "scanner.re"
+yy90:  ++YYCURSOR;
+       goto yy91;
+yy91:
+#line 195 "scanner.re"
 { fatal("illegal closure form, use '{n}', '{n,}', '{n,m}' where n and m are numbers"); }
-#line 552 "scanner.cc"
-yy89:  ++YYCURSOR;
-       goto yy90;
-yy90:
-#line 177 "scanner.re"
-{ yylval.extop.minsize = atoi((char *)tok+1);
-                                 yylval.extop.maxsize = atoi((char *)tok+1);
-                                 RETURN(CLOSESIZE); }
-#line 560 "scanner.cc"
-yy91:  yyaccept = 4;
-       yych = *(YYMARKER = ++YYCURSOR);
-       if(yych <= '/') goto yy88;
-       if(yych <= '9') goto yy94;
-       if(yych != '}') goto yy88;
-       goto yy92;
+#line 581 "scanner.cc"
 yy92:  ++YYCURSOR;
        goto yy93;
 yy93:
-#line 185 "scanner.re"
+#line 183 "scanner.re"
 { yylval.extop.minsize = atoi((char *)tok+1);
-                                 yylval.extop.maxsize = -1;
+                                 yylval.extop.maxsize = atoi((char *)tok+1);
                                  RETURN(CLOSESIZE); }
-#line 574 "scanner.cc"
-yy94:  ++YYCURSOR;
-       if(YYLIMIT == YYCURSOR) YYFILL(1);
-       yych = *YYCURSOR;
+#line 589 "scanner.cc"
+yy94:  yyaccept = 4;
+       yych = *(YYMARKER = ++YYCURSOR);
+       if(yych <= '/') goto yy91;
+       if(yych <= '9') goto yy97;
+       if(yych != '}') goto yy91;
        goto yy95;
-yy95:  if(yych <= '/') goto yy61;
-       if(yych <= '9') goto yy94;
-       if(yych != '}') goto yy61;
+yy95:  ++YYCURSOR;
        goto yy96;
-yy96:  ++YYCURSOR;
-       goto yy97;
-yy97:
-#line 181 "scanner.re"
+yy96:
+#line 191 "scanner.re"
 { yylval.extop.minsize = atoi((char *)tok+1);
-                                 yylval.extop.maxsize = MAX(yylval.extop.minsize,atoi(strchr((char *)tok, ',')+1));
+                                 yylval.extop.maxsize = -1;
                                  RETURN(CLOSESIZE); }
-#line 590 "scanner.cc"
-yy98:  yyaccept = 4;
-       yych = *(YYMARKER = ++YYCURSOR);
-       if(yych <= '/') goto yy88;
-       if(yych <= '9') goto yy94;
-       if(yych != '}') goto yy88;
+#line 603 "scanner.cc"
+yy97:  ++YYCURSOR;
+       if(YYLIMIT == YYCURSOR) YYFILL(1);
+       yych = *YYCURSOR;
+       goto yy98;
+yy98:  if(yych <= '/') goto yy64;
+       if(yych <= '9') goto yy97;
+       if(yych != '}') goto yy64;
        goto yy99;
 yy99:  ++YYCURSOR;
        goto yy100;
 yy100:
-#line 174 "scanner.re"
+#line 187 "scanner.re"
+{ yylval.extop.minsize = atoi((char *)tok+1);
+                                 yylval.extop.maxsize = MAX(yylval.extop.minsize,atoi(strchr((char *)tok, ',')+1));
+                                 RETURN(CLOSESIZE); }
+#line 619 "scanner.cc"
+yy101: yyaccept = 4;
+       yych = *(YYMARKER = ++YYCURSOR);
+       if(yych <= '/') goto yy91;
+       if(yych <= '9') goto yy97;
+       if(yych != '}') goto yy91;
+       goto yy102;
+yy102: ++YYCURSOR;
+       goto yy103;
+yy103:
+#line 180 "scanner.re"
 { yylval.op = '*';
                                  RETURN(CLOSE); }
-#line 603 "scanner.cc"
+#line 632 "scanner.cc"
 }
-#line 210 "scanner.re"
+#line 229 "scanner.re"
 
 
 code:
 
-#line 610 "scanner.cc"
+#line 639 "scanner.cc"
 {
        YYCTYPE yych;
        unsigned int yyaccept = 0;
-       goto yy101;
+       goto yy104;
        ++YYCURSOR;
-yy101:
+yy104:
        if((YYLIMIT - YYCURSOR) < 2) YYFILL(2);
        yych = *YYCURSOR;
        if(yych <= '&'){
                if(yych <= 0x0A){
-                       if(yych <= 0x09)        goto yy109;
-                       goto yy107;
+                       if(yych <= 0x09)        goto yy112;
+                       goto yy110;
                } else {
-                       if(yych == '"') goto yy111;
-                       goto yy109;
+                       if(yych == '"') goto yy114;
+                       goto yy112;
                }
        } else {
                if(yych <= '{'){
-                       if(yych <= '\'')        goto yy112;
-                       if(yych <= 'z') goto yy109;
-                       goto yy105;
+                       if(yych <= '\'')        goto yy115;
+                       if(yych <= 'z') goto yy112;
+                       goto yy108;
                } else {
-                       if(yych != '}') goto yy109;
-                       goto yy103;
+                       if(yych != '}') goto yy112;
+                       goto yy106;
                }
        }
-yy103: ++YYCURSOR;
-       goto yy104;
-yy104:
-#line 214 "scanner.re"
+yy106: ++YYCURSOR;
+       goto yy107;
+yy107:
+#line 233 "scanner.re"
 { if(--depth == 0){
                                        cur = cursor;
                                        yylval.token = new Token(token(), tline);
                                        return CODE;
                                  }
                                  goto code; }
-#line 647 "scanner.cc"
-yy105: ++YYCURSOR;
-       goto yy106;
-yy106:
-#line 220 "scanner.re"
+#line 676 "scanner.cc"
+yy108: ++YYCURSOR;
+       goto yy109;
+yy109:
+#line 239 "scanner.re"
 { ++depth;
                                  goto code; }
-#line 654 "scanner.cc"
-yy107: ++YYCURSOR;
-       goto yy108;
-yy108:
-#line 222 "scanner.re"
+#line 683 "scanner.cc"
+yy110: ++YYCURSOR;
+       goto yy111;
+yy111:
+#line 241 "scanner.re"
 { if(cursor == eof) fatal("missing '}'");
                                  pos = cursor; cline++;
                                  goto code;
                                }
-#line 663 "scanner.cc"
-yy109: ++YYCURSOR;
-       goto yy110;
-yy110:
-#line 226 "scanner.re"
+#line 692 "scanner.cc"
+yy112: ++YYCURSOR;
+       goto yy113;
+yy113:
+#line 245 "scanner.re"
 { goto code; }
-#line 669 "scanner.cc"
-yy111: yyaccept = 0;
+#line 698 "scanner.cc"
+yy114: yyaccept = 0;
        yych = *(YYMARKER = ++YYCURSOR);
-       if(yych == 0x0A)        goto yy110;
-       goto yy118;
-yy112: yyaccept = 0;
+       if(yych == 0x0A)        goto yy113;
+       goto yy121;
+yy115: yyaccept = 0;
        yych = *(YYMARKER = ++YYCURSOR);
-       if(yych == 0x0A)        goto yy110;
-       goto yy114;
-yy113: ++YYCURSOR;
+       if(yych == 0x0A)        goto yy113;
+       goto yy117;
+yy116: ++YYCURSOR;
        if(YYLIMIT == YYCURSOR) YYFILL(1);
        yych = *YYCURSOR;
-       goto yy114;
-yy114: if(yych <= '&'){
-               if(yych != 0x0A)        goto yy113;
-               goto yy115;
+       goto yy117;
+yy117: if(yych <= '&'){
+               if(yych != 0x0A)        goto yy116;
+               goto yy118;
        } else {
-               if(yych <= '\'')        goto yy109;
-               if(yych == '\\')        goto yy116;
-               goto yy113;
+               if(yych <= '\'')        goto yy112;
+               if(yych == '\\')        goto yy119;
+               goto yy116;
        }
-yy115: YYCURSOR = YYMARKER;
+yy118: YYCURSOR = YYMARKER;
        switch(yyaccept){
-       case 0: goto yy110;
+       case 0: goto yy113;
        }
-yy116: ++YYCURSOR;
+yy119: ++YYCURSOR;
        if(YYLIMIT == YYCURSOR) YYFILL(1);
        yych = *YYCURSOR;
-       if(yych == 0x0A)        goto yy115;
-       goto yy113;
-yy117: ++YYCURSOR;
+       if(yych == 0x0A)        goto yy118;
+       goto yy116;
+yy120: ++YYCURSOR;
        if(YYLIMIT == YYCURSOR) YYFILL(1);
        yych = *YYCURSOR;
-       goto yy118;
-yy118: if(yych <= '!'){
-               if(yych == 0x0A)        goto yy115;
-               goto yy117;
+       goto yy121;
+yy121: if(yych <= '!'){
+               if(yych == 0x0A)        goto yy118;
+               goto yy120;
        } else {
-               if(yych <= '"') goto yy109;
-               if(yych != '\\')        goto yy117;
-               goto yy119;
+               if(yych <= '"') goto yy112;
+               if(yych != '\\')        goto yy120;
+               goto yy122;
        }
-yy119: ++YYCURSOR;
+yy122: ++YYCURSOR;
        if(YYLIMIT == YYCURSOR) YYFILL(1);
        yych = *YYCURSOR;
-       if(yych == 0x0A)        goto yy115;
-       goto yy117;
+       if(yych == 0x0A)        goto yy118;
+       goto yy120;
 }
-#line 227 "scanner.re"
+#line 246 "scanner.re"
 
 
 comment:
 
-#line 722 "scanner.cc"
+#line 751 "scanner.cc"
 {
        YYCTYPE yych;
-       goto yy120;
+       goto yy123;
        ++YYCURSOR;
-yy120:
+yy123:
        if((YYLIMIT - YYCURSOR) < 2) YYFILL(2);
        yych = *YYCURSOR;
        if(yych <= ')'){
-               if(yych == 0x0A)        goto yy125;
-               goto yy127;
+               if(yych == 0x0A)        goto yy128;
+               goto yy130;
        } else {
-               if(yych <= '*') goto yy122;
-               if(yych == '/') goto yy124;
-               goto yy127;
+               if(yych <= '*') goto yy125;
+               if(yych == '/') goto yy127;
+               goto yy130;
        }
-yy122: ++YYCURSOR;
-       if((yych = *YYCURSOR) == '/')   goto yy130;
-       goto yy123;
-yy123:
-#line 241 "scanner.re"
-{ goto comment; }
-#line 744 "scanner.cc"
-yy124: yych = *++YYCURSOR;
-       if(yych == '*') goto yy128;
-       goto yy123;
 yy125: ++YYCURSOR;
+       if((yych = *YYCURSOR) == '/')   goto yy133;
        goto yy126;
 yy126:
-#line 237 "scanner.re"
-{ if(cursor == eof) RETURN(0);
-                                 tok = pos = cursor; cline++;
-                                 goto comment;
-                               }
-#line 756 "scanner.cc"
+#line 260 "scanner.re"
+{ goto comment; }
+#line 773 "scanner.cc"
 yy127: yych = *++YYCURSOR;
-       goto yy123;
+       if(yych == '*') goto yy131;
+       goto yy126;
 yy128: ++YYCURSOR;
        goto yy129;
 yy129:
-#line 235 "scanner.re"
+#line 256 "scanner.re"
+{ if(cursor == eof) RETURN(0);
+                                 tok = pos = cursor; cline++;
+                                 goto comment;
+                               }
+#line 785 "scanner.cc"
+yy130: yych = *++YYCURSOR;
+       goto yy126;
+yy131: ++YYCURSOR;
+       goto yy132;
+yy132:
+#line 254 "scanner.re"
 { ++depth;
                                  goto comment; }
-#line 765 "scanner.cc"
-yy130: ++YYCURSOR;
-       goto yy131;
-yy131:
-#line 231 "scanner.re"
+#line 794 "scanner.cc"
+yy133: ++YYCURSOR;
+       goto yy134;
+yy134:
+#line 250 "scanner.re"
 { if(--depth == 0)
                                        goto scan;
                                    else
                                        goto comment; }
-#line 774 "scanner.cc"
+#line 803 "scanner.cc"
 }
-#line 242 "scanner.re"
+#line 261 "scanner.re"
 
 }
 
@@ -784,4 +813,3 @@ void Scanner::fatal(char *msg) const
 }
 
 } // end namespace re2c
-
diff --git a/dfa.cc b/dfa.cc
index 7ba54f9936c0bbbb169ce68e776af6ddc03cdfd7..833d99c1afd53c8486267ec8a7bf264389e86bb1 100644 (file)
--- a/dfa.cc
+++ b/dfa.cc
@@ -8,41 +8,38 @@
 namespace re2c
 {
 
-inline char octCh(uint c)
+void prtChOrHex(std::ostream& o, uint c)
 {
-       return '0' + c % 8;
-}
-
-inline char hexCh(uint c)
-{
-       const char * sHex = "0123456789ABCDEF";
-       
-       return sHex[c & 0x0F];
-}
-
-void prtChOrHex(std::ostream& o, uchar c)
-{
-       uchar oc = talx[c];
+       int oc = (int)(re2c::wFlag ? c : re2c::talx[c]);
 
-       if (isprint(oc))
+       if ((oc < 256) && isprint(oc))
        {
                o << '\'';
-               prtCh(o, c);
+               prtCh(o, oc);
                o << '\'';
        }
+       else if (re2c::wFlag)
+       {
+               o << "0x"
+                 << hexCh(oc >> 12)
+                 << hexCh(oc >>  8)
+                 << hexCh(oc >>  4)
+                 << hexCh(oc);
+       }
        else
        {
-               o << "0x" << hexCh(c >> 4) << hexCh(c);
+               o << "0x"
+                 << hexCh(oc >>  4) 
+                 << hexCh(oc);
        }
 }
 
-void prtCh(std::ostream &o, uchar c)
+void prtCh(std::ostream &o, uint c)
 {
-       uchar oc = talx[c];
+       int oc = (int)(re2c::wFlag ? c : re2c::talx[c]);
 
        switch (oc)
        {
-
                case '\'':
                o << "\\'";
                break;
@@ -81,17 +78,31 @@ void prtCh(std::ostream &o, uchar c)
 
                default:
 
-               if (isprint(oc))
+               if ((oc < 256) && isprint(oc))
+               {
                        o << (char) oc;
+               }
+               else if (re2c::wFlag)
+               {
+                       o << "0x"
+                         << hexCh(oc >> 12)
+                         << hexCh(oc >>  8)
+                         << hexCh(oc >>  4)
+                         << hexCh(oc);
+               }
                else
-                       o << '\\' << octCh(c / 64) << octCh(c / 8) << octCh(c);
+               {
+                       o << '\\' << octCh(oc / 64) << octCh(oc / 8) << octCh(oc);
+               }
        }
 }
 
 void printSpan(std::ostream &o, uint lb, uint ub)
 {
        if (lb > ub)
+       {
                o << "*";
+       }
 
        o << "[";
 
@@ -125,7 +136,9 @@ std::ostream& operator<<(std::ostream &o, const State &s)
        o << "state " << s.label;
 
        if (s.rule)
+       {
                o << " accepts " << s.rule->accept;
+       }
 
        o << "\n";
 
@@ -134,7 +147,9 @@ std::ostream& operator<<(std::ostream &o, const State &s)
        uint lb = 0;
 
        for (uint i = 0; i < s.go.nSpans; ++i)
+       {
                lb = s.go.span[i].show(o, lb);
+       }
 
        return o;
 }
@@ -252,9 +267,8 @@ DFA::DFA(Ins *ins, uint ni, uint lb, uint ub, Char *rep)
                {
                        State *to = (State*) goTo[rep[j]].to;
 
-                       while (++j < nc && goTo[rep[j]].to == to)
+                       while (++j < nc && goTo[rep[j]].to == to) ;
 
-                               ;
                        span[s->go.nSpans].ub = lb + j;
 
                        span[s->go.nSpans].to = to;
diff --git a/dfa.h b/dfa.h
index 032d80c828c37a81c4a82a5680d245cfc140b109..0573240dd350136c93ff4af6903725a99f33ca74 100644 (file)
--- a/dfa.h
+++ b/dfa.h
@@ -8,8 +8,8 @@
 namespace re2c
 {
 
-extern void prtCh(std::ostream&, uchar);
-extern void prtChOrHex(std::ostream&, uchar);
+extern void prtCh(std::ostream&, uint);
+extern void prtChOrHex(std::ostream&, uint);
 extern void printSpan(std::ostream&, uint, uint);
 
 class DFA;
index 65fc51da3fc5c585a9cff24fc6894989a0defe7f..488189c2c3b9833c82e1c7e2c197d6394208a1d1 100644 (file)
--- a/globals.h
+++ b/globals.h
@@ -31,22 +31,39 @@ namespace re2c
 
 extern char *fileName;
 extern char *outputFileName;
-extern bool sFlag;
 extern bool bFlag;
 extern bool dFlag;
+extern bool eFlag;
 extern bool iFlag;
+extern bool sFlag;
+extern bool wFlag;
+
 extern bool bUsedYYAccept;
 extern unsigned int oline;
 extern uint maxFill;
 
-extern uchar asc2ebc[256];
-extern uchar ebc2asc[256];
+extern uint asc2ebc[256];
+extern uint ebc2asc[256];
 
-extern uchar *xlat, *talx;
+extern uint *xlat, *talx;
 
 extern int vFillIndexes;
 extern label_list<uint> vUsedLabels;
 
+extern uint nRealChars;
+
+inline char octCh(uint c)
+{
+       return '0' + c % 8;
+}
+
+inline char hexCh(uint c)
+{
+       static const char * sHex = "0123456789ABCDEF";
+       
+       return sHex[c & 0x0F];
+}
+
 } // end namespace re2c
 
 #endif
diff --git a/ins.h b/ins.h
index 5700121bf897d255fa876cae6270a722d0e2fa85..14b11feb2e6b20771b3b35bb098523ecd9383dd0 100644 (file)
--- a/ins.h
+++ b/ins.h
@@ -7,8 +7,8 @@
 namespace re2c
 {
 
-const uint nChars = 256;
-typedef uchar Char;
+const uint nChars = (1<<16);
+typedef unsigned short Char;
 
 const uint CHAR = 0;
 const uint GOTO = 1;
diff --git a/main.cc b/main.cc
index 45c9d60b66bd0fe70cde2b83b88e43aaac6789d2..1714509d61f6d42d8fefb82a13a15bae23f678c2 100644 (file)
--- a/main.cc
+++ b/main.cc
@@ -18,13 +18,16 @@ namespace re2c
 
 char *fileName = 0;
 char *outputFileName = 0;
-bool sFlag = false;
 bool bFlag = false;
-bool dFlag = false; 
+bool dFlag = false;
+bool eFlag = false;
 bool iFlag = false;
+bool sFlag = false;
+bool wFlag = false;
 bool bUsedYYAccept = false;
 unsigned int oline = 1;
 uint maxFill = 1;
+uint nRealChars = 256;
 
 int vFillIndexes = -1;
 label_list<uint> vUsedLabels;
@@ -47,6 +50,7 @@ static const mbo_opt_struct OPTIONS[] =
        mbo_opt_struct('s', 0, "nested-ifs"),
        mbo_opt_struct('v', 0, "version"),
        mbo_opt_struct('V', 0, "vernum"),
+       mbo_opt_struct('w', 0, "wide-chars"),      
        mbo_opt_struct('-', 0, NULL) /* end of args */
 };
 
@@ -54,31 +58,34 @@ static void usage()
 {
        cerr << "usage: re2c [-esbvhd] file\n"
        "\n"
-       "-? -h   --help          Display this info.\n"
+       "-? -h  --help           Display this info.\n"
        "\n"
-       "-b      --bit-vectors   Implies -s. Use bit vectors as well in the attempt to\n"
+       "-b     --bit-vectors    Implies -s. Use bit vectors as well in the attempt to\n"
        "                        coax better code out of the compiler. Most useful for\n"
        "                        specifications with more than a few keywords (e.g. for\n"
        "                        most programming languages).\n"
        "\n"
-       "-e      --ecb           Cross-compile from an ASCII platform to\n"
+       "-d     --debug-output   Creates a parser that dumps information during\n"
+       "                        about the current position and in which state the\n"
+       "                        parser is.\n"
+       "\n"
+       "-e     --ecb            Cross-compile from an ASCII platform to\n"
        "                        an EBCDIC one.\n"
        "\n"
-       "-s      --nested-ifs    Generate nested ifs for some switches. Many compilers\n"
-       "                        need this assist to generate better code.\n"
+       "-f     --storable-state Generate a scanner with support for storable state\n"
        "\n"
-       "-f      --storable-state Generate a scanner with support for storable state\n"
+       "-i     --no-debug-info  Do not generate '#line' info (usefull for versioning).\n"
        "\n"
-       "-o      --output=output Specify the output file instead of stdout\n"
+       "-o     --output=output  Specify the output file instead of stdout\n"
+       "                        This cannot be used together with switches -b or -e.\n"
        "\n"
-       "-d      --debug-output  Creates a parser that dumps information during\n"
-       "                        about the current position and in which state the\n"
-       "                        parser is.\n"
+       "-s     --nested-ifs     Generate nested ifs for some switches. Many compilers\n"
+       "                        need this assist to generate better code.\n"
        "\n"
-       "-i      --no-debug-info Do not generate '#line' info (usefull for versioning).\n"
+       "-v     --version        Show version information.\n"
+       "-V     --vernum         Show version as one number.\n"
        "\n"
-       "-v      --version       Show version information.\n"
-       "-V      --vernum        Show version as one number.\n"
+       "-w     --wide-chars     Create a parser that supports wide chars (UCS-2).\n"
        ;
 }
 
@@ -110,6 +117,7 @@ int main(int argc, char *argv[])
                        case 'e':
                        xlat = asc2ebc;
                        talx = ebc2asc;
+                       eFlag = true;
                        break;
 
                        case 's':
@@ -135,7 +143,7 @@ int main(int argc, char *argv[])
                        case 'v':
                        cout << "re2c " << PACKAGE_VERSION << "\n";
                        return 2;
-                       
+
                        case 'V': {
                                int v1, v2, v3;
                                char version[16];
@@ -145,17 +153,25 @@ int main(int argc, char *argv[])
                                return 2;
                        }
                        
+                       case 'w':
+                       nRealChars = (1<<16);
+                       wFlag = true;
+                       break;
+         
                        case 'h':
-
                        case '?':
-
                        default:
                        usage();
                        return 2;
                }
        }
 
-       if (argc == opt_ind + 1)
+       if (wFlag && (bFlag || eFlag))
+       {
+               usage();
+               return 2;
+       }
+       else if (argc == opt_ind + 1)
        {
                fileName = argv[opt_ind];
        }
index b83fe9920779650c885016d53ad04d79a164678a..572cce34c1ba7773582111a1bfff720db5caa5b6 100644 (file)
--- a/re2c.1.in
+++ b/re2c.1.in
@@ -7,6 +7,9 @@
 .ds rx regular expression
 .ds lx \fIl\fP-expression
 \"$Log$
+\"Revision 1.27  2005/12/28 18:33:37  helly
+\"- Added experimental unicode support
+\"
 \"Revision 1.26  2005/12/18 18:47:06  helly
 \"- Apply #1362806 Addition to man on flag -f
 \"
@@ -185,16 +188,9 @@ will receive.
 .SH OPTIONS
 \*(re provides the following options:
 .TP
-\fB-e\fP
-Cross-compile from an ASCII platform to an EBCDIC one. 
-.TP
-\fB-f\fP
-Generate a scanner with support for storable state.
-For details see below at \fBSCANNER WITH STORABLE STATES\fP.
-.TP
-\fB-s\fP
-Generate nested \fCif\fPs for some \fCswitch\fPes.  Many compilers need this
-assist to generate better code.
+\fB-?\fP
+\fB-h\fP
+Invoke a short help.
 .TP
 \fB-b\fP
 Implies \fB-s\fP.  Use bit vectors as well in the attempt to coax better
@@ -209,14 +205,23 @@ parser issues and states. If you use this switch you need to define a macro
 \fIvoid YYDEBUG(int state, char current)\fP. The first parameter receives the 
 state or -1 and the second parameter receives the input at the current cursor.
 .TP
+\fB-e\fP
+Cross-compile from an ASCII platform to an EBCDIC one. 
+.TP
+\fB-f\fP
+Generate a scanner with support for storable state.
+For details see below at \fBSCANNER WITH STORABLE STATES\fP.
+.TP
 \fB-i\fP
 Do not output #line information. This is usefull when you want use a CMS tool
 with the re2c output which you might want if you do not require your users to 
 have re2c themselves when building from your source.
+\fB-o output\fP
+Specify the output file.
 .TP
-\fB-h\fP
-\fB-?\fP
-Invoke a short help.
+\fB-s\fP
+Generate nested \fCif\fPs for some \fCswitch\fPes.  Many compilers need this
+assist to generate better code.
 .TP
 \fB-v\fP
 Show version information.
@@ -224,9 +229,9 @@ Show version information.
 \fB-V\fP
 Show the version as a number XXYYZZ.
 .TP
-\fB-o output\fP
-Specify the output file.
-
+\fB-w\fP
+Create a parser that supports wide chars (UCS-2). This cannot be used together 
+with switches \fB-b\fP or \fB-e\fP.
 .SH "INTERFACE CODE"
 Unlike other scanner generators, \*(re does not generate complete scanners:
 the user must supply some interface code.
index 66a6200ca04bb7231f9050b039e3ad1e9e14192a..25e73679e09f778a6ca7d789c1794e9d4150b48c 100644 (file)
--- a/scanner.h
+++ b/scanner.h
@@ -5,6 +5,7 @@
 #include <iosfwd>
 #include "token.h"
 #include "re.h"
+#include "globals.h"
 
 namespace re2c
 {
@@ -27,10 +28,10 @@ public:
        int echo(std::ostream&);
        int scan();
        void fatal(char*) const;
-       SubStr token();
-       uint line();
-       
-       uchar unescape(SubStr &s) const;
+       SubStr token() const;
+       uint line() const;      
+       uint xlat(uint c) const;
+       uint unescape(SubStr &s) const;
        Range * getRange(SubStr &s) const;
        RegExp * matchChar(uint c) const;
        RegExp * strToRE(SubStr s) const;
@@ -40,16 +41,21 @@ public:
        RegExp * mkDot() const;
 };
 
-inline SubStr Scanner::token()
+inline SubStr Scanner::token() const
 {
        return SubStr(tok, cur - tok);
 }
 
-inline uint Scanner::line()
+inline uint Scanner::line() const
 {
        return cline;
 }
 
+inline uint Scanner::xlat(uint c) const
+{
+       return re2c::wFlag ? c : re2c::xlat[c];
+}
+
 } // end namespace re2c
 
 #endif
index 242e1fb70d370bf45b311b20a1976ef901458d7d..19ddd45166a9fd3b9746c0a99e3d0c4ad0dc9727 100644 (file)
@@ -28,39 +28,45 @@ namespace re2c
 
 Scanner::Scanner(std::istream& i) : in(i),
        bot(NULL), tok(NULL), ptr(NULL), cur(NULL), pos(NULL), lim(NULL),
-       top(NULL), eof(NULL), tchar(0), tline(0), cline(1) {
+       top(NULL), eof(NULL), tchar(0), tline(0), cline(1)
+{
     ;
 }
 
-char *Scanner::fill(char *cursor){
-    if(!eof){
-       uint cnt = tok - bot;
-       if(cnt){
-           memcpy(bot, tok, lim - tok);
-           tok = bot;
-           ptr -= cnt;
-           cursor -= cnt;
-           pos -= cnt;
-           lim -= cnt;
-       }
-       if((top - lim) < BSIZE){
-           char *buf = new char[(lim - bot) + BSIZE];
-           memcpy(buf, tok, lim - tok);
-           tok = buf;
-           ptr = &buf[ptr - bot];
-           cursor = &buf[cursor - bot];
-           pos = &buf[pos - bot];
-           lim = &buf[lim - bot];
-           top = &lim[BSIZE];
-           delete [] bot;
-           bot = buf;
-       }
-       if((cnt = in.rdbuf()->sgetn((char*) lim, BSIZE)) != BSIZE){
-           eof = &lim[cnt]; *eof++ = '\0';
+char *Scanner::fill(char *cursor)
+{
+       if(!eof)
+       {
+               uint cnt = tok - bot;
+               if(cnt)
+               {
+                       memcpy(bot, tok, lim - tok);
+                       tok = bot;
+                       ptr -= cnt;
+                       cursor -= cnt;
+                       pos -= cnt;
+                       lim -= cnt;
+               }
+               if((top - lim) < BSIZE)
+               {
+                       char *buf = new char[(lim - bot) + BSIZE];
+                       memcpy(buf, tok, lim - tok);
+                       tok = buf;
+                       ptr = &buf[ptr - bot];
+                       cursor = &buf[cursor - bot];
+                       pos = &buf[pos - bot];
+                       lim = &buf[lim - bot];
+                       top = &lim[BSIZE];
+                       delete [] bot;
+                       bot = buf;
+               }
+               if((cnt = in.rdbuf()->sgetn((char*) lim, BSIZE)) != BSIZE)
+               {
+                       eof = &lim[cnt]; *eof++ = '\0';
+               }
+               lim += cnt;
        }
-       lim += cnt;
-    }
-    return cursor;
+       return cursor;
 }
 
 /*!re2c
@@ -199,12 +205,25 @@ scan:
 
        [ \t]+                  { goto scan; }
 
+       "\r\n"                  { if(cursor == eof) RETURN(0);
+                                 pos = cursor; cline++;
+                                 goto scan;
+                               }
        "\n"                    { if(cursor == eof) RETURN(0);
                                  pos = cursor; cline++;
                                  goto scan;
                                }
 
-       any                     { std::cerr << "unexpected character: " << *tok << std::endl;
+       any                     { std::cerr << "line " << tline << ", column " << (tchar + 1) 
+                                               << ": unexpected character: ";
+                                 if (isprint(*tok))
+                                 {
+                                       std::cerr << *tok << std::endl;
+                                 }
+                                 else
+                                 {
+                                       std::cerr << "0x" << hexCh(*tok >> 4) << hexCh(*tok) << std::endl;
+                                 }
                                  goto scan;
                                }
 */
@@ -250,4 +269,3 @@ void Scanner::fatal(char *msg) const
 }
 
 } // end namespace re2c
-
index c8b73a423399285e9c5856de19e7cb0fb4686114..a8be85ed34d74adbcadb901cdcb086e78e6e858f 100644 (file)
@@ -4,7 +4,7 @@
 namespace re2c
 {
 
-uchar asc2asc[256] =
+uint asc2asc[256] =
     {
         0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
         0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
@@ -24,10 +24,10 @@ uchar asc2asc[256] =
         0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff
     };
 
-uchar *xlat = asc2asc;
-uchar *talx = asc2asc;
+uint *xlat = asc2asc;
+uint *talx = asc2asc;
 
-uchar asc2ebc[256] =
+uint asc2ebc[256] =
     { /* Based on ISO 8859/1 and Code Page 37 */
         0x00, 0x01, 0x02, 0x03, 0x37, 0x2d, 0x2e, 0x2f, 0x16, 0x05, 0x25, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
         0x10, 0x11, 0x12, 0x13, 0x3c, 0x3d, 0x32, 0x26, 0x18, 0x19, 0x3f, 0x27, 0x1c, 0x1d, 0x1e, 0x1f,
@@ -47,7 +47,7 @@ uchar asc2ebc[256] =
         0x8c, 0x49, 0xcd, 0xce, 0xcb, 0xcf, 0xcc, 0xe1, 0x70, 0xdd, 0xde, 0xdb, 0xdc, 0x8d, 0xae, 0xdf
     };
 
-uchar ebc2asc[256] =
+uint ebc2asc[256] =
     { /* Based on ISO 8859/1 and Code Page 37 */
         0x00, 0x01, 0x02, 0x03, 0x9c, 0x09, 0x86, 0x7f, 0x97, 0x8d, 0x8e, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
         0x10, 0x11, 0x12, 0x13, 0x9d, 0x85, 0x08, 0x87, 0x18, 0x19, 0x92, 0x8f, 0x1c, 0x1d, 0x1e, 0x1f,
@@ -68,4 +68,3 @@ uchar ebc2asc[256] =
     };
 
 } // end namespace re2c
-