From: Peter Johnson Date: Sun, 7 Apr 2002 22:27:05 +0000 (-0000) Subject: Initial revision X-Git-Tag: v0.2.0~263 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=7cd8fdd1a0b542397090717925dc75cac586fda9;p=yasm Initial revision svn path=/trunk/yasm/; revision=573 --- diff --git a/tools/re2c/CHANGELOG b/tools/re2c/CHANGELOG new file mode 100644 index 00000000..06090aa6 --- /dev/null +++ b/tools/re2c/CHANGELOG @@ -0,0 +1,18 @@ +re2c +---- + +Version 0.9.1 +------------- + +- removed rcs comments in source files + +Version 0.9 +----------- + +- redistribution based on version 0.5 +- added parentheses to assignment expressions in 'if' statements +- rearranged class members to match initialization order +- substr fix +- use array delete [] when necessary +- other minor fixes for subduing compiler warnings + diff --git a/tools/re2c/NO_WARRANTY b/tools/re2c/NO_WARRANTY new file mode 100644 index 00000000..885a13d0 --- /dev/null +++ b/tools/re2c/NO_WARRANTY @@ -0,0 +1,2 @@ +re2c is distributed with no warranty whatever. The author and any other +contributors take no responsibility for the consequences of its use. diff --git a/tools/re2c/README b/tools/re2c/README new file mode 100644 index 00000000..943120fe --- /dev/null +++ b/tools/re2c/README @@ -0,0 +1,153 @@ +re2c +---- + +Version 0.9.1 +Originally written by Peter Bumbulis (peterr@csg.uwaterloo.ca) +Currently maintained by Brian Young (bayoung@acm.org) + +The re2c distribution can be found at: + + http://www.tildeslash.org/re2c/index.html + +The source distribution is available from: + + http://www.tildeslash.org/re2c/re2c-0.9.1.tar.gz + +This distribution is a cleaned up version of the 0.5 release +maintained by me (Brian Young). Several bugs were fixed as well +as code cleanup for warning free compilation. It has been developed +and tested with egcs 1.0.2 and gcc 2.7.2.3 on Linux x86. Peter +Bumbulis' original release can be found at: + + ftp://csg.uwaterloo.ca/pub/peterr/re2c.0.5.tar.gz + +re2c is a great tool for writing fast and flexible lexers. It has +served many people well for many years and it deserves to be +maintained more actively. re2c is on the order of 2-3 times faster +than a flex based scanner, and its input model is much more +flexible. + +Patches and requests for features will be entertained. Areas of +particular interest to me are porting (a Solaris and an NT +version will be forthcoming) and wide character support. Note +that the code is already quite portable and should be buildable +on any platform with minor makefile changes. + +Peter's original version 0.5 ANNOUNCE and README follows. + +Brian + +-- + +re2c is a tool for generating C-based recognizers from regular +expressions. re2c-based scanners are efficient: for programming +languages, given similar specifications, an re2c-based scanner is +typically almost twice as fast as a flex-based scanner with little or no +increase in size (possibly a decrease on cisc architectures). Indeed, +re2c-based scanners are quite competitive with hand-crafted ones. + +Unlike flex, re2c does not generate complete scanners: the user must +supply some interface code. While this code is not bulky (about 50-100 +lines for a flex-like scanner; see the man page and examples in the +distribution) careful coding is required for efficiency (and +correctness). One advantage of this arrangement is that the generated +code is not tied to any particular input model. For example, re2c +generated code can be used to scan data from a null-byte terminated +buffer as illustrated below. + +Given the following source + + #define NULL ((char*) 0) + char *scan(char *p){ + char *q; + #define YYCTYPE char + #define YYCURSOR p + #define YYLIMIT p + #define YYMARKER q + #define YYFILL(n) + /*!re2c + [0-9]+ {return YYCURSOR;} + [\000-\377] {return NULL;} + */ + } + +re2c will generate + + /* Generated by re2c on Sat Apr 16 11:40:58 1994 */ + #line 1 "simple.re" + #define NULL ((char*) 0) + char *scan(char *p){ + char *q; + #define YYCTYPE char + #define YYCURSOR p + #define YYLIMIT p + #define YYMARKER q + #define YYFILL(n) + { + YYCTYPE yych; + unsigned int yyaccept; + goto yy0; + yy1: ++YYCURSOR; + yy0: + if((YYLIMIT - YYCURSOR) < 2) YYFILL(2); + yych = *YYCURSOR; + if(yych <= '/') goto yy4; + if(yych >= ':') goto yy4; + yy2: yych = *++YYCURSOR; + goto yy7; + yy3: + #line 10 + {return YYCURSOR;} + yy4: yych = *++YYCURSOR; + yy5: + #line 11 + {return NULL;} + yy6: ++YYCURSOR; + if(YYLIMIT == YYCURSOR) YYFILL(1); + yych = *YYCURSOR; + yy7: if(yych <= '/') goto yy3; + if(yych <= '9') goto yy6; + goto yy3; + } + #line 12 + + } + +Note that most compilers will perform dead-code elimination to remove +all YYCURSOR, YYLIMIT comparisions. + +re2c was developed for a particular project (constructing a fast REXX +scanner of all things!) and so while it has some rough edges, it should +be quite usable. More information about re2c can be found in the +(admittedly skimpy) man page; the algorithms and heuristics used are +described in an upcoming LOPLAS article (included in the distribution). +Probably the best way to find out more about re2c is to try the supplied +examples. re2c is written in C++, and is currently being developed +under Linux using gcc 2.5.8. + +Peter + +-- + +re2c is distributed with no warranty whatever. The code is certain to +contain errors. Neither the author nor any contributor takes +responsibility for any consequences of its use. + +re2c is in the public domain. The data structures and algorithms used +in re2c are all either taken from documents available to the general +public or are inventions of the author. Programs generated by re2c may +be distributed freely. re2c itself may be distributed freely, in source +or binary, unchanged or modified. Distributors may charge whatever fees +they can obtain for re2c. + +If you do make use of re2c, or incorporate it into a larger project an +acknowledgement somewhere (documentation, research report, etc.) would +be appreciated. + +Please send bug reports and feedback (including suggestions for +improving the distribution) to + + peterr@csg.uwaterloo.ca + +Include a small example and the banner from parser.y with bug reports. + diff --git a/tools/re2c/actions.cc b/tools/re2c/actions.cc new file mode 100644 index 00000000..0260b5fb --- /dev/null +++ b/tools/re2c/actions.cc @@ -0,0 +1,505 @@ +#include +#include +#include +#include + +#include "globals.h" +#include "parser.h" +#include "dfa.h" + +Symbol *Symbol::first = NULL; + +Symbol::Symbol(const SubStr &str) : next(first), name(str), re(NULL) { + first = this; +} + +Symbol *Symbol::find(const SubStr &str){ + for(Symbol *sym = first; sym; sym = sym->next) + if(sym->name == str) return sym; + return new Symbol(str); +} + +void showIns(ostream &o, const Ins &i, const Ins &base){ + o.width(3); + o << &i - &base << ": "; + switch(i.i.tag){ + case CHAR: { + o << "match "; + for(const Ins *j = &(&i)[1]; j < (Ins*) i.i.link; ++j) + prtCh(o, j->c.value); + break; + } case GOTO: + o << "goto " << ((Ins*) i.i.link - &base); + break; + case FORK: + o << "fork " << ((Ins*) i.i.link - &base); + break; + case CTXT: + o << "term " << ((RuleOp*) i.i.link)->accept; + break; + case TERM: + o << "term " << ((RuleOp*) i.i.link)->accept; + break; + } + o << "\n"; +} + +uint RegExp::fixedLength(){ + return ~0; +} + +char *NullOp::type = "NullOp"; + +void NullOp::calcSize(Char*){ + size = 0; +} + +uint NullOp::fixedLength(){ + return 0; +} + +void NullOp::compile(Char*, Ins*){ + ; +} + +void NullOp::split(CharSet&){ + ; +} + +ostream& operator<<(ostream &o, const Range &r){ + if((r.ub - r.lb) == 1){ + prtCh(o, r.lb); + } else { + prtCh(o, r.lb); o << "-"; prtCh(o, r.ub-1); + } + return o << r.next; +} + +Range *doUnion(Range *r1, Range *r2){ + Range *r, **rP = &r; + for(;;){ + Range *s; + if(r1->lb <= r2->lb){ + s = new Range(*r1); + } else { + s = new Range(*r2); + } + *rP = s; + rP = &s->next; + for(;;){ + if(r1->lb <= r2->lb){ + if(r1->lb > s->ub) + break; + if(r1->ub > s->ub) + s->ub = r1->ub; + if(!(r1 = r1->next)){ + uint ub = 0; + for(; r2 && r2->lb <= s->ub; r2 = r2->next) + ub = r2->ub; + if(ub > s->ub) + s->ub = ub; + *rP = r2; + return r; + } + } else { + if(r2->lb > s->ub) + break; + if(r2->ub > s->ub) + s->ub = r2->ub; + if(!(r2 = r2->next)){ + uint ub = 0; + for(; r1 && r1->lb <= s->ub; r1 = r1->next) + ub = r1->ub; + if(ub > s->ub) + s->ub = ub; + *rP = r1; + return r; + } + } + } + } + *rP = NULL; + return r; +} + +Range *doDiff(Range *r1, Range *r2){ + Range *r, *s, **rP = &r; + for(; r1; r1 = r1->next){ + uint lb = r1->lb; + for(; r2 && r2->ub <= r1->lb; r2 = r2->next); + for(; r2 && r2->lb < r1->ub; r2 = r2->next){ + if(lb < r2->lb){ + *rP = s = new Range(lb, r2->lb); + rP = &s->next; + } + if((lb = r2->ub) >= r1->ub) + goto noMore; + } + *rP = s = new Range(lb, r1->ub); + rP = &s->next; + noMore:; + } + *rP = NULL; + return r; +} + +MatchOp *merge(MatchOp *m1, MatchOp *m2){ + if(!m1) + return m2; + if(!m2) + return m1; + return new MatchOp(doUnion(m1->match, m2->match)); +} + +char *MatchOp::type = "MatchOp"; + +void MatchOp::display(ostream &o) const{ + o << match; +} + +void MatchOp::calcSize(Char *rep){ + size = 1; + for(Range *r = match; r; r = r->next) + for(uint c = r->lb; c < r->ub; ++c) + if(rep[c] == c) + ++size; +} + +uint MatchOp::fixedLength(){ + return 1; +} + +void MatchOp::compile(Char *rep, Ins *i){ + i->i.tag = CHAR; + i->i.link = &i[size]; + Ins *j = &i[1]; + uint bump = size; + for(Range *r = match; r; r = r->next){ + for(uint c = r->lb; c < r->ub; ++c){ + if(rep[c] == c){ + j->c.value = c; + j->c.bump = --bump; + j++; + } + } + } +} + +void MatchOp::split(CharSet &s){ + for(Range *r = match; r; r = r->next){ + for(uint c = r->lb; c < r->ub; ++c){ + CharPtn *x = s.rep[c], *a = x->nxt; + if(!a){ + if(x->card == 1) + continue; + x->nxt = a = s.freeHead; + if(!(s.freeHead = s.freeHead->nxt)) + s.freeTail = &s.freeHead; + a->nxt = NULL; + x->fix = s.fix; + s.fix = x; + } + if(--(x->card) == 0){ + *s.freeTail = x; + *(s.freeTail = &x->nxt) = NULL; + } + s.rep[c] = a; + ++(a->card); + } + } + for(; s.fix; s.fix = s.fix->fix) + if(s.fix->card) + s.fix->nxt = NULL; +} + +RegExp *mkDiff(RegExp *e1, RegExp *e2){ + MatchOp *m1, *m2; + if(!(m1 = (MatchOp*) e1->isA(MatchOp::type))) + return NULL; + if(!(m2 = (MatchOp*) e2->isA(MatchOp::type))) + return NULL; + Range *r = doDiff(m1->match, m2->match); + return r? (RegExp*) new MatchOp(r) : (RegExp*) new NullOp; +} + +RegExp *doAlt(RegExp *e1, RegExp *e2){ + if(!e1) + return e2; + if(!e2) + return e1; + return new AltOp(e1, e2); +} + +RegExp *mkAlt(RegExp *e1, RegExp *e2){ + AltOp *a; + MatchOp *m1, *m2; + if((a = (AltOp*) e1->isA(AltOp::type))){ + if((m1 = (MatchOp*) a->exp1->isA(MatchOp::type))) + e1 = a->exp2; + } else if((m1 = (MatchOp*) e1->isA(MatchOp::type))){ + e1 = NULL; + } + if((a = (AltOp*) e2->isA(AltOp::type))){ + if((m2 = (MatchOp*) a->exp1->isA(MatchOp::type))) + e2 = a->exp2; + } else if((m2 = (MatchOp*) e2->isA(MatchOp::type))){ + e2 = NULL; + } + return doAlt(merge(m1, m2), doAlt(e1, e2)); +} + +char *AltOp::type = "AltOp"; + +void AltOp::calcSize(Char *rep){ + exp1->calcSize(rep); + exp2->calcSize(rep); + size = exp1->size + exp2->size + 2; +} + +uint AltOp::fixedLength(){ + uint l1 = exp1->fixedLength(); + uint l2 = exp1->fixedLength(); + if(l1 != l2 || l1 == ~0u) + return ~0; + return l1; +} + +void AltOp::compile(Char *rep, Ins *i){ + i->i.tag = FORK; + Ins *j = &i[exp1->size + 1]; + i->i.link = &j[1]; + exp1->compile(rep, &i[1]); + j->i.tag = GOTO; + j->i.link = &j[exp2->size + 1]; + exp2->compile(rep, &j[1]); +} + +void AltOp::split(CharSet &s){ + exp1->split(s); + exp2->split(s); +} + +char *CatOp::type = "CatOp"; + +void CatOp::calcSize(Char *rep){ + exp1->calcSize(rep); + exp2->calcSize(rep); + size = exp1->size + exp2->size; +} + +uint CatOp::fixedLength(){ + uint l1, l2; + if((l1 = exp1->fixedLength()) != ~0u ) + if((l2 = exp2->fixedLength()) != ~0u) + return l1+l2; + return ~0; +} + +void CatOp::compile(Char *rep, Ins *i){ + exp1->compile(rep, &i[0]); + exp2->compile(rep, &i[exp1->size]); +} + +void CatOp::split(CharSet &s){ + exp1->split(s); + exp2->split(s); +} + +char *CloseOp::type = "CloseOp"; + +void CloseOp::calcSize(Char *rep){ + exp->calcSize(rep); + size = exp->size + 1; +} + +void CloseOp::compile(Char *rep, Ins *i){ + exp->compile(rep, &i[0]); + i += exp->size; + i->i.tag = FORK; + i->i.link = i - exp->size; +} + +void CloseOp::split(CharSet &s){ + exp->split(s); +} + +RegExp *expr(Scanner &); + +uchar unescape(SubStr &s){ + s.len--; + uchar c; + if((c = *s.str++) != '\\' || s.len == 0) + return xlat[c]; + s.len--; + switch(c = *s.str++){ + case 'n': + return xlat['\n']; + case 't': + return xlat['\t']; + case 'v': + return xlat['\v']; + case 'b': + return xlat['\b']; + case 'r': + return xlat['\r']; + case 'f': + return xlat['\f']; + case 'a': + return xlat['\a']; + case '0': case '1': case '2': case '3': + case '4': case '5': case '6': case '7': { + uchar v = c - '0'; + for(; s.len != 0 && '0' <= (c = *s.str) && c <= '7'; s.len--, s.str++) + v = v*8 + (c - '0'); + return v; + } default: + return xlat[c]; + } +} + +Range *getRange(SubStr &s){ + uchar lb = unescape(s), ub; + if(s.len < 2 || *s.str != '-'){ + ub = lb; + } else { + s.len--; s.str++; + ub = unescape(s); + if(ub < lb){ + uchar tmp; + tmp = lb; lb = ub; ub = tmp; + } + } + return new Range(lb, ub+1); +} + +RegExp *matchChar(uint c){ + return new MatchOp(new Range(c, c+1)); +} + +RegExp *strToRE(SubStr s){ + s.len -= 2; s.str += 1; + if(s.len == 0) + return new NullOp; + RegExp *re = matchChar(unescape(s)); + while(s.len > 0) + re = new CatOp(re, matchChar(unescape(s))); + return re; +} + +RegExp *ranToRE(SubStr s){ + s.len -= 2; s.str += 1; + if(s.len == 0) + return new NullOp; + Range *r = getRange(s); + while(s.len > 0) + r = doUnion(r, getRange(s)); + return new MatchOp(r); +} + +char *RuleOp::type = "RuleOp"; + +RuleOp::RuleOp(RegExp *e, RegExp *c, Token *t, uint a) + : exp(e), ctx(c), ins(NULL), accept(a), code(t) { + ; +} + +void RuleOp::calcSize(Char *rep){ + exp->calcSize(rep); + ctx->calcSize(rep); + size = exp->size + ctx->size + 1; +} + +void RuleOp::compile(Char *rep, Ins *i){ + ins = i; + exp->compile(rep, &i[0]); + i += exp->size; + ctx->compile(rep, &i[0]); + i += ctx->size; + i->i.tag = TERM; + i->i.link = this; +} + +void RuleOp::split(CharSet &s){ + exp->split(s); + ctx->split(s); +} + +extern void printSpan(ostream&, uint, uint); + +void optimize(Ins *i){ + while(!isMarked(i)){ + mark(i); + if(i->i.tag == CHAR){ + i = (Ins*) i->i.link; + } else if(i->i.tag == GOTO || i->i.tag == FORK){ + Ins *target = (Ins*) i->i.link; + optimize(target); + if(target->i.tag == GOTO) + i->i.link = target->i.link == target? i : target; + if(i->i.tag == FORK){ + Ins *follow = (Ins*) &i[1]; + optimize(follow); + if(follow->i.tag == GOTO && follow->i.link == follow){ + i->i.tag = GOTO; + } else if(i->i.link == i){ + i->i.tag = GOTO; + i->i.link = follow; + } + } + return; + } else { + ++i; + } + } +} + +void genCode(ostream& o, RegExp *re){ + CharSet cs; + uint j; + memset(&cs, 0, sizeof(cs)); + for(j = 0; j < nChars; ++j){ + cs.rep[j] = &cs.ptn[0]; + cs.ptn[j].nxt = &cs.ptn[j+1]; + } + cs.freeHead = &cs.ptn[1]; + *(cs.freeTail = &cs.ptn[nChars-1].nxt) = NULL; + cs.ptn[0].card = nChars; + cs.ptn[0].nxt = NULL; + re->split(cs); +/* + for(uint k = 0; k < nChars;){ + for(j = k; ++k < nChars && cs.rep[k] == cs.rep[j];); + printSpan(cerr, j, k); + cerr << "\t" << cs.rep[j] - &cs.ptn[0] << endl; + } +*/ + Char rep[nChars]; + for(j = 0; j < nChars; ++j){ + if(!cs.rep[j]->nxt) + cs.rep[j]->nxt = &cs.ptn[j]; + rep[j] = (Char) (cs.rep[j]->nxt - &cs.ptn[0]); + } + + re->calcSize(rep); + Ins *ins = new Ins[re->size+1]; + memset(ins, 0, (re->size+1)*sizeof(Ins)); + re->compile(rep, ins); + Ins *eoi = &ins[re->size]; + eoi->i.tag = GOTO; + eoi->i.link = eoi; + + optimize(ins); + for(j = 0; j < re->size;){ + unmark(&ins[j]); + if(ins[j].i.tag == CHAR){ + j = (Ins*) ins[j].i.link - ins; + } else { + j++; + } + } + + DFA *dfa = new DFA(ins, re->size, 0, 256, rep); + dfa->emit(o); + delete dfa; + delete [] ins; +} diff --git a/tools/re2c/basics.h b/tools/re2c/basics.h new file mode 100644 index 00000000..2adaeb74 --- /dev/null +++ b/tools/re2c/basics.h @@ -0,0 +1,9 @@ +#ifndef _basics_h +#define _basics_h + +typedef unsigned int uint; +typedef unsigned char uchar, byte; +typedef unsigned short ushort, word; +typedef unsigned long ulong, dword; + +#endif diff --git a/tools/re2c/bootstrap/parser.cc b/tools/re2c/bootstrap/parser.cc new file mode 100644 index 00000000..6d664005 --- /dev/null +++ b/tools/re2c/bootstrap/parser.cc @@ -0,0 +1,531 @@ +#ifndef lint +static char yysccsid[] = "@(#)yaccpar 1.9 (Berkeley) 02/21/93"; +#endif +#define YYBYACC 1 +#define YYMAJOR 1 +#define YYMINOR 9 +#define yyclearin (yychar=(-1)) +#define yyerrok (yyerrflag=0) +#define YYRECOVERING (yyerrflag!=0) +#define YYPREFIX "yy" +#line 2 "parser.y" + +#include +#include +#include +#include +#include "globals.h" +#include "parser.h" +int yyparse(); +int yylex(); +void yyerror(char*); + +static uint accept; +static RegExp *spec; +static Scanner *in; + +#line 21 "parser.y" +typedef union { + Symbol *symbol; + RegExp *regexp; + Token *token; + char op; +} YYSTYPE; +#line 35 "y.tab.c" +#define CLOSE 257 +#define ID 258 +#define CODE 259 +#define RANGE 260 +#define STRING 261 +#define YYERRCODE 256 +short yylhs[] = { -1, + 0, 0, 0, 9, 2, 3, 3, 4, 4, 5, + 5, 6, 6, 7, 7, 1, 1, 8, 8, 8, + 8, +}; +short yylen[] = { 2, + 0, 2, 2, 4, 3, 0, 2, 1, 3, 1, + 3, 1, 2, 1, 2, 1, 2, 1, 1, 1, + 3, +}; +short yydefred[] = { 1, + 0, 0, 19, 20, 0, 2, 0, 0, 0, 12, + 0, 3, 0, 18, 0, 0, 0, 0, 0, 13, + 16, 0, 0, 21, 0, 0, 5, 0, 17, 4, +}; +short yydgoto[] = { 1, + 22, 6, 18, 7, 8, 9, 10, 11, 12, +}; +short yysindex[] = { 0, + -27, -49, 0, 0, -23, 0, -44, -84, -23, 0, + -243, 0, -23, 0, -39, -23, -23, -244, -23, 0, + 0, -239, -53, 0, -104, -84, 0, -23, 0, 0, +}; +short yyrindex[] = { 0, + 0, -31, 0, 0, 0, 0, -227, -17, -20, 0, + -40, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, -36, 0, 0, -226, -16, 0, -19, 0, 0, +}; +short yygindex[] = { 0, + 0, 0, 0, 21, 18, 17, 1, 0, 0, +}; +#define YYTABLESIZE 243 +short yytable[] = { 14, + 14, 24, 16, 15, 15, 30, 14, 19, 18, 20, + 15, 13, 5, 21, 27, 18, 5, 29, 14, 17, + 10, 11, 15, 8, 9, 15, 10, 11, 20, 8, + 9, 6, 7, 23, 26, 28, 25, 0, 10, 11, + 0, 8, 9, 0, 0, 0, 0, 0, 0, 0, + 0, 14, 0, 0, 0, 15, 0, 0, 0, 0, + 18, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 17, 10, 11, 0, 0, 0, 0, 0, 0, 17, + 0, 0, 0, 14, 17, 0, 0, 15, 0, 0, + 0, 0, 18, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 10, 11, 0, 8, 9, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 14, 14, 14, + 14, 15, 15, 15, 15, 18, 18, 18, 18, 18, + 2, 0, 3, 4, 14, 0, 3, 4, 10, 11, + 0, 8, 9, +}; +short yycheck[] = { 40, + 41, 41, 47, 40, 41, 59, 47, 92, 40, 9, + 47, 61, 40, 257, 259, 47, 40, 257, 59, 124, + 41, 41, 59, 41, 41, 5, 47, 47, 28, 47, + 47, 259, 259, 13, 17, 19, 16, -1, 59, 59, + -1, 59, 59, -1, -1, -1, -1, -1, -1, -1, + -1, 92, -1, -1, -1, 92, -1, -1, -1, -1, + 92, -1, -1, -1, -1, -1, -1, -1, -1, -1, + 124, 92, 92, -1, -1, -1, -1, -1, -1, 124, + -1, -1, -1, 124, 124, -1, -1, 124, -1, -1, + -1, -1, 124, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, 124, 124, -1, 124, 124, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, 258, 259, 260, + 261, 258, 259, 260, 261, 257, 258, 259, 260, 261, + 258, -1, 260, 261, 258, -1, 260, 261, 259, 259, + -1, 259, 259, +}; +#define YYFINAL 1 +#ifndef YYDEBUG +#define YYDEBUG 0 +#endif +#define YYMAXTOKEN 261 +#if YYDEBUG +char *yyname[] = { +"end-of-file",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +0,0,0,0,0,0,"'('","')'",0,0,0,0,0,"'/'",0,0,0,0,0,0,0,0,0,0,0,"';'",0,"'='",0,0, +0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,"'\\\\'",0,0,0,0,0,0,0, +0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,"'|'",0,0,0,0,0,0,0,0,0,0,0,0,0, +0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +"CLOSE","ID","CODE","RANGE","STRING", +}; +char *yyrule[] = { +"$accept : spec", +"spec :", +"spec : spec rule", +"spec : spec decl", +"decl : ID '=' expr ';'", +"rule : expr look CODE", +"look :", +"look : '/' expr", +"expr : diff", +"expr : expr '|' diff", +"diff : term", +"diff : diff '\\\\' term", +"term : factor", +"term : term factor", +"factor : primary", +"factor : primary close", +"close : CLOSE", +"close : close CLOSE", +"primary : ID", +"primary : RANGE", +"primary : STRING", +"primary : '(' expr ')'", +}; +#endif +#ifdef YYSTACKSIZE +#undef YYMAXDEPTH +#define YYMAXDEPTH YYSTACKSIZE +#else +#ifdef YYMAXDEPTH +#define YYSTACKSIZE YYMAXDEPTH +#else +#define YYSTACKSIZE 500 +#define YYMAXDEPTH 500 +#endif +#endif +int yydebug; +int yynerrs; +int yyerrflag; +int yychar; +short *yyssp; +YYSTYPE *yyvsp; +YYSTYPE yyval; +YYSTYPE yylval; +short yyss[YYSTACKSIZE]; +YYSTYPE yyvs[YYSTACKSIZE]; +#define yystacksize YYSTACKSIZE +#line 121 "parser.y" + +void yyerror(char* s){ + in->fatal(s); +} + +int yylex(){ + return in->scan(); +} + +void parse(int i, ostream &o){ + char * fnamebuf; + char * token; + + o << "/* Generated by re2c 0.5 on "; + time_t now = time(&now); + o.write(ctime(&now), 24); + o << " */\n"; + + in = new Scanner(i); + + o << "#line " << in->line() << " \""; + if( fileName != NULL ) { + fnamebuf = strdup( fileName ); + } else { + fnamebuf = strdup( "" ); + } + token = strtok( fnamebuf, "\\" ); + for(;;) { + o << token; + token = strtok( NULL, "\\" ); + if( token == NULL ) break; + o << "\\\\"; + } + o << "\"\n"; + free( fnamebuf ); + + while(in->echo(o)){ + yyparse(); + if(spec) + genCode(o, spec); + o << "#line " << in->line() << "\n"; + } +} +#line 235 "y.tab.c" +#define YYABORT goto yyabort +#define YYREJECT goto yyabort +#define YYACCEPT goto yyaccept +#define YYERROR goto yyerrlab +int +yyparse() +{ + register int yym, yyn, yystate; +#if YYDEBUG + register char *yys; + extern char *getenv(); + + if (yys = getenv("YYDEBUG")) + { + yyn = *yys; + if (yyn >= '0' && yyn <= '9') + yydebug = yyn - '0'; + } +#endif + + yynerrs = 0; + yyerrflag = 0; + yychar = (-1); + + yyssp = yyss; + yyvsp = yyvs; + *yyssp = yystate = 0; + +yyloop: + if (yyn = yydefred[yystate]) goto yyreduce; + if (yychar < 0) + { + if ((yychar = yylex()) < 0) yychar = 0; +#if YYDEBUG + if (yydebug) + { + yys = 0; + if (yychar <= YYMAXTOKEN) yys = yyname[yychar]; + if (!yys) yys = "illegal-symbol"; + printf("%sdebug: state %d, reading %d (%s)\n", + YYPREFIX, yystate, yychar, yys); + } +#endif + } + if ((yyn = yysindex[yystate]) && (yyn += yychar) >= 0 && + yyn <= YYTABLESIZE && yycheck[yyn] == yychar) + { +#if YYDEBUG + if (yydebug) + printf("%sdebug: state %d, shifting to state %d\n", + YYPREFIX, yystate, yytable[yyn]); +#endif + if (yyssp >= yyss + yystacksize - 1) + { + goto yyoverflow; + } + *++yyssp = yystate = yytable[yyn]; + *++yyvsp = yylval; + yychar = (-1); + if (yyerrflag > 0) --yyerrflag; + goto yyloop; + } + if ((yyn = yyrindex[yystate]) && (yyn += yychar) >= 0 && + yyn <= YYTABLESIZE && yycheck[yyn] == yychar) + { + yyn = yytable[yyn]; + goto yyreduce; + } + if (yyerrflag) goto yyinrecovery; +#ifdef lint + goto yynewerror; +#endif +yynewerror: + yyerror("syntax error"); +#ifdef lint + goto yyerrlab; +#endif +yyerrlab: + ++yynerrs; +yyinrecovery: + if (yyerrflag < 3) + { + yyerrflag = 3; + for (;;) + { + if ((yyn = yysindex[*yyssp]) && (yyn += YYERRCODE) >= 0 && + yyn <= YYTABLESIZE && yycheck[yyn] == YYERRCODE) + { +#if YYDEBUG + if (yydebug) + printf("%sdebug: state %d, error recovery shifting\ + to state %d\n", YYPREFIX, *yyssp, yytable[yyn]); +#endif + if (yyssp >= yyss + yystacksize - 1) + { + goto yyoverflow; + } + *++yyssp = yystate = yytable[yyn]; + *++yyvsp = yylval; + goto yyloop; + } + else + { +#if YYDEBUG + if (yydebug) + printf("%sdebug: error recovery discarding state %d\n", + YYPREFIX, *yyssp); +#endif + if (yyssp <= yyss) goto yyabort; + --yyssp; + --yyvsp; + } + } + } + else + { + if (yychar == 0) goto yyabort; +#if YYDEBUG + if (yydebug) + { + yys = 0; + if (yychar <= YYMAXTOKEN) yys = yyname[yychar]; + if (!yys) yys = "illegal-symbol"; + printf("%sdebug: state %d, error recovery discards token %d (%s)\n", + YYPREFIX, yystate, yychar, yys); + } +#endif + yychar = (-1); + goto yyloop; + } +yyreduce: +#if YYDEBUG + if (yydebug) + printf("%sdebug: state %d, reducing by rule %d (%s)\n", + YYPREFIX, yystate, yyn, yyrule[yyn]); +#endif + yym = yylen[yyn]; + yyval = yyvsp[1-yym]; + switch (yyn) + { +case 1: +#line 40 "parser.y" +{ accept = 0; + spec = NULL; } +break; +case 2: +#line 43 "parser.y" +{ spec = spec? mkAlt(spec, yyvsp[0].regexp) : yyvsp[0].regexp; } +break; +case 4: +#line 48 "parser.y" +{ if(yyvsp[-3].symbol->re) + in->fatal("sym already defined"); + yyvsp[-3].symbol->re = yyvsp[-1].regexp; } +break; +case 5: +#line 54 "parser.y" +{ yyval.regexp = new RuleOp(yyvsp[-2].regexp, yyvsp[-1].regexp, yyvsp[0].token, accept++); } +break; +case 6: +#line 58 "parser.y" +{ yyval.regexp = new NullOp; } +break; +case 7: +#line 60 "parser.y" +{ yyval.regexp = yyvsp[0].regexp; } +break; +case 8: +#line 64 "parser.y" +{ yyval.regexp = yyvsp[0].regexp; } +break; +case 9: +#line 66 "parser.y" +{ yyval.regexp = mkAlt(yyvsp[-2].regexp, yyvsp[0].regexp); } +break; +case 10: +#line 70 "parser.y" +{ yyval.regexp = yyvsp[0].regexp; } +break; +case 11: +#line 72 "parser.y" +{ yyval.regexp = mkDiff(yyvsp[-2].regexp, yyvsp[0].regexp); + if(!yyval.regexp) + in->fatal("can only difference char sets"); + } +break; +case 12: +#line 79 "parser.y" +{ yyval.regexp = yyvsp[0].regexp; } +break; +case 13: +#line 81 "parser.y" +{ yyval.regexp = new CatOp(yyvsp[-1].regexp, yyvsp[0].regexp); } +break; +case 14: +#line 85 "parser.y" +{ yyval.regexp = yyvsp[0].regexp; } +break; +case 15: +#line 87 "parser.y" +{ + switch(yyvsp[0].op){ + case '*': + yyval.regexp = mkAlt(new CloseOp(yyvsp[-1].regexp), new NullOp()); + break; + case '+': + yyval.regexp = new CloseOp(yyvsp[-1].regexp); + break; + case '?': + yyval.regexp = mkAlt(yyvsp[-1].regexp, new NullOp()); + break; + } + } +break; +case 16: +#line 103 "parser.y" +{ yyval.op = yyvsp[0].op; } +break; +case 17: +#line 105 "parser.y" +{ yyval.op = (yyvsp[-1].op == yyvsp[0].op) ? yyvsp[-1].op : '*'; } +break; +case 18: +#line 109 "parser.y" +{ if(!yyvsp[0].symbol->re) + in->fatal("can't find symbol"); + yyval.regexp = yyvsp[0].symbol->re; } +break; +case 19: +#line 113 "parser.y" +{ yyval.regexp = yyvsp[0].regexp; } +break; +case 20: +#line 115 "parser.y" +{ yyval.regexp = yyvsp[0].regexp; } +break; +case 21: +#line 117 "parser.y" +{ yyval.regexp = yyvsp[-1].regexp; } +break; +#line 476 "y.tab.c" + } + yyssp -= yym; + yystate = *yyssp; + yyvsp -= yym; + yym = yylhs[yyn]; + if (yystate == 0 && yym == 0) + { +#if YYDEBUG + if (yydebug) + printf("%sdebug: after reduction, shifting from state 0 to\ + state %d\n", YYPREFIX, YYFINAL); +#endif + yystate = YYFINAL; + *++yyssp = YYFINAL; + *++yyvsp = yyval; + if (yychar < 0) + { + if ((yychar = yylex()) < 0) yychar = 0; +#if YYDEBUG + if (yydebug) + { + yys = 0; + if (yychar <= YYMAXTOKEN) yys = yyname[yychar]; + if (!yys) yys = "illegal-symbol"; + printf("%sdebug: state %d, reading %d (%s)\n", + YYPREFIX, YYFINAL, yychar, yys); + } +#endif + } + if (yychar == 0) goto yyaccept; + goto yyloop; + } + if ((yyn = yygindex[yym]) && (yyn += yystate) >= 0 && + yyn <= YYTABLESIZE && yycheck[yyn] == yystate) + yystate = yytable[yyn]; + else + yystate = yydgoto[yym]; +#if YYDEBUG + if (yydebug) + printf("%sdebug: after reduction, shifting from state %d \ +to state %d\n", YYPREFIX, *yyssp, yystate); +#endif + if (yyssp >= yyss + yystacksize - 1) + { + goto yyoverflow; + } + *++yyssp = yystate; + *++yyvsp = yyval; + goto yyloop; +yyoverflow: + yyerror("yacc stack overflow"); +yyabort: + return (1); +yyaccept: + return (0); +} diff --git a/tools/re2c/bootstrap/re2c.man b/tools/re2c/bootstrap/re2c.man new file mode 100644 index 00000000..dc349221 --- /dev/null +++ b/tools/re2c/bootstrap/re2c.man @@ -0,0 +1,660 @@ + + + +RE2C(1) RE2C(1) + + +NNAAMMEE + re2c - convert regular expressions to C/C++ + + +SSYYNNOOPPSSIISS + rree22cc [--eessbb] _n_a_m_e + + +DDEESSCCRRIIPPTTIIOONN + rree22cc is a preprocessor that generates C-based recognizers + from regular expressions. The input to rree22cc consists of + C/C++ source interleaved with comments of the form //**!!rree22cc + ... **// which contain scanner specifications. In the out- + put these comments are replaced with code that, when exe- + cuted, will find the next input token and then execute + some user-supplied token-specific code. + + For example, given the following code + + #define NULL ((char*) 0) + char *scan(char *p){ + char *q; + #define YYCTYPE char + #define YYCURSOR p + #define YYLIMIT p + #define YYMARKER q + #define YYFILL(n) + /*!re2c + [0-9]+ {return YYCURSOR;} + [\000-\377] {return NULL;} + */ + } + + rree22cc will generate + + /* Generated by re2c on Sat Apr 16 11:40:58 1994 */ + #line 1 "simple.re" + #define NULL ((char*) 0) + char *scan(char *p){ + char *q; + #define YYCTYPE char + #define YYCURSOR p + #define YYLIMIT p + #define YYMARKER q + #define YYFILL(n) + { + YYCTYPE yych; + unsigned int yyaccept; + goto yy0; + yy1: ++YYCURSOR; + yy0: + if((YYLIMIT - YYCURSOR) < 2) YYFILL(2); + yych = *YYCURSOR; + if(yych <= '/') goto yy4; + + + +Version 0.5 8 April 1994 1 + + + + + +RE2C(1) RE2C(1) + + + if(yych >= ':') goto yy4; + yy2: yych = *++YYCURSOR; + goto yy7; + yy3: + #line 10 + {return YYCURSOR;} + yy4: yych = *++YYCURSOR; + yy5: + #line 11 + {return NULL;} + yy6: ++YYCURSOR; + if(YYLIMIT == YYCURSOR) YYFILL(1); + yych = *YYCURSOR; + yy7: if(yych <= '/') goto yy3; + if(yych <= '9') goto yy6; + goto yy3; + } + #line 12 + + } + + +OOPPTTIIOONNSS + rree22cc provides the following options: + + --ee Cross-compile from an ASCII platform to an EBCDIC + one. + + --ss Generate nested iiffs for some sswwiittcchhes. Many com- + pilers need this assist to generate better code. + + --bb Implies --ss. Use bit vectors as well in the attempt + to coax better code out of the compiler. Most use- + ful for specifications with more than a few key- + words (e.g. for most programming languages). + + +IINNTTEERRFFAACCEE CCOODDEE + Unlike other scanner generators, rree22cc does not generate + complete scanners: the user must supply some interface + code. In particular, the user must define the following + macros: + + YYYYCCHHAARR Type used to hold an input symbol. Usually cchhaarr or + uunnssiiggnneedd cchhaarr. + + YYYYCCUURRSSOORR + _l-expression of type **YYYYCCHHAARR that points to the + current input symbol. The generated code advances + YYYYCCUURRSSOORR as symbols are matched. On entry, YYYYCCUURR-- + SSOORR is assumed to point to the first character of + the current token. On exit, YYYYCCUURRSSOORR will point to + the first character of the following token. + + + + +Version 0.5 8 April 1994 2 + + + + + +RE2C(1) RE2C(1) + + + YYLLIIMMIITT Expression of type **YYYYCCHHAARR that marks the end of + the buffer (YYLLIIMMIITT[[--11]] is the last character in the + buffer). The generated code repeatedly compares + YYYYCCUURRSSOORR to YYLLIIMMIITT to determine when the buffer + needs (re)filling. + + YYYYMMAARRKKEERR + _l-expression of type **YYYYCCHHAARR. The generated code + saves backtracking information in YYYYMMAARRKKEERR. + + YYYYFFIILLLL((_n)) + The generated code "calls" YYYYFFIILLLL when the buffer + needs (re)filling: at least _n additional charac- + ters should be provided. YYYYFFIILLLL should adjust + YYYYCCUURRSSOORR, YYYYLLIIMMIITT and YYYYMMAARRKKEERR as needed. Note + that for typical programming languages _n will be + the length of the longest keyword plus one. + + +SSCCAANNNNEERR SSPPEECCIIFFIICCAATTIIOONNSS + Each scanner specification consists of a set of _r_u_l_e_s and + name definitions. Rules consist of a regular expression + along with a block of C/C++ code that is to be executed + when the associated regular expression is matched. Name + definitions are of the form ``_n_a_m_e == _r_e_g_u_l_a_r _e_x_p_r_e_s_- + _s_i_o_n;;''. + + +SSUUMMMMAARRYY OOFF RREE22CC RREEGGUULLAARR EEXXPPRREESSSSIIOONNSS + ""ffoooo"" the literal string ffoooo. ANSI-C escape sequences + can be used. + + [[xxyyzz]] a "character class"; in this case, the regular + expression matches either an 'xx', a 'yy', or a 'zz'. + + [[aabbjj--ooZZ]] + a "character class" with a range in it; matches an + 'aa', a 'bb', any letter from 'jj' through 'oo', or a + 'ZZ'. + + _r\\_s match any _r which isn't an _s. _r and _s must be regu- + lar expressions which can be expressed as character + classes. + + _r** zero or more _r's, where _r is any regular expression + + _r++ one or more _r's + + _r?? zero or one _r's (that is, "an optional _r") + + name the expansion of the "name" definition (see above) + + ((_r)) an _r; parentheses are used to override precedence + (see below) + + + +Version 0.5 8 April 1994 3 + + + + + +RE2C(1) RE2C(1) + + + _r_s an _r followed by an _s ("concatenation") + + _r||_s either an _r or an _s + + _r//_s an _r but only if it is followed by an _s. The s is + not part of the matched text. This type of regular + expression is called "trailing context". + + The regular expressions listed above are grouped according + to precedence, from highest precedence at the top to low- + est at the bottom. Those grouped together have equal + precedence. + + +AA LLAARRGGEERR EEXXAAMMPPLLEE + #include + #include + #include + #include + + #define ADDEQ 257 + #define ANDAND 258 + #define ANDEQ 259 + #define ARRAY 260 + #define ASM 261 + #define AUTO 262 + #define BREAK 263 + #define CASE 264 + #define CHAR 265 + #define CONST 266 + #define CONTINUE 267 + #define DECR 268 + #define DEFAULT 269 + #define DEREF 270 + #define DIVEQ 271 + #define DO 272 + #define DOUBLE 273 + #define ELLIPSIS 274 + #define ELSE 275 + #define ENUM 276 + #define EQL 277 + #define EXTERN 278 + #define FCON 279 + #define FLOAT 280 + #define FOR 281 + #define FUNCTION 282 + #define GEQ 283 + #define GOTO 284 + #define ICON 285 + #define ID 286 + #define IF 287 + #define INCR 288 + #define INT 289 + #define LEQ 290 + + + +Version 0.5 8 April 1994 4 + + + + + +RE2C(1) RE2C(1) + + + #define LONG 291 + #define LSHIFT 292 + #define LSHIFTEQ 293 + #define MODEQ 294 + #define MULEQ 295 + #define NEQ 296 + #define OREQ 297 + #define OROR 298 + #define POINTER 299 + #define REGISTER 300 + #define RETURN 301 + #define RSHIFT 302 + #define RSHIFTEQ 303 + #define SCON 304 + #define SHORT 305 + #define SIGNED 306 + #define SIZEOF 307 + #define STATIC 308 + #define STRUCT 309 + #define SUBEQ 310 + #define SWITCH 311 + #define TYPEDEF 312 + #define UNION 313 + #define UNSIGNED 314 + #define VOID 315 + #define VOLATILE 316 + #define WHILE 317 + #define XOREQ 318 + #define EOI 319 + + typedef unsigned int uint; + typedef unsigned char uchar; + + #define BSIZE 8192 + + #define YYCTYPE uchar + #define YYCURSOR cursor + #define YYLIMIT s->lim + #define YYMARKER s->ptr + #define YYFILL(n) {cursor = fill(s, cursor);} + + #define RET(i) {s->cur = cursor; return i;} + + typedef struct Scanner { + int fd; + uchar *bot, *tok, *ptr, *cur, *pos, *lim, *top, *eof; + uint line; + } Scanner; + + uchar *fill(Scanner *s, uchar *cursor){ + if(!s->eof){ + uint cnt = s->tok - s->bot; + if(cnt){ + memcpy(s->bot, s->tok, s->lim - s->tok); + + + +Version 0.5 8 April 1994 5 + + + + + +RE2C(1) RE2C(1) + + + s->tok = s->bot; + s->ptr -= cnt; + cursor -= cnt; + s->pos -= cnt; + s->lim -= cnt; + } + if((s->top - s->lim) < BSIZE){ + uchar *buf = (uchar*) + malloc(((s->lim - s->bot) + BSIZE)*sizeof(uchar)); + memcpy(buf, s->tok, s->lim - s->tok); + s->tok = buf; + s->ptr = &buf[s->ptr - s->bot]; + cursor = &buf[cursor - s->bot]; + s->pos = &buf[s->pos - s->bot]; + s->lim = &buf[s->lim - s->bot]; + s->top = &s->lim[BSIZE]; + free(s->bot); + s->bot = buf; + } + if((cnt = read(s->fd, (char*) s->lim, BSIZE)) != BSIZE){ + s->eof = &s->lim[cnt]; *(s->eof)++ = '\n'; + } + s->lim += cnt; + } + return cursor; + } + + int scan(Scanner *s){ + uchar *cursor = s->cur; + std: + s->tok = cursor; + /*!re2c + any = [\000-\377]; + O = [0-7]; + D = [0-9]; + L = [a-zA-Z_]; + H = [a-fA-F0-9]; + E = [Ee] [+-]? D+; + FS = [fFlL]; + IS = [uUlL]*; + ESC = [\\] ([abfnrtv?'"\\] | "x" H+ | O+); + */ + + /*!re2c + "/*" { goto comment; } + + "auto" { RET(AUTO); } + "break" { RET(BREAK); } + "case" { RET(CASE); } + "char" { RET(CHAR); } + "const" { RET(CONST); } + "continue" { RET(CONTINUE); } + "default" { RET(DEFAULT); } + "do" { RET(DO); } + + + +Version 0.5 8 April 1994 6 + + + + + +RE2C(1) RE2C(1) + + + "double" { RET(DOUBLE); } + "else" { RET(ELSE); } + "enum" { RET(ENUM); } + "extern" { RET(EXTERN); } + "float" { RET(FLOAT); } + "for" { RET(FOR); } + "goto" { RET(GOTO); } + "if" { RET(IF); } + "int" { RET(INT); } + "long" { RET(LONG); } + "register" { RET(REGISTER); } + "return" { RET(RETURN); } + "short" { RET(SHORT); } + "signed" { RET(SIGNED); } + "sizeof" { RET(SIZEOF); } + "static" { RET(STATIC); } + "struct" { RET(STRUCT); } + "switch" { RET(SWITCH); } + "typedef" { RET(TYPEDEF); } + "union" { RET(UNION); } + "unsigned" { RET(UNSIGNED); } + "void" { RET(VOID); } + "volatile" { RET(VOLATILE); } + "while" { RET(WHILE); } + + L (L|D)* { RET(ID); } + + ("0" [xX] H+ IS?) | ("0" D+ IS?) | (D+ IS?) | + (['] (ESC|any\[\n\\'])* [']) + { RET(ICON); } + + (D+ E FS?) | (D* "." D+ E? FS?) | (D+ "." D* E? FS?) + { RET(FCON); } + + (["] (ESC|any\[\n\\"])* ["]) + { RET(SCON); } + + "..." { RET(ELLIPSIS); } + ">>=" { RET(RSHIFTEQ); } + "<<=" { RET(LSHIFTEQ); } + "+=" { RET(ADDEQ); } + "-=" { RET(SUBEQ); } + "*=" { RET(MULEQ); } + "/=" { RET(DIVEQ); } + "%=" { RET(MODEQ); } + "&=" { RET(ANDEQ); } + "^=" { RET(XOREQ); } + "|=" { RET(OREQ); } + ">>" { RET(RSHIFT); } + "<<" { RET(LSHIFT); } + "++" { RET(INCR); } + "--" { RET(DECR); } + "->" { RET(DEREF); } + "&&" { RET(ANDAND); } + + + +Version 0.5 8 April 1994 7 + + + + + +RE2C(1) RE2C(1) + + + "||" { RET(OROR); } + "<=" { RET(LEQ); } + ">=" { RET(GEQ); } + "==" { RET(EQL); } + "!=" { RET(NEQ); } + ";" { RET(';'); } + "{" { RET('{'); } + "}" { RET('}'); } + "," { RET(','); } + ":" { RET(':'); } + "=" { RET('='); } + "(" { RET('('); } + ")" { RET(')'); } + "[" { RET('['); } + "]" { RET(']'); } + "." { RET('.'); } + "&" { RET('&'); } + "!" { RET('!'); } + "~" { RET('~'); } + "-" { RET('-'); } + "+" { RET('+'); } + "*" { RET('*'); } + "/" { RET('/'); } + "%" { RET('%'); } + "<" { RET('<'); } + ">" { RET('>'); } + "^" { RET('^'); } + "|" { RET('|'); } + "?" { RET('?'); } + + + [ \t\v\f]+ { goto std; } + + "\n" + { + if(cursor == s->eof) RET(EOI); + s->pos = cursor; s->line++; + goto std; + } + + any + { + printf("unexpected character: %c\n", *s->tok); + goto std; + } + */ + + comment: + /*!re2c + "*/" { goto std; } + "\n" + { + if(cursor == s->eof) RET(EOI); + s->tok = s->pos = cursor; s->line++; + + + +Version 0.5 8 April 1994 8 + + + + + +RE2C(1) RE2C(1) + + + goto comment; + } + any { goto comment; } + */ + } + + main(){ + Scanner in; + int t; + memset((char*) &in, 0, sizeof(in)); + in.fd = 0; + while((t = scan(&in)) != EOI){ + /* + printf("%d\t%.*s\n", t, in.cur - in.tok, in.tok); + printf("%d\n", t); + */ + } + close(in.fd); + } + + +SSEEEE AALLSSOO + flex(1), lex(1). + + +FFEEAATTUURREESS + rree22cc does not provide a default action: the generated code + assumes that the input will consist of a sequence of + tokens. Typically this can be dealt with by adding a rule + such as the one for unexpected characters in the example + above. + + The user must arrange for a sentinel token to appear at + the end of input (and provide a rule for matching it): + rree22cc does not provide an <<<>>> expression. If the + source is from a null-byte terminated string, a rule + matching a null character will suffice. If the source is + from a file then the approach taken in the example can be + used: pad the input with a newline (or some other charac- + ter that can't appear within another token); upon recog- + nizing such a character check to see if it is the sentinel + and act accordingly. + + rree22cc does not provide start conditions: use a separate + scanner specification for each start condition (as illus- + trated in the above example). + + No [^x]. Use difference instead. + +BBUUGGSS + Only fixed length trailing context can be handled. + + The maximum value appearing as a parameter _n to YYYYFFIILLLL is + not provided to the generated code (this value is needed + + + +Version 0.5 8 April 1994 9 + + + + + +RE2C(1) RE2C(1) + + + for constructing the interface code). Note that this + value is usually relatively small: for typical programming + languages _n will be the length of the longest keyword plus + one. + + Difference only works for character sets. + + The rree22cc internal algorithms need documentation. + + +AAUUTTHHOORR + Please send bug reports, fixes and feedback to: + + Peter Bumbulis + Computer Systems Group + University of Waterloo + Waterloo, Ontario + N2L 3G1 + Internet: peterr@csg.uwaterloo.ca + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Version 0.5 8 April 1994 10 + + diff --git a/tools/re2c/bootstrap/scanner.cc b/tools/re2c/bootstrap/scanner.cc new file mode 100644 index 00000000..19b42597 --- /dev/null +++ b/tools/re2c/bootstrap/scanner.cc @@ -0,0 +1,470 @@ +/* Generated by re2c 0.5 on Sat May 15 11:35:52 1999 */ +#line 1 "scanner.re" +#include +#include +#include +#include +#include "scanner.h" +#include "parser.h" +#include "y.tab.h" + +extern YYSTYPE yylval; + +#define BSIZE 8192 + +#define YYCTYPE uchar +#define YYCURSOR cursor +#define YYLIMIT lim +#define YYMARKER ptr +#define YYFILL(n) {cursor = fill(cursor);} + +#define RETURN(i) {cur = cursor; return i;} + + +Scanner::Scanner(int i) : in(i), + bot(NULL), tok(NULL), ptr(NULL), cur(NULL), pos(NULL), lim(NULL), + top(NULL), eof(NULL), tchar(0), tline(0), cline(1) { + ; +} + +uchar *Scanner::fill(uchar *cursor){ + if(!eof){ + uint cnt = tok - bot; + if(cnt){ + memcpy(bot, tok, lim - tok); + tok = bot; + ptr -= cnt; + cursor -= cnt; + pos -= cnt; + lim -= cnt; + } + if((top - lim) < BSIZE){ + uchar *buf = new uchar[(lim - bot) + BSIZE]; + memcpy(buf, tok, lim - tok); + tok = buf; + ptr = &buf[ptr - bot]; + cursor = &buf[cursor - bot]; + pos = &buf[pos - bot]; + lim = &buf[lim - bot]; + top = &lim[BSIZE]; + delete [] bot; + bot = buf; + } + if((cnt = read(in, (char*) lim, BSIZE)) != BSIZE){ + eof = &lim[cnt]; *eof++ = '\n'; + } + lim += cnt; + } + return cursor; +} + +#line 68 + + +int Scanner::echo(ostream &out){ + uchar *cursor = cur; + tok = cursor; +echo: +{ + YYCTYPE yych; + unsigned int yyaccept; + goto yy0; +yy1: ++YYCURSOR; +yy0: + if((YYLIMIT - YYCURSOR) < 7) YYFILL(7); + yych = *YYCURSOR; + if(yych == '\n') goto yy4; + if(yych != '/') goto yy6; +yy2: yyaccept = 0; + yych = *(YYMARKER = ++YYCURSOR); + if(yych == '*') goto yy7; +yy3: +#line 82 + { goto echo; } +yy4: yych = *++YYCURSOR; +yy5: +#line 78 + { if(cursor == eof) RETURN(0); + out.write(tok, cursor - tok); + tok = pos = cursor; cline++; + goto echo; } +yy6: yych = *++YYCURSOR; + goto yy3; +yy7: yych = *++YYCURSOR; + if(yych == '!') goto yy9; +yy8: YYCURSOR = YYMARKER; + switch(yyaccept){ + case 0: goto yy3; + } +yy9: yych = *++YYCURSOR; + if(yych != 'r') goto yy8; +yy10: yych = *++YYCURSOR; + if(yych != 'e') goto yy8; +yy11: yych = *++YYCURSOR; + if(yych != '2') goto yy8; +yy12: yych = *++YYCURSOR; + if(yych != 'c') goto yy8; +yy13: yych = *++YYCURSOR; +yy14: +#line 75 + { out.write(tok, &cursor[-7] - tok); + tok = cursor; + RETURN(1); } +} +#line 83 + +} + + +int Scanner::scan(){ + uchar *cursor = cur; + uint depth; + +scan: + tchar = cursor - pos; + tline = cline; + tok = cursor; +{ + YYCTYPE yych; + unsigned int yyaccept; + goto yy15; +yy16: ++YYCURSOR; +yy15: + if((YYLIMIT - YYCURSOR) < 2) YYFILL(2); + yych = *YYCURSOR; + if(yych <= ':'){ + if(yych <= '"'){ + if(yych <= '\n'){ + if(yych <= '\b') goto yy35; + if(yych <= '\t') goto yy31; + goto yy33; + } else { + if(yych == ' ') goto yy31; + if(yych <= '!') goto yy35; + goto yy23; + } + } else { + if(yych <= '*'){ + if(yych <= '\'') goto yy35; + if(yych <= ')') goto yy27; + goto yy21; + } else { + if(yych <= '+') goto yy28; + if(yych == '/') goto yy19; + goto yy35; + } + } + } else { + if(yych <= 'Z'){ + if(yych <= '='){ + if(yych == '<') goto yy35; + goto yy27; + } else { + if(yych == '?') goto yy28; + if(yych <= '@') goto yy35; + goto yy29; + } + } else { + if(yych <= '`'){ + if(yych <= '[') goto yy25; + if(yych <= '\\') goto yy27; + goto yy35; + } else { + if(yych <= 'z') goto yy29; + if(yych <= '{') goto yy17; + if(yych <= '|') goto yy27; + goto yy35; + } + } + } +yy17: yych = *++YYCURSOR; +yy18: +#line 96 + { depth = 1; + goto code; + } +yy19: yych = *++YYCURSOR; + if(yych == '*') goto yy54; +yy20: +#line 115 + { RETURN(*tok); } +yy21: yych = *++YYCURSOR; + if(yych == '/') goto yy52; +yy22: +#line 117 + { yylval.op = *tok; + RETURN(CLOSE); } +yy23: yyaccept = 0; + yych = *(YYMARKER = ++YYCURSOR); + if(yych != '\n') goto yy48; +yy24: +#line 108 + { fatal("bad string"); } +yy25: yyaccept = 1; + yych = *(YYMARKER = ++YYCURSOR); + if(yych != '\n') goto yy42; +yy26: +#line 113 + { fatal("bad character constant"); } +yy27: yych = *++YYCURSOR; + goto yy20; +yy28: yych = *++YYCURSOR; + goto yy22; +yy29: yych = *++YYCURSOR; + goto yy40; +yy30: +#line 120 + { cur = cursor; + yylval.symbol = Symbol::find(token()); + return ID; } +yy31: yych = *++YYCURSOR; + goto yy38; +yy32: +#line 124 + { goto scan; } +yy33: yych = *++YYCURSOR; +yy34: +#line 126 + { if(cursor == eof) RETURN(0); + pos = cursor; cline++; + goto scan; + } +yy35: yych = *++YYCURSOR; +yy36: +#line 131 + { cerr << "unexpected character: " << *tok << endl; + goto scan; + } +yy37: ++YYCURSOR; + if(YYLIMIT == YYCURSOR) YYFILL(1); + yych = *YYCURSOR; +yy38: if(yych == '\t') goto yy37; + if(yych == ' ') goto yy37; + goto yy32; +yy39: ++YYCURSOR; + if(YYLIMIT == YYCURSOR) YYFILL(1); + yych = *YYCURSOR; +yy40: if(yych <= '@'){ + if(yych <= '/') goto yy30; + if(yych <= '9') goto yy39; + goto yy30; + } else { + if(yych <= 'Z') goto yy39; + if(yych <= '`') goto yy30; + if(yych <= 'z') goto yy39; + goto yy30; + } +yy41: ++YYCURSOR; + if(YYLIMIT == YYCURSOR) YYFILL(1); + yych = *YYCURSOR; +yy42: if(yych <= '['){ + if(yych != '\n') goto yy41; + } else { + if(yych <= '\\') goto yy44; + if(yych <= ']') goto yy45; + goto yy41; + } +yy43: YYCURSOR = YYMARKER; + switch(yyaccept){ + case 0: goto yy24; + case 1: goto yy26; + } +yy44: ++YYCURSOR; + if(YYLIMIT == YYCURSOR) YYFILL(1); + yych = *YYCURSOR; + if(yych == '\n') goto yy43; + goto yy41; +yy45: yych = *++YYCURSOR; +yy46: +#line 110 + { cur = cursor; + yylval.regexp = ranToRE(token()); + return RANGE; } +yy47: ++YYCURSOR; + if(YYLIMIT == YYCURSOR) YYFILL(1); + yych = *YYCURSOR; +yy48: if(yych <= '!'){ + if(yych == '\n') goto yy43; + goto yy47; + } else { + if(yych <= '"') goto yy50; + if(yych != '\\') goto yy47; + } +yy49: ++YYCURSOR; + if(YYLIMIT == YYCURSOR) YYFILL(1); + yych = *YYCURSOR; + if(yych == '\n') goto yy43; + goto yy47; +yy50: yych = *++YYCURSOR; +yy51: +#line 105 + { cur = cursor; + yylval.regexp = strToRE(token()); + return STRING; } +yy52: yych = *++YYCURSOR; +yy53: +#line 102 + { tok = cursor; + RETURN(0); } +yy54: yych = *++YYCURSOR; +yy55: +#line 99 + { depth = 1; + goto comment; } +} +#line 134 + + +code: +{ + YYCTYPE yych; + unsigned int yyaccept; + goto yy56; +yy57: ++YYCURSOR; +yy56: + if((YYLIMIT - YYCURSOR) < 2) YYFILL(2); + yych = *YYCURSOR; + if(yych <= '&'){ + if(yych <= '\n'){ + if(yych <= '\t') goto yy64; + goto yy62; + } else { + if(yych == '"') goto yy66; + goto yy64; + } + } else { + if(yych <= '{'){ + if(yych <= '\'') goto yy67; + if(yych <= 'z') goto yy64; + goto yy60; + } else { + if(yych != '}') goto yy64; + } + } +yy58: yych = *++YYCURSOR; +yy59: +#line 138 + { if(--depth == 0){ + cur = cursor; + yylval.token = new Token(token(), tline); + return CODE; + } + goto code; } +yy60: yych = *++YYCURSOR; +yy61: +#line 144 + { ++depth; + goto code; } +yy62: yych = *++YYCURSOR; +yy63: +#line 146 + { if(cursor == eof) fatal("missing '}'"); + pos = cursor; cline++; + goto code; + } +yy64: yych = *++YYCURSOR; +yy65: +#line 150 + { goto code; } +yy66: yyaccept = 0; + yych = *(YYMARKER = ++YYCURSOR); + if(yych == '\n') goto yy65; + goto yy73; +yy67: yyaccept = 0; + yych = *(YYMARKER = ++YYCURSOR); + if(yych == '\n') goto yy65; + goto yy69; +yy68: ++YYCURSOR; + if(YYLIMIT == YYCURSOR) YYFILL(1); + yych = *YYCURSOR; +yy69: if(yych <= '&'){ + if(yych != '\n') goto yy68; + } else { + if(yych <= '\'') goto yy64; + if(yych == '\\') goto yy71; + goto yy68; + } +yy70: YYCURSOR = YYMARKER; + switch(yyaccept){ + case 0: goto yy65; + } +yy71: ++YYCURSOR; + if(YYLIMIT == YYCURSOR) YYFILL(1); + yych = *YYCURSOR; + if(yych == '\n') goto yy70; + goto yy68; +yy72: ++YYCURSOR; + if(YYLIMIT == YYCURSOR) YYFILL(1); + yych = *YYCURSOR; +yy73: if(yych <= '!'){ + if(yych == '\n') goto yy70; + goto yy72; + } else { + if(yych <= '"') goto yy64; + if(yych != '\\') goto yy72; + } +yy74: ++YYCURSOR; + if(YYLIMIT == YYCURSOR) YYFILL(1); + yych = *YYCURSOR; + if(yych == '\n') goto yy70; + goto yy72; +} +#line 151 + + +comment: +{ + YYCTYPE yych; + unsigned int yyaccept; + goto yy75; +yy76: ++YYCURSOR; +yy75: + if((YYLIMIT - YYCURSOR) < 2) YYFILL(2); + yych = *YYCURSOR; + if(yych <= ')'){ + if(yych == '\n') goto yy80; + goto yy82; + } else { + if(yych <= '*') goto yy77; + if(yych == '/') goto yy79; + goto yy82; + } +yy77: yych = *++YYCURSOR; + if(yych == '/') goto yy85; +yy78: +#line 165 + { goto comment; } +yy79: yych = *++YYCURSOR; + if(yych == '*') goto yy83; + goto yy78; +yy80: yych = *++YYCURSOR; +yy81: +#line 161 + { if(cursor == eof) RETURN(0); + tok = pos = cursor; cline++; + goto comment; + } +yy82: yych = *++YYCURSOR; + goto yy78; +yy83: yych = *++YYCURSOR; +yy84: +#line 159 + { ++depth; + goto comment; } +yy85: yych = *++YYCURSOR; +yy86: +#line 155 + { if(--depth == 0) + goto scan; + else + goto comment; } +} +#line 166 + +} + +void Scanner::fatal(char *msg){ + cerr << "line " << tline << ", column " << (tchar + 1) << ": " + << msg << endl; + exit(1); +} diff --git a/tools/re2c/bootstrap/y.tab.h b/tools/re2c/bootstrap/y.tab.h new file mode 100644 index 00000000..d7b3702d --- /dev/null +++ b/tools/re2c/bootstrap/y.tab.h @@ -0,0 +1,12 @@ +#define CLOSE 257 +#define ID 258 +#define CODE 259 +#define RANGE 260 +#define STRING 261 +typedef union { + Symbol *symbol; + RegExp *regexp; + Token *token; + char op; +} YYSTYPE; +extern YYSTYPE yylval; diff --git a/tools/re2c/code.cc b/tools/re2c/code.cc new file mode 100644 index 00000000..8aaf6a88 --- /dev/null +++ b/tools/re2c/code.cc @@ -0,0 +1,665 @@ +#include +#include +#include +#include +#include "substr.h" +#include "globals.h" +#include "dfa.h" + +// there must be at least one span in list; all spans must cover +// same range + +void Go::compact(){ + // arrange so that adjacent spans have different targets + uint i = 0; + for(uint j = 1; j < nSpans; ++j){ + if(span[j].to != span[i].to){ + ++i; span[i].to = span[j].to; + } + span[i].ub = span[j].ub; + } + nSpans = i + 1; +} + +void Go::unmap(Go *base, State *x){ + Span *s = span, *b = base->span, *e = &b[base->nSpans]; + uint lb = 0; + s->ub = 0; + s->to = NULL; + for(; b != e; ++b){ + if(b->to == x){ + if((s->ub - lb) > 1) + s->ub = b->ub; + } else { + if(b->to != s->to){ + if(s->ub){ + lb = s->ub; ++s; + } + s->to = b->to; + } + s->ub = b->ub; + } + } + s->ub = e[-1].ub; ++s; + nSpans = s - span; +} + +void doGen(Go *g, State *s, uchar *bm, uchar m){ +Span *b = g->span, *e = &b[g->nSpans]; +uint lb = 0; +for(; b < e; ++b){ + if(b->to == s) + for(; lb < b->ub; ++lb) bm[lb] |= m; + lb = b->ub; +} +} + +void prt(ostream& o, Go *g, State *s){ +Span *b = g->span, *e = &b[g->nSpans]; +uint lb = 0; +for(; b < e; ++b){ + if(b->to == s) + printSpan(o, lb, b->ub); + lb = b->ub; +} +} + +bool matches(Go *g1, State *s1, Go *g2, State *s2){ +Span *b1 = g1->span, *e1 = &b1[g1->nSpans]; +uint lb1 = 0; +Span *b2 = g2->span, *e2 = &b2[g2->nSpans]; +uint lb2 = 0; +for(;;){ + for(; b1 < e1 && b1->to != s1; ++b1) lb1 = b1->ub; + for(; b2 < e2 && b2->to != s2; ++b2) lb2 = b2->ub; + if(b1 == e1) return b2 == e2; + if(b2 == e2) return false; + if(lb1 != lb2 || b1->ub != b2->ub) return false; + ++b1; ++b2; +} +} + +class BitMap { +public: +static BitMap *first; +Go *go; +State *on; +BitMap *next; +uint i; +uchar m; +public: +static BitMap *find(Go*, State*); +static BitMap *find(State*); +static void gen(ostream&, uint, uint); +static void stats(); +BitMap(Go*, State*); +}; + +BitMap *BitMap::first = NULL; + +BitMap::BitMap(Go *g, State *x) : go(g), on(x), next(first) { +first = this; +} + +BitMap *BitMap::find(Go *g, State *x){ +for(BitMap *b = first; b; b = b->next){ + if(matches(b->go, b->on, g, x)) + return b; + } + return new BitMap(g, x); +} + +BitMap *BitMap::find(State *x){ + for(BitMap *b = first; b; b = b->next){ + if(b->on == x) + return b; + } + return NULL; +} + +void BitMap::gen(ostream &o, uint lb, uint ub){ + BitMap *b = first; + if(b){ + o << "\tstatic unsigned char yybm[] = {"; + uint n = ub - lb; + uchar *bm = new uchar[n]; + memset(bm, 0, n); + for(uint i = 0; b; i += n){ + for(uchar m = 0x80; b && m; b = b->next, m >>= 1){ + b->i = i; b->m = m; + doGen(b->go, b->on, bm-lb, m); + } + for(uint j = 0; j < n; ++j){ + if(j%8 == 0) o << "\n\t"; + o << setw(3) << (uint) bm[j] << ", "; + } + } + o << "\n\t};\n"; + } +} + +void BitMap::stats(){ + uint n = 0; + for(BitMap *b = first; b; b = b->next){ +prt(cerr, b->go, b->on); cerr << endl; + ++n; + } + cerr << n << " bitmaps\n"; + first = NULL; +} + +void genGoTo(ostream &o, State *to){ + o << "\tgoto yy" << to->label << ";\n"; +} + +void genIf(ostream &o, char *cmp, uint v){ + o << "\tif(yych " << cmp << " '"; + prtCh(o, v); + o << "')"; +} + +void indent(ostream &o, uint i){ + while(i-- > 0) + o << "\t"; +} + +static void need(ostream &o, uint n){ + if(n == 1) + o << "\tif(YYLIMIT == YYCURSOR) YYFILL(1);\n"; + else + o << "\tif((YYLIMIT - YYCURSOR) < " << n << ") YYFILL(" << n << ");\n"; + o << "\tyych = *YYCURSOR;\n"; +} + +void Match::emit(ostream &o){ + if(state->link){ + o << "\t++YYCURSOR;\n"; + need(o, state->depth); + } else { + o << "\tyych = *++YYCURSOR;\n"; + } +} + +void Enter::emit(ostream &o){ + if(state->link){ + o << "\t++YYCURSOR;\n"; + o << "yy" << label << ":\n"; + need(o, state->depth); + } else { + o << "\tyych = *++YYCURSOR;\n"; + o << "yy" << label << ":\n"; + } +} + +void Save::emit(ostream &o){ + o << "\tyyaccept = " << selector << ";\n"; + if(state->link){ + o << "\tYYMARKER = ++YYCURSOR;\n"; + need(o, state->depth); + } else { + o << "\tyych = *(YYMARKER = ++YYCURSOR);\n"; + } +} + +Move::Move(State *s) : Action(s) { + ; +} + +void Move::emit(ostream &o){ + ; +} + +Accept::Accept(State *x, uint n, uint *s, State **r) + : Action(x), nRules(n), saves(s), rules(r){ + ; +} + +void Accept::emit(ostream &o){ + bool first = true; + for(uint i = 0; i < nRules; ++i) + if(saves[i] != ~0u){ + if(first){ + first = false; + o << "\tYYCURSOR = YYMARKER;\n"; + o << "\tswitch(yyaccept){\n"; + } + o << "\tcase " << saves[i] << ":"; + genGoTo(o, rules[i]); + } + if(!first) + o << "\t}\n"; +} + +Rule::Rule(State *s, RuleOp *r) : Action(s), rule(r) { + ; +} + +void Rule::emit(ostream &o){ + uint back = rule->ctx->fixedLength(); + if(back != ~0u && back > 0u) + o << "\tYYCURSOR -= " << back << ";"; + o << "\n#line " << rule->code->line + << "\n\t" << rule->code->text << "\n"; +} + +void doLinear(ostream &o, uint i, Span *s, uint n, State *next){ + for(;;){ + State *bg = s[0].to; + while(n >= 3 && s[2].to == bg && (s[1].ub - s[0].ub) == 1){ + if(s[1].to == next && n == 3){ + indent(o, i); genIf(o, "!=", s[0].ub); genGoTo(o, bg); + return; + } else { + indent(o, i); genIf(o, "==", s[0].ub); genGoTo(o, s[1].to); + } + n -= 2; s += 2; + } + if(n == 1){ + if(bg != next){ + indent(o, i); genGoTo(o, s[0].to); + } + return; + } else if(n == 2 && bg == next){ + indent(o, i); genIf(o, ">=", s[0].ub); genGoTo(o, s[1].to); + return; + } else { + indent(o, i); genIf(o, "<=", s[0].ub - 1); genGoTo(o, bg); + n -= 1; s += 1; + } + } +} + +void Go::genLinear(ostream &o, State *next){ + doLinear(o, 0, span, nSpans, next); +} + +void genCases(ostream &o, uint lb, Span *s){ + if(lb < s->ub){ + for(;;){ + o << "\tcase '"; prtCh(o, lb); o << "':"; + if(++lb == s->ub) + break; + o << "\n"; + } + } +} + +void Go::genSwitch(ostream &o, State *next){ + if(nSpans <= 2){ + genLinear(o, next); + } else { + State *def = span[nSpans-1].to; + Span **sP = new Span*[nSpans-1], **r, **s, **t; + + t = &sP[0]; + for(uint i = 0; i < nSpans; ++i) + if(span[i].to != def) + *(t++) = &span[i]; + + o << "\tswitch(yych){\n"; + while(t != &sP[0]){ + r = s = &sP[0]; + if(*s == &span[0]) + genCases(o, 0, *s); + else + genCases(o, (*s)[-1].ub, *s); + State *to = (*s)->to; + while(++s < t){ + if((*s)->to == to) + genCases(o, (*s)[-1].ub, *s); + else + *(r++) = *s; + } + genGoTo(o, to); + t = r; + } + o << "\tdefault:"; + genGoTo(o, def); + o << "\t}\n"; + + delete [] sP; + } +} + +void doBinary(ostream &o, uint i, Span *s, uint n, State *next){ + if(n <= 4){ + doLinear(o, i, s, n, next); + } else { + uint h = n/2; + indent(o, i); genIf(o, "<=", s[h-1].ub - 1); o << "{\n"; + doBinary(o, i+1, &s[0], h, next); + indent(o, i); o << "\t} else {\n"; + doBinary(o, i+1, &s[h], n - h, next); + indent(o, i); o << "\t}\n"; + } +} + +void Go::genBinary(ostream &o, State *next){ + doBinary(o, 0, span, nSpans, next); +} + +void Go::genBase(ostream &o, State *next){ + if(nSpans == 0) + return; + if(!sFlag){ + genSwitch(o, next); + return; + } + if(nSpans > 8){ + Span *bot = &span[0], *top = &span[nSpans-1]; + uint util; + if(bot[0].to == top[0].to){ + util = (top[-1].ub - bot[0].ub)/(nSpans - 2); + } else { + if(bot[0].ub > (top[0].ub - top[-1].ub)){ + util = (top[0].ub - bot[0].ub)/(nSpans - 1); + } else { + util = top[-1].ub/(nSpans - 1); + } + } + if(util <= 2){ + genSwitch(o, next); + return; + } + } + if(nSpans > 5){ + genBinary(o, next); + } else { + genLinear(o, next); + } +} + +void Go::genGoto(ostream &o, State *next){ + if(bFlag){ + for(uint i = 0; i < nSpans; ++i){ + State *to = span[i].to; + if(to && to->isBase){ + BitMap *b = BitMap::find(to); + if(b && matches(b->go, b->on, this, to)){ + Go go; + go.span = new Span[nSpans]; + go.unmap(this, to); + o << "\tif(yybm[" << b->i << "+yych] & " << (uint) b->m << ")"; + genGoTo(o, to); + go.genBase(o, next); + delete [] go.span; + return; + } + } + } + } + genBase(o, next); +} + +void State::emit(ostream &o){ + o << "yy" << label << ":"; + action->emit(o); +} + +uint merge(Span *x0, State *fg, State *bg){ + Span *x = x0, *f = fg->go.span, *b = bg->go.span; + uint nf = fg->go.nSpans, nb = bg->go.nSpans; + State *prev = NULL, *to; + // NB: we assume both spans are for same range + for(;;){ + if(f->ub == b->ub){ + to = f->to == b->to? bg : f->to; + if(to == prev){ + --x; + } else { + x->to = prev = to; + } + x->ub = f->ub; + ++x; ++f; --nf; ++b; --nb; + if(nf == 0 && nb == 0) + return x - x0; + } + while(f->ub < b->ub){ + to = f->to == b->to? bg : f->to; + if(to == prev){ + --x; + } else { + x->to = prev = to; + } + x->ub = f->ub; + ++x; ++f; --nf; + } + while(b->ub < f->ub){ + to = b->to == f->to? bg : f->to; + if(to == prev){ + --x; + } else { + x->to = prev = to; + } + x->ub = b->ub; + ++x; ++b; --nb; + } + } +} + +const uint cInfinity = ~0; + +class SCC { +public: + State **top, **stk; +public: + SCC(uint); + ~SCC(); + void traverse(State*); +}; + +SCC::SCC(uint size){ + top = stk = new State*[size]; +} + +SCC::~SCC(){ + delete [] stk; +} + +void SCC::traverse(State *x){ + *top = x; + uint k = ++top - stk; + x->depth = k; + for(uint i = 0; i < x->go.nSpans; ++i){ + State *y = x->go.span[i].to; + if(y){ + if(y->depth == 0) + traverse(y); + if(y->depth < x->depth) + x->depth = y->depth; + } + } + if(x->depth == k) + do { + (*--top)->depth = cInfinity; + (*top)->link = x; + } while(*top != x); +} + +uint maxDist(State *s){ + uint mm = 0; + for(uint i = 0; i < s->go.nSpans; ++i){ + State *t = s->go.span[i].to; + if(t){ + uint m = 1; + if(!t->link) + m += maxDist(t); + if(m > mm) + mm = m; + } + } + return mm; +} + +void calcDepth(State *head){ + State *t; + for(State *s = head; s; s = s->next){ + if(s->link == s){ + for(uint i = 0; i < s->go.nSpans; ++i){ + t = s->go.span[i].to; + if(t && t->link == s) + goto inSCC; + } + s->link = NULL; + } else { + inSCC: + s->depth = maxDist(s); + } + } +} + +void DFA::findSCCs(){ + SCC scc(nStates); + State *s; + + for(s = head; s; s = s->next){ + s->depth = 0; + s->link = NULL; + } + + for(s = head; s; s = s->next) + if(!s->depth) + scc.traverse(s); + + calcDepth(head); +} + +void DFA::split(State *s){ + State *move = new State; + (void) new Move(move); + addState(&s->next, move); + move->link = s->link; + move->rule = s->rule; + move->go = s->go; + s->rule = NULL; + s->go.nSpans = 1; + s->go.span = new Span[1]; + s->go.span[0].ub = ubChar; + s->go.span[0].to = move; +} + +void DFA::emit(ostream &o){ + static uint label = 0; + State *s; + uint i; + + findSCCs(); + head->link = head; + head->depth = maxDist(head); + + uint nRules = 0; + for(s = head; s; s = s->next) + if(s->rule && s->rule->accept >= nRules) + nRules = s->rule->accept + 1; + + uint nSaves = 0; + uint *saves = new uint[nRules]; + memset(saves, ~0, (nRules)*sizeof(*saves)); + + // mark backtracking points + for(s = head; s; s = s->next){ + RuleOp *ignore = NULL; + if(s->rule){ + for(i = 0; i < s->go.nSpans; ++i) + if(s->go.span[i].to && !s->go.span[i].to->rule){ + delete s->action; + if(saves[s->rule->accept] == ~0u) + saves[s->rule->accept] = nSaves++; + (void) new Save(s, saves[s->rule->accept]); + continue; + } + ignore = s->rule; + } + } + + // insert actions + State **rules = new State*[nRules]; + memset(rules, 0, (nRules)*sizeof(*rules)); + State *accept = NULL; + for(s = head; s; s = s->next){ + State *ow; + if(!s->rule){ + ow = accept; + } else { + if(!rules[s->rule->accept]){ + State *n = new State; + (void) new Rule(n, s->rule); + rules[s->rule->accept] = n; + addState(&s->next, n); + } + ow = rules[s->rule->accept]; + } + for(i = 0; i < s->go.nSpans; ++i) + if(!s->go.span[i].to){ + if(!ow){ + ow = accept = new State; + (void) new Accept(accept, nRules, saves, rules); + addState(&s->next, accept); + } + s->go.span[i].to = ow; + } + } + + // split ``base'' states into two parts + for(s = head; s; s = s->next){ + s->isBase = false; + if(s->link){ + for(i = 0; i < s->go.nSpans; ++i){ + if(s->go.span[i].to == s){ + s->isBase = true; + split(s); + if(bFlag) + BitMap::find(&s->next->go, s); + s = s->next; + break; + } + } + } + } + + // find ``base'' state, if possible + Span *span = new Span[ubChar - lbChar]; + for(s = head; s; s = s->next){ + if(!s->link){ + for(i = 0; i < s->go.nSpans; ++i){ + State *to = s->go.span[i].to; + if(to && to->isBase){ + to = to->go.span[0].to; + uint nSpans = merge(span, s, to); + if(nSpans < s->go.nSpans){ + delete [] s->go.span; + s->go.nSpans = nSpans; + s->go.span = new Span[nSpans]; + memcpy(s->go.span, span, nSpans*sizeof(Span)); + } + break; + } + } + } + } + delete [] span; + + delete head->action; + + o << "{\n\tYYCTYPE yych;\n\tunsigned int yyaccept;\n"; + + if(bFlag) + BitMap::gen(o, lbChar, ubChar); + + o << "\tgoto yy" << label << ";\n"; + (void) new Enter(head, label++); + + for(s = head; s; s = s->next) + s->label = label++; + + for(s = head; s; s = s->next){ + s->emit(o); + s->go.genGoto(o, s->next); + } + o << "}\n"; + + BitMap::first = NULL; + + delete [] saves; + delete [] rules; +} diff --git a/tools/re2c/dfa.cc b/tools/re2c/dfa.cc new file mode 100644 index 00000000..c1ff0545 --- /dev/null +++ b/tools/re2c/dfa.cc @@ -0,0 +1,222 @@ +#include +#include +#include +#include "globals.h" +#include "substr.h" +#include "dfa.h" + +inline char octCh(uint c){ + return '0' + c%8; +} + +void prtCh(ostream &o, uchar c){ + uchar oc = talx[c]; + switch(oc){ + case '\'': o << "\\'"; break; + case '\n': o << "\\n"; break; + case '\t': o << "\\t"; break; + case '\v': o << "\\v"; break; + case '\b': o << "\\b"; break; + case '\r': o << "\\r"; break; + case '\f': o << "\\f"; break; + case '\a': o << "\\a"; break; + case '\\': o << "\\\\"; break; + default: + if(isprint(oc)) + o << (char) oc; + else + o << '\\' << octCh(c/64) << octCh(c/8) << octCh(c); + } +} + +void printSpan(ostream &o, uint lb, uint ub){ + if(lb > ub) + o << "*"; + o << "["; + if((ub - lb) == 1){ + prtCh(o, lb); + } else { + prtCh(o, lb); + o << "-"; + prtCh(o, ub-1); + } + o << "]"; +} + +uint Span::show(ostream &o, uint lb){ + if(to){ + printSpan(o, lb, ub); + o << " " << to->label << "; "; + } + return ub; +} + +ostream& operator<<(ostream &o, const State &s){ + o << "state " << s.label; + if(s.rule) + o << " accepts " << s.rule->accept; + o << "\n"; + uint lb = 0; + for(uint i = 0; i < s.go.nSpans; ++i) + lb = s.go.span[i].show(o, lb); + return o; +} + +ostream& operator<<(ostream &o, const DFA &dfa){ + for(State *s = dfa.head; s; s = s->next) + o << s << "\n\n"; + return o; +} + +State::State() : rule(NULL), link(NULL), kCount(0), kernel(NULL), action(NULL) { + go.nSpans = 0; + go.span = NULL; +} + +State::~State(){ + delete [] kernel; + delete [] go.span; +} + +static Ins **closure(Ins **cP, Ins *i){ + while(!isMarked(i)){ + mark(i); + *(cP++) = i; + if(i->i.tag == FORK){ + cP = closure(cP, i + 1); + i = (Ins*) i->i.link; + } else if(i->i.tag == GOTO){ + i = (Ins*) i->i.link; + } else + break; + } + return cP; +} + +struct GoTo { + Char ch; + void *to; +}; + +DFA::DFA(Ins *ins, uint ni, uint lb, uint ub, Char *rep) + : lbChar(lb), ubChar(ub) { + Ins **work = new Ins*[ni+1]; + uint nc = ub - lb; + GoTo *goTo = new GoTo[nc]; + Span *span = new Span[nc]; + memset((char*) goTo, 0, nc*sizeof(GoTo)); + tail = &head; + head = NULL; + nStates = 0; + toDo = NULL; + findState(work, closure(work, &ins[0]) - work); + while(toDo){ + State *s = toDo; + toDo = s->link; + + Ins **cP, **iP, *i; + uint nGoTos = 0; + uint j; + + s->rule = NULL; + for(iP = s->kernel; (i = *iP); ++iP){ + if(i->i.tag == CHAR){ + for(Ins *j = i + 1; j < (Ins*) i->i.link; ++j){ + if(!(j->c.link = goTo[j->c.value - lb].to)) + goTo[nGoTos++].ch = j->c.value; + goTo[j->c.value - lb].to = j; + } + } else if(i->i.tag == TERM){ + if(!s->rule || ((RuleOp*) i->i.link)->accept < s->rule->accept) + s->rule = (RuleOp*) i->i.link; + } + } + + for(j = 0; j < nGoTos; ++j){ + GoTo *go = &goTo[goTo[j].ch - lb]; + i = (Ins*) go->to; + for(cP = work; i; i = (Ins*) i->c.link) + cP = closure(cP, i + i->c.bump); + go->to = findState(work, cP - work); + } + + s->go.nSpans = 0; + for(j = 0; j < nc;){ + State *to = (State*) goTo[rep[j]].to; + while(++j < nc && goTo[rep[j]].to == to); + span[s->go.nSpans].ub = lb + j; + span[s->go.nSpans].to = to; + s->go.nSpans++; + } + + for(j = nGoTos; j-- > 0;) + goTo[goTo[j].ch - lb].to = NULL; + + s->go.span = new Span[s->go.nSpans]; + memcpy((char*) s->go.span, (char*) span, s->go.nSpans*sizeof(Span)); + + (void) new Match(s); + + } + delete [] work; + delete [] goTo; + delete [] span; +} + +DFA::~DFA(){ + State *s; + while((s = head)){ + head = s->next; + delete s; + } +} + +void DFA::addState(State **a, State *s){ + s->label = nStates++; + s->next = *a; + *a = s; + if(a == tail) + tail = &s->next; +} + +State *DFA::findState(Ins **kernel, uint kCount){ + Ins **cP, **iP, *i; + State *s; + + kernel[kCount] = NULL; + + cP = kernel; + for(iP = kernel; (i = *iP); ++iP){ + if(i->i.tag == CHAR || i->i.tag == TERM){ + *cP++ = i; + } else { + unmark(i); + } + } + kCount = cP - kernel; + kernel[kCount] = NULL; + + for(s = head; s; s = s->next){ + if(s->kCount == kCount){ + for(iP = s->kernel; (i = *iP); ++iP) + if(!isMarked(i)) + goto nextState; + goto unmarkAll; + } + nextState:; + } + + s = new State; + addState(tail, s); + s->kCount = kCount; + s->kernel = new Ins*[kCount+1]; + memcpy(s->kernel, kernel, (kCount+1)*sizeof(Ins*)); + s->link = toDo; + toDo = s; + +unmarkAll: + for(iP = kernel; (i = *iP); ++iP) + unmark(i); + + return s; +} diff --git a/tools/re2c/dfa.h b/tools/re2c/dfa.h new file mode 100644 index 00000000..edd018c3 --- /dev/null +++ b/tools/re2c/dfa.h @@ -0,0 +1,149 @@ +#ifndef _dfa_h +#define _dfa_h + +#include +#include "re.h" + +extern void prtCh(ostream&, uchar); +extern void printSpan(ostream&, uint, uint); + +class DFA; +class State; + +class Action { +public: + State *state; +public: + Action(State*); + virtual void emit(ostream&) = 0; +}; + +class Match: public Action { +public: + Match(State*); + void emit(ostream&); +}; + +class Enter: public Action { +public: + uint label; +public: + Enter(State*, uint); + void emit(ostream&); +}; + +class Save: public Match { +public: + uint selector; +public: + Save(State*, uint); + void emit(ostream&); +}; + +class Move: public Action { +public: + Move(State*); + void emit(ostream&); +}; + +class Accept: public Action { +public: + uint nRules; + uint *saves; + State **rules; +public: + Accept(State*, uint, uint*, State**); + void emit(ostream&); +}; + +class Rule: public Action { +public: + RuleOp *rule; +public: + Rule(State*, RuleOp*); + void emit(ostream&); +}; + +class Span { +public: + uint ub; + State *to; +public: + uint show(ostream&, uint); +}; + +class Go { +public: + uint nSpans; + Span *span; +public: + void genGoto(ostream&, State*); + void genBase(ostream&, State*); + void genLinear(ostream&, State*); + void genBinary(ostream&, State*); + void genSwitch(ostream&, State*); + void compact(); + void unmap(Go*, State*); +}; + +class State { +public: + uint label; + RuleOp *rule; + State *next; + State *link; + uint depth; // for finding SCCs + uint kCount; + Ins **kernel; + bool isBase:1; + Go go; + Action *action; +public: + State(); + ~State(); + void emit(ostream&); + friend ostream& operator<<(ostream&, const State&); + friend ostream& operator<<(ostream&, const State*); +}; + +class DFA { +public: + uint lbChar; + uint ubChar; + uint nStates; + State *head, **tail; + State *toDo; +public: + DFA(Ins*, uint, uint, uint, Char*); + ~DFA(); + void addState(State**, State*); + State *findState(Ins**, uint); + void split(State*); + + void findSCCs(); + void emit(ostream&); + + friend ostream& operator<<(ostream&, const DFA&); + friend ostream& operator<<(ostream&, const DFA*); +}; + +inline Action::Action(State *s) : state(s) { + s->action = this; +} + +inline Match::Match(State *s) : Action(s) + { } + +inline Enter::Enter(State *s, uint l) : Action(s), label(l) + { } + +inline Save::Save(State *s, uint i) : Match(s), selector(i) + { } + +inline ostream& operator<<(ostream &o, const State *s) + { return o << *s; } + +inline ostream& operator<<(ostream &o, const DFA *dfa) + { return o << *dfa; } + +#endif diff --git a/tools/re2c/doc/loplas.ps.gz b/tools/re2c/doc/loplas.ps.gz new file mode 100644 index 00000000..d1a9191d Binary files /dev/null and b/tools/re2c/doc/loplas.ps.gz differ diff --git a/tools/re2c/doc/sample.bib b/tools/re2c/doc/sample.bib new file mode 100644 index 00000000..1f34ab13 --- /dev/null +++ b/tools/re2c/doc/sample.bib @@ -0,0 +1,48 @@ +@Article{Bumbulis94, + author = {Peter Bumbulis and Donald D. Cowan}, + title = {RE2C -- A More Versatile Scanner Generator}, + journal = "ACM Letters on Programming Languages and Systems", + volume = 2, + number = "1--4", + year = 1994, + abstract = { + It is usually claimed that lexical analysis routines are still coded by + hand, despite the widespread availability of scanner generators, for + efficiency reasons. While efficiency is a consideration, there exist + freely available scanner generators such as GLA \cite{Gray88} that can + generate scanners that are faster than most hand-coded ones. However, + most generated scanners are tailored for a particular environment, and + retargetting these scanners to other environments, if possible, is + usually complex enough to make a hand-coded scanner more appealing. In + this paper we describe RE2C, a scanner generator that not only generates + scanners which are faster (and usually smaller) than those produced by + any other scanner generator known to the authors, including GLA, but + also adapt easily to any environment. + } +} +@Article{Gray88, + author = {Robert W. Gray}, + title = {{$\gamma$-GLA} - {A} Generator for Lexical Analyzers That + Programmers Can Use}, + journal = {USENIX Conference Proceedings}, + year = {1988}, + month = {June}, + pages = {147-160}, + abstract = {Writing an efficient lexical analyzer for even a simple + language is not a trivial task, and should not be done by hand. We + describe GLA, a tool that generates very efficient scanners. These + scanners do not use the conventional transition matrix, but instead + use a few 128 element vectors. Scanning time is only slightly + greater than the absolute minimum --- the time it takes to look at + each character in a file. The GLA language allows simple, concise + specification of scanners. Augmenting regular expressions with + auxiliary scanners easily handles nasty problems such as C comments + and C literal constants. We formalize the connection between token + scanning and token processing by associating a processor with + appropriate patterns. A library of canned descriptions simplifies the + specification of commonly used language pieces --- such as, + C\_IDENTIFIERS, C\_STRINGS, PASCAL\_COMMENTS, etc. Finally, carefully + tuned lexical analysis support modules are provided for error + handling, input buffering, storing identifiers in hash tables and + manipulating denotations.} +} diff --git a/tools/re2c/examples/basemmap.c b/tools/re2c/examples/basemmap.c new file mode 100644 index 00000000..3e5b037a --- /dev/null +++ b/tools/re2c/examples/basemmap.c @@ -0,0 +1,26 @@ +#include +#include +#include +#include +#include + +#ifndef MAP_NORESERVE +#define MAP_NORESERVE 0 +#endif + +volatile char ch; + +main(){ + struct stat statbuf; + uchar *buf; + fstat(0, &statbuf); + buf = mmap(NULL, statbuf.st_size, PROT_READ, MAP_SHARED|MAP_NORESERVE, + 0, 0); + if(buf != (uchar*)(-1)){ + uchar *cur, *lim = &buf[statbuf.st_size]; + for(cur = buf; buf != lim; ++cur){ + ch = *cur; + } + munmap(buf, statbuf.st_size); + } +} diff --git a/tools/re2c/examples/c.re b/tools/re2c/examples/c.re new file mode 100644 index 00000000..419964fb --- /dev/null +++ b/tools/re2c/examples/c.re @@ -0,0 +1,272 @@ +#include +#include +#include + +#define ADDEQ 257 +#define ANDAND 258 +#define ANDEQ 259 +#define ARRAY 260 +#define ASM 261 +#define AUTO 262 +#define BREAK 263 +#define CASE 264 +#define CHAR 265 +#define CONST 266 +#define CONTINUE 267 +#define DECR 268 +#define DEFAULT 269 +#define DEREF 270 +#define DIVEQ 271 +#define DO 272 +#define DOUBLE 273 +#define ELLIPSIS 274 +#define ELSE 275 +#define ENUM 276 +#define EQL 277 +#define EXTERN 278 +#define FCON 279 +#define FLOAT 280 +#define FOR 281 +#define FUNCTION 282 +#define GEQ 283 +#define GOTO 284 +#define ICON 285 +#define ID 286 +#define IF 287 +#define INCR 288 +#define INT 289 +#define LEQ 290 +#define LONG 291 +#define LSHIFT 292 +#define LSHIFTEQ 293 +#define MODEQ 294 +#define MULEQ 295 +#define NEQ 296 +#define OREQ 297 +#define OROR 298 +#define POINTER 299 +#define REGISTER 300 +#define RETURN 301 +#define RSHIFT 302 +#define RSHIFTEQ 303 +#define SCON 304 +#define SHORT 305 +#define SIGNED 306 +#define SIZEOF 307 +#define STATIC 308 +#define STRUCT 309 +#define SUBEQ 310 +#define SWITCH 311 +#define TYPEDEF 312 +#define UNION 313 +#define UNSIGNED 314 +#define VOID 315 +#define VOLATILE 316 +#define WHILE 317 +#define XOREQ 318 +#define EOI 319 + +typedef unsigned int uint; +typedef unsigned char uchar; + +#define BSIZE 8192 + +#define YYCTYPE uchar +#define YYCURSOR cursor +#define YYLIMIT s->lim +#define YYMARKER s->ptr +#define YYFILL(n) {cursor = fill(s, cursor);} + +#define RET(i) {s->cur = cursor; return i;} + +typedef struct Scanner { + int fd; + uchar *bot, *tok, *ptr, *cur, *pos, *lim, *top, *eof; + uint line; +} Scanner; + +uchar *fill(Scanner *s, uchar *cursor){ + if(!s->eof){ + uint cnt = s->tok - s->bot; + if(cnt){ + memcpy(s->bot, s->tok, s->lim - s->tok); + s->tok = s->bot; + s->ptr -= cnt; + cursor -= cnt; + s->pos -= cnt; + s->lim -= cnt; + } + if((s->top - s->lim) < BSIZE){ + uchar *buf = (uchar*) malloc(((s->lim - s->bot) + BSIZE)*sizeof(uchar)); + memcpy(buf, s->tok, s->lim - s->tok); + s->tok = buf; + s->ptr = &buf[s->ptr - s->bot]; + cursor = &buf[cursor - s->bot]; + s->pos = &buf[s->pos - s->bot]; + s->lim = &buf[s->lim - s->bot]; + s->top = &s->lim[BSIZE]; + free(s->bot); + s->bot = buf; + } + if((cnt = read(s->fd, (char*) s->lim, BSIZE)) != BSIZE){ + s->eof = &s->lim[cnt]; *(s->eof)++ = '\n'; + } + s->lim += cnt; + } + return cursor; +} + +int scan(Scanner *s){ + uchar *cursor = s->cur; +std: + s->tok = cursor; +/*!re2c +any = [\000-\377]; +O = [0-7]; +D = [0-9]; +L = [a-zA-Z_]; +H = [a-fA-F0-9]; +E = [Ee] [+-]? D+; +FS = [fFlL]; +IS = [uUlL]*; +ESC = [\\] ([abfnrtv?'"\\] | "x" H+ | O+); +*/ + +/*!re2c + "/*" { goto comment; } + + "auto" { RET(AUTO); } + "break" { RET(BREAK); } + "case" { RET(CASE); } + "char" { RET(CHAR); } + "const" { RET(CONST); } + "continue" { RET(CONTINUE); } + "default" { RET(DEFAULT); } + "do" { RET(DO); } + "double" { RET(DOUBLE); } + "else" { RET(ELSE); } + "enum" { RET(ENUM); } + "extern" { RET(EXTERN); } + "float" { RET(FLOAT); } + "for" { RET(FOR); } + "goto" { RET(GOTO); } + "if" { RET(IF); } + "int" { RET(INT); } + "long" { RET(LONG); } + "register" { RET(REGISTER); } + "return" { RET(RETURN); } + "short" { RET(SHORT); } + "signed" { RET(SIGNED); } + "sizeof" { RET(SIZEOF); } + "static" { RET(STATIC); } + "struct" { RET(STRUCT); } + "switch" { RET(SWITCH); } + "typedef" { RET(TYPEDEF); } + "union" { RET(UNION); } + "unsigned" { RET(UNSIGNED); } + "void" { RET(VOID); } + "volatile" { RET(VOLATILE); } + "while" { RET(WHILE); } + + L (L|D)* { RET(ID); } + + ("0" [xX] H+ IS?) | ("0" D+ IS?) | (D+ IS?) | + (['] (ESC|any\[\n\\'])* [']) + { RET(ICON); } + + (D+ E FS?) | (D* "." D+ E? FS?) | (D+ "." D* E? FS?) + { RET(FCON); } + + (["] (ESC|any\[\n\\"])* ["]) + { RET(SCON); } + + "..." { RET(ELLIPSIS); } + ">>=" { RET(RSHIFTEQ); } + "<<=" { RET(LSHIFTEQ); } + "+=" { RET(ADDEQ); } + "-=" { RET(SUBEQ); } + "*=" { RET(MULEQ); } + "/=" { RET(DIVEQ); } + "%=" { RET(MODEQ); } + "&=" { RET(ANDEQ); } + "^=" { RET(XOREQ); } + "|=" { RET(OREQ); } + ">>" { RET(RSHIFT); } + "<<" { RET(LSHIFT); } + "++" { RET(INCR); } + "--" { RET(DECR); } + "->" { RET(DEREF); } + "&&" { RET(ANDAND); } + "||" { RET(OROR); } + "<=" { RET(LEQ); } + ">=" { RET(GEQ); } + "==" { RET(EQL); } + "!=" { RET(NEQ); } + ";" { RET(';'); } + "{" { RET('{'); } + "}" { RET('}'); } + "," { RET(','); } + ":" { RET(':'); } + "=" { RET('='); } + "(" { RET('('); } + ")" { RET(')'); } + "[" { RET('['); } + "]" { RET(']'); } + "." { RET('.'); } + "&" { RET('&'); } + "!" { RET('!'); } + "~" { RET('~'); } + "-" { RET('-'); } + "+" { RET('+'); } + "*" { RET('*'); } + "/" { RET('/'); } + "%" { RET('%'); } + "<" { RET('<'); } + ">" { RET('>'); } + "^" { RET('^'); } + "|" { RET('|'); } + "?" { RET('?'); } + + + [ \t\v\f]+ { goto std; } + + "\n" + { + if(cursor == s->eof) RET(EOI); + s->pos = cursor; s->line++; + goto std; + } + + any + { + printf("unexpected character: %c\n", *s->tok); + goto std; + } +*/ + +comment: +/*!re2c + "*/" { goto std; } + "\n" + { + if(cursor == s->eof) RET(EOI); + s->tok = s->pos = cursor; s->line++; + goto comment; + } + any { goto comment; } +*/ +} + +main(){ + Scanner in; + int t; + memset((char*) &in, 0, sizeof(in)); + in.fd = 0; + while((t = scan(&in)) != EOI){ +/* + printf("%d\t%.*s\n", t, in.cur - in.tok, in.tok); + printf("%d\n", t); +*/ + } + close(in.fd); +} diff --git a/tools/re2c/examples/cmmap.re b/tools/re2c/examples/cmmap.re new file mode 100644 index 00000000..bc4d498a --- /dev/null +++ b/tools/re2c/examples/cmmap.re @@ -0,0 +1,267 @@ +#include +#include +#include +#include +#include + +#define ADDEQ 257 +#define ANDAND 258 +#define ANDEQ 259 +#define ARRAY 260 +#define ASM 261 +#define AUTO 262 +#define BREAK 263 +#define CASE 264 +#define CHAR 265 +#define CONST 266 +#define CONTINUE 267 +#define DECR 268 +#define DEFAULT 269 +#define DEREF 270 +#define DIVEQ 271 +#define DO 272 +#define DOUBLE 273 +#define ELLIPSIS 274 +#define ELSE 275 +#define ENUM 276 +#define EQL 277 +#define EXTERN 278 +#define FCON 279 +#define FLOAT 280 +#define FOR 281 +#define FUNCTION 282 +#define GEQ 283 +#define GOTO 284 +#define ICON 285 +#define ID 286 +#define IF 287 +#define INCR 288 +#define INT 289 +#define LEQ 290 +#define LONG 291 +#define LSHIFT 292 +#define LSHIFTEQ 293 +#define MODEQ 294 +#define MULEQ 295 +#define NEQ 296 +#define OREQ 297 +#define OROR 298 +#define POINTER 299 +#define REGISTER 300 +#define RETURN 301 +#define RSHIFT 302 +#define RSHIFTEQ 303 +#define SCON 304 +#define SHORT 305 +#define SIGNED 306 +#define SIZEOF 307 +#define STATIC 308 +#define STRUCT 309 +#define SUBEQ 310 +#define SWITCH 311 +#define TYPEDEF 312 +#define UNION 313 +#define UNSIGNED 314 +#define VOID 315 +#define VOLATILE 316 +#define WHILE 317 +#define XOREQ 318 +#define EOI 319 + +typedef unsigned int unint; +typedef unsigned char uchar; + +#define YYCTYPE uchar +#define YYCURSOR cursor +#define YYLIMIT s->lim +#define YYMARKER s->ptr +#define YYFILL(n) {cursor = fill(s, cursor);} + +#define RET(i) {s->cur = cursor; return i;} + +typedef struct Scanner { + uchar *tok, *ptr, *cur, *pos, *lim, *eof; + unint line; +} Scanner; + +uchar *fill(Scanner *s, uchar *cursor){ + if(!s->eof){ + unint cnt = s->lim - s->tok; + uchar *buf = malloc((cnt + 1)*sizeof(uchar)); + memcpy(buf, s->tok, cnt); + cursor = &buf[cursor - s->tok]; + s->pos = &buf[s->pos - s->tok]; + s->ptr = &buf[s->ptr - s->tok]; + s->lim = &buf[cnt]; + s->eof = s->lim; *(s->eof)++ = '\n'; + s->tok = buf; + } + return cursor; +} + +int scan(Scanner *s){ + uchar *cursor = s->cur; +std: + s->tok = cursor; +/*!re2c +any = [\000-\377]; +O = [0-7]; +D = [0-9]; +L = [a-zA-Z_]; +H = [a-fA-F0-9]; +E = [Ee] [+-]? D+; +FS = [fFlL]; +IS = [uUlL]*; +ESC = [\\] ([abfnrtv?'"\\] | "x" H+ | O+); +*/ + +/*!re2c + "/*" { goto comment; } + + "auto" { RET(AUTO); } + "break" { RET(BREAK); } + "case" { RET(CASE); } + "char" { RET(CHAR); } + "const" { RET(CONST); } + "continue" { RET(CONTINUE); } + "default" { RET(DEFAULT); } + "do" { RET(DO); } + "double" { RET(DOUBLE); } + "else" { RET(ELSE); } + "enum" { RET(ENUM); } + "extern" { RET(EXTERN); } + "float" { RET(FLOAT); } + "for" { RET(FOR); } + "goto" { RET(GOTO); } + "if" { RET(IF); } + "int" { RET(INT); } + "long" { RET(LONG); } + "register" { RET(REGISTER); } + "return" { RET(RETURN); } + "short" { RET(SHORT); } + "signed" { RET(SIGNED); } + "sizeof" { RET(SIZEOF); } + "static" { RET(STATIC); } + "struct" { RET(STRUCT); } + "switch" { RET(SWITCH); } + "typedef" { RET(TYPEDEF); } + "union" { RET(UNION); } + "unsigned" { RET(UNSIGNED); } + "void" { RET(VOID); } + "volatile" { RET(VOLATILE); } + "while" { RET(WHILE); } + + L (L|D)* { RET(ID); } + + ("0" [xX] H+ IS?) | ("0" D+ IS?) | (D+ IS?) | + (['] (ESC|any\[\n\\'])* [']) + { RET(ICON); } + + (D+ E FS?) | (D* "." D+ E? FS?) | (D+ "." D* E? FS?) + { RET(FCON); } + + (["] (ESC|any\[\n\\"])* ["]) + { RET(SCON); } + + "..." { RET(ELLIPSIS); } + ">>=" { RET(RSHIFTEQ); } + "<<=" { RET(LSHIFTEQ); } + "+=" { RET(ADDEQ); } + "-=" { RET(SUBEQ); } + "*=" { RET(MULEQ); } + "/=" { RET(DIVEQ); } + "%=" { RET(MODEQ); } + "&=" { RET(ANDEQ); } + "^=" { RET(XOREQ); } + "|=" { RET(OREQ); } + ">>" { RET(RSHIFT); } + "<<" { RET(LSHIFT); } + "++" { RET(INCR); } + "--" { RET(DECR); } + "->" { RET(DEREF); } + "&&" { RET(ANDAND); } + "||" { RET(OROR); } + "<=" { RET(LEQ); } + ">=" { RET(GEQ); } + "==" { RET(EQL); } + "!=" { RET(NEQ); } + ";" { RET(';'); } + "{" { RET('{'); } + "}" { RET('}'); } + "," { RET(','); } + ":" { RET(':'); } + "=" { RET('='); } + "(" { RET('('); } + ")" { RET(')'); } + "[" { RET('['); } + "]" { RET(']'); } + "." { RET('.'); } + "&" { RET('&'); } + "!" { RET('!'); } + "~" { RET('~'); } + "-" { RET('-'); } + "+" { RET('+'); } + "*" { RET('*'); } + "/" { RET('/'); } + "%" { RET('%'); } + "<" { RET('<'); } + ">" { RET('>'); } + "^" { RET('^'); } + "|" { RET('|'); } + "?" { RET('?'); } + + + [ \t\v\f]+ { goto std; } + + "\n" + { + if(cursor == s->eof) RET(EOI); + s->pos = cursor; s->line++; + goto std; + } + + any + { + printf("unexpected character: %c\n", *s->tok); + goto std; + } +*/ + +comment: +/*!re2c + "*/" { goto std; } + "\n" + { + if(cursor == s->eof) RET(EOI); + s->tok = s->pos = cursor; s->line++; + goto comment; + } + any { goto comment; } +*/ +} + +#ifndef MAP_NORESERVE +#define MAP_NORESERVE 0 +#endif + +main(){ + Scanner in; + struct stat statbuf; + uchar *buf; + fstat(0, &statbuf); + buf = mmap(NULL, statbuf.st_size, PROT_READ, MAP_SHARED|MAP_NORESERVE, + 0, 0); + if(buf != (uchar*)(-1)){ + int t; + in.lim = &(in.cur = buf)[statbuf.st_size]; + in.pos = NULL; + in.eof = NULL; + while((t = scan(&in)) != EOI){ +/* + printf("%d\t%.*s\n", t, in.cur - in.tok, in.tok); + printf("%d\n", t); +*/ + } + munmap(buf, statbuf.st_size); + } +} diff --git a/tools/re2c/examples/cnokw.re b/tools/re2c/examples/cnokw.re new file mode 100644 index 00000000..bdc12793 --- /dev/null +++ b/tools/re2c/examples/cnokw.re @@ -0,0 +1,239 @@ +#include +#include +#include + +#define ADDEQ 257 +#define ANDAND 258 +#define ANDEQ 259 +#define ARRAY 260 +#define ASM 261 +#define AUTO 262 +#define BREAK 263 +#define CASE 264 +#define CHAR 265 +#define CONST 266 +#define CONTINUE 267 +#define DECR 268 +#define DEFAULT 269 +#define DEREF 270 +#define DIVEQ 271 +#define DO 272 +#define DOUBLE 273 +#define ELLIPSIS 274 +#define ELSE 275 +#define ENUM 276 +#define EQL 277 +#define EXTERN 278 +#define FCON 279 +#define FLOAT 280 +#define FOR 281 +#define FUNCTION 282 +#define GEQ 283 +#define GOTO 284 +#define ICON 285 +#define ID 286 +#define IF 287 +#define INCR 288 +#define INT 289 +#define LEQ 290 +#define LONG 291 +#define LSHIFT 292 +#define LSHIFTEQ 293 +#define MODEQ 294 +#define MULEQ 295 +#define NEQ 296 +#define OREQ 297 +#define OROR 298 +#define POINTER 299 +#define REGISTER 300 +#define RETURN 301 +#define RSHIFT 302 +#define RSHIFTEQ 303 +#define SCON 304 +#define SHORT 305 +#define SIGNED 306 +#define SIZEOF 307 +#define STATIC 308 +#define STRUCT 309 +#define SUBEQ 310 +#define SWITCH 311 +#define TYPEDEF 312 +#define UNION 313 +#define UNSIGNED 314 +#define VOID 315 +#define VOLATILE 316 +#define WHILE 317 +#define XOREQ 318 +#define EOI 319 + +typedef unsigned int uint; +typedef unsigned char uchar; + +#define BSIZE 8192 + +#define YYCTYPE uchar +#define YYCURSOR cursor +#define YYLIMIT s->lim +#define YYMARKER s->ptr +#define YYFILL(n) {cursor = fill(s, cursor);} + +#define RET(i) {s->cur = cursor; return i;} + +typedef struct Scanner { + int fd; + uchar *bot, *tok, *ptr, *cur, *pos, *lim, *top, *eof; + uint line; +} Scanner; + +uchar *fill(Scanner *s, uchar *cursor){ + if(!s->eof){ + uint cnt = s->tok - s->bot; + if(cnt){ + memcpy(s->bot, s->tok, s->lim - s->tok); + s->tok = s->bot; + s->ptr -= cnt; + cursor -= cnt; + s->pos -= cnt; + s->lim -= cnt; + } + if((s->top - s->lim) < BSIZE){ + uchar *buf = (uchar*) malloc(((s->lim - s->bot) + BSIZE)*sizeof(uchar)); + memcpy(buf, s->tok, s->lim - s->tok); + s->tok = buf; + s->ptr = &buf[s->ptr - s->bot]; + cursor = &buf[cursor - s->bot]; + s->pos = &buf[s->pos - s->bot]; + s->lim = &buf[s->lim - s->bot]; + s->top = &s->lim[BSIZE]; + free(s->bot); + s->bot = buf; + } + if((cnt = read(s->fd, (char*) s->lim, BSIZE)) != BSIZE){ + s->eof = &s->lim[cnt]; *(s->eof)++ = '\n'; + } + s->lim += cnt; + } + return cursor; +} + +int scan(Scanner *s){ + uchar *cursor = s->cur; +std: + s->tok = cursor; +/*!re2c +any = [\000-\377]; +O = [0-7]; +D = [0-9]; +L = [a-zA-Z_]; +H = [a-fA-F0-9]; +E = [Ee] [+-]? D+; +FS = [fFlL]; +IS = [uUlL]*; +ESC = [\\] ([abfnrtv?'"\\] | "x" H+ | O+); +*/ + +/*!re2c + "/*" { goto comment; } + + L (L|D)* { RET(ID); } + + ("0" [xX] H+ IS?) | ("0" D+ IS?) | (D+ IS?) | + (['] (ESC|any\[\n\\'])* [']) + { RET(ICON); } + + (D+ E FS?) | (D* "." D+ E? FS?) | (D+ "." D* E? FS?) + { RET(FCON); } + + (["] (ESC|any\[\n\\"])* ["]) + { RET(SCON); } + + "..." { RET(ELLIPSIS); } + ">>=" { RET(RSHIFTEQ); } + "<<=" { RET(LSHIFTEQ); } + "+=" { RET(ADDEQ); } + "-=" { RET(SUBEQ); } + "*=" { RET(MULEQ); } + "/=" { RET(DIVEQ); } + "%=" { RET(MODEQ); } + "&=" { RET(ANDEQ); } + "^=" { RET(XOREQ); } + "|=" { RET(OREQ); } + ">>" { RET(RSHIFT); } + "<<" { RET(LSHIFT); } + "++" { RET(INCR); } + "--" { RET(DECR); } + "->" { RET(DEREF); } + "&&" { RET(ANDAND); } + "||" { RET(OROR); } + "<=" { RET(LEQ); } + ">=" { RET(GEQ); } + "==" { RET(EQL); } + "!=" { RET(NEQ); } + ";" { RET(';'); } + "{" { RET('{'); } + "}" { RET('}'); } + "," { RET(','); } + ":" { RET(':'); } + "=" { RET('='); } + "(" { RET('('); } + ")" { RET(')'); } + "[" { RET('['); } + "]" { RET(']'); } + "." { RET('.'); } + "&" { RET('&'); } + "!" { RET('!'); } + "~" { RET('~'); } + "-" { RET('-'); } + "+" { RET('+'); } + "*" { RET('*'); } + "/" { RET('/'); } + "%" { RET('%'); } + "<" { RET('<'); } + ">" { RET('>'); } + "^" { RET('^'); } + "|" { RET('|'); } + "?" { RET('?'); } + + + [ \t\v\f]+ { goto std; } + + "\n" + { + if(cursor == s->eof) RET(EOI); + s->pos = cursor; s->line++; + goto std; + } + + any + { + printf("unexpected character: %c\n", *s->tok); + goto std; + } +*/ + +comment: +/*!re2c + "*/" { goto std; } + "\n" + { + if(cursor == s->eof) RET(EOI); + s->tok = s->pos = cursor; s->line++; + goto comment; + } + any { goto comment; } +*/ +} + +main(){ + Scanner in; + int t; + memset((char*) &in, 0, sizeof(in)); + in.fd = 0; + while((t = scan(&in)) != EOI){ +/* + printf("%d\t%.*s\n", t, in.cur - in.tok, in.tok); + printf("%d\n", t); +*/ + } + close(in.fd); +} diff --git a/tools/re2c/examples/cunroll.re b/tools/re2c/examples/cunroll.re new file mode 100644 index 00000000..dd9d8054 --- /dev/null +++ b/tools/re2c/examples/cunroll.re @@ -0,0 +1,258 @@ +#include +#include +#include + +#define ADDEQ 257 +#define ANDAND 258 +#define ANDEQ 259 +#define ARRAY 260 +#define ASM 261 +#define AUTO 262 +#define BREAK 263 +#define CASE 264 +#define CHAR 265 +#define CONST 266 +#define CONTINUE 267 +#define DECR 268 +#define DEFAULT 269 +#define DEREF 270 +#define DIVEQ 271 +#define DO 272 +#define DOUBLE 273 +#define ELLIPSIS 274 +#define ELSE 275 +#define ENUM 276 +#define EQL 277 +#define EXTERN 278 +#define FCON 279 +#define FLOAT 280 +#define FOR 281 +#define FUNCTION 282 +#define GEQ 283 +#define GOTO 284 +#define ICON 285 +#define ID 286 +#define IF 287 +#define INCR 288 +#define INT 289 +#define LEQ 290 +#define LONG 291 +#define LSHIFT 292 +#define LSHIFTEQ 293 +#define MODEQ 294 +#define MULEQ 295 +#define NEQ 296 +#define OREQ 297 +#define OROR 298 +#define POINTER 299 +#define REGISTER 300 +#define RETURN 301 +#define RSHIFT 302 +#define RSHIFTEQ 303 +#define SCON 304 +#define SHORT 305 +#define SIGNED 306 +#define SIZEOF 307 +#define STATIC 308 +#define STRUCT 309 +#define SUBEQ 310 +#define SWITCH 311 +#define TYPEDEF 312 +#define UNION 313 +#define UNSIGNED 314 +#define VOID 315 +#define VOLATILE 316 +#define WHILE 317 +#define XOREQ 318 +#define EOI 319 + +typedef unsigned int uint; +typedef unsigned char uchar; + +#define BSIZE 8192 + +#define YYCTYPE uchar +#define YYCURSOR cursor +#define YYLIMIT s->lim +#define YYMARKER s->ptr +#define YYFILL(n) {cursor = fill(s, cursor);} + +#define RET(i) {s->cur = cursor; return i;} + +typedef struct Scanner { + int fd; + uchar *bot, *tok, *ptr, *cur, *pos, *lim, *top, *eof; + uint line; +} Scanner; + +uchar *fill(Scanner *s, uchar *cursor){ + if(!s->eof){ + uint cnt = s->tok - s->bot; + if(cnt){ + memcpy(s->bot, s->tok, s->lim - s->tok); + s->tok = s->bot; + s->ptr -= cnt; + cursor -= cnt; + s->pos -= cnt; + s->lim -= cnt; + } + if((s->top - s->lim) < BSIZE){ + uchar *buf = (uchar*) malloc(((s->lim - s->bot) + BSIZE)*sizeof(uchar)); + memcpy(buf, s->tok, s->lim - s->tok); + s->tok = buf; + s->ptr = &buf[s->ptr - s->bot]; + cursor = &buf[cursor - s->bot]; + s->pos = &buf[s->pos - s->bot]; + s->lim = &buf[s->lim - s->bot]; + s->top = &s->lim[BSIZE]; + free(s->bot); + s->bot = buf; + } + if((cnt = read(s->fd, (char*) s->lim, BSIZE)) != BSIZE){ + s->eof = &s->lim[cnt]; *(s->eof)++ = '\n'; + } + s->lim += cnt; + } + return cursor; +} + +int scan(Scanner *s){ + uchar *cursor = s->cur; +std: + s->tok = cursor; +/*!re2c +any = [\000-\377]; +O = [0-7]; +D = [0-9]; +L = [a-zA-Z_]; +I = L|D; +H = [a-fA-F0-9]; +E = [Ee] [+-]? D+; +FS = [fFlL]; +IS = [uUlL]*; +ESC = [\\] ([abfnrtv?'"\\] | "x" H+ | O+); +X = any\[*/]; +*/ + +/*!re2c + "/*" { goto comment; } + + + L { RET(ID); } + L I { RET(ID); } + L I I { RET(ID); } + L I I I { RET(ID); } + L I I I I { RET(ID); } + L I I I I I { RET(ID); } + L I I I I I I { RET(ID); } + L I I I I I I I { RET(ID); } + L I* { RET(ID); } + + ("0" [xX] H+ IS?) | ("0" D+ IS?) | (D+ IS?) | + (['] (ESC|any\[\n\\'])* [']) + { RET(ICON); } + + (D+ E FS?) | (D* "." D+ E? FS?) | (D+ "." D* E? FS?) + { RET(FCON); } + + (["] (ESC|any\[\n\\"])* ["]) + { RET(SCON); } + + "..." { RET(ELLIPSIS); } + ">>=" { RET(RSHIFTEQ); } + "<<=" { RET(LSHIFTEQ); } + "+=" { RET(ADDEQ); } + "-=" { RET(SUBEQ); } + "*=" { RET(MULEQ); } + "/=" { RET(DIVEQ); } + "%=" { RET(MODEQ); } + "&=" { RET(ANDEQ); } + "^=" { RET(XOREQ); } + "|=" { RET(OREQ); } + ">>" { RET(RSHIFT); } + "<<" { RET(LSHIFT); } + "++" { RET(INCR); } + "--" { RET(DECR); } + "->" { RET(DEREF); } + "&&" { RET(ANDAND); } + "||" { RET(OROR); } + "<=" { RET(LEQ); } + ">=" { RET(GEQ); } + "==" { RET(EQL); } + "!=" { RET(NEQ); } + ";" { RET(';'); } + "{" { RET('{'); } + "}" { RET('}'); } + "," { RET(','); } + ":" { RET(':'); } + "=" { RET('='); } + "(" { RET('('); } + ")" { RET(')'); } + "[" { RET('['); } + "]" { RET(']'); } + "." { RET('.'); } + "&" { RET('&'); } + "!" { RET('!'); } + "~" { RET('~'); } + "-" { RET('-'); } + "+" { RET('+'); } + "*" { RET('*'); } + "/" { RET('/'); } + "%" { RET('%'); } + "<" { RET('<'); } + ">" { RET('>'); } + "^" { RET('^'); } + "|" { RET('|'); } + "?" { RET('?'); } + + + [ \t\v\f]+ { goto std; } + + "\n" + { + if(cursor == s->eof) RET(EOI); + s->pos = cursor; s->line++; + goto std; + } + + any + { + printf("unexpected character: %c\n", *s->tok); + goto std; + } +*/ + +comment: +/*!re2c + "*/" { goto std; } + "\n" + { + if(cursor == s->eof) RET(EOI); + s->tok = s->pos = cursor; s->line++; + goto comment; + } + X { goto comment; } + X X { goto comment; } + X X X { goto comment; } + X X X X { goto comment; } + X X X X X { goto comment; } + X X X X X X { goto comment; } + X X X X X X X { goto comment; } + X X X X X X X X { goto comment; } + any { goto comment; } +*/ +} + +main(){ + Scanner in; + int t; + memset((char*) &in, 0, sizeof(in)); + in.fd = 0; + while((t = scan(&in)) != EOI){ +/* + printf("%d\t%.*s\n", t, in.cur - in.tok, in.tok); + printf("%d\n", t); +*/ + } + close(in.fd); +} diff --git a/tools/re2c/examples/modula.re b/tools/re2c/examples/modula.re new file mode 100644 index 00000000..0468ba4e --- /dev/null +++ b/tools/re2c/examples/modula.re @@ -0,0 +1,202 @@ +#include +#include +#include + +typedef unsigned int uint; +typedef unsigned char uchar; + +#define BSIZE 8192 + +#define YYCTYPE uchar +#define YYCURSOR cursor +#define YYLIMIT s->lim +#define YYMARKER s->ptr +#define YYFILL {cursor = fill(s, cursor);} + +#define RETURN(i) {s->cur = cursor; return i;} + +typedef struct Scanner { + int fd; + uchar *bot, *tok, *ptr, *cur, *pos, *lim, *top, *eof; + uint line; +} Scanner; + +uchar *fill(Scanner *s, uchar *cursor){ + if(!s->eof){ + uint cnt = s->tok - s->bot; + if(cnt){ + memcpy(s->bot, s->tok, s->lim - s->tok); + s->tok = s->bot; + s->ptr -= cnt; + cursor -= cnt; + s->pos -= cnt; + s->lim -= cnt; + } + if((s->top - s->lim) < BSIZE){ + uchar *buf = (uchar*) malloc(((s->lim - s->bot) + BSIZE)*sizeof(uchar)); + memcpy(buf, s->tok, s->lim - s->tok); + s->tok = buf; + s->ptr = &buf[s->ptr - s->bot]; + cursor = &buf[cursor - s->bot]; + s->pos = &buf[s->pos - s->bot]; + s->lim = &buf[s->lim - s->bot]; + s->top = &s->lim[BSIZE]; + free(s->bot); + s->bot = buf; + } + if((cnt = read(s->fd, (char*) s->lim, BSIZE)) != BSIZE){ + s->eof = &s->lim[cnt]; *(s->eof)++ = '\n'; + } + s->lim += cnt; + } + return cursor; +} + +int scan(Scanner *s){ + uchar *cursor = s->cur; + uint depth; +std: + s->tok = cursor; +/*!re2c +any = [\000-\377]; +digit = [0-9]; +letter = [a-zA-Z]; +*/ + +/*!re2c + "(*" { depth = 1; goto comment; } + + digit + {RETURN(1);} + digit + / ".." {RETURN(1);} + [0-7] + "B" {RETURN(2);} + [0-7] + "C" {RETURN(3);} + digit [0-9A-F] * "H" {RETURN(4);} + digit + "." digit * ("E" ([+-]) ? digit +) ? {RETURN(5);} + ['] (any\[\n']) * ['] | ["] (any\[\n"]) * ["] {RETURN(6);} + + "#" {RETURN(7);} + "&" {RETURN(8);} + "(" {RETURN(9);} + ")" {RETURN(10);} + "*" {RETURN(11);} + "+" {RETURN(12);} + "," {RETURN(13);} + "-" {RETURN(14);} + "." {RETURN(15);} + ".." {RETURN(16);} + "/" {RETURN(17);} + ":" {RETURN(18);} + ":=" {RETURN(19);} + ";" {RETURN(20);} + "<" {RETURN(21);} + "<=" {RETURN(22);} + "<>" {RETURN(23);} + "=" {RETURN(24);} + ">" {RETURN(25);} + ">=" {RETURN(26);} + "[" {RETURN(27);} + "]" {RETURN(28);} + "^" {RETURN(29);} + "{" {RETURN(30);} + "|" {RETURN(31);} + "}" {RETURN(32);} + "~" {RETURN(33);} + + "AND" {RETURN(34);} + "ARRAY" {RETURN(35);} + "BEGIN" {RETURN(36);} + "BY" {RETURN(37);} + "CASE" {RETURN(38);} + "CONST" {RETURN(39);} + "DEFINITION" {RETURN(40);} + "DIV" {RETURN(41);} + "DO" {RETURN(42);} + "ELSE" {RETURN(43);} + "ELSIF" {RETURN(44);} + "END" {RETURN(45);} + "EXIT" {RETURN(46);} + "EXPORT" {RETURN(47);} + "FOR" {RETURN(48);} + "FROM" {RETURN(49);} + "IF" {RETURN(50);} + "IMPLEMENTATION" {RETURN(51);} + "IMPORT" {RETURN(52);} + "IN" {RETURN(53);} + "LOOP" {RETURN(54);} + "MOD" {RETURN(55);} + "MODULE" {RETURN(56);} + "NOT" {RETURN(57);} + "OF" {RETURN(58);} + "OR" {RETURN(59);} + "POINTER" {RETURN(60);} + "PROCEDURE" {RETURN(61);} + "QUALIFIED" {RETURN(62);} + "RECORD" {RETURN(63);} + "REPEAT" {RETURN(64);} + "RETURN" {RETURN(65);} + "SET" {RETURN(66);} + "THEN" {RETURN(67);} + "TO" {RETURN(68);} + "TYPE" {RETURN(69);} + "UNTIL" {RETURN(70);} + "VAR" {RETURN(71);} + "WHILE" {RETURN(72);} + "WITH" {RETURN(73);} + + letter (letter | digit) * {RETURN(74);} + + [ \t]+ { goto std; } + + "\n" + { + if(cursor == s->eof) RETURN(0); + s->pos = cursor; s->line++; + goto std; + } + + any + { + printf("unexpected character: %c\n", *s->tok); + goto std; + } +*/ +comment: +/*!re2c + "*)" + { + if(--depth == 0) + goto std; + else + goto comment; + } + "(*" { ++depth; goto comment; } + "\n" + { + if(cursor == s->eof) RETURN(0); + s->tok = s->pos = cursor; s->line++; + goto comment; + } + any { goto comment; } +*/ +} + +/* +void putStr(FILE *o, char *s, uint l){ + while(l-- > 0) + putc(*s++, o); +} +*/ + +main(){ + Scanner in; + memset((char*) &in, 0, sizeof(in)); + in.fd = 0; + while(scan(&in)){ +/* + putc('<', stdout); + putStr(stdout, (char*) in.tok, in.cur - in.tok); + putc('>', stdout); + putc('\n', stdout); +*/ + } +} diff --git a/tools/re2c/examples/rexx/README b/tools/re2c/examples/rexx/README new file mode 100644 index 00000000..2af0178d --- /dev/null +++ b/tools/re2c/examples/rexx/README @@ -0,0 +1 @@ +Replacement modules for an existing REXX interpreter. Not standalone. diff --git a/tools/re2c/examples/rexx/rexx.l b/tools/re2c/examples/rexx/rexx.l new file mode 100644 index 00000000..b74741da --- /dev/null +++ b/tools/re2c/examples/rexx/rexx.l @@ -0,0 +1,319 @@ +#include "scanio.h" +#include "scanner.h" + +#define CURSOR ch +#define LOADCURSOR ch = *cursor; +#define ADVANCE cursor++; +#define BACK(n) cursor -= (n); +#define CHECK(n) if((ScanCB.lim - cursor) < (n)){cursor = ScanFill(cursor);} +#define MARK(n) ScanCB.ptr = cursor; sel = (n); +#define REVERT cursor = ScanCB.ptr; +#define MARKER sel + +#define RETURN(i) {ScanCB.cur = cursor; return i;} + +int ScanToken(){ + uchar *cursor = ScanCB.cur; + unsigned sel; + uchar ch; + ScanCB.tok = cursor; + ScanCB.eot = NULL; +/*!re2c +all = [\000-\377]; +eof = [\000]; +any = all\eof; +letter = [a-z]|[A-Z]; +digit = [0-9]; +symchr = letter|digit|[.!?_]; +const = (digit|[.])symchr*([eE][+-]?digit+)?; +simple = (symchr\(digit|[.]))(symchr\[.])*; +stem = simple [.]; +symbol = symchr*; +sqstr = ['] ((any\['\n])|(['][']))* [']; +dqstr = ["] ((any\["\n])|(["]["]))* ["]; +str = sqstr|dqstr; +ob = [ \t]*; +not = [\\~]; +A = [aA]; +B = [bB]; +C = [cC]; +D = [dD]; +E = [eE]; +F = [fF]; +G = [gG]; +H = [hH]; +I = [iI]; +J = [jJ]; +K = [kK]; +L = [lL]; +M = [mM]; +N = [nN]; +O = [oO]; +P = [pP]; +Q = [qQ]; +R = [rR]; +S = [sS]; +T = [tT]; +U = [uU]; +V = [vV]; +W = [wW]; +X = [xX]; +Y = [yY]; +Z = [zZ]; +*/ + +scan: +/*!re2c +"\n" + { + ++(ScanCB.lineNum); + ScanCB.linePos = ScanCB.pos + (cursor - ScanCB.mrk); + RETURN(SU_EOL); + } +"|" ob "|" + { RETURN(OP_CONCAT); } +"+" + { RETURN(OP_PLUS); } +"-" + { RETURN(OP_MINUS); } +"*" + { RETURN(OP_MULT); } +"/" + { RETURN(OP_DIV); } +"%" + { RETURN(OP_IDIV); } +"/" ob "/" + { RETURN(OP_REMAIN); } +"*" ob "*" + { RETURN(OP_POWER); } +"=" + { RETURN(OP_EQUAL); } +not ob "=" | "<" ob ">" | ">" ob "<" + { RETURN(OP_EQUAL_N); } +">" + { RETURN(OP_GT); } +"<" + { RETURN(OP_LT); } +">" ob "=" | not ob "<" + { RETURN(OP_GE); } +"<" ob "=" | not ob ">" + { RETURN(OP_LE); } +"=" ob "=" + { RETURN(OP_EQUAL_EQ); } +not ob "=" ob "=" + { RETURN(OP_EQUAL_EQ_N); } +">" ob ">" + { RETURN(OP_GT_STRICT); } +"<" ob "<" + { RETURN(OP_LT_STRICT); } +">" ob ">" ob "=" | not ob "<" ob "<" + { RETURN(OP_GE_STRICT); } +"<" ob "<" ob "=" | not ob ">" ob ">" + { RETURN(OP_LE_STRICT); } +"&" + { RETURN(OP_AND); } +"|" + { RETURN(OP_OR); } +"&" ob "&" + { RETURN(OP_XOR); } +not + { RETURN(OP_NOT); } + +":" + { RETURN(SU_COLON); } +"," + { RETURN(SU_COMMA); } +"(" + { RETURN(SU_POPEN); } +")" + { RETURN(SU_PCLOSE); } +";" + { RETURN(SU_EOC); } + +A D D R E S S + { RETURN(RX_ADDRESS); } +A R G + { RETURN(RX_ARG); } +C A L L + { RETURN(RX_CALL); } +D O + { RETURN(RX_DO); } +D R O P + { RETURN(RX_DROP); } +E L S E + { RETURN(RX_ELSE); } +E N D + { RETURN(RX_END); } +E X I T + { RETURN(RX_EXIT); } +I F + { RETURN(RX_IF); } +I N T E R P R E T + { RETURN(RX_INTERPRET); } +I T E R A T E + { RETURN(RX_ITERATE); } +L E A V E + { RETURN(RX_LEAVE); } +N O P + { RETURN(RX_NOP); } +N U M E R I C + { RETURN(RX_NUMERIC); } +O P T I O N S + { RETURN(RX_OPTIONS); } +O T H E R W I S E + { RETURN(RX_OTHERWISE); } +P A R S E + { RETURN(RX_PARSE); } +P R O C E D U R E + { RETURN(RX_PROCEDURE); } +P U L L + { RETURN(RX_PULL); } +P U S H + { RETURN(RX_PUSH); } +Q U E U E + { RETURN(RX_QUEUE); } +R E T U R N + { RETURN(RX_RETURN); } +S A Y + { RETURN(RX_SAY); } +S E L E C T + { RETURN(RX_SELECT); } +S I G N A L + { RETURN(RX_SIGNAL); } +T H E N + { RETURN(RX_THEN); } +T R A C E + { RETURN(RX_TRACE); } +W H E N + { RETURN(RX_WHEN); } +O F F + { RETURN(RXS_OFF); } +O N + { RETURN(RXS_ON); } +B Y + { RETURN(RXS_BY); } +D I G I T S + { RETURN(RXS_DIGITS); } +E N G I N E E R I N G + { RETURN(RXS_ENGINEERING); } +E R R O R + { RETURN(RXS_ERROR); } +E X P O S E + { RETURN(RXS_EXPOSE); } +F A I L U R E + { RETURN(RXS_FAILURE); } +F O R + { RETURN(RXS_FOR); } +F O R E V E R + { RETURN(RXS_FOREVER); } +F O R M + { RETURN(RXS_FORM); } +F U Z Z + { RETURN(RXS_FUZZ); } +H A L T + { RETURN(RXS_HALT); } +L I N E I N + { RETURN(RXS_LINEIN); } +N A M E + { RETURN(RXS_NAME); } +N O T R E A D Y + { RETURN(RXS_NOTREADY); } +N O V A L U E + { RETURN(RXS_NOVALUE); } +S C I E N T I F I C + { RETURN(RXS_SCIENTIFIC); } +S O U R C E + { RETURN(RXS_SOURCE); } +S Y N T A X + { RETURN(RXS_SYNTAX); } +T O + { RETURN(RXS_TO); } +U N T I L + { RETURN(RXS_UNTIL); } +U P P E R + { RETURN(RXS_UPPER); } +V A L U E + { RETURN(RXS_VALUE); } +V A R + { RETURN(RXS_VAR); } +V E R S I O N + { RETURN(RXS_VERSION); } +W H I L E + { RETURN(RXS_WHILE); } +W I T H + { RETURN(RXS_WITH); } + +const + { RETURN(SU_CONST); } +simple + { RETURN(SU_SYMBOL); } +stem + { RETURN(SU_SYMBOL_STEM); } +symbol + { RETURN(SU_SYMBOL_COMPOUND); } +str + { RETURN(SU_LITERAL); } +str [bB] / (all\symchr) + { RETURN(SU_LITERAL_BIN); } +str [xX] / (all\symchr) + { RETURN(SU_LITERAL_HEX); } + +eof + { RETURN(SU_EOF); } +any + { RETURN(SU_ERROR); } +*/ +} + +bool StripToken(){ + uchar *cursor = ScanCB.cur; + unsigned depth; + uchar ch; + bool blanks = FALSE; + ScanCB.eot = cursor; +strip: +/*!re2c +"/*" + { + depth = 1; + goto comment; + } +"\r" + { goto strip; } +[ \t] + { + blanks = TRUE; + goto strip; + } +[] / all + { RETURN(blanks); } +*/ + +comment: +/*!re2c +"*/" + { + if(--depth == 0) + goto strip; + else + goto comment; + } +"\n" + { + ++(ScanCB.lineNum); + ScanCB.linePos = ScanCB.pos + (cursor - ScanCB.mrk); + goto comment; + } +"/*" + { + ++depth; + goto comment; + } +eof + { RETURN(blanks); } +any + { + goto comment; + } +*/ +} diff --git a/tools/re2c/examples/rexx/scanio.c b/tools/re2c/examples/rexx/scanio.c new file mode 100644 index 00000000..de6898df --- /dev/null +++ b/tools/re2c/examples/rexx/scanio.c @@ -0,0 +1,41 @@ +uchar *ScanFill(uchar *cursor){ + unsigned cnt = s->tok - s->bot; + s->pos += cursor - s->mrk; + if(cnt){ + if(s->eot){ + unsigned len = s->eot - s->tok; + memcpy(s->bot, s->tok, len); + s->eot = &s->bot[len]; + if((len = s->lim - cursor) != 0) + memcpy(s->eot, cursor, len); + cursor = s->eot; + s->lim = &cursor[len]; + } else { + memcpy(s->bot, s->tok, s->lim - s->tok); + cursor -= cnt; + s->lim -= cnt; + } + s->tok = s->bot; + s->ptr -= cnt; + } + if((s->top - s->lim) < 512){ + uchar *buf = (uchar*) malloc(((s->lim - s->bot) + 512)*sizeof(uchar)); + memcpy(buf, s->bot, s->lim - s->bot); + s->tok = buf; + s->ptr = &buf[s->ptr - s->bot]; + if(s->eot) + s->eot = &buf[s->eot - s->bot]; + cursor = &buf[cursor - s->bot]; + s->lim = &buf[s->lim - s->bot]; + s->top = &s->lim[512]; + free(s->bot); + s->bot = buf; + } + s->mrk = cursor; + if(ScanCBIO.file){ + if((cnt = read(ScanCBIO.u.f.fd, (char*) s->lim, 512)) != 512) + memset(&s->lim[cnt], 0, 512 - cnt); + s->lim += 512; + } + return cursor; +} diff --git a/tools/re2c/examples/sample.re b/tools/re2c/examples/sample.re new file mode 100644 index 00000000..2f497a3b --- /dev/null +++ b/tools/re2c/examples/sample.re @@ -0,0 +1,7 @@ +/*!re2c + "print" {return PRINT;} + [a-z]+ {return ID;} + [0-9]+ {return DEC;} + "0x" [0-9a-f]+ {return HEX;} + [\000-\377] {return ERR;} +*/ diff --git a/tools/re2c/examples/simple.re b/tools/re2c/examples/simple.re new file mode 100644 index 00000000..5fd8891f --- /dev/null +++ b/tools/re2c/examples/simple.re @@ -0,0 +1,13 @@ +#define NULL ((char*) 0) +char *scan(char *p){ +char *q; +#define YYCTYPE char +#define YYCURSOR p +#define YYLIMIT p +#define YYMARKER q +#define YYFILL(n) +/*!re2c + [0-9]+ {return YYCURSOR;} + [\000-\377] {return NULL;} +*/ +} diff --git a/tools/re2c/globals.h b/tools/re2c/globals.h new file mode 100644 index 00000000..79edbff9 --- /dev/null +++ b/tools/re2c/globals.h @@ -0,0 +1,15 @@ +#ifndef _globals_h +#define _globals_h + +#include "basics.h" + +extern char *fileName; +extern bool sFlag; +extern bool bFlag; + +extern uchar asc2ebc[256]; +extern uchar ebc2asc[256]; + +extern uchar *xlat, *talx; + +#endif diff --git a/tools/re2c/ins.h b/tools/re2c/ins.h new file mode 100644 index 00000000..5d08cca2 --- /dev/null +++ b/tools/re2c/ins.h @@ -0,0 +1,41 @@ +#ifndef _ins_h +#define _ins_h + +#include +#include "basics.h" + +const uint nChars = 256; +typedef uchar Char; + +const uint CHAR = 0; +const uint GOTO = 1; +const uint FORK = 2; +const uint TERM = 3; +const uint CTXT = 4; + +union Ins { + struct { + byte tag; + byte marked; + void *link; + } i; + struct { + ushort value; + ushort bump; + void *link; + } c; +}; + +inline bool isMarked(Ins *i){ + return i->i.marked != 0; +} + +inline void mark(Ins *i){ + i->i.marked = true; +} + +inline void unmark(Ins *i){ + i->i.marked = false; +} + +#endif diff --git a/tools/re2c/main.cc b/tools/re2c/main.cc new file mode 100644 index 00000000..9e22c23e --- /dev/null +++ b/tools/re2c/main.cc @@ -0,0 +1,54 @@ +#include +#include +#include +#include + +#include "globals.h" +#include "parser.h" +#include "dfa.h" + +char *fileName; +bool sFlag = false; +bool bFlag = false; + +int main(unsigned argc, char *argv[]){ + fileName = NULL; + if(argc == 1) + goto usage; + while(--argc > 1){ + char *p = *++argv; + while(*++p != '\0'){ + switch(*p){ + case 'e': + xlat = asc2ebc; + talx = ebc2asc; + break; + case 's': + sFlag = true; + break; + case 'b': + sFlag = true; + bFlag = true; + break; + default: + goto usage; + } + } + } + fileName = *++argv; + int fd; + if(fileName[0] == '-' && fileName[1] == '\0'){ + fileName = ""; + fd = 0; + } else { + if((fd = open(fileName, O_RDONLY)) < 0){ + cerr << "can't open " << fileName << "\n"; + return 1; + } + } + parse(fd, cout); + return 0; +usage: + cerr << "usage: re2c [-esb] name\n"; + return 2; +} diff --git a/tools/re2c/parse.h b/tools/re2c/parse.h new file mode 100644 index 00000000..56178a80 --- /dev/null +++ b/tools/re2c/parse.h @@ -0,0 +1,20 @@ +#ifndef _parser_h +#define _parser_h + +#include "scanner.h" +#include "re.h" + +class Symbol { +public: + static Symbol *first; + Symbol *next; + Str name; + RegExp *re; +public: + Symbol(const SubStr&); + static Symbol *find(const SubStr&); +}; + +void parse(int, ostream&); + +#endif diff --git a/tools/re2c/parser.cc b/tools/re2c/parser.cc new file mode 100644 index 00000000..6d664005 --- /dev/null +++ b/tools/re2c/parser.cc @@ -0,0 +1,531 @@ +#ifndef lint +static char yysccsid[] = "@(#)yaccpar 1.9 (Berkeley) 02/21/93"; +#endif +#define YYBYACC 1 +#define YYMAJOR 1 +#define YYMINOR 9 +#define yyclearin (yychar=(-1)) +#define yyerrok (yyerrflag=0) +#define YYRECOVERING (yyerrflag!=0) +#define YYPREFIX "yy" +#line 2 "parser.y" + +#include +#include +#include +#include +#include "globals.h" +#include "parser.h" +int yyparse(); +int yylex(); +void yyerror(char*); + +static uint accept; +static RegExp *spec; +static Scanner *in; + +#line 21 "parser.y" +typedef union { + Symbol *symbol; + RegExp *regexp; + Token *token; + char op; +} YYSTYPE; +#line 35 "y.tab.c" +#define CLOSE 257 +#define ID 258 +#define CODE 259 +#define RANGE 260 +#define STRING 261 +#define YYERRCODE 256 +short yylhs[] = { -1, + 0, 0, 0, 9, 2, 3, 3, 4, 4, 5, + 5, 6, 6, 7, 7, 1, 1, 8, 8, 8, + 8, +}; +short yylen[] = { 2, + 0, 2, 2, 4, 3, 0, 2, 1, 3, 1, + 3, 1, 2, 1, 2, 1, 2, 1, 1, 1, + 3, +}; +short yydefred[] = { 1, + 0, 0, 19, 20, 0, 2, 0, 0, 0, 12, + 0, 3, 0, 18, 0, 0, 0, 0, 0, 13, + 16, 0, 0, 21, 0, 0, 5, 0, 17, 4, +}; +short yydgoto[] = { 1, + 22, 6, 18, 7, 8, 9, 10, 11, 12, +}; +short yysindex[] = { 0, + -27, -49, 0, 0, -23, 0, -44, -84, -23, 0, + -243, 0, -23, 0, -39, -23, -23, -244, -23, 0, + 0, -239, -53, 0, -104, -84, 0, -23, 0, 0, +}; +short yyrindex[] = { 0, + 0, -31, 0, 0, 0, 0, -227, -17, -20, 0, + -40, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, -36, 0, 0, -226, -16, 0, -19, 0, 0, +}; +short yygindex[] = { 0, + 0, 0, 0, 21, 18, 17, 1, 0, 0, +}; +#define YYTABLESIZE 243 +short yytable[] = { 14, + 14, 24, 16, 15, 15, 30, 14, 19, 18, 20, + 15, 13, 5, 21, 27, 18, 5, 29, 14, 17, + 10, 11, 15, 8, 9, 15, 10, 11, 20, 8, + 9, 6, 7, 23, 26, 28, 25, 0, 10, 11, + 0, 8, 9, 0, 0, 0, 0, 0, 0, 0, + 0, 14, 0, 0, 0, 15, 0, 0, 0, 0, + 18, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 17, 10, 11, 0, 0, 0, 0, 0, 0, 17, + 0, 0, 0, 14, 17, 0, 0, 15, 0, 0, + 0, 0, 18, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 10, 11, 0, 8, 9, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 14, 14, 14, + 14, 15, 15, 15, 15, 18, 18, 18, 18, 18, + 2, 0, 3, 4, 14, 0, 3, 4, 10, 11, + 0, 8, 9, +}; +short yycheck[] = { 40, + 41, 41, 47, 40, 41, 59, 47, 92, 40, 9, + 47, 61, 40, 257, 259, 47, 40, 257, 59, 124, + 41, 41, 59, 41, 41, 5, 47, 47, 28, 47, + 47, 259, 259, 13, 17, 19, 16, -1, 59, 59, + -1, 59, 59, -1, -1, -1, -1, -1, -1, -1, + -1, 92, -1, -1, -1, 92, -1, -1, -1, -1, + 92, -1, -1, -1, -1, -1, -1, -1, -1, -1, + 124, 92, 92, -1, -1, -1, -1, -1, -1, 124, + -1, -1, -1, 124, 124, -1, -1, 124, -1, -1, + -1, -1, 124, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, 124, 124, -1, 124, 124, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, 258, 259, 260, + 261, 258, 259, 260, 261, 257, 258, 259, 260, 261, + 258, -1, 260, 261, 258, -1, 260, 261, 259, 259, + -1, 259, 259, +}; +#define YYFINAL 1 +#ifndef YYDEBUG +#define YYDEBUG 0 +#endif +#define YYMAXTOKEN 261 +#if YYDEBUG +char *yyname[] = { +"end-of-file",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +0,0,0,0,0,0,"'('","')'",0,0,0,0,0,"'/'",0,0,0,0,0,0,0,0,0,0,0,"';'",0,"'='",0,0, +0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,"'\\\\'",0,0,0,0,0,0,0, +0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,"'|'",0,0,0,0,0,0,0,0,0,0,0,0,0, +0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +"CLOSE","ID","CODE","RANGE","STRING", +}; +char *yyrule[] = { +"$accept : spec", +"spec :", +"spec : spec rule", +"spec : spec decl", +"decl : ID '=' expr ';'", +"rule : expr look CODE", +"look :", +"look : '/' expr", +"expr : diff", +"expr : expr '|' diff", +"diff : term", +"diff : diff '\\\\' term", +"term : factor", +"term : term factor", +"factor : primary", +"factor : primary close", +"close : CLOSE", +"close : close CLOSE", +"primary : ID", +"primary : RANGE", +"primary : STRING", +"primary : '(' expr ')'", +}; +#endif +#ifdef YYSTACKSIZE +#undef YYMAXDEPTH +#define YYMAXDEPTH YYSTACKSIZE +#else +#ifdef YYMAXDEPTH +#define YYSTACKSIZE YYMAXDEPTH +#else +#define YYSTACKSIZE 500 +#define YYMAXDEPTH 500 +#endif +#endif +int yydebug; +int yynerrs; +int yyerrflag; +int yychar; +short *yyssp; +YYSTYPE *yyvsp; +YYSTYPE yyval; +YYSTYPE yylval; +short yyss[YYSTACKSIZE]; +YYSTYPE yyvs[YYSTACKSIZE]; +#define yystacksize YYSTACKSIZE +#line 121 "parser.y" + +void yyerror(char* s){ + in->fatal(s); +} + +int yylex(){ + return in->scan(); +} + +void parse(int i, ostream &o){ + char * fnamebuf; + char * token; + + o << "/* Generated by re2c 0.5 on "; + time_t now = time(&now); + o.write(ctime(&now), 24); + o << " */\n"; + + in = new Scanner(i); + + o << "#line " << in->line() << " \""; + if( fileName != NULL ) { + fnamebuf = strdup( fileName ); + } else { + fnamebuf = strdup( "" ); + } + token = strtok( fnamebuf, "\\" ); + for(;;) { + o << token; + token = strtok( NULL, "\\" ); + if( token == NULL ) break; + o << "\\\\"; + } + o << "\"\n"; + free( fnamebuf ); + + while(in->echo(o)){ + yyparse(); + if(spec) + genCode(o, spec); + o << "#line " << in->line() << "\n"; + } +} +#line 235 "y.tab.c" +#define YYABORT goto yyabort +#define YYREJECT goto yyabort +#define YYACCEPT goto yyaccept +#define YYERROR goto yyerrlab +int +yyparse() +{ + register int yym, yyn, yystate; +#if YYDEBUG + register char *yys; + extern char *getenv(); + + if (yys = getenv("YYDEBUG")) + { + yyn = *yys; + if (yyn >= '0' && yyn <= '9') + yydebug = yyn - '0'; + } +#endif + + yynerrs = 0; + yyerrflag = 0; + yychar = (-1); + + yyssp = yyss; + yyvsp = yyvs; + *yyssp = yystate = 0; + +yyloop: + if (yyn = yydefred[yystate]) goto yyreduce; + if (yychar < 0) + { + if ((yychar = yylex()) < 0) yychar = 0; +#if YYDEBUG + if (yydebug) + { + yys = 0; + if (yychar <= YYMAXTOKEN) yys = yyname[yychar]; + if (!yys) yys = "illegal-symbol"; + printf("%sdebug: state %d, reading %d (%s)\n", + YYPREFIX, yystate, yychar, yys); + } +#endif + } + if ((yyn = yysindex[yystate]) && (yyn += yychar) >= 0 && + yyn <= YYTABLESIZE && yycheck[yyn] == yychar) + { +#if YYDEBUG + if (yydebug) + printf("%sdebug: state %d, shifting to state %d\n", + YYPREFIX, yystate, yytable[yyn]); +#endif + if (yyssp >= yyss + yystacksize - 1) + { + goto yyoverflow; + } + *++yyssp = yystate = yytable[yyn]; + *++yyvsp = yylval; + yychar = (-1); + if (yyerrflag > 0) --yyerrflag; + goto yyloop; + } + if ((yyn = yyrindex[yystate]) && (yyn += yychar) >= 0 && + yyn <= YYTABLESIZE && yycheck[yyn] == yychar) + { + yyn = yytable[yyn]; + goto yyreduce; + } + if (yyerrflag) goto yyinrecovery; +#ifdef lint + goto yynewerror; +#endif +yynewerror: + yyerror("syntax error"); +#ifdef lint + goto yyerrlab; +#endif +yyerrlab: + ++yynerrs; +yyinrecovery: + if (yyerrflag < 3) + { + yyerrflag = 3; + for (;;) + { + if ((yyn = yysindex[*yyssp]) && (yyn += YYERRCODE) >= 0 && + yyn <= YYTABLESIZE && yycheck[yyn] == YYERRCODE) + { +#if YYDEBUG + if (yydebug) + printf("%sdebug: state %d, error recovery shifting\ + to state %d\n", YYPREFIX, *yyssp, yytable[yyn]); +#endif + if (yyssp >= yyss + yystacksize - 1) + { + goto yyoverflow; + } + *++yyssp = yystate = yytable[yyn]; + *++yyvsp = yylval; + goto yyloop; + } + else + { +#if YYDEBUG + if (yydebug) + printf("%sdebug: error recovery discarding state %d\n", + YYPREFIX, *yyssp); +#endif + if (yyssp <= yyss) goto yyabort; + --yyssp; + --yyvsp; + } + } + } + else + { + if (yychar == 0) goto yyabort; +#if YYDEBUG + if (yydebug) + { + yys = 0; + if (yychar <= YYMAXTOKEN) yys = yyname[yychar]; + if (!yys) yys = "illegal-symbol"; + printf("%sdebug: state %d, error recovery discards token %d (%s)\n", + YYPREFIX, yystate, yychar, yys); + } +#endif + yychar = (-1); + goto yyloop; + } +yyreduce: +#if YYDEBUG + if (yydebug) + printf("%sdebug: state %d, reducing by rule %d (%s)\n", + YYPREFIX, yystate, yyn, yyrule[yyn]); +#endif + yym = yylen[yyn]; + yyval = yyvsp[1-yym]; + switch (yyn) + { +case 1: +#line 40 "parser.y" +{ accept = 0; + spec = NULL; } +break; +case 2: +#line 43 "parser.y" +{ spec = spec? mkAlt(spec, yyvsp[0].regexp) : yyvsp[0].regexp; } +break; +case 4: +#line 48 "parser.y" +{ if(yyvsp[-3].symbol->re) + in->fatal("sym already defined"); + yyvsp[-3].symbol->re = yyvsp[-1].regexp; } +break; +case 5: +#line 54 "parser.y" +{ yyval.regexp = new RuleOp(yyvsp[-2].regexp, yyvsp[-1].regexp, yyvsp[0].token, accept++); } +break; +case 6: +#line 58 "parser.y" +{ yyval.regexp = new NullOp; } +break; +case 7: +#line 60 "parser.y" +{ yyval.regexp = yyvsp[0].regexp; } +break; +case 8: +#line 64 "parser.y" +{ yyval.regexp = yyvsp[0].regexp; } +break; +case 9: +#line 66 "parser.y" +{ yyval.regexp = mkAlt(yyvsp[-2].regexp, yyvsp[0].regexp); } +break; +case 10: +#line 70 "parser.y" +{ yyval.regexp = yyvsp[0].regexp; } +break; +case 11: +#line 72 "parser.y" +{ yyval.regexp = mkDiff(yyvsp[-2].regexp, yyvsp[0].regexp); + if(!yyval.regexp) + in->fatal("can only difference char sets"); + } +break; +case 12: +#line 79 "parser.y" +{ yyval.regexp = yyvsp[0].regexp; } +break; +case 13: +#line 81 "parser.y" +{ yyval.regexp = new CatOp(yyvsp[-1].regexp, yyvsp[0].regexp); } +break; +case 14: +#line 85 "parser.y" +{ yyval.regexp = yyvsp[0].regexp; } +break; +case 15: +#line 87 "parser.y" +{ + switch(yyvsp[0].op){ + case '*': + yyval.regexp = mkAlt(new CloseOp(yyvsp[-1].regexp), new NullOp()); + break; + case '+': + yyval.regexp = new CloseOp(yyvsp[-1].regexp); + break; + case '?': + yyval.regexp = mkAlt(yyvsp[-1].regexp, new NullOp()); + break; + } + } +break; +case 16: +#line 103 "parser.y" +{ yyval.op = yyvsp[0].op; } +break; +case 17: +#line 105 "parser.y" +{ yyval.op = (yyvsp[-1].op == yyvsp[0].op) ? yyvsp[-1].op : '*'; } +break; +case 18: +#line 109 "parser.y" +{ if(!yyvsp[0].symbol->re) + in->fatal("can't find symbol"); + yyval.regexp = yyvsp[0].symbol->re; } +break; +case 19: +#line 113 "parser.y" +{ yyval.regexp = yyvsp[0].regexp; } +break; +case 20: +#line 115 "parser.y" +{ yyval.regexp = yyvsp[0].regexp; } +break; +case 21: +#line 117 "parser.y" +{ yyval.regexp = yyvsp[-1].regexp; } +break; +#line 476 "y.tab.c" + } + yyssp -= yym; + yystate = *yyssp; + yyvsp -= yym; + yym = yylhs[yyn]; + if (yystate == 0 && yym == 0) + { +#if YYDEBUG + if (yydebug) + printf("%sdebug: after reduction, shifting from state 0 to\ + state %d\n", YYPREFIX, YYFINAL); +#endif + yystate = YYFINAL; + *++yyssp = YYFINAL; + *++yyvsp = yyval; + if (yychar < 0) + { + if ((yychar = yylex()) < 0) yychar = 0; +#if YYDEBUG + if (yydebug) + { + yys = 0; + if (yychar <= YYMAXTOKEN) yys = yyname[yychar]; + if (!yys) yys = "illegal-symbol"; + printf("%sdebug: state %d, reading %d (%s)\n", + YYPREFIX, YYFINAL, yychar, yys); + } +#endif + } + if (yychar == 0) goto yyaccept; + goto yyloop; + } + if ((yyn = yygindex[yym]) && (yyn += yystate) >= 0 && + yyn <= YYTABLESIZE && yycheck[yyn] == yystate) + yystate = yytable[yyn]; + else + yystate = yydgoto[yym]; +#if YYDEBUG + if (yydebug) + printf("%sdebug: after reduction, shifting from state %d \ +to state %d\n", YYPREFIX, *yyssp, yystate); +#endif + if (yyssp >= yyss + yystacksize - 1) + { + goto yyoverflow; + } + *++yyssp = yystate; + *++yyvsp = yyval; + goto yyloop; +yyoverflow: + yyerror("yacc stack overflow"); +yyabort: + return (1); +yyaccept: + return (0); +} diff --git a/tools/re2c/parser.h b/tools/re2c/parser.h new file mode 100644 index 00000000..56178a80 --- /dev/null +++ b/tools/re2c/parser.h @@ -0,0 +1,20 @@ +#ifndef _parser_h +#define _parser_h + +#include "scanner.h" +#include "re.h" + +class Symbol { +public: + static Symbol *first; + Symbol *next; + Str name; + RegExp *re; +public: + Symbol(const SubStr&); + static Symbol *find(const SubStr&); +}; + +void parse(int, ostream&); + +#endif diff --git a/tools/re2c/parser.y b/tools/re2c/parser.y new file mode 100644 index 00000000..8f2a7dce --- /dev/null +++ b/tools/re2c/parser.y @@ -0,0 +1,163 @@ +%{ + +#include +#include +#include +#include +#include "globals.h" +#include "parser.h" +int yyparse(); +int yylex(); +void yyerror(char*); + +static uint accept; +static RegExp *spec; +static Scanner *in; + +%} + +%start spec + +%union { + Symbol *symbol; + RegExp *regexp; + Token *token; + char op; +} + +%token CLOSE ID CODE RANGE STRING + +%type CLOSE +%type close +%type ID +%type CODE +%type RANGE STRING +%type rule look expr diff term factor primary + +%% + +spec : + { accept = 0; + spec = NULL; } + | spec rule + { spec = spec? mkAlt(spec, $2) : $2; } + | spec decl + ; + +decl : ID '=' expr ';' + { if($1->re) + in->fatal("sym already defined"); + $1->re = $3; } + ; + +rule : expr look CODE + { $$ = new RuleOp($1, $2, $3, accept++); } + ; + +look : + { $$ = new NullOp; } + | '/' expr + { $$ = $2; } + ; + +expr : diff + { $$ = $1; } + | expr '|' diff + { $$ = mkAlt($1, $3); } + ; + +diff : term + { $$ = $1; } + | diff '\\' term + { $$ = mkDiff($1, $3); + if(!$$) + in->fatal("can only difference char sets"); + } + ; + +term : factor + { $$ = $1; } + | term factor + { $$ = new CatOp($1, $2); } + ; + +factor : primary + { $$ = $1; } + | primary close + { + switch($2){ + case '*': + $$ = mkAlt(new CloseOp($1), new NullOp()); + break; + case '+': + $$ = new CloseOp($1); + break; + case '?': + $$ = mkAlt($1, new NullOp()); + break; + } + } + ; + +close : CLOSE + { $$ = $1; } + | close CLOSE + { $$ = ($1 == $2) ? $1 : '*'; } + ; + +primary : ID + { if(!$1->re) + in->fatal("can't find symbol"); + $$ = $1->re; } + | RANGE + { $$ = $1; } + | STRING + { $$ = $1; } + | '(' expr ')' + { $$ = $2; } + ; + +%% + +void yyerror(char* s){ + in->fatal(s); +} + +int yylex(){ + return in->scan(); +} + +void parse(int i, ostream &o){ + char * fnamebuf; + char * token; + + o << "/* Generated by re2c 0.5 on "; + time_t now = time(&now); + o.write(ctime(&now), 24); + o << " */\n"; + + in = new Scanner(i); + + o << "#line " << in->line() << " \""; + if( fileName != NULL ) { + fnamebuf = strdup( fileName ); + } else { + fnamebuf = strdup( "" ); + } + token = strtok( fnamebuf, "\\" ); + for(;;) { + o << token; + token = strtok( NULL, "\\" ); + if( token == NULL ) break; + o << "\\\\"; + } + o << "\"\n"; + free( fnamebuf ); + + while(in->echo(o)){ + yyparse(); + if(spec) + genCode(o, spec); + o << "#line " << in->line() << "\n"; + } +} diff --git a/tools/re2c/re.h b/tools/re2c/re.h new file mode 100644 index 00000000..2ea6e63b --- /dev/null +++ b/tools/re2c/re.h @@ -0,0 +1,178 @@ +#ifndef _re_h +#define _re_h + +#include +#include "token.h" +#include "ins.h" + +struct CharPtn { + uint card; + CharPtn *fix; + CharPtn *nxt; +}; + +struct CharSet { + CharPtn *fix; + CharPtn *freeHead, **freeTail; + CharPtn *rep[nChars]; + CharPtn ptn[nChars]; +}; + +class Range { +public: + Range *next; + uint lb, ub; // [lb,ub) +public: + Range(uint l, uint u) : next(NULL), lb(l), ub(u) + { } + Range(Range &r) : next(NULL), lb(r.lb), ub(r.ub) + { } + friend ostream& operator<<(ostream&, const Range&); + friend ostream& operator<<(ostream&, const Range*); +}; + +inline ostream& operator<<(ostream &o, const Range *r){ + return r? o << *r : o; +} + +class RegExp { +public: + uint size; +public: + virtual char *typeOf() = 0; + RegExp *isA(char *t) + { return typeOf() == t? this : NULL; } + virtual void split(CharSet&) = 0; + virtual void calcSize(Char*) = 0; + virtual uint fixedLength(); + virtual void compile(Char*, Ins*) = 0; + virtual void display(ostream&) const = 0; + friend ostream& operator<<(ostream&, const RegExp&); + friend ostream& operator<<(ostream&, const RegExp*); +}; + +inline ostream& operator<<(ostream &o, const RegExp &re){ + re.display(o); + return o; +} + +inline ostream& operator<<(ostream &o, const RegExp *re){ + return o << *re; +} + +class NullOp: public RegExp { +public: + static char *type; +public: + char *typeOf() + { return type; } + void split(CharSet&); + void calcSize(Char*); + uint fixedLength(); + void compile(Char*, Ins*); + void display(ostream &o) const { + o << "_"; + } +}; + +class MatchOp: public RegExp { +public: + static char *type; + Range *match; +public: + MatchOp(Range *m) : match(m) + { } + char *typeOf() + { return type; } + void split(CharSet&); + void calcSize(Char*); + uint fixedLength(); + void compile(Char*, Ins*); + void display(ostream&) const; +}; + +class RuleOp: public RegExp { +private: + RegExp *exp; +public: + RegExp *ctx; + static char *type; + Ins *ins; + uint accept; + Token *code; + uint line; +public: + RuleOp(RegExp*, RegExp*, Token*, uint); + char *typeOf() + { return type; } + void split(CharSet&); + void calcSize(Char*); + void compile(Char*, Ins*); + void display(ostream &o) const { + o << exp << "/" << ctx << ";"; + } +}; + +class AltOp: public RegExp { +private: + RegExp *exp1, *exp2; +public: + static char *type; +public: + AltOp(RegExp *e1, RegExp *e2) + { exp1 = e1; exp2 = e2; } + char *typeOf() + { return type; } + void split(CharSet&); + void calcSize(Char*); + uint fixedLength(); + void compile(Char*, Ins*); + void display(ostream &o) const { + o << exp1 << "|" << exp2; + } + friend RegExp *mkAlt(RegExp*, RegExp*); +}; + +class CatOp: public RegExp { +private: + RegExp *exp1, *exp2; +public: + static char *type; +public: + CatOp(RegExp *e1, RegExp *e2) + { exp1 = e1; exp2 = e2; } + char *typeOf() + { return type; } + void split(CharSet&); + void calcSize(Char*); + uint fixedLength(); + void compile(Char*, Ins*); + void display(ostream &o) const { + o << exp1 << exp2; + } +}; + +class CloseOp: public RegExp { +private: + RegExp *exp; +public: + static char *type; +public: + CloseOp(RegExp *e) + { exp = e; } + char *typeOf() + { return type; } + void split(CharSet&); + void calcSize(Char*); + void compile(Char*, Ins*); + void display(ostream &o) const { + o << exp << "+"; + } +}; + +extern void genCode(ostream&, RegExp*); +extern RegExp *mkDiff(RegExp*, RegExp*); +extern RegExp *strToRE(SubStr); +extern RegExp *ranToRE(SubStr); + +#endif diff --git a/tools/re2c/re2c-parser.y b/tools/re2c/re2c-parser.y new file mode 100644 index 00000000..8f2a7dce --- /dev/null +++ b/tools/re2c/re2c-parser.y @@ -0,0 +1,163 @@ +%{ + +#include +#include +#include +#include +#include "globals.h" +#include "parser.h" +int yyparse(); +int yylex(); +void yyerror(char*); + +static uint accept; +static RegExp *spec; +static Scanner *in; + +%} + +%start spec + +%union { + Symbol *symbol; + RegExp *regexp; + Token *token; + char op; +} + +%token CLOSE ID CODE RANGE STRING + +%type CLOSE +%type close +%type ID +%type CODE +%type RANGE STRING +%type rule look expr diff term factor primary + +%% + +spec : + { accept = 0; + spec = NULL; } + | spec rule + { spec = spec? mkAlt(spec, $2) : $2; } + | spec decl + ; + +decl : ID '=' expr ';' + { if($1->re) + in->fatal("sym already defined"); + $1->re = $3; } + ; + +rule : expr look CODE + { $$ = new RuleOp($1, $2, $3, accept++); } + ; + +look : + { $$ = new NullOp; } + | '/' expr + { $$ = $2; } + ; + +expr : diff + { $$ = $1; } + | expr '|' diff + { $$ = mkAlt($1, $3); } + ; + +diff : term + { $$ = $1; } + | diff '\\' term + { $$ = mkDiff($1, $3); + if(!$$) + in->fatal("can only difference char sets"); + } + ; + +term : factor + { $$ = $1; } + | term factor + { $$ = new CatOp($1, $2); } + ; + +factor : primary + { $$ = $1; } + | primary close + { + switch($2){ + case '*': + $$ = mkAlt(new CloseOp($1), new NullOp()); + break; + case '+': + $$ = new CloseOp($1); + break; + case '?': + $$ = mkAlt($1, new NullOp()); + break; + } + } + ; + +close : CLOSE + { $$ = $1; } + | close CLOSE + { $$ = ($1 == $2) ? $1 : '*'; } + ; + +primary : ID + { if(!$1->re) + in->fatal("can't find symbol"); + $$ = $1->re; } + | RANGE + { $$ = $1; } + | STRING + { $$ = $1; } + | '(' expr ')' + { $$ = $2; } + ; + +%% + +void yyerror(char* s){ + in->fatal(s); +} + +int yylex(){ + return in->scan(); +} + +void parse(int i, ostream &o){ + char * fnamebuf; + char * token; + + o << "/* Generated by re2c 0.5 on "; + time_t now = time(&now); + o.write(ctime(&now), 24); + o << " */\n"; + + in = new Scanner(i); + + o << "#line " << in->line() << " \""; + if( fileName != NULL ) { + fnamebuf = strdup( fileName ); + } else { + fnamebuf = strdup( "" ); + } + token = strtok( fnamebuf, "\\" ); + for(;;) { + o << token; + token = strtok( NULL, "\\" ); + if( token == NULL ) break; + o << "\\\\"; + } + o << "\"\n"; + free( fnamebuf ); + + while(in->echo(o)){ + yyparse(); + if(spec) + genCode(o, spec); + o << "#line " << in->line() << "\n"; + } +} diff --git a/tools/re2c/re2c.1 b/tools/re2c/re2c.1 new file mode 100644 index 00000000..d69f94d6 --- /dev/null +++ b/tools/re2c/re2c.1 @@ -0,0 +1,536 @@ +.ds re \fBre2c\fP +.ds le \fBlex\fP +.ds rx regular expression +.ds lx \fIl\fP-expression +.TH RE2C 1 "8 April 1994" "Version 0.5" +\"$Log: re2c.1,v $ +\"Revision 1.1 2002/04/07 22:27:06 peter +\"Initial revision +\" +\"Revision 1.2 1994/04/16 15:50:32 peterr +\"Fix bug in simple example. +\" +\"Revision 1.1 1994/04/08 15:39:09 peterr +\"Initial revision +\" +.SH NAME +re2c \- convert regular expressions to C/C++ + +.SH SYNOPSIS +\*(re [\fB-esb\fP] \fIname\fP + +.SH DESCRIPTION +\*(re is a preprocessor that generates C-based recognizers from regular +expressions. +The input to \*(re consists of C/C++ source interleaved with +comments of the form \fC/*!re2c\fP ... \fC*/\fP which contain +scanner specifications. +In the output these comments are replaced with code that, when +executed, will find the next input token and then execute +some user-supplied token-specific code. + +For example, given the following code + +.in +3 +.nf +#define NULL ((char*) 0) +char *scan(char *p){ +char *q; +#define YYCTYPE char +#define YYCURSOR p +#define YYLIMIT p +#define YYMARKER q +#define YYFILL(n) +/*!re2c + [0-9]+ {return YYCURSOR;} + [\\000-\\377] {return NULL;} +*/ +} +.fi +.in -3 + +\*(re will generate + +.in +3 +.nf +/* Generated by re2c on Sat Apr 16 11:40:58 1994 */ +#line 1 "simple.re" +#define NULL ((char*) 0) +char *scan(char *p){ +char *q; +#define YYCTYPE char +#define YYCURSOR p +#define YYLIMIT p +#define YYMARKER q +#define YYFILL(n) +{ + YYCTYPE yych; + unsigned int yyaccept; + goto yy0; +yy1: ++YYCURSOR; +yy0: + if((YYLIMIT - YYCURSOR) < 2) YYFILL(2); + yych = *YYCURSOR; + if(yych <= '/') goto yy4; + if(yych >= ':') goto yy4; +yy2: yych = *++YYCURSOR; + goto yy7; +yy3: +#line 10 + {return YYCURSOR;} +yy4: yych = *++YYCURSOR; +yy5: +#line 11 + {return NULL;} +yy6: ++YYCURSOR; + if(YYLIMIT == YYCURSOR) YYFILL(1); + yych = *YYCURSOR; +yy7: if(yych <= '/') goto yy3; + if(yych <= '9') goto yy6; + goto yy3; +} +#line 12 + +} +.fi +.in -3 + +.SH OPTIONS +\*(re provides the following options: +.TP +\fB-e\fP +Cross-compile from an ASCII platform to an EBCDIC one. +.TP +\fB-s\fP +Generate nested \fCif\fPs for some \fCswitch\fPes. Many compilers need this +assist to generate better code. +.TP +\fB-b\fP +Implies \fB-s\fP. Use bit vectors as well in the attempt to coax better +code out of the compiler. Most useful for specifications with more than a +few keywords (e.g. for most programming languages). + +.SH "INTERFACE CODE" +Unlike other scanner generators, \*(re does not generate complete scanners: +the user must supply some interface code. +In particular, the user must define the following macros: +.TP +\fCYYCHAR\fP +Type used to hold an input symbol. +Usually \fCchar\fP or \fCunsigned char\fP. +.TP +\fCYYCURSOR\fP +\*(lx of type \fC*YYCHAR\fP that points to the current input symbol. +The generated code advances \fCYYCURSOR\fP as symbols are matched. +On entry, \fCYYCURSOR\fP is assumed to point to the first character of the +current token. On exit, \fCYYCURSOR\fP will point to the first character of +the following token. +.TP +\fCYLIMIT\fP +Expression of type \fC*YYCHAR\fP that marks the end of the buffer +(\fCYLIMIT[-1]\fP is the last character in the buffer). +The generated code repeatedly compares \fCYYCURSOR\fP to \fCYLIMIT\fP +to determine when the buffer needs (re)filling. +.TP +\fCYYMARKER\fP +\*(lx of type \fC*YYCHAR\fP. +The generated code saves backtracking information in \fCYYMARKER\fP. +.TP +\fCYYFILL(\fP\fIn\fP\fC)\fP +The generated code "calls" \fCYYFILL\fP when the buffer needs +(re)filling: at least \fIn\fP additional characters should +be provided. \fCYYFILL\fP should adjust \fCYYCURSOR\fP, \fCYYLIMIT\fP and +\fCYYMARKER\fP as needed. Note that for typical programming languages +\fIn\fP will be the length of the longest keyword plus one. + +.SH "SCANNER SPECIFICATIONS" +Each scanner specification consists of a set of \fIrules\fP and name +definitions. +Rules consist of a regular expression along with a block of C/C++ code that +is to be executed when the associated regular expression is matched. +Name definitions are of the form +``\fIname\fP \fC=\fP \fIregular expression\fP\fC;\fP''. + +.SH "SUMMARY OF RE2C REGULAR EXPRESSIONS" +.TP +\fC"foo"\fP +the literal string \fCfoo\fP. +ANSI-C escape sequences can be used. +.TP +\fC[xyz]\fP +a "character class"; in this case, +the \*(rx matches either an '\fCx\fP', a '\fCy\fP', or a '\fCz\fP'. +.TP +\fC[abj-oZ]\fP +a "character class" with a range in it; +matches an '\fCa\fP', a '\fCb\fP', any letter from '\fCj\fP' through '\fCo\fP', +or a '\fCZ\fP'. +.TP +\fIr\fP\fC\e\fP\fIs\fP +match any \fIr\fP which isn't an \fIs\fP. \fIr\fP and \fIs\fP must be regular expressions +which can be expressed as character classes. +.TP +\fIr\fP\fC*\fP +zero or more \fIr\fP's, where \fIr\fP is any regular expression +.TP +\fC\fIr\fP\fC+\fP +one or more \fIr\fP's +.TP +\fC\fIr\fP\fC?\fP +zero or one \fIr\fP's (that is, "an optional \fIr\fP") +.TP +name +the expansion of the "name" definition (see above) +.TP +\fC(\fP\fIr\fP\fC)\fP +an \fIr\fP; parentheses are used to override precedence +(see below) +.TP +\fIrs\fP +an \fIr\fP followed by an \fIs\fP ("concatenation") +.TP +\fIr\fP\fC|\fP\fIs\fP +either an \fIr\fP or an \fIs\fP +.TP +\fIr\fP\fC/\fP\fIs\fP +an \fIr\fP but only if it is followed by an \fIs\fP. The s is not part of +the matched text. This type of \*(rx is called "trailing context". +.LP +The regular expressions listed above are grouped according to +precedence, from highest precedence at the top to lowest at the bottom. +Those grouped together have equal precedence. + +.SH "A LARGER EXAMPLE" +.LP +.in +3 +.nf +#include +#include +#include +#include + +#define ADDEQ 257 +#define ANDAND 258 +#define ANDEQ 259 +#define ARRAY 260 +#define ASM 261 +#define AUTO 262 +#define BREAK 263 +#define CASE 264 +#define CHAR 265 +#define CONST 266 +#define CONTINUE 267 +#define DECR 268 +#define DEFAULT 269 +#define DEREF 270 +#define DIVEQ 271 +#define DO 272 +#define DOUBLE 273 +#define ELLIPSIS 274 +#define ELSE 275 +#define ENUM 276 +#define EQL 277 +#define EXTERN 278 +#define FCON 279 +#define FLOAT 280 +#define FOR 281 +#define FUNCTION 282 +#define GEQ 283 +#define GOTO 284 +#define ICON 285 +#define ID 286 +#define IF 287 +#define INCR 288 +#define INT 289 +#define LEQ 290 +#define LONG 291 +#define LSHIFT 292 +#define LSHIFTEQ 293 +#define MODEQ 294 +#define MULEQ 295 +#define NEQ 296 +#define OREQ 297 +#define OROR 298 +#define POINTER 299 +#define REGISTER 300 +#define RETURN 301 +#define RSHIFT 302 +#define RSHIFTEQ 303 +#define SCON 304 +#define SHORT 305 +#define SIGNED 306 +#define SIZEOF 307 +#define STATIC 308 +#define STRUCT 309 +#define SUBEQ 310 +#define SWITCH 311 +#define TYPEDEF 312 +#define UNION 313 +#define UNSIGNED 314 +#define VOID 315 +#define VOLATILE 316 +#define WHILE 317 +#define XOREQ 318 +#define EOI 319 + +typedef unsigned int uint; +typedef unsigned char uchar; + +#define BSIZE 8192 + +#define YYCTYPE uchar +#define YYCURSOR cursor +#define YYLIMIT s->lim +#define YYMARKER s->ptr +#define YYFILL(n) {cursor = fill(s, cursor);} + +#define RET(i) {s->cur = cursor; return i;} + +typedef struct Scanner { + int fd; + uchar *bot, *tok, *ptr, *cur, *pos, *lim, *top, *eof; + uint line; +} Scanner; + +uchar *fill(Scanner *s, uchar *cursor){ + if(!s->eof){ + uint cnt = s->tok - s->bot; + if(cnt){ + memcpy(s->bot, s->tok, s->lim - s->tok); + s->tok = s->bot; + s->ptr -= cnt; + cursor -= cnt; + s->pos -= cnt; + s->lim -= cnt; + } + if((s->top - s->lim) < BSIZE){ + uchar *buf = (uchar*) + malloc(((s->lim - s->bot) + BSIZE)*sizeof(uchar)); + memcpy(buf, s->tok, s->lim - s->tok); + s->tok = buf; + s->ptr = &buf[s->ptr - s->bot]; + cursor = &buf[cursor - s->bot]; + s->pos = &buf[s->pos - s->bot]; + s->lim = &buf[s->lim - s->bot]; + s->top = &s->lim[BSIZE]; + free(s->bot); + s->bot = buf; + } + if((cnt = read(s->fd, (char*) s->lim, BSIZE)) != BSIZE){ + s->eof = &s->lim[cnt]; *(s->eof)++ = '\\n'; + } + s->lim += cnt; + } + return cursor; +} + +int scan(Scanner *s){ + uchar *cursor = s->cur; +std: + s->tok = cursor; +/*!re2c +any = [\\000-\\377]; +O = [0-7]; +D = [0-9]; +L = [a-zA-Z_]; +H = [a-fA-F0-9]; +E = [Ee] [+-]? D+; +FS = [fFlL]; +IS = [uUlL]*; +ESC = [\\\\] ([abfnrtv?'"\\\\] | "x" H+ | O+); +*/ + +/*!re2c + "/*" { goto comment; } + + "auto" { RET(AUTO); } + "break" { RET(BREAK); } + "case" { RET(CASE); } + "char" { RET(CHAR); } + "const" { RET(CONST); } + "continue" { RET(CONTINUE); } + "default" { RET(DEFAULT); } + "do" { RET(DO); } + "double" { RET(DOUBLE); } + "else" { RET(ELSE); } + "enum" { RET(ENUM); } + "extern" { RET(EXTERN); } + "float" { RET(FLOAT); } + "for" { RET(FOR); } + "goto" { RET(GOTO); } + "if" { RET(IF); } + "int" { RET(INT); } + "long" { RET(LONG); } + "register" { RET(REGISTER); } + "return" { RET(RETURN); } + "short" { RET(SHORT); } + "signed" { RET(SIGNED); } + "sizeof" { RET(SIZEOF); } + "static" { RET(STATIC); } + "struct" { RET(STRUCT); } + "switch" { RET(SWITCH); } + "typedef" { RET(TYPEDEF); } + "union" { RET(UNION); } + "unsigned" { RET(UNSIGNED); } + "void" { RET(VOID); } + "volatile" { RET(VOLATILE); } + "while" { RET(WHILE); } + + L (L|D)* { RET(ID); } + + ("0" [xX] H+ IS?) | ("0" D+ IS?) | (D+ IS?) | + (['] (ESC|any\\[\\n\\\\'])* [']) + { RET(ICON); } + + (D+ E FS?) | (D* "." D+ E? FS?) | (D+ "." D* E? FS?) + { RET(FCON); } + + (["] (ESC|any\\[\\n\\\\"])* ["]) + { RET(SCON); } + + "..." { RET(ELLIPSIS); } + ">>=" { RET(RSHIFTEQ); } + "<<=" { RET(LSHIFTEQ); } + "+=" { RET(ADDEQ); } + "-=" { RET(SUBEQ); } + "*=" { RET(MULEQ); } + "/=" { RET(DIVEQ); } + "%=" { RET(MODEQ); } + "&=" { RET(ANDEQ); } + "^=" { RET(XOREQ); } + "|=" { RET(OREQ); } + ">>" { RET(RSHIFT); } + "<<" { RET(LSHIFT); } + "++" { RET(INCR); } + "--" { RET(DECR); } + "->" { RET(DEREF); } + "&&" { RET(ANDAND); } + "||" { RET(OROR); } + "<=" { RET(LEQ); } + ">=" { RET(GEQ); } + "==" { RET(EQL); } + "!=" { RET(NEQ); } + ";" { RET(';'); } + "{" { RET('{'); } + "}" { RET('}'); } + "," { RET(','); } + ":" { RET(':'); } + "=" { RET('='); } + "(" { RET('('); } + ")" { RET(')'); } + "[" { RET('['); } + "]" { RET(']'); } + "." { RET('.'); } + "&" { RET('&'); } + "!" { RET('!'); } + "~" { RET('~'); } + "-" { RET('-'); } + "+" { RET('+'); } + "*" { RET('*'); } + "/" { RET('/'); } + "%" { RET('%'); } + "<" { RET('<'); } + ">" { RET('>'); } + "^" { RET('^'); } + "|" { RET('|'); } + "?" { RET('?'); } + + + [ \\t\\v\\f]+ { goto std; } + + "\\n" + { + if(cursor == s->eof) RET(EOI); + s->pos = cursor; s->line++; + goto std; + } + + any + { + printf("unexpected character: %c\\n", *s->tok); + goto std; + } +*/ + +comment: +/*!re2c + "*/" { goto std; } + "\\n" + { + if(cursor == s->eof) RET(EOI); + s->tok = s->pos = cursor; s->line++; + goto comment; + } + any { goto comment; } +*/ +} + +main(){ + Scanner in; + int t; + memset((char*) &in, 0, sizeof(in)); + in.fd = 0; + while((t = scan(&in)) != EOI){ +/* + printf("%d\\t%.*s\\n", t, in.cur - in.tok, in.tok); + printf("%d\\n", t); +*/ + } + close(in.fd); +} +.fi +.in -3 + +.SH "SEE ALSO" +.LP +flex(1), lex(1). + +.SH FEATURES +.LP +\*(re does not provide a default action: +the generated code assumes that the input +will consist of a sequence of tokens. +Typically this can be dealt with by adding a rule such as the one for +unexpected characters in the example above. +.LP +The user must arrange for a sentinel token to appear at the end of input +(and provide a rule for matching it): +\*(re does not provide an \fC<>\fP expression. +If the source is from a null-byte terminated string, a +rule matching a null character will suffice. If the source is from a +file then the approach taken in the example can be used: pad the input with +a newline (or some other character that can't appear within another token); +upon recognizing such a character check to see if it is the sentinel +and act accordingly. +.LP +\*(re does not provide start conditions: use a separate scanner +specification for each start condition (as illustrated in the above example). +.LP +No [^x]. Use difference instead. +.SH BUGS +.LP +Only fixed length trailing context can be handled. +.LP +The maximum value appearing as a parameter \fIn\fP to \fCYYFILL\fP is not +provided to the generated code (this value is needed for constructing +the interface code). +Note that this value is usually relatively small: for +typical programming languages \fIn\fP will be the length of the longest +keyword plus one. +.LP +Difference only works for character sets. +.LP +The \*(re internal algorithms need documentation. + +.SH AUTHOR +.LP +Please send bug reports, fixes and feedback to: +.LP +.nf +Peter Bumbulis +Computer Systems Group +University of Waterloo +Waterloo, Ontario +N2L 3G1 +Internet: peterr@csg.uwaterloo.ca +.fi diff --git a/tools/re2c/scanner.cc b/tools/re2c/scanner.cc new file mode 100644 index 00000000..19b42597 --- /dev/null +++ b/tools/re2c/scanner.cc @@ -0,0 +1,470 @@ +/* Generated by re2c 0.5 on Sat May 15 11:35:52 1999 */ +#line 1 "scanner.re" +#include +#include +#include +#include +#include "scanner.h" +#include "parser.h" +#include "y.tab.h" + +extern YYSTYPE yylval; + +#define BSIZE 8192 + +#define YYCTYPE uchar +#define YYCURSOR cursor +#define YYLIMIT lim +#define YYMARKER ptr +#define YYFILL(n) {cursor = fill(cursor);} + +#define RETURN(i) {cur = cursor; return i;} + + +Scanner::Scanner(int i) : in(i), + bot(NULL), tok(NULL), ptr(NULL), cur(NULL), pos(NULL), lim(NULL), + top(NULL), eof(NULL), tchar(0), tline(0), cline(1) { + ; +} + +uchar *Scanner::fill(uchar *cursor){ + if(!eof){ + uint cnt = tok - bot; + if(cnt){ + memcpy(bot, tok, lim - tok); + tok = bot; + ptr -= cnt; + cursor -= cnt; + pos -= cnt; + lim -= cnt; + } + if((top - lim) < BSIZE){ + uchar *buf = new uchar[(lim - bot) + BSIZE]; + memcpy(buf, tok, lim - tok); + tok = buf; + ptr = &buf[ptr - bot]; + cursor = &buf[cursor - bot]; + pos = &buf[pos - bot]; + lim = &buf[lim - bot]; + top = &lim[BSIZE]; + delete [] bot; + bot = buf; + } + if((cnt = read(in, (char*) lim, BSIZE)) != BSIZE){ + eof = &lim[cnt]; *eof++ = '\n'; + } + lim += cnt; + } + return cursor; +} + +#line 68 + + +int Scanner::echo(ostream &out){ + uchar *cursor = cur; + tok = cursor; +echo: +{ + YYCTYPE yych; + unsigned int yyaccept; + goto yy0; +yy1: ++YYCURSOR; +yy0: + if((YYLIMIT - YYCURSOR) < 7) YYFILL(7); + yych = *YYCURSOR; + if(yych == '\n') goto yy4; + if(yych != '/') goto yy6; +yy2: yyaccept = 0; + yych = *(YYMARKER = ++YYCURSOR); + if(yych == '*') goto yy7; +yy3: +#line 82 + { goto echo; } +yy4: yych = *++YYCURSOR; +yy5: +#line 78 + { if(cursor == eof) RETURN(0); + out.write(tok, cursor - tok); + tok = pos = cursor; cline++; + goto echo; } +yy6: yych = *++YYCURSOR; + goto yy3; +yy7: yych = *++YYCURSOR; + if(yych == '!') goto yy9; +yy8: YYCURSOR = YYMARKER; + switch(yyaccept){ + case 0: goto yy3; + } +yy9: yych = *++YYCURSOR; + if(yych != 'r') goto yy8; +yy10: yych = *++YYCURSOR; + if(yych != 'e') goto yy8; +yy11: yych = *++YYCURSOR; + if(yych != '2') goto yy8; +yy12: yych = *++YYCURSOR; + if(yych != 'c') goto yy8; +yy13: yych = *++YYCURSOR; +yy14: +#line 75 + { out.write(tok, &cursor[-7] - tok); + tok = cursor; + RETURN(1); } +} +#line 83 + +} + + +int Scanner::scan(){ + uchar *cursor = cur; + uint depth; + +scan: + tchar = cursor - pos; + tline = cline; + tok = cursor; +{ + YYCTYPE yych; + unsigned int yyaccept; + goto yy15; +yy16: ++YYCURSOR; +yy15: + if((YYLIMIT - YYCURSOR) < 2) YYFILL(2); + yych = *YYCURSOR; + if(yych <= ':'){ + if(yych <= '"'){ + if(yych <= '\n'){ + if(yych <= '\b') goto yy35; + if(yych <= '\t') goto yy31; + goto yy33; + } else { + if(yych == ' ') goto yy31; + if(yych <= '!') goto yy35; + goto yy23; + } + } else { + if(yych <= '*'){ + if(yych <= '\'') goto yy35; + if(yych <= ')') goto yy27; + goto yy21; + } else { + if(yych <= '+') goto yy28; + if(yych == '/') goto yy19; + goto yy35; + } + } + } else { + if(yych <= 'Z'){ + if(yych <= '='){ + if(yych == '<') goto yy35; + goto yy27; + } else { + if(yych == '?') goto yy28; + if(yych <= '@') goto yy35; + goto yy29; + } + } else { + if(yych <= '`'){ + if(yych <= '[') goto yy25; + if(yych <= '\\') goto yy27; + goto yy35; + } else { + if(yych <= 'z') goto yy29; + if(yych <= '{') goto yy17; + if(yych <= '|') goto yy27; + goto yy35; + } + } + } +yy17: yych = *++YYCURSOR; +yy18: +#line 96 + { depth = 1; + goto code; + } +yy19: yych = *++YYCURSOR; + if(yych == '*') goto yy54; +yy20: +#line 115 + { RETURN(*tok); } +yy21: yych = *++YYCURSOR; + if(yych == '/') goto yy52; +yy22: +#line 117 + { yylval.op = *tok; + RETURN(CLOSE); } +yy23: yyaccept = 0; + yych = *(YYMARKER = ++YYCURSOR); + if(yych != '\n') goto yy48; +yy24: +#line 108 + { fatal("bad string"); } +yy25: yyaccept = 1; + yych = *(YYMARKER = ++YYCURSOR); + if(yych != '\n') goto yy42; +yy26: +#line 113 + { fatal("bad character constant"); } +yy27: yych = *++YYCURSOR; + goto yy20; +yy28: yych = *++YYCURSOR; + goto yy22; +yy29: yych = *++YYCURSOR; + goto yy40; +yy30: +#line 120 + { cur = cursor; + yylval.symbol = Symbol::find(token()); + return ID; } +yy31: yych = *++YYCURSOR; + goto yy38; +yy32: +#line 124 + { goto scan; } +yy33: yych = *++YYCURSOR; +yy34: +#line 126 + { if(cursor == eof) RETURN(0); + pos = cursor; cline++; + goto scan; + } +yy35: yych = *++YYCURSOR; +yy36: +#line 131 + { cerr << "unexpected character: " << *tok << endl; + goto scan; + } +yy37: ++YYCURSOR; + if(YYLIMIT == YYCURSOR) YYFILL(1); + yych = *YYCURSOR; +yy38: if(yych == '\t') goto yy37; + if(yych == ' ') goto yy37; + goto yy32; +yy39: ++YYCURSOR; + if(YYLIMIT == YYCURSOR) YYFILL(1); + yych = *YYCURSOR; +yy40: if(yych <= '@'){ + if(yych <= '/') goto yy30; + if(yych <= '9') goto yy39; + goto yy30; + } else { + if(yych <= 'Z') goto yy39; + if(yych <= '`') goto yy30; + if(yych <= 'z') goto yy39; + goto yy30; + } +yy41: ++YYCURSOR; + if(YYLIMIT == YYCURSOR) YYFILL(1); + yych = *YYCURSOR; +yy42: if(yych <= '['){ + if(yych != '\n') goto yy41; + } else { + if(yych <= '\\') goto yy44; + if(yych <= ']') goto yy45; + goto yy41; + } +yy43: YYCURSOR = YYMARKER; + switch(yyaccept){ + case 0: goto yy24; + case 1: goto yy26; + } +yy44: ++YYCURSOR; + if(YYLIMIT == YYCURSOR) YYFILL(1); + yych = *YYCURSOR; + if(yych == '\n') goto yy43; + goto yy41; +yy45: yych = *++YYCURSOR; +yy46: +#line 110 + { cur = cursor; + yylval.regexp = ranToRE(token()); + return RANGE; } +yy47: ++YYCURSOR; + if(YYLIMIT == YYCURSOR) YYFILL(1); + yych = *YYCURSOR; +yy48: if(yych <= '!'){ + if(yych == '\n') goto yy43; + goto yy47; + } else { + if(yych <= '"') goto yy50; + if(yych != '\\') goto yy47; + } +yy49: ++YYCURSOR; + if(YYLIMIT == YYCURSOR) YYFILL(1); + yych = *YYCURSOR; + if(yych == '\n') goto yy43; + goto yy47; +yy50: yych = *++YYCURSOR; +yy51: +#line 105 + { cur = cursor; + yylval.regexp = strToRE(token()); + return STRING; } +yy52: yych = *++YYCURSOR; +yy53: +#line 102 + { tok = cursor; + RETURN(0); } +yy54: yych = *++YYCURSOR; +yy55: +#line 99 + { depth = 1; + goto comment; } +} +#line 134 + + +code: +{ + YYCTYPE yych; + unsigned int yyaccept; + goto yy56; +yy57: ++YYCURSOR; +yy56: + if((YYLIMIT - YYCURSOR) < 2) YYFILL(2); + yych = *YYCURSOR; + if(yych <= '&'){ + if(yych <= '\n'){ + if(yych <= '\t') goto yy64; + goto yy62; + } else { + if(yych == '"') goto yy66; + goto yy64; + } + } else { + if(yych <= '{'){ + if(yych <= '\'') goto yy67; + if(yych <= 'z') goto yy64; + goto yy60; + } else { + if(yych != '}') goto yy64; + } + } +yy58: yych = *++YYCURSOR; +yy59: +#line 138 + { if(--depth == 0){ + cur = cursor; + yylval.token = new Token(token(), tline); + return CODE; + } + goto code; } +yy60: yych = *++YYCURSOR; +yy61: +#line 144 + { ++depth; + goto code; } +yy62: yych = *++YYCURSOR; +yy63: +#line 146 + { if(cursor == eof) fatal("missing '}'"); + pos = cursor; cline++; + goto code; + } +yy64: yych = *++YYCURSOR; +yy65: +#line 150 + { goto code; } +yy66: yyaccept = 0; + yych = *(YYMARKER = ++YYCURSOR); + if(yych == '\n') goto yy65; + goto yy73; +yy67: yyaccept = 0; + yych = *(YYMARKER = ++YYCURSOR); + if(yych == '\n') goto yy65; + goto yy69; +yy68: ++YYCURSOR; + if(YYLIMIT == YYCURSOR) YYFILL(1); + yych = *YYCURSOR; +yy69: if(yych <= '&'){ + if(yych != '\n') goto yy68; + } else { + if(yych <= '\'') goto yy64; + if(yych == '\\') goto yy71; + goto yy68; + } +yy70: YYCURSOR = YYMARKER; + switch(yyaccept){ + case 0: goto yy65; + } +yy71: ++YYCURSOR; + if(YYLIMIT == YYCURSOR) YYFILL(1); + yych = *YYCURSOR; + if(yych == '\n') goto yy70; + goto yy68; +yy72: ++YYCURSOR; + if(YYLIMIT == YYCURSOR) YYFILL(1); + yych = *YYCURSOR; +yy73: if(yych <= '!'){ + if(yych == '\n') goto yy70; + goto yy72; + } else { + if(yych <= '"') goto yy64; + if(yych != '\\') goto yy72; + } +yy74: ++YYCURSOR; + if(YYLIMIT == YYCURSOR) YYFILL(1); + yych = *YYCURSOR; + if(yych == '\n') goto yy70; + goto yy72; +} +#line 151 + + +comment: +{ + YYCTYPE yych; + unsigned int yyaccept; + goto yy75; +yy76: ++YYCURSOR; +yy75: + if((YYLIMIT - YYCURSOR) < 2) YYFILL(2); + yych = *YYCURSOR; + if(yych <= ')'){ + if(yych == '\n') goto yy80; + goto yy82; + } else { + if(yych <= '*') goto yy77; + if(yych == '/') goto yy79; + goto yy82; + } +yy77: yych = *++YYCURSOR; + if(yych == '/') goto yy85; +yy78: +#line 165 + { goto comment; } +yy79: yych = *++YYCURSOR; + if(yych == '*') goto yy83; + goto yy78; +yy80: yych = *++YYCURSOR; +yy81: +#line 161 + { if(cursor == eof) RETURN(0); + tok = pos = cursor; cline++; + goto comment; + } +yy82: yych = *++YYCURSOR; + goto yy78; +yy83: yych = *++YYCURSOR; +yy84: +#line 159 + { ++depth; + goto comment; } +yy85: yych = *++YYCURSOR; +yy86: +#line 155 + { if(--depth == 0) + goto scan; + else + goto comment; } +} +#line 166 + +} + +void Scanner::fatal(char *msg){ + cerr << "line " << tline << ", column " << (tchar + 1) << ": " + << msg << endl; + exit(1); +} diff --git a/tools/re2c/scanner.h b/tools/re2c/scanner.h new file mode 100644 index 00000000..cf5bb1f2 --- /dev/null +++ b/tools/re2c/scanner.h @@ -0,0 +1,30 @@ +#ifndef _scanner_h +#define _scanner_h + +#include "token.h" + +class Scanner { + private: + int in; + uchar *bot, *tok, *ptr, *cur, *pos, *lim, *top, *eof; + uint tchar, tline, cline; + private: + uchar *fill(uchar*); + public: + Scanner(int); + int echo(ostream&); + int scan(); + void fatal(char*); + SubStr token(); + uint line(); +}; + +inline SubStr Scanner::token(){ + return SubStr(tok, cur - tok); +} + +inline uint Scanner::line(){ + return cline; +} + +#endif diff --git a/tools/re2c/scanner.re b/tools/re2c/scanner.re new file mode 100644 index 00000000..f7b48cbb --- /dev/null +++ b/tools/re2c/scanner.re @@ -0,0 +1,173 @@ +#include +#include +#include +#include +#include "scanner.h" +#include "parser.h" +#include "y.tab.h" + +extern YYSTYPE yylval; + +#define BSIZE 8192 + +#define YYCTYPE uchar +#define YYCURSOR cursor +#define YYLIMIT lim +#define YYMARKER ptr +#define YYFILL(n) {cursor = fill(cursor);} + +#define RETURN(i) {cur = cursor; return i;} + + +Scanner::Scanner(int i) : in(i), + bot(NULL), tok(NULL), ptr(NULL), cur(NULL), pos(NULL), lim(NULL), + top(NULL), eof(NULL), tchar(0), tline(0), cline(1) { + ; +} + +uchar *Scanner::fill(uchar *cursor){ + if(!eof){ + uint cnt = tok - bot; + if(cnt){ + memcpy(bot, tok, lim - tok); + tok = bot; + ptr -= cnt; + cursor -= cnt; + pos -= cnt; + lim -= cnt; + } + if((top - lim) < BSIZE){ + uchar *buf = new uchar[(lim - bot) + BSIZE]; + memcpy(buf, tok, lim - tok); + tok = buf; + ptr = &buf[ptr - bot]; + cursor = &buf[cursor - bot]; + pos = &buf[pos - bot]; + lim = &buf[lim - bot]; + top = &lim[BSIZE]; + delete [] bot; + bot = buf; + } + if((cnt = read(in, (char*) lim, BSIZE)) != BSIZE){ + eof = &lim[cnt]; *eof++ = '\n'; + } + lim += cnt; + } + return cursor; +} + +/*!re2c +any = [\000-\377]; +dot = any \ [\n]; +esc = dot \ [\\]; +cstring = "[" ((esc \ [\]]) | "\\" dot)* "]" ; +dstring = "\"" ((esc \ ["] ) | "\\" dot)* "\""; +sstring = "'" ((esc \ ['] ) | "\\" dot)* "'" ; +letter = [a-zA-Z]; +digit = [0-9]; +*/ + +int Scanner::echo(ostream &out){ + uchar *cursor = cur; + tok = cursor; +echo: +/*!re2c + "/*!re2c" { out.write(tok, &cursor[-7] - tok); + tok = cursor; + RETURN(1); } + "\n" { if(cursor == eof) RETURN(0); + out.write(tok, cursor - tok); + tok = pos = cursor; cline++; + goto echo; } + any { goto echo; } +*/ +} + + +int Scanner::scan(){ + uchar *cursor = cur; + uint depth; + +scan: + tchar = cursor - pos; + tline = cline; + tok = cursor; +/*!re2c + "{" { depth = 1; + goto code; + } + "/*" { depth = 1; + goto comment; } + + "*/" { tok = cursor; + RETURN(0); } + + dstring { cur = cursor; + yylval.regexp = strToRE(token()); + return STRING; } + "\"" { fatal("bad string"); } + + cstring { cur = cursor; + yylval.regexp = ranToRE(token()); + return RANGE; } + "[" { fatal("bad character constant"); } + + [()|=;/\\] { RETURN(*tok); } + + [*+?] { yylval.op = *tok; + RETURN(CLOSE); } + + letter (letter|digit)* { cur = cursor; + yylval.symbol = Symbol::find(token()); + return ID; } + + [ \t]+ { goto scan; } + + "\n" { if(cursor == eof) RETURN(0); + pos = cursor; cline++; + goto scan; + } + + any { cerr << "unexpected character: " << *tok << endl; + goto scan; + } +*/ + +code: +/*!re2c + "}" { if(--depth == 0){ + cur = cursor; + yylval.token = new Token(token(), tline); + return CODE; + } + goto code; } + "{" { ++depth; + goto code; } + "\n" { if(cursor == eof) fatal("missing '}'"); + pos = cursor; cline++; + goto code; + } + dstring | sstring | any { goto code; } +*/ + +comment: +/*!re2c + "*/" { if(--depth == 0) + goto scan; + else + goto comment; } + "/*" { ++depth; + goto comment; } + "\n" { if(cursor == eof) RETURN(0); + tok = pos = cursor; cline++; + goto comment; + } + any { goto comment; } +*/ +} + +void Scanner::fatal(char *msg){ + cerr << "line " << tline << ", column " << (tchar + 1) << ": " + << msg << endl; + exit(1); +} diff --git a/tools/re2c/substr.cc b/tools/re2c/substr.cc new file mode 100644 index 00000000..3275660e --- /dev/null +++ b/tools/re2c/substr.cc @@ -0,0 +1,30 @@ +#include +#include "substr.h" + +void SubStr::out(ostream& o) const { + o.write(str, len); +} + +bool operator==(const SubStr &s1, const SubStr &s2){ + return (bool) (s1.len == s2.len && memcmp(s1.str, s2.str, s1.len) == 0); +} + +Str::Str(const SubStr& s) : SubStr(new char[s.len], s.len) { + memcpy(str, s.str, s.len); +} + +Str::Str(Str& s) : SubStr(s.str, s.len) { + s.str = NULL; + s.len = 0; +} + +Str::Str() : SubStr((char*) NULL, 0) { + ; +} + + +Str::~Str() { + delete str; + str = (char*)-1; + len = (uint)-1; +} diff --git a/tools/re2c/substr.h b/tools/re2c/substr.h new file mode 100644 index 00000000..fb5e2cc2 --- /dev/null +++ b/tools/re2c/substr.h @@ -0,0 +1,45 @@ +#ifndef _substr_h +#define _substr_h + +#include +#include "basics.h" + +class SubStr { +public: + char *str; + uint len; +public: + friend bool operator==(const SubStr &, const SubStr &); + SubStr(uchar*, uint); + SubStr(char*, uint); + SubStr(const SubStr&); + void out(ostream&) const; +}; + +class Str: public SubStr { +public: + Str(const SubStr&); + Str(Str&); + Str(); + ~Str(); +}; + +inline ostream& operator<<(ostream& o, const SubStr &s){ + s.out(o); + return o; +} + +inline ostream& operator<<(ostream& o, const SubStr* s){ + return o << *s; +} + +inline SubStr::SubStr(uchar *s, uint l) + : str((char*) s), len(l) { } + +inline SubStr::SubStr(char *s, uint l) + : str(s), len(l) { } + +inline SubStr::SubStr(const SubStr &s) + : str(s.str), len(s.len) { } + +#endif diff --git a/tools/re2c/token.h b/tools/re2c/token.h new file mode 100644 index 00000000..de51eb48 --- /dev/null +++ b/tools/re2c/token.h @@ -0,0 +1,18 @@ +#ifndef _token_h +#define _token_h + +#include "substr.h" + +class Token { + public: + Str text; + uint line; + public: + Token(SubStr, uint); +}; + +inline Token::Token(SubStr t, uint l) : text(t), line(l) { + ; +} + +#endif diff --git a/tools/re2c/translate.cc b/tools/re2c/translate.cc new file mode 100644 index 00000000..2eeaabf0 --- /dev/null +++ b/tools/re2c/translate.cc @@ -0,0 +1,61 @@ +#include "globals.h" + +uchar asc2asc[256] = { +0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f, +0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,0x18,0x19,0x1a,0x1b,0x1c,0x1d,0x1e,0x1f, +0x20,0x21,0x22,0x23,0x24,0x25,0x26,0x27,0x28,0x29,0x2a,0x2b,0x2c,0x2d,0x2e,0x2f, +0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,0x38,0x39,0x3a,0x3b,0x3c,0x3d,0x3e,0x3f, +0x40,0x41,0x42,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0x4a,0x4b,0x4c,0x4d,0x4e,0x4f, +0x50,0x51,0x52,0x53,0x54,0x55,0x56,0x57,0x58,0x59,0x5a,0x5b,0x5c,0x5d,0x5e,0x5f, +0x60,0x61,0x62,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0x6a,0x6b,0x6c,0x6d,0x6e,0x6f, +0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7a,0x7b,0x7c,0x7d,0x7e,0x7f, +0x80,0x81,0x82,0x83,0x84,0x85,0x86,0x87,0x88,0x89,0x8a,0x8b,0x8c,0x8d,0x8e,0x8f, +0x90,0x91,0x92,0x93,0x94,0x95,0x96,0x97,0x98,0x99,0x9a,0x9b,0x9c,0x9d,0x9e,0x9f, +0xa0,0xa1,0xa2,0xa3,0xa4,0xa5,0xa6,0xa7,0xa8,0xa9,0xaa,0xab,0xac,0xad,0xae,0xaf, +0xb0,0xb1,0xb2,0xb3,0xb4,0xb5,0xb6,0xb7,0xb8,0xb9,0xba,0xbb,0xbc,0xbd,0xbe,0xbf, +0xc0,0xc1,0xc2,0xc3,0xc4,0xc5,0xc6,0xc7,0xc8,0xc9,0xca,0xcb,0xcc,0xcd,0xce,0xcf, +0xd0,0xd1,0xd2,0xd3,0xd4,0xd5,0xd6,0xd7,0xd8,0xd9,0xda,0xdb,0xdc,0xdd,0xde,0xdf, +0xe0,0xe1,0xe2,0xe3,0xe4,0xe5,0xe6,0xe7,0xe8,0xe9,0xea,0xeb,0xec,0xed,0xee,0xef, +0xf0,0xf1,0xf2,0xf3,0xf4,0xf5,0xf6,0xf7,0xf8,0xf9,0xfa,0xfb,0xfc,0xfd,0xfe,0xff +}; + +uchar *xlat = asc2asc; +uchar *talx = asc2asc; + +uchar asc2ebc[256] = { /* Based on ISO 8859/1 and Code Page 37 */ +0x00,0x01,0x02,0x03,0x37,0x2d,0x2e,0x2f,0x16,0x05,0x25,0x0b,0x0c,0x0d,0x0e,0x0f, +0x10,0x11,0x12,0x13,0x3c,0x3d,0x32,0x26,0x18,0x19,0x3f,0x27,0x1c,0x1d,0x1e,0x1f, +0x40,0x5a,0x7f,0x7b,0x5b,0x6c,0x50,0x7d,0x4d,0x5d,0x5c,0x4e,0x6b,0x60,0x4b,0x61, +0xf0,0xf1,0xf2,0xf3,0xf4,0xf5,0xf6,0xf7,0xf8,0xf9,0x7a,0x5e,0x4c,0x7e,0x6e,0x6f, +0x7c,0xc1,0xc2,0xc3,0xc4,0xc5,0xc6,0xc7,0xc8,0xc9,0xd1,0xd2,0xd3,0xd4,0xd5,0xd6, +0xd7,0xd8,0xd9,0xe2,0xe3,0xe4,0xe5,0xe6,0xe7,0xe8,0xe9,0xba,0xe0,0xbb,0xb0,0x6d, +0x79,0x81,0x82,0x83,0x84,0x85,0x86,0x87,0x88,0x89,0x91,0x92,0x93,0x94,0x95,0x96, +0x97,0x98,0x99,0xa2,0xa3,0xa4,0xa5,0xa6,0xa7,0xa8,0xa9,0xc0,0x4f,0xd0,0xa1,0x07, +0x20,0x21,0x22,0x23,0x24,0x15,0x06,0x17,0x28,0x29,0x2a,0x2b,0x2c,0x09,0x0a,0x1b, +0x30,0x31,0x1a,0x33,0x34,0x35,0x36,0x08,0x38,0x39,0x3a,0x3b,0x04,0x14,0x3e,0xff, +0x41,0xaa,0x4a,0xb1,0x9f,0xb2,0x6a,0xb5,0xbd,0xb4,0x9a,0x8a,0x5f,0xca,0xaf,0xbc, +0x90,0x8f,0xea,0xfa,0xbe,0xa0,0xb6,0xb3,0x9d,0xda,0x9b,0x8b,0xb7,0xb8,0xb9,0xab, +0x64,0x65,0x62,0x66,0x63,0x67,0x9e,0x68,0x74,0x71,0x72,0x73,0x78,0x75,0x76,0x77, +0xac,0x69,0xed,0xee,0xeb,0xef,0xec,0xbf,0x80,0xfd,0xfe,0xfb,0xfc,0xad,0x8e,0x59, +0x44,0x45,0x42,0x46,0x43,0x47,0x9c,0x48,0x54,0x51,0x52,0x53,0x58,0x55,0x56,0x57, +0x8c,0x49,0xcd,0xce,0xcb,0xcf,0xcc,0xe1,0x70,0xdd,0xde,0xdb,0xdc,0x8d,0xae,0xdf +}; + +uchar ebc2asc[256] = { /* Based on ISO 8859/1 and Code Page 37 */ +0x00,0x01,0x02,0x03,0x9c,0x09,0x86,0x7f,0x97,0x8d,0x8e,0x0b,0x0c,0x0d,0x0e,0x0f, +0x10,0x11,0x12,0x13,0x9d,0x85,0x08,0x87,0x18,0x19,0x92,0x8f,0x1c,0x1d,0x1e,0x1f, +0x80,0x81,0x82,0x83,0x84,0x0a,0x17,0x1b,0x88,0x89,0x8a,0x8b,0x8c,0x05,0x06,0x07, +0x90,0x91,0x16,0x93,0x94,0x95,0x96,0x04,0x98,0x99,0x9a,0x9b,0x14,0x15,0x9e,0x1a, +0x20,0xa0,0xe2,0xe4,0xe0,0xe1,0xe3,0xe5,0xe7,0xf1,0xa2,0x2e,0x3c,0x28,0x2b,0x7c, +0x26,0xe9,0xea,0xeb,0xe8,0xed,0xee,0xef,0xec,0xdf,0x21,0x24,0x2a,0x29,0x3b,0xac, +0x2d,0x2f,0xc2,0xc4,0xc0,0xc1,0xc3,0xc5,0xc7,0xd1,0xa6,0x2c,0x25,0x5f,0x3e,0x3f, +0xf8,0xc9,0xca,0xcb,0xc8,0xcd,0xce,0xcf,0xcc,0x60,0x3a,0x23,0x40,0x27,0x3d,0x22, +0xd8,0x61,0x62,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0xab,0xbb,0xf0,0xfd,0xde,0xb1, +0xb0,0x6a,0x6b,0x6c,0x6d,0x6e,0x6f,0x70,0x71,0x72,0xaa,0xba,0xe6,0xb8,0xc6,0xa4, +0xb5,0x7e,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7a,0xa1,0xbf,0xd0,0xdd,0xfe,0xae, +0x5e,0xa3,0xa5,0xb7,0xa9,0xa7,0xb6,0xbc,0xbd,0xbe,0x5b,0x5d,0xaf,0xa8,0xb4,0xd7, +0x7b,0x41,0x42,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0xad,0xf4,0xf6,0xf2,0xf3,0xf5, +0x7d,0x4a,0x4b,0x4c,0x4d,0x4e,0x4f,0x50,0x51,0x52,0xb9,0xfb,0xfc,0xf9,0xfa,0xff, +0x5c,0xf7,0x53,0x54,0x55,0x56,0x57,0x58,0x59,0x5a,0xb2,0xd4,0xd6,0xd2,0xd3,0xd5, +0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,0x38,0x39,0xb3,0xdb,0xdc,0xd9,0xda,0x9f +}; diff --git a/tools/re2c/y.tab.h b/tools/re2c/y.tab.h new file mode 100644 index 00000000..d7b3702d --- /dev/null +++ b/tools/re2c/y.tab.h @@ -0,0 +1,12 @@ +#define CLOSE 257 +#define ID 258 +#define CODE 259 +#define RANGE 260 +#define STRING 261 +typedef union { + Symbol *symbol; + RegExp *regexp; + Token *token; + char op; +} YYSTYPE; +extern YYSTYPE yylval;