From: nuffer Date: Sat, 31 Jan 2004 15:44:39 +0000 (+0000) Subject: Applied patch from Marcus Boerger X-Git-Tag: 0.13.6~746 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=41a5c492d938848873bfabb3062911f17873ae9d;p=re2c Applied patch from Marcus Boerger --- diff --git a/.cvsignore b/.cvsignore new file mode 100644 index 00000000..92d11064 --- /dev/null +++ b/.cvsignore @@ -0,0 +1 @@ +re2c diff --git a/Makefile b/Makefile index 78683496..b1be0d52 100644 --- a/Makefile +++ b/Makefile @@ -1,63 +1,81 @@ -# $Log$ -# Revision 1.2 2004/01/12 04:36:00 nuffer -# Integrated in bug fixes from yasm. 1. Fix detection of EOF in the re2c scanner. 2. Correctly output #line directives so that debugging the generated output is possible. -# -# Revision 1.1.1.1 2003/12/13 04:58:19 nuffer -# Initial import -# -#Revision 1.1 1994/04/08 16:30:37 peter -#Initial revision -# +# $Id$ BIN = /usr/local/bin MAN = /usr/local/man +RE2C_VERSION = 0.9.2 %.o : %.cc ; $(CC) -o $@ $(CFLAGS) -c $< %.cc : %.y ; $(YACC) $(YFLAGS) $<; mv $(YTAB).c $@ -%.cc : %.l ; $(LEX) $(LFLAGS) $<; mv $(LEXYY).c $@ %.cc: %.re - ./re2c -s $< >$@ + -@if test -x re2c; then \ + echo "re2c -s $< >$@"; \ + re2c -s $< >$@; \ + else \ + echo "cp -f bootstrap/$@ $@"; \ + cp -f bootstrap/$@ $@; \ + fi SOURCES = code.cc dfa.cc main.cc parser.y actions.cc scanner.re substr.cc\ translate.cc OBJS = code.o dfa.o main.o parser.o actions.o scanner.o substr.o\ translate.o -CC = g++ -CFLAGS = -O2 -Wall -I. -Wno-unused -Wno-parentheses +CC = g++ +CFLAGS = -O2 -Wall -I. -Wno-unused -Wno-parentheses -Wno-deprecated +YACC = bison -y YFLAGS = -d LDFLAGS = default: re2c -clean: - rm -f *.o *.s y.tab.c y.tab.h scanner.cc parser.cc .version version.h re2c +dist-clean: + rm -f *.o *.s y.tab.c y.tab.h parser.cc .version version.h + +clean: dist-clean + rm -f re2c scanner.cc README re2c.1 re2c.ps re2c*.spec makerpm parser.cc: parser.y - bison -d parser.y - mv -f parser.tab.c parser.cc + $(YACC) $(YFLAGS) parser.y + mv -f y.tab.c parser.cc -re2c: $(OBJS) +re2c: README re2c.1 $(OBJS) $(CC) -o $@ $(OBJS) $(LDFLAGS) -lstdc++ -.version: README - egrep "^Version" README | sed 's/Version //' > .version +re2c.ps: + gunzip -c doc/loplas.ps.gz > re2c.ps + +.version: + echo $(RE2C_VERSION) > .version -version.h: .version - echo "#define RE2C_VERSION" `cat .version` > version.h +version.h: + echo "#define RE2C_VERSION \"$(RE2C_VERSION)\"" > version.h -install: re2c +README: + cat README.in | sed 's/RE2C_VERSION/$(RE2C_VERSION)/g' > README + +makerpm: + cat makerpm.in | sed 's/RE2C_VERSION/$(RE2C_VERSION)/g' > makerpm + chmod +x makerpm + +re2c.1: + cat re2c.1.in | sed 's/RE2C_VERSION/$(RE2C_VERSION)/g' > re2c.1 + +install: re2c re2c.1 install -d $(BIN) install -s re2c $(BIN) install -d $(MAN)/man1 install -m 0644 re2c.1 $(MAN)/man1 -dist: re2c scanner.cc .version - mkdir re2c-`cat .version` - cp -P `p4 files ... | sed s/\\\\/\\\\/depot\\\\/home\\\\/re2c\\\\/// | sed '/- delete/d' | sed s/#.*$$//` re2c-`cat .version`/ - tar zcf re2c-`cat .version`.tar.gz re2c-`cat .version`/ - rm -rf re2c-`cat .version` +uninstall: + rm -f $(BIN)/re2c + rm -f $(MAN)/man1/re2c.1* + +dist: re2c scanner.cc + mkdir re2c-$(RE2C_VERSION) + cp -P `p4 files ... | sed s/\\\\/\\\\/depot\\\\/home\\\\/re2c\\\\/// | sed '/- delete/d' | sed s/#.*$$//` re2c-$(RE2C_VERSION)/ + tar zcf re2c-$(RE2C_VERSION).tar.gz re2c-$(RE2C_VERSION)/ + rm -rf re2c-$(RE2C_VERSION) # # generated with "gcc -I. -MM -x c++ *.cc *.y *.re" @@ -76,4 +94,4 @@ translate.o : translate.cc globals.h basics.h scanner.o : scanner.re scanner.h token.h substr.h basics.h \ parser.h re.h ins.h ./parser.o parser.o : parser.y globals.h basics.h parser.h scanner.h token.h \ - substr.h re.h ins.h + substr.h re.h ins.h version.h diff --git a/README.in b/README.in new file mode 100644 index 00000000..bf77fe1d --- /dev/null +++ b/README.in @@ -0,0 +1,163 @@ +re2c Version RE2C_VERSION +------------------ + +Originally written by Peter Bumbulis (peter@csg.uwaterloo.ca) +Currently maintained by Brian Young (bayoung@acm.org) + +The re2c distribution can be found at: + + http://sourceforge.net/projects/re2c/ + +This distribution is a cleaned up version of the 0.5 release +maintained by me (Brian Young). Several bugs were fixed as well +as code cleanup for warning free compilation. It has been developed +and tested with egcs 1.0.2 and gcc 2.7.2.3, 2.96 and 3.3.1 on Linux +x86. You can compile your own version with other gcc version if you +have yacc or any working bison version (tested up to bison 1.875). + +You can install this software by simple typing the following commands: + make + make install + +Or you can create a rpm package and install it by the following commands: + make makerpm + ./makerpm + rpm -Uhv /re2c-RE2C_VERSION-.rpm + +Here should be a number like 1. And must equal +the directory where the makerpm step has written the generated rpm to. + +Peter Bumbulis' original release can be found at: + + ftp://csg.uwaterloo.ca/pub/peter/re2c.0.5.tar.gz + +re2c is a great tool for writing fast and flexible lexers. It has +served many people well for many years and it deserves to be +maintained more actively. re2c is on the order of 2-3 times faster +than a flex based scanner, and its input model is much more +flexible. + +Patches and requests for features will be entertained. Areas of +particular interest to me are porting (a Solaris and an NT +version will be forthcoming) and wide character support. Note +that the code is already quite portable and should be buildable +on any platform with minor makefile changes. + +Peter's original version 0.5 ANNOUNCE and README follows. + +Brian + +-- + +re2c is a tool for generating C-based recognizers from regular +expressions. re2c-based scanners are efficient: for programming +languages, given similar specifications, an re2c-based scanner is +typically almost twice as fast as a flex-based scanner with little or no +increase in size (possibly a decrease on cisc architectures). Indeed, +re2c-based scanners are quite competitive with hand-crafted ones. + +Unlike flex, re2c does not generate complete scanners: the user must +supply some interface code. While this code is not bulky (about 50-100 +lines for a flex-like scanner; see the man page and examples in the +distribution) careful coding is required for efficiency (and +correctness). One advantage of this arrangement is that the generated +code is not tied to any particular input model. For example, re2c +generated code can be used to scan data from a null-byte terminated +buffer as illustrated below. + +Given the following source + + #define NULL ((char*) 0) + char *scan(char *p){ + char *q; + #define YYCTYPE char + #define YYCURSOR p + #define YYLIMIT p + #define YYMARKER q + #define YYFILL(n) + /*!re2c + [0-9]+ {return YYCURSOR;} + [\000-\377] {return NULL;} + */ + } + +re2c will generate + + /* Generated by re2c on Sat Apr 16 11:40:58 1994 */ + #line 1 "simple.re" + #define NULL ((char*) 0) + char *scan(char *p){ + char *q; + #define YYCTYPE char + #define YYCURSOR p + #define YYLIMIT p + #define YYMARKER q + #define YYFILL(n) + { + YYCTYPE yych; + unsigned int yyaccept; + goto yy0; + yy1: ++YYCURSOR; + yy0: + if((YYLIMIT - YYCURSOR) < 2) YYFILL(2); + yych = *YYCURSOR; + if(yych <= '/') goto yy4; + if(yych >= ':') goto yy4; + yy2: yych = *++YYCURSOR; + goto yy7; + yy3: + #line 10 + {return YYCURSOR;} + yy4: yych = *++YYCURSOR; + yy5: + #line 11 + {return NULL;} + yy6: ++YYCURSOR; + if(YYLIMIT == YYCURSOR) YYFILL(1); + yych = *YYCURSOR; + yy7: if(yych <= '/') goto yy3; + if(yych <= '9') goto yy6; + goto yy3; + } + #line 12 + + } + +Note that most compilers will perform dead-code elimination to remove +all YYCURSOR, YYLIMIT comparisions. + +re2c was developed for a particular project (constructing a fast REXX +scanner of all things!) and so while it has some rough edges, it should +be quite usable. More information about re2c can be found in the +(admittedly skimpy) man page; the algorithms and heuristics used are +described in an upcoming LOPLAS article (included in the distribution). +Probably the best way to find out more about re2c is to try the supplied +examples. re2c is written in C++, and is currently being developed +under Linux using gcc 2.5.8. + +Peter + +-- + +re2c is distributed with no warranty whatever. The code is certain to +contain errors. Neither the author nor any contributor takes +responsibility for any consequences of its use. + +re2c is in the public domain. The data structures and algorithms used +in re2c are all either taken from documents available to the general +public or are inventions of the author. Programs generated by re2c may +be distributed freely. re2c itself may be distributed freely, in source +or binary, unchanged or modified. Distributors may charge whatever fees +they can obtain for re2c. + +If you do make use of re2c, or incorporate it into a larger project an +acknowledgement somewhere (documentation, research report, etc.) would +be appreciated. + +Please send bug reports and feedback (including suggestions for +improving the distribution) to + + peter@csg.uwaterloo.ca + +Include a small example and the banner from parser.y with bug reports. + diff --git a/actions.cc b/actions.cc index 0260b5fb..bd82d493 100644 --- a/actions.cc +++ b/actions.cc @@ -1,3 +1,4 @@ +/* $Id$ */ #include #include #include diff --git a/basics.h b/basics.h index 2adaeb74..a6bcf331 100644 --- a/basics.h +++ b/basics.h @@ -1,3 +1,4 @@ +/* $Id$ */ #ifndef _basics_h #define _basics_h diff --git a/code.cc b/code.cc index 8cd93727..f114d465 100644 --- a/code.cc +++ b/code.cc @@ -1,3 +1,4 @@ +/* $Id$ */ #include #include #include diff --git a/dfa.h b/dfa.h index edd018c3..6ca66360 100644 --- a/dfa.h +++ b/dfa.h @@ -1,3 +1,4 @@ +/* $Id$ */ #ifndef _dfa_h #define _dfa_h diff --git a/globals.h b/globals.h index f3249969..a3c2ebe3 100644 --- a/globals.h +++ b/globals.h @@ -1,3 +1,4 @@ +/* $Id$ */ #ifndef _globals_h #define _globals_h diff --git a/ins.h b/ins.h index 5d08cca2..146bef67 100644 --- a/ins.h +++ b/ins.h @@ -1,3 +1,4 @@ +/* $Id$ */ #ifndef _ins_h #define _ins_h diff --git a/main.cc b/main.cc index 0c564235..dc485d40 100644 --- a/main.cc +++ b/main.cc @@ -1,11 +1,14 @@ +/* $Id$ */ #include #include #include #include +#include #include "globals.h" #include "parser.h" #include "dfa.h" +#include "version.h" char *fileName; bool sFlag = false; @@ -14,44 +17,62 @@ unsigned int oline = 1; using namespace std; -int main(unsigned argc, char *argv[]){ +int main(unsigned argc, char *argv[]) +{ fileName = NULL; - if(argc == 1) - goto usage; - while(--argc > 1){ - char *p = *++argv; - while(*++p != '\0'){ - switch(*p){ - case 'e': - xlat = asc2ebc; - talx = ebc2asc; - break; - case 's': - sFlag = true; - break; - case 'b': - sFlag = true; - bFlag = true; - break; - default: + if (argc == 1) { goto usage; - } } + while(--argc > 1) { + char *p = *++argv; + if (*p != '-' || *(p+1) == '\0') { + goto usage; + } + while (*++p != '\0') { + switch(*p){ + case 'e': + xlat = asc2ebc; + talx = ebc2asc; + break; + case 's': + sFlag = true; + break; + case 'b': + sFlag = true; + bFlag = true; + break; + case '-': + if (!strcmp(p, "-version")) { + goto version; + } + goto usage; + default: + goto usage; + } + } } fileName = *++argv; + if (!strcmp(fileName, "--version")) { + goto version; + } int fd; - if(fileName[0] == '-' && fileName[1] == '\0'){ - fileName = ""; - fd = 0; + if (fileName[0] == '-' && fileName[1] == '\0') { + fileName = ""; + fd = 0; } else { - if((fd = open(fileName, O_RDONLY)) < 0){ - cerr << "can't open " << fileName << "\n"; - return 1; - } + if ((fd = open(fileName, O_RDONLY)) < 0) { + cerr << "can't open " << fileName << "\n"; + return 1; + } } parse(fd, cout); return 0; + usage: cerr << "usage: re2c [-esb] name\n"; return 2; + +version: + cerr << "re2c " << RE2C_VERSION << "\n"; + return 2; } diff --git a/makerpm.in b/makerpm.in new file mode 100644 index 00000000..6234f3bc --- /dev/null +++ b/makerpm.in @@ -0,0 +1,105 @@ +#! /bin/sh + +# $Id$ +# M.Boerger + +PREFIX="re2c" +#VERSION=RE2C_VERSION +TARDIR="`basename \`pwd\``" +RELEASE=${1:-1} +VERSION=${2:-`echo $TARDIR | sed "s/$PREFIX-//g"`} + +echo "Usage:" +echo "$0 " +echo +echo "e.g.:" +echo "$0" +echo -n "Building RPM version $VERSION, release: $RELEASE " +sleep 1 ; echo -n . ; sleep 1 ; echo -n . ; sleep 1 ; echo -n . +echo + +TAR=re2c-$VERSION.tar.gz +SPEC=re2c-$VERSION.spec + +# write out the .spec file +sed -e "s/RPMVERSION/$VERSION/g" \ + -e "s/RPMRELEASE/$RELEASE/g" \ + -e "s/RPMTARDIR/$TARDIR/g" \ + > $SPEC <<'EOF' +Summary: re2c - A tool for generating C-based recognizers from regular expressions +Name: re2c +Version: RPMVERSION +Release: RPMRELEASE +Group: Development +License: public domain +URL: http://sourceforge.net/projects/re2c/ +Source: %{name}-%{version}.tar.gz +BuildRoot: %{_tmppath}/%{name}-%{version}-root + +%description +re2c is a great tool for writing fast and flexible lexers. It has +served many people well for many years and it deserves to be +maintained more actively. re2c is on the order of 2-3 times faster +than a flex based scanner, and its input model is much more +flexible. + +%prep +%setup -q -n RPMTARDIR + +%build +make clean +make re2c +#regenerate file scanner.cc +rm -f scanner.cc +make scanner.cc +#regenerate re2c itself +rm -f re2c +make re2c +make dist-clean + +%install +rm -rf $RPM_BUILD_ROOT +mkdir -p $RPM_BUILD_ROOT%{_bindir} +install -m 0755 re2c $RPM_BUILD_ROOT%{_bindir} + +mkdir -p $RPM_BUILD_ROOT%{_mandir}/man1 +install -m 0755 re2c.1 $RPM_BUILD_ROOT%{_mandir}/man1 + +%clean +rm -rf $RPM_BUILD_ROOT + +%changelog +* Sun Jan 04 2003 Marcus Boerger +- Initial version. + +%files +%defattr(-,root,root) +%{_bindir}/re2c +%{_mandir}/man1/re2c.1* + +%doc README examples doc/* +EOF + +RPMBASE=/usr/src/redhat +for i in /usr/src/redhat /usr/src/packages /usr/src/RPM; do + if test -d $i; then + RPMBASE=$i + break + fi +done + +RPMDIR=${RPMBASE}/RPMS +SPECDIR=${RPMBASE}/SPECS +SRCDIR=${RPMBASE}/SOURCES + +( +make clean +cd .. +tar czvf ${SRCDIR}/${TAR} $TARDIR ) + +echo "CP: (`pwd`) cp -a $SPEC $SPECDIR/${SPEC}" +cp -a $SPEC $SPECDIR/${SPEC} +#cp -a *.patch $SRCDIR +cd $SPECDIR +echo "RPM: rpm -ba -vv ${SPEC}" +rpm -ba ${SPEC} diff --git a/parser.h b/parser.h index 0225228f..34eae0e8 100644 --- a/parser.h +++ b/parser.h @@ -1,3 +1,4 @@ +/* $Id$ */ #ifndef _parser_h #define _parser_h diff --git a/parser.y b/parser.y index 0202501a..11f3be4a 100644 --- a/parser.y +++ b/parser.y @@ -1,5 +1,7 @@ %{ +/* $Id$ */ + #include #include #include @@ -19,6 +21,13 @@ static uint accept; static RegExp *spec; static Scanner *in; +/* Bison version 1.875 emits a definition that is not working + * with several g++ version. Hence we disable it here. + */ +#if defined(__GNUC__) +#define __attribute__(x) +#endif + %} %start spec diff --git a/re.h b/re.h index 2ea6e63b..70202a5b 100644 --- a/re.h +++ b/re.h @@ -1,3 +1,4 @@ +/* $Id$ */ #ifndef _re_h #define _re_h diff --git a/re2c.1.in b/re2c.1.in new file mode 100644 index 00000000..8ae5e87f --- /dev/null +++ b/re2c.1.in @@ -0,0 +1,539 @@ +./" +./" $Id$ +./" +.TH RE2C 1 "8 April 1994" "Version RE2C_VERSION" +.ds re \fBre2c\fP +.ds le \fBlex\fP +.ds rx regular expression +.ds lx \fIl\fP-expression +\"$Log$ +\"Revision 1.1 2004/01/31 15:44:39 nuffer +\"Applied patch from Marcus Boerger +\" +\"Revision 1.2 1994/04/16 15:50:32 peter +\"Fix bug in simple example. +\" +\"Revision 1.1 1994/04/08 15:39:09 peter +\"Initial revision +\" +.SH NAME +re2c \- convert regular expressions to C/C++ + +.SH SYNOPSIS +\*(re [\fB-esb\fP] \fIname\fP + +.SH DESCRIPTION +\*(re is a preprocessor that generates C-based recognizers from regular +expressions. +The input to \*(re consists of C/C++ source interleaved with +comments of the form \fC/*!re2c\fP ... \fC*/\fP which contain +scanner specifications. +In the output these comments are replaced with code that, when +executed, will find the next input token and then execute +some user-supplied token-specific code. + +For example, given the following code + +.in +3 +.nf +#define NULL ((char*) 0) +char *scan(char *p){ +char *q; +#define YYCTYPE char +#define YYCURSOR p +#define YYLIMIT p +#define YYMARKER q +#define YYFILL(n) +/*!re2c + [0-9]+ {return YYCURSOR;} + [\\000-\\377] {return NULL;} +*/ +} +.fi +.in -3 + +\*(re will generate + +.in +3 +.nf +/* Generated by re2c on Sat Apr 16 11:40:58 1994 */ +#line 1 "simple.re" +#define NULL ((char*) 0) +char *scan(char *p){ +char *q; +#define YYCTYPE char +#define YYCURSOR p +#define YYLIMIT p +#define YYMARKER q +#define YYFILL(n) +{ + YYCTYPE yych; + unsigned int yyaccept; + goto yy0; +yy1: ++YYCURSOR; +yy0: + if((YYLIMIT - YYCURSOR) < 2) YYFILL(2); + yych = *YYCURSOR; + if(yych <= '/') goto yy4; + if(yych >= ':') goto yy4; +yy2: yych = *++YYCURSOR; + goto yy7; +yy3: +#line 10 + {return YYCURSOR;} +yy4: yych = *++YYCURSOR; +yy5: +#line 11 + {return NULL;} +yy6: ++YYCURSOR; + if(YYLIMIT == YYCURSOR) YYFILL(1); + yych = *YYCURSOR; +yy7: if(yych <= '/') goto yy3; + if(yych <= '9') goto yy6; + goto yy3; +} +#line 12 + +} +.fi +.in -3 + +.SH OPTIONS +\*(re provides the following options: +.TP +\fB-e\fP +Cross-compile from an ASCII platform to an EBCDIC one. +.TP +\fB-s\fP +Generate nested \fCif\fPs for some \fCswitch\fPes. Many compilers need this +assist to generate better code. +.TP +\fB-b\fP +Implies \fB-s\fP. Use bit vectors as well in the attempt to coax better +code out of the compiler. Most useful for specifications with more than a +few keywords (e.g. for most programming languages). + +.SH "INTERFACE CODE" +Unlike other scanner generators, \*(re does not generate complete scanners: +the user must supply some interface code. +In particular, the user must define the following macros: +.TP +\fCYYCHAR\fP +Type used to hold an input symbol. +Usually \fCchar\fP or \fCunsigned char\fP. +.TP +\fCYYCURSOR\fP +\*(lx of type \fC*YYCHAR\fP that points to the current input symbol. +The generated code advances \fCYYCURSOR\fP as symbols are matched. +On entry, \fCYYCURSOR\fP is assumed to point to the first character of the +current token. On exit, \fCYYCURSOR\fP will point to the first character of +the following token. +.TP +\fCYLIMIT\fP +Expression of type \fC*YYCHAR\fP that marks the end of the buffer +(\fCYLIMIT[-1]\fP is the last character in the buffer). +The generated code repeatedly compares \fCYYCURSOR\fP to \fCYLIMIT\fP +to determine when the buffer needs (re)filling. +.TP +\fCYYMARKER\fP +\*(lx of type \fC*YYCHAR\fP. +The generated code saves backtracking information in \fCYYMARKER\fP. +.TP +\fCYYFILL(\fP\fIn\fP\fC)\fP +The generated code "calls" \fCYYFILL\fP when the buffer needs +(re)filling: at least \fIn\fP additional characters should +be provided. \fCYYFILL\fP should adjust \fCYYCURSOR\fP, \fCYYLIMIT\fP and +\fCYYMARKER\fP as needed. Note that for typical programming languages +\fIn\fP will be the length of the longest keyword plus one. + +.SH "SCANNER SPECIFICATIONS" +Each scanner specification consists of a set of \fIrules\fP and name +definitions. +Rules consist of a regular expression along with a block of C/C++ code that +is to be executed when the associated regular expression is matched. +Name definitions are of the form +``\fIname\fP \fC=\fP \fIregular expression\fP\fC;\fP''. + +.SH "SUMMARY OF RE2C REGULAR EXPRESSIONS" +.TP +\fC"foo"\fP +the literal string \fCfoo\fP. +ANSI-C escape sequences can be used. +.TP +\fC[xyz]\fP +a "character class"; in this case, +the \*(rx matches either an '\fCx\fP', a '\fCy\fP', or a '\fCz\fP'. +.TP +\fC[abj-oZ]\fP +a "character class" with a range in it; +matches an '\fCa\fP', a '\fCb\fP', any letter from '\fCj\fP' through '\fCo\fP', +or a '\fCZ\fP'. +.TP +\fIr\fP\fC\e\fP\fIs\fP +match any \fIr\fP which isn't an \fIs\fP. \fIr\fP and \fIs\fP must be regular expressions +which can be expressed as character classes. +.TP +\fIr\fP\fC*\fP +zero or more \fIr\fP's, where \fIr\fP is any regular expression +.TP +\fC\fIr\fP\fC+\fP +one or more \fIr\fP's +.TP +\fC\fIr\fP\fC?\fP +zero or one \fIr\fP's (that is, "an optional \fIr\fP") +.TP +name +the expansion of the "name" definition (see above) +.TP +\fC(\fP\fIr\fP\fC)\fP +an \fIr\fP; parentheses are used to override precedence +(see below) +.TP +\fIrs\fP +an \fIr\fP followed by an \fIs\fP ("concatenation") +.TP +\fIr\fP\fC|\fP\fIs\fP +either an \fIr\fP or an \fIs\fP +.TP +\fIr\fP\fC/\fP\fIs\fP +an \fIr\fP but only if it is followed by an \fIs\fP. The s is not part of +the matched text. This type of \*(rx is called "trailing context". +.LP +The regular expressions listed above are grouped according to +precedence, from highest precedence at the top to lowest at the bottom. +Those grouped together have equal precedence. + +.SH "A LARGER EXAMPLE" +.LP +.in +3 +.nf +#include +#include +#include +#include + +#define ADDEQ 257 +#define ANDAND 258 +#define ANDEQ 259 +#define ARRAY 260 +#define ASM 261 +#define AUTO 262 +#define BREAK 263 +#define CASE 264 +#define CHAR 265 +#define CONST 266 +#define CONTINUE 267 +#define DECR 268 +#define DEFAULT 269 +#define DEREF 270 +#define DIVEQ 271 +#define DO 272 +#define DOUBLE 273 +#define ELLIPSIS 274 +#define ELSE 275 +#define ENUM 276 +#define EQL 277 +#define EXTERN 278 +#define FCON 279 +#define FLOAT 280 +#define FOR 281 +#define FUNCTION 282 +#define GEQ 283 +#define GOTO 284 +#define ICON 285 +#define ID 286 +#define IF 287 +#define INCR 288 +#define INT 289 +#define LEQ 290 +#define LONG 291 +#define LSHIFT 292 +#define LSHIFTEQ 293 +#define MODEQ 294 +#define MULEQ 295 +#define NEQ 296 +#define OREQ 297 +#define OROR 298 +#define POINTER 299 +#define REGISTER 300 +#define RETURN 301 +#define RSHIFT 302 +#define RSHIFTEQ 303 +#define SCON 304 +#define SHORT 305 +#define SIGNED 306 +#define SIZEOF 307 +#define STATIC 308 +#define STRUCT 309 +#define SUBEQ 310 +#define SWITCH 311 +#define TYPEDEF 312 +#define UNION 313 +#define UNSIGNED 314 +#define VOID 315 +#define VOLATILE 316 +#define WHILE 317 +#define XOREQ 318 +#define EOI 319 + +typedef unsigned int uint; +typedef unsigned char uchar; + +#define BSIZE 8192 + +#define YYCTYPE uchar +#define YYCURSOR cursor +#define YYLIMIT s->lim +#define YYMARKER s->ptr +#define YYFILL(n) {cursor = fill(s, cursor);} + +#define RET(i) {s->cur = cursor; return i;} + +typedef struct Scanner { + int fd; + uchar *bot, *tok, *ptr, *cur, *pos, *lim, *top, *eof; + uint line; +} Scanner; + +uchar *fill(Scanner *s, uchar *cursor){ + if(!s->eof){ + uint cnt = s->tok - s->bot; + if(cnt){ + memcpy(s->bot, s->tok, s->lim - s->tok); + s->tok = s->bot; + s->ptr -= cnt; + cursor -= cnt; + s->pos -= cnt; + s->lim -= cnt; + } + if((s->top - s->lim) < BSIZE){ + uchar *buf = (uchar*) + malloc(((s->lim - s->bot) + BSIZE)*sizeof(uchar)); + memcpy(buf, s->tok, s->lim - s->tok); + s->tok = buf; + s->ptr = &buf[s->ptr - s->bot]; + cursor = &buf[cursor - s->bot]; + s->pos = &buf[s->pos - s->bot]; + s->lim = &buf[s->lim - s->bot]; + s->top = &s->lim[BSIZE]; + free(s->bot); + s->bot = buf; + } + if((cnt = read(s->fd, (char*) s->lim, BSIZE)) != BSIZE){ + s->eof = &s->lim[cnt]; *(s->eof)++ = '\\n'; + } + s->lim += cnt; + } + return cursor; +} + +int scan(Scanner *s){ + uchar *cursor = s->cur; +std: + s->tok = cursor; +/*!re2c +any = [\\000-\\377]; +O = [0-7]; +D = [0-9]; +L = [a-zA-Z_]; +H = [a-fA-F0-9]; +E = [Ee] [+-]? D+; +FS = [fFlL]; +IS = [uUlL]*; +ESC = [\\\\] ([abfnrtv?'"\\\\] | "x" H+ | O+); +*/ + +/*!re2c + "/*" { goto comment; } + + "auto" { RET(AUTO); } + "break" { RET(BREAK); } + "case" { RET(CASE); } + "char" { RET(CHAR); } + "const" { RET(CONST); } + "continue" { RET(CONTINUE); } + "default" { RET(DEFAULT); } + "do" { RET(DO); } + "double" { RET(DOUBLE); } + "else" { RET(ELSE); } + "enum" { RET(ENUM); } + "extern" { RET(EXTERN); } + "float" { RET(FLOAT); } + "for" { RET(FOR); } + "goto" { RET(GOTO); } + "if" { RET(IF); } + "int" { RET(INT); } + "long" { RET(LONG); } + "register" { RET(REGISTER); } + "return" { RET(RETURN); } + "short" { RET(SHORT); } + "signed" { RET(SIGNED); } + "sizeof" { RET(SIZEOF); } + "static" { RET(STATIC); } + "struct" { RET(STRUCT); } + "switch" { RET(SWITCH); } + "typedef" { RET(TYPEDEF); } + "union" { RET(UNION); } + "unsigned" { RET(UNSIGNED); } + "void" { RET(VOID); } + "volatile" { RET(VOLATILE); } + "while" { RET(WHILE); } + + L (L|D)* { RET(ID); } + + ("0" [xX] H+ IS?) | ("0" D+ IS?) | (D+ IS?) | + (['] (ESC|any\\[\\n\\\\'])* [']) + { RET(ICON); } + + (D+ E FS?) | (D* "." D+ E? FS?) | (D+ "." D* E? FS?) + { RET(FCON); } + + (["] (ESC|any\\[\\n\\\\"])* ["]) + { RET(SCON); } + + "..." { RET(ELLIPSIS); } + ">>=" { RET(RSHIFTEQ); } + "<<=" { RET(LSHIFTEQ); } + "+=" { RET(ADDEQ); } + "-=" { RET(SUBEQ); } + "*=" { RET(MULEQ); } + "/=" { RET(DIVEQ); } + "%=" { RET(MODEQ); } + "&=" { RET(ANDEQ); } + "^=" { RET(XOREQ); } + "|=" { RET(OREQ); } + ">>" { RET(RSHIFT); } + "<<" { RET(LSHIFT); } + "++" { RET(INCR); } + "--" { RET(DECR); } + "->" { RET(DEREF); } + "&&" { RET(ANDAND); } + "||" { RET(OROR); } + "<=" { RET(LEQ); } + ">=" { RET(GEQ); } + "==" { RET(EQL); } + "!=" { RET(NEQ); } + ";" { RET(';'); } + "{" { RET('{'); } + "}" { RET('}'); } + "," { RET(','); } + ":" { RET(':'); } + "=" { RET('='); } + "(" { RET('('); } + ")" { RET(')'); } + "[" { RET('['); } + "]" { RET(']'); } + "." { RET('.'); } + "&" { RET('&'); } + "!" { RET('!'); } + "~" { RET('~'); } + "-" { RET('-'); } + "+" { RET('+'); } + "*" { RET('*'); } + "/" { RET('/'); } + "%" { RET('%'); } + "<" { RET('<'); } + ">" { RET('>'); } + "^" { RET('^'); } + "|" { RET('|'); } + "?" { RET('?'); } + + + [ \\t\\v\\f]+ { goto std; } + + "\\n" + { + if(cursor == s->eof) RET(EOI); + s->pos = cursor; s->line++; + goto std; + } + + any + { + printf("unexpected character: %c\\n", *s->tok); + goto std; + } +*/ + +comment: +/*!re2c + "*/" { goto std; } + "\\n" + { + if(cursor == s->eof) RET(EOI); + s->tok = s->pos = cursor; s->line++; + goto comment; + } + any { goto comment; } +*/ +} + +main(){ + Scanner in; + int t; + memset((char*) &in, 0, sizeof(in)); + in.fd = 0; + while((t = scan(&in)) != EOI){ +/* + printf("%d\\t%.*s\\n", t, in.cur - in.tok, in.tok); + printf("%d\\n", t); +*/ + } + close(in.fd); +} +.fi +.in -3 + +.SH "SEE ALSO" +.LP +flex(1), lex(1). + +.SH FEATURES +.LP +\*(re does not provide a default action: +the generated code assumes that the input +will consist of a sequence of tokens. +Typically this can be dealt with by adding a rule such as the one for +unexpected characters in the example above. +.LP +The user must arrange for a sentinel token to appear at the end of input +(and provide a rule for matching it): +\*(re does not provide an \fC<>\fP expression. +If the source is from a null-byte terminated string, a +rule matching a null character will suffice. If the source is from a +file then the approach taken in the example can be used: pad the input with +a newline (or some other character that can't appear within another token); +upon recognizing such a character check to see if it is the sentinel +and act accordingly. +.LP +\*(re does not provide start conditions: use a separate scanner +specification for each start condition (as illustrated in the above example). +.LP +No [^x]. Use difference instead. +.SH BUGS +.LP +Only fixed length trailing context can be handled. +.LP +The maximum value appearing as a parameter \fIn\fP to \fCYYFILL\fP is not +provided to the generated code (this value is needed for constructing +the interface code). +Note that this value is usually relatively small: for +typical programming languages \fIn\fP will be the length of the longest +keyword plus one. +.LP +Difference only works for character sets. +.LP +The \*(re internal algorithms need documentation. + +.SH AUTHOR +.LP +Please send bug reports, fixes and feedback to: +.LP +.nf +Peter Bumbulis +Computer Systems Group +University of Waterloo +Waterloo, Ontario +N2L 3G1 +Internet: peter@csg.uwaterloo.ca +.fi diff --git a/scanner.h b/scanner.h index 3084999a..0032558c 100644 --- a/scanner.h +++ b/scanner.h @@ -1,3 +1,4 @@ +/* $Id$ */ #ifndef _scanner_h #define _scanner_h @@ -6,10 +7,10 @@ class Scanner { private: int in; - char *bot, *tok, *ptr, *cur, *pos, *lim, *top, *eof; + uchar *bot, *tok, *ptr, *cur, *pos, *lim, *top, *eof; uint tchar, tline, cline; private: - char *fill(char*); + uchar *fill(uchar*); public: Scanner(int); int echo(ostream&); diff --git a/scanner.re b/scanner.re index a8112562..ef572017 100644 --- a/scanner.re +++ b/scanner.re @@ -1,4 +1,6 @@ -#include #include +/* $Id$ */ +#include +#include #include #include #include "scanner.h" @@ -9,7 +11,7 @@ extern YYSTYPE yylval; #define BSIZE 8192 -#define YYCTYPE char +#define YYCTYPE uchar #define YYCURSOR cursor #define YYLIMIT lim #define YYMARKER ptr @@ -24,7 +26,7 @@ Scanner::Scanner(int i) : in(i), ; } -char *Scanner::fill(char *cursor){ +uchar *Scanner::fill(uchar *cursor){ if(!eof){ uint cnt = tok - bot; if(cnt){ @@ -36,7 +38,7 @@ char *Scanner::fill(char *cursor){ lim -= cnt; } if((top - lim) < BSIZE){ - char *buf = new char[(lim - bot) + BSIZE]; + uchar *buf = new uchar[(lim - bot) + BSIZE]; memcpy(buf, tok, lim - tok); tok = buf; ptr = &buf[ptr - bot]; @@ -67,7 +69,7 @@ digit = [0-9]; */ int Scanner::echo(ostream &out){ - char *cursor = cur; + uchar *cursor = cur; // Catch EOF if (eof && cursor == eof) @@ -76,11 +78,11 @@ int Scanner::echo(ostream &out){ tok = cursor; echo: /*!re2c - "/*!re2c" { out.write((char *)tok, &cursor[-7] - tok); + "/*!re2c" { out.write((const char*)(tok), &cursor[-7] - tok); tok = cursor; RETURN(1); } "\n" { if(cursor == eof) RETURN(0); - out.write((char *)tok, cursor - tok); + out.write((const char*)(tok), cursor - tok); tok = pos = cursor; cline++; goto echo; } any { goto echo; } @@ -89,7 +91,7 @@ echo: int Scanner::scan(){ - char *cursor = cur; + uchar *cursor = cur; uint depth; scan: diff --git a/substr.cc b/substr.cc index cbe48d1f..44d529b8 100644 --- a/substr.cc +++ b/substr.cc @@ -1,10 +1,11 @@ +/* $Id$ */ #include #include "substr.h" #include "globals.h" void SubStr::out(ostream& o) const { o.write(str, len); - for (int i = 0; i < len; ++i) + for (size_t i = 0; i < (size_t)len; ++i) { if (str[i] == '\n') ++oline; diff --git a/substr.h b/substr.h index fb5e2cc2..bb441ea0 100644 --- a/substr.h +++ b/substr.h @@ -1,3 +1,4 @@ +/* $Id$ */ #ifndef _substr_h #define _substr_h diff --git a/token.h b/token.h index de51eb48..3cbc5956 100644 --- a/token.h +++ b/token.h @@ -1,3 +1,4 @@ +/* $Id$ */ #ifndef _token_h #define _token_h diff --git a/translate.cc b/translate.cc index 2eeaabf0..9c0c3aa4 100644 --- a/translate.cc +++ b/translate.cc @@ -1,3 +1,4 @@ +/* $Id$ */ #include "globals.h" uchar asc2asc[256] = {