From 32d6931a841826a22f2cc752656bcf085eea554b Mon Sep 17 00:00:00 2001 From: Ulya Trofimovich Date: Mon, 19 Oct 2015 12:47:50 +0100 Subject: [PATCH] New re2c website starts here. --- gen.sh | 12 + src/css/default.css | 41 ++ src/index.rst | 54 +++ src/manpage.rst | 929 ++++++++++++++++++++++++++++++++++++++++++++ src/options.rst | 153 ++++++++ 5 files changed, 1189 insertions(+) create mode 100755 gen.sh create mode 100644 src/css/default.css create mode 100644 src/index.rst create mode 100644 src/manpage.rst create mode 100644 src/options.rst diff --git a/gen.sh b/gen.sh new file mode 100755 index 00000000..aa0e0fe2 --- /dev/null +++ b/gen.sh @@ -0,0 +1,12 @@ +#!/bin/sh + +srcdir="src" +objdir="obj" + +rm -rf "$objdir" +cp -R "$srcdir" "$objdir" + +for f in `find "$objdir" -type f -name "*.rst"` +do + rst2html.py --link-stylesheet --stylesheet="css/default.css" "$f" > "${f%.rst}.html" +done diff --git a/src/css/default.css b/src/css/default.css new file mode 100644 index 00000000..9ab661be --- /dev/null +++ b/src/css/default.css @@ -0,0 +1,41 @@ +body { + text-align: justify; + margin: auto auto auto auto; + width: 85%; +} + +h1, h2, h3, h4, h5, h6 { + color: #557799; +} + +h1.title { + font-size: 2.5em; + text-align: center; +} + +table, tr, td, th { + border: none; + padding: 0em 0.2em 0em 0.2em; +} + +pre.code { + color: black; + padding: 1em 1em 1em 1em; + margin: 0 0 0 0; + + border: 1px solid #006699; + border-left: 8px solid #006699; + + background-color: #f7f3e0; + display: block; + white-space: pre; + overflow: auto; +} + +pre.code .ln { color: grey; } /* line numbers */ +pre.code .comment, code .comment { color: #0077ff; } +pre.code .keyword, code .keyword { color: #006699; font-weight: bold; } +pre.code .literal.string, code .literal.string { color: #105000 } +pre.code .name.builtin, code .name.builtin { color: #352B84 } +pre.code .deleted, code .deleted { background-color: #DEB0A1} +pre.code .inserted, code .inserted { background-color: #A3D289} diff --git a/src/index.rst b/src/index.rst new file mode 100644 index 00000000..5ec37b82 --- /dev/null +++ b/src/index.rst @@ -0,0 +1,54 @@ + +==== +re2c +==== + +*re2c* is a tool for writing very fast and flexible scanners. Unlike +any other such tool, *re2c* focuses on generating high efficient code for +regular expression matching. As a result this allows a much broader range of +use than any traditional lexer offers. And last but not least *re2c* +generates warning free code that is equal to hand-written code in terms of +size, speed and quality. + +The above made the `PHP team to use `_ *re2c* in various places. + +Marcus Börger (helly@users.sourceforge.net) + +I very much welcome anyone who would like to contribute to the project, either +as a developer with source code access or by simply sending patches, bug reports, or +suggestions for improvement. + +Dan Nuffer (nuffer@users.sourceforge.net) + +Please use the `sourceforge `_ +to download re2c, report bugs, subscribe to the mailing list, etc. + +You can view the online manual `here `_. + +Other re2c links: +----------------- + +Other re2c links: +~~~~~~~~~~~~~~~~~ + +Other re2c links: ++++++++++++++++++ + +Other re2c links: +................. + +Other re2c links: +,,,,,,,,,,,,,,,,, + +Other re2c links: +***************** + +Other re2c links: +----------------- + +aaaa1 + +Other re2c links: +~~~~~~~~~~~~~~~~~ + +aaaaaaa diff --git a/src/manpage.rst b/src/manpage.rst new file mode 100644 index 00000000..c8d77ac6 --- /dev/null +++ b/src/manpage.rst @@ -0,0 +1,929 @@ +==== +re2c +==== + +----------------------------------------- +convert regular expressions to C/C++ code +----------------------------------------- + +:Manual section: 1 + +SYNOPSIS +-------- + +``re2c [OPTIONS] FILE`` + +DESCRIPTION +----------- + +``re2c`` is a lexer generator for C/C++. It finds regular expression +specifications inside of C/C++ comments and replaces them with a +hard-coded DFA. The user must supply some interface code in order to +control and customize the generated DFA. + +EXAMPLE +------- + +Given the following code: + +.. code-block:: c + + unsigned int stou (const char \* s) + { + # define YYCTYPE char const + YYCTYPE * YYCURSOR = s; + unsigned int result = 0; + for (;;) + { + /*!re2c + re2c:yyfill:enable = 0; + + "\x00" { return result; } + [0-9] { result = result * 10 + c; continue; } + */ + } + } + +``re2c -is`` will generate: + +.. code-block:: c + + /* Generated by re2c 0.13.7.dev on Mon Jul 14 13:37:46 2014 */ + unsigned int stou (const char * s) + { + # define YYCTYPE char + const YYCTYPE * YYCURSOR = s; + unsigned int result = 0; + + for (;;) + { + { + YYCTYPE yych; + + yych = *YYCURSOR; + if (yych <= 0x00) goto yy3; + if (yych <= '/') goto yy2; + if (yych <= '9') goto yy5; + + yy2: + yy3: + ++YYCURSOR; + { return result; } + yy5: + ++YYCURSOR; + { result = result * 10 + c; continue; } + } + } + } + +.. _OPTIONS: + +OPTIONS +------- + +.. include:: options.rst + +WARNINGS +-------- + +``-W`` + Turn on all warnings. + +``-Werror`` + Turn warnings into errors. Note that this option along + doesn't turn on any warnings, it only affects those warnings that have + been turned on so far or will be turned on later. + +``-W`` + Turn on individual ``warning``. + +``-Wno-`` + Turn off individual ``warning``. + +``-Werror-`` + Turn on individual ``warning`` and treat it as error (this implies ``-W``). + +``-Wno-error-`` + Don't treat this particular ``warning`` as error. This doesn't turn off + the warning itself. + +``-Wcondition-order`` + Warn if the generated program makes implicit + assumptions about condition numbering. One should use either ``-t, --type-header`` option or + ``/*!types:re2c*/`` directive to generate mapping of condition names to numbers and use + autogenerated condition names. + +``-Wempty-character-class`` + Warn if regular expression contains empty + character class. From the rational point of view trying to match empty + character class makes no sense: it should always fail. However, for + backwards compatibility reasons ``re2c`` allows empty character class and + treats it as empty string. Use ``--empty-class`` option to change default + behaviour. + +``-Wmatch-empty-string`` + Warn if regular expression in a rule is + nullable (matches empty string). If DFA runs in a loop and empty match + is unintentional (input position in not advanced manually), lexer may + get stuck in eternal loop. + +``-Wswapped-range`` + Warn if range lower bound is greater that upper + bound. Default ``re2c`` behaviour is to silently swap range bounds. + +``-Wundefined-control-flow`` + Warn if some input strings cause undefined + control flow in lexer (the faulty patterns are reported). This is the + most dangerous and common mistake. It can be easily fixed by adding + default rule ``*`` (this rule has the lowest priority, matches any code unit and consumes + exactly one code unit). + +``-Wuseless-escape`` + Warn if a symbol is escaped when it shouldn't be. + By default re2c silently ignores escape, but this may as well indicate a + typo or an error in escape sequence. + +INTERFACE CODE +-------------- + +The user must supply interface code either in the form of C/C++ code +(macros, functions, variables, etc.) or in the form of `INPLACE CONFIGURATIONS`_. +Which symbols must be defined and which are optional +depends on a particular use case. + +``YYCONDTYPE`` + In ``-c`` mode you can use ``-t`` to generate a file that + contains the enumeration used as conditions. Each of the values refers + to a condition of a rule set. + +``YYCTXMARKER`` + l-value of type ``YYCTYPE *``. + The generated code saves trailing context backtracking information in + ``YYCTXMARKER``. The user only needs to define this macro if a scanner + specification uses trailing context in one or more of its regular + expressions. + +``YYCTYPE`` + Type used to hold an input symbol (code unit). Usually + ``char`` or ``unsigned char`` for ASCII, EBCDIC and UTF-8, *unsigned short* + for UTF-16 or UCS-2 and ``unsigned int`` for UTF-32. + +``YYCURSOR`` + l-value of type ``YYCTYPE *`` that points to the current input symbol. The generated code advances + ``YYCURSOR`` as symbols are matched. On entry, ``YYCURSOR`` is assumed to + point to the first character of the current token. On exit, ``YYCURSOR`` + will point to the first character of the following token. + +``YYDEBUG (state, current)`` + This is only needed if the ``-d`` flag was + specified. It allows to easily debug the generated parser by calling a + user defined function for every state. The function should have the + following signature: ``void YYDEBUG (int state, char current)``. The first + parameter receives the state or -1 and the second parameter receives the + input at the current cursor. + +``YYFILL (n)`` + The generated code "calls"" ``YYFILL (n)`` when the + buffer needs (re)filling: at least ``n`` additional characters should be + provided. ``YYFILL (n)`` should adjust ``YYCURSOR``, ``YYLIMIT``, ``YYMARKER`` + and ``YYCTXMARKER`` as needed. Note that for typical programming languages + ``n`` will be the length of the longest keyword plus one. The user can + place a comment of the form ``/*!max:re2c*/`` to insert ``YYMAXFILL`` definition that is set to the maximum + length value. + +``YYGETCONDITION ()`` + This define is used to get the condition prior to + entering the scanner code when using ``-c`` switch. The value must be + initialized with a value from the enumeration ``YYCONDTYPE`` type. + +``YYGETSTATE ()`` + The user only needs to define this macro if the ``-f`` + flag was specified. In that case, the generated code "calls" + ``YYGETSTATE ()`` at the very beginning of the scanner in order to obtain + the saved state. ``YYGETSTATE ()`` must return a signed integer. The value + must be either -1, indicating that the scanner is entered for the first + time, or a value previously saved by ``YYSETSTATE (s)``. In the second + case, the scanner will resume operations right after where the last + ``YYFILL (n)`` was called. + +``YYLIMIT`` + Expression of type ``YYCTYPE *`` that marks the end of the buffer ``YYLIMIT[-1]`` + is the last character in the buffer). The generated code repeatedly + compares ``YYCURSOR`` to ``YYLIMIT`` to determine when the buffer needs + (re)filling. + +``YYMARKER`` + l-value of type ``YYCTYPE *``. + The generated code saves backtracking information in ``YYMARKER``. Some + easy scanners might not use this. + +``YYMAXFILL`` + This will be automatically defined by ``/*!max:re2c*/`` blocks as explained above. + +``YYSETCONDITION (c)`` + This define is used to set the condition in + transition rules. This is only being used when ``-c`` is active and + transition rules are being used. + +``YYSETSTATE (s)`` + The user only needs to define this macro if the ``-f`` + flag was specified. In that case, the generated code "calls" + ``YYSETSTATE`` just before calling ``YYFILL (n)``. The parameter to + ``YYSETSTATE`` is a signed integer that uniquely identifies the specific + instance of ``YYFILL (n)`` that is about to be called. Should the user + wish to save the state of the scanner and have ``YYFILL (n)`` return to + the caller, all he has to do is store that unique identifer in a + variable. Later, when the scannered is called again, it will call + ``YYGETSTATE ()`` and resume execution right where it left off. The + generated code will contain both ``YYSETSTATE (s)`` and ``YYGETSTATE`` even + if ``YYFILL (n)`` is being disabled. + +SYNTAX +------ + +Code for ``re2c`` consists of a set of `RULES`_, `NAMED DEFINITIONS`_ and +`INPLACE CONFIGURATIONS`_. + +.. _RULES: + +RULES +~~~~~ + +Rules consist of a regular expression (see `REGULAR EXPRESSIONS`_) along with a block of C/C++ code +that is to be executed when the associated regular expression is +matched. You can either start the code with an opening curly brace or +the sequence ``:=``. When the code with a curly brace then ``re2c`` counts the brace depth +and stops looking for code automatically. Otherwise curly braces are not +allowed and ``re2c`` stops looking for code at the first line that does +not begin with whitespace. If two or more rules overlap, the first rule +is preferred. + + ``regular-expression { C/C++ code }`` + + ``regular-expression := C/C++ code`` + +There is one special rule: default rule ``*`` + + ``* { C/C++ code }`` + + ``* := C/C++ code`` + +Note that default rule ``*`` differs from ``[^]``: default rule has the lowest priority, +matches any code unit (either valid or invalid) and always consumes one character; +while ``[^]`` matches any valid code point (not code unit) and can consume multiple +code units. In fact, when variable-length encoding is used, ``*`` +is the only possible way to match invalid input character (see `ENCODINGS`_ for details). + +If ``-c`` is active then each regular expression is preceeded by a list +of comma separated condition names. Besides normal naming rules there +are two special cases: ``<*>`` (such rules are merged to all conditions) +and ``<>`` (such the rule cannot have an associated regular expression, +its code is merged to all actions). Non empty rules may further more specify the new +condition. In that case ``re2c`` will generate the necessary code to +change the condition automatically. Rules can use ``:=>`` as a shortcut +to automatically generate code that not only sets the +new condition state but also continues execution with the new state. A +shortcut rule should not be used in a loop where there is code between +the start of the loop and the ``re2c`` block unless ``re2c:cond:goto`` +is changed to ``continue``. If code is necessary before all rules (though not simple jumps) you +can doso by using ```` pseudo-rules. + + `` regular-expression { C/C++ code }`` + + `` regular-expression := C/C++ code`` + + `` * { C/C++ code }`` + + `` * := C/C++ code`` + + `` regular-expression => condition { C/C++ code }`` + + `` regular-expression => condition := C/C++ code`` + + `` * => condition { C/C++ code }`` + + `` * => condition := C/C++ code`` + + `` regular-expression :=> condition`` + + + ``<*> regular-expression { C/C++ code }`` + + ``<*> regular-expression := C/C++ code`` + + ``<*> * { C/C++ code }`` + + ``<*> * := C/C++ code`` + + ``<*> regular-expression => condition { C/C++ code }`` + + ``<*> regular-expression => condition := C/C++ code`` + + ``<*> * => condition { C/C++ code }`` + + ``<*> * => condition := C/C++ code`` + + ``<*> regular-expression :=> condition`` + + + ``<> { C/C++ code }`` + + ``<> := C/C++ code`` + + ``<> => condition { C/C++ code }`` + + ``<> => condition := C/C++ code`` + + ``<> :=> condition`` + + ``<> :=> condition`` + + + `` { C/C++ code }`` + + `` := C/C++ code`` + + `` { C/C++ code }`` + + `` := C/C++ code`` + +.. _NAMED DEFINITIONS: + +NAMED DEFINITIONS +~~~~~~~~~~~~~~~~~ + +Named definitions are of the form: + + ``name = regular-expression;`` + +If ``-F`` is active, then named definitions are also of the form: + + ``name { regular-expression }`` + + + +.. _INPLACE CONFIGURATIONS: + +INPLACE CONFIGURATIONS +~~~~~~~~~~~~~~~~~~~~~~ + +``re2c:condprefix = yyc;`` + Allows to specify the prefix used for + condition labels. That is this text is prepended to any condition label + in the generated output file. + +``re2c:condenumprefix = yyc;`` + Allows to specify the prefix used for + condition values. That is this text is prepended to any condition enum + value in the generated output file. + +``re2c:cond:divider = "/* *********************************** */";`` + Allows to customize the devider for condition blocks. You can use ``@@`` + to put the name of the condition or customize the placeholder using + ``re2c:cond:divider@cond``. + +``re2c:cond:divider@cond = @@;`` + Specifies the placeholder that will be + replaced with the condition name in ``re2c:cond:divider``. + +``re2c:cond:goto = "goto @@;";`` + Allows to customize the condition goto statements used with ``:=>`` style rules. You can use ``@@`` + to put the name of the condition or ustomize the placeholder using + ``re2c:cond:goto@cond``. You can also change this to ``continue;``, which + would allow you to continue with the next loop cycle including any code + between loop start and re2c block. + +``re2c:cond:goto@cond = @@;`` + Spcifies the placeholder that will be replaced with the condition label in ``re2c:cond:goto``. + +``re2c:indent:top = 0;`` + Specifies the minimum number of indendation to + use. Requires a numeric value greater than or equal zero. + +``re2c:indent:string = "\t";`` + Specifies the string to use for indendation. Requires a string that should + contain only whitespace unless you need this for external tools. The easiest + way to specify spaces is to enclude them in single or double quotes. + If you do not want any indendation at all you can simply set this to "". + +``re2c:yych:conversion = 0;`` + When this setting is non zero, then ``re2c`` automatically generates + conversion code whenever yych gets read. In this case the type must be + defined using ``re2c:define:YYCTYPE``. + +``re2c:yych:emit = 1;`` + Generation of *yych* can be suppressed by setting this to 0. + +``re2c:yybm:hex = 0;`` + If set to zero then a decimal table is being used else a hexadecimal table will be generated. + +``re2c:yyfill:enable = 1;`` + Set this to zero to suppress generation of ``YYFILL (n)``. When using this be sure to verify that the generated + scanner does not read behind input. Allowing this behavior might + introduce sever security issues to you programs. + +``re2c:yyfill:check = 1;`` + This can be set 0 to suppress output of the + pre condition using ``YYCURSOR`` and ``YYLIMIT`` which becomes usefull when + ``YYLIMIT + YYMAXFILL`` is always accessible. + +``re2c:define:YYFILL = "YYFILL";`` + Substitution for ``YYFILL``. Note + that by default ``re2c`` generates argument in braces and semicolon after + ``YYFILL``. If you need to make ``YYFILL`` an arbitrary statement rather + than a call, set ``re2c:define:YYFILL:naked`` to non-zero and use + ``re2c:define:YYFILL@len`` to denote formal parameter inside of ``YYFILL`` + body. + +``re2c:define:YYFILL@len = "@@";`` + Any occurence of this text + inside of ``YYFILL`` will be replaced with the actual argument. + +``re2c:yyfill:parameter = 1;`` + Controls argument in braces after + ``YYFILL``. If zero, agrument is omitted. If non-zero, argument is + generated unless ``re2c:define:YYFILL:naked`` is set to non-zero. + +``re2c:define:YYFILL:naked = 0;`` + Controls argument in braces and + semicolon after ``YYFILL``. If zero, both agrument and semicolon are + omitted. If non-zero, argument is generated unless + ``re2c:yyfill:parameter`` is set to zero and semicolon is generated + unconditionally. + +``re2c:startlabel = 0;`` + If set to a non zero integer then the start + label of the next scanner blocks will be generated even if not used by + the scanner itself. Otherwise the normal ``yy0`` like start label is only + being generated if needed. If set to a text value then a label with that + text will be generated regardless of whether the normal start label is + being used or not. This setting is being reset to 0 after a start + label has been generated. + +``re2c:labelprefix = "yy";`` + Allows to change the prefix of numbered + labels. The default is ``yy`` and can be set any string that is a valid + label. + +``re2c:state:abort = 0;`` + When not zero and switch ``-f`` is active then + the ``YYGETSTATE`` block will contain a default case that aborts and a -1 + case is used for initialization. + +``re2c:state:nextlabel = 0;`` + Used when ``-f`` is active to control + whether the ``YYGETSTATE`` block is followed by a ``yyNext:`` label line. + Instead of using ``yyNext`` you can usually also use configuration + ``startlabel`` to force a specific start label or default to ``yy0`` as + start label. Instead of using a dedicated label it is often better to + separate the ``YYGETSTATE`` code from the actual scanner code by placing a + ``/*!getstate:re2c*/`` comment. + +``re2c:cgoto:threshold = 9;`` + When ``-g`` is active this value specifies + the complexity threshold that triggers generation of jump tables rather + than using nested if's and decision bitfields. The threshold is compared + against a calculated estimation of if-s needed where every used bitmap + divides the threshold by 2. + +``re2c:yych:conversion = 0;`` + When the input uses signed characters and + ``-s`` or ``-b`` switches are in effect re2c allows to automatically convert + to the unsigned character type that is then necessary for its internal + single character. When this setting is zero or an empty string the + conversion is disabled. Using a non zero number the conversion is taken + from ``YYCTYPE``. If that is given by an inplace configuration that value + is being used. Otherwise it will be ``(YYCTYPE)`` and changes to that + configuration are no longer possible. When this setting is a string the + braces must be specified. Now assuming your input is a ``char *`` + buffer and you are using above mentioned switches you can set + ``YYCTYPE`` to ``unsigned char`` and this setting to either 1 or ``(unsigned char)``. + +``re2c:define:YYCONDTYPE = "YYCONDTYPE";`` + Enumeration used for condition support with ``-c`` mode. + +``re2c:define:YYCTXMARKER = "YYCTXMARKER";`` + Allows to overwrite the + define ``YYCTXMARKER`` and thus avoiding it by setting the value to the + actual code needed. + +``re2c:define:YYCTYPE = "YYCTYPE";`` + Allows to overwrite the define + ``YYCTYPE`` and thus avoiding it by setting the value to the actual code + needed. + +``re2c:define:YYCURSOR = "YYCURSOR";`` + Allows to overwrite the define + ``YYCURSOR`` and thus avoiding it by setting the value to the actual code + needed. + +``re2c:define:YYDEBUG = "YYDEBUG";`` + Allows to overwrite the define + ``YYDEBUG`` and thus avoiding it by setting the value to the actual code + needed. + +``re2c:define:YYGETCONDITION = "YYGETCONDITION";`` + Substitution for + ``YYGETCONDITION``. Note that by default ``re2c`` generates braces after + ``YYGETCONDITION``. Set ``re2c:define:YYGETCONDITION:naked`` to non-zero to + omit braces. + +``re2c:define:YYGETCONDITION:naked = 0;`` + Controls braces after + ``YYGETCONDITION``. If zero, braces are omitted. If non-zero, braces are + generated. + +``re2c:define:YYSETCONDITION = "YYSETCONDITION";`` + Substitution for + ``YYSETCONDITION``. Note that by default ``re2c`` generates argument in + braces and semicolon after ``YYSETCONDITION``. If you need to make + ``YYSETCONDITION`` an arbitrary statement rather than a call, set + ``re2c:define:YYSETCONDITION:naked`` to non-zero and use + ``re2c:define:YYSETCONDITION@cond`` to denote formal parameter inside of + ``YYSETCONDITION`` body. + +``re2c:define:YYSETCONDITION@cond = "@@";`` + Any occurence of this + text inside of ``YYSETCONDITION`` will be replaced with the actual + argument. + +``re2c:define:YYSETCONDITION:naked = 0;`` + Controls argument in braces + and semicolon after ``YYSETCONDITION``. If zero, both agrument and + semicolon are omitted. If non-zero, both argument and semicolon are + generated. + +``re2c:define:YYGETSTATE = "YYGETSTATE";`` + Substitution for + ``YYGETSTATE``. Note that by default ``re2c`` generates braces after + ``YYGETSTATE``. Set ``re2c:define:YYGETSTATE:naked`` to non-zero to omit + braces. + +``re2c:define:YYGETSTATE:naked = 0;`` + Controls braces after + ``YYGETSTATE``. If zero, braces are omitted. If non-zero, braces are + generated. + +``re2c:define:YYSETSTATE = "YYSETSTATE";`` + Substitution for + ``YYSETSTATE``. Note that by default ``re2c`` generates argument in braces + and semicolon after ``YYSETSTATE``. If you need to make ``YYSETSTATE`` an + arbitrary statement rather than a call, set + ``re2c:define:YYSETSTATE:naked`` to non-zero and use + ``re2c:define:YYSETSTATE@cond`` to denote formal parameter inside of + ``YYSETSTATE`` body. + +``re2c:define:YYSETSTATE@state = "@@";`` + Any occurence of this text + inside of ``YYSETSTATE`` will be replaced with the actual argument. + +``re2c:define:YYSETSTATE:naked = 0;`` + Controls argument in braces and + semicolon after ``YYSETSTATE``. If zero, both agrument and semicolon are + omitted. If non-zero, both argument and semicolon are generated. + +``re2c:define:YYLIMIT = "YYLIMIT";`` + Allows to overwrite the define + ``YYLIMIT`` and thus avoiding it by setting the value to the actual code + needed. + +``re2c:define:YYMARKER = "YYMARKER";`` + Allows to overwrite the define + ``YYMARKER`` and thus avoiding it by setting the value to the actual code + needed. + +``re2c:label:yyFillLabel = "yyFillLabel";`` + Allows to overwrite the name of the label ``yyFillLabel``. + +``re2c:label:yyNext = "yyNext";`` + Allows to overwrite the name of the label ``yyNext``. + +``re2c:variable:yyaccept = yyaccept;`` + Allows to overwrite the name of the variable ``yyaccept``. + +``re2c:variable:yybm = "yybm";`` + Allows to overwrite the name of the variable ``yybm``. + +``re2c:variable:yych = "yych";`` + Allows to overwrite the name of the variable ``yych``. + +``re2c:variable:yyctable = "yyctable";`` + When both ``-c`` and ``-g`` are active then ``re2c`` uses this variable to generate a static jump table + for ``YYGETCONDITION``. + +``re2c:variable:yystable = "yystable";`` + Deprecated. + +``re2c:variable:yytarget = "yytarget";`` + Allows to overwrite the name of the variable ``yytarget``. + + + +.. _REGULAR EXPRESSIONS: + +REGULAR EXPRESSIONS +~~~~~~~~~~~~~~~~~~~ + +``"foo"`` + literal string ``"foo"``. ANSI-C escape sequences can be used. + +``'foo'`` + literal string ``"foo"`` (characters [a-zA-Z] treated + case-insensitive). ANSI-C escape sequences can be used. + +``[xyz]`` + character class; in this case, regular expression matches either ``x``, ``y``, or ``z``. + +``[abj-oZ]`` + character class with a range in it; matches ``a``, ``b``, any letter from ``j`` through ``o`` or ``Z``. + +``[^class]`` + inverted character class. + +``r \ s`` + match any ``r`` which isn't ``s``. ``r`` and ``s`` must be regular expressions + which can be expressed as character classes. + +``r*`` + zero or more occurences of ``r``. + +``r+`` + one or more occurences of ``r``. + +``r?`` + optional ``r``. + +``(r)`` + ``r``; parentheses are used to override precedence. + +``r s`` + ``r`` followed by ``s`` (concatenation). + +``r | s`` + either ``r`` or ``s`` (alternative). + +``r`` / ``s`` + ``r`` but only if it is followed by ``s``. Note that ``s`` is not + part of the matched text. This type of regular expression is called + "trailing context". Trailing context can only be the end of a rule + and not part of a named definition. + +``r{n}`` + matches ``r`` exactly ``n`` times. + +``r{n,}`` + matches ``r`` at least ``n`` times. + +``r{n,m}`` + matches ``r`` at least ``n`` times, but not more than ``m`` times. + +``.`` + match any character except newline. + +``name`` + matches named definition as specified by ``name`` only if ``-F`` is + off. If ``-F`` is active then this behaves like it was enclosed in double + quotes and matches the string "name". + +Character classes and string literals may contain octal or hexadecimal +character definitions and the following set of escape sequences: +``\a``, ``\b``, ``\f``, ``\n``, ``\r``, ``\t``, ``\v``, ``\\``. An octal character is defined by a backslash +followed by its three octal digits (e.g. ``\377``). +Hexadecimal characters from 0 to 0xFF are defined by backslash, a lower +cased ``x`` and two hexadecimal digits (e.g. ``\x12``). Hexadecimal characters from 0x100 to 0xFFFF are defined by backslash, a lower cased +``\u`` or an upper cased ``\X`` and four hexadecimal digits (e.g. ``\u1234``). +Hexadecimal characters from 0x10000 to 0xFFFFffff are defined by backslash, an upper cased ``\U`` +and eight hexadecimal digits (e.g. ``\U12345678``). + +The only portable "any" rule is the default rule ``*``. + + + +.. _SCANNER WITH STORABLE STATES: + +SCANNER WITH STORABLE STATES +---------------------------- + +When the ``-f`` flag is specified, ``re2c`` generates a scanner that can +store its current state, return to the caller, and later resume +operations exactly where it left off. + +The default operation of ``re2c`` is a +"pull" model, where the scanner asks for extra input whenever it needs it. However, this mode of operation assumes that the scanner is the "owner" +the parsing loop, and that may not always be convenient. + +Typically, if there is a preprocessor ahead of the scanner in the +stream, or for that matter any other procedural source of data, the +scanner cannot "ask" for more data unless both scanner and source +live in a separate threads. + +The ``-f`` flag is useful for just this situation: it lets users design +scanners that work in a "push" model, i.e. where data is fed to the +scanner chunk by chunk. When the scanner runs out of data to consume, it +just stores its state, and return to the caller. When more input data is +fed to the scanner, it resumes operations exactly where it left off. + +Changes needed compared to the "pull" model: + +* User has to supply macros ``YYSETSTATE ()`` and ``YYGETSTATE (state)``. + +* The ``-f`` option inhibits declaration of ``yych`` and ``yyaccept``. So the + user has to declare these. Also the user has to save and restore these. + In the example ``examples/push_model/push.re`` these are declared as + fields of the (C++) class of which the scanner is a method, so they do + not need to be saved/restored explicitly. For C they could e.g. be made + macros that select fields from a structure passed in as parameter. + Alternatively, they could be declared as local variables, saved with + ``YYFILL (n)`` when it decides to return and restored at entry to the + function. Also, it could be more efficient to save the state from + ``YYFILL (n)`` because ``YYSETSTATE (state)`` is called unconditionally. + ``YYFILL (n)`` however does not get ``state`` as parameter, so we would have + to store state in a local variable by ``YYSETSTATE (state)``. + +* Modify ``YYFILL (n)`` to return (from the function calling it) if more input is needed. + +* Modify caller to recognise if more input is needed and respond appropriately. + +* The generated code will contain a switch block that is used to + restores the last state by jumping behind the corrspoding ``YYFILL (n)`` + call. This code is automatically generated in the epilog of the first ``/*!re2c */`` + block. It is possible to trigger generation of the ``YYGETSTATE ()`` + block earlier by placing a ``/*!getstate:re2c*/`` comment. This is especially useful when the scanner code should be + wrapped inside a loop. + +Please see ``examples/push_model/push.re`` for "push" model scanner. The +generated code can be tweaked using inplace configurations ``state:abort`` +and ``state:nextlabel``. + + + +SCANNER WITH CONDITION SUPPORT +------------------------------ + +You can preceed regular expressions with a list of condition names when +using the ``-c`` switch. In this case ``re2c`` generates scanner blocks for +each conditon. Where each of the generated blocks has its own +precondition. The precondition is given by the interface define +``YYGETCONDITON()`` and must be of type ``YYCONDTYPE``. + +There are two special rule types. First, the rules of the condition ``<*>`` +are merged to all conditions (note that they have lower priority than +other rules of that condition). And second the empty condition list +allows to provide a code block that does not have a scanner part. +Meaning it does not allow any regular expression. The condition value +referring to this special block is always the one with the enumeration +value 0. This way the code of this special rule can be used to +initialize a scanner. It is in no way necessary to have these rules: but +sometimes it is helpful to have a dedicated uninitialized condition +state. + +Non empty rules allow to specify the new condition, which makes them +transition rules. Besides generating calls for the define +``YYSETCONDTITION`` no other special code is generated. + +There is another kind of special rules that allow to prepend code to any +code block of all rules of a certain set of conditions or to all code +blocks to all rules. This can be helpful when some operation is common +among rules. For instance this can be used to store the length of the +scanned string. These special setup rules start with an exclamation mark +followed by either a list of conditions ```` or a star +````. When ``re2c`` generates the code for a rule whose state does not have a +setup rule and a star'd setup rule is present, than that code will be +used as setup code. + + + +.. _ENCODINGS: + +ENCODINGS +--------- + +``re2c`` supports the following encodings: ASCII (default), EBCDIC (``-e``), +UCS-2 (``-w``), UTF-16 (``-x``), UTF-32 (``-u``) and UTF-8 (``-8``). +See also inplace configuration ``re2c:flags``. + +The following concepts should be clarified when talking about encoding. +*Code point* is an abstract number, which represents single encoding +symbol. *Code unit* is the smallest unit of memory, which is used in the +encoded text (it corresponds to one character in the input stream). One +or more code units can be needed to represent a single code point, +depending on the encoding. In *fixed-length* encoding, each code point +is represented with equal number of code units. In *variable-length* +encoding, different code points can be represented with different number +of code units. + +* ASCII is a fixed-length encoding. Its code space includes 0x100 + code points, from 0 to 0xFF. One code point is represented with exactly one + 1-byte code unit, which has the same value as the code point. Size of + ``YYCTYPE`` must be 1 byte. + +* EBCDIC is a fixed-length encoding. Its code space includes 0x100 + code points, from 0 to 0xFF. One code point is represented with exactly + one 1-byte code unit, which has the same value as the code point. Size + of ``YYCTYPE`` must be 1 byte. + +* UCS-2 is a fixed-length encoding. Its code space includes 0x10000 + code points, from 0 to 0xFFFF. One code point is represented with + exactly one 2-byte code unit, which has the same value as the code + point. Size of ``YYCTYPE`` must be 2 bytes. + +* UTF-16 is a variable-length encoding. Its code space includes all + Unicode code points, from 0 to 0xD7FF and from 0xE000 to 0x10FFFF. One + code point is represented with one or two 2-byte code units. Size of + ``YYCTYPE`` must be 2 bytes. + +* UTF-32 is a fixed-length encoding. Its code space includes all + Unicode code points, from 0 to 0xD7FF and from 0xE000 to 0x10FFFF. One + code point is represented with exactly one 4-byte code unit. Size of + ``YYCTYPE`` must be 4 bytes. + +* UTF-8 is a variable-length encoding. Its code space includes all + Unicode code points, from 0 to 0xD7FF and from 0xE000 to 0x10FFFF. One + code point is represented with sequence of one, two, three or four + 1-byte code units. Size of ``YYCTYPE`` must be 1 byte. + +In Unicode, values from range 0xD800 to 0xDFFF (surrogates) are not +valid Unicode code points, any encoded sequence of code units, that +would map to Unicode code points in the range 0xD800-0xDFFF, is +ill-formed. The user can control how ``re2c`` treats such ill-formed +sequences with ``--encoding-policy `` flag (see `OPTIONS`_ +for full explanation). + +For some encodings, there are code units, that never occur in valid +encoded stream (e.g. 0xFF byte in UTF-8). If the generated scanner must +check for invalid input, the only true way to do so is to use default +rule ``*``. Note, that full range rule ``[^]`` won't catch invalid code units when variable-length encoding is used +(``[^]`` means "all valid code points", while default rule ``*`` means "all possible code units"). + + + +GENERIC INPUT API +----------------- + +``re2c`` usually operates on input using pointer-like primitives +``YYCURSOR``, ``YYMARKER``, ``YYCTXMARKER`` and ``YYLIMIT``. + +Generic input API (enabled with ``--input custom`` switch) allows to +customize input operations. In this mode, ``re2c`` will express all +operations on input in terms of the following primitives: + + +---------------------+-----------------------------------------------------+ + | ``YYPEEK ()`` | get current input character | + +---------------------+-----------------------------------------------------+ + | ``YYSKIP ()`` | advance to the next character | + +---------------------+-----------------------------------------------------+ + | ``YYBACKUP ()`` | backup current input position | + +---------------------+-----------------------------------------------------+ + | ``YYBACKUPCTX ()`` | backup current input position for trailing context | + +---------------------+-----------------------------------------------------+ + | ``YYRESTORE ()`` | restore current input position | + +---------------------+-----------------------------------------------------+ + | ``YYRESTORECTX ()`` | restore current input position for trailing context | + +---------------------+-----------------------------------------------------+ + | ``YYLESSTHAN (n)`` | check if less than ``n`` input characters are left | + +---------------------+-----------------------------------------------------+ + +This `article `_ +has more details, and you can find some usage examples +`here `_. + + + +UNDERSTANDING RE2C +------------------ + +The subdirectory ``examples`` of the ``re2c`` distribution contains a few step +by step examples to get you started with ``re2c``. + + + +BUGS +---- + +* Difference only works for character sets, and not in UTF-8 mode. +* Some features don't work together (such as reusable rules, conditions, setup rules and default rules). + + + +SEE ALSO +-------- + +flex(1), lex(1), `quex `_. + + + +AUTHORS +------- + +* Peter Bumbulis peter@csg.uwaterloo.ca +* Brian Young bayoung@acm.org +* Dan Nuffer nuffer@users.sourceforge.net +* Marcus Boerger helly@users.sourceforge.net +* Hartmut Kaiser hkaiser@users.sourceforge.net +* Emmanuel Mogenet mgix@mgix.com (added storable state) +* Ulya Trofimovich skvadrik@gmail.com + +VERSION INFORMATION +------------------- + +This manpage describes ``re2c`` version 0.14.1.dev, package date 15 Oct 2015. diff --git a/src/options.rst b/src/options.rst new file mode 100644 index 00000000..42a2909a --- /dev/null +++ b/src/options.rst @@ -0,0 +1,153 @@ +``-? -h --help`` + Invoke a short help. + +``-b --bit-vectors`` + Implies ``-s``. Use bit vectors as well in the + attempt to coax better code out of the compiler. Most useful for + specifications with more than a few keywords (e.g. for most programming + languages). + +``-c --conditions`` + Used to support (f)lex-like condition support. + +``-d --debug-output`` + Creates a parser that dumps information about + the current position and in which state the parser is while parsing the + input. This is useful to debug parser issues and states. If you use this + switch you need to define a macro ``YYDEBUG`` that is called like a + function with two parameters: ``void YYDEBUG (int state, char current)``. + The first parameter receives the state or ``-1`` and the second parameter + receives the input at the current cursor. + +``-D --emit-dot`` + Emit Graphviz dot data. It can then be processed + with e.g. ``dot -Tpng input.dot > output.png``. Please note that + scanners with many states may crash dot. + +``-e --ecb`` + Generate a parser that supports EBCDIC. The generated + code can deal with any character up to 0xFF. In this mode ``re2c`` assumes + that input character size is 1 byte. This switch is incompatible with + ``-w``, ``-x``, ``-u`` and ``-8``. + +``-f --storable-state`` + Generate a scanner with support for storable state. + +``-F --flex-syntax`` + Partial support for flex syntax. When this flag + is active then named definitions must be surrounded by curly braces and + can be defined without an equal sign and the terminating semi colon. + Instead names are treated as direct double quoted strings. + +``-g --computed-gotos`` + Generate a scanner that utilizes GCC's + computed goto feature. That is ``re2c`` generates jump tables whenever a + decision is of a certain complexity (e.g. a lot of if conditions are + otherwise necessary). This is only useable with GCC and produces output + that cannot be compiled with any other compiler. Note that this implies + ``-b`` and that the complexity threshold can be configured using the + inplace configuration ``cgoto:threshold``. + +``-i --no-debug-info`` + Do not output ``#line`` information. This is + usefull when you want use a CMS tool with the ``re2c`` output which you + might want if you do not require your users to have ``re2c`` themselves + when building from your source. + +``-o OUTPUT --output=OUTPUT`` + Specify the ``OUTPUT`` file. + +``-r --reusable`` + Allows reuse of scanner definitions with ``/*!use:re2c */`` after ``/*!rules:re2c */``. + In this mode no ``/*!re2c */`` block and exactly one ``/*!rules:re2c */`` must be present. + The rules are being saved and used by every ``/*!use:re2c */`` block that follows. + These blocks can contain inplace configurations, especially ``re2c:flags:e``, + ``re2c:flags:w``, ``re2c:flags:x``, ``re2c:flags:u`` and ``re2c:flags:8``. + That way it is possible to create the same scanner multiple times for + different character types, different input mechanisms or different output mechanisms. + The ``/*!use:re2c */`` blocks can also contain additional rules that will be appended + to the set of rules in ``/*!rules:re2c */``. + +``-s --nested-ifs`` + Generate nested ifs for some switches. Many + compilers need this assist to generate better code. + +``-t HEADER --type-header=HEADER`` + Create a ``HEADER`` file that + contains types for the (f)lex-like condition support. This can only be + activated when ``-c`` is in use. + +``-u --unicode`` + Generate a parser that supports UTF-32. The generated + code can deal with any valid Unicode character up to 0x10FFFF. In this + mode ``re2c`` assumes that input character size is 4 bytes. This switch is + incompatible with ``-e``, ``-w``, ``-x`` and ``-8``. This implies ``-s``. + +``-v --version`` + Show version information. + +``-V --vernum`` + Show the version as a number XXYYZZ. + +``-w --wide-chars`` + Generate a parser that supports UCS-2. The + generated code can deal with any valid Unicode character up to 0xFFFF. + In this mode ``re2c`` assumes that input character size is 2 bytes. This + switch is incompatible with ``-e``, ``-x``, ``-u`` and ``-8``. This implies + ``-s``. + +``-x --utf-16`` + Generate a parser that supports UTF-16. The generated + code can deal with any valid Unicode character up to 0x10FFFF. In this + mode ``re2c`` assumes that input character size is 2 bytes. This switch is + incompatible with ``-e``, ``-w``, ``-u`` and ``-8``. This implies ``-s``. + +``-8 --utf-8`` + Generate a parser that supports UTF-8. The generated + code can deal with any valid Unicode character up to 0x10FFFF. In this + mode ``re2c`` assumes that input character size is 1 byte. This switch is + incompatible with ``-e``, ``-w``, ``-x`` and ``-u``. + +``--case-insensitive`` + All strings are case insensitive, so all + "-expressions are treated in the same way '-expressions are. + +``--case-inverted`` + Invert the meaning of single and double quoted + strings. With this switch single quotes are case sensitive and double + quotes are case insensitive. + +``--no-generation-date`` + Suppress date output in the generated output so + that it only shows the ``re2c`` version. + +``--encoding-policy POLICY`` + Specify how ``re2c`` must treat Unicode + surrogates. ``POLICY`` can be one of the following: ``fail`` (abort with + error when surrogate encountered), ``substitute`` (silently substitute + surrogate with error code point 0xFFFD), ``ignore`` (treat surrogates as + normal code points). By default ``re2c`` ignores surrogates (for backward + compatibility). Unicode standard says that standalone surrogates are + invalid code points, but different libraries and programs treat them + differently. + +``--input INPUT`` + Specify re2c input API. ``INPUT`` can be one of the + following: ``default``, ``custom``. + +``-S --skeleton`` + Instead of embedding re2c-generated code into C/C++ + source, generate a self-contained program for the same DFA. Most useful + for correctness and performance testing. + +``--empty-class POLICY`` + What to do if user inputs empty character + class. ``POLICY`` can be one of the following: ``match-empty`` (match empty + input: pretty illogical, but this is the default for backwards + compatibility reason), ``match-none`` (fail to match on any input), + ``error`` (compilation error). Note that there are various ways to + construct empty class, e.g: [], [^\\x00-\\xFF], + [\\x00-\\xFF][\\x00-\\xFF]. + +``-1 --single-pass`` + Deprecated and does nothing (single pass is by default now). -- 2.40.0