From 69f0386ccce689e84905a122a4bdbd6e87bcf9c7 Mon Sep 17 00:00:00 2001 From: Ulya Trofimovich Date: Thu, 22 Oct 2015 14:02:37 +0100 Subject: [PATCH] Added two examples: simple and YYMAXFILL. --- src/css/default.css | 7 +- src/examples.rst | 124 ++++++++++++++++++ .../01_recognizing_integer_literals.re | 31 +++++ src/examples/02_recognizing_strings.re | 48 +++++++ src/manual.rst | 59 +-------- 5 files changed, 214 insertions(+), 55 deletions(-) create mode 100644 src/examples.rst create mode 100644 src/examples/01_recognizing_integer_literals.re create mode 100644 src/examples/02_recognizing_strings.re diff --git a/src/css/default.css b/src/css/default.css index 4e7d579d..94742ddb 100644 --- a/src/css/default.css +++ b/src/css/default.css @@ -40,8 +40,13 @@ pre.code { pre.code .ln { color: grey; } /* line numbers */ pre.code .comment, code .comment { color: #0077ff; } -pre.code .keyword, code .keyword { color: #aa3300; font-weight: bold; } +pre.code .comment.preproc, code .comment.preproc { color: #9966ff; } +pre.code .keyword, code .keyword { color: #003399; font-weight: bold; } +pre.code .keyword.type, code .keyword.type { color: #aa3300; } +pre.code .operator, code .operator { color: #000066; } +pre.code .punctuation, code .punctuation { color: #000066; font-weight: bold; } pre.code .literal.string, code .literal.string { color: #ff5500; } +pre.code .literal.number, code .literal.number { color: #ff5500; font-weight: bold; } pre.code .name.builtin, code .name.builtin { color: #352B84 } pre.code .deleted, code .deleted { background-color: #DEB0A1} pre.code .inserted, code .inserted { background-color: #A3D289} diff --git a/src/examples.rst b/src/examples.rst new file mode 100644 index 00000000..6f3be08a --- /dev/null +++ b/src/examples.rst @@ -0,0 +1,124 @@ +.. _Recognizing integer literals: + +Recognizing integer literals +---------------------------- + +This program simply loops over its commad-line arguments +and tries to match each argument against one of the four patterns: +binary, octal, decimal and hexadecimal integer literals. +The numbers are not *parsed* (their numeric value is not retrieved), they are merely *recognized*. + +.. include:: examples/01_recognizing_integer_literals.re + :code: cpp + :number-lines: + +A couple of things should be noted: + +* Default case (when none of the patterns matched) is handled properly (line 17). + +* Check for the end of input is disabled (line 9). + In this case it is safe, because all arguments are ``NULL``-terminated + and none of the rules matches ``NULL`` in the middle: + when lexer reaches end of input, it will see ``NULL`` and stop. + It's a common practice to use ``re2c:yyfill:enable = 0;`` + in cases when input character set is restricted and one special + character can be chosen to indicate end of input. + **But do make sure that the terminating character is not allowed in the middle of a rule!** + +* ``YYMARKER`` (line 6) is needed because rules overlap: + it backups input position of the longest successful match. + Say, we have overlapping rules ``"a"`` and ``"abc"`` and input string ``"abd"``: + by the time ``"a"`` matches there's still a chance to match ``"abc"``. + But when lexer sees ``'d'`` it must rollback. + (You might wonder why the programmer has to bother with ``YYMARKER`` at all: couldn't re2c generate a local variable ``YYMARKER``? + The thing is, all input pointers must be updated syncronously by ``YYFILL``). + +Generate, compile and run: + +.. code-block:: bash + + $ re2c -o example.cc example.re + $ g++ -o example example.cc + $ ./example 0 -12 +345 12345678901234567890 0xAbcDEf 0x00 007 0B0 0b110101010 0x 0b a ? "" + dec: 0 + dec: -12 + dec: +345 + dec: 12345678901234567890 + hex: 0xAbcDEf + hex: 0x00 + oct: 007 + bin: 0B0 + bin: 0b110101010 + err: 0x + err: 0b + err: a + err: ? + err: + +.. _Recognizing strings: the need for YYMAXFILL: + +Recognizing strings: the need for YYMAXFILL +------------------------------------------- + +This example is about recognizing strings. +Strings may contain *any* characters in the range [0 - 0xFF] except quotes (quotes should be escaped). +It means that (unlike the previous example, `Recognizing integer literals`_) +we cannot use ``NULL`` or any other character as a terminator. +We must explicitely check for end of input. + +So how does it work? +The simplest possible way is to check on each character (right before advancing input position). +But this is very slow. +Instead, re2c estimates maximal lexeme length ``YYMAXFILL`` (disregarding loops) +and generates check if there's at least ``YYMAXFILL`` characters left: + +.. code-block:: cpp + + if ((YYLIMIT - YYCURSOR) < YYMAXFILL) + YYFILL(YYMAXFILL); + +``YYLIMIT`` must point at the end of input (so that ``YYLIMIT[-1]`` is the last input character). +These checks are inserted at start and before loops. +If there's not enough input characters, the generated lexer calls ``YYFILL(YYMAXFILL)`` +so that the programmer can supply more input or stop. + +The common practice is to pad input with ``YYMAXFILL`` fake characters. +**The padding should not form a valid lexeme suffix!** + +.. include:: examples/02_recognizing_strings.re + :code: cpp + :number-lines: + +Notes: + +* ``/*!max:re2c*/`` (line 4) tells re2c to generate ``#define YYMAXFILL ``. +* Input string is padded with ``YYMAXFILL`` zeroes (line 15). + Zeroes do not form a valid lexeme suffix (but padding with quotes would confuse the lexer ``;)``). +* ``YYFILL`` simply stops: there's nothing more to lex (line 30). +* We have to use ``re2c:define:YYFILL:naked = 1;`` (line 31) + in order to suppress passing parameter to ``YYFILL``. + (It was an unfortunate idea to make ``YYFILL`` a call expression by default: + ``YYFILL`` has to stop the lexer eventually, that's why it has to be a macro and not a function. + One should either set ``re2c:define:YYFILL:naked = 1;``, or define ``YYFILL(n)`` as a macro.) + +Generate, compile and run: + +.. code-block:: bash + + $ re2c -o example.cc example.re + $ g++ -o example example.cc + $ ./example \"a\ momentary\" \"\" \"lap\"se\" \"of \"rea\\\"son\" + str: "a momentary" + str: "" + err: "lap"se" + err: "of + str: "rea\"son" + +.. _Arbitrary large input and YYFILL: + +Arbitrary large input and YYFILL +-------------------------------- + +Suppose the input cannot be mapped in memory at once. +The usual thing to do is to allocate a reasonably sized buffer and to read +input chunk by chunk. diff --git a/src/examples/01_recognizing_integer_literals.re b/src/examples/01_recognizing_integer_literals.re new file mode 100644 index 00000000..99a41f28 --- /dev/null +++ b/src/examples/01_recognizing_integer_literals.re @@ -0,0 +1,31 @@ +#include + +static const char *lex(const char *input) +{ + const char *YYCURSOR = input; + const char *YYMARKER; + /*!re2c + re2c:define:YYCTYPE = char; + re2c:yyfill:enable = 0; + + end = "\x00"; + bin = "0" [bB] [01]+; + oct = "0" [0-7]+; + dec = [+-]? ("0" | [1-9][0-9]*); + hex = "0" [xX] [0-9a-fA-F]+; + + * { return "err"; } + bin end { return "bin"; } + oct end { return "oct"; } + dec end { return "dec"; } + hex end { return "hex"; } + */ +} + +int main(int argc, char **argv) +{ + for (int i = 1; i < argc; ++i) { + printf ("%s: %s\n", lex(argv[i]), argv[i]); + } + return 0; +} diff --git a/src/examples/02_recognizing_strings.re b/src/examples/02_recognizing_strings.re new file mode 100644 index 00000000..f0b39d3e --- /dev/null +++ b/src/examples/02_recognizing_strings.re @@ -0,0 +1,48 @@ +#include +#include + +/*!max:re2c*/ + +struct input_t { + size_t len; + char *str; + + input_t(const char *s) + : len(strlen(s)) + , str(new char[len + YYMAXFILL]) + { + memcpy(str, s, len); + memset(str + len, 0, YYMAXFILL); + } + ~input_t() + { + delete[]str; + } +}; + +static const char *lex(const input_t & input) +{ + const char *YYCURSOR = input.str; + const char *const YYLIMIT = input.str + input.len + YYMAXFILL; + const char *YYMARKER; + /*!re2c + re2c:define:YYCTYPE = char; + re2c:define:YYFILL = "return \"err\";"; + re2c:define:YYFILL:naked = 1; + + end = "\x00"; + str = "\"" ([^"] | "\\\"")* "\""; + + * { return "err"; } + str end { return "str"; } + */ +} + +int main(int argc, char **argv) +{ + for (int i = 1; i < argc; ++i) { + input_t arg(argv[i]); + printf("%s: %s\n", lex(arg), arg.str); + } + return 0; +} diff --git a/src/manual.rst b/src/manual.rst index 35bb1a3b..2530ce07 100644 --- a/src/manual.rst +++ b/src/manual.rst @@ -877,60 +877,11 @@ Examples -------------------------------------------------------------------------------- -The subdirectory ``examples`` of the ``re2c`` distribution contains a few step -by step examples to get you started with ``re2c``. - -Given the following code: - -.. code-block:: c - - unsigned int stou (const char \* s) - { - # define YYCTYPE char const - YYCTYPE * YYCURSOR = s; - unsigned int result = 0; - for (;;) - { - /*!re2c - re2c:yyfill:enable = 0; - - "\x00" { return result; } - [0-9] { result = result * 10 + c; continue; } - */ - } - } - -``re2c -is`` will generate: - -.. code-block:: c - - /* Generated by re2c 0.13.7.dev on Mon Jul 14 13:37:46 2014 */ - unsigned int stou (const char * s) - { - # define YYCTYPE char - const YYCTYPE * YYCURSOR = s; - unsigned int result = 0; - - for (;;) - { - { - YYCTYPE yych; - - yych = *YYCURSOR; - if (yych <= 0x00) goto yy3; - if (yych <= '/') goto yy2; - if (yych <= '9') goto yy5; - - yy2: - yy3: - ++YYCURSOR; - { return result; } - yy5: - ++YYCURSOR; - { result = result * 10 + c; continue; } - } - } - } +All examples are written in C++-98. +`Do let me know `_ if you notice any obvious lies and errors. +You can find more examples in subdirectory ``examples`` of the ``re2c`` distribution. + +.. include:: examples.rst .. _Changelog: -- 2.40.0