--- /dev/null
+#include <stdlib.h>
+#include <stdio.h>
+
+static void lex(const char *cur, const char *lim)
+{
+ const char *mar, *tok;
+# define YYCTYPE char
+# define YYPEEK() *cur
+# define YYSKIP() if (++cur == lim) return;
+# define YYBACKUP() mar = cur
+# define YYRESTORE() cur = mar
+loop:
+ tok = cur;
+ /*!re2c
+ re2c:yyfill:enable = 0;
+
+ * { goto loop; }
+ "__" [a-zA-Z0-9_]+ {
+ printf("%.*s\n", (int) (cur - tok), tok);
+ goto loop;
+ }
+ */
+}
+
+int main(int argc, char **argv)
+{
+ if (argc < 2) {
+ fprintf(stderr, "no input files\n");
+ return 1;
+ }
+
+ FILE *file = fopen(argv[1], "rb");
+ if (file == NULL) {
+ fprintf(stderr, "cannot open file\n");
+ return 1;
+ }
+
+ fseek(file, 0, SEEK_END);
+ const size_t fsize = (size_t) ftell(file);
+ fseek(file, 0, SEEK_SET);
+
+ char *buffer = (char*) malloc(fsize);
+ fread(buffer, 1, fsize, file);
+ lex(buffer, buffer + fsize);
+
+ free(buffer);
+ fclose(file);
+ return 0;
+}
--- /dev/null
+#include <stdio.h>
+#include <string.h>
+
+static int lex(const char *cur, const char *lim)
+{
+ const char *mar, *tok = cur;
+# define YYCTYPE char
+# define YYPEEK() (cur < lim ? *cur : 0)
+# define YYSKIP() ++cur
+# define YYBACKUP() mar = cur
+# define YYRESTORE() cur = mar
+ /*!re2c
+ re2c:yyfill:enable = 0;
+
+ * { printf("error\n"); return 1; }
+ [0-9a-zA-Z]+ [;] [\x00] {
+ printf("%.*s\n", (int) (cur - tok) - 1, tok);
+ return 0;
+ }
+ */
+}
+
+int main(int argc, char **argv)
+{
+ if (argc != 2) return 1;
+
+ char *s = argv[1];
+ size_t l = strlen(s);
+ s[l] = ';'; // overwrite terminating NULL
+ return lex(s, s + l + 1);
+}
--- /dev/null
+#include <fstream>
+
+static void conv(std::ifstream &in, std::ofstream &out)
+{
+ std::streampos mar;
+# define YYCTYPE char
+# define YYPEEK() in.peek()
+# define YYSKIP() do { in.ignore(); if (in.eof()) return; } while(0)
+# define YYBACKUP() mar = in.tellg()
+# define YYRESTORE() in.seekg(mar)
+loop:
+ /*!re2c
+ re2c:yyfill:enable = 0;
+
+ * { out.put(yych); goto loop; }
+ "\r\n" { out.put('\n'); goto loop; }
+ */
+}
+
+int main(int argc, char **argv)
+{
+ if (argc != 3) return 1;
+
+ std::ifstream in(argv[1], std::ios::binary);
+ std::ofstream out(argv[2], std::ios::binary);
+ if (in.fail() || out.fail()) return 2;
+
+ conv(in, out);
+ return 0;
+}
--- /dev/null
+Strings in binaries
+-------------------
+
+The program below searches all strings starting with double underscore in the given binary file.
+The same method can be used to search for arbitrary signatures or keywords.
+Since we are dealing with a *binary* file, we cannot use the sentinel method to check for the end of input:
+binary files can contain all kinds of characters, so no sentinel can be chosen.
+The usual way in such cases is to use ``YYLIMIT``-based checking: it requires padding input with ``YYMAXFILL`` fake characters,
+but it's not a problem since the input is buffered anyway.
+
+However, this exampe takes another approach:
+it uses generic API to override the default checking mechanism.
+First, it disables the usual mechanism: suppresses the generation of ``YYLESSTHAN`` and ``YYFILL`` with ``re2c:yyfill:enable = 0;`` configuration.
+Second, it redefines ``YYSKIP`` to perform checking before advancing to the next input character.
+In principle, this approach is less efficient:
+checking happens more frequently, as ``YYSKIP`` is invoked on each input character,
+while ``YYLESSTHAN`` happens only once per each strongly connected component of automaton.
+However, it allows to avoid padding.
+
+:download:`[binsyms.re] <15_binsyms.re.txt>`
+
+.. literalinclude:: 15_binsyms.re.txt
+ :language: cpp
+ :linenos:
+
+Compile:
+
+.. code-block:: bash
+
+ $ re2c --input custom -o binsyms.cc binsyms.re
+ $ g++ -o binsyms binsyms.cc
+
+Run:
+
+.. code-block:: bash
+
+ $ ./binsyms binsyms
+ __gmon_start__
+ __libc_start_main
+ __off_t
+ __cxx11
+ __gnu_cxx3divExx
+ __off64_t
+ __pad1
+ __pad2
+ __pad3
+ __pad4
+ __pad5
+ __compar_fn_t
+ __gnu_cxx
+ __init_array_start
+ __libc_csu_fini
+ __libc_csu_init
+ __init_array_end
+ __GNU_EH_FRAME_HDR
+ __init_array_end
+ __init_array_start
+ __libc_csu_fini
+ __gmon_start__
+ __libc_start_main
+ __data_start
+ __TMC_END__
+ __dso_handle
+ __libc_csu_init
+ __bss_start
+
+
--- /dev/null
+Fake sentinel
+-------------
+
+This example explores the case when we know the *length* of input,
+but there is no terminating character and buffering is not possible.
+In such cases we cannot use the usual sentinel method; and we cannot use ``YYLIMIT``-based method as it requires ``YYMAXFILL`` padding.
+The choiche then is to use generic API:
+disable the default cheching mechanism with ``re2c:yyfill:enable = 0;``
+and use one of the primitives ``YYPEEK`` and ``YYSKIP`` to check for the end of input.
+
+In this example we use ``YYPEEK`` to emulate *fake sentinel*:
+every time the lexer peeks a new character, it first checks for the end of input:
+if it has already been reached, ``YYPEEK`` returns ``NULL`` (though the actual string has no terminating ``NULL``).
+Checking on every ``YYPEEK`` is less efficient than the usual sentinel method
+(which performs no checking at all), but it can be more efficient than copying input to buffer and padding it with a real sentinel character.
+
+Note that fake sentinel method also relies on the fact that sentinel cannot appear in the middle of well-formed input.
+If the input can contain arbitrary characters, then one should utilize ``YYSKIP`` as shown in `this example <example_15.html>`_.
+
+:download:`[fake_sentinel.re] <16_fake_sentinel.re.txt>`
+
+.. literalinclude:: 16_fake_sentinel.re.txt
+ :language: cpp
+ :linenos:
+
+Compile:
+
+.. code-block:: bash
+
+ $ re2c --input custom -o fake_sentinel.cc fake_sentinel.re
+ $ g++ -o fake_sentinel fake_sentinel.cc
+
+Run:
+
+.. code-block:: bash
+
+ $ ./fake_sentinel somestring
+ somestring;
--- /dev/null
+std::ifstream
+-------------
+
+This example shows how to override re2c input mechanism:
+instead of reading input characters from a buffer in memory, read them directly from file using STL ``std::ifstream`` class.
+Note that we use ``tellg`` / ``seekg`` and rely on the ability to move backward and forward in the input stream:
+this might not be possible, for example with ``stdin`` stream.
+The program below converts Windows-style line endings ``CR LF`` to Unix-style line endings ``LF``.
+
+This program uses a non-standard way of checking for the end of input:
+it disables the usual cheching mechnism with ``re2c:yyfill:enable = 0;``
+(this suppresses the generation of ``YYLESSTHAN`` and ``YYFILL``)
+and puts the responsibility for checking on ``YYSKIP``.
+This results in more frequent checks: ``YYSKIP`` is happens on each input character,
+while ``YYLESSTHAN`` happens only once per each strongly connected component of automaton.
+However, this method allows to avoid padding, which would require buffering input and nullify all advantages of direct-file input.
+
+
+
+:download:`[ifstream.re] <17_ifstream.re.txt>`
+
+.. literalinclude:: 17_ifstream.re.txt
+ :language: cpp
+ :linenos:
+
+Compile:
+
+.. code-block:: bash
+
+ $ re2c --input custom -o ifstream.cc ifstream.re
+ $ g++ -o ifstream ifstream.cc
+
URI (RFC-3986) <example_10>
HTTP (RFC-7230) <example_11>
Braille patterns <example_06>
+ Strings in binaries <example_15>
+ Fake sentinel <example_16>
+ std::ifstream <example_17>
Examples have been written with two goals in mind.
First, they are practical: each example solves a distinct real-world problem,
ranging from simple recognizers to complex parsers conforming to real-world standards and specifications.
Second, examples show various aspects of using re2c API:
- Checking for the end of input:
- simple and efficient `sentinel method <example_01.html>`_
- which should be used when it is possible to mark the end of input with a *sentinel character* that never appears in the middle of well-formed input;
- and the more complex `general method <example_02.html>`_ based on ``YYLIMIT``
- which requires apending ``YYMAXFILL`` characters of padding at the end of input.
+ Checking for the end of input: this can be done in a number of different ways.
+ The simplest and the most efficient way is the sentinel method demonstrated by `lexing numbers <example_01.html>`_ example:
+ it should be used when there is a *sentinel character* that never appears in the middle of well-formed input,
+ such as the ``NULL`` character in null-terminated strings.
+ If the input is buffered, sentinel should be `appended at the end of buffer <example_09.html>`_.
+ If appending is not possible, one can `emulate fake sentinel <example_16.html>`_ using generic API.
+ Another, more general (but also less efficient) method is based on comparison of current input position and the end position:
+ that is, comparison of ``YYCURSOR`` and ``YYLIMIT`` as explained in `parsing strings <example_02.html>`_ example,
+ or using ``YYLESSTHAN`` in case of generic API.
+ By default, this method requires padding input with ``YYMAXFILL`` fake characters;
+ if padding is undesirable or impossible, one can override the checking mechanism using generic API
+ and `perform checks on each input character <example_15.html>`_
+ (also used in `std::ifstream <example_17.html>`_ example).
Handling `large input <example_03.html>`_: how to organize buffering and how to refill buffer with ``YYFILL``.
Some additional details of handling *tags* in ``YYFILL`` are illustrated
using *m-tags* to handle repeated submatch and store repeated values efficiently in the form of a prefix tree
(outlined by parsing non-recursive `records and structures <example_13.html>`_ and also used in parsing `HTTP messages <example_11.html>`_).
+ Using `generic API </manual/features/generic_api/generic_api.html>`_,
+ either to override the input mechanism (outlined by `std::ifstream <example_17.html>`_ example),
+ or to tweak it (as explained in
+ `fake sentinel <example_16.html>`_ and
+ `strings in binaries <example_15.html>`_
+ examples).
+
Switching between different lexing *modes* using multiple interrelated sub-lexers:
either in semi-automated manner with re2c `conditions <../manual/features/conditions/conditions.html>`_ feature
(outlined by a simple example of `parsing integers <example_05.html>`_ and also used in parsing `Braille patterns <example_06.html>`_),
--- /dev/null
+#include <fstream>
+
+static void conv(std::ifstream &in, std::ofstream &out)
+{
+ std::streampos mar;
+# define YYCTYPE char
+# define YYPEEK() in.peek()
+# define YYSKIP() do { in.ignore(); if (in.eof()) return; } while(0)
+# define YYBACKUP() mar = in.tellg()
+# define YYRESTORE() in.seekg(mar)
+loop:
+ /*!re2c
+ re2c:yyfill:enable = 0;
+
+ * { out.put(yych); goto loop; }
+ "\r\n" { out.put('\n'); goto loop; }
+ */
+}
+
+int main(int argc, char **argv)
+{
+ if (argc != 3) return 1;
+
+ std::ifstream in(argv[1], std::ios::binary);
+ std::ofstream out(argv[2], std::ios::binary);
+ if (in.fail() || out.fail()) return 2;
+
+ conv(in, out);
+ return 0;
+}