* Input string is padded with ``YYMAXFILL`` characters ``'a'`` (line 15).
Sequence of ``'a'`` does not form a valid lexeme suffix (but padding like ``"\0`` would cause false match on incorrect input like ``"aha``).
* ``YYLIMIT`` points to the end of padding (line 26).
-* ``YYFILL`` simply stops: there's nothing more to lex (line 30).
+* ``YYFILL`` returns an error (line 30): if the input was correct, lexer should have stopped
+ at the beginning of padding.
+* Lexer should consume *all* input characters (line 37).
* We have to use ``re2c:define:YYFILL:naked = 1;`` (line 31)
in order to suppress passing parameter to ``YYFILL``.
(It was an unfortunate idea to make ``YYFILL`` a call expression by default:
* Each condition is a standalone lexer (DFA).
* Conditions are interconnected: transitions are allowed between final states of one DFA
and start state of another DFA (but no transitions between inner states of different DFAs).
+ The generated code starts with dispatch on conditions.
* Each condition has a unique identifier: ``/*!types:re2c*/`` directive (line 3)
tells re2c to generate enumeration of them (names are prefixed with ``yyc`` by default).
These identifiers are used in the initial dispatch on conditions:
error :[
error :[
+
+.. Braille patterns (encodings):
+
+Braille patterns (encodings)
+----------------------------
+
+This example is about encoding support in re2c.
+It's a simple decoder from Grade-1 (uncontracted) Unicode English Braille to plain English.
+The input may be encoded in UTF-8, UTF-16, UTF-32 or UCS-2:
+all of these encodings are capable of representing Braille patterns (code points ``[0x2800 - 0x28ff]``).
+We use ``-r`` option to reuse the same block of re2c rules with different encodings.
+
+So. We have a file `[06_braille.utf8.txt] <examples/06_braille.utf8.txt.html>`_ (encoded in UTF-8) with a message:
+
+.. include:: examples/06_braille.utf8.txt
+
+Let's translate it into UTF-16, UTF-32 or UCS-2:
+
+.. code-block:: bash
+
+ $ iconv -f utf8 -t utf16le 06_braille.utf8.txt > 06_braille.utf16.txt
+ $ iconv -f utf8 -t utf32le 06_braille.utf8.txt > 06_braille.utf32.txt
+ $ iconv -f utf8 -t ucs2 06_braille.utf8.txt > 06_braille.ucs2.txt
+
+Uncontracted Braille is simple (compared to Grade-2 Braille).
+Patterns (mostly) map directly to symbols: alphabet letters, digits and punctuators.
+There is a couple of patterns that don't map to symbols:
+start of numeric mode (⠼), end of numeric mode (⠰), capital letter (⠠) (and some other, which are not covered by this example).
+Ambiguous punctuation patterns are also excluded.
+Grade-2 Braille allows contractions; they obey complex rules (like those of a natural language)
+and are much harder to implement.
+
+`[06_braille.re] <examples/06_braille.re>`_
+
+.. include:: examples/06_braille.re
+ :code: cpp
+ :number-lines:
+
+Notes:
+
+* Reuse mode allows two types of blocks: a single ``/*!rules:re2c ... */`` block (lines 49 - 129)
+ and multiple ``/*!use:re2c ... */`` blocks (lines 140 - 148, 157 - 167 and 176 - 186).
+ All blocks can have their own configurations, definitions and rules.
+* Conditions are used to emulate transitions between numeric and normal modes (lines 76 and 104).
+* Each encoding has an appropriate code unit type (``YYCTYPE``).
+
+Generate, compile and run:
+
+.. code-block:: bash
+
+ $ re2c -cr8 -o example.cc 06_braille.re
+ $ g++ -o example example.cc
+ $ ./example
+ utf8:
+ All human beings are born free and equal in dignity and rights.
+ They are endowed with reason and conscience and should act towards
+ one another in a spirit of brotherhood.
+
+ utf16:
+ All human beings are born free and equal in dignity and rights.
+ They are endowed with reason and conscience and should act towards
+ one another in a spirit of brotherhood.
+
+ utf32:
+ All human beings are born free and equal in dignity and rights.
+ They are endowed with reason and conscience and should act towards
+ one another in a spirit of brotherhood.
+
+ ucs2:
+ All human beings are born free and equal in dignity and rights.
+ They are endowed with reason and conscience and should act towards
+ one another in a spirit of brotherhood.
+
+
--- /dev/null
+#include <ctype.h>
+#include <stdio.h>
+#include <string.h>
+
+/*!max:re2c*/
+
+template<typename char_t>
+struct input_t {
+ size_t len;
+ char_t *str;
+
+ input_t(FILE *f)
+ : len(0)
+ , str(new char_t[len + YYMAXFILL])
+ {
+ fseek(f, 0, SEEK_END);
+ len = ftell(f) / sizeof(char_t);
+ fseek(f, 0, SEEK_SET);
+ str = new char_t[len + YYMAXFILL];
+ fread(str, sizeof(char_t), len, f);
+ memset(str + len, 0, YYMAXFILL);
+ }
+ ~input_t()
+ {
+ delete[]str;
+ }
+};
+
+typedef input_t<unsigned char> iutf8_t;
+typedef input_t<unsigned short> iutf16_t;
+typedef input_t<unsigned int> iutf32_t;
+typedef input_t<unsigned short> iucs2_t;
+
+struct out_t {
+ bool caps;
+
+ out_t() : caps(false) {}
+ void prt(char c)
+ {
+ printf("%c", caps ? toupper(c) : c);
+ caps = false;
+ }
+ void err()
+ {
+ printf(" ... error\n");
+ }
+};
+
+/*!rules:re2c
+
+ // letters
+ l = "\u2830";
+ la = "\u2801"; lb = "\u2803"; lc = "\u2809"; ld = "\u2819"; le = "\u2811";
+ lf = "\u280b"; lg = "\u281b"; lh = "\u2813"; li = "\u280a"; lj = "\u281a";
+ lk = "\u2805"; ll = "\u2807"; lm = "\u280d"; ln = "\u281d"; lo = "\u2815";
+ lp = "\u280f"; lq = "\u281f"; lr = "\u2817"; ls = "\u280e"; lt = "\u281e";
+ lu = "\u2825"; lv = "\u2827"; lw = "\u283a"; lx = "\u282d"; ly = "\u283d";
+ lz = "\u2835";
+
+ // numbers
+ n = "\u283c";
+ n1 = "\u2801"; n2 = "\u2803"; n3 = "\u2809"; n4 = "\u2819"; n5 = "\u2811";
+ n6 = "\u280b"; n7 = "\u281b"; n8 = "\u2813"; n9 = "\u280a"; n0 = "\u281a";
+
+ // punctuation
+ pcom = "\u2802"; psem = "\u2806"; pcln = "\u2812";
+ pdot = "\u2832"; pxcl = "\u2816"; pqst = "\u2826";
+ past = "\u2814"; pdsh = "\u2804"; phyp = "\u2824";
+
+ // formatting
+ fcp = "\u2820"; fsp = "\u2800" | "\x20"; fnl = "\n" | "\n\r";
+
+ <*> * { out.err(); return; }
+ <*> "\x00" { if (YYLIMIT - YYCURSOR != YYMAXFILL - 1) out.err(); return; }
+
+ <*> l :=> l
+ <l> la { out.prt('a'); goto yyc_l; }
+ <l> lb { out.prt('b'); goto yyc_l; }
+ <l> lc { out.prt('c'); goto yyc_l; }
+ <l> ld { out.prt('d'); goto yyc_l; }
+ <l> le { out.prt('e'); goto yyc_l; }
+ <l> lf { out.prt('f'); goto yyc_l; }
+ <l> lg { out.prt('g'); goto yyc_l; }
+ <l> lh { out.prt('h'); goto yyc_l; }
+ <l> li { out.prt('i'); goto yyc_l; }
+ <l> lj { out.prt('j'); goto yyc_l; }
+ <l> lk { out.prt('k'); goto yyc_l; }
+ <l> ll { out.prt('l'); goto yyc_l; }
+ <l> lm { out.prt('m'); goto yyc_l; }
+ <l> ln { out.prt('n'); goto yyc_l; }
+ <l> lo { out.prt('o'); goto yyc_l; }
+ <l> lp { out.prt('p'); goto yyc_l; }
+ <l> lq { out.prt('q'); goto yyc_l; }
+ <l> lr { out.prt('r'); goto yyc_l; }
+ <l> ls { out.prt('s'); goto yyc_l; }
+ <l> lt { out.prt('t'); goto yyc_l; }
+ <l> lu { out.prt('u'); goto yyc_l; }
+ <l> lv { out.prt('v'); goto yyc_l; }
+ <l> lw { out.prt('w'); goto yyc_l; }
+ <l> lx { out.prt('x'); goto yyc_l; }
+ <l> ly { out.prt('y'); goto yyc_l; }
+ <l> lz { out.prt('z'); goto yyc_l; }
+
+ <*> n :=> n
+ <n> n1 { out.prt('1'); goto yyc_n; }
+ <n> n2 { out.prt('2'); goto yyc_n; }
+ <n> n3 { out.prt('3'); goto yyc_n; }
+ <n> n4 { out.prt('4'); goto yyc_n; }
+ <n> n5 { out.prt('5'); goto yyc_n; }
+ <n> n6 { out.prt('6'); goto yyc_n; }
+ <n> n7 { out.prt('7'); goto yyc_n; }
+ <n> n8 { out.prt('8'); goto yyc_n; }
+ <n> n9 { out.prt('9'); goto yyc_n; }
+ <n> n0 { out.prt('0'); goto yyc_n; }
+
+ <*> pcom { out.prt(','); goto yyc_l; }
+ <*> psem { out.prt(';'); goto yyc_l; }
+ <*> pcln { out.prt(':'); goto yyc_l; }
+ <*> pdot { out.prt('.'); goto yyc_l; }
+ <*> pxcl { out.prt('!'); goto yyc_l; }
+ <*> pqst { out.prt('?'); goto yyc_l; }
+ <*> past { out.prt('*'); goto yyc_l; }
+ <*> pdsh { out.prt('\''); goto yyc_l; }
+ <*> phyp { out.prt('-'); goto yyc_l; }
+
+ <*> fcp { out.caps = true; goto yyc_l; }
+ <*> fsp { out.prt(' '); goto yyc_l; }
+ <*> fnl { out.prt('\n'); goto yyc_l; }
+*/
+
+/*!types:re2c*/
+
+static void lex_utf8(const iutf8_t & input)
+{
+ const unsigned char *YYCURSOR = input.str;
+ const unsigned char *const YYLIMIT = input.str + input.len + YYMAXFILL;
+ const unsigned char *YYMARKER;
+ int c = yycl;
+ out_t out;
+ /*!use:re2c
+ re2c:define:YYCTYPE = "unsigned char";
+ re2c:define:YYFILL = "{ out.err(); return; }";
+ re2c:define:YYFILL:naked = 1;
+ re2c:define:YYGETCONDITION = "c";
+ re2c:define:YYGETCONDITION:naked = 1;
+ re2c:define:YYSETCONDITION = "c = @@;";
+ re2c:define:YYSETCONDITION:naked = 1;
+ */
+}
+
+static void lex_utf16(const iutf16_t & input)
+{
+ const unsigned short *YYCURSOR = input.str;
+ const unsigned short *const YYLIMIT = input.str + input.len + YYMAXFILL;
+ int c = yycl;
+ out_t out;
+ /*!use:re2c
+ re2c:define:YYCTYPE = "unsigned int";
+ re2c:define:YYFILL = "{ out.err(); return; }";
+ re2c:define:YYFILL:naked = 1;
+ re2c:define:YYGETCONDITION = "c";
+ re2c:define:YYGETCONDITION:naked = 1;
+ re2c:define:YYSETCONDITION = "c = @@;";
+ re2c:define:YYSETCONDITION:naked = 1;
+ re2c:flags:8 = 0;
+ re2c:flags:x = 1;
+ */
+}
+
+static void lex_utf32(const iutf32_t & input)
+{
+ const unsigned int *YYCURSOR = input.str;
+ const unsigned int *const YYLIMIT = input.str + input.len + YYMAXFILL;
+ int c = yycl;
+ out_t out;
+ /*!use:re2c
+ re2c:define:YYCTYPE = "unsigned int";
+ re2c:define:YYFILL = "{ out.err(); return; }";
+ re2c:define:YYFILL:naked = 1;
+ re2c:define:YYGETCONDITION = "c";
+ re2c:define:YYGETCONDITION:naked = 1;
+ re2c:define:YYSETCONDITION = "c = @@;";
+ re2c:define:YYSETCONDITION:naked = 1;
+ re2c:flags:x = 0;
+ re2c:flags:u = 1;
+ */
+}
+
+static void lex_ucs2(const iucs2_t & input)
+{
+ const unsigned short *YYCURSOR = input.str;
+ const unsigned short *const YYLIMIT = input.str + input.len + YYMAXFILL;
+ int c = yycl;
+ out_t out;
+ /*!use:re2c
+ re2c:define:YYCTYPE = "unsigned int";
+ re2c:define:YYFILL = "{ out.err(); return; }";
+ re2c:define:YYFILL:naked = 1;
+ re2c:define:YYGETCONDITION = "c";
+ re2c:define:YYGETCONDITION:naked = 1;
+ re2c:define:YYSETCONDITION = "c = @@;";
+ re2c:define:YYSETCONDITION:naked = 1;
+ re2c:flags:u = 0;
+ re2c:flags:w = 1;
+ */
+}
+
+int main()
+{
+ FILE *f;
+
+ f = fopen("06_braille.utf8.txt", "rb");
+ if (f) {
+ printf("utf8:\n");
+ iutf8_t input(f);
+ lex_utf8(input);
+ fclose(f);
+ }
+
+ f = fopen("06_braille.utf16.txt", "rb");
+ if (f) {
+ printf("utf16:\n");
+ iutf16_t input(f);
+ lex_utf16(input);
+ fclose(f);
+ }
+
+ f = fopen("06_braille.utf32.txt", "rb");
+ if (f) {
+ printf("utf32:\n");
+ iutf32_t input(f);
+ lex_utf32(input);
+ fclose(f);
+ }
+
+ f = fopen("06_braille.ucs2.txt", "rb");
+ if (f) {
+ printf("ucs2:\n");
+ iucs2_t input(f);
+ lex_ucs2(input);
+ fclose(f);
+ }
+
+ return 0;
+}