----------------------------
This example is about encoding support in re2c.
-It's a simple decoder from Grade-1 (uncontracted) Unicode English Braille to plain English.
+It's a partial decoder from Grade-1 (uncontracted) Unicode English Braille to plain English.
The input may be encoded in UTF-8, UTF-16, UTF-32 or UCS-2:
all of these encodings are capable of representing Braille patterns (code points ``[0x2800 - 0x28ff]``).
We use ``-r`` option to reuse the same block of re2c rules with different encodings.
-So. We have a file `[06_braille.utf8.txt] <examples/06_braille.utf8.txt.html>`_ (encoded in UTF-8) with a message:
+So. The hardest part is to get some input.
+Here is a message out of the void:
.. include:: examples/06_braille.utf8.txt
-Let's translate it into UTF-16, UTF-32 or UCS-2:
+It appears to be UTF-8 encoded `[06_braille.utf8.txt] <examples/06_braille.utf8.txt.html>`_.
+Now translate it into UTF-16, UTF-32 or UCS-2:
.. code-block:: bash
$ iconv -f utf8 -t utf32le 06_braille.utf8.txt > 06_braille.utf32.txt
$ iconv -f utf8 -t ucs2 06_braille.utf8.txt > 06_braille.ucs2.txt
-Uncontracted Braille is simple (compared to Grade-2 Braille).
-Patterns (mostly) map directly to symbols: alphabet letters, digits and punctuators.
-There is a couple of patterns that don't map to symbols:
-start of numeric mode (⠼), end of numeric mode (⠰), capital letter (⠠) (and some other, which are not covered by this example).
-Ambiguous punctuation patterns are also excluded.
+And the input is ready.
+
+Grade-1 Braille is quite simple (compared to Grade-2 Braille).
+Patterns map directly to symbols (letters, digits and punctuators) except for a couple of special patterns:
+numeric mode indicator (⠼), letter mode indicator (⠰), capital letter (⠠)
+and some other, which we omit for simplicity (as well as a few ambiguous punctuation patterns).
Grade-2 Braille allows contractions; they obey complex rules (like those of a natural language)
and are much harder to implement.
Notes:
-* Reuse mode allows two types of blocks: a single ``/*!rules:re2c ... */`` block (lines 49 - 129)
- and multiple ``/*!use:re2c ... */`` blocks (lines 140 - 148, 157 - 167 and 176 - 186).
+* Reuse mode is enabled with ``-r`` option.
+* In reuse mode re2c expects a single ``/*!rules:re2c ... */`` block (line 49)
+ followed by multiple ``/*!use:re2c ... */`` blocks (lines 140, 157 and 176).
All blocks can have their own configurations, definitions and rules.
-* Conditions are used to emulate transitions between numeric and normal modes (lines 76 and 104).
-* Each encoding has an appropriate code unit type (``YYCTYPE``).
+* Encoding can be enabled either with command-line option or with configuration.
+* Each encoding needs an appropriate code unit type (``YYCTYPE``).
+* We use conditions to switch between numeric and normal modes (lines 76 and 104).
Generate, compile and run:
one another in a spirit of brotherhood.
+.. C++98 lexer:
+
+C++98 lexer
+-----------
+
+`[07_c++98.re] <examples/07_c++98.re>`_
+
+.. include:: examples/07_c++98.re
+ :code: cpp
+ :number-lines:
+
+Generate, compile and run:
+
+.. code-block:: bash
+
+ $ re2c -o example.cc 07_c++98.re
+ $ g++ -o example example.cc
+ $ ./example 07_c++98.re | fold
+ STATIC CONST size_t SIZE = 64 * 1024; STRUCT input_t { UNSIGNED CHAR buf[SIZE +
+ YYMAXFILL]; UNSIGNED CHAR *lim; UNSIGNED CHAR *cur; UNSIGNED CHAR *mar; UNSIGNE
+ D CHAR *tok; BOOL eof; FILE *CONST file; input_t(FILE *f) : buf() , lim(buf + SI
+ ZE) , cur(lim) , mar(lim) , tok(lim) , eof(false) , file(f) {} BOOL fill(size_t
+ need) { IF (eof) { RETURN false; } CONST size_t free = tok - buf; IF (free < nee
+ d) { RETURN false; } memmove(buf, tok, lim - tok); lim -= free; cur -= free; mar
+ -= free; tok -= free; lim += fread(lim, 1, free, file); IF (lim < buf + SIZE) {
+ eof = true; memset(lim, 0, YYMAXFILL); lim += YYMAXFILL; } RETURN true; } }; TE
+ MPLATE<INT base> STATIC BOOL adddgt(UNSIGNED LONG &u, UNSIGNED LONG d) { IF (u >
+ (ULONG_MAX - d) / base) { RETURN false; } u = u * base + d; RETURN true; } STAT
+ IC BOOL lex_int_sfx(CONST UNSIGNED CHAR *s, UNSIGNED LONG u) { } STATIC BOOL lex
+ _oct(CONST UNSIGNED CHAR *s, BOOL sfx, UNSIGNED LONG &u) { FOR (u = 0, ++s;;) {
+ } } STATIC BOOL lex_dec(CONST UNSIGNED CHAR *s, BOOL sfx, UNSIGNED LONG &u) { FO
+ R (u = 0;;) { } } STATIC BOOL lex_hex(CONST UNSIGNED CHAR *s, BOOL sfx, UNSIGNED
+ LONG &u) { FOR (u = 0, s += 2;;) { } } STATIC BOOL lex_str(input_t &in, UNSIGNE
+ D CHAR q) { printf("\x25\x63", q); FOR (UNSIGNED LONG u = q;; printf("\x5c\x78\x
+ 25\x6c\x78", u)) { in.tok = in.cur; } printf("\x25\x63", q); RETURN true; } STAT
+ IC BOOL lex_flt(CONST UNSIGNED CHAR *s) { DOUBLE d = 0; DOUBLE x = 1; INT e = 0;
+ mant_int: mant_frac: exp_sign: exp: sfx: end: printf("\x25\x67", d); RETURN tru
+ e; } STATIC BOOL lex(input_t &in) { UNSIGNED LONG u; FOR (;;) { in.tok = in.cur;
+ } } INT main(INT argc, CHAR **argv) { IF (argc != 2) { printf ("\x75\x73\x61\x6
+ 7\x65\x3a\x20\x2e\x2f\x65\x78\x61\x6d\x70\x6c\x65\x20\x3c\x66\x69\x6c\x65\x6e\x6
+ 1\x6d\x65\x3e\xa"); RETURN 1; } FILE *file = fopen(argv[1], "\x72\x62"); IF (!fi
+ le) { printf("\x65\x72\x72\x6f\x72\x3a\x20\x63\x61\x6e\x6e\x6f\x74\x20\x6f\x70\x
+ 65\x6e\x20\x66\x69\x6c\x65\x3a\x20\x25\x73\xa", argv[1]); RETURN 1; } input_t in
+ (file); IF (!lex(in)) { printf("\x2e\x2e\x2e\x20\x65\x72\x72\x6f\x72\xa"); } ELS
+ E { printf("\xa"); } fclose(file); RETURN 0; }
+
+
--- /dev/null
+#include <float.h>
+#include <limits.h>
+#include <stdio.h>
+#include <string.h>
+
+/*!max:re2c*/
+static const size_t SIZE = 64 * 1024;
+
+struct input_t {
+ unsigned char buf[SIZE + YYMAXFILL];
+ unsigned char *lim;
+ unsigned char *cur;
+ unsigned char *mar;
+ unsigned char *tok;
+ bool eof;
+
+ FILE *const file;
+
+ input_t(FILE *f)
+ : buf()
+ , lim(buf + SIZE)
+ , cur(lim)
+ , mar(lim)
+ , tok(lim)
+ , eof(false)
+ , file(f)
+ {}
+ bool fill(size_t need)
+ {
+ if (eof) {
+ return false;
+ }
+
+ const size_t free = tok - buf;
+ if (free < need) {
+ return false;
+ }
+
+ memmove(buf, tok, lim - tok);
+ lim -= free;
+ cur -= free;
+ mar -= free;
+ tok -= free;
+ lim += fread(lim, 1, free, file);
+ if (lim < buf + SIZE) {
+ eof = true;
+ memset(lim, 0, YYMAXFILL);
+ lim += YYMAXFILL;
+ }
+ return true;
+ }
+};
+
+/*!re2c re2c:define:YYCTYPE = "unsigned char"; */
+
+template<int base>
+static bool adddgt(unsigned long &u, unsigned long d)
+{
+ if (u > (ULONG_MAX - d) / base) {
+ return false;
+ }
+ u = u * base + d;
+ return true;
+}
+
+static bool lex_int_sfx(const unsigned char *s, unsigned long u)
+{
+ /*!re2c
+ re2c:yyfill:enable = 0;
+ re2c:define:YYCURSOR = s;
+ * { return u < INT_MAX; }
+ 'u' { return u < UINT_MAX; }
+ 'l' { return u < LONG_MAX; }
+ 'ul' | 'lu' { return true; }
+ */
+}
+
+static bool lex_oct(const unsigned char *s, bool sfx, unsigned long &u)
+{
+ for (u = 0, ++s;;) {
+ /*!re2c
+ re2c:yyfill:enable = 0;
+ re2c:define:YYCURSOR = s;
+ [0-7] { if (adddgt<8>(u, s[-1] - 0x30u)) continue; return false; }
+ "" { return !sfx || lex_int_sfx(s, u); }
+ */
+ }
+}
+
+static bool lex_dec(const unsigned char *s, bool sfx, unsigned long &u)
+{
+ for (u = 0;;) {
+ /*!re2c
+ re2c:yyfill:enable = 0;
+ re2c:define:YYCURSOR = s;
+ [0-9] { if (adddgt<10>(u, s[-1] - 0x30u)) continue; return false; }
+ "" { return !sfx || lex_int_sfx(s, u); }
+ */
+ }
+}
+
+static bool lex_hex(const unsigned char *s, bool sfx, unsigned long &u)
+{
+ for (u = 0, s += 2;;) {
+ /*!re2c
+ re2c:yyfill:enable = 0;
+ re2c:define:YYCURSOR = s;
+ [0-9] { if (adddgt<16>(u, s[-1] - 0x30u)) continue; return false; }
+ [a-f] { if (adddgt<16>(u, s[-1] - 0x61u + 10)) continue; return false; }
+ [A-F] { if (adddgt<16>(u, s[-1] - 0x41u + 10)) continue; return false; }
+ "" { return !sfx || lex_int_sfx(s, u); }
+ */
+ }
+}
+
+static bool lex_str(input_t &in, unsigned char q)
+{
+ printf("%c", q);
+ for (unsigned long u = q;; printf("\\x%lx", u)) {
+ in.tok = in.cur;
+ /*!re2c
+ re2c:define:YYCURSOR = in.cur;
+ re2c:define:YYMARKER = in.mar;
+ re2c:define:YYLIMIT = in.lim;
+ re2c:yyfill:enable = 1;
+ re2c:define:YYFILL = "if (!in.fill(@@)) return false;";
+ re2c:define:YYFILL:naked = 1;
+ * { return false; }
+ [^\n\\] { u = in.tok[0]; if (u == q) break; continue; }
+ "\\a" { u = '\a'; continue; }
+ "\\b" { u = '\b'; continue; }
+ "\\f" { u = '\f'; continue; }
+ "\\n" { u = '\n'; continue; }
+ "\\r" { u = '\r'; continue; }
+ "\\t" { u = '\t'; continue; }
+ "\\v" { u = '\v'; continue; }
+ "\\\\" { u = '\\'; continue; }
+ "\\'" { u = '\''; continue; }
+ "\\\"" { u = '"'; continue; }
+ "\\?" { u = '?'; continue; }
+ "\\" [0-7]{1,3} { lex_oct(in.tok, false, u); continue; }
+ "\\u" [0-9a-fA-F]{4} { lex_hex(in.tok, false, u); continue; }
+ "\\U" [0-9a-fA-F]{8} { lex_hex(in.tok, false, u); continue; }
+ "\\x" [0-9a-fA-F]+ { if (!lex_hex(in.tok, false, u)) return false; continue; }
+ */
+ }
+ printf("%c", q);
+ return true;
+}
+
+static bool lex_flt(const unsigned char *s)
+{
+ double d = 0;
+ double x = 1;
+ int e = 0;
+ /*!re2c
+ re2c:yyfill:enable = 0;
+ re2c:define:YYCURSOR = s;
+ */
+mant_int:
+ /*!re2c
+ "." { goto mant_frac; }
+ [eE] { goto exp_sign; }
+ * { d = (d * 10) + (s[-1] - '0'); goto mant_int; }
+ */
+mant_frac:
+ /*!re2c
+ "" { goto sfx; }
+ [eE] { goto exp_sign; }
+ [0-9] { d += (x /= 10) * (s[-1] - '0'); goto mant_frac; }
+ */
+exp_sign:
+ /*!re2c
+ "+"? { x = 1e+1; goto exp; }
+ "-" { x = 1e-1; goto exp; }
+ */
+exp:
+ /*!re2c
+ "" { for (; e > 0; --e) d *= x; goto sfx; }
+ [0-9] { e = (e * 10) + (s[-1] - '0'); goto exp; }
+ */
+sfx:
+ /*!re2c
+ * { goto end; }
+ [fF] { if (d > FLT_MAX) return false; goto end; }
+ */
+end:
+ printf("%g", d);
+ return true;
+}
+
+static bool lex(input_t &in)
+{
+ unsigned long u;
+ for (;;) {
+ in.tok = in.cur;
+ /*!re2c
+ re2c:define:YYCURSOR = in.cur;
+ re2c:define:YYMARKER = in.mar;
+ re2c:define:YYLIMIT = in.lim;
+ re2c:yyfill:enable = 1;
+ re2c:define:YYFILL = "if (!in.fill(@@)) return false;";
+ re2c:define:YYFILL:naked = 1;
+
+ end = "\x00";
+
+ * { return false; }
+ end { return in.lim - in.tok == YYMAXFILL; }
+
+ // macros
+ macro = ("#" | "%:") ([^\n] | "\\\n")* "\n";
+ macro { continue; }
+
+ // whitespaces
+ mcm = "/*" ([^*] | ("*" [^/]))* "*""/";
+ scm = "//" [^\n]* "\n";
+ wsp = ([ \t\v\n\r] | scm | mcm)+;
+ wsp { printf(" "); continue; }
+
+ // character and string literals
+ "L"? ['"] { if (!lex_str(in, in.cur[-1])) return false; continue; }
+ "L"? "''" { return false; }
+
+ // integer literals
+ int_sfx = 'u' | 'l' | 'ul' | 'lu';
+ oct = "0" [0-7]* int_sfx?;
+ dec = [1-9][0-9]* int_sfx?;
+ hex = '0x' [0-9a-fA-F]+ int_sfx?;
+ oct { if (!lex_oct(in.tok, true, u)) return false; printf("%lu", u); continue; }
+ dec { if (!lex_dec(in.tok, true, u)) return false; printf("%lu", u); continue; }
+ hex { if (!lex_hex(in.tok, true, u)) return false; printf("%lu", u); continue; }
+
+ // floating literals
+ frc = [0-9]* "." [0-9]+ | [0-9]+ ".";
+ exp = 'e' [+-]? [0-9]+;
+ flt = (frc exp? | [0-9]+ exp) [fFlL]?;
+ flt { if (lex_flt(in.tok)) continue; return false; }
+
+ // boolean literals
+ "false" { printf("false"); continue; }
+ "true" { printf("true"); continue; }
+
+ // keywords
+ "asm" { printf("ASM"); continue; }
+ "auto" { printf("AUTO"); continue; }
+ "bool" { printf("BOOL"); continue; }
+ "break" { printf("BREAK"); continue; }
+ "case" { printf("CASE"); continue; }
+ "catch" { printf("CATCH"); continue; }
+ "char" { printf("CHAR"); continue; }
+ "class" { printf("CLASS"); continue; }
+ "const" { printf("CONST"); continue; }
+ "const_cast" { printf("CONST_CAST"); continue; }
+ "continue" { printf("CONTINUE"); continue; }
+ "default" { printf("DEFAULT"); continue; }
+ "do" { printf("DO"); continue; }
+ "double" { printf("DOUBLE"); continue; }
+ "dynamic_cast" { printf("DYNAMIC_CAST"); continue; }
+ "else" { printf("ELSE"); continue; }
+ "enum" { printf("ENUM"); continue; }
+ "explicit" { printf("EXPLICIT"); continue; }
+ "export" { printf("EXPORT"); continue; }
+ "extern" { printf("EXTERN"); continue; }
+ "float" { printf("FLOAT"); continue; }
+ "for" { printf("FOR"); continue; }
+ "friend" { printf("FRIEND"); continue; }
+ "goto" { printf("GOTO"); continue; }
+ "if" { printf("IF"); continue; }
+ "inline" { printf("INLINE"); continue; }
+ "int" { printf("INT"); continue; }
+ "long" { printf("LONG"); continue; }
+ "mutable" { printf("MUTABLE"); continue; }
+ "namespace" { printf("NAMESPACE"); continue; }
+ "operator" { printf("OPERATOR"); continue; }
+ "private" { printf("PRIVATE"); continue; }
+ "protected" { printf("PROTECTED"); continue; }
+ "public" { printf("PUBLIC"); continue; }
+ "register" { printf("REGISTER"); continue; }
+ "reinterpret_cast" { printf("REINTERPRET_CAST"); continue; }
+ "return" { printf("RETURN"); continue; }
+ "short" { printf("SHORT"); continue; }
+ "signed" { printf("SIGNED"); continue; }
+ "sizeof" { printf("SIZEOF"); continue; }
+ "static" { printf("STATIC"); continue; }
+ "static_cast" { printf("STATIC_CAST"); continue; }
+ "struct" { printf("STRUCT"); continue; }
+ "switch" { printf("SWITCH"); continue; }
+ "template" { printf("TEMPLATE"); continue; }
+ "this" { printf("THIS"); continue; }
+ "throw" { printf("THROW"); continue; }
+ "try" { printf("TRY"); continue; }
+ "typedef" { printf("TYPEDEF"); continue; }
+ "typeid" { printf("TYPEID"); continue; }
+ "typename" { printf("TYPENAME"); continue; }
+ "union" { printf("UNION"); continue; }
+ "unsigned" { printf("UNSIGNED"); continue; }
+ "using" { printf("USING"); continue; }
+ "virtual" { printf("VIRTUAL"); continue; }
+ "void" { printf("VOID"); continue; }
+ "volatile" { printf("VOLATILE"); continue; }
+ "wchar_t" { printf("WCHAR_T"); continue; }
+ "while" { printf("WHILE"); continue; }
+
+ // operators and punctuation (including preprocessor)
+ ("{" | "<%") { printf("{"); continue; }
+ ("}" | "%>") { printf("}"); continue; }
+ ("[" | "<:") { printf("["); continue; }
+ ("]" | ":>") { printf("]"); continue; }
+ "(" { printf("("); continue; }
+ ")" { printf(")"); continue; }
+ ";" { printf(";"); continue; }
+ ":" { printf(":"); continue; }
+ "..." { printf("..."); continue; }
+ "new" { printf("new"); continue; }
+ "delete" { printf("delete"); continue; }
+ "?" { printf("?"); continue; }
+ "::" { printf("::"); continue; }
+ "." { printf("."); continue; }
+ ".*" { printf("."); continue; }
+ "+" { printf("+"); continue; }
+ "-" { printf("-"); continue; }
+ "*" { printf("*"); continue; }
+ "/" { printf("/"); continue; }
+ "%" { printf("%%"); continue; }
+ ("^" | "xor") { printf("^"); continue; }
+ ("&" | "bitand") { printf("&"); continue; }
+ ("|" | "bitor") { printf("|"); continue; }
+ ("~" | "compl") { printf("~"); continue; }
+ ("!" | "not") { printf("!"); continue; }
+ "=" { printf("="); continue; }
+ "<" { printf("<"); continue; }
+ ">" { printf(">"); continue; }
+ "+=" { printf("+="); continue; }
+ "-=" { printf("-="); continue; }
+ "*=" { printf("*="); continue; }
+ "/=" { printf("/="); continue; }
+ "%=" { printf("%%="); continue; }
+ ("^=" | "xor_eq") { printf("^="); continue; }
+ ("&=" | "and_eq") { printf("&="); continue; }
+ ("|=" | "or_eq") { printf("|="); continue; }
+ "<<" { printf("<<"); continue; }
+ ">>" { printf(">>"); continue; }
+ ">>=" { printf(">>="); continue; }
+ "<<=" { printf("<<="); continue; }
+ "==" { printf("=="); continue; }
+ ("!=" | "not_eq") { printf("!="); continue; }
+ "<=" { printf("<="); continue; }
+ ">=" { printf(">="); continue; }
+ ("&&" | "and") { printf("&&"); continue; }
+ ("||" | "or") { printf("||"); continue; }
+ "++" { printf("++"); continue; }
+ "--" { printf("--"); continue; }
+ "," { printf(","); continue; }
+ "->*" { printf("->*"); continue; }
+ "->" { printf("->"); continue; }
+
+ // identifiers
+ id = [a-zA-Z_][a-zA-Z_0-9]*;
+ id { printf("%.*s", in.cur - in.tok, in.tok); continue; }
+ */
+ }
+}
+
+int main(int argc, char **argv)
+{
+ if (argc != 2) {
+ printf ("usage: ./example <filename>\n");
+ return 1;
+ }
+
+ FILE *file = fopen(argv[1], "rb");
+ if (!file) {
+ printf("error: cannot open file: %s\n", argv[1]);
+ return 1;
+ }
+
+ input_t in(file);
+ if (!lex(in)) {
+ printf("... error\n");
+ } else {
+ printf("\n");
+ }
+
+ fclose(file);
+ return 0;
+}