From: Ulya Trofimovich Date: Fri, 30 Oct 2015 15:19:00 +0000 (+0000) Subject: Added example with encodings and reuse mode. X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=168d98f847172c8e645f5debcd3791c8ce1eaf10;p=re2c Added example with encodings and reuse mode. --- diff --git a/src/examples.rst b/src/examples.rst index 1ef1c8c9..a9e7495c 100644 --- a/src/examples.rst +++ b/src/examples.rst @@ -109,7 +109,9 @@ Notes: * Input string is padded with ``YYMAXFILL`` characters ``'a'`` (line 15). Sequence of ``'a'`` does not form a valid lexeme suffix (but padding like ``"\0`` would cause false match on incorrect input like ``"aha``). * ``YYLIMIT`` points to the end of padding (line 26). -* ``YYFILL`` simply stops: there's nothing more to lex (line 30). +* ``YYFILL`` returns an error (line 30): if the input was correct, lexer should have stopped + at the beginning of padding. +* Lexer should consume *all* input characters (line 37). * We have to use ``re2c:define:YYFILL:naked = 1;`` (line 31) in order to suppress passing parameter to ``YYFILL``. (It was an unfortunate idea to make ``YYFILL`` a call expression by default: @@ -312,6 +314,7 @@ Notes: * Each condition is a standalone lexer (DFA). * Conditions are interconnected: transitions are allowed between final states of one DFA and start state of another DFA (but no transitions between inner states of different DFAs). + The generated code starts with dispatch on conditions. * Each condition has a unique identifier: ``/*!types:re2c*/`` directive (line 3) tells re2c to generate enumeration of them (names are prefixed with ``yyc`` by default). These identifiers are used in the initial dispatch on conditions: @@ -342,3 +345,77 @@ Generate, compile and run: error :[ error :[ + +.. Braille patterns (encodings): + +Braille patterns (encodings) +---------------------------- + +This example is about encoding support in re2c. +It's a simple decoder from Grade-1 (uncontracted) Unicode English Braille to plain English. +The input may be encoded in UTF-8, UTF-16, UTF-32 or UCS-2: +all of these encodings are capable of representing Braille patterns (code points ``[0x2800 - 0x28ff]``). +We use ``-r`` option to reuse the same block of re2c rules with different encodings. + +So. We have a file `[06_braille.utf8.txt] `_ (encoded in UTF-8) with a message: + +.. include:: examples/06_braille.utf8.txt + +Let's translate it into UTF-16, UTF-32 or UCS-2: + +.. code-block:: bash + + $ iconv -f utf8 -t utf16le 06_braille.utf8.txt > 06_braille.utf16.txt + $ iconv -f utf8 -t utf32le 06_braille.utf8.txt > 06_braille.utf32.txt + $ iconv -f utf8 -t ucs2 06_braille.utf8.txt > 06_braille.ucs2.txt + +Uncontracted Braille is simple (compared to Grade-2 Braille). +Patterns (mostly) map directly to symbols: alphabet letters, digits and punctuators. +There is a couple of patterns that don't map to symbols: +start of numeric mode (⠼), end of numeric mode (⠰), capital letter (⠠) (and some other, which are not covered by this example). +Ambiguous punctuation patterns are also excluded. +Grade-2 Braille allows contractions; they obey complex rules (like those of a natural language) +and are much harder to implement. + +`[06_braille.re] `_ + +.. include:: examples/06_braille.re + :code: cpp + :number-lines: + +Notes: + +* Reuse mode allows two types of blocks: a single ``/*!rules:re2c ... */`` block (lines 49 - 129) + and multiple ``/*!use:re2c ... */`` blocks (lines 140 - 148, 157 - 167 and 176 - 186). + All blocks can have their own configurations, definitions and rules. +* Conditions are used to emulate transitions between numeric and normal modes (lines 76 and 104). +* Each encoding has an appropriate code unit type (``YYCTYPE``). + +Generate, compile and run: + +.. code-block:: bash + + $ re2c -cr8 -o example.cc 06_braille.re + $ g++ -o example example.cc + $ ./example + utf8: + All human beings are born free and equal in dignity and rights. + They are endowed with reason and conscience and should act towards + one another in a spirit of brotherhood. + + utf16: + All human beings are born free and equal in dignity and rights. + They are endowed with reason and conscience and should act towards + one another in a spirit of brotherhood. + + utf32: + All human beings are born free and equal in dignity and rights. + They are endowed with reason and conscience and should act towards + one another in a spirit of brotherhood. + + ucs2: + All human beings are born free and equal in dignity and rights. + They are endowed with reason and conscience and should act towards + one another in a spirit of brotherhood. + + diff --git a/src/examples/02_recognizing_strings.re b/src/examples/02_recognizing_strings.re index 83157319..764543a5 100644 --- a/src/examples/02_recognizing_strings.re +++ b/src/examples/02_recognizing_strings.re @@ -34,7 +34,7 @@ static const char *lex(const input_t & input) str = "'" [^']* "'"; * { return "err"; } - str end { return "str"; } + str end { return YYLIMIT - YYCURSOR == YYMAXFILL ? "str" : "err"; } */ } diff --git a/src/examples/06_braille.re b/src/examples/06_braille.re new file mode 100644 index 00000000..9e7a9b87 --- /dev/null +++ b/src/examples/06_braille.re @@ -0,0 +1,245 @@ +#include +#include +#include + +/*!max:re2c*/ + +template +struct input_t { + size_t len; + char_t *str; + + input_t(FILE *f) + : len(0) + , str(new char_t[len + YYMAXFILL]) + { + fseek(f, 0, SEEK_END); + len = ftell(f) / sizeof(char_t); + fseek(f, 0, SEEK_SET); + str = new char_t[len + YYMAXFILL]; + fread(str, sizeof(char_t), len, f); + memset(str + len, 0, YYMAXFILL); + } + ~input_t() + { + delete[]str; + } +}; + +typedef input_t iutf8_t; +typedef input_t iutf16_t; +typedef input_t iutf32_t; +typedef input_t iucs2_t; + +struct out_t { + bool caps; + + out_t() : caps(false) {} + void prt(char c) + { + printf("%c", caps ? toupper(c) : c); + caps = false; + } + void err() + { + printf(" ... error\n"); + } +}; + +/*!rules:re2c + + // letters + l = "\u2830"; + la = "\u2801"; lb = "\u2803"; lc = "\u2809"; ld = "\u2819"; le = "\u2811"; + lf = "\u280b"; lg = "\u281b"; lh = "\u2813"; li = "\u280a"; lj = "\u281a"; + lk = "\u2805"; ll = "\u2807"; lm = "\u280d"; ln = "\u281d"; lo = "\u2815"; + lp = "\u280f"; lq = "\u281f"; lr = "\u2817"; ls = "\u280e"; lt = "\u281e"; + lu = "\u2825"; lv = "\u2827"; lw = "\u283a"; lx = "\u282d"; ly = "\u283d"; + lz = "\u2835"; + + // numbers + n = "\u283c"; + n1 = "\u2801"; n2 = "\u2803"; n3 = "\u2809"; n4 = "\u2819"; n5 = "\u2811"; + n6 = "\u280b"; n7 = "\u281b"; n8 = "\u2813"; n9 = "\u280a"; n0 = "\u281a"; + + // punctuation + pcom = "\u2802"; psem = "\u2806"; pcln = "\u2812"; + pdot = "\u2832"; pxcl = "\u2816"; pqst = "\u2826"; + past = "\u2814"; pdsh = "\u2804"; phyp = "\u2824"; + + // formatting + fcp = "\u2820"; fsp = "\u2800" | "\x20"; fnl = "\n" | "\n\r"; + + <*> * { out.err(); return; } + <*> "\x00" { if (YYLIMIT - YYCURSOR != YYMAXFILL - 1) out.err(); return; } + + <*> l :=> l + la { out.prt('a'); goto yyc_l; } + lb { out.prt('b'); goto yyc_l; } + lc { out.prt('c'); goto yyc_l; } + ld { out.prt('d'); goto yyc_l; } + le { out.prt('e'); goto yyc_l; } + lf { out.prt('f'); goto yyc_l; } + lg { out.prt('g'); goto yyc_l; } + lh { out.prt('h'); goto yyc_l; } + li { out.prt('i'); goto yyc_l; } + lj { out.prt('j'); goto yyc_l; } + lk { out.prt('k'); goto yyc_l; } + ll { out.prt('l'); goto yyc_l; } + lm { out.prt('m'); goto yyc_l; } + ln { out.prt('n'); goto yyc_l; } + lo { out.prt('o'); goto yyc_l; } + lp { out.prt('p'); goto yyc_l; } + lq { out.prt('q'); goto yyc_l; } + lr { out.prt('r'); goto yyc_l; } + ls { out.prt('s'); goto yyc_l; } + lt { out.prt('t'); goto yyc_l; } + lu { out.prt('u'); goto yyc_l; } + lv { out.prt('v'); goto yyc_l; } + lw { out.prt('w'); goto yyc_l; } + lx { out.prt('x'); goto yyc_l; } + ly { out.prt('y'); goto yyc_l; } + lz { out.prt('z'); goto yyc_l; } + + <*> n :=> n + n1 { out.prt('1'); goto yyc_n; } + n2 { out.prt('2'); goto yyc_n; } + n3 { out.prt('3'); goto yyc_n; } + n4 { out.prt('4'); goto yyc_n; } + n5 { out.prt('5'); goto yyc_n; } + n6 { out.prt('6'); goto yyc_n; } + n7 { out.prt('7'); goto yyc_n; } + n8 { out.prt('8'); goto yyc_n; } + n9 { out.prt('9'); goto yyc_n; } + n0 { out.prt('0'); goto yyc_n; } + + <*> pcom { out.prt(','); goto yyc_l; } + <*> psem { out.prt(';'); goto yyc_l; } + <*> pcln { out.prt(':'); goto yyc_l; } + <*> pdot { out.prt('.'); goto yyc_l; } + <*> pxcl { out.prt('!'); goto yyc_l; } + <*> pqst { out.prt('?'); goto yyc_l; } + <*> past { out.prt('*'); goto yyc_l; } + <*> pdsh { out.prt('\''); goto yyc_l; } + <*> phyp { out.prt('-'); goto yyc_l; } + + <*> fcp { out.caps = true; goto yyc_l; } + <*> fsp { out.prt(' '); goto yyc_l; } + <*> fnl { out.prt('\n'); goto yyc_l; } +*/ + +/*!types:re2c*/ + +static void lex_utf8(const iutf8_t & input) +{ + const unsigned char *YYCURSOR = input.str; + const unsigned char *const YYLIMIT = input.str + input.len + YYMAXFILL; + const unsigned char *YYMARKER; + int c = yycl; + out_t out; + /*!use:re2c + re2c:define:YYCTYPE = "unsigned char"; + re2c:define:YYFILL = "{ out.err(); return; }"; + re2c:define:YYFILL:naked = 1; + re2c:define:YYGETCONDITION = "c"; + re2c:define:YYGETCONDITION:naked = 1; + re2c:define:YYSETCONDITION = "c = @@;"; + re2c:define:YYSETCONDITION:naked = 1; + */ +} + +static void lex_utf16(const iutf16_t & input) +{ + const unsigned short *YYCURSOR = input.str; + const unsigned short *const YYLIMIT = input.str + input.len + YYMAXFILL; + int c = yycl; + out_t out; + /*!use:re2c + re2c:define:YYCTYPE = "unsigned int"; + re2c:define:YYFILL = "{ out.err(); return; }"; + re2c:define:YYFILL:naked = 1; + re2c:define:YYGETCONDITION = "c"; + re2c:define:YYGETCONDITION:naked = 1; + re2c:define:YYSETCONDITION = "c = @@;"; + re2c:define:YYSETCONDITION:naked = 1; + re2c:flags:8 = 0; + re2c:flags:x = 1; + */ +} + +static void lex_utf32(const iutf32_t & input) +{ + const unsigned int *YYCURSOR = input.str; + const unsigned int *const YYLIMIT = input.str + input.len + YYMAXFILL; + int c = yycl; + out_t out; + /*!use:re2c + re2c:define:YYCTYPE = "unsigned int"; + re2c:define:YYFILL = "{ out.err(); return; }"; + re2c:define:YYFILL:naked = 1; + re2c:define:YYGETCONDITION = "c"; + re2c:define:YYGETCONDITION:naked = 1; + re2c:define:YYSETCONDITION = "c = @@;"; + re2c:define:YYSETCONDITION:naked = 1; + re2c:flags:x = 0; + re2c:flags:u = 1; + */ +} + +static void lex_ucs2(const iucs2_t & input) +{ + const unsigned short *YYCURSOR = input.str; + const unsigned short *const YYLIMIT = input.str + input.len + YYMAXFILL; + int c = yycl; + out_t out; + /*!use:re2c + re2c:define:YYCTYPE = "unsigned int"; + re2c:define:YYFILL = "{ out.err(); return; }"; + re2c:define:YYFILL:naked = 1; + re2c:define:YYGETCONDITION = "c"; + re2c:define:YYGETCONDITION:naked = 1; + re2c:define:YYSETCONDITION = "c = @@;"; + re2c:define:YYSETCONDITION:naked = 1; + re2c:flags:u = 0; + re2c:flags:w = 1; + */ +} + +int main() +{ + FILE *f; + + f = fopen("06_braille.utf8.txt", "rb"); + if (f) { + printf("utf8:\n"); + iutf8_t input(f); + lex_utf8(input); + fclose(f); + } + + f = fopen("06_braille.utf16.txt", "rb"); + if (f) { + printf("utf16:\n"); + iutf16_t input(f); + lex_utf16(input); + fclose(f); + } + + f = fopen("06_braille.utf32.txt", "rb"); + if (f) { + printf("utf32:\n"); + iutf32_t input(f); + lex_utf32(input); + fclose(f); + } + + f = fopen("06_braille.ucs2.txt", "rb"); + if (f) { + printf("ucs2:\n"); + iucs2_t input(f); + lex_ucs2(input); + fclose(f); + } + + return 0; +} diff --git a/src/examples/06_braille.ucs2.txt b/src/examples/06_braille.ucs2.txt new file mode 100644 index 00000000..b938e17e Binary files /dev/null and b/src/examples/06_braille.ucs2.txt differ diff --git a/src/examples/06_braille.utf16.txt b/src/examples/06_braille.utf16.txt new file mode 100644 index 00000000..b938e17e Binary files /dev/null and b/src/examples/06_braille.utf16.txt differ diff --git a/src/examples/06_braille.utf32.txt b/src/examples/06_braille.utf32.txt new file mode 100644 index 00000000..0105fef0 Binary files /dev/null and b/src/examples/06_braille.utf32.txt differ diff --git a/src/examples/06_braille.utf8.txt b/src/examples/06_braille.utf8.txt new file mode 100644 index 00000000..5ede72d1 --- /dev/null +++ b/src/examples/06_braille.utf8.txt @@ -0,0 +1,4 @@ +⠠⠁⠇⠇⠀⠓⠥⠍⠁⠝⠀⠃⠑⠊⠝⠛⠎⠀⠁⠗⠑⠀⠃⠕⠗⠝⠀⠋⠗⠑⠑⠀⠁⠝⠙⠀⠑⠟⠥⠁⠇⠀⠊⠝⠀⠙⠊⠛⠝⠊⠞⠽⠀⠁⠝⠙⠀⠗⠊⠛⠓⠞⠎⠲⠀ +⠠⠞⠓⠑⠽⠀⠁⠗⠑⠀⠑⠝⠙⠕⠺⠑⠙⠀⠺⠊⠞⠓⠀⠗⠑⠁⠎⠕⠝⠀⠁⠝⠙⠀⠉⠕⠝⠎⠉⠊⠑⠝⠉⠑⠀⠁⠝⠙⠀⠎⠓⠕⠥⠇⠙⠀⠁⠉⠞⠀⠞⠕⠺⠁⠗⠙⠎⠀ +⠕⠝⠑⠀⠁⠝⠕⠞⠓⠑⠗⠀⠊⠝⠀⠁⠀⠎⠏⠊⠗⠊⠞⠀⠕⠋⠀⠃⠗⠕⠞⠓⠑⠗⠓⠕⠕⠙⠲ + diff --git a/src/examples/06_braille.utf8.txt.rst b/src/examples/06_braille.utf8.txt.rst new file mode 100644 index 00000000..5ede72d1 --- /dev/null +++ b/src/examples/06_braille.utf8.txt.rst @@ -0,0 +1,4 @@ +⠠⠁⠇⠇⠀⠓⠥⠍⠁⠝⠀⠃⠑⠊⠝⠛⠎⠀⠁⠗⠑⠀⠃⠕⠗⠝⠀⠋⠗⠑⠑⠀⠁⠝⠙⠀⠑⠟⠥⠁⠇⠀⠊⠝⠀⠙⠊⠛⠝⠊⠞⠽⠀⠁⠝⠙⠀⠗⠊⠛⠓⠞⠎⠲⠀ +⠠⠞⠓⠑⠽⠀⠁⠗⠑⠀⠑⠝⠙⠕⠺⠑⠙⠀⠺⠊⠞⠓⠀⠗⠑⠁⠎⠕⠝⠀⠁⠝⠙⠀⠉⠕⠝⠎⠉⠊⠑⠝⠉⠑⠀⠁⠝⠙⠀⠎⠓⠕⠥⠇⠙⠀⠁⠉⠞⠀⠞⠕⠺⠁⠗⠙⠎⠀ +⠕⠝⠑⠀⠁⠝⠕⠞⠓⠑⠗⠀⠊⠝⠀⠁⠀⠎⠏⠊⠗⠊⠞⠀⠕⠋⠀⠃⠗⠕⠞⠓⠑⠗⠓⠕⠕⠙⠲ +