]> granicus.if.org Git - re2c/commitdiff
Added final example: C++98 lexer.
authorUlya Trofimovich <skvadrik@gmail.com>
Mon, 2 Nov 2015 13:31:43 +0000 (13:31 +0000)
committerUlya Trofimovich <skvadrik@gmail.com>
Mon, 2 Nov 2015 13:31:43 +0000 (13:31 +0000)
src/examples.rst
src/examples/07_c++98.re [new file with mode: 0644]

index a9e7495cabed7e24b2649be9c5d204e3c255bb65..636043cee1decf700e13472529f0dbe6ac34710f 100644 (file)
@@ -352,16 +352,18 @@ Braille patterns (encodings)
 ----------------------------
 
 This example is about encoding support in re2c.
-It's a simple decoder from Grade-1 (uncontracted) Unicode English Braille to plain English.
+It's a partial decoder from Grade-1 (uncontracted) Unicode English Braille to plain English.
 The input may be encoded in UTF-8, UTF-16, UTF-32 or UCS-2:
 all of these encodings are capable of representing Braille patterns (code points ``[0x2800 - 0x28ff]``).
 We use ``-r`` option to reuse the same block of re2c rules with different encodings.
 
-So. We have a file `[06_braille.utf8.txt] <examples/06_braille.utf8.txt.html>`_ (encoded in UTF-8) with a message:
+So. The hardest part is to get some input.
+Here is a message out of the void:
 
 .. include:: examples/06_braille.utf8.txt
 
-Let's translate it into UTF-16, UTF-32 or UCS-2:
+It appears to be UTF-8 encoded `[06_braille.utf8.txt] <examples/06_braille.utf8.txt.html>`_.
+Now translate it into UTF-16, UTF-32 or UCS-2:
 
 .. code-block:: bash
 
@@ -369,11 +371,12 @@ Let's translate it into UTF-16, UTF-32 or UCS-2:
     $ iconv -f utf8 -t utf32le 06_braille.utf8.txt > 06_braille.utf32.txt
     $ iconv -f utf8 -t ucs2 06_braille.utf8.txt > 06_braille.ucs2.txt
 
-Uncontracted Braille is simple (compared to Grade-2 Braille).
-Patterns (mostly) map directly to symbols: alphabet letters, digits and punctuators.
-There is a couple of patterns that don't map to symbols:
-start of numeric mode (⠼), end of numeric mode (⠰), capital letter (⠠) (and some other, which are not covered by this example).
-Ambiguous punctuation patterns are also excluded.
+And the input is ready.
+
+Grade-1 Braille is quite simple (compared to Grade-2 Braille).
+Patterns map directly to symbols (letters, digits and punctuators) except for a couple of special patterns:
+numeric mode indicator (⠼), letter mode indicator (⠰), capital letter (⠠)
+and some other, which we omit for simplicity (as well as a few ambiguous punctuation patterns).
 Grade-2 Braille allows contractions; they obey complex rules (like those of a natural language)
 and are much harder to implement.
 
@@ -385,11 +388,13 @@ and are much harder to implement.
 
 Notes:
 
-* Reuse mode allows two types of blocks: a single ``/*!rules:re2c ... */`` block (lines 49 - 129)
-  and multiple ``/*!use:re2c ... */`` blocks (lines 140 - 148, 157 - 167 and 176 - 186).
+* Reuse mode is enabled with ``-r`` option.
+* In reuse mode re2c expects a single ``/*!rules:re2c ... */`` block (line 49)
+  followed by multiple ``/*!use:re2c ... */`` blocks (lines 140, 157 and 176).
   All blocks can have their own configurations, definitions and rules.
-* Conditions are used to emulate transitions between numeric and normal modes (lines 76 and 104).
-* Each encoding has an appropriate code unit type (``YYCTYPE``).
+* Encoding can be enabled either with command-line option or with configuration.
+* Each encoding needs an appropriate code unit type (``YYCTYPE``).
+* We use conditions to switch between numeric and normal modes (lines 76 and 104).
 
 Generate, compile and run:
 
@@ -419,3 +424,50 @@ Generate, compile and run:
     one another in a spirit of brotherhood.
 
 
+.. C++98 lexer:
+
+C++98 lexer
+-----------
+
+`[07_c++98.re] <examples/07_c++98.re>`_
+
+.. include:: examples/07_c++98.re
+    :code: cpp
+    :number-lines:
+
+Generate, compile and run:
+
+.. code-block:: bash
+
+    $ re2c -o example.cc 07_c++98.re
+    $ g++ -o example example.cc
+    $ ./example 07_c++98.re | fold
+     STATIC CONST size_t SIZE = 64 * 1024; STRUCT input_t { UNSIGNED CHAR buf[SIZE +
+     YYMAXFILL]; UNSIGNED CHAR *lim; UNSIGNED CHAR *cur; UNSIGNED CHAR *mar; UNSIGNE
+    D CHAR *tok; BOOL eof; FILE *CONST file; input_t(FILE *f) : buf() , lim(buf + SI
+    ZE) , cur(lim) , mar(lim) , tok(lim) , eof(false) , file(f) {} BOOL fill(size_t 
+    need) { IF (eof) { RETURN false; } CONST size_t free = tok - buf; IF (free < nee
+    d) { RETURN false; } memmove(buf, tok, lim - tok); lim -= free; cur -= free; mar
+     -= free; tok -= free; lim += fread(lim, 1, free, file); IF (lim < buf + SIZE) {
+     eof = true; memset(lim, 0, YYMAXFILL); lim += YYMAXFILL; } RETURN true; } }; TE
+    MPLATE<INT base> STATIC BOOL adddgt(UNSIGNED LONG &u, UNSIGNED LONG d) { IF (u >
+     (ULONG_MAX - d) / base) { RETURN false; } u = u * base + d; RETURN true; } STAT
+    IC BOOL lex_int_sfx(CONST UNSIGNED CHAR *s, UNSIGNED LONG u) { } STATIC BOOL lex
+    _oct(CONST UNSIGNED CHAR *s, BOOL sfx, UNSIGNED LONG &u) { FOR (u = 0, ++s;;) { 
+    } } STATIC BOOL lex_dec(CONST UNSIGNED CHAR *s, BOOL sfx, UNSIGNED LONG &u) { FO
+    R (u = 0;;) { } } STATIC BOOL lex_hex(CONST UNSIGNED CHAR *s, BOOL sfx, UNSIGNED
+     LONG &u) { FOR (u = 0, s += 2;;) { } } STATIC BOOL lex_str(input_t &in, UNSIGNE
+    D CHAR q) { printf("\x25\x63", q); FOR (UNSIGNED LONG u = q;; printf("\x5c\x78\x
+    25\x6c\x78", u)) { in.tok = in.cur; } printf("\x25\x63", q); RETURN true; } STAT
+    IC BOOL lex_flt(CONST UNSIGNED CHAR *s) { DOUBLE d = 0; DOUBLE x = 1; INT e = 0;
+     mant_int: mant_frac: exp_sign: exp: sfx: end: printf("\x25\x67", d); RETURN tru
+    e; } STATIC BOOL lex(input_t &in) { UNSIGNED LONG u; FOR (;;) { in.tok = in.cur;
+     } } INT main(INT argc, CHAR **argv) { IF (argc != 2) { printf ("\x75\x73\x61\x6
+    7\x65\x3a\x20\x2e\x2f\x65\x78\x61\x6d\x70\x6c\x65\x20\x3c\x66\x69\x6c\x65\x6e\x6
+    1\x6d\x65\x3e\xa"); RETURN 1; } FILE *file = fopen(argv[1], "\x72\x62"); IF (!fi
+    le) { printf("\x65\x72\x72\x6f\x72\x3a\x20\x63\x61\x6e\x6e\x6f\x74\x20\x6f\x70\x
+    65\x6e\x20\x66\x69\x6c\x65\x3a\x20\x25\x73\xa", argv[1]); RETURN 1; } input_t in
+    (file); IF (!lex(in)) { printf("\x2e\x2e\x2e\x20\x65\x72\x72\x6f\x72\xa"); } ELS
+    E { printf("\xa"); } fclose(file); RETURN 0; }
+
+
diff --git a/src/examples/07_c++98.re b/src/examples/07_c++98.re
new file mode 100644 (file)
index 0000000..6b79f01
--- /dev/null
@@ -0,0 +1,386 @@
+#include <float.h>
+#include <limits.h>
+#include <stdio.h>
+#include <string.h>
+
+/*!max:re2c*/
+static const size_t SIZE = 64 * 1024;
+
+struct input_t {
+    unsigned char buf[SIZE + YYMAXFILL];
+    unsigned char *lim;
+    unsigned char *cur;
+    unsigned char *mar;
+    unsigned char *tok;
+    bool eof;
+
+    FILE *const file;
+
+    input_t(FILE *f)
+        : buf()
+        , lim(buf + SIZE)
+        , cur(lim)
+        , mar(lim)
+        , tok(lim)
+        , eof(false)
+        , file(f)
+    {}
+    bool fill(size_t need)
+    {
+        if (eof) {
+            return false;
+        }
+
+        const size_t free = tok - buf;
+        if (free < need) {
+            return false;
+        }
+
+        memmove(buf, tok, lim - tok);
+        lim -= free;
+        cur -= free;
+        mar -= free;
+        tok -= free;
+        lim += fread(lim, 1, free, file);
+        if (lim < buf + SIZE) {
+            eof = true;
+            memset(lim, 0, YYMAXFILL);
+            lim += YYMAXFILL;
+        }
+        return true;
+    }
+};
+
+/*!re2c re2c:define:YYCTYPE = "unsigned char"; */
+
+template<int base>
+static bool adddgt(unsigned long &u, unsigned long d)
+{
+    if (u > (ULONG_MAX - d) / base) {
+        return false;
+    }
+    u = u * base + d;
+    return true;
+}
+
+static bool lex_int_sfx(const unsigned char *s, unsigned long u)
+{
+    /*!re2c
+        re2c:yyfill:enable = 0;
+        re2c:define:YYCURSOR = s;
+        *           { return u < INT_MAX; }
+        'u'         { return u < UINT_MAX; }
+        'l'         { return u < LONG_MAX; }
+        'ul' | 'lu' { return true; }
+    */
+}
+
+static bool lex_oct(const unsigned char *s, bool sfx, unsigned long &u)
+{
+    for (u = 0, ++s;;) {
+    /*!re2c
+        re2c:yyfill:enable = 0;
+        re2c:define:YYCURSOR = s;
+        [0-7] { if (adddgt<8>(u, s[-1] - 0x30u)) continue; return false; }
+        ""    { return !sfx || lex_int_sfx(s, u); }
+    */
+    }
+}
+
+static bool lex_dec(const unsigned char *s, bool sfx, unsigned long &u)
+{
+    for (u = 0;;) {
+    /*!re2c
+        re2c:yyfill:enable = 0;
+        re2c:define:YYCURSOR = s;
+        [0-9] { if (adddgt<10>(u, s[-1] - 0x30u)) continue; return false; }
+        ""    { return !sfx || lex_int_sfx(s, u); }
+    */
+    }
+}
+
+static bool lex_hex(const unsigned char *s, bool sfx, unsigned long &u)
+{
+    for (u = 0, s += 2;;) {
+    /*!re2c
+        re2c:yyfill:enable = 0;
+        re2c:define:YYCURSOR = s;
+        [0-9] { if (adddgt<16>(u, s[-1] - 0x30u))      continue; return false; }
+        [a-f] { if (adddgt<16>(u, s[-1] - 0x61u + 10)) continue; return false; }
+        [A-F] { if (adddgt<16>(u, s[-1] - 0x41u + 10)) continue; return false; }
+        ""    { return !sfx || lex_int_sfx(s, u); }
+    */
+    }
+}
+
+static bool lex_str(input_t &in, unsigned char q)
+{
+    printf("%c", q);
+    for (unsigned long u = q;; printf("\\x%lx", u)) {
+        in.tok = in.cur;
+        /*!re2c
+            re2c:define:YYCURSOR = in.cur;
+            re2c:define:YYMARKER = in.mar;
+            re2c:define:YYLIMIT = in.lim;
+            re2c:yyfill:enable = 1;
+            re2c:define:YYFILL = "if (!in.fill(@@)) return false;";
+            re2c:define:YYFILL:naked = 1;
+            *                    { return false; }
+            [^\n\\]              { u = in.tok[0]; if (u == q) break; continue; }
+            "\\a"                { u = '\a'; continue; }
+            "\\b"                { u = '\b'; continue; }
+            "\\f"                { u = '\f'; continue; }
+            "\\n"                { u = '\n'; continue; }
+            "\\r"                { u = '\r'; continue; }
+            "\\t"                { u = '\t'; continue; }
+            "\\v"                { u = '\v'; continue; }
+            "\\\\"               { u = '\\'; continue; }
+            "\\'"                { u = '\''; continue; }
+            "\\\""               { u = '"';  continue; }
+            "\\?"                { u = '?';  continue; }
+            "\\" [0-7]{1,3}      { lex_oct(in.tok, false, u); continue; }
+            "\\u" [0-9a-fA-F]{4} { lex_hex(in.tok, false, u); continue; }
+            "\\U" [0-9a-fA-F]{8} { lex_hex(in.tok, false, u); continue; }
+            "\\x" [0-9a-fA-F]+   { if (!lex_hex(in.tok, false, u)) return false; continue; }
+        */
+    }
+    printf("%c", q);
+    return true;
+}
+
+static bool lex_flt(const unsigned char *s)
+{
+    double d = 0;
+    double x = 1;
+    int e = 0;
+    /*!re2c
+        re2c:yyfill:enable = 0;
+        re2c:define:YYCURSOR = s;
+    */
+mant_int:
+    /*!re2c
+        "."   { goto mant_frac; }
+        [eE]  { goto exp_sign; }
+        *     { d = (d * 10) + (s[-1] - '0'); goto mant_int; }
+    */
+mant_frac:
+    /*!re2c
+        ""    { goto sfx; }
+        [eE]  { goto exp_sign; }
+        [0-9] { d += (x /= 10) * (s[-1] - '0'); goto mant_frac; }
+    */
+exp_sign:
+    /*!re2c
+        "+"?  { x = 1e+1; goto exp; }
+        "-"   { x = 1e-1; goto exp; }
+    */
+exp:
+    /*!re2c
+        ""    { for (; e > 0; --e) d *= x;    goto sfx; }
+        [0-9] { e = (e * 10) + (s[-1] - '0'); goto exp; }
+    */
+sfx:
+    /*!re2c
+        *     { goto end; }
+        [fF]  { if (d > FLT_MAX) return false; goto end; }
+    */
+end:
+    printf("%g", d);
+    return true;
+}
+
+static bool lex(input_t &in)
+{
+    unsigned long u;
+    for (;;) {
+        in.tok = in.cur;
+        /*!re2c
+            re2c:define:YYCURSOR = in.cur;
+            re2c:define:YYMARKER = in.mar;
+            re2c:define:YYLIMIT = in.lim;
+            re2c:yyfill:enable = 1;
+            re2c:define:YYFILL = "if (!in.fill(@@)) return false;";
+            re2c:define:YYFILL:naked = 1;
+
+            end = "\x00";
+
+            *   { return false; }
+            end { return in.lim - in.tok == YYMAXFILL; }
+
+            // macros
+            macro = ("#" | "%:") ([^\n] | "\\\n")* "\n";
+            macro { continue; }
+
+            // whitespaces
+            mcm = "/*" ([^*] | ("*" [^/]))* "*""/";
+            scm = "//" [^\n]* "\n";
+            wsp = ([ \t\v\n\r] | scm | mcm)+;
+            wsp { printf(" "); continue; }
+
+            // character and string literals
+            "L"? ['"] { if (!lex_str(in, in.cur[-1])) return false; continue; }
+            "L"? "''" { return false; }
+
+            // integer literals
+            int_sfx = 'u' | 'l' | 'ul' | 'lu';
+            oct = "0" [0-7]*        int_sfx?;
+            dec = [1-9][0-9]*       int_sfx?;
+            hex = '0x' [0-9a-fA-F]+ int_sfx?;
+            oct { if (!lex_oct(in.tok, true, u)) return false; printf("%lu", u); continue; }
+            dec { if (!lex_dec(in.tok, true, u)) return false; printf("%lu", u); continue; }
+            hex { if (!lex_hex(in.tok, true, u)) return false; printf("%lu", u); continue; }
+
+            // floating literals
+            frc = [0-9]* "." [0-9]+ | [0-9]+ ".";
+            exp = 'e' [+-]? [0-9]+;
+            flt = (frc exp? | [0-9]+ exp) [fFlL]?;
+            flt { if (lex_flt(in.tok)) continue; return false; }
+
+            // boolean literals
+            "false" { printf("false"); continue; }
+            "true"  { printf("true");  continue; }
+
+            // keywords
+            "asm"              { printf("ASM");              continue; }
+            "auto"             { printf("AUTO");             continue; }
+            "bool"             { printf("BOOL");             continue; }
+            "break"            { printf("BREAK");            continue; }
+            "case"             { printf("CASE");             continue; }
+            "catch"            { printf("CATCH");            continue; }
+            "char"             { printf("CHAR");             continue; }
+            "class"            { printf("CLASS");            continue; }
+            "const"            { printf("CONST");            continue; }
+            "const_cast"       { printf("CONST_CAST");       continue; }
+            "continue"         { printf("CONTINUE");         continue; }
+            "default"          { printf("DEFAULT");          continue; }
+            "do"               { printf("DO");               continue; }
+            "double"           { printf("DOUBLE");           continue; }
+            "dynamic_cast"     { printf("DYNAMIC_CAST");     continue; }
+            "else"             { printf("ELSE");             continue; }
+            "enum"             { printf("ENUM");             continue; }
+            "explicit"         { printf("EXPLICIT");         continue; }
+            "export"           { printf("EXPORT");           continue; }
+            "extern"           { printf("EXTERN");           continue; }
+            "float"            { printf("FLOAT");            continue; }
+            "for"              { printf("FOR");              continue; }
+            "friend"           { printf("FRIEND");           continue; }
+            "goto"             { printf("GOTO");             continue; }
+            "if"               { printf("IF");               continue; }
+            "inline"           { printf("INLINE");           continue; }
+            "int"              { printf("INT");              continue; }
+            "long"             { printf("LONG");             continue; }
+            "mutable"          { printf("MUTABLE");          continue; }
+            "namespace"        { printf("NAMESPACE");        continue; }
+            "operator"         { printf("OPERATOR");         continue; }
+            "private"          { printf("PRIVATE");          continue; }
+            "protected"        { printf("PROTECTED");        continue; }
+            "public"           { printf("PUBLIC");           continue; }
+            "register"         { printf("REGISTER");         continue; }
+            "reinterpret_cast" { printf("REINTERPRET_CAST"); continue; }
+            "return"           { printf("RETURN");           continue; }
+            "short"            { printf("SHORT");            continue; }
+            "signed"           { printf("SIGNED");           continue; }
+            "sizeof"           { printf("SIZEOF");           continue; }
+            "static"           { printf("STATIC");           continue; }
+            "static_cast"      { printf("STATIC_CAST");      continue; }
+            "struct"           { printf("STRUCT");           continue; }
+            "switch"           { printf("SWITCH");           continue; }
+            "template"         { printf("TEMPLATE");         continue; }
+            "this"             { printf("THIS");             continue; }
+            "throw"            { printf("THROW");            continue; }
+            "try"              { printf("TRY");              continue; }
+            "typedef"          { printf("TYPEDEF");          continue; }
+            "typeid"           { printf("TYPEID");           continue; }
+            "typename"         { printf("TYPENAME");         continue; }
+            "union"            { printf("UNION");            continue; }
+            "unsigned"         { printf("UNSIGNED");         continue; }
+            "using"            { printf("USING");            continue; }
+            "virtual"          { printf("VIRTUAL");          continue; }
+            "void"             { printf("VOID");             continue; }
+            "volatile"         { printf("VOLATILE");         continue; }
+            "wchar_t"          { printf("WCHAR_T");          continue; }
+            "while"            { printf("WHILE");            continue; }
+
+            // operators and punctuation (including preprocessor)
+            ("{" | "<%")      { printf("{");      continue; }
+            ("}" | "%>")      { printf("}");      continue; }
+            ("[" | "<:")      { printf("[");      continue; }
+            ("]" | ":>")      { printf("]");      continue; }
+            "("               { printf("(");      continue; }
+            ")"               { printf(")");      continue; }
+            ";"               { printf(";");      continue; }
+            ":"               { printf(":");      continue; }
+            "..."             { printf("...");    continue; }
+            "new"             { printf("new");    continue; }
+            "delete"          { printf("delete"); continue; }
+            "?"               { printf("?");      continue; }
+            "::"              { printf("::");     continue; }
+            "."               { printf(".");      continue; }
+            ".*"              { printf(".");      continue; }
+            "+"               { printf("+");      continue; }
+            "-"               { printf("-");      continue; }
+            "*"               { printf("*");      continue; }
+            "/"               { printf("/");      continue; }
+            "%"               { printf("%%");     continue; }
+            ("^" | "xor")     { printf("^");      continue; }
+            ("&" | "bitand")  { printf("&");      continue; }
+            ("|" | "bitor")   { printf("|");      continue; }
+            ("~" | "compl")   { printf("~");      continue; }
+            ("!" | "not")     { printf("!");      continue; }
+            "="               { printf("=");      continue; }
+            "<"               { printf("<");      continue; }
+            ">"               { printf(">");      continue; }
+            "+="              { printf("+=");     continue; }
+            "-="              { printf("-=");     continue; }
+            "*="              { printf("*=");     continue; }
+            "/="              { printf("/=");     continue; }
+            "%="              { printf("%%=");    continue; }
+            ("^=" | "xor_eq") { printf("^=");     continue; }
+            ("&=" | "and_eq") { printf("&=");     continue; }
+            ("|=" | "or_eq")  { printf("|=");     continue; }
+            "<<"              { printf("<<");     continue; }
+            ">>"              { printf(">>");     continue; }
+            ">>="             { printf(">>=");    continue; }
+            "<<="             { printf("<<=");    continue; }
+            "=="              { printf("==");     continue; }
+            ("!=" | "not_eq") { printf("!=");     continue; }
+            "<="              { printf("<=");     continue; }
+            ">="              { printf(">=");     continue; }
+            ("&&" | "and")    { printf("&&");     continue; }
+            ("||" | "or")     { printf("||");     continue; }
+            "++"              { printf("++");     continue; }
+            "--"              { printf("--");     continue; }
+            ","               { printf(",");      continue; }
+            "->*"             { printf("->*");    continue; }
+            "->"              { printf("->");     continue; }
+
+            // identifiers
+            id = [a-zA-Z_][a-zA-Z_0-9]*;
+            id { printf("%.*s", in.cur - in.tok, in.tok); continue; }
+        */
+    }
+}
+
+int main(int argc, char **argv)
+{
+    if (argc != 2) {
+        printf ("usage: ./example <filename>\n");
+        return 1;
+    }
+
+    FILE *file = fopen(argv[1], "rb");
+    if (!file) {
+        printf("error: cannot open file: %s\n", argv[1]);
+        return 1;
+    }
+
+    input_t in(file);
+    if (!lex(in)) {
+        printf("... error\n");
+    } else {
+        printf("\n");
+    }
+
+    fclose(file);
+    return 0;
+}