Paper on Lookahead TDFA: added benchmarks.

author Ulya Trofimovich <skvadrik@gmail.com>

Wed, 19 Jul 2017 17:43:33 +0000 (18:43 +0100)

committer Ulya Trofimovich <skvadrik@gmail.com>

Wed, 19 Jul 2017 17:43:33 +0000 (18:43 +0100)
author Ulya Trofimovich <skvadrik@gmail.com>
Wed, 19 Jul 2017 17:43:33 +0000 (18:43 +0100)
committer Ulya Trofimovich <skvadrik@gmail.com>
Wed, 19 Jul 2017 17:43:33 +0000 (18:43 +0100)
diff --git a/re2c/benchmarks/http/gen/gen_http.hs b/re2c/benchmarks/http/gen/gen_http.hs

new file mode 100644 (file)

index 0000000..6f6403a
--- /dev/null
+++ b/re2c/benchmarks/http/gen/gen_http.hs
@@ -0,0 +1,365 @@
+import qualified Test.QuickCheck as Q
+import           Data.List (intercalate)
+import           Control.Monad (forM_, when, replicateM)
+
+    {-!re2c
+
+        crlf  = "\n";
+        sp    = " ";
+        htab  = "\t";
+        ows   = (sp | htab)*;
+        rws   = (sp | htab)+;
+        bws   = ows;
+        digit = [0-9];
+        alpha = [a-zA-Z];
+        hexdigit    = [0-9a-fA-F];
+        unreserved  = alpha | digit | [-._~];
+        pct_encoded = "%" hexdigit{2};
+        sub_delims  = [!$&'()*+,;=];
+        pchar       = unreserved | pct_encoded | sub_delims | [:@];
+        vchar = [\x1f-\x7e];
+        tchar = [-!#$%&'*+.^_`|~] | digit | alpha;
+        token = tchar+;
+        octet = [^\x00];
+
+
+        obs_fold       = crlf (sp | htab)+;
+        obs_text       = [\x80-\xff];
+        field_name     = token;
+        field_vchar    = vchar | obs_text;
+        field_content  = field_vchar ((sp | htab)+ field_vchar)?;
+        field_value    = (field_content | obs_fold)*;
+        header_field   = #h1 field_name #h2 ":" ows field_value ows #h3;
+        method         = token;
+        segment        = pchar*;
+        scheme         = alpha (alpha | digit | [-+.])*;
+        userinfo       = (unreserved | pct_encoded | sub_delims | ":")*;
+        dec_octet
+            = digit
+            | [\x31-\x39] digit
+            | "1" digit{2}
+            | "2" [\x30-\x34] digit
+            | "25" [\x30-\x35];
+        ipv4address    = dec_octet "." dec_octet "." dec_octet "." dec_octet;
+        h16            = hexdigit{1,4};
+        ls32           = h16 ":" h16 | ipv4address;
+        ipv6address
+            =                            (h16 ":"){6} ls32
+            |                       "::" (h16 ":"){5} ls32
+            | (               h16)? "::" (h16 ":"){4} ls32
+            | ((h16 ":"){0,1} h16)? "::" (h16 ":"){3} ls32
+            | ((h16 ":"){0,2} h16)? "::" (h16 ":"){2} ls32
+            | ((h16 ":"){0,3} h16)? "::"  h16 ":"     ls32
+            | ((h16 ":"){0,4} h16)? "::"              ls32
+            | ((h16 ":"){0,5} h16)? "::"              h16
+            | ((h16 ":"){0,6} h16)? "::";
+        ipvfuture      = "v" hexdigit+ "." (unreserved | sub_delims | ":" )+;
+        ip_literal     = "[" ( ipv6address | ipvfuture ) "]";
+        reg_name       = (unreserved | pct_encoded | sub_delims)*;
+        host           = ip_literal | ipv4address | reg_name;
+        port           = digit*;
+        authority      = (userinfo "@")? host (":" port)?;
+        query          = (pchar | [/?])*;
+        path_abempty   = ("/" pchar*)*;
+        path_absolute  = "/" (pchar+ ("/" pchar*)*)?;
+        path_rootless  = pchar+ ("/" pchar*)*;
+        path_empty     = "";
+        hier_part
+            = "//" authority path_abempty
+            | (path_absolute | path_rootless | path_empty);
+        absolute_uri   = scheme ":" hier_part ("?" query)?;
+        absolute_path  = ("/" segment)*;
+        origin_form    = absolute_path ("?" query)?;
+        http_name      = "HTTP";
+        http_version   = http_name "/" digit "." digit;
+        request_target = origin_form | absolute_uri | authority | "*";
+        request_line   = @m1 method @m2 sp @rt1 request_target @rt2 sp @v3 http_version @v4 crlf;
+        status_code    = digit{3};
+        reason_phrase  = (htab | sp | vchar | obs_text)*;
+        status_line    = @v1 http_version @v2 sp @s1 status_code @s2 sp @rp1 reason_phrase @rp2 crlf;
+        start_line     = (request_line | status_line);
+        message_body   = octet*;
+        message_head   = start_line (header_field crlf)* crlf;
+    -}
+
+type Scheme = String
+type User = String
+type Password = String
+type Host = String
+type Port = String
+type Path = String
+type Query = String
+data RequestTarget
+    = OriginForm Path Query
+    | AbsoluteURI Scheme User Password Host Port Path Query
+    | Authority User Password Host Port
+    | Asterisk
+type ReasonPhrase = String
+type HTTPVersion = String
+type Method = String
+type StatusCode = Int
+data StartLine
+    = RequestLine Method RequestTarget HTTPVersion
+    | StatusLine HTTPVersion StatusCode ReasonPhrase
+data HeaderField = HeaderField String
+data HTTPMsg = HTTPMsg StartLine [HeaderField]
+
+
+instance Show RequestTarget where
+    show (OriginForm p q) = p ++ (if q == "" then "" else "?" ++ q)
+    show (AbsoluteURI sc us pw ho pr pa q) = concat
+        [ sc
+        , ":"
+        , if ho == "" then "" else "//"
+        , us
+        , if us == "" then "" else ":"
+        , pw
+        , if pw == "" then "" else "@"
+        , ho
+        , if pr == "" then "" else ":"
+        , pr
+        , pa
+        , if q == "" then "" else "?"
+        , q
+        ]
+    show (Authority us pw ho pr) = concat
+        [ us
+        , if us == "" then "" else ":"
+        , pw
+        , if pw == "" then "" else "@"
+        , ho
+        , if pr == "" then "" else ":"
+        , pr
+        ]
+    show Asterisk = "*"
+instance Show StartLine where
+    show (RequestLine m rt v) = m ++ " " ++ show rt ++ " " ++ v ++ "\n"
+    show (StatusLine v sc rp) = v ++ " " ++ show sc ++ " " ++ rp ++ "\n"
+instance Show HeaderField where show (HeaderField hf) = hf
+instance Show HTTPMsg where
+    show (HTTPMsg sl hfs) = show sl ++ unlines (map show hfs) ++ "\n"
+
+
+instance Q.Arbitrary HTTPMsg where
+    arbitrary = do
+        scheme   <- Q.elements ["http", "https", "ftp", "file"]
+        user     <- lstr
+        password <- anstr
+        host     <- Q.frequency [(10, ipv4), (10, hostname), (1, ipv6)]
+        port     <- mayb $ show <$> (Q.choose (0, 10000) :: Q.Gen Int)
+        path     <- filepath
+        query    <- mayb anstr
+        request_target <- Q.frequency
+            [ (1, pure $ OriginForm path query)
+            , (1, pure $ OriginForm path "")
+
+            , (1, pure $ Authority user password host port)
+            , (1, pure $ Authority user password host "")
+            , (1, pure $ Authority ""   ""       host port)
+            , (1, pure $ Authority ""   ""       host "")
+
+            , (1, pure $ AbsoluteURI scheme user  password host port path query)
+            , (2, pure $ AbsoluteURI scheme ""    ""       host port path query)
+            , (1, pure $ AbsoluteURI scheme ""    ""       ""   ""   path query)
+
+            , (1, pure Asterisk)
+            ]
+        reason_phrase <- Q.listOf (Q.elements (alphanum ++ space)) :: Q.Gen [Char]
+        method <- Q.elements
+            [ "OPTIONS"
+            , "GET"
+            , "HEAD"
+            , "POST"
+            , "PUT"
+            , "DELETE"
+            , "TRACE"
+            , "CONNECT"
+            , "PATCH"
+            ]
+        status_code <- Q.choose (100, 999) :: Q.Gen Int
+        http_version <- do
+            n1 <- Q.choose (0, 9) :: Q.Gen Int
+            n2 <- Q.choose (0, 9) :: Q.Gen Int
+            return $ "HTTP/" ++ show n1 ++ "." ++ show n2
+        start_line <- Q.frequency
+            [ (1, pure $ RequestLine method request_target http_version)
+            , (1, pure $ StatusLine http_version status_code reason_phrase)
+            ]
+        -- copy-pasted from https://en.wikipedia.org/wiki/List_of_HTTP_header_fields
+        header_fields <- Q.listOf1 $ HeaderField <$> Q.elements
+            -- request examples
+            [ "Accept: text/plain"
+            , "Accept-Charset: utf-8"
+            , "Accept-Encoding: gzip, deflate"
+            , "Accept-Language: en-US"
+            , "Accept-Datetime: Thu, 31 May 2007 20:35:00 GMT"
+            , "Authorization: Basic QWxhZGRpbjpvcGVuIHNlc2FtZQ=="
+            , "Cache-Control: no-cache"
+            , "Connection: keep-alive"
+            , "Connection: Upgrade"
+            , "Cookie: $Version=1; Skin=new;"
+            , "Content-Length: 348"
+            , "Content-MD5: Q2hlY2sgSW50ZWdyaXR5IQ=="
+            , "Content-Type: application/x-www-form-urlencoded"
+            , "Date: Tue, 15 Nov 1994 08:12:31 GMT"
+            , "Expect: 100-continue"
+            , "Forwarded: for=192.0.2.60;proto=http;by=203.0.113.43"
+            , "Forwarded: for=192.0.2.43, for=198.51.100.17"
+            , "From: user@example.com"
+            , "Host: en.wikipedia.org:8080"
+            , "Host: en.wikipedia.org"
+            , "If-Match: \"737060cd8c284d8af7ad3082f209582d\""
+            , "If-Modified-Since: Sat, 29 Oct 1994 19:43:31 GMT"
+            , "If-None-Match: \"737060cd8c284d8af7ad3082f209582d\""
+            , "If-Range: \"737060cd8c284d8af7ad3082f209582d\""
+            , "If-Unmodified-Since: Sat, 29 Oct 1994 19:43:31 GMT"
+            , "Max-Forwards: 10"
+            , "Origin: http://www.example-social-network.com"
+            , "Pragma: no-cache"
+            , "Proxy-Authorization: Basic QWxhZGRpbjpvcGVuIHNlc2FtZQ=="
+            , "Range: bytes=500-999"
+            , "Referer: http://en.wikipedia.org/wiki/Main_Page"
+            , "TE: trailers, deflate"
+            , "User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:12.0) Gecko/20100101 Firefox/12.0"
+            , "Upgrade: HTTPS/1.3, IRC/6.9, RTA/x11, websocket"
+            , "Via: 1.0 fred, 1.1 example.com (Apache/1.1)"
+            , "Warning: 199 Miscellaneous warning"
+            , "X-Requested-With: XMLHttpRequest"
+            , "DNT: 1 (Do Not Track Enabled)"
+            , "DNT: 0 (Do Not Track Disabled)"
+            , "X-Forwarded-For: client1, proxy1, proxy2"
+            , "X-Forwarded-For: 129.78.138.66, 129.78.64.103"
+            , "X-Forwarded-Host: en.wikipedia.org:8080"
+            , "X-Forwarded-Host: en.wikipedia.org"
+            , "X-Forwarded-Proto: https"
+            , "Front-End-Https: on"
+            , "X-HTTP-Method-Override: DELETE"
+            , "X-Att-Deviceid: GT-P7320/P7320XXLPG"
+            , "x-wap-profile: http://wap.samsungmobile.com/uaprof/SGH-I777.xml"
+            , "Proxy-Connection: keep-alive"
+            , "X-UIDH: ..."
+            , "X-Csrf-Token: i8XNjC4b8KVok4uw5RftR38Wgp2BFwql"
+            , "X-Request-ID: f058ebd6-02f7-4d3f-942e-904344e8cde5"
+            -- response examples
+            , "Access-Control-Allow-Origin: *"
+            , "Accept-Patch: text/example;charset=utf-8"
+            , "Accept-Ranges: bytes"
+            , "Age: 12"
+            , "Allow: GET, HEAD"
+            , "Alt-Svc: http/1.1=\"http2.example.com:8001\"; ma=7200"
+            , "Cache-Control: max-age=3600"
+            , "Connection: close"
+            , "Content-Disposition: attachment; filename=\"fname.ext\""
+            , "Content-Encoding: gzip"
+            , "Content-Language: da"
+            , "Content-Length: 51"
+            , "Content-Location: /index.htm"
+            , "Content-MD5: Q2hlY2sgSW50ZWdyaXR5IQ=="
+            , "Content-Range: bytes 21010-47021/47022"
+            , "Content-Type: text/html; charset=utf-8"
+            , "Date: Tue, 15 Nov 1994 08:12:31 GMT"
+            , "ETag: \"737060cd8c284d8af7ad3082f209582d\""
+            , "Expires: Thu, 01 Dec 1994 16:00:00 GMT"
+            , "Last-Modified: Tue, 15 Nov 1994 12:45:26 GMT"
+            , "Link: </feed>; rel=\"alternate\""
+            , "Location: http://www.w3.org/pub/WWW/People.html"
+            , "P3P: CP=\"This is not a P3P policy! See http://www.google.com/support/accounts/bin/answer.py?hl=en&answer=151657 for more info.\""
+            , "Pragma: no-cache"
+            , "Proxy-Authenticate: Basic"
+            , "Public-Key-Pins: max-age=2592000; pin-sha256=\"E9CZ9INDbd+2eRQozYqqbQ2yXLVKB9+xcprMF+44U1g=\";"
+            , "Refresh: 5; url=http://www.w3.org/pub/WWW/People.html"
+            , "Retry-After: 120"
+            , "Retry-After: Fri, 07 Nov 2014 23:59:59 GMT"
+            , "Server: Apache/2.4.1 (Unix)"
+            , "Set-Cookie: UserID=JohnDoe; Max-Age=3600; Version=1"
+            , "Strict-Transport-Security: max-age=16070400; includeSubDomains"
+            , "Trailer: Max-Forwards"
+            , "Transfer-Encoding: chunked"
+            , "Tk: ?"
+            , "Upgrade: HTTPS/1.3, IRC/6.9, RTA/x11, websocket"
+            , "Vary: *"
+            , "Vary: Accept-Language"
+            , "Via: 1.0 fred, 1.1 example.com (Apache/1.1)"
+            , "Warning: 199 Miscellaneous warning"
+            , "WWW-Authenticate: Basic"
+            , "X-Frame-Options: deny"
+            , "Status: 200 OK"
+            , "X-XSS-Protection: 1; mode=block"
+            , "X-WebKit-CSP: default-src 'self'"
+            , "X-Content-Type-Options: nosniff"
+            , "X-Powered-By: PHP/5.4.0"
+            , "X-UA-Compatible: IE=EmulateIE7"
+            , "X-UA-Compatible: IE=edge"
+            , "X-UA-Compatible: Chrome=1"
+            , "X-Content-Duration: 42.666"
+            , "Upgrade-Insecure-Requests: 1"
+            , "X-Request-ID: f058ebd6-02f7-4d3f-942e-904344e8cde5"
+            ]
+        return $ HTTPMsg start_line header_fields
+
+
+lalpha :: [Char]
+lalpha = ['a'..'z']
+
+ualpha :: [Char]
+ualpha = ['A'..'Z']
+
+alpha :: [Char]
+alpha = lalpha ++ ualpha
+
+digit :: [Char]
+digit = ['0'..'9']
+
+hexdigit :: [Char]
+hexdigit = ['a'..'f'] ++ ['A'..'F'] ++ digit
+
+alphanum :: [Char]
+alphanum = alpha ++ digit
+
+space :: [Char]
+space = [' ', '\t']
+
+lstr :: Q.Gen String
+lstr = Q.listOf1 (Q.elements lalpha) :: Q.Gen [Char]
+
+anstr :: Q.Gen String
+anstr = Q.listOf1 (Q.elements alphanum) :: Q.Gen [Char]
+
+ipv4 :: Q.Gen String
+ipv4 = do
+    let octet = show <$> (Q.choose (0,255) :: Q.Gen Int)
+    intercalate "." <$> replicateM 4 octet
+
+ipv6 :: Q.Gen String
+ipv6 = do
+    let hexgroup = do
+            k <- Q.choose (1, 4) :: Q.Gen Int
+            replicateM k (Q.elements hexdigit)
+    n <- Q.choose (0, 8) :: Q.Gen Int
+    m <- Q.choose (1, n) :: Q.Gen Int
+    hh <- replicateM n hexgroup
+    return $ (++ "]") . ("[" ++) $ case n of
+        8 -> intercalate ":" hh
+        _ -> concat [intercalate ":" (take m hh), "::", intercalate ":" (drop m hh)]
+
+hostname :: Q.Gen String
+hostname = do
+    subdomains <- Q.listOf1 (take 10 <$> lstr)
+    return $ intercalate "." $ take 5 subdomains
+
+filepath :: Q.Gen String
+filepath = do
+    subdirs <- Q.listOf1 lstr
+    return $ concat $ map ("/" ++) $ take 10 subdirs
+
+mayb :: Q.Gen String -> Q.Gen String
+mayb gen = Q.oneof [gen, pure ""]
+
+main :: IO ()
+main = do
+    let f = "http.dat"
+    writeFile f ""
+    forM_ [0 .. 170000 :: Int] $ \i -> do
+        when (i `mod` 1000 == 0) $ print i
+        msg <- Q.generate (Q.arbitrary :: Q.Gen HTTPMsg)
+        appendFile f $ show msg
diff --git a/re2c/benchmarks/http/gen/mk_gen.sh b/re2c/benchmarks/http/gen/mk_gen.sh

new file mode 100755 (executable)

index 0000000..fc6dd78
--- /dev/null
+++ b/re2c/benchmarks/http/gen/mk_gen.sh
@@ -0,0 +1 @@
+ghc -O2 -Wall gen_http.hs
diff --git a/re2c/benchmarks/http/rfc7230/http_rfc7230.re b/re2c/benchmarks/http/rfc7230/http_rfc7230.re

new file mode 100644 (file)

index 0000000..32184fd
--- /dev/null
+++ b/re2c/benchmarks/http/rfc7230/http_rfc7230.re
@@ -0,0 +1,318 @@
+#include <assert.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+
+typedef struct taglist_t {
+    struct taglist_t *pred;
+    long dist;
+} taglist_t;
+
+typedef struct taglistpool_t {
+    taglist_t *head;
+    taglist_t *next;
+    taglist_t *last;
+} taglistpool_t;
+
+typedef struct {
+    FILE *file;
+    char *buf;
+    char *lim;
+    char *cur;
+    char *mar;
+    char *tok;
+    /*!tags:re2c format = "char *@@;\n"; */
+    /*!taglists:re2c format = "taglist_t *@@;\n"; */
+    taglistpool_t tlp;
+    int eof;
+} input_t;
+
+static void taglistpool_clear(taglistpool_t *tlp, input_t *in)
+{
+    tlp->next = tlp->head;
+    /*!taglists:re2c format = "in->@@ = 0;\n"; */
+}
+
+static void taglistpool_init(taglistpool_t *tlp)
+{
+    static const unsigned size = 1024 * 1024;
+    tlp->head = (taglist_t*)malloc(size * sizeof(taglist_t));
+    tlp->next = tlp->head;
+    tlp->last = tlp->head + size;
+}
+
+static void taglistpool_free(taglistpool_t *tlp)
+{
+    free(tlp->head);
+    tlp->head = tlp->next = tlp->last = NULL;
+}
+
+static taglist_t *taglistpool_next(taglistpool_t *tlp)
+{
+    if (tlp->next < tlp->last) {
+        return tlp->next++;
+    }
+
+    const unsigned size = tlp->last - tlp->head;
+    taglist_t *head = (taglist_t*)malloc(2 * size * sizeof(taglist_t));
+    memcpy(head, tlp->head, size * sizeof(taglist_t));
+    free(tlp->head);
+    tlp->head = head;
+    tlp->next = head + size;
+    tlp->last = head + size * 2;
+    return tlp->next++;
+}
+
+static void taglist(taglist_t **ptl, const char *b, const char *t, taglistpool_t *tlp)
+{
+    taglist_t *tl = taglistpool_next(tlp);
+    tl->pred = *ptl;
+    tl->dist = t - b;
+    *ptl = tl;
+}
+
+/*!max:re2c*/
+static const size_t SIZE = 4096;
+
+static void init_input(input_t *in, const char *fname)
+{
+    in->file = fopen(fname, "r");
+    in->buf = (char*) malloc(SIZE + YYMAXFILL);
+    in->lim = in->buf + SIZE;
+    in->cur = in->lim;
+    in->mar = in->lim;
+    in->tok = in->lim;
+    /*!tags:re2c format = "in->@@ = 0;\n"; */
+    /*!taglists:re2c format = "in->@@ = 0;\n"; */
+    taglistpool_init(&in->tlp);
+    in->eof = 0;
+}
+
+static void free_input(input_t *in)
+{
+    fclose(in->file);
+    free(in->buf);
+    taglistpool_free(&in->tlp);
+}
+
+static int fill(input_t *in, size_t need)
+{
+    size_t free;
+
+    if (in->eof) return 1;
+
+    free = in->tok - in->buf;
+    if (free < need) return 2;
+
+    memmove(in->buf, in->tok, in->lim - in->tok);
+    in->lim -= free;
+    in->cur -= free;
+    in->mar -= free;
+    in->tok -= free;
+    /*!tags:re2c format = "if (in->@@) in->@@ -= free;\n"; */
+    in->lim += fread(in->lim, 1, free, in->file);
+    if (in->lim < in->buf + SIZE) {
+        in->eof = 1;
+        memset(in->lim, 0, YYMAXFILL);
+        in->lim += YYMAXFILL;
+    }
+    return 0;
+}
+
+static void print_headers(const char *tok,
+    const taglist_t *h1, const taglist_t *h2,
+    const taglist_t *h3, const taglist_t *h4,
+    const taglist_t *h5)
+{
+    if (!h1 || !h2 || !h3 || !h4 || !h5) {
+        assert(!h1 && !h2 && !h3 && !h4 && !h5);
+        return;
+    }
+    print_headers(tok, h1->pred, h2->pred, h3->pred, h4->pred, h5->pred);
+    printf("%.*s%.*s%.*s%.*s\n",
+        (int)(h2->dist - h1->dist), tok + h1->dist,
+        (int)(h3->dist - h2->dist), tok + h2->dist,
+        (int)(h4->dist - h3->dist), tok + h3->dist,
+        (int)(h5->dist - h4->dist), tok + h4->dist);
+}
+
+#define YYCTYPE            char
+#define YYCURSOR           in->cur
+#define YYMARKER           in->mar
+#define YYLIMIT            in->lim
+#define YYTAGLISTP(tl)     taglist(&tl, in->tok, in->cur, &in->tlp)
+#define YYTAGLISTN(tl)     taglist(&tl, in->tok, NULL, &in->tlp)
+#define YYFILL(n)          if (fill(in, n) != 0) return 2;
+
+static int lex(input_t *in, long *count, long *total)
+{
+    const char *of, *au, *at,
+        *hs1, *hs3, *m1, *p1, *p3, *p5, *q1, *q3,
+        *hs2, *hs4, *m2, *p2, *p4, *p6, *q2, *q4,
+        *r1, *r3, *rp1, *s1, *st1, *u1, *u3, *v1, *v3,
+        *r2, *r4, *rp2, *s2, *st2, *u2, *u4, *v2, *v4;
+    taglist_t *h1, *h2, *h3, *h4, *h5;
+    long c, t;
+
+    c = 0; t = 0;
+    of = au = at
+        = hs1 = hs3 = m1 = p1 = p3 = p5 = q1 = q3
+        = hs2 = hs4 = m2 = p2 = p4 = p6 = q2 = q4
+        = r1 = r3 = rp1 = s1 = st1 = u1 = u3 = v1 = v3
+        = r2 = r4 = rp2 = s2 = st2 = u2 = u4 = v2 = v4 = NULL;
+loop:
+    in->tok = in->cur;
+/*!re2c
+    re2c:flags:tags = 1;
+    re2c:tags:expression = "in->@@";
+
+    end = "\x00";
+    eol = "\n";
+
+    crlf        = eol;
+    sp          = " ";
+    htab        = "\t";
+    ows         = (sp | htab)*;
+    digit       = [0-9];
+    alpha       = [a-zA-Z];
+    hexdigit    = [0-9a-fA-F];
+    unreserved  = alpha | digit | [-._~];
+    pct_encoded = "%" hexdigit{2};
+    sub_delims  = [!$&'()*+,;=];
+    pchar       = unreserved | pct_encoded | sub_delims | [:@];
+    vchar       = [\x1f-\x7e];
+    tchar       = [-!#$%&'*+.^_`|~] | digit | alpha;
+
+    obs_fold       = crlf (sp | htab)+;
+    obs_text       = [\x80-\xff];
+    field_name     = tchar+;
+    field_vchar    = vchar | obs_text;
+    field_content  = field_vchar ((sp | htab)+ field_vchar)?;
+    field_value    = (field_content | obs_fold)*;
+    header_field   = #h1 field_name #h2 ":" ows #h3 field_value #h4 ows #h5;
+    scheme         = alpha (alpha | digit | [-+.])*;
+    userinfo       = (unreserved | pct_encoded | sub_delims | ":")*;
+    dec_octet
+        = digit
+        | [\x31-\x39] digit
+        | "1" digit{2}
+        | "2" [\x30-\x34] digit
+        | "25" [\x30-\x35];
+    ipv4address    = dec_octet "." dec_octet "." dec_octet "." dec_octet;
+    h16            = hexdigit{1,4};
+    ls32           = h16 ":" h16 | ipv4address;
+    ipv6address
+        =                            (h16 ":"){6} ls32
+        |                       "::" (h16 ":"){5} ls32
+        | (               h16)? "::" (h16 ":"){4} ls32
+        | ((h16 ":"){0,1} h16)? "::" (h16 ":"){3} ls32
+        | ((h16 ":"){0,2} h16)? "::" (h16 ":"){2} ls32
+        | ((h16 ":"){0,3} h16)? "::"  h16 ":"     ls32
+        | ((h16 ":"){0,4} h16)? "::"              ls32
+        | ((h16 ":"){0,5} h16)? "::"              h16
+        | ((h16 ":"){0,6} h16)? "::";
+    ipvfuture      = "v" hexdigit+ "." (unreserved | sub_delims | ":" )+;
+    ip_literal     = "[" ( ipv6address | ipvfuture ) "]";
+    reg_name       = (unreserved | pct_encoded | sub_delims)*;
+    path_abempty   = ("/" pchar*)*;
+    path_absolute  = "/" (pchar+ ("/" pchar*)*)?;
+    path_rootless  = pchar+ ("/" pchar*)*;
+    path_empty     = "";
+    host           = ip_literal | ipv4address | reg_name;
+    port           = digit*;
+    query          = (pchar | [/?])*;
+    absolute_uri   = @s1 scheme @s2 ":"
+        ( "//" (@u1 userinfo @u2 "@")? @hs1 host @hs2 (":" @r1 port @r2)? @p1 path_abempty @p2
+        | @p3 (path_absolute | path_rootless | path_empty) @p4
+        ) ("?" @q1 query @q2)?;
+    authority      = (@u3 userinfo @u4 "@")? @hs3 host @hs4 (":" @r3 port @r4)?;
+    origin_form    = @p5 path_abempty @p6 ("?" @q3 query @q4)?;
+    http_name      = "HTTP";
+    http_version   = http_name "/" digit "." digit;
+    request_target
+        = @at authority
+        | @au absolute_uri
+        | @of origin_form
+        | "*";
+    method         = tchar+;
+    request_line   = @m1 method @m2 sp request_target sp @v3 http_version @v4 crlf;
+    status_code    = digit{3};
+    reason_phrase  = (htab | sp | vchar | obs_text)*;
+    status_line    = @v1 http_version @v2 sp @st1 status_code @st2 sp @rp1 reason_phrase @rp2 crlf;
+    start_line     = (request_line | status_line);
+    message_head   = start_line (header_field crlf)* crlf;
+
+    *   { return 1; }
+    end { *count = c; *total = t; return 0; }
+    eol { goto loop; }
+    message_head {
+#ifndef VERIFY
+        c += 1;
+        if (st1) t += (v2 - v1) + (st2 - st1) + (rp2 - rp1);
+        if (m1) {
+            if (of) t += (p6 - p5) + (q4 - q3);
+            if (au) t += (s2 - s1) + (u2 - u1) + (hs2 - hs1)
+                + (r2 - r1) + (p2 - p1) + (p4 - p3) + (q2 - q1);
+            if (at) t += (u4 - u3) + (hs4 - hs3) + (r4 - r3);
+            t += (v4 - v3);
+        }
+        for (; h1 != 0; h1 = h1->pred, h2 = h2->pred,
+            h3 = h3->pred, h4 = h4->pred, h5 = h5->pred) {
+            t += (h2->dist - h1->dist) + (h3->dist - h2->dist)
+                + (h4->dist - h3->dist) + (h5->dist - h4->dist);
+        }
+#else
+        if (st1) {
+            printf("%.*s %.*s %.*s\n",
+                (int)(v2 - v1), v1,
+                (int)(st2 - st1), st1,
+                (int)(rp2 - rp1), rp1);
+        } else if (m1) {
+            printf("%.*s ", (int)(m2 - m1), m1);
+            if (of) {
+                printf("%.*s", (int)(p6 - p5), p5);
+                if (q3) printf("?%.*s", (int)(q4 - q3), q3);
+            } else if (au) {
+                printf("%.*s:", (int)(s2 - s1), s1);
+                if (p1) printf("//");
+                if (u1) printf("%.*s@", (int)(u2 - u1), u1);
+                printf("%.*s", (int)(hs2 - hs1), hs1);
+                if (r1) printf(":%.*s", (int)(r2 - r1), r1);
+                if (p1) printf("%.*s",  (int)(p2 - p1), p1);
+                if (p3) printf("%.*s",  (int)(p4 - p3), p3);
+                if (q1) printf("?%.*s", (int)(q2 - q1), q1);
+            } else if (at) {
+                if (u3) printf("%.*s@", (int)(u4 - u3), u3);
+                printf("%.*s", (int)(hs4 - hs3), hs3);
+                if (r3) printf(":%.*s", (int)(r4 - r3), r3);
+            } else {
+                printf("*");
+            }
+            printf(" %.*s\n", (int)(v4 - v3), v3);
+        }
+        print_headers(in->tok, h1, h2, h3, h4, h5);
+        printf("\n");
+#endif
+        taglistpool_clear(&in->tlp, in);
+        goto loop;
+    }
+*/
+}
+
+int main(int argc, char **argv)
+{
+    long count, total;
+    input_t in;
+    init_input(&in, argv[1]);
+
+    switch (lex(&in, &count, &total)) {
+        case 0: fprintf(stderr, "ok, parsed %ld HTTPs, peeked %ld chars\n", count, total); break;
+        case 1: fprintf(stderr, "syntax error: %s\n", in.cur); break;
+        case 2: fprintf(stderr, "fill failed\n"); break;
+        default: fprintf(stderr, "panic!\n"); break;
+    }
+
+    free_input(&in);
+    return 0;
+}
diff --git a/re2c/benchmarks/http/rfc7230/http_rfc7230_notags.re b/re2c/benchmarks/http/rfc7230/http_rfc7230_notags.re

new file mode 100644 (file)

index 0000000..e97909b
--- /dev/null
+++ b/re2c/benchmarks/http/rfc7230/http_rfc7230_notags.re
@@ -0,0 +1,180 @@
+#include <stddef.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+
+typedef struct {
+    FILE *file;
+    char *buf;
+    char *lim;
+    char *cur;
+    char *mar;
+    char *tok;
+    int eof;
+} input_t;
+
+/*!max:re2c*/
+static const size_t SIZE = 4096;
+
+static void init_input(input_t *in, const char *fname)
+{
+    in->file = fopen(fname, "r");
+    in->buf = (char*) malloc(SIZE + YYMAXFILL);
+    in->lim = in->buf + SIZE;
+    in->cur = in->lim;
+    in->mar = in->lim;
+    in->tok = in->lim;
+    in->eof = 0;
+}
+
+static void free_input(input_t *in)
+{
+    fclose(in->file);
+    free(in->buf);
+}
+
+static int fill(input_t *in, size_t need)
+{
+    size_t free;
+
+    if (in->eof) return 1;
+
+    free = in->tok - in->buf;
+    if (free < need) return 2;
+
+    memmove(in->buf, in->tok, in->lim - in->tok);
+    in->lim -= free;
+    in->cur -= free;
+    in->mar -= free;
+    in->tok -= free;
+    in->lim += fread(in->lim, 1, free, in->file);
+    if (in->lim < in->buf + SIZE) {
+        in->eof = 1;
+        memset(in->lim, 0, YYMAXFILL);
+        in->lim += YYMAXFILL;
+    }
+    return 0;
+}
+
+#define YYCTYPE   char
+#define YYCURSOR  in->cur
+#define YYMARKER  in->mar
+#define YYLIMIT   in->lim
+#define YYFILL(n) if (fill(in, n) != 0) return 2;
+
+static int lex(input_t *in, long *count, long *total)
+{
+    long c, t;
+
+    c = 0; t = 0;
+loop:
+    in->tok = in->cur;
+/*!re2c
+
+    end = "\x00";
+    eol = "\n";
+
+    crlf        = eol;
+    sp          = " ";
+    htab        = "\t";
+    ows         = (sp | htab)*;
+    digit       = [0-9];
+    alpha       = [a-zA-Z];
+    hexdigit    = [0-9a-fA-F];
+    unreserved  = alpha | digit | [-._~];
+    pct_encoded = "%" hexdigit{2};
+    sub_delims  = [!$&'()*+,;=];
+    pchar       = unreserved | pct_encoded | sub_delims | [:@];
+    vchar       = [\x1f-\x7e];
+    tchar       = [-!#$%&'*+.^_`|~] | digit | alpha;
+
+    obs_fold       = crlf (sp | htab)+;
+    obs_text       = [\x80-\xff];
+    field_name     = tchar+;
+    field_vchar    = vchar | obs_text;
+    field_content  = field_vchar ((sp | htab)+ field_vchar)?;
+    field_value    = (field_content | obs_fold)*;
+    header_field   = field_name ":" ows field_value ows;
+    scheme         = alpha (alpha | digit | [-+.])*;
+    userinfo       = (unreserved | pct_encoded | sub_delims | ":")*;
+    dec_octet
+        = digit
+        | [\x31-\x39] digit
+        | "1" digit{2}
+        | "2" [\x30-\x34] digit
+        | "25" [\x30-\x35];
+    ipv4address    = dec_octet "." dec_octet "." dec_octet "." dec_octet;
+    h16            = hexdigit{1,4};
+    ls32           = h16 ":" h16 | ipv4address;
+    ipv6address
+        =                            (h16 ":"){6} ls32
+        |                       "::" (h16 ":"){5} ls32
+        | (               h16)? "::" (h16 ":"){4} ls32
+        | ((h16 ":"){0,1} h16)? "::" (h16 ":"){3} ls32
+        | ((h16 ":"){0,2} h16)? "::" (h16 ":"){2} ls32
+        | ((h16 ":"){0,3} h16)? "::"  h16 ":"     ls32
+        | ((h16 ":"){0,4} h16)? "::"              ls32
+        | ((h16 ":"){0,5} h16)? "::"              h16
+        | ((h16 ":"){0,6} h16)? "::";
+    ipvfuture      = "v" hexdigit+ "." (unreserved | sub_delims | ":" )+;
+    ip_literal     = "[" ( ipv6address | ipvfuture ) "]";
+    reg_name       = (unreserved | pct_encoded | sub_delims)*;
+    path_abempty   = ("/" pchar*)*;
+    path_absolute  = "/" (pchar+ ("/" pchar*)*)?;
+    path_rootless  = pchar+ ("/" pchar*)*;
+    path_empty     = "";
+    host           = ip_literal | ipv4address | reg_name;
+    port           = digit*;
+    query          = (pchar | [/?])*;
+    absolute_uri   =  scheme  ":"
+        ( "//" (userinfo "@")? host (":" port)? path_abempty
+        | (path_absolute | path_rootless | path_empty)
+        ) ("?" query)?;
+    authority      = (userinfo "@")? host (":" port)?;
+    origin_form    = path_abempty ("?" query)?;
+    http_name      = "HTTP";
+    http_version   = http_name "/" digit "." digit;
+    request_target
+        = authority
+        | absolute_uri
+        | origin_form
+        | "*";
+    method         = tchar+;
+    request_line   = method sp request_target sp http_version crlf;
+    status_code    = digit{3};
+    reason_phrase  = (htab | sp | vchar | obs_text)*;
+    status_line    = http_version sp status_code sp reason_phrase crlf;
+    start_line     = (request_line | status_line);
+    message_head   = start_line (header_field crlf)* crlf;
+
+    *   { return 1; }
+    end { *count = c; return 0; }
+    eol { goto loop; }
+    message_head {
+#ifndef VERIFY
+        c += 1;
+        t += in->cur - in->tok;
+#else
+        printf("%.*s", (int)(in->cur - in->tok), in->tok);
+#endif
+        goto loop;
+    }
+*/
+}
+
+int main(int argc, char **argv)
+{
+    long count, total;
+    input_t in;
+    init_input(&in, argv[1]);
+
+    switch (lex(&in, &count, &total)) {
+        case 0: fprintf(stderr, "ok, parsed %ld HTTPs, peeked %ld chars\n", count, total); break;
+        case 1: fprintf(stderr, "syntax error: %s\n", in.cur); break;
+        case 2: fprintf(stderr, "fill failed\n"); break;
+        default: fprintf(stderr, "panic!\n"); break;
+    }
+
+    free_input(&in);
+    return 0;
+}
diff --git a/re2c/benchmarks/http/simple/http_simple.re b/re2c/benchmarks/http/simple/http_simple.re

new file mode 100644 (file)

index 0000000..d4b4613
--- /dev/null
+++ b/re2c/benchmarks/http/simple/http_simple.re
@@ -0,0 +1,231 @@
+#include <assert.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+
+typedef struct taglist_t {
+    struct taglist_t *pred;
+    long dist;
+} taglist_t;
+
+typedef struct taglistpool_t {
+    taglist_t *head;
+    taglist_t *next;
+    taglist_t *last;
+} taglistpool_t;
+
+typedef struct {
+    FILE *file;
+    char *buf;
+    char *lim;
+    char *cur;
+    char *mar;
+    char *tok;
+    /*!tags:re2c format = "char *@@;\n"; */
+    /*!taglists:re2c format = "taglist_t *@@;\n"; */
+    taglistpool_t tlp;
+    int eof;
+} input_t;
+
+static void taglistpool_clear(taglistpool_t *tlp, input_t *in)
+{
+    tlp->next = tlp->head;
+    /*!taglists:re2c format = "in->@@ = 0;\n"; */
+}
+
+static void taglistpool_init(taglistpool_t *tlp)
+{
+    static const unsigned size = 1024 * 1024;
+    tlp->head = (taglist_t*)malloc(size * sizeof(taglist_t));
+    tlp->next = tlp->head;
+    tlp->last = tlp->head + size;
+}
+
+static void taglistpool_free(taglistpool_t *tlp)
+{
+    free(tlp->head);
+    tlp->head = tlp->next = tlp->last = NULL;
+}
+
+static taglist_t *taglistpool_next(taglistpool_t *tlp)
+{
+    if (tlp->next < tlp->last) {
+        return tlp->next++;
+    }
+
+    const unsigned size = tlp->last - tlp->head;
+    taglist_t *head = (taglist_t*)malloc(2 * size * sizeof(taglist_t));
+    memcpy(head, tlp->head, size * sizeof(taglist_t));
+    free(tlp->head);
+    tlp->head = head;
+    tlp->next = head + size;
+    tlp->last = head + size * 2;
+    return tlp->next++;
+}
+
+static void taglist(taglist_t **ptl, const char *b, const char *t, taglistpool_t *tlp)
+{
+    taglist_t *tl = taglistpool_next(tlp);
+    tl->pred = *ptl;
+    tl->dist = t - b;
+    *ptl = tl;
+}
+
+/*!max:re2c*/
+static const size_t SIZE = 4096;
+
+static void init_input(input_t *in, const char *fname)
+{
+    in->file = fopen(fname, "r");
+    in->buf = (char*) malloc(SIZE + YYMAXFILL);
+    in->lim = in->buf + SIZE;
+    in->cur = in->lim;
+    in->mar = in->lim;
+    in->tok = in->lim;
+    /*!tags:re2c format = "in->@@ = 0;\n"; */
+    /*!taglists:re2c format = "in->@@ = 0;\n"; */
+    taglistpool_init(&in->tlp);
+    in->eof = 0;
+}
+
+static void free_input(input_t *in)
+{
+    fclose(in->file);
+    free(in->buf);
+    taglistpool_free(&in->tlp);
+}
+
+static int fill(input_t *in, size_t need)
+{
+    size_t free;
+
+    if (in->eof) return 1;
+
+    free = in->tok - in->buf;
+    if (free < need) return 2;
+
+    memmove(in->buf, in->tok, in->lim - in->tok);
+    in->lim -= free;
+    in->cur -= free;
+    in->mar -= free;
+    in->tok -= free;
+    /*!tags:re2c format = "if (in->@@) in->@@ -= free;\n"; */
+    in->lim += fread(in->lim, 1, free, in->file);
+    if (in->lim < in->buf + SIZE) {
+        in->eof = 1;
+        memset(in->lim, 0, YYMAXFILL);
+        in->lim += YYMAXFILL;
+    }
+    return 0;
+}
+
+static void print_headers(const char *tok,
+    const taglist_t *h1, const taglist_t *h2, const taglist_t *h3)
+{
+    if (!h1 || !h2 || !h3) {
+        assert(!h1 && !h2 && !h3);
+        return;
+    }
+    print_headers(tok, h1->pred, h2->pred, h3->pred);
+    printf("%.*s%.*s\n",
+        (int)(h2->dist - h1->dist), tok + h1->dist,
+        (int)(h3->dist - h2->dist), tok + h2->dist);
+}
+
+#define YYCTYPE            char
+#define YYCURSOR           in->cur
+#define YYMARKER           in->mar
+#define YYLIMIT            in->lim
+#define YYTAGLISTP(tl)     taglist(&tl, in->tok, in->cur, &in->tlp)
+#define YYTAGLISTN(tl)     taglist(&tl, in->tok, NULL, &in->tlp)
+#define YYFILL(n)          if (fill(in, n) != 0) return 2;
+
+static int lex(input_t *in, long *count, long *total)
+{
+    const char *s1, *s2, *v1, *v2, *v3, *v4, *m1, *m2, *rp1, *rp2, *rt1, *rt2;
+    taglist_t *h1, *h2, *h3;
+    long c, t;
+
+    c = 0; t = 0;
+loop:
+    in->tok = in->cur;
+/*!re2c
+    re2c:flags:tags = 1;
+    re2c:tags:expression = "in->@@";
+
+    end            = "\x00";
+    eol            = "\n";
+    sp             = " ";
+    htab           = "\t";
+    ows            = (sp | htab)*;
+    char           = [-._~%!$&'()*+,;=a-zA-Z0-9];
+    tchar          = [-._~%!$&'*+#^`|a-zA-Z0-9];
+    vchar          = [\x1f-\x7e\x80-\xff];
+    scheme         = [-+.a-zA-Z0-9]+;
+    userinfo       = (char | [:])+;
+    host           = (char | [:[\]])+;
+    port           = [0-9]*;
+    path           = (char | [:@/])*;
+    query          = (char | [:@?/])*;
+    obs_fold       = eol (sp | htab)+;
+    field_content  = vchar ((sp | htab)+ vchar)?;
+    header_field   = #h1 tchar+ ":" #h2 ows (field_content | obs_fold)* ows #h3;
+    authority      = (userinfo "@")? host (":" port)?;
+    absolute_uri   = scheme ":" ("//" (userinfo "@")? host (":" port)?)? path ("?" query)?;
+    origin_form    = "/" path ("?" query)?;
+    http_version   = "HTTP/" [0-9] "." [0-9];
+    request_target = authority | absolute_uri | origin_form | "*";
+    method         = tchar+;
+    request_line   = @m1 method @m2 sp @rt1 request_target @rt2 sp @v3 http_version @v4 eol;
+    status_code    = [0-9]{3};
+    reason_phrase  = (htab | sp | vchar)*;
+    status_line    = @v1 http_version @v2 sp @s1 status_code @s2 sp @rp1 reason_phrase @rp2 eol;
+    start_line     = (request_line | status_line);
+    message_head   = start_line (header_field eol)* eol;
+
+    *   { return 1; }
+    end { *count = c; *total = t; return 0; }
+    eol { goto loop; }
+    message_head {
+#ifndef VERIFY
+        c += 1;
+        t += (v2 - v1) + (s2 - s1) + (rp2 - rp1)
+            + (m2 - m1) + (rt2 - rt1) + (v4 - v3);
+        for (; h1 != 0; h1 = h1->pred, h2 = h2->pred, h3 = h3->pred) {
+            t += (h2->dist - h1->dist) + (h3->dist - h2->dist);
+        }
+#else
+        if (s1) printf("%.*s %.*s %.*s\n",
+            (int)(v2 - v1), v1,
+            (int)(s2 - s1), s1,
+            (int)(rp2 - rp1), rp1);
+        if (m1) printf("%.*s %.*s %.*s\n",
+            (int)(m2 - m1), m1,
+            (int)(rt2 - rt1), rt1,
+            (int)(v4 - v3), v3);
+        print_headers(in->tok, h1, h2, h3);
+        printf("\n");
+#endif
+        taglistpool_clear(&in->tlp, in);
+        goto loop;
+    }
+*/
+}
+
+int main(int argc, char **argv)
+{
+    long count, total;
+    input_t in;
+    init_input(&in, argv[1]);
+
+    switch (lex(&in, &count, &total)) {
+        case 0: fprintf(stderr, "ok, parsed %ld HTTPs, peeked %ld chars\n", count, total); break;
+        case 1: fprintf(stderr, "syntax error: %s\n", in.cur); break;
+        case 2: fprintf(stderr, "fill failed\n"); break;
+        default: fprintf(stderr, "panic!\n"); break;
+    }
+
+    free_input(&in);
+    return 0;
+}
diff --git a/re2c/benchmarks/http/simple/http_simple_notags.re b/re2c/benchmarks/http/simple/http_simple_notags.re

new file mode 100644 (file)

index 0000000..ac7ba07
--- /dev/null
+++ b/re2c/benchmarks/http/simple/http_simple_notags.re
@@ -0,0 +1,134 @@
+#include <assert.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+
+typedef struct {
+    FILE *file;
+    char *buf;
+    char *lim;
+    char *cur;
+    char *mar;
+    char *tok;
+    int eof;
+} input_t;
+
+/*!max:re2c*/
+static const size_t SIZE = 4096;
+
+static void init_input(input_t *in, const char *fname)
+{
+    in->file = fopen(fname, "r");
+    in->buf = (char*) malloc(SIZE + YYMAXFILL);
+    in->lim = in->buf + SIZE;
+    in->cur = in->lim;
+    in->mar = in->lim;
+    in->tok = in->lim;
+    in->eof = 0;
+}
+
+static void free_input(input_t *in)
+{
+    fclose(in->file);
+    free(in->buf);
+}
+
+static int fill(input_t *in, size_t need)
+{
+    size_t free;
+
+    if (in->eof) return 1;
+
+    free = in->tok - in->buf;
+    if (free < need) return 2;
+
+    memmove(in->buf, in->tok, in->lim - in->tok);
+    in->lim -= free;
+    in->cur -= free;
+    in->mar -= free;
+    in->tok -= free;
+    in->lim += fread(in->lim, 1, free, in->file);
+    if (in->lim < in->buf + SIZE) {
+        in->eof = 1;
+        memset(in->lim, 0, YYMAXFILL);
+        in->lim += YYMAXFILL;
+    }
+    return 0;
+}
+
+#define YYCTYPE            char
+#define YYCURSOR           in->cur
+#define YYMARKER           in->mar
+#define YYLIMIT            in->lim
+#define YYFILL(n)          if (fill(in, n) != 0) return 2;
+
+static int lex(input_t *in, long *count, long *total)
+{
+    long c, t;
+
+    c = 0; t = 0;
+loop:
+    in->tok = in->cur;
+/*!re2c
+    end            = "\x00";
+    eol            = "\n";
+    sp             = " ";
+    htab           = "\t";
+    ows            = (sp | htab)*;
+    char           = [-._~%!$&'()*+,;=a-zA-Z0-9];
+    tchar          = [-._~%!$&'*+#^`|a-zA-Z0-9];
+    vchar          = [\x1f-\x7e\x80-\xff];
+    scheme         = [-+.a-zA-Z0-9]+;
+    userinfo       = (char | [:])+;
+    host           = (char | [:[\]])+;
+    port           = [0-9]*;
+    path           = (char | [:@/])*;
+    query          = (char | [:@?/])*;
+    obs_fold       = eol (sp | htab)+;
+    field_content  = vchar ((sp | htab)+ vchar)?;
+    header_field   = tchar+ ":" ows (field_content | obs_fold)* ows;
+    authority      = (userinfo "@")? host (":" port)?;
+    absolute_uri   = scheme ":" ("//" (userinfo "@")? host (":" port)?)? path ("?" query)?;
+    origin_form    = "/" path ("?" query)?;
+    http_version   = "HTTP/" [0-9] "." [0-9];
+    request_target = authority | absolute_uri | origin_form | "*";
+    method         = tchar+;
+    request_line   = method sp request_target sp http_version eol;
+    status_code    = [0-9]{3};
+    reason_phrase  = (htab | sp | vchar)*;
+    status_line    = http_version sp status_code sp reason_phrase eol;
+    start_line     = (request_line | status_line);
+    message_head   = start_line (header_field eol)* eol;
+
+    *   { return 1; }
+    end { *count = c; *total = t; return 0; }
+    eol { goto loop; }
+    message_head {
+#ifndef VERIFY
+        c += 1;
+        t += in->cur - in->tok;
+#else
+        printf("%.*s", (int)(in->cur - in->tok), in->tok);
+#endif
+        goto loop;
+    }
+*/
+}
+
+int main(int argc, char **argv)
+{
+    long count, total;
+    input_t in;
+    init_input(&in, argv[1]);
+
+    switch (lex(&in, &count, &total)) {
+        case 0: fprintf(stderr, "ok, parsed %ld HTTPs, peeked %ld chars\n", count, total); break;
+        case 1: fprintf(stderr, "syntax error: %s\n", in.cur); break;
+        case 2: fprintf(stderr, "fill failed\n"); break;
+        default: fprintf(stderr, "panic!\n"); break;
+    }
+
+    free_input(&in);
+    return 0;
+}
diff --git a/re2c/benchmarks/uri/gen/gen_uri.hs b/re2c/benchmarks/uri/gen/gen_uri.hs

new file mode 100644 (file)

index 0000000..18e2afe
--- /dev/null
+++ b/re2c/benchmarks/uri/gen/gen_uri.hs
@@ -0,0 +1,111 @@
+{-# LANGUAGE LambdaCase #-}
+
+import qualified Test.QuickCheck as Q
+import qualified Test.QuickCheck.Monadic as QM
+import qualified System.Random as SR
+import qualified System.Process as SP
+import qualified System.Exit as SE
+import qualified Data.ByteString.Char8 as BS
+import           Data.Char (chr)
+import           Data.List (intercalate)
+import           Control.Monad (forM_, when, replicateM)
+import           Data.Foldable (foldlM)
+
+type Scheme = String
+type User = String
+type Password = String
+type Host = String
+type Port = String
+type Path = String
+type Query = String
+type Fragment = String
+data URI = URI Scheme User Password Host Port Path Query Fragment
+
+instance Show URI where
+    show (URI sc us pw ho po pa qu fr) = concat
+        [ sc
+        , ":"
+        , if ho == "" then "" else "//"
+        , us
+        , if us == "" then "" else ":"
+        , pw
+        , if pw == "" then "" else "@"
+        , ho
+        , if po == "" then "" else ":"
+        , po
+        , pa
+        , if qu == "" then "" else "?"
+        , qu
+        , if fr == "" then "" else "#"
+        , fr
+        ]
+
+lalpha = ['a'..'z']
+ualpha = ['A'..'Z']
+alpha = lalpha ++ ualpha
+digit = ['0'..'9']
+hexdigit = ['a'..'f'] ++ ['A'..'F'] ++ digit
+alphanum = alpha ++ digit
+
+lstr :: Q.Gen String
+lstr = Q.listOf1 (Q.elements lalpha) :: Q.Gen [Char]
+
+anstr :: Q.Gen String
+anstr = Q.listOf1 (Q.elements alphanum) :: Q.Gen [Char]
+
+ipv4 :: Q.Gen String
+ipv4 = do
+    let octet = show <$> (Q.choose (0,255) :: Q.Gen Int)
+    intercalate "." <$> replicateM 4 octet
+
+ipv6 :: Q.Gen String
+ipv6 = do
+    let hexgroup = do
+            k <- Q.choose (1, 4) :: Q.Gen Int
+            replicateM k (Q.elements hexdigit)
+    n <- Q.choose (0, 8) :: Q.Gen Int
+    m <- Q.choose (1, n) :: Q.Gen Int
+    hh <- replicateM n hexgroup
+    return $ (++ "]") . ("[" ++) $ case n of
+        8 -> intercalate ":" hh
+        _ -> concat [intercalate ":" (take m hh), "::", intercalate ":" (drop m hh)]
+
+hostname :: Q.Gen String
+hostname = do
+    subdomains <- Q.listOf1 (take 10 <$> lstr)
+    return $ intercalate "." $ take 5 subdomains
+
+filepath :: Q.Gen String
+filepath = do
+    subdirs <- Q.listOf1 lstr
+    return $ concat $ map ("/" ++) $ take 10 subdirs
+
+mayb :: Q.Gen String -> Q.Gen String
+mayb gen = Q.oneof [gen, pure ""]
+
+instance Q.Arbitrary URI where
+    arbitrary = do
+        scheme   <- Q.elements ["http", "https", "ftp", "file"]
+        user     <- lstr
+        password <- anstr
+        host     <- Q.frequency [(10, ipv4), (10, hostname), (1, ipv6)]
+        port     <- mayb $ show <$> (Q.choose (0, 10000) :: Q.Gen Int)
+        path     <- filepath
+        query    <- mayb anstr
+        fragment <- mayb anstr
+
+        Q.frequency
+            [ (1, pure $ URI scheme user  password host port path query fragment)
+            , (2, pure $ URI scheme ""    ""       host port path query fragment)
+            , (1, pure $ URI scheme ""    ""       ""   ""   path query fragment)
+            ]
+
+main = do
+    let f = "uri.dat"
+    writeFile f ""
+
+    forM_ [0..600000] $ \i -> do
+        when (i `mod` 100 == 0) $ print i
+
+        uri <- Q.generate (Q.arbitrary :: Q.Gen URI)
+        appendFile f $ show uri ++ "\n"
diff --git a/re2c/benchmarks/uri/gen/mk_gen.sh b/re2c/benchmarks/uri/gen/mk_gen.sh

new file mode 100755 (executable)

index 0000000..233725d
--- /dev/null
+++ b/re2c/benchmarks/uri/gen/mk_gen.sh
@@ -0,0 +1 @@
+ghc -O2 -Wall gen_uri.hs
diff --git a/re2c/benchmarks/uri/rfc3986/uri_rfc3986.re b/re2c/benchmarks/uri/rfc3986/uri_rfc3986.re

new file mode 100644 (file)

index 0000000..15e358c
--- /dev/null
+++ b/re2c/benchmarks/uri/rfc3986/uri_rfc3986.re
@@ -0,0 +1,178 @@
+#include <stddef.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+
+/*!max:re2c*/
+static const size_t SIZE = 4096;
+
+typedef struct {
+    FILE *file;
+    char *buf;
+    char *lim;
+    char *cur;
+    char *mar;
+    char *tok;
+    /*!tags:re2c format = "char *@@;\n"; */
+    int eof;
+} input_t;
+
+static void init_input(input_t *in, const char *fname)
+{
+    in->file = fopen(fname, "r");
+    in->buf = (char*) malloc(SIZE + YYMAXFILL);
+    in->lim = in->buf + SIZE;
+    in->cur = in->lim;
+    in->mar = in->lim;
+    in->tok = in->lim;
+    /*!tags:re2c format = "in->@@ = 0;\n"; */
+    in->eof = 0;
+}
+
+static void free_input(input_t *in)
+{
+    free(in->buf);
+    fclose(in->file);
+}
+
+static int fill(input_t *in, size_t need)
+{
+    size_t free;
+
+    if (in->eof) return 1;
+
+    free = in->tok - in->buf;
+    if (free < need) return 2;
+
+    memmove(in->buf, in->tok, in->lim - in->tok);
+    in->lim -= free;
+    in->cur -= free;
+    in->mar -= free;
+    in->tok -= free;
+    /*!tags:re2c format = "if (in->@@) in->@@ -= free;\n"; */
+    in->lim += fread(in->lim, 1, free, in->file);
+    if (in->lim < in->buf + SIZE) {
+        in->eof = 1;
+        memset(in->lim, 0, YYMAXFILL);
+        in->lim += YYMAXFILL;
+    }
+    return 0;
+}
+
+static int lex(input_t *in, long *count, long *total)
+{
+    const char
+        *s1, *u1, *h1, *h3, *h5, *r1, *p1, *p3, *q1, *f1,
+        *s2, *u2, *h2, *h4, *h6, *r2, *p2, *p4, *q2, *f2;
+    long c, t;
+
+    c = 0; t = 0;
+loop:
+    in->tok = in->cur;
+/*!re2c
+
+    re2c:define:YYCTYPE = char;
+    re2c:define:YYCURSOR = in->cur;
+    re2c:define:YYMARKER = in->mar;
+    re2c:define:YYLIMIT = in->lim;
+    re2c:define:YYFILL = "if (fill(in, @@) != 0) return 2;";
+    re2c:define:YYFILL:naked = 1;
+    re2c:flags:tags = 1;
+    re2c:tags:expression = "in->@@";
+
+    end = "\x00";
+    eol = "\n";
+
+    alpha       = [a-zA-Z];
+    digit       = [0-9];
+    hexdigit    = [0-9a-fA-F];
+    unreserved  = alpha | digit | [-._~];
+    pct_encoded = "%" hexdigit{2};
+    sub_delims  = [!$&'()*+,;=];
+    pchar       = unreserved | pct_encoded | sub_delims | [:@];
+
+    scheme = @s1 alpha (alpha | digit | [-+.])* @s2;
+    userinfo = @u1 (unreserved | pct_encoded | sub_delims | ":")* @u2;
+    dec_octet
+        = digit
+        | [\x31-\x39] digit
+        | "1" digit{2}
+        | "2" [\x30-\x34] digit
+        | "25" [\x30-\x35];
+    ipv4address = dec_octet "." dec_octet "." dec_octet "." dec_octet;
+    h16         = hexdigit{1,4};
+    ls32        = h16 ":" h16 | ipv4address;
+    ipv6address
+        =                            (h16 ":"){6} ls32
+        |                       "::" (h16 ":"){5} ls32
+        | (               h16)? "::" (h16 ":"){4} ls32
+        | ((h16 ":"){0,1} h16)? "::" (h16 ":"){3} ls32
+        | ((h16 ":"){0,2} h16)? "::" (h16 ":"){2} ls32
+        | ((h16 ":"){0,3} h16)? "::"  h16 ":"     ls32
+        | ((h16 ":"){0,4} h16)? "::"              ls32
+        | ((h16 ":"){0,5} h16)? "::"              h16
+        | ((h16 ":"){0,6} h16)? "::";
+    ipvfuture   = "v" hexdigit+ "." (unreserved | sub_delims | ":" )+;
+    ip_literal  = "[" ( ipv6address | ipvfuture ) "]";
+    reg_name    = (unreserved | pct_encoded | sub_delims)*;
+    host
+        = @h1 ip_literal  @h2
+        | @h3 ipv4address @h4
+        | @h5 reg_name    @h6;
+    port      = @r1 digit* @r2;
+    authority = (userinfo "@")? host (":" port)?;
+    path_abempty  = ("/" pchar*)*;
+    path_absolute = "/" (pchar+ ("/" pchar*)*)?;
+    path_rootless = pchar+ ("/" pchar*)*;
+    path_empty    = "";
+    hier_part
+        = "//" authority @p1 path_abempty @p2
+        | @p3 (path_absolute | path_rootless | path_empty) @p4;
+    query    = @q1 (pchar | [/?])* @q2;
+    fragment = @f1 (pchar | [/?])* @f2;
+    uri = scheme ":" hier_part ("?" query)? ("#" fragment)?;
+
+    *   { return 1; }
+    end { *count = c; *total = t; return 0; }
+    eol { goto loop; }
+    uri {
+#ifndef VERIFY
+        c += 1;
+        t += (s2 - s1) + (u2 - u1) + (h2 - h1) + (h4 - h3)
+            + (h6 - h5) + (r2 - r1) + (p2 - p1) + (p4 - p3)
+            + (q2 - q1) + (f2 - f1);
+#else
+                printf("%.*s:", (int)(s2 - s1), s1);
+        if (p1) printf("//");
+        if (u1) printf("%.*s@", (int)(u2 - u1), u1);
+        if (h1) printf("%.*s",  (int)(h2 - h1), h1);
+        if (h3) printf("%.*s",  (int)(h4 - h3), h3);
+        if (h5) printf("%.*s",  (int)(h6 - h5), h5);
+        if (r1) printf(":%.*s", (int)(r2 - r1), r1);
+        if (p1) printf("%.*s",  (int)(p2 - p1), p1);
+        if (p3) printf("%.*s",  (int)(p4 - p3), p3);
+        if (q1) printf("?%.*s", (int)(q2 - q1), q1);
+        if (f1) printf("#%.*s", (int)(f2 - f1), f1);
+        printf("\n");
+#endif
+        goto loop;
+    }
+*/
+}
+
+int main(int argc, char **argv)
+{
+    long count, total;
+    input_t in;
+    init_input(&in, argv[1]);
+
+    switch (lex(&in, &count, &total)) {
+        case 0: fprintf(stderr, "ok, parsed %ld URIs, peeked %ld chars\n", count, total); break;
+        case 1: fprintf(stderr, "syntax error\n"); break;
+        case 2: fprintf(stderr, "yyfill failed\n"); break;
+        default: fprintf(stderr, "panic!\n"); break;
+    }
+
+    free_input(&in);
+    return 0;
+}
diff --git a/re2c/benchmarks/uri/rfc3986/uri_rfc3986_notags.re b/re2c/benchmarks/uri/rfc3986/uri_rfc3986_notags.re

new file mode 100644 (file)

index 0000000..4008e7c
--- /dev/null
+++ b/re2c/benchmarks/uri/rfc3986/uri_rfc3986_notags.re
@@ -0,0 +1,153 @@
+#include <stddef.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+
+/*!max:re2c*/
+static const size_t SIZE = 4096;
+
+typedef struct {
+    FILE *file;
+    char *buf;
+    char *lim;
+    char *cur;
+    char *mar;
+    char *tok;
+    int eof;
+} input_t;
+
+static void init_input(input_t *in, const char *fname)
+{
+    in->file = fopen(fname, "r");
+    in->buf = (char*) malloc(SIZE + YYMAXFILL);
+    in->lim = in->buf + SIZE;
+    in->cur = in->lim;
+    in->mar = in->lim;
+    in->tok = in->lim;
+    in->eof = 0;
+}
+
+static void free_input(input_t *in)
+{
+    free(in->buf);
+    fclose(in->file);
+}
+
+static int fill(input_t *in, size_t need)
+{
+    size_t free;
+
+    if (in->eof) return 1;
+
+    free = in->tok - in->buf;
+    if (free < need) return 2;
+
+    memmove(in->buf, in->tok, in->lim - in->tok);
+    in->lim -= free;
+    in->cur -= free;
+    in->mar -= free;
+    in->tok -= free;
+    in->lim += fread(in->lim, 1, free, in->file);
+    if (in->lim < in->buf + SIZE) {
+        in->eof = 1;
+        memset(in->lim, 0, YYMAXFILL);
+        in->lim += YYMAXFILL;
+    }
+    return 0;
+}
+
+static int lex(input_t *in, long *count, long *total)
+{
+    long c, t;
+
+    c = 0; t = 0;
+loop:
+    in->tok = in->cur;
+/*!re2c
+    re2c:define:YYCTYPE = char;
+    re2c:define:YYCURSOR = in->cur;
+    re2c:define:YYMARKER = in->mar;
+    re2c:define:YYLIMIT = in->lim;
+    re2c:define:YYFILL = "if (fill(in, @@) != 0) return 2;";
+    re2c:define:YYFILL:naked = 1;
+
+    end = "\x00";
+    eol = "\n";
+
+    alpha         = [a-zA-Z];
+    digit         = [0-9];
+    hexdigit      = [0-9a-fA-F];
+    unreserved    = alpha | digit | [-._~];
+    pct_encoded   = "%" hexdigit{2};
+    sub_delims    = [!$&'()*+,;=];
+    pchar         = unreserved | pct_encoded | sub_delims | [:@];
+
+    scheme        = alpha (alpha | digit | [-+.])*;
+    userinfo      = (unreserved | pct_encoded | sub_delims | ":")*;
+    dec_octet
+        = digit
+        | [\x31-\x39] digit
+        | "1" digit{2}
+        | "2" [\x30-\x34] digit
+        | "25" [\x30-\x35];
+    ipv4address   = dec_octet "." dec_octet "." dec_octet "." dec_octet;
+    h16           = hexdigit{1,4};
+    ls32          = h16 ":" h16 | ipv4address;
+    ipv6address
+        =                            (h16 ":"){6} ls32
+        |                       "::" (h16 ":"){5} ls32
+        | (               h16)? "::" (h16 ":"){4} ls32
+        | ((h16 ":"){0,1} h16)? "::" (h16 ":"){3} ls32
+        | ((h16 ":"){0,2} h16)? "::" (h16 ":"){2} ls32
+        | ((h16 ":"){0,3} h16)? "::"  h16 ":"     ls32
+        | ((h16 ":"){0,4} h16)? "::"              ls32
+        | ((h16 ":"){0,5} h16)? "::"              h16
+        | ((h16 ":"){0,6} h16)? "::";
+    ipvfuture     = "v" hexdigit+ "." (unreserved | sub_delims | ":" )+;
+    ip_literal    = "[" ( ipv6address | ipvfuture ) "]";
+    reg_name      = (unreserved | pct_encoded | sub_delims)*;
+    host          = ip_literal | ipv4address | reg_name;
+    port          = digit*;
+    authority     = (userinfo "@")? host (":" port)?;
+    path_abempty  = ("/" pchar*)*;
+    path_absolute = "/" (pchar+ ("/" pchar*)*)?;
+    path_rootless = pchar+ ("/" pchar*)*;
+    path_empty    = "";
+    hier_part
+        = "//" authority path_abempty
+        | (path_absolute | path_rootless | path_empty);
+    query         = (pchar | [/?])*;
+    fragment      = (pchar | [/?])*;
+    uri           = scheme ":" hier_part ("?" query)? ("#" fragment)?;
+
+    *   { return 1; }
+    end { *count = c; *total = t; return 0; }
+    eol { goto loop; }
+    uri {
+#ifndef VERIFY
+        c += 1;
+        t += in->cur - in->tok;
+#else
+        printf("%.*s\n", (int)(in->cur - in->tok), in->tok);
+#endif
+        goto loop;
+    }
+*/
+}
+
+int main(int argc, char **argv)
+{
+    long count, total;
+    input_t in;
+    init_input(&in, argv[1]);
+
+    switch (lex(&in, &count, &total)) {
+        case 0: fprintf(stderr, "ok, parsed %ld URIs, peeked %ld chars\n", count, total); break;
+        case 1: fprintf(stderr, "syntax error\n"); break;
+        case 2: fprintf(stderr, "yyfill failed\n"); break;
+        default: fprintf(stderr, "panic!\n"); break;
+    }
+
+    free_input(&in);
+    return 0;
+}
diff --git a/re2c/benchmarks/uri/simple/uri_simple.re b/re2c/benchmarks/uri/simple/uri_simple.re

new file mode 100644 (file)

index 0000000..63a9ea8
--- /dev/null
+++ b/re2c/benchmarks/uri/simple/uri_simple.re
@@ -0,0 +1,134 @@
+#include <stddef.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+
+/*!max:re2c*/
+static const size_t SIZE = 4096;
+
+typedef struct {
+    FILE *file;
+    char *buf;
+    char *lim;
+    char *cur;
+    char *mar;
+    char *tok;
+    /*!tags:re2c format = "char *@@;\n"; */
+    int eof;
+} input_t;
+
+static void init_input(input_t *in, const char *fname)
+{
+    in->file = fopen(fname, "r");
+    in->buf = (char*) malloc(SIZE + YYMAXFILL);
+    in->lim = in->buf + SIZE;
+    in->cur = in->lim;
+    in->mar = in->lim;
+    in->tok = in->lim;
+    /*!tags:re2c format = "in->@@ = 0;\n"; */
+    in->eof = 0;
+}
+
+static void free_input(input_t *in)
+{
+    free(in->buf);
+    fclose(in->file);
+}
+
+static int fill(input_t *in, size_t need)
+{
+    size_t free;
+
+    if (in->eof) return 1;
+
+    free = in->tok - in->buf;
+    if (free < need) return 2;
+
+    memmove(in->buf, in->tok, in->lim - in->tok);
+    in->lim -= free;
+    in->cur -= free;
+    in->mar -= free;
+    in->tok -= free;
+    /*!tags:re2c format = "if (in->@@) in->@@ -= free;\n"; */
+    in->lim += fread(in->lim, 1, free, in->file);
+    if (in->lim < in->buf + SIZE) {
+        in->eof = 1;
+        memset(in->lim, 0, YYMAXFILL);
+        in->lim += YYMAXFILL;
+    }
+    return 0;
+}
+
+static int lex(input_t *in, long *count, long *total)
+{
+    const char
+        *s1, *u1, *h1, *r1, *p1, *q1, *f1,
+        *s2, *u2, *h2, *r2, *p2, *q2, *f2;
+    long c, t;
+
+    c = 0; t = 0;
+loop:
+    in->tok = in->cur;
+/*!re2c
+
+    re2c:define:YYCTYPE = char;
+    re2c:define:YYCURSOR = in->cur;
+    re2c:define:YYMARKER = in->mar;
+    re2c:define:YYLIMIT = in->lim;
+    re2c:define:YYFILL = "if (fill(in, @@) != 0) return 2;";
+    re2c:define:YYFILL:naked = 1;
+    re2c:flags:tags = 1;
+    re2c:tags:expression = "in->@@";
+
+    end       = "\x00";
+    eol       = "\n";
+    char      = [-._~%!$&'()*+,;=a-zA-Z0-9];
+    scheme    = @s1 [-+.a-zA-Z0-9]+ @s2;
+    userinfo  = @u1 (char | [:])+ @u2;
+    host      = @h1 (char | [:[\]])+ @h2;
+    port      = @r1 [0-9]* @r2;
+    path      = @p1 (char | [:@/])* @p2;
+    query     = @q1 (char | [:@?/])* @q2;
+    fragment  = @f1 (char | [:@?/])* @f2;
+    uri       = scheme ":" ("//" (userinfo "@")? host (":" port)?)? path ("?" query)? ("#" fragment)?;
+
+    *   { return 1; }
+    end { *count = c; *total = t; return 0; }
+    eol { goto loop; }
+    uri {
+#ifndef VERIFY
+        c += 1;
+        t += (s2 - s1) + (u2 - u1) + (h2 - h1)
+            + (r2 - r1) + (p2 - p1) + (q2 - q1) + (f2 - f1);
+#else
+                printf("%.*s:", (int)(s2 - s1), s1);
+        if (h1) printf("//");
+        if (u1) printf("%.*s@", (int)(u2 - u1), u1);
+        if (h1) printf("%.*s",  (int)(h2 - h1), h1);
+        if (r1) printf(":%.*s", (int)(r2 - r1), r1);
+                printf("%.*s",  (int)(p2 - p1), p1);
+        if (q1) printf("?%.*s", (int)(q2 - q1), q1);
+        if (f1) printf("#%.*s", (int)(f2 - f1), f1);
+        printf("\n");
+#endif
+        goto loop;
+    }
+*/
+}
+
+int main(int argc, char **argv)
+{
+    long count, total;
+    input_t in;
+    init_input(&in, argv[1]);
+
+    switch (lex(&in, &count, &total)) {
+        case 0: fprintf(stderr, "ok, parsed %ld URIs, peeked %ld chars\n", count, total); break;
+        case 1: fprintf(stderr, "syntax error\n"); break;
+        case 2: fprintf(stderr, "yyfill failed\n"); break;
+        default: fprintf(stderr, "panic!\n"); break;
+    }
+
+    free_input(&in);
+    return 0;
+}
diff --git a/re2c/benchmarks/uri/simple/uri_simple_notags.re b/re2c/benchmarks/uri/simple/uri_simple_notags.re

new file mode 100644 (file)

index 0000000..ed551da
--- /dev/null
+++ b/re2c/benchmarks/uri/simple/uri_simple_notags.re
@@ -0,0 +1,116 @@
+#include <stddef.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+
+/*!max:re2c*/
+static const size_t SIZE = 4096;
+
+typedef struct {
+    FILE *file;
+    char *buf;
+    char *lim;
+    char *cur;
+    char *mar;
+    char *tok;
+    int eof;
+} input_t;
+
+static void init_input(input_t *in, const char *fname)
+{
+    in->file = fopen(fname, "r");
+    in->buf = (char*) malloc(SIZE + YYMAXFILL);
+    in->lim = in->buf + SIZE;
+    in->cur = in->lim;
+    in->mar = in->lim;
+    in->tok = in->lim;
+    in->eof = 0;
+}
+
+static void free_input(input_t *in)
+{
+    free(in->buf);
+    fclose(in->file);
+}
+
+static int fill(input_t *in, size_t need)
+{
+    size_t free;
+
+    if (in->eof) return 1;
+
+    free = in->tok - in->buf;
+    if (free < need) return 2;
+
+    memmove(in->buf, in->tok, in->lim - in->tok);
+    in->lim -= free;
+    in->cur -= free;
+    in->mar -= free;
+    in->tok -= free;
+    in->lim += fread(in->lim, 1, free, in->file);
+    if (in->lim < in->buf + SIZE) {
+        in->eof = 1;
+        memset(in->lim, 0, YYMAXFILL);
+        in->lim += YYMAXFILL;
+    }
+    return 0;
+}
+
+static int lex(input_t *in, long *count, long *total)
+{
+    long c, t;
+
+    c = 0; t = 0;
+loop:
+    in->tok = in->cur;
+/*!re2c
+    re2c:define:YYCTYPE = char;
+    re2c:define:YYCURSOR = in->cur;
+    re2c:define:YYMARKER = in->mar;
+    re2c:define:YYLIMIT = in->lim;
+    re2c:define:YYFILL = "if (fill(in, @@) != 0) return 2;";
+    re2c:define:YYFILL:naked = 1;
+
+    end       = "\x00";
+    eol       = "\n";
+    char      = [-._~%!$&'()*+,;=a-zA-Z0-9];
+    scheme    = [-+.a-zA-Z0-9]+;
+    userinfo  = (char | [:])+;
+    host      = (char | [:[\]])+;
+    port      = [0-9]*;
+    path      = (char | [:@/])*;
+    query     = (char | [:@?/])*;
+    fragment  = (char | [:@?/])*;
+    uri       = scheme ":" ("//" (userinfo "@")? host (":" port)?)? path ("?" query)? ("#" fragment)?;
+
+    *   { return 1; }
+    end { *count = c; *total = t; return 0; }
+    eol { goto loop; }
+    uri {
+#ifndef VERIFY
+        c += 1;
+        t += in->cur - in->tok;
+#else
+        printf("%.*s\n", (int)(in->cur - in->tok), in->tok);
+#endif
+        goto loop;
+    }
+*/
+}
+
+int main(int argc, char **argv)
+{
+    long count, total;
+    input_t in;
+    init_input(&in, argv[1]);
+
+    switch (lex(&in, &count, &total)) {
+        case 0: fprintf(stderr, "ok, parsed %ld URIs, peeked %ld chars\n", count, total); break;
+        case 1: fprintf(stderr, "syntax error\n"); break;
+        case 2: fprintf(stderr, "yyfill failed\n"); break;
+        default: fprintf(stderr, "panic!\n"); break;
+    }
+
+    free_input(&in);
+    return 0;
+}
diff --git a/re2c/doc/tdfa/img/bench/__mk.sh b/re2c/doc/tdfa/img/bench/__mk.sh

new file mode 100755 (executable)

index 0000000..71fddd7
--- /dev/null
+++ b/re2c/doc/tdfa/img/bench/__mk.sh
@@ -0,0 +1,12 @@
+gnuplot plot.gnuplot
+
+montage size_gcc.png size_clang.png -tile 2x1 -geometry +0+0 size_gcc_clang.png
+montage size_tcc.png size_pcc.png   -tile 2x1 -geometry +0+0 size_tcc_pcc.png
+montage time_gcc.png time_clang.png -tile 2x1 -geometry +0+0 time_gcc_clang.png
+montage time_tcc.png time_pcc.png   -tile 2x1 -geometry +0+0 time_tcc_pcc.png
+
+for f in *.png; do convert $f $f; done
+
+rm size_{gcc,clang,tcc,pcc}.png
+rm time_{gcc,clang,tcc,pcc}.png
+
diff --git a/re2c/doc/tdfa/img/bench/plot.gnuplot b/re2c/doc/tdfa/img/bench/plot.gnuplot

new file mode 100644 (file)

index 0000000..23959f5
--- /dev/null
+++ b/re2c/doc/tdfa/img/bench/plot.gnuplot
@@ -0,0 +1,118 @@
+set terminal pngcairo dashed font "Courier,mono"
+set xtics ("HTTP-RFC3720" 1, "HTTP-simple" 2, "URI-RFC3986" 3, "URI-simple" 4)
+
+set style line 8 lc rgb '#888888' lw 3 lt 1    # TDFA(0)
+set style line 1 lc rgb '#888888' lw 3 lt 2    # TDFA(1)
+set style line 2 lc rgb '#888888' lw 3 lt 3    # DFA
+set style line 3 lc rgb '#000000' lw 1 lt 1    # TDFA(0) -b
+set style line 4 lc rgb '#000000' lw 1 lt 2    # TDFA(1) -b
+set style line 5 lc rgb '#000000' lw 1 lt 3    # DFA -b
+set style line 6 lc rgb '#dddddd' lw 5 lt 1    # TDFA(0) --no-optimize-tags
+set style line 7 lc rgb '#dddddd' lw 5 lt 2    # TDFA(1) --no-optimize-tags
+
+set logscale y 2
+set ylabel "binary size (K, logscale)"
+
+set output 'size_gcc.png'
+set title "gcc"
+plot \
+     "data" index 6 using 1:6 ls 6 with lines title "TDFA(0) --no-optimize-tags", \
+     "data" index 7 using 1:6 ls 7 with lines title "TDFA(1) --no-optimize-tags", \
+     "data" index 3 using 1:6 ls 3 with lines title "TDFA(0) -b", \
+     "data" index 4 using 1:6 ls 4 with lines title "TDFA(1) -b", \
+     "data" index 5 using 1:6 ls 5 with lines title "DFA -b", \
+     "data" index 0 using 1:6 ls 8 with lines title "TDFA(0)", \
+     "data" index 1 using 1:6 ls 1 with lines title "TDFA(1)", \
+     "data" index 2 using 1:6 ls 2 with lines title "DFA"
+
+set output 'size_clang.png'
+set title "clang"
+plot \
+     "data" index 6 using 1:7 ls 6 with lines notitle, \
+     "data" index 7 using 1:7 ls 7 with lines notitle, \
+     "data" index 3 using 1:7 ls 3 with lines notitle, \
+     "data" index 4 using 1:7 ls 4 with lines notitle, \
+     "data" index 5 using 1:7 ls 5 with lines notitle, \
+     "data" index 0 using 1:7 ls 8 with lines notitle, \
+     "data" index 1 using 1:7 ls 1 with lines notitle, \
+     "data" index 2 using 1:7 ls 2 with lines notitle
+
+set output 'size_tcc.png'
+set title "tcc"
+plot \
+     "data" index 6 using 1:8 ls 6 with lines notitle, \
+     "data" index 7 using 1:8 ls 7 with lines notitle, \
+     "data" index 3 using 1:8 ls 3 with lines notitle, \
+     "data" index 4 using 1:8 ls 4 with lines notitle, \
+     "data" index 5 using 1:8 ls 5 with lines notitle, \
+     "data" index 0 using 1:8 ls 8 with lines notitle, \
+     "data" index 1 using 1:8 ls 1 with lines notitle, \
+     "data" index 2 using 1:8 ls 2 with lines notitle
+
+set output 'size_pcc.png'
+set title "pcc"
+plot \
+     "data" index 6 using 1:9 ls 6 with lines notitle, \
+     "data" index 7 using 1:9 ls 7 with lines notitle, \
+     "data" index 3 using 1:9 ls 3 with lines notitle, \
+     "data" index 4 using 1:9 ls 4 with lines notitle, \
+     "data" index 5 using 1:9 ls 5 with lines notitle, \
+     "data" index 0 using 1:9 ls 8 with lines notitle, \
+     "data" index 1 using 1:9 ls 1 with lines notitle, \
+     "data" index 2 using 1:9 ls 2 with lines notitle
+
+
+
+
+unset logscale
+set ylabel "run time (s)"
+set yrange [0:16<*]
+
+set output 'time_gcc.png'
+set title "gcc"
+plot \
+     "data" index 6 using 1:10 ls 6 with lines title "TDFA(0) --no-optimize-tags", \
+     "data" index 7 using 1:10 ls 7 with lines title "TDFA(1) --no-optimize-tags", \
+     "data" index 3 using 1:10 ls 3 with lines title "TDFA(0) -b", \
+     "data" index 4 using 1:10 ls 4 with lines title "TDFA(1) -b", \
+     "data" index 5 using 1:10 ls 5 with lines title "DFA -b", \
+     "data" index 0 using 1:10 ls 8 with lines title "TDFA(0)", \
+     "data" index 1 using 1:10 ls 1 with lines title "TDFA(1)", \
+     "data" index 2 using 1:10 ls 2 with lines title "DFA"
+
+set output 'time_clang.png'
+set title "clang"
+plot \
+     "data" index 6 using 1:11 ls 6 with lines notitle, \
+     "data" index 7 using 1:11 ls 7 with lines notitle, \
+     "data" index 3 using 1:11 ls 3 with lines notitle, \
+     "data" index 4 using 1:11 ls 4 with lines notitle, \
+     "data" index 5 using 1:11 ls 5 with lines notitle, \
+     "data" index 0 using 1:11 ls 8 with lines notitle, \
+     "data" index 1 using 1:11 ls 1 with lines notitle, \
+     "data" index 2 using 1:11 ls 2 with lines notitle
+
+set output 'time_tcc.png'
+set title "tcc"
+plot \
+     "data" index 6 using 1:12 ls 6 with lines notitle, \
+     "data" index 7 using 1:12 ls 7 with lines notitle, \
+     "data" index 3 using 1:12 ls 3 with lines notitle, \
+     "data" index 4 using 1:12 ls 4 with lines notitle, \
+     "data" index 5 using 1:12 ls 5 with lines notitle, \
+     "data" index 0 using 1:12 ls 8 with lines notitle, \
+     "data" index 1 using 1:12 ls 1 with lines notitle, \
+     "data" index 2 using 1:12 ls 2 with lines notitle
+
+set output 'time_pcc.png'
+set title "pcc"
+plot \
+     "data" index 6 using 1:13 ls 6 with lines notitle, \
+     "data" index 7 using 1:13 ls 7 with lines notitle, \
+     "data" index 3 using 1:13 ls 3 with lines notitle, \
+     "data" index 4 using 1:13 ls 4 with lines notitle, \
+     "data" index 5 using 1:13 ls 5 with lines notitle, \
+     "data" index 0 using 1:13 ls 8 with lines notitle, \
+     "data" index 1 using 1:13 ls 1 with lines notitle, \
+     "data" index 2 using 1:13 ls 2 with lines notitle
+
diff --git a/re2c/doc/tdfa/tdfa.tex b/re2c/doc/tdfa/tdfa.tex

index 6a648f79dd868ea2b2296ebbc8bb05c51153dfce..3bf31441dd63959ebd50401862fcfe4cccbc5293 100644 (file)
--- a/re2c/doc/tdfa/tdfa.tex
+++ b/re2c/doc/tdfa/tdfa.tex
@@ -74,8 +74,11 @@
  \noindent
  This paper extends the work of Laurikari [Lau00] [Lau01] and Kuklewicz [Kuk??] on tagged deterministic finite automata (TDFA)
  in connection with submatch extraction in regular expressions.
-I augment TDFA with 1-symbol lookahead, which results in significant reduction of tag variables and operations.
-Lookahead-aware automata may have slightly more states, but they are more amenable to optimizations and as a rule result in both smaller and faster code.
+The main goal of this work is application of TDFA to lexer generators that optimize for speed of the generated code.
+I suggest a number of practical improvements to Laurikari algorithm;
+notably, the use of 1-symbol lookahead, which results in significant reduction of tag variables and operations.
+Experimental results confirm that lookahead-aware TDFA are considerably faster and usually smaller then baseline TDFA;
+and they are reasonably close in speed and size to canonical DFA used for recognition.
  The proposed algorithm can handle repeated submatch and therefore is applicable to full parsing.
  Furthermore, I consider two disambiguation policies: leftmost greedy and POSIX.
  I formalize the algorithm suggested by Kuklewicz
@@ -88,10 +91,10 @@ All discussed models and algorithms are implemented in the open source lexer gen
  
  \section*{Introduction}
  
-RE2C [Bum94] [web??] is a lexer generator for C: it compiles regular expressions into code.
+RE2C [Bum94] [web??] is a lexer generator for C: it compiles regular expressions to C code.
  Unlike regular expression libraries such as TRE [Lau01] or RE2 [Cox??], RE2C has no restriction on preprocessing time
  and concentrates fully on the quality of generated code.
-RE2C takes pride in generating fast lexers: at least as fast as reasonably optimized lexers coded by hand.
+It takes pride in generating fast lexers: at least as fast as reasonably optimized lexers coded by hand.
  This is not an easy goal; hand-written code is specialized for a particular environment, while autogenerated code must fit anywhere.
  RE2C has a highly configurable interface and quite a few optimizations ranging from
  high-level program transformations to low-level tweaks of conditional jumps.
@@ -99,7 +102,7 @@ In such setting it is undesirable to add extensions that affect performance.
  \\ \\
  One useful extension of regular expressions is submatch extraction and parsing.
  Many authors studied this subject and developed algorithms suitable for their particular settings and problem domains.
-Their approaches differ in many respects:
+Their approaches differ in various respects:
  the specific subtype of problem (full parsing, submatch extracton with or without history of repetitions);
  the underlying formalizm (backtracking,
  nondeterministic automata, deterministic automata, 
@@ -123,7 +126,7 @@ Laurikari algorithm is special in this respect.
  It is based on a single deterministic automaton, runs in one pass and linear time,
  and the consumed space does not depend on the input length.
  What is most important, the overhead on submatch extraction depends on the detalization of submatch:
-on regular expressions without submatches Laurikari automaton shrinks to a simple DFA.
+on submatch-free regular expressions Laurikari automaton shrinks to a simple DFA.
  \\ \\
  From RE2C point of view this is close enough to hand-written code:
  you only pay for what you need, like a reasonable programmer would do.
@@ -179,7 +182,7 @@ Even lookahead-aware automata contain a lot of redundant operations,
  which can be dramatically reduced by the most basic optimizations like liveness analysis and dead code elimination.
  The overall number of submatch records can be minimized using technique similar to register allocation.
  I suggest another tweak of Laurikari algoritm that makes optimizations particularly easy
-and show that they have crucial impact on the quality of code, even in the presence of an optimizing C compiler.
+and show that they are useful even in the presence of an optimizing C compiler.
  RE2C implementation of submatch extraction is the motivation and the main goal of this work.
  \\
  
@@ -2475,7 +2478,6 @@ What is most important, copy operations are cheap for simple tags.
  
  \subsection*{Scalar representation of histories}
  
-For non-simple tags we need to track their full history.
  The most naive representation of history is a list of offsets;
  however, copy operations on lists are very inefficient.
  Fortunately, a better representation is possible: as observed by [Kar], histories form a \emph{prefix tree}:
@@ -2518,7 +2520,8 @@ Then it applies register optimizations;
  they are aimed at reducing the number of registers and copy operations.
  This is done by the usual means:
  liveness analysis, followed by dead code elimination,
-followed by interference analysis and finally register allocation.
+followed by interference analysis and finally register allocation
+with biased coalescing of registers bound by copy operations.
  The full cycle is run twice (first iteration is enough in most cases,
  but subsequent iterations are cheap as they run on an already optimized program and reuse the same infrastructure).
  Prior to the first iteration RE2C renames registers so that they occupy consequent numbers;
@@ -2536,16 +2539,288 @@ Then RE2C examines TDFA states and, if all outgoing transitions have the same op
  this operation is hoisted out of transitions into the state itself.
  \\
  
-Finally, RE2C converts TDFA to a tunnel automaton [??].
+Finally, RE2C converts TDFA to a tunnel automaton [??]
+that allows to further reduce TDFA size by merging similar states and reusing duplicated pieces of code.
  \\
  
-All these optimizations are basic and some are even primitive, yet they result in great reduction of registers, operations and TDFA states.
-Furthermore, experiments show that optimizing C compilers (such as GCC or Clang) are not a substitution for RE2C optimizations;
-they don't have the special knowledge of the program that RE2C has.
+Most of these optimizations are basic and some are even primitive, yet put all together and in correct order
+they result in a great reduction of registers, operations and TDFA states
+(see the section \ref{section_tests_and_benchmarks} for experimental results).
  
  \section{Tests and benchmarks}\label{section_tests_and_benchmarks}
  
-(1.5x - 2x speedup on in case of RFC-3986 compliant URI parser).
+Correctness testing of RE2C was done in several different ways.
+First, RE2C has a test suite with over a thousand tests;
+most of them are hand-written snippets of code that used to trigger RE2C errors,
+or just examples of useful real-world programs.
+\\
+
+Second, RE2C implementation of POSIX captures was verified on the canonical POSIX test suite compiled by Glenn Fowler [??].
+I used the augmented version provided by Kuklewicz [??] and excluded some tests from the ``basic'' subset
+that use start and end anchors \texttt{\^} and \texttt{\$}, which are not supported by RE2C.
+\\
+
+Third, I used RE2C self-validation mode [??].
+In this mode, instead of generating normal code, RE2C generates a special self-contained \emph{skeleton} program
+and two input files: one with input strings and one with compressed match results
+that are used to verify program behaviour on all inputs.
+Strings are generated so that they cover all TDFA transitions and many TDFA paths (including incorrect inputs that cause match failure).
+Generation of input data happens right after TDFA construction and before any optimizations,
+but the program itself is fully optimized.
+Thus skeleton programs are capable of revealing any errors in optimization and code generation.
+\\
+
+Fourth, I compared TDFA(0) against TDFA(1) on various RE and inputs:
+they result in very different programs, but must yield identical results.
+\\
+
+Last, and most important, I used fuzzer contributed by Sergei Trofimovich [??] and based on haskell QuickCheck library [??].
+I applied it in many different settings:
+ran TDFA(0) programs on skeleton inputs generated for TDFA(1) programs and vice versa;
+fuzz-tested RE2C against Kuklewicz library Regex-TDFA [??];
+verified or disproved numerous assumptions and hypotheses;
+generated minimal triggers for bugs and special cases
+and otherwise compensated the lack of imagination with the use of random generator.
+\\
+
+Benchmarks are aimed at comparison of TDFA(0) and TDFA(1).
+We have already seen on numerous examples in section \ref{section_determinization}
+that TDFA(1) has every reason to result in faster code;
+however, only a real-world program can show if there is any percievable difference in practice.
+As an example of such real-world program I used two canonical use case for submatch extraction in RE: URI parser and HTTP parser.
+Both examples are used by many authors (see e.g. [ThoBra] and [SohTho]),
+as their syntax is simple enough to admit regular grammars [RFC-3986] [RFC7230],
+but at the same time they have non-trivial structure composed of multiple components of varying length and form.
+Each example comes in two flavors: RFC-compliant parser that performs full validation,
+and simplified parser that skips most of the validation and barely parses the input; both forms may be useful in practice.
+The input to each parser is a 1G file of randomly generated URIs or HTTP messages; it is buffered in 4K chunks.
+Programs are written so that they spend most of the time on parsing
+and do only the bare minimum of work necesssary to convince compiler that parse results cannot be optimized out ---
+this way benchmarks measure the efficiency of parsing, not the accompanying code or operating system.
+Alternatively all parsers can be built in ``verification mode'' and will print out parse results.
+For each parser there is a corresponding recognizer based on a simple DFA:
+it sets a baseline for expectations of how fast and small the lexer can be and what is the real overhed on submatch extraction.
+\\
+
+All benchmarks were run on 64-bit Intel Core i3 machine with 350M RAM and 32K L1d, 32K L1i, 256K L2 and 3072K L3 caches;
+each result is the average of 4 subsequent runs after two ``warmup'' runs.
+\\
+
+Benchmarks are written in C-90.
+I used four different C compilers:
+gcc-7.1.10 (Gnu Compiler Collection [??]),
+clang-4.0.1 ([??]),
+tcc-0.9.26 (Tiny C Compiler [??])
+and pcc-1.1.0 (Portable C Compiler [??]).
+All compilers were run with optimization level \texttt{-O2} (though some of them probably ignore it).
+\\
+
+RE2C was run in three different settings:
+default mode, with \texttt{-b} option (generate bit masks and nested \texttt{if}-s instead of plain \texttt{switch}-es),
+and with \texttt{--no-optimize-tags} option (it suppresses all optimizations
+of tag variables described in section \ref{section_implementation}, except compaction).
+\\
+
+\begin{table*}\label{table1}
+\begin{center}
+    \begin{tabular}{|c|ccccccccccc|}
+    \hline
+    & registers & states & code size (B) & \multicolumn{4}{c}{stripped binary size (B)} & \multicolumn{4}{c|}{run time (s)} \\
+    & & &
+        & gcc & clang & tcc & pcc
+        & gcc & clang & tcc & pcc \\
+    \hline \hline
+    \multicolumn{12}{|c|}{re2c} \\
+    \hline
+    TDFA(0) & 45 & 452 & 250 & 63 & 135 & 339 & 247 & 12.88 & 10.31 & 99.12 & 55.91 \\
+    TDFA(1) & 42 & 457 & 183 & 55 & 139 & 213 & 151 &  6.42 &  5.59 & 67.04 & 27.96 \\
+    DFA     & -- & 414 & 135 & 35 & 111 & 145 &  91 &  4.96 &  4.46 & 62.15 & 23.74 \\
+%    TDFA(0) & 45 & 452 & 255712 & 63544 & 137320 & 346408 & 252024 & 12.88 & 10.31 & 99.12 & 55.91 \\
+%    TDFA(1) & 42 & 457 & 186600 & 55352 & 141416 & 217160 & 153720 &  6.42 &  5.59 & 67.04 & 27.96 \\
+%    DFA     & -- & 414 & 137816 & 34864 & 112728 & 148048 &  92256 &  4.96 &  4.46 & 62.15 & 23.74 \\
+    \hline \hline
+    \multicolumn{12}{|c|}{re2c -b} \\
+    \hline
+    TDFA(0) & 45 & 452 & 295 & 63 & 59 & 352 & 267 & 11.96 & 10.31 & 65.53 & 36.98 \\
+    TDFA(1) & 42 & 457 & 171 & 55 & 51 & 144 & 111 &  6.01 &  5.40 & 15.96 & 10.59 \\
+    DFA     & -- & 414 & 123 & 35 & 39 &  75 &  51 &  4.73 &  4.78 & 10.93 &  5.63 \\
+%    TDFA(0) & 45 & 452 & 301968 & 63544 & 59496 & 360136 & 272504 & 11.96 & 10.31 & 65.53 & 36.98 \\
+%    TDFA(1) & 42 & 457 & 174903 & 55352 & 51304 & 147016 & 112760 &  6.01 &  5.40 & 15.96 & 10.59 \\
+%    DFA     & -- & 414 & 125389 & 34864 & 39000 &  76272 &  51296 &  4.73 &  4.78 & 10.93 &  5.63 \\
+    \hline \hline
+    \multicolumn{12}{|c|}{re2c --no-optimize-tags} \\
+    \hline
+    TDFA(0) & 2054 & 625 & 816 & 275 & 267 & 1107 & 839 & 14.14 & 13.24 & 105.87 & 59.71 \\
+    TDFA(1) &  149 & 462 & 200 &  63 & 147 &  233 & 167 &  6.64 &  5.90 &  68.50 & 29.39 \\
+%    TDFA(0) & 2054 & 625 & 835285 & 280632 & 272488 & 1132616 & 858232 & 14.14 & 13.24 & 105.87 & 59.71 \\
+%    TDFA(1) &  149 & 462 & 204119 &  63544 & 149608 &  238568 & 170104 &  6.64 &  5.90 &  68.50 & 29.39 \\
+    \hline
+    \end{tabular}\\
+    \caption{RFC-7230 compilant HTTP parser.}
+    \smallskip
+    \footnotesize{Total 39 tags: 34 simple and 5 with history.
+    Nondeterminism for TDFA(0): 23 tags with degree 2, 12 tags with degree 3 and 1 tag with degree 4.
+    Nondeterminism for TDFA(1): 18 tags with degree 2, 2 tags with degree 3.}
+\end{center}
+\end{table*}
+
+
+
+\begin{table*}\label{table2}
+\begin{center}
+    \begin{tabular}{|c|ccccccccccc|}
+    \hline
+    & registers & states & code size (B) & \multicolumn{4}{c}{stripped binary size (B)} & \multicolumn{4}{c|}{run time (s)} \\
+    & & &
+        & gcc & clang & tcc & pcc
+        & gcc & clang & tcc & pcc \\
+    \hline \hline
+    \multicolumn{12}{|c|}{re2c} \\
+    \hline
+    TDFA(0) & 18 & 70 & 32 & 15 & 31 & 41 & 31 & 7.65 & 5.50 & 71.60 & 33.96 \\
+    TDFA(1) & 16 & 73 & 33 & 15 & 35 & 41 & 31 & 5.31 & 3.83 & 63.36 & 26.78 \\
+    DFA     & -- & 69 & 25 & 15 & 31 & 31 & 23 & 4.90 & 3.34 & 62.12 & 23.64 \\
+%    TDFA(0) &  &  &           & 14392 & 30816 & 41160 & 30840 & 7.65 & 5.50 & 71.60 & 33.96 \\
+%    TDFA(1) &  &  &           & 14392 & 34912 & 41704 & 30840 & 5.31 & 3.83 & 63.36 & 26.78 \\
+%    DFA     & -- & 69 & 24937 & 14384 & 30808 & 31280 & 22624 & 4.90 & 3.34 & 62.12 & 23.64 \\
+    \hline \hline
+    \multicolumn{12}{|c|}{re2c -b} \\
+    \hline
+    TDFA(0) & 18 & 70 & 32 & 15 & 19 & 31 & 31 & 7.12 & 7.31 & 31.85 & 17.47 \\
+    TDFA(1) & 16 & 73 & 29 & 15 & 19 & 29 & 27 & 5.25 & 4.42 & 13.52 &  8.86 \\
+    DFA     & -- & 69 & 19 & 11 & 15 & 15 & 15 & 4.66 & 3.96 & 11.00 &  5.79 \\
+%    TDFA(0) &  &  &           & 14392 & 18528 & 31336 & 30840 & 7.12 & 7.31 & 31.85 & 17.47 \\
+%    TDFA(1) &  &  &           & 14392 & 18528 & 29288 & 26744 & 5.25 & 4.42 & 13.52 &  8.86 \\
+%    DFA     & -- & 69 & 18472 & 10288 & 14424 & 14832 & 14432 & 4.66 & 3.96 & 11.00 &  5.79 \\
+    \hline \hline
+    \multicolumn{12}{|c|}{re2c --no-optimize-tags} \\
+    \hline
+    TDFA(0) & 72 & 106 & 57 & 23 & 55 & 73 & 55 & 8.61 & 6.77 & 73.05 & 34.68 \\
+    TDFA(1) & 44 &  82 & 39 & 19 & 43 & 49 & 39 & 6.01 & 5.38 & 63.87 & 27.44 \\
+%    TDFA(0) & 72 & 106 & 57956 & 22584 & 55400 & 73928 & 55416 & 8.61 & 6.77 & 73.05 & 34.68 \\
+%    TDFA(1) & 44 &  82 & 39674 & 18488 & 43112 & 49480 & 39032 & 6.01 & 5.38 & 63.87 & 27.44 \\
+    \hline
+    \end{tabular}\\
+    \caption{Simplified HTTP parser.}
+    \smallskip
+    \footnotesize{Total 15 tags: 12 simple and 3 with history.
+    Nondeterminism for TDFA(0): 8 tags with degree 2.
+    Nondeterminism for TDFA(1): 3 tags with degree 2.}
+\end{center}
+\end{table*}
+
+
+
+\begin{table*}\label{table3}
+\begin{center}
+    \begin{tabular}{|c|ccccccccccc|}
+    \hline
+    & registers & states & code size (K) & \multicolumn{4}{c}{binary size (K, stripped)} & \multicolumn{4}{c|}{run time (s)} \\
+    & & &
+        & gcc & clang & tcc & pcc
+        & gcc & clang & tcc & pcc \\
+    \hline \hline
+    \multicolumn{12}{|c|}{re2c} \\
+    \hline
+    TDFA(0) & 23 & 252 & 152 & 39 & 75 & 203 & 155 & 10.03 & 6.10 & 111.90 & 73.81 \\
+    TDFA(1) & 20 & 256 & 115 & 35 & 75 & 138 & 103 &  6.75 & 3.24 & 104.56 & 50.90 \\
+    DFA     & -- & 198 &  67 & 23 & 55 &  73 &  55 &  7.05 & 3.21 &  97.89 & 51.43 \\
+%    TDFA(0) & 23 & 252 & 154776 & 38960 & 75864 & 207600 & 157792 & 10.03 & 6.10 & 111.90 & 73.81 \\
+%    TDFA(1) & 20 & 256 & 117498 & 34864 & 75864 & 140560 & 104544 &  6.75 & 3.24 & 104.56 & 50.90 \\
+%    DFA     & -- & 198 &  67617 & 22576 & 55384 &  74384 &  55392 &  7.05 & 3.21 &  97.89 & 51.43 \\
+    \hline \hline
+    \multicolumn{12}{|c|}{re2c -b} \\
+    \hline
+    TDFA(0) & 23 & 252 & 165 & 39 & 35 & 181 & 151 & 8.40 & 8.56 & 39.56 & 31.84 \\
+    TDFA(1) & 20 & 256 & 127 & 55 & 31 & 130 & 107 & 5.23 & 4.83 & 12.04 & 10.02 \\
+    DFA     & -- & 198 &  60 & 19 & 23 &  39 &  35 & 4.05 & 4.08 &  9.23 &  8.19 \\
+%    TDFA(0) & 23 & 252 & 168684 & 38960 & 34904 & 186704 & 153696 & 8.40 & 8.56 & 39.56 & 31.84 \\
+%    TDFA(1) & 20 & 256 & 129322 & 55344 & 30808 & 132912 & 108640 & 5.23 & 4.83 & 12.04 & 10.02 \\
+%    DFA     & -- & 198 &  60759 & 18480 & 22616 &  39376 &  34912 & 4.05 & 4.08 &  9.23 &  8.19 \\
+    \hline \hline
+    \multicolumn{12}{|c|}{re2c --no-optimize-tags} \\
+    \hline
+    TDFA(0) & 611 & 280 & 426 & 127 & 151 & 536 & 463 & 10.41 & 7.56 & 127.48 & 75.46 \\
+    TDFA(1) &  64 & 256 & 131 &  43 &  87 & 156 & 123 &  6.74 & 3.55 & 103.98 & 51.12 \\
+%    TDFA(0) & 611 & 280 & 435350 & 129072 & 153696 & 548272 & 473184 & 10.41 & 7.56 & 127.48 & 75.46 \\
+%    TDFA(1) &  64 & 256 & 133518 &  43056 &  88160 & 159248 & 125024 &  6.74 & 3.55 & 103.98 & 51.12 \\
+    \hline
+    \end{tabular}\\
+    \caption{RFC-3986 compilant URI parser.}
+    \smallskip
+    \footnotesize{Total 20 tags (all simple).
+    Nondeterminism for TDFA(0): 15 tags with degree 2 and 4 tags with degree 3.
+    Nondeterminism for TDFA(1): 10 tags with degree 2.}
+\end{center}
+\end{table*}
+
+
+
+\begin{table*}\label{table4}
+\begin{center}
+    \begin{tabular}{|c|ccccccccccc|}
+    \hline
+    & registers & states & code size (K) & \multicolumn{4}{c}{binary size (K, stripped)} & \multicolumn{4}{c|}{run time (s)} \\
+    & & &
+        & gcc & clang & tcc & pcc
+        & gcc & clang & tcc & pcc \\
+    \hline \hline
+    \multicolumn{12}{|c|}{re2c} \\
+    \hline
+    TDFA(0) & 16 & 26 & 17 & 11 & 19 & 23 & 19 & 8.34 & 3.57 & 102.84 & 59.88 \\
+    TDFA(1) & 13 & 28 & 19 & 11 & 19 & 25 & 23 & 6.06 & 3.14 & 100.33 & 48.02 \\
+    DFA     & -- & 22 & 10 & 11 & 15 & 14 & 15 & 5.91 & 2.68 &  98.10 & 47.25 \\
+%    TDFA(0) &    &  &  & 10288 & 18520 & 22960 & 18528 & 8.34 & 3.57 & 102.84 & 59.88 \\
+%    TDFA(1) &    &  &  & 10288 & 18520 & 25424 & 22624 & 6.06 & 3.14 & 100.33 & 48.02 \\
+%    DFA     & -- &  &  & 10288 & 14424 & 14256 & 14432 & 5.91 & 2.68 &  98.10 & 47.25 \\
+    \hline \hline
+    \multicolumn{12}{|c|}{re2c -b} \\
+    \hline
+    TDFA(0) & 16 & 26 & 20 & 11 & 11 & 22 & 23 & 7.17 & 6.66 & 23.21 & 18.77 \\
+    TDFA(1) & 13 & 28 & 17 & 11 & 11 & 19 & 19 & 4.05 & 3.09 &  8.59 &  6.94 \\
+    DFA     & -- & 22 &  7 & 11 & 11 &  8 & 11 & 3.92 & 2.56 &  8.06 &  4.42 \\
+%    TDFA(0) &    &  &       & 10288 & 10328 & 22352 & 22624 & 7.17 & 6.66 & 23.21 & 18.77 \\
+%    TDFA(1) &    &  &       & 10288 & 10328 & 18960 & 18528 & 4.05 & 3.09 &  8.59 &  6.94 \\
+%    DFA     & -- &  &  6483 & 10288 & 10328 &  7888 & 10336 & 3.92 & 2.56 &  8.06 &  4.42 \\
+    \hline \hline
+    \multicolumn{12}{|c|}{re2c --no-optimize-tags} \\
+    \hline
+    TDFA(0) & 79 & 29 & 33 & 19 & 23 & 43 & 39 & 7.46 & 3.94 & 105.22 & 61.72 \\
+    TDFA(1) & 40 & 31 & 28 & 15 & 23 & 36 & 31 & 6.29 & 3.33 & 102.00 & 48.22 \\
+%    TDFA(0) & 79 & 29 & 33745 & 18480 & 22624 & 43504 & 39008 & 7.46 & 3.94 & 105.22 & 61.72 \\
+%    TDFA(1) & 40 & 31 & 28013 & 14384 & 22624 & 36080 & 30816 & 6.29 & 3.33 & 102.00 & 48.22 \\
+    \hline
+    \end{tabular}\\
+    \caption{Simplified URI parser.}
+    \smallskip
+    \footnotesize{Total 14 tags (all simple).
+    Nondeterminism for TDFA(0): 8 tags with degree 2 and 5 tags with degree 3.
+    Nondeterminism for TDFA(1): 7 tags with degree 2.}
+\end{center}
+\end{table*}
+
+\begin{table*}
+\begin{center}
+\includegraphics[width=\linewidth]{img/bench/size_gcc_clang.png}\\
+\includegraphics[width=\linewidth]{img/bench/size_tcc_pcc.png}\\
+\end{center}
+\end{table*}
+
+\begin{table*}
+\begin{center}
+\includegraphics[width=\linewidth]{img/bench/time_gcc_clang.png}\\
+\includegraphics[width=\linewidth]{img/bench/time_tcc_pcc.png}\\
+\end{center}
+\end{table*}
+
+Benchmark results are summarized in tables \ref{table1}, \ref{table2}, \ref{table3} and \ref{table4}
+and visualized on subsequent plots.
+They demonstrate that TDFA(1) can result in 1.5x - 2x speedup compared to TDFA(0), especially in the presence of tags with history;
+TDFA(1) incurs only modest overhead on submatch extraction compared to basic DFA-based recognition;
+nondeterminism levels are not so high in (at least some) real-world programs;
+RE2C optimizations reduce binary size, especially in complex cases with large automata and high submatch detalization,
+and even optimizing C compilers are not a substitution for them, as they lack the special knowledge of the program that RE2C has;
+RE2C optimizations have less effect on execution time (it is also reduced, but not by much).
  
  \section{Future work}\label{section_future_work}
  
@@ -2568,7 +2843,8 @@ which originates in LR parsing invented by Knuth [Knu65]
  \item Karper
  \item Kuklewicz
  
-    \item [Cox10] Russ Cox, "Regular Expression Matching in the Wild", March 2010, https://swtch.com/~rsc/regexp/regexp3.html
+    \item \! [Cox10] Russ Cox, \textit{"Regular Expression Matching in the Wild"}, March 2010, \\
+        https://swtch.com/\textasciitilde rsc/regexp/regexp3.html
  
  \end{enumerate}
author	Ulya Trofimovich <skvadrik@gmail.com>
	Wed, 19 Jul 2017 17:43:33 +0000 (18:43 +0100)
committer	Ulya Trofimovich <skvadrik@gmail.com>
	Wed, 19 Jul 2017 17:43:33 +0000 (18:43 +0100)
re2c/benchmarks/http/gen/gen_http.hs	[new file with mode: 0644]	patch \| blob
re2c/benchmarks/http/gen/mk_gen.sh	[new file with mode: 0755]	patch \| blob
re2c/benchmarks/http/rfc7230/http_rfc7230.re	[new file with mode: 0644]	patch \| blob
re2c/benchmarks/http/rfc7230/http_rfc7230_notags.re	[new file with mode: 0644]	patch \| blob
re2c/benchmarks/http/simple/http_simple.re	[new file with mode: 0644]	patch \| blob
re2c/benchmarks/http/simple/http_simple_notags.re	[new file with mode: 0644]	patch \| blob
re2c/benchmarks/uri/gen/gen_uri.hs	[new file with mode: 0644]	patch \| blob
re2c/benchmarks/uri/gen/mk_gen.sh	[new file with mode: 0755]	patch \| blob
re2c/benchmarks/uri/rfc3986/uri_rfc3986.re	[new file with mode: 0644]	patch \| blob
re2c/benchmarks/uri/rfc3986/uri_rfc3986_notags.re	[new file with mode: 0644]	patch \| blob
re2c/benchmarks/uri/simple/uri_simple.re	[new file with mode: 0644]	patch \| blob
re2c/benchmarks/uri/simple/uri_simple_notags.re	[new file with mode: 0644]	patch \| blob
re2c/doc/tdfa/img/bench/__mk.sh	[new file with mode: 0755]	patch \| blob
re2c/doc/tdfa/img/bench/plot.gnuplot	[new file with mode: 0644]	patch \| blob
re2c/doc/tdfa/tdfa.tex		patch \| blob \| history