From fd0195dd4bf37b1eb5004d0311e91ad24fa3fc94 Mon Sep 17 00:00:00 2001 From: Ulya Trofimovich Date: Thu, 9 May 2019 18:30:32 +0100 Subject: [PATCH] libre2c: updated benchmark. --- lib/bench.cc | 444 +++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 359 insertions(+), 85 deletions(-) diff --git a/lib/bench.cc b/lib/bench.cc index ab4f0801..076a16d3 100644 --- a/lib/bench.cc +++ b/lib/bench.cc @@ -1,83 +1,124 @@ /* * This is not a real benchmark, just some very early attempt at comparing - * various re2clib algorithms and re2. + * various libre2c algorithms and re2. */ #include "config.h" +#include "src/util/c99_stdint.h" #include "lib/regex.h" +#include #include #include #include #include #include +#include #ifdef HAVE_RE2_RE2_H #include "re2/re2.h" #endif +struct Result +{ + const char *title; + uint64_t ticks; +}; + +static const uint64_t UNAVAIL = 0xFFFFffffFFFFffff; static const int XREG_RE2 = 1 << 31; -static inline void show(const char *prefix1, const char *prefix2 - , clock_t t1, clock_t t2, bool ok) +static inline void show(const std::vector &results) { - fprintf(stderr, "%30s%10s:%10ld ticks, %lfs\n" - , prefix1, prefix2, t2 - t1, double(t2 - t1) / CLOCKS_PER_SEC); - - if (!ok) { - fprintf(stderr, "*** %s %s failed\n", prefix1, prefix2); - exit(1); + const size_t n = results.size(); + assert(n > 0); + + const uint64_t ticks0 = results[0].ticks; + + for (size_t i = 0; i < n; ++i) { + const Result &r = results[i]; + if (r.ticks == UNAVAIL) { + fprintf(stderr, "%30s:%10s%10s\n", r.title, "-", "-"); + } + else { + fprintf(stderr, "%30s:%10.2lf%10.3lfs\n", r.title + , double(r.ticks) / double(ticks0) + , double(r.ticks) / CLOCKS_PER_SEC); + } } } -static void bench_re2c(const char *regexp, const char *string, size_t ntimes - , int flags, int mask, int need, const char *prefix) +static Result bench_re2c(const char *regexp, std::vector &strings + , size_t ntimes, int flags, int mask, int need, const char *prefix) { + Result x; + x.title = prefix; + if ((flags & mask) || (need != 0 && !(flags & need))) { - return; + x.ticks = UNAVAIL; + return x; } regex_t re; - int result; - clock_t t1, t2; - - t1 = clock(); - result = regcomp(&re, regexp, flags); - t2 = clock(); - show(prefix, "compile", t1, t2, result == 0); + int err; + clock_t /*t1, t2,*/ t3, t4; + + //t1 = clock(); + err = regcomp(&re, regexp, flags); + //t2 = clock(); + if (err) { + fprintf(stderr, "*** %s compile failed\n", prefix); + exit(1); + } const size_t nmatch = re.re_nsub; regmatch_t *pmatch = new regmatch_t[nmatch]; - t1 = clock(); + t3 = clock(); + err = 0; for (size_t i = 0; i < ntimes; ++i) { - result = regexec(&re, string, nmatch, pmatch, 0); + for (size_t j = 0; j < strings.size(); ++j) { + err |= regexec(&re, strings[j].c_str(), nmatch, pmatch, 0); + } + } + t4 = clock(); + if (err) { + fprintf(stderr, "*** %s run failed\n", prefix); + exit(1); } - t2 = clock(); - show("", "run", t1, t2, result == 0); delete[] pmatch; regfree(&re); + + x.ticks = uint64_t(t4 - t3); + return x; } #ifdef HAVE_RE2_RE2_H -static void bench_re2(const char *regexp, const char *string, size_t ntimes - , int mask, const char *prefix) +static Result bench_re2(const char *regexp, std::vector &strings + , size_t ntimes, int mask, const char *prefix) { + Result x; + x.title = prefix; + if (mask & XREG_RE2) { - return; + x.ticks = UNAVAIL; + return x; } RE2 *re2; - clock_t t1, t2; - bool err; + clock_t /*t1, t2,*/ t3, t4; + bool ok = true; - t1 = clock(); + //t1 = clock(); re2 = new RE2(regexp, RE2::POSIX); - t2 = clock(); - show(prefix, "compile", t1, t2, re2->ok()); + //t2 = clock(); + if (!re2->ok()) { + fprintf(stderr, "*** %s compile failed\n", prefix); + exit(1); + } const int argc = re2->NumberOfCapturingGroups(); RE2::Arg *args = new RE2::Arg[argc]; @@ -88,77 +129,306 @@ static void bench_re2(const char *regexp, const char *string, size_t ntimes argps[i] = &args[i]; } - t1 = clock(); + t3 = clock(); for (size_t i = 0; i < ntimes; ++i) { - err = RE2::FullMatchN(string, *re2, argps, argc); + for (size_t j = 0; j < strings.size(); ++j) { + ok = ok && RE2::FullMatchN(strings[j].c_str(), *re2, argps, argc); + } + } + t4 = clock(); + if (!ok) { + fprintf(stderr, "*** %s run failed\n", prefix); + exit(1); } - t2 = clock(); - show("", "run", t1, t2, err); delete[] results; delete[] argps; delete[] args; delete re2; + + x.ticks = uint64_t(t4 - t3); + return x; } #endif -static void bench(const char *r, const char *s, size_t n, int mask, int need) +static void bench(const char *r, std::vector &ss, size_t n + , int mask, int need) { - fprintf(stderr, "\nr: %.*s..., s: %.*s..., n: %lu\n", 30, r, 30, s, n); - - bench_re2c(r, s, n, 0, mask, need, "re2c-tdfa"); - bench_re2c(r, s, n, REG_NFA, mask, need, "re2c-tnfa-posix-gor1"); - bench_re2c(r, s, n, REG_NFA | REG_GTOP, mask, need, "re2c-tnfa-posix-gtop"); - bench_re2c(r, s, n, REG_NFA | REG_KUKLEWICZ, mask, need, "re2c-tnfa-posix-kukl-gor1"); - bench_re2c(r, s, n, REG_NFA | REG_KUKLEWICZ | REG_GTOP, mask, need, "re2c-tnfa-posix-kukl-gtop"); - bench_re2c(r, s, n, REG_NFA | REG_SLOWPREC, mask, need, "re2c-tnfa-posix-gor1-slow"); - bench_re2c(r, s, n, REG_NFA | REG_LEFTMOST, mask, need, "re2c-tnfa-leftmost"); - bench_re2c(r, s, n, REG_NFA | REG_TRIE, mask, need, "re2c-tnfa-posix-gtop-trie"); - bench_re2c(r, s, n, REG_NFA | REG_TRIE | REG_LEFTMOST, mask, need, "re2c-tnfa-leftmost-trie"); - bench_re2c(r, s, n, REG_NFA | REG_BACKWARD, mask, need, "re2c-tnfa-posix-back-gor1"); - bench_re2c(r, s, n, REG_NFA | REG_BACKWARD | REG_GTOP, mask, need, "re2c-tnfa-posix-back-gtop"); + assert(!ss.empty()); + const char *s0 = ss[0].c_str(); + + fprintf(stderr, "\nr: %.*s..., s: %.*s..., n: %lu\n", 30, r, 30, s0, n); + std::vector rs; + + rs.push_back(bench_re2c(r, ss, n, REG_NFA | REG_LEFTMOST, mask, need, "re2c-tnfa-leftmost")); #ifdef HAVE_RE2_RE2_H - bench_re2(r, s, n, mask, "re2"); + rs.push_back(bench_re2(r, ss, n, mask, "re2")); #endif + rs.push_back(bench_re2c(r, ss, n, REG_NFA, mask, need, "re2c-tnfa-posix-gor1")); + rs.push_back(bench_re2c(r, ss, n, REG_NFA | REG_GTOP, mask, need, "re2c-tnfa-posix-gtop")); + rs.push_back(bench_re2c(r, ss, n, REG_NFA | REG_KUKLEWICZ, mask, need, "re2c-tnfa-posix-kukl-gor1")); + rs.push_back(bench_re2c(r, ss, n, REG_NFA | REG_SLOWPREC, mask, need, "re2c-tnfa-posix-gor1-slow")); + rs.push_back(bench_re2c(r, ss, n, REG_NFA | REG_TRIE | REG_GTOP, mask, need, "re2c-tnfa-posix-gtop-trie")); + rs.push_back(bench_re2c(r, ss, n, REG_NFA | REG_BACKWARD, mask, need, "re2c-tnfa-posix-back-gor1")); + //rs.push_back(bench_re2c(r, ss, n, 0, mask, need, "re2c-tdfa")); + //rs.push_back(bench_re2c(r, ss, n, REG_NFA | REG_TRIE, mask, need, "re2c-tnfa-posix-gor1-trie")); + //rs.push_back(bench_re2c(r, ss, n, REG_NFA | REG_TRIE | REG_LEFTMOST, mask, need, "re2c-tnfa-leftmost-trie")); + //rs.push_back(bench_re2c(r, ss, n, REG_NFA | REG_KUKLEWICZ | REG_GTOP, mask, need, "re2c-tnfa-posix-kukl-gtop")); + //rs.push_back(bench_re2c(r, ss, n, REG_NFA | REG_BACKWARD | REG_GTOP, mask, need, "re2c-tnfa-posix-back-gtop")); + + show(rs); +} + +static void load_strings(const char *fname, const char *delim + , std::vector &strings) +{ + FILE *f = fopen(fname, "r"); + if (!f) { + fprintf(stderr, "cannot open file %s\n", fname); + exit(1); + } + + fseek(f, 0, SEEK_END); + const size_t flen = static_cast(ftell(f)); + fseek(f, 0, SEEK_SET); + + char *fbuf = new char[flen]; + if (flen != fread(fbuf, 1, flen, f)) { + fprintf(stderr, "cannot read file %s\n", fname); + exit(1); + } + + strings.clear(); + const size_t dlen = strlen(delim); + for (char *p1 = fbuf, *p2 = fbuf, *p3 = fbuf + flen; p2 < p3; ) { + for (p1 = p2; p2 + dlen <= p3; ++p2) { + if (memcmp(p2, delim, dlen) == 0) { + p2 += dlen; + break; + } + } + strings.push_back(std::string(p1, static_cast(p2 - p1 - 1))); + } + + delete[] fbuf; + fclose(f); } +// RFC URI and HTTP +#define CRLF "[\\n]" +#define OCT "([0-9]|[1-9][0-9]|[1][0-9]{2}|[2][0-4][0-9]|[2][5][0-5])" +#define IPV4 "(" OCT "[.]" OCT "[.]" OCT "[.]" OCT ")" +#define H16 "[0-9a-fA-F]{1,4}" +#define LS32 "(" H16 ":" H16 "|" IPV4 ")" +#define IPV6 "((" H16 ":){6}" LS32 \ + "|" "::(" H16 ":){5}" LS32 \ + "|(" H16 ")?::(" H16 ":){4}" LS32 \ + "|((" H16 ":){0,1}" H16 ")?::(" H16 ":){3}" LS32 \ + "|((" H16 ":){0,2}" H16 ")?::(" H16 ":){2}" LS32 \ + "|((" H16 ":){0,3}" H16 ")?::" H16 ":" LS32 \ + "|((" H16 ":){0,4}" H16 ")?::" LS32 \ + "|((" H16 ":){0,5}" H16 ")?::" H16 \ + "|((" H16 ":){0,6}" H16 ")?::)" +#define IPVFUTURE "(v[0-9a-fA-F]+[.][-0-9a-zA-Z._~!$&'()*+,;=:]+)" +#define IPLITERAL "([[](" IPV6 "|" IPVFUTURE ")[\\]])" +#define REGNAME "(([-0-9a-zA-Z._~!$&'()*+,;=]|%[0-9a-fA-F]{2})*)" +#define HOST "(" IPLITERAL "|" IPV4 "|" REGNAME ")" +#define PORT "(:[0-9]*)?" +#define USERINFO "(([-a-zA-Z0-9._~!$&'()*+,;=:]|%[0-9a-fA-F]{2})*@)?" +#define SCHEME "([a-zA-Z][a-zA-Z0-9+.-]*):" +#define QUERY "([?]([-a-zA-Z0-9._~!$&'()*+,;=:@/?]|%[0-9a-fA-F]{2})*)?" +#define FRAGMENT "([#]([-a-zA-Z0-9._~!$&'()*+,;=:@/?]|%[0-9a-fA-F]{2})*)?" +#define PCHAR "([-0-9a-zA-Z._~!$&'()*+,;=:@]|%[0-9a-fA-F]{2})" +#define PATH_ABEMPTY "(/" PCHAR "*)*" +#define PATH_ABSOLUTE "/(" PCHAR "+(/" PCHAR "*)*)?" +#define PATH_ROOTLESS PCHAR "+(/" PCHAR "*)*" +#define PATH_EMPTY "([/]?)" +#define AUTHORITY "(" USERINFO HOST PORT ")" +#define HIER_PART "(//" AUTHORITY PATH_ABEMPTY "|" PATH_ABSOLUTE "|" PATH_ROOTLESS "|" PATH_EMPTY ")" +#define URI SCHEME HIER_PART QUERY FRAGMENT +#define ABSOLUTE_URI SCHEME HIER_PART QUERY +#define ORIGIN_FROM PATH_ABEMPTY QUERY +#define REQUEST_TARGET "(" AUTHORITY "|" ABSOLUTE_URI "|" ORIGIN_FROM "|([*]))" +#define METHOD "([-!#$%&'*+.^_`|~0-9a-zA-Z]+)" +#define HTTP_VERSION "(HTTP/[0-9][.][0-9])" +#define STATUS_CODE "([0-9]{3})" +#define REASON_PHRASE "([ \\t \\x1f-\\x7e\\x80-\\xff]*)" +#define REQUEST_LINE METHOD "[ ]" REQUEST_TARGET "[ ]" HTTP_VERSION CRLF +#define STATUS_LINE HTTP_VERSION "[ ]" STATUS_CODE "[ ]" REASON_PHRASE CRLF +#define FIELD_NAME "([-!#$%&'*+.^_`|~0-9a-zA-Z]+)" +#define FIELD_VCHAR "[\\x1f-\\x7e\\x80-\\xff]" +#define FIELD_CONTENT "(" FIELD_VCHAR "([ \\t]+" FIELD_VCHAR")?)" +#define FIELD_VALUE "(" FIELD_CONTENT "|" CRLF "[ \\t]+)*" +#define HEADER_FIELD "(" FIELD_NAME ":[ \\t]*" FIELD_VALUE "[ \\t]*)" +#define MESSAGE_HEAD "(" REQUEST_LINE "|" STATUS_LINE ")" "(" HEADER_FIELD CRLF ")*" + +// simplified URI and HTTP +#define CHAR2 "[-._~%!$&'()*+,;=a-zA-Z0-9]" +#define SCHEME2 "([-+.a-zA-Z0-9]+):" +#define USERINFO2 "((" CHAR2 "|[:])+@)?" +#define HOST2 "((" CHAR2 "|[:[\\]])+)" +#define PORT2 "(:[0-9]*)?" +#define AUTHORITY2 USERINFO2 HOST2 PORT2 +#define PATH2 "((" CHAR2 "|[:@/])*)" +#define QUERY2 "([?](" CHAR2 "|[:@?/])*)?" +#define FRAGMENT2 "([#](" CHAR2 "|[:@?/])*)?" +#define URI2 SCHEME2 "(//" AUTHORITY2 ")?" PATH2 QUERY2 FRAGMENT2 +#define ABSOLUTE_URI2 SCHEME2 "(//" AUTHORITY2 PORT2 ")?" PATH2 QUERY2 +#define ORIGIN_FROM2 "/" PATH2 QUERY2 +#define REQUEST_TARGET2 "(" AUTHORITY2 "|" ABSOLUTE_URI2 "|" ORIGIN_FROM2 "|[*])" +#define HTTP_VERSION2 "(HTTP/[0-9][.][0-9])" +#define METHOD2 "([-._~%!$&'*+#^`|a-zA-Z0-9]+)" +#define REQUEST_LINE2 "(" METHOD2 "[ ]" REQUEST_TARGET2 "[ ]" HTTP_VERSION2 CRLF ")" +#define STATUS_LINE2 "(" HTTP_VERSION2 "[ ]([0-9]{3})[ ]([\\t \\x1f-\\x7e\\x80-\\xff]*)" CRLF ")" +#define HEADER_FIELD2 "([-._~%!$&'*+#^`|a-zA-Z0-9]+:[\\x1f-\\x7e\\x80-\\xff \\t\\n]*)" +#define MESSAGE_HEAD2 "(" REQUEST_LINE2 "|" STATUS_LINE2 ")" "(" HEADER_FIELD2 CRLF ")*" + int main() { static const size_t VERY_LONG = 1 << 16; const char *regexp, *string; + std::vector strings; char *longstring = new char[VERY_LONG + 1]; longstring[VERY_LONG] = 0; - regexp = "([a-zA-Z]([a-zA-Z0-9+.-])*)[:]([/][/](((([a-zA-Z0-9._~-]|[%][0-9a-fA-F]{2}|[!$&'()*+,;=:])*)[@])?(([[]((([0-9a-fA-F]{1,4}[:]){6}([0-9a-fA-F]{1,4}[:][0-9a-fA-F]{1,4}|(([0-9]|[\x31-\x39][0-9]|[1][0-9]{2}|[2][\x30-\x34][0-9]|[2][5][\x30-\x35])[.]([0-9]|[\x31-\x39][0-9]|[1][0-9]{2}|[2][\x30-\x34][0-9]|[2][5][\x30-\x35])[.]([0-9]|[\x31-\x39][0-9]|[1][0-9]{2}|[2][\x30-\x34][0-9]|[2][5][\x30-\x35])[.]([0-9]|[\x31-\x39][0-9]|[1][0-9]{2}|[2][\x30-\x34][0-9]|[2][5][\x30-\x35])))|[:][:]([0-9a-fA-F]{1,4}[:]){5}([0-9a-fA-F]{1,4}[:][0-9a-fA-F]{1,4}|(([0-9]|[\x31-\x39][0-9]|[1][0-9]{2}|[2][\x30-\x34][0-9]|[2][5][\x30-\x35])[.]([0-9]|[\x31-\x39][0-9]|[1][0-9]{2}|[2][\x30-\x34][0-9]|[2][5][\x30-\x35])[.]([0-9]|[\x31-\x39][0-9]|[1][0-9]{2}|[2][\x30-\x34][0-9]|[2][5][\x30-\x35])[.]([0-9]|[\x31-\x39][0-9]|[1][0-9]{2}|[2][\x30-\x34][0-9]|[2][5][\x30-\x35])))|([0-9a-fA-F]{1,4})?[:][:]([0-9a-fA-F]{1,4}[:]){4}([0-9a-fA-F]{1,4}[:][0-9a-fA-F]{1,4}|(([0-9]|[\x31-\x39][0-9]|[1][0-9]{2}|[2][\x30-\x34][0-9]|[2][5][\x30-\x35])[.]([0-9]|[\x31-\x39][0-9]|[1][0-9]{2}|[2][\x30-\x34][0-9]|[2][5][\x30-\x35])[.]([0-9]|[\x31-\x39][0-9]|[1][0-9]{2}|[2][\x30-\x34][0-9]|[2][5][\x30-\x35])[.]([0-9]|[\x31-\x39][0-9]|[1][0-9]{2}|[2][\x30-\x34][0-9]|[2][5][\x30-\x35])))|(([0-9a-fA-F]{1,4}[:]){0,1} [0-9a-fA-F]{1,4})?[:][:]([0-9a-fA-F]{1,4}[:]){3}([0-9a-fA-F]{1,4}[:][0-9a-fA-F]{1,4}|(([0-9]|[\x31-\x39][0-9]|[1][0-9]{2}|[2][\x30-\x34][0-9]|[2][5][\x30-\x35])[.]([0-9]|[\x31-\x39][0-9]|[1][0-9]{2}|[2][\x30-\x34][0-9]|[2][5][\x30-\x35])[.]([0-9]|[\x31-\x39][0-9]|[1][0-9]{2}|[2][\x30-\x34][0-9]|[2][5][\x30-\x35])[.]([0-9]|[\x31-\x39][0-9]|[1][0-9]{2}|[2][\x30-\x34][0-9]|[2][5][\x30-\x35])))|(([0-9a-fA-F]{1,4}[:]){0,2} [0-9a-fA-F]{1,4})?[:][:]([0-9a-fA-F]{1,4}[:]){2}([0-9a-fA-F]{1,4}[:][0-9a-fA-F]{1,4}|(([0-9]|[\x31-\x39][0-9]|[1][0-9]{2}|[2][\x30-\x34][0-9]|[2][5][\x30-\x35])[.]([0-9]|[\x31-\x39][0-9]|[1][0-9]{2}|[2][\x30-\x34][0-9]|[2][5][\x30-\x35])[.]([0-9]|[\x31-\x39][0-9]|[1][0-9]{2}|[2][\x30-\x34][0-9]|[2][5][\x30-\x35])[.]([0-9]|[\x31-\x39][0-9]|[1][0-9]{2}|[2][\x30-\x34][0-9]|[2][5][\x30-\x35])))|(([0-9a-fA-F]{1,4}[:]){0,3} [0-9a-fA-F]{1,4})?[:][:][0-9a-fA-F]{1,4}[:]([0-9a-fA-F]{1,4}[:][0-9a-fA-F]{1,4}|(([0-9]|[\x31-\x39][0-9]|[1][0-9]{2}|[2][\x30-\x34][0-9]|[2][5][\x30-\x35])[.]([0-9]|[\x31-\x39][0-9]|[1][0-9]{2}|[2][\x30-\x34][0-9]|[2][5][\x30-\x35])[.]([0-9]|[\x31-\x39][0-9]|[1][0-9]{2}|[2][\x30-\x34][0-9]|[2][5][\x30-\x35])[.]([0-9]|[\x31-\x39][0-9]|[1][0-9]{2}|[2][\x30-\x34][0-9]|[2][5][\x30-\x35])))|(([0-9a-fA-F]{1,4}[:]){0,4} [0-9a-fA-F]{1,4})?[:][:]([0-9a-fA-F]{1,4}[:][0-9a-fA-F]{1,4}|(([0-9]|[\x31-\x39][0-9]|[1][0-9]{2}|[2][\x30-\x34][0-9]|[2][5][\x30-\x35])[.]([0-9]|[\x31-\x39][0-9]|[1][0-9]{2}|[2][\x30-\x34][0-9]|[2][5][\x30-\x35])[.]([0-9]|[\x31-\x39][0-9]|[1][0-9]{2}|[2][\x30-\x34][0-9]|[2][5][\x30-\x35])[.]([0-9]|[\x31-\x39][0-9]|[1][0-9]{2}|[2][\x30-\x34][0-9]|[2][5][\x30-\x35])))|(([0-9a-fA-F]{1,4}[:]){0,5} [0-9a-fA-F]{1,4})?[:][:][0-9a-fA-F]{1,4}|(([0-9a-fA-F]{1,4}[:]){0,6} [0-9a-fA-F]{1,4})?[:][:])|([v][0-9a-fA-F]+[.]([a-zA-Z0-9._~-]|[!$&'()*+,;=]|[:])+))[\\]])|(([0-9]|[\x31-\x39][0-9]|[1][0-9]{2}|[2][\x30-\x34][0-9]|[2][5][\x30-\x35])[.]([0-9]|[\x31-\x39][0-9]|[1][0-9]{2}|[2][\x30-\x34][0-9]|[2][5][\x30-\x35])[.]([0-9]|[\x31-\x39][0-9]|[1][0-9]{2}|[2][\x30-\x34][0-9]|[2][5][\x30-\x35])[.]([0-9]|[\x31-\x39][0-9]|[1][0-9]{2}|[2][\x30-\x34][0-9]|[2][5][\x30-\x35]))|(([a-zA-Z0-9._~-]|[%][0-9a-fA-F]{2}|[!$&'()*+,;=])*))([:][0-9]*)?)(([/]([a-zA-Z0-9._~-]|[%][0-9a-fA-F]{2}|[!$&'()*+,;=:@])*)*)|([/](([a-zA-Z0-9._~-]|[%][0-9a-fA-F]{2}|[!$&'()*+,;=:@])+([/]([a-zA-Z0-9._~-]|[%][0-9a-fA-F]{2}|[!$&'()*+,;=:@])*)*)?)|(([a-zA-Z0-9._~-]|[%][0-9a-fA-F]{2}|[!$&'()*+,;=:@])+([/]([a-zA-Z0-9._~-]|[%][0-9a-fA-F]{2}|[!$&'()*+,;=:@])*)*)|([/]?))([?](([a-zA-Z0-9._~-]|[%][0-9a-fA-F]{2}|[!$&'()*+,;=:@])|[/?])*)?([#](([a-zA-Z0-9._~-]|[%][0-9a-fA-F]{2}|[!$&'()*+,;=:@])|[/?])*)?"; - string = "http://240.147.163.34:2124/z/dibws/hkwfcdsvlghmmyd/ltiyxrfuam?fshxiix3zNzrBJNnuLDXnOruEmp4fX"; - bench(regexp, string, 1, 0, 0); - bench(regexp, string, 1000, REG_BACKWARD, 0); - - regexp = "(a|aa)*"; - memset(longstring, 'a', VERY_LONG); - longstring[VERY_LONG - 1] = 0; - bench(regexp, longstring, 1, REG_TRIE, 0); + // http + load_strings("../lib/bench.data_http", "\n\n", strings); + regexp = MESSAGE_HEAD; + bench(regexp, strings, 1, REG_BACKWARD, 0); + regexp = MESSAGE_HEAD2; + bench(regexp, strings, 1, 0, 0); + + // uri + load_strings("../lib/bench.data_uri", "\n", strings); + regexp = URI; + bench(regexp, strings, 1, REG_BACKWARD, 0); + regexp = URI2; + bench(regexp, strings, 1, 0, 0); + + // ipv6 + strings.clear(); + strings.push_back("fee2:586:10aa:fd03:37f:76ee:bed7:880"); + strings.push_back("::fee2:586:fd03:37f:76ee:bed7:880"); + strings.push_back("fee2::586:fd03:37f:76ee:bed7:880"); + strings.push_back("fee2:586::fd03:37f:76ee:bed7:880"); + strings.push_back("fee2:586:fd03::37f:76ee:bed7:880"); + strings.push_back("fee2:586:fd03:37f::76ee:bed7:880"); + strings.push_back("fee2:586:fd03:37f:76ee::bed7:880"); + strings.push_back("fee2:586:fd03:37f:76ee:bed7::880"); + strings.push_back("fee2:586:fd03:37f:76ee:bed7:880::"); + strings.push_back("::586:fd03:37f:76ee:bed7:880"); + strings.push_back("fee2::fd03:37f:76ee:bed7:880"); + strings.push_back("fee2:fd03::37f:76ee:bed7:880"); + strings.push_back("fee2:fd03:37f::76ee:bed7:880"); + strings.push_back("fee2:fd03:37f:76ee::bed7:880"); + strings.push_back("fee2:fd03:37f:76ee:bed7::880"); + strings.push_back("fee2:fd03:37f:76ee:bed7:880::"); + strings.push_back("::fd03:37f:76ee:bed7:880"); + strings.push_back("fd03::37f:76ee:bed7:880"); + strings.push_back("fd03:37f::76ee:bed7:880"); + strings.push_back("fd03:37f:76ee::bed7:880"); + strings.push_back("fd03:37f:76ee:bed7::880"); + strings.push_back("fd03:fd03:37f:76ee:bed7::"); + strings.push_back("::37f:76ee:bed7:880"); + strings.push_back("fd03::76ee:bed7:880"); + strings.push_back("fd03:37f::bed7:880"); + strings.push_back("fd03:37f:76ee::880"); + strings.push_back("fd03:37f:76ee:bed7::"); + strings.push_back("::76ee:bed7:880"); + strings.push_back("fd03::bed7:880"); + strings.push_back("fd03:37f::880"); + strings.push_back("fd03:37f:76ee::"); + strings.push_back("::bed7:880"); + strings.push_back("fd03::880"); + strings.push_back("fd03:37f::"); + strings.push_back("::bed7"); + strings.push_back("fd03::"); + strings.push_back("::"); + regexp = IPV6; + bench(regexp, strings, 100, REG_BACKWARD, 0); + + strings.clear(); + strings.push_back("127.0.0.1"); + strings.push_back("192.168.1.200"); + strings.push_back("255.255.255.255"); + strings.push_back("8.8.8.8"); + strings.push_back("240.147.163.34"); + regexp = IPV4; + bench(regexp, strings, 10000, 0, 0); + regexp = "([0-9]{1,3})[.]([0-9]{1,3})[.]([0-9]{1,3})[.]([0-9]{1,3})"; + bench(regexp, strings, 10000, 0, 0); + regexp = "([^.]+)[.]([^.]+)[.]([^.]+)[.]([^.]+)"; + bench(regexp, strings, 10000, 0, 0); + + strings.clear(); + strings.push_back("Mon Jan 01 00:24:00 GMT 2019"); + strings.push_back("Tue Feb 02 01:24:00 GMT 2018"); + strings.push_back("Wed Mar 03 02:24:00 GMT 2017"); + strings.push_back("Thu Apr 04 03:24:00 GMT 2016"); + strings.push_back("Fri May 05 04:24:00 GMT 2015"); + strings.push_back("Sat Jun 06 05:24:00 GMT 2014"); + strings.push_back("Sun Jul 07 06:24:00 GMT 2013"); + regexp = "(Mon|Tue|Wed|Thu|Fri|Sat|Sun)" + " (Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)" + " ([0-9]+)" + " ([0-9]{2}:[0-9]{2}:[0-9]{2})" + " ([A-Z]{3}([+-][0-9]{2})?)" + " ([0-9]{4})"; + bench(regexp, strings, 10000, 0, 0); + regexp = "([a-zA-Z]+) ([a-zA-Z]+) ([0-9]+) ([0-9:]+) ([A-Z0-9+-]+) ([0-9]+)"; + bench(regexp, strings, 10000, 0, 0); + regexp = "([^ ]+) ([^ ]+) ([^ ]+) ([^ ]+) ([^ ]+) ([^ ]+)"; + bench(regexp, strings, 10000, 0, 0); - regexp = "((((a)*))*|(((((a)))*))+)*"; memset(longstring, 'a', VERY_LONG); - bench(regexp, longstring, 1, REG_TRIE, 0); - - regexp = "(X|Xa|Xab|Xaba|abab|baba|bY|Y)*"; - longstring[0] = 'X'; - for (size_t i = 1; i < VERY_LONG - 1; ) { - longstring[i++] = 'a'; - longstring[i++] = 'b'; - } - longstring[VERY_LONG - 1] = 'Y'; - bench(regexp, longstring, 1, REG_TRIE, 0); - - regexp = "(((..)|(.))((..)|(.))((..)|(.)))*"; - memset(longstring, 'a', VERY_LONG); - bench(regexp, longstring, 1, REG_TRIE, 0); - - regexp = "(((){0,100}){0,100}){0,100}"; - bench(regexp, "", 1, REG_LEFTMOST | REG_BACKWARD | XREG_RE2, 0); + longstring[VERY_LONG] = 0; + strings.clear(); + strings.push_back(longstring); + + regexp = "(a{2}|a{3}|a{5})*"; + bench(regexp, strings, 1, 0, 0); + regexp = "(a{7}|a{11}|a{13})*"; + bench(regexp, strings, 1, 0, 0); + regexp = "(a{17}|a{19}|a{23})*"; + bench(regexp, strings, 1, 0, 0); + regexp = "(a{29}|a{31}|a{37})*"; + bench(regexp, strings, 1, 0, 0); + regexp = "(((a){2})|((a){3})|((a){5}))*"; + bench(regexp, strings, 1, 0, 0); + regexp = "(((a){7})|((a){11})|((a){13}))*"; + bench(regexp, strings, 1, 0, 0); + regexp = "(((a){17})|((a){19})|((a){23}))*"; + bench(regexp, strings, 1, 0, 0); + regexp = "(((a){29})|((a){31})|((a){37}))*"; + bench(regexp, strings, 1, 0, 0); + + regexp = "((((((((((a*)*)*)*)*)*)*)*)*)*)*"; + bench(regexp, strings, 1, 0, 0); + regexp = "(a*)(a*)(a*)(a*)(a*)(a*)(a*)(a*)"; + bench(regexp, strings, 1, 0, 0); + regexp = "(((a*)(a*)(a*))*((a*)(a*)(a*))*)*"; + bench(regexp, strings, 1, 0, 0); + regexp = "(((((a*)*)*((a*)*)*((a*)*)*)*)*)*"; + bench(regexp, strings, 1, 0, 0); + regexp = "(((((a*)*(a*))*(a*))*(a*))*(a*))*"; + bench(regexp, strings, 1, 0, 0); + regexp = "((a*)((a*)((a*)((a*)(a*)*)*)*)*)*"; + bench(regexp, strings, 1, 0, 0); + regexp = "(a*)|(a*)|(a*)|(a*)|(a*)|(a*)|(a*)"; + bench(regexp, strings, 1, 0, 0); + regexp = "((a*)|(a*)|(a*))((a*)|(a*)|(a*))"; + bench(regexp, strings, 1, 0, 0); + regexp = "((a*)|(a*))((a*)|(a*))((a*)|(a*))"; + bench(regexp, strings, 1, 0, 0); + regexp = "((a*)|(a*)|(a*))*|((a*)|(a*)|(a*))*"; + bench(regexp, strings, 1, 0, 0); + regexp = "(((((a*)*)*|((a*)*)*|((a*)*)*)*)*)*"; + bench(regexp, strings, 1, 0, 0); + regexp = "((a*)|((a*)(a*))|((a*)(a*)(a*)))*"; + bench(regexp, strings, 1, 0, 0); + regexp = "(((a*)(a*)(a*))|((a*)(a*))|(a*))*"; + bench(regexp, strings, 1, 0, 0); // Pathological case for constant-memory POSIX algorithms that use naive // (worst-case cubic in the size of TNFA) algorithm for precedence matrix @@ -168,14 +438,18 @@ int main() // (which also approaches TNFA size). Trie-based algorithms are not // affected, but they consume memory proportional to the length of input, // and so are also not practical. - regexp = "((a?){1,1000})*"; - bench(regexp, "aaaa", 1, 0, REG_NFA); - - regexp = "((a?){1,300})*"; - bench(regexp, "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", 4, 0, 0); - - regexp = "((((a?){1,10}){1,10}){1,10})*"; - bench(regexp, "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", 1, 0, REG_NFA); + strings.clear(); + strings.push_back("aaaaaaaaaa"); + regexp = "((a?){0,125})*"; + bench(regexp, strings, 1, 0, REG_NFA); + regexp = "((a?){0,250})*"; + bench(regexp, strings, 1, 0, REG_NFA); + regexp = "((a?){0,500})*"; + bench(regexp, strings, 1, 0, REG_NFA); + regexp = "((a?){0,1000})*"; + bench(regexp, strings, 1, 0, REG_NFA); + regexp = "((a?){0,2000})*"; + bench(regexp, strings, 1, XREG_RE2, REG_NFA); delete[] longstring; return 0; -- 2.40.0