Paper: reworked "Benchmarks" section, added "Conclusions and future work" section.

author Ulya Trofimovich <skvadrik@gmail.com>

Wed, 26 Jun 2019 10:20:14 +0000 (11:20 +0100)

committer Ulya Trofimovich <skvadrik@gmail.com>

Wed, 26 Jun 2019 11:38:28 +0000 (12:38 +0100)
author Ulya Trofimovich <skvadrik@gmail.com>
Wed, 26 Jun 2019 10:20:14 +0000 (11:20 +0100)
committer Ulya Trofimovich <skvadrik@gmail.com>
Wed, 26 Jun 2019 11:38:28 +0000 (12:38 +0100)
diff --git a/doc/tdfa_v2/img/bench/data_artificial b/doc/tdfa_v2/img/bench/data_artificial

index 528008e86d72788e37efa2905ab6dba1c078b14f..e6a16590b8a2c1335e4379d7ccb4d765e302d42f 100644 (file)
--- a/doc/tdfa_v2/img/bench/data_artificial
+++ b/doc/tdfa_v2/img/bench/data_artificial
@@ -1,24 +1,21 @@
-
-# NUM       LEFT      RE2       GOR1      GTOP      KUKL       SLOW      LAZY      BACK
-
-1           1.00      1.38      3.98      3.74      2.71       5.21      3.78      3.48
-2           1.00      1.41      6.71      7.29      3.74       14.14     4.17      3.25
-3           1.00      1.59      10.69     11.52     4.14       27.27     4.17      3.19
-4           1.00      1.52      15.48     16.51     4.36       44.42     4.14      3.12
-5           1.00      0.96      3.47      3.62      5.89       3.77      5.19      5.45
-6           1.00      1.36      5.04      5.75      8.18       8.63      8.14      4.73
-7           1.00      1.51      7.33      8.35      9.21       14.99     12.84     4.56
-8           1.00      1.50      9.92      11.07     9.44       22.33     15.00     4.48
-9           1.00      0.30      1.87      1.94      2.26       1.84      3.82      6.44
-10          1.00      0.88      4.23      4.53      16.22      4.47      11.18     8.74
-11          1.00      0.78      3.37      3.80      14.78      3.29      7.39      11.64
-12          1.00      0.53      2.98      3.33      12.90      2.74      7.08      14.04
-13          1.00      0.58      3.11      3.61      11.26      2.94      7.17      12.73
-14          1.00      0.61      3.38      3.98      12.81      3.23      7.51      12.52
-15          1.00      0.71      3.24      3.93      5.82       3.43      7.69      19.04
-16          1.00      0.80      3.48      3.93      8.56       3.34      6.45      7.58
-17          1.00      0.79      3.20      3.60      9.12       3.15      6.64      6.72
-18          1.00      0.73      2.79      3.20      8.36       2.80      7.32      9.03
-19          1.00      0.48      3.46      2.77      10.52      3.44      6.26      10.92
-20          1.00      0.54      2.81      2.37      5.68       2.80      5.08      6.41
-21          1.00      0.60      2.70      2.68      5.71       2.66      5.53      8.81
+1       1.00  1.27  3.72   3.49   2.60   4.56    2.98   3.21
+2       1.00  1.33  6.34   6.80   3.44   11.92   2.78   3.07
+3       1.00  1.44  10.19  10.94  3.87   22.45   2.31   3.01
+4       1.00  1.44  15.15  16.01  4.17   36.72   2.28   2.97
+5       1.00  1.12  2.96   3.28   5.68   3.22    4.94   4.36
+6       1.00  1.38  4.47   5.36   7.86   7.33    10.65  4.11
+7       1.00  1.46  6.39   7.43   8.61   13.06   14.67  3.99
+8       1.00  1.44  8.66   9.80   8.82   19.78   18.85  3.92
+9       1.00  0.78  1.91   1.98   3.50   1.86    5.96   7.22
+10      1.00  0.76  3.53   3.95   13.80  3.85    11.95  7.53
+11      1.00  0.89  3.16   3.76   15.35  3.16    20.59  9.86
+12      1.00  0.86  2.92   3.49   17.82  2.57    21.64  13.05
+13      1.00  0.79  2.87   3.45   12.61  2.72    24.53  11.54
+14      1.00  0.81  3.07   3.62   14.90  2.94    16.27  9.92
+15      1.00  0.78  3.18   3.92   5.99   3.33    8.01   8.81
+16      1.00  0.89  2.99   3.41   7.17   2.89    11.49  6.29
+17      1.00  0.76  2.75   3.02   7.95   2.70    9.96   5.83
+18      1.00  0.93  2.42   3.00   8.00   2.44    19.99  7.73
+19      1.00  0.87  3.27   2.57   11.24  3.22    24.15  10.08
+20      1.00  0.71  2.50   2.19   6.02   2.46    14.57  5.55
+21      1.00  0.79  2.51   2.52   6.20   2.47    16.15  7.50
diff --git a/doc/tdfa_v2/img/bench/data_pathological b/doc/tdfa_v2/img/bench/data_pathological

index b133b44ddbb76dda7456b3f5eeb7c55f12803156..5a721abe4735594122a5a64f3328c3e4ca2f6f74 100644 (file)
--- a/doc/tdfa_v2/img/bench/data_pathological
+++ b/doc/tdfa_v2/img/bench/data_pathological
@@ -1,8 +1,5 @@
-
-# NUM       LEFT      RE2       GOR1      GTOP      KUKL       SLOW      LAZY      BACK
-
-1           1.00      0.52      5.88      6.23      27.72      53.39     5.65      1.24
-2           1.00      0.30      6.54      6.74      35.56      115.85    5.46      0.74
-3           1.00      0.17      6.96      7.06      45.70      241.16    5.34      0.41
-4           1.00      0.09      7.51      7.47      50.99      494.63    5.39      0.22
-5           1.00      --        7.84      7.80      60.49      999.74    5.44      0.12
+1       1.00  0.46  5.11   5.42   24.85  46.84   5.66   1.13
+2       1.00  0.27  5.67   5.85   31.69  101.25  5.25   0.67
+3       1.00  0.15  6.02   6.15   40.83  211.30  5.08   0.37
+4       1.00  0.08  6.48   6.47   45.31  431.51  4.92   0.19
+5       1.00  -     6.68   6.64   53.49  880.72  4.83   0.11
diff --git a/doc/tdfa_v2/img/bench/data_realworld b/doc/tdfa_v2/img/bench/data_realworld

index 7d9ac9ef5a8e8bb677991bd8bdc8ee030aaefc14..88cef801a762ca0af22ff7cbc5eb75f19317736b 100644 (file)
--- a/doc/tdfa_v2/img/bench/data_realworld
+++ b/doc/tdfa_v2/img/bench/data_realworld
@@ -1,14 +1,11 @@
-
-# NUM       LEFT      RE2       GOR1      GTOP      KUKL       SLOW      LAZY      BACK
-
-1           1.00      3.06      4.37      3.98      75.58      3.51      2.37      INF
-2           1.00      0.40      3.20      2.81      19.43      2.65      1.53      4358.53
-3           1.00      1.51      4.15      3.71      10.93      3.82      2.13      INF
-4           1.00      0.36      3.97      3.65      6.27       3.62      2.57      63.19
-5           1.00      0.83      5.74      5.69      34.48      7.68      2.41      INF
-6           1.00      0.88      3.62      3.15      5.57       2.84      1.52      30.16
-7           1.00      0.56      4.97      4.17      6.44       3.63      2.05      29.18
-8           1.00      0.53      4.76      4.08      6.15       3.47      1.97      18.14
-9           1.00      0.83      4.17      3.62      9.09       2.89      1.62      118.76
-10          1.00      0.97      4.74      4.00      8.23       3.62      2.01      44.17
-11          1.00      1.43      5.09      4.25      8.86       3.91      2.10      48.89
+1       1.00  3.93  3.65   3.27   75.03  2.80    2.45   -
+2       1.00  0.43  2.92   2.63   19.84  2.36    1.72   -
+3       1.00  2.55  3.13   2.76   13.57  2.90    1.95   -
+4       1.00  0.45  3.55   3.21   6.51   3.09    2.72   42.93
+5       1.00  1.34  3.16   3.53   45.37  3.37    1.25   -
+6       1.00  0.83  3.53   3.14   5.55   2.71    1.86   30.04
+7       1.00  0.50  4.89   4.11   6.10   3.41    2.49   27.15
+8       1.00  0.50  4.57   4.08   5.98   3.33    2.46   17.55
+9       1.00  0.74  3.91   3.21   8.49   2.80    1.97   110.86
+10      1.00  0.95  5.00   4.32   8.01   3.90    2.64   43.09
+11      1.00  1.32  5.10   4.41   8.28   3.98    2.65   44.92
diff --git a/doc/tdfa_v2/img/bench/plot.gnuplot b/doc/tdfa_v2/img/bench/plot.gnuplot

index 324270b3a63f29ab21651b0555367809e9888782..89cd7095ed72ade60ac0af44aa2fd11e94d181f9 100644 (file)
--- a/doc/tdfa_v2/img/bench/plot.gnuplot
+++ b/doc/tdfa_v2/img/bench/plot.gnuplot
@@ -4,15 +4,15 @@ set ylabel "slowdown vs leftmost greedy (times)"
  
  set style line 1 lc rgb '#000000' lw 1
  set style line 2 lc rgb '#000000' lw 1 dt ' -'
-set style line 3 lc rgb '#000000' lw 1 dt (40.00, 10.00)
-set style line 4 lc rgb '#000000' lw 1 dt (20.00, 15.00)
-set style line 5 lc rgb '#000000' lw 1 dt (4.00, 20.00, 40, 20)
-set style line 6 lc rgb '#000000' lw 1 dt '  -  '
-set style line 7 lc rgb '#000000' lw 1 dt (60.00, 15.00)
-set style line 8 lc rgb '#000000' lw 1 dt (4, 16)
+set style line 3 lc rgb '#000000' lw 1 dt (70.00, 15.00)
+set style line 4 lc rgb '#000000' lw 1 dt (40.00, 15.00)
+set style line 5 lc rgb '#000000' lw 1 dt (4, 20, 40, 20)
+set style line 6 lc rgb '#000000' lw 1 dt (20.00, 15.00)
+set style line 7 lc rgb '#000000' lw 1 dt (10.00, 30.00)
+set style line 8 lc rgb '#000000' lw 1 dt (40, 20, 5, 20, 5, 20, 5, 20)
  
  set output 'plot_realworld.png'
-set terminal pngcairo dashed font "Courier,mono" size 800,600
+set terminal pngcairo dashed font "Courier,mono" size 750,550
  set title "real-world RE"
  set xtics (\
      "HTTP 6204-198" 1, \
@@ -31,7 +31,7 @@ set bmargin 6
  set tmargin 2
  set lmargin 12
  set rmargin 1
-set yrange [-1:30]
+set yrange [-1:25]
  plot \
       "data_realworld" using 1:2 ls 1 with lines title "leftmost greedy", \
       "data_realworld" using 1:3 ls 2 with lines title "RE2", \
@@ -44,8 +44,8 @@ plot \
  
  
  set output 'plot_artificial.png'
-set terminal pngcairo dashed font "Courier" size 1300,700
-set title "artificial highly ambiguous RE on long (64K) input strings"
+set terminal pngcairo dashed font "Courier" size 1150,650
+set title "artificial highly ambiguous RE on long (16K) input strings"
  set xtics (\
      '(a\{2\}|a\{3\}|a\{5\})*' 1, \
      '(a\{7\}|a\{11\}|a\{13\})*' 2, \
@@ -73,7 +73,7 @@ set bmargin 11
  set tmargin 2
  set lmargin 15
  set rmargin 1
-set yrange [-1:30]
+set yrange [-1:25]
  plot \
       "data_artificial" using 1:2 ls 1 with lines title "leftmost greedy", \
       "data_artificial" using 1:3 ls 2 with lines title "RE2", \
@@ -86,7 +86,7 @@ plot \
  
  
  set output 'plot_pathological.png'
-set terminal pngcairo dashed font "Courier" size 500,600
+set terminal pngcairo dashed font "Courier" size 400,550
  set title "pathological RE"
  set xtics (\
      '((a?)\{0,125\})*' 1, \
@@ -99,7 +99,7 @@ set bmargin 6
  set tmargin 2
  set lmargin 12
  set rmargin 1
-set yrange [-50:32<*]
+set yrange [-50:1000]
  plot \
       "data_pathological" using 1:2 ls 1 with lines title "leftmost greedy", \
       "data_pathological" using 1:3 ls 2 with lines title "RE2", \
diff --git a/doc/tdfa_v2/img/bench/plot.png b/doc/tdfa_v2/img/bench/plot.png

index 38b1abf8bdf30315f329e79d99cc89c94e8a0bee..c5fc6bc69cb4ee1d0b3076e0c88a0f2a6394f0ef 100644 (file)

Binary files a/doc/tdfa_v2/img/bench/plot.png and b/doc/tdfa_v2/img/bench/plot.png differ
diff --git a/doc/tdfa_v2/part_1_tnfa.tex b/doc/tdfa_v2/part_1_tnfa.tex

index d722f4e5bc8d4087e9a8a41e0d07fe880bc417c5..212e98bb4d93b61fc23370bb0501bc7012a66fdc 100644 (file)
--- a/doc/tdfa_v2/part_1_tnfa.tex
+++ b/doc/tdfa_v2/part_1_tnfa.tex
@@ -143,7 +143,7 @@ Our algorithm works in worst-case $O(n \, m^2 \, t)$ time and $O(m^2)$ space,
  where $n$ is the length of input, $m$ is the size of the regular expression with counted repetition subexpressions ``unrolled'',
  and $t$ is the number of capturing groups and subexpressions that contain them.
  %
-Benchmarks show that in practice our algorithm is 2x-10x slower than leftmost greedy matching
+Benchmarks show that in practice our algorithm is about 5x slower than leftmost greedy matching
  (which has no overhead on disambiguation).
  %
  We present a lazy variation that is much faster, but requires memory proportional to the size of input.
@@ -1572,14 +1572,14 @@ We assume the existence of helper function $height(T, t)$ that maps each tag to
          \BlankLine
          $t_1 = tag(U, n_1), \; t_2 = tag(U, n_2)$ \;
  
-        \BlankLine
-        \lIf {$t_1 mod \, 2 \equiv 0$} { \Return $-1$ }
-        \lIf {$t_2 mod \, 2 \equiv 0$} { \Return $1$ }
-
          \BlankLine
          \lIf {$t_1 < 0$} { \Return $1$ }
          \lIf {$t_2 < 0$} { \Return $-1$ }
  
+        \BlankLine
+        \lIf {$t_1 mod \, 2 \equiv 0$} { \Return $-1$ }
+        \lIf {$t_2 mod \, 2 \equiv 0$} { \Return $1$ }
+
          \BlankLine
          \Return $0$
      }
@@ -2152,7 +2152,8 @@ Benchmark results show the following:
  \includegraphics[width=\linewidth]{img/bench/plot.png}
  \vspace{-2em}
  \caption{
-Benchmarks.
+Benchmarks.\\
+Real-world tests have labels of the form ``title $m$-$k$'', where $m$ is RE size and $k$ is the number of capturing groups.
  %: real-world RE (upper left),
  %pathological RE for naive precedence table algorithm (upper right),
  %artifical highly ambiguous RE on very long inputs (lower).
@@ -2161,31 +2162,41 @@ Benchmarks.
  
  \begin{itemize}[itemsep=0.5em]
      \item Okui-Suzuki algorithm degrades with increased closure size.
-        This is understandable, as the algorithm performs pairwise comparison of closure states.
+        This is understandable, as the algorithm performs pairwise comparison of closure states to compute precedence matrices.
          Naive $update \Xund ptables ()$ algorithm degrades extremely fast,
-        and the advanced algorithm behaves much better (but it may incur slight overhead in simple cases).
-
-    \item Cox and Kuklewicz algorithms degrade as the number of tags increases.
-        This is not surprizing, as both algorithms have per-tag inner loops in their core.
-        On large real-world RE Kuklewicz algorithm is much slower than all Okui-Suzuki variations,
-        and Cox algorithm is so slow that it did not fit into the plot space.
-
-    \item The bottleneck of Cox algorithm is copying of offset arrays.
-        Using GOR1 instead of naive depth-first search, though asymptotically faster,
-        increases the amount of copying because depth-dirst scan order allows to use a single buffer array that is updated and restored in-place.
-        However, copying offset arrays is also required in other parts of the algorithm,
-        and in general Cox algorithm is not suited for RE with many submatch groups.
-
-    \item Lazy variation of Okui-Suzuki is much faster than the main variation on real-world tests and not very long inputs.
-
-    \item GTOP is somewhat faster than GOR1 on real-world RE, but can be slower on artificial RE.
-
-    \item RE2 performs close to our implementations (sometimes better, sometimes worse).
+        and the advanced algorithm behaves much better (though it may incur slight overhead in simple cases).
+
+    \item Kuklewicz algorithms degrades with increased closure size and increased number of tags.
+        This is not surprizing, as the algorithm has per-state and per-tag loop used to compute precedence matrix.
+        On real-world tests with many capturing groups Kuklewicz algorithm is much slower than Okui-Suzuki algorithm.
+
+    \item Cox algorithm degrades with increased number of tags.
+        The bottleneck of the algorithm is copying of offset arrays
+        (each array contains a pair of offsets per tag).
+        Using GOR1 instead of naive depth-first search increases the amount of copying (though asymptotically faster),
+        because depth-dirst scan order allows to use a single buffer array that is updated and restored in-place.
+        However, copying is required elsewhere in the algorithm,
+        and in general it is not suited for RE with many submatch groups.
+        On real-world tests Cox algorithm is so slow that it did not fit into the plot space.
+
+    \item Lazy variation of Okui-Suzuki degrades with increased cache size and the size of path context.
+        This may happen because of long input strings and because of high level of ambiguity in RE
+        (in such cases lazy algorithm does all the work of non-lazy algorithm,
+        but with the additional overhead on cache lookups/insertions and accumulation of data from the previous steps).
+        On real-world tests lazy variation of Okui-Suzuki is fast.
+
+    \item GOR1 and GTOP performance is similar.
+
+    \item RE2 performance is close to our leftmost greedy implementation.
      \\[-0.5em]
  \end{itemize}
  
-One interesting test is RE of the form $(a^{k_1}|\hdots|a^{k_n})^{0,\infty}$,
-e.g. \texttt{(a\{2\}|a\{3\}|a\{5\})*}.
+One particularly interesting group of tests that show the above points
+are RE of the form $(a^{k_1}|\hdots|a^{k_n})^{0,\infty}$
+(artificial tests 1-4)
+and their variations with more capturing groups
+(artificial tests 5-8).
+For example, consider \texttt{(a\{2\}|a\{3\}|a\{5\})*} and \texttt{(((a)\{2\})|((a)\{3\})|((a)\{5\}))*}.
  Given input string \texttt{a...a},
  submatch on the last iteration varies with the length of input:
  it equals \texttt{aaaaa} for $5n$-character string,
@@ -2193,11 +2204,16 @@ it equals \texttt{aaaaa} for $5n$-character string,
  and \texttt{aaa} for strings of length $5n - 2$ and $5n + 1$ ($n \in \YN$).
  Variation continues infinitely with a period of five characters.
  %
-We can increase variation period and the range of possible submatch results by choosing different counter values.
+We can increase variation period and the range of possible submatch results by choosing larger counter values.
  %
-Large period and wide range correspond to a higher level of ambiguity and many parallel competing paths,
-which means increased closure size (hence the slowdown of Okui-Suzuki algorithm, especially the ``naive Okui-Suzuki'' variation).
-Adding more capturing groups increases the number of tags (hence the slowdown of Kuklewicz and Cox algorithms).
+This causes increased closure size ---
+hence the slowdown of Okui-Suzuki algorithm on tests 1 to 4 and 5 to 8 (especially pronounced for the ``naive Okui-Suzuki'' variation),
+and the more gentle slowdown of Kuklewicz algorithm on the same ranges.
+%
+Adding more capturing groups increases the number of tags ---
+hence the slowdown of Kuklewicz and Cox algorithms on 5-8 group compared to 1-4 group.
+%
+%Note that Cox algorithm performs very well on this test and slows down at the same pace as leftmost greedy.
  \\
  
  In closing, we would like to point out that correctness
@@ -2209,8 +2225,38 @@ All algorithms except Cox algorithm have passed the tests
  
  \FloatBarrier
  
+
  \section{Conclusions and future work}
  
+The main result of our work is a practical POSIX matching algorithm
+that can be used on real-world regular expressions,
+does not require complex preprocessing
+and incurs relatively modest disambiguation overhead compared to other algorithms.
+%
+We tried to present the algorithm in full, with a few useful variations,
+in order to make implementation easy for the reader.
+\\
+
+We see a certain tradeoff between speed and memory usage:
+bounded-memory version of the algorithm performs a lot of redundant work,
+and the lazy version avoids redundant work at the expense of potentially unbounded memory usage.
+Both approaches seem not ideal;
+perhaps in practice a hybrid approach can be used.
+\\
+
+It is still an open question to us
+whether it is possible to combine the elegance of derivative-based approach to POSIX disambiguation
+with the practical efficiency of NFA-based methods.
+%
+Derivative-based approach constructs match results in such order that longest-leftmost result is always first.
+%
+We experimented with recursive descent parsers that embrace the same ordering idea,
+but the resulting algorithm was rather complex and slow in practice.
+\\
+
+It would be interesting to apply our approach to automata with counters
+instead of unrolling bounded repetition.
+
  
  \vfill\null
  \clearpage
author	Ulya Trofimovich <skvadrik@gmail.com>
	Wed, 26 Jun 2019 10:20:14 +0000 (11:20 +0100)
committer	Ulya Trofimovich <skvadrik@gmail.com>
	Wed, 26 Jun 2019 11:38:28 +0000 (12:38 +0100)
doc/tdfa_v2/img/bench/data_artificial		patch \| blob \| history
doc/tdfa_v2/img/bench/data_pathological		patch \| blob \| history
doc/tdfa_v2/img/bench/data_realworld		patch \| blob \| history
doc/tdfa_v2/img/bench/plot.gnuplot		patch \| blob \| history
doc/tdfa_v2/img/bench/plot.png		patch \| blob \| history
doc/tdfa_v2/part_1_tnfa.tex		patch \| blob \| history