From b0c0edd5011708b5ad8ff06bbf940a40c631f54e Mon Sep 17 00:00:00 2001
From: Ulya Trofimovich <skvadrik@gmail.com>
Date: Sat, 16 Mar 2019 23:45:48 +0000
Subject: [PATCH] libre2c: added POSIX tests that show why closure can't use
 DFS and needs worst-case quadratic shortest path algorithm.

In particular, test ((((a?)+)|(aa))+) and string "aaa" demonstrates why
we cannot replace SSSP (GOR1 or GTOP) with a liner-time DFS that sets
predecessor chains of TNFA states and sorts initial closure configurations
according to POSIX relation imposed by precedence matrix.

Consider what happens after consuming the first to 'a's (see the .dot
encoded TNFA below):

- initial states are 8, 14 and 9 (in the order imposed by D-matrix)

- start DFS from 8

- find path 8->7->6->5->4->3->2->22->21->11->10

- find path 8->7->6->5->4->3->2->22->21->20->19->18->17->16->15

- find path
  8->7->6->5->4->3->2->22->21->20->19->18->17->16->14->13->17
  compare it to the previous path to 17 and discard the new looping path

- find path
  8->7->6->5->4->3->2->22->21->20->19->18->17->16->14->13->12->3
  compare it to the previous path to 3 and discard the new looping path

- find path 8->7->6->5->4->3->2->1->0

- DFS from 8 complete, start DFS from 14

- find path 14, compare it to the previous path to 14 and accept the new
  one as it is better (but do not attempt to propagate the improvement
  and rescan states: this is DFS, not GOR1, and the whole idea is to
  avoid rescanning and have linear complexity)

- DFS from 14 complete, start DFS from 9

- find path 9

The end, and we have an incorrect path to 15: the long path through the
outer loop from 8, rather than the short path through the inner loop
from 14. This further leads to incorrect results.

digraph NFA {
  rankdir=LR
  node[shape=Mrecord fontname=Courier height=0.2 width=0.2]
  edge[arrowhead=vee fontname=Courier label=" "]

  n23 [label="23"]
  n23 -> n22 [label="/0&uarr;(1)"]
  n22 [label="22"]
  n22 -> n21 [label="/2&uarr;(2)"]
  n21 [label="21"]
  n21 -> n11
  n21 -> n20 [color=lightgray]
  n20 [label="20"]
  n20 -> n19 [label="/4&darr;(3)"]
  n19 [label="19"]
  n19 -> n18 [label="/5&darr;(2)"]
  n18 [label="18"]
  n18 -> n17 [label="/6&uarr;(3)"]
  n17 [label="17"]
  n17 -> n16 [label="/8&uarr;(4)"]
  n16 [label="16"]
  n16 -> n15
  n16 -> n14 [color=lightgray]
  n15 [label="15"]
  n15 -> n14 [label="97"]
  n14 [label="14"]
  n14 -> n13 [label="/9&uarr;(3)"]
  n13 [label="13"]
  n13 -> n17
  n13 -> n12 [color=lightgray]
  n12 [label="12"]
  n12 -> n3 [label="/7&uarr;(2)"]
  n11 [label="11"]
  n11 -> n10 [label="/4&uarr;(3)"]
  n10 [label="10"]
  n10 -> n9 [label="97"]
  n9 [label="9"]
  n9 -> n8 [label="97"]
  n8 [label="8"]
  n8 -> n7 [label="/5&uarr;(2)"]
  n7 [label="7"]
  n7 -> n6 [label="/6&darr;(3)"]
  n6 [label="6"]
  n6 -> n5 [label="/8&darr;(4)"]
  n5 [label="5"]
  n5 -> n4 [label="/9&darr;(3)"]
  n4 [label="4"]
  n4 -> n3 [label="/7&darr;(2)"]
  n3 [label="3"]
  n3 -> n2 [label="/3&uarr;(1)"]
  n2 [label="2"]
  n2 -> n22
  n2 -> n1 [color=lightgray]
  n1 [label="1"]
  n1 -> n0 [label="/1&uarr;(0)"]
  n0 [label="0"] [fillcolor=gray]
}
---
 lib/test.cc | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/lib/test.cc b/lib/test.cc
index 9a5c7ec9..9d0eb43e 100644
--- a/lib/test.cc
+++ b/lib/test.cc
@@ -244,7 +244,9 @@ static int test_all_posix(int flags)
     T2("(.?.?){2}",                      "xxx",     0,3, 2,3);
     T4("(.?.?)(.?.?)(.?.?)",             "xxx",     0,3, 0,2, 2,3, 3,3);
     T2("(.?.?){3}",                      "xxx",     0,3, 3,3);
+    T2("(.?.?)*",                        "xx",      0,2, 0,2);
     T2("(.?.?)*",                        "xxx",     0,3, 2,3);
+    T2("(.?.?)*",                        "xxxx",    0,4, 2,4);
     T4("a?((ab)?)(b?)",                  "ab",      0,2, 1,1, -1,-1, 1,2);
     T4("(a?)((ab)?)b?",                  "ab",      0,2, 0,1, 1,1, -1,-1);
     T3("a?((ab)?)b?",                    "ab",      0,2, 1,1, -1,-1);
@@ -383,6 +385,7 @@ static int test_all_posix(int flags)
     T2("(y?){2}",                            "y",           0,1, 1,1);
     T2("(y?){2}",                            "yy",          0,2, 1,2);
     T2("(y?){0,2}",                          "NULL",        0,0, 0,0);
+    T3("(y?){0,2}|(y?)",                     "y",           0,1, 0,1, -1,-1);
     T2("(y?){0,2}",                          "y",           0,1, 0,1);
     T2("(y?){0,2}",                          "yy",          0,2, 1,2);
     T2("(y?){1,2}",                          "NULL",        0,0, 0,0);
@@ -516,6 +519,15 @@ static int test_all_posix(int flags)
     else if (!(flags & REG_SLOWPREC)) {
         T3("((a?){1,1000})*", "aaaa", 0,4, 0,4, 3,4);
     }
+    T6("((((a?)+)|(aa))+)", "aaa", 0,3, 0,3, 0,3, 0,3, 2,3, -1,-1);
+    T6("(((aa)|((a?)+))+)", "aaa", 0,3, 0,3, 0,3, -1,-1, 0,3, 2,3);
+    T4("((a?){1,2}|(a)*)*", "aaaa", 0,4, 0,4, -1,-1, 3,4);
+    T5("(((a?){2,3}|(a)*))*", "aaaaa", 0,5, 0,5, 0,5, -1,-1, 4,5);
+    T5("(((a?)|(a?a?))+)", "aa", 0,2, 0,2, 0,2, -1,-1, 0,2);
+    T9("((((a)*))*|((((a))*))+)*", "aa", 0,2, 0,2, 0,2, 0,2, 1,2, -1,-1, -1,-1, -1,-1, -1,-1);
+    T6("(((a)*)*|((a)*)+)*", "aa", 0,2, 0,2, 0,2, 1,2, -1,-1, -1,-1);
+    T7("(((a)+)|(((a)+)?))+", "aa", 0,2, 0,2, 0,2, 1,2, -1,-1, -1,-1, -1,-1);
+    T3("(a*|(a)*)*", "aa", 0,2, 0,2, -1,-1);
 
     return e;
 }
@@ -948,6 +960,9 @@ static int test_all_leftmost(int flags)
 #undef T5
 #undef T6
 #undef T7
+#undef T8
+#undef T9
+#undef T10
 
 int main()
 {
-- 
2.50.1