From b0c0edd5011708b5ad8ff06bbf940a40c631f54e Mon Sep 17 00:00:00 2001 From: Ulya Trofimovich Date: Sat, 16 Mar 2019 23:45:48 +0000 Subject: [PATCH] libre2c: added POSIX tests that show why closure can't use DFS and needs worst-case quadratic shortest path algorithm. In particular, test ((((a?)+)|(aa))+) and string "aaa" demonstrates why we cannot replace SSSP (GOR1 or GTOP) with a liner-time DFS that sets predecessor chains of TNFA states and sorts initial closure configurations according to POSIX relation imposed by precedence matrix. Consider what happens after consuming the first to 'a's (see the .dot encoded TNFA below): - initial states are 8, 14 and 9 (in the order imposed by D-matrix) - start DFS from 8 - find path 8->7->6->5->4->3->2->22->21->11->10 - find path 8->7->6->5->4->3->2->22->21->20->19->18->17->16->15 - find path 8->7->6->5->4->3->2->22->21->20->19->18->17->16->14->13->17 compare it to the previous path to 17 and discard the new looping path - find path 8->7->6->5->4->3->2->22->21->20->19->18->17->16->14->13->12->3 compare it to the previous path to 3 and discard the new looping path - find path 8->7->6->5->4->3->2->1->0 - DFS from 8 complete, start DFS from 14 - find path 14, compare it to the previous path to 14 and accept the new one as it is better (but do not attempt to propagate the improvement and rescan states: this is DFS, not GOR1, and the whole idea is to avoid rescanning and have linear complexity) - DFS from 14 complete, start DFS from 9 - find path 9 The end, and we have an incorrect path to 15: the long path through the outer loop from 8, rather than the short path through the inner loop from 14. This further leads to incorrect results. digraph NFA { rankdir=LR node[shape=Mrecord fontname=Courier height=0.2 width=0.2] edge[arrowhead=vee fontname=Courier label=" "] n23 [label="23"] n23 -> n22 [label="/0↑(1)"] n22 [label="22"] n22 -> n21 [label="/2↑(2)"] n21 [label="21"] n21 -> n11 n21 -> n20 [color=lightgray] n20 [label="20"] n20 -> n19 [label="/4↓(3)"] n19 [label="19"] n19 -> n18 [label="/5↓(2)"] n18 [label="18"] n18 -> n17 [label="/6↑(3)"] n17 [label="17"] n17 -> n16 [label="/8↑(4)"] n16 [label="16"] n16 -> n15 n16 -> n14 [color=lightgray] n15 [label="15"] n15 -> n14 [label="97"] n14 [label="14"] n14 -> n13 [label="/9↑(3)"] n13 [label="13"] n13 -> n17 n13 -> n12 [color=lightgray] n12 [label="12"] n12 -> n3 [label="/7↑(2)"] n11 [label="11"] n11 -> n10 [label="/4↑(3)"] n10 [label="10"] n10 -> n9 [label="97"] n9 [label="9"] n9 -> n8 [label="97"] n8 [label="8"] n8 -> n7 [label="/5↑(2)"] n7 [label="7"] n7 -> n6 [label="/6↓(3)"] n6 [label="6"] n6 -> n5 [label="/8↓(4)"] n5 [label="5"] n5 -> n4 [label="/9↓(3)"] n4 [label="4"] n4 -> n3 [label="/7↓(2)"] n3 [label="3"] n3 -> n2 [label="/3↑(1)"] n2 [label="2"] n2 -> n22 n2 -> n1 [color=lightgray] n1 [label="1"] n1 -> n0 [label="/1↑(0)"] n0 [label="0"] [fillcolor=gray] } --- lib/test.cc | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/lib/test.cc b/lib/test.cc index 9a5c7ec9..9d0eb43e 100644 --- a/lib/test.cc +++ b/lib/test.cc @@ -244,7 +244,9 @@ static int test_all_posix(int flags) T2("(.?.?){2}", "xxx", 0,3, 2,3); T4("(.?.?)(.?.?)(.?.?)", "xxx", 0,3, 0,2, 2,3, 3,3); T2("(.?.?){3}", "xxx", 0,3, 3,3); + T2("(.?.?)*", "xx", 0,2, 0,2); T2("(.?.?)*", "xxx", 0,3, 2,3); + T2("(.?.?)*", "xxxx", 0,4, 2,4); T4("a?((ab)?)(b?)", "ab", 0,2, 1,1, -1,-1, 1,2); T4("(a?)((ab)?)b?", "ab", 0,2, 0,1, 1,1, -1,-1); T3("a?((ab)?)b?", "ab", 0,2, 1,1, -1,-1); @@ -383,6 +385,7 @@ static int test_all_posix(int flags) T2("(y?){2}", "y", 0,1, 1,1); T2("(y?){2}", "yy", 0,2, 1,2); T2("(y?){0,2}", "NULL", 0,0, 0,0); + T3("(y?){0,2}|(y?)", "y", 0,1, 0,1, -1,-1); T2("(y?){0,2}", "y", 0,1, 0,1); T2("(y?){0,2}", "yy", 0,2, 1,2); T2("(y?){1,2}", "NULL", 0,0, 0,0); @@ -516,6 +519,15 @@ static int test_all_posix(int flags) else if (!(flags & REG_SLOWPREC)) { T3("((a?){1,1000})*", "aaaa", 0,4, 0,4, 3,4); } + T6("((((a?)+)|(aa))+)", "aaa", 0,3, 0,3, 0,3, 0,3, 2,3, -1,-1); + T6("(((aa)|((a?)+))+)", "aaa", 0,3, 0,3, 0,3, -1,-1, 0,3, 2,3); + T4("((a?){1,2}|(a)*)*", "aaaa", 0,4, 0,4, -1,-1, 3,4); + T5("(((a?){2,3}|(a)*))*", "aaaaa", 0,5, 0,5, 0,5, -1,-1, 4,5); + T5("(((a?)|(a?a?))+)", "aa", 0,2, 0,2, 0,2, -1,-1, 0,2); + T9("((((a)*))*|((((a))*))+)*", "aa", 0,2, 0,2, 0,2, 0,2, 1,2, -1,-1, -1,-1, -1,-1, -1,-1); + T6("(((a)*)*|((a)*)+)*", "aa", 0,2, 0,2, 0,2, 1,2, -1,-1, -1,-1); + T7("(((a)+)|(((a)+)?))+", "aa", 0,2, 0,2, 0,2, 1,2, -1,-1, -1,-1, -1,-1); + T3("(a*|(a)*)*", "aa", 0,2, 0,2, -1,-1); return e; } @@ -948,6 +960,9 @@ static int test_all_leftmost(int flags) #undef T5 #undef T6 #undef T7 +#undef T8 +#undef T9 +#undef T10 int main() { -- 2.40.0