From 40951cd1c287296b71c1b52d295eab2eddd5f89c Mon Sep 17 00:00:00 2001 From: Ulya Trofimovich Date: Thu, 25 Jul 2019 09:21:20 +0100 Subject: [PATCH] Fixed code generation for EOF rule with '-f --storable-state' option. --- doc/tdfa_v2/part_1_tnfa.tex | 3222 ------------------------------- src/codegen/emit_action.cc | 73 +- src/codegen/emit_dfa.cc | 2 +- src/codegen/output.cc | 25 +- src/options/opt.cc | 1 + src/parse/validate.cc | 7 +- test/eof/nonblocking_push.fi.c | 273 +-- test/eof/nonblocking_push.fi.re | 5 +- 8 files changed, 203 insertions(+), 3405 deletions(-) delete mode 100644 doc/tdfa_v2/part_1_tnfa.tex diff --git a/doc/tdfa_v2/part_1_tnfa.tex b/doc/tdfa_v2/part_1_tnfa.tex deleted file mode 100644 index 21b16704..00000000 --- a/doc/tdfa_v2/part_1_tnfa.tex +++ /dev/null @@ -1,3222 +0,0 @@ -\documentclass[AMA,STIX1COL]{WileyNJD-v2} - -\articletype{Research Article} - -\received{xx July 2019} -\revised{xx July 2019} -\accepted{xx July 2019} - -\raggedbottom - -\let\procedure\relax -\let\endprocedure\relax - -\usepackage[utf8]{inputenc} -\usepackage{amsmath, amssymb, amsthm, amsfonts} -\usepackage{accents} -\usepackage{mathtools} -\usepackage{graphicx} -\usepackage{enumitem} -\usepackage[justification=centering]{caption} -\usepackage{url} -\usepackage[section]{placeins} -\usepackage{proof-at-the-end} -\usepackage{setspace} -\usepackage{tikz, pgfplots} - -\usepackage[noline, noend, nofillcomment, linesnumbered]{algorithm2e} -\SetArgSty{textnormal} -\newcommand\Xcommentfont[1]{\selectfont\textnormal{#1}} -\SetCommentSty{Xcommentfont} -\SetNoFillComment -\SetNlSty{textnormal}{}{} -\renewcommand{\algorithmcfname}{ALGORITHM} - - -\let\oldnl\nl -\newcommand{\nonl}{\renewcommand{\nl}{\let\nl\oldnl}} % Remove line number for one line - -\newcommand{\Xl}{\langle} -\newcommand{\Xr}{\rangle} -\newcommand{\Xm}{\langle\!\rangle} -\newcommand{\Xset}{\!\leftarrow\!} -\newcommand{\Xund}{\rule{.4em}{.4pt}} -\newcommand{\Xlb}{[\![} -\newcommand{\Xrb}{]\!]} -\newcommand{\Xmap}{\!\mapsto\!} -\newcommand{\XB}{\mathcal{B}} -\newcommand{\XD}{\mathcal{D}} -\newcommand{\XE}{\mathcal{E}} -\newcommand{\XF}{\mathcal{F}} -\newcommand{\XI}{\mathcal{I}} -\newcommand{\XPT}{\XP\!\XT} -\newcommand{\XIT}{\XI\!\XT} -\newcommand{\XIR}{\XI\!\XR} -\newcommand{\XL}{\mathcal{L}} -\newcommand{\XN}{\mathcal{N}} -\newcommand{\XM}{\mathcal{M}} -\newcommand{\XO}{\mathcal{O}} -\newcommand{\XP}{\mathcal{P}} -\newcommand{\XR}{\mathcal{R}} -\newcommand{\XS}{\mathcal{S}} -\newcommand{\XT}{\mathcal{T}} -\newcommand{\XX}{\mathcal{X}} -\newcommand{\YB}{\mathbb{B}} -\newcommand{\YC}{\mathbb{C}} -\newcommand{\YK}{\mathbb{K}} -\newcommand{\YF}{\mathbb{F}} -\newcommand{\YN}{\mathbb{N}} -\newcommand{\YT}{\mathbb{T}} -\newcommand{\YQ}{\mathbb{Q}} -\newcommand{\YP}{\mathbb{P}} -\newcommand{\YZ}{\mathbb{Z}} -\newcommand{\PT}{PT} -\newcommand{\PE}{P\!E} -\newcommand{\PR}{P\!R} -\newcommand{\IPT}{I\!PT} -\newcommand{\IRE}{I\!RE} - -\newcommand{\Xstirling}[2]{\genfrac{\{}{\}}{0pt}{}{#1}{#2}} -\newcommand*{\Xbar}[1]{\overline{#1}} -\newcommand{\pnorm}[2]{\|{#1}\|^{pos}_{#2}} -\newcommand{\snorm}[2]{\|{#1}\|^{sub}_{#2}} - -\DeclarePairedDelimiter\ceil{\lceil}{\rceil} -\DeclarePairedDelimiter\floor{\lfloor}{\rfloor} - -\setlist{nosep} - -\newenvironment{Xfig} - {\par\medskip\noindent\minipage{\linewidth}\begin{center}} - {\end{center}\endminipage\par\medskip} -\newenvironment{Xtab} - {\par\medskip\noindent\minipage{\linewidth}\begin{center}} - {\end{center}\endminipage\par\medskip} - -\setlength{\parindent}{0pt} -\setlength{\belowcaptionskip}{-1em} - - -\begin{document} - -\title{Efficient POSIX Submatch Extraction on NFA} - -\author[1]{Angelo Borsotti} -\author[2]{Ulya Trofimovich} - -\address[1]{\email{angelo.borsotti@mail.polimi.it}} -\address[2]{\email{skvadrik@gmail.com}} - -\abstract[Summary]{ -In this paper we further develop the POSIX disambiguation algorithm by Okui and Suzuki. -We extend its theoretical foundations on a few important practical cases -and introduce numerous performance improvements. -% -Our algorithm works in worst-case $O(n \, m^2 \, t)$ time and $O(m^2)$ space, -where $n$ is the length of input, $m$ is the size of the regular expression with bounded repetition expanded -and $t$ is the number of capturing groups and subexpressions that contain groups. -% -Benchmarks show that in practice our algorithm is \textasciitilde{}5x slower than leftmost-greedy matching. -% -We present a lazy version of the algorithm that is much faster, but requires memory proportional to the size of input. -% -We study other NFA-based algorithms -and show that the Kuklewicz algorithm is slower in practice, -and the backward matching algorithm by Cox is incorrect. -} - -\keywords{Regular Expressions, Parsing, Submatch Extraction, Finite-State Automata, POSIX} - -\maketitle - - -\section{Introduction} - -In this paper we study NFA-based approaches to the problem of POSIX regular expression parsing and submatch extraction. -A number of algorithms have been proposed in recent years, -but not all of them have been properly studied and formalized. -We experimented with different approaches and found that in practice the algorithm by Okui and Suzuki \cite{OS13} is the most efficient one. -In the process, we have discovered a number of improvements -that require careful reconstruction of the underlying theory and introduction of new algorithms and proofs. -In our experience, the Okui and Suzuki approach is not easy to understand, -therefore we include illustrative examples and detailed pseudocode of the extended algorithm. -% -It should be noted that there is a totally different (and very elegant) approach to the problem -based on Brzozowski derivatives \cite{SL14}. -We choose to focus on NFA-based approach because -in our experience, derivative-based approach is slow in practice -(we also discuss theoretical bounds below). -% -Both NFA and derivatives can be used to construct DFA with POSIX longest-match semantics \cite{SL13} \cite{Bor15} \cite{Tro17}. -The resulting DFA-based algorithms are very fast, because there is no run-time overhead on disambiguation. -However, DFA construction is not always viable due to its exponential worst-case complexity, -and if viable, it needs to be efficient. -Therefore we concentrate on NFA-based algorithms -that can be used directly for matching or serve as a basis for DFA construction. -We give an overview of existing algorithms, including some that are incorrect but interesting. - -\subparagraph{Laurikari, 2001 (incorrect).} - -Laurikari described an algorithm based on TNFA, which is an $\epsilon$-NFA with tagged transitions \cite{Lau01}. -In his algorithm each submatch group is represented with a pair of \emph{tags} (opening and closing). -Disambiguation is based on minimizing the value of opening tags and maximizing the value of closing tags, where -different tags have priority according to POSIX subexpression hierarchy. -Notably, Laurikari used the idea of topological order to avoid worst-case exponential time of $\epsilon$-closure construction. -His algorithm doesn't track history of iteration subexpressions and gives incorrect result in cases like \texttt{(a|aa)*} and string \texttt{aa}. -Reported computational complexity is $O(n \, m \, c \, t \, log(t))$, where -$n$ is input length, -$m$ is TNFA size, -$c$ is the time for comparing tag values -and $t$ is the number of tags. -Memory requirement is $O(m \, t)$. - -\subparagraph{Kuklewicz, 2007.} - -Kuklewicz fixed Laurikari algorithm by introducing \emph{orbit} tags for iteration subexpressions. -He gave only an informal description \cite{Kuk07}, but the algorithm was later formalized in \cite{Tro17}. -It works in the same way as the Laurikari algorithm, -except that comparison of orbit tags is based on their previous history, not just the most recent value. -The clever idea is to avoid recording full history -by compressing histories in a matrix of size $t \times m$, where $m$ is TNFA size and $t$ is the number of tags. -$t$-Th row of the matrix represents ordering of closure states with respect to $t$-th tag -(with possible ties --- different states may have the same order). -Matrix is updated at each step using continuations of tag histories. -The algorithm requires $O(m \, t)$ memory and $O(n \, m \, t \, (m + t \, log(m))$ time, where $n$ is the input length -(we assume that worst-case optimal $O(m^2 \, t)$ algorithm for $\epsilon$-closure is used, -and matrix update takes $O(m \, log(m) \, t^2)$ because for $t$ tags we need to sort $m$ states with $O(t)$ comparison function). -%Kuklewicz disambiguation is combined with Laurikari determinization [Lau00] in [Tro17]. - -\subparagraph{Cox, 2009 (incorrect).} - -Cox came up with the idea of backward POSIX matching \cite{Cox09}, -which is based on the observation that it is easier to maximize submatch on the last iteration than on the first one -because we do not need to remember the history of previous iterations. -The algorithm consumes input from right to left -and tracks two pairs of offsets for each submatch group: -the \emph{active} pair of the most recent offsets (used in disambiguation) -and the \emph{final} pair of offsets on the backwards-first (i.e. the last) iteration. -The algorithm gives incorrect results under two conditions: -(1) ambiguous matches have equal offsets on some iteration, -and (2) disambiguation happens too late, when active offsets have already been updated and the difference between ambiguous matches is erased. -We found that such situations may occur for two reasons. -First, $\epsilon$-closure algorithm sometimes compares ambiguous paths \emph{after} their join point, -when both paths have a common suffix with tagged transitions. -This is the case with Cox prototype implementation \cite{Cox09}; for example, it gives incorrect results for \texttt{(aa|a)*} and string \texttt{aaaaa}. -Most of such failures can be repaired by exploring states in topological order, -but a topological order does not exist in the presence of $\epsilon$-loops. -The second reason is bounded repetition: ambiguous paths may not have an intermediate join point at all. -For example, in case of \texttt{(aaaa|aaa|a)\{3,4\}} and string \texttt{aaaaaaaaaa} -we have matches \texttt{(aaaa)(aaaa)(a)(a)} and \texttt{(aaaa)(aaa)(aaa)} -with different number of iterations. -Assuming that bounded repetition is modeled by chaining three non-optional sub-automata for \texttt{(aaaa|aaa|a)} and the optional fourth one, -by the time ambiguous paths meet both have active offsets \texttt{(0,4)}. -Despite the flaw, Cox algorithm is interesting: if somehow delayed comparison problem was fixed, it would work. -The algorithm requires $O(m \, t)$ memory and $O(n \, m^2 \, t)$ time -(assuming worst-case optimal closure algorithm), -where $n$ is the length of input, -$m$ it the size of the regular expression -and $t$ is the number of submatch groups plus enclosing subexpressions. - -\subparagraph{Okui and Suzuki, 2013.} - -Okui and Suzuki view the disambiguation problem from the point of comparison of parse trees \cite{OS13}. -Ambiguous trees have the same frontier of leaf symbols, but their branching structure is different. -Each subtree corresponds to a subexpression. -The \emph{norm} of a subtree is the number of alphabet symbols in it (a.k.a. submatch length). -Longest match corresponds to a tree in which the norm of each subtree in leftmost in-order traversal is maximized. -The clever idea of Okui and Suzuki is to relate the norm of subtrees to their \emph{height} (distance from the root). -Namely, if we walk through the leaves of two ambiguous trees, tracking the height of each complete subtree, -then at some step heights will diverge: -subtree with a smaller norm will already be complete, but the one with a greater norm will not. -Height of subtrees is easy to track by attributing it to parentheses and encoding in automaton transitions. -Okui and Suzuki use PAT --- $\epsilon$-free position automaton with transitions labeled by sequences of parentheses. -Disambiguation is based on comparing parentheses along ambiguous PAT paths. -Similar to Kuklewicz, Okui and Suzuki avoid recording full-length paths -by pre-comparing them at each step and storing comparison results in a pair of matrices indexed by PAT states. -The authors report complexity $O(n(m^2 + c))$, where -$n$ is the input length, -$m$ is the number of occurrences of the most frequent symbol in the regular expression -and $c$ is the number of submatch groups and repetition operators. -However, this estimate leaves out the construction of PAT and precomputation of the precedence relation. -Memory requirement is $O(m^2)$. -Okui-Suzuki disambiguation is combined with Berry-Sethi construction in \cite{Bor15} in construction of parsing DFA. - -\subparagraph{Sulzmann and Lu, 2013.} - -Sulzmann and Lu based their algorithm on Brzozowski derivatives \cite{SL14} -(correctness proof is given in \cite{ADU16}). -The algorithm unfolds a regular expression into a sequence of derivatives -and then folds it back into a parse tree. -Each derivative is obtained from the previous one by consuming input symbols in left-to-right order, -and each tree is built from the next tree by injecting symbols in reversed right-to-left order. -In practice, Sulzmann and Lu fuse backward and forward passes, -which allows to avoid potentially unbounded memory usage on keeping all intermediate derivatives. -The algorithm is elegant in that it does not require explicit disambiguation: -parse trees are naturally ordered by the longest criterion. -Time and space complexity is not entirely clear. -In \cite{SL14} Sulzmann and Lu consider the size of the regular expression as a constant. -In \cite{SL13} they give more precise estimates: $O(2^m \, t)$ space and $O(n \, log(2^m) \, 2^m \, t^2)$ time, -where $m$ is the size of the regular expression, -$n$ is the length of input -and $t$ the number of submatch groups (the authors do not differentiate between $m$ and $t$). -However, this estimate assumes worst-case $O(2^m)$ derivative size and on-the-fly DFA construction. -The authors also mention a better $O(m^2)$ theoretical bound for derivative size. -If we adopt this bound and exclude DFA construction, we get $O(m^2 \, t)$ memory requirement and $O(n \, m^2 \, t^2)$ time, -which seems reasonably close to (but worse than) NFA-based approaches. -\\ - -Undoubtedly, other approaches exist, -but many of them produce incorrect results or require memory proportional to the length of input -(e.g. Glibc implementation \cite{Glibc}). -% -Our contributions are the following: -\begin{itemize}[itemsep=0.5em, topsep=0.5em] - - \item We extend Okui-Suzuki algorithm on the case of partially ordered parse trees - and introduce the notion of \emph{explicit} and \emph{implicit} submatch groups. - This greatly reduces the overhead on disambiguation - for regular expressions with only a few submatch groups, - which is a common case in practice. - - \item We extend Okui-Suzuki algorithm on the case of bounded repetition. - - \item We combine Okui-Suzuki algorithm with Laurikari TNFA. - It allows us to omit the preprocessing step - at the cost of $\epsilon$-closure construction, - which may be preferable in cases when preprocessing time is included in match time. - - \item We introduce \emph{negative tags} that allow us to handle - no-match situation in the same way as match. - Negative tags provide a simple way to reset obsolete offsets from earlier iterations, - in cases like \texttt{(a(b)?)*} and string \texttt{aba}. - - \item We consider $\epsilon$-closure construction as a shortest-path problem - and show that path concatenation is right-distributive over path comparison - for the subset of paths considered by closure algorithm. - This justifies the use of well-known Goldberg-Radzik algorithm based on the idea of topological order, - which has worst-case optimal quadratic complexity in the size of closure - and guaranteed linear complexity if the closure has no $\epsilon$-loops. - This is an improvement over naive exhaustive depth-first search with backtracking, - and also an improvement over Laurikari algorithm \cite{Tro17}. - - \item We give a faster algorithm for updating precedence matrices. - The straightforward algorithm described by Okui and Suzuki involves pairwise comparison of all states in closure - and takes $O(m^2 \, t)$ time, assuming $m$ states and $O(t)$ comparison function. - We show a pathological example \texttt{((a?)\{0,1000\})*} where $t \approx m$. - Our algorithm takes $O(m^2)$ time. - - \item We show how to use our algorithm in order to build either parse trees or POSIX-style offsets. - - \item We present a lazy version of our algorithm - that reduces the overhead on disambiguation - at the cost of memory usage that grows with the length of input. - The lazy algorithm is well-suited for small inputs. - - \item We provide a C++ implementation of different NFA-based algorithms \cite{RE2C} - and benchmark them against each other and against leftmost greedy implementation - that has no overhead on disambiguation and serves as a baseline. - We also provide a completely independent Java implementation - and a web-page for interactive experiments with our algorithms. -\end{itemize} - -The rest of this paper is arranged as follows. -In section \ref{section_main} we present the main idea and the skeleton of our algorithm. -In section \ref{section_formalization} we provide theoretical foundations for the rest of the paper. -After that, we go into specific details: -section \ref{section_closure} is concerned with $\epsilon$-closure construction, -section \ref{section_pathtree} discusses data structures used to represent TNFA paths, -section \ref{section_results} discusses possible output formats (parse trees or POSIX-style offsets), -section \ref{section_comparison} gives the core disambiguation algorithms, -section \ref{section_lazy} presents lazy variation of our algorithm, -and section \ref{section_tnfa} gives specific TNFA construction. -The remaining sections \ref{section_complexity}, \ref{section_benchmarks} and \ref{section_conclusions} -contain complexity analysis, benchmarks, conclusions and directions for future work. - - -\section{Skeleton of the algorithm}\label{section_main} - -Our algorithm is based on four cornerstone concepts: -regular expressions, parse trees, parenthesized expressions and tagged NFA. -% -As usual, we formalize matching problem -by giving the interpretation of regular expressions as sets of parse trees. -% -Then we define POSIX disambiguation semantics in terms of order on parse trees. -This definition reflects the POSIX standard, -but it is too high-level to be used in a practical matching algorithm. -% -Therefore we go from parse trees to their linearized representation --- parenthesized expressions. -We define an order on parenthesized expressions and show its equivalence to the order on parse trees. -The latter definition of order is more low-level and can be easily converted to an efficient comparison procedure. -% -Finally, we construct TNFA and map parenthesized expressions to its paths, -which allows us to compare ambiguous paths using the definition of order on parenthesized expressions. -% -In this section we give the four basic definitions and the skeleton of the algorithm. -In the following sections we formalize the relation between different representations and fill in all the details. - - \begin{definition} - \emph{Regular expressions (RE)} over finite alphabet $\Sigma$, denoted $\XR_\Sigma$: - \begin{enumerate} - \item - Empty RE $\epsilon$ and - unit RE $\alpha$ (where $\alpha \in \Sigma$) are in $\XR_\Sigma$. - \item If $e_1, e_2 \in \XR_\Sigma$, then - union $e_1 | e_2$, - product $e_1 e_2$, - repetition $e_1^{n, m}$ (where $0 \leq n \leq m \leq \infty$), and - submatch group $(e_1)$ - are in $\XR_\Sigma$. - \end{enumerate} - \end{definition} - - - \begin{definition} - \emph{Parse trees (PT)} over finite alphabet $\Sigma$, denoted $\XT_\Sigma$: - \begin{enumerate} - \item - Nil tree ${\varnothing}^i$, - empty tree ${\epsilon}^i$ and - unit tree ${\alpha}^i$ (where $\alpha \in \Sigma$ and $i \in \YZ$) - are in $\XT_\Sigma$. - \item If $t_1, \dots, t_n \in \XT_\Sigma$ (where $n \geq 1$, and $i \in \YZ$), then - ${T}^i(t_1, \dots, t_n)$ - is in $\XT_\Sigma$. - \end{enumerate} - \end{definition} - - - \begin{definition} - \emph{Parenthesized expressions (PE)} over finite alphabet $\Sigma$, denoted $\XP_\Sigma$: - \begin{enumerate} - \item - Nil expression $\Xm$, - empty expression $\epsilon$ and - unit expression $\alpha$ (where $\alpha \in \Sigma$) - are in $\XP_\Sigma$. - \item If $e_1, e_2 \in \XP_\Sigma$, then - $e_1 e_2$ and - $\Xl e_1 \Xr$ - are in $\XP_\Sigma$. - \end{enumerate} - \end{definition} - - - \begin{definition} - \emph{Tagged Nondeterministic Finite Automaton (TNFA)} - is a structure $(\Sigma, Q, T, \Delta, q_0, q_f)$, where: - \begin{itemize} - \item[] $\Sigma$ is a finite set of symbols (\emph{alphabet}) - \item[] $Q$ is a finite set of \emph{states} - \item[] $T \subset \YN \times \YZ \times \YN \times \YN$ is a mapping of \emph{tags} to their submatch group, lower nested tag and upper nested tag - \item[] $\Delta = \Delta^\Sigma \sqcup \Delta^\epsilon$ is the \emph{transition} relation, - consisting of two parts: - \begin{itemize} - \item[] $\Delta^\Sigma \subseteq Q \times \Sigma \times \{\epsilon\} \times Q$ (transitions on symbols) - \item[] $\Delta^\epsilon \subseteq Q \times \YN \times \big( \YZ \cup \{\epsilon\} \big) \times Q$ - ($\epsilon$-transitions, where $\forall (q, n, \Xund, \Xund), (q, m, \Xund, \Xund) \in \Delta^\epsilon: n \neq m$) - \end{itemize} - \item[] $q_0 \in Q$ is the \emph{initial} state - \item[] $q_f \in Q$ is the \emph{final} state - \end{itemize} - \end{definition} - -As the reader might notice, our definitions are subtly different from the usual ones in literature. -Regular expressions are extended with submatch operator -and generalized repetition (note that it is not just syntactic sugar: in POSIX \texttt{(a)(a)} is semantically different from \texttt{(a)\{2\}}, -and \texttt{(a)} in not the same as \texttt{a}). -Parse trees have a special \emph{nil-tree} constructor -and an upper index, which allows us to distinguish between submatch and non-submatch subtrees. -Mirroring parse trees, parenthesized expressions also have a special \emph{nil-parenthesis}. -TNFA is in essence a nondeterministic finite-state transducer -in which some of the $\epsilon$-transitions are marked with \emph{tags} --- -integer numbers that denote opening and closing parentheses of submatch groups. -For $i$-th group, the opening tag is $2i - 1$ and the closing tag is $2i$ (where $i \in \YN$). -Tags can be negative, which represents the absence of match and corresponds to nil-parenthesis $\Xm$ and nil-tree $\varnothing$. -Additionally, all $\epsilon$-transitions are marked with \emph{priority} -which allows us to impose specific order of TNFA traversal -(all $\epsilon$-transitions from the same state have different priority). -\\ - -\begin{algorithm}[H] \DontPrintSemicolon \SetKwProg{Fn}{}{}{} \SetAlgoInsideSkip{medskip} -\setstretch{0.8} -\Fn {$\underline{match \big( N \!\!=\! (\Sigma, Q, T, \Delta, q_0, q_f), \; \alpha_1 \!\hdots\! \alpha_n \big)} \smallskip$} { - - $B, D : \text{uninitialized matrices in } \YZ^{|Q| \times |Q|}, \; U: \text{path context}$ \; - $r_0 = initial \Xund result(T)$ \; - $u_0 = empty \Xund path(\,)$ \; - $X = \big\{ (q_0, \varnothing, u_0, r_0) \big\}, \; i = 1$ \; - - \BlankLine - \While {$i \leq n \wedge X \neq \emptyset$} { - $X = closure(N, X, U, B, D)$ \; - $X = update \Xund result(T, X, U, i, \alpha_i)$ \; - $(B, D) = update \Xund ptables(N, X, U, B, D)$ \; - $X = \big\{ (q, o, u_0, r) \mid (o, \Xund, \Xund, r) \in X \wedge (o, \alpha_i, \epsilon, q) \in \Delta^\Sigma \big\}$ \; - $i = i + 1$ \; - } - - \BlankLine - $X = closure(N, X, U, B, D)$ \; - \If {$(q_f, \Xund, u, r) \in X$} { - \Return $f\!inal \Xund result (T, U, u, r, n)$ - } \lElse { - \Return $\varnothing$ - } - - \BlankLine -} -\caption{TNFA simulation on a string.}\label{alg_match} -\end{algorithm} -\medskip - -The algorithm takes an automaton $N$ and string $\alpha_1 \!\hdots\! \alpha_n$ as input, -and outputs the match result is some form: it can be a parse tree or a POSIX array of offsets, -but for now we leave it unspecified and hide behind functions -$initial \Xund result ()$, $update \Xund result ()$ and $f\!inal \Xund result ()$. -The algorithm works by consuming input symbols, -tracking a set of active \emph{configurations} -and updating \emph{precedence tables} $B$ and $D$. -Configuration is a tuple $(q, o, u, r)$. -The first component $q$ is a TNFA state that is unique for each configuration in the current set. -Components $o$ and $u$ keep information about the path by which $q$ was reached: -$o$ is the \emph{origin} state used as index in precedence tables, -and $u$ is a path fragment constructed by $closure()$. -Specific representation of path fragments is hidden behind path context $U$ and function stub $empty \Xund path ()$. -Finally, $r$-component is a partial match result associated with state $q$. -Most of the real work happens inside of $closure()$ and $update \Xund ptables ()$, both of which remain undefined for now. -The $closure()$ function builds $\epsilon$-closure of the current configuration set: -it explores all states reachable by $\epsilon$-transitions from the $q$-components -and tracks the best path to each reachable state. -The $update \Xund ptables ()$ function -performs pairwise comparison of all configurations in the new set, -recording results in $B$ and $D$ matrices. -On the next step $q$-components become $o$-components. -If paths originating from current configurations join on some future step, -$closure ()$ will use origin states to lookup comparison results in $B$ and $D$ matrices. -If the paths do not join, then comparison performed by $update \Xund ptables ()$ is redundant --- -unfortunately we do not know in advance which configurations will spawn ambiguous paths. -\\ - - -\section{Formalization}\label{section_formalization} - -In this section we establish the relation between all intermediate representations. -For brevity all proofs are moved to the appendix. -% -First of all, we rewrite REs in a form that makes submatch information explicit: -to each subexpression we assign an \emph{implicit} and \emph{explicit} submatch index. -Explicit indices enumerate submatch groups (for all other subexpressions they are zero). -Implicit indices enumerate submatch groups and subexpressions that are not submatch groups, -but contain nested or sibling groups and need to be considered by disambiguation. -This form reflects the POSIX standard, which states that -submatch extraction applies only to parenthesized subexpressions, -but the longest-match rule applies to all subexpressions regardless of parentheses. - - \begin{definition} - \emph{Indexed regular expressions (IRE)} over finite alphabet $\Sigma$, denoted $\XIR_\Sigma$: - \begin{enumerate} - \item - Empty IRE $(i, j, \epsilon)$ and - unit IRE $(i, j, \alpha)$, where $\alpha \in \Sigma$ and $i, j \in \YZ$, - are in $\XIR_\Sigma$. - - \item If $r_1, r_2 \in \XIR_\Sigma$ and $i, j \in \YZ$, then - union $(i, j, r_1 \mid r_2)$, - product $(i, j, r_1 \cdot r_2)$ and - repetition $(i, j, r_1^{n, m})$, where $0 \leq n \leq m \leq \infty$, - are in $\XIR_\Sigma$. - \end{enumerate} - \end{definition} - -Function $\IRE$ transforms RE into IRE. -It is defined via a composition of two functions, -$mark()$ that transforms RE into IRE with submatch indices in the boolean range $\{0, 1\}$, -and $enum()$ that substitutes boolean indices with consecutive numbers. -An example of constructing an IRE from a RE is given on figure \ref{fig:mark_enum}. -% - \begin{align*} - &\begin{aligned} - mark &: \XR_\Sigma \longrightarrow \XIR_\Sigma \\ - mark &(x) \mid_{x \in \{\epsilon, \alpha\}} = (0, 0, x) \\[-0.2em] - % - mark &(e_1 \circ e_2) \mid_{\circ \in \{|,\cdot\}} = (i, 0, - (i, j_1, r_1) \circ - (i, j_2, r_2) - ) \\[-0.2em] - &\text{where } (i_1, j_1, r_1) = mark(e_1) \\[-0.2em] - &\space{\hphantom{where }}(i_2, j_2, r_2) = mark(e_2) \\[-0.2em] - &\space{\hphantom{where }}i = i_1 \vee i_2 \\[-0.2em] - % - mark &(e^{n, m}) \mid_{e = (e_1)} = (1, 0, (1, 1, r)) \\[-0.2em] - &\text{where } (\Xund, \Xund, r) = mark(e_1) \\[-0.2em] - % - mark &(e^{n, m}) \mid_{e \neq (e_1)} = (i, 0, (i, j, r)) \\[-0.2em] - &\text{where } (i, j, r) = mark(e) \\[-0.2em] - % - mark &((e)) = mark((e)^{1, 1}) - \end{aligned} - % - &&\begin{aligned} - enum &: \YZ \times \YZ \times \XIR_\Sigma \longrightarrow \YZ \times \YZ \times \XIR_\Sigma \\ - enum &(\bar{i}, \bar{j}, (i, j, x)) \mid_{x \in \{\epsilon, \alpha\}} - = (\bar{i} + i, \bar{j} + j, (\bar{i} \times i, \bar{j} \times j, x)) - \\[-0.2em] - enum &(\bar{i}, \bar{j}, (i, j, r_1 \circ r_2)) \mid_{\circ \in \{|,\cdot\}} - = (i_2, j_2, (\bar{i} \times i, \bar{j} \times j, \bar{r}_1 \circ \bar{r}_2)) \\[-0.2em] - &\text{where } (i_1, j_1, \bar{r}_1) = enum(\bar{i} + i, \bar{j} + j, r_1) \\[-0.2em] - &\space{\hphantom{where }}(i_2, j_2, \bar{r}_2) = enum(i_1, j_1, r_2) - \\[-0.2em] - enum &(\bar{i}, \bar{j}, (i, j, r^{n,m})) = (i_1, j_1, (\bar{i} \times i, \bar{j} \times j, \bar{r}^{n,m})) \\[-0.2em] - &\text{where } - (i_1, j_1, \bar{r}) = enum(\bar{i} + i, \bar{j} + j, r) - \\[-0.2em] - \\[-0.2em] - \IRE &: \XR_\Sigma \rightarrow \XIR_\Sigma \\[-0.2em] - \IRE&(e) = r \\[-0.2em] - &\text{where }(\Xund, \Xund, r) = enum(1, 1, mark(e)) - \\[-0.2em] - \end{aligned} - \end{align*} - -The relation between regular expressions and parse trees is given by the operator $\PT$. -Each IRE denotes a set of PTs. -% -We write $str(t)$ to denote the string formed by concatenation of all alphabet symbols in the left-to-right traversal of $t$, -and $\PT(r, w)$ denotes the set $\big\{ t \in \PT(r) \mid str(t) = w \big\}$ of all PTs for IRE $r$ and a string $w$. -% - \begin{align*} - \PT &: \XIR_\Sigma \rightarrow 2^{\XT_\Sigma} - \\ - \PT\big((i, \Xund, \epsilon)\big) &= \{ {\epsilon}^{i} \} - \\[-0.2em] - \PT\big((i, \Xund, \alpha)\big) &= \{ {\alpha}^{i} \} - \\[-0.2em] - \PT\big((i, \Xund, (i_1, j_1, r_1) \mid (i_2, j_2, r_2))\big) &= - \big\{ {T}^{i}(t, \varnothing^{i_2}) \mid t \in \PT\big((i_1, j_1, r_1)\big) \big\} \cup - \big\{ {T}^{i}(\varnothing^{i_1}, t) \mid t \in \PT\big((i_2, j_2, r_2)\big) \big\} - \\[-0.2em] - \PT\big((i, \Xund, (i_1, j_1, r_1) \cdot (i_2, j_2, r_2))\big) &= - \big\{ {T}^{i}(t_1, t_2) \mid - t_1 \in \PT\big((i_1, j_1, r_1)\big), - t_2 \in \PT\big((i_2, j_2, r_2)\big) - \big\} \\[-0.2em] - \PT\big((i, \Xund, (i_1, j_1, r_1)^{n, m})\big) &= - \begin{cases} - \big\{ {T}^{i}(t_1, \dots, t_m) \mid t_k \in \PT\big((i_1, j_1, r_1)\big) \; - \forall k = \overline{1, m} \big\} \cup \{ {T}^{i}(\varnothing^{i_1}) \} &\text{if } n = 0 \\[-0.2em] - \big\{ {T}^{i}(t_n, \dots, t_m) \mid t_k \in \PT\big((i_1, j_1, r_1)\big) \; - \forall k = \overline{n, m} \big\} &\text{if } n > 0 - \end{cases} - \end{align*} - \medskip - -Following Okui and Suzuki, we assign \emph{positions} to the nodes of IRE and PT. -The root position is $\Lambda$, and position of the $i$-th subtree of a tree with position $p$ is $p.i$ -(we shorten $\|t\|_\Lambda$ as $\|t\|$). -The \emph{length} of position $p$, denoted $|p|$, is defined as $0$ for $\Lambda$ and $|p| + 1$ for $p.i$. -%The set of all positions is denoted $\XP$. -The subtree of a tree $t$ at position $p$ is denoted $t|_p$. -Position $p$ is a \emph{prefix} of position $q$ iff $q = p.p'$ for some $p'$, -and a \emph{proper prefix} if additionally $p \neq q$. -Position $p$ is a \emph{sibling} of position $q$ iff $q = q'.i, p = q'.j$ for some $q'$ and $i,j \in \YN$. -Positions are ordered lexicographically. -The set of all positions of a tree $t$ is denoted $Pos(t)$. -Additionally, we define a set of \emph{submatch positions} as -$Sub(t) = \big\{ p \mid \exists t|_p = s^i : i \neq 0 \big\}$ --- -a subset of $Pos(t)$ that contains positions of subtrees with nonzero implicit submatch index. -Intuitively, this is the set of positions important from disambiguation perspective: -in the case of ambiguity we do not need to consider the full trees, -just the relevant parts of them. -% -PTs have two definitions of norm, one for $Pos$ and one for $Sub$, -which we call \emph{p-norm} and \emph{s-norm} respectively: - -\begin{figure} -\includegraphics[width=\linewidth]{img/mark_enum.pdf} -\vspace{-2em} -\caption{ -IRE for RE $(\epsilon|a^{0,\infty})(a|\epsilon)^{0,3}$ -and examples of PTs for string $a$. -S-norm is marked with $\#$. -}\label{fig:mark_enum} -\end{figure} - - \begin{definition}\label{tnorm_of_PTs} - The \emph{p-norm} and \emph{s-norm} of a PT $t$ at position $p$ are: - \begin{align*} - \pnorm{t}{p} = - \begin{cases} - -1 &\text{if } p \in Pos(t) \text{ and } t|_p = \varnothing^i \\[-0.2em] - |str(t|_p)| &\text{if } p \in Pos(t) \text{ and } t|_p \neq \varnothing^i \\[-0.2em] - \infty &\text{if } p \not\in Pos(t) - \end{cases} - \quad\quad\quad - \snorm{t}{p} = - \begin{cases} - -1 &\text{if } p \in Sub(t) \text{ and } t|_p = \varnothing^i \\[-0.2em] - |str(t|_p)| &\text{if } p \in Sub(t) \text{ and } t|_p \neq \varnothing^i \\[-0.2em] - \infty &\text{if } p \not\in Sub(t) - \end{cases} - \end{align*} - \end{definition} - -Generally, the norm of a subtree means the number of alphabet symbols in its leaves, with two exceptions. -First, for nil subtrees the norm is $-1$: intuitively, they have the lowest ``ranking'' among all possible subtrees. -Second, for nonexistent subtrees (those with positions not in $Pos(t)$) the norm is infinite. -This may seem counter-intuitive at first, but it makes sense in the presence of REs with empty repetitions. -According to POSIX, optional empty repetitions are not allowed, and our definition reflects this: -if a tree $s$ has a subtree $s|_p$ corresponding to an empty repetition, -and another tree $t$ has no subtree at position $p$, -then the infinite norm $\|t\|_p$ ``outranks'' $\|s\|_p$. -We define two orders on PTs: - - \begin{definition}[P-order on PTs] - \label{total_order_on_PTs} - Given parse trees $t, s \in PT(r, w)$ for some IRE $r$ and string $w$, we say that $t <_p s$ w.r.t. \emph{decision position} $p$ - iff $\pnorm{t}{p} > \pnorm{s}{p}$ and $\pnorm{t}{q} = \pnorm{s}{q} \; \forall q < p$. - We say that $t < s$ iff $t <_p s$ for some $p$. - \end{definition} - - \begin{definition}[S-order on PTs] - \label{partial_order_on_PTs} - Given parse trees $t, s \in PT(r, w)$ for some IRE $r$ and string $w$, we say that $t \prec_p s$ w.r.t. \emph{decision position} $p$ % $p \in Sub(t) \cup Sub(s)$ - iff $\snorm{t}{p} > \snorm{s}{p}$ and $\snorm{t}{q} = \snorm{s}{q} \; \forall q < p$. - We say that $t \prec s$ iff $t \prec_p s$ for some $p$. - \end{definition} - - \begin{definition}\label{incomparable_PTs} - PTs $t$ and $s$ are \emph{incomparable}, denoted $t \sim s$, - iff neither $t \prec s$, nor $s \prec t$. - \end{definition} - -\begin{theoremEnd}[restate, no link to proof, no link to theorem, category=theorem_porder_on_PTs]{theorem} - \label{theorem_porder_on_PTs} - P-order $<$ is a strict total order on $\PT(e, w)$ for any IRE $e$ and string $w$. -\end{theoremEnd} -\begin{proofEnd} - We need to show that $<$ is transitive and trichotomous. - \begin{itemize}[itemsep=0.5em, topsep=0.5em] - \item[(1)] - Transitivity: we need to show that $\forall t, s, u \in \PT(e,w): (t < s \wedge s < u) \implies t < u$. - \\[0.5em] - Let $t <_p s$ and $s <_q u$ for some positions $p$, $q$, and let $r = min (p, q)$. - \\[0.5em] - First, we show that $\pnorm{t}{r} > \pnorm{u}{r}$. - If $p \leq q$, we have $\pnorm{t}{p} > \pnorm{s}{p}$ (implied by $t <_p s$) - and $\pnorm{s}{p} \geq \pnorm{u}{p}$ (implied by $s <_q u \wedge p \leq q$), - therefore $\pnorm{t}{p} > \pnorm{u}{p}$. - Otherwise $p > q$, we have $\pnorm{t}{q} > \pnorm{u}{q}$ (implied by $s <_q u$) - and $\pnorm{t}{q} = \pnorm{s}{q}$ (implied by $t <_p s \wedge q < p$), - therefore $\pnorm{t}{q} > \pnorm{u}{q}$. - \\[0.5em] - Second, we show that $\forall r' < r : \pnorm{t}{r'} = \pnorm{u}{r'}$. - We have $\pnorm{t}{r'} = \pnorm{s}{r'}$ (implied by $t <_p s \wedge r' < p$) - and $\pnorm{s}{r'} = \pnorm{u}{r'}$ (implied by $s <_q u \wedge r' < q$), - therefore $\pnorm{t}{r'} = \pnorm{u}{r'}$. - - \item[(2)] - Trichotomy: we need to show that $\forall t, s \in \PT(e,w)$ - exactly one of $t < s$, $s < t$ or $t = s$ holds. - Consider the set of positions where norms of $t$ and $s$ disagree - $P = \{p \in Pos(t) \cup Pos(s) : \pnorm{t}{p} \neq \pnorm{s}{p}\}$. - % - \begin{itemize}[itemsep=0.5em] - \item[(2.1)] First case: $P \neq \emptyset$. - We show that in this case exactly one of $t < s$ or $s < t$ is true - ($t \neq s$ is obvious). - \\[0.5em] - First, we show that at least one of $t < s$ or $s < t$ is true. - % - Let $p = min(P)$; it is well-defined since $P$ is non-empty, finite and lexicographically ordered. - % - For all $q < p$ we have $\pnorm{t}{q} = \pnorm{s}{q}$ (by definition of $p$ - and because $\pnorm{t}{q} = \infty = \pnorm{s}{q}$ if $q \not\in Pos(t) \cup Pos(s)$). - % - Since $\pnorm{t}{p} \neq \pnorm{s}{p}$, we have either $t <_p s$ or $t <_p s$. - \\[0.5em] - Second, we show that at most one of $t < s$ or $s < t$ is true, - i.e. $<$ is asymmetric: $\forall t, s \in \PT(e,w) : t < s \implies s \not< t$. - % - Suppose, on the contrary, that $t <_p s$ and $s <_q t$ for some $p$, $q$. - Without loss of generality let $p \leq q$. - On one hand $t <_p s$ implies $\pnorm{t}{p} > \pnorm{s}{p}$. - But on the other hand $s <_q t \wedge p \leq q$ implies $\pnorm{t}{p} \leq \pnorm{s}{p}$. - Contradiction. - - \item[(2.2)] Second case: $P = \emptyset$. - We show that in this case $t = s$. - \\[0.5em] - We have $Pos(t) = Pos(s)$ --- otherwise there is a position with norm $\infty$ in only one of the trees. - Therefore $t$ and $s$ have identical node structure. - % - By lemma \ref{lemma_positions} any position in $t$ and $s$ corresponds to the same position in $e$. - % - Since any position in $e$ corresponds to a unique explicit submatch index, - it must be that submatch indices of all nodes in $t$ and $s$ coincide. - % - Consider some position $p \in Pos(t)$. - If $p$ corresponds to an inner node, then both $t|_p$ and $s|_p$ are of the form $T^i(\hdots)$. - Otherwise, $p$ corresponds to a leaf node, which can be either $\varnothing$ or $\epsilon$ or $\alpha$. - Since all three have different norms ($-1$, $0$ and $1$ respectively), - and since $\pnorm{t}{p} = \pnorm{s}{p}$, it must be that $t|_p$ and $s|_p$ are identical. - \end{itemize} - \end{itemize} -\end{proofEnd} -\vspace{-0.5em} - -\begin{theoremEnd}[restate, no link to proof, no link to theorem, category=theorem_sorder_on_PTs]{theorem} - \label{theorem_sorder_on_PTs} - S-order $\prec$ is a strict weak order on $\PT(e, w)$ for any IRE $e$ and string $w$. -\end{theoremEnd} -\begin{proofEnd} - We need to show that $\prec$ is asymmetric and transitive, and incomparability relation $\sim$ is transitive. - \begin{itemize}[itemsep=0.5em, topsep=0.5em] - \item[(1)] - Asymmetry: we need to show that $\forall t, s \in \PT(e,w): t \prec s \implies s \not\prec t$. - \\[0.5em] - Suppose, on the contrary, that $t \prec_p s$ and $s \prec_q t$ for some $p$, $q$. - Without loss of generality let $p \leq q$. - On one hand $t \prec_p s$ implies $\snorm{t}{p} > \snorm{s}{p}$. - But on the other hand $s \prec_q t \wedge p \leq q$ implies $\snorm{t}{p} \leq \snorm{s}{p}$. - Contradiction. - - \item[(2)] - Transitivity: we need to show that $\forall t, s, u \in \PT(e,w): (t \prec s \wedge s \prec u) \implies t \prec u$. - \\[0.5em] - Let $t \prec_p s$ and $s \prec_q u$ for some positions $p$, $q$, and let $r = min (p, q)$. - \\[0.5em] - First, we show that $\snorm{t}{r} > \snorm{u}{r}$. - If $p \leq q$, we have $\snorm{t}{p} > \snorm{s}{p}$ (implied by $t \prec_p s$) - and $\snorm{s}{p} \geq \snorm{u}{p}$ (implied by $s \prec_q u \wedge p \leq q$), - therefore $\snorm{t}{p} > \snorm{u}{p}$. - Otherwise $p > q$, we have $\snorm{t}{q} > \snorm{u}{q}$ (implied by $s \prec_q u$) - and $\snorm{t}{q} = \snorm{s}{q}$ (implied by $t \prec_p s \wedge q < p$), - therefore $\snorm{t}{q} > \snorm{u}{q}$. - \\[0.5em] - Second, we show that $\forall r' < r : \snorm{t}{r'} = \snorm{u}{r'}$. - We have $\snorm{t}{r'} = \snorm{s}{r'}$ (implied by $t \prec_p s \wedge r' < p$) - and $\snorm{s}{r'} = \snorm{u}{r'}$ (implied by $s \prec_q u \wedge r' < q$), - therefore $\snorm{t}{r'} = \snorm{u}{r'}$. - - \item[(3)] - Transitivity of incomparability: we need to show that $\forall t, s \in \PT(e,w): (t \sim s \wedge s \sim u) \implies t \sim u$. - \\[0.5em] - By forward implication of lemma \ref{lemma_incomparability_equivdef} - $t \sim s \Rightarrow \forall p : \snorm{t}{p} = \snorm{s}{p}$ and - $s \sim u \Rightarrow \forall p : \snorm{s}{p} = \snorm{u}{p}$, therefore - $(t \sim s \wedge s \sim u) \Rightarrow \forall p : \snorm{t}{p} = \snorm{u}{p} \Rightarrow t \sim u$ - by backward implication of lemma \ref{lemma_incomparability_equivdef}. - \end{itemize} -\end{proofEnd} - -The following theorem \ref{theorem_order_compat} establishes an important relation between P-order and S-order. -P-order is total, and there is a unique $<$-minimal tree $t_{min}$. -S-order is partial, it partitions all trees into equivalence classes -and there is a whole class of $\prec$-minimal trees $T_{min}$ -(such trees coincide in submatch positions, but differ in some non-submatch positions). -Theorem \ref{theorem_order_compat} shows that $t_{min} \in T_{min}$. -This means that P-order and S-order ``agree'' on the notion of minimal tree: -we can continuously narrow down $T_{min}$ until we are left with $t_{min}$. -In practice, this means that adding more parentheses in RE does not drastically change submatch results. -% -Note that this doesn't mean that P-order is an extension of S-order: -the two orders may disagree. -For example, consider trees $t$ and $u$ on figure \ref{fig:mark_enum}: -on one hand $t \prec_{2.2} u$, because $\snorm{t}{2.2} = \infty > 0 = \snorm{u}{2.2}$ and s-norms at all preceding submatch positions agree; -on the other hand $u <_{1.1} t$, because $\pnorm{t}{1.1} = -1 < 0 = \pnorm{u}{1.1}$ -and p-norms at all preceding positions agree. - -\begin{theoremEnd}[restate, no link to proof, no link to theorem, category=theorem_order_compat]{theorem} - \label{theorem_order_compat} - Let $t_{min}$ be the $<$-minimal tree in $\PT(e,w)$ for some IRE $e$ and string $w$, - and let $T_{min}$ be the class of the $\prec$-minimal trees in $\PT(e,w)$. - Then $t_{min} \in T_{min}$. -\end{theoremEnd} -\begin{proofEnd} - Consider any $t \in T_{min}$. - From $t$ we can construct another tree $t'$ in the following way. - Consider all positions $p \in Sub(t)$ which are not proper prefixes of another position in $Sub(t)$. - For each such position, $t|_p$ is itself a PT for some sub-IRE $r'$ and substring $w'$: $t|_p \in \PT(r', w')$. - Let $t'_{min}$ be the $<$-minimal tree in $\PT(r', w')$ and substitute $t|_p$ with $t'_{min}$. - Let $t'$ be the tree resulting from all such substitutions - (note that they are independent of the order in which we consider positions $p$). - Since substitutions preserve s-norm at submatch positions, we have $t' \in T_{min}$. - We will show that $t' = t_{min}$. - \\[0.5em] - Suppose, on the contrary, that $t' \neq t_{min}$. - % - Then $t_{min} <_p t'$ for some decision position $p$. - % - It must be that $p \not\in Sub(t') \cup Sub(t_{min})$, because - otherwise $\snorm{t_{min}}{p} = \pnorm{t_{min}}{p} > \pnorm{t'}{p} = \snorm{t'}{p}$ - and $\pnorm{t_{min}}{p} = \pnorm{t'}{p} \; \forall q < p$ implies $\snorm{t_{min}}{p} = \snorm{t'}{p} \; \forall q < p$, - which means that $t_{min} \prec_p t'$, which contradicts to $t' \in T_{min}$. - Thus $p$ is a non-submatch position. - % - Let $p = p'.p''$, where $p'$ is the longest proper prefix of $p$ in $Sub(u) \cup Sub(t_{min})$. - % - For all $q \leq p'$ it must be that $\snorm{u}{q} = \snorm{t_{min}}{q}$, - otherwise $\snorm{u}{q} \neq \snorm{t_{min}}{q}$ implies $\pnorm{u}{q} \neq \pnorm{t_{min}}{q}$, - which contradicts to $t_{min} <_p t'$ because $q \leq p' < p$. - % - By lemma \ref{lemma_subtrees}, subtrees $t'_{p'}$ and $t_{min}|_{p'}$ are comparable: - $\exists r', w' : t'|_{p'}, t_{min}|_{p'} \in \PT(r', w')$. - By construction of $t'$, subtree $t'_{p'}$ is $<$-minimal in $\PT(r', w')$, - but at the same time $t_{min} <_{p'.p''} u$ implies $t_{min}|_{p'} <_{p''} u|_{p'}$. - Contradiction. -\end{proofEnd} - -Following the idea of Okui and Suzuki, -we go from comparison of parse trees to comparison of their linearized representation --- parenthesized expressions. -Parenthesis $\Xl$ is opening, and -parenthesis $\Xr$ is closing; -the \emph{nil}-parenthesis $\Xm$ is both opening and closing. -For convenience we sometimes annotate parentheses with \emph{height}, -which we define as the number of preceding opening parentheses (including this one) -minus the number of preceding closing parentheses (including this one). -Explicit height annotations allow us to consider PE fragments in isolation -without losing the context of the whole expression. -However, height is not a part of parenthesis itself, -and it is not taken into account when comparing the elements of PEs. -Function $\Phi$ transforms PT at the given height into PE: -% - \begin{align*} - \Phi &: \YZ \times \XT_\Sigma \rightarrow \XP_\Sigma - \\ - \Phi_{h}(t^{i}) &= \begin{cases} - str(t^{i}) &\text{if } i = 0 \\[-0.2em] - \Xm_h &\text{if } i \neq 0 \wedge t = \varnothing \\[-0.2em] - \Xl_{h+1} \Xr_h &\text{if } i \neq 0 \wedge t = \epsilon \\[-0.2em] - \Xl_{h+1} a \Xr_h &\text{if } i \neq 0 \wedge t = a \in \Sigma \\[-0.2em] - \Xl_{h+1} \Phi_{h+1}(t_1) \dots \Phi_{h+1}(t_n) \Xr_h &\text{if } i \neq 0 \wedge t = T(t_1, \dots, t_n) - \end{cases} - \end{align*} - -For a given IRE $r$ and string $w$ the set of all PEs $\big\{ \Phi_{0}(t) \mid t \in \PT(r, w) \big\}$ is denoted $\PE(r, w)$, -and the set of all prefixes in $\PE(r, w)$ is denoted $\PR(r, w)$. -Each PE $\alpha$ can be represented as $\alpha_0 a_1 \alpha_1 \dots a_n \alpha_n$, -where $\alpha_i$ is the $i$-th \emph{frame} --- a possibly empty sequence of parentheses between -subsequent alphabet symbols $a_i$ and $a_{i+1}$ (or the beginning and end of $\alpha$). -PE fragments $\alpha$ and $\beta$ are \emph{comparable} -if they have the same number of frames and $\alpha, \beta \in \PR(r, w)$ for some $r$ and $w$. -% -For fragments $\alpha$ and $\beta$, -$\alpha \sqcap \beta$ denotes their longest common prefix, -$\alpha \backslash \beta$ denotes the suffix of $\alpha$ after removing $\alpha \sqcap \beta$, -$lasth(\alpha)$ denotes the height of the last parenthesis in $\alpha$ (or $\infty$ if $\alpha$ is empty or begins with an alphabet symbol), -$minh(\alpha)$ denotes the minimal height of parenthesis in $\alpha$ (or $\infty$ if $\alpha$ is empty or begins with an alphabet symbol), -$f\!irst(\alpha)$ denotes the first parenthesis in $\alpha$ (or $\bot$ if $\alpha$ is empty or begins with an alphabet symbol). -For comparable PE fragments $\alpha$ and $\beta$ the index of the first distinct pair of frames is called \emph{fork}. - -\begin{figure} -\includegraphics[width=\linewidth]{img/pe.pdf} -\vspace{-2em} -\caption{ -Examples: (a) -- (d): four main rules of POSIX comparison, -(e) -- pairwise comparison of PEs. -}\label{fig_pe} -\end{figure} - - \begin{definition} - \label{def_traces} - Let $\alpha$, $\beta$ be comparable PE prefixes, such that - $\alpha = \alpha_0 a_1 \alpha_1 \dots a_n \alpha_n$, - $\beta = \beta_0 a_1 \beta_1 \dots a_n \beta_n$ and $k$ is the fork. - We define $trace (\alpha, \beta)$ as the sequence $(\rho_0, \dots, \rho_n)$, where: - % - \begin{align*} - \rho_i = \begin{cases} - -1 &\text{if } i < k \\[-0.2em] - min (lasth (\alpha_i \sqcap \beta_i), minh(\alpha_i \backslash \beta_i)) &\text{if } i = k \\[-0.2em] - min (\rho_{i-1}, minh(\alpha_i)) &\text{if } i > k - \end{cases} - \end{align*} - - We write $traces(\alpha, \beta)$ to denote $\big( trace (\alpha, \beta), trace (\beta, \alpha) \big)$. - \end{definition} - - \begin{definition}\label{prec1} - (Longest precedence.) - Let $\alpha$, $\beta$ be comparable PE prefixes and - $traces(\alpha, \beta) = \big( (\rho_0, \dots, \rho_n), (\rho'_0, \dots, \rho'_n) \big)$. - Then $\alpha \sqsubset \beta \Leftrightarrow \exists i \leq n: - \big( \rho_i > \rho'_i \big) \wedge - \big( \rho_j = \rho'_j \; \forall j > i \big)$. - If neither $\alpha \sqsubset \beta$, nor $\beta \sqsubset \alpha$, - then $\alpha$, $\beta$ are \emph{longest-equivalent}: $\alpha \sim \beta$ - (note that in this case $\rho_i = \rho'_i \; \forall i = \overline {1, n}$). - \end{definition} - - \begin{definition}\label{prec2} - (Leftmost precedence.) - Let $\alpha$, $\beta$ be comparable PE prefixes, and let - $x = first (\alpha \backslash \beta)$, - $y = first (\beta \backslash \alpha)$. - Then $\alpha \subset \beta \Leftrightarrow x < y$, where - the set of possible values of $x$ and $y$ is ordered as follows: - $\bot < \Xr < \Xl < \Xm$. - \end{definition} - - \begin{definition}\label{pe_order} - (Longest-leftmost precedence.) - Let $\alpha$, $\beta$ be comparable PE prefixes, then - $\alpha < \beta \Leftrightarrow - \big( \alpha \sqsubset \beta \big) \vee - \big( \alpha \sim \beta \wedge \alpha \subset \beta \big)$. - \end{definition} - -\begin{theoremEnd}[restate, no link to proof, no link to theorem, category=theorem_order_on_pe_same_as_on_pt]{theorem} - \label{theorem_order_on_pe_same_as_on_pt} - If $s, t \in \PT(e, w)$ for some IRE $e$ and string $w$, then - $s \prec t \Leftrightarrow \Phi_{h}(s) < \Phi_{h}(t) \; \forall h$. -\end{theoremEnd} -\begin{proofEnd} - Forward implication is given by lemma \ref{lemma_pe_less}. - Backward implication: - suppose, on the contrary, that $\Phi_{h}(s) < \Phi_{h}(t) \; \forall h$, but $s \not\prec t$. - Since $\prec$ is a strict weak order (by theorem \ref{theorem_sorder_on_PTs}), - it must be that either $s \sim t$ - (then $\Phi_{h}(s) = \Phi_{h}(t) \; \forall h$ by lemma \ref{lemma_pe_equiv}), - or $t \prec s$ - (then $\Phi_{h}(t) < \Phi_{h}(s) \; \forall h$ by lemma \ref{lemma_pe_less}). - Both cases contradict $\Phi_{h}(s) < \Phi_{h}(t) \; \forall h$, - therefore assumption $s \not\prec t$ is incorrect. -\end{proofEnd} - -Next, we go from comparison of PEs to comparison of TNFA paths. -% -A \emph{path} in TNFA $(\Sigma, Q, T, \Delta, q_0, q_f)$ -is a sequence of transitions $\{(q_i, a_i, b_i, q_{i + 1})\}_{i=1}^{n-1} \subseteq \Delta$, where $n \in \YN$. -% -Every path induces a string of alphabet symbols -and a mixed string of symbols and tags which corresponds to a fragment of PE: -positive opening tags map to $\Xl$, -positive closing tags map to $\Xr$, -and negative tags map to $\Xm$. -We write $q_1 \overset {s|\alpha} {\rightsquigarrow} q_2$ -to denote the fact that a path from $q_1$ to $q_2$ induces alphabet string $s$ and PE fragment $\alpha$. -% -We extend the notion of order from PEs to paths: given paths -$\pi_1 = q_1 \overset {s|\alpha} {\rightsquigarrow} q_2$ and -$\pi_2 = q_1 \overset {s|\beta} {\rightsquigarrow} q_3$ -we say that $\pi_1 < \pi_2$ if $\alpha < \beta$. -% -For a given IRE $e$ we say that a path in TNFA for $e$ is \emph{minimal} if it induces -$\alpha = \PE(t)$ for some minimal tree $t \in \PT(e)$. -% -Two paths are \emph{ambiguous} if their start and end states coincide and they induce the same alphabet string. -Two paths have a \emph{join point} if they have ambiguous prefixes. -% -In order to justify our TNFA simulation algorithm, -we need to show that PEs induced by TNFA paths can be compared incrementally -(otherwise we would have to keep full-length PEs, which requires the amount of memory proportional to the length of input). -Justification of incremental comparison consists of two parts: -the following lemma \ref{lemma_incr_cmp_frames} justifies comparison between frames, -and lemmas \ref{lemma_closure_minpaths}, \ref{lemma_closure_noloop}, \ref{lemma_closure_rightdist} -in section \ref{section_closure} justify comparison at join points inside of one frame -(this is necessary as the number of paths in closure may be exponential in the number of states). - -\begin{theoremEnd}[restate, no link to proof, no link to theorem, category=lemma_frames]{lemma} -[Frame-by-frame comparison of PEs] - \label{lemma_incr_cmp_frames} - If $\alpha$, $\beta$ are comparable PE prefixes, - $c$ is an alphabet symbol and - $\gamma$ is a single-frame PE fragment, - then $\alpha < \beta$ implies $\alpha c \gamma < \beta c \gamma$. -\end{theoremEnd} -\begin{proofEnd} - Let $\big((\rho_1, \dots, \rho_n), (\rho'_1, \dots, \rho'_n)\big) = traces(\alpha, \beta)$ where $n \geq 1$. - Since $\alpha c \gamma$, $\beta c \gamma$ have one more frame than $\alpha$, $\beta$ - and the first $n$ frames are identical to frames of $\alpha$, $\beta$, - we can represent $traces(\alpha c \gamma, \beta c \gamma)$ - as $\big((\rho_1, \dots, \rho_n, \rho_{n+1}), (\rho'_1, \dots, \rho'_n, \rho'_{n+1})\big)$. - % - \begin{itemize}[itemsep=0.5em, topsep=0.5em] - \item[(1)] - Case $\alpha \sim \beta \wedge \alpha \subset \beta$. - In this case $\rho_i = \rho'_i \;\forall i \leq n$, - therefore $\rho_{n+1} = min(\rho_n, minh(\gamma)) = min(\rho'_n, minh(\gamma)) = \rho'_{n+1}$ - and $\alpha c \gamma \sim \beta c \gamma$. - Furthermore, - $first (\alpha c \gamma \backslash \beta c \gamma) = first (\alpha \backslash \beta)$ and - $first (\beta c \gamma \backslash \alpha c \gamma) = first (\beta \backslash \alpha)$, - therefore $\alpha \subset \beta \implies \alpha c \gamma \subset \beta c \gamma$. - \item[(2)] - Case $\alpha \sqsubset \beta$. - In this case $\exists j \leq n$ such that $\rho_j > \rho'_j$ and $\rho_i = \rho'_i \;\forall j < i \leq n$. - We show that $\exists l \leq n + 1$ such that $\rho_l > \rho'_l$ and $\rho_i = \rho'_i \;\forall l < i \leq n + 1$, - which by definition means that $\alpha c \gamma \sqsubset \beta c \gamma$. - \begin{itemize} - \item[(2a)] - Case $j < n$. - In this case $\rho_n = \rho'_n$ and - $\rho_{n+1} = min(\rho_n, minh(\gamma)) = min(\rho'_n, minh(\gamma)) = \rho'_{n+1}$, - therefore $l = j$. - \item[(2b)] - Case $j = n$ and $minh(\gamma) > \rho'_n$. - In this case $\rho_n > \rho'_n$ and we have - $\rho_{n+1} = min(\rho_n, minh(\gamma)) > \rho'_n$ and - $\rho'_{n+1} = min(\rho'_n, minh(\gamma)) = \rho'_n$, - therefore $\rho_{n+1} > \rho'_{n+1}$ - and $l = n + 1$. - \item[(2c)] - Case $j = n$ and $minh(\gamma) \leq \rho'_n$. - In this case $\rho_n > \rho'_n$ and we have - $\rho_{n+1} = min(\rho_n, minh(\gamma)) = minh(\gamma)$ and - $\rho'_{n+1} = min(\rho'_n, minh(\gamma)) = minh(\gamma)$, - therefore $\rho_{n+1} = \rho'_{n+1}$ - and $l = n$. - \end{itemize} - \end{itemize} - In both cases $\alpha c \gamma < \beta c \gamma$. -\end{proofEnd} - - -\section{closure construction}\label{section_closure} - -The problem of constructing $\epsilon$-closure with POSIX disambiguation -can be formulated as a shortest path problem on directed graph with weighted arcs. -In our case weight is not a number --- it is the PE fragment induced by the path. -% -We give two algorithms for closure construction: GOR1, named after the well-known Goldberg-Radzik algorithm \cite{GR93}, -and GTOP, named after ``global topological order''. -% -Both have the usual structure of shortest-path finding algorithms. -The algorithm starts with a set of initial configurations, empty queue and empty set of resulting configurations. -Initial configurations are enqueued and the algorithm loops until the queue becomes empty. -At each iteration it dequeues configuration $(q, o, u, r)$ and scans $\epsilon$-transitions from state $q$. -For transition $(q, \Xund, \gamma, p)$ it constructs a new configuration $(p, o, v, r)$ -that combines $u$ and $\gamma$ in an extended path $v$. -If the resulting set contains another configuration for state $p$, -then the algorithm chooses the configuration which has a better path from POSIX perspective. -Otherwise it adds the new configuration to the resulting set. -If the resulting set was changed, the new configuration is enqueued for further scanning. -Eventually all states in $\epsilon$-closure are explored, no improvements can be made, and the algorithm terminates. -% -%Lemma \ref{lemma_closure_rightdist} allows us to skip comparison in non-join states (with in-degree 1), because -%any path to such state is formed by concatenation of the unique transition and the shortest known path to the previous state. -\\ - -The difference between GOR1 and GTOP is in the order they inspect configurations. -% -Both algorithms are based on the idea of topological ordering. -Unlike other shortest-path algorithms, their queuing discipline is based on graph structure, not on the distance estimates. -This is crucial, because we do not have any distance estimates: -paths can be compared, but there is no absolute ``POSIX-ness'' value that we can attribute to each path. -% -GOR1 is described in \cite{GR93}. -It uses two stacks and makes a number of passes; -each pass consists of a depth-first search on the admissible subgraph -followed by a linear scan of states that are topologically ordered by depth-first search. -The algorithm is one of the most efficient shortest-path algorithms \cite{CGR96}. -$n$-Pass structure guarantees worst-case complexity $O(n \, m)$ of the Bellman-Ford algorithm, -where $n$ is the number of states and $m$ is the number of transitions in $\epsilon$-closure -(both can be approximated by TNFA size) \cite{CGGTW09}. -% -GTOP is a simple algorithm that maintains one global priority queue (e.g. a binary heap) -ordered by the topological index of states (for graphs with cycles, we assume reverse depth-first post-order). -Since GTOP does not have the $n$-pass structure, its worst-case complexity is not clear. -However, it is much simpler to implement -and in practice it performs almost identically to GOR1 on graphs induced by TNFA $\epsilon$-closures. -% -On acyclic graphs, both GOR1 and GTOP have linear $O(n + m)$ complexity. -\\ - -\begin{algorithm}[] \DontPrintSemicolon \SetKwProg{Fn}{}{}{} \SetAlgoInsideSkip{medskip} -\begin{multicols}{2} - - \newcommand \NOPASS {O\!F\!F} - \newcommand \TOPSORT {T\!O\!P} - \newcommand \LINEAR {L\!I\!N} - \newcommand \INQUEUE {I\!N} - \newcommand \OFFQUEUE {OUT} - \newcommand \Xfalse {f\!al\!se} - - \setstretch{0.85} - - \Fn {$\underline{closure \Xund gor1(N\!=\!(\Sigma, Q, T, \Delta, q_0, q_f), X, U, B, D)} \smallskip$} { - - \Indm - context: $C = (N, U, B, D$ \; - \Indp - $,\, topsort, linear : \text{stacks of states } q \in Q$ \; - $,\, result : Q \rightarrow \YC \cup \{ \varnothing \}$ \; - $,\, status : Q \rightarrow \{ \NOPASS, \TOPSORT, \LINEAR \}$ \; - $,\, indeg : Q \rightarrow \YZ$ \tcp{in-degree of state} - $,\, active : Q \rightarrow \YB$ \tcp{true if state needs rescan} - $,\, etrans : Q \rightarrow 2^{\Delta^\epsilon}$ \tcp{$\epsilon$-transitions ordered by priority} - $,\, next : Q \rightarrow \YZ)$ \tcp{index of current transition} - \Indm - \Indp - - \BlankLine - $result(q) \equiv \varnothing$ \; - $status(q) \equiv \NOPASS$ \; - $active(q) \equiv \Xfalse$ \; - $next(q) \equiv 1$ \; - - \BlankLine - \For {$x = (\Xund, q, \Xund, \Xund) \in X$ sorted by inverted $prec(\,)$} { - $result(q) = x$ \; - $push(topsort, q)$ - } - - \BlankLine - \While {$topsort$ is not empty} { - - \BlankLine - \While {$topsort$ is not empty} { - $q = pop(topsort)$ \; - - \If {$status(q) \neq \LINEAR$} { - - $status(q) = \TOPSORT$ \; - $push(topsort, q)$ \; - - \BlankLine - \If {$\neg scan(q, C, \Xfalse)$} { - $status(q) = \LINEAR$ \; - $pop(topsort)$ \; - $push(linear, q)$ - } - } - } - - \BlankLine - \While {$linear$ is not empty} { - $q = pop(linear)$ \; - - \If {$active(q)$} { - $next(q) = 1$ \; - $active(q) = \Xfalse$ \; - $scan(q, C, true)$ \; - } - - $status(q) = \NOPASS$ \; - } - } - - \BlankLine - \Return $prune(result, N)$ - } - \BlankLine - \BlankLine - - \Fn {$\underline{scan (q, C, all)} \smallskip$} { - $any = \Xfalse$ \; - - \While {$next(q) < n$} { - $(q, \epsilon, \tau, p) = etrans (q)_{next(q)}$ \; - $next(q) = next(q) + 1$ \; - $x = result(p), \; (o, q, u, r) = result(q)$ \; - $y = (o, p, extend \Xund path (H, u, \tau), r)$ \; - - \BlankLine - \If {$x \!=\! \varnothing \vee indeg(p) \!<\! 2 \vee less(y, x, C)$} { - $result(p) = y$ \; - \If {$status(q) = \NOPASS$} { - $any = true$ \; - $next(p) = 1$ \; - $push(topsort, p)$ \; - \lIf {$\neg all$} {$break$} - } - \lElse { - $active(p) = 1$ - } - } - } - - \Return $any$ \; - } - \BlankLine - \BlankLine - -\columnbreak - - \Fn {$\underline{closure \Xund gtop(N\!=\!(\Sigma, Q, T, \Delta, q_0, q_f), X, U, B, D)} \smallskip$} { - - \Indm - context: $C = (N, U, B, D$ \; - \Indp - $,\, queue : \text{priority queue of states } q \in Q$ \; - $,\, result : Q \rightarrow \YC \cup \{ \varnothing \}$ \; - $,\, status : Q \rightarrow \{ \INQUEUE, \OFFQUEUE\}$ \; - $,\, indeg : Q \rightarrow \YZ$ \tcp{in-degree of state} - $,\, topord : Q \rightarrow \YZ$ \tcp{topological index of state} - $,\, etrans : Q \rightarrow 2^{\Delta^\epsilon}$ \tcp{$\epsilon$-transitions} - \Indm - \Indp - - \BlankLine - $result(q) \equiv \varnothing$ \; - $status(q) \equiv \OFFQUEUE$ \; - - \BlankLine - \For {$x = (\Xund, q, \Xund, \Xund) \in X$} { - $y = result(q)$ \; - \If {$y \!=\! \bot \vee less(x, y, C)$} { - $result(q) = x$ \; - \If {$status(q) \neq \INQUEUE$} { - $insert \Xund with \Xund priority(queue, q, topord(q))$ \; - $status(q) = \INQUEUE$ \; - } - } - } - - \BlankLine - \While {$queue$ is not empty} { - - $q = extract \Xund min(queue)$ \; - $status(q) = \OFFQUEUE$ \; - - \BlankLine - \For {$(q, \epsilon, \tau, p) \in etrans (q)$} { - $x = result(p), \; (o, q, u, r) = result(q)$ \; - $y = (o, p, extend \Xund path (H, u, \tau), r)$ \; - - \BlankLine - \If {$x \!=\! \varnothing \vee indeg(p) \!<\! 2 \vee less(y, x, C)$} { - $result(p) = y$ \; - \If {$status(p) \neq \INQUEUE$} { - $insert \Xund with \Xund priority(queue, p, topord(p))$ \; - $status(p) = \INQUEUE$ \; - } - } - } - } - - \BlankLine - \Return $prune(result, N)$ - } - \BlankLine - \BlankLine - - \Fn {$\underline{prune (X, N)} \smallskip$} { - \Return $\big\{ (\Xund, q, \Xund, \Xund) \in X \mid - q \in F \vee \exists (q, \alpha, \Xund, \Xund) \in \Delta^\Sigma \}$ - } - \BlankLine - \BlankLine - - \Fn {$\underline{less (x, y, C)} \smallskip$} { - $(\Xund, \Xund, l) = compare (x, y, U, B, D)$ \; - \Return $l < 0$ - } - \BlankLine - \BlankLine - - \Fn {$\underline{prec (x, y, D)} \smallskip$} { - $(q, \Xund, \Xund, \Xund) = x, \; (p, \Xund, \Xund, \Xund) = y$ \; - \Return $D[q][p] < 0$ - } - \BlankLine - \BlankLine - -\end{multicols} -\vspace{1em} -\caption{ -Closure algorithms GOR1 (on the left) and GTOP (on the right). -Definition of functions of $push()$, $pop()$, $insert \Xund with \Xund priority()$, $extract \Xund min()$, -$indeg()$ and $topord()$ is omitted for brevity. -Definitions of $compare ()$ and $extend \Xund path ()$ are given in sections \ref{section_comparison} and \ref{section_pathtree}. -$\YC$ is the set of all configurations.} -\end{algorithm} - -The general proof of correctness of shortest-path algorithms is out of the scope of this paper. -However, we need to justify the application of these algorithms to our setting. -% -In order to do that, we recall the framework for solving shortest-path algorithms based on \emph{closed semirings} -described in \cite{CLR} (section 26.4) -and show that our problem fits into this framework. -% -A \emph{semiring} is a structure $(\YK, \oplus, \otimes, \Xbar{0}, \Xbar{1})$, where -$\YK$ is a set, -$\oplus \!\!:\!\! \YK \times \YK \rightarrow \YK$ is an associative and commutative operation with identity element $\Xbar{0}$, -$\otimes \!\!:\!\! \YK \times \YK \rightarrow \YK$ is an associative operation with identity element $\Xbar{1}$, -$\otimes$ distributes over $\oplus$ -and $\Xbar{0}$ is annihilator for $\otimes$. -% -Additionally, \emph{closed} semiring requires that -$\oplus$ is idempotent, -any countable $\oplus$-sum of $\YK$ elements is in $\YK$, -and associativity, commutativity, distributivity and idempotence apply to countable $\oplus$-sums. -Mohri generalizes this definition and notes that either left or right distributivity is sufficient \cite{Moh02}. -% -In our case $\YK$ is the set of closure paths without tagged $\epsilon$-loops: -the following lemma \ref{lemma_closure_minpaths} and \ref{lemma_closure_noloop} -show that, on one hand, paths with tagged $\epsilon$-loops are not minimal, -and on the other hand such paths are discarded by the algorithm, -so they can be removed from consideration. -% -Consequently $\YK$ is finite. -We have semiring $(\YK, min, \cdot, \varnothing, \epsilon)$, where -$min$ is POSIX comparison of ambiguous paths, -$\cdot$ is concatenation of paths at the join points -(subject to restriction that paths do not contain tagged $\epsilon$-loops -and remain within TNFA bounds --- concatenation of arbitrary paths is not in $\YK$), -$\varnothing$ corresponds to artificial infinitely long path, -and $\epsilon$ is the empty path. -% -It is easy to show that -$min$ is commutative and associative, -$\varnothing$ is identity for $min$ ($min(\pi, \varnothing) = min(\varnothing, \pi) = \pi$), -$\cdot$ is associative, -$\epsilon$ is identity for $\cdot$ ($\pi \cdot \epsilon = \epsilon \cdot \pi = \pi$), -$\varnothing$ is annihilator for $\cdot$ ($\pi \cdot \varnothing = \varnothing \cdot \pi = \varnothing$), -and right distributivity of $\cdot$ over $min$ for paths with at most one $\epsilon$-loop is given by lemma \ref{lemma_closure_rightdist}. -% -Idempotence holds because $min(\pi, \pi) = \pi$. -% -Since $\YK$ is finite, the properties for $\oplus$-sums over countable subsets are satisfied. - -\begin{theoremEnd}[restate, no link to proof, no link to theorem, category=lemmata_closure]{lemma} - \label{lemma_closure_minpaths} - Minimal paths do not contain tagged $\epsilon$-loops. -\end{theoremEnd} -\begin{proofEnd} - % Proof in terms of REs and correspondence between subexpression and loop - % is a bit hard because of unrolling of repetition in TNFA construction - % (there is no direct correspondence between sub-RE and sub-TNFA). - % - Suppose, on the contrary, that $\pi$ is a minimal path in some TNFA - and that $\pi$ contains at least one tagged $\epsilon$-loop. - We show that it is possible to construct another path $\pi'$ such that $\pi' < \pi$. - % - Path $\pi$ can be represented as - $\pi = \pi_1 \pi_2 \pi_3$, where - $\pi_1 = q_0 \overset {u | \alpha} {\rightsquigarrow} q$, - $\pi_2 = q \overset {\epsilon | \beta} {\rightsquigarrow} q$ is the last tagged $\epsilon$-loop on $\pi$ and - $\pi_3 = q \overset {v | \gamma} {\rightsquigarrow} q_f$. - Let $\pi' = \pi_1 \pi_3$ be the path that is obtained from $\pi$ by removing the loop $\pi_2$. - Paths $\pi$ and $\pi'$ consume the same input string $uv$ - and induce comparable PEs $\alpha \beta \gamma$ and $\alpha \gamma$. - Let $\big( (\rho_1, \hdots, \rho_n), (\rho'_1, \hdots, \rho'_n) \big) = traces (\alpha \beta \gamma, \alpha \gamma)$ - and let $k$ be the index of the fork frame. - % - By construction of TNFA the loop $\pi_2$ must be contained in a sub-TNFA $f$ - for sub-IRE of the form $e = (\Xund, \Xund, e_1^{1,\infty})$, - as this is the only looping TNFA construct --- see algorithm \ref{alg_tnfa}. - Let $f_1$ be the sub-TNFA for $e_1$. - Path $\pi$ enters $f$ and iterates through $f_1$ at least twice before leaving $f$ - (single iteration is not enough to create a loop by TNFA construction). - Let $j$ be the total number of iterations through $f_1$, - and let $i$ be the index of the last $\epsilon$-loop iteration - (note that not all iterations are necessarily $\epsilon$-loops). - Consider two possible cases: - % - \begin{enumerate}[itemsep=0.5em, topsep=0.5em] - \item[(1)] - Case $i = j$. - In this case fork of $\alpha \beta \gamma$ and $\alpha \gamma$ happens immediately after $(i-1)$-th iteration: - % - \begin{alignat*}{10} - \alpha \beta \gamma &= x_0 \Xl_{h-1} \;&&\; \Xl_h x_1 \Xr_h \hdots \Xl_h x_{i-1} \Xr_h \;&&\big|\; \Xl_h x_{i} \Xr_h \;&&\; \Xr_{h-1} x_{j+1} \\[-0.5em] - \alpha \gamma &= x_0 \Xl_{h-1} \;&&\; \Xl_h x_1 \Xr_h \hdots \Xl_h x_{i-1} \Xr_h \;&&\big|\; \;&&\; \Xr_{h-1} x_{j+1} - \end{alignat*} - % - Since $x_i$ is an $\epsilon$-loop, it is contained in the fork frame of $\alpha \beta \gamma$. - We have $minh (\beta) = h$ and $minh (\gamma) \leq h - 1$, therefore $\rho_k = \rho'_k \leq h - 1$. - Subsequent frames $l > k$ (if any) are identical and thus $\rho_l = \rho'_l$. - Furthermore, $first (\gamma) = \Xr < \Xl = first (\beta)$. - Therefore $\alpha \beta \gamma \sim \alpha \gamma$ and $\alpha \gamma \subset \alpha \beta \gamma$. - - \item[(2)] - Case $i < j$. - In this case $(i + 1)$-th iteration cannot be an $\epsilon$-loop - (because we assumed that $i$-th iteration is the last $\epsilon$-loop), - therefore the fork of $\alpha \beta \gamma$ and $\alpha \gamma$ happens - inside of $i$-th iteration of $\alpha \beta \gamma$ - and $(i + 1)$-th iteration of $\alpha \gamma$: - % - \begin{alignat*}{10} - \alpha \beta \gamma &= x_0 \Xl_{h-1} \;&&\; \Xl_h x_1 \Xr_h \hdots \Xl_h x_{i-1} \Xr_h \Xl_h y_1 \;&&\big|\; y_2 \Xr_h \Xl_h x_{i+1} && \Xr_h \Xl_h x_{i+2} \Xr_h \hdots \Xl_h x_j \Xr_h \;&&\; \Xr_{h-1} x_{j+1} \\[-0.5em] - \alpha \gamma &= x_0 \Xl_{h-1} \;&&\; \Xl_h x_1 \Xr_h \hdots \Xl_h x_{i-1} \Xr_h \Xl_h y_1 \;&&\big|\; y_3 && \Xr_h \Xl_h x_{i+2} \Xr_h \hdots \Xl_h x_j \Xr_h \;&&\; \Xr_{h-1} x_{j+1} - \end{alignat*} - % - Here $y_1 y_2 = x_i$ and $y_1 y_3 = x_{i+1}$ ($i$-th iteration is missing from $\alpha \gamma$ by construction of $\pi'$). - Fragment $y_2$ is part of the $\epsilon$-loop, - therefore fork frame of $\alpha \beta \gamma$ contains a parenthesis $\Xr_h$ and we have $\rho_k = h$. - On the other hand, $y_3$ contains alphabet symbols, - because $x_{i+1}$ is not an $\epsilon$-loop and $y_1$ is a part of the $\epsilon$-loop. - Therefore fork frame of $\alpha \gamma$ ends in $y_3$ and we have $\rho'_k > h$. - % - %In this case - %fork frame of $\alpha \beta \gamma$ contains $y_2 \Xr_h \Xl_h$ fragment, because $y_2$ is part of the $\epsilon$-loop. - %But the fork frame of $\alpha \gamma$ ends inside of $y_3$, because $(i+1)$-th repetiton is not an $\epsilon$-loop and must contain alphabet symbols. - %Therefore at the fork frame $k$ we have $\rho_k = h$ and $\rho'_k > h$. - % - All subsequent frames $l > k$ are identical: - if they contain parentheses of height less than $h$, then $\rho_l = \rho'_l < h$; - otherwise $\rho_l \leq h$ and $\rho'_l > h$. - Therefore $\alpha \gamma \sqsubset \alpha \beta \gamma$. - \end{enumerate} - % - In both cases $\alpha \gamma < \alpha \beta \gamma$, - which contradicts the fact that $\pi$ is a minimal path. -\end{proofEnd} -\vspace{-0.5em} - -\begin{theoremEnd}[restate, no link to proof, no link to theorem, category=lemmata_closure]{lemma} - \label{lemma_closure_noloop} - GOR1 and GTOP discard paths with tagged $\epsilon$-loops. -\end{theoremEnd} -\begin{proofEnd} - Suppose that GOR1/GTOP finds path $\pi_1 \pi_2$ - where $\pi_1 = q_0 \overset {s | \alpha} {\rightsquigarrow} q_1$ - and $\pi_2 = q_1 \overset {\epsilon | \gamma} {\rightsquigarrow} q_1$ is a tagged $\epsilon$-loop. - Both algorithms construct new paths by exploring transitions from the end state of existing paths, - so they can only find $\pi_1 \pi_2$ after they find $\pi_1$. - Therefore when GOR1/GTOP finds $\pi_1 \pi_2$, - it already has some shortest-path candidate $\pi'_1 = q_0 \overset {s | \alpha'} {\rightsquigarrow} q_1$ - and must compare ambiguous paths $\pi_1 \pi_2$ and $\pi'_1$. - There are two possibilities: either $\alpha' = \alpha$ - or $\alpha' < \alpha$ (the latter means that the algorithm has found - a shorter path to $q_1$ in between finding $\pi_1$ and $\pi_1 \pi_2$). - Let $\big( (\rho_1, \hdots, \rho_k), (\rho'_1, \hdots, \rho'_k) \big) = traces (\alpha', \alpha \gamma)$. - % - \begin{itemize}[itemsep=0.5em, topsep=0.5em] - \item[(1)] - Case $\alpha' = \alpha$. - Because $\alpha$ is a proper prefix of $\alpha \gamma$, - fork happens at the last frame and we have - $\rho_k = lasth(\alpha)$ and - $\rho'_k = min (lasth(\alpha), minh(\gamma))$. - If $lasth(\alpha) > minh(\gamma)$, then $\rho_k > \rho'_k$ and $\alpha \sqsubset \alpha \gamma$. - Otherwise $\rho_k = \rho'_k$ and $\alpha \sim \alpha \gamma$, - and we have $first(\alpha \backslash \alpha \gamma) = \bot$ and $first(\alpha \gamma \backslash \alpha) \neq \bot$, - therefore $\alpha \subset \alpha \gamma$. - In both cases $\alpha < \alpha \gamma$. - - \item[(2)] - Case $\alpha' < \alpha$. - Let $\big( (\sigma_1, \hdots, \sigma_k), (\sigma'_1, \hdots, \sigma'_k) \big) = traces (\alpha', \alpha)$. - We have $\rho_k = \sigma_k$ and $\rho'_k = min (\sigma'_k, minh(\gamma)) \leq \sigma_k$. - If $minh(\gamma) < \sigma'_k$ then $\rho_k > \rho'_k$ and $\alpha' \sqsubset \alpha \gamma$. - Otherwise $\rho'_k = \sigma'_k$. - If $\alpha' \sqsubset \alpha$ then $\alpha' \sqsubset \alpha \gamma$. - Otherwise $\alpha' \sim \alpha$ and $\alpha' \subset \alpha$. - None of $\alpha$ and $\alpha'$ is a proper prefix of the other - because otherwise the longer path has an $\epsilon$-loop through $q_1$, which contradicts our assumption about $\pi_1$ and $\pi'_1$. - Therefore $first (\alpha' \backslash \alpha) = first (\alpha' \backslash \alpha \gamma)$ - and $first (\alpha \backslash \alpha') = first (\alpha \gamma \backslash \alpha')$. - Consequently $\alpha' \subset \alpha \implies \alpha' \subset \alpha \gamma$. - Thus $\alpha' < \alpha \gamma$. - \end{itemize} - % - In both cases $\alpha' < \alpha \gamma$, therefore path $\pi_1 \pi_2$ is discarded. -\end{proofEnd} -\vspace{-0.5em} - -\begin{theoremEnd}[restate, no link to proof, no link to theorem, category=lemmata_closure]{lemma} -[Right distributivity of comparison over concatenation for paths without tagged $\epsilon$-loops] - \label{lemma_closure_rightdist} - Let - $\pi_\alpha = q_0 \overset {u | \alpha} {\rightsquigarrow} q_1$ and - $\pi_\beta = q_0 \overset {u | \beta} {\rightsquigarrow} q_1$ - be ambiguous paths in TNFA $f$ for IRE $e$, - and let $\pi_\gamma = q_1 \overset {\epsilon | \gamma} {\rightsquigarrow} q_2$ - be their common $\epsilon$-suffix, - such that $\pi_\alpha \pi_\gamma$ and $\pi_\beta \pi_\gamma$ do not contain tagged $\epsilon$-loops. - If $\alpha < \beta$ then $\alpha \gamma < \beta \gamma$. -\end{theoremEnd} -\begin{proofEnd} - Let - $\big( (\rho_1, \hdots, \rho_k),$ $(\rho'_1, \hdots, \rho'_k) \big) = traces (\alpha, \beta)$ and - $\big( (\sigma_1, \hdots, \sigma_k),$ $(\sigma'_1, \hdots, \sigma'_k) \big) = traces (\alpha \gamma, \beta \gamma)$. - Appending $\gamma$ to $\alpha$ and $\beta$ changes only the last frame, therefore - for frames $i < k$ we have $\rho_i = \sigma_i$ and $\rho'_i = \sigma'_i$. - Consider two possible cases. - % - \begin{itemize}[itemsep=0.5em, topsep=0.5em] - \item[(1)] - Case $\alpha \sim \beta \wedge \alpha \subset \beta$. - % - We show that $\alpha \gamma \sim \beta \gamma \wedge \alpha \gamma \subset \beta \gamma$. - % - We have $\rho_i = \rho'_i \; \forall i$ (implied by $\alpha \sim \beta$), therefore - $\sigma_i = \sigma'_i \; \forall i$ and consequently $\alpha \gamma \sim \beta \gamma$. - Let - $x = first (\alpha \backslash \beta)$, - $y = first (\beta \backslash \alpha)$, - $x' = first (\alpha \gamma \backslash \beta \gamma)$ and - $y' = first (\beta \gamma \backslash \alpha \gamma)$. - If one of $\pi_\alpha$ and $\pi_\beta$ is a proper prefix of another, - then the longer path contains tagged $\epsilon$-loop through $q_1$, - which contradicts lemma conditions - (the suffix of the longer path must be an $\epsilon$-path, - because $\alpha$ and $\beta$ have the same number of frames - and the suffix is contained in the last frame). - Therefore none of $\pi_\alpha$ and $\pi_\beta$ is a proper prefix of another. - Consequently $x = x'$ and $y = y'$, and we have - $\alpha \subset \beta$ - $\implies$ - $x < y$ - $\implies$ - $x' < y'$ - $\implies$ - $\alpha \gamma \subset \beta \gamma$. - - \item[(2)] - Case $\alpha \sqsubset \beta$: - by definition this means that $\exists j \leq k$ such that $\rho_j > \rho'_j$ and $\rho_i = \rho'_i \;\forall i > j$. - We show that $\alpha \gamma \sqsubset \beta \gamma$. - % - \begin{itemize} - \item[(2a)] - Case $j < k$. In this case $\rho_k = \rho'_k$ - and appending $\gamma$ does not change relation on the last frame: - $\sigma_k = min (\rho_k, minh (\gamma)) = min (\rho'_k, minh (\gamma)) = \sigma'_k$. - Since $\sigma_i = \rho_i$ and $\sigma'_i = \rho'_i$ for all preceding frames $i < k$, - we have $\alpha \gamma \sqsubset \beta \gamma$. - - \item[(2b)] - Case $j = k$ and $minh (\gamma) > \rho'_k$. - In this case $\rho_k > \rho'_k$ - and again appending $\gamma$ does not change relation on the last frame: - $\sigma_k = min (\rho_k, minh (\gamma)) > \rho'_k$ and - $\sigma'_k = min (\rho'_k, minh (\gamma)) = \rho'_k$, therefore - $\sigma_k > \sigma'_k$. - Therefore $\alpha \gamma \sqsubset \beta \gamma$. - - \item[(2c)] - Case $j = k$ and $minh (\gamma) \leq \rho'_k$ - and $\exists l < k$ such that $\rho_l > \rho'_l$ and $\rho_i = \rho'_i$ for $l < i < k$. - In this case $\gamma$ contains parentheses of low height - and appending it makes height on the last frame equal: - $\sigma_k = \sigma'_k = minh (\gamma)$. - However, the relation on the last preceding differing frame is the same: - $\sigma_l = \rho_l > \rho'_l = \sigma'_l$. - Therefore $\alpha \gamma \sqsubset \beta \gamma$. - - \item[(2d)] - Case $j = k$ and $minh (\gamma) \leq \rho'_k$ - and $\nexists l < k$ such that $\rho_l > \rho'_l$ and $\rho_i = \rho'_i$ for $l < i < k$. - In this case $\gamma$ contains parentheses of low height, - appending it makes height on the last frame equal: - $\sigma_k = \sigma'_k = minh (\gamma)$, - and this may change comparison result - as the relation on the last preceding differing frame may be different. - % - We show that in this case the extended path $\pi_\beta \pi_\gamma$ contains a tagged $\epsilon$-loop. - % - Consider the fragments of paths $\pi_\alpha$ and $\pi_\beta$ from fork to join, - including (if it exists) the common $\epsilon$-transition to the fork state: - $\pi_\alpha'$ and $\pi_\beta'$. - % - Minimal parenthesis height on $\pi_\alpha'$ is $\rho_k$. - By TNFA construction this means that $\pi_\alpha'$ is contained - in a sub-TNFA $f'$ for $e|_p$ at some position $p$ with length $|p| = \rho_k$. - % - As for $\pi_\beta'$, its start state coincides with $\pi_\alpha'$ and thus is in $f'$. - The minimal height of all but the last frames of $\pi_\beta'$ is at least $\rho_k$: - by conditions of (2d) either $k = 1$ and there are no such frames, - or $\rho'_{k-1} \geq \rho_{k-1}$ which implies $\rho'_{k-1} \geq \rho_k$ - (because by definition $\rho_k = min(\rho_{k-1}, minh(\alpha_k)) \leq \rho_{k-1}$). - On the last frame of $\pi_\beta'$ minimal height is $\rho'_k < \rho_k$. - Therefore all but the last frames of $\pi_\beta'$ are contained in $f'$, - but the the last frame is not. - % - Now consider $\pi_\gamma$: by conditions of (2d) its minimal height is less than $\rho_k$, - therefore it is not contained in $f'$, - % - but its start state is the join point of $\pi_\alpha'$ and $\pi_\beta'$ and thus in $f'$. - % - Taken together, above facts imply that the last frame of $\pi_\beta \pi_\gamma$ - starts in $f'$, then leaves $f'$, then returns to $f'$ and joins with $\pi_\alpha \pi_\gamma$, - and then leaves $f'$ second time. - Since the end state of $f'$ is unique (by TNFA construction), - $\pi_\beta \pi_\gamma$ must contain a tagged $\epsilon$-loop through it, - which contradicts lemma conditions. - \end{itemize} - \end{itemize} - % - (Note that in the presence of tagged $\epsilon$-loops right distributivity may not hold: - we may have paths $\pi_1$, $\pi_2$ and $\pi_3$ - such that $\pi_2$ and $\pi_3$ are two different $\epsilon$-loops through the same subautomaton - and $\pi_1 \pi_2 < \pi_1 \pi_3$, - in which case $\pi_1 \pi_2 \pi_3 < \pi_1 \pi_3$, - but $\pi_1 < \pi_1 \pi_2$ because the first is a proper prefix of the second.) - % -\end{proofEnd} - - -\section{Tree representation of paths}\label{section_pathtree} - -In this section we specify the representation of path fragments in configurations -and define path context $U$ and functions $empty \Xund path ()$ and $extend \Xund path ()$ -used in previous sections. -% -An obvious way to represent tagged path is to use a sequence of tags, such as a list or an array: -in that case $empty \Xund path ()$ can be implemented as an empty sequence, -and $extend \Xund path ()$ is just an append operation. -% -However, a more efficient representation is possible -if we consider the structure formed by paths in $\epsilon$-closure. -This structure is a \emph{prefix tree} of tags. -Some care is necessary with TNFA construction in order to ensure prefixness, -but that is easy to accommodate and we give the details in section \ref{section_tnfa}. -Storing paths in a prefix tree achieves two purposes: -first, we save on the duplicated prefixes, -and second, copying paths becomes as simple as copying a pointer to a tree leaf --- no need to copy the full sequence. -This technique was used by many researches, e.g. Laurikari mentions a \emph{functional data structure} \cite{Lau01} -and Karper describes it as the \emph{flyweight pattern} \cite{Kar14}. -\\ - -\begin{algorithm}[H] \DontPrintSemicolon \SetKwProg{Fn}{}{}{} \SetAlgoInsideSkip{medskip} -\begin{multicols}{2} - \setstretch{0.8} - - \Fn {$\underline {empty \Xund path (\,)} \smallskip$} { - \Return $0$ \; - } - \BlankLine - - \Fn {$\underline {extend \Xund path (U, n, \tau)} \smallskip$} { - \If {$\tau \neq \epsilon$} { - $m = |U| + 1$ \; - append $m$ to $succ(U, n)$ \; - append $(n, \emptyset, \tau)$ to $U$ \; - \Return $m$ \; - } - \lElse { - \Return $n$ - } - } - \BlankLine - -\columnbreak - - \Fn {$\underline {unroll \Xund path (U, n)} \smallskip$} { - $u = \epsilon$ \; - \While { $n \neq 0$ } { - $u = u \cdot tag(U, n)$ \; - $n = pred(U, n)$ \; - } - \Return $reverse(u)$ \; - } - \BlankLine - - \vfill - -\end{multicols} -\caption{Operations on tag tree.} -\end{algorithm} -\medskip - -A convenient representation of tag tree is an indexed sequence of nodes. -Each node is a triple $(p, s, t)$ where -$p$ is the index of predecessor node, -$s$ is a set of indices of successor nodes -and $t$ is a tag (positive or negative). -% -Forward links are only necessary if the advanced algorithm for $update \Xund ptables ()$ is used -(section \ref{section_comparison}), otherwise successor component can be omitted. -% -Now we can represent $u$-components of configurations with indices in the $U$-tree: -root index is $0$ (which corresponds to the empty path), -and each $u$-component is a tree index from which we can trace predecessors to the root -(function $unroll \Xund path ()$ demonstrates this). -% -In the implementation, it is important to use numeric indices rather than pointers -because it allows to use the ``two-fingers'' algorithm to find fork of two paths (section \ref{section_comparison}). -% -We assume the existence of functions -$pred(U, n)$ that returns $p$-component of $n$-th node, -$succ(U, n)$ that returns $s$-component of $n$-th node and -$tag(U, n)$ that returns $t$-component of $n$-th node. - - -\section{Representation of match results}\label{section_results} - -In this section we show two ways to construct match results: POSIX offsets and a parse tree. -% -In the first case, $r$-component of configurations is an array of offset pairs $pmatch$. -Offsets are updated incrementally at each step by scanning the corresponding path fragment -and setting negative tags to $-1$ and positive tags to the current step number. -We need the most recent value of each tag, therefore we take care to update tags at most once. -Negative tags are updated using helper functions $low()$ and $upp()$ that map each tag to the range of tags covered by it -(which includes itself, its pair tag and all nested tags). -Helper function $sub()$ maps each tag to the corresponding submatch group. -For a given tag $t$, functions $sub()$, $low()$ and $upp()$ are defined as the 2nd, 3rd and 4th components of $(t, s, l, u) \in T$. -Section \ref{section_tnfa} shows how this mapping is constructed. -\\ - -In the second case, $r$-component of configurations is a tagged string that is accumulated at each step, -and eventually converted to a parse tree at the end of match. -The resulting parse tree is only partially structured: -leaves that correspond to subexpressions with zero implicit submatch index contain ``flattened'' substring of alphabet symbols. -It is possible to construct parse trees incrementally as well, -but this is more complex and the partial trees may require even more space than tagged strings. -\\ - -\begin{algorithm}[H] \DontPrintSemicolon \SetKwProg{Fn}{}{}{} -\begin{multicols}{2} - \setstretch{0.8} - - \Fn {$\underline{initial \Xund result (T)} \smallskip$} { - \Return uninitialized array of pairs $(rm \Xund so, rm \Xund eo)$\; - } - \BlankLine - \BlankLine - - \Fn {$\underline{update \Xund result (T, X, U, k, \Xund)} \smallskip$} { - \Return $\big\{ (q, o, u, apply (T, U, u, r, k)) \mid (q, o, u, r) \in X \big\}$ \; - } - \BlankLine - \BlankLine - - \Fn {$\underline{f\!inal \Xund result (T, U, u, r, k)} \smallskip$} { - $pmatch = apply (T, U, u, r, k)$ \; - $pmatch[0].rm \Xund so = 0, \; pmatch[0].rm \Xund eo = k$ \; - \Return $pmatch$ \; - } - \BlankLine - \BlankLine - - \Fn {$\underline{apply (T, U, n, pmatch, k)} \smallskip$} { - $done(\Xund) \equiv f\!alse$ \; - \While {$n \neq 0$} { - $t = tag(U, n) \; s = sub(T, |t|)$ \; - \If {$t < 0 \wedge \big( s = 0 \vee \neg done(s) \big)$} { - \For {$t' = \overline{low(T, |t|), upp(T, |t|)}$} { - $s' = sub(T, t')$ \; - \If {$s' \neq 0 \wedge \neg done(s')$} { - $done(s') = true$ \; - $set \Xund tag (pmatch, t', s', -1)$ \; - } - } - } - \ElseIf {$s \neq 0 \wedge \neg done(s)$} { - $done(s) = true$ \; - $set \Xund tag (pmatch, t, s, k)$ \; - } - $n = pred(U, n)$ \; - } - \Return $pmatch$ \; - } - \BlankLine - \BlankLine - - \Fn {$\underline{set \Xund tag (pmatch, t, s, pos)} \smallskip$} { - \lIf {$t \, mod \, 2 \equiv 1$} {$pmatch[s].rm \Xund so = pos$} - \lElse {$pmatch[s].rm \Xund eo = pos$} - } - \BlankLine - - \vfill - -\columnbreak - - \Fn {$\underline{initial \Xund result (\Xund)} \smallskip$} { - \Return $\epsilon$ \; - } - \BlankLine - \BlankLine - - \Fn {$\underline{update \Xund result (\Xund, X, U, \Xund, \alpha)} \smallskip$} { - \Return $\big\{ (q, o, u, r \cdot unroll \Xund path (U, u) \cdot \alpha)$ \; - \Indp\Indp\Indp\Indp\Indp\Indp\Indp\Indp $\mid (q, o, u, r) \in X \big\}$ \; \Indm\Indm\Indm\Indm\Indm\Indm\Indm\Indm - } - \BlankLine - \BlankLine - - \Fn {$\underline{f\!inal \Xund result (\Xund, U, u, r, \Xund)} \smallskip$} { - \Return $parse \Xund tree (r \cdot unroll \Xund path (U, u), 1)$ \; - } - \BlankLine - \BlankLine - - \Fn {$\underline{parse \Xund tree (u, i)} \smallskip$} { - \If {$u = (2i \!-\! 1) \cdot (2i)$} { - \Return $T^i(\epsilon)$ - } - \If {$u = (1 \!-\! 2i) \cdot \hdots $} { - \Return $T^i(\varnothing)$ - } - \If {$u = (2i \!-\! 1) \cdot \alpha_1 \hdots \alpha_n \cdot (2i) \wedge \alpha_1, \hdots, \alpha_n \in \Sigma $} { - \Return $T^i(a_1, \hdots, a_n)$ - } - \If {$u = (2i \!-\! 1) \cdot \beta_1 \hdots \beta_m \cdot (2i) \wedge \beta_1 = 2j \!-\! 1 \in T$} { - $n = 0, k = 1$ \; - \While {$k \leq m$} { - $l = k$ \; - \lWhile {$|\beta_{k+1}| > 2j$} { - $k = k + 1$ - } - $n = n + 1$ \; - $t_n = parse \Xund tree (\beta_l \dots \beta_k, j)$ - } - \Return $T^i(t_1, \dots, t_n)$ - } - \Return $\varnothing$ \tcp{ill-formed PE} - } - \BlankLine - - \vfill - -\end{multicols} -\vspace{1.5em} -\caption{Construction of match results: POSIX offsets (on the left) and parse tree (on the right).} -\end{algorithm} -\medskip - - -\section{Disambiguation procedures}\label{section_comparison} - -In this section we define disambiguation procedures $compare ()$ and $update \Xund ptables ()$. -The pseudocode follows definition \ref{pe_order} closely -and relies on the prefix tree representation of paths given in section \ref{section_results}. -% -In order to find fork of two paths in $compare ()$ we use so-called ``two-fingers'' algorithm, -which is based on the observation that parent index is always less than child index. -Given two indices $n_1$ and $n_2$, we continuously set the greater index to its parent until the indices become equal, -at which point we have either found fork or the root of $U$-tree. -We track minimal height of each path along the way -and memorize the pair of indices right after the fork --- they are used to determine the leftmost path in case of equal heights. -% -We assume the existence of helper function $height(T, t)$ that maps each tag to its height. -\\ - -\begin{algorithm}[H] \DontPrintSemicolon \SetKwProg{Fn}{}{}{} \SetAlgoInsideSkip{medskip} -\begin{multicols}{2} - \setstretch{0.8} - - \Fn {$\underline {compare (c_1, c_2, U, B, D)} \smallskip$} { - $(\Xund, o_1, n_1, \Xund) = c_1, \; (\Xund, o_2, n_2, \Xund) = c_2$ \; - - \lIf { $o_1 = o_2 \wedge n_1 = n_2$ } { - \Return $(\infty, \infty, 0)$ - } - - \BlankLine - $f\!ork = o_1 = o_2$ \; - \lIf {$f\!ork$ } { - $h_1 = h_2 = \infty$ - } - \lElse { - $h_1 = B[o_1][o_2], \; h_2 = B[o_2][o_1]$ - } - - \BlankLine - $m_1 = m_2 = \bot$ \; - \While {$n_1 \neq n_2$} { - \If {$n_1 > n_2$} { - $h_1 = min(h_1, height(T, tag(U, n_1)))$ \; - $m_1 = n_1, \; n_1 = pred(U, n_1)$ \; - } - \Else { - $h_2 = min(h_2, height(T, tag(U, n_2)))$ \; - $m_2 = n_2, \; n_2 = pred(U, n_2)$ \; - } - } - \If {$n_1 \neq \bot$} { - $h = height(T, tag(U, n_1))$ \; - $h_1 = min(h_1, h), \; h_2 = min(h_2, h)$ \; - } - - \BlankLine - \lIf {$h_1 > h_2$} {$l = -1$} - \lElseIf {$h_1 < h_2$} {$l = 1$ } - \lElseIf {$\neg f\!ork$} {$l = D[o_1][o_2]$} - \lElse {$l = le\!f\!tprec(m_1, m_2, U)$} - - \BlankLine - \Return $(h_1, h_2, l)$ \; - } - \BlankLine - \BlankLine - - \Fn {$\underline {le\!f\!tprec (n_1, n_2, U)} \smallskip$} { - - \lIf {$n_1 = n_2$} { \Return $0$ } - \lIf {$n_1 = \bot$} { \Return $-1$ } - \lIf {$n_2 = \bot$} { \Return $1$ } - - \BlankLine - $t_1 = tag(U, n_1), \; t_2 = tag(U, n_2)$ \; - - \BlankLine - \lIf {$t_1 < 0$} { \Return $1$ } - \lIf {$t_2 < 0$} { \Return $-1$ } - - \BlankLine - \lIf {$t_1 mod \, 2 \equiv 0$} { \Return $-1$ } - \lIf {$t_2 mod \, 2 \equiv 0$} { \Return $1$ } - - \BlankLine - \Return $0$ - } - \BlankLine - \BlankLine - - \Fn {$\underline {update \Xund ptables (N, X, U, B, D)} \smallskip$} { - \For {$x_1 = (q_1, \Xund, \Xund, \Xund) \in X$} { - \For {$x_2 = (q_2, \Xund, \Xund, \Xund) \in X$} { - $(h_1, h_2, l) = compare (x_1, x_2, U, B, D)$ \; - $B' [q_1] [q_2] = h_1, \; D' [q_1] [q_2] = l$ \; - $B' [q_2] [q_1] = h_2, \; D' [q_2] [q_1] = -l$ - } - } - \BlankLine - \Return $(B', D')$ \; - } - - \vfill - \columnbreak - - \Fn {$\underline {update \Xund ptables (N, X, U, B, D)} \smallskip$} { - $i = 0, \; next(n) \equiv 1, \; \text{empty stack } S, \; \text{empty array } L$ \; - $push(S, 0)$ \; - - \BlankLine - \While {$S$ is not empty} { - $n = pop(S)$ \; - - \BlankLine - \If {$next(n) < k$} { - $push(S, n)$ \; - $push(S, succ(U, n)_{next(n)})$ \; - $next(n) = next(n) + 1$ \; - $continue$ \; - } - - \BlankLine - $h = height(T, tag(U, n)), \; i_1 = i$ \; - - \BlankLine - \For {$(q, o, n_1, \Xund) \in X \mid n_1 = n$} { - $i = i + 1, \; L[i] = (q, o, \bot, h)$ \; - } - \For {$j_1 = \overline{i_1 + 1, i}$} { - \For {$j_2 = \overline{j_1, i}$} { - $(q_1, o_1, \Xund, \Xund) = L[j_1]$ \; - $(q_2, o_2, \Xund, \Xund) = L[j_2]$ \; - - \BlankLine - \If {$n = 0 \wedge o_1 \neq o_2$} { - $h_1 = B[o_1][o_2], \; h_2 = B[o_2][o_1]$ \; - $l = D[o_1][o_2]$ \; - } - \lElse { - $h_1 = h_2 = h, \; l = 0$ - } - - \BlankLine - $B'[q_1][q_2] = h_1, \; D'[q_1][q_2] = l$ \; - $B'[q_2][q_1] = h_2, \; D'[q_2][q_1] = -l$ \; - } - } - - \BlankLine - \For {$m \in succ(U, n)$ in reverse} { - $i_2 = i_1$ \; - \lWhile {$i_2 > 0 \wedge L[i_2].n = m$} { - $i_2 = i_2 - 1$ - } - - \BlankLine - \For {$j_1 = \overline{i_2, i_1}$} { - $L[j_1].h = min(L[j_1].h, h)$; \; - - \BlankLine - \For {$j_2 = \overline{i_1, i}$} { - $(q_1, o_1, n_1, h_1) = L[j_1]$ \; - $(q_2, o_2, n_2, h_2) = L[j_2]$ \; - - \BlankLine - \If {$n = 0 \wedge o_1 \neq o_2$} { - $h_1 = min(h_1, B[o_1][o_2])$ \; - $h_2 = min(h_2, B[o_2][o_1])$ \; - } - - \BlankLine - \lIf {$h_1 > h_2$} {$l = -1$} - \lElseIf {$h_1 < h_2$} {$l = 1$ } - \lElseIf {$o_1 \neq o_2$} {$l = D[o_1][o_2]$} - \lElse {$l = le\!f\!tprec(n_1, n_2, U)$} - } - - \BlankLine - $B'[q_1][q_2] = h_1, \; D'[q_1][q_2] = l$ \; - $B'[q_2][q_1] = h_2, \; D'[q_2][q_1] = -l$ \; - } - - $i_1 = i_2$ \; - } - - \BlankLine - \lFor {$j = \overline{i_1, i}$} { - $L[j].n = n$ - } - } - - \BlankLine - \Return $(B', D')$ \; - } - -\end{multicols} -\caption{Disambiguation procedures.} -\end{algorithm} -\medskip - -We give two alternative algorithms for $update \Xund ptables ()$: -a simple one with $O(m^2 \, t)$ complexity (on the left) and a complex one with $O(m^2)$ complexity (on the right). -Worst case is demonstrated by RE $((a|\epsilon)^{0,k})^{0,\infty}$ where $n \in \YN$, -for which the simple algorithm takes $O(k^3)$ time and the complex algorithm takes $O(k^2)$ time. -% -The idea of complex algorithm is to avoid repeated re-scanning of path prefixes in the $U$-tree. -It makes one pass over the tree, -constructing an array $L$ of \emph{level items} $(q, o, u, h)$, where -$q$ and $o$ are state and origin as in configurations, -$u$ is the current tree index and $h$ is the current minimal height. -One item is added per each closure configuration $(q, o, u, r)$ when traversal reaches the tree node with index $u$. -After a subtree has been traversed, -the algorithm scans level items added during traversal of this subtree (such items are distinguished by their $u$-component), -sets their $h$-component to the minimum of $h$ and the height of tag at the current node, -and computes the new value of $B$ and $D$ matrices for each pair of $q$-states in items from different branches. -After that, $u$-component of all scanned items is downgraded to the tree index of the current node -(erasing the difference between items from different branches). - - -\section{Lazy disambiguation}\label{section_lazy} - -Most of the overhead in our algorithm comes from updating $B$ and $D$ matrices at each step. -It is all the more unfortunate since many comparisons performed by $update \Xund ptables ()$ are useless --- -the compared paths may never meet. -In fact, if the input is unambiguous, all comparisons are useless. -% -A natural idea, therefore, is to compare paths only in case of real ambiguity (when they meet in closure) -and avoid computation of precedence matrices altogether. -% -We can do it with a few modifications to our original algorithm. -% -First, we no longer need $B$ and $D$ matrices and $update \Xund ptables ()$ function. -Instead, we introduce cache $C$ that maps a pair of tree indices $(n_1, n_2)$ to a triple of precedence values $(h_1, h_2, l)$. -Cache stores the ``useful'' part of $B$ and $D$ matrices on multiple preceding steps. -It is populated lazily during disambiguation -and allows us to avoid re-computing the same values multiple times. -% -Second, we need to modify the tree representation of paths in the following way: -forward links are no longer needed (parent links are sufficient), -and tree nodes must be augmented with information about current step and origin state (we assume the existence of helper functions $step()$ and $origin()$). -% -Third, instead of using $empty \Xund path ()$ to initialize path fragments in configurations -we need to set them to path fragments of their parent configurations, -so that paths are accumulated rather than reset at each step. -% -Fourth, we no longer need to call $update \Xund result ()$ at each step --- this can be done once at the end of match. -% -Below is the modified lazy version of $compare()$, the only part of the algorithm that requires non-trivial change. -\\ - -\begin{algorithm}[H] \DontPrintSemicolon \SetKwProg{Fn}{}{}{} \SetAlgoInsideSkip{medskip} -\begin{multicols}{2} - \setstretch{0.8} - - \Fn {$\underline {compare (c_1, c_2, U, C)} \smallskip$} { - $(\Xund, \Xund, n_1, \Xund) = c_1, \; (\Xund, \Xund, n_2, \Xund) = c_2$ \; - - \BlankLine - \Return $compare1 (n_1, n_2, U, C)$ \; - } - \BlankLine - \BlankLine - - \Fn {$\underline {compare1 (n_1, n_2, U, C)} \smallskip$} { - \If {$C(n_1, n_2) = \varnothing $} { - $C(n_1, n_2) = compare2 (n_1, n_2, U, C)$ \; - } - - \BlankLine - \Return $C(n_1, n_2)$ \; - } - \BlankLine - \BlankLine - - \vfill - \columnbreak - - \Fn {$\underline {compare2 (n_1, n_2, U, C)} \smallskip$} { - \lIf { $n_1 = n_2$ } { - \Return $(\infty, \infty, 0)$ - } - - \BlankLine - $h_1 = h_2 = \infty$ \; - $o_1 = origin(U, n_1), \; o_2 = origin(U, n_2)$ \; - $s_1 = step(U, n_1), \; s_2 = step(U, n_2), \; s = max (s_1, s_2)$ \; - $f\!ork = o_1 = o_2 \wedge s_1 = s_2$ \; - - \BlankLine - $m_1 = m_2 = \bot$ \; - \While {$n_1 \neq n_2 \wedge (s_1 \geq s \vee s_2 \geq s)$} { - \If {$s_1 \geq s \wedge (n_1 > n_2 \vee s_2 < s)$} { - $h_1 = min(h_1, height(T, tag(U, n_1)))$ \; - $m_1 = n_1, \; n_1 = pred(U, n_1), \; s_1 = step(U, n_1)$ \; - } - \Else { - $h_2 = min(h_2, height(T, tag(U, n_2)))$ \; - $m_2 = n_2, \; n_2 = pred(U, n_2), \; s_2 = step(U, n_2)$ \; - } - } - - \BlankLine - \If {$\neg f\!ork$ } { - $(h'_1, h'_2, l) = compare1(n_1, n_2, U, C)$ \; - $h_1 = min(h_1, h'_1), \; h_2 = min(h_2, h'_2)$ \; - } - \ElseIf {$n_1 \neq \bot$} { - $h = height(T, tag(U, n_1))$ \; - $h_1 = min(h_1, h), \; h_2 = min(h_2, h)$ \; - } - - \BlankLine - \lIf {$h_1 > h_2$} {$l = -1$} - \lElseIf {$h_1 < h_2$} {$l = 1$ } - \lElseIf {$f\!ork$} {$l = le\!f\!tprec(m_1, m_2, U)$} - - \BlankLine - \Return $(h_1, h_2, l)$ \; - } - \BlankLine - \BlankLine - -\end{multicols} -\vspace{1em} -\caption{Lazy disambiguation procedures (we assume that cache $C$ is modified in-place).} -\end{algorithm} -\medskip - -The problem with this approach is that we need to keep full-length history of each active path: -at the point of ambiguity we may need to look an arbitrary number of steps back -in order to find the fork of ambiguous paths. -% -This may be acceptable for small inputs (and memory footprint may even be smaller due to reduction of precedence matrices), -but it is definitely infeasible for very long or streaming inputs. -% -A possible solution may be a hybrid approach that uses lazy disambiguation, -but every $k$ steps fully calculates precedence matrices and ``forgets'' path prefixes. -Another possible solution is to keep both algorithms and choose between them depending on the length of input. - - -\section{TNFA construction}\label{section_tnfa} - -TNFA construction is given by the function $tn\!f\!a()$ -that accepts IRE $r$ and state $y$ and returns TNFA for $r$ with final state $y$ -(algorithm \ref{alg_tnfa}). -% -This precise construction is not necessary for the algorithms to work, -but it has a number of important properties. -\begin{itemize}[itemsep=0.5em, topsep=0.5em] - \item Non-essential $\epsilon$-transitions are removed, as they make closure algorithms slower. - - \item Bounded repetition $r^{n,m}$ is unrolled in a way - that duplicates $r$ exactly $m$ times - and factors out common path prefixes: - subautomaton for $(k+1)$-th iteration is only reachable from subautomaton for $k$-th iteration. - For example, $a^{2,5}$ is unrolled as $aa(\epsilon | a (\epsilon | a (\epsilon | a)))$, not as $aa(\epsilon|a|aa|aaa)$. - This ensures that the tag tree build by $\epsilon$-closure is a prefix tree. - - \item Priorities are assigned so as to make it more likely - that depth-first traversal of the $\epsilon$-closure finds short paths before long paths. - % - This is an optimization that makes GOR1 much faster in specific cases - with many ambiguous paths that are longest-equivalent and must be compared by the leftmost criterion. - An example of such case is $(((\epsilon)^{0,k})^{0,k})^{0,k})$ for some large $k$. - % - Because GOR1 has a depth-first component, it is sensitive to the order of transitions in TNFA. - If it finds the shortest path early, then all other paths are just canceled at the first join point with the shortest path - (because there is no improvement and further scanning is pointless). - In the opposite case GOR1 finds long paths before short ones, - and whenever it finds an improved (shorter) path, it has to schedule configurations for re-scan on the next pass. - This causes GOR1 to make more passes and scan more configurations on each pass, - which makes it significantly slower. - Arguably this bias is a weakness of GOR1 --- GTOP is more robust in this respect. - - \item When adding negative tags, we add a single transition for the topmost closing tag - (it corresponds to the nil-parenthesis, which has the height of a closing parenthesis). - Then we map this tag to the full range of its nested tags, including itself and the pair opening tag. - An alternative approach is to add all nested negative tags as TNFA transitions and get rid of the mapping, - but this may result in significant increase of TNFA size and major slowdown - (we observed 2x slowdown on large tests with hundreds of submatch groups). - - \item Compact representation of nested tags as ranges in $T$ - is possible because nested tags occupy consecutive numbers. - - \item Passing the final state $y$ in $tn\!f\!a()$ function allows to link subautomata in a simple and efficient way. - It allows to avoid tracking and patching of subautomaton transitions that go to the final state - (when this final state needs to be changed). -\end{itemize} - - -\section{Complexity analysis}\label{section_complexity} - -Our algorithm consists of three steps: conversion of RE to IRE, -construction of TNFA from IRE -and simulation of TNFA on the input string. -We discuss time and space complexity of each step -in term of the following parameters: -$n$ --- the length of input, -$m$ --- the size of RE with counted repetition subexpressions expanded -(each subexpression duplicated the number of times equal to the repetition counter), -and $t$ --- the number of capturing groups and subexpressions that contain them. -\\ - -\begin{algorithm}[] \DontPrintSemicolon \SetKwProg{Fn}{}{}{} -\begin{multicols}{2} -\setstretch{0.85} - - \newcommand \retonfa {tn\!f\!a} - \newcommand \ntag {ntags} - - \Fn {$\underline{\retonfa(r, y)} \smallskip$} { - \If {$r = (0, 0, \epsilon)$} { - \Return $(\Sigma, \{y\}, \emptyset, \emptyset, y, y)$ - } - \BlankLine - \ElseIf {$r = (0, 0, \alpha) \mid_{\alpha \in \Sigma}$} { - \Return $(\Sigma, \{x,y\}, \emptyset, \{(x, \alpha, \epsilon, y)\}, x, y)$ - } - \BlankLine - \ElseIf {$r = (0, 0, r_1 \cdot r_2)$} { - $(\Sigma, Q_2, T_2, \Delta_2, x, y) = \retonfa (r_2, y)$ \; - $(\Sigma, Q_u, T_u, \Delta_2, z, x) = \retonfa (r_1, x)$ \; - \Return $(\Sigma, Q_1 \cup Q_2, T_1 \cup T_2, \Delta_1 \cup \Delta_2, z, y)$ - } - \BlankLine - \ElseIf {$r = (0, 0, r_1 \mid r_2)$} { - $(\Sigma, Q_2, T_2, \Delta_2, x_2, y) = \retonfa (r_2, y)$ \; - $(\Sigma, Q'_2, T_2, \Delta'_2, x'_2, y) = \ntag (T_2, y)$ \; - $(\Sigma, Q_1, T_1, \Delta_1, x_1, x'_2) = \retonfa (r_2, x'_2)$ \; - $(\Sigma, Q'_1, T_1, \Delta'_1, x'_1, x_2) = \ntag (T_1, x_2)$ \; - $Q = Q_1 \cup Q'_1 \cup Q_2 \cup Q'_2 \cup \{x\}$ \; - $\Delta = \Delta_1 \cup \Delta'_1 \cup \Delta_2 \cup \Delta'_2 \cup \big\{ (x,1,\epsilon,x_1), (x,2,\epsilon,x'_1) \big\}$ \; - \Return $(\Sigma, Q, T_1 \cup T_2, \Delta, x, y)$ - } - \BlankLine - \ElseIf {$r = (0, 0, r_1^{n,m}) \mid_{1 < n \leq m \leq \infty}$} { - $(\Sigma, Q_1, T_1, \Delta_1, x, y) = \retonfa ((0, 0, r_1^{n-1,m-1}), y)$ \; - $(\Sigma, Q_2, T_2, \Delta_2, z, x) = \retonfa (r_1, x)$ \; - \Return $(\Sigma, Q_1 \cup Q_2, T_1 \cup T_2, \Delta_1 \cup \Delta_2, z, y)$ - } - \BlankLine - \ElseIf {$r = (0, 0, r_1^{0,m})$} { - $(\Sigma, Q_1, T_1, \Delta_1, x_1, y) = \retonfa ((0, 0, r_1^{1,m}), y)$ \; - $(\Sigma, Q'_1, T_1, \Delta'_1, x'_1, y) = \ntag (T_1, y)$ \; - $Q = Q_1 \cup Q'_1 \cup \{x\}$ \; - $\Delta = \Delta_1 \cup \Delta'_1 \cup \big\{ (x, 1, \epsilon, x_1), (x, 2, \epsilon, x'_1) \big\}$ \; - \Return $(\Sigma, Q, T_1, \Delta, x, y)$ - } - \BlankLine - \ElseIf {$r = (0, 0, r_1^{1,\infty})$} { - $(\Sigma, Q_1, T_1, \Delta_1, z, x) = \retonfa (r_1, \Xund)$ \; - $Q = Q_1 \cup \{y\}$ \; - $\Delta = \Delta_1 \cup \big\{ (x, 1, \epsilon, z), (x, 2, \epsilon, y) \big\}$ \; - \Return $(\Sigma, Q, T_1, \Delta, z, y)$ - } - \BlankLine - \ElseIf {$r = (0, 0, r_1^{1,1})$} { - \Return $\retonfa (r_1, y)$ - } - \BlankLine - \ElseIf {$r = (0, 0, r_1^{1,m}) \mid_{1 < m < \infty}$} { - $(\Sigma, Q_1, T_1, \Delta_1, x, y) = \retonfa ((0, 0, r_1^{1,m-1}), y)$ \; - $(\Sigma, Q_2, T_2, \Delta_2, w, z) = \retonfa (r_1, z)$ \; - $\Delta = \Delta_1 \cup \Delta_2 \cup \big\{ (z, 1, \epsilon, y), (z, 2, \epsilon, x) \big\}$ \; - \Return $(\Sigma, Q_1 \cup Q_2, T_1 \cup T_2, \Delta, w, y)$ - } - \BlankLine - \ElseIf {$r = (i, j, r_1) \mid_{i \neq 0}$} { - $(\Sigma, Q_1, T_1, \Delta_1, z, x) = \retonfa ((0, 0, r_1), x)$ \; - $Q = Q_1 \cup \{w, y\}$ \; - $T = T_1 \cup \big\{ (2i\!-\!1, j, 0, -1), (2i, j, 0, -1) \big\}$ \; - $\Delta = \Delta_1 \cup \big\{ (w, 1, 2i\!-\!1, z), (x, 1, 2i, y) \big\}$ \; - \Return $(\Sigma, Q, T, \Delta, w, y)$ - } - } - \BlankLine - \BlankLine - - \Fn {$\underline{\ntag(T, y)} \smallskip$} { - $(t_{open}, t_{last}) = min \Xund max \big\{ t \mid (t, \Xund, \Xund, \Xund) \in T \big\}$ \; - $(t_{clos}, s, \Xund, \Xund) = (t, \Xund, \Xund, \Xund) \in T \mid t = t_{open} + 1$ \; - $T' = \big\{ (t, \Xund, \Xund, \Xund) \in T \mid t \neq t_{clos} \big\} \cup \big\{ (t_{clos}, s, t_{open}, t_{last}) \big\}$ \; - $\Delta = \big\{ (x, 1, -t_{clos}, y) \big\}$ \; - \Return $(\Sigma, \{x, y\}, T', \Delta, x, y)$ \; - } - - \vfill - -\columnbreak - - \nonl \includegraphics[width=\linewidth]{img/tnfa_construction.pdf} - -\end{multicols} -\vspace{0.5em} -\caption{TNFA construction.}\label{alg_tnfa} -\end{algorithm} - -The first step, conversion of RE to IRE, is given by the functions $mark()$ and $enum()$ from section \ref{section_formalization}. -% -For each sub-RE, $mark()$ constructs a corresponding sub-IRE, -and $enum()$ performs a linear visit of the IRE (which doesn't change its structure), -therefore IRE size is $O(m)$. -% -Each subexpression is processed twice (once by $mark()$ and once by $enum()$) -and processing takes $O(1)$ time, therefore total time is $O(m)$. -\\ - -The second step, TNFA construction, is given by the $tn\!f\!a()$ function (algorithm \ref{alg_tnfa}). -At this step counted repetition is unrolled, so TNFA may be much larger than IRE. -For each subexpressions of the form $(i, j, r^{n,m})$ automaton for $r$ is duplicated exactly $m$ times if $m \neq \infty$, or $max(1, n)$ times otherwise -(at each step of recursion the counter is decremented and one copy of automaton is constructed). -Other $tn\!f\!a()$ clauses are in one-to-one correspondence with sub-IRE. -Each clause adds only a constant number of states and transitions (including calls to $ntags()$ and excluding recursive calls). -Therefore TNFA contains $O(m)$ states and transitions. -The size of mapping $T$ is $O(t)$, which is $O(m)$. -Therefore total TNFA size is $O(m)$. -Time complexity is $O(m)$, because each clause takes $O(1)$ time (excluding recursive calls), and total $O(m)$ clauses are executed. -\\ - -The third step, TNFA simulation, is given by algorithm \ref{alg_match}. -Initialization at lines 2-5 takes $O(1)$ time. -Main loop at lines 6-11 is executed at most $n$ times. -The size of closure is bounded by TNFA size, which is $O(m)$ (typically closure is much smaller than that). -Each iteration of the loop includes the following steps. -% -At line 7 the call to $closure()$ takes $O(m^2 \, t)$ time if GOR1 from section \ref{section_closure} is used, -because GOR1 makes $O(m^2)$ scans (closure contains $O(m)$ states and $O(m)$ transitions), -and at each scan we may need to compare the tag sequences using $compare()$ from section \ref{section_comparison}, -which takes $O(t)$ time -(there is $O(t)$ unique tags and we consider paths with at most one $\epsilon$-loop, -so the number of occurrences of each tag is bounded by the repetition counters, -which amounts to a constant factor). -% -At line 8 the call to $update \Xund result()$ takes $O(m \, t)$ time, -because closure size is $O(m)$, -and the length of the tag sequence is $O(t)$ as argued above. -% -At line 9 the call to $update \Xund ptables ()$ takes $O(m^2)$ time -if the advanced algorithm from section \ref{section_comparison} is used. -% -At line 10 scanning the closure for reachable states takes $O(m)$ time, -because closure size is $O(m)$. -% -The sum of the above steps is $O(m^2 \, t)$, so the total time of loop at lines 6-11 is $O(n \, m^2 \, t)$. -The final call to $closure()$ at line 12 takes $O(m^2 \, t)$, -and $f\!inal \Xund result ()$ at line 14 takes $O(m \, t)$. -The total time taken by $match()$ is therefore $O(n \, m^2 \, t)$. -\\ - -Space complexity of $match()$ is as follows. -% -The size of the closure is $O(m)$. -% -Precedence matrices $B$ and $D$ take $O(m^2)$ space. -% -Match results take $O(m \, t)$ space in case of POSIX-style offsets, -because the number of parallel results is bounded by the closure size, -and each result takes $O(t)$ space. -In case of parse trees, match results take $O(m \, n)$ space, because each result accumulates all loop iterations. -% -The size of the path context $U$ is $O(m^2)$ -because GOR1 makes $O(m^2)$ scans and thus adds no more than $O(m^2)$ tags in the tree. -The total space taken by $match()$ is therefore $O(m^2)$ -for POSIX-style offsets and $O(m (m + n))$ for parse trees. - - -\section{Benchmarks}\label{section_benchmarks} - -In order to present benchmark results in a meaningful way, -we show the time of each algorithm relative to the ``baseline'' leftmost greedy algorithm, -which has no overhead on disambiguation and thus represents best-case matching time. -% -We measure the speed of each algorithm in characters per second -and divide it by the speed of leftmost greedy algorithm. -% -This allows us to show the net overhead on POSIX disambiguation. -% -To relate our implementations to the real world, -we include Google RE2 library (it uses leftmost greedy disambiguation and serves as a reference, not as a competing implementation). -% -All implementations can be found in RE2C source code \cite{RE2C}. -% -Our set of benchmarks consists of three subsets: -\begin{enumerate}[itemsep=0.5em, topsep=0.5em] - \item Real-world benchmarks. - These include very large REs containing thousands of characters and hundreds of capturing groups - (parser for HTTP message headers conforming to RFC-7230, - URI parser conforming to RFC-3986, - IPv6 address parser); - medium-sized REs containing hundreds of characters and dozens of capturing groups - (simplified parsers for HTTP headers and URI, IPv4 address parser, simple date parser); - and small REs with less than a hundred characters and about five capturing groups - (simplified parsers for IPv4 and date). - - \item Artificial benchmarks with high level of ambiguity. - All these REs are restricted to a single alphabet symbol - used with various combinations of RE operators (union, product, iteration and bounded repetition). - - \item A series of pathological examples that demonstrates worst-case behavior of the naive $update \Xund ptables ()$ algorithm. -\end{enumerate} - -We benchmark four variations of our algorithm: -``Okui-Suzuki'' is the main variation (it uses advanced $update \Xund ptables ()$ algorithm and GOR1), -``GTOP Okui-Suzuki'' uses GTOP, -``naive Okui-Suzuki'' uses naive $update \Xund ptables ()$ algorithm, -and ``lazy Okui-Suzuki'' differs from the main variation as described in section \ref{section_lazy}. -% -Besides our algorithm, we also benchmark Kuklewicz and Cox algorithms. -Kuklewicz algorithm is described in detail in \cite{Tro17}. -As for the Cox algorithm, the only description we are aware of is the prototype implementation \cite{Cox09}. -We found a number of flaws in it, as described in the introduction. -Our implementation, therefore, differs from the original: -we add support for bounded repetition, -we use GOR1 for closure construction, -and we use a fast forward pre-processing phase to find the matching string prefix before running the backward phase -(forward phase ignores submatch and merely performs recognition). -% -Benchmark results show the following: -\\[-0.5em] - -\begin{figure} -\pgfplotstableread[col sep = semicolon, trim cells]{img/bench/data_realworld} \datatable -\begin{tikzpicture} - \begin{axis}[ - width=330, - height=250, - ylabel=slowdown vs leftmost greedy (times), - xmin=1, xmax=11, ymin=0, ymax=30, - xticklabels from table={\datatable}{Alg}, - xtick=data, - xticklabel style={xshift=10pt, yshift=-5pt, font=\footnotesize}, - ylabel style={yshift=-10pt}, - legend cell align={left}, - legend style={fill=none}, - grid=major, - grid style={dashed, gray!20}, - x tick label style={rotate=30, anchor=east} - ] - \addplot[color=black, mark=] table[y = LG] from \datatable ; - \addlegendentry{leftmost greedy} ; - \addplot[color=black, mark=, dash pattern=on 1pt off 1pt] table[y = RE2] from \datatable ; - \addlegendentry{RE2} ; - \addplot[color=black, mark=, dash pattern=on 8pt off 1.5pt] table[y = OS] from \datatable ; - \addlegendentry{Okui-Suzuki} ; - \addplot[color=black, mark=, dash pattern=on 5pt off 1.5pt] table[y = OS_GTOP] from \datatable ; - \addlegendentry{GTOP Okui-Suzuki} ; - \addplot[color=black, mark=, dash pattern=on 3pt off 1pt] table[y = OS_naive] from \datatable ; - \addlegendentry{naive Okui-Suzuki} ; - \addplot[color=black, mark=, dash pattern=on 2.5pt off 2.5pt] table[y = OS_lazy] from \datatable ; - \addlegendentry{lazy Okui-Suzuki} ; - \addplot[color=black, mark=, dash pattern=on 1pt off 1pt on 1pt off 1pt on 1pt off 2pt on 4pt off 2pt] table[y = Kuklewicz] from \datatable ; - \addlegendentry{Kuklewicz} ; - \addplot[color=black, mark=, dash pattern=on 1pt off 2pt on 4pt off 2pt] table[y = Cox] from \datatable ; - \addlegendentry{Cox} ; - \end{axis} -\end{tikzpicture} -\pgfplotstableread[col sep = semicolon, trim cells]{img/bench/data_pathological} \datatable -\begin{tikzpicture} - \begin{axis}[ - height=250, - width=170, - ylabel=slowdown vs leftmost greedy (times), - xmin=33, xmax=37, ymin=-50, - xticklabels from table={\datatable}{Alg}, - xtick=data, - xticklabel style={xshift=10pt, yshift=-5pt, font=\footnotesize}, - ylabel style={yshift=-5pt}, - legend cell align={left}, - legend style={fill=none}, - grid=major, - grid style={dashed, gray!20}, - x tick label style={rotate=30, anchor=east} - ] - \addplot[color=black, mark=] table[y = LG] from \datatable ; - \addlegendentry{leftmost greedy} ; - \addplot[color=black, mark=, dash pattern=on 1pt off 1pt] table[y = RE2] from \datatable ; - \addlegendentry{RE2} ; - \addplot[color=black, mark=, dash pattern=on 8pt off 1.5pt] table[y = OS] from \datatable ; - \addlegendentry{Okui-Suzuki} ; - \addplot[color=black, mark=, dash pattern=on 5pt off 1.5pt] table[y = OS_GTOP] from \datatable ; - \addlegendentry{GTOP Okui-Suzuki} ; - \addplot[color=black, mark=, dash pattern=on 3pt off 1pt] table[y = OS_naive] from \datatable ; - \addlegendentry{naive Okui-Suzuki} ; - \addplot[color=black, mark=, dash pattern=on 2.5pt off 2.5pt] table[y = OS_lazy] from \datatable ; - \addlegendentry{lazy Okui-Suzuki} ; - \addplot[color=black, mark=, dash pattern=on 1pt off 1pt on 1pt off 1pt on 1pt off 2pt on 4pt off 2pt] table[y = Kuklewicz] from \datatable ; - \addlegendentry{Kuklewicz} ; - \addplot[color=black, mark=, dash pattern=on 1pt off 2pt on 4pt off 2pt] table[y = Cox] from \datatable ; - \addlegendentry{Cox} ; - \end{axis} -\end{tikzpicture} -\pgfplotstableread[col sep = semicolon, trim cells]{img/bench/data_artificial} \datatable -\begin{tikzpicture} - \begin{axis}[ - height=250, - width=500, - ylabel=slowdown vs leftmost greedy (times), - xmin=12, xmax=32, ymin=0, ymax=30, - xticklabels from table={\datatable}{Alg}, - xtick=data, - xticklabel style={xshift=10pt, yshift=-5pt, font=\footnotesize}, - ylabel style={yshift=-10pt}, - legend cell align={left}, - legend style={fill=none}, - grid=major, - grid style={dashed, gray!20}, - x tick label style={rotate=30, anchor=east} - ] - \addplot[color=black, mark=] table[y = LG] from \datatable ; - \addlegendentry{leftmost greedy} ; - \addplot[color=black, mark=, dash pattern=on 1pt off 1pt] table[y = RE2] from \datatable ; - \addlegendentry{RE2} ; - \addplot[color=black, mark=, dash pattern=on 8pt off 1.5pt] table[y = OS] from \datatable ; - \addlegendentry{Okui-Suzuki} ; - \addplot[color=black, mark=, dash pattern=on 5pt off 1.5pt] table[y = OS_GTOP] from \datatable ; - \addlegendentry{GTOP Okui-Suzuki} ; - \addplot[color=black, mark=, dash pattern=on 3pt off 1pt] table[y = OS_naive] from \datatable ; - \addlegendentry{naive Okui-Suzuki} ; - \addplot[color=black, mark=, dash pattern=on 2.5pt off 2.5pt] table[y = OS_lazy] from \datatable ; - \addlegendentry{lazy Okui-Suzuki} ; - \addplot[color=black, mark=, dash pattern=on 1pt off 1pt on 1pt off 1pt on 1pt off 2pt on 4pt off 2pt] table[y = Kuklewicz] from \datatable ; - \addlegendentry{Kuklewicz} ; - \addplot[color=black, mark=, dash pattern=on 1pt off 2pt on 4pt off 2pt] table[y = Cox] from \datatable ; - \addlegendentry{Cox} ; - \end{axis} -\end{tikzpicture} -\caption{ -Benchmarks.\\ -Real-world tests have labels of the form ``title $m$,$k$'', where $m$ is RE size and $k$ is the number of capturing groups. -}\label{fig:bench} -\end{figure} - -\begin{itemize}[itemsep=0.5em] - \item Okui-Suzuki algorithm degrades with increased closure size. - This is understandable, as the algorithm performs pairwise comparison of closure states to compute precedence matrices. - Naive $update \Xund ptables ()$ algorithm degrades extremely fast, - and the advanced algorithm behaves much better (though it may incur slight overhead in simple cases). - - \item Kuklewicz algorithms degrades with increased closure size and increased number of tags. - This is not surprising, as the algorithm has per-state and per-tag loop used to compute precedence matrix. - On real-world tests with many capturing groups Kuklewicz algorithm is much slower than Okui-Suzuki algorithm. - - \item Cox algorithm degrades with increased number of tags. - The bottleneck of the algorithm is copying of offset arrays - (each array contains a pair of offsets per tag). - Using GOR1 instead of naive depth-first search increases the amount of copying (though asymptotically faster), - because depth-first scan order allows to use a single buffer array that is updated and restored in-place. - However, copying is required elsewhere in the algorithm, - and in general it is not suited for RE with many submatch groups. - On real-world tests the Cox algorithm is so slow that it did not fit into the plot space. - - \item Lazy variation of Okui-Suzuki degrades with increased cache size and the size of path context. - This may happen because of long input strings and because of high level of ambiguity in RE - (in such cases lazy algorithm does all the work of non-lazy algorithm, - but with the additional overhead on cache lookups/insertions and accumulation of data from the previous steps). - On real-world tests lazy variation of Okui-Suzuki is fast. - - \item GOR1 and GTOP performance is similar. - - \item RE2 performance is close to our leftmost greedy implementation. - \\[-0.5em] -\end{itemize} - -One particularly interesting group of tests that show the above points -are RE of the form $(a^{k_1}|\hdots|a^{k_n})^{0,\infty}$ -(artificial tests 1-4) -and their variations with more capturing groups -(artificial tests 5-8). -For example, consider \texttt{(a\{2\}|a\{3\}|a\{5\})*} and \texttt{(((a)\{2\})|((a)\{3\})|((a)\{5\}))*}. -Given input string \texttt{a...a}, -submatch on the last iteration varies with the length of input: -it equals \texttt{aaaaa} for $5n$-character string, -\texttt{aa} for strings of length $5n - 3$ and $5n - 1$, -and \texttt{aaa} for strings of length $5n - 2$ and $5n + 1$ ($n \in \YN$). -Variation continues infinitely with a period of five characters. -% -We can increase variation period and the range of possible submatch results by choosing larger counter values. -% -This causes increased closure size --- -hence the slowdown of Okui-Suzuki algorithm on tests 1 to 4 and 5 to 8 (especially pronounced for the ``naive Okui-Suzuki'' variation), -and the more gentle slowdown of Kuklewicz algorithm on the same ranges. -% -Adding more capturing groups increases the number of tags --- -hence the slowdown of Kuklewicz and Cox algorithms on 5-8 group compared to 1-4 group. -\\ - -In closing, we would like to point out that correctness -of all benchmarked implementations has been tested on a subset of Glenn Fowler test suite \cite{Fow03} -(we removed tests for backreferences and start/end anchors), -extended by Kuklewicz and further extended by ourselves to some 500 tests. -All algorithms except Cox algorithm have passed the tests -(Cox algorithm fails in about 10 cases for the reasons discussed in the introduction). - -\FloatBarrier - - -\section{Conclusions and future work}\label{section_conclusions} - -The main result of our work is a practical POSIX matching algorithm -that can be used on real-world regular expressions, -does not require complex preprocessing -and incurs relatively modest disambiguation overhead compared to other algorithms. -% -We tried to present the algorithm in full, with a few useful variations, -in order to make implementation easy for the reader. -\\ - -We see a certain tradeoff between speed and memory usage: -bounded-memory version of the algorithm performs a lot of redundant work, -and the lazy version avoids redundant work at the expense of potentially unbounded memory usage. -Both approaches seem not ideal; -perhaps in practice a hybrid approach can be used. -\\ - -It is still an open question to us -whether it is possible to combine the elegance of the derivative-based approach to POSIX disambiguation -with the practical efficiency of NFA-based methods. -% -The derivative-based approach constructs match results in such order that longest-leftmost result is always first. -% -We experimented with recursive descent parsers that embrace the same ordering idea -and construsted a prototype implementation. -\\ - -It would be interesting to apply our approach to automata with counters -instead of unrolling bounded repetition. - - -\section*{Acknowledgements} - -I want to thank my parents Vladimir and Elina, -my teachers Tatyana Leonidovna and Demian Vladimirovich -and the Belarusian State University -for the love of mathematics. -This work would not be possible without the help of my husband and dearest friend Sergei Trofimovich. -Finally, many thanks to the RE2C users! -\null\hfill\textit{Ulya Trofimovich} - - -\begin{thebibliography}{99} - -\bibitem{OS13} - Satoshi Okui and Taro Suzuki, - \textit{Disambiguation in Regular Expression Matching via Position Automata with Augmented Transitions}, - International Conference on Implementation and Application of Automata, pp. 231-240, Springer, Berlin, Heidelberg, - 2013. - -\bibitem{Lau01} - Ville Laurikari, - \textit{Efficient submatch addressing for regular expressions}, - Helsinki University of Technology, - 2001. - -\bibitem{Kuk07} - Chris Kuklewicz, - \textit{Regular expressions/bounded space proposal}, - \url{http://wiki.haskell.org/index.php?title=Regular_expressions/Bounded_space_proposal&oldid=11475} - 2007. - -\bibitem{Cox09} - Russ Cox, - backward POSIX matching algorithm (source code), - \url{https://swtch.com/~rsc/regexp/nfa-posix.y.txt} - 2009. - -\bibitem{SL14} - Martin Sulzmann, Kenny Zhuo Ming Lu, - \textit{POSIX Regular Expression Parsing with Derivatives}, - International Symposium on Functional and Logic Programming, pp. 203-220, Springer, Cham, - 2014. - -\bibitem{SL13} - Martin Sulzmann, Kenny Zhuo Ming Lu, - \textit{Correct and Efficient POSIX Submatch Extraction with Regular Expression Derivatives}, - \url{https://www.home.hs-karlsruhe.de/~suma0002/publications/posix-derivatives.pdf}, - 2013. - -\bibitem{Bor15} - Angelo Borsotti1, Luca Breveglieri, Stefano Crespi Reghizzi, Angelo Morzenti, - \textit{From Ambiguous Regular Expressions to Deterministic Parsing Automata}, - International Conference on Implementation and Application of Automata. Springer, Cham, pp.35-48, - 2015. - -\bibitem{ADU16} - Fahad Ausaf, Roy Dyckhoff, Christian Urban, - \textit{POSIX Lexing with Derivatives of Regular Expressions}, - International Conference on Interactive Theorem Proving. Springer, Cham, pp. 69-86, - 2016. - -\bibitem{Tro17} - Ulya Trofimovich, - \textit{Tagged Deterministic Finite Automata with Lookahead}, - \url{http://re2c.org/2017_trofimovich_tagged_deterministic_finite_automata_with_lookahead.pdf}, - 2017. - -\bibitem{CLR} - Thomas H. Cormen, Charles E. Leiserson, Ronald L. Rivest, - \textit{Introduction to algorithms}, - 1st edition, - MIT Press and McGraw-Hill, - ISBN 0-262-03141-8. - -\bibitem{Moh02} - Mehryar Mohri, - \textit{Semiring frameworks and algorithms for shortest-distance problems}, - Journal of Automata, Languages and Combinatorics 7 (2002) 3, 321–350, Otto-von-Guericke-Universitat, Magdeburg, - 2002. - -\bibitem{Kar14} - Aaron Karper, - \textit{Efficient regular expressions that produce parse trees}, - Master's thesis, - University of Bern, - 2014. - -\bibitem{GR93} - Andrew V. Goldberg, Tomasz Radzik, - \textit{A heuristic improvement of the Bellman-Ford algorithm}, - Elsevier, Applied Mathematics Letters, vol. 6, no. 3, pp. 3-6, - 1993. - -\bibitem{CGR96} - Boris V. Cherkassky, Andrew V. Goldberg, Tomasz Radzik, - \textit{Shortest paths algorithms: Theory and experimental evaluation}, - Springer, Mathematical programming, vol. 73, no. 2, pp. 129-174, - 1996. - -\bibitem{CGGTW09} - Boris V. Cherkassky, Loukas Georgiadis, Andrew V. Goldberg, Robert E. Tarjan, and Renato F. Werneck. - \textit{Shortest Path Feasibility Algorithms: An Experimental Evaluation}, - Journal of Experimental Algorithmics (JEA), 14, 7, - 2009. - -\bibitem{Cox09} - Aaron Karper, - \textit{Efficient regular expressions that produce parse trees}, - epubli GmbH, - 2014 - -\bibitem{POSIX} - POSIX standard: \textit{POSIX-1.2008} - a.k.a. \textit{IEEE Std 1003.1-2008} - The IEEE and The Open Group, - 2008. - -\bibitem{Fow03} - Glenn Fowler, - \textit{An Interpretation of the POSIX Regex Standard}, - \url{https://web.archive.org/web/20050408073627/http://www.research.att.com/~gsf/testregex/re-interpretation.html}, - 2003. - -\bibitem{RE2C} - \textit{RE2C}, lexer generator for C/C++. - Website: \url{http://re2c.org}, - source code: \url{http://github.com/skvadrik/re2c}. - -\bibitem{RE2} - \textit{RE2}, regular expression library. - Source code: \url{http://github.com/google/re2}. - -\bibitem{Glibc} - \textit{The GNU C library}, - \url{https://www.gnu.org/software/libc/}. - -\end{thebibliography} - - -\vfill\null -\clearpage - - -\section*{Appendix} - -\subsection*{Proof of theorem \ref{theorem_porder_on_PTs}} - -\begin{theoremEnd}[normal, no link to proof, no link to theorem]{lemma} -[Unique position mapping from all PTs to IRE] - \label{lemma_positions} - If $t, s \in PT(r)$ for some IRE $r$ - and there is a common position $p \in Pos(t) \cap Pos(s)$, - then $p$ corresponds to the same position $p' \in Pos(r)$ for both $t$ and $s$. -\end{theoremEnd} -\begin{proofEnd} - The proof is by induction on the length of $p$. - Induction basis: $p = p' = \Lambda$ (the roots of $t$ and $s$ correspond to the root of $e$). - Induction step: suppose that for any position $p$ of length $|p| < h$ the lemma is true. - We will show that if exists a $k \in \YN$ such that $p.k \in Pos(t) \cap Pos(s)$, - then $p.k$ corresponds to the same position $p'.k'$ in $r$ for both $t$ and $s$ (for some $k' \in \YN$). - If $r|_{p'}$ is an elementary IRE of the form $(i, j, \epsilon)$ or $(i, j, \alpha)$, - or if at least one of $t|_p$ and $s|_p$ is $\varnothing$, - then $k$ doesn't exist. - Otherwise $r|_{p'}$ is a compound IRE and both $t|_p$ and $s|_p$ are not $\varnothing$. - If $r|_{p'}$ is a union $(i, j, (i_1, j_1, r_1)|(i_2, r_2, j_2))$ - or a product $(i, j, (i_1, j_1, r_1)\cdot(i_2, r_2, j_2))$, - then both $t|_p$ and $s|_p$ have exactly two subtrees, - and positions $p.1$ and $p.2$ in $t$ and $s$ correspond to positions $p'.1$ and $p'.2$ in $r$. - Otherwise, $r|_{p'}$ is a repetition $(i, j, r_1^{n,m})$ - and for any $k \geq 1$ position $p.k$ in $t$ and $s$ corresponds to position $p'.1$ in $r$. -\end{proofEnd} - -\printProofs[theorem_porder_on_PTs] - - -\subsection*{Proof of theorem \ref{theorem_sorder_on_PTs}} - -\begin{theoremEnd}[normal, no link to proof, no link to theorem]{lemma} - \label{lemma_incomparability_equivdef} - If $t, s \in \PT(e, w)$ for some IRE $e$ and string $w$, - then $t \sim s \Leftrightarrow \; \forall p : \snorm{t}{p} = \snorm{s}{p}$. -\end{theoremEnd} -\begin{proofEnd} - Forward implication: let $t \sim s$ and suppose, on the contrary, that $\exists p = min \{ q \mid \snorm{t}{q} \neq \snorm{s}{q} \}$, - then either $t \prec_p s$ (if $\snorm{t}{p} > \snorm{s}{p}$) or $s \prec_p t$ (if $\snorm{t}{p} < \snorm{s}{p}$), - both cases contradict $t \sim s$. - % - Backward implication: $\forall p : \snorm{t}{p} = \snorm{s}{p}$ - implies $\nexists p : t \prec_p s$ and $\nexists q : s \prec_q t$, - which implies $t \sim s$. -\end{proofEnd} - -\printProofs[theorem_sorder_on_PTs] - - -\subsection*{Proof of Theorem \ref{theorem_order_compat}} - -\begin{theoremEnd}[normal, no link to proof, no link to theorem]{lemma} -[Comparability of subtrees] - \label{lemma_subtrees} - For a given IRE $e$, string $w$ and position $p$, - if $t, s \in \PT(e, w)$, $p \in Sub(t) \cup Sub(s)$ and $\snorm{t}{q} = \snorm{s}{q} \; \forall q \leq p$, - then $\exists e', w' : t|_p, s|_p \in \PT(e', w')$. -\end{theoremEnd} -\begin{proofEnd} - By induction on the length of $p$. - Induction basis: $p = \Lambda$, let $e' = e$ and $w' = w$. - % - Induction step: suppose that the lemma is true for any position $p$ of length - $|p| < h$, we will show that it is true for any position $p.k$ of length $h$ - ($k \in \YN$). - % - Assume that $p.k \in Sub(t) \cap Sub(s)$ - (otherwise either $p.k \not\in Sub(t) \cup Sub(s)$, - or exactly one of $\snorm{t}{p.k}$, $\snorm{s}{p.k}$ is $\infty$ --- in both - cases lemma conditions are not satisfied). - Then both $t|_p$ and $s|_p$ have at least one subtree: let - $t|_{p} = T(t_1, \dots, t_n)$ and - $s|_{p} = T(s_1, \dots, s_m)$, where $n, m \geq k$. - % - By induction hypothesis $\exists e', w' : t|_p, s|_p \in \PT(e', w')$. - We have $w' = str(t_1) \dots str(t_n) = str(s_1) \dots str(s_m)$. - % - We show that $str(t_k) = str(s_k)$. - Consider positions $p.j$ for $j \leq k$. - By definition the set of submatch positions contains siblings, - therefore $p.j \in Sub(t) \cap Sub(s)$. - By lemma conditions $\snorm{t}{p.j} = \snorm{s}{p.j}$ (because $p.j \leq p.k$), - therefore $|str(t_1) \dots str(t_{k-1})|$ - $= \sum\nolimits_{j=1}^{k-1}\snorm{t}{j}$ - $= \sum\nolimits_{j=1}^{k-1}\snorm{s}{j}$ - $= |str(s_1) \dots str(s_{k-1})|$ and - $|str(t_k)| = |str(s_k)|$. - Consequently, $str(t_k)$ and $str(s_k)$ start and end at the same character in $w'$ and therefore are equal. - % - Finally, have $t|_{p.k}, s|_{p.k} \in \PT(r|_{p.k}, str(t_k))$ and induction step is complete. -\end{proofEnd} - -\printProofs[theorem_order_compat] - - -\subsection*{Proof of Theorem \ref{theorem_order_on_pe_same_as_on_pt}} - -\begin{theoremEnd}[normal, no link to proof, no link to theorem]{lemma} - \label{lemma_pe_equiv} - Let $s, t \in \PT(e, w)$ for some IRE $e$ and string $w$. - If $s \sim t$, then $\Phi_{h}(s) = \Phi_{h}(t) \; \forall h$. -\end{theoremEnd} -\begin{proofEnd} - By induction on the height of $e$. - % - Induction basis: for height $1$ we have - $| \PT(e, w) | \leq 1 \; \forall w$, - therefore $s = t$ and $\Phi_{h}(s) = \Phi_{h}(t)$. - % - Induction step: - height is greater than 1, therefore - $s = T^{d} (s_1, \dots, s_n)$ and - $t = T^{d} (t_1, \dots, t_m)$. - If $d = 0$, then $\Phi_{h}(s) = str(s) = w = str(t) = \Phi_{h}(t)$. - Otherwise $d \neq 0$. - By lemma \ref{lemma_incomparability_equivdef} we have $s \sim t \Rightarrow \snorm{s}{p} = \snorm{t}{p} \;\forall p$. - This implies $n = m$ (otherwise the norm of subtree at position $min(n,m)+1$ is $\infty$ for only one of $s$, $t$). - Therefore - $\Phi_{h}(s) = \Xl_{h+1} \Phi_{h+1}(s_1), \dots, \Phi_{h+1}(s_n) \Xr_h$ and - $\Phi_{h}(t) = \Xl_{h+1} \Phi_{h+1}(t_1), \dots, \Phi_{h+1}(t_n) \Xr_h$. - It suffices to show that $\forall i \leq n: \Phi_{h+1}(s_i) = \Phi_{h+1}(t_i)$. - We have $\snorm{s_i}{p} = \snorm{t_i}{p} \;\forall p$ (implied by $\snorm{s}{p} = \snorm{t}{p} \;\forall p$), - therefore by lemma \ref{lemma_incomparability_equivdef} $s_i \sim t_i$, - and by lemma \ref{lemma_subtrees} $\exists e', w': s_i, t_i \in \PT(e', w')$, - where the height of $e'$ is less than the height of $e$. - By induction hypothesis $\Phi_{h+1}(s_i) = \Phi_{h+1}(t_i)$. -\end{proofEnd} - -\begin{theoremEnd}[normal, no link to proof, no link to theorem]{lemma} - \label{lemma_pe_less_1} - Let $s, t \in \PT(e, w)$ for some IRE $e$ and string $w$. - If $s \prec_p t$ and $|p| = 1$, then $\Phi_{h}(s) < \Phi_{h}(t) \; \forall h$. -\end{theoremEnd} -\begin{proofEnd} - By lemma conditions $|p| = 1$, therefore $p \in \YN$. - At least one of $s|_p$ and $t|_p$ must exist (otherwise $\snorm{s}{p} = \infty = \snorm{t}{p}$ which contradicts $s \prec_p t$), - therefore $e$ is a compound IRE and $s$, $t$ can be represented as - $s = T^{d} (s_1, \dots, s_n)$ and - $t = T^{d} (t_1, \dots, t_m)$ - where $d \neq 0$ because $\Lambda$ is a prefix of decision position $p$. - Let $k$ be the number of frames and let $j$ be the fork, then: - \begin{alignat*}{7} - \Phi_{h}(s) &\;=\; \Xl_{h+1} &&\Phi_{h+1}(s_1) &&\dots &&\Phi_{h+1}(s_n) \Xr_h - &&\;=\; \beta_0 a_1 \dots a_j \beta_j &&\;\big|\; && \gamma_j a_{j + 1} \dots a_k \gamma_k \\[-0.5em] - \Phi_{h}(t) &\;=\; \Xl_{h+1} &&\Phi_{h+1}(t_1) &&\dots &&\Phi_{h+1}(t_m) \Xr_h - &&\;=\; \beta_0 a_1 \dots a_j \beta_j &&\;\big|\; && \delta_j a_{j + 1} \dots a_k \delta_k - \end{alignat*} -% - Consider any $i < p$ ($i \in \YN$). - By lemma conditions $s \prec_p t$, therefore $\snorm{s}{q} = \snorm{t}{q} \;\forall q < p$, and - in particular $\snorm{s_i}{q} = \snorm{t_i}{q} \;\forall q$, therefore - by lemma \ref{lemma_incomparability_equivdef} $s_i \sim t_i$, - therefore by lemma \ref{lemma_pe_equiv} $\Phi_{h+1}(s_i) = \Phi_{h+1}(t_i)$. - Let $traces (\Phi_{h}(s), \Phi_{h}(t)) = \big( (\rho_0, \dots, \rho_k), (\rho'_0, \dots, \rho'_k) \big)$. - - \begin{itemize}[itemsep=0.5em, topsep=0.5em] - \item[(1)] - Case $\infty = \snorm{s}{p} > \snorm{t}{p}$. - In this case $s_p$ does not exist - and fork happens immediately after $\Phi_{h+1}(s_{p-1})$, $\Phi_{h+1}(t_{p-1})$: - \begin{alignat*}{7} - \Phi_{h}(s) &\;=\; \Xl_{h+1} &&\Phi_{h+1}(s_1) &&\dots &&\Phi_{h+1}(s_{p-1}) - &&\;\big|\; \Xr_{h} && && \\[-0.5em] - \Phi_{h}(t) &\;=\; \Xl_{h+1} &&\Phi_{h+1}(t_1) &&\dots &&\Phi_{h+1}(t_{p-1}) - &&\;\big|\; \Phi_{h+1}(t_p) &&\dots &&\Phi_{h+1}(t_m) \Xr_{h} - \end{alignat*} - % - Fork frame is the last one, - therefore both $\gamma_j$ and $\delta_j$ contain the closing parenthesis $\Xr_{h}$ - and we have $\rho_j = \rho'_j = h$. - For all $i < j$ we have $\rho_i = \rho'_i = -1$. - Therefore $\rho_i = \rho'_i \;\forall i$ and $\Phi_{h}(s) \sim \Phi_{h}(t)$. - Since $first(\gamma_j)$ is $\Xr$ and $first(\delta_j)$ is one of $\Xl$ and $\Xm$, - we have $\Phi_{h}(s) \subset \Phi_{h}(t)$. - Therefore $\Phi_{h}(s) < \Phi_{h}(t)$. - - \item[(2)] - Case $\infty > \snorm{s}{p} > \snorm{t}{p} = -1$. - In this case both $s_p$ and $t_p$ exist, - $s_p$ is not $\varnothing$ and $t_p$ is $\varnothing$, - and fork happens immediately after $\Phi_{h+1}(s_{p-1})$, $\Phi_{h+1}(t_{p-1})$: - \begin{alignat*}{8} - \Phi_{h}(s) &\;=\; \Xl_{h+1} &&\Phi_{h+1}(s_1) &&\dots &&\Phi_{h+1}(s_{p-1}) - &&\;\big|\; \Xl_{h+2} \; x \; \Xr_{h+1} \; &&\Phi_{h+1}(s_{p+1}) &&\dots &&\Phi_{h+1}(s_n) \Xr_{h} \\[-0.5em] - \Phi_{h}(t) &\;=\; \Xl_{h+1} &&\Phi_{h+1}(t_1) &&\dots &&\Phi_{h+1}(t_{p-1}) - &&\;\big|\; \Xm_{h+1} \;\;\;\;\;\; &&\Phi_{h+1}(t_{p+1}) &&\dots &&\Phi_{h+1}(t_m) \Xr_{h} - \end{alignat*} - % - \begin{itemize} - \item[(2.1)] - If the fork frame is the last one, - then both $\gamma_j$ and $\delta_j$ contain the closing parenthesis $\Xr_{h}$ - and we have $\rho_j = \rho'_j = h$. - - \item[(2.2)] - Otherwise the fork frame is not the last one. - We have $minh(\gamma_j)$, $minh(\delta_j) \geq h + 1$ - and $lasth (\beta_j) = h + 1$ - (the last parenthesis in $\beta_j$ is either $\Xl_{h+1}$ if $p = 1$ and $s_{p-1}$ does not exist, - or else one of $\Xr_{h+1}$ and $\Xm_{h+1}$), - therefore $\rho_j = \rho'_j = h + 1$. - % - For subsequent frames $i$ such that $j < i < k$ we have $\rho_i = \rho'_i = h + 1$ - (on one hand $\rho_i, \rho'_i \leq h + 1$ because $\rho_j = \rho'_j = h + 1$, - but on the other hand $minh(\gamma_i)$, $minh(\delta_i) \geq h + 1$). - % - For the last pair of frames we have $\rho_k = \rho'_k = h$ (they both contain the closing parenthesis $\Xr_{h}$). - \end{itemize} - - In both cases $\rho_i = \rho'_i \;\forall i \geq j$. - Since $\rho_i = \rho'_i = -1 \;\forall i < j$, - we have $\rho_i = \rho'_i \;\forall i$ and therefore $\Phi_{h}(s) \sim \Phi_{h}(t)$. - % - Since $first (\gamma_j) = \Xl < \Xm = first (\delta_j)$ we have $\Phi_{h}(s) \subset \Phi_{h}(t)$. - Therefore $\Phi_{h}(s) < \Phi_{h}(t)$. - - \item[(3)] - Case $\infty > \snorm{s}{p} > \snorm{t}{p} \geq 0$. - In this case both $s_p$ and $t_p$ exist and none of them is $\varnothing$, - and fork happens somewhere after the opening parenthesis $\Xl_{h+2}$ - and before the closing parenthesis $\Xr_{h+1}$ in $\Phi_{h}(s_p)$, $\Phi_{h}(t_p)$: - \begin{alignat*}{9} - \Phi_{h}(s) &\;=\; \Xl_{h+1} &&\Phi_{h+1}(s_1) &&\dots &&\Phi_{h+1}(s_{p-1}) &&\; \Xl_{h+2} \; x - &&\;\big|\; y \; \Xr_{h+1} \; &&\Phi_{h+1}(s_{p+1}) &&\dots &&\Phi_{h+1}(s_n) \Xr_{h} \\[-0.5em] - \Phi_{h}(t) &\;=\; \Xl_{h+1} &&\Phi_{h+1}(t_1) &&\dots &&\Phi_{h+1}(t_{p-1}) &&\; \Xl_{h+2} \; x - &&\;\big|\; z \; \Xr_{h+1} \; &&\Phi_{h+1}(t_{p+1}) &&\dots &&\Phi_{h+1}(t_m) \Xr_{h} - \end{alignat*} - % - From $\snorm{s}{p} > \snorm{t}{p} \geq 0$ it follows that - $s_p$ contains more alphabet symbols than $t_p$. - Consequently $\Phi_{h+1}(s_p)$ contains more alphabet symbols, and thus spans more frames than $\Phi_{h+1}(t_p)$. - % - Let $l$ be the index of the frame $\delta_l$ that contains the closing parenthesis $\Xr_{h+1}$ of $\Phi_{h+1}(t_p)$. - By the above reasoning $\Phi_{h+1}(s_p)$ does not end in frame $\gamma_l$, - therefore $\gamma_l$ does not contain the closing parenthesis $\Xr_{h+1}$ - and we have $minh (\gamma_l) \geq h+2$ and $minh (\delta_l) = h+1$. - % - Furthermore, note that $minh(x)$, $minh(y)$, $minh(z) \geq h + 2$, - therefore $lasth(\beta_j) \geq h+2$ (including the case when $x$ is empty), - and for all frames $i$ such that $j \leq i < l$ (if any) we have $\rho_i, \rho'_i \geq h+2$. - % - Consequently, for $l$-th frame we have $\rho_l \geq h+2$ and $\rho'_l = h + 1$, thus $\rho_l > \rho'_l$. - % - For subsequent frames $i$ such that $l < i < k$ we have $minh(\gamma_i)$, $minh(\delta_i) \geq h + 1$, - therefore $\rho_i \geq h+1$ and $\rho'_i = h + 1$, thus $\rho_i \geq \rho'_i$. - % - For the last pair of frames we have $\rho_k = \rho'_k = h$, as they both contain the closing parenthesis $\Xr_{h}$. - % - Therefore $\Phi_{h}(s) \sqsubset \Phi_{h}(t)$, - which implies $\Phi_{h}(s) < \Phi_{h}(t)$. - \end{itemize} -\end{proofEnd} - -\begin{theoremEnd}[normal, no link to proof, no link to theorem]{lemma} - \label{lemma_pe_less} - Let $s, t \in \PT(r, w)$ for some IRE $r$ and string $w$. - If $s \prec_p t$, then $\Phi_{h}(s) < \Phi_{h}(t) \; \forall h$. -\end{theoremEnd} -\begin{proofEnd} - The proof is by induction on the length of $p$. - % - Induction basis for $|p| = 1$ is given by lemma \ref{lemma_pe_less_1}. - % - Induction step: suppose that the lemma is correct for all $p$ of length $|p| < h$ and let $|p| = h$ ($h \geq 2$). - % - Let $p = p'.p''$ where $p' \in \YN$. - % - At least one of $s|_p$ and $t|_p$ must exist (otherwise $\snorm{s}{p} = \infty = \snorm{t}{p}$ which contradicts $s \prec_p t$), - therefore both $e$ and $e|_{p'}$ are compound IREs and $s$, $t$ can be represented as - $s = T^{d} (s_1, \dots, s_n)$ and - $t = T^{d} (t_1, \dots, t_m)$ where - $s' = s_{p'} = T^{d'} (s'_1, \dots, s'_{n'})$ and - $t' = t_{p'} = T^{d'} (t'_1, \dots, t'_{m'})$ - and both $d, d' \neq 0$ (because $\Lambda$ and $p'$ are prefixes of decision position $p$). - % - Therefore $\Phi_{h}(s)$, $\Phi_{h}(t)$ can be represented as follows: - \begin{alignat*}{9} - \Phi_{h}(s) - \;&= - \;&& \Xl_{h+1} \Phi_{h+1}(s_1) \dots \Phi_{h+1}(s_{p'-1}) - \;&& \overbrace {\Xl_{h+2} \Phi_{h+2}(s'_1) \dots \Phi_{h+2}(s'_{n'}) \Xr_{h+1}}^{\Phi_{h+1}(s')} - \;&& \Phi_{h+1}(s_{p'+1}) \Phi_{h+1}(s_n) \Xr_{h} - \\ - \Phi_{h}(t) - \;&= - \;&& \Xl_{h+1} \Phi_{h+1}(t_1) \dots \Phi_{h+1}(t_{p'-1}) - \;&& \underbrace {\Xl_{h+2} \Phi_{h+2}(t'_1) \dots \Phi_{h+2}(t'_{m'}) \Xr_{h+1}}_{\Phi_{h+1}(t')} - \;&& \Phi_{h+1}(t_{p'+1}) \Phi_{h+1}(t_m) \Xr_{h} - \end{alignat*} - % - Consider any $i < p'$. - By lemma conditions $s \prec_p t$, therefore $\snorm{s}{q} = \snorm{t}{q} \;\forall q < p$, and - in particular $\snorm{s_i}{q} = \snorm{t_i}{q} \;\forall q$, therefore - by lemma \ref{lemma_incomparability_equivdef} $s_i \sim t_i$, - therefore by lemma \ref{lemma_pe_equiv} $\Phi_{h+1}(s_i) = \Phi_{h+1}(t_i)$. - % - Since $p' < p$ we have $\snorm{s}{q} = \snorm{t}{q} \;\forall q \leq p'$ and - by lemma \ref{lemma_subtrees} $\exists e', w' : s', t' \in \PT(e', w')$. - Since $\snorm{s'}{q} = \snorm{s}{p'.q} \;\forall q$ - and $\snorm{t'}{q} = \snorm{t}{p'.q} \;\forall q$, - we have $s' \prec_{p''} t'$. - Since $|p''| < |p|$ by induction hypothesis we have $\Phi_{h+1}(s') < \Phi_{h+1}(t')$. - % - If $j$ is the fork and $f \leq j \leq k$, then - $\Phi_{h}(s)$, $\Phi_{h}(t)$ can be represented as: - \begin{alignat*}{9} - \Phi_{h}(s) - \;&= - \;&& \beta_0 a_1 \dots a_f \beta_f^1 - \;&& \overbrace {\beta_f^2 a_{f+1} \dots a_j \beta_j \;\big|\; \gamma_j a_{j+1} \dots a_k \gamma_k^1}^{\Phi_{h+1}(s')} - \;&& \gamma_k^2 a_{k+1} \dots a_l \gamma_l - \\[-0.5em] - \Phi_{h}(t) - \;&= - \;&& \beta_0 a_1 \dots a_f \beta_f^1 - \;&& \underbrace {\beta_f^2 a_{f+1} \dots a_j \beta_j \;\big|\; \delta_j a_{j+1} \dots a_k \delta_k^1}_{\Phi_{h+1}(t')} - \;&& \delta_k^2 a_{k+1} \dots a_l \delta_l - \end{alignat*} - % - Let $traces (\Phi_{h}(s), \Phi_{h}(t)) = \big( (\rho_0, \dots, \rho_l), (\rho'_0, \dots, \rho'_l) \big)$ - and $traces (\Phi_{h+1}(s'), \Phi_{h+1}(t')) = \big( (\sigma_h, \dots, \sigma_k), (\sigma'_h, \dots, \sigma'_k) \big)$. - % - We show that for frames $i$ such that $j \leq i < k$ we have - $\rho_i = \sigma_i \wedge \rho'_i = \sigma'_i$ - and for subsequent frames $k \leq i \leq l$ we have $\rho_i = \rho'_i$. - % - \begin{itemize}[itemsep=0.5em, topsep=0.5em] - \item[(1)] - Case $i = j < k \leq l$ (the fork frame). - Since we have shown that $\Phi_{h+1}(s_i) = \Phi_{h+1}(t_i) \;\forall i < p'$, - and since $\Phi_{h+1}(s')$ and $\Phi_{h+1}(t')$ have nonempty common prefix $\Xl_{h+2}$, - it follows that $lasht (\Phi_{h}(s) \sqcap \Phi_{h}(t)) = lasth (\Phi_{h+1}(s') \sqcap \Phi_{h+1}(t'))$. - % - From $j < k$ it follows that $\gamma_j$ and $\delta_j$ end before $a_k$ - and are not changed by appending $\gamma^2_k$ and $\delta^2_k$. - % - Therefore $\rho_j = \sigma_j \wedge \rho'_j = \sigma'_j$. - - \item[(2)] - Case $j < i < k \leq l$. - The computation of $\rho_i$, $\rho'_i$ depends only on - $\rho_j$, $\rho'_j$, - or which we have shown $\rho_j = \sigma_j \wedge \rho'_j = \sigma'_j$ in case (1), - and on $\Phi_{h+1}(s')$, $\Phi_{h+1}(t')$, - which are not changed by appending $\gamma^2_k$ and $\delta^2_k$ since $i < k$. - Therefore $\rho_i = \sigma_i \wedge \rho'_i = \sigma'_i$. - - \item[(3)] - Case $j \leq i = k < l$. We have - $minh (\gamma^1_k) = minh (\delta^1_k) = h + 1$ and - $minh (\gamma^2_k) = minh (\delta^2_k) \geq h + 1$. - None of the preceding frames after the fork contain parentheses with height less than $h + 1$, - therefore $\rho_k = \rho'_k = h + 1$. - - \item[(4)] - Case $j \leq k < i < l$. - We have $\rho_i = \rho'_i = h + 1$, - because $\rho_k = \rho'_k = h + 1$ and $minh(\gamma_i)$, $minh(\delta_i) \geq h + 1$. - - \item[(5)] - Case $j \leq k \leq i = l$. - We have $\rho_l = \rho'_l = h$, - because both $\gamma_l$ and $\delta_l$ contain the closing parenthesis $\Xr_{h}$. - \end{itemize} - % - We have shown that $\rho_i = \sigma_i \wedge \rho'_i = \sigma'_i \;\forall i: j \leq i < k$ - and $\rho_i = \rho'_i \;\forall i: k \leq i \leq l$. - It trivially follows that $\Phi_{h+1}(s') \sqsubset \Phi_{h+1}(t')$ $\Rightarrow \Phi_{h}(s) \sqsubset \Phi_{h}(t)$ - and $\Phi_{h+1}(s') \sim \Phi_{h+1}(t')$ $\Rightarrow \Phi_{h}(s) \sim \Phi_{h}(t)$. - Because none of $\Phi_{h+1}(s')$, $\Phi_{h+1}(t')$ is a proper prefix of another, - $\Phi_{h+1}(s') \subset \Phi_{h+1}(t')$ $\Rightarrow \Phi_{h}(s) \subset \Phi_{h}(t)$. - Therefore $\Phi_{h+1}(s') < \Phi_{h+1}(t') \Rightarrow \Phi_{h}(s) < \Phi_{h}(t)$ - (the premise has been shown). -\end{proofEnd} - -\printProofs[theorem_order_on_pe_same_as_on_pt] - - -\subsection*{Correctness of incremental path comparison} - -\printProofs[lemma_frames] -\printProofs[lemmata_closure] - - -\end{document} - diff --git a/src/codegen/emit_action.cc b/src/codegen/emit_action.cc index 0201946a..07bb2fb0 100644 --- a/src/codegen/emit_action.cc +++ b/src/codegen/emit_action.cc @@ -35,6 +35,7 @@ static void emit_rule (Output &o, uint32_t ind, const DFA &dfa, size_t static void gen_fintags (Output &o, uint32_t ind, const DFA &dfa, const Rule &rule); static void gen_goto (code_lines_t &, const State *, const State *, const DFA &, tcid_t, const opt_t *, bool, bool, uint32_t); static void gen_on_eof (code_lines_t &, const opt_t *, const DFA &, const State *, const State *, uint32_t); +static void gen_on_eof_fail (code_lines_t &, const opt_t *, const DFA &, const State *, const State *, std::ostringstream &); static bool endstate (const State *s); static void flushln (code_lines_t &code, std::ostringstream &o); @@ -330,24 +331,45 @@ void gen_goto(code_lines_t &code, const State *from, const State *to } } -void gen_on_eof(code_lines_t &code, const opt_t *opts, const DFA &dfa - , const State *from, const State *to, uint32_t fillidx) +void gen_on_eof_fail(code_lines_t &code, const opt_t *opts, const DFA &dfa + , const State *from, const State *to, std::ostringstream &o) { - const State *retry = from->action.type == Action::MOVE ? from->prev : from; const State *fallback = from->rule == Rule::NONE ? dfa.defstate : dfa.finstates[from->rule]; const tcid_t falltags = from->rule == Rule::NONE ? from->fall_tags : from->rule_tags; - std::ostringstream o; - o << "if ("; - if (opts->input_api == INPUT_CUSTOM) { - o << opts->yylessthan << " ()"; + if (from->action.type == Action::INITIAL) { + o << opts->indString << "goto " << opts->labelPrefix << "eof;"; + flushln(code, o); } - else { - o << opts->yylimit << " <= " << opts->yycursor; + else if (fallback != to) { + code_lines_t tagcode; + gen_settags(tagcode, dfa, falltags, opts); + + if (tagcode.empty()) { + o << opts->indString + << "goto " << opts->labelPrefix << fallback->label << ";"; + flushln(code, o); + } + else { + for (uint32_t i = 0; i < tagcode.size(); ++i) { + code.push_back(opts->indString + tagcode[i]); + } + o << opts->indString + << "goto " << opts->labelPrefix << fallback->label << ";"; + flushln(code, o); + } } - o << ") {"; +} + +void gen_on_eof(code_lines_t &code, const opt_t *opts, const DFA &dfa + , const State *from, const State *to, uint32_t fillidx) +{ + const State *retry = from->action.type == Action::MOVE ? from->prev : from; + std::ostringstream o; + + o << "if (" << output_expr_lessthan(1, opts) << ") {"; flushln(code, o); if (opts->fFlag) { @@ -360,8 +382,13 @@ void gen_on_eof(code_lines_t &code, const opt_t *opts, const DFA &dfa } flushln(code, o); - o << opts->indString << opts->fill << " ();"; + o << opts->indString << opts->fill << "();"; flushln(code, o); + + o << opts->indString << opts->labelPrefix << "eof" << fillidx << ":;"; + flushln(code, o); + + gen_on_eof_fail(code, opts, dfa, from, to, o); } else { if (opts->fill_use) { @@ -369,29 +396,7 @@ void gen_on_eof(code_lines_t &code, const opt_t *opts, const DFA &dfa << "goto " << opts->labelPrefix << retry->label << "_;"; flushln(code, o); } - - if (from->action.type == Action::INITIAL) { - o << opts->indString << "goto " << opts->labelPrefix << "eof;"; - flushln(code, o); - } - else if (fallback != to) { - code_lines_t tagcode; - gen_settags(tagcode, dfa, falltags, opts); - - if (tagcode.empty()) { - o << opts->indString - << "goto " << opts->labelPrefix << fallback->label << ";"; - flushln(code, o); - } - else { - for (uint32_t i = 0; i < tagcode.size(); ++i) { - code.push_back(opts->indString + tagcode[i]); - } - o << opts->indString - << "goto " << opts->labelPrefix << fallback->label << ";"; - flushln(code, o); - } - } + gen_on_eof_fail(code, opts, dfa, from, to, o); } o << "}"; diff --git a/src/codegen/emit_dfa.cc b/src/codegen/emit_dfa.cc index f977dfa0..f239894f 100644 --- a/src/codegen/emit_dfa.cc +++ b/src/codegen/emit_dfa.cc @@ -48,7 +48,7 @@ void emit_eof(Output & o, uint32_t ind, const Code *code) { const opt_t *opts = o.block().opts; - if (opts->eof == NOEOF || opts->fFlag) return; + if (opts->eof == NOEOF) return; o.wstring(opts->labelPrefix).ws("eof:\n"); o.wdelay_line_info_input(code->loc); diff --git a/src/codegen/output.cc b/src/codegen/output.cc index 0d469a95..679d48f3 100644 --- a/src/codegen/output.cc +++ b/src/codegen/output.cc @@ -457,6 +457,9 @@ bool Output::emit_blocks(const std::string &fname, blocks_t &blocks, fix_first_block_opts(blocks); + // global options are last block's options + const opt_t *globopt = block().opts; + unsigned int line_count = 1; for (unsigned int j = 0; j < blocks.size(); ++j) { OutputBlock & b = *blocks[j]; @@ -490,7 +493,7 @@ bool Output::emit_blocks(const std::string &fname, blocks_t &blocks, output_cond_table(o, ind, b.types, bopt); break; case OutputFragment::STATE_GOTO: - output_state_goto(o, ind, 0, fill_index, bopt); + output_state_goto(o, ind, 0, fill_index, globopt); break; case OutputFragment::STAGS: output_tags(o, ind, *f.tags, global_stags, bopt); @@ -648,22 +651,24 @@ void output_state_goto(std::ostream & o, uint32_t ind, : opts->state_get + "()"; o << indstr << "switch (" << getstate << ") {\n"; - if (opts->bUseStateAbort) - { + if (opts->bUseStateAbort) { o << indstr << "default: abort();\n"; o << indstr << "case -1: goto " << opts->labelPrefix << start_label << ";\n"; } - else - { + else { o << indstr << "default: goto " << opts->labelPrefix << start_label << ";\n"; } - for (uint32_t i = 0; i < fill_index; ++i) - { - o << indstr << "case " << i << ": goto " << opts->yyfilllabel << i << ";\n"; + for (uint32_t i = 0; i < fill_index; ++i) { + o << indstr << "case " << i << ":"; + if (opts->eof != NOEOF) { + o << " if (" << output_expr_lessthan(1, opts) << ")" + << " goto " << opts->labelPrefix << "eof" << i << ";"; + } + o << " goto " << opts->yyfilllabel << i << ";\n"; } o << indstr << "}\n"; - if (opts->bUseStateNext) - { + + if (opts->bUseStateNext) { o << opts->yynext << ":\n"; } } diff --git a/src/options/opt.cc b/src/options/opt.cc index a8e1743b..95e2f1a7 100644 --- a/src/options/opt.cc +++ b/src/options/opt.cc @@ -35,6 +35,7 @@ void mutopt_t::fix(const conopt_t *globopts) bFlag = Opt::baseopt.bFlag; gFlag = Opt::baseopt.gFlag; cGotoThreshold = Opt::baseopt.cGotoThreshold; + eof = Opt::baseopt.eof; // default environment-insensitive formatting yybmHexTable = Opt::baseopt.yybmHexTable; // fallthrough diff --git a/src/parse/validate.cc b/src/parse/validate.cc index ed94631c..eb913af0 100644 --- a/src/parse/validate.cc +++ b/src/parse/validate.cc @@ -50,15 +50,10 @@ void validate_ast(const specs_t &specs, const opt_t *opts, Msg &msg) "%sEOF rule found, but 're2c:eof' configuration is not set", incond(i->name).c_str()); } - else if (i->eofs.empty() && opts->eof != NOEOF && !opts->fFlag) { + else if (i->eofs.empty() && opts->eof != NOEOF) { fatal("%s're2c:eof' configuration is set, but no EOF rule found", incond(i->name).c_str()); } - else if (!i->eofs.empty() && opts->fFlag) { - msg.fatal(i->eofs[0]->loc, - "%sEOF rule is unreachable in push-model lexers", - incond(i->name).c_str()); - } else if (i->eofs.size() > 1) { msg.fatal(i->eofs[1]->loc, "EOF rule %sis already defined at line %u", diff --git a/test/eof/nonblocking_push.fi.c b/test/eof/nonblocking_push.fi.c index 3caaeddd..bdfc740e 100644 --- a/test/eof/nonblocking_push.fi.c +++ b/test/eof/nonblocking_push.fi.c @@ -50,9 +50,6 @@ struct input_t { lim += bytes_read; lim[0] = 0; - // simulate the END packet (can as well send a normal packet) - if (feof(file)) ++lim; - // quick make a copy of buffer with newlines replaced w/ _ char b[40]; snprintf(b, 40, "%s", buf); @@ -93,27 +90,27 @@ static status_t lex(input_t &in) { switch (YYGETSTATE()) { default: goto yy0; -case 0: goto yyFillLabel0; -case 1: goto yyFillLabel1; -case 2: goto yyFillLabel2; -case 3: goto yyFillLabel3; -case 4: goto yyFillLabel4; -case 5: goto yyFillLabel5; -case 6: goto yyFillLabel6; -case 7: goto yyFillLabel7; -case 8: goto yyFillLabel8; -case 9: goto yyFillLabel9; -case 10: goto yyFillLabel10; -case 11: goto yyFillLabel11; -case 12: goto yyFillLabel12; -case 13: goto yyFillLabel13; -case 14: goto yyFillLabel14; -case 15: goto yyFillLabel15; -case 16: goto yyFillLabel16; -case 17: goto yyFillLabel17; -case 18: goto yyFillLabel18; -case 19: goto yyFillLabel19; -case 20: goto yyFillLabel20; +case 0: if (in.lim <= in.cur) goto yyeof0; goto yyFillLabel0; +case 1: if (in.lim <= in.cur) goto yyeof1; goto yyFillLabel1; +case 2: if (in.lim <= in.cur) goto yyeof2; goto yyFillLabel2; +case 3: if (in.lim <= in.cur) goto yyeof3; goto yyFillLabel3; +case 4: if (in.lim <= in.cur) goto yyeof4; goto yyFillLabel4; +case 5: if (in.lim <= in.cur) goto yyeof5; goto yyFillLabel5; +case 6: if (in.lim <= in.cur) goto yyeof6; goto yyFillLabel6; +case 7: if (in.lim <= in.cur) goto yyeof7; goto yyFillLabel7; +case 8: if (in.lim <= in.cur) goto yyeof8; goto yyFillLabel8; +case 9: if (in.lim <= in.cur) goto yyeof9; goto yyFillLabel9; +case 10: if (in.lim <= in.cur) goto yyeof10; goto yyFillLabel10; +case 11: if (in.lim <= in.cur) goto yyeof11; goto yyFillLabel11; +case 12: if (in.lim <= in.cur) goto yyeof12; goto yyFillLabel12; +case 13: if (in.lim <= in.cur) goto yyeof13; goto yyFillLabel13; +case 14: if (in.lim <= in.cur) goto yyeof14; goto yyFillLabel14; +case 15: if (in.lim <= in.cur) goto yyeof15; goto yyFillLabel15; +case 16: if (in.lim <= in.cur) goto yyeof16; goto yyFillLabel16; +case 17: if (in.lim <= in.cur) goto yyeof17; goto yyFillLabel17; +case 18: if (in.lim <= in.cur) goto yyeof18; goto yyFillLabel18; +case 19: if (in.lim <= in.cur) goto yyeof19; goto yyFillLabel19; +case 20: if (in.lim <= in.cur) goto yyeof20; goto yyFillLabel20; } @@ -122,14 +119,8 @@ yy0: yyFillLabel0: in.yych = *in.cur; switch (in.yych) { - case 0x00: - if (in.lim <= in.cur) { - YYSETSTATE(0); - YYFILL (); - } - goto yy2; case '\n': - case ' ': goto yy6; + case ' ': goto yy4; case 'A': case 'B': case 'C': @@ -180,37 +171,42 @@ yyFillLabel0: case 'w': case 'x': case 'y': - case 'z': goto yy9; - case 'T': goto yy12; - default: goto yy4; + case 'z': goto yy7; + case 'T': goto yy10; + default: + if (in.lim <= in.cur) { + YYSETSTATE(0); + YYFILL(); + yyeof0:; + goto yyeof; + } + goto yy2; } yy2: - ++in.cur; - { printf("< EOF\n"); return OK; } -yy4: ++in.cur; { printf("< Unexpected character >%c<\n", in.yych); return FAIL; } -yy6: +yy4: ++in.cur; yyFillLabel1: in.yych = *in.cur; switch (in.yych) { case '\n': - case ' ': goto yy6; + case ' ': goto yy4; default: if (in.lim <= in.cur) { YYSETSTATE(1); - YYFILL (); + YYFILL(); + yyeof1:; } - goto yy8; + goto yy6; } -yy8: +yy6: { printf("< whitespace\n"); return WHITESPACE; } -yy9: +yy7: ++in.cur; yyFillLabel2: in.yych = *in.cur; -yy10: +yy8: switch (in.yych) { case 'A': case 'B': @@ -263,17 +259,18 @@ yy10: case 'w': case 'x': case 'y': - case 'z': goto yy9; + case 'z': goto yy7; default: if (in.lim <= in.cur) { YYSETSTATE(2); - YYFILL (); + YYFILL(); + yyeof2:; } - goto yy11; + goto yy9; } -yy11: +yy9: { printf("< word\n"); return WORD; } -yy12: +yy10: ++in.cur; yyFillLabel3: in.yych = *in.cur; @@ -281,13 +278,14 @@ yyFillLabel3: case 0x00: if (in.lim <= in.cur) { YYSETSTATE(3); - YYFILL (); + YYFILL(); + yyeof3:; } - goto yy11; - case 'H': goto yy13; - default: goto yy10; + goto yy9; + case 'H': goto yy11; + default: goto yy8; } -yy13: +yy11: ++in.cur; yyFillLabel4: in.yych = *in.cur; @@ -295,13 +293,14 @@ yyFillLabel4: case 0x00: if (in.lim <= in.cur) { YYSETSTATE(4); - YYFILL (); + YYFILL(); + yyeof4:; } - goto yy11; - case 'I': goto yy14; - default: goto yy10; + goto yy9; + case 'I': goto yy12; + default: goto yy8; } -yy14: +yy12: ++in.cur; yyFillLabel5: in.yych = *in.cur; @@ -309,13 +308,14 @@ yyFillLabel5: case 0x00: if (in.lim <= in.cur) { YYSETSTATE(5); - YYFILL (); + YYFILL(); + yyeof5:; } - goto yy11; - case 'N': goto yy15; - default: goto yy10; + goto yy9; + case 'N': goto yy13; + default: goto yy8; } -yy15: +yy13: ++in.cur; yyFillLabel6: in.yych = *in.cur; @@ -323,13 +323,14 @@ yyFillLabel6: case 0x00: if (in.lim <= in.cur) { YYSETSTATE(6); - YYFILL (); + YYFILL(); + yyeof6:; } - goto yy11; - case 'G': goto yy16; - default: goto yy10; + goto yy9; + case 'G': goto yy14; + default: goto yy8; } -yy16: +yy14: in.mark = ++in.cur; yyFillLabel7: in.yych = *in.cur; @@ -337,187 +338,203 @@ yyFillLabel7: case 0x00: if (in.lim <= in.cur) { YYSETSTATE(7); - YYFILL (); + YYFILL(); + yyeof7:; } - goto yy11; - case '\n': goto yy17; - default: goto yy10; + goto yy9; + case '\n': goto yy15; + default: goto yy8; } -yy17: +yy15: ++in.cur; yyFillLabel8: in.yych = *in.cur; switch (in.yych) { - case 'W': goto yy19; + case 'W': goto yy17; default: if (in.lim <= in.cur) { YYSETSTATE(8); - YYFILL (); + YYFILL(); + yyeof8:; } - goto yy18; + goto yy16; } -yy18: +yy16: in.cur = in.mark; - goto yy11; -yy19: + goto yy9; +yy17: ++in.cur; yyFillLabel9: in.yych = *in.cur; switch (in.yych) { - case 'I': goto yy20; + case 'I': goto yy18; default: if (in.lim <= in.cur) { YYSETSTATE(9); - YYFILL (); + YYFILL(); + yyeof9:; } - goto yy18; + goto yy16; } -yy20: +yy18: ++in.cur; yyFillLabel10: in.yych = *in.cur; switch (in.yych) { - case 'T': goto yy21; + case 'T': goto yy19; default: if (in.lim <= in.cur) { YYSETSTATE(10); - YYFILL (); + YYFILL(); + yyeof10:; } - goto yy18; + goto yy16; } -yy21: +yy19: ++in.cur; yyFillLabel11: in.yych = *in.cur; switch (in.yych) { - case 'H': goto yy22; + case 'H': goto yy20; default: if (in.lim <= in.cur) { YYSETSTATE(11); - YYFILL (); + YYFILL(); + yyeof11:; } - goto yy18; + goto yy16; } -yy22: +yy20: ++in.cur; yyFillLabel12: in.yych = *in.cur; switch (in.yych) { - case '\n': goto yy23; + case '\n': goto yy21; default: if (in.lim <= in.cur) { YYSETSTATE(12); - YYFILL (); + YYFILL(); + yyeof12:; } - goto yy18; + goto yy16; } -yy23: +yy21: ++in.cur; yyFillLabel13: in.yych = *in.cur; switch (in.yych) { - case 'N': goto yy24; + case 'N': goto yy22; default: if (in.lim <= in.cur) { YYSETSTATE(13); - YYFILL (); + YYFILL(); + yyeof13:; } - goto yy18; + goto yy16; } -yy24: +yy22: ++in.cur; yyFillLabel14: in.yych = *in.cur; switch (in.yych) { - case 'E': goto yy25; + case 'E': goto yy23; default: if (in.lim <= in.cur) { YYSETSTATE(14); - YYFILL (); + YYFILL(); + yyeof14:; } - goto yy18; + goto yy16; } -yy25: +yy23: ++in.cur; yyFillLabel15: in.yych = *in.cur; switch (in.yych) { - case 'W': goto yy26; + case 'W': goto yy24; default: if (in.lim <= in.cur) { YYSETSTATE(15); - YYFILL (); + YYFILL(); + yyeof15:; } - goto yy18; + goto yy16; } -yy26: +yy24: ++in.cur; yyFillLabel16: in.yych = *in.cur; switch (in.yych) { - case 'L': goto yy27; + case 'L': goto yy25; default: if (in.lim <= in.cur) { YYSETSTATE(16); - YYFILL (); + YYFILL(); + yyeof16:; } - goto yy18; + goto yy16; } -yy27: +yy25: ++in.cur; yyFillLabel17: in.yych = *in.cur; switch (in.yych) { - case 'I': goto yy28; + case 'I': goto yy26; default: if (in.lim <= in.cur) { YYSETSTATE(17); - YYFILL (); + YYFILL(); + yyeof17:; } - goto yy18; + goto yy16; } -yy28: +yy26: ++in.cur; yyFillLabel18: in.yych = *in.cur; switch (in.yych) { - case 'N': goto yy29; + case 'N': goto yy27; default: if (in.lim <= in.cur) { YYSETSTATE(18); - YYFILL (); + YYFILL(); + yyeof18:; } - goto yy18; + goto yy16; } -yy29: +yy27: ++in.cur; yyFillLabel19: in.yych = *in.cur; switch (in.yych) { - case 'E': goto yy30; + case 'E': goto yy28; default: if (in.lim <= in.cur) { YYSETSTATE(19); - YYFILL (); + YYFILL(); + yyeof19:; } - goto yy18; + goto yy16; } -yy30: +yy28: ++in.cur; yyFillLabel20: in.yych = *in.cur; switch (in.yych) { - case 'S': goto yy31; + case 'S': goto yy29; default: if (in.lim <= in.cur) { YYSETSTATE(20); - YYFILL (); + YYFILL(); + yyeof20:; } - goto yy18; + goto yy16; } -yy31: +yy29: ++in.cur; { printf("< Thing w/ newlines\n"); return THING; } +yyeof: + { printf("< EOF\n"); return OK; } } diff --git a/test/eof/nonblocking_push.fi.re b/test/eof/nonblocking_push.fi.re index 1537aa6f..9c03e90e 100644 --- a/test/eof/nonblocking_push.fi.re +++ b/test/eof/nonblocking_push.fi.re @@ -48,9 +48,6 @@ struct input_t { lim += bytes_read; lim[0] = 0; - // simulate the END packet (can as well send a normal packet) - if (feof(file)) ++lim; - // quick make a copy of buffer with newlines replaced w/ _ char b[40]; snprintf(b, 40, "%s", buf); @@ -99,7 +96,7 @@ static status_t lex(input_t &in) re2c:eof = 0; * { printf("< Unexpected character >%c<\n", in.yych); return FAIL; } - [\x00] { printf("< EOF\n"); return OK; } + $ { printf("< EOF\n"); return OK; } [\n ]+ { printf("< whitespace\n"); return WHITESPACE; } [a-zA-Z]+ { printf("< word\n"); return WORD; } "THING\nWITH\nNEWLINES" { printf("< Thing w/ newlines\n"); return THING; } -- 2.50.1