From eb94bfe01f38bd89f3f79ca6e3f2ed97c40aa827 Mon Sep 17 00:00:00 2001 From: Ulya Trofimovich Date: Sat, 30 Jun 2018 21:23:13 +0100 Subject: [PATCH] Paper: added example of PEs and traces computation. --- re2c/doc/tdfa_v2/TODO | 15 + re2c/doc/tdfa_v2/img/Makefile | 2 +- re2c/doc/tdfa_v2/img/pe.tex | 165 +++ re2c/doc/tdfa_v2/part_1_tnfa.tex | 1983 +++++++++++++++--------------- 4 files changed, 1180 insertions(+), 985 deletions(-) create mode 100644 re2c/doc/tdfa_v2/img/pe.tex diff --git a/re2c/doc/tdfa_v2/TODO b/re2c/doc/tdfa_v2/TODO index 6f0b9062..b5e7030f 100644 --- a/re2c/doc/tdfa_v2/TODO +++ b/re2c/doc/tdfa_v2/TODO @@ -55,3 +55,18 @@ - The question is that Okui defined the ordering on trees that are a bit different from ours (and that were similar to our PTs rather than to IPTs). Perhaps we should tell how his trees correspond to ours? +- Lemma 1, last line. Before "If r' ..." I would suggest to add: "Then r~ can be defined as follows." + +- Theorem 2: I think that is proof is just above it. Perhaps we should move it below. + +- in Figure 3. closure() is missing + +- > Hmm... I think that would be hard, and also needless effort. + I was trying to prevent a question from some reviewer. Well, we could tell that we deliver an unspecified one that provides the correct submatches. + +- > You mean, mentioning that the user can specify the set of the implicit + > submatch indices using a different mechanism than parentheses? + Yes, this is what tagged automata are all about: letting the user specify the parts for which the submatch has to + be delivered. What I mean is that our algorithm can do it, and this is more general than Posix subexpressions. + Indeed, it can go from full-parsing to any user-defined submatch extraction. + diff --git a/re2c/doc/tdfa_v2/img/Makefile b/re2c/doc/tdfa_v2/img/Makefile index c2933cfa..120d8c27 100644 --- a/re2c/doc/tdfa_v2/img/Makefile +++ b/re2c/doc/tdfa_v2/img/Makefile @@ -1,7 +1,7 @@ %.pdf : %.tex lualatex -shell-escape $< $<.build_log -all : mark_enum.pdf trees.pdf tnfa.pdf gor1.pdf +all : mark_enum.pdf pe.pdf trees.pdf tnfa.pdf gor1.pdf clean: rm *.log *.build_log *.aux *.pdf *.pag diff --git a/re2c/doc/tdfa_v2/img/pe.tex b/re2c/doc/tdfa_v2/img/pe.tex new file mode 100644 index 00000000..e4b69712 --- /dev/null +++ b/re2c/doc/tdfa_v2/img/pe.tex @@ -0,0 +1,165 @@ + +\documentclass[tikz,border=10pt]{standalone} + + +\RequirePackage{luatex85} +\usepackage[utf8]{inputenc} +\usepackage{amsmath, amssymb, amsfonts, accents} +\usetikzlibrary{graphdrawing, graphs, arrows, shapes, automata, calc} +\usegdlibrary{trees, layered} +\usepackage{stix} + + +%\newcommand{\Xund}{\rule{.4em}{.4pt}} +%\newcommand{\IRE}{I\!RE} + +\newcommand{\Xund}{\rule{.4em}{.4pt}} +\newcommand{\Xl}{\langle} +\newcommand{\Xr}{\rangle} +\newcommand{\Xm}{\langle\!\rangle} + + +\begin{document} + +\begin{tikzpicture}[>=stealth, ->, auto, node distance=0.2in] + +\tikzstyle{every node}=[draw=none] + +% $(\epsilon|a^{0,\infty})(a|\epsilon)^{0,\infty}$ + +\begin{scope}[xshift=0in, yshift=0in] + \node (a) {{ + $\begin{aligned} + \quad + & \Phi_0 \big( + {T}^{1} ( + {T}^{2} ( + {\varnothing}^{0}, + {T}^{0}({a}^{0}) + ), {T}^{3}( + {\varnothing}^{4} + ) + )\big) &&= + \Xl_1 + \Xl_2 + a + \Xr_1 + \Xl_2 + \Xm_2 + \Xr_1 + \Xr_0 + &&= \overbracket {\Xl_1 \Xl_2 }%^{\alpha_0} + && a + && \overbracket { \Xr_1 \Xl_2 \Xm_2 \Xr_1 \Xr_0 }%^{\alpha_1} +% &&= \alpha_0 a \alpha_1 + &&= \alpha + \\[-0.3em] + & \Phi_0 \big( + {T}^{1} ( + {T}^{2}( + {\varnothing}^{0}, + {T}^{0}({\varnothing}^{0}) + ), + {T}^{3}\big( + {T}^{4}({a}^{0},{\varnothing}^{0}) + ) + )\big) &&= + \Xl_1 + \Xl_2 + \Xr_1 + \Xl_2 + \Xl_3 + a + \Xr_2 + \Xr_1 + \Xr_0 + &&= \overbracket { \Xl_1 \Xl_2 \Xr_1 \Xl_2 \Xl_3 }%^{\beta_0} + && a + && \overbracket { \Xr_2 \Xr_1 \Xr_0 }%^{\beta_1} +% &&= \beta_0 a \beta_1 + &&= \beta + \\[-0.3em] + & \Phi_0 \big( + {T}^{1} ( + {T}^{2}( + {\epsilon}^{0}, + {\varnothing}^{0} + ), + {T}^{3} ( + {T}^{4}({a}^{0},{\varnothing}^{0}), + {T}^{4}({\varnothing}^{0}, {\epsilon}^{0}) + ) + )\big) &&= + \Xl_1 + \Xl_2 + \Xr_1 + \Xl_2 + \Xl_3 + a + \Xr_2 + \Xl_3 + \Xr_2 + \Xr_1 + \Xr_0 + &&= \overbracket { \Xl_1 \Xl_2 \Xr_1 \Xl_2 \Xl_3 }%^{\gamma_0} + && a + && \overbracket { \Xr_2 \Xl_3 \Xr_2 \Xr_1 \Xr_0 }%^{\gamma_1} +% &&= \gamma_0 a \gamma_1 + &&= \gamma + \quad + \end{aligned}$ + }}; +\end{scope} + +\begin{scope}[xshift=0in, yshift=-2.1in] + \node (a) { + $\begin{aligned} + &\begin{tabular}{c|ll} + $traces (\alpha, \beta)$ & 0 & 1 \\ + \hline \\[-0.5em] + $\rho^{\alpha/\beta}$ & 2 & 0 \\ + $\rho^{\beta/\alpha}$ & 1 & 0 \\ + \end{tabular} + &&\quad + \left[\begin{aligned} + \rho^{\alpha/\beta}_0 &= min (lasth (\Xl_1 \Xl_2), minh (\epsilon)) = min (2, \infty) = 2 \\[-0.3em] + \rho^{\beta/\alpha}_0 &= min (lasth (\Xl_1 \Xl_2), minh (\Xr_1 \Xl_2 \Xl_3)) = min (2, 1) = 1 \\[-0.3em] + \rho^{\alpha/\beta}_1 &= min (\rho^{\alpha/\beta}_0, minh (\Xr_1 \Xl_2 \Xm_2 \Xr_1 \Xr_0)) = min (2,0) = 0 \\[-0.3em] + \rho^{\beta/\alpha}_1 &= min (\rho^{\beta/\alpha}_0, minh (\Xr_2 \Xr_1 \Xr_0)) = min (1,0) = 0 + \end{aligned}\right. + \\[1em] + &\begin{tabular}{c|ll} + $traces (\beta, \gamma)$ & 0 & 1 \\ + \hline \\[-0.5em] + $\rho^{\beta/\gamma}$ & -1 & 0 \\ + $\rho^{\gamma/\beta}$ & -1 & 0 \\ + \end{tabular} + &&\quad + \left[\begin{aligned} + \rho^{\beta/\gamma}_0 &= -1 \\[-0.3em] + \rho^{\gamma/\beta}_0 &= -1 \\[-0.3em] + \rho^{\beta/\gamma}_1 &= min (lasth (\Xr_2), minh (\Xr_1 \Xr_0)) = min (2,0) = 0 \\[-0.3em] + \rho^{\gamma/\beta}_1 &= min (lasth (\Xr_2), minh (\Xl_3 \Xr_2 \Xr_1 \Xr_0)) = min (2,0) = 0 + \end{aligned}\right. + \\[1em] + &\begin{tabular}{c|ll} + $traces (\alpha, \gamma)$ & 0 & 1 \\ + \hline \\[-0.5em] + $\rho^{\alpha/\gamma}$ & 2 & 0 \\ + $\rho^{\gamma/\alpha}$ & 1 & 0 \\ + \end{tabular} + &&\quad + \left[\begin{aligned} + \rho^{\alpha/\gamma}_0 &= min (lasth (\Xl_1 \Xl_2), minh (\epsilon)) = min (2, \infty) = 2 \\[-0.3em] + \rho^{\gamma/\alpha}_0 &= min (lasth (\Xl_1 \Xl_2), minh (\Xr_1 \Xl_2 \Xl_3)) = min (2, 1) = 1 \\[-0.3em] + \rho^{\alpha/\gamma}_1 &= min (\rho^{\alpha/\gamma}_0, minh (\Xr_1 \Xl_2 \Xm_2 \Xr_1 \Xr_0)) = min (2,0) = 0 \\[-0.3em] + \rho^{\gamma/\alpha}_1 &= min (\rho^{\gamma/\alpha}_0, minh (\Xr_2 \Xl_3 \Xr_2 \Xr_1 \Xr_0)) = min (1,0) = 0 + \end{aligned}\right. + \end{aligned}$ + }; +\end{scope} + +\end{tikzpicture} + +\end{document} + diff --git a/re2c/doc/tdfa_v2/part_1_tnfa.tex b/re2c/doc/tdfa_v2/part_1_tnfa.tex index 8f0eaa28..3bbf0ad1 100644 --- a/re2c/doc/tdfa_v2/part_1_tnfa.tex +++ b/re2c/doc/tdfa_v2/part_1_tnfa.tex @@ -347,7 +347,7 @@ If both indices are non-zero, then the sub-RE is a submatch group. Indices are convenient because they allow us to consider individual sub-REs without loosing the submatch context of the whole RE. We call this representation \emph{indexed regular expressions (IREs)}. -An example of IRE can be seen on figure \ref{fig_parse_trees}. +%An example of IRE can be seen on figure \ref{fig_parse_trees}. \begin{Xdef} \emph{Indexed regular expressions (IREs)} over finite alphabet $\Sigma$, denoted $\XIR_\Sigma$, are: @@ -479,8 +479,8 @@ Therefore RE and IRE are equivalent representations. \begin{figure}\label{fig_mark_enum} \includegraphics[width=\linewidth]{img/mark_enum.pdf} \caption{ -An example of constructing IRE for RE $(\epsilon|a^{0,\infty})(a|\epsilon)^{0,3} )$ using $mark$ and $enum$\\ -and some examples of IPT for the resulting IRE. +An example of constructing IRE for RE $(\epsilon|a^{0,\infty})(a|\epsilon)^{0,3}$ using $mark$ and $enum$\\ +and some examples of IPT for the resulting IRE and string ``$a$''. } \end{figure} @@ -643,10 +643,12 @@ it will only add more details. \begin{XThe}\label{theorem_order_compat} For an IRE $r$ and string $w$, let $t_{min}$ be the $<$-minimal tree in $\IPT(r,w)$ - and let $T_{min}$ be the class of the $\prec$-minimal trees in $\IPT(r,w)$. + and let $T_{min} = \IPT_{min}(r,w)$ be the class of the $\prec$-minimal trees in $\IPT(r,w)$. Then $t_{min} \in T_{min}$. \end{XThe} +In the rest of the paper the words ``order'' and ``norm`` refer to s-order and s-norm. + \section{Parenthesized expressions} @@ -694,8 +696,8 @@ If $\alpha, \beta \in PE(r, w)$ for some $r$ and $w$, we say that $\alpha$ and $ For PE $\alpha$, $\beta$, $\alpha \sqcap \beta$ denotes the longest common prefix of $\alpha$ and $\beta$, $\alpha \backslash \beta$ denotes the suffix of $\alpha$ after removing $\alpha \sqcap \beta$, -$lastht(\alpha)$ denotes the height of the last parenthesis in $\alpha$ or $\infty$ if $\alpha$ is empty or begins with a letter, -$minht(\alpha)$ denotes the minimal height of parenthesis in $\alpha$ or $\infty$ if $\alpha$ is empty or begins with a letter, +$lasth(\alpha)$ denotes the height of the last parenthesis in $\alpha$ or $\infty$ if $\alpha$ is empty or begins with a letter, +$minh(\alpha)$ denotes the minimal height of parenthesis in $\alpha$ or $\infty$ if $\alpha$ is empty or begins with a letter, $first(\alpha)$ denotes the first parenthesis in $\alpha$ or $\bot$ if $\alpha$ is empty or begins with a letter. Each PE $\alpha$ can be represented as $\alpha_0 a_1 \alpha_1 \dots a_n \alpha_n$, where $\alpha_i$ is the $i$-th \emph{frame} --- the possibly empty sequence of parentheses between @@ -712,14 +714,24 @@ the index of the first distinct pair of frames is called \emph{fork}. $$ \rho_i = \begin{cases} -1 &\text{if } i < k \\ - min (lastht (\alpha_i \sqcap \beta_i), minht(\alpha_i \backslash \beta_i)) &\text{if } i = k \\ - min (\rho_{i-1}, minht(\alpha_i)) &\text{if } i > k + min (lasth (\alpha_i \sqcap \beta_i), minh(\alpha_i \backslash \beta_i)) &\text{if } i = k \\ + min (\rho_{i-1}, minh(\alpha_i)) &\text{if } i > k \end{cases} $$ We write $traces(\alpha, \beta)$ to denote $\big( trace (\alpha, \beta), trace (\beta, \alpha) \big)$. - $\square$ \end{Xdef} +\begin{figure}\label{fig_pe} +\includegraphics[width=\linewidth]{img/pe.pdf} +\caption{ +An example of PEs for IPTs from figure \ref{fig_mark_enum} and the computation of $traces$ for each pair of PEs.\\ +Here $\alpha \sqsubset \beta$ and $\alpha \sqsubset \gamma$, while +$\beta \sim \gamma$ and $\beta \subset \gamma$, +because $first (\beta \backslash \gamma) = \Xr < \Xl = first (\gamma \backslash \beta)$. +Therefore $\alpha < \beta < \gamma$. +} +\end{figure} + \begin{Xdef}\label{prec1} Let $\alpha$, $\beta$ be comparable PE and $traces(\alpha, \beta) = \big( (\rho_0, \dots, \rho_n), (\rho'_0, \dots, \rho'_n) \big)$. @@ -786,1152 +798,1155 @@ the index of the first distinct pair of frames is called \emph{fork}. % \end{align*} \end{Xdef} - \begin{XLem}\label{lemma_pe_order_antisymm} - The longest-leftmost-precedence relation $<$ is antisymmetric: - if $\alpha < \beta$, then $\beta \not< \alpha$. - \\ - Proof. - Suppose, on the contrary, that $\alpha < \beta$ and $\beta < \alpha$. - Let $\big( (\rho_0, \dots, \rho_n), (\rho'_0, \dots, \rho'_n) \big) = traces(\alpha, \beta)$. + \begin{XThe}\label{theorem_order_on_pe_same_as_on_pt} + Let $s, t \in PT(r, w)$, then + $s <_p t \Leftrightarrow \Phi_{h}(s) < \Phi_{h}(t) \; \forall h$. + \end{XThe} - \medskip +\FloatBarrier - If $\exists i = max \{j \mid \rho_j \neq \rho'_j \}$, then - $\alpha < \beta \implies \alpha \sqsubset \beta \implies \rho_i > \rho'_i$, but - $\beta < \alpha \implies \beta \sqsubset \alpha \implies \rho'_i > \rho_i$. Contradiction. - \medskip +\section{Tagged NFA} - Otherwise $\rho_i = \rho'_i \; \forall i$, then - $\alpha < \beta \implies \alpha \sim \beta \wedge \alpha \subset \beta$ and - $\beta < \alpha \implies \beta \sim \alpha \wedge \beta \subset \alpha$. - Let - $x = first (\alpha \backslash \beta)$, - $y = first (\beta \backslash \alpha)$, then - $\alpha \subset \beta \implies x < y$, but - $\beta \subset \alpha \implies y < x$. Contradiction. - $\square$ - \end{XLem} + \begin{Xdef} + \emph{Tagged Nondeterministic Finite Automaton (TNFA)} + is a structure $(\Sigma, Q, F, q_0, \Delta)$, where: + \begin{itemize} + \item[] $\Sigma$ is a finite set of symbols (\emph{alphabet}) +% \item[] $T$ is a finite set of \emph{tags} +% \item[] $P$ is a finite set of \emph{priorities} + \item[] $Q$ is a finite set of \emph{states} + \item[] $F \subseteq Q$ is the set of \emph{final} states + \item[] $q_0 \in Q$ is the \emph{initial} state + + \item[] $\Delta = \Delta^\Sigma \sqcup \Delta^\epsilon$ is the \emph{transition} relation, where: + \begin{itemize} + \item[] $\Delta^\Sigma \subseteq Q \times \Sigma \times \{\epsilon\} \times Q$ + \item[] $\Delta^\epsilon \subseteq Q \times \{\epsilon\} \times \big( (\YZ \times \YZ) \cup \{\epsilon\} \big) \times Q$ + \end{itemize} +% and all $\epsilon$-transitions from the same state have different priorities: +% $\forall (x, r, \epsilon, y), (\widetilde{x}, \widetilde{r}, \epsilon, \widetilde{y}) \in \Delta^\epsilon: +% x = \widetilde{x} \wedge y = \widetilde{y} \Rightarrow r \!\neq\! \widetilde{r}$. + \end{itemize} + \end{Xdef} +TNFA construction is given by the $F$ function defined on figure \ref{fig_tnfa}. - \begin{XLem}\label{lemma_pe_equiv} - Let $s, t \in PT(r, w)$. - If $s \sim t$, then $\Phi_{h}(h, s) = \Phi_{h}(t) \; \forall h$. + \begin{align*} + otag \big( (i, j, r) \big) &= + \begin{cases} + \epsilon &\text{if } i = 0 \\[-0.5em] + (2i - 1, 2j - 1) &\text{if } i \neq 0 + \end{cases} \\ - Proof. - By induction on the height of $r$. + ctag \big( (i, j, r) \big) &= + \begin{cases} + \epsilon &\text{if } i = 0 \\[-0.5em] + (2i, 2j) \hphantom{{} - 1 {} - 1} &\text{if } i \neq 0 + \end{cases} + \\ + ntag \big( (i, j, r) \big) &= + \begin{cases} + \epsilon &\text{if } i = 0 \\[-0.5em] + (1 - 2i, 1 - 2j) &\text{if } i \neq 0 + \end{cases} + \end{align*} - \medskip +% \begin{align*} +% F \big( (0, 0, r_1 \mid r_2) \big) &= (\Sigma, Q_1 \cup Q_2 \cup \{ x, y \}, x, \{y\}, \Delta) \\ +% \text{where } +% & F(r_1) = (\Sigma, Q_1, x_1, \{y_1\}, \Delta_1), \; +% F(r_2) = (\Sigma, Q_2, x_2, \{y_2\}, \Delta_2) \\ +% & \Delta = \Delta_1 \cup \Delta_2 \cup \big\{ +% (x, \epsilon, \epsilon, x_1), (y_1, \epsilon, ntag(r_2), y), +% (x, \epsilon, ntag(r_1), x_2), (y_2, \epsilon, \epsilon, y) +% \big\} +% \\ +% F \big( (0, 0, r_1 \cdot r_2) \big) &= (\Sigma, Q_1 \cup Q_2 \setminus \{ x_2 \}, x_1, \{y_2\}, \Delta) \\ +% \text{where } +% & F(r_1) = (\Sigma, Q_1, x_1, \{y_1\}, \Delta_1), \; +% F(r_2) = (\Sigma, Q_2, x_2, \{y_2\}, \Delta_2) \\ +% & \Delta = \Delta_1 \cup +% \big\{ (y_1, \alpha, \beta, z) \mid (x_2, \alpha, \beta, z) \in \Delta_2 \big\} +% \\ +% F \big( (0, 0, r^{n, m}) \big) \mid n > 1 &= F \big( (0, 0, r \cdot r^{n - 1, m - 1}) +% \\ +% F \big( (0, 0, r^{0, m}) \big) &= (\Sigma, Q_1 \cup \{ x, y \}, x, \{y\}, \Delta) \\ +% \text{where } +% & F \big( (0, 0, r^{1, m}) \big) = (\Sigma, Q_1, x_1, \{y_1\}, \Delta_1) \\ +% & \Delta = \Delta_1 \cup +% \big\{ (x, \epsilon, \epsilon, x_1), (y_1, \epsilon, ntag(r_2), y), (x, \epsilon, ntag(r), y) \big\} +% \\ +% F \big( (0, 0, r^{1, \infty}) \big) &= (\Sigma, Q_1 \cup \{ y \}, x_1, \{y\}, \Delta) \\ +% \text{where } +% & F(r) = (\Sigma, Q_1, x_1, \{y_1\}, \Delta_1) \\ +% & \Delta = \Delta_1 \cup +% \big\{ (y_1, \epsilon, \epsilon, x_1), (y_1, \epsilon, \epsilon, y) \big\} +% \\ +% F \big( (0, 0, r^{1, m}) \big) \mid m \neq \infty &= (\Sigma, Q_1 \cup \hdots \cup Q_m, x_1, \{y_m\}, \Delta) \\ +% \text{where } +% & f_i = F(r) = (\Sigma, Q_i, \{y_i\}, x_i, \Delta_i) \; \forall i = \overline{1, m} \\ +% & \Delta = \Delta_1 \cup \hdots \cup \Delta_m +% \cup \{ (y_i, \epsilon, \epsilon, x_{i+1}), (y_i, \epsilon, \epsilon, y_m) \}_{i=1}^{m-1} +% \\ +% \end{align*} - Induction basis. - For RT of height $1$ we have - $| PT(r, w) | \leq 1 \; \forall w$, - therefore $s = t$ and $\Phi_{h}(s) = \Phi_{h}(t)$. +\begin{figure}\label{fig_tnfa} +\includegraphics[width=\linewidth]{img/tnfa.pdf} +\caption{ +TNFA construction. +} +\end{figure} - \medskip - Induction step. - We have - $s = T^{d} (s_1, \dots, s_n)$ and - $t = T^{d} (t_1, \dots, t_m)$. - If $d = 0$, then $\Phi_{h}(s) = str(s) = w = str(t) = \Phi_{h}(t)$. - Otherwise $d \neq 0$. - By lemma \ref{incomparability_equivdef} we have $s \sim t \Rightarrow \forall p: \|s\|_p = \|t\|_p$. - This implies $n = m$ (otherwise the norm of subtree at position $min(n,m)+1$ is $\infty$ for only one of $s$, $t$). - Therefore - $\Phi_{h}(s) = \Xl_{h+1} \Phi_{h+1}(s_1), \dots, \Phi_{h+1}(s_n) \Xr_h$ and - $\Phi_{h}(t) = \Xl_{h+1} \Phi_{h+1}(t_1), \dots, \Phi_{h+1}(t_n) \Xr_h$. -% Consider any $i \leq n$. - It suffices to show that $\forall i \leq n: \Phi_{h+1}(s_i) = \Phi_{h+1}(t_i)$. - We have $\forall p: \|s_i\|_p = \|t_i\|_p$ (implied by $\forall p: \|s\|_p = \|t\|_p$), - therefore by lemma \ref{incomparability_equivdef} we have $s_i \sim t_i$, - and by lemma \ref{lemma_subtrees} $\exists r', w': s_i, t_i \in PT(r', w')$, - where the height of $r'$ is less than the height of $r$. - By induction hypothesis $\Phi_{h+1}(s_i) = \Phi_{h+1}(t_i)$. - $\square$ - \end{XLem} - \begin{XLem}\label{lemma_pe_less_1} - Let $s, t \in PT(r, w)$. - If $s <_p t$ and $|p| = 1$, then $\Phi_{h}(s) < \Phi_{h}(t) \; \forall h$. - \\ - Proof. +\begin{figure*} +\begin{multicols}{2} - By lemma conditions $|p| = 1$, which implies that $s$ and $t$ are compound PT - $s = T^{d} (s_1, \dots, s_n)$ and - $t = T^{d} (t_1, \dots, t_m)$, where - $d \neq 0$ - (because $\Lambda$ is a prefix of decision position $p$). - Therefore $\Phi_{h}(s)$, $\Phi_{h}(t)$ can be represented as follows, - where $k$ is the number of frames and $j$ is the fork: - \begin{alignat*}{7} - \Phi_{h}(s) &\;=\; \Xl_{h+1} &&\Phi_{h+1}(s_1) &&\dots &&\Phi_{h+1}(s_n) \Xr_h - &&\;=\; \beta_0 a_1 \dots a_j \beta_j &&\;\big|\; && \gamma_j a_{j + 1} \dots a_k \gamma_k \\[-0.5em] - \Phi_{h}(t) &\;=\; \Xl_{h+1} &&\Phi_{h+1}(t_1) &&\dots &&\Phi_{h+1}(t_m) \Xr_h - &&\;=\; \beta_0 a_1 \dots a_j \beta_j &&\;\big|\; && \delta_j a_{j + 1} \dots a_k \delta_k - \end{alignat*} -% - By lemma conditions $|p| = 1$, therefore $p \in \YN$. - Consider any $i \in \YN$ such that $i < p$. - By lemma conditions $s <_p t$, which means - $\|s\|_p > \|t\|_p \wedge \|s\|_q = \|t\|_q \;\forall q < p$. - In particular $\|s_i\|_q = \|t_i\|_q \;\forall q$, - therefore by lemma \ref{incomparability_equivdef} $s_i \sim t_i$ - and by lemma \ref{lemma_pe_equiv} we have $\Phi(s_i) = \Phi(t_i)$. - Let $traces (\Phi_{h}(s), \Phi_{h}(t)) = \big( (\rho_0, \dots, \rho_k), (\rho'_0, \dots, \rho'_k) \big)$ - and consider the following cases. + \begin{algorithm}[H] \DontPrintSemicolon \SetKwProg{Fn}{}{}{} \SetAlgoInsideSkip{medskip} + \Fn {$\underline{match (\XN, \alpha_1 \dots \alpha_n)} \smallskip$} { + $(\Sigma, T, P, Q, F, q_0, T, \Delta) = \XN$ \; + $X = closure(\{ (\bot, q_0, \epsilon, t_0) \}, F, \Delta)$ \; + $(X, B, D) = step(0, X, B, D)$ \; - \medskip + \BlankLine + \For {$i = \overline{1,n}$} { + $X = reach(X, \Delta, \alpha_i)$ \; + $X = closure(X, B, D)$ \; + $(X, B, D) = step(i, X, B, D)$ \; + } - First case: - $\infty = \|s_p\| > \|t_p\|$. - In this case $s|_p$ does not exist - (because $p$ corresponds to a submatch position in $r$, - therefore $p \in Pos(s)$ implies $p \in Sub(s)$, - which contradicts $\|s_p\| = \infty$). - Fork happens immediately after $\Phi_{h+1}(s_{p-1})$, $\Phi_{h+1}(t_{p-1})$: - \begin{alignat*}{7} - \Phi_{h}(s) &\;=\; \Xl_{h+1} &&\Phi_{h+1}(s_1) &&\dots &&\Phi_{h+1}(s_{p-1}) - &&\;\big|\; \Xr_{h} && && \\[-0.5em] - \Phi_{h}(t) &\;=\; \Xl_{h+1} &&\Phi_{h+1}(t_1) &&\dots &&\Phi_{h+1}(t_{p-1}) - &&\;\big|\; \Phi_{h+1}(t_p) &&\dots &&\Phi_{h+1}(t_m) \Xr_{h} - \end{alignat*} - % - In this case fork frame is the last frame: $j = k$, and therefore $\rho_j = \rho'_j = h$ - (because $\gamma_j$ and $\delta_j$ contain the closing parenthesis $\Xr_{h}$). - For all $i < j$ we have $\rho_i = \rho'_i = -1$, therefore $\Phi_{h}(s) \sim \Phi_{h}(t)$. - Furthermore, $first(\gamma_j)$ is $\Xr$ and $first(\delta_j)$ is one of $\Xl$ and $\Xm$, - therefore $\Phi_{h}(s) \subset \Phi_{h}(t)$. - Consequently $\Phi_{h}(s) < \Phi_{h}(t)$. + \BlankLine + \lIf {$\exists (\Xund, p, \Xund, t) \in X \mid p \in F$} { + \Return $t$ + } \lElse { + \Return $\bot$ + } + } + \end{algorithm} - \medskip - Second case: $\infty > \|s_p\| > \|t_p\| = -1$. - In this case both $s_p$ and $t_p$ exist, - $s_p$ is not $\varnothing$ and $t_p$ is $\varnothing$, - and fork happens immediately after $\Phi_{h+1}(s_{p-1})$, $\Phi_{h+1}(t_{p-1})$: - \begin{alignat*}{8} - \Phi_{h}(s) &\;=\; \Xl_{h+1} &&\Phi_{h+1}(s_1) &&\dots &&\Phi_{h+1}(s_{p-1}) - &&\;\big|\; \Xl_{h+1} \; x \; \Xr_{h+1} \; &&\Phi_{h+1}(s_{p+1}) &&\dots &&\Phi_{h+1}(s_n) \Xr_{h} \\[-0.5em] - \Phi_{h}(t) &\;=\; \Xl_{h+1} &&\Phi_{h+1}(t_1) &&\dots &&\Phi_{h+1}(t_{p-1}) - &&\;\big|\; \Xm_{h+1} \;\;\;\;\;\; &&\Phi_{h+1}(t_{p+1}) &&\dots &&\Phi_{h+1}(t_m) \Xr_{h} - \end{alignat*} - % - If $j$-th frame is the last, we have $\rho_j = \rho'_j = h$ like in the first case. - Otherwise we have $\rho_j = \rho'_j = h + 1$, - because $minht(\gamma_j)$, $minht(\delta_j) \geq h + 1$ - and $lastht (\beta_j) = h + 1$ - (because if $p = 1$ then $\beta_j = \Xl_{h+1}$, otherwise - $s_{p-1}$ exists and the last parenthesis in $\beta_j$ - is last parenthesis of $\Phi_{h+1}(s_{p-1})$, which is either $\Xr_{h+1}$ or $\Xm_{h+1}$). - For subsequent frames $i$ such that $j < i < k$ we have $\rho_i = \rho'_i = h + 1$ - (because $minht(\gamma_j)$, $minht(\delta_j) \geq h + 1$), - and for the last pair of frames we have $\rho_k = \rho'_k = h$. - So in this case again $\Phi_{h}(s) \sim \Phi_{h}(t)$. - Furthermore, $first (\gamma_j) = \Xl < \Xm = first (\delta_j)$, therefore $\Phi_{h}(s) \subset \Phi_{h}(t)$ - and $\Phi_{h}(s) < \Phi_{h}(t)$. + \begin{algorithm}[H] \DontPrintSemicolon \SetKwProg{Fn}{}{}{} \SetAlgoInsideSkip{medskip} + \Fn {$\underline{reach (X, \alpha)} \smallskip$} { + \Return $\{ (p, p', \epsilon, t) \mid$ \; + $\qquad (\Xund, p, \Xund, t) \in X \wedge + (p, \alpha, \epsilon, p') \in \Delta^\Sigma \}$ + } + \end{algorithm} - \medskip - - Third case: $\infty > \|s_p\| > \|t_p\| \geq 0$. - In this case both $s_p$ and $t_p$ exist and none of them is $\varnothing$, - and fork happens somewhere after the opening parenthesis $\Xl$ - and before the closing parenthesis $\Xr$ in $\Phi_{h}(s_p)$, $\Phi_{h}(t_p)$: - \begin{alignat*}{9} - \Phi_{h}(s) &\;=\; \Xl_{h+1} &&\Phi_{h+1}(s_1) &&\dots &&\Phi_{h+1}(s_{p-1}) &&\; \Xl_{h+2} \; x - &&\;\big|\; y \; \Xr_{h+1} \; &&\Phi_{h+1}(s_{p+1}) &&\dots &&\Phi_{h+1}(s_n) \Xr_{h} \\[-0.5em] - \Phi_{h}(t) &\;=\; \Xl_{h+1} &&\Phi_{h+1}(t_1) &&\dots &&\Phi_{h+1}(t_{p-1}) &&\; \Xl_{h+2} \; x - &&\;\big|\; z \; \Xr_{h+1} \; &&\Phi_{h+1}(t_{p+1}) &&\dots &&\Phi_{h+1}(t_m) \Xr_{h} - \end{alignat*} - % - Let $\delta_l$ be the frame containing the closing parenthesis $\Xr_{h+1}$ of $\Phi_{h+1}(t_p)$. - Because $\|s_p\| > \|t_p\|$, - the closing parenthesis $\Xr_{h+1}$ of $\Phi_{h+1}(s_p)$ is not contained in $\gamma_{l}$, - and $l$-th frame is not the last one. - Therefore $minht (\gamma_l) \geq h+2$ and $minht (\delta_l) = h+1$. - Furthermore, $minht(x)$, $minht(y)$, $minht(z) \geq h + 2$, - therefore $lastht(\beta_j) = lastht(\Xl_{h+2} \; x) \geq h+2$ and we have - $\rho_i, \rho'_i \geq h+2$ for all frames $j \leq i < l$ - (note that it might be $\rho_i < \rho'_i$), - and for the $l$-th frame $\rho_l \geq h+2 > h+1 = \rho'_l$. - For subsequent frames $\gamma_i$, $\delta_i$ such that $l < i < k$ we have - $minht(\gamma_i)$, $minht(\delta_i) \geq h + 1$, - therefore $\rho_i \geq h+1 = \rho'_i$. - For the last pair of frames $\rho_k = \rho'_k = h$. - Therefore in this case $\Phi_{h}(s) \sqsubset \Phi_{h}(t)$, - which implies $\Phi_{h}(s) < \Phi_{h}(t)$. - $\square$ - \end{XLem} +\iffalse + \begin{algorithm}[H] \DontPrintSemicolon \SetKwProg{Fn}{}{}{} \SetAlgoInsideSkip{medskip} + \Fn {$\underline{closure(k, X, B, D)} \smallskip$} { - \begin{XLem}\label{lemma_pe_less} - Let $s, t \in PT(r, w)$. - If $s <_p t$, then $\Phi_{h}(s) < \Phi_{h}(t) \; \forall h$. - \\ - Proof. - By induction on the length of $p$. + $Y = \emptyset$ \; + \For {$x \in X$} { + Y = $rclosure (x, Y, B, D)$ + } - \medskip + \Return $Y$ \; + } + \end{algorithm} - Induction basis for $|p| = 1$ is given by lemma \ref{lemma_pe_less_1}. - \medskip + \begin{algorithm}[H] \DontPrintSemicolon \SetKwProg{Fn}{}{}{} \SetAlgoInsideSkip{medskip} + \Fn {$\underline{rclosure(x, Y, B, D)} \smallskip$} { + $(q, p, u, t) = x$ \; + \If {unmarked $p$} { + mark $p$ \; - Induction step. - Let $|p| \geq 2$, then $s$ and $t$ are compound PT - $s = T^{d} (s_1, \dots, s_n)$ and - $t = T^{d} (t_1, \dots, t_m)$, where - $d \neq 0$ (because $\Lambda$ is a prefix of decision position $p$). - % - Furthermore, let $p = p'.p''$, where $p' \in \YN$. - Subtrees $s' = s_{p'}$ and $t' = t_{p'}$ exist, because $p'$ a proper prefix of decision position $p$, - and they also must be compount PT - $s' = T^{d'} (s'_1, \dots, s'_{n'})$ and - $t' = T^{d'} (t'_1, \dots, t'_{m'})$, - because $|p''| > 0$, and it must be - $d' \neq 0$ (because $p'$ is a prefix of decision position $p$). - % - For subtrees $s_i$ and $t_i$ where $i < p'$ we have - $\|s_i\|_q = \|t_i\|_q \;\forall q$ (implied by $s <_p t$), - therefore by lemma \ref{incomparability_equivdef} - $s_i \sim t_i$, and by lemma \ref{lemma_pe_equiv} we have $\Phi_{h+1}(s_i) = \Phi_{h+1}(t_i)$. - % - Therefore $\Phi_{h}(s)$, $\Phi_{h}(t)$ can be represented as follows: - \begin{alignat*}{9} - \Phi_{h}(s) - \;&= - \;&& \Xl_{h+1} \Phi_{h+1}(s_1) \dots \Phi_{h+1}(s_{p'-1}) - \;&& \overbrace {\Xl_{h+2} \Phi_{h+2}(s'_1) \dots \Phi_{h+2}(s'_{n'}) \Xr_{h+1}}^{\Phi_{h+1}(s')} - \;&& \Phi_{h+1}(s_{p'+1}) \Phi_{h+1}(s_n) \Xr_{h} - \\ - \Phi_{h}(t) - \;&= - \;&& \Xl_{h+1} \Phi_{h+1}(t_1) \dots \Phi_{h+1}(t_{p'-1}) - \;&& \underbrace {\Xl_{h+2} \Phi_{h+2}(t'_1) \dots \Phi_{h+2}(t'_{m'}) \Xr_{h+1}}_{\Phi_{h+1}(t')} - \;&& \Phi_{h+1}(t_{p'+1}) \Phi_{h+1}(t_m) \Xr_{h} - \end{alignat*} + \If {$p \in F \vee \exists (p, \alpha, \Xund, \Xund) \in \Delta^\Sigma$} { + \If {$\nexists y = (\Xund, p, \Xund, \Xund) \in Y$} { + $Y = Y \cup \{ x \}$ + } \ElseIf { $less (x, y, B, D)$} { + $Y = Y \cup \{ x \} \setminus \{ y \}$ + } + } - We have $\|s\|_q = \|t\|_q \;\forall q < p'$ (implied by $s <_p t$), - therefore by lemma \ref{lemma_subtrees} $\exists r', w' : s', t' \in PT(r', w')$. - Moreover, $s' <_{p''} t'$ and $|p''| < |p|$, therefore by induction hypothesis $\Phi_{p+1}(s') < \Phi_{p+1}(t')$. - % - On the other hand, if $j$ is the fork and $f \leq j \leq k$ then - $\Phi_{h}(s)$, $\Phi_{h}(t)$ can be represented as: - \begin{alignat*}{9} - \Phi_{h}(s) - \;&= - \;&& \beta_0 a_1 \dots a_f \beta_f^1 - \;&& \overbrace {\beta_f^2 a_{f+1} \dots a_j \beta_j \;\big|\; \gamma_j a_{j+1} \dots a_k \gamma_k^1}^{\Phi_{h+1}(s')} - \;&& \gamma_k^2 a_{k+1} \dots a_l \gamma_l - \\[-0.5em] - \Phi_{h}(t) - \;&= - \;&& \beta_0 a_1 \dots a_f \beta_f^1 - \;&& \underbrace {\beta_f^2 a_{f+1} \dots a_j \beta_j \;\big|\; \delta_j a_{j+1} \dots a_k \delta_k^1}_{\Phi_{h+1}(t')} - \;&& \delta_k^2 a_{k+1} \dots a_l \delta_l - \end{alignat*} + \BlankLine + \ForEach {$(p, \epsilon, w, p') \in \Delta^\epsilon$} { + Y = $rclosure ((q, p', u w, t), Y, B, D)$ \; + } -% \begin{alignat*}{9} -% \Phi_{h}(s) -% \;&= -% \;&&\overbrace {\Xl_{h+1} \Phi_{h+1}(s_1) \dots \Phi_{h+1}(s_{p'-1})} -% ^{\beta_0 a_1 \dots a_i \beta_i^1} -% \;&&\overbrace {\Xl_{h+2} \Phi_{h+2}(s'_1) \dots \Phi_{h+2}(s'_{n'}) \Xr_{h+1}} -% ^{\beta_i^2 a_{i+1} \dots a_j \beta_j \;\big|\; \gamma_j a_{j+1} \dots a_k \gamma_k^1} -% \;&&\overbrace {\Phi_{h+1}(s_{p'+1}) \Phi_{h+1}(s_n) \Xr_{h}} -% ^{\gamma_k^2 a_{k+1} \dots a_l \gamma_l} -% \\ -% \Phi_{h}(t) -% \;&= -% \;&&\underbrace {\Xl_{h+1} \Phi_{h+1}(t_1) \dots \Phi_{h+1}(t_{p'-1})} -% _{\beta_0 a_1 \dots a_i \beta_i^1} -% \;&&\underbrace {\Xl_{h+2} \Phi_{h+2}(t'_1) \dots \Phi_{h+2}(t'_{m'}) \Xr_{h+1}} -% _{\beta_i^2 a_{i+1} \dots a_j \beta_j \;\big|\; \delta_j a_{j+1} \dots a_k \delta_k^1} -% \;&&\underbrace {\Phi_{h+1}(t_{p'+1}) \Phi_{h+1}(t_m) \Xr_{h}} -% _{\delta_k^2 a_{k+1} \dots a_l \delta_l} -% \end{alignat*} + unmark $p$ \; + } + \Return $Y$ \; + } + \end{algorithm} - Let $traces (\Phi_{h}(s), \Phi_{h}(t)) = \big( (\rho_0, \dots, \rho_l), (\rho'_0, \dots, \rho'_l) \big)$ - and $traces (\Phi_{h+1}(s'), \Phi_{h+1}(t')) = \big( (\sigma_h, \dots, \sigma_k), (\sigma'_h, \dots, \sigma'_k) \big)$. - % - We show that for frames $i$ such that $j \leq i < k$ we have - $\rho_i = \sigma_i \wedge \rho'_i = \sigma'_i$, - and for subsequent frames $k \leq i \leq l$ we have $\rho_i = \rho'_i$. + \begin{algorithm}[H] \DontPrintSemicolon \SetKwProg{Fn}{}{}{} + \Fn {$\underline {less (x, y, B, D)} \smallskip$} { + $(\Xund, \Xund, l) = precedence (x, y, B, D)$ \; + \Return $l$ \; + } + \end{algorithm} +\fi - \medskip - First case: $i = j \leq k \leq l$ (the fork frame). - Because $\Phi_{h+1}(s')$ and $\Phi_{h+1}(t')$ have nonempty common prefix $\Xl_{h+2}$, - we have $lasht (\Phi_{h}(s) \sqcap \Phi_{h}(t)) = lastht (\Phi_{h+1}(s') \sqcap \Phi_{h+1}(t')) \geq h + 2$. - % - If $j < k$ then $minht (\gamma_j)$, $minht (\delta_j)$ are not affected by appending - $\gamma^2_k$, $\delta^2_k$ and therefore $\rho_j = \sigma_j \wedge \rho'_j = \sigma'_j$. - % - Else if $j = k < l$ then we have $minht (\gamma^1_k) = minht (\delta^1_k) = h + 1$ and - $minht (\gamma^2_k) = minht (\delta^2_k) \geq h + 1$, and - therefore $\rho_j = \rho'_j = h + 1$. - % - Finally, if $j = l$ then $minht (\gamma_j) = minht (\delta_j) = h$ and $\rho_j = \rho'_j = h$. + \begin{algorithm}[H] \DontPrintSemicolon \SetKwProg{Fn}{}{}{} \SetAlgoInsideSkip{medskip} + \Fn {$\underline{step(k, X, B, D)} \smallskip$} { + let $\{ x_i \}_{i=1}^{n} = \{(q_i, p_i, u_i, t_i) \}_{i=1}^{n} = X$ - \medskip + \BlankLine + \tcc {update tag values in $X$} + \For {$i = \overline {1, n}$} { + $b_1 \dots b_m = u_i$ \; + \lFor {$j = \overline{1, m}$} { + $t_i[b_j] = k$ + } + } - Second case: $j < i < k$. - In this case the calculation of $\rho_i$, $\rho'_i$ depends on $\rho_j$, $\rho'_j$ - (for which we have shown $\rho_j = \sigma_j \wedge \rho'_j = \sigma'_j$) and - is not affected by the appended $\gamma^2_k$, $\delta^2_k$, therefore - $\rho_i = \sigma_i \wedge \rho'_i = \sigma'_i$. + \BlankLine + \tcc {update $B$ and $D$ matrices} + \For {$i = \overline {1, n}$} { + \For {$j = \overline {1, n}$} { + $(h_1, h_2, l) = precedence (x_i, x_j, B, D)$ \; + $B' [p_i] [p_j] = h_1, \; D' [p_i] [p_j] = l$ \; + $B' [p_j] [p_i] = h_2, \; D' [p_j] [p_i] = -l$ \; + } + } - \medskip + \BlankLine + \Return $(X, B', D')$ \; + } + \end{algorithm} - Third case: $j < i = k < l$. We have - $minht (\gamma^1_k) = minht (\delta^1_k) = h + 1$ and - $minht (\gamma^2_k) = minht (\delta^2_k) \geq h + 1$, - and none of the preceding frames after the fork contain parentheses with height less than $h + 1$, - therefore $\rho_k = \rho'_k = h + 1$. + \columnbreak - \medskip - Fourth case: $k < i < l$. - We have $\rho_i = \rho'_i = h + 1$, - because $\rho_k = \rho'_k = h + 1$ and $minht(\gamma_i)$, $minht(\delta_i) \geq h + 1$. + \begin{algorithm}[H] \DontPrintSemicolon \SetKwProg{Fn}{}{}{} + \Fn {$\underline {precedence (x_1, x_2, B, D)} \smallskip$} { + $(q_1, \Xund, a_1 \dots a_n, \Xund) = x_1$ \; + $(q_2, \Xund, b_1 \dots b_m, \Xund) = x_2$ \; + $k = 1, \; h_1 = h_2 = \infty, \; l = 0$ \; - \medskip + \BlankLine + \tcc {if fork frame, find fork} + \If { $q_1 = q_2$ } { + \While {$k < min (n, m)$ and $a_k = b_k$} { + $k = k + 1$ + } + } - Fifth case: $i = l$. - We have $\rho_l = \rho'_l = h$. + \BlankLine + \tcc {longest-precedence} + \If { $q_1 = q_2$ } { + $h_1 = h_2 = height (a_{k-1})$ + } \ElseIf {$k > 1$} { + $h_1 = B [q_1] [q_2], \; h_2 = B [q_2] [q_1]$ + } + \lFor {$i = \overline{k, n}$} { + $h_1 = min (h_1, height (a_i))$ + } + \lFor {$i = \overline{k, m}$} { + $h_2 = min (h_2, height (b_i))$ + } + \lIf {$h_1 > h_2$} {\Return $(h_1, h_2, -1)$} + \lIf {$h_1 < h_2$} {\Return $(h_1, h_2, 1)$} - \medskip + \BlankLine + \tcc {leftmost-precedence} + \If { $q_1 = q_2$ } { + \lIf {$k = n = m$} { $l = 0$ } + \lElseIf {$k = n$} { $l = -1$ } + \lElseIf {$k = m$} { $l = 1$ } + \lElseIf {$a_k mod 2 \equiv 0$} { $l = -1$ } + \lElseIf {$b_k mod 2 \equiv 0$} { $l = 1$ } + \lElseIf {$a_k > b_k$} { $l = -1$ } + \lElseIf {$a_k < b_k$} { $l = 1$ } + } \Else { + $l = D [q_1] [q_2]$ + } + \Return $(h_1, h_2, l)$ + } + \end{algorithm} - So, we have shown that $\rho_i = \sigma_i \wedge \rho'_i = \sigma'_i$ for $j \leq i < k$ - and $\rho_i = \rho'_i$ for $k \leq i \leq l$. - It trivially follows that $\Phi_{h+1}(s') \sqsubset \Phi_{h+1}(t')$ implies $\Phi_{h}(s) \sqsubset \Phi_{h}(t)$, and - $\Phi_{h+1}(s') \sim \Phi_{h+1}(t')$ implies $\Phi_{h}(s) \sim \Phi_{h}(t)$. - Because none of $\Phi_{h+1}(s')$, $\Phi_{h+1}(t')$ is a proper prefix of another, - $\Phi_{h+1}(s') \subset \Phi_{h+1}(t')$ implies $\Phi_{h}(s) \subset \Phi_{h}(t)$. - Therefore $\Phi_{h+1}(s') < \Phi_{h+1}(t')$ implies $\Phi_{h}(s) < \Phi_{h}(t)$. - $\square$ - \end{XLem} +\end{multicols} +\begin{center} +\caption{Matching algorithm.} +\end{center} +\end{figure*} - \begin{XThe} - Let $s, t \in PT(r, w)$, then - $s <_p t \Leftrightarrow \Phi_{h}(s) < \Phi_{h}(t) \; \forall h$. - \\ - Proof. - - \medskip - - $\Rightarrow$. Given by lemma \ref{lemma_pe_less}. - - \medskip - - $\Leftarrow$. - We have $\Phi_{h}(s) < \Phi_{h}(t) \; \forall h$. - Suppose that $\nexists p: s <_p t$. - By lemma \ref{lemma_pe_order_antisymm} either $s = t$ - (in which case $\Phi_{h}(s) = \Phi_{h}(t)$, contradiction) - or $t <_q s$ for some $q$ - (in which case $\Phi_{h}(t) < \Phi_{h}(s)$ by lemma \ref{lemma_pe_less}, contradiction). - Therefore $\exists p: s <_p t$. - $\square$ - \end{XThe} - - -\section{Tagged NFA} - - \begin{Xdef} - \emph{Tagged Nondeterministic Finite Automaton (TNFA)} - is a structure $(\Sigma, Q, F, q_0, \Delta)$, where: - \begin{itemize} - \item[] $\Sigma$ is a finite set of symbols (\emph{alphabet}) -% \item[] $T$ is a finite set of \emph{tags} -% \item[] $P$ is a finite set of \emph{priorities} - \item[] $Q$ is a finite set of \emph{states} - \item[] $F \subseteq Q$ is the set of \emph{final} states - \item[] $q_0 \in Q$ is the \emph{initial} state - - \item[] $\Delta = \Delta^\Sigma \sqcup \Delta^\epsilon$ is the \emph{transition} relation, where: - \begin{itemize} - \item[] $\Delta^\Sigma \subseteq Q \times \Sigma \times \{\epsilon\} \times Q$ - \item[] $\Delta^\epsilon \subseteq Q \times \{\epsilon\} \times \big( (\YZ \times \YZ) \cup \{\epsilon\} \big) \times Q$ - \end{itemize} -% and all $\epsilon$-transitions from the same state have different priorities: -% $\forall (x, r, \epsilon, y), (\widetilde{x}, \widetilde{r}, \epsilon, \widetilde{y}) \in \Delta^\epsilon: -% x = \widetilde{x} \wedge y = \widetilde{y} \Rightarrow r \!\neq\! \widetilde{r}$. - \end{itemize} - \end{Xdef} - -TNFA construction is given by the $F$ function defined on figure \ref{fig_tnfa}. - - \begin{align*} - otag \big( (i, j, r) \big) &= - \begin{cases} - \epsilon &\text{if } i = 0 \\[-0.5em] - (2i - 1, 2j - 1) &\text{if } i \neq 0 - \end{cases} - \\ - ctag \big( (i, j, r) \big) &= - \begin{cases} - \epsilon &\text{if } i = 0 \\[-0.5em] - (2i, 2j) \hphantom{{} - 1 {} - 1} &\text{if } i \neq 0 - \end{cases} - \\ - ntag \big( (i, j, r) \big) &= - \begin{cases} - \epsilon &\text{if } i = 0 \\[-0.5em] - (1 - 2i, 1 - 2j) &\text{if } i \neq 0 - \end{cases} - \end{align*} - -% \begin{align*} -% F \big( (0, 0, r_1 \mid r_2) \big) &= (\Sigma, Q_1 \cup Q_2 \cup \{ x, y \}, x, \{y\}, \Delta) \\ -% \text{where } -% & F(r_1) = (\Sigma, Q_1, x_1, \{y_1\}, \Delta_1), \; -% F(r_2) = (\Sigma, Q_2, x_2, \{y_2\}, \Delta_2) \\ -% & \Delta = \Delta_1 \cup \Delta_2 \cup \big\{ -% (x, \epsilon, \epsilon, x_1), (y_1, \epsilon, ntag(r_2), y), -% (x, \epsilon, ntag(r_1), x_2), (y_2, \epsilon, \epsilon, y) -% \big\} -% \\ -% F \big( (0, 0, r_1 \cdot r_2) \big) &= (\Sigma, Q_1 \cup Q_2 \setminus \{ x_2 \}, x_1, \{y_2\}, \Delta) \\ -% \text{where } -% & F(r_1) = (\Sigma, Q_1, x_1, \{y_1\}, \Delta_1), \; -% F(r_2) = (\Sigma, Q_2, x_2, \{y_2\}, \Delta_2) \\ -% & \Delta = \Delta_1 \cup -% \big\{ (y_1, \alpha, \beta, z) \mid (x_2, \alpha, \beta, z) \in \Delta_2 \big\} -% \\ -% F \big( (0, 0, r^{n, m}) \big) \mid n > 1 &= F \big( (0, 0, r \cdot r^{n - 1, m - 1}) -% \\ -% F \big( (0, 0, r^{0, m}) \big) &= (\Sigma, Q_1 \cup \{ x, y \}, x, \{y\}, \Delta) \\ -% \text{where } -% & F \big( (0, 0, r^{1, m}) \big) = (\Sigma, Q_1, x_1, \{y_1\}, \Delta_1) \\ -% & \Delta = \Delta_1 \cup -% \big\{ (x, \epsilon, \epsilon, x_1), (y_1, \epsilon, ntag(r_2), y), (x, \epsilon, ntag(r), y) \big\} -% \\ -% F \big( (0, 0, r^{1, \infty}) \big) &= (\Sigma, Q_1 \cup \{ y \}, x_1, \{y\}, \Delta) \\ -% \text{where } -% & F(r) = (\Sigma, Q_1, x_1, \{y_1\}, \Delta_1) \\ -% & \Delta = \Delta_1 \cup -% \big\{ (y_1, \epsilon, \epsilon, x_1), (y_1, \epsilon, \epsilon, y) \big\} -% \\ -% F \big( (0, 0, r^{1, m}) \big) \mid m \neq \infty &= (\Sigma, Q_1 \cup \hdots \cup Q_m, x_1, \{y_m\}, \Delta) \\ -% \text{where } -% & f_i = F(r) = (\Sigma, Q_i, \{y_i\}, x_i, \Delta_i) \; \forall i = \overline{1, m} \\ -% & \Delta = \Delta_1 \cup \hdots \cup \Delta_m -% \cup \{ (y_i, \epsilon, \epsilon, x_{i+1}), (y_i, \epsilon, \epsilon, y_m) \}_{i=1}^{m-1} -% \\ -% \end{align*} - -\begin{figure}\label{fig_tnfa} -\includegraphics[width=\linewidth]{img/tnfa.pdf} -\caption{ -TNFA construction. -} -\end{figure} - - - - -\begin{figure*} -\begin{multicols}{2} +\begin{figure*} +\begin{multicols}{2} \begin{algorithm}[H] \DontPrintSemicolon \SetKwProg{Fn}{}{}{} \SetAlgoInsideSkip{medskip} - \Fn {$\underline{match (\XN, \alpha_1 \dots \alpha_n)} \smallskip$} { - $(\Sigma, T, P, Q, F, q_0, T, \Delta) = \XN$ \; - $X = closure(\{ (\bot, q_0, \epsilon, t_0) \}, F, \Delta)$ \; - $(X, B, D) = step(0, X, B, D)$ \; - - \BlankLine - \For {$i = \overline{1,n}$} { - $X = reach(X, \Delta, \alpha_i)$ \; - $X = closure(X, B, D)$ \; - $(X, B, D) = step(i, X, B, D)$ \; + \Fn {$\underline{closure \Xund goldberg \Xund radzik(X, F, \Delta)} \smallskip$} { + empty stacks $topsort$, $newpass$ \; + $result(q) \equiv \bot$ \; + $status(q) \equiv \mathit{OFFSTACK}$ \; + \For {$(q, x) \in X$} { + $relax(q, x, result, topsort)$ \; } + \While {$topsort$ is not empty} { + \While {$topsort$ is not empty} { + $q = pop(topsort)$ \; - \BlankLine - \lIf {$\exists (\Xund, p, \Xund, t) \in X \mid p \in F$} { - \Return $t$ - } \lElse { - \Return $\bot$ + \If {$status(q) = \mathit{TOPSORT}$} { + $push(newpass, n)$ \; + } \ElseIf {$status(q) = \mathit{NEWPASS}$} { + $status(q) = \mathit{TOPSORT}$ \; + $push(topsort, q)$ \; + $scan(q, result, topsort)$ \; + } + } + \While {$newpass$ is not empty} { + $q = pop(newpass)$ \; + $scan(q, result, topsort)$ \; + $status(q) = \mathit{OFFSTACK}$ \; + } } + \Return $\big\{ (q, x) \mid x = result(q) \; \wedge$ \; + $\hspace{6em} \big(q \in F \vee \exists (p, \alpha, \Xund, \Xund) \in \Delta^\Sigma \big) \big\}$ \; } \end{algorithm} + \columnbreak \begin{algorithm}[H] \DontPrintSemicolon \SetKwProg{Fn}{}{}{} \SetAlgoInsideSkip{medskip} - \Fn {$\underline{reach (X, \alpha)} \smallskip$} { - \Return $\{ (p, p', \epsilon, t) \mid$ \; - $\qquad (\Xund, p, \Xund, t) \in X \wedge - (p, \alpha, \epsilon, p') \in \Delta^\Sigma \}$ - } - \end{algorithm} - - -\iffalse - \begin{algorithm}[H] \DontPrintSemicolon \SetKwProg{Fn}{}{}{} \SetAlgoInsideSkip{medskip} - \Fn {$\underline{closure(k, X, B, D)} \smallskip$} { - - $Y = \emptyset$ \; - \For {$x \in X$} { - Y = $rclosure (x, Y, B, D)$ + \Fn {$\underline{scan(q, result, topsort)} \smallskip$} { + \ForEach {outgoing arc $(q, \epsilon, \chi, p) \in \Delta$} { + $x = result(q) \chi$ \; + $relax(p, x, result, topsort)$ \; } - - \Return $Y$ \; } \end{algorithm} \begin{algorithm}[H] \DontPrintSemicolon \SetKwProg{Fn}{}{}{} \SetAlgoInsideSkip{medskip} - \Fn {$\underline{rclosure(x, Y, B, D)} \smallskip$} { - $(q, p, u, t) = x$ \; - \If {unmarked $p$} { - mark $p$ \; - - \If {$p \in F \vee \exists (p, \alpha, \Xund, \Xund) \in \Delta^\Sigma$} { - \If {$\nexists y = (\Xund, p, \Xund, \Xund) \in Y$} { - $Y = Y \cup \{ x \}$ - } \ElseIf { $less (x, y, B, D)$} { - $Y = Y \cup \{ x \} \setminus \{ y \}$ + \Fn {$\underline{relax(q, x, result, topsort, B, D)} \smallskip$} { + $(\Xund, \Xund, l) = precedence (x, result(q), B, D)$ \; + \If {$l = -1$} { + $result(q) = x$ \; + \If {$status(q) \neq \mathit{TOPSORT}$} { + $push(topsort, q)$ \; + $status(q) = \mathit{NEWPASS}$ \; } } - - \BlankLine - \ForEach {$(p, \epsilon, w, p') \in \Delta^\epsilon$} { - Y = $rclosure ((q, p', u w, t), Y, B, D)$ \; - } - - unmark $p$ \; - } - \Return $Y$ \; } \end{algorithm} - \begin{algorithm}[H] \DontPrintSemicolon \SetKwProg{Fn}{}{}{} - \Fn {$\underline {less (x, y, B, D)} \smallskip$} { - $(\Xund, \Xund, l) = precedence (x, y, B, D)$ \; - \Return $l$ \; - } - \end{algorithm} -\fi +\end{multicols} +\begin{center} +\caption{GOR1.} +\end{center} +\end{figure*} - \begin{algorithm}[H] \DontPrintSemicolon \SetKwProg{Fn}{}{}{} \SetAlgoInsideSkip{medskip} - \Fn {$\underline{step(k, X, B, D)} \smallskip$} { - let $\{ x_i \}_{i=1}^{n} = \{(q_i, p_i, u_i, t_i) \}_{i=1}^{n} = X$ +\clearpage +\pagebreak - \BlankLine - \tcc {update tag values in $X$} - \For {$i = \overline {1, n}$} { - $b_1 \dots b_m = u_i$ \; - \lFor {$j = \overline{1, m}$} { - $t_i[b_j] = k$ - } - } +\section{GOR1 correcness proof} - \BlankLine - \tcc {update $B$ and $D$ matrices} - \For {$i = \overline {1, n}$} { - \For {$j = \overline {1, n}$} { - $(h_1, h_2, l) = precedence (x_i, x_j, B, D)$ \; - $B' [p_i] [p_j] = h_1, \; D' [p_i] [p_j] = l$ \; - $B' [p_j] [p_i] = h_2, \; D' [p_j] [p_i] = -l$ \; - } - } +For a given RT $r$, +we say that PE $\alpha$ is \emph{minimal} if $\alpha = PE(t)$ for some minimal $t \in PT(r)$, +and we say that path $\pi$ in TNFA $F(r)$ is \emph{minimal} if $\pi$ induces a minimal PE. - \BlankLine - \Return $(X, B', D')$ \; - } - \end{algorithm} +GOR1 correctness proof consists of two parts. +First, we get rid of $\epsilon$-loops by showing that, +on one hand, minmal paths do not contain $\epsilon$-loops, +and on the other hand, GOR1 cancels all paths which contain $\epsilon$-loops. +Second, for paths without $\epsilon$-loops we show right distributivity of path comparison over path concatenation. +The proofs make use of the TNFA nested structure +and the fact that each sub-TNFA is has a unique entry and exit states. +TNFA construct for all possible types of RT are shown on the fugure below. - \columnbreak - \begin{algorithm}[H] \DontPrintSemicolon \SetKwProg{Fn}{}{}{} - \Fn {$\underline {precedence (x_1, x_2, B, D)} \smallskip$} { - $(q_1, \Xund, a_1 \dots a_n, \Xund) = x_1$ \; - $(q_2, \Xund, b_1 \dots b_m, \Xund) = x_2$ \; - $k = 1, \; h_1 = h_2 = \infty, \; l = 0$ \; +\begin{figure}\label{fig_gor1} +\includegraphics[width=\linewidth]{img/gor1.pdf} +\caption{ +Sub-TNFA for individual sub-RT with submatch groups: \\ +(a) -- union, (b) -- product, (c), (d) -- bounded repetition, (e), (f) -- unbounded repetition. +} +\end{figure} - \BlankLine - \tcc {if fork frame, find fork} - \If { $q_1 = q_2$ } { - \While {$k < min (n, m)$ and $a_k = b_k$} { - $k = k + 1$ - } - } + \begin{XLem}\label{gor1_path_containment} + Let $r$ be a RE, $\pi = q_1 \overset {\alpha} {\rightsquigarrow} q_2$ a tagged path in TNFA $F(r)$, + where $\alpha \neq \epsilon$, + and $h = minh (\alpha)$ the minimal tag height on path $\pi$. + The following statements are true: + \begin{enumerate} + \item There is a position $p$ of length $|p| = h$ + such that $\pi$ fully lies inside of subautomaton $F(r|_p)$. - \BlankLine - \tcc {longest-precedence} - \If { $q_1 = q_2$ } { - $h_1 = h_2 = height (a_{k-1})$ - } \ElseIf {$k > 1$} { - $h_1 = B [q_1] [q_2], \; h_2 = B [q_2] [q_1]$ - } - \lFor {$i = \overline{k, n}$} { - $h_1 = min (h_1, height (a_i))$ - } - \lFor {$i = \overline{k, m}$} { - $h_2 = min (h_2, height (b_i))$ - } - \lIf {$h_1 > h_2$} {\Return $(h_1, h_2, -1)$} - \lIf {$h_1 < h_2$} {\Return $(h_1, h_2, 1)$} + \item There no position $p$ of length $|p| > h$ + such that $\pi$ fully lies inside of subautomaton $F(r|_p)$. + \end{enumerate} + Proof. + Obvious from TNFA construction. + $\square$ + \end{XLem} - \BlankLine - \tcc {leftmost-precedence} - \If { $q_1 = q_2$ } { - \lIf {$k = n = m$} { $l = 0$ } - \lElseIf {$k = n$} { $l = -1$ } - \lElseIf {$k = m$} { $l = 1$ } - \lElseIf {$a_k mod 2 \equiv 0$} { $l = -1$ } - \lElseIf {$b_k mod 2 \equiv 0$} { $l = 1$ } - \lElseIf {$a_k > b_k$} { $l = -1$ } - \lElseIf {$a_k < b_k$} { $l = 1$ } - } \Else { - $l = D [q_1] [q_2]$ - } - \Return $(h_1, h_2, l)$ - } - \end{algorithm} -\end{multicols} -\begin{center} -\caption{Matching algorithm.} -\end{center} -\end{figure*} -\begin{figure*} -\begin{multicols}{2} + \begin{XLem}\label{gor1_minpaths} + Minimal paths do not contain tagged $\epsilon$-loops. + \\ + Proof. - \begin{algorithm}[H] \DontPrintSemicolon \SetKwProg{Fn}{}{}{} \SetAlgoInsideSkip{medskip} - \Fn {$\underline{closure \Xund goldberg \Xund radzik(X, F, \Delta)} \smallskip$} { - empty stacks $topsort$, $newpass$ \; - $result(q) \equiv \bot$ \; - $status(q) \equiv \mathit{OFFSTACK}$ \; - \For {$(q, x) \in X$} { - $relax(q, x, result, topsort)$ \; - } - \While {$topsort$ is not empty} { - \While {$topsort$ is not empty} { - $q = pop(topsort)$ \; + Suppose, on the contrary, that $\pi$ is a minimal path in TNFA $F(r)$, and that $\pi$ contains at least one $\epsilon$-loop. + Consider the \emph{last} $\epsilon$-loop in $\pi$: + it can only come from sub-TNFA of the form $F\big( (i, \Xund, (i_1, \Xund, r_1)^{n,\infty}) \big)$ where $n \geq 0$, + as this is the only looping TNFA construct. + Let $w_n$ be the final state of sub-TNFA $F\big( (i_1, \Xund, r_1) \big)$ + as shown on figure \ref{fig_gor1} (e) -- (f). + Then $\pi$ can be represented as + $\pi = \pi_1 \pi_2 \pi_3$, where $\pi_2$ is the $\epsilon$-loop: + $\pi_1 = q_0 \overset {u | \alpha} {\rightsquigarrow} w_n$ and + $\pi_2 = w_n \overset {\epsilon | \beta} {\rightsquigarrow} w_n$ and + $\pi_3 = w_n \overset {v | \gamma} {\rightsquigarrow} q_f$. + Consider path $\pi' = \pi_1 \pi_3$ that is obtained from $\pi$ by removing $\pi_2$. + It consumes the same input string $uv$ as $\pi$, + therefore PE transduced by $\pi$ and $\pi'$ are comparable: $\alpha \beta \gamma, \alpha \gamma \in PE(r, uv)$. + Let $j$ be the total number of repetitions through $F\big( (i_1, \Xund, r_1) \big)$, + and let $i$ be the index of the $\epsilon$-loop repetition. - \If {$status(q) = \mathit{TOPSORT}$} { - $push(newpass, n)$ \; - } \ElseIf {$status(q) = \mathit{NEWPASS}$} { - $status(q) = \mathit{TOPSORT}$ \; - $push(topsort, q)$ \; - $scan(q, result, topsort)$ \; - } - } - \While {$newpass$ is not empty} { - $q = pop(newpass)$ \; - $scan(q, result, topsort)$ \; - $status(q) = \mathit{OFFSTACK}$ \; - } - } - \Return $\big\{ (q, x) \mid x = result(q) \; \wedge$ \; - $\hspace{6em} \big(q \in F \vee \exists (p, \alpha, \Xund, \Xund) \in \Delta^\Sigma \big) \big\}$ \; - } - \end{algorithm} + First case: $i = j$. + In this case fork of $\alpha \beta \gamma$ and $\alpha \gamma$ happens immediately after $(i-1)$-th repetition: + % + \begin{alignat*}{10} + \alpha \beta \gamma &= x_0 \Xl_{h-1} \;&&\; \Xl_h x_1 \Xr_h \hdots \Xl_h x_{i-1} \Xr_h \;&&\big|\; \Xl_h x_{i} \Xr_h \;&&\; \Xr_{h-1} x_{j+1} \\[-0.5em] + \alpha \gamma &= x_0 \Xl_{h-1} \;&&\; \Xl_h x_1 \Xr_h \hdots \Xl_h x_{i-1} \Xr_h \;&&\big|\; \;&&\; \Xr_{h-1} x_{j+1} + \end{alignat*} + % + It must be $\alpha \beta \gamma \sim \alpha \gamma$, + because $minh (\beta) = h > h - 1 = minh (first (\gamma))$, + so at the fork frame $k$ we have $\rho_k = \rho'_k \leq h - 1$. + % + Furthermore, it must be $\alpha \gamma \subset \alpha \beta \gamma$, + because $first (\gamma) = \Xr < \Xl = first (\beta)$. - \columnbreak + Second case: $i < j$. + In this case $(i + 1)$-th repetition cannot be an $\epsilon$-loop + (because we assumed that $i$-th repetition is the \emph{last} $\epsilon$-loop), + therefore + fork of $\alpha \beta \gamma$ and $\alpha \gamma$ happens + inside of $i$-th repetition of $\alpha \beta \gamma$ + and $(i + 1)$-th repetition of $\alpha \gamma$: + % + \begin{alignat*}{10} + \alpha \beta \gamma &= x_0 \Xl_{h-1} \;&&\; \Xl_h x_1 \Xr_h \hdots \Xl_h x_{i-1} \Xr_h \Xl_h y_1 \;&&\big|\; y_2 \Xr_h \Xl_h x_{i+1} \Xr_h && \Xl_h x_{i+2} \Xr_h \hdots \Xl_h x_j \Xr_h \;&&\; \Xr_{h-1} x_{j+1} \\[-0.5em] + \alpha \gamma &= x_0 \Xl_{h-1} \;&&\; \Xl_h x_1 \Xr_h \hdots \Xl_h x_{i-1} \Xr_h \Xl_h y_1 \;&&\big|\; y_3 \Xr_h && \Xl_h x_{i+2} \Xr_h \hdots \Xl_h x_j \Xr_h \;&&\; \Xr_{h-1} x_{j+1} + \end{alignat*} + % + In this case + fork frame of $\alpha \beta \gamma$ contains $y_2 \Xr_h \Xl_h$ fragment, because $y_2$ is part of the $\epsilon$-loop. + But the fork frame of $\alpha \gamma$ ends inside of $y_3$, because $(i+1)$-th repetiton is not an $\epsilon$-loop and must contain alphabet symbols. + Therefore at the fork frame $k$ we have $\rho_k = h$ and $\rho'_k > h$. + All subsequent frames $l > k$ are identical, + so either $\rho_l = \rho'_l = h'$ (if $l$-th frame contains parentheses of height $h' \leq h$), + or else $\rho_l = \rho_k$ and $\rho'_l = \rho'_k$. + Consequently $\alpha \gamma \sqsubset \alpha \beta \gamma$. - \begin{algorithm}[H] \DontPrintSemicolon \SetKwProg{Fn}{}{}{} \SetAlgoInsideSkip{medskip} - \Fn {$\underline{scan(q, result, topsort)} \smallskip$} { - \ForEach {outgoing arc $(q, \epsilon, \chi, p) \in \Delta$} { - $x = result(q) \chi$ \; - $relax(p, x, result, topsort)$ \; - } - } - \end{algorithm} + In both cases $\alpha \gamma < \alpha \beta \gamma$, + which contradicts the fact that $\pi$ is a minimal path. + $\square$ + \end{XLem} - \begin{algorithm}[H] \DontPrintSemicolon \SetKwProg{Fn}{}{}{} \SetAlgoInsideSkip{medskip} - \Fn {$\underline{relax(q, x, result, topsort, B, D)} \smallskip$} { - $(\Xund, \Xund, l) = precedence (x, result(q), B, D)$ \; - \If {$l = -1$} { - $result(q) = x$ \; - \If {$status(q) \neq \mathit{TOPSORT}$} { - $push(topsort, q)$ \; - $status(q) = \mathit{NEWPASS}$ \; - } - } - } - \end{algorithm} -\end{multicols} -\begin{center} -\caption{GOR1.} -\end{center} -\end{figure*} + \begin{XLem}\label{gor1_loops} + GOR1 discards paths with tagged $\epsilon$-loops. + \\ + Proof. + GOR1 finds non-looping paths before their looping counterparts, + as it uses depth-first search to explore new paths and prunes ambiguous paths + immediately after exploring transitions to the join state. + So for each TNFA state, the first path to be found is a path without $\epsilon$-loops. + We will show that once GOR1 has found a path without $\epsilon$-loops, + it will never prefer a path with an $\epsilon$-loop + (though of course it might prefer some other path without $\epsilon$-loops). -\clearpage + The only TNFA construct that has a loop is unbounded repetition + $F\big( (i, \Xund, (i_1, \Xund, r_1)^{n,\infty}) \big)$ where $n \geq 0$, + shown on figure \ref{fig_gor1} (e) -- (f). + Consider arbitrary path $\pi$ that contains + $\epsilon$-loop through sub-TNFA $F\big( (i_1, \Xund, r_1) \big)$. + % + Let $q_1$ be the first state on $\pi$ that belongs to the $\epsilon$-loop. + % + Path $\pi$ can be represented as $\pi = \pi_1 \pi_2 \pi_3$, where + $\pi_1 = q_0 \overset {u | \alpha} {\rightsquigarrow} q_1$ and + $\pi_2 = q_1 \overset {\epsilon | \beta} {\rightsquigarrow} q_1$ and + $\pi_3 = q_1 \overset {v | \gamma} {\rightsquigarrow} q_f$. + % + By the time GOR1 finds path $\pi_1 \pi_2$, + it must have already found some other path $\pi'_1 = q_0 \overset {u | \alpha'} {\rightsquigarrow} q_1$ without $\epsilon$-loops. + There are two possible cases: either $\alpha' = \alpha$, or $\alpha' < \alpha$. + We will show that in both cases $\alpha' < \alpha \gamma$ + and consequently, GOR1 prefers the path without the $\epsilon$-loop. + Let $k$ be the index of the last frame + and $\big( (\rho_1, \hdots, \rho_k), (\rho'_1, \hdots, \rho'_k) \big) = traces (\alpha', \alpha \gamma)$. + + First case: $\alpha' = \alpha$. + Because $\alpha$ is a proper prefix of $\alpha \gamma$, + fork happens at the last frame and we have + $\rho_k = lasth(\alpha)$ and + $\rho'_k = min (lasth(\alpha), minh(\gamma))$. + If $lasth(\alpha) > minh(\gamma)$, then $\rho_k > \rho'_k$ and $\alpha \sqsubset \alpha \gamma$. + Otherwise $\rho_k = \rho'_k$ and $\alpha \sim \alpha \gamma$, + and we have $first(\alpha \backslash \alpha \gamma) = \bot$ and $first(\alpha \gamma \backslash \alpha) \neq \bot$, + therefore $\alpha \subset \alpha \gamma$. + In both cases $\alpha < \alpha \gamma$. + + Second case: $\alpha' < \alpha$. + Let $\big( (\sigma_1, \hdots, \sigma_k), (\sigma'_1, \hdots, \sigma'_k) \big) = traces (\alpha', \alpha)$. + We have $\rho_k = \sigma_k$ and $\rho'_k = min (\sigma'_k, minh(\gamma)) \leq \sigma_k$. + If $minh(\gamma) < \sigma'_k$ then $\rho_k > \rho'_k$ and $\alpha' \sqsubset \alpha \gamma$. + Otherwise $\rho'_k = \sigma'_k$. + If $\alpha' \sqsubset \alpha$ then $\alpha' \sqsubset \alpha \gamma$. + Otherwise $\alpha' \sim \alpha$ and $\alpha' \subset \alpha$. + None of $\alpha$ and $\alpha'$ is a proper prefix of the other, + because otherwise the longer path has an $\epsilon$-loop through $q_1$, which contradicts our assumption about $\pi_1$ and $\pi'_1$. + Therefore $first (\alpha' \backslash \alpha) = first (\alpha' \backslash \alpha \gamma)$ + and $first (\alpha \backslash \alpha') = first (\alpha \gamma \backslash \alpha')$. + Consequently $\alpha' \subset \alpha \implies \alpha' \subset \alpha \gamma$. + Thus $\alpha' < \alpha \gamma$. + $\square$ + \end{XLem} + + + + + + \begin{XLem} \emph{(Right distributivity of path comparison over path concatenation for $\epsilon$-loop free paths.)} + Let + $\pi_\alpha = q_0 \overset {u | \alpha} {\rightsquigarrow} q_1$ and + $\pi_\beta = q_0 \overset {u | \beta} {\rightsquigarrow} q_1$ + be ambiguous paths in TNFA $F(r)$ + and $\pi_\gamma = q_1 \overset {\epsilon | \gamma} {\rightsquigarrow} q_2$ + their common $\epsilon$-suffix, + such that paths $\pi_\alpha$, $\pi_\beta$ + and the extended paths $\pi_\alpha \pi_\gamma$, $\pi_\beta \pi_\gamma$ + are all free from $\epsilon$-loops. + Then $\alpha < \beta \implies \alpha \gamma < \beta \gamma$. + \\ + Proof. + + Let $k = |u|$ be the number of frames in $\alpha$ and $\beta$. + Let + $\big( (\rho_1, \hdots, \rho_k),$ $(\rho'_1, \hdots, \rho'_k) \big) = traces (\alpha, \beta)$ and + $\big( (\sigma_1, \hdots, \sigma_k),$ $(\sigma'_1, \hdots, \sigma'_k) \big) = traces (\alpha \gamma, \beta \gamma)$. + Obviously for frames $i < k$ we have $\rho_i = \sigma_i$ and $\rho'_i = \sigma'_i$, + and for the last frame $k$ we have + $\sigma_k = min (\rho_k, minh (\gamma))$ and + $\sigma'_k = min (\rho'_k, minh (\gamma))$. + Consider two possible cases. + + First case: $\alpha \sim \beta \wedge \alpha \subset \beta$. + % + We show that $\alpha \gamma \sim \beta \gamma \wedge \alpha \gamma \subset \beta \gamma$. + % + We have $\rho_i = \rho'_i \; \forall i$, therefore + $\sigma_i = \sigma'_i \; \forall i$ and consequently $\alpha \gamma \sim \beta \gamma$. + Let + $x = first (\alpha \backslash \beta)$, + $y = first (\beta \backslash \alpha)$, + $x' = first (\alpha \backslash \beta \gamma)$ and + $y' = first (\beta \backslash \alpha \gamma)$. + None of $\pi_\alpha$ and $\pi_\beta$ is a proper prefix of another, + otherwise the longer path must contain $\epsilon$-loop through $q_1$ + (because $\alpha$ and $\beta$ have the same number of frames). + Consequently $x = x'$ and $y = y'$, and we have + $\alpha \subset \beta$ + $\implies$ + $x < y$ + $\implies$ + $x' < y'$ + $\implies$ + $\alpha \gamma \subset \beta \gamma$. + + Second case: $\alpha \sqsubset \beta$. + We show that $\alpha \gamma \sqsubset \beta \gamma$. + % + If $\rho_k = \rho'_k$ then $\sigma_k = \sigma'_k$ + and obviously $\alpha \gamma \sqsubset \beta \gamma$. + Else it must be $\rho_k > \rho'_k$. + In this case, if $minh (\gamma) > \rho'_k$, then $\sigma_k > \sigma'_k$ and again $\alpha \gamma \sqsubset \beta \gamma$. + Else $minh (\gamma) \leq \rho'_k$ and $\sigma_k = \sigma'_k$. + In this case, if $k > 1$ and $\rho_{k-1} > \rho'_{k-1}$ then again $\alpha \gamma \sqsubset \beta \gamma$. + % + In other words, the only possible case when $\gamma$ can change comparison result is + when at the last frame we have $\rho_k > \rho'_k$, + the appended suffix $\gamma$ contains parentheses with low height $minh (\gamma) \leq \rho'_k$ + (so that $\sigma_k = \sigma'_k$), + and the previous frame doesn't exist + or compares differently from the last frame: $k = 1$ or $\rho_{k-1} \leq \rho'_{k-1}$. + We show that in this case the extended path $\pi_\beta \pi_\gamma$ must contain $\epsilon$-loop, + which contradicts to the lemma condiitons. + + Consider the fragments of paths $\pi_\alpha$ and $\pi_\beta$ from fork to join, + including (if it exists) the $\epsilon$-transition to the fork state: + $\pi_\alpha' = q_2 \overset {u | \alpha'} {\rightsquigarrow} q_1$ and + $\pi_\beta' = q_2 \overset {u | \beta'} {\rightsquigarrow} q_1$. + We know that $minh (\alpha') = \rho_k$. +% (because $\rho_k$ is set to the minimal parenthesis height on the path from fork to join). + Therefore by lemma \ref{gor1_path_containment} + we know that $\pi_\alpha'$ is contained in a subautomaton $f$ of height $\rho_k$. + Likewise we know that $\pi_\beta'$ is not contained in $f$, because $minh (\beta') = \rho'_k < \rho_k$. + % + Let $\pi_\beta''$ be the part of $\pi_\beta'$ containing the last $k$-th frame, + and note the following: + \begin{enumerate} + \item[(a)] the start state of $\pi_\beta''$ must be contained in $f$ + (because by our assumption + either $k = 1$ and then start state of $\pi_\beta''$ is the fork state, + or $\rho_{k-1} \leq \rho'_{k-1}$ which implies $\rho'_{k-1} \geq \rho_k$ + and then all but the last frames of $\pi_\beta'$ must be contained in $f$) + \item[(b)] $\pi$ cannot be contained in $f$ + (because by our assumption $\rho_k > \rho'_k$) + \item[(c)] the end state of $\pi_\beta''$ is contained in $f$ + (because it's the join state $q_1$ of $\pi_\alpha'$ and $\pi_\beta'$) + \item[(d)] $\pi_\gamma$ is not contained in $f$ + (because by our assumption $minh (\gamma) \leq \rho'_k$ and consequently $minh (\gamma) < \rho_k$) + \end{enumerate} + % + Put together, items (a) - (d) mean that the $\epsilon$-path $\pi_\beta'' \pi_\gamma$ + first leaves subautomaton $f$, then re-enters $f$, and then leaves $f$ second time. + Because $f$ has a unique exit state, this means that $\pi_\beta'' \pi_\gamma$ contains $\epsilon$-loop + through the exit state of $f$, which contradicts lemma conditions. + (Effectively it means that $\pi_\beta \pi_\gamma$ is non-minimal and would be discarded by GOR1 anyway.) +% Note that $\pi_\beta'$ itself does not necessarily contain $\epsilon$-loop. + % + $\square$ + \end{XLem} + + + + +\section{Matching algorithm} + +The final algorithm, complexity estimates. + +\section{Conclusions} + +\FloatBarrier \pagebreak -\section{GOR1 correcness proof} +\section*{Appendix} + +\subsection*{Proof of Theorem \ref{theorem_order_on_IPTs}} + + \begin{XLem}\label{lemma_ptorder_antisymmetry} + The order on IPTs is antisymmetric: if $t < s$, then $s \not< t$. + \\ + Proof. + + Suppose, on the contrary, that $t <_p s$ and $s <_q t$ for some $p$, $q$. + Without loss of generality let $p \leq q$. + On one hand $t <_p s$ implies $\|t\|_p > \|s\|_p$. + On the other hand $s <_q t$ implies $\|t\|_p \leq \|s\|_p$. + Contradicting the assumption. + $\square$ + \end{XLem} + + \begin{XLem}\label{lemma_ptorder_transitivity} + The order on IPTs is transitive: if $t < s$ and $s < u$, then $t < u$. + \\ + Proof. -For a given RT $r$, -we say that PE $\alpha$ is \emph{minimal} if $\alpha = PE(t)$ for some minimal $t \in PT(r)$, -and we say that path $\pi$ in TNFA $F(r)$ is \emph{minimal} if $\pi$ induces a minimal PE. + Let $t <_p s$ and $s <_q u$ for some positions $p$, $q$, and let $r = min (p, q)$. + \\[-1em] -GOR1 correctness proof consists of two parts. -First, we get rid of $\epsilon$-loops by showing that, -on one hand, minmal paths do not contain $\epsilon$-loops, -and on the other hand, GOR1 cancels all paths which contain $\epsilon$-loops. -Second, for paths without $\epsilon$-loops we show right distributivity of path comparison over path concatenation. -The proofs make use of the TNFA nested structure -and the fact that each sub-TNFA is has a unique entry and exit states. -TNFA construct for all possible types of RT are shown on the fugure below. + First, we show that $\|t\|_r > \|u\|_r$. + If $p \leq q$, we have $\|t\|_p > \|s\|_p$ (implied by $t <_p s$) + and $\|s\|_p \geq \|u\|_p$ (implied by conjunction $s <_q u \wedge p \leq q$), + therefore $\|t\|_p > \|u\|_p$. + Otherwise $p > q$, we have $\|s\|_q > \|u\|_q$ (implied by $s <_q u$) + and $\|t\|_q = \|s\|_q$ (implied by conjunction $t <_p s \wedge q < p$), + therefore $\|t\|_q > \|u\|_q$. + \\[-1em] + Second, we show that $\forall r' < r$ it holds that $\|t\|_{r'} = \|u\|_{r'}$. + We have $\|t\|_{r'} = \|s\|_{r'}$ (implied by conjunction $t <_p s \wedge r' < p$) + and $\|s\|_{r'} = \|u\|_{r'}$ (implied by conjunction $s <_q u \wedge r' < q$), + therefore $\|t\|_{r'} = \|u\|_{r'}$. + $\square$ + \end{XLem} + \begin{XLem}\label{incomparability_equivdef} + $t \sim s \Leftrightarrow \; \forall p : \|t\|_p = \|s\|_p$. + \\ + Proof. -\begin{figure}\label{fig_gor1} -\includegraphics[width=\linewidth]{img/gor1.pdf} -\caption{ -Sub-TNFA for individual sub-RT with submatch groups: \\ -(a) -- union, (b) -- product, (c), (d) -- bounded repetition, (e), (f) -- unbounded repetition. -} -\end{figure} + $\Rightarrow$. %First, we show $t \sim s \Rightarrow \forall p : \|t\|_p = \|s\|_p$. + Suppose, on the contrary, that $\exists p = min \{ q \mid \|t\|_q \neq \|s\|_q \}$, + then either $t <_p s$ (if $\|t\|_p > \|s\|_p$), or $s <_p t$ (if $\|t\|_p < \|s\|_p$). + Both cases contradict $t \sim s$. + \\[-1em] - \begin{XLem}\label{gor1_path_containment} - Let $r$ be a RE, $\pi = q_1 \overset {\alpha} {\rightsquigarrow} q_2$ a tagged path in TNFA $F(r)$, - where $\alpha \neq \epsilon$, - and $h = minht (\alpha)$ the minimal tag height on path $\pi$. - The following statements are true: - \begin{enumerate} - \item There is a position $p$ of length $|p| = h$ - such that $\pi$ fully lies inside of subautomaton $F(r|_p)$. + $\Leftarrow$. + $\forall p : \|t\|_p = \|s\|_p$ implies + $\nexists p : t <_p s$ and $\nexists q : s <_q t$, + which implies $t \sim s$. + $\square$ + \end{XLem} - \item There no position $p$ of length $|p| > h$ - such that $\pi$ fully lies inside of subautomaton $F(r|_p)$. - \end{enumerate} + \begin{XLem}\label{lemma_ptorder_transitivity_of_incomparability} + Incomparability relation on parse trees is transitive: if $t \sim s$ and $s \sim u$, then $t \sim u$. + \\ Proof. - Obvious from TNFA construction. + + By lemma \ref{incomparability_equivdef} we have + $t \sim s \Rightarrow \forall p : \|t\|_p = \|s\|_p$ and + $s \sim u \Rightarrow \forall p : \|s\|_p = \|u\|_p$, + therefore by lemma \ref{incomparability_equivdef} $\forall p : \|t\|_p = \|u\|_p \Rightarrow t \sim u$. $\square$ \end{XLem} +The proof of theorem \ref{theorem_order_on_IPTs} +follows from +the lemma \ref{lemma_ptorder_antisymmetry}, +the lemma \ref{lemma_ptorder_transitivity} and +the lemma \ref{lemma_ptorder_transitivity_of_incomparability}. + +\subsection*{Proof of Theorem \ref{theorem_order_compat}} +First, we prove an auxilary lemma \ref{lemma_subtrees} which +shows that comparison of sub-IPT is justified +if the s-norms at all preceding submatch positions are equal. - \begin{XLem}\label{gor1_minpaths} - Minimal paths do not contain tagged $\epsilon$-loops. + \begin{XLem}\label{lemma_subtrees} + If $t, s \in \IPT(r, w)$ and $\exists p \in Sub(t) \cup Sub(s)$ such that $\snorm{t}{q} = \snorm{s}{q} \; \forall q \leq p$, + then $\exists \widetilde{r}, \widetilde{w} : t|_p, s|_p \in \IPT(\widetilde{r}, \widetilde{w})$. \\ Proof. + By induction on position $p$. + \\[-1em] - Suppose, on the contrary, that $\pi$ is a minimal path in TNFA $F(r)$, and that $\pi$ contains at least one $\epsilon$-loop. - Consider the \emph{last} $\epsilon$-loop in $\pi$: - it can only come from sub-TNFA of the form $F\big( (i, \Xund, (i_1, \Xund, r_1)^{n,\infty}) \big)$ where $n \geq 0$, - as this is the only looping TNFA construct. - Let $w_n$ be the final state of sub-TNFA $F\big( (i_1, \Xund, r_1) \big)$ - as shown on figure \ref{fig_gor1} (e) -- (f). - Then $\pi$ can be represented as - $\pi = \pi_1 \pi_2 \pi_3$, where $\pi_2$ is the $\epsilon$-loop: - $\pi_1 = q_0 \overset {u | \alpha} {\rightsquigarrow} w_n$ and - $\pi_2 = w_n \overset {\epsilon | \beta} {\rightsquigarrow} w_n$ and - $\pi_3 = w_n \overset {v | \gamma} {\rightsquigarrow} q_f$. - Consider path $\pi' = \pi_1 \pi_3$ that is obtained from $\pi$ by removing $\pi_2$. - It consumes the same input string $uv$ as $\pi$, - therefore PE transduced by $\pi$ and $\pi'$ are comparable: $\alpha \beta \gamma, \alpha \gamma \in PE(r, uv)$. - Let $j$ be the total number of repetitions through $F\big( (i_1, \Xund, r_1) \big)$, - and let $i$ be the index of the $\epsilon$-loop repetition. + Induction basis: the case of $p = \Lambda$ is trivial: let $\widetilde{r} = r$, $\widetilde{w} = w$. + \\[-1em] - First case: $i = j$. - In this case fork of $\alpha \beta \gamma$ and $\alpha \gamma$ happens immediately after $(i-1)$-th repetition: - % - \begin{alignat*}{10} - \alpha \beta \gamma &= x_0 \Xl_{h-1} \;&&\; \Xl_h x_1 \Xr_h \hdots \Xl_h x_{i-1} \Xr_h \;&&\big|\; \Xl_h x_{i} \Xr_h \;&&\; \Xr_{h-1} x_{j+1} \\[-0.5em] - \alpha \gamma &= x_0 \Xl_{h-1} \;&&\; \Xl_h x_1 \Xr_h \hdots \Xl_h x_{i-1} \Xr_h \;&&\big|\; \;&&\; \Xr_{h-1} x_{j+1} - \end{alignat*} - % - It must be $\alpha \beta \gamma \sim \alpha \gamma$, - because $minht (\beta) = h > h - 1 = minht (first (\gamma))$, - so at the fork frame $k$ we have $\rho_k = \rho'_k \leq h - 1$. - % - Furthermore, it must be $\alpha \gamma \subset \alpha \beta \gamma$, - because $first (\gamma) = \Xr < \Xl = first (\beta)$. + Induction step: we have $|p| > 0$, let $p = p'.i$, where $i \in \YN$. + Let $t' = t|_{p'} = T(t_1, \dots, t_n)$, + $s' = s|_{p'} = T(s_1, \dots, s_m)$. + By induction hypothesis $\exists r', w' : t', s' \in \IPT(r', w')$, + where $w' = str(t_1) \dots str(t_n) = str(s_1) \dots str(s_m)$. + \\[-1em] - Second case: $i < j$. - In this case $(i + 1)$-th repetition cannot be an $\epsilon$-loop - (because we assumed that $i$-th repetition is the \emph{last} $\epsilon$-loop), - therefore - fork of $\alpha \beta \gamma$ and $\alpha \gamma$ happens - inside of $i$-th repetition of $\alpha \beta \gamma$ - and $(i + 1)$-th repetition of $\alpha \gamma$: - % - \begin{alignat*}{10} - \alpha \beta \gamma &= x_0 \Xl_{h-1} \;&&\; \Xl_h x_1 \Xr_h \hdots \Xl_h x_{i-1} \Xr_h \Xl_h y_1 \;&&\big|\; y_2 \Xr_h \Xl_h x_{i+1} \Xr_h && \Xl_h x_{i+2} \Xr_h \hdots \Xl_h x_j \Xr_h \;&&\; \Xr_{h-1} x_{j+1} \\[-0.5em] - \alpha \gamma &= x_0 \Xl_{h-1} \;&&\; \Xl_h x_1 \Xr_h \hdots \Xl_h x_{i-1} \Xr_h \Xl_h y_1 \;&&\big|\; y_3 \Xr_h && \Xl_h x_{i+2} \Xr_h \hdots \Xl_h x_j \Xr_h \;&&\; \Xr_{h-1} x_{j+1} - \end{alignat*} - % - In this case - fork frame of $\alpha \beta \gamma$ contains $y_2 \Xr_h \Xl_h$ fragment, because $y_2$ is part of the $\epsilon$-loop. - But the fork frame of $\alpha \gamma$ ends inside of $y_3$, because $(i+1)$-th repetiton is not an $\epsilon$-loop and must contain alphabet symbols. - Therefore at the fork frame $k$ we have $\rho_k = h$ and $\rho'_k > h$. - All subsequent frames $l > k$ are identical, - so either $\rho_l = \rho'_l = h'$ (if $l$-th frame contains parentheses of height $h' \leq h$), - or else $\rho_l = \rho_k$ and $\rho'_l = \rho'_k$. - Consequently $\alpha \gamma \sqsubset \alpha \beta \gamma$. + Next, we show that $str(t_i) = str(s_i)$. + It must be that $i \in Sub(s') \cap Sub(t')$, + otherwise only one of $\|t'\|_i$, $\|s'\|_i$ is $\infty$, + which contradicts lemma condiiton $\|t\|_p = \|s\|_p$. + Consider position $j \leq i$. + Because the set of submatch positions contains siblings, we have $j \in Sub(s') \cap Sub(t')$. + Consequently, $\|t'\|_j = |str(t_j)|$ and $\|s\|_j = |str(s_j)|$. + By lemma condition we have $\|t\|_{p'.j} = \|s\|_{p'.j}$, + therefore $\|t'\|_j = \|s'\|_j$, + therefore $|str(t_j)| = |str(s_j)|$. + Because $str(t_1) \dots str(t_n) = str(s_1) \dots str(s_m)$, + we have $str(t_i) = str(s_i)$. + \\[-1em] - In both cases $\alpha \gamma < \alpha \beta \gamma$, - which contradicts the fact that $\pi$ is a minimal path. + Now, let $\widetilde{w} = str(t_i)$. + If $r' = r_1|r_2$ or $r' = r_1 r_2$, let $\widetilde{r} = r_i$. + Otherwise, $r' = r_1^{k,l}$, let $\widetilde{r} = r_1$. $\square$ \end{XLem} + Proof of theorem \ref{theorem_order_compat}. + \\[-1em] - \begin{XLem}\label{gor1_loops} - GOR1 discards paths with tagged $\epsilon$-loops. - \\ - Proof. - - GOR1 finds non-looping paths before their looping counterparts, - as it uses depth-first search to explore new paths and prunes ambiguous paths - immediately after exploring transitions to the join state. - So for each TNFA state, the first path to be found is a path without $\epsilon$-loops. - We will show that once GOR1 has found a path without $\epsilon$-loops, - it will never prefer a path with an $\epsilon$-loop - (though of course it might prefer some other path without $\epsilon$-loops). + Consider any $t \in T_{min}$. + For each position $p \in Sub(t)$, which is not itself a prefix of another position in $Sub(t)$, + consider subtree $t' = t|_p$. + It is an IPT for some sub-IRE $r'$ and substring $w'$: $t' \in \IPT(r', w')$. + Let $t''$ be the $<$-minimal tree in $\IPT(r', w')$ and substitute $t'$ with $t''$ in $t$. + (Note that substitutions are independent and can be performed in any order.) + Let $u$ be the tree resulting from all such substitutions. + By lemma \ref{incomparability_equivdef} we have $u \sim t$ + (because substitutions preserve the s-norm of subtrees at positions in $Sub(t)$), + and so $u \in T_{min}$. + We will show that $u = t_{min}$. + \\[-1em] - The only TNFA construct that has a loop is unbounded repetition - $F\big( (i, \Xund, (i_1, \Xund, r_1)^{n,\infty}) \big)$ where $n \geq 0$, - shown on figure \ref{fig_gor1} (e) -- (f). - Consider arbitrary path $\pi$ that contains - $\epsilon$-loop through sub-TNFA $F\big( (i_1, \Xund, r_1) \big)$. - % - Let $q_1$ be the first state on $\pi$ that belongs to the $\epsilon$-loop. - % - Path $\pi$ can be represented as $\pi = \pi_1 \pi_2 \pi_3$, where - $\pi_1 = q_0 \overset {u | \alpha} {\rightsquigarrow} q_1$ and - $\pi_2 = q_1 \overset {\epsilon | \beta} {\rightsquigarrow} q_1$ and - $\pi_3 = q_1 \overset {v | \gamma} {\rightsquigarrow} q_f$. + Suppose, on the contrary, that $u \neq t_{min}$. % - By the time GOR1 finds path $\pi_1 \pi_2$, - it must have already found some other path $\pi'_1 = q_0 \overset {u | \alpha'} {\rightsquigarrow} q_1$ without $\epsilon$-loops. - There are two possible cases: either $\alpha' = \alpha$, or $\alpha' < \alpha$. - We will show that in both cases $\alpha' < \alpha \gamma$ - and consequently, GOR1 prefers the path without the $\epsilon$-loop. - Let $k$ be the index of the last frame - and $\big( (\rho_1, \hdots, \rho_k), (\rho'_1, \hdots, \rho'_k) \big) = traces (\alpha', \alpha \gamma)$. + Then we have $t_{min} <_p u$ for some non-submatch decision position $p$. + Let $p = p'.p''$, where $p'$ is the longest prefix of $p$ that is a submatch position: $p' \in Sub(u) \cup Sub(t_{min})$. + \\[-1em] - First case: $\alpha' = \alpha$. - Because $\alpha$ is a proper prefix of $\alpha \gamma$, - fork happens at the last frame and we have - $\rho_k = lastht(\alpha)$ and - $\rho'_k = min (lastht(\alpha), minht(\gamma))$. - If $lastht(\alpha) > minht(\gamma)$, then $\rho_k > \rho'_k$ and $\alpha \sqsubset \alpha \gamma$. - Otherwise $\rho_k = \rho'_k$ and $\alpha \sim \alpha \gamma$, - and we have $first(\alpha \backslash \alpha \gamma) = \bot$ and $first(\alpha \gamma \backslash \alpha) \neq \bot$, - therefore $\alpha \subset \alpha \gamma$. - In both cases $\alpha < \alpha \gamma$. + It must be that for all submatch positions $q' < p'$ we have $\snorm{u}{q'} = \snorm{t_{min}}{q'}$, + because $u \in T_{min}$ and thus either $u \sim t_{min}$, + or $u \prec_q t_{min}$ for some $q \in Sub(u) \cup Sub(t_{min})$ + (in which case it must be $q > p$, because $u \prec_q t_{min}$ implies $\snorm{u}{q} > \snorm{t_{min}}{q}$, + which in turn implies $\pnorm{u}{q} > \pnorm{t_{min}}{q}$, + which contradicts $t_{min} <_p u$ if $q \leq p$). + \\[-1em] - Second case: $\alpha' < \alpha$. - Let $\big( (\sigma_1, \hdots, \sigma_k), (\sigma'_1, \hdots, \sigma'_k) \big) = traces (\alpha', \alpha)$. - We have $\rho_k = \sigma_k$ and $\rho'_k = min (\sigma'_k, minht(\gamma)) \leq \sigma_k$. - If $minht(\gamma) < \sigma'_k$ then $\rho_k > \rho'_k$ and $\alpha' \sqsubset \alpha \gamma$. - Otherwise $\rho'_k = \sigma'_k$. - If $\alpha' \sqsubset \alpha$ then $\alpha' \sqsubset \alpha \gamma$. - Otherwise $\alpha' \sim \alpha$ and $\alpha' \subset \alpha$. - None of $\alpha$ and $\alpha'$ is a proper prefix of the other, - because otherwise the longer path has an $\epsilon$-loop through $q_1$, which contradicts our assumption about $\pi_1$ and $\pi'_1$. - Therefore $first (\alpha' \backslash \alpha) = first (\alpha' \backslash \alpha \gamma)$ - and $first (\alpha \backslash \alpha') = first (\alpha \gamma \backslash \alpha')$. - Consequently $\alpha' \subset \alpha \implies \alpha' \subset \alpha \gamma$. - Thus $\alpha' < \alpha \gamma$. + Therefore by lemma \ref{lemma_subtrees} we have $\exists r', w' : u|_{p'}, t_{min}|_{p'} \in \IPT(r', w')$. + On one hand, $t_{min} <_{p'.p''} u$ implies $t_{min}|_{p'} <_{p''} u|_{p'}$. + But on the other hand, $p' \in Sub(u)$ and $p'$ is not itself a prefix of another position in $Sub(u)$, + therefore $u|_{p'}$ is $<$-minimal by construction of $u$. + Contradiction. +% +% % +% On one hand, $u <_q \widetilde{s}$ for some submatch position $q$ (because $u \in P_{min}$). +% On the other hand, $s \lessdot_p \widetilde{u}$ for some non-submatch position $p < q$ (because $s$ is $\lessdot$-minimal). +% % +% Let $p = p'.p''$, where $p'$ is the longest prefix of $p$ in $Sub(\widetilde{s}) \cup Sub(u)$. +% It must be $\|u\|_{q'} = \|\widetilde{s}\|_{q'} \forall q' \leq p'$ (because $p' < q$ and $u <_q \widetilde{s}$), +% and $p' \in Sub(u)$ (otherwise $\|u\|_{p'} = \infty \neq \|\widetilde{s}\|_{p'}$). +% % +% Now, by lemma \ref{lemma_subtrees} we have $\exists r', w' : u|_{p'}, \widetilde{s}|_{p'} \in PT(r', w')$, +% therefore $s \lessdot_{p'.p''} u$ implies $s|_{p'} \lessdot_{p''} \widetilde{u}|_{p'}$. +% But $p' \in Sub(u)$ and $p'$ is not itself a prefix of another position in $Sub(u)$, +% therefore $\widetilde{u}|_{p'}$ is $\lessdot$-minimal by construction of $u$. +% Contradiction. $\square$ - \end{XLem} +\subsection*{Proof of Theorem \ref{theorem_order_on_pe_same_as_on_pt}} + \begin{XLem}\label{lemma_pe_order_antisymm} + The longest-leftmost-precedence relation $<$ is antisymmetric: + if $\alpha < \beta$, then $\beta \not< \alpha$. + \\ + Proof. + Suppose, on the contrary, that $\alpha < \beta$ and $\beta < \alpha$. + Let $\big( (\rho_0, \dots, \rho_n), (\rho'_0, \dots, \rho'_n) \big) = traces(\alpha, \beta)$. + \medskip - \begin{XLem} \emph{(Right distributivity of path comparison over path concatenation for $\epsilon$-loop free paths.)} + If $\exists i = max \{j \mid \rho_j \neq \rho'_j \}$, then + $\alpha < \beta \implies \alpha \sqsubset \beta \implies \rho_i > \rho'_i$, but + $\beta < \alpha \implies \beta \sqsubset \alpha \implies \rho'_i > \rho_i$. Contradiction. + + \medskip + + Otherwise $\rho_i = \rho'_i \; \forall i$, then + $\alpha < \beta \implies \alpha \sim \beta \wedge \alpha \subset \beta$ and + $\beta < \alpha \implies \beta \sim \alpha \wedge \beta \subset \alpha$. Let - $\pi_\alpha = q_0 \overset {u | \alpha} {\rightsquigarrow} q_1$ and - $\pi_\beta = q_0 \overset {u | \beta} {\rightsquigarrow} q_1$ - be ambiguous paths in TNFA $F(r)$ - and $\pi_\gamma = q_1 \overset {\epsilon | \gamma} {\rightsquigarrow} q_2$ - their common $\epsilon$-suffix, - such that paths $\pi_\alpha$, $\pi_\beta$ - and the extended paths $\pi_\alpha \pi_\gamma$, $\pi_\beta \pi_\gamma$ - are all free from $\epsilon$-loops. - Then $\alpha < \beta \implies \alpha \gamma < \beta \gamma$. + $x = first (\alpha \backslash \beta)$, + $y = first (\beta \backslash \alpha)$, then + $\alpha \subset \beta \implies x < y$, but + $\beta \subset \alpha \implies y < x$. Contradiction. + $\square$ + \end{XLem} + + + \begin{XLem}\label{lemma_pe_equiv} + Let $s, t \in PT(r, w)$. + If $s \sim t$, then $\Phi_{h}(s) = \Phi_{h}(t) \; \forall h$. \\ Proof. + By induction on the height of $r$. - Let $k = |u|$ be the number of frames in $\alpha$ and $\beta$. - Let - $\big( (\rho_1, \hdots, \rho_k),$ $(\rho'_1, \hdots, \rho'_k) \big) = traces (\alpha, \beta)$ and - $\big( (\sigma_1, \hdots, \sigma_k),$ $(\sigma'_1, \hdots, \sigma'_k) \big) = traces (\alpha \gamma, \beta \gamma)$. - Obviously for frames $i < k$ we have $\rho_i = \sigma_i$ and $\rho'_i = \sigma'_i$, - and for the last frame $k$ we have - $\sigma_k = min (\rho_k, minht (\gamma))$ and - $\sigma'_k = min (\rho'_k, minht (\gamma))$. - Consider two possible cases. + \medskip - First case: $\alpha \sim \beta \wedge \alpha \subset \beta$. - % - We show that $\alpha \gamma \sim \beta \gamma \wedge \alpha \gamma \subset \beta \gamma$. - % - We have $\rho_i = \rho'_i \; \forall i$, therefore - $\sigma_i = \sigma'_i \; \forall i$ and consequently $\alpha \gamma \sim \beta \gamma$. - Let - $x = first (\alpha \backslash \beta)$, - $y = first (\beta \backslash \alpha)$, - $x' = first (\alpha \backslash \beta \gamma)$ and - $y' = first (\beta \backslash \alpha \gamma)$. - None of $\pi_\alpha$ and $\pi_\beta$ is a proper prefix of another, - otherwise the longer path must contain $\epsilon$-loop through $q_1$ - (because $\alpha$ and $\beta$ have the same number of frames). - Consequently $x = x'$ and $y = y'$, and we have - $\alpha \subset \beta$ - $\implies$ - $x < y$ - $\implies$ - $x' < y'$ - $\implies$ - $\alpha \gamma \subset \beta \gamma$. + Induction basis. + For RT of height $1$ we have + $| PT(r, w) | \leq 1 \; \forall w$, + therefore $s = t$ and $\Phi_{h}(s) = \Phi_{h}(t)$. - Second case: $\alpha \sqsubset \beta$. - We show that $\alpha \gamma \sqsubset \beta \gamma$. - % - If $\rho_k = \rho'_k$ then $\sigma_k = \sigma'_k$ - and obviously $\alpha \gamma \sqsubset \beta \gamma$. - Else it must be $\rho_k > \rho'_k$. - In this case, if $minht (\gamma) > \rho'_k$, then $\sigma_k > \sigma'_k$ and again $\alpha \gamma \sqsubset \beta \gamma$. - Else $minht (\gamma) \leq \rho'_k$ and $\sigma_k = \sigma'_k$. - In this case, if $k > 1$ and $\rho_{k-1} > \rho'_{k-1}$ then again $\alpha \gamma \sqsubset \beta \gamma$. - % - In other words, the only possible case when $\gamma$ can change comparison result is - when at the last frame we have $\rho_k > \rho'_k$, - the appended suffix $\gamma$ contains parentheses with low height $minht (\gamma) \leq \rho'_k$ - (so that $\sigma_k = \sigma'_k$), - and the previous frame doesn't exist - or compares differently from the last frame: $k = 1$ or $\rho_{k-1} \leq \rho'_{k-1}$. - We show that in this case the extended path $\pi_\beta \pi_\gamma$ must contain $\epsilon$-loop, - which contradicts to the lemma condiitons. + \medskip - Consider the fragments of paths $\pi_\alpha$ and $\pi_\beta$ from fork to join, - including (if it exists) the $\epsilon$-transition to the fork state: - $\pi_\alpha' = q_2 \overset {u | \alpha'} {\rightsquigarrow} q_1$ and - $\pi_\beta' = q_2 \overset {u | \beta'} {\rightsquigarrow} q_1$. - We know that $minht (\alpha') = \rho_k$. -% (because $\rho_k$ is set to the minimal parenthesis height on the path from fork to join). - Therefore by lemma \ref{gor1_path_containment} - we know that $\pi_\alpha'$ is contained in a subautomaton $f$ of height $\rho_k$. - Likewise we know that $\pi_\beta'$ is not contained in $f$, because $minht (\beta') = \rho'_k < \rho_k$. - % - Let $\pi_\beta''$ be the part of $\pi_\beta'$ containing the last $k$-th frame, - and note the following: - \begin{enumerate} - \item[(a)] the start state of $\pi_\beta''$ must be contained in $f$ - (because by our assumption - either $k = 1$ and then start state of $\pi_\beta''$ is the fork state, - or $\rho_{k-1} \leq \rho'_{k-1}$ which implies $\rho'_{k-1} \geq \rho_k$ - and then all but the last frames of $\pi_\beta'$ must be contained in $f$) - \item[(b)] $\pi$ cannot be contained in $f$ - (because by our assumption $\rho_k > \rho'_k$) - \item[(c)] the end state of $\pi_\beta''$ is contained in $f$ - (because it's the join state $q_1$ of $\pi_\alpha'$ and $\pi_\beta'$) - \item[(d)] $\pi_\gamma$ is not contained in $f$ - (because by our assumption $minht (\gamma) \leq \rho'_k$ and consequently $minht (\gamma) < \rho_k$) - \end{enumerate} - % - Put together, items (a) - (d) mean that the $\epsilon$-path $\pi_\beta'' \pi_\gamma$ - first leaves subautomaton $f$, then re-enters $f$, and then leaves $f$ second time. - Because $f$ has a unique exit state, this means that $\pi_\beta'' \pi_\gamma$ contains $\epsilon$-loop - through the exit state of $f$, which contradicts lemma conditions. - (Effectively it means that $\pi_\beta \pi_\gamma$ is non-minimal and would be discarded by GOR1 anyway.) -% Note that $\pi_\beta'$ itself does not necessarily contain $\epsilon$-loop. - % + Induction step. + We have + $s = T^{d} (s_1, \dots, s_n)$ and + $t = T^{d} (t_1, \dots, t_m)$. + If $d = 0$, then $\Phi_{h}(s) = str(s) = w = str(t) = \Phi_{h}(t)$. + Otherwise $d \neq 0$. + By lemma \ref{incomparability_equivdef} we have $s \sim t \Rightarrow \forall p: \|s\|_p = \|t\|_p$. + This implies $n = m$ (otherwise the norm of subtree at position $min(n,m)+1$ is $\infty$ for only one of $s$, $t$). + Therefore + $\Phi_{h}(s) = \Xl_{h+1} \Phi_{h+1}(s_1), \dots, \Phi_{h+1}(s_n) \Xr_h$ and + $\Phi_{h}(t) = \Xl_{h+1} \Phi_{h+1}(t_1), \dots, \Phi_{h+1}(t_n) \Xr_h$. +% Consider any $i \leq n$. + It suffices to show that $\forall i \leq n: \Phi_{h+1}(s_i) = \Phi_{h+1}(t_i)$. + We have $\forall p: \|s_i\|_p = \|t_i\|_p$ (implied by $\forall p: \|s\|_p = \|t\|_p$), + therefore by lemma \ref{incomparability_equivdef} we have $s_i \sim t_i$, + and by lemma \ref{lemma_subtrees} $\exists r', w': s_i, t_i \in PT(r', w')$, + where the height of $r'$ is less than the height of $r$. + By induction hypothesis $\Phi_{h+1}(s_i) = \Phi_{h+1}(t_i)$. $\square$ \end{XLem} + \begin{XLem}\label{lemma_pe_less_1} + Let $s, t \in PT(r, w)$. + If $s <_p t$ and $|p| = 1$, then $\Phi_{h}(s) < \Phi_{h}(t) \; \forall h$. + \\ + Proof. + By lemma conditions $|p| = 1$, which implies that $s$ and $t$ are compound PT + $s = T^{d} (s_1, \dots, s_n)$ and + $t = T^{d} (t_1, \dots, t_m)$, where + $d \neq 0$ + (because $\Lambda$ is a prefix of decision position $p$). + Therefore $\Phi_{h}(s)$, $\Phi_{h}(t)$ can be represented as follows, + where $k$ is the number of frames and $j$ is the fork: + \begin{alignat*}{7} + \Phi_{h}(s) &\;=\; \Xl_{h+1} &&\Phi_{h+1}(s_1) &&\dots &&\Phi_{h+1}(s_n) \Xr_h + &&\;=\; \beta_0 a_1 \dots a_j \beta_j &&\;\big|\; && \gamma_j a_{j + 1} \dots a_k \gamma_k \\[-0.5em] + \Phi_{h}(t) &\;=\; \Xl_{h+1} &&\Phi_{h+1}(t_1) &&\dots &&\Phi_{h+1}(t_m) \Xr_h + &&\;=\; \beta_0 a_1 \dots a_j \beta_j &&\;\big|\; && \delta_j a_{j + 1} \dots a_k \delta_k + \end{alignat*} +% + By lemma conditions $|p| = 1$, therefore $p \in \YN$. + Consider any $i \in \YN$ such that $i < p$. + By lemma conditions $s <_p t$, which means + $\|s\|_p > \|t\|_p \wedge \|s\|_q = \|t\|_q \;\forall q < p$. + In particular $\|s_i\|_q = \|t_i\|_q \;\forall q$, + therefore by lemma \ref{incomparability_equivdef} $s_i \sim t_i$ + and by lemma \ref{lemma_pe_equiv} we have $\Phi(s_i) = \Phi(t_i)$. + Let $traces (\Phi_{h}(s), \Phi_{h}(t)) = \big( (\rho_0, \dots, \rho_k), (\rho'_0, \dots, \rho'_k) \big)$ + and consider the following cases. -\section{Matching algorithm} - -The final algorithm, complexity estimates. - -\section{Conclusions} + \medskip -\FloatBarrier -\pagebreak + First case: + $\infty = \|s_p\| > \|t_p\|$. + In this case $s|_p$ does not exist + (because $p$ corresponds to a submatch position in $r$, + therefore $p \in Pos(s)$ implies $p \in Sub(s)$, + which contradicts $\|s_p\| = \infty$). + Fork happens immediately after $\Phi_{h+1}(s_{p-1})$, $\Phi_{h+1}(t_{p-1})$: + \begin{alignat*}{7} + \Phi_{h}(s) &\;=\; \Xl_{h+1} &&\Phi_{h+1}(s_1) &&\dots &&\Phi_{h+1}(s_{p-1}) + &&\;\big|\; \Xr_{h} && && \\[-0.5em] + \Phi_{h}(t) &\;=\; \Xl_{h+1} &&\Phi_{h+1}(t_1) &&\dots &&\Phi_{h+1}(t_{p-1}) + &&\;\big|\; \Phi_{h+1}(t_p) &&\dots &&\Phi_{h+1}(t_m) \Xr_{h} + \end{alignat*} + % + In this case fork frame is the last frame: $j = k$, and therefore $\rho_j = \rho'_j = h$ + (because $\gamma_j$ and $\delta_j$ contain the closing parenthesis $\Xr_{h}$). + For all $i < j$ we have $\rho_i = \rho'_i = -1$, therefore $\Phi_{h}(s) \sim \Phi_{h}(t)$. + Furthermore, $first(\gamma_j)$ is $\Xr$ and $first(\delta_j)$ is one of $\Xl$ and $\Xm$, + therefore $\Phi_{h}(s) \subset \Phi_{h}(t)$. + Consequently $\Phi_{h}(s) < \Phi_{h}(t)$. -\section*{Appendix} + \medskip -\subsection*{Proof of Theorem \ref{theorem_order_on_IPTs}} + Second case: $\infty > \|s_p\| > \|t_p\| = -1$. + In this case both $s_p$ and $t_p$ exist, + $s_p$ is not $\varnothing$ and $t_p$ is $\varnothing$, + and fork happens immediately after $\Phi_{h+1}(s_{p-1})$, $\Phi_{h+1}(t_{p-1})$: + \begin{alignat*}{8} + \Phi_{h}(s) &\;=\; \Xl_{h+1} &&\Phi_{h+1}(s_1) &&\dots &&\Phi_{h+1}(s_{p-1}) + &&\;\big|\; \Xl_{h+1} \; x \; \Xr_{h+1} \; &&\Phi_{h+1}(s_{p+1}) &&\dots &&\Phi_{h+1}(s_n) \Xr_{h} \\[-0.5em] + \Phi_{h}(t) &\;=\; \Xl_{h+1} &&\Phi_{h+1}(t_1) &&\dots &&\Phi_{h+1}(t_{p-1}) + &&\;\big|\; \Xm_{h+1} \;\;\;\;\;\; &&\Phi_{h+1}(t_{p+1}) &&\dots &&\Phi_{h+1}(t_m) \Xr_{h} + \end{alignat*} + % + If $j$-th frame is the last, we have $\rho_j = \rho'_j = h$ like in the first case. + Otherwise we have $\rho_j = \rho'_j = h + 1$, + because $minh(\gamma_j)$, $minh(\delta_j) \geq h + 1$ + and $lasth (\beta_j) = h + 1$ + (because if $p = 1$ then $\beta_j = \Xl_{h+1}$, otherwise + $s_{p-1}$ exists and the last parenthesis in $\beta_j$ + is last parenthesis of $\Phi_{h+1}(s_{p-1})$, which is either $\Xr_{h+1}$ or $\Xm_{h+1}$). + For subsequent frames $i$ such that $j < i < k$ we have $\rho_i = \rho'_i = h + 1$ + (because $minh(\gamma_j)$, $minh(\delta_j) \geq h + 1$), + and for the last pair of frames we have $\rho_k = \rho'_k = h$. + So in this case again $\Phi_{h}(s) \sim \Phi_{h}(t)$. + Furthermore, $first (\gamma_j) = \Xl < \Xm = first (\delta_j)$, therefore $\Phi_{h}(s) \subset \Phi_{h}(t)$ + and $\Phi_{h}(s) < \Phi_{h}(t)$. - \begin{XLem}\label{lemma_ptorder_antisymmetry} - The order on IPTs is antisymmetric: if $t < s$, then $s \not< t$. - \\ - Proof. + \medskip - Suppose, on the contrary, that $t <_p s$ and $s <_q t$ for some $p$, $q$. - Without loss of generality let $p \leq q$. - On one hand $t <_p s$ implies $\|t\|_p > \|s\|_p$. - On the other hand $s <_q t$ implies $\|t\|_p \leq \|s\|_p$. - Contradicting the assumption. + Third case: $\infty > \|s_p\| > \|t_p\| \geq 0$. + In this case both $s_p$ and $t_p$ exist and none of them is $\varnothing$, + and fork happens somewhere after the opening parenthesis $\Xl$ + and before the closing parenthesis $\Xr$ in $\Phi_{h}(s_p)$, $\Phi_{h}(t_p)$: + \begin{alignat*}{9} + \Phi_{h}(s) &\;=\; \Xl_{h+1} &&\Phi_{h+1}(s_1) &&\dots &&\Phi_{h+1}(s_{p-1}) &&\; \Xl_{h+2} \; x + &&\;\big|\; y \; \Xr_{h+1} \; &&\Phi_{h+1}(s_{p+1}) &&\dots &&\Phi_{h+1}(s_n) \Xr_{h} \\[-0.5em] + \Phi_{h}(t) &\;=\; \Xl_{h+1} &&\Phi_{h+1}(t_1) &&\dots &&\Phi_{h+1}(t_{p-1}) &&\; \Xl_{h+2} \; x + &&\;\big|\; z \; \Xr_{h+1} \; &&\Phi_{h+1}(t_{p+1}) &&\dots &&\Phi_{h+1}(t_m) \Xr_{h} + \end{alignat*} + % + Let $\delta_l$ be the frame containing the closing parenthesis $\Xr_{h+1}$ of $\Phi_{h+1}(t_p)$. + Because $\|s_p\| > \|t_p\|$, + the closing parenthesis $\Xr_{h+1}$ of $\Phi_{h+1}(s_p)$ is not contained in $\gamma_{l}$, + and $l$-th frame is not the last one. + Therefore $minh (\gamma_l) \geq h+2$ and $minh (\delta_l) = h+1$. + Furthermore, $minh(x)$, $minh(y)$, $minh(z) \geq h + 2$, + therefore $lasth(\beta_j) = lasth(\Xl_{h+2} \; x) \geq h+2$ and we have + $\rho_i, \rho'_i \geq h+2$ for all frames $j \leq i < l$ + (note that it might be $\rho_i < \rho'_i$), + and for the $l$-th frame $\rho_l \geq h+2 > h+1 = \rho'_l$. + For subsequent frames $\gamma_i$, $\delta_i$ such that $l < i < k$ we have + $minh(\gamma_i)$, $minh(\delta_i) \geq h + 1$, + therefore $\rho_i \geq h+1 = \rho'_i$. + For the last pair of frames $\rho_k = \rho'_k = h$. + Therefore in this case $\Phi_{h}(s) \sqsubset \Phi_{h}(t)$, + which implies $\Phi_{h}(s) < \Phi_{h}(t)$. $\square$ \end{XLem} - \begin{XLem}\label{lemma_ptorder_transitivity} - The order on IPTs is transitive: if $t < s$ and $s < u$, then $t < u$. + + \begin{XLem}\label{lemma_pe_less} + Let $s, t \in PT(r, w)$. + If $s <_p t$, then $\Phi_{h}(s) < \Phi_{h}(t) \; \forall h$. \\ Proof. + By induction on the length of $p$. - Let $t <_p s$ and $s <_q u$ for some positions $p$, $q$, and let $r = min (p, q)$. - \\[-1em] + \medskip - First, we show that $\|t\|_r > \|u\|_r$. - If $p \leq q$, we have $\|t\|_p > \|s\|_p$ (implied by $t <_p s$) - and $\|s\|_p \geq \|u\|_p$ (implied by conjunction $s <_q u \wedge p \leq q$), - therefore $\|t\|_p > \|u\|_p$. - Otherwise $p > q$, we have $\|s\|_q > \|u\|_q$ (implied by $s <_q u$) - and $\|t\|_q = \|s\|_q$ (implied by conjunction $t <_p s \wedge q < p$), - therefore $\|t\|_q > \|u\|_q$. - \\[-1em] + Induction basis for $|p| = 1$ is given by lemma \ref{lemma_pe_less_1}. - Second, we show that $\forall r' < r$ it holds that $\|t\|_{r'} = \|u\|_{r'}$. - We have $\|t\|_{r'} = \|s\|_{r'}$ (implied by conjunction $t <_p s \wedge r' < p$) - and $\|s\|_{r'} = \|u\|_{r'}$ (implied by conjunction $s <_q u \wedge r' < q$), - therefore $\|t\|_{r'} = \|u\|_{r'}$. - $\square$ - \end{XLem} + \medskip - \begin{XLem}\label{incomparability_equivdef} - $t \sim s \Leftrightarrow \; \forall p : \|t\|_p = \|s\|_p$. - \\ - Proof. + Induction step. + Let $|p| \geq 2$, then $s$ and $t$ are compound PT + $s = T^{d} (s_1, \dots, s_n)$ and + $t = T^{d} (t_1, \dots, t_m)$, where + $d \neq 0$ (because $\Lambda$ is a prefix of decision position $p$). + % + Furthermore, let $p = p'.p''$, where $p' \in \YN$. + Subtrees $s' = s_{p'}$ and $t' = t_{p'}$ exist, because $p'$ a proper prefix of decision position $p$, + and they also must be compount PT + $s' = T^{d'} (s'_1, \dots, s'_{n'})$ and + $t' = T^{d'} (t'_1, \dots, t'_{m'})$, + because $|p''| > 0$, and it must be + $d' \neq 0$ (because $p'$ is a prefix of decision position $p$). + % + For subtrees $s_i$ and $t_i$ where $i < p'$ we have + $\|s_i\|_q = \|t_i\|_q \;\forall q$ (implied by $s <_p t$), + therefore by lemma \ref{incomparability_equivdef} + $s_i \sim t_i$, and by lemma \ref{lemma_pe_equiv} we have $\Phi_{h+1}(s_i) = \Phi_{h+1}(t_i)$. + % + Therefore $\Phi_{h}(s)$, $\Phi_{h}(t)$ can be represented as follows: + \begin{alignat*}{9} + \Phi_{h}(s) + \;&= + \;&& \Xl_{h+1} \Phi_{h+1}(s_1) \dots \Phi_{h+1}(s_{p'-1}) + \;&& \overbrace {\Xl_{h+2} \Phi_{h+2}(s'_1) \dots \Phi_{h+2}(s'_{n'}) \Xr_{h+1}}^{\Phi_{h+1}(s')} + \;&& \Phi_{h+1}(s_{p'+1}) \Phi_{h+1}(s_n) \Xr_{h} + \\ + \Phi_{h}(t) + \;&= + \;&& \Xl_{h+1} \Phi_{h+1}(t_1) \dots \Phi_{h+1}(t_{p'-1}) + \;&& \underbrace {\Xl_{h+2} \Phi_{h+2}(t'_1) \dots \Phi_{h+2}(t'_{m'}) \Xr_{h+1}}_{\Phi_{h+1}(t')} + \;&& \Phi_{h+1}(t_{p'+1}) \Phi_{h+1}(t_m) \Xr_{h} + \end{alignat*} - $\Rightarrow$. %First, we show $t \sim s \Rightarrow \forall p : \|t\|_p = \|s\|_p$. - Suppose, on the contrary, that $\exists p = min \{ q \mid \|t\|_q \neq \|s\|_q \}$, - then either $t <_p s$ (if $\|t\|_p > \|s\|_p$), or $s <_p t$ (if $\|t\|_p < \|s\|_p$). - Both cases contradict $t \sim s$. - \\[-1em] + We have $\|s\|_q = \|t\|_q \;\forall q < p'$ (implied by $s <_p t$), + therefore by lemma \ref{lemma_subtrees} $\exists r', w' : s', t' \in PT(r', w')$. + Moreover, $s' <_{p''} t'$ and $|p''| < |p|$, therefore by induction hypothesis $\Phi_{p+1}(s') < \Phi_{p+1}(t')$. + % + On the other hand, if $j$ is the fork and $f \leq j \leq k$ then + $\Phi_{h}(s)$, $\Phi_{h}(t)$ can be represented as: + \begin{alignat*}{9} + \Phi_{h}(s) + \;&= + \;&& \beta_0 a_1 \dots a_f \beta_f^1 + \;&& \overbrace {\beta_f^2 a_{f+1} \dots a_j \beta_j \;\big|\; \gamma_j a_{j+1} \dots a_k \gamma_k^1}^{\Phi_{h+1}(s')} + \;&& \gamma_k^2 a_{k+1} \dots a_l \gamma_l + \\[-0.5em] + \Phi_{h}(t) + \;&= + \;&& \beta_0 a_1 \dots a_f \beta_f^1 + \;&& \underbrace {\beta_f^2 a_{f+1} \dots a_j \beta_j \;\big|\; \delta_j a_{j+1} \dots a_k \delta_k^1}_{\Phi_{h+1}(t')} + \;&& \delta_k^2 a_{k+1} \dots a_l \delta_l + \end{alignat*} - $\Leftarrow$. - $\forall p : \|t\|_p = \|s\|_p$ implies - $\nexists p : t <_p s$ and $\nexists q : s <_q t$, - which implies $t \sim s$. - $\square$ - \end{XLem} +% \begin{alignat*}{9} +% \Phi_{h}(s) +% \;&= +% \;&&\overbrace {\Xl_{h+1} \Phi_{h+1}(s_1) \dots \Phi_{h+1}(s_{p'-1})} +% ^{\beta_0 a_1 \dots a_i \beta_i^1} +% \;&&\overbrace {\Xl_{h+2} \Phi_{h+2}(s'_1) \dots \Phi_{h+2}(s'_{n'}) \Xr_{h+1}} +% ^{\beta_i^2 a_{i+1} \dots a_j \beta_j \;\big|\; \gamma_j a_{j+1} \dots a_k \gamma_k^1} +% \;&&\overbrace {\Phi_{h+1}(s_{p'+1}) \Phi_{h+1}(s_n) \Xr_{h}} +% ^{\gamma_k^2 a_{k+1} \dots a_l \gamma_l} +% \\ +% \Phi_{h}(t) +% \;&= +% \;&&\underbrace {\Xl_{h+1} \Phi_{h+1}(t_1) \dots \Phi_{h+1}(t_{p'-1})} +% _{\beta_0 a_1 \dots a_i \beta_i^1} +% \;&&\underbrace {\Xl_{h+2} \Phi_{h+2}(t'_1) \dots \Phi_{h+2}(t'_{m'}) \Xr_{h+1}} +% _{\beta_i^2 a_{i+1} \dots a_j \beta_j \;\big|\; \delta_j a_{j+1} \dots a_k \delta_k^1} +% \;&&\underbrace {\Phi_{h+1}(t_{p'+1}) \Phi_{h+1}(t_m) \Xr_{h}} +% _{\delta_k^2 a_{k+1} \dots a_l \delta_l} +% \end{alignat*} - \begin{XLem}\label{lemma_ptorder_transitivity_of_incomparability} - Incomparability relation on parse trees is transitive: if $t \sim s$ and $s \sim u$, then $t \sim u$. - \\ - Proof. + Let $traces (\Phi_{h}(s), \Phi_{h}(t)) = \big( (\rho_0, \dots, \rho_l), (\rho'_0, \dots, \rho'_l) \big)$ + and $traces (\Phi_{h+1}(s'), \Phi_{h+1}(t')) = \big( (\sigma_h, \dots, \sigma_k), (\sigma'_h, \dots, \sigma'_k) \big)$. + % + We show that for frames $i$ such that $j \leq i < k$ we have + $\rho_i = \sigma_i \wedge \rho'_i = \sigma'_i$, + and for subsequent frames $k \leq i \leq l$ we have $\rho_i = \rho'_i$. - By lemma \ref{incomparability_equivdef} we have - $t \sim s \Rightarrow \forall p : \|t\|_p = \|s\|_p$ and - $s \sim u \Rightarrow \forall p : \|s\|_p = \|u\|_p$, - therefore by lemma \ref{incomparability_equivdef} $\forall p : \|t\|_p = \|u\|_p \Rightarrow t \sim u$. - $\square$ - \end{XLem} + \medskip -The proof of theorem \ref{theorem_order_on_IPTs} -follows from -the lemma \ref{lemma_ptorder_antisymmetry}, -the lemma \ref{lemma_ptorder_transitivity} and -the lemma \ref{lemma_ptorder_transitivity_of_incomparability}. + First case: $i = j \leq k \leq l$ (the fork frame). + Because $\Phi_{h+1}(s')$ and $\Phi_{h+1}(t')$ have nonempty common prefix $\Xl_{h+2}$, + we have $lasht (\Phi_{h}(s) \sqcap \Phi_{h}(t)) = lasth (\Phi_{h+1}(s') \sqcap \Phi_{h+1}(t')) \geq h + 2$. + % + If $j < k$ then $minh (\gamma_j)$, $minh (\delta_j)$ are not affected by appending + $\gamma^2_k$, $\delta^2_k$ and therefore $\rho_j = \sigma_j \wedge \rho'_j = \sigma'_j$. + % + Else if $j = k < l$ then we have $minh (\gamma^1_k) = minh (\delta^1_k) = h + 1$ and + $minh (\gamma^2_k) = minh (\delta^2_k) \geq h + 1$, and + therefore $\rho_j = \rho'_j = h + 1$. + % + Finally, if $j = l$ then $minh (\gamma_j) = minh (\delta_j) = h$ and $\rho_j = \rho'_j = h$. + \medskip -\subsection*{Proof of Theorem \ref{theorem_order_compat}} + Second case: $j < i < k$. + In this case the calculation of $\rho_i$, $\rho'_i$ depends on $\rho_j$, $\rho'_j$ + (for which we have shown $\rho_j = \sigma_j \wedge \rho'_j = \sigma'_j$) and + is not affected by the appended $\gamma^2_k$, $\delta^2_k$, therefore + $\rho_i = \sigma_i \wedge \rho'_i = \sigma'_i$. -First, we prove an auxilary lemma \ref{lemma_subtrees} which -shows that comparison of sub-IPT is justified -if the s-norms at all preceding submatch positions are equal. + \medskip - \begin{XLem}\label{lemma_subtrees} - If $t, s \in \IPT(r, w)$ and $\exists p \in Sub(t) \cup Sub(s)$ such that $\snorm{t}{q} = \snorm{s}{q} \; \forall q \leq p$, - then $\exists \widetilde{r}, \widetilde{w} : t|_p, s|_p \in \IPT(\widetilde{r}, \widetilde{w})$. - \\ - Proof. - By induction on position $p$. - \\[-1em] + Third case: $j < i = k < l$. We have + $minh (\gamma^1_k) = minh (\delta^1_k) = h + 1$ and + $minh (\gamma^2_k) = minh (\delta^2_k) \geq h + 1$, + and none of the preceding frames after the fork contain parentheses with height less than $h + 1$, + therefore $\rho_k = \rho'_k = h + 1$. - Induction basis: the case of $p = \Lambda$ is trivial: let $\widetilde{r} = r$, $\widetilde{w} = w$. - \\[-1em] + \medskip - Induction step: we have $|p| > 0$, let $p = p'.i$, where $i \in \YN$. - Let $t' = t|_{p'} = T(t_1, \dots, t_n)$, - $s' = s|_{p'} = T(s_1, \dots, s_m)$. - By induction hypothesis $\exists r', w' : t', s' \in \IPT(r', w')$, - where $w' = str(t_1) \dots str(t_n) = str(s_1) \dots str(s_m)$. - \\[-1em] + Fourth case: $k < i < l$. + We have $\rho_i = \rho'_i = h + 1$, + because $\rho_k = \rho'_k = h + 1$ and $minh(\gamma_i)$, $minh(\delta_i) \geq h + 1$. - Next, we show that $str(t_i) = str(s_i)$. - It must be that $i \in Sub(s') \cap Sub(t')$, - otherwise only one of $\|t'\|_i$, $\|s'\|_i$ is $\infty$, - which contradicts lemma condiiton $\|t\|_p = \|s\|_p$. - Consider position $j \leq i$. - Because the set of submatch positions contains siblings, we have $j \in Sub(s') \cap Sub(t')$. - Consequently, $\|t'\|_j = |str(t_j)|$ and $\|s\|_j = |str(s_j)|$. - By lemma condition we have $\|t\|_{p'.j} = \|s\|_{p'.j}$, - therefore $\|t'\|_j = \|s'\|_j$, - therefore $|str(t_j)| = |str(s_j)|$. - Because $str(t_1) \dots str(t_n) = str(s_1) \dots str(s_m)$, - we have $str(t_i) = str(s_i)$. - \\[-1em] + \medskip - Now, let $\widetilde{w} = str(t_i)$. - If $r' = r_1|r_2$ or $r' = r_1 r_2$, let $\widetilde{r} = r_i$. - Otherwise, $r' = r_1^{k,l}$, let $\widetilde{r} = r_1$. - $\square$ - \end{XLem} + Fifth case: $i = l$. + We have $\rho_l = \rho'_l = h$. + \medskip - Proof of theorem \ref{theorem_order_compat}. - \\[-1em] + So, we have shown that $\rho_i = \sigma_i \wedge \rho'_i = \sigma'_i$ for $j \leq i < k$ + and $\rho_i = \rho'_i$ for $k \leq i \leq l$. + It trivially follows that $\Phi_{h+1}(s') \sqsubset \Phi_{h+1}(t')$ implies $\Phi_{h}(s) \sqsubset \Phi_{h}(t)$, and + $\Phi_{h+1}(s') \sim \Phi_{h+1}(t')$ implies $\Phi_{h}(s) \sim \Phi_{h}(t)$. + Because none of $\Phi_{h+1}(s')$, $\Phi_{h+1}(t')$ is a proper prefix of another, + $\Phi_{h+1}(s') \subset \Phi_{h+1}(t')$ implies $\Phi_{h}(s) \subset \Phi_{h}(t)$. + Therefore $\Phi_{h+1}(s') < \Phi_{h+1}(t')$ implies $\Phi_{h}(s) < \Phi_{h}(t)$. + $\square$ + \end{XLem} - Consider any $t \in T_{min}$. - For each position $p \in Sub(t)$, which is not itself a prefix of another position in $Sub(t)$, - consider subtree $t' = t|_p$. - It is an IPT for some sub-IRE $r'$ and substring $w'$: $t' \in \IPT(r', w')$. - Let $t''$ be the $<$-minimal tree in $\IPT(r', w')$ and substitute $t'$ with $t''$ in $t$. - (Note that substitutions are independent and can be performed in any order.) - Let $u$ be the tree resulting from all such substitutions. - By lemma \ref{incomparability_equivdef} we have $u \sim t$ - (because substitutions preserve the s-norm of subtrees at positions in $Sub(t)$), - and so $u \in T_{min}$. - We will show that $u = t_{min}$. - \\[-1em] - Suppose, on the contrary, that $u \neq t_{min}$. - % - Then we have $t_{min} <_p u$ for some non-submatch decision position $p$. - Let $p = p'.p''$, where $p'$ is the longest prefix of $p$ that is a submatch position: $p' \in Sub(u) \cup Sub(t_{min})$. + Proof of theorem \ref{theorem_order_on_pe_same_as_on_pt}. \\[-1em] - It must be that for all submatch positions $q' < p'$ we have $\snorm{u}{q'} = \snorm{t_{min}}{q'}$, - because $u \in T_{min}$ and thus either $u \sim t_{min}$, - or $u \prec_q t_{min}$ for some $q \in Sub(u) \cup Sub(t_{min})$ - (in which case it must be $q > p$, because $u \prec_q t_{min}$ implies $\snorm{u}{q} > \snorm{t_{min}}{q}$, - which in turn implies $\pnorm{u}{q} > \pnorm{t_{min}}{q}$, - which contradicts $t_{min} <_p u$ if $q \leq p$). + $\Rightarrow$. Given by lemma \ref{lemma_pe_less}. \\[-1em] - Therefore by lemma \ref{lemma_subtrees} we have $\exists r', w' : u|_{p'}, t_{min}|_{p'} \in \IPT(r', w')$. - On one hand, $t_{min} <_{p'.p''} u$ implies $t_{min}|_{p'} <_{p''} u|_{p'}$. - But on the other hand, $p' \in Sub(u)$ and $p'$ is not itself a prefix of another position in $Sub(u)$, - therefore $u|_{p'}$ is $<$-minimal by construction of $u$. - Contradiction. -% -% % -% On one hand, $u <_q \widetilde{s}$ for some submatch position $q$ (because $u \in P_{min}$). -% On the other hand, $s \lessdot_p \widetilde{u}$ for some non-submatch position $p < q$ (because $s$ is $\lessdot$-minimal). -% % -% Let $p = p'.p''$, where $p'$ is the longest prefix of $p$ in $Sub(\widetilde{s}) \cup Sub(u)$. -% It must be $\|u\|_{q'} = \|\widetilde{s}\|_{q'} \forall q' \leq p'$ (because $p' < q$ and $u <_q \widetilde{s}$), -% and $p' \in Sub(u)$ (otherwise $\|u\|_{p'} = \infty \neq \|\widetilde{s}\|_{p'}$). -% % -% Now, by lemma \ref{lemma_subtrees} we have $\exists r', w' : u|_{p'}, \widetilde{s}|_{p'} \in PT(r', w')$, -% therefore $s \lessdot_{p'.p''} u$ implies $s|_{p'} \lessdot_{p''} \widetilde{u}|_{p'}$. -% But $p' \in Sub(u)$ and $p'$ is not itself a prefix of another position in $Sub(u)$, -% therefore $\widetilde{u}|_{p'}$ is $\lessdot$-minimal by construction of $u$. -% Contradiction. + $\Leftarrow$. + We have $\Phi_{h}(s) < \Phi_{h}(t) \; \forall h$. + Suppose that $\nexists p: s <_p t$. + By lemma \ref{lemma_pe_order_antisymm} either $s = t$ + (in which case $\Phi_{h}(s) = \Phi_{h}(t)$, contradiction) + or $t <_q s$ for some $q$ + (in which case $\Phi_{h}(t) < \Phi_{h}(s)$ by lemma \ref{lemma_pe_less}, contradiction). + Therefore $\exists p: s <_p t$. $\square$ -- 2.40.0