From: Teodor Sigaev <teodor@sigaev.ru>
Date: Mon, 27 Jun 2016 17:41:00 +0000 (+0300)
Subject: Make exact distance match for FTS phrase operator
X-Git-Tag: REL9_6_BETA3~101
X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=028350f619f7688e0453fcd2c4b25abe9ba30fa7;p=postgresql

Make exact distance match for FTS phrase operator

Phrase operator now requires exact distance betweens lexems instead of
less-or-equal.

Per discussion c19fcfec308e6ccd952cdde9e648b505@mail.gmail.com
---

diff --git a/doc/src/sgml/textsearch.sgml b/doc/src/sgml/textsearch.sgml
index 9028bedd1b..72bef9f49e 100644
--- a/doc/src/sgml/textsearch.sgml
+++ b/doc/src/sgml/textsearch.sgml
@@ -346,10 +346,10 @@ SELECT to_tsvector('error is not fatal') @@ to_tsquery('fatal &lt;-&gt; error');
 
     There is a more general version of the FOLLOWED BY operator having the
     form <literal>&lt;<replaceable>N</>&gt;</literal>,
-    where <replaceable>N</> is an integer standing for the greatest distance
+    where <replaceable>N</> is an integer standing for the exact distance
     allowed between the matching lexemes.  <literal>&lt;1&gt;</literal> is
     the same as <literal>&lt;-&gt;</>, while <literal>&lt;2&gt;</literal>
-    allows one other lexeme to optionally appear between the matches, and so
+    allows one other lexeme to appear between the matches, and so
     on.  The <literal>phraseto_tsquery</> function makes use of this
     operator to construct a <literal>tsquery</> that can match a multi-word
     phrase when some of the words are stop words.  For example:
@@ -1529,7 +1529,7 @@ SELECT to_tsquery('fat') &lt;-&gt; to_tsquery('cat | rat');
       <para>
        Returns a query that searches for a match to the first given query
        followed by a match to the second given query at a distance of at
-       most <replaceable>distance</replaceable> lexemes, using
+       <replaceable>distance</replaceable> lexemes, using
        the <literal>&lt;<replaceable>N</>&gt;</literal>
        <type>tsquery</> operator.  For example:
 
diff --git a/src/backend/utils/adt/tsvector_op.c b/src/backend/utils/adt/tsvector_op.c
index 6117ba9b3e..04718829a0 100644
--- a/src/backend/utils/adt/tsvector_op.c
+++ b/src/backend/utils/adt/tsvector_op.c
@@ -1375,6 +1375,7 @@ TS_phrase_execute(QueryItem *curitem,
 		ExecPhraseData Ldata = {0, false, NULL},
 					Rdata = {0, false, NULL};
 		WordEntryPos *Lpos,
+				   *LposStart,
 				   *Rpos,
 				   *pos_iter = NULL;
 
@@ -1416,52 +1417,60 @@ TS_phrase_execute(QueryItem *curitem,
 			pos_iter = data->pos;
 		}
 
-		Lpos = Ldata.pos;
-		Rpos = Rdata.pos;
-
 		/*
 		 * Find matches by distance, WEP_GETPOS() is needed because
 		 * ExecPhraseData->data can point to the tsvector's WordEntryPosVector
 		 */
 
+		Rpos = Rdata.pos;
+		LposStart = Ldata.pos;
 		while (Rpos < Rdata.pos + Rdata.npos)
 		{
+			/*
+			 * We need to check all possible distances, so reset Lpos
+			 * to guranteed not yet satisfied position.
+			 */
+			Lpos = LposStart;
 			while (Lpos < Ldata.pos + Ldata.npos)
 			{
-				if (WEP_GETPOS(*Lpos) <= WEP_GETPOS(*Rpos))
+				if (WEP_GETPOS(*Rpos) - WEP_GETPOS(*Lpos) ==
+					curitem->qoperator.distance)
 				{
-					/*
-					 * Lpos is behind the Rpos, so we have to check the
-					 * distance condition
-					 */
-					if (WEP_GETPOS(*Rpos) - WEP_GETPOS(*Lpos) <= curitem->qoperator.distance)
+					/* MATCH! */
+					if (data)
 					{
-						/* MATCH! */
-						if (data)
-						{
-							*pos_iter = WEP_GETPOS(*Rpos);
-							pos_iter++;
-
-							break;		/* We need to build a unique result
-										 * array, so go to the next Rpos */
-						}
-						else
-						{
-							/*
-							 * We are in the root of the phrase tree and hence
-							 * we don't have to store the resulting positions
-							 */
-							return true;
-						}
+						/* Store position for upper phrase operator */
+						*pos_iter = WEP_GETPOS(*Rpos);
+						pos_iter++;
+
+						/*
+						 * Set left start position to next, because current one
+						 * could not satisfy distance for any other right
+						 * position
+						 */
+						LposStart = Lpos + 1;
+						break;
+					}
+					else
+					{
+						/*
+						 * We are in the root of the phrase tree and hence
+						 * we don't have to store the resulting positions
+						 */
+						return true;
 					}
+
 				}
-				else
+				else if (WEP_GETPOS(*Rpos) <= WEP_GETPOS(*Lpos) ||
+						 WEP_GETPOS(*Rpos) - WEP_GETPOS(*Lpos) <
+							curitem->qoperator.distance)
 				{
 					/*
-					 * Go to the next Rpos, because Lpos is ahead of the
-					 * current Rpos
+					 * Go to the next Rpos, because Lpos is ahead or on less
+					 * distance than required by current operator
 					 */
 					break;
+
 				}
 
 				Lpos++;
diff --git a/src/test/regress/expected/tstypes.out b/src/test/regress/expected/tstypes.out
index 64d6de6050..781be70736 100644
--- a/src/test/regress/expected/tstypes.out
+++ b/src/test/regress/expected/tstypes.out
@@ -665,10 +665,10 @@ SELECT to_tsvector('simple', '1 2 3 1') @@ '1 <-> 2' AS "true";
  t
 (1 row)
 
-SELECT to_tsvector('simple', '1 2 3 1') @@ '1 <2> 2' AS "true";
- true 
-------
- t
+SELECT to_tsvector('simple', '1 2 3 1') @@ '1 <2> 2' AS "false";
+ false 
+-------
+ f
 (1 row)
 
 SELECT to_tsvector('simple', '1 2 3 1') @@ '1 <-> 3' AS "false";
@@ -683,6 +683,12 @@ SELECT to_tsvector('simple', '1 2 3 1') @@ '1 <2> 3' AS "true";
  t
 (1 row)
 
+SELECT to_tsvector('simple', '1 2 1 2') @@ '1 <3> 2' AS "true";
+ true 
+------
+ t
+(1 row)
+
 SELECT to_tsvector('simple', '1 2 11 3') @@ '1 <-> 3' AS "false";
  false 
 -------
@@ -897,7 +903,7 @@ SELECT ts_rank_cd(' a:1 sa:2A sb:2D g'::tsvector, 'a <-> s:*');
 SELECT ts_rank_cd(' a:1 sa:2A sb:2D g'::tsvector, 'a <-> s:* <-> sa:A');
  ts_rank_cd 
 ------------
-  0.0714286
+          0
 (1 row)
 
 SELECT ts_rank_cd(' a:1 sa:2A sb:2D g'::tsvector, 'a <-> s:* <-> sa:B');
@@ -924,10 +930,10 @@ SELECT 'a:1 b:2'::tsvector @@ 'a <1> b'::tsquery AS "true";
  t
 (1 row)
 
-SELECT 'a:1 b:2'::tsvector @@ 'a <2> b'::tsquery AS "true";
- true 
-------
- t
+SELECT 'a:1 b:2'::tsvector @@ 'a <2> b'::tsquery AS "false";
+ false 
+-------
+ f
 (1 row)
 
 SELECT 'a:1 b:3'::tsvector @@ 'a <-> b'::tsquery AS "false";
@@ -954,10 +960,10 @@ SELECT 'a:1 b:3'::tsvector @@ 'a <2> b'::tsquery AS "true";
  t
 (1 row)
 
-SELECT 'a:1 b:3'::tsvector @@ 'a <3> b'::tsquery AS "true";
- true 
-------
- t
+SELECT 'a:1 b:3'::tsvector @@ 'a <3> b'::tsquery AS "false";
+ false 
+-------
+ f
 (1 row)
 
 -- tsvector editing operations
diff --git a/src/test/regress/sql/tstypes.sql b/src/test/regress/sql/tstypes.sql
index 738ec824b9..abcf1504ce 100644
--- a/src/test/regress/sql/tstypes.sql
+++ b/src/test/regress/sql/tstypes.sql
@@ -130,9 +130,10 @@ SELECT 'supeznova supernova'::tsvector @@ 'super:*'::tsquery AS "true";
 
 --phrase search
 SELECT to_tsvector('simple', '1 2 3 1') @@ '1 <-> 2' AS "true";
-SELECT to_tsvector('simple', '1 2 3 1') @@ '1 <2> 2' AS "true";
+SELECT to_tsvector('simple', '1 2 3 1') @@ '1 <2> 2' AS "false";
 SELECT to_tsvector('simple', '1 2 3 1') @@ '1 <-> 3' AS "false";
 SELECT to_tsvector('simple', '1 2 3 1') @@ '1 <2> 3' AS "true";
+SELECT to_tsvector('simple', '1 2 1 2') @@ '1 <3> 2' AS "true";
 
 SELECT to_tsvector('simple', '1 2 11 3') @@ '1 <-> 3' AS "false";
 SELECT to_tsvector('simple', '1 2 11 3') @@ '1:* <-> 3' AS "true";
@@ -180,12 +181,12 @@ SELECT ts_rank_cd(' a:1 sa:2A sb:2D g'::tsvector, 'a <-> s:* <-> sa:B');
 SELECT 'a:1 b:2'::tsvector @@ 'a <-> b'::tsquery AS "true";
 SELECT 'a:1 b:2'::tsvector @@ 'a <0> b'::tsquery AS "false";
 SELECT 'a:1 b:2'::tsvector @@ 'a <1> b'::tsquery AS "true";
-SELECT 'a:1 b:2'::tsvector @@ 'a <2> b'::tsquery AS "true";
+SELECT 'a:1 b:2'::tsvector @@ 'a <2> b'::tsquery AS "false";
 SELECT 'a:1 b:3'::tsvector @@ 'a <-> b'::tsquery AS "false";
 SELECT 'a:1 b:3'::tsvector @@ 'a <0> b'::tsquery AS "false";
 SELECT 'a:1 b:3'::tsvector @@ 'a <1> b'::tsquery AS "false";
 SELECT 'a:1 b:3'::tsvector @@ 'a <2> b'::tsquery AS "true";
-SELECT 'a:1 b:3'::tsvector @@ 'a <3> b'::tsquery AS "true";
+SELECT 'a:1 b:3'::tsvector @@ 'a <3> b'::tsquery AS "false";
 
 -- tsvector editing operations