Fix strange behavior (and possible crashes) in full text phrase search.

author Tom Lane <tgl@sss.pgh.pa.us>

Wed, 21 Dec 2016 20:18:25 +0000 (15:18 -0500)

committer Tom Lane <tgl@sss.pgh.pa.us>

Wed, 21 Dec 2016 20:18:40 +0000 (15:18 -0500)
author Tom Lane <tgl@sss.pgh.pa.us>
Wed, 21 Dec 2016 20:18:25 +0000 (15:18 -0500)
committer Tom Lane <tgl@sss.pgh.pa.us>
Wed, 21 Dec 2016 20:18:40 +0000 (15:18 -0500)
diff --git a/doc/src/sgml/datatype.sgml b/doc/src/sgml/datatype.sgml

index 67d0c349e0c6a558ccc8ba2a7de6d2552ff4c832..464ce83d30e0b991616ba57dc2dda19db1d6a5ee 100644 (file)
--- a/doc/src/sgml/datatype.sgml
+++ b/doc/src/sgml/datatype.sgml
@@ -3959,15 +3959,7 @@ SELECT 'fat &amp; rat &amp; ! cat'::tsquery;
          tsquery         
  ------------------------
   'fat' &amp; 'rat' &amp; !'cat'
-
-SELECT '(fat | rat) &lt;-&gt; cat'::tsquery;
-              tsquery
------------------------------------
- 'fat' &lt;-&gt; 'cat' | 'rat' &lt;-&gt; 'cat'
  </programlisting>
-
-     The last example demonstrates that <type>tsquery</type> sometimes
-     rearranges nested operators into a logically equivalent formulation.
      </para>
  
      <para>
diff --git a/doc/src/sgml/textsearch.sgml b/doc/src/sgml/textsearch.sgml

index 2da75955d0b5557e82d8783697346c204c60a52e..67e4901c921d8cf885beeb56e3b1fc6567a90615 100644 (file)
--- a/doc/src/sgml/textsearch.sgml
+++ b/doc/src/sgml/textsearch.sgml
@@ -264,7 +264,7 @@ SELECT 'fat &amp; cow'::tsquery @@ 'a fat cat sat on a mat and ate a fat rat'::t
      text, any more than a <type>tsvector</type> is.  A <type>tsquery</type>
      contains search terms, which must be already-normalized lexemes, and
      may combine multiple terms using AND, OR, NOT, and FOLLOWED BY operators.
-    (For details see <xref linkend="datatype-tsquery">.)  There are
+    (For syntax details see <xref linkend="datatype-tsquery">.)  There are
      functions <function>to_tsquery</>, <function>plainto_tsquery</>,
      and <function>phraseto_tsquery</>
      that are helpful in converting user-written text into a proper
@@ -323,6 +323,8 @@ text @@ text
      at least one of its arguments must appear, while the <literal>!</> (NOT)
      operator specifies that its argument must <emphasis>not</> appear in
      order to have a match.
+    For example, the query <literal>fat &amp; ! rat</> matches documents that
+    contain <literal>fat</> but not <literal>rat</>.
     </para>
  
     <para>
@@ -377,6 +379,28 @@ SELECT phraseto_tsquery('the cats ate the rats');
      then <literal>&amp;</literal>, then <literal>&lt;-&gt;</literal>,
      and <literal>!</literal> most tightly.
     </para>
+
+   <para>
+    It's worth noticing that the AND/OR/NOT operators mean something subtly
+    different when they are within the arguments of a FOLLOWED BY operator
+    than when they are not, because within FOLLOWED BY the exact position of
+    the match is significant.  For example, normally <literal>!x</> matches
+    only documents that do not contain <literal>x</> anywhere.
+    But <literal>!x &lt;-&gt; y</> matches <literal>y</> if it is not
+    immediately after an <literal>x</>; an occurrence of <literal>x</>
+    elsewhere in the document does not prevent a match.  Another example is
+    that <literal>x &amp; y</> normally only requires that <literal>x</>
+    and <literal>y</> both appear somewhere in the document, but
+    <literal>(x &amp; y) &lt;-&gt; z</> requires <literal>x</>
+    and <literal>y</> to match at the same place, immediately before
+    a <literal>z</>.  Thus this query behaves differently from
+    <literal>x &lt;-&gt; z &amp; y &lt;-&gt; z</>, which will match a
+    document containing two separate sequences <literal>x z</> and
+    <literal>y z</>.  (This specific query is useless as written,
+    since <literal>x</> and <literal>y</> could not match at the same place;
+    but with more complex situations such as prefix-match patterns, a query
+    of this form could be useful.)
+   </para>
    </sect2>
  
    <sect2 id="textsearch-intro-configurations">
diff --git a/src/backend/utils/adt/tsginidx.c b/src/backend/utils/adt/tsginidx.c

index efc111e379c6dfd62bb6aff3c91227fdf27c5a70..3e0a44459acd301bb0dfbced5568944fb34d5e73 100644 (file)
--- a/src/backend/utils/adt/tsginidx.c
+++ b/src/backend/utils/adt/tsginidx.c
@@ -212,7 +212,7 @@ checkcondition_gin(void *checkval, QueryOperand *val, ExecPhraseData *data)
   * Evaluate tsquery boolean expression using ternary logic.
   */
  static GinTernaryValue
-TS_execute_ternary(GinChkVal *gcv, QueryItem *curitem)
+TS_execute_ternary(GinChkVal *gcv, QueryItem *curitem, bool in_phrase)
  {
         GinTernaryValue val1,
                                 val2,
@@ -230,7 +230,10 @@ TS_execute_ternary(GinChkVal *gcv, QueryItem *curitem)
         switch (curitem->qoperator.oper)
         {
                 case OP_NOT:
-                       result = TS_execute_ternary(gcv, curitem + 1);
+                       /* In phrase search, always return MAYBE since we lack positions */
+                       if (in_phrase)
+                               return GIN_MAYBE;
+                       result = TS_execute_ternary(gcv, curitem + 1, in_phrase);
                         if (result == GIN_MAYBE)
                                 return result;
                         return !result;
@@ -238,17 +241,21 @@ TS_execute_ternary(GinChkVal *gcv, QueryItem *curitem)
                 case OP_PHRASE:
  
                         /*
-                        * GIN doesn't contain any information about positions, treat
+                        * GIN doesn't contain any information about positions, so treat
                          * OP_PHRASE as OP_AND with recheck requirement
                          */
-                       *gcv->need_recheck = true;
+                       *(gcv->need_recheck) = true;
+                       /* Pass down in_phrase == true in case there's a NOT below */
+                       in_phrase = true;
+
                         /* FALL THRU */
  
                 case OP_AND:
-                       val1 = TS_execute_ternary(gcv, curitem + curitem->qoperator.left);
+                       val1 = TS_execute_ternary(gcv, curitem + curitem->qoperator.left,
+                                                                         in_phrase);
                         if (val1 == GIN_FALSE)
                                 return GIN_FALSE;
-                       val2 = TS_execute_ternary(gcv, curitem + 1);
+                       val2 = TS_execute_ternary(gcv, curitem + 1, in_phrase);
                         if (val2 == GIN_FALSE)
                                 return GIN_FALSE;
                         if (val1 == GIN_TRUE && val2 == GIN_TRUE)
@@ -257,10 +264,11 @@ TS_execute_ternary(GinChkVal *gcv, QueryItem *curitem)
                                 return GIN_MAYBE;
  
                 case OP_OR:
-                       val1 = TS_execute_ternary(gcv, curitem + curitem->qoperator.left);
+                       val1 = TS_execute_ternary(gcv, curitem + curitem->qoperator.left,
+                                                                         in_phrase);
                         if (val1 == GIN_TRUE)
                                 return GIN_TRUE;
-                       val2 = TS_execute_ternary(gcv, curitem + 1);
+                       val2 = TS_execute_ternary(gcv, curitem + 1, in_phrase);
                         if (val2 == GIN_TRUE)
                                 return GIN_TRUE;
                         if (val1 == GIN_FALSE && val2 == GIN_FALSE)
@@ -307,7 +315,7 @@ gin_tsquery_consistent(PG_FUNCTION_ARGS)
  
                 res = TS_execute(GETQUERY(query),
                                                  &gcv,
-                                                TS_EXEC_CALC_NOT | TS_EXEC_PHRASE_AS_AND,
+                                                TS_EXEC_CALC_NOT | TS_EXEC_PHRASE_NO_POS,
                                                  checkcondition_gin);
         }
  
@@ -343,7 +351,7 @@ gin_tsquery_triconsistent(PG_FUNCTION_ARGS)
                 gcv.map_item_operand = (int *) (extra_data[0]);
                 gcv.need_recheck = &recheck;
  
-               res = TS_execute_ternary(&gcv, GETQUERY(query));
+               res = TS_execute_ternary(&gcv, GETQUERY(query), false);
  
                 if (res == GIN_TRUE && recheck)
                         res = GIN_MAYBE;
diff --git a/src/backend/utils/adt/tsgistidx.c b/src/backend/utils/adt/tsgistidx.c

index 6cdfb13f6d12221e309bc536a1ba71235141f75b..a4c2bb9cec40d7d4d26c5b4a2097892d872e73e0 100644 (file)
--- a/src/backend/utils/adt/tsgistidx.c
+++ b/src/backend/utils/adt/tsgistidx.c
@@ -359,12 +359,11 @@ gtsvector_consistent(PG_FUNCTION_ARGS)
                 if (ISALLTRUE(key))
                         PG_RETURN_BOOL(true);
  
-               PG_RETURN_BOOL(TS_execute(
-                                                                 GETQUERY(query),
+               /* since signature is lossy, cannot specify CALC_NOT here */
+               PG_RETURN_BOOL(TS_execute(GETQUERY(query),
                                                                   (void *) GETSIGN(key),
-                                                                 TS_EXEC_PHRASE_AS_AND,
-                                                                 checkcondition_bit
-                                                                 ));
+                                                                 TS_EXEC_PHRASE_NO_POS,
+                                                                 checkcondition_bit));
         }
         else
         {                                                       /* only leaf pages */
@@ -372,12 +371,10 @@ gtsvector_consistent(PG_FUNCTION_ARGS)
  
                 chkval.arrb = GETARR(key);
                 chkval.arre = chkval.arrb + ARRNELEM(key);
-               PG_RETURN_BOOL(TS_execute(
-                                                                 GETQUERY(query),
+               PG_RETURN_BOOL(TS_execute(GETQUERY(query),
                                                                   (void *) &chkval,
-                                                                 TS_EXEC_PHRASE_AS_AND | TS_EXEC_CALC_NOT,
-                                                                 checkcondition_arr
-                                                                 ));
+                                                                 TS_EXEC_PHRASE_NO_POS | TS_EXEC_CALC_NOT,
+                                                                 checkcondition_arr));
         }
  }
  
diff --git a/src/backend/utils/adt/tsquery.c b/src/backend/utils/adt/tsquery.c

index 3d11a1c2080b89396a3bc0955eb0f6a4b8ee1298..f0bd52877f335753688926258c3edc4136f08523 100644 (file)
--- a/src/backend/utils/adt/tsquery.c
+++ b/src/backend/utils/adt/tsquery.c
@@ -557,13 +557,11 @@ findoprnd_recurse(QueryItem *ptr, uint32 *pos, int nnodes, bool *needcleanup)
                                    curitem->oper == OP_OR ||
                                    curitem->oper == OP_PHRASE);
  
-                       if (curitem->oper == OP_PHRASE)
-                               *needcleanup = true;    /* push OP_PHRASE down later */
-
                         (*pos)++;
  
                         /* process RIGHT argument */
                         findoprnd_recurse(ptr, pos, nnodes, needcleanup);
+
                         curitem->left = *pos - tmp; /* set LEFT arg's offset */
  
                         /* process LEFT argument */
@@ -574,8 +572,9 @@ findoprnd_recurse(QueryItem *ptr, uint32 *pos, int nnodes, bool *needcleanup)
  
  
  /*
- * Fills in the left-fields previously left unfilled. The input
- * QueryItems must be in polish (prefix) notation.
+ * Fill in the left-fields previously left unfilled.
+ * The input QueryItems must be in polish (prefix) notation.
+ * Also, set *needcleanup to true if there are any QI_VALSTOP nodes.
   */
  static void
  findoprnd(QueryItem *ptr, int size, bool *needcleanup)
@@ -687,15 +686,17 @@ parse_tsquery(char *buf,
         memcpy((void *) GETOPERAND(query), (void *) state.op, state.sumlen);
         pfree(state.op);
  
-       /* Set left operand pointers for every operator. */
+       /*
+        * Set left operand pointers for every operator.  While we're at it,
+        * detect whether there are any QI_VALSTOP nodes.
+        */
         findoprnd(ptr, query->size, &needcleanup);
  
         /*
-        * QI_VALSTOP nodes should be cleaned and OP_PHRASE should be pushed
-        * down
+        * If there are QI_VALSTOP nodes, delete them and simplify the tree.
          */
         if (needcleanup)
-               return cleanup_fakeval_and_phrase(query);
+               query = cleanup_tsquery_stopwords(query);
  
         return query;
  }
@@ -1088,6 +1089,9 @@ tsqueryrecv(PG_FUNCTION_ARGS)
          */
         findoprnd(item, size, &needcleanup);
  
+       /* Can't have found any QI_VALSTOP nodes */
+       Assert(!needcleanup);
+
         /* Copy operands to output struct */
         for (i = 0; i < size; i++)
         {
@@ -1105,9 +1109,6 @@ tsqueryrecv(PG_FUNCTION_ARGS)
  
         SET_VARSIZE(query, len + datalen);
  
-       if (needcleanup)
-               PG_RETURN_TSQUERY(cleanup_fakeval_and_phrase(query));
-
         PG_RETURN_TSQUERY(query);
  }
  
diff --git a/src/backend/utils/adt/tsquery_cleanup.c b/src/backend/utils/adt/tsquery_cleanup.c

index 330664da6391d5adc686a545d423648f1d8ae904..c10c7ef0aa5ffffdcdd8d8392ade11eee5ec68ed 100644 (file)
--- a/src/backend/utils/adt/tsquery_cleanup.c
+++ b/src/backend/utils/adt/tsquery_cleanup.c
@@ -25,19 +25,6 @@ typedef struct NODE
         QueryItem  *valnode;
  } NODE;
  
-/*
- * To simplify walking on query tree and pushing down of phrase operator
- * we define some fake priority here: phrase operator has highest priority
- * of any other operators (and we believe here that OP_PHRASE is a highest
- * code of operations) and value node has ever highest priority.
- * Priority values of other operations don't matter until they are less than
- * phrase operator and value node.
- */
-#define VALUE_PRIORITY                 (OP_COUNT + 1)
-#define NODE_PRIORITY(x) \
-       ( ((x)->valnode->qoperator.type == QI_OPR) ? \
-               (x)->valnode->qoperator.oper : VALUE_PRIORITY )
-
  /*
   * make query tree from plain view of query
   */
@@ -368,227 +355,6 @@ clean_stopword_intree(NODE *node, int *ladd, int *radd)
         return node;
  }
  
-static NODE *
-copyNODE(NODE *node)
-{
-       NODE       *cnode = palloc(sizeof(NODE));
-
-       /* since this function recurses, it could be driven to stack overflow. */
-       check_stack_depth();
-
-       cnode->valnode = palloc(sizeof(QueryItem));
-       *(cnode->valnode) = *(node->valnode);
-
-       if (node->valnode->type == QI_OPR)
-       {
-               cnode->right = copyNODE(node->right);
-               if (node->valnode->qoperator.oper != OP_NOT)
-                       cnode->left = copyNODE(node->left);
-       }
-
-       return cnode;
-}
-
-static NODE *
-makeNODE(int8 op, NODE *left, NODE *right)
-{
-       NODE       *node = palloc(sizeof(NODE));
-
-       /* zeroing allocation to prevent difference in unused bytes */
-       node->valnode = palloc0(sizeof(QueryItem));
-
-       node->valnode->qoperator.type = QI_OPR;
-       node->valnode->qoperator.oper = op;
-
-       node->left = left;
-       node->right = right;
-
-       return node;
-}
-
-/*
- * Move operation with high priority to the leaves. This guarantees
- * that the phrase operator will be near the bottom of the tree.
- * An idea behind is do not store position of lexemes during execution
- * of ordinary operations (AND, OR, NOT) because it could be expensive.
- * Actual transformation will be performed only on subtrees under the
- * <-> (<n>) operation since it's needed solely for the phrase operator.
- *
- * Rules:
- *       a  <->  (b | c)       =>      (a <-> b)  |   (a <-> c)
- *      (a | b)  <->    c         =>   (a <-> c)  |   (b <-> c)
- *       a  <->        !b         =>           a         &  !(a <-> b)
- *      !a  <->         b         =>           b         &  !(a <-> b)
- *
- * Warnings for readers:
- *               a <-> b          !=      b <-> a
- *
- *       a <n> (b <n> c)       !=       (a <n> b) <n> c since the phrase lengths are:
- *                      n                                      2n-1
- */
-static NODE *
-normalize_phrase_tree(NODE *node)
-{
-       /* there should be no stop words at this point */
-       Assert(node->valnode->type != QI_VALSTOP);
-
-       if (node->valnode->type == QI_VAL)
-               return node;
-
-       /* since this function recurses, it could be driven to stack overflow. */
-       check_stack_depth();
-
-       Assert(node->valnode->type == QI_OPR);
-
-       if (node->valnode->qoperator.oper == OP_NOT)
-       {
-               NODE       *orignode = node;
-
-               /* eliminate NOT sequence */
-               while (node->valnode->type == QI_OPR &&
-               node->valnode->qoperator.oper == node->right->valnode->qoperator.oper)
-               {
-                       node = node->right->right;
-               }
-
-               if (orignode != node)
-                       /* current node isn't checked yet */
-                       node = normalize_phrase_tree(node);
-               else
-                       node->right = normalize_phrase_tree(node->right);
-       }
-       else if (node->valnode->qoperator.oper == OP_PHRASE)
-       {
-               int16           distance;
-               NODE       *X;
-
-               node->left = normalize_phrase_tree(node->left);
-               node->right = normalize_phrase_tree(node->right);
-
-               /*
-                * if subtree contains only nodes with higher "priority" then we are
-                * done. See comment near NODE_PRIORITY()
-                */
-               if (NODE_PRIORITY(node) <= NODE_PRIORITY(node->right) &&
-                       NODE_PRIORITY(node) <= NODE_PRIORITY(node->left))
-                       return node;
-
-               /*
-                * We can't swap left-right and works only with left child because of
-                * a <-> b      !=      b <-> a
-                */
-
-               distance = node->valnode->qoperator.distance;
-
-               if (node->right->valnode->type == QI_OPR)
-               {
-                       switch (node->right->valnode->qoperator.oper)
-                       {
-                               case OP_AND:
-                                       /* a <-> (b & c)  =>  (a <-> b) & (a <-> c) */
-                                       node = makeNODE(OP_AND,
-                                                                       makeNODE(OP_PHRASE,
-                                                                                        node->left,
-                                                                                        node->right->left),
-                                                                       makeNODE(OP_PHRASE,
-                                                                                        copyNODE(node->left),
-                                                                                        node->right->right));
-                                       node->left->valnode->qoperator.distance =
-                                               node->right->valnode->qoperator.distance = distance;
-                                       break;
-                               case OP_OR:
-                                       /* a <-> (b | c)  =>  (a <-> b) | (a <-> c) */
-                                       node = makeNODE(OP_OR,
-                                                                       makeNODE(OP_PHRASE,
-                                                                                        node->left,
-                                                                                        node->right->left),
-                                                                       makeNODE(OP_PHRASE,
-                                                                                        copyNODE(node->left),
-                                                                                        node->right->right));
-                                       node->left->valnode->qoperator.distance =
-                                               node->right->valnode->qoperator.distance = distance;
-                                       break;
-                               case OP_NOT:
-                                       /* a <-> !b  =>  a & !(a <-> b) */
-                                       X = node->right;
-                                       node->right = node->right->right;
-                                       X->right = node;
-                                       node = makeNODE(OP_AND,
-                                                                       copyNODE(node->left),
-                                                                       X);
-                                       break;
-                               case OP_PHRASE:
-                                       /* no-op */
-                                       break;
-                               default:
-                                       elog(ERROR, "Wrong type of tsquery node: %d",
-                                                node->right->valnode->qoperator.oper);
-                       }
-               }
-
-               if (node->left->valnode->type == QI_OPR &&
-                       node->valnode->qoperator.oper == OP_PHRASE)
-               {
-                       /*
-                        * if the node is still OP_PHRASE, check the left subtree,
-                        * otherwise the whole node will be transformed later.
-                        */
-                       switch (node->left->valnode->qoperator.oper)
-                       {
-                               case OP_AND:
-                                       /* (a & b) <-> c  =>  (a <-> c) & (b <-> c) */
-                                       node = makeNODE(OP_AND,
-                                                                       makeNODE(OP_PHRASE,
-                                                                                        node->left->left,
-                                                                                        node->right),
-                                                                       makeNODE(OP_PHRASE,
-                                                                                        node->left->right,
-                                                                                        copyNODE(node->right)));
-                                       node->left->valnode->qoperator.distance =
-                                               node->right->valnode->qoperator.distance = distance;
-                                       break;
-                               case OP_OR:
-                                       /* (a | b) <-> c  =>  (a <-> c) | (b <-> c) */
-                                       node = makeNODE(OP_OR,
-                                                                       makeNODE(OP_PHRASE,
-                                                                                        node->left->left,
-                                                                                        node->right),
-                                                                       makeNODE(OP_PHRASE,
-                                                                                        node->left->right,
-                                                                                        copyNODE(node->right)));
-                                       node->left->valnode->qoperator.distance =
-                                               node->right->valnode->qoperator.distance = distance;
-                                       break;
-                               case OP_NOT:
-                                       /* !a <-> b  =>  b & !(a <-> b) */
-                                       X = node->left;
-                                       node->left = node->left->right;
-                                       X->right = node;
-                                       node = makeNODE(OP_AND,
-                                                                       X,
-                                                                       copyNODE(node->right));
-                                       break;
-                               case OP_PHRASE:
-                                       /* no-op */
-                                       break;
-                               default:
-                                       elog(ERROR, "Wrong type of tsquery node: %d",
-                                                node->left->valnode->qoperator.oper);
-                       }
-               }
-
-               /* continue transformation */
-               node = normalize_phrase_tree(node);
-       }
-       else    /* AND or OR */
-       {
-               node->left = normalize_phrase_tree(node->left);
-               node->right = normalize_phrase_tree(node->right);
-       }
-
-       return node;
-}
-
  /*
   * Number of elements in query tree
   */
@@ -613,8 +379,11 @@ calcstrlen(NODE *node)
         return size;
  }
  
+/*
+ * Remove QI_VALSTOP (stopword) nodes from TSQuery.
+ */
  TSQuery
-cleanup_fakeval_and_phrase(TSQuery in)
+cleanup_tsquery_stopwords(TSQuery in)
  {
         int32           len,
                                 lenstr,
@@ -642,9 +411,6 @@ cleanup_fakeval_and_phrase(TSQuery in)
                 return out;
         }
  
-       /* push OP_PHRASE nodes down */
-       root = normalize_phrase_tree(root);
-
         /*
          * Build TSQuery from plain view
          */
diff --git a/src/backend/utils/adt/tsquery_op.c b/src/backend/utils/adt/tsquery_op.c

index a574b4b2573d602ea91b05256449e836a1dc0060..8f90ce99e0d1f264021b3b75c5750a591aa2999e 100644 (file)
--- a/src/backend/utils/adt/tsquery_op.c
+++ b/src/backend/utils/adt/tsquery_op.c
@@ -104,7 +104,7 @@ tsquery_or(PG_FUNCTION_ARGS)
         PG_FREE_IF_COPY(a, 0);
         PG_FREE_IF_COPY(b, 1);
  
-       PG_RETURN_POINTER(query);
+       PG_RETURN_TSQUERY(query);
  }
  
  Datum
@@ -140,7 +140,7 @@ tsquery_phrase_distance(PG_FUNCTION_ARGS)
         PG_FREE_IF_COPY(a, 0);
         PG_FREE_IF_COPY(b, 1);
  
-       PG_RETURN_POINTER(cleanup_fakeval_and_phrase(query));
+       PG_RETURN_TSQUERY(query);
  }
  
  Datum
diff --git a/src/backend/utils/adt/tsvector_op.c b/src/backend/utils/adt/tsvector_op.c

index 2145688f90b3a8bc05fcb1b148d86a4ef8bfc7ce..7af4a3bfc18478900eac32c0e0a612250531326a 100644 (file)
--- a/src/backend/utils/adt/tsvector_op.c
+++ b/src/backend/utils/adt/tsvector_op.c
@@ -11,9 +11,10 @@
   *
   *-------------------------------------------------------------------------
   */
-
  #include "postgres.h"
  
+#include <limits.h>
+
  #include "access/htup_details.h"
  #include "catalog/namespace.h"
  #include "catalog/pg_type.h"
@@ -1404,148 +1405,395 @@ checkcondition_str(void *checkval, QueryOperand *val, ExecPhraseData *data)
         return res;
  }
  
+/*
+ * Compute output position list for a tsquery operator in phrase mode.
+ *
+ * Merge the position lists in Ldata and Rdata as specified by "emit",
+ * returning the result list into *data.  The input position lists must be
+ * sorted and unique, and the output will be as well.
+ *
+ * data: pointer to initially-all-zeroes output struct, or NULL
+ * Ldata, Rdata: input position lists
+ * emit: bitmask of TSPO_XXX flags
+ * Loffset: offset to be added to Ldata positions before comparing/outputting
+ * Roffset: offset to be added to Rdata positions before comparing/outputting
+ * max_npos: maximum possible required size of output position array
+ *
+ * Loffset and Roffset should not be negative, else we risk trying to output
+ * negative positions, which won't fit into WordEntryPos.
+ *
+ * Returns true if any positions were emitted to *data; or if data is NULL,
+ * returns true if any positions would have been emitted.
+ */
+#define TSPO_L_ONLY            0x01    /* emit positions appearing only in L */
+#define TSPO_R_ONLY            0x02    /* emit positions appearing only in R */
+#define TSPO_BOTH              0x04    /* emit positions appearing in both L&R */
+
+static bool
+TS_phrase_output(ExecPhraseData *data,
+                                ExecPhraseData *Ldata,
+                                ExecPhraseData *Rdata,
+                                int emit,
+                                int Loffset,
+                                int Roffset,
+                                int max_npos)
+{
+       int                     Lindex,
+                               Rindex;
+
+       /* Loop until both inputs are exhausted */
+       Lindex = Rindex = 0;
+       while (Lindex < Ldata->npos || Rindex < Rdata->npos)
+       {
+               int                     Lpos,
+                                       Rpos;
+               int                     output_pos = 0;
+
+               /*
+                * Fetch current values to compare.  WEP_GETPOS() is needed because
+                * ExecPhraseData->data can point to a tsvector's WordEntryPosVector.
+                */
+               if (Lindex < Ldata->npos)
+                       Lpos = WEP_GETPOS(Ldata->pos[Lindex]) + Loffset;
+               else
+               {
+                       /* L array exhausted, so we're done if R_ONLY isn't set */
+                       if (!(emit & TSPO_R_ONLY))
+                               break;
+                       Lpos = INT_MAX;
+               }
+               if (Rindex < Rdata->npos)
+                       Rpos = WEP_GETPOS(Rdata->pos[Rindex]) + Roffset;
+               else
+               {
+                       /* R array exhausted, so we're done if L_ONLY isn't set */
+                       if (!(emit & TSPO_L_ONLY))
+                               break;
+                       Rpos = INT_MAX;
+               }
+
+               /* Merge-join the two input lists */
+               if (Lpos < Rpos)
+               {
+                       /* Lpos is not matched in Rdata, should we output it? */
+                       if (emit & TSPO_L_ONLY)
+                               output_pos = Lpos;
+                       Lindex++;
+               }
+               else if (Lpos == Rpos)
+               {
+                       /* Lpos and Rpos match ... should we output it? */
+                       if (emit & TSPO_BOTH)
+                               output_pos = Rpos;
+                       Lindex++;
+                       Rindex++;
+               }
+               else    /* Lpos > Rpos */
+               {
+                       /* Rpos is not matched in Ldata, should we output it? */
+                       if (emit & TSPO_R_ONLY)
+                               output_pos = Rpos;
+                       Rindex++;
+               }
+
+               if (output_pos > 0)
+               {
+                       if (data)
+                       {
+                               /* Store position, first allocating output array if needed */
+                               if (data->pos == NULL)
+                               {
+                                       data->pos = (WordEntryPos *)
+                                               palloc(max_npos * sizeof(WordEntryPos));
+                                       data->allocated = true;
+                               }
+                               data->pos[data->npos++] = output_pos;
+                       }
+                       else
+                       {
+                               /*
+                                * Exact positions not needed, so return true as soon as we
+                                * know there is at least one.
+                                */
+                               return true;
+                       }
+               }
+       }
+
+       if (data && data->npos > 0)
+       {
+               /* Let's assert we didn't overrun the array */
+               Assert(data->npos <= max_npos);
+               return true;
+       }
+       return false;
+}
+
  /*
   * Execute tsquery at or below an OP_PHRASE operator.
   *
- * This handles the recursion at levels where we need to care about
- * match locations.  In addition to the same arguments used for TS_execute,
- * the caller may pass a preinitialized-to-zeroes ExecPhraseData struct to
- * be filled with lexeme match positions on success.  data == NULL if no
- * match data need be returned.  (In practice, outside callers pass NULL,
- * and only the internal recursion cases pass a data pointer.)
+ * This handles tsquery execution at recursion levels where we need to care
+ * about match locations.
+ *
+ * In addition to the same arguments used for TS_execute, the caller may pass
+ * a preinitialized-to-zeroes ExecPhraseData struct, to be filled with lexeme
+ * match position info on success.  data == NULL if no position data need be
+ * returned.  (In practice, outside callers pass NULL, and only the internal
+ * recursion cases pass a data pointer.)
+ * Note: the function assumes data != NULL for operators other than OP_PHRASE.
+ * This is OK because an outside call always starts from an OP_PHRASE node.
+ *
+ * The detailed semantics of the match data, given that the function returned
+ * "true" (successful match, or possible match), are:
+ *
+ * npos > 0, negate = false:
+ *      query is matched at specified position(s) (and only those positions)
+ * npos > 0, negate = true:
+ *      query is matched at all positions *except* specified position(s)
+ * npos = 0, negate = false:
+ *      query is possibly matched, matching position(s) are unknown
+ *      (this should only be returned when TS_EXEC_PHRASE_NO_POS flag is set)
+ * npos = 0, negate = true:
+ *      query is matched at all positions
+ *
+ * Successful matches also return a "width" value which is the match width in
+ * lexemes, less one.  Hence, "width" is zero for simple one-lexeme matches,
+ * and is the sum of the phrase operator distances for phrase matches.  Note
+ * that when width > 0, the listed positions represent the ends of matches not
+ * the starts.  (This unintuitive rule is needed to avoid possibly generating
+ * negative positions, which wouldn't fit into the WordEntryPos arrays.)
+ *
+ * When the function returns "false" (no match), it must return npos = 0,
+ * negate = false (which is the state initialized by the caller); but the
+ * "width" output in such cases is undefined.
   */
  static bool
  TS_phrase_execute(QueryItem *curitem, void *arg, uint32 flags,
-                                 ExecPhraseData *data,
-                                 TSExecuteCallback chkcond)
+                                 TSExecuteCallback chkcond,
+                                 ExecPhraseData *data)
  {
+       ExecPhraseData Ldata,
+                               Rdata;
+       bool            lmatch,
+                               rmatch;
+       int                     Loffset,
+                               Roffset,
+                               maxwidth;
+
         /* since this function recurses, it could be driven to stack overflow */
         check_stack_depth();
  
         if (curitem->type == QI_VAL)
-       {
                 return chkcond(arg, (QueryOperand *) curitem, data);
-       }
-       else
+
+       switch (curitem->qoperator.oper)
         {
-               ExecPhraseData Ldata = {0, false, NULL},
-                                       Rdata = {0, false, NULL};
-               WordEntryPos *Lpos,
-                                  *LposStart,
-                                  *Rpos,
-                                  *pos_iter = NULL;
+               case OP_NOT:
  
-               Assert(curitem->qoperator.oper == OP_PHRASE);
+                       /*
+                        * Because a "true" result with no specific positions is taken as
+                        * uncertain, we need no special care here for !TS_EXEC_CALC_NOT.
+                        * If it's a false positive, the right things happen anyway.
+                        *
+                        * Also, we need not touch data->width, since a NOT operation does
+                        * not change the match width.
+                        */
+                       if (TS_phrase_execute(curitem + 1, arg, flags, chkcond, data))
+                       {
+                               if (data->npos > 0)
+                               {
+                                       /* we have some positions, invert negate flag */
+                                       data->negate = !data->negate;
+                                       return true;
+                               }
+                               else if (data->negate)
+                               {
+                                       /* change "match everywhere" to "match nowhere" */
+                                       data->negate = false;
+                                       return false;
+                               }
+                               /* match positions are, and remain, uncertain */
+                               return true;
+                       }
+                       else
+                       {
+                               /* change "match nowhere" to "match everywhere" */
+                               Assert(data->npos == 0 && !data->negate);
+                               data->negate = true;
+                               return true;
+                       }
  
-               if (!TS_phrase_execute(curitem + curitem->qoperator.left,
-                                                          arg, flags, &Ldata, chkcond))
-                       return false;
+               case OP_PHRASE:
+               case OP_AND:
+                       memset(&Ldata, 0, sizeof(Ldata));
+                       memset(&Rdata, 0, sizeof(Rdata));
  
-               if (!TS_phrase_execute(curitem + 1, arg, flags, &Rdata, chkcond))
-                       return false;
+                       if (!TS_phrase_execute(curitem + curitem->qoperator.left,
+                                                                  arg, flags, chkcond, &Ldata))
+                               return false;
  
-               /*
-                * If either operand has no position information, then we normally
-                * return false.  But if TS_EXEC_PHRASE_AS_AND flag is set then we
-                * return true, treating OP_PHRASE as if it were OP_AND.
-                */
-               if (Ldata.npos == 0 || Rdata.npos == 0)
-                       return (flags & TS_EXEC_PHRASE_AS_AND) ? true : false;
+                       if (!TS_phrase_execute(curitem + 1,
+                                                                  arg, flags, chkcond, &Rdata))
+                               return false;
  
-               /*
-                * Prepare output position array if needed.
-                */
-               if (data)
-               {
                         /*
-                        * We can recycle the righthand operand's result array if it was
-                        * palloc'd, else must allocate our own.  The number of matches
-                        * couldn't be more than the smaller of the two operands' matches.
+                        * If either operand has no position information, then we can't
+                        * return position data, only a "possible match" result. "Possible
+                        * match" answers are only wanted when TS_EXEC_PHRASE_NO_POS flag
+                        * is set, otherwise return false.
                          */
-                       if (!Rdata.allocated)
-                               data->pos = palloc(sizeof(WordEntryPos) * Min(Ldata.npos, Rdata.npos));
-                       else
-                               data->pos = Rdata.pos;
+                       if ((Ldata.npos == 0 && !Ldata.negate) ||
+                               (Rdata.npos == 0 && !Rdata.negate))
+                               return (flags & TS_EXEC_PHRASE_NO_POS) ? true : false;
  
-                       data->allocated = true;
-                       data->npos = 0;
-                       pos_iter = data->pos;
-               }
+                       if (curitem->qoperator.oper == OP_PHRASE)
+                       {
+                               /*
+                                * Compute Loffset and Roffset suitable for phrase match, and
+                                * compute overall width of whole phrase match.
+                                */
+                               Loffset = curitem->qoperator.distance + Rdata.width;
+                               Roffset = 0;
+                               if (data)
+                                       data->width = curitem->qoperator.distance +
+                                               Ldata.width + Rdata.width;
+                       }
+                       else
+                       {
+                               /*
+                                * For OP_AND, set output width and alignment like OP_OR (see
+                                * comment below)
+                                */
+                               maxwidth = Max(Ldata.width, Rdata.width);
+                               Loffset = maxwidth - Ldata.width;
+                               Roffset = maxwidth - Rdata.width;
+                               if (data)
+                                       data->width = maxwidth;
+                       }
  
-               /*
-                * Find matches by distance.  WEP_GETPOS() is needed because
-                * ExecPhraseData->data can point to a tsvector's WordEntryPosVector.
-                *
-                * Note that the output positions are those of the matching RIGHT
-                * operands.
-                */
-               Rpos = Rdata.pos;
-               LposStart = Ldata.pos;
-               while (Rpos < Rdata.pos + Rdata.npos)
-               {
-                       /*
-                        * We need to check all possible distances, so reset Lpos to
-                        * guaranteed not yet satisfied position.
-                        */
-                       Lpos = LposStart;
-                       while (Lpos < Ldata.pos + Ldata.npos)
+                       if (Ldata.negate && Rdata.negate)
                         {
-                               if (WEP_GETPOS(*Rpos) - WEP_GETPOS(*Lpos) ==
-                                       curitem->qoperator.distance)
-                               {
-                                       /* MATCH! */
-                                       if (data)
-                                       {
-                                               /* Store position for upper phrase operator */
-                                               *pos_iter = WEP_GETPOS(*Rpos);
-                                               pos_iter++;
-
-                                               /*
-                                                * Set left start position to next, because current
-                                                * one could not satisfy distance for any other right
-                                                * position
-                                                */
-                                               LposStart = Lpos + 1;
-                                               break;
-                                       }
-                                       else
-                                       {
-                                               /*
-                                                * We are at the root of the phrase tree and hence we
-                                                * don't have to identify all the match positions.
-                                                * Just report success.
-                                                */
-                                               return true;
-                                       }
+                               /* !L & !R: treat as !(L | R) */
+                               (void) TS_phrase_output(data, &Ldata, &Rdata,
+                                                                               TSPO_BOTH | TSPO_L_ONLY | TSPO_R_ONLY,
+                                                                               Loffset, Roffset,
+                                                                               Ldata.npos + Rdata.npos);
+                               if (data)
+                                       data->negate = true;
+                               return true;
+                       }
+                       else if (Ldata.negate)
+                       {
+                               /* !L & R */
+                               return TS_phrase_output(data, &Ldata, &Rdata,
+                                                                               TSPO_R_ONLY,
+                                                                               Loffset, Roffset,
+                                                                               Rdata.npos);
+                       }
+                       else if (Rdata.negate)
+                       {
+                               /* L & !R */
+                               return TS_phrase_output(data, &Ldata, &Rdata,
+                                                                               TSPO_L_ONLY,
+                                                                               Loffset, Roffset,
+                                                                               Ldata.npos);
+                       }
+                       else
+                       {
+                               /* straight AND */
+                               return TS_phrase_output(data, &Ldata, &Rdata,
+                                                                               TSPO_BOTH,
+                                                                               Loffset, Roffset,
+                                                                               Min(Ldata.npos, Rdata.npos));
+                       }
  
-                               }
-                               else if (WEP_GETPOS(*Rpos) <= WEP_GETPOS(*Lpos) ||
-                                                WEP_GETPOS(*Rpos) - WEP_GETPOS(*Lpos) <
-                                                curitem->qoperator.distance)
-                               {
-                                       /*
-                                        * Go to the next Rpos, because Lpos is ahead or on less
-                                        * distance than required by current operator
-                                        */
-                                       break;
+               case OP_OR:
+                       memset(&Ldata, 0, sizeof(Ldata));
+                       memset(&Rdata, 0, sizeof(Rdata));
  
-                               }
+                       lmatch = TS_phrase_execute(curitem + curitem->qoperator.left,
+                                                                          arg, flags, chkcond, &Ldata);
+                       rmatch = TS_phrase_execute(curitem + 1,
+                                                                          arg, flags, chkcond, &Rdata);
  
-                               Lpos++;
-                       }
+                       if (!lmatch && !rmatch)
+                               return false;
  
-                       Rpos++;
-               }
+                       /*
+                        * If a valid operand has no position information, then we can't
+                        * return position data, only a "possible match" result. "Possible
+                        * match" answers are only wanted when TS_EXEC_PHRASE_NO_POS flag
+                        * is set, otherwise return false.
+                        */
+                       if ((lmatch && Ldata.npos == 0 && !Ldata.negate) ||
+                               (rmatch && Rdata.npos == 0 && !Rdata.negate))
+                               return (flags & TS_EXEC_PHRASE_NO_POS) ? true : false;
  
-               if (data)
-               {
-                       data->npos = pos_iter - data->pos;
+                       /*
+                        * Cope with undefined output width from failed submatch.  (This
+                        * takes less code than trying to ensure that all failure returns
+                        * set data->width to zero.)
+                        */
+                       if (!lmatch)
+                               Ldata.width = 0;
+                       if (!rmatch)
+                               Rdata.width = 0;
  
-                       if (data->npos > 0)
+                       /*
+                        * For OP_AND and OP_OR, report the width of the wider of the two
+                        * inputs, and align the narrower input's positions to the right
+                        * end of that width.  This rule deals at least somewhat
+                        * reasonably with cases like "x <-> (y | z <-> q)".
+                        */
+                       maxwidth = Max(Ldata.width, Rdata.width);
+                       Loffset = maxwidth - Ldata.width;
+                       Roffset = maxwidth - Rdata.width;
+                       data->width = maxwidth;
+
+                       if (Ldata.negate && Rdata.negate)
+                       {
+                               /* !L | !R: treat as !(L & R) */
+                               (void) TS_phrase_output(data, &Ldata, &Rdata,
+                                                                               TSPO_BOTH,
+                                                                               Loffset, Roffset,
+                                                                               Min(Ldata.npos, Rdata.npos));
+                               data->negate = true;
                                 return true;
-               }
+                       }
+                       else if (Ldata.negate)
+                       {
+                               /* !L | R: treat as !(L & !R) */
+                               (void) TS_phrase_output(data, &Ldata, &Rdata,
+                                                                               TSPO_L_ONLY,
+                                                                               Loffset, Roffset,
+                                                                               Ldata.npos);
+                               data->negate = true;
+                               return true;
+                       }
+                       else if (Rdata.negate)
+                       {
+                               /* L | !R: treat as !(!L & R) */
+                               (void) TS_phrase_output(data, &Ldata, &Rdata,
+                                                                               TSPO_R_ONLY,
+                                                                               Loffset, Roffset,
+                                                                               Rdata.npos);
+                               data->negate = true;
+                               return true;
+                       }
+                       else
+                       {
+                               /* straight OR */
+                               return TS_phrase_output(data, &Ldata, &Rdata,
+                                                                               TSPO_BOTH | TSPO_L_ONLY | TSPO_R_ONLY,
+                                                                               Loffset, Roffset,
+                                                                               Ldata.npos + Rdata.npos);
+                       }
+
+               default:
+                       elog(ERROR, "unrecognized operator: %d", curitem->qoperator.oper);
         }
  
+       /* not reachable, but keep compiler quiet */
         return false;
  }
  
@@ -1594,12 +1842,7 @@ TS_execute(QueryItem *curitem, void *arg, uint32 flags,
                                 return TS_execute(curitem + 1, arg, flags, chkcond);
  
                 case OP_PHRASE:
-
-                       /*
-                        * do not check TS_EXEC_PHRASE_AS_AND here because chkcond() could
-                        * do something more if it's called from TS_phrase_execute()
-                        */
-                       return TS_phrase_execute(curitem, arg, flags, NULL, chkcond);
+                       return TS_phrase_execute(curitem, arg, flags, chkcond, NULL);
  
                 default:
                         elog(ERROR, "unrecognized operator: %d", curitem->qoperator.oper);
diff --git a/src/include/tsearch/ts_utils.h b/src/include/tsearch/ts_utils.h

index 1fbd9838984581aa2368bef3adec3795c2db1186..d74853b65854eaa6da4ee5e11086366bcec5d572 100644 (file)
--- a/src/include/tsearch/ts_utils.h
+++ b/src/include/tsearch/ts_utils.h
@@ -113,8 +113,8 @@ extern text *generateHeadline(HeadlineParsedText *prs);
   * struct ExecPhraseData is passed to a TSExecuteCallback function if we need
   * lexeme position data (because of a phrase-match operator in the tsquery).
   * The callback should fill in position data when it returns true (success).
- * If it cannot return position data, it may ignore its "data" argument, but
- * then the caller of TS_execute() must pass the TS_EXEC_PHRASE_AS_AND flag
+ * If it cannot return position data, it may leave "data" unchanged, but
+ * then the caller of TS_execute() must pass the TS_EXEC_PHRASE_NO_POS flag
   * and must arrange for a later recheck with position data available.
   *
   * The reported lexeme positions must be sorted and unique.  Callers must only
@@ -123,13 +123,21 @@ extern text *generateHeadline(HeadlineParsedText *prs);
   * portion of a tsvector value.  If "allocated" is true then the pos array
   * is palloc'd workspace and caller may free it when done.
   *
+ * "negate" means that the pos array contains positions where the query does
+ * not match, rather than positions where it does.  "width" is positive when
+ * the match is wider than one lexeme.  Neither of these fields normally need
+ * to be touched by TSExecuteCallback functions; they are used for
+ * phrase-search processing within TS_execute.
+ *
   * All fields of the ExecPhraseData struct are initially zeroed by caller.
   */
  typedef struct ExecPhraseData
  {
         int                     npos;                   /* number of positions reported */
         bool            allocated;              /* pos points to palloc'd data? */
+       bool            negate;                 /* positions are where query is NOT matched */
         WordEntryPos *pos;                      /* ordered, non-duplicate lexeme positions */
+       int                     width;                  /* width of match in lexemes, less 1 */
  } ExecPhraseData;
  
  /*
@@ -139,7 +147,9 @@ typedef struct ExecPhraseData
   * val: lexeme to test for presence of
   * data: to be filled with lexeme positions; NULL if position data not needed
   *
- * Return TRUE if lexeme is present in data, else FALSE
+ * Return TRUE if lexeme is present in data, else FALSE.  If data is not
+ * NULL, it should be filled with lexeme positions, but function can leave
+ * it as zeroes if position data is not available.
   */
  typedef bool (*TSExecuteCallback) (void *arg, QueryOperand *val,
                                                                                            ExecPhraseData *data);
@@ -151,15 +161,20 @@ typedef bool (*TSExecuteCallback) (void *arg, QueryOperand *val,
  /*
   * If TS_EXEC_CALC_NOT is not set, then NOT expressions are automatically
   * evaluated to be true.  Useful in cases where NOT cannot be accurately
- * computed (GiST) or it isn't important (ranking).
+ * computed (GiST) or it isn't important (ranking).  From TS_execute's
+ * perspective, !CALC_NOT means that the TSExecuteCallback function might
+ * return false-positive indications of a lexeme's presence.
   */
  #define TS_EXEC_CALC_NOT               (0x01)
  /*
- * Treat OP_PHRASE as OP_AND.  Used when positional information is not
- * accessible, like in consistent methods of GIN/GiST indexes; rechecking
- * must occur later.
+ * If TS_EXEC_PHRASE_NO_POS is set, allow OP_PHRASE to be executed lossily
+ * in the absence of position information: a TRUE result indicates that the
+ * phrase might be present.  Without this flag, OP_PHRASE always returns
+ * false if lexeme position information is not available.
   */
-#define TS_EXEC_PHRASE_AS_AND  (0x02)
+#define TS_EXEC_PHRASE_NO_POS  (0x02)
+/* Obsolete spelling of TS_EXEC_PHRASE_NO_POS: */
+#define TS_EXEC_PHRASE_AS_AND  TS_EXEC_PHRASE_NO_POS
  
  extern bool TS_execute(QueryItem *curitem, void *arg, uint32 flags,
                    TSExecuteCallback chkcond);
@@ -228,7 +243,7 @@ extern Datum gin_tsquery_consistent_oldsig(PG_FUNCTION_ARGS);
   * TSQuery Utilities
   */
  extern QueryItem *clean_NOT(QueryItem *ptr, int32 *len);
-extern TSQuery cleanup_fakeval_and_phrase(TSQuery in);
+extern TSQuery cleanup_tsquery_stopwords(TSQuery in);
  
  typedef struct QTNode
  {
diff --git a/src/test/regress/expected/tsdicts.out b/src/test/regress/expected/tsdicts.out

index c55591a6787d1434f07a92acd46123a50c7b6234..8ed64d3c68e07107fbe048e114b1c48a6f21c037 100644 (file)
--- a/src/test/regress/expected/tsdicts.out
+++ b/src/test/regress/expected/tsdicts.out
@@ -470,15 +470,15 @@ SELECT to_tsquery('hunspell_tst', 'footballyklubber:b & rebookings:A & sky');
  (1 row)
  
  SELECT to_tsquery('hunspell_tst', 'footballyklubber:b <-> sky');
-                           to_tsquery                            
------------------------------------------------------------------
- 'foot':B <-> 'sky' & 'ball':B <-> 'sky' & 'klubber':B <-> 'sky'
+                   to_tsquery                    
+-------------------------------------------------
+ ( 'foot':B & 'ball':B & 'klubber':B ) <-> 'sky'
  (1 row)
  
  SELECT phraseto_tsquery('hunspell_tst', 'footballyklubber sky');
-                     phraseto_tsquery                      
------------------------------------------------------------
- 'foot' <-> 'sky' & 'ball' <-> 'sky' & 'klubber' <-> 'sky'
+             phraseto_tsquery              
+-------------------------------------------
+ ( 'foot' & 'ball' & 'klubber' ) <-> 'sky'
  (1 row)
  
  -- Test ispell dictionary with hunspell affix with FLAG long in configuration
diff --git a/src/test/regress/expected/tsearch.out b/src/test/regress/expected/tsearch.out

index cf3beb35a994aaeff06af0eaf7218d67cc47b132..0681d43358bf446be82ea4c1883184c88ae0162b 100644 (file)
--- a/src/test/regress/expected/tsearch.out
+++ b/src/test/regress/expected/tsearch.out
@@ -556,15 +556,15 @@ SELECT plainto_tsquery('english', 'foo bar') && 'asd | fg';
  
  -- Check stop word deletion, a and s are stop-words
  SELECT to_tsquery('english', '!(a & !b) & c');
- to_tsquery 
-------------
- 'b' & 'c'
+ to_tsquery  
+-------------
+ !!'b' & 'c'
  (1 row)
  
  SELECT to_tsquery('english', '!(a & !b)');
   to_tsquery 
  ------------
- 'b'
+ !!'b'
  (1 row)
  
  SELECT to_tsquery('english', '(1 <-> 2) <-> a');
@@ -1240,15 +1240,15 @@ SELECT ts_rewrite('1 & (2 <2> 3)', 'SELECT keyword, sample FROM test_tsquery'::t
  (1 row)
  
  SELECT ts_rewrite('5 <-> (1 & (2 <-> 3))', 'SELECT keyword, sample FROM test_tsquery'::text );
-              ts_rewrite               
----------------------------------------
- '5' <-> '1' & '5' <-> ( '2' <-> '3' )
+       ts_rewrite        
+-------------------------
+ '5' <-> ( '2' <-> '4' )
  (1 row)
  
  SELECT ts_rewrite('5 <-> (6 | 8)', 'SELECT keyword, sample FROM test_tsquery'::text );
-        ts_rewrite         
----------------------------
- '5' <-> '7' | '5' <-> '8'
+      ts_rewrite       
+-----------------------
+ '5' <-> ( '6' | '8' )
  (1 row)
  
  -- Check empty substitution
@@ -1386,6 +1386,26 @@ SELECT ts_rewrite( query, 'SELECT keyword, sample FROM test_tsquery' ) FROM to_t
   'citi' & 'foo' & ( 'bar' | 'qq' ) & ( 'nyc' | 'big' & 'appl' | 'new' & 'york' )
  (1 row)
  
+SELECT ts_rewrite(tsquery_phrase('foo', 'foo'), 'foo', 'bar | baz');
+               ts_rewrite                
+-----------------------------------------
+ ( 'bar' | 'baz' ) <-> ( 'bar' | 'baz' )
+(1 row)
+
+SELECT to_tsvector('foo bar') @@
+  ts_rewrite(tsquery_phrase('foo', 'foo'), 'foo', 'bar | baz');
+ ?column? 
+----------
+ f
+(1 row)
+
+SELECT to_tsvector('bar baz') @@
+  ts_rewrite(tsquery_phrase('foo', 'foo'), 'foo', 'bar | baz');
+ ?column? 
+----------
+ t
+(1 row)
+
  RESET enable_seqscan;
  --test GUC
  SET default_text_search_config=simple;
diff --git a/src/test/regress/expected/tstypes.out b/src/test/regress/expected/tstypes.out

index 8d9290cbac122d00222c97304f2f6de3c42438b7..dcce82fdc4f99c1abd70ef6b488d06a22d32c0c6 100644 (file)
--- a/src/test/regress/expected/tstypes.out
+++ b/src/test/regress/expected/tstypes.out
@@ -366,133 +366,6 @@ SELECT '!!a & !!b'::tsquery;
   !!'a' & !!'b'
  (1 row)
  
--- phrase transformation
-SELECT 'a <-> (b|c)'::tsquery;
-          tsquery          
----------------------------
- 'a' <-> 'b' | 'a' <-> 'c'
-(1 row)
-
-SELECT '(a|b) <-> c'::tsquery;
-          tsquery          
----------------------------
- 'a' <-> 'c' | 'b' <-> 'c'
-(1 row)
-
-SELECT '(a|b) <-> (d|c)'::tsquery;
-                        tsquery                        
--------------------------------------------------------
- 'a' <-> 'd' | 'b' <-> 'd' | 'a' <-> 'c' | 'b' <-> 'c'
-(1 row)
-
-SELECT 'a <-> (b&c)'::tsquery;
-          tsquery          
----------------------------
- 'a' <-> 'b' & 'a' <-> 'c'
-(1 row)
-
-SELECT '(a&b) <-> c'::tsquery;
-          tsquery          
----------------------------
- 'a' <-> 'c' & 'b' <-> 'c'
-(1 row)
-
-SELECT '(a&b) <-> (d&c)'::tsquery;
-                        tsquery                        
--------------------------------------------------------
- 'a' <-> 'd' & 'b' <-> 'd' & 'a' <-> 'c' & 'b' <-> 'c'
-(1 row)
-
-SELECT 'a <-> !b'::tsquery;
-        tsquery         
-------------------------
- 'a' & !( 'a' <-> 'b' )
-(1 row)
-
-SELECT '!a <-> b'::tsquery;
-        tsquery         
-------------------------
- !( 'a' <-> 'b' ) & 'b'
-(1 row)
-
-SELECT '!a <-> !b'::tsquery;
-              tsquery               
-------------------------------------
- !'a' & !( !( 'a' <-> 'b' ) & 'b' )
-(1 row)
-
-SELECT 'a <-> !(b&c)'::tsquery;
-               tsquery                
---------------------------------------
- 'a' & !( 'a' <-> 'b' & 'a' <-> 'c' )
-(1 row)
-
-SELECT 'a <-> !(b|c)'::tsquery;
-               tsquery                
---------------------------------------
- 'a' & !( 'a' <-> 'b' | 'a' <-> 'c' )
-(1 row)
-
-SELECT  '!(a&b) <-> c'::tsquery;
-               tsquery                
---------------------------------------
- !( 'a' <-> 'c' & 'b' <-> 'c' ) & 'c'
-(1 row)
-
-SELECT  '!(a|b) <-> c'::tsquery;
-               tsquery                
---------------------------------------
- !( 'a' <-> 'c' | 'b' <-> 'c' ) & 'c'
-(1 row)
-
-SELECT  '(!a|b) <-> c'::tsquery;
-               tsquery                
---------------------------------------
- !( 'a' <-> 'c' ) & 'c' | 'b' <-> 'c'
-(1 row)
-
-SELECT  '(!a&b) <-> c'::tsquery;
-               tsquery                
---------------------------------------
- !( 'a' <-> 'c' ) & 'c' & 'b' <-> 'c'
-(1 row)
-
-SELECT  'c <-> (!a|b)'::tsquery;
-               tsquery                
---------------------------------------
- 'c' & !( 'c' <-> 'a' ) | 'c' <-> 'b'
-(1 row)
-
-SELECT  'c <-> (!a&b)'::tsquery;
-               tsquery                
---------------------------------------
- 'c' & !( 'c' <-> 'a' ) & 'c' <-> 'b'
-(1 row)
-
-SELECT  '(a|b) <-> !c'::tsquery;
-                    tsquery                     
-------------------------------------------------
- ( 'a' | 'b' ) & !( 'a' <-> 'c' | 'b' <-> 'c' )
-(1 row)
-
-SELECT  '(a&b) <-> !c'::tsquery;
-                  tsquery                   
---------------------------------------------
- 'a' & 'b' & !( 'a' <-> 'c' & 'b' <-> 'c' )
-(1 row)
-
-SELECT  '!c <-> (a|b)'::tsquery;
-                     tsquery                     
--------------------------------------------------
- !( 'c' <-> 'a' ) & 'a' | !( 'c' <-> 'b' ) & 'b'
-(1 row)
-
-SELECT  '!c <-> (a&b)'::tsquery;
-                     tsquery                     
--------------------------------------------------
- !( 'c' <-> 'a' ) & 'a' & !( 'c' <-> 'b' ) & 'b'
-(1 row)
-
  --comparisons
  SELECT 'a' < 'b & c'::tsquery as "true";
   true 
@@ -568,33 +441,33 @@ SELECT 'foo & bar'::tsquery && 'asd | fg';
  (1 row)
  
  SELECT 'a' <-> 'b & d'::tsquery;
-         ?column?          
----------------------------
- 'a' <-> 'b' & 'a' <-> 'd'
+       ?column?        
+-----------------------
+ 'a' <-> ( 'b' & 'd' )
  (1 row)
  
  SELECT 'a & g' <-> 'b & d'::tsquery;
-                       ?column?                        
--------------------------------------------------------
- 'a' <-> 'b' & 'g' <-> 'b' & 'a' <-> 'd' & 'g' <-> 'd'
+            ?column?             
+---------------------------------
+ ( 'a' & 'g' ) <-> ( 'b' & 'd' )
  (1 row)
  
  SELECT 'a & g' <-> 'b | d'::tsquery;
-                       ?column?                        
--------------------------------------------------------
- 'a' <-> 'b' & 'g' <-> 'b' | 'a' <-> 'd' & 'g' <-> 'd'
+            ?column?             
+---------------------------------
+ ( 'a' & 'g' ) <-> ( 'b' | 'd' )
  (1 row)
  
  SELECT 'a & g' <-> 'b <-> d'::tsquery;
-                     ?column?                      
----------------------------------------------------
- 'a' <-> ( 'b' <-> 'd' ) & 'g' <-> ( 'b' <-> 'd' )
+             ?column?              
+-----------------------------------
+ ( 'a' & 'g' ) <-> ( 'b' <-> 'd' )
  (1 row)
  
  SELECT tsquery_phrase('a <3> g', 'b & d', 10);
-               tsquery_phrase                
----------------------------------------------
- 'a' <3> 'g' <10> 'b' & 'a' <3> 'g' <10> 'd'
+         tsquery_phrase         
+--------------------------------
+ 'a' <3> 'g' <10> ( 'b' & 'd' )
  (1 row)
  
  -- tsvector-tsquery operations
@@ -749,25 +622,152 @@ SELECT to_tsvector('simple', '1 2 3 4') @@ '(1 <-> 2) <-> 3' AS "true";
   t
  (1 row)
  
-SELECT to_tsvector('simple', '1 2 3 4') @@ '1 <-> (2 <-> 3)' AS "false";
+SELECT to_tsvector('simple', '1 2 3 4') @@ '1 <-> (2 <-> 3)' AS "true";
+ true 
+------
+ t
+(1 row)
+
+SELECT to_tsvector('simple', '1 2 3 4') @@ '1 <2> (2 <-> 3)' AS "false";
   false 
  -------
   f
  (1 row)
  
-SELECT to_tsvector('simple', '1 2 3 4') @@ '1 <2> (2 <-> 3)' AS "true";
+SELECT to_tsvector('simple', '1 2 1 2 3 4') @@ '(1 <-> 2) <-> 3' AS "true";
   true 
  ------
   t
  (1 row)
  
-SELECT to_tsvector('simple', '1 2 1 2 3 4') @@ '(1 <-> 2) <-> 3' AS "true";
+SELECT to_tsvector('simple', '1 2 1 2 3 4') @@ '1 <-> 2 <-> 3' AS "true";
   true 
  ------
   t
  (1 row)
  
-SELECT to_tsvector('simple', '1 2 1 2 3 4') @@ '1 <-> 2 <-> 3' AS "true";
+-- without position data, phrase search does not match
+SELECT strip(to_tsvector('simple', '1 2 3 4')) @@ '1 <-> 2 <-> 3' AS "false";
+ false 
+-------
+ f
+(1 row)
+
+select to_tsvector('simple', 'q x q y') @@ 'q <-> (x & y)' AS "false";
+ false 
+-------
+ f
+(1 row)
+
+select to_tsvector('simple', 'q x') @@ 'q <-> (x | y <-> z)' AS "true";
+ true 
+------
+ t
+(1 row)
+
+select to_tsvector('simple', 'q y') @@ 'q <-> (x | y <-> z)' AS "false";
+ false 
+-------
+ f
+(1 row)
+
+select to_tsvector('simple', 'q y z') @@ 'q <-> (x | y <-> z)' AS "true";
+ true 
+------
+ t
+(1 row)
+
+select to_tsvector('simple', 'q y x') @@ 'q <-> (x | y <-> z)' AS "false";
+ false 
+-------
+ f
+(1 row)
+
+select to_tsvector('simple', 'q x y') @@ 'q <-> (x | y <-> z)' AS "true";
+ true 
+------
+ t
+(1 row)
+
+select to_tsvector('simple', 'q x') @@ '(x | y <-> z) <-> q' AS "false";
+ false 
+-------
+ f
+(1 row)
+
+select to_tsvector('simple', 'x q') @@ '(x | y <-> z) <-> q' AS "true";
+ true 
+------
+ t
+(1 row)
+
+select to_tsvector('simple', 'x y q') @@ '(x | y <-> z) <-> q' AS "false";
+ false 
+-------
+ f
+(1 row)
+
+select to_tsvector('simple', 'x y z') @@ '(x | y <-> z) <-> q' AS "false";
+ false 
+-------
+ f
+(1 row)
+
+select to_tsvector('simple', 'x y z q') @@ '(x | y <-> z) <-> q' AS "true";
+ true 
+------
+ t
+(1 row)
+
+select to_tsvector('simple', 'y z q') @@ '(x | y <-> z) <-> q' AS "true";
+ true 
+------
+ t
+(1 row)
+
+select to_tsvector('simple', 'y y q') @@ '(x | y <-> z) <-> q' AS "false";
+ false 
+-------
+ f
+(1 row)
+
+select to_tsvector('simple', 'y y q') @@ '(!x | y <-> z) <-> q' AS "true";
+ true 
+------
+ t
+(1 row)
+
+select to_tsvector('simple', 'x y q') @@ '(!x | y <-> z) <-> q' AS "true";
+ true 
+------
+ t
+(1 row)
+
+select to_tsvector('simple', 'y y q') @@ '(x | y <-> !z) <-> q' AS "true";
+ true 
+------
+ t
+(1 row)
+
+select to_tsvector('simple', 'x q') @@ '(x | y <-> !z) <-> q' AS "true";
+ true 
+------
+ t
+(1 row)
+
+select to_tsvector('simple', 'x q') @@ '(!x | y <-> z) <-> q' AS "false";
+ false 
+-------
+ f
+(1 row)
+
+select to_tsvector('simple', 'z q') @@ '(!x | y <-> z) <-> q' AS "true";
+ true 
+------
+ t
+(1 row)
+
+select to_tsvector('simple', 'x y q y') @@ '!x <-> y' AS "true";
   true 
  ------
   t
@@ -1002,6 +1002,12 @@ SELECT 'a:1 b:3'::tsvector @@ 'a <3> b'::tsquery AS "false";
   f
  (1 row)
  
+SELECT 'a:1 b:3'::tsvector @@ 'a <0> a:*'::tsquery AS "true";
+ true 
+------
+ t
+(1 row)
+
  -- tsvector editing operations
  SELECT strip('w:12B w:13* w:12,5,6 a:1,3* a:3 w asd:1dc asd'::tsvector);
       strip     
diff --git a/src/test/regress/sql/tsearch.sql b/src/test/regress/sql/tsearch.sql

index de43860c7044f4e3a1b27b43ee967034033bbb16..1255f6954db95498751f08b0b6856fc417af171f 100644 (file)
--- a/src/test/regress/sql/tsearch.sql
+++ b/src/test/regress/sql/tsearch.sql
@@ -447,6 +447,12 @@ SELECT ts_rewrite( query, 'SELECT keyword, sample FROM test_tsquery' ) FROM to_t
  SELECT ts_rewrite( query, 'SELECT keyword, sample FROM test_tsquery' ) FROM to_tsquery('english', 'moscow & hotel') AS query;
  SELECT ts_rewrite( query, 'SELECT keyword, sample FROM test_tsquery' ) FROM to_tsquery('english', 'bar &  new & qq & foo & york') AS query;
  
+SELECT ts_rewrite(tsquery_phrase('foo', 'foo'), 'foo', 'bar | baz');
+SELECT to_tsvector('foo bar') @@
+  ts_rewrite(tsquery_phrase('foo', 'foo'), 'foo', 'bar | baz');
+SELECT to_tsvector('bar baz') @@
+  ts_rewrite(tsquery_phrase('foo', 'foo'), 'foo', 'bar | baz');
+
  RESET enable_seqscan;
  
  --test GUC
diff --git a/src/test/regress/sql/tstypes.sql b/src/test/regress/sql/tstypes.sql

index 9ea93a2993845a71ee452e8a0e18674f7ea78b15..77436ce04e06f88866b4c9855f36559b776dc6b8 100644 (file)
--- a/src/test/regress/sql/tstypes.sql
+++ b/src/test/regress/sql/tstypes.sql
@@ -64,34 +64,6 @@ SELECT 'a & !!b'::tsquery;
  SELECT '!!a & b'::tsquery;
  SELECT '!!a & !!b'::tsquery;
  
--- phrase transformation
-SELECT 'a <-> (b|c)'::tsquery;
-SELECT '(a|b) <-> c'::tsquery;
-SELECT '(a|b) <-> (d|c)'::tsquery;
-
-SELECT 'a <-> (b&c)'::tsquery;
-SELECT '(a&b) <-> c'::tsquery;
-SELECT '(a&b) <-> (d&c)'::tsquery;
-
-SELECT 'a <-> !b'::tsquery;
-SELECT '!a <-> b'::tsquery;
-SELECT '!a <-> !b'::tsquery;
-
-SELECT 'a <-> !(b&c)'::tsquery;
-SELECT 'a <-> !(b|c)'::tsquery;
-SELECT  '!(a&b) <-> c'::tsquery;
-SELECT  '!(a|b) <-> c'::tsquery;
-
-SELECT  '(!a|b) <-> c'::tsquery;
-SELECT  '(!a&b) <-> c'::tsquery;
-SELECT  'c <-> (!a|b)'::tsquery;
-SELECT  'c <-> (!a&b)'::tsquery;
-
-SELECT  '(a|b) <-> !c'::tsquery;
-SELECT  '(a&b) <-> !c'::tsquery;
-SELECT  '!c <-> (a|b)'::tsquery;
-SELECT  '!c <-> (a&b)'::tsquery;
-
  --comparisons
  SELECT 'a' < 'b & c'::tsquery as "true";
  SELECT 'a' > 'b & c'::tsquery as "false";
@@ -146,10 +118,33 @@ SELECT to_tsvector('simple', '1 2 11 3') @@ '1:* <-> 3' AS "true";
  
  SELECT to_tsvector('simple', '1 2 3 4') @@ '1 <-> 2 <-> 3' AS "true";
  SELECT to_tsvector('simple', '1 2 3 4') @@ '(1 <-> 2) <-> 3' AS "true";
-SELECT to_tsvector('simple', '1 2 3 4') @@ '1 <-> (2 <-> 3)' AS "false";
-SELECT to_tsvector('simple', '1 2 3 4') @@ '1 <2> (2 <-> 3)' AS "true";
+SELECT to_tsvector('simple', '1 2 3 4') @@ '1 <-> (2 <-> 3)' AS "true";
+SELECT to_tsvector('simple', '1 2 3 4') @@ '1 <2> (2 <-> 3)' AS "false";
  SELECT to_tsvector('simple', '1 2 1 2 3 4') @@ '(1 <-> 2) <-> 3' AS "true";
  SELECT to_tsvector('simple', '1 2 1 2 3 4') @@ '1 <-> 2 <-> 3' AS "true";
+-- without position data, phrase search does not match
+SELECT strip(to_tsvector('simple', '1 2 3 4')) @@ '1 <-> 2 <-> 3' AS "false";
+
+select to_tsvector('simple', 'q x q y') @@ 'q <-> (x & y)' AS "false";
+select to_tsvector('simple', 'q x') @@ 'q <-> (x | y <-> z)' AS "true";
+select to_tsvector('simple', 'q y') @@ 'q <-> (x | y <-> z)' AS "false";
+select to_tsvector('simple', 'q y z') @@ 'q <-> (x | y <-> z)' AS "true";
+select to_tsvector('simple', 'q y x') @@ 'q <-> (x | y <-> z)' AS "false";
+select to_tsvector('simple', 'q x y') @@ 'q <-> (x | y <-> z)' AS "true";
+select to_tsvector('simple', 'q x') @@ '(x | y <-> z) <-> q' AS "false";
+select to_tsvector('simple', 'x q') @@ '(x | y <-> z) <-> q' AS "true";
+select to_tsvector('simple', 'x y q') @@ '(x | y <-> z) <-> q' AS "false";
+select to_tsvector('simple', 'x y z') @@ '(x | y <-> z) <-> q' AS "false";
+select to_tsvector('simple', 'x y z q') @@ '(x | y <-> z) <-> q' AS "true";
+select to_tsvector('simple', 'y z q') @@ '(x | y <-> z) <-> q' AS "true";
+select to_tsvector('simple', 'y y q') @@ '(x | y <-> z) <-> q' AS "false";
+select to_tsvector('simple', 'y y q') @@ '(!x | y <-> z) <-> q' AS "true";
+select to_tsvector('simple', 'x y q') @@ '(!x | y <-> z) <-> q' AS "true";
+select to_tsvector('simple', 'y y q') @@ '(x | y <-> !z) <-> q' AS "true";
+select to_tsvector('simple', 'x q') @@ '(x | y <-> !z) <-> q' AS "true";
+select to_tsvector('simple', 'x q') @@ '(!x | y <-> z) <-> q' AS "false";
+select to_tsvector('simple', 'z q') @@ '(!x | y <-> z) <-> q' AS "true";
+select to_tsvector('simple', 'x y q y') @@ '!x <-> y' AS "true";
  
  --ranking
  SELECT ts_rank(' a:1 s:2C d g'::tsvector, 'a | s');
@@ -193,6 +188,7 @@ SELECT 'a:1 b:3'::tsvector @@ 'a <0> b'::tsquery AS "false";
  SELECT 'a:1 b:3'::tsvector @@ 'a <1> b'::tsquery AS "false";
  SELECT 'a:1 b:3'::tsvector @@ 'a <2> b'::tsquery AS "true";
  SELECT 'a:1 b:3'::tsvector @@ 'a <3> b'::tsquery AS "false";
+SELECT 'a:1 b:3'::tsvector @@ 'a <0> a:*'::tsquery AS "true";
  
  -- tsvector editing operations
author	Tom Lane <tgl@sss.pgh.pa.us>
	Wed, 21 Dec 2016 20:18:25 +0000 (15:18 -0500)
committer	Tom Lane <tgl@sss.pgh.pa.us>
	Wed, 21 Dec 2016 20:18:40 +0000 (15:18 -0500)
doc/src/sgml/datatype.sgml		patch \| blob \| history
doc/src/sgml/textsearch.sgml		patch \| blob \| history
src/backend/utils/adt/tsginidx.c		patch \| blob \| history
src/backend/utils/adt/tsgistidx.c		patch \| blob \| history
src/backend/utils/adt/tsquery.c		patch \| blob \| history
src/backend/utils/adt/tsquery_cleanup.c		patch \| blob \| history
src/backend/utils/adt/tsquery_op.c		patch \| blob \| history
src/backend/utils/adt/tsvector_op.c		patch \| blob \| history
src/include/tsearch/ts_utils.h		patch \| blob \| history
src/test/regress/expected/tsdicts.out		patch \| blob \| history
src/test/regress/expected/tsearch.out		patch \| blob \| history
src/test/regress/expected/tstypes.out		patch \| blob \| history
src/test/regress/sql/tsearch.sql		patch \| blob \| history
src/test/regress/sql/tstypes.sql		patch \| blob \| history