Fix ts_delete(tsvector, text[]) to cope with duplicate array entries.

author Tom Lane <tgl@sss.pgh.pa.us>

Fri, 5 Aug 2016 19:14:08 +0000 (15:14 -0400)

committer Tom Lane <tgl@sss.pgh.pa.us>

Fri, 5 Aug 2016 19:14:19 +0000 (15:14 -0400)
author Tom Lane <tgl@sss.pgh.pa.us>
Fri, 5 Aug 2016 19:14:08 +0000 (15:14 -0400)
committer Tom Lane <tgl@sss.pgh.pa.us>
Fri, 5 Aug 2016 19:14:19 +0000 (15:14 -0400)
diff --git a/src/backend/utils/adt/tsvector_op.c b/src/backend/utils/adt/tsvector_op.c

index 63ae4330ef1fbf2c12cd5a998bb91dfdfee6d5aa..29cc687643c1746d5a8b270ee1cbaa3ca548ae67 100644 (file)
--- a/src/backend/utils/adt/tsvector_op.c
+++ b/src/backend/utils/adt/tsvector_op.c
@@ -317,7 +317,7 @@ tsvector_setweight_by_filter(PG_FUNCTION_ARGS)
  
                 if (nulls[i])
                         ereport(ERROR,
-                                       (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+                                       (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED),
                                          errmsg("lexeme array may not contain nulls")));
  
                 lex = VARDATA(dlexemes[i]);
@@ -430,7 +430,7 @@ compareint(const void *va, const void *vb)
  /*
   * Internal routine to delete lexemes from TSVector by array of offsets.
   *
- * int *indices_to_delete -- array of lexeme offsets to delete
+ * int *indices_to_delete -- array of lexeme offsets to delete (modified here!)
   * int indices_count -- size of that array
   *
   * Returns new TSVector without given lexemes along with their positions
@@ -445,35 +445,51 @@ tsvector_delete_by_indices(TSVector tsv, int *indices_to_delete,
                            *arrout;
         char       *data = STRPTR(tsv),
                            *dataout;
-       int                     i,
-                               j,
-                               k,
-                               curoff;
+       int                     i,                              /* index in arrin */
+                               j,                              /* index in arrout */
+                               k,                              /* index in indices_to_delete */
+                               curoff;                 /* index in dataout area */
  
         /*
-        * Here we overestimates tsout size, since we don't know exact size
-        * occupied by positions and weights. We will set exact size later after a
-        * pass through TSVector.
+        * Sort the filter array to simplify membership checks below.  Also, get
+        * rid of any duplicate entries, so that we can assume that indices_count
+        * is exactly equal to the number of lexemes that will be removed.
          */
-       tsout = (TSVector) palloc0(VARSIZE(tsv));
-       arrout = ARRPTR(tsout);
-       tsout->size = tsv->size - indices_count;
-
-       /* Sort our filter array to simplify membership check later. */
         if (indices_count > 1)
+       {
+               int                     kp;
+
                 qsort(indices_to_delete, indices_count, sizeof(int), compareint);
+               kp = 0;
+               for (k = 1; k < indices_count; k++)
+               {
+                       if (indices_to_delete[k] != indices_to_delete[kp])
+                               indices_to_delete[++kp] = indices_to_delete[k];
+               }
+               indices_count = ++kp;
+       }
  
         /*
-        * Copy tsv to tsout skipping lexemes that enlisted in indices_to_delete.
+        * Here we overestimate tsout size, since we don't know how much space is
+        * used by the deleted lexeme(s).  We will set exact size below.
          */
-       curoff = 0;
+       tsout = (TSVector) palloc0(VARSIZE(tsv));
+
+       /* This count must be correct because STRPTR(tsout) relies on it. */
+       tsout->size = tsv->size - indices_count;
+
+       /*
+        * Copy tsv to tsout, skipping lexemes listed in indices_to_delete.
+        */
+       arrout = ARRPTR(tsout);
         dataout = STRPTR(tsout);
+       curoff = 0;
         for (i = j = k = 0; i < tsv->size; i++)
         {
                 /*
-                * Here we should check whether current i is present in
-                * indices_to_delete or not. Since indices_to_delete is already sorted
-                * we can advance it index only when we have match.
+                * If current i is present in indices_to_delete, skip this lexeme.
+                * Since indices_to_delete is already sorted, we only need to check
+                * the current (k'th) entry.
                  */
                 if (k < indices_count && i == indices_to_delete[k])
                 {
@@ -481,7 +497,7 @@ tsvector_delete_by_indices(TSVector tsv, int *indices_to_delete,
                         continue;
                 }
  
-               /* Copy lexeme, it's positions and weights */
+               /* Copy lexeme and its positions and weights */
                 memcpy(dataout + curoff, data + arrin[i].pos, arrin[i].len);
                 arrout[j].haspos = arrin[i].haspos;
                 arrout[j].len = arrin[i].len;
@@ -489,8 +505,8 @@ tsvector_delete_by_indices(TSVector tsv, int *indices_to_delete,
                 curoff += arrin[i].len;
                 if (arrin[i].haspos)
                 {
-                       int                     len = POSDATALEN(tsv, arrin + i) * sizeof(WordEntryPos) +
-                       sizeof(uint16);
+                       int                     len = POSDATALEN(tsv, arrin + i) * sizeof(WordEntryPos)
+                       + sizeof(uint16);
  
                         curoff = SHORTALIGN(curoff);
                         memcpy(dataout + curoff,
@@ -503,10 +519,9 @@ tsvector_delete_by_indices(TSVector tsv, int *indices_to_delete,
         }
  
         /*
-        * After the pass through TSVector k should equals exactly to
-        * indices_count. If it isn't then the caller provided us with indices
-        * outside of [0, tsv->size) range and estimation of tsout's size is
-        * wrong.
+        * k should now be exactly equal to indices_count. If it isn't then the
+        * caller provided us with indices outside of [0, tsv->size) range and
+        * estimation of tsout's size is wrong.
          */
         Assert(k == indices_count);
  
@@ -560,7 +575,7 @@ tsvector_delete_arr(PG_FUNCTION_ARGS)
  
         /*
          * In typical use case array of lexemes to delete is relatively small. So
-        * here we optimizing things for that scenario: iterate through lexarr
+        * here we optimize things for that scenario: iterate through lexarr
          * performing binary search of each lexeme from lexarr in tsvector.
          */
         skip_indices = palloc0(nlex * sizeof(int));
@@ -572,10 +587,10 @@ tsvector_delete_arr(PG_FUNCTION_ARGS)
  
                 if (nulls[i])
                         ereport(ERROR,
-                                       (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+                                       (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED),
                                          errmsg("lexeme array may not contain nulls")));
  
-               lex = VARDATA(dlexemes[i]);
+               lex = VARDATA_ANY(dlexemes[i]);
                 lex_len = VARSIZE_ANY_EXHDR(dlexemes[i]);
                 lex_pos = tsvector_bsearch(tsin, lex, lex_len);
  
@@ -738,7 +753,7 @@ array_to_tsvector(PG_FUNCTION_ARGS)
         {
                 if (nulls[i])
                         ereport(ERROR,
-                                       (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+                                       (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED),
                                          errmsg("lexeme array may not contain nulls")));
  
                 datalen += VARSIZE_ANY_EXHDR(dlexemes[i]);
@@ -797,7 +812,7 @@ tsvector_filter(PG_FUNCTION_ARGS)
  
                 if (nulls[i])
                         ereport(ERROR,
-                                       (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+                                       (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED),
                                          errmsg("weight array may not contain nulls")));
  
                 char_weight = DatumGetChar(dweights[i]);
diff --git a/src/test/regress/expected/tstypes.out b/src/test/regress/expected/tstypes.out

index 886ea747f17733f6b38e15bbd48bbf43b55fa316..73f43c5ff027364d1b8ed7ad093245f2ba32147c 100644 (file)
--- a/src/test/regress/expected/tstypes.out
+++ b/src/test/regress/expected/tstypes.out
@@ -1087,6 +1087,12 @@ SELECT ts_delete('base hidden rebel spaceship strike'::tsvector, ARRAY['spaceshi
   'base' 'hidden' 'strike'
  (1 row)
  
+SELECT ts_delete('base hidden rebel spaceship strike'::tsvector, ARRAY['spaceship','leya','rebel','rebel']);
+        ts_delete         
+--------------------------
+ 'base' 'hidden' 'strike'
+(1 row)
+
  SELECT ts_delete('base hidden rebel spaceship strike'::tsvector, ARRAY['spaceship','leya','rebel', NULL]);
  ERROR:  lexeme array may not contain nulls
  SELECT unnest('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector);
diff --git a/src/test/regress/sql/tstypes.sql b/src/test/regress/sql/tstypes.sql

index 724234d94d25daae8dc7708f2c010791696579c1..f0c06ba5f5a6e1c37ca51521bb5ad7690579da11 100644 (file)
--- a/src/test/regress/sql/tstypes.sql
+++ b/src/test/regress/sql/tstypes.sql
@@ -212,6 +212,7 @@ SELECT ts_delete('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3':
  SELECT ts_delete('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector, ARRAY['spaceshi','rebel']);
  SELECT ts_delete('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector, ARRAY['spaceship','leya','rebel']);
  SELECT ts_delete('base hidden rebel spaceship strike'::tsvector, ARRAY['spaceship','leya','rebel']);
+SELECT ts_delete('base hidden rebel spaceship strike'::tsvector, ARRAY['spaceship','leya','rebel','rebel']);
  SELECT ts_delete('base hidden rebel spaceship strike'::tsvector, ARRAY['spaceship','leya','rebel', NULL]);
  
  SELECT unnest('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector);
author	Tom Lane <tgl@sss.pgh.pa.us>
	Fri, 5 Aug 2016 19:14:08 +0000 (15:14 -0400)
committer	Tom Lane <tgl@sss.pgh.pa.us>
	Fri, 5 Aug 2016 19:14:19 +0000 (15:14 -0400)
src/backend/utils/adt/tsvector_op.c		patch \| blob \| history
src/test/regress/expected/tstypes.out		patch \| blob \| history
src/test/regress/sql/tstypes.sql		patch \| blob \| history