From: Tom Lane Date: Fri, 5 Aug 2016 19:14:08 +0000 (-0400) Subject: Fix ts_delete(tsvector, text[]) to cope with duplicate array entries. X-Git-Tag: REL9_6_BETA4~30 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=c50d192ce33c10fa06411306f8644b4f47ce9a06;p=postgresql Fix ts_delete(tsvector, text[]) to cope with duplicate array entries. Such cases either failed an Assert, or produced a corrupt tsvector in non-Assert builds, as reported by Andreas Seltenreich. The reason is that tsvector_delete_by_indices() just assumed that its input array had no duplicates. Fix by explicitly de-duping. In passing, improve some comments, and fix a number of tests for null values to use ERRCODE_NULL_VALUE_NOT_ALLOWED not ERRCODE_INVALID_PARAMETER_VALUE. Discussion: <87invhoj6e.fsf@credativ.de> --- diff --git a/src/backend/utils/adt/tsvector_op.c b/src/backend/utils/adt/tsvector_op.c index 63ae4330ef..29cc687643 100644 --- a/src/backend/utils/adt/tsvector_op.c +++ b/src/backend/utils/adt/tsvector_op.c @@ -317,7 +317,7 @@ tsvector_setweight_by_filter(PG_FUNCTION_ARGS) if (nulls[i]) ereport(ERROR, - (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED), errmsg("lexeme array may not contain nulls"))); lex = VARDATA(dlexemes[i]); @@ -430,7 +430,7 @@ compareint(const void *va, const void *vb) /* * Internal routine to delete lexemes from TSVector by array of offsets. * - * int *indices_to_delete -- array of lexeme offsets to delete + * int *indices_to_delete -- array of lexeme offsets to delete (modified here!) * int indices_count -- size of that array * * Returns new TSVector without given lexemes along with their positions @@ -445,35 +445,51 @@ tsvector_delete_by_indices(TSVector tsv, int *indices_to_delete, *arrout; char *data = STRPTR(tsv), *dataout; - int i, - j, - k, - curoff; + int i, /* index in arrin */ + j, /* index in arrout */ + k, /* index in indices_to_delete */ + curoff; /* index in dataout area */ /* - * Here we overestimates tsout size, since we don't know exact size - * occupied by positions and weights. We will set exact size later after a - * pass through TSVector. + * Sort the filter array to simplify membership checks below. Also, get + * rid of any duplicate entries, so that we can assume that indices_count + * is exactly equal to the number of lexemes that will be removed. */ - tsout = (TSVector) palloc0(VARSIZE(tsv)); - arrout = ARRPTR(tsout); - tsout->size = tsv->size - indices_count; - - /* Sort our filter array to simplify membership check later. */ if (indices_count > 1) + { + int kp; + qsort(indices_to_delete, indices_count, sizeof(int), compareint); + kp = 0; + for (k = 1; k < indices_count; k++) + { + if (indices_to_delete[k] != indices_to_delete[kp]) + indices_to_delete[++kp] = indices_to_delete[k]; + } + indices_count = ++kp; + } /* - * Copy tsv to tsout skipping lexemes that enlisted in indices_to_delete. + * Here we overestimate tsout size, since we don't know how much space is + * used by the deleted lexeme(s). We will set exact size below. */ - curoff = 0; + tsout = (TSVector) palloc0(VARSIZE(tsv)); + + /* This count must be correct because STRPTR(tsout) relies on it. */ + tsout->size = tsv->size - indices_count; + + /* + * Copy tsv to tsout, skipping lexemes listed in indices_to_delete. + */ + arrout = ARRPTR(tsout); dataout = STRPTR(tsout); + curoff = 0; for (i = j = k = 0; i < tsv->size; i++) { /* - * Here we should check whether current i is present in - * indices_to_delete or not. Since indices_to_delete is already sorted - * we can advance it index only when we have match. + * If current i is present in indices_to_delete, skip this lexeme. + * Since indices_to_delete is already sorted, we only need to check + * the current (k'th) entry. */ if (k < indices_count && i == indices_to_delete[k]) { @@ -481,7 +497,7 @@ tsvector_delete_by_indices(TSVector tsv, int *indices_to_delete, continue; } - /* Copy lexeme, it's positions and weights */ + /* Copy lexeme and its positions and weights */ memcpy(dataout + curoff, data + arrin[i].pos, arrin[i].len); arrout[j].haspos = arrin[i].haspos; arrout[j].len = arrin[i].len; @@ -489,8 +505,8 @@ tsvector_delete_by_indices(TSVector tsv, int *indices_to_delete, curoff += arrin[i].len; if (arrin[i].haspos) { - int len = POSDATALEN(tsv, arrin + i) * sizeof(WordEntryPos) + - sizeof(uint16); + int len = POSDATALEN(tsv, arrin + i) * sizeof(WordEntryPos) + + sizeof(uint16); curoff = SHORTALIGN(curoff); memcpy(dataout + curoff, @@ -503,10 +519,9 @@ tsvector_delete_by_indices(TSVector tsv, int *indices_to_delete, } /* - * After the pass through TSVector k should equals exactly to - * indices_count. If it isn't then the caller provided us with indices - * outside of [0, tsv->size) range and estimation of tsout's size is - * wrong. + * k should now be exactly equal to indices_count. If it isn't then the + * caller provided us with indices outside of [0, tsv->size) range and + * estimation of tsout's size is wrong. */ Assert(k == indices_count); @@ -560,7 +575,7 @@ tsvector_delete_arr(PG_FUNCTION_ARGS) /* * In typical use case array of lexemes to delete is relatively small. So - * here we optimizing things for that scenario: iterate through lexarr + * here we optimize things for that scenario: iterate through lexarr * performing binary search of each lexeme from lexarr in tsvector. */ skip_indices = palloc0(nlex * sizeof(int)); @@ -572,10 +587,10 @@ tsvector_delete_arr(PG_FUNCTION_ARGS) if (nulls[i]) ereport(ERROR, - (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED), errmsg("lexeme array may not contain nulls"))); - lex = VARDATA(dlexemes[i]); + lex = VARDATA_ANY(dlexemes[i]); lex_len = VARSIZE_ANY_EXHDR(dlexemes[i]); lex_pos = tsvector_bsearch(tsin, lex, lex_len); @@ -738,7 +753,7 @@ array_to_tsvector(PG_FUNCTION_ARGS) { if (nulls[i]) ereport(ERROR, - (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED), errmsg("lexeme array may not contain nulls"))); datalen += VARSIZE_ANY_EXHDR(dlexemes[i]); @@ -797,7 +812,7 @@ tsvector_filter(PG_FUNCTION_ARGS) if (nulls[i]) ereport(ERROR, - (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED), errmsg("weight array may not contain nulls"))); char_weight = DatumGetChar(dweights[i]); diff --git a/src/test/regress/expected/tstypes.out b/src/test/regress/expected/tstypes.out index 886ea747f1..73f43c5ff0 100644 --- a/src/test/regress/expected/tstypes.out +++ b/src/test/regress/expected/tstypes.out @@ -1087,6 +1087,12 @@ SELECT ts_delete('base hidden rebel spaceship strike'::tsvector, ARRAY['spaceshi 'base' 'hidden' 'strike' (1 row) +SELECT ts_delete('base hidden rebel spaceship strike'::tsvector, ARRAY['spaceship','leya','rebel','rebel']); + ts_delete +-------------------------- + 'base' 'hidden' 'strike' +(1 row) + SELECT ts_delete('base hidden rebel spaceship strike'::tsvector, ARRAY['spaceship','leya','rebel', NULL]); ERROR: lexeme array may not contain nulls SELECT unnest('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector); diff --git a/src/test/regress/sql/tstypes.sql b/src/test/regress/sql/tstypes.sql index 724234d94d..f0c06ba5f5 100644 --- a/src/test/regress/sql/tstypes.sql +++ b/src/test/regress/sql/tstypes.sql @@ -212,6 +212,7 @@ SELECT ts_delete('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3': SELECT ts_delete('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector, ARRAY['spaceshi','rebel']); SELECT ts_delete('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector, ARRAY['spaceship','leya','rebel']); SELECT ts_delete('base hidden rebel spaceship strike'::tsvector, ARRAY['spaceship','leya','rebel']); +SELECT ts_delete('base hidden rebel spaceship strike'::tsvector, ARRAY['spaceship','leya','rebel','rebel']); SELECT ts_delete('base hidden rebel spaceship strike'::tsvector, ARRAY['spaceship','leya','rebel', NULL]); SELECT unnest('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector);