From: Tom Lane Date: Fri, 5 Aug 2016 20:09:06 +0000 (-0400) Subject: Make array_to_tsvector() sort and de-duplicate the given strings. X-Git-Tag: REL9_6_BETA4~29 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=f10eab73df2b94c860dea4a906c54e3c903f42e2;p=postgresql Make array_to_tsvector() sort and de-duplicate the given strings. This is required for the result to be a legal tsvector value. Noted while fooling with Andreas Seltenreich's ts_delete() crash. Discussion: <87invhoj6e.fsf@credativ.de> --- diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml index 971e642276..783033403a 100644 --- a/doc/src/sgml/func.sgml +++ b/doc/src/sgml/func.sgml @@ -9294,7 +9294,7 @@ CREATE TYPE rainbow AS ENUM ('red', 'orange', 'yellow', 'green', 'blue', 'purple tsvector convert array of lexemes to tsvector array_to_tsvector('{fat,cat,rat}'::text[]) - 'fat' 'cat' 'rat' + 'cat' 'fat' 'rat' diff --git a/src/backend/utils/adt/tsvector_op.c b/src/backend/utils/adt/tsvector_op.c index 29cc687643..ad5a254c57 100644 --- a/src/backend/utils/adt/tsvector_op.c +++ b/src/backend/utils/adt/tsvector_op.c @@ -416,17 +416,34 @@ tsvector_bsearch(const TSVector tsv, char *lexeme, int lexeme_len) return -1; } +/* + * qsort comparator functions + */ + static int -compareint(const void *va, const void *vb) +compare_int(const void *va, const void *vb) { - int32 a = *((const int32 *) va); - int32 b = *((const int32 *) vb); + int a = *((const int *) va); + int b = *((const int *) vb); if (a == b) return 0; return (a > b) ? 1 : -1; } +static int +compare_text_lexemes(const void *va, const void *vb) +{ + Datum a = *((const Datum *) va); + Datum b = *((const Datum *) vb); + char *alex = VARDATA_ANY(a); + int alex_len = VARSIZE_ANY_EXHDR(a); + char *blex = VARDATA_ANY(b); + int blex_len = VARSIZE_ANY_EXHDR(b); + + return tsCompareString(alex, alex_len, blex, blex_len, false); +} + /* * Internal routine to delete lexemes from TSVector by array of offsets. * @@ -459,7 +476,7 @@ tsvector_delete_by_indices(TSVector tsv, int *indices_to_delete, { int kp; - qsort(indices_to_delete, indices_count, sizeof(int), compareint); + qsort(indices_to_delete, indices_count, sizeof(int), compare_int); kp = 0; for (k = 1; k < indices_count; k++) { @@ -743,32 +760,50 @@ array_to_tsvector(PG_FUNCTION_ARGS) bool *nulls; int nitems, i, + j, tslen, datalen = 0; char *cur; deconstruct_array(v, TEXTOID, -1, false, 'i', &dlexemes, &nulls, &nitems); + /* Reject nulls (maybe we should just ignore them, instead?) */ for (i = 0; i < nitems; i++) { if (nulls[i]) ereport(ERROR, (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED), errmsg("lexeme array may not contain nulls"))); + } - datalen += VARSIZE_ANY_EXHDR(dlexemes[i]); + /* Sort and de-dup, because this is required for a valid tsvector. */ + if (nitems > 1) + { + qsort(dlexemes, nitems, sizeof(Datum), compare_text_lexemes); + j = 0; + for (i = 1; i < nitems; i++) + { + if (compare_text_lexemes(&dlexemes[j], &dlexemes[i]) < 0) + dlexemes[++j] = dlexemes[i]; + } + nitems = ++j; } + /* Calculate space needed for surviving lexemes. */ + for (i = 0; i < nitems; i++) + datalen += VARSIZE_ANY_EXHDR(dlexemes[i]); tslen = CALCDATASIZE(nitems, datalen); + + /* Allocate and fill tsvector. */ tsout = (TSVector) palloc0(tslen); SET_VARSIZE(tsout, tslen); tsout->size = nitems; + arrout = ARRPTR(tsout); cur = STRPTR(tsout); - for (i = 0; i < nitems; i++) { - char *lex = VARDATA(dlexemes[i]); + char *lex = VARDATA_ANY(dlexemes[i]); int lex_len = VARSIZE_ANY_EXHDR(dlexemes[i]); memcpy(cur, lex, lex_len); diff --git a/src/test/regress/expected/tstypes.out b/src/test/regress/expected/tstypes.out index 73f43c5ff0..8d9290cbac 100644 --- a/src/test/regress/expected/tstypes.out +++ b/src/test/regress/expected/tstypes.out @@ -1165,6 +1165,13 @@ SELECT array_to_tsvector(ARRAY['base','hidden','rebel','spaceship','strike']); SELECT array_to_tsvector(ARRAY['base','hidden','rebel','spaceship', NULL]); ERROR: lexeme array may not contain nulls +-- array_to_tsvector must sort and de-dup +SELECT array_to_tsvector(ARRAY['foo','bar','baz','bar']); + array_to_tsvector +------------------- + 'bar' 'baz' 'foo' +(1 row) + SELECT setweight('w:12B w:13* w:12,5,6 a:1,3* a:3 w asd:1dc asd zxc:81,567,222A'::tsvector, 'c'); setweight ---------------------------------------------------------- diff --git a/src/test/regress/sql/tstypes.sql b/src/test/regress/sql/tstypes.sql index f0c06ba5f5..9ea93a2993 100644 --- a/src/test/regress/sql/tstypes.sql +++ b/src/test/regress/sql/tstypes.sql @@ -226,6 +226,8 @@ SELECT tsvector_to_array('base hidden rebel spaceship strike'::tsvector); SELECT array_to_tsvector(ARRAY['base','hidden','rebel','spaceship','strike']); SELECT array_to_tsvector(ARRAY['base','hidden','rebel','spaceship', NULL]); +-- array_to_tsvector must sort and de-dup +SELECT array_to_tsvector(ARRAY['foo','bar','baz','bar']); SELECT setweight('w:12B w:13* w:12,5,6 a:1,3* a:3 w asd:1dc asd zxc:81,567,222A'::tsvector, 'c'); SELECT setweight('a:1,3A asd:1C w:5,6,12B,13A zxc:81,222A,567'::tsvector, 'c');