From bb140506df605fab58f48926ee1db1f80bdafb59 Mon Sep 17 00:00:00 2001 From: Teodor Sigaev Date: Thu, 7 Apr 2016 18:44:18 +0300 Subject: [PATCH] Phrase full text search. Patch introduces new text search operator (<-> or ) into tsquery. On-disk and binary in/out format of tsquery are backward compatible. It has two side effect: - change order for tsquery, so, users, who has a btree index over tsquery, should reindex it - less number of parenthesis in tsquery output, and tsquery becomes more readable Authors: Teodor Sigaev, Oleg Bartunov, Dmitry Ivanov Reviewers: Alexander Korotkov, Artur Zakirov --- contrib/tsearch2/expected/tsearch2.out | 56 ++-- doc/src/sgml/datatype.sgml | 9 +- doc/src/sgml/func.sgml | 39 +++ doc/src/sgml/textsearch.sgml | 182 ++++++++++- src/backend/tsearch/to_tsany.c | 187 +++++------ src/backend/tsearch/ts_parse.c | 15 +- src/backend/tsearch/ts_selfuncs.c | 3 +- src/backend/tsearch/wparser_def.c | 31 +- src/backend/utils/adt/tsginidx.c | 57 ++-- src/backend/utils/adt/tsgistidx.c | 4 +- src/backend/utils/adt/tsquery.c | 311 ++++++++++++++----- src/backend/utils/adt/tsquery_cleanup.c | 362 ++++++++++++++++++++-- src/backend/utils/adt/tsquery_op.c | 54 +++- src/backend/utils/adt/tsquery_util.c | 11 +- src/backend/utils/adt/tsrank.c | 263 +++++++++++----- src/backend/utils/adt/tsvector.c | 2 +- src/backend/utils/adt/tsvector_op.c | 326 +++++++++++++++++-- src/backend/utils/adt/tsvector_parser.c | 10 +- src/include/catalog/catversion.h | 2 +- src/include/catalog/pg_operator.h | 3 + src/include/catalog/pg_proc.h | 7 + src/include/tsearch/ts_public.h | 22 +- src/include/tsearch/ts_type.h | 30 +- src/include/tsearch/ts_utils.h | 15 +- src/test/regress/expected/tsdicts.out | 36 ++- src/test/regress/expected/tsearch.out | 395 ++++++++++++++++++++++-- src/test/regress/expected/tstypes.out | 369 +++++++++++++++++++++- src/test/regress/sql/tsdicts.sql | 3 + src/test/regress/sql/tsearch.sql | 101 ++++++ src/test/regress/sql/tstypes.sql | 75 ++++- 30 files changed, 2536 insertions(+), 444 deletions(-) diff --git a/contrib/tsearch2/expected/tsearch2.out b/contrib/tsearch2/expected/tsearch2.out index 972f764c14..97379e7185 100644 --- a/contrib/tsearch2/expected/tsearch2.out +++ b/contrib/tsearch2/expected/tsearch2.out @@ -278,15 +278,15 @@ SELECT '(!1|2)&3'::tsquery; (1 row) SELECT '1|(2|(4|(5|6)))'::tsquery; - tsquery ------------------------------------------ - '1' | ( '2' | ( '4' | ( '5' | '6' ) ) ) + tsquery +----------------------------- + '1' | '2' | '4' | '5' | '6' (1 row) SELECT '1|2|4|5|6'::tsquery; - tsquery ------------------------------------------ - ( ( ( '1' | '2' ) | '4' ) | '5' ) | '6' + tsquery +----------------------------- + '1' | '2' | '4' | '5' | '6' (1 row) SELECT '1&(2&(4&(5&6)))'::tsquery; @@ -340,7 +340,7 @@ select 'a' > 'b & c'::tsquery; select 'a | f' < 'b & c'::tsquery; ?column? ---------- - t + f (1 row) select 'a | ff' < 'b & c'::tsquery; @@ -443,9 +443,9 @@ select count(*) from test_tsquery where keyword > 'new & york'; set enable_seqscan=on; select rewrite('foo & bar & qq & new & york', 'new & york'::tsquery, 'big & apple | nyc | new & york & city'); - rewrite ----------------------------------------------------------------------------------- - 'foo' & 'bar' & 'qq' & ( 'city' & 'new' & 'york' | ( 'nyc' | 'big' & 'apple' ) ) + rewrite +------------------------------------------------------------------------------ + 'foo' & 'bar' & 'qq' & ( 'nyc' | 'big' & 'apple' | 'city' & 'new' & 'york' ) (1 row) select rewrite('moscow', 'select keyword, sample from test_tsquery'::text ); @@ -461,9 +461,9 @@ select rewrite('moscow & hotel', 'select keyword, sample from test_tsquery'::tex (1 row) select rewrite('bar & new & qq & foo & york', 'select keyword, sample from test_tsquery'::text ); - rewrite -------------------------------------------------------------------------------------- - 'citi' & 'foo' & ( 'bar' | 'qq' ) & ( 'nyc' | ( 'big' & 'appl' | 'new' & 'york' ) ) + rewrite +--------------------------------------------------------------------------------- + ( 'nyc' | 'big' & 'appl' | 'new' & 'york' ) & 'citi' & 'foo' & ( 'bar' | 'qq' ) (1 row) select rewrite( ARRAY['moscow', keyword, sample] ) from test_tsquery; @@ -479,9 +479,9 @@ select rewrite( ARRAY['moscow & hotel', keyword, sample] ) from test_tsquery; (1 row) select rewrite( ARRAY['bar & new & qq & foo & york', keyword, sample] ) from test_tsquery; - rewrite -------------------------------------------------------------------------------------- - 'citi' & 'foo' & ( 'bar' | 'qq' ) & ( 'nyc' | ( 'big' & 'appl' | 'new' & 'york' ) ) + rewrite +--------------------------------------------------------------------------------- + ( 'nyc' | 'big' & 'appl' | 'new' & 'york' ) & 'citi' & 'foo' & ( 'bar' | 'qq' ) (1 row) select keyword from test_tsquery where keyword @> 'new'; @@ -520,9 +520,9 @@ select rewrite( ARRAY[query, keyword, sample] ) from test_tsquery, to_tsquery('e (1 row) select rewrite( ARRAY[query, keyword, sample] ) from test_tsquery, to_tsquery('english', 'bar & new & qq & foo & york') as query where keyword <@ query; - rewrite -------------------------------------------------------------------------------------- - 'citi' & 'foo' & ( 'bar' | 'qq' ) & ( 'nyc' | ( 'big' & 'appl' | 'new' & 'york' ) ) + rewrite +--------------------------------------------------------------------------------- + ( 'nyc' | 'big' & 'appl' | 'new' & 'york' ) & 'citi' & 'foo' & ( 'bar' | 'qq' ) (1 row) select rewrite( ARRAY[query, keyword, sample] ) from test_tsquery, to_tsquery('english', 'moscow') as query where query @> keyword; @@ -538,9 +538,9 @@ select rewrite( ARRAY[query, keyword, sample] ) from test_tsquery, to_tsquery('e (1 row) select rewrite( ARRAY[query, keyword, sample] ) from test_tsquery, to_tsquery('english', 'bar & new & qq & foo & york') as query where query @> keyword; - rewrite -------------------------------------------------------------------------------------- - 'citi' & 'foo' & ( 'bar' | 'qq' ) & ( 'nyc' | ( 'big' & 'appl' | 'new' & 'york' ) ) + rewrite +--------------------------------------------------------------------------------- + ( 'nyc' | 'big' & 'appl' | 'new' & 'york' ) & 'citi' & 'foo' & ( 'bar' | 'qq' ) (1 row) create index qq on test_tsquery using gist (keyword gist_tp_tsquery_ops); @@ -581,9 +581,9 @@ select rewrite( ARRAY[query, keyword, sample] ) from test_tsquery, to_tsquery('e (1 row) select rewrite( ARRAY[query, keyword, sample] ) from test_tsquery, to_tsquery('english', 'bar & new & qq & foo & york') as query where keyword <@ query; - rewrite -------------------------------------------------------------------------------------- - 'citi' & 'foo' & ( 'bar' | 'qq' ) & ( 'nyc' | ( 'big' & 'appl' | 'new' & 'york' ) ) + rewrite +--------------------------------------------------------------------------------- + ( 'nyc' | 'big' & 'appl' | 'new' & 'york' ) & 'citi' & 'foo' & ( 'bar' | 'qq' ) (1 row) select rewrite( ARRAY[query, keyword, sample] ) from test_tsquery, to_tsquery('english', 'moscow') as query where query @> keyword; @@ -599,9 +599,9 @@ select rewrite( ARRAY[query, keyword, sample] ) from test_tsquery, to_tsquery('e (1 row) select rewrite( ARRAY[query, keyword, sample] ) from test_tsquery, to_tsquery('english', 'bar & new & qq & foo & york') as query where query @> keyword; - rewrite -------------------------------------------------------------------------------------- - 'citi' & 'foo' & ( 'bar' | 'qq' ) & ( 'nyc' | ( 'big' & 'appl' | 'new' & 'york' ) ) + rewrite +--------------------------------------------------------------------------------- + ( 'nyc' | 'big' & 'appl' | 'new' & 'york' ) & 'citi' & 'foo' & ( 'bar' | 'qq' ) (1 row) set enable_seqscan='on'; diff --git a/doc/src/sgml/datatype.sgml b/doc/src/sgml/datatype.sgml index 7c3ef92cd2..0b60c61d48 100644 --- a/doc/src/sgml/datatype.sgml +++ b/doc/src/sgml/datatype.sgml @@ -3924,8 +3924,9 @@ SELECT to_tsvector('english', 'The Fat Rats'); A tsquery value stores lexemes that are to be searched for, and combines them honoring the Boolean operators - & (AND), | (OR), and - ! (NOT). Parentheses can be used to enforce grouping + & (AND), | (OR), + ! (NOT) and <-> (FOLLOWED BY) phrase search + operator. Parentheses can be used to enforce grouping of the operators: @@ -3946,8 +3947,8 @@ SELECT 'fat & rat & ! cat'::tsquery; In the absence of parentheses, ! (NOT) binds most tightly, - and & (AND) binds more tightly than - | (OR). + and & (AND) and <-> (FOLLOWED BY) + both bind more tightly than | (OR). diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml index 15b6b4eb3d..9b0778baa9 100644 --- a/doc/src/sgml/func.sgml +++ b/doc/src/sgml/func.sgml @@ -9127,6 +9127,12 @@ CREATE TYPE rainbow AS ENUM ('red', 'orange', 'yellow', 'green', 'blue', 'purple !! 'cat'::tsquery !'cat' + + <-> + tsquery followed by tsquery + to_tsquery('fat') <-> to_tsquery('rat') + 'fat' <-> 'rat' + @> tsquery contains another ? @@ -9219,6 +9225,18 @@ CREATE TYPE rainbow AS ENUM ('red', 'orange', 'yellow', 'green', 'blue', 'purple plainto_tsquery('english', 'The Fat Rats') 'fat' & 'rat' + + + + phraseto_tsquery + + phraseto_tsquery( config regconfig , query text) + + tsquery + produce tsquery ignoring punctuation + phraseto_tsquery('english', 'The Fat Rats') + 'fat' <-> 'rat' + @@ -9421,6 +9439,27 @@ CREATE TYPE rainbow AS ENUM ('red', 'orange', 'yellow', 'green', 'blue', 'purple SELECT ts_rewrite('a & b'::tsquery, 'SELECT t,s FROM aliases') 'b' & ( 'foo' | 'bar' ) + + + + tsquery_phrase + + tsquery_phrase(query1 tsquery, query2 tsquery) + + tsquery + implementation of <-> (FOLLOWED BY) operator + tsquery_phrase(to_tsquery('fat'), to_tsquery('cat')) + 'fat' <-> 'cat' + + + + tsquery_phrase(query1 tsquery, query2 tsquery, distance integer) + + tsquery + phrase-concatenate with distance + tsquery_phrase(to_tsquery('fat'), to_tsquery('cat'), 10) + 'fat' <10> 'cat' + diff --git a/doc/src/sgml/textsearch.sgml b/doc/src/sgml/textsearch.sgml index ea3abc9e15..930c8f0a5d 100644 --- a/doc/src/sgml/textsearch.sgml +++ b/doc/src/sgml/textsearch.sgml @@ -263,9 +263,10 @@ SELECT 'fat & cow'::tsquery @@ 'a fat cat sat on a mat and ate a fat rat'::t As the above example suggests, a tsquery is not just raw text, any more than a tsvector is. A tsquery contains search terms, which must be already-normalized lexemes, and - may combine multiple terms using AND, OR, and NOT operators. + may combine multiple terms using AND, OR, NOT and FOLLOWED BY operators. (For details see .) There are - functions to_tsquery and plainto_tsquery + functions to_tsquery, plainto_tsquery + and phraseto_tsquery that are helpful in converting user-written text into a proper tsquery, for example by normalizing words appearing in the text. Similarly, to_tsvector is used to parse and @@ -293,6 +294,35 @@ SELECT 'fat cats ate fat rats'::tsvector @@ to_tsquery('fat & rat'); already normalized, so rats does not match rat. + + Phrase search is made possible with the help of the <-> + (FOLLOWED BY) operator, which enforces lexeme order. This allows you + to discard strings not containing the desired phrase, for example: + + +SELECT q @@ to_tsquery('fatal <-> error') +FROM unnest(array[to_tsvector('fatal error'), + to_tsvector('error is not fatal')]) AS q; + ?column? +---------- + t + f + + + A more generic version of the FOLLOWED BY operator takes form of + <N>, where N stands for the greatest allowed distance + between the specified lexemes. The phraseto_tsquery + function makes use of this behavior in order to construct a + tsquery capable of matching the provided phrase: + + +SELECT phraseto_tsquery('cat ate some rats'); + phraseto_tsquery +------------------------------- + ( 'cat' <-> 'ate' ) <2> 'rat' + + + The @@ operator also supports text input, allowing explicit conversion of a text @@ -709,11 +739,14 @@ UPDATE tt SET ti = PostgreSQL provides the - functions to_tsquery and - plainto_tsquery for converting a query to - the tsquery data type. to_tsquery - offers access to more features than plainto_tsquery, - but is less forgiving about its input. + functions to_tsquery, + plainto_tsquery and + phraseto_tsquery + for converting a query to the tsquery data type. + to_tsquery offers access to more features + than both plainto_tsquery and + phraseto_tsquery, but is less forgiving + about its input. @@ -728,7 +761,8 @@ to_tsquery( config to_tsquery creates a tsquery value from querytext, which must consist of single tokens separated by the Boolean operators & (AND), - | (OR) and ! (NOT). These operators + | (OR), ! (NOT), and also the + <-> (FOLLOWED BY) phrase search operator. These operators can be grouped using parentheses. In other words, the input to to_tsquery must already follow the general rules for tsquery input, as described in Note that plainto_tsquery cannot - recognize Boolean operators, weight labels, or prefix-match labels - in its input: + recognize Boolean and phrase search operators, weight labels, + or prefix-match labels in its input: SELECT plainto_tsquery('english', 'The Fat & Rats:C'); @@ -827,6 +861,57 @@ SELECT plainto_tsquery('english', 'The Fat & Rats:C'); Here, all the input punctuation was discarded as being space symbols. + + phraseto_tsquery + + + +phraseto_tsquery( config regconfig, querytext text) returns tsquery + + + + phraseto_tsquery behaves much like + plainto_tsquery, with the exception + that it utilizes the <-> (FOLLOWED BY) phrase search + operator instead of the & (AND) Boolean operator. + This is particularly useful when searching for exact lexeme sequences, + since the phrase search operator helps to maintain lexeme order. + + + + Example: + + +SELECT phraseto_tsquery('english', 'The Fat Rats'); + phraseto_tsquery +------------------ + 'fat' <-> 'rat' + + + Just like the plainto_tsquery, the + phraseto_tsquery function cannot + recognize Boolean and phrase search operators, weight labels, + or prefix-match labels in its input: + + +SELECT phraseto_tsquery('english', 'The Fat & Rats:C'); + phraseto_tsquery +----------------------------- + ( 'fat' <-> 'rat' ) <-> 'c' + + + It is possible to specify the configuration to be used to parse the document, + for example, we could create a new one using the hunspell dictionary + (namely 'eng_hunspell') in order to match phrases with different word forms: + + +SELECT phraseto_tsquery('eng_hunspell', 'developer of the building which collapsed'); + phraseto_tsquery +-------------------------------------------------------------------------------------------- + ( 'developer' <3> 'building' ) <2> 'collapse' | ( 'developer' <3> 'build' ) <2> 'collapse' + + + @@ -1387,6 +1472,81 @@ FROM (SELECT id, body, q, ts_rank_cd(ti, q) AS rank + + + + tsquery <-> tsquery + + + + + Returns the phrase-concatenation of the two given queries. + + +SELECT to_tsquery('fat') <-> to_tsquery('cat | rat'); + ?column? +----------------------------------- + 'fat' <-> 'cat' | 'fat' <-> 'rat' + + + + + + + + + + + tsquery_phrase + + + tsquery_phrase(query1 tsquery, query2 tsquery [, distance integer ]) returns tsquery + + + + + Returns the distanced phrase-concatenation of the two given queries. + This function lies in the implementation of the <-> operator. + + +SELECT tsquery_phrase(to_tsquery('fat'), to_tsquery('cat'), 10); + tsquery_phrase +------------------ + 'fat' <10> 'cat' + + + + + + + + + + + setweight + + + setweight(query tsquery, weight "char") returns tsquery + + + + + setweight returns a copy of the input query in which every + position has been labeled with the given weight(s), either + A, B, C, + D or their combination. These labels are retained when + queries are concatenated, allowing words from different parts of a document + to be weighted differently by ranking functions. + + + + Note that weight labels apply to positions, not + lexemes. If the input query has been stripped of + positions then setweight does nothing. + + + + @@ -2428,7 +2588,7 @@ more sample word(s) : more indexed word(s) Specific stop words recognized by the subdictionary cannot be - specified; instead use ? to mark the location where any + specified; instead use <-> to mark the location where any stop word can appear. For example, assuming that a and the are stop words according to the subdictionary: diff --git a/src/backend/tsearch/to_tsany.c b/src/backend/tsearch/to_tsany.c index aa77ec0728..3f69d74702 100644 --- a/src/backend/tsearch/to_tsany.c +++ b/src/backend/tsearch/to_tsany.c @@ -18,6 +18,13 @@ #include "utils/builtins.h" +typedef struct MorphOpaque +{ + Oid cfg_id; + int qoperator; /* query operator */ +} MorphOpaque; + + Datum get_current_ts_config(PG_FUNCTION_ARGS) { @@ -262,60 +269,81 @@ to_tsvector(PG_FUNCTION_ARGS) * to the stack. * * All words belonging to the same variant are pushed as an ANDed list, - * and different variants are ORred together. + * and different variants are ORed together. */ static void pushval_morph(Datum opaque, TSQueryParserState state, char *strval, int lenval, int16 weight, bool prefix) { - int32 count = 0; - ParsedText prs; - uint32 variant, - pos, - cntvar = 0, - cntpos = 0, - cnt = 0; - Oid cfg_id = DatumGetObjectId(opaque); /* the input is actually - * an Oid, not a pointer */ + int32 count = 0; + ParsedText prs; + uint32 variant, + pos = 0, + cntvar = 0, + cntpos = 0, + cnt = 0; + MorphOpaque *data = (MorphOpaque *) DatumGetPointer(opaque); prs.lenwords = 4; prs.curwords = 0; prs.pos = 0; prs.words = (ParsedWord *) palloc(sizeof(ParsedWord) * prs.lenwords); - parsetext(cfg_id, &prs, strval, lenval); + parsetext(data->cfg_id, &prs, strval, lenval); if (prs.curwords > 0) { - while (count < prs.curwords) { - pos = prs.words[count].pos.pos; + /* + * Were any stop words removed? If so, fill empty positions + * with placeholders linked by an appropriate operator. + */ + if (pos > 0 && pos + 1 < prs.words[count].pos.pos) + { + while (pos + 1 < prs.words[count].pos.pos) + { + /* put placeholders for each missing stop word */ + pushStop(state); + if (cntpos) + pushOperator(state, data->qoperator, 1); + cntpos++; + pos++; + } + } + + pos = prs.words[count].pos.pos; /* save current word's position */ + + /* Go through all variants obtained from this token */ cntvar = 0; while (count < prs.curwords && pos == prs.words[count].pos.pos) { variant = prs.words[count].nvariant; + /* Push all words belonging to the same variant */ cnt = 0; - while (count < prs.curwords && pos == prs.words[count].pos.pos && variant == prs.words[count].nvariant) + while (count < prs.curwords && + pos == prs.words[count].pos.pos && + variant == prs.words[count].nvariant) { - - pushValue(state, prs.words[count].word, prs.words[count].len, weight, - ((prs.words[count].flags & TSL_PREFIX) || prefix) ? true : false); + pushValue(state, + prs.words[count].word, + prs.words[count].len, + weight, + ((prs.words[count].flags & TSL_PREFIX) || prefix)); pfree(prs.words[count].word); if (cnt) - pushOperator(state, OP_AND); + pushOperator(state, OP_AND, 0); cnt++; count++; } if (cntvar) - pushOperator(state, OP_OR); + pushOperator(state, OP_OR, 0); cntvar++; } if (cntpos) - pushOperator(state, OP_AND); - + pushOperator(state, data->qoperator, 1); /* distance may be useful */ cntpos++; } @@ -329,44 +357,18 @@ pushval_morph(Datum opaque, TSQueryParserState state, char *strval, int lenval, Datum to_tsquery_byid(PG_FUNCTION_ARGS) { - Oid cfgid = PG_GETARG_OID(0); - text *in = PG_GETARG_TEXT_P(1); - TSQuery query; - QueryItem *res; - int32 len; - - query = parse_tsquery(text_to_cstring(in), pushval_morph, ObjectIdGetDatum(cfgid), false); - - if (query->size == 0) - PG_RETURN_TSQUERY(query); - - /* clean out any stopword placeholders from the tree */ - res = clean_fakeval(GETQUERY(query), &len); - if (!res) - { - SET_VARSIZE(query, HDRSIZETQ); - query->size = 0; - PG_RETURN_POINTER(query); - } - memcpy((void *) GETQUERY(query), (void *) res, len * sizeof(QueryItem)); + text *in = PG_GETARG_TEXT_P(1); + TSQuery query; + MorphOpaque data; - /* - * Removing the stopword placeholders might've resulted in fewer - * QueryItems. If so, move the operands up accordingly. - */ - if (len != query->size) - { - char *oldoperand = GETOPERAND(query); - int32 lenoperand = VARSIZE(query) - (oldoperand - (char *) query); + data.cfg_id = PG_GETARG_OID(0); + data.qoperator = OP_AND; - Assert(len < query->size); - - query->size = len; - memmove((void *) GETOPERAND(query), oldoperand, VARSIZE(query) - (oldoperand - (char *) query)); - SET_VARSIZE(query, COMPUTESIZE(len, lenoperand)); - } + query = parse_tsquery(text_to_cstring(in), + pushval_morph, + PointerGetDatum(&data), + false); - pfree(res); PG_RETURN_TSQUERY(query); } @@ -385,55 +387,60 @@ to_tsquery(PG_FUNCTION_ARGS) Datum plainto_tsquery_byid(PG_FUNCTION_ARGS) { - Oid cfgid = PG_GETARG_OID(0); - text *in = PG_GETARG_TEXT_P(1); - TSQuery query; - QueryItem *res; - int32 len; + text *in = PG_GETARG_TEXT_P(1); + TSQuery query; + MorphOpaque data; - query = parse_tsquery(text_to_cstring(in), pushval_morph, ObjectIdGetDatum(cfgid), true); + data.cfg_id = PG_GETARG_OID(0); + data.qoperator = OP_AND; - if (query->size == 0) - PG_RETURN_TSQUERY(query); + query = parse_tsquery(text_to_cstring(in), + pushval_morph, + PointerGetDatum(&data), + true); - /* clean out any stopword placeholders from the tree */ - res = clean_fakeval(GETQUERY(query), &len); - if (!res) - { - SET_VARSIZE(query, HDRSIZETQ); - query->size = 0; - PG_RETURN_POINTER(query); - } - memcpy((void *) GETQUERY(query), (void *) res, len * sizeof(QueryItem)); + PG_RETURN_POINTER(query); +} - /* - * Removing the stopword placeholders might've resulted in fewer - * QueryItems. If so, move the operands up accordingly. - */ - if (len != query->size) - { - char *oldoperand = GETOPERAND(query); - int32 lenoperand = VARSIZE(query) - (oldoperand - (char *) query); +Datum +plainto_tsquery(PG_FUNCTION_ARGS) +{ + text *in = PG_GETARG_TEXT_P(0); + Oid cfgId; + + cfgId = getTSCurrentConfig(true); + PG_RETURN_DATUM(DirectFunctionCall2(plainto_tsquery_byid, + ObjectIdGetDatum(cfgId), + PointerGetDatum(in))); +} - Assert(len < query->size); - query->size = len; - memmove((void *) GETOPERAND(query), oldoperand, lenoperand); - SET_VARSIZE(query, COMPUTESIZE(len, lenoperand)); - } +Datum +phraseto_tsquery_byid(PG_FUNCTION_ARGS) +{ + text *in = PG_GETARG_TEXT_P(1); + TSQuery query; + MorphOpaque data; - pfree(res); - PG_RETURN_POINTER(query); + data.cfg_id = PG_GETARG_OID(0); + data.qoperator = OP_PHRASE; + + query = parse_tsquery(text_to_cstring(in), + pushval_morph, + PointerGetDatum(&data), + true); + + PG_RETURN_TSQUERY(query); } Datum -plainto_tsquery(PG_FUNCTION_ARGS) +phraseto_tsquery(PG_FUNCTION_ARGS) { text *in = PG_GETARG_TEXT_P(0); Oid cfgId; cfgId = getTSCurrentConfig(true); - PG_RETURN_DATUM(DirectFunctionCall2(plainto_tsquery_byid, + PG_RETURN_DATUM(DirectFunctionCall2(phraseto_tsquery_byid, ObjectIdGetDatum(cfgId), PointerGetDatum(in))); } diff --git a/src/backend/tsearch/ts_parse.c b/src/backend/tsearch/ts_parse.c index 64cf906a5a..f0e4269e84 100644 --- a/src/backend/tsearch/ts_parse.c +++ b/src/backend/tsearch/ts_parse.c @@ -454,7 +454,7 @@ hladdword(HeadlineParsedText *prs, char *buf, int buflen, int type) } static void -hlfinditem(HeadlineParsedText *prs, TSQuery query, char *buf, int buflen) +hlfinditem(HeadlineParsedText *prs, TSQuery query, int32 pos, char *buf, int buflen) { int i; QueryItem *item = GETQUERY(query); @@ -467,6 +467,7 @@ hlfinditem(HeadlineParsedText *prs, TSQuery query, char *buf, int buflen) } word = &(prs->words[prs->curwords - 1]); + word->pos = LIMITPOS(pos); for (i = 0; i < query->size; i++) { if (item->type == QI_VAL && @@ -492,17 +493,20 @@ addHLParsedLex(HeadlineParsedText *prs, TSQuery query, ParsedLex *lexs, TSLexeme { ParsedLex *tmplexs; TSLexeme *ptr; + int32 savedpos; while (lexs) { - if (lexs->type > 0) hladdword(prs, lexs->lemm, lexs->lenlemm, lexs->type); ptr = norms; + savedpos = prs->vectorpos; while (ptr && ptr->lexeme) { - hlfinditem(prs, query, ptr->lexeme, strlen(ptr->lexeme)); + if (ptr->flags & TSL_ADDPOS) + savedpos++; + hlfinditem(prs, query, savedpos, ptr->lexeme, strlen(ptr->lexeme)); ptr++; } @@ -516,6 +520,8 @@ addHLParsedLex(HeadlineParsedText *prs, TSQuery query, ParsedLex *lexs, TSLexeme ptr = norms; while (ptr->lexeme) { + if (ptr->flags & TSL_ADDPOS) + prs->vectorpos++; pfree(ptr->lexeme); ptr++; } @@ -575,7 +581,10 @@ hlparsetext(Oid cfgId, HeadlineParsedText *prs, TSQuery query, char *buf, int bu do { if ((norms = LexizeExec(&ldata, &lexs)) != NULL) + { + prs->vectorpos++; addHLParsedLex(prs, query, lexs, norms); + } else addHLParsedLex(prs, query, lexs, NULL); } while (norms); diff --git a/src/backend/tsearch/ts_selfuncs.c b/src/backend/tsearch/ts_selfuncs.c index 7462888b5c..c4118f1db2 100644 --- a/src/backend/tsearch/ts_selfuncs.c +++ b/src/backend/tsearch/ts_selfuncs.c @@ -261,7 +261,7 @@ mcelem_tsquery_selec(TSQuery query, Datum *mcelem, int nmcelem, /* * Traverse the tsquery in preorder, calculating selectivity as: * - * selec(left_oper) * selec(right_oper) in AND nodes, + * selec(left_oper) * selec(right_oper) in AND & PHRASE nodes, * * selec(left_oper) + selec(right_oper) - * selec(left_oper) * selec(right_oper) in OR nodes, @@ -400,6 +400,7 @@ tsquery_opr_selec(QueryItem *item, char *operand, lookup, length, minfreq); break; + case OP_PHRASE: case OP_AND: s1 = tsquery_opr_selec(item + 1, operand, lookup, length, minfreq); diff --git a/src/backend/tsearch/wparser_def.c b/src/backend/tsearch/wparser_def.c index 4a28ce7545..2faa15ebd4 100644 --- a/src/backend/tsearch/wparser_def.c +++ b/src/backend/tsearch/wparser_def.c @@ -2030,15 +2030,36 @@ typedef struct } hlCheck; static bool -checkcondition_HL(void *checkval, QueryOperand *val) +checkcondition_HL(void *opaque, QueryOperand *val, ExecPhraseData *data) { int i; + hlCheck *checkval = (hlCheck *) opaque; - for (i = 0; i < ((hlCheck *) checkval)->len; i++) + for (i = 0; i < checkval->len; i++) { - if (((hlCheck *) checkval)->words[i].item == val) - return true; + if (checkval->words[i].item == val) + { + /* don't need to find all positions */ + if (!data) + return true; + + if (!data->pos) + { + data->pos = palloc(sizeof(WordEntryPos) * checkval->len); + data->allocated = true; + data->npos = 1; + data->pos[0] = checkval->words[i].pos; + } + else if (data->pos[data->npos - 1] < checkval->words[i].pos) + { + data->pos[data->npos++] = checkval->words[i].pos; + } + } } + + if (data && data->npos > 0) + return true; + return false; } @@ -2400,7 +2421,7 @@ mark_hl_words(HeadlineParsedText *prs, TSQuery query, int highlight, if (poslen < bestlen && !(NOENDTOKEN(prs->words[beste].type) || prs->words[beste].len <= shortword)) { - /* best already finded, so try one more cover */ + /* best already found, so try one more cover */ p++; continue; } diff --git a/src/backend/utils/adt/tsginidx.c b/src/backend/utils/adt/tsginidx.c index fef594700a..fc0686ee66 100644 --- a/src/backend/utils/adt/tsginidx.c +++ b/src/backend/utils/adt/tsginidx.c @@ -179,14 +179,16 @@ typedef struct } GinChkVal; static GinTernaryValue -checkcondition_gin(void *checkval, QueryOperand *val) +checkcondition_gin_internal(GinChkVal *gcv, QueryOperand *val, ExecPhraseData *data) { - GinChkVal *gcv = (GinChkVal *) checkval; int j; - /* if any val requiring a weight is used, set recheck flag */ - if (val->weight != 0) - *(gcv->need_recheck) = true; + /* + * if any val requiring a weight is used or caller + * needs position information then set recheck flag + */ + if (val->weight != 0 || data != NULL) + *gcv->need_recheck = true; /* convert item's number to corresponding entry's (operand's) number */ j = gcv->map_item_operand[((QueryItem *) val) - gcv->first_item]; @@ -195,16 +197,22 @@ checkcondition_gin(void *checkval, QueryOperand *val) return gcv->check[j]; } +/* + * Wrapper of check condition function for TS_execute. + */ +static bool +checkcondition_gin(void *checkval, QueryOperand *val, ExecPhraseData *data) +{ + return checkcondition_gin_internal((GinChkVal *) checkval, + val, + data) != GIN_FALSE; +} + /* * Evaluate tsquery boolean expression using ternary logic. - * - * chkcond is a callback function used to evaluate each VAL node in the query. - * checkval can be used to pass information to the callback. TS_execute doesn't - * do anything with it. */ static GinTernaryValue -TS_execute_ternary(QueryItem *curitem, void *checkval, - GinTernaryValue (*chkcond) (void *checkval, QueryOperand *val)) +TS_execute_ternary(GinChkVal *gcv, QueryItem *curitem) { GinTernaryValue val1, val2, @@ -214,22 +222,30 @@ TS_execute_ternary(QueryItem *curitem, void *checkval, check_stack_depth(); if (curitem->type == QI_VAL) - return chkcond(checkval, (QueryOperand *) curitem); + return checkcondition_gin_internal(gcv, + (QueryOperand *) curitem, + NULL /* don't have any position info */); switch (curitem->qoperator.oper) { case OP_NOT: - result = TS_execute_ternary(curitem + 1, checkval, chkcond); + result = TS_execute_ternary(gcv, curitem + 1); if (result == GIN_MAYBE) return result; return !result; + case OP_PHRASE: + /* + * GIN doesn't contain any information about positions, + * treat OP_PHRASE as OP_AND with recheck requirement + */ + *gcv->need_recheck = true; + case OP_AND: - val1 = TS_execute_ternary(curitem + curitem->qoperator.left, - checkval, chkcond); + val1 = TS_execute_ternary(gcv, curitem + curitem->qoperator.left); if (val1 == GIN_FALSE) return GIN_FALSE; - val2 = TS_execute_ternary(curitem + 1, checkval, chkcond); + val2 = TS_execute_ternary(gcv, curitem + 1); if (val2 == GIN_FALSE) return GIN_FALSE; if (val1 == GIN_TRUE && val2 == GIN_TRUE) @@ -238,11 +254,10 @@ TS_execute_ternary(QueryItem *curitem, void *checkval, return GIN_MAYBE; case OP_OR: - val1 = TS_execute_ternary(curitem + curitem->qoperator.left, - checkval, chkcond); + val1 = TS_execute_ternary(gcv, curitem + curitem->qoperator.left); if (val1 == GIN_TRUE) return GIN_TRUE; - val2 = TS_execute_ternary(curitem + 1, checkval, chkcond); + val2 = TS_execute_ternary(gcv, curitem + 1); if (val2 == GIN_TRUE) return GIN_TRUE; if (val1 == GIN_FALSE && val2 == GIN_FALSE) @@ -327,9 +342,7 @@ gin_tsquery_triconsistent(PG_FUNCTION_ARGS) gcv.map_item_operand = (int *) (extra_data[0]); gcv.need_recheck = &recheck; - res = TS_execute_ternary(GETQUERY(query), - &gcv, - checkcondition_gin); + res = TS_execute_ternary(&gcv, GETQUERY(query)); if (res == GIN_TRUE && recheck) res = GIN_MAYBE; diff --git a/src/backend/utils/adt/tsgistidx.c b/src/backend/utils/adt/tsgistidx.c index 0100cf4f37..cdd5d43fce 100644 --- a/src/backend/utils/adt/tsgistidx.c +++ b/src/backend/utils/adt/tsgistidx.c @@ -298,7 +298,7 @@ typedef struct * is there value 'val' in array or not ? */ static bool -checkcondition_arr(void *checkval, QueryOperand *val) +checkcondition_arr(void *checkval, QueryOperand *val, ExecPhraseData *data) { int32 *StopLow = ((CHKVAL *) checkval)->arrb; int32 *StopHigh = ((CHKVAL *) checkval)->arre; @@ -327,7 +327,7 @@ checkcondition_arr(void *checkval, QueryOperand *val) } static bool -checkcondition_bit(void *checkval, QueryOperand *val) +checkcondition_bit(void *checkval, QueryOperand *val, ExecPhraseData *data) { /* * we are not able to find a prefix in signature tree diff --git a/src/backend/utils/adt/tsquery.c b/src/backend/utils/adt/tsquery.c index 0732060678..257b5d3345 100644 --- a/src/backend/utils/adt/tsquery.c +++ b/src/backend/utils/adt/tsquery.c @@ -56,7 +56,7 @@ struct TSQueryParserStateData /* * subroutine to parse the modifiers (weight and prefix flag currently) - * part, like ':1AB' of a query. + * part, like ':AB*' of a query. */ static char * get_modifiers(char *buf, int16 *weight, bool *prefix) @@ -100,6 +100,94 @@ get_modifiers(char *buf, int16 *weight, bool *prefix) return buf; } +/* + * Parse phrase operator. The operator + * may take the following forms: + * + * a b (distance is no greater than X) + * a <-> b (default distance = 1) + * + * The buffer should begin with '<' char + */ +static char * +parse_phrase_operator(char *buf, int16 *distance) +{ + enum + { + PHRASE_OPEN = 0, + PHRASE_DIST, + PHRASE_CLOSE, + PHRASE_ERR, + PHRASE_FINISH + } state = PHRASE_OPEN; + + char *ptr = buf; + char *endptr; + long l = 1; + + while (*ptr) + { + switch(state) + { + case PHRASE_OPEN: + Assert(t_iseq(ptr, '<')); + state = PHRASE_DIST; + ptr++; + break; + + case PHRASE_DIST: + if (t_iseq(ptr, '-')) + { + state = PHRASE_CLOSE; + ptr++; + break; + } + else if (!t_isdigit(ptr)) + { + state = PHRASE_ERR; + break; + } + + l = strtol(ptr, &endptr, 10); + if (ptr == endptr) + state = PHRASE_ERR; + else if (errno == ERANGE || l > MAXENTRYPOS) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("distance in phrase operator should not be greater than %d", + MAXENTRYPOS))); + else + { + state = PHRASE_CLOSE; + ptr = endptr; + } + break; + + case PHRASE_CLOSE: + if (t_iseq(ptr, '>')) + { + state = PHRASE_FINISH; + ptr++; + } + else + state = PHRASE_ERR; + break; + + case PHRASE_FINISH: + *distance = (int16) l; + return ptr; + + case PHRASE_ERR: + default: + goto err; + } + } + + err: + *distance = -1; + return buf; +} + /* * token types for parsing */ @@ -116,8 +204,10 @@ typedef enum /* * get token from query string * - * *operator is filled in with OP_* when return values is PT_OPR + * *operator is filled in with OP_* when return values is PT_OPR, + * but *weight could contain a distance value in case of phrase operator. * *strval, *lenval and *weight are filled in when return value is PT_VAL + * */ static ts_tokentype gettoken_query(TSQueryParserState state, @@ -185,13 +275,23 @@ gettoken_query(TSQueryParserState state, (state->buf)++; return PT_OPR; } - if (t_iseq(state->buf, '|')) + else if (t_iseq(state->buf, '|')) { state->state = WAITOPERAND; *operator = OP_OR; (state->buf)++; return PT_OPR; } + else if (t_iseq(state->buf, '<')) + { + state->state = WAITOPERAND; + *operator = OP_PHRASE; + /* weight var is used as storage for distance */ + state->buf = parse_phrase_operator(state->buf, weight); + if (*weight < 0) + return PT_ERR; + return PT_OPR; + } else if (t_iseq(state->buf, ')')) { (state->buf)++; @@ -223,15 +323,16 @@ gettoken_query(TSQueryParserState state, * Push an operator to state->polstr */ void -pushOperator(TSQueryParserState state, int8 oper) +pushOperator(TSQueryParserState state, int8 oper, int16 distance) { QueryOperator *tmp; - Assert(oper == OP_NOT || oper == OP_AND || oper == OP_OR); + Assert(oper == OP_NOT || oper == OP_AND || oper == OP_OR || oper == OP_PHRASE); tmp = (QueryOperator *) palloc0(sizeof(QueryOperator)); tmp->type = QI_OPR; tmp->oper = oper; + tmp->distance = (oper == OP_PHRASE) ? distance : 0; /* left is filled in later with findoprnd */ state->polstr = lcons(tmp, state->polstr); @@ -330,14 +431,18 @@ makepol(TSQueryParserState state, PushFunction pushval, Datum opaque) { - int8 operator = 0; - ts_tokentype type; - int lenval = 0; - char *strval = NULL; - int8 opstack[STACKDEPTH]; - int lenstack = 0; - int16 weight = 0; - bool prefix; + int8 operator = 0; + ts_tokentype type; + int lenval = 0; + char *strval = NULL; + struct + { + int8 op; + int16 distance; + } opstack[STACKDEPTH]; + int lenstack = 0; + int16 weight = 0; + bool prefix; /* since this function recurses, it could be driven to stack overflow */ check_stack_depth(); @@ -348,39 +453,48 @@ makepol(TSQueryParserState state, { case PT_VAL: pushval(opaque, state, strval, lenval, weight, prefix); - while (lenstack && (opstack[lenstack - 1] == OP_AND || - opstack[lenstack - 1] == OP_NOT)) + while (lenstack && (opstack[lenstack - 1].op == OP_AND || + opstack[lenstack - 1].op == OP_PHRASE || + opstack[lenstack - 1].op == OP_NOT)) { lenstack--; - pushOperator(state, opstack[lenstack]); + pushOperator(state, + opstack[lenstack].op, + opstack[lenstack].distance); } break; case PT_OPR: if (lenstack && operator == OP_OR) - pushOperator(state, OP_OR); + pushOperator(state, OP_OR, 0); else { if (lenstack == STACKDEPTH) /* internal error */ elog(ERROR, "tsquery stack too small"); - opstack[lenstack] = operator; + opstack[lenstack].op = operator; + opstack[lenstack].distance = weight; lenstack++; } break; case PT_OPEN: makepol(state, pushval, opaque); - while (lenstack && (opstack[lenstack - 1] == OP_AND || - opstack[lenstack - 1] == OP_NOT)) + while (lenstack && (opstack[lenstack - 1].op == OP_AND || + opstack[lenstack - 1].op == OP_PHRASE || + opstack[lenstack - 1].op == OP_NOT)) { lenstack--; - pushOperator(state, opstack[lenstack]); + pushOperator(state, + opstack[lenstack].op, + opstack[lenstack].distance); } break; case PT_CLOSE: while (lenstack) { lenstack--; - pushOperator(state, opstack[lenstack]); + pushOperator(state, + opstack[lenstack].op, + opstack[lenstack].distance); }; return; case PT_ERR: @@ -394,12 +508,14 @@ makepol(TSQueryParserState state, while (lenstack) { lenstack--; - pushOperator(state, opstack[lenstack]); + pushOperator(state, + opstack[lenstack].op, + opstack[lenstack].distance); } } static void -findoprnd_recurse(QueryItem *ptr, uint32 *pos, int nnodes) +findoprnd_recurse(QueryItem *ptr, uint32 *pos, int nnodes, bool *needcleanup) { /* since this function recurses, it could be driven to stack overflow. */ check_stack_depth(); @@ -407,10 +523,13 @@ findoprnd_recurse(QueryItem *ptr, uint32 *pos, int nnodes) if (*pos >= nnodes) elog(ERROR, "malformed tsquery: operand not found"); - if (ptr[*pos].type == QI_VAL || - ptr[*pos].type == QI_VALSTOP) /* need to handle VALSTOP here, they - * haven't been cleaned away yet. */ + if (ptr[*pos].type == QI_VAL) + { + (*pos)++; + } + else if (ptr[*pos].type == QI_VALSTOP) { + *needcleanup = true; /* we'll have to remove stop words */ (*pos)++; } else @@ -419,21 +538,32 @@ findoprnd_recurse(QueryItem *ptr, uint32 *pos, int nnodes) if (ptr[*pos].qoperator.oper == OP_NOT) { - ptr[*pos].qoperator.left = 1; + ptr[*pos].qoperator.left = 1; /* fixed offset */ (*pos)++; - findoprnd_recurse(ptr, pos, nnodes); + + /* process the only argument */ + findoprnd_recurse(ptr, pos, nnodes, needcleanup); } else { - QueryOperator *curitem = &ptr[*pos].qoperator; - int tmp = *pos; + QueryOperator *curitem = &ptr[*pos].qoperator; + int tmp = *pos; /* save current position */ + + Assert(curitem->oper == OP_AND || + curitem->oper == OP_OR || + curitem->oper == OP_PHRASE); - Assert(curitem->oper == OP_AND || curitem->oper == OP_OR); + if (curitem->oper == OP_PHRASE) + *needcleanup = true; /* push OP_PHRASE down later */ (*pos)++; - findoprnd_recurse(ptr, pos, nnodes); - curitem->left = *pos - tmp; - findoprnd_recurse(ptr, pos, nnodes); + + /* process RIGHT argument */ + findoprnd_recurse(ptr, pos, nnodes, needcleanup); + curitem->left = *pos - tmp; /* set LEFT arg's offset */ + + /* process LEFT argument */ + findoprnd_recurse(ptr, pos, nnodes, needcleanup); } } } @@ -444,12 +574,13 @@ findoprnd_recurse(QueryItem *ptr, uint32 *pos, int nnodes) * QueryItems must be in polish (prefix) notation. */ static void -findoprnd(QueryItem *ptr, int size) +findoprnd(QueryItem *ptr, int size, bool *needcleanup) { uint32 pos; + *needcleanup = false; pos = 0; - findoprnd_recurse(ptr, &pos, size); + findoprnd_recurse(ptr, &pos, size, needcleanup); if (pos != size) elog(ERROR, "malformed tsquery: extra nodes"); @@ -466,9 +597,6 @@ findoprnd(QueryItem *ptr, int size) * * opaque is passed on to pushval as is, pushval can use it to store its * private state. - * - * The returned query might contain QI_STOPVAL nodes. The caller is responsible - * for cleaning them up (with clean_fakeval) */ TSQuery parse_tsquery(char *buf, @@ -482,6 +610,7 @@ parse_tsquery(char *buf, int commonlen; QueryItem *ptr; ListCell *cell; + bool needcleanup; /* init state */ state.buffer = buf; @@ -531,7 +660,7 @@ parse_tsquery(char *buf, i = 0; foreach(cell, state.polstr) { - QueryItem *item = (QueryItem *) lfirst(cell); + QueryItem *item = (QueryItem *) lfirst(cell); switch (item->type) { @@ -555,7 +684,14 @@ parse_tsquery(char *buf, pfree(state.op); /* Set left operand pointers for every operator. */ - findoprnd(ptr, query->size); + findoprnd(ptr, query->size, &needcleanup); + + /* + * QI_VALSTOP nodes should be cleaned and + * and OP_PHRASE should be pushed down + */ + if (needcleanup) + return cleanup_fakeval_and_phrase(query); return query; } @@ -600,12 +736,15 @@ while( ( (inf)->cur - (inf)->buf ) + (addsize) + 1 >= (inf)->buflen ) \ (inf)->cur = (inf)->buf + len; \ } +#define PRINT_PRIORITY(x) \ + ( (QO_PRIORITY(x) == OP_NOT) ? OP_NOT_PHRASE : QO_PRIORITY(x) ) + /* - * recursive walk on tree and print it in - * infix (human-readable) view + * recursively traverse the tree and + * print it in infix (human-readable) form */ static void -infix(INFIX *in, bool first) +infix(INFIX *in, int parentPriority) { /* since this function recurses, it could be driven to stack overflow. */ check_stack_depth(); @@ -674,24 +813,22 @@ infix(INFIX *in, bool first) } else if (in->curpol->qoperator.oper == OP_NOT) { - bool isopr = false; + int priority = PRINT_PRIORITY(in->curpol); - RESIZEBUF(in, 1); - *(in->cur) = '!'; - in->cur++; - *(in->cur) = '\0'; - in->curpol++; - - if (in->curpol->type == QI_OPR) + if (priority < parentPriority) { - isopr = true; RESIZEBUF(in, 2); sprintf(in->cur, "( "); in->cur = strchr(in->cur, '\0'); } + RESIZEBUF(in, 1); + *(in->cur) = '!'; + in->cur++; + *(in->cur) = '\0'; + in->curpol++; - infix(in, isopr); - if (isopr) + infix(in, priority); + if (priority < parentPriority) { RESIZEBUF(in, 2); sprintf(in->cur, " )"); @@ -701,11 +838,18 @@ infix(INFIX *in, bool first) else { int8 op = in->curpol->qoperator.oper; + int priority = PRINT_PRIORITY(in->curpol); + int16 distance = in->curpol->qoperator.distance; INFIX nrm; + bool needParenthesis = false; in->curpol++; - if (op == OP_OR && !first) + if (priority < parentPriority || + (op == OP_PHRASE && + (priority == parentPriority || /* phrases are not commutative! */ + parentPriority == OP_PRIORITY(OP_AND)))) { + needParenthesis = true; RESIZEBUF(in, 2); sprintf(in->cur, "( "); in->cur = strchr(in->cur, '\0'); @@ -717,14 +861,14 @@ infix(INFIX *in, bool first) nrm.cur = nrm.buf = (char *) palloc(sizeof(char) * nrm.buflen); /* get right operand */ - infix(&nrm, false); + infix(&nrm, priority); /* get & print left operand */ in->curpol = nrm.curpol; - infix(in, false); + infix(in, priority); /* print operator & right operand */ - RESIZEBUF(in, 3 + (nrm.cur - nrm.buf)); + RESIZEBUF(in, 3 + (2 + 10 /* distance */) + (nrm.cur - nrm.buf)); switch (op) { case OP_OR: @@ -733,6 +877,12 @@ infix(INFIX *in, bool first) case OP_AND: sprintf(in->cur, " & %s", nrm.buf); break; + case OP_PHRASE: + if (distance != 1) + sprintf(in->cur, " <%d> %s", distance, nrm.buf); + else + sprintf(in->cur, " <-> %s", nrm.buf); + break; default: /* OP_NOT is handled in above if-branch */ elog(ERROR, "unrecognized operator type: %d", op); @@ -740,7 +890,7 @@ infix(INFIX *in, bool first) in->cur = strchr(in->cur, '\0'); pfree(nrm.buf); - if (op == OP_OR && !first) + if (needParenthesis) { RESIZEBUF(in, 2); sprintf(in->cur, " )"); @@ -749,7 +899,6 @@ infix(INFIX *in, bool first) } } - Datum tsqueryout(PG_FUNCTION_ARGS) { @@ -768,7 +917,7 @@ tsqueryout(PG_FUNCTION_ARGS) nrm.cur = nrm.buf = (char *) palloc(sizeof(char) * nrm.buflen); *(nrm.cur) = '\0'; nrm.op = GETOPERAND(query); - infix(&nrm, true); + infix(&nrm, -1 /* lowest priority */); PG_FREE_IF_COPY(query, 0); PG_RETURN_CSTRING(nrm.buf); @@ -789,7 +938,8 @@ tsqueryout(PG_FUNCTION_ARGS) * * For each operator: * uint8 type, QI_OPR - * uint8 operator, one of OP_AND, OP_OR, OP_NOT. + * uint8 operator, one of OP_AND, OP_PHRASE OP_OR, OP_NOT. + * uint16 distance (only for OP_PHRASE) */ Datum tsquerysend(PG_FUNCTION_ARGS) @@ -815,6 +965,9 @@ tsquerysend(PG_FUNCTION_ARGS) break; case QI_OPR: pq_sendint(&buf, item->qoperator.oper, sizeof(item->qoperator.oper)); + if (item->qoperator.oper == OP_PHRASE) + pq_sendint(&buf, item->qoperator.distance, + sizeof(item->qoperator.distance)); break; default: elog(ERROR, "unrecognized tsquery node type: %d", item->type); @@ -830,15 +983,16 @@ tsquerysend(PG_FUNCTION_ARGS) Datum tsqueryrecv(PG_FUNCTION_ARGS) { - StringInfo buf = (StringInfo) PG_GETARG_POINTER(0); - TSQuery query; - int i, - len; - QueryItem *item; - int datalen; - char *ptr; - uint32 size; - const char **operands; + StringInfo buf = (StringInfo) PG_GETARG_POINTER(0); + TSQuery query; + int i, + len; + QueryItem *item; + int datalen; + char *ptr; + uint32 size; + const char **operands; + bool needcleanup; size = pq_getmsgint(buf, sizeof(uint32)); if (size > (MaxAllocSize / sizeof(QueryItem))) @@ -907,13 +1061,15 @@ tsqueryrecv(PG_FUNCTION_ARGS) int8 oper; oper = (int8) pq_getmsgint(buf, sizeof(int8)); - if (oper != OP_NOT && oper != OP_OR && oper != OP_AND) + if (oper != OP_NOT && oper != OP_OR && oper != OP_AND && oper != OP_PHRASE) elog(ERROR, "invalid tsquery: unrecognized operator type %d", (int) oper); if (i == size - 1) elog(ERROR, "invalid pointer to right operand"); item->qoperator.oper = oper; + if (oper == OP_PHRASE) + item->qoperator.distance = (int16) pq_getmsgint(buf, sizeof(int16)); } else elog(ERROR, "unrecognized tsquery node type: %d", item->type); @@ -930,7 +1086,7 @@ tsqueryrecv(PG_FUNCTION_ARGS) * Fill in the left-pointers. Checks that the tree is well-formed as a * side-effect. */ - findoprnd(item, size); + findoprnd(item, size, &needcleanup); /* Copy operands to output struct */ for (i = 0; i < size; i++) @@ -949,7 +1105,10 @@ tsqueryrecv(PG_FUNCTION_ARGS) SET_VARSIZE(query, len + datalen); - PG_RETURN_TSVECTOR(query); + if (needcleanup) + PG_RETURN_TSQUERY(cleanup_fakeval_and_phrase(query)); + + PG_RETURN_TSQUERY(query); } /* diff --git a/src/backend/utils/adt/tsquery_cleanup.c b/src/backend/utils/adt/tsquery_cleanup.c index 333789be3c..126795504a 100644 --- a/src/backend/utils/adt/tsquery_cleanup.c +++ b/src/backend/utils/adt/tsquery_cleanup.c @@ -25,6 +25,12 @@ typedef struct NODE QueryItem *valnode; } NODE; +/* Non-operator nodes have fake (but highest) priority */ +#define NODE_PRIORITY(x) \ + ( ((x)->valnode->qoperator.type == QI_OPR) ? \ + QO_PRIORITY((x)->valnode) : \ + TOP_PRIORITY ) + /* * make query tree from plain view of query */ @@ -160,7 +166,8 @@ clean_NOT_intree(NODE *node) { NODE *res = node; - Assert(node->valnode->qoperator.oper == OP_AND); + Assert(node->valnode->qoperator.oper == OP_AND || + node->valnode->qoperator.oper == OP_PHRASE); node->left = clean_NOT_intree(node->left); node->right = clean_NOT_intree(node->right); @@ -212,18 +219,20 @@ clean_NOT(QueryItem *ptr, int *len) #define V_STOP 3 /* the expression is a stop word */ /* - * Clean query tree from values which is always in - * text (stopword) + * Remove QI_VALSTOP (stopword nodes) from query tree. */ static NODE * -clean_fakeval_intree(NODE *node, char *result) +clean_fakeval_intree(NODE *node, char *result, int *adddistance) { - char lresult = V_UNKNOWN, - rresult = V_UNKNOWN; + char lresult = V_UNKNOWN, + rresult = V_UNKNOWN; /* since this function recurses, it could be driven to stack overflow. */ check_stack_depth(); + if (adddistance) + *adddistance = 0; + if (node->valnode->type == QI_VAL) return node; else if (node->valnode->type == QI_VALSTOP) @@ -237,7 +246,7 @@ clean_fakeval_intree(NODE *node, char *result) if (node->valnode->qoperator.oper == OP_NOT) { - node->right = clean_fakeval_intree(node->right, &rresult); + node->right = clean_fakeval_intree(node->right, &rresult, NULL); if (!node->right) { *result = V_STOP; @@ -247,13 +256,30 @@ clean_fakeval_intree(NODE *node, char *result) } else { - NODE *res = node; + NODE *res = node; + int ndistance, ldistance = 0, rdistance = 0; + + ndistance = (node->valnode->qoperator.oper == OP_PHRASE) ? + node->valnode->qoperator.distance : + 0; - node->left = clean_fakeval_intree(node->left, &lresult); - node->right = clean_fakeval_intree(node->right, &rresult); + node->left = clean_fakeval_intree(node->left, + &lresult, + ndistance ? &ldistance : NULL); + + node->right = clean_fakeval_intree(node->right, + &rresult, + ndistance ? &rdistance : NULL); + + /* + * ndistance, ldistance and rdistance are greater than zero + * if their corresponding nodes are OP_PHRASE + */ if (lresult == V_STOP && rresult == V_STOP) { + if (adddistance && ndistance) + *adddistance = ldistance + ndistance + rdistance; freetree(node); *result = V_STOP; return NULL; @@ -261,33 +287,333 @@ clean_fakeval_intree(NODE *node, char *result) else if (lresult == V_STOP) { res = node->right; + /* + * propagate distance from current node to the + * right upper subtree. + */ + if (adddistance && ndistance) + *adddistance = rdistance; pfree(node); } else if (rresult == V_STOP) { res = node->left; + /* + * propagate distance from current node to the upper tree. + */ + if (adddistance && ndistance) + *adddistance = ndistance + ldistance; pfree(node); } + else if (ndistance) + { + node->valnode->qoperator.distance += ldistance; + if (adddistance) + *adddistance = 0; + } + else if (adddistance) + { + *adddistance = 0; + } + return res; } return node; } -QueryItem * -clean_fakeval(QueryItem *ptr, int *len) +static NODE * +copyNODE(NODE *node) { - NODE *root = maketree(ptr); + NODE *cnode = palloc(sizeof(NODE)); + + /* since this function recurses, it could be driven to stack overflow. */ + check_stack_depth(); + + cnode->valnode = palloc(sizeof(QueryItem)); + *(cnode->valnode) = *(node->valnode); + + if (node->valnode->type == QI_OPR) + { + cnode->right = copyNODE(node->right); + if (node->valnode->qoperator.oper != OP_NOT) + cnode->left = copyNODE(node->left); + } + + return cnode; +} + +static NODE * +makeNODE(int8 op, NODE *left, NODE *right) +{ + NODE *node = palloc(sizeof(NODE)); + + node->valnode = palloc(sizeof(QueryItem)); + + node->valnode->qoperator.type = QI_OPR; + node->valnode->qoperator.oper = op; + + node->left = left; + node->right = right; + + return node; +} + +/* + * Move operation with high priority to the leaves. This guarantees + * that the phrase operator will be near the bottom of the tree. + * An idea behind is do not store position of lexemes during execution + * of ordinary operations (AND, OR, NOT) because it could be expensive. + * Actual transformation will be performed only on subtrees under the + * <-> () operation since it's needed solely for the phrase operator. + * + * Rules: + * a <-> (b | c) => (a <-> b) | (a <-> c) + * (a | b) <-> c => (a <-> c) | (b <-> c) + * a <-> !b => a & !(a <-> b) + * !a <-> b => b & !(a <-> b) + * + * Warnings for readers: + * a <-> b != b <-> a + * + * a (b c) != (a b) c since the phrase lengths are: + * n 2n-1 + */ +static NODE * +normalize_phrase_tree(NODE *node) +{ + /* there should be no stop words at this point */ + Assert(node->valnode->type != QI_VALSTOP); + + if (node->valnode->type == QI_VAL) + return node; + + /* since this function recurses, it could be driven to stack overflow. */ + check_stack_depth(); + + Assert(node->valnode->type == QI_OPR); + + if (node->valnode->qoperator.oper == OP_NOT) + { + /* eliminate NOT sequence */ + while (node->valnode->type == QI_OPR && + node->valnode->qoperator.oper == node->right->valnode->qoperator.oper) + { + node = node->right->right; + } + + node->right = normalize_phrase_tree(node->right); + } + else if (node->valnode->qoperator.oper == OP_PHRASE) + { + int16 distance; + NODE *X; + + node->left = normalize_phrase_tree(node->left); + node->right = normalize_phrase_tree(node->right); + + if (NODE_PRIORITY(node) <= NODE_PRIORITY(node->right) && + NODE_PRIORITY(node) <= NODE_PRIORITY(node->left)) + return node; + + /* + * We can't swap left-right and works only with left child + * because of a <-> b != b <-> a + */ + + distance = node->valnode->qoperator.distance; + + if (node->right->valnode->type == QI_OPR) + { + switch (node->right->valnode->qoperator.oper) + { + case OP_AND: + /* a <-> (b & c) => (a <-> b) & (a <-> c) */ + node = makeNODE(OP_AND, + makeNODE(OP_PHRASE, + node->left, + node->right->left), + makeNODE(OP_PHRASE, + copyNODE(node->left), + node->right->right)); + node->left->valnode->qoperator.distance = + node->right->valnode->qoperator.distance = distance; + break; + case OP_OR: + /* a <-> (b | c) => (a <-> b) | (a <-> c) */ + node = makeNODE(OP_OR, + makeNODE(OP_PHRASE, + node->left, + node->right->left), + makeNODE(OP_PHRASE, + copyNODE(node->left), + node->right->right)); + node->left->valnode->qoperator.distance = + node->right->valnode->qoperator.distance = distance; + break; + case OP_NOT: + /* a <-> !b => a & !(a <-> b) */ + X = node->right; + node->right = node->right->right; + X->right = node; + node = makeNODE(OP_AND, + copyNODE(node->left), + X); + break; + case OP_PHRASE: + /* no-op */ + break; + default: + elog(ERROR,"Wrong type of tsquery node: %d", + node->right->valnode->qoperator.oper); + } + } + + if (node->left->valnode->type == QI_OPR && + node->valnode->qoperator.oper == OP_PHRASE) + { + /* + * if the node is still OP_PHRASE, check the left subtree, + * otherwise the whole node will be transformed later. + */ + switch(node->left->valnode->qoperator.oper) + { + case OP_AND: + /* (a & b) <-> c => (a <-> c) & (b <-> c) */ + node = makeNODE(OP_AND, + makeNODE(OP_PHRASE, + node->left->left, + node->right), + makeNODE(OP_PHRASE, + node->left->right, + copyNODE(node->right))); + node->left->valnode->qoperator.distance = + node->right->valnode->qoperator.distance = distance; + break; + case OP_OR: + /* (a | b) <-> c => (a <-> c) | (b <-> c) */ + node = makeNODE(OP_OR, + makeNODE(OP_PHRASE, + node->left->left, + node->right), + makeNODE(OP_PHRASE, + node->left->right, + copyNODE(node->right))); + node->left->valnode->qoperator.distance = + node->right->valnode->qoperator.distance = distance; + break; + case OP_NOT: + /* !a <-> b => b & !(a <-> b) */ + X = node->left; + node->left = node->left->right; + X->right = node; + node = makeNODE(OP_AND, + X, + copyNODE(node->right)); + break; + case OP_PHRASE: + /* no-op */ + break; + default: + elog(ERROR,"Wrong type of tsquery node: %d", + node->left->valnode->qoperator.oper); + } + } + + /* continue transformation */ + node = normalize_phrase_tree(node); + } + else /* AND or OR */ + { + node->left = normalize_phrase_tree(node->left); + node->right = normalize_phrase_tree(node->right); + } + + return node; +} + +/* + * Number of elements in query tree + */ +static int32 +calcstrlen(NODE *node) +{ + int32 size = 0; + + if (node->valnode->type == QI_VAL) + { + size = node->valnode->qoperand.length + 1; + } + else + { + Assert(node->valnode->type == QI_OPR); + + size = calcstrlen(node->right); + if (node->valnode->qoperator.oper != OP_NOT) + size += calcstrlen(node->left); + } + + return size; +} + +TSQuery +cleanup_fakeval_and_phrase(TSQuery in) +{ + int32 len, + lenstr, + commonlen, + i; + NODE *root; char result = V_UNKNOWN; - NODE *resroot; + TSQuery out; + QueryItem *items; + char *operands; - resroot = clean_fakeval_intree(root, &result); + if (in->size == 0) + return in; + + /* eliminate stop words */ + root = clean_fakeval_intree(maketree(GETQUERY(in)), &result, NULL); if (result != V_UNKNOWN) { ereport(NOTICE, (errmsg("text-search query contains only stop words or doesn't contain lexemes, ignored"))); - *len = 0; - return NULL; + out = palloc(HDRSIZETQ); + out->size = 0; + SET_VARSIZE(out, HDRSIZETQ); + return out; + } + + /* push OP_PHRASE nodes down */ + root = normalize_phrase_tree(root); + + /* + * Build TSQuery from plain view + */ + + lenstr = calcstrlen(root); + items = plaintree(root, &len); + commonlen = COMPUTESIZE(len, lenstr); + + out = palloc(commonlen); + SET_VARSIZE(out, commonlen); + out->size = len; + + memcpy(GETQUERY(out), items, len * sizeof(QueryItem)); + + items = GETQUERY(out); + operands = GETOPERAND(out); + for (i = 0; i < out->size; i++) + { + QueryOperand *op = (QueryOperand *) &items[i]; + + if (op->type != QI_VAL) + continue; + + memcpy(operands, GETOPERAND(in) + op->distance, op->length); + operands[op->length] = '\0'; + op->distance = operands - GETOPERAND(out); + operands += op->length + 1; } - return plaintree(resroot, len); + return out; } diff --git a/src/backend/utils/adt/tsquery_op.c b/src/backend/utils/adt/tsquery_op.c index 9cdf1fe10b..30d3faf3e2 100644 --- a/src/backend/utils/adt/tsquery_op.c +++ b/src/backend/utils/adt/tsquery_op.c @@ -27,7 +27,7 @@ tsquery_numnode(PG_FUNCTION_ARGS) } static QTNode * -join_tsqueries(TSQuery a, TSQuery b, int8 operator) +join_tsqueries(TSQuery a, TSQuery b, int8 operator, uint16 distance) { QTNode *res = (QTNode *) palloc0(sizeof(QTNode)); @@ -36,6 +36,8 @@ join_tsqueries(TSQuery a, TSQuery b, int8 operator) res->valnode = (QueryItem *) palloc0(sizeof(QueryItem)); res->valnode->type = QI_OPR; res->valnode->qoperator.oper = operator; + if (operator == OP_PHRASE) + res->valnode->qoperator.distance = distance; res->child = (QTNode **) palloc0(sizeof(QTNode *) * 2); res->child[0] = QT2QTN(GETQUERY(b), GETOPERAND(b)); @@ -64,7 +66,7 @@ tsquery_and(PG_FUNCTION_ARGS) PG_RETURN_POINTER(a); } - res = join_tsqueries(a, b, OP_AND); + res = join_tsqueries(a, b, OP_AND, 0); query = QTN2QT(res); @@ -94,7 +96,7 @@ tsquery_or(PG_FUNCTION_ARGS) PG_RETURN_POINTER(a); } - res = join_tsqueries(a, b, OP_OR); + res = join_tsqueries(a, b, OP_OR, 0); query = QTN2QT(res); @@ -105,6 +107,52 @@ tsquery_or(PG_FUNCTION_ARGS) PG_RETURN_POINTER(query); } +Datum +tsquery_phrase_distance(PG_FUNCTION_ARGS) +{ + TSQuery a = PG_GETARG_TSQUERY_COPY(0); + TSQuery b = PG_GETARG_TSQUERY_COPY(1); + QTNode *res; + TSQuery query; + int32 distance = PG_GETARG_INT32(2); + + if (distance < 0 || distance > MAXENTRYPOS) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("distance in phrase operator should be non-negative and less than %d", + MAXENTRYPOS))); + if (a->size == 0) + { + PG_FREE_IF_COPY(a, 1); + PG_RETURN_POINTER(b); + } + else if (b->size == 0) + { + PG_FREE_IF_COPY(b, 1); + PG_RETURN_POINTER(a); + } + + res = join_tsqueries(a, b, OP_PHRASE, (uint16) distance); + + query = QTN2QT(res); + + QTNFree(res); + PG_FREE_IF_COPY(a, 0); + PG_FREE_IF_COPY(b, 1); + + PG_RETURN_POINTER(cleanup_fakeval_and_phrase(query)); +} + +Datum +tsquery_phrase(PG_FUNCTION_ARGS) +{ + PG_RETURN_POINTER(DirectFunctionCall3( + tsquery_phrase_distance, + PG_GETARG_DATUM(0), + PG_GETARG_DATUM(1), + Int32GetDatum(1))); +} + Datum tsquery_not(PG_FUNCTION_ARGS) { diff --git a/src/backend/utils/adt/tsquery_util.c b/src/backend/utils/adt/tsquery_util.c index fe26ad52dd..0f338aa653 100644 --- a/src/backend/utils/adt/tsquery_util.c +++ b/src/backend/utils/adt/tsquery_util.c @@ -110,6 +110,10 @@ QTNodeCompare(QTNode *an, QTNode *bn) if ((res = QTNodeCompare(an->child[i], bn->child[i])) != 0) return res; } + + if (ao->oper == OP_PHRASE && ao->distance != bo->distance) + return (ao->distance > bo->distance) ? -1 : 1; + return 0; } else if (an->valnode->type == QI_VAL) @@ -150,7 +154,7 @@ QTNSort(QTNode *in) for (i = 0; i < in->nchild; i++) QTNSort(in->child[i]); - if (in->nchild > 1) + if (in->nchild > 1 && in->valnode->qoperator.oper != OP_PHRASE) qsort((void *) in->child, in->nchild, sizeof(QTNode *), cmpQTN); } @@ -190,7 +194,10 @@ QTNTernary(QTNode *in) { QTNode *cc = in->child[i]; - if (cc->valnode->type == QI_OPR && in->valnode->qoperator.oper == cc->valnode->qoperator.oper) + /* OP_Phrase isn't associative */ + if (cc->valnode->type == QI_OPR && + in->valnode->qoperator.oper == cc->valnode->qoperator.oper && + in->valnode->qoperator.oper != OP_PHRASE) { int oldnchild = in->nchild; diff --git a/src/backend/utils/adt/tsrank.c b/src/backend/utils/adt/tsrank.c index 53f678a3bf..ab47b763ee 100644 --- a/src/backend/utils/adt/tsrank.c +++ b/src/backend/utils/adt/tsrank.c @@ -364,8 +364,10 @@ calc_rank(const float *w, TSVector t, TSQuery q, int32 method) return 0.0; /* XXX: What about NOT? */ - res = (item->type == QI_OPR && item->qoperator.oper == OP_AND) ? - calc_rank_and(w, t, q) : calc_rank_or(w, t, q); + res = (item->type == QI_OPR && (item->qoperator.oper == OP_AND || + item->qoperator.oper == OP_PHRASE)) ? + calc_rank_and(w, t, q) : + calc_rank_or(w, t, q); if (res < 0) res = 1e-20f; @@ -496,10 +498,17 @@ ts_rank_tt(PG_FUNCTION_ARGS) typedef struct { - QueryItem **item; - int16 nitem; - uint8 wclass; - int32 pos; + union { + struct { /* compiled doc representation */ + QueryItem **items; + int16 nitem; + } query; + struct { /* struct is used for preparing doc representation */ + QueryItem *item; + WordEntry *entry; + } map; + } data; + WordEntryPos pos; } DocRepresentation; static int @@ -508,26 +517,59 @@ compareDocR(const void *va, const void *vb) const DocRepresentation *a = (const DocRepresentation *) va; const DocRepresentation *b = (const DocRepresentation *) vb; - if (a->pos == b->pos) - return 0; - return (a->pos > b->pos) ? 1 : -1; + if (WEP_GETPOS(a->pos) == WEP_GETPOS(b->pos)) + { + if (WEP_GETWEIGHT(a->pos) == WEP_GETWEIGHT(b->pos)) + { + if (a->data.map.entry == b->data.map.entry) + return 0; + + return (a->data.map.entry > b->data.map.entry) ? 1 : -1; + } + + return (WEP_GETWEIGHT(a->pos) > WEP_GETWEIGHT(b->pos)) ? 1 : -1; + } + + return (WEP_GETPOS(a->pos) > WEP_GETPOS(b->pos)) ? 1 : -1; } +#define MAXQROPOS MAXENTRYPOS +typedef struct +{ + bool operandexists; + bool reverseinsert; /* indicates insert order, + true means descending order */ + uint32 npos; + WordEntryPos pos[MAXQROPOS]; +} QueryRepresentationOperand; + typedef struct { - TSQuery query; - bool *operandexist; + TSQuery query; + QueryRepresentationOperand *operandData; } QueryRepresentation; -#define QR_GET_OPERAND_EXISTS(q, v) ( (q)->operandexist[ ((QueryItem*)(v)) - GETQUERY((q)->query) ] ) -#define QR_SET_OPERAND_EXISTS(q, v) QR_GET_OPERAND_EXISTS(q,v) = true +#define QR_GET_OPERAND_DATA(q, v) \ + ( (q)->operandData + (((QueryItem*)(v)) - GETQUERY((q)->query)) ) static bool -checkcondition_QueryOperand(void *checkval, QueryOperand *val) +checkcondition_QueryOperand(void *checkval, QueryOperand *val, ExecPhraseData *data) { - QueryRepresentation *qr = (QueryRepresentation *) checkval; + QueryRepresentation *qr = (QueryRepresentation *) checkval; + QueryRepresentationOperand *opData = QR_GET_OPERAND_DATA(qr, val); - return QR_GET_OPERAND_EXISTS(qr, val); + if (!opData->operandexists) + return false; + + if (data) + { + data->npos = opData->npos; + data->pos = opData->pos; + if (opData->reverseinsert) + data->pos += MAXQROPOS - opData->npos; + } + + return true; } typedef struct @@ -539,14 +581,65 @@ typedef struct DocRepresentation *end; } CoverExt; +static void +resetQueryRepresentation(QueryRepresentation *qr, bool reverseinsert) +{ + int i; + + for(i = 0; i < qr->query->size; i++) + { + qr->operandData[i].operandexists = false; + qr->operandData[i].reverseinsert = reverseinsert; + qr->operandData[i].npos = 0; + } +} + +static void +fillQueryRepresentationData(QueryRepresentation *qr, DocRepresentation *entry) +{ + int i; + int lastPos; + QueryRepresentationOperand *opData; + + for (i = 0; i < entry->data.query.nitem; i++) + { + if (entry->data.query.items[i]->type != QI_VAL) + continue; + + opData = QR_GET_OPERAND_DATA(qr, entry->data.query.items[i]); + + opData->operandexists = true; + + if (opData->npos == 0) + { + lastPos = (opData->reverseinsert) ? (MAXQROPOS - 1) : 0; + opData->pos[lastPos] = entry->pos; + opData->npos++; + continue; + } + + lastPos = opData->reverseinsert ? + (MAXQROPOS - opData->npos) : + (opData->npos - 1); + + if (WEP_GETPOS(opData->pos[lastPos]) != WEP_GETPOS(entry->pos)) + { + lastPos = opData->reverseinsert ? + (MAXQROPOS - 1 - opData->npos) : + (opData->npos); + + opData->pos[lastPos] = entry->pos; + opData->npos++; + } + } +} static bool Cover(DocRepresentation *doc, int len, QueryRepresentation *qr, CoverExt *ext) { - DocRepresentation *ptr; - int lastpos = ext->pos; - int i; - bool found = false; + DocRepresentation *ptr; + int lastpos = ext->pos; + bool found = false; /* * since this function recurses, it could be driven to stack overflow. @@ -554,7 +647,7 @@ Cover(DocRepresentation *doc, int len, QueryRepresentation *qr, CoverExt *ext) */ check_stack_depth(); - memset(qr->operandexist, 0, sizeof(bool) * qr->query->size); + resetQueryRepresentation(qr, false); ext->p = INT_MAX; ext->q = 0; @@ -563,16 +656,13 @@ Cover(DocRepresentation *doc, int len, QueryRepresentation *qr, CoverExt *ext) /* find upper bound of cover from current position, move up */ while (ptr - doc < len) { - for (i = 0; i < ptr->nitem; i++) - { - if (ptr->item[i]->type == QI_VAL) - QR_SET_OPERAND_EXISTS(qr, ptr->item[i]); - } + fillQueryRepresentationData(qr, ptr); + if (TS_execute(GETQUERY(qr->query), (void *) qr, false, checkcondition_QueryOperand)) { - if (ptr->pos > ext->q) + if (WEP_GETPOS(ptr->pos) > ext->q) { - ext->q = ptr->pos; + ext->q = WEP_GETPOS(ptr->pos); ext->end = ptr; lastpos = ptr - doc; found = true; @@ -585,22 +675,24 @@ Cover(DocRepresentation *doc, int len, QueryRepresentation *qr, CoverExt *ext) if (!found) return false; - memset(qr->operandexist, 0, sizeof(bool) * qr->query->size); + resetQueryRepresentation(qr, true); ptr = doc + lastpos; /* find lower bound of cover from found upper bound, move down */ while (ptr >= doc + ext->pos) { - for (i = 0; i < ptr->nitem; i++) - if (ptr->item[i]->type == QI_VAL) - QR_SET_OPERAND_EXISTS(qr, ptr->item[i]); + /* + * we scan doc from right to left, so pos info in reverse order! + */ + fillQueryRepresentationData(qr, ptr); + if (TS_execute(GETQUERY(qr->query), (void *) qr, true, checkcondition_QueryOperand)) { - if (ptr->pos < ext->p) + if (WEP_GETPOS(ptr->pos) < ext->p) { ext->begin = ptr; - ext->p = ptr->pos; + ext->p = WEP_GETPOS(ptr->pos); } break; } @@ -628,18 +720,20 @@ get_docrep(TSVector txt, QueryRepresentation *qr, int *doclen) WordEntry *entry, *firstentry; WordEntryPos *post; - int32 dimt, + int32 dimt, /* number of 'post' items */ j, i, nitem; int len = qr->query->size * 4, cur = 0; DocRepresentation *doc; - char *operand; doc = (DocRepresentation *) palloc(sizeof(DocRepresentation) * len); - operand = GETOPERAND(qr->query); + /* + * Iterate through query to make DocRepresentaion for words and it's entries + * satisfied by query + */ for (i = 0; i < qr->query->size; i++) { QueryOperand *curoperand; @@ -649,13 +743,11 @@ get_docrep(TSVector txt, QueryRepresentation *qr, int *doclen) curoperand = &item[i].qoperand; - if (QR_GET_OPERAND_EXISTS(qr, &item[i])) - continue; - firstentry = entry = find_wordentry(txt, qr->query, curoperand, &nitem); if (!entry) continue; + /* iterations over entries in tsvector */ while (entry - firstentry < nitem) { if (entry->haspos) @@ -676,53 +768,67 @@ get_docrep(TSVector txt, QueryRepresentation *qr, int *doclen) doc = (DocRepresentation *) repalloc(doc, sizeof(DocRepresentation) * len); } + /* iterations over entry's positions */ for (j = 0; j < dimt; j++) { - if (j == 0) - { - int k; - - doc[cur].nitem = 0; - doc[cur].item = (QueryItem **) palloc(sizeof(QueryItem *) * qr->query->size); - - for (k = 0; k < qr->query->size; k++) - { - QueryOperand *kptr = &item[k].qoperand; - QueryOperand *iptr = &item[i].qoperand; - - if (k == i || - (item[k].type == QI_VAL && - compareQueryOperand(&kptr, &iptr, operand) == 0)) - { - /* - * if k == i, we've already checked above that - * it's type == Q_VAL - */ - doc[cur].item[doc[cur].nitem] = item + k; - doc[cur].nitem++; - QR_SET_OPERAND_EXISTS(qr, item + k); - } - } - } - else + if (curoperand->weight == 0 || + curoperand->weight & (1 << WEP_GETWEIGHT(post[j]))) { - doc[cur].nitem = doc[cur - 1].nitem; - doc[cur].item = doc[cur - 1].item; + doc[cur].pos = post[j]; + doc[cur].data.map.entry = entry; + doc[cur].data.map.item = (QueryItem *) curoperand; + cur++; } - doc[cur].pos = WEP_GETPOS(post[j]); - doc[cur].wclass = WEP_GETWEIGHT(post[j]); - cur++; } entry++; } } - *doclen = cur; - if (cur > 0) { + DocRepresentation *rptr = doc + 1, + *wptr = doc, + storage; + + /* + * Sort representation in ascending order by pos and entry + */ qsort((void *) doc, cur, sizeof(DocRepresentation), compareDocR); + + /* + * Join QueryItem per WordEntry and it's position + */ + storage.pos = doc->pos; + storage.data.query.items = palloc(sizeof(QueryItem *) * qr->query->size); + storage.data.query.items[0] = doc->data.map.item; + storage.data.query.nitem = 1; + + while (rptr - doc < cur) + { + if (rptr->pos == (rptr-1)->pos && + rptr->data.map.entry == (rptr-1)->data.map.entry) + { + storage.data.query.items[storage.data.query.nitem] = rptr->data.map.item; + storage.data.query.nitem++; + } + else + { + *wptr = storage; + wptr++; + storage.pos = rptr->pos; + storage.data.query.items = palloc(sizeof(QueryItem *) * qr->query->size); + storage.data.query.items[0] = rptr->data.map.item; + storage.data.query.nitem = 1; + } + + rptr++; + } + + *wptr = storage; + wptr++; + + *doclen = wptr - doc; return doc; } @@ -758,12 +864,13 @@ calc_rank_cd(const float4 *arrdata, TSVector txt, TSQuery query, int method) } qr.query = query; - qr.operandexist = (bool *) palloc0(sizeof(bool) * query->size); + qr.operandData = (QueryRepresentationOperand *) + palloc0(sizeof(QueryRepresentationOperand) * query->size); doc = get_docrep(txt, &qr, &doclen); if (!doc) { - pfree(qr.operandexist); + pfree(qr.operandData); return 0.0; } @@ -777,7 +884,7 @@ calc_rank_cd(const float4 *arrdata, TSVector txt, TSQuery query, int method) while (ptr <= ext.end) { - InvSum += invws[ptr->wclass]; + InvSum += invws[WEP_GETWEIGHT(ptr->pos)]; ptr++; } @@ -827,7 +934,7 @@ calc_rank_cd(const float4 *arrdata, TSVector txt, TSQuery query, int method) pfree(doc); - pfree(qr.operandexist); + pfree(qr.operandData); return (float4) Wdoc; } diff --git a/src/backend/utils/adt/tsvector.c b/src/backend/utils/adt/tsvector.c index 12043bf3f5..2a26c46551 100644 --- a/src/backend/utils/adt/tsvector.c +++ b/src/backend/utils/adt/tsvector.c @@ -28,7 +28,7 @@ typedef struct /* Compare two WordEntryPos values for qsort */ -static int +int comparePos(const void *a, const void *b) { int apos = WEP_GETPOS(*(const WordEntryPos *) a); diff --git a/src/backend/utils/adt/tsvector_op.c b/src/backend/utils/adt/tsvector_op.c index f6d3fb5d7b..e363f2a023 100644 --- a/src/backend/utils/adt/tsvector_op.c +++ b/src/backend/utils/adt/tsvector_op.c @@ -1121,35 +1121,124 @@ tsCompareString(char *a, int lena, char *b, int lenb, bool prefix) } /* - * check weight info + * Check weight info or/and fill 'data' with the required positions */ static bool -checkclass_str(CHKVAL *chkval, WordEntry *val, QueryOperand *item) +checkclass_str(CHKVAL *chkval, WordEntry *entry, QueryOperand *val, + ExecPhraseData *data) { - WordEntryPosVector *posvec; - WordEntryPos *ptr; - uint16 len; + bool result = false; - posvec = (WordEntryPosVector *) - (chkval->values + SHORTALIGN(val->pos + val->len)); + if (entry->haspos && (val->weight || data)) + { + WordEntryPosVector *posvec; - len = posvec->npos; - ptr = posvec->pos; + /* + * We can't use the _POSVECPTR macro here because the pointer to the + * tsvector's lexeme storage is already contained in chkval->values. + */ + posvec = (WordEntryPosVector *) + (chkval->values + SHORTALIGN(entry->pos + entry->len)); - while (len--) + if (val->weight && data) + { + WordEntryPos *posvec_iter = posvec->pos; + WordEntryPos *dptr; + + /* + * Filter position information by weights + */ + dptr = data->pos = palloc(sizeof(WordEntryPos) * posvec->npos); + data->allocated = true; + + /* Is there a position with a matching weight? */ + while (posvec_iter < posvec->pos + posvec->npos) + { + /* If true, append this position to the data->pos */ + if (val->weight & (1 << WEP_GETWEIGHT(*posvec_iter))) + { + *dptr = WEP_GETPOS(*posvec_iter); + dptr++; + } + + posvec_iter++; + } + + data->npos = dptr - data->pos; + + if (data->npos > 0) + result = true; + } + else if (val->weight) + { + WordEntryPos *posvec_iter = posvec->pos; + + /* Is there a position with a matching weight? */ + while (posvec_iter < posvec->pos + posvec->npos) + { + if (val->weight & (1 << WEP_GETWEIGHT(*posvec_iter))) + { + result = true; + break; /* no need to go further */ + } + + posvec_iter++; + } + } + else /* data != NULL */ + { + data->npos = posvec->npos; + data->pos = posvec->pos; + data->allocated = false; + result = true; + } + } + else { - if (item->weight & (1 << WEP_GETWEIGHT(*ptr))) - return true; - ptr++; + result = true; } - return false; + + return result; +} + +/* + * Removes duplicate pos entries. We can't use uniquePos() from + * tsvector.c because array might be longer than MAXENTRYPOS + * + * Returns new length. + */ +static int +uniqueLongPos(WordEntryPos *pos, int npos) +{ + WordEntryPos *pos_iter, + *result; + + if (npos <= 1) + return npos; + + qsort((void *) pos, npos, sizeof(WordEntryPos), comparePos); + + result = pos; + pos_iter = pos + 1; + while (pos_iter < pos + npos) + { + if (WEP_GETPOS(*pos_iter) != WEP_GETPOS(*result)) + { + result++; + *result = WEP_GETPOS(*pos_iter); + } + + pos_iter++; + } + + return result + 1 - pos; } /* * is there value 'val' in array or not ? */ static bool -checkcondition_str(void *checkval, QueryOperand *val) +checkcondition_str(void *checkval, QueryOperand *val, ExecPhraseData *data) { CHKVAL *chkval = (CHKVAL *) checkval; WordEntry *StopLow = chkval->arrb; @@ -1162,14 +1251,16 @@ checkcondition_str(void *checkval, QueryOperand *val) while (StopLow < StopHigh) { StopMiddle = StopLow + (StopHigh - StopLow) / 2; - difference = tsCompareString(chkval->operand + val->distance, val->length, - chkval->values + StopMiddle->pos, StopMiddle->len, + difference = tsCompareString(chkval->operand + val->distance, + val->length, + chkval->values + StopMiddle->pos, + StopMiddle->len, false); if (difference == 0) { - res = (val->weight && StopMiddle->haspos) ? - checkclass_str(chkval, StopMiddle, val) : true; + /* Check weight info & fill 'data' with positions */ + res = checkclass_str(chkval, StopMiddle, val, data); break; } else if (difference > 0) @@ -1178,30 +1269,199 @@ checkcondition_str(void *checkval, QueryOperand *val) StopHigh = StopMiddle; } - if (!res && val->prefix) + if ((!res || data) && val->prefix) { + WordEntryPos *allpos = NULL; + int npos = 0, + totalpos = 0; /* * there was a failed exact search, so we should scan further to find - * a prefix match. + * a prefix match. We also need to do so if caller needs position info */ if (StopLow >= StopHigh) StopMiddle = StopHigh; - while (res == false && StopMiddle < chkval->arre && - tsCompareString(chkval->operand + val->distance, val->length, - chkval->values + StopMiddle->pos, StopMiddle->len, + while ((!res || data) && StopMiddle < chkval->arre && + tsCompareString(chkval->operand + val->distance, + val->length, + chkval->values + StopMiddle->pos, + StopMiddle->len, true) == 0) { - res = (val->weight && StopMiddle->haspos) ? - checkclass_str(chkval, StopMiddle, val) : true; + if (data) + { + /* + * We need to join position information + */ + res = checkclass_str(chkval, StopMiddle, val, data); + + if (res) + { + while (npos + data->npos >= totalpos) + { + if (totalpos == 0) + { + totalpos = 256; + allpos = palloc(sizeof(WordEntryPos) * totalpos); + } + else + { + totalpos *= 2; + allpos = repalloc(allpos, sizeof(WordEntryPos) * totalpos); + } + } + + memcpy(allpos + npos, data->pos, sizeof(WordEntryPos) * data->npos); + npos += data->npos; + } + } + else + { + res = checkclass_str(chkval, StopMiddle, val, NULL); + } StopMiddle++; } + + if (res && data) + { + /* Sort and make unique array of found positions */ + data->pos = allpos; + data->npos = uniqueLongPos(allpos, npos); + data->allocated = true; + } } return res; } +/* + * Check for phrase condition. Fallback to the AND operation + * if there is no positional information. + */ +static bool +TS_phrase_execute(QueryItem *curitem, + void *checkval, bool calcnot, ExecPhraseData *data, + bool (*chkcond) (void *, QueryOperand *, ExecPhraseData *)) +{ + /* since this function recurses, it could be driven to stack overflow */ + check_stack_depth(); + + if (curitem->type == QI_VAL) + { + return chkcond(checkval, (QueryOperand *) curitem, data); + } + else + { + ExecPhraseData Ldata = {0, false, NULL}, + Rdata = {0, false, NULL}; + WordEntryPos *Lpos, + *Rpos, + *pos_iter = NULL; + + Assert(curitem->qoperator.oper == OP_PHRASE); + + if (!TS_phrase_execute(curitem + curitem->qoperator.left, + checkval, calcnot, &Ldata, chkcond)) + return false; + + if (!TS_phrase_execute(curitem + 1, checkval, calcnot, &Rdata, chkcond)) + return false; + + /* + * if at least one of the operands has no position + * information, fallback to AND operation. + */ + if (Ldata.npos == 0 || Rdata.npos == 0) + return true; + + /* + * Result of the operation is a list of the + * corresponding positions of RIGHT operand. + */ + if (data) + { + if (!Rdata.allocated) + /* + * OP_PHRASE is based on the OP_AND, so the number of resulting + * positions could not be greater than the total amount of operands. + */ + data->pos = palloc(sizeof(WordEntryPos) * Min(Ldata.npos, Rdata.npos)); + else + data->pos = Rdata.pos; + + data->allocated = true; + data->npos = 0; + pos_iter = data->pos; + } + + Lpos = Ldata.pos; + Rpos = Rdata.pos; + + /* + * Find matches by distance, WEP_GETPOS() is needed because + * ExecPhraseData->data can point to the tsvector's WordEntryPosVector + */ + + while (Rpos < Rdata.pos + Rdata.npos) + { + while (Lpos < Ldata.pos + Ldata.npos) + { + if (WEP_GETPOS(*Lpos) <= WEP_GETPOS(*Rpos)) + { + /* + * Lpos is behind the Rpos, so we have to check the + * distance condition + */ + if (WEP_GETPOS(*Rpos) - WEP_GETPOS(*Lpos) <= curitem->qoperator.distance) + { + /* MATCH! */ + if (data) + { + *pos_iter = WEP_GETPOS(*Rpos); + pos_iter++; + + break; /* We need to build a unique result + * array, so go to the next Rpos */ + } + else + { + /* + * We are in the root of the phrase tree and hence + * we don't have to store the resulting positions + */ + return true; + } + } + } + else + { + /* + * Go to the next Rpos, because Lpos + * is ahead of the current Rpos + */ + break; + } + + Lpos++; + } + + Rpos++; + } + + if (data) + { + data->npos = pos_iter - data->pos; + + if (data->npos > 0) + return true; + } + } + + return false; +} + + /* * Evaluate tsquery boolean expression. * @@ -1210,16 +1470,19 @@ checkcondition_str(void *checkval, QueryOperand *val) * do anything with it. * if calcnot is false, NOT expressions are always evaluated to be true. This * is used in ranking. + * It believes that ordinary operators are always closier to root than phrase + * operator, so, TS_execute() may not take care of lexeme's position at all. */ bool TS_execute(QueryItem *curitem, void *checkval, bool calcnot, - bool (*chkcond) (void *checkval, QueryOperand *val)) + bool (*chkcond) (void *checkval, QueryOperand *val, ExecPhraseData *data)) { /* since this function recurses, it could be driven to stack overflow */ check_stack_depth(); if (curitem->type == QI_VAL) - return chkcond(checkval, (QueryOperand *) curitem); + return chkcond(checkval, (QueryOperand *) curitem, + NULL /* we don't need position info */); switch (curitem->qoperator.oper) { @@ -1241,6 +1504,9 @@ TS_execute(QueryItem *curitem, void *checkval, bool calcnot, else return TS_execute(curitem + 1, checkval, calcnot, chkcond); + case OP_PHRASE: + return TS_phrase_execute(curitem, checkval, calcnot, NULL, chkcond); + default: elog(ERROR, "unrecognized operator: %d", curitem->qoperator.oper); } @@ -1277,6 +1543,10 @@ tsquery_requires_match(QueryItem *curitem) */ return false; + case OP_PHRASE: + /* + * Treat OP_PHRASE as OP_AND here + */ case OP_AND: /* If either side requires a match, we're good */ if (tsquery_requires_match(curitem + curitem->qoperator.left)) diff --git a/src/backend/utils/adt/tsvector_parser.c b/src/backend/utils/adt/tsvector_parser.c index e2817082db..d99405824d 100644 --- a/src/backend/utils/adt/tsvector_parser.c +++ b/src/backend/utils/adt/tsvector_parser.c @@ -89,7 +89,15 @@ do { \ } \ } while (0) -#define ISOPERATOR(x) ( pg_mblen(x)==1 && ( *(x)=='!' || *(x)=='&' || *(x)=='|' || *(x)=='(' || *(x)==')' ) ) +/* phrase operator begins with '<' */ +#define ISOPERATOR(x) \ + ( pg_mblen(x) == 1 && ( *(x) == '!' || \ + *(x) == '&' || \ + *(x) == '|' || \ + *(x) == '(' || \ + *(x) == ')' || \ + *(x) == '<' \ + ) ) /* Fills gettoken_tsvector's output parameters, and returns true */ #define RETURN_TOKEN \ diff --git a/src/include/catalog/catversion.h b/src/include/catalog/catversion.h index 0edc6cbafe..6d254ba133 100644 --- a/src/include/catalog/catversion.h +++ b/src/include/catalog/catversion.h @@ -53,6 +53,6 @@ */ /* yyyymmddN */ -#define CATALOG_VERSION_NO 201604062 +#define CATALOG_VERSION_NO 201604071 #endif diff --git a/src/include/catalog/pg_operator.h b/src/include/catalog/pg_operator.h index b3daff28e3..a5e4a02ebc 100644 --- a/src/include/catalog/pg_operator.h +++ b/src/include/catalog/pg_operator.h @@ -1675,6 +1675,9 @@ DATA(insert OID = 3680 ( "&&" PGNSP PGUID b f f 3615 3615 3615 0 0 tsque DESCR("AND-concatenate"); DATA(insert OID = 3681 ( "||" PGNSP PGUID b f f 3615 3615 3615 0 0 tsquery_or - - )); DESCR("OR-concatenate"); +/* <-> operation calls tsquery_phrase, but function is polymorphic. So, point to OID of the tsquery_phrase */ +DATA(insert OID = 5005 ( "<->" PGNSP PGUID b f f 3615 3615 3615 0 0 5003 - - )); +DESCR("phrase-concatenate"); DATA(insert OID = 3682 ( "!!" PGNSP PGUID l f f 0 3615 3615 0 0 tsquery_not - - )); DESCR("NOT tsquery"); DATA(insert OID = 3693 ( "@>" PGNSP PGUID b f f 3615 3615 16 3694 0 tsq_mcontains contsel contjoinsel )); diff --git a/src/include/catalog/pg_proc.h b/src/include/catalog/pg_proc.h index d7dbc73928..c351594be4 100644 --- a/src/include/catalog/pg_proc.h +++ b/src/include/catalog/pg_proc.h @@ -4607,6 +4607,9 @@ DESCR("less-equal-greater"); DATA(insert OID = 3669 ( tsquery_and PGNSP PGUID 12 1 0 0 0 f f f f t f i s 2 0 3615 "3615 3615" _null_ _null_ _null_ _null_ _null_ tsquery_and _null_ _null_ _null_ )); DATA(insert OID = 3670 ( tsquery_or PGNSP PGUID 12 1 0 0 0 f f f f t f i s 2 0 3615 "3615 3615" _null_ _null_ _null_ _null_ _null_ tsquery_or _null_ _null_ _null_ )); +DATA(insert OID = 5003 ( tsquery_phrase PGNSP PGUID 12 1 0 0 0 f f f f t f i s 2 0 3615 "3615 3615" _null_ _null_ _null_ _null_ _null_ tsquery_phrase _null_ _null_ _null_ )); +DATA(insert OID = 5004 ( tsquery_phrase PGNSP PGUID 12 1 0 0 0 f f f f t f i s 3 0 3615 "3615 3615 23" _null_ _null_ _null_ _null_ _null_ tsquery_phrase_distance _null_ _null_ _null_ )); +DESCR("phrase-concatenate with distance"); DATA(insert OID = 3671 ( tsquery_not PGNSP PGUID 12 1 0 0 0 f f f f t f i s 1 0 3615 "3615" _null_ _null_ _null_ _null_ _null_ tsquery_not _null_ _null_ _null_ )); DATA(insert OID = 3691 ( tsq_mcontains PGNSP PGUID 12 1 0 0 0 f f f f t f i s 2 0 16 "3615 3615" _null_ _null_ _null_ _null_ _null_ tsq_mcontains _null_ _null_ _null_ )); @@ -4726,12 +4729,16 @@ DATA(insert OID = 3746 ( to_tsquery PGNSP PGUID 12 100 0 0 0 f f f f t f i s 2 DESCR("make tsquery"); DATA(insert OID = 3747 ( plainto_tsquery PGNSP PGUID 12 100 0 0 0 f f f f t f i s 2 0 3615 "3734 25" _null_ _null_ _null_ _null_ _null_ plainto_tsquery_byid _null_ _null_ _null_ )); DESCR("transform to tsquery"); +DATA(insert OID = 5006 ( phraseto_tsquery PGNSP PGUID 12 100 0 0 0 f f f f t f i s 2 0 3615 "3734 25" _null_ _null_ _null_ _null_ _null_ phraseto_tsquery_byid _null_ _null_ _null_ )); +DESCR("transform to tsquery"); DATA(insert OID = 3749 ( to_tsvector PGNSP PGUID 12 100 0 0 0 f f f f t f s s 1 0 3614 "25" _null_ _null_ _null_ _null_ _null_ to_tsvector _null_ _null_ _null_ )); DESCR("transform to tsvector"); DATA(insert OID = 3750 ( to_tsquery PGNSP PGUID 12 100 0 0 0 f f f f t f s s 1 0 3615 "25" _null_ _null_ _null_ _null_ _null_ to_tsquery _null_ _null_ _null_ )); DESCR("make tsquery"); DATA(insert OID = 3751 ( plainto_tsquery PGNSP PGUID 12 100 0 0 0 f f f f t f s s 1 0 3615 "25" _null_ _null_ _null_ _null_ _null_ plainto_tsquery _null_ _null_ _null_ )); DESCR("transform to tsquery"); +DATA(insert OID = 5001 ( phraseto_tsquery PGNSP PGUID 12 100 0 0 0 f f f f t f s s 1 0 3615 "25" _null_ _null_ _null_ _null_ _null_ phraseto_tsquery _null_ _null_ _null_ )); +DESCR("transform to tsquery"); DATA(insert OID = 3752 ( tsvector_update_trigger PGNSP PGUID 12 1 0 0 0 f f f f f f v s 0 0 2279 "" _null_ _null_ _null_ _null_ _null_ tsvector_update_trigger_byid _null_ _null_ _null_ )); DESCR("trigger for automatic update of tsvector column"); diff --git a/src/include/tsearch/ts_public.h b/src/include/tsearch/ts_public.h index 6f7a891ae8..9364eee438 100644 --- a/src/include/tsearch/ts_public.h +++ b/src/include/tsearch/ts_public.h @@ -34,16 +34,17 @@ typedef struct */ typedef struct { - uint32 selected:1, - in:1, - replace:1, - repeated:1, - skip:1, - unused:3, - type:8, - len:16; - char *word; - QueryOperand *item; + uint32 selected: 1, + in: 1, + replace: 1, + repeated: 1, + skip: 1, + unused: 3, + type: 8, + len: 16; + WordEntryPos pos; + char *word; + QueryOperand *item; } HeadlineWordEntry; typedef struct @@ -51,6 +52,7 @@ typedef struct HeadlineWordEntry *words; int32 lenwords; int32 curwords; + int32 vectorpos; /* positions a-la tsvector */ char *startsel; char *stopsel; char *fragdelim; diff --git a/src/include/tsearch/ts_type.h b/src/include/tsearch/ts_type.h index bc99524dc0..5f4e596193 100644 --- a/src/include/tsearch/ts_type.h +++ b/src/include/tsearch/ts_type.h @@ -49,6 +49,8 @@ typedef struct #define MAXSTRLEN ( (1<<11) - 1) #define MAXSTRPOS ( (1<<20) - 1) +extern int comparePos(const void *a, const void *b); + /* * Equivalent to * typedef struct { @@ -213,15 +215,33 @@ typedef struct } QueryOperand; -/* Legal values for QueryOperator.operator */ -#define OP_NOT 1 -#define OP_AND 2 -#define OP_OR 3 +/* + * Legal values for QueryOperator.operator. + * They should be ordered by priority! We assume that phrase + * has highest priority, but this agreement is only + * for query transformation! That's need to simplify + * algorithm of query transformation. + */ +#define OP_OR 1 +#define OP_AND 2 +#define OP_NOT 3 +#define OP_PHRASE 4 +#define OP_NOT_PHRASE 5 /* + * OP_PHRASE negation operations must have greater + * priority in order to force infix() to surround + * the whole OP_PHRASE expression with parentheses. + */ + +#define TOP_PRIORITY 6 /* highest priority for val nodes */ + +#define OP_PRIORITY(x) (x) +#define QO_PRIORITY(x) OP_PRIORITY(((QueryOperator *) (x))->oper) typedef struct { QueryItemType type; int8 oper; /* see above */ + int16 distance; /* distance between agrs for OP_PHRASE */ uint32 left; /* pointer to left operand. Right operand is * item + 1, left operand is placed * item+item->left */ @@ -304,6 +324,8 @@ extern Datum tsquery_numnode(PG_FUNCTION_ARGS); extern Datum tsquery_and(PG_FUNCTION_ARGS); extern Datum tsquery_or(PG_FUNCTION_ARGS); +extern Datum tsquery_phrase(PG_FUNCTION_ARGS); +extern Datum tsquery_phrase_distance(PG_FUNCTION_ARGS); extern Datum tsquery_not(PG_FUNCTION_ARGS); extern Datum tsquery_rewrite(PG_FUNCTION_ARGS); diff --git a/src/include/tsearch/ts_utils.h b/src/include/tsearch/ts_utils.h index 88533a6423..855bbfecd6 100644 --- a/src/include/tsearch/ts_utils.h +++ b/src/include/tsearch/ts_utils.h @@ -55,7 +55,7 @@ extern TSQuery parse_tsquery(char *buf, extern void pushValue(TSQueryParserState state, char *strval, int lenval, int16 weight, bool prefix); extern void pushStop(TSQueryParserState state); -extern void pushOperator(TSQueryParserState state, int8 oper); +extern void pushOperator(TSQueryParserState state, int8 oper, int16 distance); /* * parse plain text and lexize words @@ -104,8 +104,15 @@ extern text *generateHeadline(HeadlineParsedText *prs); /* * Common check function for tsvector @@ tsquery */ +typedef struct ExecPhraseData +{ + int npos; + bool allocated; + WordEntryPos *pos; +} ExecPhraseData; + extern bool TS_execute(QueryItem *curitem, void *checkval, bool calcnot, - bool (*chkcond) (void *checkval, QueryOperand *val)); + bool (*chkcond) (void *, QueryOperand *, ExecPhraseData *)); extern bool tsquery_requires_match(QueryItem *curitem); /* @@ -120,6 +127,8 @@ extern Datum to_tsquery_byid(PG_FUNCTION_ARGS); extern Datum to_tsquery(PG_FUNCTION_ARGS); extern Datum plainto_tsquery_byid(PG_FUNCTION_ARGS); extern Datum plainto_tsquery(PG_FUNCTION_ARGS); +extern Datum phraseto_tsquery_byid(PG_FUNCTION_ARGS); +extern Datum phraseto_tsquery(PG_FUNCTION_ARGS); /* * GiST support function @@ -169,7 +178,7 @@ extern Datum gin_tsquery_consistent_oldsig(PG_FUNCTION_ARGS); * TSQuery Utilities */ extern QueryItem *clean_NOT(QueryItem *ptr, int32 *len); -extern QueryItem *clean_fakeval(QueryItem *ptr, int32 *len); +extern TSQuery cleanup_fakeval_and_phrase(TSQuery in); typedef struct QTNode { diff --git a/src/test/regress/expected/tsdicts.out b/src/test/regress/expected/tsdicts.out index ef86295f88..5ddbe80234 100644 --- a/src/test/regress/expected/tsdicts.out +++ b/src/test/regress/expected/tsdicts.out @@ -434,9 +434,9 @@ SELECT to_tsvector('ispell_tst', 'Booking the skies after rebookings for footbal (1 row) SELECT to_tsquery('ispell_tst', 'footballklubber'); - to_tsquery ------------------------------------------------------------------------------- - ( 'footballklubber' | 'foot' & 'ball' & 'klubber' ) | 'football' & 'klubber' + to_tsquery +-------------------------------------------------------------------------- + 'footballklubber' | 'foot' & 'ball' & 'klubber' | 'football' & 'klubber' (1 row) SELECT to_tsquery('ispell_tst', 'footballyklubber:b & rebookings:A & sky'); @@ -458,9 +458,9 @@ SELECT to_tsvector('hunspell_tst', 'Booking the skies after rebookings for footb (1 row) SELECT to_tsquery('hunspell_tst', 'footballklubber'); - to_tsquery ------------------------------------------------------------------------------- - ( 'footballklubber' | 'foot' & 'ball' & 'klubber' ) | 'football' & 'klubber' + to_tsquery +-------------------------------------------------------------------------- + 'footballklubber' | 'foot' & 'ball' & 'klubber' | 'football' & 'klubber' (1 row) SELECT to_tsquery('hunspell_tst', 'footballyklubber:b & rebookings:A & sky'); @@ -469,6 +469,18 @@ SELECT to_tsquery('hunspell_tst', 'footballyklubber:b & rebookings:A & sky'); 'foot':B & 'ball':B & 'klubber':B & ( 'booking':A | 'book':A ) & 'sky' (1 row) +SELECT to_tsquery('hunspell_tst', 'footballyklubber:b <-> sky'); + to_tsquery +----------------------------------------------------------------------------- + ( 'foot':B <-> 'sky' ) & ( 'ball':B <-> 'sky' ) & ( 'klubber':B <-> 'sky' ) +(1 row) + +SELECT phraseto_tsquery('hunspell_tst', 'footballyklubber sky'); + phraseto_tsquery +----------------------------------------------------------------------- + ( 'foot' <-> 'sky' ) & ( 'ball' <-> 'sky' ) & ( 'klubber' <-> 'sky' ) +(1 row) + -- Test ispell dictionary with hunspell affix with FLAG long in configuration ALTER TEXT SEARCH CONFIGURATION hunspell_tst ALTER MAPPING REPLACE hunspell WITH hunspell_long; @@ -479,9 +491,9 @@ SELECT to_tsvector('hunspell_tst', 'Booking the skies after rebookings for footb (1 row) SELECT to_tsquery('hunspell_tst', 'footballklubber'); - to_tsquery ------------------------------------------------------------------------------- - ( 'footballklubber' | 'foot' & 'ball' & 'klubber' ) | 'football' & 'klubber' + to_tsquery +-------------------------------------------------------------------------- + 'footballklubber' | 'foot' & 'ball' & 'klubber' | 'football' & 'klubber' (1 row) SELECT to_tsquery('hunspell_tst', 'footballyklubber:b & rebookings:A & sky'); @@ -500,9 +512,9 @@ SELECT to_tsvector('hunspell_tst', 'Booking the skies after rebookings for footb (1 row) SELECT to_tsquery('hunspell_tst', 'footballklubber'); - to_tsquery ------------------------------------------------------------------------------- - ( 'footballklubber' | 'foot' & 'ball' & 'klubber' ) | 'football' & 'klubber' + to_tsquery +-------------------------------------------------------------------------- + 'footballklubber' | 'foot' & 'ball' & 'klubber' | 'football' & 'klubber' (1 row) SELECT to_tsquery('hunspell_tst', 'footballyklubber:b & rebookings:A & sky'); diff --git a/src/test/regress/expected/tsearch.out b/src/test/regress/expected/tsearch.out index 3811250fb7..558f00cc4e 100644 --- a/src/test/regress/expected/tsearch.out +++ b/src/test/regress/expected/tsearch.out @@ -554,6 +554,235 @@ SELECT plainto_tsquery('english', 'foo bar') && 'asd | fg'; 'foo' & 'bar' & ( 'asd' | 'fg' ) (1 row) +-- Check stop word deletion, a and s are stop-words +SELECT to_tsquery('english', '(1 <-> 2) <-> a'); + to_tsquery +------------- + '1' <-> '2' +(1 row) + +SELECT to_tsquery('english', '(1 <-> a) <-> 2'); + to_tsquery +------------- + '1' <2> '2' +(1 row) + +SELECT to_tsquery('english', '(a <-> 1) <-> 2'); + to_tsquery +------------- + '1' <-> '2' +(1 row) + +SELECT to_tsquery('english', 'a <-> (1 <-> 2)'); + to_tsquery +------------- + '1' <-> '2' +(1 row) + +SELECT to_tsquery('english', '1 <-> (a <-> 2)'); + to_tsquery +------------- + '1' <-> '2' +(1 row) + +SELECT to_tsquery('english', '1 <-> (2 <-> a)'); + to_tsquery +------------- + '1' <-> '2' +(1 row) + +SELECT to_tsquery('english', '(1 <-> 2) <3> a'); + to_tsquery +------------- + '1' <-> '2' +(1 row) + +SELECT to_tsquery('english', '(1 <-> a) <3> 2'); + to_tsquery +------------- + '1' <4> '2' +(1 row) + +SELECT to_tsquery('english', '(a <-> 1) <3> 2'); + to_tsquery +------------- + '1' <3> '2' +(1 row) + +SELECT to_tsquery('english', 'a <3> (1 <-> 2)'); + to_tsquery +------------- + '1' <-> '2' +(1 row) + +SELECT to_tsquery('english', '1 <3> (a <-> 2)'); + to_tsquery +------------- + '1' <3> '2' +(1 row) + +SELECT to_tsquery('english', '1 <3> (2 <-> a)'); + to_tsquery +------------- + '1' <3> '2' +(1 row) + +SELECT to_tsquery('english', '(1 <3> 2) <-> a'); + to_tsquery +------------- + '1' <3> '2' +(1 row) + +SELECT to_tsquery('english', '(1 <3> a) <-> 2'); + to_tsquery +------------- + '1' <4> '2' +(1 row) + +SELECT to_tsquery('english', '(a <3> 1) <-> 2'); + to_tsquery +------------- + '1' <-> '2' +(1 row) + +SELECT to_tsquery('english', 'a <-> (1 <3> 2)'); + to_tsquery +------------- + '1' <3> '2' +(1 row) + +SELECT to_tsquery('english', '1 <-> (a <3> 2)'); + to_tsquery +------------- + '1' <-> '2' +(1 row) + +SELECT to_tsquery('english', '1 <-> (2 <3> a)'); + to_tsquery +------------- + '1' <-> '2' +(1 row) + +SELECT to_tsquery('english', '((a <-> 1) <-> 2) <-> s'); + to_tsquery +------------- + '1' <-> '2' +(1 row) + +SELECT to_tsquery('english', '(2 <-> (a <-> 1)) <-> s'); + to_tsquery +------------- + '2' <-> '1' +(1 row) + +SELECT to_tsquery('english', '((1 <-> a) <-> 2) <-> s'); + to_tsquery +------------- + '1' <2> '2' +(1 row) + +SELECT to_tsquery('english', '(2 <-> (1 <-> a)) <-> s'); + to_tsquery +------------- + '2' <-> '1' +(1 row) + +SELECT to_tsquery('english', 's <-> ((a <-> 1) <-> 2)'); + to_tsquery +------------- + '1' <-> '2' +(1 row) + +SELECT to_tsquery('english', 's <-> (2 <-> (a <-> 1))'); + to_tsquery +------------- + '2' <-> '1' +(1 row) + +SELECT to_tsquery('english', 's <-> ((1 <-> a) <-> 2)'); + to_tsquery +------------- + '1' <2> '2' +(1 row) + +SELECT to_tsquery('english', 's <-> (2 <-> (1 <-> a))'); + to_tsquery +------------- + '2' <-> '1' +(1 row) + +SELECT to_tsquery('english', '((a <-> 1) <-> s) <-> 2'); + to_tsquery +------------- + '1' <2> '2' +(1 row) + +SELECT to_tsquery('english', '(s <-> (a <-> 1)) <-> 2'); + to_tsquery +------------- + '1' <-> '2' +(1 row) + +SELECT to_tsquery('english', '((1 <-> a) <-> s) <-> 2'); + to_tsquery +------------- + '1' <3> '2' +(1 row) + +SELECT to_tsquery('english', '(s <-> (1 <-> a)) <-> 2'); + to_tsquery +------------- + '1' <2> '2' +(1 row) + +SELECT to_tsquery('english', '2 <-> ((a <-> 1) <-> s)'); + to_tsquery +------------- + '2' <-> '1' +(1 row) + +SELECT to_tsquery('english', '2 <-> (s <-> (a <-> 1))'); + to_tsquery +------------- + '2' <-> '1' +(1 row) + +SELECT to_tsquery('english', '2 <-> ((1 <-> a) <-> s)'); + to_tsquery +------------- + '2' <-> '1' +(1 row) + +SELECT to_tsquery('english', '2 <-> (s <-> (1 <-> a))'); + to_tsquery +------------- + '2' <-> '1' +(1 row) + +SELECT to_tsquery('foo <-> (a <-> (the <-> bar))'); + to_tsquery +----------------- + 'foo' <-> 'bar' +(1 row) + +SELECT to_tsquery('((foo <-> a) <-> the) <-> bar'); + to_tsquery +----------------- + 'foo' <3> 'bar' +(1 row) + +SELECT to_tsquery('foo <-> a <-> the <-> bar'); + to_tsquery +----------------- + 'foo' <3> 'bar' +(1 row) + +SELECT phraseto_tsquery('PostgreSQL can be extended by the user in many ways'); + phraseto_tsquery +----------------------------------------------------------------------- + ( ( ( 'postgresql' <3> 'extend' ) <3> 'user' ) <2> 'mani' ) <-> 'way' +(1 row) + SELECT ts_rank_cd(to_tsvector('english', ' Day after day, day after day, We stuck, nor breath nor motion, @@ -602,6 +831,22 @@ S. T. Coleridge (1772-1834) 0.1 (1 row) +SELECT ts_rank_cd(to_tsvector('english', ' +Day after day, day after day, + We stuck, nor breath nor motion, +As idle as a painted Ship + Upon a painted Ocean. +Water, water, every where + And all the boards did shrink; +Water, water, every where, + Nor any drop to drink. +S. T. Coleridge (1772-1834) +'), to_tsquery('english', 'painted <-> Ship')); + ts_rank_cd +------------ + 0.1 +(1 row) + SELECT ts_rank_cd(strip(to_tsvector('both stripped')), to_tsquery('both & stripped')); ts_rank_cd @@ -674,6 +919,44 @@ S. T. Coleridge (1772-1834) Water, water, every where (1 row) +SELECT ts_headline('english', ' +Day after day, day after day, + We stuck, nor breath nor motion, +As idle as a painted Ship + Upon a painted Ocean. +Water, water, every where + And all the boards did shrink; +Water, water, every where, + Nor any drop to drink. +S. T. Coleridge (1772-1834) +', phraseto_tsquery('english', 'painted Ocean')); + ts_headline +---------------------------------- + painted Ocean. + + Water, water, every where + + And all the boards did shrink;+ + Water, water, every +(1 row) + +SELECT ts_headline('english', ' +Day after day, day after day, + We stuck, nor breath nor motion, +As idle as a painted Ship + Upon a painted Ocean. +Water, water, every where + And all the boards did shrink; +Water, water, every where, + Nor any drop to drink. +S. T. Coleridge (1772-1834) +', phraseto_tsquery('english', 'idle as a painted Ship')); + ts_headline +--------------------------------------------- + idle as a painted Ship+ + Upon a painted Ocean. + + Water, water, every where + + And all the boards +(1 row) + SELECT ts_headline('english', ' @@ -703,6 +986,24 @@ to_tsquery('english', 'sea&foo'), 'HighlightAll=true'); (1 row) +SELECT ts_headline('simple', '1 2 3 1 3'::text, '1 <-> 3', 'MaxWords=2, MinWords=1'); + ts_headline +------------------- + 1 3 +(1 row) + +SELECT ts_headline('simple', '1 2 3 1 3'::text, '1 & 3', 'MaxWords=4, MinWords=1'); + ts_headline +------------------------------ + 1 2 3 1 +(1 row) + +SELECT ts_headline('simple', '1 2 3 1 3'::text, '1 <-> 3', 'MaxWords=4, MinWords=1'); + ts_headline +------------------- + 1 3 +(1 row) + --Check if headline fragments work SELECT ts_headline('english', ' Day after day, day after day, @@ -805,13 +1106,13 @@ UPDATE test_tsquery SET sample = to_tsquery('english', txtsample::text); SELECT COUNT(*) FROM test_tsquery WHERE keyword < 'new & york'; count ------- - 1 + 2 (1 row) SELECT COUNT(*) FROM test_tsquery WHERE keyword <= 'new & york'; count ------- - 2 + 3 (1 row) SELECT COUNT(*) FROM test_tsquery WHERE keyword = 'new & york'; @@ -823,13 +1124,13 @@ SELECT COUNT(*) FROM test_tsquery WHERE keyword = 'new & york'; SELECT COUNT(*) FROM test_tsquery WHERE keyword >= 'new & york'; count ------- - 3 + 4 (1 row) SELECT COUNT(*) FROM test_tsquery WHERE keyword > 'new & york'; count ------- - 2 + 3 (1 row) CREATE UNIQUE INDEX bt_tsq ON test_tsquery (keyword); @@ -837,13 +1138,13 @@ SET enable_seqscan=OFF; SELECT COUNT(*) FROM test_tsquery WHERE keyword < 'new & york'; count ------- - 1 + 2 (1 row) SELECT COUNT(*) FROM test_tsquery WHERE keyword <= 'new & york'; count ------- - 2 + 3 (1 row) SELECT COUNT(*) FROM test_tsquery WHERE keyword = 'new & york'; @@ -855,20 +1156,20 @@ SELECT COUNT(*) FROM test_tsquery WHERE keyword = 'new & york'; SELECT COUNT(*) FROM test_tsquery WHERE keyword >= 'new & york'; count ------- - 3 + 4 (1 row) SELECT COUNT(*) FROM test_tsquery WHERE keyword > 'new & york'; count ------- - 2 + 3 (1 row) RESET enable_seqscan; SELECT ts_rewrite('foo & bar & qq & new & york', 'new & york'::tsquery, 'big & apple | nyc | new & york & city'); - ts_rewrite ----------------------------------------------------------------------------------- - 'foo' & 'bar' & 'qq' & ( 'city' & 'new' & 'york' | ( 'nyc' | 'big' & 'apple' ) ) + ts_rewrite +------------------------------------------------------------------------------ + 'foo' & 'bar' & 'qq' & ( 'nyc' | 'big' & 'apple' | 'city' & 'new' & 'york' ) (1 row) SELECT ts_rewrite('moscow', 'SELECT keyword, sample FROM test_tsquery'::text ); @@ -884,9 +1185,9 @@ SELECT ts_rewrite('moscow & hotel', 'SELECT keyword, sample FROM test_tsquery':: (1 row) SELECT ts_rewrite('bar & new & qq & foo & york', 'SELECT keyword, sample FROM test_tsquery'::text ); - ts_rewrite -------------------------------------------------------------------------------------- - 'citi' & 'foo' & ( 'bar' | 'qq' ) & ( 'nyc' | ( 'big' & 'appl' | 'new' & 'york' ) ) + ts_rewrite +--------------------------------------------------------------------------------- + ( 'nyc' | 'big' & 'appl' | 'new' & 'york' ) & 'citi' & 'foo' & ( 'bar' | 'qq' ) (1 row) SELECT ts_rewrite( 'moscow', 'SELECT keyword, sample FROM test_tsquery'); @@ -902,9 +1203,33 @@ SELECT ts_rewrite( 'moscow & hotel', 'SELECT keyword, sample FROM test_tsquery') (1 row) SELECT ts_rewrite( 'bar & new & qq & foo & york', 'SELECT keyword, sample FROM test_tsquery'); - ts_rewrite -------------------------------------------------------------------------------------- - 'citi' & 'foo' & ( 'bar' | 'qq' ) & ( 'nyc' | ( 'big' & 'appl' | 'new' & 'york' ) ) + ts_rewrite +--------------------------------------------------------------------------------- + ( 'nyc' | 'big' & 'appl' | 'new' & 'york' ) & 'citi' & 'foo' & ( 'bar' | 'qq' ) +(1 row) + +SELECT ts_rewrite('1 & (2 <-> 3)', 'SELECT keyword, sample FROM test_tsquery'::text ); + ts_rewrite +------------- + '2' <-> '4' +(1 row) + +SELECT ts_rewrite('1 & (2 <2> 3)', 'SELECT keyword, sample FROM test_tsquery'::text ); + ts_rewrite +----------------------- + '1' & ( '2' <2> '3' ) +(1 row) + +SELECT ts_rewrite('5 <-> (1 & (2 <-> 3))', 'SELECT keyword, sample FROM test_tsquery'::text ); + ts_rewrite +----------------------------------------------- + ( '5' <-> '1' ) & ( '5' <-> ( '2' <-> '3' ) ) +(1 row) + +SELECT ts_rewrite('5 <-> (6 | 8)', 'SELECT keyword, sample FROM test_tsquery'::text ); + ts_rewrite +--------------------------- + '5' <-> '7' | '5' <-> '8' (1 row) SELECT keyword FROM test_tsquery WHERE keyword @> 'new'; @@ -943,9 +1268,9 @@ SELECT ts_rewrite( query, 'SELECT keyword, sample FROM test_tsquery' ) FROM to_t (1 row) SELECT ts_rewrite( query, 'SELECT keyword, sample FROM test_tsquery' ) FROM to_tsquery('english', 'bar & new & qq & foo & york') AS query; - ts_rewrite -------------------------------------------------------------------------------------- - 'citi' & 'foo' & ( 'bar' | 'qq' ) & ( 'nyc' | ( 'big' & 'appl' | 'new' & 'york' ) ) + ts_rewrite +--------------------------------------------------------------------------------- + ( 'nyc' | 'big' & 'appl' | 'new' & 'york' ) & 'citi' & 'foo' & ( 'bar' | 'qq' ) (1 row) SELECT ts_rewrite( query, 'SELECT keyword, sample FROM test_tsquery' ) FROM to_tsquery('english', 'moscow') AS query; @@ -961,9 +1286,9 @@ SELECT ts_rewrite( query, 'SELECT keyword, sample FROM test_tsquery' ) FROM to_t (1 row) SELECT ts_rewrite( query, 'SELECT keyword, sample FROM test_tsquery' ) FROM to_tsquery('english', 'bar & new & qq & foo & york') AS query; - ts_rewrite -------------------------------------------------------------------------------------- - 'citi' & 'foo' & ( 'bar' | 'qq' ) & ( 'nyc' | ( 'big' & 'appl' | 'new' & 'york' ) ) + ts_rewrite +--------------------------------------------------------------------------------- + ( 'nyc' | 'big' & 'appl' | 'new' & 'york' ) & 'citi' & 'foo' & ( 'bar' | 'qq' ) (1 row) CREATE INDEX qq ON test_tsquery USING gist (keyword tsquery_ops); @@ -1004,9 +1329,9 @@ SELECT ts_rewrite( query, 'SELECT keyword, sample FROM test_tsquery' ) FROM to_t (1 row) SELECT ts_rewrite( query, 'SELECT keyword, sample FROM test_tsquery' ) FROM to_tsquery('english', 'bar & new & qq & foo & york') AS query; - ts_rewrite -------------------------------------------------------------------------------------- - 'citi' & 'foo' & ( 'bar' | 'qq' ) & ( 'nyc' | ( 'big' & 'appl' | 'new' & 'york' ) ) + ts_rewrite +--------------------------------------------------------------------------------- + ( 'nyc' | 'big' & 'appl' | 'new' & 'york' ) & 'citi' & 'foo' & ( 'bar' | 'qq' ) (1 row) SELECT ts_rewrite( query, 'SELECT keyword, sample FROM test_tsquery' ) FROM to_tsquery('english', 'moscow') AS query; @@ -1022,9 +1347,9 @@ SELECT ts_rewrite( query, 'SELECT keyword, sample FROM test_tsquery' ) FROM to_t (1 row) SELECT ts_rewrite( query, 'SELECT keyword, sample FROM test_tsquery' ) FROM to_tsquery('english', 'bar & new & qq & foo & york') AS query; - ts_rewrite -------------------------------------------------------------------------------------- - 'citi' & 'foo' & ( 'bar' | 'qq' ) & ( 'nyc' | ( 'big' & 'appl' | 'new' & 'york' ) ) + ts_rewrite +--------------------------------------------------------------------------------- + ( 'nyc' | 'big' & 'appl' | 'new' & 'york' ) & 'citi' & 'foo' & ( 'bar' | 'qq' ) (1 row) RESET enable_seqscan; @@ -1132,3 +1457,15 @@ select * from pendtest where 'ipi:*'::tsquery @@ ts; ---- (0 rows) +--check OP_PHRASE on index +create temp table phrase_index_test(fts tsvector); +insert into phrase_index_test values('A fat cat has just eaten a rat.'); +create index phrase_index_test_idx on phrase_index_test using gin(fts); +set enable_seqscan = off; +select * from phrase_index_test where fts @@ phraseto_tsquery('fat cat'); + fts +------------------------------------------------- + 'A' 'a' 'cat' 'eaten' 'fat' 'has' 'just' 'rat.' +(1 row) + +set enable_seqscan = on; diff --git a/src/test/regress/expected/tstypes.out b/src/test/regress/expected/tstypes.out index a386a46361..c904c1c705 100644 --- a/src/test/regress/expected/tstypes.out +++ b/src/test/regress/expected/tstypes.out @@ -277,15 +277,15 @@ SELECT '(!1|2)&3'::tsquery; (1 row) SELECT '1|(2|(4|(5|6)))'::tsquery; - tsquery ------------------------------------------ - '1' | ( '2' | ( '4' | ( '5' | '6' ) ) ) + tsquery +----------------------------- + '1' | '2' | '4' | '5' | '6' (1 row) SELECT '1|2|4|5|6'::tsquery; - tsquery ------------------------------------------ - ( ( ( '1' | '2' ) | '4' ) | '5' ) | '6' + tsquery +----------------------------- + '1' | '2' | '4' | '5' | '6' (1 row) SELECT '1&(2&(4&(5&6)))'::tsquery; @@ -325,11 +325,139 @@ SELECT $$'\\as'$$::tsquery; (1 row) SELECT 'a:* & nbb:*ac | doo:a* | goo'::tsquery; + tsquery +-------------------------------------- + 'a':* & 'nbb':*AC | 'doo':*A | 'goo' +(1 row) + +-- phrase transformation +SELECT 'a <-> (b|c)'::tsquery; + tsquery +--------------------------- + 'a' <-> 'b' | 'a' <-> 'c' +(1 row) + +SELECT '(a|b) <-> c'::tsquery; + tsquery +--------------------------- + 'a' <-> 'c' | 'b' <-> 'c' +(1 row) + +SELECT '(a|b) <-> (d|c)'::tsquery; + tsquery +------------------------------------------------------- + 'a' <-> 'd' | 'b' <-> 'd' | 'a' <-> 'c' | 'b' <-> 'c' +(1 row) + +SELECT 'a <-> (b&c)'::tsquery; + tsquery +----------------------------------- + ( 'a' <-> 'b' ) & ( 'a' <-> 'c' ) +(1 row) + +SELECT '(a&b) <-> c'::tsquery; + tsquery +----------------------------------- + ( 'a' <-> 'c' ) & ( 'b' <-> 'c' ) +(1 row) + +SELECT '(a&b) <-> (d&c)'::tsquery; + tsquery +----------------------------------------------------------------------- + ( 'a' <-> 'd' ) & ( 'b' <-> 'd' ) & ( 'a' <-> 'c' ) & ( 'b' <-> 'c' ) +(1 row) + +SELECT 'a <-> !b'::tsquery; + tsquery +------------------------ + 'a' & !( 'a' <-> 'b' ) +(1 row) + +SELECT '!a <-> b'::tsquery; + tsquery +------------------------ + !( 'a' <-> 'b' ) & 'b' +(1 row) + +SELECT '!a <-> !b'::tsquery; + tsquery +------------------------------------ + !'a' & !( !( 'a' <-> 'b' ) & 'b' ) +(1 row) + +SELECT 'a <-> !(b&c)'::tsquery; + tsquery +---------------------------------------------- + 'a' & !( ( 'a' <-> 'b' ) & ( 'a' <-> 'c' ) ) +(1 row) + +SELECT 'a <-> !(b|c)'::tsquery; + tsquery +-------------------------------------- + 'a' & !( 'a' <-> 'b' | 'a' <-> 'c' ) +(1 row) + +SELECT '!(a&b) <-> c'::tsquery; + tsquery +---------------------------------------------- + !( ( 'a' <-> 'c' ) & ( 'b' <-> 'c' ) ) & 'c' +(1 row) + +SELECT '!(a|b) <-> c'::tsquery; + tsquery +-------------------------------------- + !( 'a' <-> 'c' | 'b' <-> 'c' ) & 'c' +(1 row) + +SELECT '(!a|b) <-> c'::tsquery; + tsquery +-------------------------------------- + !( 'a' <-> 'c' ) & 'c' | 'b' <-> 'c' +(1 row) + +SELECT '(!a&b) <-> c'::tsquery; tsquery ------------------------------------------ - ( 'a':* & 'nbb':*AC | 'doo':*A ) | 'goo' + !( 'a' <-> 'c' ) & 'c' & ( 'b' <-> 'c' ) +(1 row) + +SELECT 'c <-> (!a|b)'::tsquery; + tsquery +-------------------------------------- + 'c' & !( 'c' <-> 'a' ) | 'c' <-> 'b' +(1 row) + +SELECT 'c <-> (!a&b)'::tsquery; + tsquery +------------------------------------------ + 'c' & !( 'c' <-> 'a' ) & ( 'c' <-> 'b' ) +(1 row) + +SELECT '(a|b) <-> !c'::tsquery; + tsquery +------------------------------------------------ + ( 'a' | 'b' ) & !( 'a' <-> 'c' | 'b' <-> 'c' ) +(1 row) + +SELECT '(a&b) <-> !c'::tsquery; + tsquery +---------------------------------------------------- + 'a' & 'b' & !( ( 'a' <-> 'c' ) & ( 'b' <-> 'c' ) ) +(1 row) + +SELECT '!c <-> (a|b)'::tsquery; + tsquery +------------------------------------------------- + !( 'c' <-> 'a' ) & 'a' | !( 'c' <-> 'b' ) & 'b' +(1 row) + +SELECT '!c <-> (a&b)'::tsquery; + tsquery +------------------------------------------------- + !( 'c' <-> 'a' ) & 'a' & !( 'c' <-> 'b' ) & 'b' (1 row) +--comparisons SELECT 'a' < 'b & c'::tsquery as "true"; true ------ @@ -342,10 +470,10 @@ SELECT 'a' > 'b & c'::tsquery as "false"; f (1 row) -SELECT 'a | f' < 'b & c'::tsquery as "true"; - true ------- - t +SELECT 'a | f' < 'b & c'::tsquery as "false"; + false +------- + f (1 row) SELECT 'a | ff' < 'b & c'::tsquery as "false"; @@ -360,6 +488,7 @@ SELECT 'a | f | g' < 'b & c'::tsquery as "false"; f (1 row) +--concatenation SELECT numnode( 'new'::tsquery ); numnode --------- @@ -402,6 +531,36 @@ SELECT 'foo & bar'::tsquery && 'asd | fg'; 'foo' & 'bar' & ( 'asd' | 'fg' ) (1 row) +SELECT 'a' <-> 'b & d'::tsquery; + ?column? +----------------------------------- + ( 'a' <-> 'b' ) & ( 'a' <-> 'd' ) +(1 row) + +SELECT 'a & g' <-> 'b & d'::tsquery; + ?column? +----------------------------------------------------------------------- + ( 'a' <-> 'b' ) & ( 'g' <-> 'b' ) & ( 'a' <-> 'd' ) & ( 'g' <-> 'd' ) +(1 row) + +SELECT 'a & g' <-> 'b | d'::tsquery; + ?column? +----------------------------------------------------------------------- + ( 'a' <-> 'b' ) & ( 'g' <-> 'b' ) | ( 'a' <-> 'd' ) & ( 'g' <-> 'd' ) +(1 row) + +SELECT 'a & g' <-> 'b <-> d'::tsquery; + ?column? +----------------------------------------------------------- + ( 'a' <-> ( 'b' <-> 'd' ) ) & ( 'g' <-> ( 'b' <-> 'd' ) ) +(1 row) + +SELECT tsquery_phrase('a <3> g', 'b & d', 10); + tsquery_phrase +------------------------------------------------------------- + ( ( 'a' <3> 'g' ) <10> 'b' ) & ( ( 'a' <3> 'g' ) <10> 'd' ) +(1 row) + -- tsvector-tsquery operations SELECT 'a b:89 ca:23A,64b d:34c'::tsvector @@ 'd:AC & ca' as "true"; true @@ -499,6 +658,80 @@ SELECT 'supeznova supernova'::tsvector @@ 'super:*'::tsquery AS "true"; t (1 row) +--phrase search +SELECT to_tsvector('simple', '1 2 3 1') @@ '1 <-> 2' AS "true"; + true +------ + t +(1 row) + +SELECT to_tsvector('simple', '1 2 3 1') @@ '1 <2> 2' AS "true"; + true +------ + t +(1 row) + +SELECT to_tsvector('simple', '1 2 3 1') @@ '1 <-> 3' AS "false"; + false +------- + f +(1 row) + +SELECT to_tsvector('simple', '1 2 3 1') @@ '1 <2> 3' AS "true"; + true +------ + t +(1 row) + +SELECT to_tsvector('simple', '1 2 11 3') @@ '1 <-> 3' AS "false"; + false +------- + f +(1 row) + +SELECT to_tsvector('simple', '1 2 11 3') @@ '1:* <-> 3' AS "true"; + true +------ + t +(1 row) + +SELECT to_tsvector('simple', '1 2 3 4') @@ '1 <-> 2 <-> 3' AS "true"; + true +------ + t +(1 row) + +SELECT to_tsvector('simple', '1 2 3 4') @@ '(1 <-> 2) <-> 3' AS "true"; + true +------ + t +(1 row) + +SELECT to_tsvector('simple', '1 2 3 4') @@ '1 <-> (2 <-> 3)' AS "false"; + false +------- + f +(1 row) + +SELECT to_tsvector('simple', '1 2 3 4') @@ '1 <2> (2 <-> 3)' AS "true"; + true +------ + t +(1 row) + +SELECT to_tsvector('simple', '1 2 1 2 3 4') @@ '(1 <-> 2) <-> 3' AS "true"; + true +------ + t +(1 row) + +SELECT to_tsvector('simple', '1 2 1 2 3 4') @@ '1 <-> 2 <-> 3' AS "true"; + true +------ + t +(1 row) + +--ranking SELECT ts_rank(' a:1 s:2C d g'::tsvector, 'a | s'); ts_rank ----------- @@ -613,6 +846,120 @@ SELECT ts_rank_cd(' a:1 s:2 d g'::tsvector, 'a & s'); 0.1 (1 row) +SELECT ts_rank_cd(' a:1 s:2A d g'::tsvector, 'a <-> s'); + ts_rank_cd +------------ + 0.181818 +(1 row) + +SELECT ts_rank_cd(' a:1 s:2C d g'::tsvector, 'a <-> s'); + ts_rank_cd +------------ + 0.133333 +(1 row) + +SELECT ts_rank_cd(' a:1 s:2 d g'::tsvector, 'a <-> s'); + ts_rank_cd +------------ + 0.1 +(1 row) + +SELECT ts_rank_cd(' a:1 s:2 d:2A g'::tsvector, 'a <-> s'); + ts_rank_cd +------------ + 0.1 +(1 row) + +SELECT ts_rank_cd(' a:1 s:2,3A d:2A g'::tsvector, 'a <2> s:A'); + ts_rank_cd +------------ + 0.0909091 +(1 row) + +SELECT ts_rank_cd(' a:1 b:2 s:3A d:2A g'::tsvector, 'a <2> s:A'); + ts_rank_cd +------------ + 0.0909091 +(1 row) + +SELECT ts_rank_cd(' a:1 sa:2D sb:2A g'::tsvector, 'a <-> s:*'); + ts_rank_cd +------------ + 0.1 +(1 row) + +SELECT ts_rank_cd(' a:1 sa:2A sb:2D g'::tsvector, 'a <-> s:*'); + ts_rank_cd +------------ + 0.1 +(1 row) + +SELECT ts_rank_cd(' a:1 sa:2A sb:2D g'::tsvector, 'a <-> s:* <-> sa:A'); + ts_rank_cd +------------ + 0.0714286 +(1 row) + +SELECT ts_rank_cd(' a:1 sa:2A sb:2D g'::tsvector, 'a <-> s:* <-> sa:B'); + ts_rank_cd +------------ + 0 +(1 row) + +SELECT 'a:1 b:2'::tsvector @@ 'a <-> b'::tsquery AS "true"; + true +------ + t +(1 row) + +SELECT 'a:1 b:2'::tsvector @@ 'a <0> b'::tsquery AS "false"; + false +------- + f +(1 row) + +SELECT 'a:1 b:2'::tsvector @@ 'a <1> b'::tsquery AS "true"; + true +------ + t +(1 row) + +SELECT 'a:1 b:2'::tsvector @@ 'a <2> b'::tsquery AS "true"; + true +------ + t +(1 row) + +SELECT 'a:1 b:3'::tsvector @@ 'a <-> b'::tsquery AS "false"; + false +------- + f +(1 row) + +SELECT 'a:1 b:3'::tsvector @@ 'a <0> b'::tsquery AS "false"; + false +------- + f +(1 row) + +SELECT 'a:1 b:3'::tsvector @@ 'a <1> b'::tsquery AS "false"; + false +------- + f +(1 row) + +SELECT 'a:1 b:3'::tsvector @@ 'a <2> b'::tsquery AS "true"; + true +------ + t +(1 row) + +SELECT 'a:1 b:3'::tsvector @@ 'a <3> b'::tsquery AS "true"; + true +------ + t +(1 row) + -- tsvector editing operations SELECT strip('w:12B w:13* w:12,5,6 a:1,3* a:3 w asd:1dc asd'::tsvector); strip diff --git a/src/test/regress/sql/tsdicts.sql b/src/test/regress/sql/tsdicts.sql index d13ce2e378..4d0419e35a 100644 --- a/src/test/regress/sql/tsdicts.sql +++ b/src/test/regress/sql/tsdicts.sql @@ -142,6 +142,9 @@ SELECT to_tsvector('hunspell_tst', 'Booking the skies after rebookings for footb SELECT to_tsquery('hunspell_tst', 'footballklubber'); SELECT to_tsquery('hunspell_tst', 'footballyklubber:b & rebookings:A & sky'); +SELECT to_tsquery('hunspell_tst', 'footballyklubber:b <-> sky'); +SELECT phraseto_tsquery('hunspell_tst', 'footballyklubber sky'); + -- Test ispell dictionary with hunspell affix with FLAG long in configuration ALTER TEXT SEARCH CONFIGURATION hunspell_tst ALTER MAPPING REPLACE hunspell WITH hunspell_long; diff --git a/src/test/regress/sql/tsearch.sql b/src/test/regress/sql/tsearch.sql index 405278fb16..ccd152591a 100644 --- a/src/test/regress/sql/tsearch.sql +++ b/src/test/regress/sql/tsearch.sql @@ -129,6 +129,52 @@ SELECT plainto_tsquery('english', 'foo bar') || plainto_tsquery('english', 'asd SELECT plainto_tsquery('english', 'foo bar') || !!plainto_tsquery('english', 'asd fg'); SELECT plainto_tsquery('english', 'foo bar') && 'asd | fg'; +-- Check stop word deletion, a and s are stop-words +SELECT to_tsquery('english', '(1 <-> 2) <-> a'); +SELECT to_tsquery('english', '(1 <-> a) <-> 2'); +SELECT to_tsquery('english', '(a <-> 1) <-> 2'); +SELECT to_tsquery('english', 'a <-> (1 <-> 2)'); +SELECT to_tsquery('english', '1 <-> (a <-> 2)'); +SELECT to_tsquery('english', '1 <-> (2 <-> a)'); + +SELECT to_tsquery('english', '(1 <-> 2) <3> a'); +SELECT to_tsquery('english', '(1 <-> a) <3> 2'); +SELECT to_tsquery('english', '(a <-> 1) <3> 2'); +SELECT to_tsquery('english', 'a <3> (1 <-> 2)'); +SELECT to_tsquery('english', '1 <3> (a <-> 2)'); +SELECT to_tsquery('english', '1 <3> (2 <-> a)'); + +SELECT to_tsquery('english', '(1 <3> 2) <-> a'); +SELECT to_tsquery('english', '(1 <3> a) <-> 2'); +SELECT to_tsquery('english', '(a <3> 1) <-> 2'); +SELECT to_tsquery('english', 'a <-> (1 <3> 2)'); +SELECT to_tsquery('english', '1 <-> (a <3> 2)'); +SELECT to_tsquery('english', '1 <-> (2 <3> a)'); + +SELECT to_tsquery('english', '((a <-> 1) <-> 2) <-> s'); +SELECT to_tsquery('english', '(2 <-> (a <-> 1)) <-> s'); +SELECT to_tsquery('english', '((1 <-> a) <-> 2) <-> s'); +SELECT to_tsquery('english', '(2 <-> (1 <-> a)) <-> s'); +SELECT to_tsquery('english', 's <-> ((a <-> 1) <-> 2)'); +SELECT to_tsquery('english', 's <-> (2 <-> (a <-> 1))'); +SELECT to_tsquery('english', 's <-> ((1 <-> a) <-> 2)'); +SELECT to_tsquery('english', 's <-> (2 <-> (1 <-> a))'); + +SELECT to_tsquery('english', '((a <-> 1) <-> s) <-> 2'); +SELECT to_tsquery('english', '(s <-> (a <-> 1)) <-> 2'); +SELECT to_tsquery('english', '((1 <-> a) <-> s) <-> 2'); +SELECT to_tsquery('english', '(s <-> (1 <-> a)) <-> 2'); +SELECT to_tsquery('english', '2 <-> ((a <-> 1) <-> s)'); +SELECT to_tsquery('english', '2 <-> (s <-> (a <-> 1))'); +SELECT to_tsquery('english', '2 <-> ((1 <-> a) <-> s)'); +SELECT to_tsquery('english', '2 <-> (s <-> (1 <-> a))'); + +SELECT to_tsquery('foo <-> (a <-> (the <-> bar))'); +SELECT to_tsquery('((foo <-> a) <-> the) <-> bar'); +SELECT to_tsquery('foo <-> a <-> the <-> bar'); +SELECT phraseto_tsquery('PostgreSQL can be extended by the user in many ways'); + + SELECT ts_rank_cd(to_tsvector('english', ' Day after day, day after day, We stuck, nor breath nor motion, @@ -165,6 +211,18 @@ Water, water, every where, S. T. Coleridge (1772-1834) '), to_tsquery('english', 'ocean')); +SELECT ts_rank_cd(to_tsvector('english', ' +Day after day, day after day, + We stuck, nor breath nor motion, +As idle as a painted Ship + Upon a painted Ocean. +Water, water, every where + And all the boards did shrink; +Water, water, every where, + Nor any drop to drink. +S. T. Coleridge (1772-1834) +'), to_tsquery('english', 'painted <-> Ship')); + SELECT ts_rank_cd(strip(to_tsvector('both stripped')), to_tsquery('both & stripped')); @@ -208,6 +266,30 @@ Water, water, every where, S. T. Coleridge (1772-1834) ', to_tsquery('english', 'ocean')); +SELECT ts_headline('english', ' +Day after day, day after day, + We stuck, nor breath nor motion, +As idle as a painted Ship + Upon a painted Ocean. +Water, water, every where + And all the boards did shrink; +Water, water, every where, + Nor any drop to drink. +S. T. Coleridge (1772-1834) +', phraseto_tsquery('english', 'painted Ocean')); + +SELECT ts_headline('english', ' +Day after day, day after day, + We stuck, nor breath nor motion, +As idle as a painted Ship + Upon a painted Ocean. +Water, water, every where + And all the boards did shrink; +Water, water, every where, + Nor any drop to drink. +S. T. Coleridge (1772-1834) +', phraseto_tsquery('english', 'idle as a painted Ship')); + SELECT ts_headline('english', ' @@ -222,6 +304,10 @@ ff-bg ', to_tsquery('english', 'sea&foo'), 'HighlightAll=true'); +SELECT ts_headline('simple', '1 2 3 1 3'::text, '1 <-> 3', 'MaxWords=2, MinWords=1'); +SELECT ts_headline('simple', '1 2 3 1 3'::text, '1 & 3', 'MaxWords=4, MinWords=1'); +SELECT ts_headline('simple', '1 2 3 1 3'::text, '1 <-> 3', 'MaxWords=4, MinWords=1'); + --Check if headline fragments work SELECT ts_headline('english', ' Day after day, day after day, @@ -283,6 +369,8 @@ CREATE TABLE test_tsquery (txtkeyword TEXT, txtsample TEXT); Moscow moskva | moscow 'Sanct Peter' Peterburg | peter | 'Sanct Peterburg' 'foo bar qq' foo & (bar | qq) & city +1 & (2 <-> 3) 2 <-> 4 +5 <-> 6 5 <-> 7 \. \set ECHO all @@ -320,6 +408,11 @@ SELECT ts_rewrite( 'moscow', 'SELECT keyword, sample FROM test_tsquery'); SELECT ts_rewrite( 'moscow & hotel', 'SELECT keyword, sample FROM test_tsquery'); SELECT ts_rewrite( 'bar & new & qq & foo & york', 'SELECT keyword, sample FROM test_tsquery'); +SELECT ts_rewrite('1 & (2 <-> 3)', 'SELECT keyword, sample FROM test_tsquery'::text ); +SELECT ts_rewrite('1 & (2 <2> 3)', 'SELECT keyword, sample FROM test_tsquery'::text ); +SELECT ts_rewrite('5 <-> (1 & (2 <-> 3))', 'SELECT keyword, sample FROM test_tsquery'::text ); +SELECT ts_rewrite('5 <-> (6 | 8)', 'SELECT keyword, sample FROM test_tsquery'::text ); + SELECT keyword FROM test_tsquery WHERE keyword @> 'new'; SELECT keyword FROM test_tsquery WHERE keyword @> 'moscow'; @@ -386,3 +479,11 @@ select * from pendtest where 'ipsa:*'::tsquery @@ ts; select * from pendtest where 'ips:*'::tsquery @@ ts; select * from pendtest where 'ipt:*'::tsquery @@ ts; select * from pendtest where 'ipi:*'::tsquery @@ ts; + +--check OP_PHRASE on index +create temp table phrase_index_test(fts tsvector); +insert into phrase_index_test values('A fat cat has just eaten a rat.'); +create index phrase_index_test_idx on phrase_index_test using gin(fts); +set enable_seqscan = off; +select * from phrase_index_test where fts @@ phraseto_tsquery('fat cat'); +set enable_seqscan = on; diff --git a/src/test/regress/sql/tstypes.sql b/src/test/regress/sql/tstypes.sql index 38b7f65c25..ecc71c85e6 100644 --- a/src/test/regress/sql/tstypes.sql +++ b/src/test/regress/sql/tstypes.sql @@ -58,12 +58,42 @@ SELECT E'1&(''2''&('' 4''&(\\|5 | ''6 \\'' !|&'')))'::tsquery; SELECT $$'\\as'$$::tsquery; SELECT 'a:* & nbb:*ac | doo:a* | goo'::tsquery; +-- phrase transformation +SELECT 'a <-> (b|c)'::tsquery; +SELECT '(a|b) <-> c'::tsquery; +SELECT '(a|b) <-> (d|c)'::tsquery; + +SELECT 'a <-> (b&c)'::tsquery; +SELECT '(a&b) <-> c'::tsquery; +SELECT '(a&b) <-> (d&c)'::tsquery; + +SELECT 'a <-> !b'::tsquery; +SELECT '!a <-> b'::tsquery; +SELECT '!a <-> !b'::tsquery; + +SELECT 'a <-> !(b&c)'::tsquery; +SELECT 'a <-> !(b|c)'::tsquery; +SELECT '!(a&b) <-> c'::tsquery; +SELECT '!(a|b) <-> c'::tsquery; + +SELECT '(!a|b) <-> c'::tsquery; +SELECT '(!a&b) <-> c'::tsquery; +SELECT 'c <-> (!a|b)'::tsquery; +SELECT 'c <-> (!a&b)'::tsquery; + +SELECT '(a|b) <-> !c'::tsquery; +SELECT '(a&b) <-> !c'::tsquery; +SELECT '!c <-> (a|b)'::tsquery; +SELECT '!c <-> (a&b)'::tsquery; + +--comparisons SELECT 'a' < 'b & c'::tsquery as "true"; SELECT 'a' > 'b & c'::tsquery as "false"; -SELECT 'a | f' < 'b & c'::tsquery as "true"; +SELECT 'a | f' < 'b & c'::tsquery as "false"; SELECT 'a | ff' < 'b & c'::tsquery as "false"; SELECT 'a | f | g' < 'b & c'::tsquery as "false"; +--concatenation SELECT numnode( 'new'::tsquery ); SELECT numnode( 'new & york'::tsquery ); SELECT numnode( 'new & york | qwery'::tsquery ); @@ -72,6 +102,11 @@ SELECT 'foo & bar'::tsquery && 'asd'; SELECT 'foo & bar'::tsquery || 'asd & fg'; SELECT 'foo & bar'::tsquery || !!'asd & fg'::tsquery; SELECT 'foo & bar'::tsquery && 'asd | fg'; +SELECT 'a' <-> 'b & d'::tsquery; +SELECT 'a & g' <-> 'b & d'::tsquery; +SELECT 'a & g' <-> 'b | d'::tsquery; +SELECT 'a & g' <-> 'b <-> d'::tsquery; +SELECT tsquery_phrase('a <3> g', 'b & d', 10); -- tsvector-tsquery operations @@ -93,6 +128,23 @@ SELECT 'supernova'::tsvector @@ 'super:*'::tsquery AS "true"; SELECT 'supeanova supernova'::tsvector @@ 'super:*'::tsquery AS "true"; SELECT 'supeznova supernova'::tsvector @@ 'super:*'::tsquery AS "true"; +--phrase search +SELECT to_tsvector('simple', '1 2 3 1') @@ '1 <-> 2' AS "true"; +SELECT to_tsvector('simple', '1 2 3 1') @@ '1 <2> 2' AS "true"; +SELECT to_tsvector('simple', '1 2 3 1') @@ '1 <-> 3' AS "false"; +SELECT to_tsvector('simple', '1 2 3 1') @@ '1 <2> 3' AS "true"; + +SELECT to_tsvector('simple', '1 2 11 3') @@ '1 <-> 3' AS "false"; +SELECT to_tsvector('simple', '1 2 11 3') @@ '1:* <-> 3' AS "true"; + +SELECT to_tsvector('simple', '1 2 3 4') @@ '1 <-> 2 <-> 3' AS "true"; +SELECT to_tsvector('simple', '1 2 3 4') @@ '(1 <-> 2) <-> 3' AS "true"; +SELECT to_tsvector('simple', '1 2 3 4') @@ '1 <-> (2 <-> 3)' AS "false"; +SELECT to_tsvector('simple', '1 2 3 4') @@ '1 <2> (2 <-> 3)' AS "true"; +SELECT to_tsvector('simple', '1 2 1 2 3 4') @@ '(1 <-> 2) <-> 3' AS "true"; +SELECT to_tsvector('simple', '1 2 1 2 3 4') @@ '1 <-> 2 <-> 3' AS "true"; + +--ranking SELECT ts_rank(' a:1 s:2C d g'::tsvector, 'a | s'); SELECT ts_rank(' a:1 sa:2C d g'::tsvector, 'a | s'); SELECT ts_rank(' a:1 sa:2C d g'::tsvector, 'a | s:*'); @@ -114,6 +166,27 @@ SELECT ts_rank_cd(' a:1 s:2C d g'::tsvector, 'a & s'); SELECT ts_rank_cd(' a:1 s:2B d g'::tsvector, 'a & s'); SELECT ts_rank_cd(' a:1 s:2 d g'::tsvector, 'a & s'); +SELECT ts_rank_cd(' a:1 s:2A d g'::tsvector, 'a <-> s'); +SELECT ts_rank_cd(' a:1 s:2C d g'::tsvector, 'a <-> s'); +SELECT ts_rank_cd(' a:1 s:2 d g'::tsvector, 'a <-> s'); +SELECT ts_rank_cd(' a:1 s:2 d:2A g'::tsvector, 'a <-> s'); +SELECT ts_rank_cd(' a:1 s:2,3A d:2A g'::tsvector, 'a <2> s:A'); +SELECT ts_rank_cd(' a:1 b:2 s:3A d:2A g'::tsvector, 'a <2> s:A'); +SELECT ts_rank_cd(' a:1 sa:2D sb:2A g'::tsvector, 'a <-> s:*'); +SELECT ts_rank_cd(' a:1 sa:2A sb:2D g'::tsvector, 'a <-> s:*'); +SELECT ts_rank_cd(' a:1 sa:2A sb:2D g'::tsvector, 'a <-> s:* <-> sa:A'); +SELECT ts_rank_cd(' a:1 sa:2A sb:2D g'::tsvector, 'a <-> s:* <-> sa:B'); + +SELECT 'a:1 b:2'::tsvector @@ 'a <-> b'::tsquery AS "true"; +SELECT 'a:1 b:2'::tsvector @@ 'a <0> b'::tsquery AS "false"; +SELECT 'a:1 b:2'::tsvector @@ 'a <1> b'::tsquery AS "true"; +SELECT 'a:1 b:2'::tsvector @@ 'a <2> b'::tsquery AS "true"; +SELECT 'a:1 b:3'::tsvector @@ 'a <-> b'::tsquery AS "false"; +SELECT 'a:1 b:3'::tsvector @@ 'a <0> b'::tsquery AS "false"; +SELECT 'a:1 b:3'::tsvector @@ 'a <1> b'::tsquery AS "false"; +SELECT 'a:1 b:3'::tsvector @@ 'a <2> b'::tsquery AS "true"; +SELECT 'a:1 b:3'::tsvector @@ 'a <3> b'::tsquery AS "true"; + -- tsvector editing operations SELECT strip('w:12B w:13* w:12,5,6 a:1,3* a:3 w asd:1dc asd'::tsvector); -- 2.40.0