From 028350f619f7688e0453fcd2c4b25abe9ba30fa7 Mon Sep 17 00:00:00 2001 From: Teodor Sigaev Date: Mon, 27 Jun 2016 20:41:00 +0300 Subject: [PATCH] Make exact distance match for FTS phrase operator Phrase operator now requires exact distance betweens lexems instead of less-or-equal. Per discussion c19fcfec308e6ccd952cdde9e648b505@mail.gmail.com --- doc/src/sgml/textsearch.sgml | 6 +-- src/backend/utils/adt/tsvector_op.c | 67 +++++++++++++++------------ src/test/regress/expected/tstypes.out | 32 +++++++------ src/test/regress/sql/tstypes.sql | 7 +-- 4 files changed, 64 insertions(+), 48 deletions(-) diff --git a/doc/src/sgml/textsearch.sgml b/doc/src/sgml/textsearch.sgml index 9028bedd1b..72bef9f49e 100644 --- a/doc/src/sgml/textsearch.sgml +++ b/doc/src/sgml/textsearch.sgml @@ -346,10 +346,10 @@ SELECT to_tsvector('error is not fatal') @@ to_tsquery('fatal <-> error'); There is a more general version of the FOLLOWED BY operator having the form <N>, - where N is an integer standing for the greatest distance + where N is an integer standing for the exact distance allowed between the matching lexemes. <1> is the same as <->, while <2> - allows one other lexeme to optionally appear between the matches, and so + allows one other lexeme to appear between the matches, and so on. The phraseto_tsquery function makes use of this operator to construct a tsquery that can match a multi-word phrase when some of the words are stop words. For example: @@ -1529,7 +1529,7 @@ SELECT to_tsquery('fat') <-> to_tsquery('cat | rat'); Returns a query that searches for a match to the first given query followed by a match to the second given query at a distance of at - most distance lexemes, using + distance lexemes, using the <N> tsquery operator. For example: diff --git a/src/backend/utils/adt/tsvector_op.c b/src/backend/utils/adt/tsvector_op.c index 6117ba9b3e..04718829a0 100644 --- a/src/backend/utils/adt/tsvector_op.c +++ b/src/backend/utils/adt/tsvector_op.c @@ -1375,6 +1375,7 @@ TS_phrase_execute(QueryItem *curitem, ExecPhraseData Ldata = {0, false, NULL}, Rdata = {0, false, NULL}; WordEntryPos *Lpos, + *LposStart, *Rpos, *pos_iter = NULL; @@ -1416,52 +1417,60 @@ TS_phrase_execute(QueryItem *curitem, pos_iter = data->pos; } - Lpos = Ldata.pos; - Rpos = Rdata.pos; - /* * Find matches by distance, WEP_GETPOS() is needed because * ExecPhraseData->data can point to the tsvector's WordEntryPosVector */ + Rpos = Rdata.pos; + LposStart = Ldata.pos; while (Rpos < Rdata.pos + Rdata.npos) { + /* + * We need to check all possible distances, so reset Lpos + * to guranteed not yet satisfied position. + */ + Lpos = LposStart; while (Lpos < Ldata.pos + Ldata.npos) { - if (WEP_GETPOS(*Lpos) <= WEP_GETPOS(*Rpos)) + if (WEP_GETPOS(*Rpos) - WEP_GETPOS(*Lpos) == + curitem->qoperator.distance) { - /* - * Lpos is behind the Rpos, so we have to check the - * distance condition - */ - if (WEP_GETPOS(*Rpos) - WEP_GETPOS(*Lpos) <= curitem->qoperator.distance) + /* MATCH! */ + if (data) { - /* MATCH! */ - if (data) - { - *pos_iter = WEP_GETPOS(*Rpos); - pos_iter++; - - break; /* We need to build a unique result - * array, so go to the next Rpos */ - } - else - { - /* - * We are in the root of the phrase tree and hence - * we don't have to store the resulting positions - */ - return true; - } + /* Store position for upper phrase operator */ + *pos_iter = WEP_GETPOS(*Rpos); + pos_iter++; + + /* + * Set left start position to next, because current one + * could not satisfy distance for any other right + * position + */ + LposStart = Lpos + 1; + break; + } + else + { + /* + * We are in the root of the phrase tree and hence + * we don't have to store the resulting positions + */ + return true; } + } - else + else if (WEP_GETPOS(*Rpos) <= WEP_GETPOS(*Lpos) || + WEP_GETPOS(*Rpos) - WEP_GETPOS(*Lpos) < + curitem->qoperator.distance) { /* - * Go to the next Rpos, because Lpos is ahead of the - * current Rpos + * Go to the next Rpos, because Lpos is ahead or on less + * distance than required by current operator */ break; + } Lpos++; diff --git a/src/test/regress/expected/tstypes.out b/src/test/regress/expected/tstypes.out index 64d6de6050..781be70736 100644 --- a/src/test/regress/expected/tstypes.out +++ b/src/test/regress/expected/tstypes.out @@ -665,10 +665,10 @@ SELECT to_tsvector('simple', '1 2 3 1') @@ '1 <-> 2' AS "true"; t (1 row) -SELECT to_tsvector('simple', '1 2 3 1') @@ '1 <2> 2' AS "true"; - true ------- - t +SELECT to_tsvector('simple', '1 2 3 1') @@ '1 <2> 2' AS "false"; + false +------- + f (1 row) SELECT to_tsvector('simple', '1 2 3 1') @@ '1 <-> 3' AS "false"; @@ -683,6 +683,12 @@ SELECT to_tsvector('simple', '1 2 3 1') @@ '1 <2> 3' AS "true"; t (1 row) +SELECT to_tsvector('simple', '1 2 1 2') @@ '1 <3> 2' AS "true"; + true +------ + t +(1 row) + SELECT to_tsvector('simple', '1 2 11 3') @@ '1 <-> 3' AS "false"; false ------- @@ -897,7 +903,7 @@ SELECT ts_rank_cd(' a:1 sa:2A sb:2D g'::tsvector, 'a <-> s:*'); SELECT ts_rank_cd(' a:1 sa:2A sb:2D g'::tsvector, 'a <-> s:* <-> sa:A'); ts_rank_cd ------------ - 0.0714286 + 0 (1 row) SELECT ts_rank_cd(' a:1 sa:2A sb:2D g'::tsvector, 'a <-> s:* <-> sa:B'); @@ -924,10 +930,10 @@ SELECT 'a:1 b:2'::tsvector @@ 'a <1> b'::tsquery AS "true"; t (1 row) -SELECT 'a:1 b:2'::tsvector @@ 'a <2> b'::tsquery AS "true"; - true ------- - t +SELECT 'a:1 b:2'::tsvector @@ 'a <2> b'::tsquery AS "false"; + false +------- + f (1 row) SELECT 'a:1 b:3'::tsvector @@ 'a <-> b'::tsquery AS "false"; @@ -954,10 +960,10 @@ SELECT 'a:1 b:3'::tsvector @@ 'a <2> b'::tsquery AS "true"; t (1 row) -SELECT 'a:1 b:3'::tsvector @@ 'a <3> b'::tsquery AS "true"; - true ------- - t +SELECT 'a:1 b:3'::tsvector @@ 'a <3> b'::tsquery AS "false"; + false +------- + f (1 row) -- tsvector editing operations diff --git a/src/test/regress/sql/tstypes.sql b/src/test/regress/sql/tstypes.sql index 738ec824b9..abcf1504ce 100644 --- a/src/test/regress/sql/tstypes.sql +++ b/src/test/regress/sql/tstypes.sql @@ -130,9 +130,10 @@ SELECT 'supeznova supernova'::tsvector @@ 'super:*'::tsquery AS "true"; --phrase search SELECT to_tsvector('simple', '1 2 3 1') @@ '1 <-> 2' AS "true"; -SELECT to_tsvector('simple', '1 2 3 1') @@ '1 <2> 2' AS "true"; +SELECT to_tsvector('simple', '1 2 3 1') @@ '1 <2> 2' AS "false"; SELECT to_tsvector('simple', '1 2 3 1') @@ '1 <-> 3' AS "false"; SELECT to_tsvector('simple', '1 2 3 1') @@ '1 <2> 3' AS "true"; +SELECT to_tsvector('simple', '1 2 1 2') @@ '1 <3> 2' AS "true"; SELECT to_tsvector('simple', '1 2 11 3') @@ '1 <-> 3' AS "false"; SELECT to_tsvector('simple', '1 2 11 3') @@ '1:* <-> 3' AS "true"; @@ -180,12 +181,12 @@ SELECT ts_rank_cd(' a:1 sa:2A sb:2D g'::tsvector, 'a <-> s:* <-> sa:B'); SELECT 'a:1 b:2'::tsvector @@ 'a <-> b'::tsquery AS "true"; SELECT 'a:1 b:2'::tsvector @@ 'a <0> b'::tsquery AS "false"; SELECT 'a:1 b:2'::tsvector @@ 'a <1> b'::tsquery AS "true"; -SELECT 'a:1 b:2'::tsvector @@ 'a <2> b'::tsquery AS "true"; +SELECT 'a:1 b:2'::tsvector @@ 'a <2> b'::tsquery AS "false"; SELECT 'a:1 b:3'::tsvector @@ 'a <-> b'::tsquery AS "false"; SELECT 'a:1 b:3'::tsvector @@ 'a <0> b'::tsquery AS "false"; SELECT 'a:1 b:3'::tsvector @@ 'a <1> b'::tsquery AS "false"; SELECT 'a:1 b:3'::tsvector @@ 'a <2> b'::tsquery AS "true"; -SELECT 'a:1 b:3'::tsvector @@ 'a <3> b'::tsquery AS "true"; +SELECT 'a:1 b:3'::tsvector @@ 'a <3> b'::tsquery AS "false"; -- tsvector editing operations -- 2.40.0