From 1664ae1978bf0f5ee940dc2fc8313e6400a7e7da Mon Sep 17 00:00:00 2001 From: Teodor Sigaev Date: Thu, 5 Apr 2018 19:55:11 +0300 Subject: [PATCH] Add websearch_to_tsquery Error-tolerant conversion function with web-like syntax for search query, it simplifies constraining search engine with close to habitual interface for users. Bump catalog version Authors: Victor Drobny, Dmitry Ivanov with editorization by me Reviewed by: Aleksander Alekseev, Tomas Vondra, Thomas Munro, Aleksandr Parfenov Discussion: https://www.postgresql.org/message-id/flat/fe931111ff7e9ad79196486ada79e268@postgrespro.ru --- doc/src/sgml/func.sgml | 12 + doc/src/sgml/textsearch.sgml | 92 +++++- src/backend/tsearch/to_tsany.c | 38 ++- src/backend/utils/adt/tsquery.c | 390 +++++++++++++++++----- src/backend/utils/adt/tsvector.c | 2 +- src/backend/utils/adt/tsvector_parser.c | 36 +- src/include/catalog/catversion.h | 2 +- src/include/catalog/pg_proc.h | 4 + src/include/tsearch/ts_utils.h | 26 +- src/test/regress/expected/tsearch.out | 423 ++++++++++++++++++++++++ src/test/regress/sql/tsearch.sql | 94 ++++++ 11 files changed, 1000 insertions(+), 119 deletions(-) diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml index 9a1efc14cf..122f034f17 100644 --- a/doc/src/sgml/func.sgml +++ b/doc/src/sgml/func.sgml @@ -9630,6 +9630,18 @@ CREATE TYPE rainbow AS ENUM ('red', 'orange', 'yellow', 'green', 'blue', 'purple phraseto_tsquery('english', 'The Fat Rats') 'fat' <-> 'rat' + + + + websearch_to_tsquery + + websearch_to_tsquery( config regconfig , query text) + + tsquery + produce tsquery from a web search style query + websearch_to_tsquery('english', '"fat rat" or rat') + 'fat' <-> 'rat' | 'rat' + diff --git a/doc/src/sgml/textsearch.sgml b/doc/src/sgml/textsearch.sgml index 610b7bf033..19f58511c8 100644 --- a/doc/src/sgml/textsearch.sgml +++ b/doc/src/sgml/textsearch.sgml @@ -797,13 +797,16 @@ UPDATE tt SET ti = PostgreSQL provides the functions to_tsquery, - plainto_tsquery, and - phraseto_tsquery + plainto_tsquery, + phraseto_tsquery and + websearch_to_tsquery for converting a query to the tsquery data type. to_tsquery offers access to more features than either plainto_tsquery or - phraseto_tsquery, but it is less forgiving - about its input. + phraseto_tsquery, but it is less forgiving about its + input. websearch_to_tsquery is a simplified version + of to_tsquery with an alternative syntax, similar + to the one used by web search engines. @@ -962,6 +965,87 @@ SELECT phraseto_tsquery('english', 'The Fat & Rats:C'); + +websearch_to_tsquery( config regconfig, querytext text) returns tsquery + + + + websearch_to_tsquery creates a tsquery + value from querytext using an alternative + syntax in which simple unformatted text is a valid query. + Unlike plainto_tsquery + and phraseto_tsquery, it also recognizes certain + operators. Moreover, this function should never raise syntax errors, + which makes it possible to use raw user-supplied input for search. + The following syntax is supported: + + + + unquoted text: text not inside quote marks will be + converted to terms separated by & operators, as + if processed by + plainto_tsquery. + + + + + "quoted text": text inside quote marks will be + converted to terms separated by <-> + operators, as if processed by phraseto_tsquery. + + + + + OR: logical or will be converted to + the | operator. + + + + + -: the logical not operator, converted to the + the ! operator. + + + + + + Examples: + + select websearch_to_tsquery('english', 'The fat rats'); + websearch_to_tsquery + ----------------- + 'fat' & 'rat' + (1 row) + + + select websearch_to_tsquery('english', '"supernovae stars" -crab'); + websearch_to_tsquery + ---------------------------------- + 'supernova' <-> 'star' & !'crab' + (1 row) + + + select websearch_to_tsquery('english', '"sad cat" or "fat rat"'); + websearch_to_tsquery + ----------------------------------- + 'sad' <-> 'cat' | 'fat' <-> 'rat' + (1 row) + + + select websearch_to_tsquery('english', 'signal -"segmentation fault"'); + websearch_to_tsquery + --------------------------------------- + 'signal' & !( 'segment' <-> 'fault' ) + (1 row) + + + select websearch_to_tsquery('english', '""" )( dummy \\ query <->'); + websearch_to_tsquery + ---------------------- + 'dummi' & 'queri' + (1 row) + + diff --git a/src/backend/tsearch/to_tsany.c b/src/backend/tsearch/to_tsany.c index ea5947a3a8..6055fb6b4e 100644 --- a/src/backend/tsearch/to_tsany.c +++ b/src/backend/tsearch/to_tsany.c @@ -490,7 +490,7 @@ to_tsquery_byid(PG_FUNCTION_ARGS) query = parse_tsquery(text_to_cstring(in), pushval_morph, PointerGetDatum(&data), - false); + 0); PG_RETURN_TSQUERY(query); } @@ -520,7 +520,7 @@ plainto_tsquery_byid(PG_FUNCTION_ARGS) query = parse_tsquery(text_to_cstring(in), pushval_morph, PointerGetDatum(&data), - true); + P_TSQ_PLAIN); PG_RETURN_POINTER(query); } @@ -551,7 +551,7 @@ phraseto_tsquery_byid(PG_FUNCTION_ARGS) query = parse_tsquery(text_to_cstring(in), pushval_morph, PointerGetDatum(&data), - true); + P_TSQ_PLAIN); PG_RETURN_TSQUERY(query); } @@ -567,3 +567,35 @@ phraseto_tsquery(PG_FUNCTION_ARGS) ObjectIdGetDatum(cfgId), PointerGetDatum(in))); } + +Datum +websearch_to_tsquery_byid(PG_FUNCTION_ARGS) +{ + text *in = PG_GETARG_TEXT_PP(1); + MorphOpaque data; + TSQuery query = NULL; + + data.cfg_id = PG_GETARG_OID(0); + + data.qoperator = OP_AND; + + query = parse_tsquery(text_to_cstring(in), + pushval_morph, + PointerGetDatum(&data), + P_TSQ_WEB); + + PG_RETURN_TSQUERY(query); +} + +Datum +websearch_to_tsquery(PG_FUNCTION_ARGS) +{ + text *in = PG_GETARG_TEXT_PP(0); + Oid cfgId; + + cfgId = getTSCurrentConfig(true); + PG_RETURN_DATUM(DirectFunctionCall2(websearch_to_tsquery_byid, + ObjectIdGetDatum(cfgId), + PointerGetDatum(in))); + +} diff --git a/src/backend/utils/adt/tsquery.c b/src/backend/utils/adt/tsquery.c index 1ccbf79030..793c0e5dd1 100644 --- a/src/backend/utils/adt/tsquery.c +++ b/src/backend/utils/adt/tsquery.c @@ -32,14 +32,53 @@ const int tsearch_op_priority[OP_COUNT] = 3 /* OP_PHRASE */ }; +/* + * parser's states + */ +typedef enum +{ + WAITOPERAND = 1, + WAITOPERATOR = 2, + WAITFIRSTOPERAND = 3 +} ts_parserstate; + +/* + * token types for parsing + */ +typedef enum +{ + PT_END = 0, + PT_ERR = 1, + PT_VAL = 2, + PT_OPR = 3, + PT_OPEN = 4, + PT_CLOSE = 5 +} ts_tokentype; + +/* + * get token from query string + * + * *operator is filled in with OP_* when return values is PT_OPR, + * but *weight could contain a distance value in case of phrase operator. + * *strval, *lenval and *weight are filled in when return value is PT_VAL + * + */ +typedef ts_tokentype (*ts_tokenizer)(TSQueryParserState state, int8 *operator, + int *lenval, char **strval, + int16 *weight, bool *prefix); + struct TSQueryParserStateData { - /* State for gettoken_query */ + /* Tokenizer used for parsing tsquery */ + ts_tokenizer gettoken; + + /* State of tokenizer function */ char *buffer; /* entire string we are scanning */ char *buf; /* current scan point */ - int state; int count; /* nesting count, incremented by (, * decremented by ) */ + bool in_quotes; /* phrase in quotes "" */ + ts_parserstate state; /* polish (prefix) notation in list, filled in by push* functions */ List *polstr; @@ -57,12 +96,6 @@ struct TSQueryParserStateData TSVectorParseState valstate; }; -/* parser's states */ -#define WAITOPERAND 1 -#define WAITOPERATOR 2 -#define WAITFIRSTOPERAND 3 -#define WAITSINGLEOPERAND 4 - /* * subroutine to parse the modifiers (weight and prefix flag currently) * part, like ':AB*' of a query. @@ -118,18 +151,17 @@ get_modifiers(char *buf, int16 *weight, bool *prefix) * * The buffer should begin with '<' char */ -static char * -parse_phrase_operator(char *buf, int16 *distance) +static bool +parse_phrase_operator(TSQueryParserState pstate, int16 *distance) { enum { PHRASE_OPEN = 0, PHRASE_DIST, PHRASE_CLOSE, - PHRASE_ERR, PHRASE_FINISH } state = PHRASE_OPEN; - char *ptr = buf; + char *ptr = pstate->buf; char *endptr; long l = 1; /* default distance */ @@ -138,9 +170,13 @@ parse_phrase_operator(char *buf, int16 *distance) switch (state) { case PHRASE_OPEN: - Assert(t_iseq(ptr, '<')); - state = PHRASE_DIST; - ptr++; + if (t_iseq(ptr, '<')) + { + state = PHRASE_DIST; + ptr++; + } + else + return false; break; case PHRASE_DIST: @@ -148,18 +184,16 @@ parse_phrase_operator(char *buf, int16 *distance) { state = PHRASE_CLOSE; ptr++; - break; + continue; } + if (!t_isdigit(ptr)) - { - state = PHRASE_ERR; - break; - } + return false; errno = 0; l = strtol(ptr, &endptr, 10); if (ptr == endptr) - state = PHRASE_ERR; + return false; else if (errno == ERANGE || l < 0 || l > MAXENTRYPOS) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), @@ -179,54 +213,77 @@ parse_phrase_operator(char *buf, int16 *distance) ptr++; } else - state = PHRASE_ERR; + return false; break; case PHRASE_FINISH: *distance = (int16) l; - return ptr; - - case PHRASE_ERR: - default: - goto err; + pstate->buf = ptr; + return true; } } -err: - *distance = -1; - return buf; + return false; } /* - * token types for parsing + * Parse OR operator used in websearch_to_tsquery(), returns true if we + * believe that "OR" literal could be an operator OR */ -typedef enum +static bool +parse_or_operator(TSQueryParserState pstate) { - PT_END = 0, - PT_ERR = 1, - PT_VAL = 2, - PT_OPR = 3, - PT_OPEN = 4, - PT_CLOSE = 5 -} ts_tokentype; + char *ptr = pstate->buf; + + if (pstate->in_quotes) + return false; + + /* it should begin with "OR" literal */ + if (pg_strncasecmp(ptr, "or", 2) != 0) + return false; + + ptr += 2; + + /* + * it shouldn't be a part of any word but somewhere later it should be some + * operand + */ + if (*ptr == '\0') /* no operand */ + return false; + + /* it shouldn't be a part of any word */ + if (t_iseq(ptr, '-') || t_iseq(ptr, '_') || t_isalpha(ptr) || t_isdigit(ptr)) + return false; + + for(;;) + { + ptr += pg_mblen(ptr); + + if (*ptr == '\0') /* got end of string without operand */ + return false; + + /* + * Suppose, we found an operand, but could be a not correct operand. So + * we still treat OR literal as operation with possibly incorrect + * operand and will not search it as lexeme + */ + if (!t_isspace(ptr)) + break; + } + + pstate->buf += 2; + return true; +} -/* - * get token from query string - * - * *operator is filled in with OP_* when return values is PT_OPR, - * but *weight could contain a distance value in case of phrase operator. - * *strval, *lenval and *weight are filled in when return value is PT_VAL - * - */ static ts_tokentype -gettoken_query(TSQueryParserState state, - int8 *operator, - int *lenval, char **strval, int16 *weight, bool *prefix) +gettoken_query_standard(TSQueryParserState state, int8 *operator, + int *lenval, char **strval, + int16 *weight, bool *prefix) { *weight = 0; *prefix = false; - while (1) + while (true) { switch (state->state) { @@ -234,17 +291,16 @@ gettoken_query(TSQueryParserState state, case WAITOPERAND: if (t_iseq(state->buf, '!')) { - (state->buf)++; /* can safely ++, t_iseq guarantee that - * pg_mblen()==1 */ - *operator = OP_NOT; + state->buf++; state->state = WAITOPERAND; + *operator = OP_NOT; return PT_OPR; } else if (t_iseq(state->buf, '(')) { - state->count++; - (state->buf)++; + state->buf++; state->state = WAITOPERAND; + state->count++; return PT_OPEN; } else if (t_iseq(state->buf, ':')) @@ -256,19 +312,19 @@ gettoken_query(TSQueryParserState state, } else if (!t_isspace(state->buf)) { - /* - * We rely on the tsvector parser to parse the value for - * us - */ + /* We rely on the tsvector parser to parse the value for us */ reset_tsvector_parser(state->valstate, state->buf); - if (gettoken_tsvector(state->valstate, strval, lenval, NULL, NULL, &state->buf)) + if (gettoken_tsvector(state->valstate, strval, lenval, + NULL, NULL, &state->buf)) { state->buf = get_modifiers(state->buf, weight, prefix); state->state = WAITOPERATOR; return PT_VAL; } else if (state->state == WAITFIRSTOPERAND) + { return PT_END; + } else ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), @@ -276,58 +332,206 @@ gettoken_query(TSQueryParserState state, state->buffer))); } break; + case WAITOPERATOR: if (t_iseq(state->buf, '&')) { + state->buf++; state->state = WAITOPERAND; *operator = OP_AND; - (state->buf)++; return PT_OPR; } else if (t_iseq(state->buf, '|')) { + state->buf++; state->state = WAITOPERAND; *operator = OP_OR; - (state->buf)++; return PT_OPR; } - else if (t_iseq(state->buf, '<')) + else if (parse_phrase_operator(state, weight)) { + /* weight var is used as storage for distance */ state->state = WAITOPERAND; *operator = OP_PHRASE; - /* weight var is used as storage for distance */ - state->buf = parse_phrase_operator(state->buf, weight); - if (*weight < 0) - return PT_ERR; return PT_OPR; } else if (t_iseq(state->buf, ')')) { - (state->buf)++; + state->buf++; state->count--; return (state->count < 0) ? PT_ERR : PT_CLOSE; } - else if (*(state->buf) == '\0') + else if (*state->buf == '\0') + { return (state->count) ? PT_ERR : PT_END; + } else if (!t_isspace(state->buf)) + { return PT_ERR; + } + break; + } + + state->buf += pg_mblen(state->buf); + } +} + +static ts_tokentype +gettoken_query_websearch(TSQueryParserState state, int8 *operator, + int *lenval, char **strval, + int16 *weight, bool *prefix) +{ + *weight = 0; + *prefix = false; + + while (true) + { + switch (state->state) + { + case WAITFIRSTOPERAND: + case WAITOPERAND: + if (t_iseq(state->buf, '-')) + { + state->buf++; + state->state = WAITOPERAND; + + if (state->in_quotes) + continue; + + *operator = OP_NOT; + return PT_OPR; + } + else if (t_iseq(state->buf, '"')) + { + state->buf++; + + if (!state->in_quotes) + { + state->state = WAITOPERAND; + + if (strchr(state->buf, '"')) + { + /* quoted text should be ordered <-> */ + state->in_quotes = true; + return PT_OPEN; + } + + /* web search tolerates missing quotes */ + continue; + } + else + { + /* we have to provide an operand */ + state->in_quotes = false; + state->state = WAITOPERATOR; + pushStop(state); + return PT_CLOSE; + } + } + else if (ISOPERATOR(state->buf)) + { + /* or else gettoken_tsvector() will raise an error */ + state->buf++; + state->state = WAITOPERAND; + continue; + } + else if (!t_isspace(state->buf)) + { + /* We rely on the tsvector parser to parse the value for us */ + reset_tsvector_parser(state->valstate, state->buf); + if (gettoken_tsvector(state->valstate, strval, lenval, + NULL, NULL, &state->buf)) + { + state->state = WAITOPERATOR; + return PT_VAL; + } + else if (state->state == WAITFIRSTOPERAND) + { + return PT_END; + } + else + { + /* finally, we have to provide an operand */ + pushStop(state); + return PT_END; + } + } break; - case WAITSINGLEOPERAND: - if (*(state->buf) == '\0') + + case WAITOPERATOR: + if (t_iseq(state->buf, '"')) + { + if (!state->in_quotes) + { + /* + * put implicit AND after an operand + * and handle this quote in WAITOPERAND + */ + state->state = WAITOPERAND; + *operator = OP_AND; + return PT_OPR; + } + else + { + state->buf++; + + /* just close quotes */ + state->in_quotes = false; + return PT_CLOSE; + } + } + else if (parse_or_operator(state)) + { + state->state = WAITOPERAND; + *operator = OP_OR; + return PT_OPR; + } + else if (*state->buf == '\0') + { return PT_END; - *strval = state->buf; - *lenval = strlen(state->buf); - state->buf += strlen(state->buf); - state->count++; - return PT_VAL; - default: - return PT_ERR; + } + else if (!t_isspace(state->buf)) + { + if (state->in_quotes) + { + /* put implicit <-> after an operand */ + *operator = OP_PHRASE; + *weight = 1; + } + else + { + /* put implicit AND after an operand */ + *operator = OP_AND; + } + + state->state = WAITOPERAND; + return PT_OPR; + } break; } + state->buf += pg_mblen(state->buf); } } +static ts_tokentype +gettoken_query_plain(TSQueryParserState state, int8 *operator, + int *lenval, char **strval, + int16 *weight, bool *prefix) +{ + *weight = 0; + *prefix = false; + + if (*state->buf == '\0') + return PT_END; + + *strval = state->buf; + *lenval = strlen(state->buf); + state->buf += *lenval; + state->count++; + return PT_VAL; +} + /* * Push an operator to state->polstr */ @@ -489,7 +693,9 @@ makepol(TSQueryParserState state, /* since this function recurses, it could be driven to stack overflow */ check_stack_depth(); - while ((type = gettoken_query(state, &operator, &lenval, &strval, &weight, &prefix)) != PT_END) + while ((type = state->gettoken(state, &operator, + &lenval, &strval, + &weight, &prefix)) != PT_END) { switch (type) { @@ -605,7 +811,7 @@ TSQuery parse_tsquery(char *buf, PushFunction pushval, Datum opaque, - bool isplain) + int flags) { struct TSQueryParserStateData state; int i; @@ -614,16 +820,32 @@ parse_tsquery(char *buf, QueryItem *ptr; ListCell *cell; bool needcleanup; + int tsv_flags = P_TSV_OPR_IS_DELIM | P_TSV_IS_TSQUERY; + + /* plain should not be used with web */ + Assert((flags & (P_TSQ_PLAIN | P_TSQ_WEB)) != (P_TSQ_PLAIN | P_TSQ_WEB)); + + /* select suitable tokenizer */ + if (flags & P_TSQ_PLAIN) + state.gettoken = gettoken_query_plain; + else if (flags & P_TSQ_WEB) + { + state.gettoken = gettoken_query_websearch; + tsv_flags |= P_TSV_IS_WEB; + } + else + state.gettoken = gettoken_query_standard; /* init state */ state.buffer = buf; state.buf = buf; - state.state = (isplain) ? WAITSINGLEOPERAND : WAITFIRSTOPERAND; state.count = 0; + state.in_quotes = false; + state.state = WAITFIRSTOPERAND; state.polstr = NIL; /* init value parser's state */ - state.valstate = init_tsvector_parser(state.buffer, true, true); + state.valstate = init_tsvector_parser(state.buffer, tsv_flags); /* init list of operand */ state.sumlen = 0; @@ -716,7 +938,7 @@ tsqueryin(PG_FUNCTION_ARGS) { char *in = PG_GETARG_CSTRING(0); - PG_RETURN_TSQUERY(parse_tsquery(in, pushval_asis, PointerGetDatum(NULL), false)); + PG_RETURN_TSQUERY(parse_tsquery(in, pushval_asis, PointerGetDatum(NULL), 0)); } /* diff --git a/src/backend/utils/adt/tsvector.c b/src/backend/utils/adt/tsvector.c index 64e02ef434..7a27bd12a3 100644 --- a/src/backend/utils/adt/tsvector.c +++ b/src/backend/utils/adt/tsvector.c @@ -200,7 +200,7 @@ tsvectorin(PG_FUNCTION_ARGS) char *cur; int buflen = 256; /* allocated size of tmpbuf */ - state = init_tsvector_parser(buf, false, false); + state = init_tsvector_parser(buf, 0); arrlen = 64; arr = (WordEntryIN *) palloc(sizeof(WordEntryIN) * arrlen); diff --git a/src/backend/utils/adt/tsvector_parser.c b/src/backend/utils/adt/tsvector_parser.c index 7367ba6a40..fed411a842 100644 --- a/src/backend/utils/adt/tsvector_parser.c +++ b/src/backend/utils/adt/tsvector_parser.c @@ -33,6 +33,7 @@ struct TSVectorParseStateData int eml; /* max bytes per character */ bool oprisdelim; /* treat ! | * ( ) as delimiters? */ bool is_tsquery; /* say "tsquery" not "tsvector" in errors? */ + bool is_web; /* we're in websearch_to_tsquery() */ }; @@ -42,7 +43,7 @@ struct TSVectorParseStateData * ! | & ( ) */ TSVectorParseState -init_tsvector_parser(char *input, bool oprisdelim, bool is_tsquery) +init_tsvector_parser(char *input, int flags) { TSVectorParseState state; @@ -52,8 +53,9 @@ init_tsvector_parser(char *input, bool oprisdelim, bool is_tsquery) state->len = 32; state->word = (char *) palloc(state->len); state->eml = pg_database_encoding_max_length(); - state->oprisdelim = oprisdelim; - state->is_tsquery = is_tsquery; + state->oprisdelim = (flags & P_TSV_OPR_IS_DELIM) != 0; + state->is_tsquery = (flags & P_TSV_IS_TSQUERY) != 0; + state->is_web = (flags & P_TSV_IS_WEB) != 0; return state; } @@ -89,16 +91,6 @@ do { \ } \ } while (0) -/* phrase operator begins with '<' */ -#define ISOPERATOR(x) \ - ( pg_mblen(x) == 1 && ( *(x) == '!' || \ - *(x) == '&' || \ - *(x) == '|' || \ - *(x) == '(' || \ - *(x) == ')' || \ - *(x) == '<' \ - ) ) - /* Fills gettoken_tsvector's output parameters, and returns true */ #define RETURN_TOKEN \ do { \ @@ -183,14 +175,15 @@ gettoken_tsvector(TSVectorParseState state, { if (*(state->prsbuf) == '\0') return false; - else if (t_iseq(state->prsbuf, '\'')) + else if (!state->is_web && t_iseq(state->prsbuf, '\'')) statecode = WAITENDCMPLX; - else if (t_iseq(state->prsbuf, '\\')) + else if (!state->is_web && t_iseq(state->prsbuf, '\\')) { statecode = WAITNEXTCHAR; oldstate = WAITENDWORD; } - else if (state->oprisdelim && ISOPERATOR(state->prsbuf)) + else if ((state->oprisdelim && ISOPERATOR(state->prsbuf)) || + (state->is_web && t_iseq(state->prsbuf, '"'))) PRSSYNTAXERROR; else if (!t_isspace(state->prsbuf)) { @@ -217,13 +210,14 @@ gettoken_tsvector(TSVectorParseState state, } else if (statecode == WAITENDWORD) { - if (t_iseq(state->prsbuf, '\\')) + if (!state->is_web && t_iseq(state->prsbuf, '\\')) { statecode = WAITNEXTCHAR; oldstate = WAITENDWORD; } else if (t_isspace(state->prsbuf) || *(state->prsbuf) == '\0' || - (state->oprisdelim && ISOPERATOR(state->prsbuf))) + (state->oprisdelim && ISOPERATOR(state->prsbuf)) || + (state->is_web && t_iseq(state->prsbuf, '"'))) { RESIZEPRSBUF; if (curpos == state->word) @@ -250,11 +244,11 @@ gettoken_tsvector(TSVectorParseState state, } else if (statecode == WAITENDCMPLX) { - if (t_iseq(state->prsbuf, '\'')) + if (!state->is_web && t_iseq(state->prsbuf, '\'')) { statecode = WAITCHARCMPLX; } - else if (t_iseq(state->prsbuf, '\\')) + else if (!state->is_web && t_iseq(state->prsbuf, '\\')) { statecode = WAITNEXTCHAR; oldstate = WAITENDCMPLX; @@ -270,7 +264,7 @@ gettoken_tsvector(TSVectorParseState state, } else if (statecode == WAITCHARCMPLX) { - if (t_iseq(state->prsbuf, '\'')) + if (!state->is_web && t_iseq(state->prsbuf, '\'')) { RESIZEPRSBUF; COPYCHAR(curpos, state->prsbuf); diff --git a/src/include/catalog/catversion.h b/src/include/catalog/catversion.h index 5d55890b9d..5f63efc355 100644 --- a/src/include/catalog/catversion.h +++ b/src/include/catalog/catversion.h @@ -53,6 +53,6 @@ */ /* yyyymmddN */ -#define CATALOG_VERSION_NO 201804031 +#define CATALOG_VERSION_NO 201804051 #endif diff --git a/src/include/catalog/pg_proc.h b/src/include/catalog/pg_proc.h index 9bf20c059b..edf212fcf0 100644 --- a/src/include/catalog/pg_proc.h +++ b/src/include/catalog/pg_proc.h @@ -4971,6 +4971,8 @@ DATA(insert OID = 3747 ( plainto_tsquery PGNSP PGUID 12 100 0 0 0 f f f t f i s DESCR("transform to tsquery"); DATA(insert OID = 5006 ( phraseto_tsquery PGNSP PGUID 12 100 0 0 0 f f f t f i s 2 0 3615 "3734 25" _null_ _null_ _null_ _null_ _null_ phraseto_tsquery_byid _null_ _null_ _null_ )); DESCR("transform to tsquery"); +DATA(insert OID = 8889 ( websearch_to_tsquery PGNSP PGUID 12 100 0 0 0 f f f t f i s 2 0 3615 "3734 25" _null_ _null_ _null_ _null_ _null_ websearch_to_tsquery_byid _null_ _null_ _null_ )); +DESCR("transform to tsquery"); DATA(insert OID = 3749 ( to_tsvector PGNSP PGUID 12 100 0 0 0 f f f t f s s 1 0 3614 "25" _null_ _null_ _null_ _null_ _null_ to_tsvector _null_ _null_ _null_ )); DESCR("transform to tsvector"); DATA(insert OID = 3750 ( to_tsquery PGNSP PGUID 12 100 0 0 0 f f f t f s s 1 0 3615 "25" _null_ _null_ _null_ _null_ _null_ to_tsquery _null_ _null_ _null_ )); @@ -4979,6 +4981,8 @@ DATA(insert OID = 3751 ( plainto_tsquery PGNSP PGUID 12 100 0 0 0 f f f t f s s DESCR("transform to tsquery"); DATA(insert OID = 5001 ( phraseto_tsquery PGNSP PGUID 12 100 0 0 0 f f f t f s s 1 0 3615 "25" _null_ _null_ _null_ _null_ _null_ phraseto_tsquery _null_ _null_ _null_ )); DESCR("transform to tsquery"); +DATA(insert OID = 8890 ( websearch_to_tsquery PGNSP PGUID 12 100 0 0 0 f f f t f s s 1 0 3615 "25" _null_ _null_ _null_ _null_ _null_ websearch_to_tsquery _null_ _null_ _null_ )); +DESCR("transform to tsquery"); DATA(insert OID = 4209 ( to_tsvector PGNSP PGUID 12 100 0 0 0 f f f t f s s 1 0 3614 "3802" _null_ _null_ _null_ _null_ _null_ jsonb_to_tsvector _null_ _null_ _null_ )); DESCR("transform jsonb to tsvector"); DATA(insert OID = 4210 ( to_tsvector PGNSP PGUID 12 100 0 0 0 f f f t f s s 1 0 3614 "114" _null_ _null_ _null_ _null_ _null_ json_to_tsvector _null_ _null_ _null_ )); diff --git a/src/include/tsearch/ts_utils.h b/src/include/tsearch/ts_utils.h index f8ddce5ecb..73e969fe9c 100644 --- a/src/include/tsearch/ts_utils.h +++ b/src/include/tsearch/ts_utils.h @@ -25,9 +25,11 @@ struct TSVectorParseStateData; /* opaque struct in tsvector_parser.c */ typedef struct TSVectorParseStateData *TSVectorParseState; -extern TSVectorParseState init_tsvector_parser(char *input, - bool oprisdelim, - bool is_tsquery); +#define P_TSV_OPR_IS_DELIM (1 << 0) +#define P_TSV_IS_TSQUERY (1 << 1) +#define P_TSV_IS_WEB (1 << 2) + +extern TSVectorParseState init_tsvector_parser(char *input, int flags); extern void reset_tsvector_parser(TSVectorParseState state, char *input); extern bool gettoken_tsvector(TSVectorParseState state, char **token, int *len, @@ -35,6 +37,16 @@ extern bool gettoken_tsvector(TSVectorParseState state, char **endptr); extern void close_tsvector_parser(TSVectorParseState state); +/* phrase operator begins with '<' */ +#define ISOPERATOR(x) \ + ( pg_mblen(x) == 1 && ( *(x) == '!' || \ + *(x) == '&' || \ + *(x) == '|' || \ + *(x) == '(' || \ + *(x) == ')' || \ + *(x) == '<' \ + ) ) + /* parse_tsquery */ struct TSQueryParserStateData; /* private in backend/utils/adt/tsquery.c */ @@ -46,9 +58,13 @@ typedef void (*PushFunction) (Datum opaque, TSQueryParserState state, * QueryOperand struct */ bool prefix); +#define P_TSQ_PLAIN (1 << 0) +#define P_TSQ_WEB (1 << 1) + extern TSQuery parse_tsquery(char *buf, - PushFunction pushval, - Datum opaque, bool isplain); + PushFunction pushval, + Datum opaque, + int flags); /* Functions for use by PushFunction implementations */ extern void pushValue(TSQueryParserState state, diff --git a/src/test/regress/expected/tsearch.out b/src/test/regress/expected/tsearch.out index d63fb12f1d..c38237c8a4 100644 --- a/src/test/regress/expected/tsearch.out +++ b/src/test/regress/expected/tsearch.out @@ -1672,3 +1672,426 @@ select * from phrase_index_test where fts @@ phraseto_tsquery('english', 'fat ca (1 row) set enable_seqscan = on; +-- test websearch_to_tsquery function +select websearch_to_tsquery('simple', 'I have a fat:*ABCD cat'); + websearch_to_tsquery +--------------------------------------------- + 'i' & 'have' & 'a' & 'fat' & 'abcd' & 'cat' +(1 row) + +select websearch_to_tsquery('simple', 'orange:**AABBCCDD'); + websearch_to_tsquery +----------------------- + 'orange' & 'aabbccdd' +(1 row) + +select websearch_to_tsquery('simple', 'fat:A!cat:B|rat:C<'); + websearch_to_tsquery +----------------------------------------- + 'fat' & 'a' & 'cat' & 'b' & 'rat' & 'c' +(1 row) + +select websearch_to_tsquery('simple', 'fat:A : cat:B'); + websearch_to_tsquery +--------------------------- + 'fat' & 'a' & 'cat' & 'b' +(1 row) + +select websearch_to_tsquery('simple', 'fat*rat'); + websearch_to_tsquery +---------------------- + 'fat' & 'rat' +(1 row) + +select websearch_to_tsquery('simple', 'fat-rat'); + websearch_to_tsquery +--------------------------- + 'fat-rat' & 'fat' & 'rat' +(1 row) + +select websearch_to_tsquery('simple', 'fat_rat'); + websearch_to_tsquery +---------------------- + 'fat' & 'rat' +(1 row) + +-- weights are completely ignored +select websearch_to_tsquery('simple', 'abc : def'); + websearch_to_tsquery +---------------------- + 'abc' & 'def' +(1 row) + +select websearch_to_tsquery('simple', 'abc:def'); + websearch_to_tsquery +---------------------- + 'abc' & 'def' +(1 row) + +select websearch_to_tsquery('simple', 'a:::b'); + websearch_to_tsquery +---------------------- + 'a' & 'b' +(1 row) + +select websearch_to_tsquery('simple', 'abc:d'); + websearch_to_tsquery +---------------------- + 'abc' & 'd' +(1 row) + +select websearch_to_tsquery('simple', ':'); +NOTICE: text-search query contains only stop words or doesn't contain lexemes, ignored + websearch_to_tsquery +---------------------- + +(1 row) + +-- these operators are ignored +select websearch_to_tsquery('simple', 'abc & def'); + websearch_to_tsquery +---------------------- + 'abc' & 'def' +(1 row) + +select websearch_to_tsquery('simple', 'abc | def'); + websearch_to_tsquery +---------------------- + 'abc' & 'def' +(1 row) + +select websearch_to_tsquery('simple', 'abc <-> def'); + websearch_to_tsquery +---------------------- + 'abc' & 'def' +(1 row) + +select websearch_to_tsquery('simple', 'abc (pg or class)'); + websearch_to_tsquery +------------------------ + 'abc' & 'pg' | 'class' +(1 row) + +-- NOT is ignored in quotes +select websearch_to_tsquery('english', 'My brand new smartphone'); + websearch_to_tsquery +------------------------------- + 'brand' & 'new' & 'smartphon' +(1 row) + +select websearch_to_tsquery('english', 'My brand "new smartphone"'); + websearch_to_tsquery +--------------------------------- + 'brand' & 'new' <-> 'smartphon' +(1 row) + +select websearch_to_tsquery('english', 'My brand "new -smartphone"'); + websearch_to_tsquery +--------------------------------- + 'brand' & 'new' <-> 'smartphon' +(1 row) + +-- test OR operator +select websearch_to_tsquery('simple', 'cat or rat'); + websearch_to_tsquery +---------------------- + 'cat' | 'rat' +(1 row) + +select websearch_to_tsquery('simple', 'cat OR rat'); + websearch_to_tsquery +---------------------- + 'cat' | 'rat' +(1 row) + +select websearch_to_tsquery('simple', 'cat "OR" rat'); + websearch_to_tsquery +---------------------- + 'cat' & 'or' & 'rat' +(1 row) + +select websearch_to_tsquery('simple', 'cat OR'); + websearch_to_tsquery +---------------------- + 'cat' & 'or' +(1 row) + +select websearch_to_tsquery('simple', 'OR rat'); + websearch_to_tsquery +---------------------- + 'or' & 'rat' +(1 row) + +select websearch_to_tsquery('simple', '"fat cat OR rat"'); + websearch_to_tsquery +------------------------------------ + 'fat' <-> 'cat' <-> 'or' <-> 'rat' +(1 row) + +select websearch_to_tsquery('simple', 'fat (cat OR rat'); + websearch_to_tsquery +----------------------- + 'fat' & 'cat' | 'rat' +(1 row) + +select websearch_to_tsquery('simple', 'or OR or'); + websearch_to_tsquery +---------------------- + 'or' | 'or' +(1 row) + +-- OR is an operator here ... +select websearch_to_tsquery('simple', '"fat cat"or"fat rat"'); + websearch_to_tsquery +----------------------------------- + 'fat' <-> 'cat' | 'fat' <-> 'rat' +(1 row) + +select websearch_to_tsquery('simple', 'fat or(rat'); + websearch_to_tsquery +---------------------- + 'fat' | 'rat' +(1 row) + +select websearch_to_tsquery('simple', 'fat or)rat'); + websearch_to_tsquery +---------------------- + 'fat' | 'rat' +(1 row) + +select websearch_to_tsquery('simple', 'fat or&rat'); + websearch_to_tsquery +---------------------- + 'fat' | 'rat' +(1 row) + +select websearch_to_tsquery('simple', 'fat or|rat'); + websearch_to_tsquery +---------------------- + 'fat' | 'rat' +(1 row) + +select websearch_to_tsquery('simple', 'fat or!rat'); + websearch_to_tsquery +---------------------- + 'fat' | 'rat' +(1 row) + +select websearch_to_tsquery('simple', 'fat orrat'); + websearch_to_tsquery +---------------------- + 'fat' | 'rat' +(1 row) + +select websearch_to_tsquery('simple', 'fat or '); + websearch_to_tsquery +---------------------- + 'fat' & 'or' +(1 row) + +-- ... but not here +select websearch_to_tsquery('simple', 'abc orange'); + websearch_to_tsquery +---------------------- + 'abc' & 'orange' +(1 row) + +select websearch_to_tsquery('simple', 'abc orтест'); + websearch_to_tsquery +---------------------- + 'abc' & 'orтест' +(1 row) + +select websearch_to_tsquery('simple', 'abc OR1234'); + websearch_to_tsquery +---------------------- + 'abc' & 'or1234' +(1 row) + +select websearch_to_tsquery('simple', 'abc or-abc'); + websearch_to_tsquery +--------------------------------- + 'abc' & 'or-abc' & 'or' & 'abc' +(1 row) + +select websearch_to_tsquery('simple', 'abc OR_abc'); + websearch_to_tsquery +---------------------- + 'abc' & 'or' & 'abc' +(1 row) + +-- test quotes +select websearch_to_tsquery('english', '"pg_class pg'); + websearch_to_tsquery +----------------------- + 'pg' & 'class' & 'pg' +(1 row) + +select websearch_to_tsquery('english', 'pg_class pg"'); + websearch_to_tsquery +----------------------- + 'pg' & 'class' & 'pg' +(1 row) + +select websearch_to_tsquery('english', '"pg_class pg"'); + websearch_to_tsquery +----------------------------- + ( 'pg' & 'class' ) <-> 'pg' +(1 row) + +select websearch_to_tsquery('english', 'abc "pg_class pg"'); + websearch_to_tsquery +------------------------------------- + 'abc' & ( 'pg' & 'class' ) <-> 'pg' +(1 row) + +select websearch_to_tsquery('english', '"pg_class pg" def'); + websearch_to_tsquery +------------------------------------- + ( 'pg' & 'class' ) <-> 'pg' & 'def' +(1 row) + +select websearch_to_tsquery('english', 'abc "pg pg_class pg" def'); + websearch_to_tsquery +------------------------------------------------------ + 'abc' & 'pg' <-> ( 'pg' & 'class' ) <-> 'pg' & 'def' +(1 row) + +select websearch_to_tsquery('english', ' or "pg pg_class pg" or '); + websearch_to_tsquery +-------------------------------------- + 'pg' <-> ( 'pg' & 'class' ) <-> 'pg' +(1 row) + +select websearch_to_tsquery('english', '""pg pg_class pg""'); + websearch_to_tsquery +------------------------------ + 'pg' & 'pg' & 'class' & 'pg' +(1 row) + +select websearch_to_tsquery('english', 'abc """"" def'); + websearch_to_tsquery +---------------------- + 'abc' & 'def' +(1 row) + +select websearch_to_tsquery('english', 'cat -"fat rat"'); + websearch_to_tsquery +------------------------------ + 'cat' & !( 'fat' <-> 'rat' ) +(1 row) + +select websearch_to_tsquery('english', 'cat -"fat rat" cheese'); + websearch_to_tsquery +---------------------------------------- + 'cat' & !( 'fat' <-> 'rat' ) & 'chees' +(1 row) + +select websearch_to_tsquery('english', 'abc "def -"'); + websearch_to_tsquery +---------------------- + 'abc' & 'def' +(1 row) + +select websearch_to_tsquery('english', 'abc "def :"'); + websearch_to_tsquery +---------------------- + 'abc' & 'def' +(1 row) + +select websearch_to_tsquery('english', '"A fat cat" has just eaten a -rat.'); + websearch_to_tsquery +------------------------------------ + 'fat' <-> 'cat' & 'eaten' & !'rat' +(1 row) + +select websearch_to_tsquery('english', '"A fat cat" has just eaten OR !rat.'); + websearch_to_tsquery +----------------------------------- + 'fat' <-> 'cat' & 'eaten' | 'rat' +(1 row) + +select websearch_to_tsquery('english', '"A fat cat" has just (+eaten OR -rat)'); + websearch_to_tsquery +------------------------------------ + 'fat' <-> 'cat' & 'eaten' | !'rat' +(1 row) + +select websearch_to_tsquery('english', 'this is ----fine'); + websearch_to_tsquery +---------------------- + !!!!'fine' +(1 row) + +select websearch_to_tsquery('english', '(()) )))) this ||| is && -fine, "dear friend" OR good'); + websearch_to_tsquery +---------------------------------------- + !'fine' & 'dear' <-> 'friend' | 'good' +(1 row) + +select websearch_to_tsquery('english', 'an old <-> cat " is fine &&& too'); + websearch_to_tsquery +------------------------ + 'old' & 'cat' & 'fine' +(1 row) + +select websearch_to_tsquery('english', '"A the" OR just on'); +NOTICE: text-search query contains only stop words or doesn't contain lexemes, ignored + websearch_to_tsquery +---------------------- + +(1 row) + +select websearch_to_tsquery('english', '"a fat cat" ate a rat'); + websearch_to_tsquery +--------------------------------- + 'fat' <-> 'cat' & 'ate' & 'rat' +(1 row) + +select to_tsvector('english', 'A fat cat ate a rat') @@ + websearch_to_tsquery('english', '"a fat cat" ate a rat'); + ?column? +---------- + t +(1 row) + +select to_tsvector('english', 'A fat grey cat ate a rat') @@ + websearch_to_tsquery('english', '"a fat cat" ate a rat'); + ?column? +---------- + f +(1 row) + +-- cases handled by gettoken_tsvector() +select websearch_to_tsquery(''''); +NOTICE: text-search query contains only stop words or doesn't contain lexemes, ignored + websearch_to_tsquery +---------------------- + +(1 row) + +select websearch_to_tsquery('''abc''''def'''); + websearch_to_tsquery +---------------------- + 'abc' & 'def' +(1 row) + +select websearch_to_tsquery('\abc'); + websearch_to_tsquery +---------------------- + 'abc' +(1 row) + +select websearch_to_tsquery('\'); +NOTICE: text-search query contains only stop words or doesn't contain lexemes, ignored + websearch_to_tsquery +---------------------- + +(1 row) + diff --git a/src/test/regress/sql/tsearch.sql b/src/test/regress/sql/tsearch.sql index 1c8520b3e9..1768541f21 100644 --- a/src/test/regress/sql/tsearch.sql +++ b/src/test/regress/sql/tsearch.sql @@ -539,3 +539,97 @@ create index phrase_index_test_idx on phrase_index_test using gin(fts); set enable_seqscan = off; select * from phrase_index_test where fts @@ phraseto_tsquery('english', 'fat cat'); set enable_seqscan = on; + +-- test websearch_to_tsquery function +select websearch_to_tsquery('simple', 'I have a fat:*ABCD cat'); +select websearch_to_tsquery('simple', 'orange:**AABBCCDD'); +select websearch_to_tsquery('simple', 'fat:A!cat:B|rat:C<'); +select websearch_to_tsquery('simple', 'fat:A : cat:B'); + +select websearch_to_tsquery('simple', 'fat*rat'); +select websearch_to_tsquery('simple', 'fat-rat'); +select websearch_to_tsquery('simple', 'fat_rat'); + +-- weights are completely ignored +select websearch_to_tsquery('simple', 'abc : def'); +select websearch_to_tsquery('simple', 'abc:def'); +select websearch_to_tsquery('simple', 'a:::b'); +select websearch_to_tsquery('simple', 'abc:d'); +select websearch_to_tsquery('simple', ':'); + +-- these operators are ignored +select websearch_to_tsquery('simple', 'abc & def'); +select websearch_to_tsquery('simple', 'abc | def'); +select websearch_to_tsquery('simple', 'abc <-> def'); +select websearch_to_tsquery('simple', 'abc (pg or class)'); + +-- NOT is ignored in quotes +select websearch_to_tsquery('english', 'My brand new smartphone'); +select websearch_to_tsquery('english', 'My brand "new smartphone"'); +select websearch_to_tsquery('english', 'My brand "new -smartphone"'); + +-- test OR operator +select websearch_to_tsquery('simple', 'cat or rat'); +select websearch_to_tsquery('simple', 'cat OR rat'); +select websearch_to_tsquery('simple', 'cat "OR" rat'); +select websearch_to_tsquery('simple', 'cat OR'); +select websearch_to_tsquery('simple', 'OR rat'); +select websearch_to_tsquery('simple', '"fat cat OR rat"'); +select websearch_to_tsquery('simple', 'fat (cat OR rat'); +select websearch_to_tsquery('simple', 'or OR or'); + +-- OR is an operator here ... +select websearch_to_tsquery('simple', '"fat cat"or"fat rat"'); +select websearch_to_tsquery('simple', 'fat or(rat'); +select websearch_to_tsquery('simple', 'fat or)rat'); +select websearch_to_tsquery('simple', 'fat or&rat'); +select websearch_to_tsquery('simple', 'fat or|rat'); +select websearch_to_tsquery('simple', 'fat or!rat'); +select websearch_to_tsquery('simple', 'fat orrat'); +select websearch_to_tsquery('simple', 'fat or '); + +-- ... but not here +select websearch_to_tsquery('simple', 'abc orange'); +select websearch_to_tsquery('simple', 'abc orтест'); +select websearch_to_tsquery('simple', 'abc OR1234'); +select websearch_to_tsquery('simple', 'abc or-abc'); +select websearch_to_tsquery('simple', 'abc OR_abc'); + +-- test quotes +select websearch_to_tsquery('english', '"pg_class pg'); +select websearch_to_tsquery('english', 'pg_class pg"'); +select websearch_to_tsquery('english', '"pg_class pg"'); +select websearch_to_tsquery('english', 'abc "pg_class pg"'); +select websearch_to_tsquery('english', '"pg_class pg" def'); +select websearch_to_tsquery('english', 'abc "pg pg_class pg" def'); +select websearch_to_tsquery('english', ' or "pg pg_class pg" or '); +select websearch_to_tsquery('english', '""pg pg_class pg""'); +select websearch_to_tsquery('english', 'abc """"" def'); +select websearch_to_tsquery('english', 'cat -"fat rat"'); +select websearch_to_tsquery('english', 'cat -"fat rat" cheese'); +select websearch_to_tsquery('english', 'abc "def -"'); +select websearch_to_tsquery('english', 'abc "def :"'); + +select websearch_to_tsquery('english', '"A fat cat" has just eaten a -rat.'); +select websearch_to_tsquery('english', '"A fat cat" has just eaten OR !rat.'); +select websearch_to_tsquery('english', '"A fat cat" has just (+eaten OR -rat)'); + +select websearch_to_tsquery('english', 'this is ----fine'); +select websearch_to_tsquery('english', '(()) )))) this ||| is && -fine, "dear friend" OR good'); +select websearch_to_tsquery('english', 'an old <-> cat " is fine &&& too'); + +select websearch_to_tsquery('english', '"A the" OR just on'); +select websearch_to_tsquery('english', '"a fat cat" ate a rat'); + +select to_tsvector('english', 'A fat cat ate a rat') @@ + websearch_to_tsquery('english', '"a fat cat" ate a rat'); + +select to_tsvector('english', 'A fat grey cat ate a rat') @@ + websearch_to_tsquery('english', '"a fat cat" ate a rat'); + +-- cases handled by gettoken_tsvector() +select websearch_to_tsquery(''''); +select websearch_to_tsquery('''abc''''def'''); +select websearch_to_tsquery('\abc'); +select websearch_to_tsquery('\'); -- 2.40.0