From: Teodor Sigaev Date: Fri, 14 Aug 2009 14:53:20 +0000 (+0000) Subject: Add prefix support for synonym dictionary X-Git-Tag: REL8_5_ALPHA1~24 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=abd8c94ff98091a5134d905258c2835e72b2dbd8;p=postgresql Add prefix support for synonym dictionary --- diff --git a/doc/src/sgml/textsearch.sgml b/doc/src/sgml/textsearch.sgml index 547c0153ac..ed78c1d10a 100644 --- a/doc/src/sgml/textsearch.sgml +++ b/doc/src/sgml/textsearch.sgml @@ -1,4 +1,4 @@ - + Full Text Search @@ -2288,6 +2288,63 @@ SELECT * FROM ts_debug('english', 'Paris'); asciiword | Word, all ASCII | Paris | {my_synonym,english_stem} | my_synonym | {paris} + + + An asterisk (*) at the end of definition word indicates + that definition word is a prefix, and to_tsquery() + function will transform that definition to the prefix search format (see + ). + Notice that it is ignored in to_tsvector(). + + + + Contents of $SHAREDIR/tsearch_data/synonym_sample.syn: + + +postgres pgsql +postgresql pgsql +postgre pgsql +gogle googl +indices index* + + + + Results: + + +=# create text search dictionary syn( template=synonym,synonyms='synonym_sample'); +=# select ts_lexize('syn','indices'); + ts_lexize +----------- + {index} +(1 row) + +=# create text search configuration tst ( copy=simple); +=# alter text search configuration tst alter mapping for asciiword with syn; +=# select to_tsquery('tst','indices'); + to_tsquery +------------ + 'index':* +(1 row) + +=# select 'indexes are very useful'::tsvector; + tsvector +--------------------------------- + 'are' 'indexes' 'useful' 'very' +(1 row) + +=# select 'indexes are very useful'::tsvector @@ to_tsquery('tst','indices'); + ?column? +---------- + t +(1 row) + +=# select to_tsvector('tst','indices'); + to_tsvector +------------- + 'index':1 +(1 row) + The only parameter required by the synonym template is diff --git a/src/backend/tsearch/dict_synonym.c b/src/backend/tsearch/dict_synonym.c index a9d094880d..13ecfd0eed 100644 --- a/src/backend/tsearch/dict_synonym.c +++ b/src/backend/tsearch/dict_synonym.c @@ -7,7 +7,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/tsearch/dict_synonym.c,v 1.10 2009/01/01 17:23:48 momjian Exp $ + * $PostgreSQL: pgsql/src/backend/tsearch/dict_synonym.c,v 1.11 2009/08/14 14:53:20 teodor Exp $ * *------------------------------------------------------------------------- */ @@ -23,6 +23,8 @@ typedef struct { char *in; char *out; + int outlen; + uint16 flags; } Syn; typedef struct @@ -36,11 +38,14 @@ typedef struct * Finds the next whitespace-delimited word within the 'in' string. * Returns a pointer to the first character of the word, and a pointer * to the next byte after the last character in the word (in *end). + * Character '*' at the end of word will not be threated as word + * charater if flags is not null. */ static char * -findwrd(char *in, char **end) +findwrd(char *in, char **end, uint16 *flags) { char *start; + char *lastchar; /* Skip leading spaces */ while (*in && t_isspace(in)) @@ -53,13 +58,27 @@ findwrd(char *in, char **end) return NULL; } - start = in; + lastchar = start = in; /* Find end of word */ while (*in && !t_isspace(in)) + { + lastchar = in; in += pg_mblen(in); + } + + if ( in - lastchar == 1 && t_iseq(lastchar, '*') && flags ) + { + *flags = TSL_PREFIX; + *end = lastchar; + } + else + { + if (flags) + *flags = 0; + *end = in; + } - *end = in; return start; } @@ -84,6 +103,7 @@ dsynonym_init(PG_FUNCTION_ARGS) *end = NULL; int cur = 0; char *line = NULL; + uint16 flags = 0; foreach(l, dictoptions) { @@ -117,7 +137,7 @@ dsynonym_init(PG_FUNCTION_ARGS) while ((line = tsearch_readline(&trst)) != NULL) { - starti = findwrd(line, &end); + starti = findwrd(line, &end, NULL); if (!starti) { /* Empty line */ @@ -130,7 +150,7 @@ dsynonym_init(PG_FUNCTION_ARGS) } *end = '\0'; - starto = findwrd(end + 1, &end); + starto = findwrd(end + 1, &end, &flags); if (!starto) { /* A line with only one word (+whitespace). Ignore silently. */ @@ -168,6 +188,9 @@ dsynonym_init(PG_FUNCTION_ARGS) d->syn[cur].out = lowerstr(starto); } + d->syn[cur].outlen = strlen(starto); + d->syn[cur].flags = flags; + cur++; skipline: @@ -212,7 +235,8 @@ dsynonym_lexize(PG_FUNCTION_ARGS) PG_RETURN_POINTER(NULL); res = palloc0(sizeof(TSLexeme) * 2); - res[0].lexeme = pstrdup(found->out); + res[0].lexeme = pnstrdup(found->out, found->outlen); + res[0].flags = found->flags; PG_RETURN_POINTER(res); } diff --git a/src/backend/tsearch/synonym_sample.syn b/src/backend/tsearch/synonym_sample.syn index 4e2eaeec0c..3ecbcf901c 100644 --- a/src/backend/tsearch/synonym_sample.syn +++ b/src/backend/tsearch/synonym_sample.syn @@ -2,3 +2,4 @@ postgres pgsql postgresql pgsql postgre pgsql gogle googl +indices index* diff --git a/src/test/regress/expected/tsdicts.out b/src/test/regress/expected/tsdicts.out index 3ae6a671da..aba67fcab7 100644 --- a/src/test/regress/expected/tsdicts.out +++ b/src/test/regress/expected/tsdicts.out @@ -208,6 +208,12 @@ SELECT ts_lexize('synonym', 'Gogle'); {googl} (1 row) +SELECT ts_lexize('synonym', 'indices'); + ts_lexize +----------- + {index} +(1 row) + -- Create and simple test thesaurus dictionary -- More tests in configuration checks because ts_lexize() -- cannot pass more than one word to thesaurus. @@ -290,6 +296,18 @@ SELECT to_tsvector('synonym_tst', 'Most common mistake is to write Gogle instead 'common':2 'googl':7,10 'instead':8 'mistak':3 'write':6 (1 row) +SELECT to_tsvector('synonym_tst', 'Indexes or indices - Which is right plural form of index?'); + to_tsvector +---------------------------------------------- + 'form':8 'index':1,3,10 'plural':7 'right':6 +(1 row) + +SELECT to_tsquery('synonym_tst', 'Index & indices'); + to_tsquery +--------------------- + 'index' & 'index':* +(1 row) + -- test thesaurus in configuration -- see thesaurus_sample.ths to understand 'odd' resulting tsvector CREATE TEXT SEARCH CONFIGURATION thesaurus_tst ( diff --git a/src/test/regress/sql/tsdicts.sql b/src/test/regress/sql/tsdicts.sql index f36e63a311..000f6eb2e7 100644 --- a/src/test/regress/sql/tsdicts.sql +++ b/src/test/regress/sql/tsdicts.sql @@ -56,6 +56,7 @@ CREATE TEXT SEARCH DICTIONARY synonym ( SELECT ts_lexize('synonym', 'PoStGrEs'); SELECT ts_lexize('synonym', 'Gogle'); +SELECT ts_lexize('synonym', 'indices'); -- Create and simple test thesaurus dictionary -- More tests in configuration checks because ts_lexize() @@ -104,6 +105,8 @@ ALTER TEXT SEARCH CONFIGURATION synonym_tst ALTER MAPPING FOR SELECT to_tsvector('synonym_tst', 'Postgresql is often called as postgres or pgsql and pronounced as postgre'); SELECT to_tsvector('synonym_tst', 'Most common mistake is to write Gogle instead of Google'); +SELECT to_tsvector('synonym_tst', 'Indexes or indices - Which is right plural form of index?'); +SELECT to_tsquery('synonym_tst', 'Index & indices'); -- test thesaurus in configuration -- see thesaurus_sample.ths to understand 'odd' resulting tsvector