From 25bd9ce31baec836b5ad6bbd63ece54af0c73a3e Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Wed, 5 Aug 2009 18:06:49 +0000 Subject: [PATCH] Add matchorig, matchsynonyms, and keepsynonyms options to contrib/dict_xsyn. Sergey Karpov --- contrib/dict_xsyn/dict_xsyn.c | 112 +++++++++++-------- contrib/dict_xsyn/expected/dict_xsyn.out | 130 ++++++++++++++++++++++- contrib/dict_xsyn/sql/dict_xsyn.sql | 41 ++++++- doc/src/sgml/dict-xsyn.sgml | 51 ++++++++- 4 files changed, 282 insertions(+), 52 deletions(-) diff --git a/contrib/dict_xsyn/dict_xsyn.c b/contrib/dict_xsyn/dict_xsyn.c index 511ef271e3..b53670cb23 100644 --- a/contrib/dict_xsyn/dict_xsyn.c +++ b/contrib/dict_xsyn/dict_xsyn.c @@ -6,7 +6,7 @@ * Copyright (c) 2007-2009, PostgreSQL Global Development Group * * IDENTIFICATION - * $PostgreSQL: pgsql/contrib/dict_xsyn/dict_xsyn.c,v 1.6 2009/01/01 17:23:32 momjian Exp $ + * $PostgreSQL: pgsql/contrib/dict_xsyn/dict_xsyn.c,v 1.7 2009/08/05 18:06:49 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -33,7 +33,10 @@ typedef struct int len; Syn *syn; + bool matchorig; bool keeporig; + bool matchsynonyms; + bool keepsynonyms; } DictSyn; @@ -88,7 +91,8 @@ read_dictionary(DictSyn *d, char *filename) { char *value; char *key; - char *end = NULL; + char *pos; + char *end; if (*line == '\0') continue; @@ -96,26 +100,36 @@ read_dictionary(DictSyn *d, char *filename) value = lowerstr(line); pfree(line); - key = find_word(value, &end); - if (!key) + pos = value; + while ((key = find_word(pos, &end)) != NULL) { - pfree(value); - continue; - } + /* Enlarge syn structure if full */ + if (cur == d->len) + { + d->len = (d->len > 0) ? 2 * d->len : 16; + if (d->syn) + d->syn = (Syn *) repalloc(d->syn, sizeof(Syn) * d->len); + else + d->syn = (Syn *) palloc(sizeof(Syn) * d->len); + } - if (cur == d->len) - { - d->len = (d->len > 0) ? 2 * d->len : 16; - if (d->syn) - d->syn = (Syn *) repalloc(d->syn, sizeof(Syn) * d->len); - else - d->syn = (Syn *) palloc(sizeof(Syn) * d->len); - } + /* Save first word only if we will match it */ + if (pos != value || d->matchorig) + { + d->syn[cur].key = pnstrdup(key, end - key); + d->syn[cur].value = pstrdup(value); - d->syn[cur].key = pnstrdup(key, end - key); - d->syn[cur].value = value; + cur++; + } + + pos = end; - cur++; + /* Don't bother scanning synonyms if we will not match them */ + if (!d->matchsynonyms) + break; + } + + pfree(value); } tsearch_readline_end(&trst); @@ -133,23 +147,40 @@ dxsyn_init(PG_FUNCTION_ARGS) List *dictoptions = (List *) PG_GETARG_POINTER(0); DictSyn *d; ListCell *l; + char *filename = NULL; d = (DictSyn *) palloc0(sizeof(DictSyn)); d->len = 0; d->syn = NULL; + d->matchorig = true; d->keeporig = true; + d->matchsynonyms = false; + d->keepsynonyms = true; foreach(l, dictoptions) { DefElem *defel = (DefElem *) lfirst(l); - if (pg_strcasecmp(defel->defname, "KEEPORIG") == 0) + if (pg_strcasecmp(defel->defname, "MATCHORIG") == 0) + { + d->matchorig = defGetBoolean(defel); + } + else if (pg_strcasecmp(defel->defname, "KEEPORIG") == 0) { d->keeporig = defGetBoolean(defel); } + else if (pg_strcasecmp(defel->defname, "MATCHSYNONYMS") == 0) + { + d->matchsynonyms = defGetBoolean(defel); + } + else if (pg_strcasecmp(defel->defname, "KEEPSYNONYMS") == 0) + { + d->keepsynonyms = defGetBoolean(defel); + } else if (pg_strcasecmp(defel->defname, "RULES") == 0) { - read_dictionary(d, defGetString(defel)); + /* we can't read the rules before parsing all options! */ + filename = defGetString(defel); } else { @@ -160,6 +191,9 @@ dxsyn_init(PG_FUNCTION_ARGS) } } + if (filename) + read_dictionary(d, filename); + PG_RETURN_POINTER(d); } @@ -194,41 +228,33 @@ dxsyn_lexize(PG_FUNCTION_ARGS) /* Parse string of synonyms and return array of words */ { - char *value = pstrdup(found->value); - int value_length = strlen(value); - char *pos = value; + char *value = found->value; + char *syn; + char *pos; + char *end; int nsyns = 0; - bool is_first = true; - res = palloc(0); + res = palloc(sizeof(TSLexeme)); - while (pos < value + value_length) + pos = value; + while ((syn = find_word(pos, &end)) != NULL) { - char *end; - char *syn = find_word(pos, &end); - - if (!syn) - break; - *end = '\0'; - res = repalloc(res, sizeof(TSLexeme) * (nsyns + 2)); - res[nsyns].lexeme = NULL; - /* first word is added to result only if KEEPORIG flag is set */ - if (d->keeporig || !is_first) + /* The first word is output only if keeporig=true */ + if (pos != value || d->keeporig) { - res[nsyns].lexeme = pstrdup(syn); - res[nsyns + 1].lexeme = NULL; - + res[nsyns].lexeme = pnstrdup(syn, end - syn); nsyns++; } - is_first = false; + pos = end; - pos = end + 1; + /* Stop if we are not to output the synonyms */ + if (!d->keepsynonyms) + break; } - - pfree(value); + res[nsyns].lexeme = NULL; } PG_RETURN_POINTER(res); diff --git a/contrib/dict_xsyn/expected/dict_xsyn.out b/contrib/dict_xsyn/expected/dict_xsyn.out index 99071ea8c7..d91697a97e 100644 --- a/contrib/dict_xsyn/expected/dict_xsyn.out +++ b/contrib/dict_xsyn/expected/dict_xsyn.out @@ -5,10 +5,76 @@ SET client_min_messages = warning; \set ECHO none RESET client_min_messages; ---configuration -ALTER TEXT SEARCH DICTIONARY xsyn (RULES='xsyn_sample', KEEPORIG=false); +-- default configuration - match first word and return it among with all synonyms +ALTER TEXT SEARCH DICTIONARY xsyn (RULES='xsyn_sample', KEEPORIG=true, MATCHORIG=true, KEEPSYNONYMS=true, MATCHSYNONYMS=false); --lexize SELECT ts_lexize('xsyn', 'supernova'); + ts_lexize +-------------------------- + {supernova,sn,sne,1987a} +(1 row) + +SELECT ts_lexize('xsyn', 'sn'); + ts_lexize +----------- + +(1 row) + +SELECT ts_lexize('xsyn', 'grb'); + ts_lexize +----------- + +(1 row) + +-- the same, but return only synonyms +ALTER TEXT SEARCH DICTIONARY xsyn (RULES='xsyn_sample', KEEPORIG=false, MATCHORIG=true, KEEPSYNONYMS=true, MATCHSYNONYMS=false); +SELECT ts_lexize('xsyn', 'supernova'); + ts_lexize +---------------- + {sn,sne,1987a} +(1 row) + +SELECT ts_lexize('xsyn', 'sn'); + ts_lexize +----------- + +(1 row) + +SELECT ts_lexize('xsyn', 'grb'); + ts_lexize +----------- + +(1 row) + +-- match any word and return all words +ALTER TEXT SEARCH DICTIONARY xsyn (RULES='xsyn_sample', KEEPORIG=true, MATCHORIG=true, KEEPSYNONYMS=true, MATCHSYNONYMS=true); +SELECT ts_lexize('xsyn', 'supernova'); + ts_lexize +-------------------------- + {supernova,sn,sne,1987a} +(1 row) + +SELECT ts_lexize('xsyn', 'sn'); + ts_lexize +-------------------------- + {supernova,sn,sne,1987a} +(1 row) + +SELECT ts_lexize('xsyn', 'grb'); + ts_lexize +----------- + +(1 row) + +-- match any word and return all words except first one +ALTER TEXT SEARCH DICTIONARY xsyn (RULES='xsyn_sample', KEEPORIG=false, MATCHORIG=true, KEEPSYNONYMS=true, MATCHSYNONYMS=true); +SELECT ts_lexize('xsyn', 'supernova'); + ts_lexize +---------------- + {sn,sne,1987a} +(1 row) + +SELECT ts_lexize('xsyn', 'sn'); ts_lexize ---------------- {sn,sne,1987a} @@ -20,3 +86,63 @@ SELECT ts_lexize('xsyn', 'grb'); (1 row) +-- match any synonym but not first word, and return first word instead +ALTER TEXT SEARCH DICTIONARY xsyn (RULES='xsyn_sample', KEEPORIG=true, MATCHORIG=false, KEEPSYNONYMS=false, MATCHSYNONYMS=true); +SELECT ts_lexize('xsyn', 'supernova'); + ts_lexize +----------- + +(1 row) + +SELECT ts_lexize('xsyn', 'sn'); + ts_lexize +------------- + {supernova} +(1 row) + +SELECT ts_lexize('xsyn', 'grb'); + ts_lexize +----------- + +(1 row) + +-- do not match or return anything +ALTER TEXT SEARCH DICTIONARY xsyn (RULES='xsyn_sample', KEEPORIG=false, MATCHORIG=false, KEEPSYNONYMS=false, MATCHSYNONYMS=false); +SELECT ts_lexize('xsyn', 'supernova'); + ts_lexize +----------- + +(1 row) + +SELECT ts_lexize('xsyn', 'sn'); + ts_lexize +----------- + +(1 row) + +SELECT ts_lexize('xsyn', 'grb'); + ts_lexize +----------- + +(1 row) + +-- match any word but return nothing +ALTER TEXT SEARCH DICTIONARY xsyn (RULES='xsyn_sample', KEEPORIG=false, MATCHORIG=true, KEEPSYNONYMS=false, MATCHSYNONYMS=true); +SELECT ts_lexize('xsyn', 'supernova'); + ts_lexize +----------- + {} +(1 row) + +SELECT ts_lexize('xsyn', 'sn'); + ts_lexize +----------- + {} +(1 row) + +SELECT ts_lexize('xsyn', 'grb'); + ts_lexize +----------- + +(1 row) + diff --git a/contrib/dict_xsyn/sql/dict_xsyn.sql b/contrib/dict_xsyn/sql/dict_xsyn.sql index 17f6df9cf3..9db0851700 100644 --- a/contrib/dict_xsyn/sql/dict_xsyn.sql +++ b/contrib/dict_xsyn/sql/dict_xsyn.sql @@ -8,9 +8,46 @@ SET client_min_messages = warning; \set ECHO all RESET client_min_messages; ---configuration -ALTER TEXT SEARCH DICTIONARY xsyn (RULES='xsyn_sample', KEEPORIG=false); +-- default configuration - match first word and return it among with all synonyms +ALTER TEXT SEARCH DICTIONARY xsyn (RULES='xsyn_sample', KEEPORIG=true, MATCHORIG=true, KEEPSYNONYMS=true, MATCHSYNONYMS=false); --lexize SELECT ts_lexize('xsyn', 'supernova'); +SELECT ts_lexize('xsyn', 'sn'); +SELECT ts_lexize('xsyn', 'grb'); + +-- the same, but return only synonyms +ALTER TEXT SEARCH DICTIONARY xsyn (RULES='xsyn_sample', KEEPORIG=false, MATCHORIG=true, KEEPSYNONYMS=true, MATCHSYNONYMS=false); +SELECT ts_lexize('xsyn', 'supernova'); +SELECT ts_lexize('xsyn', 'sn'); +SELECT ts_lexize('xsyn', 'grb'); + +-- match any word and return all words +ALTER TEXT SEARCH DICTIONARY xsyn (RULES='xsyn_sample', KEEPORIG=true, MATCHORIG=true, KEEPSYNONYMS=true, MATCHSYNONYMS=true); +SELECT ts_lexize('xsyn', 'supernova'); +SELECT ts_lexize('xsyn', 'sn'); +SELECT ts_lexize('xsyn', 'grb'); + +-- match any word and return all words except first one +ALTER TEXT SEARCH DICTIONARY xsyn (RULES='xsyn_sample', KEEPORIG=false, MATCHORIG=true, KEEPSYNONYMS=true, MATCHSYNONYMS=true); +SELECT ts_lexize('xsyn', 'supernova'); +SELECT ts_lexize('xsyn', 'sn'); +SELECT ts_lexize('xsyn', 'grb'); + +-- match any synonym but not first word, and return first word instead +ALTER TEXT SEARCH DICTIONARY xsyn (RULES='xsyn_sample', KEEPORIG=true, MATCHORIG=false, KEEPSYNONYMS=false, MATCHSYNONYMS=true); +SELECT ts_lexize('xsyn', 'supernova'); +SELECT ts_lexize('xsyn', 'sn'); +SELECT ts_lexize('xsyn', 'grb'); + +-- do not match or return anything +ALTER TEXT SEARCH DICTIONARY xsyn (RULES='xsyn_sample', KEEPORIG=false, MATCHORIG=false, KEEPSYNONYMS=false, MATCHSYNONYMS=false); +SELECT ts_lexize('xsyn', 'supernova'); +SELECT ts_lexize('xsyn', 'sn'); +SELECT ts_lexize('xsyn', 'grb'); + +-- match any word but return nothing +ALTER TEXT SEARCH DICTIONARY xsyn (RULES='xsyn_sample', KEEPORIG=false, MATCHORIG=true, KEEPSYNONYMS=false, MATCHSYNONYMS=true); +SELECT ts_lexize('xsyn', 'supernova'); +SELECT ts_lexize('xsyn', 'sn'); SELECT ts_lexize('xsyn', 'grb'); diff --git a/doc/src/sgml/dict-xsyn.sgml b/doc/src/sgml/dict-xsyn.sgml index 00d8a84faf..7df6959d3d 100644 --- a/doc/src/sgml/dict-xsyn.sgml +++ b/doc/src/sgml/dict-xsyn.sgml @@ -1,4 +1,4 @@ - + dict_xsyn @@ -23,9 +23,26 @@ - keeporig controls whether the original word is included (if - true), or only its synonyms (if false). Default - is true. + matchorig controls whether the original word is accepted by + the dictionary. Default is true. + + + + + matchsynonyms controls whether the synonyms are + accepted by the dictionary. Default is false. + + + + + keeporig controls whether the original word is included in + the dictionary's output. Default is true. + + + + + keepsynonyms controls whether the synonyms are included in + the dictionary's output. Default is true. @@ -87,13 +104,37 @@ ALTER TEXT SEARCH DICTIONARY To test the dictionary, you can try +mydb=# SELECT ts_lexize('xsyn', 'word'); + ts_lexize +----------------------- + {syn1,syn2,syn3} + +mydb# ALTER TEXT SEARCH DICTIONARY xsyn (RULES='my_rules', KEEPORIG=true); +ALTER TEXT SEARCH DICTIONARY + mydb=# SELECT ts_lexize('xsyn', 'word'); ts_lexize ----------------------- {word,syn1,syn2,syn3} + +mydb# ALTER TEXT SEARCH DICTIONARY xsyn (RULES='my_rules', KEEPORIG=false, MATCHSYNONYMS=true); +ALTER TEXT SEARCH DICTIONARY + +mydb=# SELECT ts_lexize('xsyn', 'syn1'); + ts_lexize +----------------------- + {syn1,syn2,syn3} + +mydb# ALTER TEXT SEARCH DICTIONARY xsyn (RULES='my_rules', KEEPORIG=true, MATCHORIG=false, KEEPSYNONYMS=false); +ALTER TEXT SEARCH DICTIONARY + +mydb=# SELECT ts_lexize('xsyn', 'syn1'); + ts_lexize +----------------------- + {word} - but real-world usage will involve including it in a text search + Real-world usage will involve including it in a text search configuration as described in . That might look like this: -- 2.40.0