From: Tom Lane Date: Mon, 15 Oct 2007 21:36:50 +0000 (+0000) Subject: Add sample text search dictionary templates and parsers, to replace the X-Git-Tag: REL8_3_BETA2~77 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=5fcb079858bb392e87067b5526e9df950db38024;p=postgresql Add sample text search dictionary templates and parsers, to replace the hard-to-maintain textual examples currently in the SGML docs. From Sergey Karpov. --- diff --git a/contrib/Makefile b/contrib/Makefile index 3f49645036..0bd9e65b05 100644 --- a/contrib/Makefile +++ b/contrib/Makefile @@ -1,4 +1,4 @@ -# $PostgreSQL: pgsql/contrib/Makefile,v 1.80 2007/10/13 22:59:43 tgl Exp $ +# $PostgreSQL: pgsql/contrib/Makefile,v 1.81 2007/10/15 21:36:49 tgl Exp $ subdir = contrib top_builddir = .. @@ -10,6 +10,8 @@ WANTED_DIRS = \ chkpass \ cube \ dblink \ + dict_int \ + dict_xsyn \ earthdistance \ fuzzystrmatch \ hstore \ @@ -31,6 +33,7 @@ WANTED_DIRS = \ seg \ spi \ tablefunc \ + test_parser \ vacuumlo ifeq ($(with_openssl),yes) diff --git a/contrib/README b/contrib/README index 5b2167ec97..ac15b85a3e 100644 --- a/contrib/README +++ b/contrib/README @@ -1,4 +1,3 @@ - The PostgreSQL contrib tree --------------------------- @@ -29,8 +28,8 @@ adminpack - by Dave Page btree_gist - - Support for emulating BTREE indexing in GiST - by Oleg Bartunov and Teodor Sigaev + Support for emulating BTREE indexing in GiST + by Oleg Bartunov and Teodor Sigaev chkpass - An auto-encrypted password datatype @@ -44,8 +43,16 @@ dblink - Allows remote query execution by Joe Conway +dict_int - + Text search dictionary template for integers + by Sergey Karpov + +dict_xsyn - + Text search dictionary template for extended synonym processing + by Sergey Karpov + earthdistance - - Operator for computing earth distance for two points + Operator for computing earth distance between two points by Hal Snyder fuzzystrmatch - @@ -53,8 +60,8 @@ fuzzystrmatch - by Joe Conway , Joel Burton hstore - - Hstore - module for storing (key,value) pairs - by Oleg Bartunov and Teodor Sigaev + Module for storing (key, value) pairs + by Oleg Bartunov and Teodor Sigaev intagg - Integer aggregator @@ -92,6 +99,10 @@ pg_freespacemap - Displays the contents of the free space map (FSM) by Mark Kirkwood +pg_standby - + Sample archive_command for warm standby operation + by Simon Riggs + pg_trgm - Functions for determining the similarity of text based on trigram matching. @@ -110,7 +121,7 @@ pgrowlocks - by Tatsuo Ishii pgstattuple - - A function to return statistics about "dead" tuples and free + Functions to return statistics about "dead" tuples and free space within a table by Tatsuo Ishii @@ -126,12 +137,16 @@ sslinfo - by Victor Wagner start-scripts - - Scripts for starting the server at boot time. + Scripts for starting the server at boot time on various platforms. tablefunc - Examples of functions returning tables by Joe Conway +test_parser - + Sample text search parser + by Sergey Karpov + tsearch2 - Full-text-index support using GiST by Teodor Sigaev and Oleg Bartunov diff --git a/contrib/dict_int/Makefile b/contrib/dict_int/Makefile new file mode 100644 index 0000000000..4e03a69a6e --- /dev/null +++ b/contrib/dict_int/Makefile @@ -0,0 +1,19 @@ +# $PostgreSQL: pgsql/contrib/dict_int/Makefile,v 1.1 2007/10/15 21:36:50 tgl Exp $ + +MODULE_big = dict_int +OBJS = dict_int.o +DATA_built = dict_int.sql +DATA = uninstall_dict_int.sql +DOCS = README.dict_int +REGRESS = dict_int + +ifdef USE_PGXS +PG_CONFIG = pg_config +PGXS := $(shell $(PG_CONFIG) --pgxs) +include $(PGXS) +else +subdir = contrib/dict_int +top_builddir = ../.. +include $(top_builddir)/src/Makefile.global +include $(top_srcdir)/contrib/contrib-global.mk +endif diff --git a/contrib/dict_int/README.dict_int b/contrib/dict_int/README.dict_int new file mode 100644 index 0000000000..5883c1c2f5 --- /dev/null +++ b/contrib/dict_int/README.dict_int @@ -0,0 +1,41 @@ +Dictionary for integers +======================= + +The motivation for this example dictionary is to control the indexing of +integers (signed and unsigned), and, consequently, to minimize the number of +unique words which greatly affect the performance of searching. + +* Configuration + +The dictionary accepts two options: + + - The MAXLEN parameter specifies the maximum length (number of digits) + allowed in an integer word. The default value is 6. + + - The REJECTLONG parameter specifies if an overlength integer should be + truncated or ignored. If REJECTLONG=FALSE (default), the dictionary returns + the first MAXLEN digits of the integer. If REJECTLONG=TRUE, the + dictionary treats an overlength integer as a stop word, so that it will + not be indexed. + +* Usage + +1. Compile and install + +2. Load dictionary + + psql mydb < dict_int.sql + +3. Test it + + mydb# select ts_lexize('intdict', '12345678'); + ts_lexize + ----------- + {123456} + +4. Change its options as you wish + + mydb# ALTER TEXT SEARCH DICTIONARY intdict (MAXLEN = 4, REJECTLONG = true); + ALTER TEXT SEARCH DICTIONARY + +That's all. diff --git a/contrib/dict_int/dict_int.c b/contrib/dict_int/dict_int.c new file mode 100644 index 0000000000..85d45491cc --- /dev/null +++ b/contrib/dict_int/dict_int.c @@ -0,0 +1,99 @@ +/*------------------------------------------------------------------------- + * + * dict_int.c + * Text search dictionary for integers + * + * Copyright (c) 2007, PostgreSQL Global Development Group + * + * IDENTIFICATION + * $PostgreSQL: pgsql/contrib/dict_int/dict_int.c,v 1.1 2007/10/15 21:36:50 tgl Exp $ + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "commands/defrem.h" +#include "fmgr.h" +#include "tsearch/ts_public.h" + +PG_MODULE_MAGIC; + + +typedef struct { + int maxlen; + bool rejectlong; +} DictInt; + + +PG_FUNCTION_INFO_V1(dintdict_init); +Datum dintdict_init(PG_FUNCTION_ARGS); + +PG_FUNCTION_INFO_V1(dintdict_lexize); +Datum dintdict_lexize(PG_FUNCTION_ARGS); + +Datum +dintdict_init(PG_FUNCTION_ARGS) +{ + List *dictoptions = (List *) PG_GETARG_POINTER(0); + DictInt *d; + ListCell *l; + + d = (DictInt *) palloc0(sizeof(DictInt)); + d->maxlen = 6; + d->rejectlong = false; + + foreach(l, dictoptions) + { + DefElem *defel = (DefElem *) lfirst(l); + + if (pg_strcasecmp(defel->defname, "MAXLEN") == 0) + { + d->maxlen = atoi(defGetString(defel)); + } + else if (pg_strcasecmp(defel->defname, "REJECTLONG") == 0) + { + d->rejectlong = defGetBoolean(defel); + } + else + { + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("unrecognized intdict parameter: \"%s\"", + defel->defname))); + } + } + + PG_RETURN_POINTER(d); +} + +Datum +dintdict_lexize(PG_FUNCTION_ARGS) +{ + DictInt *d = (DictInt*)PG_GETARG_POINTER(0); + char *in = (char*)PG_GETARG_POINTER(1); + char *txt = pnstrdup(in, PG_GETARG_INT32(2)); + TSLexeme *res=palloc(sizeof(TSLexeme)*2); + + res[1].lexeme = NULL; + if (PG_GETARG_INT32(2) > d->maxlen) + { + if ( d->rejectlong ) + { + /* reject by returning void array */ + pfree(txt); + res[0].lexeme = NULL; + } + else + { + /* trim integer */ + txt[d->maxlen] = '\0'; + res[0].lexeme = txt; + } + } + else + { + res[0].lexeme = txt; + } + + PG_RETURN_POINTER(res); +} diff --git a/contrib/dict_int/dict_int.sql.in b/contrib/dict_int/dict_int.sql.in new file mode 100644 index 0000000000..0bd97a83e2 --- /dev/null +++ b/contrib/dict_int/dict_int.sql.in @@ -0,0 +1,29 @@ +-- $PostgreSQL: pgsql/contrib/dict_int/dict_int.sql.in,v 1.1 2007/10/15 21:36:50 tgl Exp $ + +-- Adjust this setting to control where the objects get created. +SET search_path = public; + +BEGIN; + +CREATE FUNCTION dintdict_init(internal) + RETURNS internal + AS 'MODULE_PATHNAME' + LANGUAGE C STRICT; + +CREATE FUNCTION dintdict_lexize(internal, internal, internal, internal) + RETURNS internal + AS 'MODULE_PATHNAME' + LANGUAGE C STRICT; + +CREATE TEXT SEARCH TEMPLATE intdict_template ( + LEXIZE = dintdict_lexize, + INIT = dintdict_init +); + +CREATE TEXT SEARCH DICTIONARY intdict ( + TEMPLATE = intdict_template +); + +COMMENT ON TEXT SEARCH DICTIONARY intdict IS 'dictionary for integers'; + +END; diff --git a/contrib/dict_int/expected/dict_int.out b/contrib/dict_int/expected/dict_int.out new file mode 100644 index 0000000000..7feb493e15 --- /dev/null +++ b/contrib/dict_int/expected/dict_int.out @@ -0,0 +1,308 @@ +-- +-- first, define the datatype. Turn off echoing so that expected file +-- does not depend on contents of this file. +-- +SET client_min_messages = warning; +\set ECHO none +RESET client_min_messages; +--lexize +select ts_lexize('intdict', '511673'); + ts_lexize +----------- + {511673} +(1 row) + +select ts_lexize('intdict', '129'); + ts_lexize +----------- + {129} +(1 row) + +select ts_lexize('intdict', '40865854'); + ts_lexize +----------- + {408658} +(1 row) + +select ts_lexize('intdict', '952'); + ts_lexize +----------- + {952} +(1 row) + +select ts_lexize('intdict', '654980341'); + ts_lexize +----------- + {654980} +(1 row) + +select ts_lexize('intdict', '09810106'); + ts_lexize +----------- + {098101} +(1 row) + +select ts_lexize('intdict', '14262713'); + ts_lexize +----------- + {142627} +(1 row) + +select ts_lexize('intdict', '6532082986'); + ts_lexize +----------- + {653208} +(1 row) + +select ts_lexize('intdict', '0150061'); + ts_lexize +----------- + {015006} +(1 row) + +select ts_lexize('intdict', '7778'); + ts_lexize +----------- + {7778} +(1 row) + +select ts_lexize('intdict', '9547'); + ts_lexize +----------- + {9547} +(1 row) + +select ts_lexize('intdict', '753395478'); + ts_lexize +----------- + {753395} +(1 row) + +select ts_lexize('intdict', '647652'); + ts_lexize +----------- + {647652} +(1 row) + +select ts_lexize('intdict', '6988655574'); + ts_lexize +----------- + {698865} +(1 row) + +select ts_lexize('intdict', '1279'); + ts_lexize +----------- + {1279} +(1 row) + +select ts_lexize('intdict', '1266645909'); + ts_lexize +----------- + {126664} +(1 row) + +select ts_lexize('intdict', '7594193969'); + ts_lexize +----------- + {759419} +(1 row) + +select ts_lexize('intdict', '16928207'); + ts_lexize +----------- + {169282} +(1 row) + +select ts_lexize('intdict', '196850350328'); + ts_lexize +----------- + {196850} +(1 row) + +select ts_lexize('intdict', '22026985592'); + ts_lexize +----------- + {220269} +(1 row) + +select ts_lexize('intdict', '2063765'); + ts_lexize +----------- + {206376} +(1 row) + +select ts_lexize('intdict', '242387310'); + ts_lexize +----------- + {242387} +(1 row) + +select ts_lexize('intdict', '93595'); + ts_lexize +----------- + {93595} +(1 row) + +select ts_lexize('intdict', '9374'); + ts_lexize +----------- + {9374} +(1 row) + +select ts_lexize('intdict', '996969'); + ts_lexize +----------- + {996969} +(1 row) + +select ts_lexize('intdict', '353595982'); + ts_lexize +----------- + {353595} +(1 row) + +select ts_lexize('intdict', '925860'); + ts_lexize +----------- + {925860} +(1 row) + +select ts_lexize('intdict', '11848378337'); + ts_lexize +----------- + {118483} +(1 row) + +select ts_lexize('intdict', '333'); + ts_lexize +----------- + {333} +(1 row) + +select ts_lexize('intdict', '799287416765'); + ts_lexize +----------- + {799287} +(1 row) + +select ts_lexize('intdict', '745939'); + ts_lexize +----------- + {745939} +(1 row) + +select ts_lexize('intdict', '67601305734'); + ts_lexize +----------- + {676013} +(1 row) + +select ts_lexize('intdict', '3361113'); + ts_lexize +----------- + {336111} +(1 row) + +select ts_lexize('intdict', '9033778607'); + ts_lexize +----------- + {903377} +(1 row) + +select ts_lexize('intdict', '7507648'); + ts_lexize +----------- + {750764} +(1 row) + +select ts_lexize('intdict', '1166'); + ts_lexize +----------- + {1166} +(1 row) + +select ts_lexize('intdict', '9360498'); + ts_lexize +----------- + {936049} +(1 row) + +select ts_lexize('intdict', '917795'); + ts_lexize +----------- + {917795} +(1 row) + +select ts_lexize('intdict', '9387894'); + ts_lexize +----------- + {938789} +(1 row) + +select ts_lexize('intdict', '42764329'); + ts_lexize +----------- + {427643} +(1 row) + +select ts_lexize('intdict', '564062'); + ts_lexize +----------- + {564062} +(1 row) + +select ts_lexize('intdict', '5413377'); + ts_lexize +----------- + {541337} +(1 row) + +select ts_lexize('intdict', '060965'); + ts_lexize +----------- + {060965} +(1 row) + +select ts_lexize('intdict', '08273593'); + ts_lexize +----------- + {082735} +(1 row) + +select ts_lexize('intdict', '593556010144'); + ts_lexize +----------- + {593556} +(1 row) + +select ts_lexize('intdict', '17988843352'); + ts_lexize +----------- + {179888} +(1 row) + +select ts_lexize('intdict', '252281774'); + ts_lexize +----------- + {252281} +(1 row) + +select ts_lexize('intdict', '313425'); + ts_lexize +----------- + {313425} +(1 row) + +select ts_lexize('intdict', '641439323669'); + ts_lexize +----------- + {641439} +(1 row) + +select ts_lexize('intdict', '314532610153'); + ts_lexize +----------- + {314532} +(1 row) + diff --git a/contrib/dict_int/sql/dict_int.sql b/contrib/dict_int/sql/dict_int.sql new file mode 100644 index 0000000000..3a335f8f3d --- /dev/null +++ b/contrib/dict_int/sql/dict_int.sql @@ -0,0 +1,61 @@ +-- +-- first, define the datatype. Turn off echoing so that expected file +-- does not depend on contents of this file. +-- +SET client_min_messages = warning; +\set ECHO none +\i dict_int.sql +\set ECHO all +RESET client_min_messages; + +--lexize +select ts_lexize('intdict', '511673'); +select ts_lexize('intdict', '129'); +select ts_lexize('intdict', '40865854'); +select ts_lexize('intdict', '952'); +select ts_lexize('intdict', '654980341'); +select ts_lexize('intdict', '09810106'); +select ts_lexize('intdict', '14262713'); +select ts_lexize('intdict', '6532082986'); +select ts_lexize('intdict', '0150061'); +select ts_lexize('intdict', '7778'); +select ts_lexize('intdict', '9547'); +select ts_lexize('intdict', '753395478'); +select ts_lexize('intdict', '647652'); +select ts_lexize('intdict', '6988655574'); +select ts_lexize('intdict', '1279'); +select ts_lexize('intdict', '1266645909'); +select ts_lexize('intdict', '7594193969'); +select ts_lexize('intdict', '16928207'); +select ts_lexize('intdict', '196850350328'); +select ts_lexize('intdict', '22026985592'); +select ts_lexize('intdict', '2063765'); +select ts_lexize('intdict', '242387310'); +select ts_lexize('intdict', '93595'); +select ts_lexize('intdict', '9374'); +select ts_lexize('intdict', '996969'); +select ts_lexize('intdict', '353595982'); +select ts_lexize('intdict', '925860'); +select ts_lexize('intdict', '11848378337'); +select ts_lexize('intdict', '333'); +select ts_lexize('intdict', '799287416765'); +select ts_lexize('intdict', '745939'); +select ts_lexize('intdict', '67601305734'); +select ts_lexize('intdict', '3361113'); +select ts_lexize('intdict', '9033778607'); +select ts_lexize('intdict', '7507648'); +select ts_lexize('intdict', '1166'); +select ts_lexize('intdict', '9360498'); +select ts_lexize('intdict', '917795'); +select ts_lexize('intdict', '9387894'); +select ts_lexize('intdict', '42764329'); +select ts_lexize('intdict', '564062'); +select ts_lexize('intdict', '5413377'); +select ts_lexize('intdict', '060965'); +select ts_lexize('intdict', '08273593'); +select ts_lexize('intdict', '593556010144'); +select ts_lexize('intdict', '17988843352'); +select ts_lexize('intdict', '252281774'); +select ts_lexize('intdict', '313425'); +select ts_lexize('intdict', '641439323669'); +select ts_lexize('intdict', '314532610153'); diff --git a/contrib/dict_int/uninstall_dict_int.sql b/contrib/dict_int/uninstall_dict_int.sql new file mode 100644 index 0000000000..0323ab298e --- /dev/null +++ b/contrib/dict_int/uninstall_dict_int.sql @@ -0,0 +1,9 @@ +SET search_path = public; + +DROP TEXT SEARCH DICTIONARY intdict; + +DROP TEXT SEARCH TEMPLATE intdict_template; + +DROP FUNCTION dintdict_init(internal); + +DROP FUNCTION dintdict_lexize(internal,internal,internal,internal); diff --git a/contrib/dict_xsyn/Makefile b/contrib/dict_xsyn/Makefile new file mode 100644 index 0000000000..563f039e46 --- /dev/null +++ b/contrib/dict_xsyn/Makefile @@ -0,0 +1,38 @@ +# $PostgreSQL: pgsql/contrib/dict_xsyn/Makefile,v 1.1 2007/10/15 21:36:50 tgl Exp $ + +MODULE_big = dict_xsyn +OBJS = dict_xsyn.o +DATA_built = dict_xsyn.sql +DATA = uninstall_dict_xsyn.sql +DOCS = README.dict_xsyn +REGRESS = dict_xsyn + +DICTDIR = tsearch_data +DICTFILES = xsyn_sample.rules + +ifdef USE_PGXS +PG_CONFIG = pg_config +PGXS := $(shell $(PG_CONFIG) --pgxs) +include $(PGXS) +else +subdir = contrib/dict_xsyn +top_builddir = ../.. +include $(top_builddir)/src/Makefile.global +include $(top_srcdir)/contrib/contrib-global.mk +endif + +install: install-data + +.PHONY: install-data +install-data: $(DICTFILES) + for i in $(DICTFILES); \ + do $(INSTALL_DATA) $(srcdir)/$$i '$(DESTDIR)$(datadir)/$(DICTDIR)/'$$i; \ + done + +uninstall: uninstall-data + +.PHONY: uninstall-data +uninstall-data: + for i in $(DICTFILES); \ + do rm -rf '$(DESTDIR)$(datadir)/$(DICTDIR)/'$$i ; \ + done diff --git a/contrib/dict_xsyn/README.dict_xsyn b/contrib/dict_xsyn/README.dict_xsyn new file mode 100644 index 0000000000..9565eefefb --- /dev/null +++ b/contrib/dict_xsyn/README.dict_xsyn @@ -0,0 +1,52 @@ +Extended Synonym dictionary +=========================== + +This is a simple synonym dictionary. It replaces words with groups of their +synonyms, and so makes it possible to search for a word using any of its +synonyms. + +* Configuration + +It accepts the following options: + + - KEEPORIG controls whether the original word is included, or only its + synonyms. Default is 'true'. + + - RULES is the base name of the file containing the list of synonyms. + This file must be in $(prefix)/share/tsearch_data/, and its name must + end in ".rules" (which is not included in the RULES parameter). + +The rules file has the following format: + + - Each line represents a group of synonyms for a single word, which is + given first on the line. Synonyms are separated by whitespace: + + word syn1 syn2 syn3 + + - Sharp ('#') sign is a comment delimiter. It may appear at any position + inside the line. The rest of the line will be skipped. + +Look at xsyn_sample.rules, which is installed in $(prefix)/share/tsearch_data/, +for an example. + +* Usage + +1. Compile and install + +2. Load dictionary + + psql mydb < dict_xsyn.sql + +3. Test it + + mydb=# SELECT ts_lexize('xsyn','word'); + ts_lexize + ---------------- + {word,syn1,syn2,syn3) + +4. Change the dictionary options as you wish + + mydb# ALTER TEXT SEARCH DICTIONARY xsyn (KEEPORIG=false); + ALTER TEXT SEARCH DICTIONARY + +That's all. diff --git a/contrib/dict_xsyn/dict_xsyn.c b/contrib/dict_xsyn/dict_xsyn.c new file mode 100644 index 0000000000..1cd53a26bd --- /dev/null +++ b/contrib/dict_xsyn/dict_xsyn.c @@ -0,0 +1,235 @@ +/*------------------------------------------------------------------------- + * + * dict_xsyn.c + * Extended synonym dictionary + * + * Copyright (c) 2007, PostgreSQL Global Development Group + * + * IDENTIFICATION + * $PostgreSQL: pgsql/contrib/dict_xsyn/dict_xsyn.c,v 1.1 2007/10/15 21:36:50 tgl Exp $ + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include + +#include "commands/defrem.h" +#include "fmgr.h" +#include "storage/fd.h" +#include "tsearch/ts_locale.h" +#include "tsearch/ts_utils.h" + +PG_MODULE_MAGIC; + +typedef struct +{ + char *key; /* Word */ + char *value; /* Unparsed list of synonyms, including the word itself */ +} Syn; + +typedef struct +{ + int len; + Syn *syn; + + bool keeporig; +} DictSyn; + + +PG_FUNCTION_INFO_V1(dxsyn_init); +Datum dxsyn_init(PG_FUNCTION_ARGS); + +PG_FUNCTION_INFO_V1(dxsyn_lexize); +Datum dxsyn_lexize(PG_FUNCTION_ARGS); + +static char * +find_word(char *in, char **end) +{ + char *start; + + *end = NULL; + while (*in && t_isspace(in)) + in += pg_mblen(in); + + if (!*in || *in == '#') + return NULL; + start = in; + + while (*in && !t_isspace(in)) + in += pg_mblen(in); + + *end = in; + + return start; +} + +static int +compare_syn(const void *a, const void *b) +{ + return strcmp(((Syn *) a)->key, ((Syn *) b)->key); +} + +static void +read_dictionary(DictSyn *d, char *filename) +{ + char *real_filename = get_tsearch_config_filename(filename, "rules"); + FILE *fin; + char *line; + int cur = 0; + + if ((fin = AllocateFile(real_filename, "r")) == NULL) + ereport(ERROR, + (errcode(ERRCODE_CONFIG_FILE_ERROR), + errmsg("could not open synonym file \"%s\": %m", + real_filename))); + + while ((line = t_readline(fin)) != NULL) + { + char *value; + char *key; + char *end = NULL; + + if (*line == '\0') + continue; + + value = lowerstr(line); + pfree(line); + + key = find_word(value, &end); + if (!key) + { + pfree(value); + continue; + } + + if (cur == d->len) + { + d->len = (d->len > 0) ? 2 * d->len : 16; + if (d->syn) + d->syn = (Syn *) repalloc(d->syn, sizeof(Syn) * d->len); + else + d->syn = (Syn *) palloc(sizeof(Syn) * d->len); + } + + d->syn[cur].key = pnstrdup(key, end - key); + d->syn[cur].value = value; + + cur++; + } + + FreeFile(fin); + + d->len = cur; + if (cur > 1) + qsort(d->syn, d->len, sizeof(Syn), compare_syn); + + pfree(real_filename); +} + +Datum +dxsyn_init(PG_FUNCTION_ARGS) +{ + List *dictoptions = (List *) PG_GETARG_POINTER(0); + DictSyn *d; + ListCell *l; + + d = (DictSyn *) palloc0(sizeof(DictSyn)); + d->len = 0; + d->syn = NULL; + d->keeporig = true; + + foreach(l, dictoptions) + { + DefElem *defel = (DefElem *) lfirst(l); + + if (pg_strcasecmp(defel->defname, "KEEPORIG") == 0) + { + d->keeporig = defGetBoolean(defel); + } + else if (pg_strcasecmp(defel->defname, "RULES") == 0) + { + read_dictionary(d, defGetString(defel)); + } + else + { + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("unrecognized xsyn parameter: \"%s\"", + defel->defname))); + } + } + + PG_RETURN_POINTER(d); +} + +Datum +dxsyn_lexize(PG_FUNCTION_ARGS) +{ + DictSyn *d = (DictSyn *) PG_GETARG_POINTER(0); + char *in = (char *) PG_GETARG_POINTER(1); + int length = PG_GETARG_INT32(2); + Syn word; + Syn *found; + TSLexeme *res = NULL; + + if (!length || d->len == 0) + PG_RETURN_POINTER(NULL); + + /* Create search pattern */ + { + char *temp = pnstrdup(in, length); + + word.key = lowerstr(temp); + pfree(temp); + word.value = NULL; + } + + /* Look for matching syn */ + found = (Syn *)bsearch(&word, d->syn, d->len, sizeof(Syn), compare_syn); + pfree(word.key); + + if (!found) + PG_RETURN_POINTER(NULL); + + /* Parse string of synonyms and return array of words */ + { + char *value = pstrdup(found->value); + int value_length = strlen(value); + char *pos = value; + int nsyns = 0; + bool is_first = true; + + res = palloc(0); + + while(pos < value + value_length) + { + char *end; + char *syn = find_word(pos, &end); + + if (!syn) + break; + *end = '\0'; + + res = repalloc(res, sizeof(TSLexeme)*(nsyns + 2)); + res[nsyns].lexeme = NULL; + + /* first word is added to result only if KEEPORIG flag is set */ + if(d->keeporig || !is_first) + { + res[nsyns].lexeme = pstrdup(syn); + res[nsyns + 1].lexeme = NULL; + + nsyns++; + } + + is_first = false; + + pos = end + 1; + } + + pfree(value); + } + + PG_RETURN_POINTER(res); +} diff --git a/contrib/dict_xsyn/dict_xsyn.sql.in b/contrib/dict_xsyn/dict_xsyn.sql.in new file mode 100644 index 0000000000..0e5755e5b1 --- /dev/null +++ b/contrib/dict_xsyn/dict_xsyn.sql.in @@ -0,0 +1,29 @@ +-- $PostgreSQL: pgsql/contrib/dict_xsyn/dict_xsyn.sql.in,v 1.1 2007/10/15 21:36:50 tgl Exp $ + +-- Adjust this setting to control where the objects get created. +SET search_path = public; + +BEGIN; + +CREATE FUNCTION dxsyn_init(internal) + RETURNS internal + AS 'MODULE_PATHNAME' + LANGUAGE C STRICT; + +CREATE FUNCTION dxsyn_lexize(internal, internal, internal, internal) + RETURNS internal + AS 'MODULE_PATHNAME' + LANGUAGE C STRICT; + +CREATE TEXT SEARCH TEMPLATE xsyn_template ( + LEXIZE = dxsyn_lexize, + INIT = dxsyn_init +); + +CREATE TEXT SEARCH DICTIONARY xsyn ( + TEMPLATE = xsyn_template +); + +COMMENT ON TEXT SEARCH DICTIONARY xsyn IS 'eXtended synonym dictionary'; + +END; diff --git a/contrib/dict_xsyn/expected/dict_xsyn.out b/contrib/dict_xsyn/expected/dict_xsyn.out new file mode 100644 index 0000000000..99071ea8c7 --- /dev/null +++ b/contrib/dict_xsyn/expected/dict_xsyn.out @@ -0,0 +1,22 @@ +-- +-- first, define the datatype. Turn off echoing so that expected file +-- does not depend on contents of this file. +-- +SET client_min_messages = warning; +\set ECHO none +RESET client_min_messages; +--configuration +ALTER TEXT SEARCH DICTIONARY xsyn (RULES='xsyn_sample', KEEPORIG=false); +--lexize +SELECT ts_lexize('xsyn', 'supernova'); + ts_lexize +---------------- + {sn,sne,1987a} +(1 row) + +SELECT ts_lexize('xsyn', 'grb'); + ts_lexize +----------- + +(1 row) + diff --git a/contrib/dict_xsyn/sql/dict_xsyn.sql b/contrib/dict_xsyn/sql/dict_xsyn.sql new file mode 100644 index 0000000000..17f6df9cf3 --- /dev/null +++ b/contrib/dict_xsyn/sql/dict_xsyn.sql @@ -0,0 +1,16 @@ +-- +-- first, define the datatype. Turn off echoing so that expected file +-- does not depend on contents of this file. +-- +SET client_min_messages = warning; +\set ECHO none +\i dict_xsyn.sql +\set ECHO all +RESET client_min_messages; + +--configuration +ALTER TEXT SEARCH DICTIONARY xsyn (RULES='xsyn_sample', KEEPORIG=false); + +--lexize +SELECT ts_lexize('xsyn', 'supernova'); +SELECT ts_lexize('xsyn', 'grb'); diff --git a/contrib/dict_xsyn/uninstall_dict_xsyn.sql b/contrib/dict_xsyn/uninstall_dict_xsyn.sql new file mode 100644 index 0000000000..7b7acea0d1 --- /dev/null +++ b/contrib/dict_xsyn/uninstall_dict_xsyn.sql @@ -0,0 +1,9 @@ +SET search_path = public; + +DROP TEXT SEARCH DICTIONARY xsyn; + +DROP TEXT SEARCH TEMPLATE xsyn_template; + +DROP FUNCTION dxsyn_init(internal); + +DROP FUNCTION dxsyn_lexize(internal,internal,internal,internal); diff --git a/contrib/dict_xsyn/xsyn_sample.rules b/contrib/dict_xsyn/xsyn_sample.rules new file mode 100644 index 0000000000..203bec793a --- /dev/null +++ b/contrib/dict_xsyn/xsyn_sample.rules @@ -0,0 +1,6 @@ +# Sample rules file for eXtended Synonym (xsyn) dictionary +# format is as follows: +# +# word synonym1 synonym2 ... +# +supernova sn sne 1987a diff --git a/contrib/test_parser/Makefile b/contrib/test_parser/Makefile new file mode 100644 index 0000000000..1267a6e062 --- /dev/null +++ b/contrib/test_parser/Makefile @@ -0,0 +1,19 @@ +# $PostgreSQL: pgsql/contrib/test_parser/Makefile,v 1.1 2007/10/15 21:36:50 tgl Exp $ + +MODULE_big = test_parser +OBJS = test_parser.o +DATA_built = test_parser.sql +DATA = uninstall_test_parser.sql +DOCS = README.test_parser +REGRESS = test_parser + +ifdef USE_PGXS +PG_CONFIG = pg_config +PGXS := $(shell $(PG_CONFIG) --pgxs) +include $(PGXS) +else +subdir = contrib/test_parser +top_builddir = ../.. +include $(top_builddir)/src/Makefile.global +include $(top_srcdir)/contrib/contrib-global.mk +endif diff --git a/contrib/test_parser/README.test_parser b/contrib/test_parser/README.test_parser new file mode 100644 index 0000000000..d8ca90a5df --- /dev/null +++ b/contrib/test_parser/README.test_parser @@ -0,0 +1,52 @@ +Example parser +============== + +This is an example of a custom parser for full text search. + +It recognizes space-delimited words and returns only two token types: + + - 3, word, Word + + - 12, blank, Space symbols + +The token numbers have been chosen to keep compatibility with the default +ts_headline() function, since we do not want to implement our own version. + +* Configuration + +The parser has no user-configurable parameters. + +* Usage + +1. Compile and install + +2. Load dictionary + + psql mydb < test_parser.sql + +3. Test it + + mydb# SELECT * FROM ts_parse('testparser','That''s my first own parser'); + tokid | token + -------+-------- + 3 | That's + 12 | + 3 | my + 12 | + 3 | first + 12 | + 3 | own + 12 | + 3 | parser + + mydb# SELECT to_tsvector('testcfg','That''s my first own parser'); + to_tsvector + ------------------------------------------------- + 'my':2 'own':4 'first':3 'parser':5 'that''s':1 + + mydb# SELECT ts_headline('testcfg','Supernovae stars are the brightest phenomena in galaxies', to_tsquery('testcfg', 'star')); + headline + ----------------------------------------------------------------- + Supernovae stars are the brightest phenomena in galaxies + +That's all. diff --git a/contrib/test_parser/expected/test_parser.out b/contrib/test_parser/expected/test_parser.out new file mode 100644 index 0000000000..ec4e3b2bb4 --- /dev/null +++ b/contrib/test_parser/expected/test_parser.out @@ -0,0 +1,50 @@ +-- +-- first, define the parser. Turn off echoing so that expected file +-- does not depend on contents of this file. +-- +SET client_min_messages = warning; +\set ECHO none +RESET client_min_messages; +-- make test configuration using parser +CREATE TEXT SEARCH CONFIGURATION testcfg (PARSER = testparser); +ALTER TEXT SEARCH CONFIGURATION testcfg ADD MAPPING FOR word WITH simple; +-- ts_parse +SELECT * FROM ts_parse('testparser', 'That''s simple parser can''t parse urls like http://some.url/here/'); + tokid | token +-------+----------------------- + 3 | That's + 12 | + 3 | simple + 12 | + 3 | parser + 12 | + 3 | can't + 12 | + 3 | parse + 12 | + 3 | urls + 12 | + 3 | like + 12 | + 3 | http://some.url/here/ +(15 rows) + +SELECT to_tsvector('testcfg','That''s my first own parser'); + to_tsvector +------------------------------------------------- + 'my':2 'own':4 'first':3 'parser':5 'that''s':1 +(1 row) + +SELECT to_tsquery('testcfg', 'star'); + to_tsquery +------------ + 'star' +(1 row) + +SELECT ts_headline('testcfg','Supernovae stars are the brightest phenomena in galaxies', + to_tsquery('testcfg', 'stars')); + ts_headline +----------------------------------------------------------------- + Supernovae stars are the brightest phenomena in galaxies +(1 row) + diff --git a/contrib/test_parser/sql/test_parser.sql b/contrib/test_parser/sql/test_parser.sql new file mode 100644 index 0000000000..f43d4c7e09 --- /dev/null +++ b/contrib/test_parser/sql/test_parser.sql @@ -0,0 +1,26 @@ +-- +-- first, define the parser. Turn off echoing so that expected file +-- does not depend on contents of this file. +-- +SET client_min_messages = warning; +\set ECHO none +\i test_parser.sql +\set ECHO all +RESET client_min_messages; + +-- make test configuration using parser + +CREATE TEXT SEARCH CONFIGURATION testcfg (PARSER = testparser); + +ALTER TEXT SEARCH CONFIGURATION testcfg ADD MAPPING FOR word WITH simple; + +-- ts_parse + +SELECT * FROM ts_parse('testparser', 'That''s simple parser can''t parse urls like http://some.url/here/'); + +SELECT to_tsvector('testcfg','That''s my first own parser'); + +SELECT to_tsquery('testcfg', 'star'); + +SELECT ts_headline('testcfg','Supernovae stars are the brightest phenomena in galaxies', + to_tsquery('testcfg', 'stars')); diff --git a/contrib/test_parser/test_parser.c b/contrib/test_parser/test_parser.c new file mode 100644 index 0000000000..728bf4098f --- /dev/null +++ b/contrib/test_parser/test_parser.c @@ -0,0 +1,130 @@ +/*------------------------------------------------------------------------- + * + * test_parser.c + * Simple example of a text search parser + * + * Copyright (c) 2007, PostgreSQL Global Development Group + * + * IDENTIFICATION + * $PostgreSQL: pgsql/contrib/test_parser/test_parser.c,v 1.1 2007/10/15 21:36:50 tgl Exp $ + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "fmgr.h" + +PG_MODULE_MAGIC; + + +/* + * types + */ + +/* self-defined type */ +typedef struct { + char * buffer; /* text to parse */ + int len; /* length of the text in buffer */ + int pos; /* position of the parser */ +} ParserState; + +/* copy-paste from wparser.h of tsearch2 */ +typedef struct { + int lexid; + char *alias; + char *descr; +} LexDescr; + +/* + * prototypes + */ +PG_FUNCTION_INFO_V1(testprs_start); +Datum testprs_start(PG_FUNCTION_ARGS); + +PG_FUNCTION_INFO_V1(testprs_getlexeme); +Datum testprs_getlexeme(PG_FUNCTION_ARGS); + +PG_FUNCTION_INFO_V1(testprs_end); +Datum testprs_end(PG_FUNCTION_ARGS); + +PG_FUNCTION_INFO_V1(testprs_lextype); +Datum testprs_lextype(PG_FUNCTION_ARGS); + +/* + * functions + */ +Datum testprs_start(PG_FUNCTION_ARGS) +{ + ParserState *pst = (ParserState *) palloc0(sizeof(ParserState)); + pst->buffer = (char *) PG_GETARG_POINTER(0); + pst->len = PG_GETARG_INT32(1); + pst->pos = 0; + + PG_RETURN_POINTER(pst); +} + +Datum testprs_getlexeme(PG_FUNCTION_ARGS) +{ + ParserState *pst = (ParserState *) PG_GETARG_POINTER(0); + char **t = (char **) PG_GETARG_POINTER(1); + int *tlen = (int *) PG_GETARG_POINTER(2); + int type; + + *tlen = pst->pos; + *t = pst->buffer + pst->pos; + + if ((pst->buffer)[pst->pos] == ' ') + { + /* blank type */ + type = 12; + /* go to the next non-white-space character */ + while ((pst->buffer)[pst->pos] == ' ' && + pst->pos < pst->len) + (pst->pos)++; + } else { + /* word type */ + type = 3; + /* go to the next white-space character */ + while ((pst->buffer)[pst->pos] != ' ' && + pst->pos < pst->len) + (pst->pos)++; + } + + *tlen = pst->pos - *tlen; + + /* we are finished if (*tlen == 0) */ + if (*tlen == 0) + type=0; + + PG_RETURN_INT32(type); +} + +Datum testprs_end(PG_FUNCTION_ARGS) +{ + ParserState *pst = (ParserState *) PG_GETARG_POINTER(0); + pfree(pst); + PG_RETURN_VOID(); +} + +Datum testprs_lextype(PG_FUNCTION_ARGS) +{ + /* + * Remarks: + * - we have to return the blanks for headline reason + * - we use the same lexids like Teodor in the default + * word parser; in this way we can reuse the headline + * function of the default word parser. + */ + LexDescr *descr = (LexDescr *) palloc(sizeof(LexDescr) * (2+1)); + + /* there are only two types in this parser */ + descr[0].lexid = 3; + descr[0].alias = pstrdup("word"); + descr[0].descr = pstrdup("Word"); + descr[1].lexid = 12; + descr[1].alias = pstrdup("blank"); + descr[1].descr = pstrdup("Space symbols"); + descr[2].lexid = 0; + + PG_RETURN_POINTER(descr); +} diff --git a/contrib/test_parser/test_parser.sql.in b/contrib/test_parser/test_parser.sql.in new file mode 100644 index 0000000000..cb5c9a2b03 --- /dev/null +++ b/contrib/test_parser/test_parser.sql.in @@ -0,0 +1,36 @@ +-- $PostgreSQL: pgsql/contrib/test_parser/test_parser.sql.in,v 1.1 2007/10/15 21:36:50 tgl Exp $ + +-- Adjust this setting to control where the objects get created. +SET search_path = public; + +BEGIN; + +CREATE FUNCTION testprs_start(internal, int4) + RETURNS internal + AS 'MODULE_PATHNAME' + LANGUAGE C STRICT; + +CREATE FUNCTION testprs_getlexeme(internal, internal, internal) + RETURNS internal + AS 'MODULE_PATHNAME' + LANGUAGE C STRICT; + +CREATE FUNCTION testprs_end(internal) + RETURNS void + AS 'MODULE_PATHNAME' + LANGUAGE C STRICT; + +CREATE FUNCTION testprs_lextype(internal) + RETURNS internal + AS 'MODULE_PATHNAME' + LANGUAGE C STRICT; + +CREATE TEXT SEARCH PARSER testparser ( + START = testprs_start, + GETTOKEN = testprs_getlexeme, + END = testprs_end, + HEADLINE = pg_catalog.prsd_headline, + LEXTYPES = testprs_lextype +); + +END; diff --git a/contrib/test_parser/uninstall_test_parser.sql b/contrib/test_parser/uninstall_test_parser.sql new file mode 100644 index 0000000000..d194677d6b --- /dev/null +++ b/contrib/test_parser/uninstall_test_parser.sql @@ -0,0 +1,11 @@ +SET search_path = public; + +DROP TEXT SEARCH PARSER testparser; + +DROP FUNCTION testprs_start(internal, int4); + +DROP FUNCTION testprs_getlexeme(internal, internal, internal); + +DROP FUNCTION testprs_end(internal); + +DROP FUNCTION testprs_lextype(internal);