]> granicus.if.org Git - postgresql/commitdiff
Add sample text search dictionary templates and parsers, to replace the
authorTom Lane <tgl@sss.pgh.pa.us>
Mon, 15 Oct 2007 21:36:50 +0000 (21:36 +0000)
committerTom Lane <tgl@sss.pgh.pa.us>
Mon, 15 Oct 2007 21:36:50 +0000 (21:36 +0000)
hard-to-maintain textual examples currently in the SGML docs.  From
Sergey Karpov.

24 files changed:
contrib/Makefile
contrib/README
contrib/dict_int/Makefile [new file with mode: 0644]
contrib/dict_int/README.dict_int [new file with mode: 0644]
contrib/dict_int/dict_int.c [new file with mode: 0644]
contrib/dict_int/dict_int.sql.in [new file with mode: 0644]
contrib/dict_int/expected/dict_int.out [new file with mode: 0644]
contrib/dict_int/sql/dict_int.sql [new file with mode: 0644]
contrib/dict_int/uninstall_dict_int.sql [new file with mode: 0644]
contrib/dict_xsyn/Makefile [new file with mode: 0644]
contrib/dict_xsyn/README.dict_xsyn [new file with mode: 0644]
contrib/dict_xsyn/dict_xsyn.c [new file with mode: 0644]
contrib/dict_xsyn/dict_xsyn.sql.in [new file with mode: 0644]
contrib/dict_xsyn/expected/dict_xsyn.out [new file with mode: 0644]
contrib/dict_xsyn/sql/dict_xsyn.sql [new file with mode: 0644]
contrib/dict_xsyn/uninstall_dict_xsyn.sql [new file with mode: 0644]
contrib/dict_xsyn/xsyn_sample.rules [new file with mode: 0644]
contrib/test_parser/Makefile [new file with mode: 0644]
contrib/test_parser/README.test_parser [new file with mode: 0644]
contrib/test_parser/expected/test_parser.out [new file with mode: 0644]
contrib/test_parser/sql/test_parser.sql [new file with mode: 0644]
contrib/test_parser/test_parser.c [new file with mode: 0644]
contrib/test_parser/test_parser.sql.in [new file with mode: 0644]
contrib/test_parser/uninstall_test_parser.sql [new file with mode: 0644]

index 3f49645036d5497462e796e2951a56485f138d84..0bd9e65b05a1b6dcb92f201cdc45bb8520362740 100644 (file)
@@ -1,4 +1,4 @@
-# $PostgreSQL: pgsql/contrib/Makefile,v 1.80 2007/10/13 22:59:43 tgl Exp $
+# $PostgreSQL: pgsql/contrib/Makefile,v 1.81 2007/10/15 21:36:49 tgl Exp $
 
 subdir = contrib
 top_builddir = ..
@@ -10,6 +10,8 @@ WANTED_DIRS = \
                chkpass         \
                cube            \
                dblink          \
+               dict_int        \
+               dict_xsyn       \
                earthdistance   \
                fuzzystrmatch   \
                hstore          \
@@ -31,6 +33,7 @@ WANTED_DIRS = \
                seg             \
                spi             \
                tablefunc       \
+               test_parser     \
                vacuumlo
 
 ifeq ($(with_openssl),yes)
index 5b2167ec97edb00b270e328f2448d6775367df07..ac15b85a3e8c31e234ba2be671b844365f138335 100644 (file)
@@ -1,4 +1,3 @@
-
 The PostgreSQL contrib tree
 ---------------------------
 
@@ -29,8 +28,8 @@ adminpack -
        by Dave Page <dpage@vale-housing.co.uk>
 
 btree_gist -
-      Support for emulating BTREE indexing in GiST
-      by Oleg Bartunov <oleg@sai.msu.su> and Teodor Sigaev <teodor@sigaev.ru>
+       Support for emulating BTREE indexing in GiST
+       by Oleg Bartunov <oleg@sai.msu.su> and Teodor Sigaev <teodor@sigaev.ru>
 
 chkpass -
        An auto-encrypted password datatype
@@ -44,8 +43,16 @@ dblink -
        Allows remote query execution
        by Joe Conway <mail@joeconway.com>
 
+dict_int -
+       Text search dictionary template for integers
+       by Sergey Karpov <karpov@sao.ru>
+
+dict_xsyn -
+       Text search dictionary template for extended synonym processing
+       by Sergey Karpov <karpov@sao.ru>
+
 earthdistance -
-       Operator for computing earth distance for two points
+       Operator for computing earth distance between two points
        by Hal Snyder <hal@vailsys.com>
 
 fuzzystrmatch -
@@ -53,8 +60,8 @@ fuzzystrmatch -
        by Joe Conway <mail@joeconway.com>, Joel Burton <jburton@scw.org>
 
 hstore -
-       Hstore - module for storing (key,value) pairs
-    by Oleg Bartunov <oleg@sai.msu.su> and Teodor Sigaev <teodor@sigaev.ru>
+       Module for storing (key, value) pairs
+       by Oleg Bartunov <oleg@sai.msu.su> and Teodor Sigaev <teodor@sigaev.ru>
 
 intagg -
        Integer aggregator
@@ -92,6 +99,10 @@ pg_freespacemap -
        Displays the contents of the free space map (FSM)
        by Mark Kirkwood <markir@paradise.net.nz>
 
+pg_standby -
+       Sample archive_command for warm standby operation
+       by Simon Riggs <simon@2ndquadrant.com>
+
 pg_trgm -
        Functions for determining the similarity of text based on trigram
        matching.
@@ -110,7 +121,7 @@ pgrowlocks -
        by Tatsuo Ishii <ishii@sraoss.co.jp>
 
 pgstattuple -
-       A function to return statistics about "dead" tuples and free
+       Functions to return statistics about "dead" tuples and free
        space within a table
        by Tatsuo Ishii <ishii@sraoss.co.jp>
 
@@ -126,12 +137,16 @@ sslinfo -
        by Victor Wagner <vitus@cryptocom.ru>
 
 start-scripts - 
-       Scripts for starting the server at boot time.
+       Scripts for starting the server at boot time on various platforms.
 
 tablefunc -
        Examples of functions returning tables
        by Joe Conway <mail@joeconway.com>
 
+test_parser -
+       Sample text search parser
+       by Sergey Karpov <karpov@sao.ru>
+
 tsearch2 -
        Full-text-index support using GiST
        by Teodor Sigaev <teodor@sigaev.ru> and Oleg Bartunov
diff --git a/contrib/dict_int/Makefile b/contrib/dict_int/Makefile
new file mode 100644 (file)
index 0000000..4e03a69
--- /dev/null
@@ -0,0 +1,19 @@
+# $PostgreSQL: pgsql/contrib/dict_int/Makefile,v 1.1 2007/10/15 21:36:50 tgl Exp $
+
+MODULE_big = dict_int
+OBJS = dict_int.o
+DATA_built = dict_int.sql
+DATA = uninstall_dict_int.sql
+DOCS = README.dict_int
+REGRESS = dict_int
+
+ifdef USE_PGXS
+PG_CONFIG = pg_config
+PGXS := $(shell $(PG_CONFIG) --pgxs)
+include $(PGXS)
+else
+subdir = contrib/dict_int
+top_builddir = ../..
+include $(top_builddir)/src/Makefile.global
+include $(top_srcdir)/contrib/contrib-global.mk
+endif
diff --git a/contrib/dict_int/README.dict_int b/contrib/dict_int/README.dict_int
new file mode 100644 (file)
index 0000000..5883c1c
--- /dev/null
@@ -0,0 +1,41 @@
+Dictionary for integers
+=======================
+
+The motivation for this example dictionary is to control the indexing of
+integers (signed and unsigned), and, consequently, to minimize the number of
+unique words which greatly affect the performance of searching.
+
+* Configuration
+
+The dictionary accepts two options: 
+
+  - The MAXLEN parameter specifies the maximum length (number of digits)
+    allowed in an integer word.  The default value is 6.
+
+  - The REJECTLONG parameter specifies if an overlength integer should be
+    truncated or ignored. If REJECTLONG=FALSE (default), the dictionary returns
+    the first MAXLEN digits of the integer. If REJECTLONG=TRUE, the
+    dictionary treats an overlength integer as a stop word, so that it will
+    not be indexed.
+
+* Usage
+
+1. Compile and install
+
+2. Load dictionary
+
+   psql mydb < dict_int.sql
+
+3. Test it
+   mydb# select ts_lexize('intdict', '12345678');
+    ts_lexize
+   -----------
+    {123456}
+
+4. Change its options as you wish
+
+   mydb# ALTER TEXT SEARCH DICTIONARY intdict (MAXLEN = 4, REJECTLONG = true);
+   ALTER TEXT SEARCH DICTIONARY
+
+That's all.
diff --git a/contrib/dict_int/dict_int.c b/contrib/dict_int/dict_int.c
new file mode 100644 (file)
index 0000000..85d4549
--- /dev/null
@@ -0,0 +1,99 @@
+/*-------------------------------------------------------------------------
+ *
+ * dict_int.c
+ *       Text search dictionary for integers
+ *
+ * Copyright (c) 2007, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ *       $PostgreSQL: pgsql/contrib/dict_int/dict_int.c,v 1.1 2007/10/15 21:36:50 tgl Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "commands/defrem.h"
+#include "fmgr.h"
+#include "tsearch/ts_public.h"
+
+PG_MODULE_MAGIC;
+
+
+typedef struct {
+       int     maxlen;
+       bool    rejectlong;
+} DictInt;
+
+
+PG_FUNCTION_INFO_V1(dintdict_init);
+Datum dintdict_init(PG_FUNCTION_ARGS);
+
+PG_FUNCTION_INFO_V1(dintdict_lexize);
+Datum dintdict_lexize(PG_FUNCTION_ARGS);
+
+Datum
+dintdict_init(PG_FUNCTION_ARGS)
+{
+       List            *dictoptions = (List *) PG_GETARG_POINTER(0);
+       DictInt         *d;
+       ListCell        *l;
+
+       d = (DictInt *) palloc0(sizeof(DictInt));
+       d->maxlen = 6;
+       d->rejectlong = false;
+
+       foreach(l, dictoptions)
+       {
+               DefElem *defel = (DefElem *) lfirst(l);
+
+               if (pg_strcasecmp(defel->defname, "MAXLEN") == 0)
+               {
+                       d->maxlen = atoi(defGetString(defel));
+               }
+               else if (pg_strcasecmp(defel->defname, "REJECTLONG") == 0)
+               {
+                       d->rejectlong = defGetBoolean(defel);
+               }
+               else
+               {
+                       ereport(ERROR,
+                                       (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+                                        errmsg("unrecognized intdict parameter: \"%s\"",
+                                                       defel->defname)));
+               }
+       }
+       
+       PG_RETURN_POINTER(d);
+}
+
+Datum
+dintdict_lexize(PG_FUNCTION_ARGS)
+{
+       DictInt *d = (DictInt*)PG_GETARG_POINTER(0);
+       char       *in = (char*)PG_GETARG_POINTER(1);
+       char *txt = pnstrdup(in, PG_GETARG_INT32(2));
+       TSLexeme *res=palloc(sizeof(TSLexeme)*2);
+
+       res[1].lexeme = NULL;
+       if  (PG_GETARG_INT32(2) > d->maxlen)
+       {
+               if ( d->rejectlong )
+               {
+                       /* reject by returning void array */
+                       pfree(txt);
+                       res[0].lexeme = NULL;
+               }
+               else
+               {
+                       /* trim integer */
+                       txt[d->maxlen] = '\0';
+                       res[0].lexeme = txt;
+               }
+       }
+       else
+       {
+               res[0].lexeme = txt;
+       }
+
+       PG_RETURN_POINTER(res);
+}
diff --git a/contrib/dict_int/dict_int.sql.in b/contrib/dict_int/dict_int.sql.in
new file mode 100644 (file)
index 0000000..0bd97a8
--- /dev/null
@@ -0,0 +1,29 @@
+-- $PostgreSQL: pgsql/contrib/dict_int/dict_int.sql.in,v 1.1 2007/10/15 21:36:50 tgl Exp $
+
+-- Adjust this setting to control where the objects get created.
+SET search_path = public;
+
+BEGIN;
+
+CREATE FUNCTION dintdict_init(internal)
+        RETURNS internal
+        AS 'MODULE_PATHNAME'
+        LANGUAGE C STRICT;
+
+CREATE FUNCTION dintdict_lexize(internal, internal, internal, internal)
+        RETURNS internal
+        AS 'MODULE_PATHNAME'
+        LANGUAGE C STRICT;
+
+CREATE TEXT SEARCH TEMPLATE intdict_template (
+        LEXIZE = dintdict_lexize,
+       INIT   = dintdict_init
+);
+
+CREATE TEXT SEARCH DICTIONARY intdict (
+       TEMPLATE = intdict_template
+);
+
+COMMENT ON TEXT SEARCH DICTIONARY intdict IS 'dictionary for integers';
+
+END;
diff --git a/contrib/dict_int/expected/dict_int.out b/contrib/dict_int/expected/dict_int.out
new file mode 100644 (file)
index 0000000..7feb493
--- /dev/null
@@ -0,0 +1,308 @@
+--
+-- first, define the datatype.  Turn off echoing so that expected file
+-- does not depend on contents of this file.
+--
+SET client_min_messages = warning;
+\set ECHO none
+RESET client_min_messages;
+--lexize
+select ts_lexize('intdict', '511673');
+ ts_lexize 
+-----------
+ {511673}
+(1 row)
+
+select ts_lexize('intdict', '129');
+ ts_lexize 
+-----------
+ {129}
+(1 row)
+
+select ts_lexize('intdict', '40865854');
+ ts_lexize 
+-----------
+ {408658}
+(1 row)
+
+select ts_lexize('intdict', '952');
+ ts_lexize 
+-----------
+ {952}
+(1 row)
+
+select ts_lexize('intdict', '654980341');
+ ts_lexize 
+-----------
+ {654980}
+(1 row)
+
+select ts_lexize('intdict', '09810106');
+ ts_lexize 
+-----------
+ {098101}
+(1 row)
+
+select ts_lexize('intdict', '14262713');
+ ts_lexize 
+-----------
+ {142627}
+(1 row)
+
+select ts_lexize('intdict', '6532082986');
+ ts_lexize 
+-----------
+ {653208}
+(1 row)
+
+select ts_lexize('intdict', '0150061');
+ ts_lexize 
+-----------
+ {015006}
+(1 row)
+
+select ts_lexize('intdict', '7778');
+ ts_lexize 
+-----------
+ {7778}
+(1 row)
+
+select ts_lexize('intdict', '9547');
+ ts_lexize 
+-----------
+ {9547}
+(1 row)
+
+select ts_lexize('intdict', '753395478');
+ ts_lexize 
+-----------
+ {753395}
+(1 row)
+
+select ts_lexize('intdict', '647652');
+ ts_lexize 
+-----------
+ {647652}
+(1 row)
+
+select ts_lexize('intdict', '6988655574');
+ ts_lexize 
+-----------
+ {698865}
+(1 row)
+
+select ts_lexize('intdict', '1279');
+ ts_lexize 
+-----------
+ {1279}
+(1 row)
+
+select ts_lexize('intdict', '1266645909');
+ ts_lexize 
+-----------
+ {126664}
+(1 row)
+
+select ts_lexize('intdict', '7594193969');
+ ts_lexize 
+-----------
+ {759419}
+(1 row)
+
+select ts_lexize('intdict', '16928207');
+ ts_lexize 
+-----------
+ {169282}
+(1 row)
+
+select ts_lexize('intdict', '196850350328');
+ ts_lexize 
+-----------
+ {196850}
+(1 row)
+
+select ts_lexize('intdict', '22026985592');
+ ts_lexize 
+-----------
+ {220269}
+(1 row)
+
+select ts_lexize('intdict', '2063765');
+ ts_lexize 
+-----------
+ {206376}
+(1 row)
+
+select ts_lexize('intdict', '242387310');
+ ts_lexize 
+-----------
+ {242387}
+(1 row)
+
+select ts_lexize('intdict', '93595');
+ ts_lexize 
+-----------
+ {93595}
+(1 row)
+
+select ts_lexize('intdict', '9374');
+ ts_lexize 
+-----------
+ {9374}
+(1 row)
+
+select ts_lexize('intdict', '996969');
+ ts_lexize 
+-----------
+ {996969}
+(1 row)
+
+select ts_lexize('intdict', '353595982');
+ ts_lexize 
+-----------
+ {353595}
+(1 row)
+
+select ts_lexize('intdict', '925860');
+ ts_lexize 
+-----------
+ {925860}
+(1 row)
+
+select ts_lexize('intdict', '11848378337');
+ ts_lexize 
+-----------
+ {118483}
+(1 row)
+
+select ts_lexize('intdict', '333');
+ ts_lexize 
+-----------
+ {333}
+(1 row)
+
+select ts_lexize('intdict', '799287416765');
+ ts_lexize 
+-----------
+ {799287}
+(1 row)
+
+select ts_lexize('intdict', '745939');
+ ts_lexize 
+-----------
+ {745939}
+(1 row)
+
+select ts_lexize('intdict', '67601305734');
+ ts_lexize 
+-----------
+ {676013}
+(1 row)
+
+select ts_lexize('intdict', '3361113');
+ ts_lexize 
+-----------
+ {336111}
+(1 row)
+
+select ts_lexize('intdict', '9033778607');
+ ts_lexize 
+-----------
+ {903377}
+(1 row)
+
+select ts_lexize('intdict', '7507648');
+ ts_lexize 
+-----------
+ {750764}
+(1 row)
+
+select ts_lexize('intdict', '1166');
+ ts_lexize 
+-----------
+ {1166}
+(1 row)
+
+select ts_lexize('intdict', '9360498');
+ ts_lexize 
+-----------
+ {936049}
+(1 row)
+
+select ts_lexize('intdict', '917795');
+ ts_lexize 
+-----------
+ {917795}
+(1 row)
+
+select ts_lexize('intdict', '9387894');
+ ts_lexize 
+-----------
+ {938789}
+(1 row)
+
+select ts_lexize('intdict', '42764329');
+ ts_lexize 
+-----------
+ {427643}
+(1 row)
+
+select ts_lexize('intdict', '564062');
+ ts_lexize 
+-----------
+ {564062}
+(1 row)
+
+select ts_lexize('intdict', '5413377');
+ ts_lexize 
+-----------
+ {541337}
+(1 row)
+
+select ts_lexize('intdict', '060965');
+ ts_lexize 
+-----------
+ {060965}
+(1 row)
+
+select ts_lexize('intdict', '08273593');
+ ts_lexize 
+-----------
+ {082735}
+(1 row)
+
+select ts_lexize('intdict', '593556010144');
+ ts_lexize 
+-----------
+ {593556}
+(1 row)
+
+select ts_lexize('intdict', '17988843352');
+ ts_lexize 
+-----------
+ {179888}
+(1 row)
+
+select ts_lexize('intdict', '252281774');
+ ts_lexize 
+-----------
+ {252281}
+(1 row)
+
+select ts_lexize('intdict', '313425');
+ ts_lexize 
+-----------
+ {313425}
+(1 row)
+
+select ts_lexize('intdict', '641439323669');
+ ts_lexize 
+-----------
+ {641439}
+(1 row)
+
+select ts_lexize('intdict', '314532610153');
+ ts_lexize 
+-----------
+ {314532}
+(1 row)
+
diff --git a/contrib/dict_int/sql/dict_int.sql b/contrib/dict_int/sql/dict_int.sql
new file mode 100644 (file)
index 0000000..3a335f8
--- /dev/null
@@ -0,0 +1,61 @@
+--
+-- first, define the datatype.  Turn off echoing so that expected file
+-- does not depend on contents of this file.
+--
+SET client_min_messages = warning;
+\set ECHO none
+\i dict_int.sql
+\set ECHO all
+RESET client_min_messages;
+
+--lexize
+select ts_lexize('intdict', '511673');
+select ts_lexize('intdict', '129');
+select ts_lexize('intdict', '40865854');
+select ts_lexize('intdict', '952');
+select ts_lexize('intdict', '654980341');
+select ts_lexize('intdict', '09810106');
+select ts_lexize('intdict', '14262713');
+select ts_lexize('intdict', '6532082986');
+select ts_lexize('intdict', '0150061');
+select ts_lexize('intdict', '7778');
+select ts_lexize('intdict', '9547');
+select ts_lexize('intdict', '753395478');
+select ts_lexize('intdict', '647652');
+select ts_lexize('intdict', '6988655574');
+select ts_lexize('intdict', '1279');
+select ts_lexize('intdict', '1266645909');
+select ts_lexize('intdict', '7594193969');
+select ts_lexize('intdict', '16928207');
+select ts_lexize('intdict', '196850350328');
+select ts_lexize('intdict', '22026985592');
+select ts_lexize('intdict', '2063765');
+select ts_lexize('intdict', '242387310');
+select ts_lexize('intdict', '93595');
+select ts_lexize('intdict', '9374');
+select ts_lexize('intdict', '996969');
+select ts_lexize('intdict', '353595982');
+select ts_lexize('intdict', '925860');
+select ts_lexize('intdict', '11848378337');
+select ts_lexize('intdict', '333');
+select ts_lexize('intdict', '799287416765');
+select ts_lexize('intdict', '745939');
+select ts_lexize('intdict', '67601305734');
+select ts_lexize('intdict', '3361113');
+select ts_lexize('intdict', '9033778607');
+select ts_lexize('intdict', '7507648');
+select ts_lexize('intdict', '1166');
+select ts_lexize('intdict', '9360498');
+select ts_lexize('intdict', '917795');
+select ts_lexize('intdict', '9387894');
+select ts_lexize('intdict', '42764329');
+select ts_lexize('intdict', '564062');
+select ts_lexize('intdict', '5413377');
+select ts_lexize('intdict', '060965');
+select ts_lexize('intdict', '08273593');
+select ts_lexize('intdict', '593556010144');
+select ts_lexize('intdict', '17988843352');
+select ts_lexize('intdict', '252281774');
+select ts_lexize('intdict', '313425');
+select ts_lexize('intdict', '641439323669');
+select ts_lexize('intdict', '314532610153');
diff --git a/contrib/dict_int/uninstall_dict_int.sql b/contrib/dict_int/uninstall_dict_int.sql
new file mode 100644 (file)
index 0000000..0323ab2
--- /dev/null
@@ -0,0 +1,9 @@
+SET search_path = public;
+
+DROP TEXT SEARCH DICTIONARY intdict;
+
+DROP TEXT SEARCH TEMPLATE intdict_template;
+
+DROP FUNCTION dintdict_init(internal);
+
+DROP FUNCTION dintdict_lexize(internal,internal,internal,internal);
diff --git a/contrib/dict_xsyn/Makefile b/contrib/dict_xsyn/Makefile
new file mode 100644 (file)
index 0000000..563f039
--- /dev/null
@@ -0,0 +1,38 @@
+# $PostgreSQL: pgsql/contrib/dict_xsyn/Makefile,v 1.1 2007/10/15 21:36:50 tgl Exp $
+
+MODULE_big = dict_xsyn
+OBJS = dict_xsyn.o
+DATA_built = dict_xsyn.sql
+DATA = uninstall_dict_xsyn.sql
+DOCS = README.dict_xsyn
+REGRESS = dict_xsyn
+
+DICTDIR = tsearch_data
+DICTFILES = xsyn_sample.rules
+
+ifdef USE_PGXS
+PG_CONFIG = pg_config
+PGXS := $(shell $(PG_CONFIG) --pgxs)
+include $(PGXS)
+else
+subdir = contrib/dict_xsyn
+top_builddir = ../..
+include $(top_builddir)/src/Makefile.global
+include $(top_srcdir)/contrib/contrib-global.mk
+endif
+
+install: install-data
+
+.PHONY: install-data
+install-data: $(DICTFILES)
+       for i in $(DICTFILES); \
+               do $(INSTALL_DATA) $(srcdir)/$$i '$(DESTDIR)$(datadir)/$(DICTDIR)/'$$i; \
+       done
+
+uninstall: uninstall-data
+
+.PHONY: uninstall-data
+uninstall-data:
+       for i in $(DICTFILES); \
+               do rm -rf '$(DESTDIR)$(datadir)/$(DICTDIR)/'$$i ; \
+       done
diff --git a/contrib/dict_xsyn/README.dict_xsyn b/contrib/dict_xsyn/README.dict_xsyn
new file mode 100644 (file)
index 0000000..9565eef
--- /dev/null
@@ -0,0 +1,52 @@
+Extended Synonym dictionary
+===========================
+
+This is a simple synonym dictionary. It replaces words with groups of their
+synonyms, and so makes it possible to search for a word using any of its
+synonyms.
+
+* Configuration
+
+It accepts the following options:
+ - KEEPORIG controls whether the original word is included, or only its
+   synonyms. Default is 'true'.
+
+ - RULES is the base name of the file containing the list of synonyms.
+   This file must be in $(prefix)/share/tsearch_data/, and its name must
+   end in ".rules" (which is not included in the RULES parameter).
+
+The rules file has the following format:
+
+ - Each line represents a group of synonyms for a single word, which is
+   given first on the line. Synonyms are separated by whitespace:
+   
+   word syn1 syn2 syn3
+
+ - Sharp ('#') sign is a comment delimiter. It may appear at any position
+   inside the line.  The rest of the line will be skipped.
+
+Look at xsyn_sample.rules, which is installed in $(prefix)/share/tsearch_data/,
+for an example.
+
+* Usage
+
+1. Compile and install
+
+2. Load dictionary
+
+   psql mydb < dict_xsyn.sql
+
+3. Test it
+   mydb=# SELECT ts_lexize('xsyn','word');
+   ts_lexize
+   ----------------
+   {word,syn1,syn2,syn3)
+
+4. Change the dictionary options as you wish
+
+   mydb# ALTER TEXT SEARCH DICTIONARY xsyn (KEEPORIG=false);
+   ALTER TEXT SEARCH DICTIONARY
+
+That's all.
diff --git a/contrib/dict_xsyn/dict_xsyn.c b/contrib/dict_xsyn/dict_xsyn.c
new file mode 100644 (file)
index 0000000..1cd53a2
--- /dev/null
@@ -0,0 +1,235 @@
+/*-------------------------------------------------------------------------
+ *
+ * dict_xsyn.c
+ *       Extended synonym dictionary
+ *
+ * Copyright (c) 2007, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ *       $PostgreSQL: pgsql/contrib/dict_xsyn/dict_xsyn.c,v 1.1 2007/10/15 21:36:50 tgl Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include <ctype.h>
+
+#include "commands/defrem.h"
+#include "fmgr.h"
+#include "storage/fd.h"
+#include "tsearch/ts_locale.h"
+#include "tsearch/ts_utils.h"
+
+PG_MODULE_MAGIC;
+
+typedef struct
+{
+       char *key; /* Word */
+       char *value; /* Unparsed list of synonyms, including the word itself */
+}      Syn;
+
+typedef struct
+{
+       int len;
+       Syn *syn;
+
+       bool keeporig;
+}      DictSyn;
+
+
+PG_FUNCTION_INFO_V1(dxsyn_init);
+Datum dxsyn_init(PG_FUNCTION_ARGS);
+
+PG_FUNCTION_INFO_V1(dxsyn_lexize);
+Datum dxsyn_lexize(PG_FUNCTION_ARGS);
+
+static char *
+find_word(char *in, char **end)
+{
+       char *start;
+
+       *end = NULL;
+       while (*in && t_isspace(in))
+               in += pg_mblen(in);
+
+       if (!*in || *in == '#')
+               return NULL;
+       start = in;
+
+       while (*in && !t_isspace(in))
+               in += pg_mblen(in);
+
+       *end = in;
+
+       return start;
+}
+
+static int
+compare_syn(const void *a, const void *b)
+{
+       return strcmp(((Syn *) a)->key, ((Syn *) b)->key);
+}
+
+static void
+read_dictionary(DictSyn *d, char *filename)
+{
+       char *real_filename = get_tsearch_config_filename(filename, "rules");
+       FILE *fin;
+       char *line;
+       int cur = 0;
+
+       if ((fin = AllocateFile(real_filename, "r")) == NULL)
+               ereport(ERROR,
+                               (errcode(ERRCODE_CONFIG_FILE_ERROR),
+                                errmsg("could not open synonym file \"%s\": %m",
+                                               real_filename)));
+
+       while ((line = t_readline(fin)) != NULL)
+       {
+               char *value;
+               char *key;
+               char *end = NULL;
+
+               if (*line == '\0')
+                       continue;
+
+               value = lowerstr(line);
+               pfree(line);
+
+               key = find_word(value, &end);
+               if (!key)
+               {
+                       pfree(value);
+                       continue;
+               }
+
+               if (cur == d->len)
+               {
+                       d->len = (d->len > 0) ? 2 * d->len : 16;
+                       if (d->syn)
+                               d->syn = (Syn *) repalloc(d->syn, sizeof(Syn) * d->len);
+                       else
+                               d->syn = (Syn *) palloc(sizeof(Syn) * d->len);
+               }
+
+               d->syn[cur].key = pnstrdup(key, end - key);
+               d->syn[cur].value = value;
+
+               cur++;
+       }
+
+       FreeFile(fin);
+
+       d->len = cur;
+       if (cur > 1)
+               qsort(d->syn, d->len, sizeof(Syn), compare_syn);
+
+       pfree(real_filename);
+}
+
+Datum
+dxsyn_init(PG_FUNCTION_ARGS)
+{
+       List *dictoptions = (List *) PG_GETARG_POINTER(0);
+       DictSyn *d;
+       ListCell *l;
+
+       d = (DictSyn *) palloc0(sizeof(DictSyn));
+       d->len = 0;
+       d->syn = NULL;
+       d->keeporig = true;
+
+       foreach(l, dictoptions)
+       {
+               DefElem *defel = (DefElem *) lfirst(l);
+
+               if (pg_strcasecmp(defel->defname, "KEEPORIG") == 0)
+               {
+                       d->keeporig = defGetBoolean(defel);
+               }
+               else if (pg_strcasecmp(defel->defname, "RULES") == 0)
+               {
+                       read_dictionary(d, defGetString(defel));
+               }
+               else
+               {
+                       ereport(ERROR,
+                                       (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+                                        errmsg("unrecognized xsyn parameter: \"%s\"",
+                                                       defel->defname)));
+               }
+       }
+
+       PG_RETURN_POINTER(d);
+}
+
+Datum
+dxsyn_lexize(PG_FUNCTION_ARGS)
+{
+       DictSyn *d = (DictSyn *) PG_GETARG_POINTER(0);
+       char *in = (char *) PG_GETARG_POINTER(1);
+       int length = PG_GETARG_INT32(2);
+       Syn word;
+       Syn *found;
+       TSLexeme *res = NULL;
+
+       if (!length || d->len == 0)
+               PG_RETURN_POINTER(NULL);
+
+       /* Create search pattern */
+       {
+               char *temp = pnstrdup(in, length);
+
+               word.key = lowerstr(temp);
+               pfree(temp);
+               word.value = NULL;
+       }
+
+       /* Look for matching syn */
+       found = (Syn *)bsearch(&word, d->syn, d->len, sizeof(Syn), compare_syn);
+       pfree(word.key);
+
+       if (!found)
+               PG_RETURN_POINTER(NULL);
+
+       /* Parse string of synonyms and return array of words */
+       {
+               char *value = pstrdup(found->value);
+               int value_length = strlen(value);
+               char *pos = value;
+               int nsyns = 0;
+               bool is_first = true;
+
+               res = palloc(0);
+
+               while(pos < value + value_length)
+               {
+                       char *end;
+                       char *syn = find_word(pos, &end);
+
+                       if (!syn)
+                               break;
+                       *end = '\0';
+
+                       res = repalloc(res, sizeof(TSLexeme)*(nsyns + 2));
+                       res[nsyns].lexeme = NULL;
+
+                       /* first word is added to result only if KEEPORIG flag is set */
+                       if(d->keeporig || !is_first)
+                       {
+                               res[nsyns].lexeme = pstrdup(syn);
+                               res[nsyns + 1].lexeme = NULL;
+
+                               nsyns++;
+                       }
+
+                       is_first = false;
+
+                       pos = end + 1;
+               }
+
+               pfree(value);
+       }
+
+       PG_RETURN_POINTER(res);
+}
diff --git a/contrib/dict_xsyn/dict_xsyn.sql.in b/contrib/dict_xsyn/dict_xsyn.sql.in
new file mode 100644 (file)
index 0000000..0e5755e
--- /dev/null
@@ -0,0 +1,29 @@
+-- $PostgreSQL: pgsql/contrib/dict_xsyn/dict_xsyn.sql.in,v 1.1 2007/10/15 21:36:50 tgl Exp $
+
+-- Adjust this setting to control where the objects get created.
+SET search_path = public;
+
+BEGIN;
+
+CREATE FUNCTION dxsyn_init(internal)
+        RETURNS internal
+        AS 'MODULE_PATHNAME'
+        LANGUAGE C STRICT;
+
+CREATE FUNCTION dxsyn_lexize(internal, internal, internal, internal)
+        RETURNS internal
+        AS 'MODULE_PATHNAME'
+        LANGUAGE C STRICT;
+
+CREATE TEXT SEARCH TEMPLATE xsyn_template (
+        LEXIZE = dxsyn_lexize,
+       INIT   = dxsyn_init
+);
+
+CREATE TEXT SEARCH DICTIONARY xsyn (
+       TEMPLATE = xsyn_template
+);
+
+COMMENT ON TEXT SEARCH DICTIONARY xsyn IS 'eXtended synonym dictionary';
+
+END;
diff --git a/contrib/dict_xsyn/expected/dict_xsyn.out b/contrib/dict_xsyn/expected/dict_xsyn.out
new file mode 100644 (file)
index 0000000..99071ea
--- /dev/null
@@ -0,0 +1,22 @@
+--
+-- first, define the datatype.  Turn off echoing so that expected file
+-- does not depend on contents of this file.
+--
+SET client_min_messages = warning;
+\set ECHO none
+RESET client_min_messages;
+--configuration
+ALTER TEXT SEARCH DICTIONARY xsyn (RULES='xsyn_sample', KEEPORIG=false);
+--lexize
+SELECT ts_lexize('xsyn', 'supernova');
+   ts_lexize    
+----------------
+ {sn,sne,1987a}
+(1 row)
+
+SELECT ts_lexize('xsyn', 'grb');
+ ts_lexize 
+-----------
+(1 row)
+
diff --git a/contrib/dict_xsyn/sql/dict_xsyn.sql b/contrib/dict_xsyn/sql/dict_xsyn.sql
new file mode 100644 (file)
index 0000000..17f6df9
--- /dev/null
@@ -0,0 +1,16 @@
+--
+-- first, define the datatype.  Turn off echoing so that expected file
+-- does not depend on contents of this file.
+--
+SET client_min_messages = warning;
+\set ECHO none
+\i dict_xsyn.sql
+\set ECHO all
+RESET client_min_messages;
+
+--configuration
+ALTER TEXT SEARCH DICTIONARY xsyn (RULES='xsyn_sample', KEEPORIG=false);
+
+--lexize
+SELECT ts_lexize('xsyn', 'supernova');
+SELECT ts_lexize('xsyn', 'grb');
diff --git a/contrib/dict_xsyn/uninstall_dict_xsyn.sql b/contrib/dict_xsyn/uninstall_dict_xsyn.sql
new file mode 100644 (file)
index 0000000..7b7acea
--- /dev/null
@@ -0,0 +1,9 @@
+SET search_path = public;
+
+DROP TEXT SEARCH DICTIONARY xsyn;
+
+DROP TEXT SEARCH TEMPLATE xsyn_template;
+
+DROP FUNCTION dxsyn_init(internal);
+
+DROP FUNCTION dxsyn_lexize(internal,internal,internal,internal);
diff --git a/contrib/dict_xsyn/xsyn_sample.rules b/contrib/dict_xsyn/xsyn_sample.rules
new file mode 100644 (file)
index 0000000..203bec7
--- /dev/null
@@ -0,0 +1,6 @@
+# Sample rules file for eXtended Synonym (xsyn) dictionary
+# format is as follows:
+#
+# word synonym1 synonym2 ...
+#
+supernova sn sne 1987a
diff --git a/contrib/test_parser/Makefile b/contrib/test_parser/Makefile
new file mode 100644 (file)
index 0000000..1267a6e
--- /dev/null
@@ -0,0 +1,19 @@
+# $PostgreSQL: pgsql/contrib/test_parser/Makefile,v 1.1 2007/10/15 21:36:50 tgl Exp $
+
+MODULE_big = test_parser
+OBJS = test_parser.o
+DATA_built = test_parser.sql
+DATA = uninstall_test_parser.sql
+DOCS = README.test_parser
+REGRESS = test_parser
+
+ifdef USE_PGXS
+PG_CONFIG = pg_config
+PGXS := $(shell $(PG_CONFIG) --pgxs)
+include $(PGXS)
+else
+subdir = contrib/test_parser
+top_builddir = ../..
+include $(top_builddir)/src/Makefile.global
+include $(top_srcdir)/contrib/contrib-global.mk
+endif
diff --git a/contrib/test_parser/README.test_parser b/contrib/test_parser/README.test_parser
new file mode 100644 (file)
index 0000000..d8ca90a
--- /dev/null
@@ -0,0 +1,52 @@
+Example parser
+==============
+
+This is an example of a custom parser for full text search.
+
+It recognizes space-delimited words and returns only two token types:
+
+ - 3,  word,  Word
+
+ - 12, blank, Space symbols
+
+The token numbers have been chosen to keep compatibility with the default
+ts_headline() function, since we do not want to implement our own version.
+
+* Configuration
+
+The parser has no user-configurable parameters.
+
+* Usage
+
+1. Compile and install
+
+2. Load dictionary
+
+   psql mydb < test_parser.sql
+
+3. Test it
+
+   mydb# SELECT * FROM ts_parse('testparser','That''s my first own parser');
+    tokid | token
+   -------+--------
+        3 | That's
+       12 |
+        3 | my
+       12 |
+        3 | first
+       12 |
+        3 | own
+       12 |
+        3 | parser
+
+   mydb# SELECT to_tsvector('testcfg','That''s my first own parser');
+   to_tsvector
+   -------------------------------------------------
+   'my':2 'own':4 'first':3 'parser':5 'that''s':1
+   
+   mydb# SELECT ts_headline('testcfg','Supernovae stars are the brightest phenomena in galaxies', to_tsquery('testcfg', 'star'));
+   headline
+   -----------------------------------------------------------------
+   Supernovae <b>stars</b> are the brightest phenomena in galaxies
+   
+That's all.
diff --git a/contrib/test_parser/expected/test_parser.out b/contrib/test_parser/expected/test_parser.out
new file mode 100644 (file)
index 0000000..ec4e3b2
--- /dev/null
@@ -0,0 +1,50 @@
+--
+-- first, define the parser.  Turn off echoing so that expected file
+-- does not depend on contents of this file.
+--
+SET client_min_messages = warning;
+\set ECHO none
+RESET client_min_messages;
+-- make test configuration using parser
+CREATE TEXT SEARCH CONFIGURATION testcfg (PARSER = testparser);
+ALTER TEXT SEARCH CONFIGURATION testcfg ADD MAPPING FOR word WITH simple;
+-- ts_parse
+SELECT * FROM ts_parse('testparser', 'That''s simple parser can''t parse urls like http://some.url/here/');
+ tokid |         token         
+-------+-----------------------
+     3 | That's
+    12 |  
+     3 | simple
+    12 |  
+     3 | parser
+    12 |  
+     3 | can't
+    12 |  
+     3 | parse
+    12 |  
+     3 | urls
+    12 |  
+     3 | like
+    12 |  
+     3 | http://some.url/here/
+(15 rows)
+
+SELECT to_tsvector('testcfg','That''s my first own parser');
+                   to_tsvector                   
+-------------------------------------------------
+ 'my':2 'own':4 'first':3 'parser':5 'that''s':1
+(1 row)
+
+SELECT to_tsquery('testcfg', 'star');
+ to_tsquery 
+------------
+ 'star'
+(1 row)
+
+SELECT ts_headline('testcfg','Supernovae stars are the brightest phenomena in galaxies', 
+       to_tsquery('testcfg', 'stars'));
+                           ts_headline                           
+-----------------------------------------------------------------
+ Supernovae <b>stars</b> are the brightest phenomena in galaxies
+(1 row)
+
diff --git a/contrib/test_parser/sql/test_parser.sql b/contrib/test_parser/sql/test_parser.sql
new file mode 100644 (file)
index 0000000..f43d4c7
--- /dev/null
@@ -0,0 +1,26 @@
+--
+-- first, define the parser.  Turn off echoing so that expected file
+-- does not depend on contents of this file.
+--
+SET client_min_messages = warning;
+\set ECHO none
+\i test_parser.sql
+\set ECHO all
+RESET client_min_messages;
+
+-- make test configuration using parser
+
+CREATE TEXT SEARCH CONFIGURATION testcfg (PARSER = testparser);
+
+ALTER TEXT SEARCH CONFIGURATION testcfg ADD MAPPING FOR word WITH simple;
+
+-- ts_parse
+
+SELECT * FROM ts_parse('testparser', 'That''s simple parser can''t parse urls like http://some.url/here/');
+
+SELECT to_tsvector('testcfg','That''s my first own parser');
+
+SELECT to_tsquery('testcfg', 'star');
+
+SELECT ts_headline('testcfg','Supernovae stars are the brightest phenomena in galaxies', 
+       to_tsquery('testcfg', 'stars'));
diff --git a/contrib/test_parser/test_parser.c b/contrib/test_parser/test_parser.c
new file mode 100644 (file)
index 0000000..728bf40
--- /dev/null
@@ -0,0 +1,130 @@
+/*-------------------------------------------------------------------------
+ *
+ * test_parser.c
+ *       Simple example of a text search parser
+ *
+ * Copyright (c) 2007, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ *       $PostgreSQL: pgsql/contrib/test_parser/test_parser.c,v 1.1 2007/10/15 21:36:50 tgl Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "fmgr.h"
+
+PG_MODULE_MAGIC;
+
+
+/*
+ * types
+ */
+
+/* self-defined type */
+typedef struct {
+       char *  buffer; /* text to parse */
+       int             len;    /* length of the text in buffer */
+       int             pos;    /* position of the parser */
+} ParserState;
+
+/* copy-paste from wparser.h of tsearch2 */
+typedef struct {
+       int             lexid;
+       char    *alias;
+       char    *descr;
+} LexDescr;
+
+/*
+ * prototypes
+ */
+PG_FUNCTION_INFO_V1(testprs_start);
+Datum testprs_start(PG_FUNCTION_ARGS);
+
+PG_FUNCTION_INFO_V1(testprs_getlexeme);
+Datum testprs_getlexeme(PG_FUNCTION_ARGS);
+
+PG_FUNCTION_INFO_V1(testprs_end);
+Datum testprs_end(PG_FUNCTION_ARGS);
+
+PG_FUNCTION_INFO_V1(testprs_lextype);
+Datum testprs_lextype(PG_FUNCTION_ARGS);
+
+/*
+ * functions
+ */
+Datum testprs_start(PG_FUNCTION_ARGS)
+{
+       ParserState *pst = (ParserState *) palloc0(sizeof(ParserState));
+       pst->buffer = (char *) PG_GETARG_POINTER(0);
+       pst->len = PG_GETARG_INT32(1);
+       pst->pos = 0;
+
+       PG_RETURN_POINTER(pst);
+}
+
+Datum testprs_getlexeme(PG_FUNCTION_ARGS)
+{
+       ParserState *pst   = (ParserState *) PG_GETARG_POINTER(0);
+       char            **t        = (char **) PG_GETARG_POINTER(1);
+       int                     *tlen  = (int *) PG_GETARG_POINTER(2);
+       int                     type;
+
+       *tlen = pst->pos;
+       *t = pst->buffer +      pst->pos;
+
+       if ((pst->buffer)[pst->pos] == ' ')
+       {
+               /* blank type */
+               type = 12;
+               /* go to the next non-white-space character */
+               while ((pst->buffer)[pst->pos] == ' ' &&
+                          pst->pos < pst->len)
+                       (pst->pos)++;
+       } else {
+               /* word type */
+               type = 3;
+               /* go to the next white-space character */
+               while ((pst->buffer)[pst->pos] != ' ' &&
+                          pst->pos < pst->len)
+                       (pst->pos)++;
+       }
+
+       *tlen = pst->pos - *tlen;
+
+       /* we are finished if (*tlen == 0) */
+       if (*tlen == 0)
+               type=0;
+
+       PG_RETURN_INT32(type);
+}
+
+Datum testprs_end(PG_FUNCTION_ARGS)
+{
+       ParserState *pst = (ParserState *) PG_GETARG_POINTER(0);
+       pfree(pst);
+       PG_RETURN_VOID();
+}
+
+Datum testprs_lextype(PG_FUNCTION_ARGS)
+{
+       /*
+        * Remarks:
+        * - we have to return the blanks for headline reason
+        * - we use the same lexids like Teodor in the default
+        * word parser; in this way we can reuse the headline
+        * function of the default word parser.
+        */
+       LexDescr *descr = (LexDescr *) palloc(sizeof(LexDescr) * (2+1));
+
+       /* there are only two types in this parser */
+       descr[0].lexid = 3;
+       descr[0].alias = pstrdup("word");
+       descr[0].descr = pstrdup("Word");
+       descr[1].lexid = 12;
+       descr[1].alias = pstrdup("blank");
+       descr[1].descr = pstrdup("Space symbols");
+       descr[2].lexid = 0;
+
+       PG_RETURN_POINTER(descr);
+}
diff --git a/contrib/test_parser/test_parser.sql.in b/contrib/test_parser/test_parser.sql.in
new file mode 100644 (file)
index 0000000..cb5c9a2
--- /dev/null
@@ -0,0 +1,36 @@
+-- $PostgreSQL: pgsql/contrib/test_parser/test_parser.sql.in,v 1.1 2007/10/15 21:36:50 tgl Exp $
+
+-- Adjust this setting to control where the objects get created.
+SET search_path = public;
+
+BEGIN;
+
+CREATE FUNCTION testprs_start(internal, int4)
+    RETURNS internal
+    AS 'MODULE_PATHNAME'
+    LANGUAGE C STRICT;
+
+CREATE FUNCTION testprs_getlexeme(internal, internal, internal)
+    RETURNS internal
+    AS 'MODULE_PATHNAME'
+    LANGUAGE C STRICT;
+
+CREATE FUNCTION testprs_end(internal)
+    RETURNS void
+    AS 'MODULE_PATHNAME'
+    LANGUAGE C STRICT;
+
+CREATE FUNCTION testprs_lextype(internal)
+    RETURNS internal
+    AS 'MODULE_PATHNAME'
+    LANGUAGE C STRICT;
+
+CREATE TEXT SEARCH PARSER testparser (
+    START    = testprs_start,
+    GETTOKEN = testprs_getlexeme,
+    END      = testprs_end,
+    HEADLINE = pg_catalog.prsd_headline,
+    LEXTYPES = testprs_lextype
+);
+
+END;
diff --git a/contrib/test_parser/uninstall_test_parser.sql b/contrib/test_parser/uninstall_test_parser.sql
new file mode 100644 (file)
index 0000000..d194677
--- /dev/null
@@ -0,0 +1,11 @@
+SET search_path = public;
+
+DROP TEXT SEARCH PARSER testparser;
+
+DROP FUNCTION testprs_start(internal, int4);
+
+DROP FUNCTION testprs_getlexeme(internal, internal, internal);
+
+DROP FUNCTION testprs_end(internal);
+
+DROP FUNCTION testprs_lextype(internal);