Add an Accept parameter to "simple" dictionaries. The default of true

author Tom Lane <tgl@sss.pgh.pa.us>

Wed, 14 Nov 2007 18:36:37 +0000 (18:36 +0000)

committer Tom Lane <tgl@sss.pgh.pa.us>

Wed, 14 Nov 2007 18:36:37 +0000 (18:36 +0000)
author Tom Lane <tgl@sss.pgh.pa.us>
Wed, 14 Nov 2007 18:36:37 +0000 (18:36 +0000)
committer Tom Lane <tgl@sss.pgh.pa.us>
Wed, 14 Nov 2007 18:36:37 +0000 (18:36 +0000)
diff --git a/doc/src/sgml/textsearch.sgml b/doc/src/sgml/textsearch.sgml

index 0ba401c2a437474771e0c1504f0a336d107f3a17..31753791cda03182ea02ef0f832d2333607dffa5 100644 (file)
--- a/doc/src/sgml/textsearch.sgml
+++ b/doc/src/sgml/textsearch.sgml
@@ -1,4 +1,4 @@
-<!-- $PostgreSQL: pgsql/doc/src/sgml/textsearch.sgml,v 1.32 2007/11/14 03:26:24 tgl Exp $ -->
+<!-- $PostgreSQL: pgsql/doc/src/sgml/textsearch.sgml,v 1.33 2007/11/14 18:36:37 tgl Exp $ -->
  
  <chapter id="textsearch">
   <title id="textsearch-title">Full Text Search</title>
@@ -2093,9 +2093,11 @@ SELECT ts_rank_cd (to_tsvector('english','list stop words'), to_tsquery('list &a
     <para>
      The <literal>simple</> dictionary template operates by converting the
      input token to lower case and checking it against a file of stop words.
-    If it is found in the file then <literal>NULL</> is returned, causing
+    If it is found in the file then an empty array is returned, causing
      the token to be discarded.  If not, the lower-cased form of the word
-    is returned as the normalized lexeme.
+    is returned as the normalized lexeme.  Alternatively, the dictionary
+    can be configured to report non-stop-words as unrecognized, allowing
+    them to be passed on to the next dictionary in the list.
     </para>
  
     <para>
@@ -2138,6 +2140,35 @@ SELECT ts_lexize('public.simple_dict','The');
  </programlisting>
     </para>
  
+   <para>
+    We can also choose to return <literal>NULL</>, instead of the lower-cased
+    word, if it is not found in the stop words file.  This behavior is
+    selected by setting the dictionary's <literal>Accept</> parameter to
+    <literal>false</>.  Continuing the example:
+
+<programlisting>
+ALTER TEXT SEARCH DICTIONARY public.simple_dict ( Accept = false );
+
+SELECT ts_lexize('public.simple_dict','YeS');
+ ts_lexize
+-----------
+
+
+SELECT ts_lexize('public.simple_dict','The');
+ ts_lexize
+-----------
+ {}
+</programlisting>
+   </para>
+
+   <para>
+    With the default setting of <literal>Accept</> = <literal>true</>,
+    it is only useful to place a <literal>simple</> dictionary at the end
+    of a list of dictionaries, since it will never pass on any token to
+    a following dictionary.  Conversely, <literal>Accept</> = <literal>false</>
+    is only useful when there is at least one following dictionary.
+   </para>
+
     <caution>
      <para>
       Most types of dictionaries rely on configuration files, such as files of
diff --git a/src/backend/tsearch/dict_simple.c b/src/backend/tsearch/dict_simple.c

index aea2c0963b150e46819ee88ff563c54b7f6f8d9c..8248d3987d64dc19cb2ebb545d4bfdc1afa95121 100644 (file)
--- a/src/backend/tsearch/dict_simple.c
+++ b/src/backend/tsearch/dict_simple.c
@@ -7,7 +7,7 @@
   *
   *
   * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/tsearch/dict_simple.c,v 1.3 2007/08/25 00:03:59 tgl Exp $
+ *       $PostgreSQL: pgsql/src/backend/tsearch/dict_simple.c,v 1.4 2007/11/14 18:36:37 tgl Exp $
   *
   *-------------------------------------------------------------------------
   */
@@ -23,6 +23,7 @@
  typedef struct
  {
         StopList        stoplist;
+       bool            accept;
  } DictSimple;
  
  
@@ -31,9 +32,12 @@ dsimple_init(PG_FUNCTION_ARGS)
  {
         List       *dictoptions = (List *) PG_GETARG_POINTER(0);
         DictSimple *d = (DictSimple *) palloc0(sizeof(DictSimple));
-       bool            stoploaded = false;
+       bool            stoploaded = false,
+                               acceptloaded = false;
         ListCell   *l;
  
+       d->accept = true;                       /* default */
+
         foreach(l, dictoptions)
         {
                 DefElem    *defel = (DefElem *) lfirst(l);
@@ -47,6 +51,15 @@ dsimple_init(PG_FUNCTION_ARGS)
                         readstoplist(defGetString(defel), &d->stoplist, lowerstr);
                         stoploaded = true;
                 }
+               else if (pg_strcasecmp("Accept", defel->defname) == 0)
+               {
+                       if (acceptloaded)
+                               ereport(ERROR,
+                                               (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+                                                errmsg("multiple Accept parameters")));
+                       d->accept = defGetBoolean(defel);
+                       acceptloaded = true;
+               }
                 else
                 {
                         ereport(ERROR,
@@ -66,14 +79,28 @@ dsimple_lexize(PG_FUNCTION_ARGS)
         char       *in = (char *) PG_GETARG_POINTER(1);
         int32      len = PG_GETARG_INT32(2);
         char       *txt;
-       TSLexeme   *res = palloc0(sizeof(TSLexeme) * 2);
+       TSLexeme   *res;
  
         txt = lowerstr_with_len(in, len);
  
         if (*txt == '\0' || searchstoplist(&(d->stoplist), txt))
+       {
+               /* reject as stopword */
                 pfree(txt);
-       else
+               res = palloc0(sizeof(TSLexeme) * 2);
+               PG_RETURN_POINTER(res);
+       }
+       else if (d->accept)
+       {
+               /* accept */
+               res = palloc0(sizeof(TSLexeme) * 2);
                 res[0].lexeme = txt;
-
-       PG_RETURN_POINTER(res);
+               PG_RETURN_POINTER(res);
+       }
+       else
+       {
+               /* report as unrecognized */
+               pfree(txt);
+               PG_RETURN_POINTER(NULL);
+       }
  }
author	Tom Lane <tgl@sss.pgh.pa.us>
	Wed, 14 Nov 2007 18:36:37 +0000 (18:36 +0000)
committer	Tom Lane <tgl@sss.pgh.pa.us>
	Wed, 14 Nov 2007 18:36:37 +0000 (18:36 +0000)
doc/src/sgml/textsearch.sgml		patch \| blob \| history
src/backend/tsearch/dict_simple.c		patch \| blob \| history