Have text search thesaurus files use "?" for stop words.

author Bruce Momjian <bruce@momjian.us>

Sat, 10 Nov 2007 15:39:34 +0000 (15:39 +0000)

committer Bruce Momjian <bruce@momjian.us>

Sat, 10 Nov 2007 15:39:34 +0000 (15:39 +0000)
author Bruce Momjian <bruce@momjian.us>
Sat, 10 Nov 2007 15:39:34 +0000 (15:39 +0000)
committer Bruce Momjian <bruce@momjian.us>
Sat, 10 Nov 2007 15:39:34 +0000 (15:39 +0000)
diff --git a/doc/src/sgml/textsearch.sgml b/doc/src/sgml/textsearch.sgml

index 26fdad0c6ff071f34607f07361dca00ce8135e3a..e556c6dd78a8bdf5b7ae6f573d596e8983453024 100644 (file)
--- a/doc/src/sgml/textsearch.sgml
+++ b/doc/src/sgml/textsearch.sgml
@@ -1,4 +1,4 @@
-<!-- $PostgreSQL: pgsql/doc/src/sgml/textsearch.sgml,v 1.30 2007/11/05 15:55:53 mha Exp $ -->
+<!-- $PostgreSQL: pgsql/doc/src/sgml/textsearch.sgml,v 1.31 2007/11/10 15:39:34 momjian Exp $ -->
  
  <chapter id="textsearch">
   <title id="textsearch-title">Full Text Search</title>
@@ -2258,20 +2258,17 @@ more sample word(s) : more indexed word(s)
     </para>
  
     <para>
-    Stop words recognized by the subdictionary are replaced by a <quote>stop
-    word placeholder</quote> to record their position. To illustrate this,
-    consider these phrases:
+    Specific stop words recognized by the subdictionary cannot be
+    specified;  instead use <literal>?</> to mark the location where any
+    stop word can appear.  For example, assuming that <literal>a</> and
+    <literal>the</> are stop words according to the subdictionary:
  
  <programlisting>
-a one the two : swsw
-the one a two : swsw2
+? one ? two : swsw
  </programlisting>
  
-    Assuming that <literal>a</> and <literal>the</> are stop words according
-    to the subdictionary, these two phrases are identical to the thesaurus:
-    they both look like <replaceable>stopword</> <literal>one</>
-    <replaceable>stopword</> <literal>two</>.  Input matching this pattern
-    will be replaced by <literal>swsw2</>, according to the tie-breaking rule.
+    matches <literal>a one the two</> and <literal>the one a two</>;
+    both would be replaced by <literal>swsw</>.
     </para>
  
     <para>
@@ -3576,6 +3573,12 @@ Parser: "pg_catalog.default"
      </para>
     </listitem>
  
+   <listitem>
+    <para>
+     Thesaurus files now use <literal>?</> for stop words.
+    </para>
+   </listitem>
+
     <listitem>
      <para>
       What else?
diff --git a/src/backend/tsearch/dict_thesaurus.c b/src/backend/tsearch/dict_thesaurus.c

index 7a0ae4afd3e40d6d0a43734ac802c7cd9f8815e9..31564a789935c13edf0ae81feae7a6738f753266 100644 (file)
--- a/src/backend/tsearch/dict_thesaurus.c
+++ b/src/backend/tsearch/dict_thesaurus.c
@@ -7,7 +7,7 @@
   *
   *
   * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/tsearch/dict_thesaurus.c,v 1.5 2007/11/09 01:32:22 momjian Exp $
+ *       $PostgreSQL: pgsql/src/backend/tsearch/dict_thesaurus.c,v 1.6 2007/11/10 15:39:34 momjian Exp $
   *
   *-------------------------------------------------------------------------
   */
@@ -412,47 +412,48 @@ compileTheLexeme(DictThesaurus * d)
         {
                 TSLexeme   *ptr;
  
-               ptr = (TSLexeme *) DatumGetPointer(FunctionCall4(&(d->subdict->lexize),
-                                                                          PointerGetDatum(d->subdict->dictData),
-                                                                                 PointerGetDatum(d->wrds[i].lexeme),
-                                                                       Int32GetDatum(strlen(d->wrds[i].lexeme)),
-                                                                                                        PointerGetDatum(NULL)));
-
-               if (!ptr)
-                       elog(ERROR, "thesaurus word-sample \"%s\" isn't recognized by subdictionary (rule %d)",
-                                d->wrds[i].lexeme, d->wrds[i].entries->idsubst + 1);
-               else if (!(ptr->lexeme))
-               {
-                       elog(NOTICE, "thesaurus word-sample \"%s\" is recognized as stop-word, assign any stop-word (rule %d)",
-                                d->wrds[i].lexeme, d->wrds[i].entries->idsubst + 1);
-
+               if (strcmp(d->wrds[i].lexeme, "?") == 0)        /* Is stop word marker? */
                         newwrds = addCompiledLexeme(newwrds, &nnw, &tnm, NULL, d->wrds[i].entries, 0);
-               }
                 else
                 {
-                       while (ptr->lexeme)
+                       ptr = (TSLexeme *) DatumGetPointer(FunctionCall4(&(d->subdict->lexize),
+                                                                                  PointerGetDatum(d->subdict->dictData),
+                                                                                         PointerGetDatum(d->wrds[i].lexeme),
+                                                                               Int32GetDatum(strlen(d->wrds[i].lexeme)),
+                                                                                                                PointerGetDatum(NULL)));
+       
+                       if (!ptr)
+                               elog(ERROR, "thesaurus word-sample \"%s\" isn't recognized by subdictionary (rule %d)",
+                                        d->wrds[i].lexeme, d->wrds[i].entries->idsubst + 1);
+                       else if (!(ptr->lexeme))
+                               elog(ERROR, "thesaurus word-sample \"%s\" is recognized as stop-word, use \"?\" for stop words instead (rule %d)",
+                                        d->wrds[i].lexeme, d->wrds[i].entries->idsubst + 1);
+                       else
                         {
-                               TSLexeme   *remptr = ptr + 1;
-                               int                     tnvar = 1;
-                               int                     curvar = ptr->nvariant;
-
-                               /* compute n words in one variant */
-                               while (remptr->lexeme)
+                               while (ptr->lexeme)
                                 {
-                                       if (remptr->nvariant != (remptr - 1)->nvariant)
-                                               break;
-                                       tnvar++;
-                                       remptr++;
-                               }
-
-                               remptr = ptr;
-                               while (remptr->lexeme && remptr->nvariant == curvar)
-                               {
-                                       newwrds = addCompiledLexeme(newwrds, &nnw, &tnm, remptr, d->wrds[i].entries, tnvar);
-                                       remptr++;
+                                       TSLexeme   *remptr = ptr + 1;
+                                       int                     tnvar = 1;
+                                       int                     curvar = ptr->nvariant;
+       
+                                       /* compute n words in one variant */
+                                       while (remptr->lexeme)
+                                       {
+                                               if (remptr->nvariant != (remptr - 1)->nvariant)
+                                                       break;
+                                               tnvar++;
+                                               remptr++;
+                                       }
+       
+                                       remptr = ptr;
+                                       while (remptr->lexeme && remptr->nvariant == curvar)
+                                       {
+                                               newwrds = addCompiledLexeme(newwrds, &nnw, &tnm, remptr, d->wrds[i].entries, tnvar);
+                                               remptr++;
+                                       }
+       
+                                       ptr = remptr;
                                 }
-
-                               ptr = remptr;
                         }
                 }
  
diff --git a/src/backend/tsearch/thesaurus_sample.ths b/src/backend/tsearch/thesaurus_sample.ths

index 77a32a75d056242c0816b7aa0a515c065c4a7ede..0b4857ec33403b59ad3ebb2a42388cae7d7886c1 100644 (file)
--- a/src/backend/tsearch/thesaurus_sample.ths
+++ b/src/backend/tsearch/thesaurus_sample.ths
@@ -14,4 +14,5 @@ two : *2
  supernovae stars : *sn
  supernovae : *sn
  booking tickets : order invitation cards
-# booking the tickets : order invitation Cards
+booking ? tickets : order invitation Cards
+
diff --git a/src/test/regress/expected/tsdicts.out b/src/test/regress/expected/tsdicts.out

index 3520baceac71007d5da591a0d2b66679f0291de6..4b8929361a85e09e6fc30f4bddce7a0612b06f07 100644 (file)
--- a/src/test/regress/expected/tsdicts.out
+++ b/src/test/regress/expected/tsdicts.out
@@ -311,8 +311,8 @@ SELECT to_tsvector('thesaurus_tst', 'Supernovae star is very new star and usuall
  (1 row)
  
  SELECT to_tsvector('thesaurus_tst', 'Booking tickets is looking like a booking a tickets');
-                             to_tsvector                             
----------------------------------------------------------------------
- 'book':8 'card':3 'like':6 'look':5 'invit':2 'order':1 'ticket':10
+                      to_tsvector                      
+-------------------------------------------------------
+ 'card':3,10 'like':6 'look':5 'invit':2,9 'order':1,8
  (1 row)
author	Bruce Momjian <bruce@momjian.us>
	Sat, 10 Nov 2007 15:39:34 +0000 (15:39 +0000)
committer	Bruce Momjian <bruce@momjian.us>
	Sat, 10 Nov 2007 15:39:34 +0000 (15:39 +0000)
doc/src/sgml/textsearch.sgml		patch \| blob \| history
src/backend/tsearch/dict_thesaurus.c		patch \| blob \| history
src/backend/tsearch/thesaurus_sample.ths		patch \| blob \| history
src/test/regress/expected/tsdicts.out		patch \| blob \| history