-<!-- $PostgreSQL: pgsql/doc/src/sgml/textsearch.sgml,v 1.30 2007/11/05 15:55:53 mha Exp $ -->
+<!-- $PostgreSQL: pgsql/doc/src/sgml/textsearch.sgml,v 1.31 2007/11/10 15:39:34 momjian Exp $ -->
<chapter id="textsearch">
<title id="textsearch-title">Full Text Search</title>
</para>
<para>
- Stop words recognized by the subdictionary are replaced by a <quote>stop
- word placeholder</quote> to record their position. To illustrate this,
- consider these phrases:
+ Specific stop words recognized by the subdictionary cannot be
+ specified; instead use <literal>?</> to mark the location where any
+ stop word can appear. For example, assuming that <literal>a</> and
+ <literal>the</> are stop words according to the subdictionary:
<programlisting>
-a one the two : swsw
-the one a two : swsw2
+? one ? two : swsw
</programlisting>
- Assuming that <literal>a</> and <literal>the</> are stop words according
- to the subdictionary, these two phrases are identical to the thesaurus:
- they both look like <replaceable>stopword</> <literal>one</>
- <replaceable>stopword</> <literal>two</>. Input matching this pattern
- will be replaced by <literal>swsw2</>, according to the tie-breaking rule.
+ matches <literal>a one the two</> and <literal>the one a two</>;
+ both would be replaced by <literal>swsw</>.
</para>
<para>
</para>
</listitem>
+ <listitem>
+ <para>
+ Thesaurus files now use <literal>?</> for stop words.
+ </para>
+ </listitem>
+
<listitem>
<para>
What else?
*
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/tsearch/dict_thesaurus.c,v 1.5 2007/11/09 01:32:22 momjian Exp $
+ * $PostgreSQL: pgsql/src/backend/tsearch/dict_thesaurus.c,v 1.6 2007/11/10 15:39:34 momjian Exp $
*
*-------------------------------------------------------------------------
*/
{
TSLexeme *ptr;
- ptr = (TSLexeme *) DatumGetPointer(FunctionCall4(&(d->subdict->lexize),
- PointerGetDatum(d->subdict->dictData),
- PointerGetDatum(d->wrds[i].lexeme),
- Int32GetDatum(strlen(d->wrds[i].lexeme)),
- PointerGetDatum(NULL)));
-
- if (!ptr)
- elog(ERROR, "thesaurus word-sample \"%s\" isn't recognized by subdictionary (rule %d)",
- d->wrds[i].lexeme, d->wrds[i].entries->idsubst + 1);
- else if (!(ptr->lexeme))
- {
- elog(NOTICE, "thesaurus word-sample \"%s\" is recognized as stop-word, assign any stop-word (rule %d)",
- d->wrds[i].lexeme, d->wrds[i].entries->idsubst + 1);
-
+ if (strcmp(d->wrds[i].lexeme, "?") == 0) /* Is stop word marker? */
newwrds = addCompiledLexeme(newwrds, &nnw, &tnm, NULL, d->wrds[i].entries, 0);
- }
else
{
- while (ptr->lexeme)
+ ptr = (TSLexeme *) DatumGetPointer(FunctionCall4(&(d->subdict->lexize),
+ PointerGetDatum(d->subdict->dictData),
+ PointerGetDatum(d->wrds[i].lexeme),
+ Int32GetDatum(strlen(d->wrds[i].lexeme)),
+ PointerGetDatum(NULL)));
+
+ if (!ptr)
+ elog(ERROR, "thesaurus word-sample \"%s\" isn't recognized by subdictionary (rule %d)",
+ d->wrds[i].lexeme, d->wrds[i].entries->idsubst + 1);
+ else if (!(ptr->lexeme))
+ elog(ERROR, "thesaurus word-sample \"%s\" is recognized as stop-word, use \"?\" for stop words instead (rule %d)",
+ d->wrds[i].lexeme, d->wrds[i].entries->idsubst + 1);
+ else
{
- TSLexeme *remptr = ptr + 1;
- int tnvar = 1;
- int curvar = ptr->nvariant;
-
- /* compute n words in one variant */
- while (remptr->lexeme)
+ while (ptr->lexeme)
{
- if (remptr->nvariant != (remptr - 1)->nvariant)
- break;
- tnvar++;
- remptr++;
- }
-
- remptr = ptr;
- while (remptr->lexeme && remptr->nvariant == curvar)
- {
- newwrds = addCompiledLexeme(newwrds, &nnw, &tnm, remptr, d->wrds[i].entries, tnvar);
- remptr++;
+ TSLexeme *remptr = ptr + 1;
+ int tnvar = 1;
+ int curvar = ptr->nvariant;
+
+ /* compute n words in one variant */
+ while (remptr->lexeme)
+ {
+ if (remptr->nvariant != (remptr - 1)->nvariant)
+ break;
+ tnvar++;
+ remptr++;
+ }
+
+ remptr = ptr;
+ while (remptr->lexeme && remptr->nvariant == curvar)
+ {
+ newwrds = addCompiledLexeme(newwrds, &nnw, &tnm, remptr, d->wrds[i].entries, tnvar);
+ remptr++;
+ }
+
+ ptr = remptr;
}
-
- ptr = remptr;
}
}