-<!-- $PostgreSQL: pgsql/doc/src/sgml/textsearch.sgml,v 1.32 2007/11/14 03:26:24 tgl Exp $ -->
+<!-- $PostgreSQL: pgsql/doc/src/sgml/textsearch.sgml,v 1.33 2007/11/14 18:36:37 tgl Exp $ -->
<chapter id="textsearch">
<title id="textsearch-title">Full Text Search</title>
<para>
The <literal>simple</> dictionary template operates by converting the
input token to lower case and checking it against a file of stop words.
- If it is found in the file then <literal>NULL</> is returned, causing
+ If it is found in the file then an empty array is returned, causing
the token to be discarded. If not, the lower-cased form of the word
- is returned as the normalized lexeme.
+ is returned as the normalized lexeme. Alternatively, the dictionary
+ can be configured to report non-stop-words as unrecognized, allowing
+ them to be passed on to the next dictionary in the list.
</para>
<para>
</programlisting>
</para>
+ <para>
+ We can also choose to return <literal>NULL</>, instead of the lower-cased
+ word, if it is not found in the stop words file. This behavior is
+ selected by setting the dictionary's <literal>Accept</> parameter to
+ <literal>false</>. Continuing the example:
+
+<programlisting>
+ALTER TEXT SEARCH DICTIONARY public.simple_dict ( Accept = false );
+
+SELECT ts_lexize('public.simple_dict','YeS');
+ ts_lexize
+-----------
+
+
+SELECT ts_lexize('public.simple_dict','The');
+ ts_lexize
+-----------
+ {}
+</programlisting>
+ </para>
+
+ <para>
+ With the default setting of <literal>Accept</> = <literal>true</>,
+ it is only useful to place a <literal>simple</> dictionary at the end
+ of a list of dictionaries, since it will never pass on any token to
+ a following dictionary. Conversely, <literal>Accept</> = <literal>false</>
+ is only useful when there is at least one following dictionary.
+ </para>
+
<caution>
<para>
Most types of dictionaries rely on configuration files, such as files of
*
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/tsearch/dict_simple.c,v 1.3 2007/08/25 00:03:59 tgl Exp $
+ * $PostgreSQL: pgsql/src/backend/tsearch/dict_simple.c,v 1.4 2007/11/14 18:36:37 tgl Exp $
*
*-------------------------------------------------------------------------
*/
typedef struct
{
StopList stoplist;
+ bool accept;
} DictSimple;
{
List *dictoptions = (List *) PG_GETARG_POINTER(0);
DictSimple *d = (DictSimple *) palloc0(sizeof(DictSimple));
- bool stoploaded = false;
+ bool stoploaded = false,
+ acceptloaded = false;
ListCell *l;
+ d->accept = true; /* default */
+
foreach(l, dictoptions)
{
DefElem *defel = (DefElem *) lfirst(l);
readstoplist(defGetString(defel), &d->stoplist, lowerstr);
stoploaded = true;
}
+ else if (pg_strcasecmp("Accept", defel->defname) == 0)
+ {
+ if (acceptloaded)
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("multiple Accept parameters")));
+ d->accept = defGetBoolean(defel);
+ acceptloaded = true;
+ }
else
{
ereport(ERROR,
char *in = (char *) PG_GETARG_POINTER(1);
int32 len = PG_GETARG_INT32(2);
char *txt;
- TSLexeme *res = palloc0(sizeof(TSLexeme) * 2);
+ TSLexeme *res;
txt = lowerstr_with_len(in, len);
if (*txt == '\0' || searchstoplist(&(d->stoplist), txt))
+ {
+ /* reject as stopword */
pfree(txt);
- else
+ res = palloc0(sizeof(TSLexeme) * 2);
+ PG_RETURN_POINTER(res);
+ }
+ else if (d->accept)
+ {
+ /* accept */
+ res = palloc0(sizeof(TSLexeme) * 2);
res[0].lexeme = txt;
-
- PG_RETURN_POINTER(res);
+ PG_RETURN_POINTER(res);
+ }
+ else
+ {
+ /* report as unrecognized */
+ pfree(txt);
+ PG_RETURN_POINTER(NULL);
+ }
}