From 9389ac8928866eb4ab19b2f3892531e798e34f24 Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Wed, 25 Aug 2010 21:42:55 +0000 Subject: [PATCH] Document filtering dictionaries in textsearch.sgml. While at it, copy-edit the description of prefix-match marker support in synonym dictionaries, and clarify the description of the default unaccent dictionary a bit more. --- doc/src/sgml/textsearch.sgml | 126 ++++++++++++++++++++--------------- doc/src/sgml/unaccent.sgml | 8 ++- 2 files changed, 79 insertions(+), 55 deletions(-) diff --git a/doc/src/sgml/textsearch.sgml b/doc/src/sgml/textsearch.sgml index fb7f205091..60fac102df 100644 --- a/doc/src/sgml/textsearch.sgml +++ b/doc/src/sgml/textsearch.sgml @@ -1,4 +1,4 @@ - + Full Text Search @@ -112,7 +112,7 @@ as a sorted array of normalized lexemes. Along with the lexemes it is often desirable to store positional information to use for proximity ranking, so that a document that - contains a more dense region of query words is + contains a more dense region of query words is assigned a higher rank than one with scattered query words. @@ -1151,13 +1151,13 @@ MaxFragments=0, FragmentDelimiter=" ... " SELECT ts_headline('english', 'The most common type of search -is to find all documents containing given query terms +is to find all documents containing given query terms and return them in order of their similarity to the query.', to_tsquery('query & similarity')); ts_headline ------------------------------------------------------------ - containing given <b>query</b> terms + containing given <b>query</b> terms and return them in order of their <b>similarity</b> to the <b>query</b>. @@ -1166,7 +1166,7 @@ SELECT ts_headline('english', is to find all documents containing given query terms and return them in order of their similarity to the query.', - to_tsquery('query & similarity'), + to_tsquery('query & similarity'), 'StartSel = <, StopSel = >'); ts_headline ------------------------------------------------------- @@ -2064,6 +2064,14 @@ SELECT alias, description, token FROM ts_debug('http://example.com/stuff/index.h (notice that one token can produce more than one lexeme) + + + a single lexeme with the TSL_FILTER flag set, to replace + the original token with a new token to be passed to subsequent + dictionaries (a dictionary that does this is called a + filtering dictionary) + + an empty array if the dictionary knows the token, but it is a stop word @@ -2096,6 +2104,13 @@ SELECT alias, description, token FROM ts_debug('http://example.com/stuff/index.h until some dictionary recognizes it as a known word. If it is identified as a stop word, or if no dictionary recognizes the token, it will be discarded and not indexed or searched for. + Normally, the first dictionary that returns a non-NULL + output determines the result, and any remaining dictionaries are not + consulted; but a filtering dictionary can replace the given word + with a modified word, which is then passed to subsequent dictionaries. + + + The general rule for configuring a list of dictionaries is to place first the most narrow, most specific dictionary, then the more general dictionaries, finishing with a very general dictionary, like @@ -2112,6 +2127,16 @@ ALTER TEXT SEARCH CONFIGURATION astro_en + + A filtering dictionary can be placed anywhere in the list, except at the + end where it'd be useless. Filtering dictionaries are useful to partially + normalize words to simplify the task of later dictionaries. For example, + a filtering dictionary could be used to remove accents from accented + letters, as is done by the + contrib/unaccent + extension module. + + Stop Words @@ -2184,7 +2209,7 @@ CREATE TEXT SEARCH DICTIONARY public.simple_dict ( Here, english is the base name of a file of stop words. The file's full name will be $SHAREDIR/tsearch_data/english.stop, - where $SHAREDIR means the + where $SHAREDIR means the PostgreSQL installation's shared-data directory, often /usr/local/share/postgresql (use pg_config --sharedir to determine it if you're not sure). @@ -2295,17 +2320,39 @@ SELECT * FROM ts_debug('english', 'Paris'); asciiword | Word, all ASCII | Paris | {my_synonym,english_stem} | my_synonym | {paris} - + - An asterisk (*) at the end of definition word indicates - that definition word is a prefix, and to_tsquery() - function will transform that definition to the prefix search format (see - ). - Notice that it is ignored in to_tsvector(). + The only parameter required by the synonym template is + SYNONYMS, which is the base name of its configuration file + — my_synonyms in the above example. + The file's full name will be + $SHAREDIR/tsearch_data/my_synonyms.syn + (where $SHAREDIR means the + PostgreSQL installation's shared-data directory). + The file format is just one line + per word to be substituted, with the word followed by its synonym, + separated by white space. Blank lines and trailing spaces are ignored. + + + + The synonym template also has an optional parameter + CaseSensitive, which defaults to false. When + CaseSensitive is false, words in the synonym file + are folded to lower case, as are input tokens. When it is + true, words and tokens are not folded to lower case, + but are compared as-is. - Contents of $SHAREDIR/tsearch_data/synonym_sample.syn: + An asterisk (*) can be placed at the end of a synonym + in the configuration file. This indicates that the synonym is a prefix. + The asterisk is ignored when the entry is used in + to_tsvector(), but when it is used in + to_tsquery(), the result will be a query item with + the prefix match marker (see + ). + For example, suppose we have these entries in + $SHAREDIR/tsearch_data/synonym_sample.syn: postgres pgsql postgresql pgsql @@ -2313,67 +2360,42 @@ postgre pgsql gogle googl indices index* - - - - Results: + Then we will get these results: -=# CREATE TEXT SEARCH DICTIONARY syn (template=synonym, synonyms='synonym_sample'); -=# SELECT ts_lexize('syn','indices'); +mydb=# CREATE TEXT SEARCH DICTIONARY syn (template=synonym, synonyms='synonym_sample'); +mydb=# SELECT ts_lexize('syn','indices'); ts_lexize ----------- {index} (1 row) -=# CREATE TEXT SEARCH CONFIGURATION tst (copy=simple); -=# ALTER TEXT SEARCH CONFIGURATION tst ALTER MAPPING FOR asciiword WITH syn; -=# SELECT to_tsquery('tst','indices'); +mydb=# CREATE TEXT SEARCH CONFIGURATION tst (copy=simple); +mydb=# ALTER TEXT SEARCH CONFIGURATION tst ALTER MAPPING FOR asciiword WITH syn; +mydb=# SELECT to_tsvector('tst','indices'); + to_tsvector +------------- + 'index':1 +(1 row) + +mydb=# SELECT to_tsquery('tst','indices'); to_tsquery ------------ 'index':* (1 row) -=# SELECT 'indexes are very useful'::tsvector; +mydb=# SELECT 'indexes are very useful'::tsvector; tsvector --------------------------------- 'are' 'indexes' 'useful' 'very' (1 row) -=# SELECT 'indexes are very useful'::tsvector @@ to_tsquery('tst','indices'); +mydb=# SELECT 'indexes are very useful'::tsvector @@ to_tsquery('tst','indices'); ?column? ---------- t (1 row) - -=# SELECT to_tsvector('tst','indices'); - to_tsvector -------------- - 'index':1 -(1 row) - - - The only parameter required by the synonym template is - SYNONYMS, which is the base name of its configuration file - — my_synonyms in the above example. - The file's full name will be - $SHAREDIR/tsearch_data/my_synonyms.syn - (where $SHAREDIR means the - PostgreSQL installation's shared-data directory). - The file format is just one line - per word to be substituted, with the word followed by its synonym, - separated by white space. Blank lines and trailing spaces are ignored. - - - - The synonym template also has an optional parameter - CaseSensitive, which defaults to false. When - CaseSensitive is false, words in the synonym file - are folded to lower case, as are input tokens. When it is - true, words and tokens are not folded to lower case, - but are compared as-is. - diff --git a/doc/src/sgml/unaccent.sgml b/doc/src/sgml/unaccent.sgml index 6c73c3f298..135fcdb6dc 100644 --- a/doc/src/sgml/unaccent.sgml +++ b/doc/src/sgml/unaccent.sgml @@ -1,4 +1,4 @@ - + unaccent @@ -75,8 +75,10 @@ Running the installation script unaccent.sql creates a text search template unaccent and a dictionary unaccent - based on it, with default parameters. You can alter the - parameters, for example + based on it. The unaccent dictionary has the default + parameter setting RULES='unaccent', which makes it immediately + usable with the standard unaccent.rules file. + If you wish, you can alter the parameter, for example mydb=# ALTER TEXT SEARCH DICTIONARY unaccent (RULES='my_rules'); -- 2.40.0