From 4b21d1f09b96a1a76a72ebc752daeeff77555801 Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Mon, 15 Oct 2007 21:39:57 +0000 Subject: [PATCH] Remove obsolete examples of add-on parsers and dictionary templates; these are more easily and usefully maintained as contrib modules. Various other wordsmithing, markup improvement, etc. --- doc/src/sgml/textsearch.sgml | 1197 ++++++++++------------------------ 1 file changed, 336 insertions(+), 861 deletions(-) diff --git a/doc/src/sgml/textsearch.sgml b/doc/src/sgml/textsearch.sgml index 8fbffe07c0..b4bad63b9d 100644 --- a/doc/src/sgml/textsearch.sgml +++ b/doc/src/sgml/textsearch.sgml @@ -1,4 +1,4 @@ - + Full Text Search @@ -15,12 +15,13 @@ Introduction - Full Text Searching (or just text search) allows - identifying documents that satisfy a query, and - optionally sorting them by relevance to the query. The most common search + Full Text Searching (or just text search) provides + the capability to identify documents that satisfy a + query, and optionally to sort them by relevance to + the query. The most common type of search is to find all documents containing given query terms and return them in order of their similarity to the - query. Notions of query and + query. Notions of query and similarity are very flexible and depend on the specific application. The simplest search considers query as a set of words and similarity as the frequency of query @@ -32,7 +33,7 @@ Textual search operators have existed in databases for years. PostgreSQL has - ~,~*, LIKE, + ~, ~*, LIKE, and ILIKE operators for textual datatypes, but they lack many essential properties required by modern information systems: @@ -43,11 +44,11 @@ There is no linguistic support, even for English. Regular expressions are not sufficient because they cannot easily handle derived words, e.g., satisfies and satisfy. You might - miss documents which contain satisfies, although you + miss documents that contain satisfies, although you probably would like to find them when searching for satisfy. It is possible to use OR - to search any of them, but it is tedious and error-prone - (some words can have several thousand derivatives). + to search for any of them, but this is tedious and + error-prone (some words can have several thousand derivatives). @@ -75,7 +76,7 @@ Parsing documents into lexemes. It is - useful to identify various classes of lexemes, e.g. digits, words, + useful to identify various classes of lexemes, e.g. numbers, words, complex words, email addresses, so that they can be processed differently. In principle lexeme classes depend on the specific application but for an ordinary search it is useful to have a predefined @@ -105,7 +106,7 @@ searching. For example, each document can be represented as a sorted array of normalized lexemes. Along with the lexemes it is desirable to store positional information to use for proximity - ranking, so that a document which contains a more + ranking, so that a document that contains a more dense region of query words is assigned a higher rank than one with scattered query words. @@ -146,7 +147,7 @@ Map different variations of a word to a canonical form using - snowball stemmer rules. + Snowball stemmer rules. @@ -174,7 +175,7 @@ system; for example, a magazine article or email message. The text search engine must be able to parse documents and store associations of lexemes (key words) with their parent document. Later, these associations are - used to search for documents which contain query words. + used to search for documents that contain query words. @@ -199,8 +200,7 @@ WHERE mid = did AND mid = 12; Actually, in the previous example queries, COALESCE - - should be used to prevent a simgle NULL attribute from + should be used to prevent a single NULL attribute from causing a NULL result for the whole document. @@ -276,23 +276,73 @@ SELECT 'fat & cow'::tsquery @@ 'a fat cat sat on a mat and ate a fat rat'::t text search functionality includes the ability to do many more things: skip indexing certain words (stop words), process synonyms, and use sophisticated parsing, e.g. parse based on more than just white space. - This functionality is controlled by configurations. - Fortunately, PostgreSQL comes with predefined - configurations for many languages. (psql's \dF - shows all predefined configurations.) + This functionality is controlled by text search + configurations. PostgreSQL comes with predefined + configurations for many languages, and you can easily create your own + configurations. (psql's \dF command + shows all available configurations.) - During installation an appropriate configuration was selected and - was set accordingly + During installation an appropriate configuration is selected and + is set accordingly in postgresql.conf. If you are using the same text search configuration for the entire cluster you can use the value in - postgresql.conf. If using different configurations - throughout the cluster but - the same text search configuration for any one database, - use ALTER DATABASE ... SET. If not, you must set - default_text_search_config in each session. Many functions - also take an optional configuration name. + postgresql.conf. To use different configurations + throughout the cluster but the same configuration within any one database, + use ALTER DATABASE ... SET. Otherwise, you can set + default_text_search_config in each session. + Many functions also take an optional configuration name. + + + + To make it easier to build custom text search configurations, a + configuration is built up from simpler database objects. + PostgreSQL's text search facility provides + four types of configuration-related database objects: + + + + + + Text search parsers break documents into lexemes + and classify each lexeme (for example, as words or numbers). + + + + + + Text search dictionaries convert lexemes to normalized + form and reject stop words. + + + + + + Text search templates provide the functions underlying + dictionaries. (A dictionary simply specifies a template and a set + of parameters for the template.) + + + + + + Text search configurations specify a parser and a set + of dictionaries to use to normalize the lexemes produced by the parser. + + + + + + Text search parsers and templates are built from low-level C functions; + therefore it requires C programming ability to develop new ones, and + superuser privileges to install one into a database. (There are examples + of add-on parsers and templates in the contrib/ area of the + PostgreSQL distribution.) Since dictionaries and + configurations just parameterize and connect together some underlying + parsers and templates, no special privilege is needed to create a new + dictionary or configuration. Examples of creating custom dictionaries and + configurations appear later in this chapter. @@ -312,35 +362,43 @@ SELECT 'fat & cow'::tsquery @@ 'a fat cat sat on a mat and ate a fat rat'::t Searching a Table - It is possible to do full text table search with no index. A simple query - to find all title entries that contain the word - friend is: + It is possible to do full text search with no index. A simple query + to print the title of each row that contains the word + friend in its body field is: SELECT title FROM pgweb -WHERE to_tsvector('english', body) @@ to_tsquery('friend') +WHERE to_tsvector('english', body) @@ to_tsquery('english', 'friend') - - - The query above uses the english the configuration set by english configuration + is to be used to parse and normalize the strings. Alternatively we + could omit the configuration parameters: + + +SELECT title +FROM pgweb +WHERE to_tsvector(body) @@ to_tsquery('friend') + + + This query will use the configuration set by . A more complex query is to - select the ten most recent documents which contain create and - table in the title or body: + select the ten most recent documents that contain create and + table in the title or body: SELECT title FROM pgweb -WHERE to_tsvector('english', title || body) @@ to_tsquery('create & table') +WHERE to_tsvector(title || body) @@ to_tsquery('create & table') ORDER BY dlm DESC LIMIT 10; - dlm is the last-modified date so we - used ORDER BY dlm LIMIT 10 to get the ten most recent - matches. For clarity we omitted the coalesce function - which prevents the unwanted effect of NULL - concatenation. + dlm is the last-modified date so we + used ORDER BY dlm LIMIT 10 to get the ten most recent + matches. For clarity we omitted the COALESCE function + which would be needed to search rows that contain NULL + in one of the two fields. @@ -349,15 +407,15 @@ ORDER BY dlm DESC LIMIT 10; Creating Indexes - We can create a GIN () index to speed up the search: + We can create a GIN index () to speed up the search: CREATE INDEX pgweb_idx ON pgweb USING gin(to_tsvector('english', body)); - Notice that the 2-argument version of to_tsvector is - used. Only text search functions which specify a configuration name can + Notice that the 2-argument version of to_tsvector is + used. Only text search functions that specify a configuration name can be used in expression indexes (). This is because the index contents must be unaffected by . If they were affected, the @@ -371,15 +429,15 @@ CREATE INDEX pgweb_idx ON pgweb USING gin(to_tsvector('english', body)); Because the two-argument version of to_tsvector was used in the index above, only a query reference that uses the 2-argument version of to_tsvector with the same configuration - name will use that index, i.e. WHERE 'a & b' @@ - to_svector('english', body) will use the index, but WHERE - 'a & b' @@ to_svector(body)) and WHERE 'a & b' @@ - body::tsvector will not. This guarantees that an index will be used - only with the same configuration used to create the index rows. + name will use that index. That is, WHERE + to_tsvector('english', body) @@ 'a & b' can use the index, + but WHERE to_tsvector(body) @@ 'a & b' cannot. + This ensures that an index will be used only with the same configuration + used to create the index entries. - It is possible to setup more complex expression indexes where the + It is possible to set up more complex expression indexes where the configuration name is specified by another column, e.g.: @@ -388,7 +446,9 @@ CREATE INDEX pgweb_idx ON pgweb USING gin(to_tsvector(config_name, body)); where config_name is a column in the pgweb table. This allows mixed configurations in the same index while - recording which configuration was used for each index row. + recording which configuration was used for each index entry. Again, + queries that are to use the index must be phrased to match, e.g. + WHERE to_tsvector(config_name, body) @@ 'a & b'. @@ -400,7 +460,7 @@ CREATE INDEX pgweb_idx ON pgweb USING gin(to_tsvector('english', title || body)) - A more complex case is to create a separate tsvector column + Another approach is to create a separate tsvector column to hold the output of to_tsvector(). This example is a concatenation of title and body, with ranking information. We assign different labels to them to encode @@ -409,7 +469,7 @@ CREATE INDEX pgweb_idx ON pgweb USING gin(to_tsvector('english', title || body)) ALTER TABLE pgweb ADD COLUMN textsearch_index tsvector; UPDATE pgweb SET textsearch_index = - setweight(to_tsvector('english', coalesce(title,'')), 'A') || ' ' || + setweight(to_tsvector('english', coalesce(title,'')), 'A') || setweight(to_tsvector('english', coalesce(body,'')),'D'); @@ -419,7 +479,7 @@ UPDATE pgweb SET textsearch_index = CREATE INDEX textsearch_idx ON pgweb USING gin(textsearch_index); - After vacuuming, we are ready to perform a fast full text search: + Now we are ready to perform a fast full text search: SELECT ts_rank_cd(textsearch_index, q) AS rank, title @@ -427,13 +487,30 @@ FROM pgweb, to_tsquery('create & table') q WHERE q @@ textsearch_index ORDER BY rank DESC LIMIT 10; + - It is necessary to create a trigger to keep the new tsvector + + When using a separate column to store the tsvector + representation, + it is necessary to create a trigger to keep the tsvector column current anytime title or body changes. - Keep in mind that, just like with expression indexes, it is important to - specify the configuration name when creating text search data types - inside triggers so the column's contents are not affected by changes to - default_text_search_config. + A predefined trigger function tsvector_update_trigger + is available for this, or you can write your own. + Keep in mind that, just as with expression indexes, it is important to + specify the configuration name when creating tsvector values + inside triggers, so that the column's contents are not affected by changes + to default_text_search_config. + + + + The main advantage of this approach over an expression index is that + it is not necessary to explicitly specify the text search configuration + in queries in order to make use of the index. As in the example above, + the query can depend on default_text_search_config. + Another advantage is that searches will be faster, since + it will not be necessary to redo the to_tsvector calls + to verify index matches. (This is more important when using a GiST + index than a GIN index; see .) @@ -447,8 +524,8 @@ ORDER BY rank DESC LIMIT 10; To implement full text searching there must be a function to create a tsvector from a document and a tsquery from a user query. Also, we need to return results in some order, i.e., we need - a function which compares documents with respect to their relevance to - the tsquery. Full text searching in + a function that compares documents with respect to their relevance to + the tsquery. PostgreSQL provides support for all of these functions. @@ -462,7 +539,7 @@ ORDER BY rank DESC LIMIT 10; - Full text searching in PostgreSQL provides + PostgreSQL provides the function to_tsvector, which converts a document to the tsvector data type. More details are available in , but for now consider a simple example: @@ -497,20 +574,20 @@ SELECT to_tsvector('english', 'a fat cat sat on a mat - it ate a fat rats'); frequently and have little informational value. In our example these are a, on, and it. The punctuation sign - was also ignored because its - type (Space symbols) is not indexed. All information - about the parser, dictionaries and what types of lexemes to index is - documented in the full text configuration section (Space symbols) is not indexed. The choice of + parser, dictionaries and what types of lexemes to index is determined by + the selected text search configuration (). It is possible to have - several different configurations in the same database, and many predefined - system configurations are available for different languages. In our example + many different configurations in the same database, and predefined + configurations are available for various languages. In our example we used the default configuration english for the English language. As another example, below is the output from the ts_debug - function ( ), which shows all details - of the full text machinery: + function (), which shows all details + of the text search parsing machinery: SELECT * FROM ts_debug('english','a fat cat sat on a mat - it ate a fat rats'); @@ -545,8 +622,9 @@ SELECT * FROM ts_debug('english','a fat cat sat on a mat - it ate a fat rats'); - Function setweight() is used to label - tsvector. The typical usage of this is to mark out the + The function setweight() is used to label the entries + of a tsvector with a given weight. The typical + usage of this is to mark entries coming from different parts of a document, perhaps by importance. Later, this can be used for ranking of search results in addition to positional information (distance between query terms). If no ranking is required, positional @@ -555,18 +633,24 @@ SELECT * FROM ts_debug('english','a fat cat sat on a mat - it ate a fat rats'); - Because to_tsvector(NULL) can - return NULL, it is recommended to use - coalesce. Here is the safe method for creating a - tsvector from a structured document: + Because to_tsvector(NULL) will + return NULL, it is recommended to use + coalesce whenever a field might be null. + Here is the recommended method for creating + a tsvector from a structured document: -UPDATE tt SET ti= - setweight(to_tsvector(coalesce(title,'')), 'A') || ' ' || - setweight(to_tsvector(coalesce(keyword,'')), 'B') || ' ' || - setweight(to_tsvector(coalesce(abstract,'')), 'C') || ' ' || +UPDATE tt SET ti = + setweight(to_tsvector(coalesce(title,'')), 'A') || + setweight(to_tsvector(coalesce(keyword,'')), 'B') || + setweight(to_tsvector(coalesce(abstract,'')), 'C') || setweight(to_tsvector(coalesce(body,'')), 'D'); + + Here we have used setweight() to label the source + of each lexeme in the finished tsvector, and then merged + the labeled tsvector values using the concatenation + operator ||. @@ -588,10 +672,10 @@ UPDATE tt SET ti= - Parses the given document and returns a series - of records, one for each token produced by parsing. Each record includes - a tokid giving its type and a token - which gives its content: + Parses the given document and returns a + series of records, one for each token produced by parsing. Each record + includes a tokid giving its type and a + token which gives its content: SELECT * FROM ts_parse('default','123 - a number'); @@ -622,10 +706,10 @@ SELECT * FROM ts_parse('default','123 - a number'); Returns a table which describes each kind of token the - parser might produce as output. For each token + parser can recognize. For each token type the table gives the tokid which the - parser uses to label each - token of that type, the alias which + parser uses to label a + token of that type, the alias which names the token type, and a short description: @@ -672,13 +756,13 @@ SELECT * FROM ts_token_type('default'); Ranking attempts to measure how relevant documents are to a particular query by inspecting the number of times each search word appears in the - document, and whether different search terms occur near each other. Full - text searching provides two predefined ranking functions which attempt to - produce a measure of how a document is relevant to the query. In spite - of that, the concept of relevancy is vague and very application-specific. - These functions try to take into account lexical, proximity, and structural - information. Different applications might require additional information - for ranking, e.g. document modification time. + document, and whether different search terms occur near each other. + PostgreSQL provides two predefined ranking + functions, which take into account lexical, + proximity, and structural information. However, the concept of + relevancy is vague and very application-specific. Different applications + might require additional information for ranking, e.g. document + modification time. @@ -702,7 +786,7 @@ SELECT * FROM ts_token_type('default'); - ts_rank( weights float4[], vector TSVECTOR, query TSQUERY, normalization int4 ) returns float4 + ts_rank( weights float4[], vector tsvector, query tsquery , normalization int4 ) returns float4 @@ -738,7 +822,7 @@ SELECT * FROM ts_token_type('default'); - ts_rank_cd( weights float4[], vector TSVECTOR, query TSQUERY, normalization int4 ) returns float4 + ts_rank_cd( weights float4[], vector tsvector, query tsquery , normalization int4 ) returns float4 @@ -747,7 +831,7 @@ SELECT * FROM ts_token_type('default'); This function computes the cover density ranking for the given document vector and query, as described in Clarke, Cormack, and Tudhope's "Relevance Ranking for One to Three Term Queries" in the - "Information Processing and Management", 1999. + journal "Information Processing and Management", 1999. @@ -763,8 +847,9 @@ SELECT * FROM ts_token_type('default'); than a thousand-word document with five instances. Both ranking functions take an integer normalization option that specifies whether a document's length should impact its rank. The integer - option controls several behaviors which is done using bit-wise fields and - | (for example, 2|4): + option controls several behaviors, so it is a bit mask: you can specify + one or more behaviors using + | (for example, 2|4). @@ -803,7 +888,7 @@ SELECT * FROM ts_token_type('default'); - It is important to note that ranking functions do not use any global + It is important to note that the ranking functions do not use any global information so it is impossible to produce a fair normalization to 1% or 100%, as sometimes required. However, a simple technique like rank/(rank+1) can be applied. Of course, this is just @@ -866,8 +951,8 @@ ORDER BY rnk DESC LIMIT 10; Ranking can be expensive since it requires consulting the tsvector of all documents, which can be I/O bound and therefore slow. Unfortunately, it is almost impossible to avoid since full - text searching in a database should work without indexes . Moreover an index can be lossy (a GiST + text searching in a database should work without indexes. Moreover an index can be lossy (a GiST index, for example) so it must check documents to avoid false hits. @@ -889,9 +974,9 @@ ORDER BY rnk DESC LIMIT 10; To present search results it is ideal to show a part of each document and how it is related to the query. Usually, search engines show fragments of - the document with marked search terms. PostgreSQL full - text searching provides the function headline that - implements such functionality. + the document with marked search terms. PostgreSQL + provides a function headline that + implements this functionality. @@ -900,18 +985,18 @@ ORDER BY rnk DESC LIMIT 10; - ts_headline( config_name text, document text, query TSQUERY, options text ) returns text + ts_headline( config_name text, document text, query tsquery , options text ) returns text - The ts_headline function accepts a document along with - a query, and returns one or more ellipsis-separated excerpts from the - document in which terms from the query are highlighted. The configuration - used to parse the document can be specified by its - config_name; if none is specified, the current - configuration is used. + The ts_headline function accepts a document along + with a query, and returns one or more ellipsis-separated excerpts from + the document in which terms from the query are highlighted. The + configuration to be used to parse the document can be specified by its + config_name; if none is specified, the + default_text_search_config configuration is used. @@ -921,7 +1006,8 @@ ORDER BY rnk DESC LIMIT 10; If an options string is specified it should - consist of a comma-separated list of one or more 'option=value' pairs. + consist of a comma-separated list of one or more + option=value pairs. The available options are: @@ -934,21 +1020,21 @@ ORDER BY rnk DESC LIMIT 10; - MaxWords, MinWords: limit the shortest and - longest headlines to output + MaxWords, MinWords: these numbers + determine the longest and shortest headlines to output. - ShortWord: this prevents your headline from beginning - or ending with a word which has this many characters or less. The default + ShortWord: the minimum length of a word that begins + or ends a headline. The default value of three eliminates the English articles. HighlightAll: boolean flag; if - true the whole document will be highlighted + true the whole document will be highlighted. @@ -972,16 +1058,16 @@ SELECT ts_headline('a b c', 'c'::tsquery); SELECT ts_headline('a b c', 'c'::tsquery, 'StartSel=<,StopSel=>'); ts_headline ------------- - a b <c> + a b <c> headline uses the original document, not tsvector, so it can be slow and should be used with care. - A typical mistake is to call headline() for + A typical mistake is to call headline for every matching document when only ten documents are - shown. SQL subselects can help here; below is an + to be shown. SQL subselects can help; here is an example: @@ -992,12 +1078,6 @@ ORDER BY rank DESC LIMIT 10) AS foo; - - Note that the cascade dropping of the parser function - causes dropping of the ts_headline used in the full text search - configuration config_name. - - @@ -1051,7 +1131,7 @@ ORDER BY rank DESC LIMIT 10) AS foo; - Colour names are substituted by their hexadecimal values, e.g., + Color names are substituted by their hexadecimal values, e.g., red, green, blue, magenta -> FF0000, 00FF00, 0000FF, FF00FF @@ -1068,7 +1148,7 @@ ORDER BY rank DESC LIMIT 10) AS foo; - A dictionary is a program which accepts lexemes as + A dictionary is a program that accepts lexemes as input and returns: @@ -1078,7 +1158,7 @@ ORDER BY rank DESC LIMIT 10) AS foo; - a void array if the dictionary knows the lexeme, but it is a stop word + an empty array if the dictionary knows the lexeme, but it is a stop word @@ -1090,30 +1170,31 @@ ORDER BY rank DESC LIMIT 10) AS foo; - Full text searching provides predefined dictionaries for many languages, - and SQL commands to manipulate them. There are also - several predefined template dictionaries that can be used to create new - dictionaries by overriding their default parameters. Besides this, it is - possible to develop custom dictionaries using an API; - see the dictionary for integers () as an example. + PostgreSQL provides predefined dictionaries for + many languages. There are also several predefined templates that can be + used to create new dictionaries with custom parameters. If no existing + dictionary template is suitable, it is possible to create new ones; see the + contrib/ area of the PostgreSQL distribution + for examples. - The ALTER TEXT SEARCH CONFIGURATION ADD - MAPPING command binds specific types of lexemes and a set of - dictionaries to process them. (Mappings can also be specified as part of - configuration creation.) Lexemes are processed by a stack of dictionaries - until some dictionary identifies it as a known word or it turns out to be - a stop word. If no dictionary recognizes a lexeme, it will be discarded - and not indexed. A general rule for configuring a stack of dictionaries + A text search configuration binds a parser together with a set of + dictionaries to process the parser's output lexemes. For each token + type that the parser can return, a separate stack of dictionaries is + specified by the configuration. When a lexeme of that type is found + by the parser, each dictionary in the stack is consulted in turn, + until some dictionary recognizes it as a known word. If it is identified + as a stop word, or if no dictionary recognizes the lexeme, it will be + discarded and not indexed or searched for. + The general rule for configuring a stack of dictionaries is to place first the most narrow, most specific dictionary, then the more - general dictionaries and finish it with a very general dictionary, like - the snowball stemmer or simple, which + general dictionaries, finishing with a very general dictionary, like + a Snowball stemmer or simple, which recognizes everything. For example, for an astronomy-specific search (astro_en configuration) one could bind lword (latin word) with a synonym dictionary of astronomical - terms, a general English dictionary and a snowball English + terms, a general English dictionary and a Snowball English stemmer: @@ -1122,27 +1203,11 @@ ALTER TEXT SEARCH CONFIGURATION astro_en - - Function ts_lexize can be used to test dictionaries, - for example: - - -SELECT ts_lexize('english_stem', 'stars'); - ts_lexize ------------ - {star} -(1 row) - - - Also, the ts_debug function () is helpful for testing. - - Stop Words - Stop words are words which are very common, appear in almost every + Stop words are words that are very common, appear in almost every document, and have no discrimination value. Therefore, they can be ignored in the context of full text searching. For example, every English text contains words like a and the, so it is @@ -1156,7 +1221,7 @@ SELECT to_tsvector('english','in the list of stop words'); 'list':3 'stop':5 'word':6 - The gaps between positions 1-3 and 3-5 are because of stop words, so ranks + The mising positions 1,2,4 are because of stop words. Ranks calculated for documents with and without stop words are quite different: @@ -1176,9 +1241,9 @@ SELECT ts_rank_cd ('{1,1,1,1}', to_tsvector('english','list stop words'), to_tsq It is up to the specific dictionary how it treats stop words. For example, ispell dictionaries first normalize words and then - look at the list of stop words, while stemmers + look at the list of stop words, while Snowball stemmers first check the list of stop words. The reason for the different - behaviour is an attempt to decrease possible noise. + behavior is an attempt to decrease noise. @@ -1224,7 +1289,7 @@ SELECT ts_lexize('public.simple_dict','The'); Synonym Dictionary - This dictionary template is used to create dictionaries which replace a + This dictionary template is used to create dictionaries that replace a word with a synonym. Phrases are not supported (use the thesaurus dictionary () for that). A synonym dictionary can be used to overcome linguistic problems, for example, to @@ -1260,7 +1325,7 @@ SELECT * FROM ts_debug('english','Paris'); A thesaurus dictionary (sometimes abbreviated as TZ) is - a collection of words which includes information about the relationships + a collection of words that includes information about the relationships of words and phrases, i.e., broader terms (BT), narrower terms (NT), preferred terms, non-preferred terms, related terms, etc. @@ -1321,15 +1386,14 @@ the one a two : swsw2 - As any normal dictionary, it can be assigned to the specific lexeme types. Since a thesaurus dictionary has the capability to recognize phrases it must remember its state and interact with the parser. A thesaurus dictionary uses these assignments to check if it should handle the next word or stop - accumulation. The thesaurus dictionary compiler must be configured + accumulation. The thesaurus dictionary must be configured carefully. For example, if the thesaurus dictionary is assigned to handle only the lword lexeme, then a thesaurus dictionary definition like ' one 7' will not work since lexeme type - digit is not assigned to the thesaurus dictionary. + uint is not assigned to the thesaurus dictionary. @@ -1506,8 +1570,8 @@ SELECT ts_lexize('english_ispell','banked'); To create an ispell dictionary one should use the built-in - ispell dictionary and specify several - parameters. + ispell template and specify several + parameters: @@ -1618,9 +1682,10 @@ CREATE TEXT SEARCH DICTIONARY english_stem ( - Returns an array of lexemes if the input lexeme - is known to the dictionary dictname, or a void - array if the lexeme is known to the dictionary but it is a stop word, or + Returns an array of lexemes if the input + lexeme is known to the dictionary + dict_name, or an empty array if the lexeme + is known to the dictionary but it is a stop word, or NULL if it is an unknown word. @@ -1668,20 +1733,25 @@ SELECT plainto_tsquery('supernovae stars'); + + Also, the ts_debug function () is helpful for testing dictionaries. + + Configuration Example - A full text configuration specifies all options necessary to transform a + A text search configuration specifies all options necessary to transform a document into a tsvector: the parser breaks text into tokens, and the dictionaries transform each token into a lexeme. Every call to to_tsvector() and to_tsquery() needs a configuration to perform its processing. To facilitate management - of full text searching objects, a set of SQL commands - is available, and there are several psql commands which display information - about full text searching objects (). + of text search objects, a set of SQL commands + is available, and there are several psql commands that display information + about text search objects (). @@ -1695,14 +1765,14 @@ SELECT plainto_tsquery('supernovae stars'); - Several predefined text searching configurations are available in the + Several predefined text search configurations are available in the pg_catalog schema. If you need a custom configuration - you can create a new text searching configuration and modify it using SQL + you can create a new text search configuration and modify it using SQL commands. - New text searching objects are created in the current schema by default + New text search objects are created in the current schema by default (usually the public schema), but a schema-qualified name can be used to create objects in the specified schema. @@ -1734,7 +1804,7 @@ postgresql pg CREATE TEXT SEARCH DICTIONARY pg_dict ( - TEMPLATE = synonym + TEMPLATE = synonym, SYNONYMS = pg_dict ); @@ -1778,32 +1848,13 @@ ALTER TEXT SEARCH CONFIGURATION pg Now, we can test our configuration: +COMMIT; + SELECT * FROM ts_debug('public.pg', ' PostgreSQL, the highly scalable, SQL compliant, open source object-relational database management system, is now undergoing beta testing of the next -version of our software: PostgreSQL 8.3. +version of our software. '); - - COMMIT; - - - - - With the dictionaries and mappings set up, suppose we have a table - pgweb which contains 11239 documents from the - PostgreSQL web site. Only relevant columns - are shown: - - -=> \d pgweb - Table "public.pgweb" - Column | Type | Modifiers ------------+-------------------+----------- - tid | integer | not null - path | character varying | not null - body | character varying | - title | character varying | - dlm | date | @@ -1842,7 +1893,7 @@ SHOW default_text_search_config; - There are two kinds of indexes which can be used to speed up full text + There are two kinds of indexes that can be used to speed up full text operators (). Note that indexes are not mandatory for full text searching. @@ -1952,8 +2003,8 @@ EXPLAIN SELECT * FROM apod WHERE textsearch @@ to_tsquery('supernovae'); Actually, this is not the whole story. GiST indexes have an optimization - for storing small tsvectors (< TOAST_INDEX_TARGET - bytes, 512 bytes). On leaf pages small tsvectors are stored unchanged, + for storing small tsvectors (under TOAST_INDEX_TARGET + bytes, 512 bytes by default). On leaf pages small tsvectors are stored unchanged, while longer ones are represented by their signatures, which introduces some lossiness. Unfortunately, the existing index API does not allow for a return value to say whether it found an exact value (tsvector) or whether @@ -1973,7 +2024,7 @@ EXPLAIN SELECT * FROM apod WHERE textsearch @@ to_tsquery('supernovae'); not accessed. However, label information is not stored in the index, so if the query involves label weights it must access the heap. Therefore, a special full text search operator @@@ - was created which forces the use of the heap to get information about + was created that forces the use of the heap to get information about labels. GiST indexes are lossy so it always reads the heap and there is no need for a special operator. In the example below, fulltext_idx is a GIN index: - - Another example — the PostgreSQL mailing list - archives contained 910,989 unique words with 57,491,343 lexemes in 461,020 - messages. - - - - <application>psql</> Support - Information about full text searching objects can be obtained + Information about text search configuration objects can be obtained in psql using a set of commands: \dF{d,p,t}+ PATTERN @@ -2093,8 +2102,8 @@ EXPLAIN SELECT * FROM apod WHERE textsearch @@@ to_tsquery('supernovae:a'); The optional parameter PATTERN should be the name of - a text searching object, optionally schema-qualified. If - PATTERN is not specified then information about all + a text search object, optionally schema-qualified. If + PATTERN is omitted then information about all visible objects will be displayed. PATTERN can be a regular expression and can provide separate patterns for the schema and object names. The following examples illustrate this: @@ -2115,16 +2124,18 @@ EXPLAIN SELECT * FROM apod WHERE textsearch @@@ to_tsquery('supernovae:a'); fulltext | fulltext_cfg | public | fulltext_cfg | + + The available commands are: - \dF[+] [PATTERN] + \dF+ PATTERN - List text searching configurations (add + for more detail). + List text search configurations (add + for more detail). @@ -2166,7 +2177,7 @@ Parser: "pg_catalog.default" - \dFd[+] [PATTERN] + \dFd+ PATTERN List text search dictionaries (add + for more detail). @@ -2201,7 +2212,7 @@ Parser: "pg_catalog.default" - \dFp[+] [PATTERN] + \dFp+ PATTERN List text search parsers (add + for more detail). @@ -2258,7 +2269,7 @@ Parser: "pg_catalog.default" - \dFt[+] [PATTERN] + \dFt+ PATTERN List text search templates (add + for more detail). @@ -2284,16 +2295,59 @@ Parser: "pg_catalog.default" + + Limitations + + + The current limitations of PostgreSQL's + text search features are: + + + The length of each lexeme must be less than 2K bytes + + + The length of a tsvector (lexemes + positions) must be less than 1 megabyte + + + The number of lexemes must be less than 264 + + + Positional information must be greater than 0 and less than 16,383 + + + No more than 256 positions per lexeme + + + The number of nodes (lexemes + operations) in tsquery must be less than 32,768 + + + + + + For comparison, the PostgreSQL 8.1 documentation + contained 10,441 unique words, a total of 335,420 words, and the most frequent + word postgresql was mentioned 6,127 times in 655 documents. + + + + + Another example — the PostgreSQL mailing list + archives contained 910,989 unique words with 57,491,343 lexemes in 461,020 + messages. + + + + Debugging - Function ts_debug allows easy testing of your full text searching - configuration. + The function ts_debug allows easy testing of a + text search configuration. - ts_debug(config_name, document TEXT) returns SETOF ts_debug + ts_debug( config_name, document text) returns SETOF ts_debug @@ -2304,7 +2358,7 @@ Parser: "pg_catalog.default" - ts_debug's result type is defined as: + ts_debug's result type is defined as: CREATE TYPE ts_debug AS ( @@ -2320,8 +2374,7 @@ CREATE TYPE ts_debug AS ( For a demonstration of how function ts_debug works we first create a public.english configuration and - ispell dictionary for the English language. You can skip the test step and - play with the standard english configuration. + ispell dictionary for the English language: @@ -2340,24 +2393,25 @@ ALTER TEXT SEARCH CONFIGURATION public.english SELECT * FROM ts_debug('public.english','The Brightest supernovaes'); - Alias | Description | Token | Dictionaries | Lexized token --------+---------------+-------------+---------------------------------------+--------------------------------- + Alias | Description | Token | Dictionaries | Lexized token +-------+---------------+-------------+-------------------------------------------------+------------------------------------- lword | Latin word | The | {public.english_ispell,pg_catalog.english_stem} | public.english_ispell: {} - blank | Space symbols | | | + blank | Space symbols | | | lword | Latin word | Brightest | {public.english_ispell,pg_catalog.english_stem} | public.english_ispell: {bright} - blank | Space symbols | | | + blank | Space symbols | | | lword | Latin word | supernovaes | {public.english_ispell,pg_catalog.english_stem} | pg_catalog.english_stem: {supernova} (5 rows) - In this example, the word Brightest was recognized by a - parser as a Latin word (alias lword) - and came through the dictionaries public.english_ispell and - pg_catalog.english_stem. It was recognized by + In this example, the word Brightest was recognized by the + parser as a Latin word (alias lword). + For this token type the dictionary stack is + public.english_ispell and + pg_catalog.english_stem. The word was recognized by public.english_ispell, which reduced it to the noun bright. The word supernovaes is unknown - by the public.english_ispell dictionary so it was passed to + to the public.english_ispell dictionary so it was passed to the next dictionary, and, fortunately, was recognized (in fact, public.english_stem is a stemming dictionary and recognizes everything; that is why it was placed at the end of the dictionary stack). @@ -2375,7 +2429,7 @@ SELECT * FROM ts_debug('public.english','The Brightest supernovaes'); SELECT "Alias", "Token", "Lexized token" FROM ts_debug('public.english','The Brightest supernovaes'); Alias | Token | Lexized token --------+-------------+--------------------------------- +-------+-------------+-------------------------------------- lword | The | public.english_ispell: {} blank | | lword | Brightest | public.english_ispell: {bright} @@ -2387,583 +2441,4 @@ FROM ts_debug('public.english','The Brightest supernovaes'); - - Example of Creating a Rule-Based Dictionary - - - The motivation for this example dictionary is to control the indexing of - integers (signed and unsigned), and, consequently, to minimize the number - of unique words which greatly affects to performance of searching. - - - - The dictionary accepts two options: - - - - - The MAXLEN parameter specifies the maximum length of the - number considered as a 'good' integer. The default value is 6. - - - - - - The REJECTLONG parameter specifies if a 'long' integer - should be indexed or treated as a stop word. If - REJECTLONG=FALSE (default), - the dictionary returns the prefixed part of the integer with length - MAXLEN. If - REJECTLONG=TRUE, the dictionary - considers a long integer as a stop word. - - - - - - - - - A similar idea can be applied to the indexing of decimal numbers, for - example, in the DecDict dictionary. The dictionary - accepts two options: the MAXLENFRAC parameter specifies - the maximum length of the fractional part considered as a 'good' decimal. - The default value is 3. The REJECTLONG parameter - controls whether a decimal number with a 'long' fractional part should be indexed - or treated as a stop word. If - REJECTLONG=FALSE (default), - the dictionary returns the decimal number with the length of its fraction part - truncated to MAXLEN. If - REJECTLONG=TRUE, the dictionary - considers the number as a stop word. Notice that - REJECTLONG=FALSE allows the indexing - of 'shortened' numbers and search results will contain documents with - shortened numbers. - - - - Examples: - - -SELECT ts_lexize('intdict', 11234567890); - ts_lexize ------------ - {112345} - - - - - Now, we want to ignore long integers: - - - -ALTER TEXT SEARCH DICTIONARY intdict ( - MAXLEN = 6, REJECTLONG = TRUE -); - -SELECT ts_lexize('intdict', 11234567890); - ts_lexize ------------ - {} - - - - - Create contrib/dict_intdict directory with files - dict_tmpl.c, Makefile, dict_intdict.sql.in: - - -$ make && make install -$ psql DBNAME < dict_intdict.sql - - - - - This is a dict_tmpl.c file: - - - -#include "postgres.h" -#include "utils/builtins.h" -#include "fmgr.h" - -#ifdef PG_MODULE_MAGIC -PG_MODULE_MAGIC; -#endif - -#include "tsearch/ts_locale.h" -#include "tsearch/ts_public.h" -#include "tsearch/ts_utils.h" - -typedef struct { - int maxlen; - bool rejectlong; -} DictInt; - - -PG_FUNCTION_INFO_V1(dinit_intdict); -Datum dinit_intdict(PG_FUNCTION_ARGS); - -Datum -dinit_intdict(PG_FUNCTION_ARGS) { - DictInt *d = (DictInt*)malloc( sizeof(DictInt) ); - Map *cfg, *pcfg; - text *in; - - if (!d) - elog(ERROR, "No memory"); - memset(d, 0, sizeof(DictInt)); - - /* Your INIT code */ - /* defaults */ - d->maxlen = 6; - d->rejectlong = false; - - if (PG_ARGISNULL(0) || PG_GETARG_POINTER(0) == NULL) /* no options */ - PG_RETURN_POINTER(d); - - in = PG_GETARG_TEXT_P(0); - parse_keyvalpairs(in, &cfg); - PG_FREE_IF_COPY(in, 0); - pcfg=cfg; - - while (pcfg->key) - { - if (strcasecmp("MAXLEN", pcfg->key) == 0) - d->maxlen=atoi(pcfg->value); - else if ( strcasecmp("REJECTLONG", pcfg->key) == 0) - { - if ( strcasecmp("true", pcfg->value) == 0 ) - d->rejectlong=true; - else if ( strcasecmp("false", pcfg->value) == 0) - d->rejectlong=false; - else - elog(ERROR,"Unknown value: %s => %s", pcfg->key, pcfg->value); - } - else - elog(ERROR,"Unknown option: %s => %s", pcfg->key, pcfg->value); - - pfree(pcfg->key); - pfree(pcfg->value); - pcfg++; - } - pfree(cfg); - - PG_RETURN_POINTER(d); - } - -PG_FUNCTION_INFO_V1(dlexize_intdict); -Datum dlexize_intdict(PG_FUNCTION_ARGS); -Datum -dlexize_intdict(PG_FUNCTION_ARGS) -{ - DictInt *d = (DictInt*)PG_GETARG_POINTER(0); - char *in = (char*)PG_GETARG_POINTER(1); - char *txt = pnstrdup(in, PG_GETARG_INT32(2)); - TSLexeme *res = palloc(sizeof(TSLexeme) * 2); - - /* Your INIT dictionary code */ - res[1].lexeme = NULL; - - if (PG_GETARG_INT32(2) > d->maxlen) - { - if (d->rejectlong) - { /* stop, return void array */ - pfree(txt); - res[0].lexeme = NULL; - } - else - { /* cut integer */ - txt[d->maxlen] = '\0'; - res[0].lexeme = txt; - } - } - else - res[0].lexeme = txt; - - PG_RETURN_POINTER(res); -} - - - - This is the Makefile: - - -subdir = contrib/dict_intdict -top_builddir = ../.. -include $(top_builddir)/src/Makefile.global - -MODULE_big = dict_intdict -OBJS = dict_tmpl.o -DATA_built = dict_intdict.sql -DOCS = - -include $(top_srcdir)/contrib/contrib-global.mk - - - - - This is a dict_intdict.sql.in: - - -SET default_text_search_config = 'english'; - -BEGIN; - -CREATE OR REPLACE FUNCTION dinit_intdict(internal) - RETURNS internal - AS 'MODULE_PATHNAME' - LANGUAGE 'C'; - -CREATE OR REPLACE FUNCTION dlexize_intdict(internal,internal,internal,internal) - RETURNS internal - AS 'MODULE_PATHNAME' - LANGUAGE 'C' - WITH (isstrict); - -CREATE TEXT SEARCH TEMPLATE intdict_template ( - LEXIZE = dlexize_intdict, INIT = dinit_intdict -); - -CREATE TEXT SEARCH DICTIONARY intdict ( - TEMPLATE = intdict_template, - MAXLEN = 6, REJECTLONG = false -); - -COMMENT ON TEXT SEARCH DICTIONARY intdict IS 'Dictionary for Integers'; - -END; - - - - - - - Example of Creating a Parser - - - SQL command CREATE TEXT SEARCH PARSER creates - a parser for full text searching. In our example we will implement - a simple parser which recognizes space-delimited words and - has only two types (3, word, Word; 12, blank, Space symbols). Identifiers - were chosen to keep compatibility with the default headline() function - since we do not implement our own version. - - - - To implement a parser one needs to create a minimum of four functions. - - - - - - - - START = start_function - - - - - Initialize the parser. Arguments are a pointer to the parsed text and its - length. - - - Returns a pointer to the internal structure of a parser. Note that it should - be malloced or palloced in the - TopMemoryContext. We name it ParserState. - - - - - - - - GETTOKEN = gettoken_function - - - - - Returns the next token. - Arguments are ParserState *, char **, int *. - - - This procedure will be called as long as the procedure returns token type zero. - - - - - - - - END = end_function, - - - - - This void function will be called after parsing is finished to free - allocated resources in this procedure (ParserState). The argument - is ParserState *. - - - - - - - - LEXTYPES = lextypes_function - - - - - Returns an array containing the id, alias, and the description of the tokens - in the parser. See LexDescr in src/include/utils/ts_public.h. - - - - - - - - Below is the source code of our test parser, organized as a contrib module. - - - - Testing: - - -SELECT * FROM ts_parse('testparser','That''s my first own parser'); - tokid | token --------+-------- - 3 | That's - 12 | - 3 | my - 12 | - 3 | first - 12 | - 3 | own - 12 | - 3 | parser - -SELECT to_tsvector('testcfg','That''s my first own parser'); - to_tsvector -------------------------------------------------- - 'my':2 'own':4 'first':3 'parser':5 'that''s':1 - -SELECT ts_headline('testcfg','Supernovae stars are the brightest phenomena in galaxies', to_tsquery('testcfg', 'star')); - headline ------------------------------------------------------------------ - Supernovae <b>stars</b> are the brightest phenomena in galaxies - - - - - - This test parser is an example adopted from a tutorial by Valli, parser - HOWTO. - - - - To compile the example just do: - - -$ make -$ make install -$ psql regression < test_parser.sql - - - - - This is a test_parser.c: - - - -#ifdef PG_MODULE_MAGIC -PG_MODULE_MAGIC; -#endif - -/* - * types - */ - -/* self-defined type */ -typedef struct { - char * buffer; /* text to parse */ - int len; /* length of the text in buffer */ - int pos; /* position of the parser */ -} ParserState; - -/* copy-paste from wparser.h of tsearch2 */ -typedef struct { - int lexid; - char *alias; - char *descr; -} LexDescr; - -/* - * prototypes - */ -PG_FUNCTION_INFO_V1(testprs_start); -Datum testprs_start(PG_FUNCTION_ARGS); - -PG_FUNCTION_INFO_V1(testprs_getlexeme); -Datum testprs_getlexeme(PG_FUNCTION_ARGS); - -PG_FUNCTION_INFO_V1(testprs_end); -Datum testprs_end(PG_FUNCTION_ARGS); - -PG_FUNCTION_INFO_V1(testprs_lextype); -Datum testprs_lextype(PG_FUNCTION_ARGS); - -/* - * functions - */ -Datum testprs_start(PG_FUNCTION_ARGS) -{ - ParserState *pst = (ParserState *) palloc(sizeof(ParserState)); - pst->buffer = (char *) PG_GETARG_POINTER(0); - pst->len = PG_GETARG_INT32(1); - pst->pos = 0; - - PG_RETURN_POINTER(pst); -} - -Datum testprs_getlexeme(PG_FUNCTION_ARGS) -{ - ParserState *pst = (ParserState *) PG_GETARG_POINTER(0); - char **t = (char **) PG_GETARG_POINTER(1); - int *tlen = (int *) PG_GETARG_POINTER(2); - int type; - - *tlen = pst->pos; - *t = pst->buffer + pst->pos; - - if ((pst->buffer)[pst->pos] == ' ') - { - /* blank type */ - type = 12; - /* go to the next non-white-space character */ - while ((pst->buffer)[pst->pos] == ' ' && - pst->pos < pst->len) - (pst->pos)++; - } else { - /* word type */ - type = 3; - /* go to the next white-space character */ - while ((pst->buffer)[pst->pos] != ' ' && - pst->pos < pst->len) - (pst->pos)++; - } - - *tlen = pst->pos - *tlen; - - /* we are finished if (*tlen == 0) */ - if (*tlen == 0) - type=0; - - PG_RETURN_INT32(type); -} - -Datum testprs_end(PG_FUNCTION_ARGS) -{ - ParserState *pst = (ParserState *) PG_GETARG_POINTER(0); - pfree(pst); - PG_RETURN_VOID(); -} - -Datum testprs_lextype(PG_FUNCTION_ARGS) -{ - /* - Remarks: - - we have to return the blanks for headline reason - - we use the same lexids like Teodor in the default - word parser; in this way we can reuse the headline - function of the default word parser. - */ - LexDescr *descr = (LexDescr *) palloc(sizeof(LexDescr) * (2+1)); - - /* there are only two types in this parser */ - descr[0].lexid = 3; - descr[0].alias = pstrdup("word"); - descr[0].descr = pstrdup("Word"); - descr[1].lexid = 12; - descr[1].alias = pstrdup("blank"); - descr[1].descr = pstrdup("Space symbols"); - descr[2].lexid = 0; - - PG_RETURN_POINTER(descr); -} - - - - This is a Makefile - - -override CPPFLAGS := -I. $(CPPFLAGS) - -MODULE_big = test_parser -OBJS = test_parser.o - -DATA_built = test_parser.sql -DATA = -DOCS = README.test_parser -REGRESS = test_parser - - -ifdef USE_PGXS -PGXS := $(shell pg_config --pgxs) -include $(PGXS) -else -subdir = contrib/test_parser -top_builddir = ../.. -include $(top_builddir)/src/Makefile.global -include $(top_srcdir)/contrib/contrib-global.mk -endif - - - This is a test_parser.sql.in: - - -SET default_text_search_config = 'english'; - -BEGIN; - -CREATE FUNCTION testprs_start(internal,int4) - RETURNS internal - AS 'MODULE_PATHNAME' - LANGUAGE 'C' with (isstrict); - -CREATE FUNCTION testprs_getlexeme(internal,internal,internal) - RETURNS internal - AS 'MODULE_PATHNAME' - LANGUAGE 'C' with (isstrict); - -CREATE FUNCTION testprs_end(internal) - RETURNS void - AS 'MODULE_PATHNAME' - LANGUAGE 'C' with (isstrict); - -CREATE FUNCTION testprs_lextype(internal) - RETURNS internal - AS 'MODULE_PATHNAME' - LANGUAGE 'C' with (isstrict); - - -CREATE TEXT SEARCH PARSER testparser ( - START = testprs_start, - GETTOKEN = testprs_getlexeme, - END = testprs_end, - LEXTYPES = testprs_lextype -); - -CREATE TEXT SEARCH CONFIGURATION testcfg (PARSER = testparser); -ALTER TEXT SEARCH CONFIGURATION testcfg ADD MAPPING FOR word WITH simple; - -END; - - - - - - -- 2.40.0