From: Bruce Momjian Date: Wed, 29 Aug 2007 02:37:04 +0000 (+0000) Subject: Properly indent SGML in textsearch.sgml. X-Git-Tag: REL8_3_BETA1~268 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=f145de27c3e80975226f85c5413d2de33a0f5d35;p=postgresql Properly indent SGML in textsearch.sgml. --- diff --git a/doc/src/sgml/textsearch.sgml b/doc/src/sgml/textsearch.sgml index ee2812acca..afa4415d93 100644 --- a/doc/src/sgml/textsearch.sgml +++ b/doc/src/sgml/textsearch.sgml @@ -1,148 +1,173 @@ -Full Text Search - - - -Introduction - - -Full Text Searching (or just text search) allows -identifying documents that satisfy a query, and -optionally sorting them by relevance to the query. The most common search -is to find all documents containing given query terms -and return them in order of their similarity to the -query. Notions of query and -similarity are very flexible and depend on the specific -application. The simplest search considers query as a -set of words and similarity as the frequency of query -words in the document. Full text indexing can be done inside the -database or outside. Doing indexing inside the database allows easy access -to document metadata to assist in indexing and display. - - - -Textual search operators have existed in databases for years. -PostgreSQL has -~,~*, LIKE, -ILIKE operators for textual datatypes, but they lack -many essential properties required by modern information systems: - - - - -There is no linguistic support, even for English. Regular expressions are -not sufficient because they cannot easily handle derived words, -e.g., satisfies and satisfy. You might -miss documents which contain satisfies, although you -probably would like to find them when searching for -satisfy. It is possible to use OR -to search any of them, but it is tedious and error-prone -(some words can have several thousand derivatives). - - - -They provide no ordering (ranking) of search results, which makes them -ineffective when thousands of matching documents are found. - - - -They tend to be slow because they process all documents for every search and -there is no index support. - - - - - - -Full text indexing allows documents to be preprocessed -and an index saved for later rapid searching. Preprocessing includes: - - - -Parsing documents into lexemes. It is -useful to identify various lexemes, e.g. digits, words, complex words, -email addresses, so they can be processed differently. In principle -lexemes depend on the specific application but for an ordinary search it -is useful to have a predefined list of lexemes. - - - -Dictionaries allow the conversion of lexemes into -a normalized form so it is not necessary to enter -search words in a specific form. - - - -Store preprocessed documents -optimized for searching. For example, represent each document as a sorted array -of lexemes. Along with lexemes it is desirable to store positional -information to use for proximity ranking, so that a -document which contains a more "dense" region of query words is assigned -a higher rank than one with scattered query words. - - - - - -Dictionaries allow fine-grained control over how lexemes are created. With -dictionaries you can: - - -Define "stop words" that should not be indexed. - - - -Map synonyms to a single word using ispell. - - -Map phrases to a single word using a thesaurus. - - -Map different variations of a word to a canonical form using -an ispell dictionary. - - -Map different variations of a word to a canonical form using -snowball stemmer rules. - - - - - - -A data type (), tsvector -is provided, for storing preprocessed documents, -along with a type tsquery for representing textual -queries. Also, a full text search operator @@ is defined -for these data types (). Full text -searches can be accelerated using indexes (). - - - - -What Is a <firstterm>Document</firstterm>? - - -document - - - -A document can be a simple text file stored in the file system. The full -text indexing engine can parse text files and store associations of lexemes -(words) with their parent document. Later, these associations are used to -search for documents which contain query words. In this case, the database -can be used to store the full text index and for executing searches, and -some unique identifier can be used to retrieve the document from the file -system. - - - -A document can also be any textual database attribute or a combination -(concatenation), which in turn can be stored in various tables or obtained -dynamically. In other words, a document can be constructed from different -parts for indexing and it might not exist as a whole. For example: + + Full Text Search + + + + Introduction + + + Full Text Searching (or just text search) allows + identifying documents that satisfy a query, and + optionally sorting them by relevance to the query. The most common search + is to find all documents containing given query terms + and return them in order of their similarity to the + query. Notions of query and + similarity are very flexible and depend on the specific + application. The simplest search considers query as a + set of words and similarity as the frequency of query + words in the document. Full text indexing can be done inside the + database or outside. Doing indexing inside the database allows easy access + to document metadata to assist in indexing and display. + + + + Textual search operators have existed in databases for years. + PostgreSQL has + ~,~*, LIKE, + ILIKE operators for textual datatypes, but they lack + many essential properties required by modern information systems: + + + + + + There is no linguistic support, even for English. Regular expressions are + not sufficient because they cannot easily handle derived words, + e.g., satisfies and satisfy. You might + miss documents which contain satisfies, although you + probably would like to find them when searching for + satisfy. It is possible to use OR + to search any of them, but it is tedious and error-prone + (some words can have several thousand derivatives). + + + + + + They provide no ordering (ranking) of search results, which makes them + ineffective when thousands of matching documents are found. + + + + + + They tend to be slow because they process all documents for every search and + there is no index support. + + + + + + Full text indexing allows documents to be preprocessed + and an index saved for later rapid searching. Preprocessing includes: + + + + + + Parsing documents into lexemes. It is + useful to identify various lexemes, e.g. digits, words, complex words, + email addresses, so they can be processed differently. In principle + lexemes depend on the specific application but for an ordinary search it + is useful to have a predefined list of lexemes. + + + + + + Dictionaries allow the conversion of lexemes into + a normalized form so it is not necessary to enter + search words in a specific form. + + + + + + Store preprocessed documents + optimized for searching. For example, represent each document as a sorted array + of lexemes. Along with lexemes it is desirable to store positional + information to use for proximity ranking, so that a + document which contains a more "dense" region of query words is assigned + a higher rank than one with scattered query words. + + + + + + Dictionaries allow fine-grained control over how lexemes are created. With + dictionaries you can: + + + + + + Define "stop words" that should not be indexed. + + + + + + Map synonyms to a single word using ispell. + + + + + + Map phrases to a single word using a thesaurus. + + + + + + Map different variations of a word to a canonical form using + an ispell dictionary. + + + + + + Map different variations of a word to a canonical form using + snowball stemmer rules. + + + + + + A data type (), tsvector + is provided, for storing preprocessed documents, + along with a type tsquery for representing textual + queries. Also, a full text search operator @@ is defined + for these data types (). Full text + searches can be accelerated using indexes (). + + + + + What Is a <firstterm>Document</firstterm>? + + + document + + + + A document can be a simple text file stored in the file system. The full + text indexing engine can parse text files and store associations of lexemes + (words) with their parent document. Later, these associations are used to + search for documents which contain query words. In this case, the database + can be used to store the full text index and for executing searches, and + some unique identifier can be used to retrieve the document from the file + system. + + + + A document can also be any textual database attribute or a combination + (concatenation), which in turn can be stored in various tables or obtained + dynamically. In other words, a document can be constructed from different + parts for indexing and it might not exist as a whole. For example: + SELECT title || ' ' || author || ' ' || abstract || ' ' || body AS document FROM messages @@ -152,39 +177,38 @@ SELECT m.title || ' ' || m.author || ' ' || m.abstract || ' ' || d.body AS docum FROM messages m, docs d WHERE mid = did AND mid = 12; - - - - -Actually, in the previous example queries, COALESCE - -should be used to prevent a NULL attribute from causing -a NULL result. - - - + - -Data Types + + + Actually, in the previous example queries, COALESCE + + should be used to prevent a NULL attribute from causing + a NULL result. + + + - + + Data Types + - -tsvector - + + tsvector + + + tsvector + - -tsvector - + + tsvector is a data type that represents a document and is + optimized for full text searching. In the simplest case, + tsvector is a sorted list of lexemes, so even without indexes + full text searches perform better than standard ~ and + LIKE operations: - -tsvector is a data type that represents a document and is -optimized for full text searching. In the simplest case, -tsvector is a sorted list of lexemes, so even without indexes -full text searches perform better than standard ~ and -LIKE operations: SELECT 'a fat cat sat on a mat and ate a fat rat'::tsvector; tsvector @@ -192,7 +216,7 @@ SELECT 'a fat cat sat on a mat and ate a fat rat'::tsvector; 'a' 'on' 'and' 'ate' 'cat' 'fat' 'mat' 'rat' 'sat' -Notice, that space is also a lexeme: + Notice, that space is also a lexeme: SELECT 'space '' '' is a lexeme'::tsvector; @@ -201,59 +225,62 @@ SELECT 'space '' '' is a lexeme'::tsvector; 'a' 'is' ' ' 'space' 'lexeme' -Each lexeme, optionally, can have positional information which is used for -proximity ranking: + Each lexeme, optionally, can have positional information which is used for + proximity ranking: + SELECT 'a:1 fat:2 cat:3 sat:4 on:5 a:6 mat:7 and:8 ate:9 a:10 fat:11 rat:12'::tsvector; - tsvector + tsvector ------------------------------------------------------------------------------- 'a':1,6,10 'on':5 'and':8 'ate':9 'cat':3 'fat':2,11 'mat':7 'rat':12 'sat':4 -Each lexeme position also can be labeled as A, -B, C, D, -where D is the default. These labels can be used to group -lexemes into different importance or -rankings, for example to reflect document structure. -Actual values can be assigned at search time and used during the calculation -of the document rank. This is very useful for controlling search results. - - -The concatenation operator, e.g. tsvector || tsvector, -can "construct" a document from several parts. The order is important if -tsvector contains positional information. Of course, -it is also possible to build a document using different tables: + Each lexeme position also can be labeled as A, + B, C, D, + where D is the default. These labels can be used to group + lexemes into different importance or + rankings, for example to reflect document structure. + Actual values can be assigned at search time and used during the calculation + of the document rank. This is very useful for controlling search results. + + + + The concatenation operator, e.g. tsvector || tsvector, + can "construct" a document from several parts. The order is important if + tsvector contains positional information. Of course, + it is also possible to build a document using different tables: SELECT 'fat:1 cat:2'::tsvector || 'fat:1 rat:2'::tsvector; ?column? --------------------------- 'cat':2 'fat':1,3 'rat':4 + SELECT 'fat:1 rat:2'::tsvector || 'fat:1 cat:2'::tsvector; ?column? --------------------------- 'cat':4 'fat':1,3 'rat':2 - + - + - + - -tsquery - + + tsquery + - -tsquery - + + tsquery + - -tsquery is a data type for textual queries which supports -the boolean operators & (AND), | (OR), -and parentheses. A tsquery consists of lexemes -(optionally labeled by letters) with boolean operators in between: + + tsquery is a data type for textual queries which supports + the boolean operators & (AND), | (OR), + and parentheses. A tsquery consists of lexemes + (optionally labeled by letters) with boolean operators in between: SELECT 'fat & cat'::tsquery; @@ -261,17 +288,19 @@ SELECT 'fat & cat'::tsquery; --------------- 'fat' & 'cat' SELECT 'fat:ab & cat'::tsquery; - tsquery + tsquery ------------------ 'fat':AB & 'cat' -Labels can be used to restrict the search region, which allows the -development of different search engines using the same full text index. - - -tsqueries can be concatenated using && (AND) -and || (OR) operators: + Labels can be used to restrict the search region, which allows the + development of different search engines using the same full text index. + + + + tsqueries can be concatenated using && (AND) + and || (OR) operators: + SELECT 'a & b'::tsquery && 'c | d'::tsquery; ?column? @@ -283,515 +312,537 @@ SELECT 'a & b'::tsquery || 'c|d'::tsquery; --------------------------- 'a' & 'b' | ( 'c' | 'd' ) - - - - - + + + + - -Performing Searches + + + + Performing Searches + + + Full text searching in PostgreSQL is based on + the operator @@, which tests whether a tsvector + (document) matches a tsquery (query). Also, this operator + supports text input, allowing explicit conversion of a text + string to tsvector to be skipped. The variants available + are: - -Full text searching in PostgreSQL is based on -the operator @@, which tests whether a tsvector -(document) matches a tsquery (query). Also, this operator -supports text input, allowing explicit conversion of a text -string to tsvector to be skipped. The variants available -are: tsvector @@ tsquery tsquery @@ tsvector text @@ tsquery text @@ text - + + + + The match operator @@ returns true if + the tsvector matches the tsquery. It doesn't + matter which data type is written first: - -The match operator @@ returns true if -the tsvector matches the tsquery. It doesn't -matter which data type is written first: SELECT 'cat & rat'::tsquery @@ 'a fat cat sat on a mat and ate a fat rat'::tsvector; ?column? ---------- t + SELECT 'fat & cow'::tsquery @@ 'a fat cat sat on a mat and ate a fat rat'::tsvector; ?column? ---------- f - - - -The form text @@ tsquery -is equivalent to to_tsvector(x) @@ y. -The form text @@ text -is equivalent to to_tsvector(x) @@ plainto_tsquery(y). - - - -Configurations - - -configurations - - - -The above are all simple text search examples. As mentioned before, full -text search functionality includes the ability to do many more things: -skip indexing certain words (stop words), process synonyms, and use -sophisticated parsing, e.g. parse based on more than just white space. -This functionality is controlled by configurations. -Fortunately, PostgreSQL comes with predefined -configurations for many languages. (psql's \dF -shows all predefined configurations.) During installation an appropriate -configuration was selected and was set accordingly. If you -need to change it, see . - - - - - - -Tables and Indexes - - -The previous section described how to perform full text searches using -constant strings. This section shows how to search table data, optionally -using indexes. - - - -Searching a Table - - -It is possible to do full text table search with no index. A simple query -to find all title entries that contain the word -friend is: + + + + The form text @@ tsquery + is equivalent to to_tsvector(x) @@ y. + The form text @@ text + is equivalent to to_tsvector(x) @@ plainto_tsquery(y). + + + + Configurations + + + configurations + + + + The above are all simple text search examples. As mentioned before, full + text search functionality includes the ability to do many more things: + skip indexing certain words (stop words), process synonyms, and use + sophisticated parsing, e.g. parse based on more than just white space. + This functionality is controlled by configurations. + Fortunately, PostgreSQL comes with predefined + configurations for many languages. (psql's \dF + shows all predefined configurations.) During installation an appropriate + configuration was selected and was set accordingly. If you + need to change it, see . + + + + + + + Tables and Indexes + + + The previous section described how to perform full text searches using + constant strings. This section shows how to search table data, optionally + using indexes. + + + + Searching a Table + + + It is possible to do full text table search with no index. A simple query + to find all title entries that contain the word + friend is: + SELECT title FROM pgweb WHERE to_tsvector('english', body) @@ to_tsquery('friend') - + + + + The query above uses the english the configuration set by . A more complex query is to + select the ten most recent documents which contain create and + table in the title or body: - -A more complex query is to select the ten most recent documents which -contain create and table in the title -or body: SELECT title FROM pgweb WHERE to_tsvector('english', title || body) @@ to_tsquery('create & table') ORDER BY dlm DESC LIMIT 10; -dlm is the last-modified date so we -used ORDER BY dlm LIMIT 10 to get the ten most recent -matches. For clarity we omitted the coalesce function -which prevents the unwanted effect of NULL -concatenation. - - + dlm is the last-modified date so we + used ORDER BY dlm LIMIT 10 to get the ten most recent + matches. For clarity we omitted the coalesce function + which prevents the unwanted effect of NULL + concatenation. + - -Creating Indexes + + + + Creating Indexes + + + We can create a GIN () index to speed up the search: - -We can create a GIN () index to speed up the search: CREATE INDEX pgweb_idx ON pgweb USING gin(to_tsvector('english', body)); -Notice that the 2-argument version of to_tsvector is -used. Only text search functions which specify a configuration name can -be used in expression indexes (). -This is because the index contents must be unaffected by -. -If they were affected, the index -contents might be inconsistent because different entries could contain -tsvectors that were created with different text search -configurations, and there would be no way to guess which was which. -It would be impossible to dump and restore such an index correctly. - - - -Because the two-argument version of to_tsvector was -used in the index above, only a query reference that uses the 2-argument -version of to_tsvector with the same configuration -name will use that index, i.e. WHERE 'a & b' @@ -to_svector('english', body) will use the index, but WHERE -'a & b' @@ to_svector(body)) and WHERE 'a & b' @@ -body::tsvector will not. This guarantees that an index will be used -only with the same configuration used to create the index rows. - - - -It is possible to setup more complex expression indexes where the -configuration name is specified by another column, e.g.: + + Notice that the 2-argument version of to_tsvector is + used. Only text search functions which specify a configuration name can + be used in expression indexes (). + This is because the index contents must be unaffected by . If they were affected, the + index contents might be inconsistent because different entries could + contain tsvectors that were created with different text search + configurations, and there would be no way to guess which was which. It + would be impossible to dump and restore such an index correctly. + + + + Because the two-argument version of to_tsvector was + used in the index above, only a query reference that uses the 2-argument + version of to_tsvector with the same configuration + name will use that index, i.e. WHERE 'a & b' @@ + to_svector('english', body) will use the index, but WHERE + 'a & b' @@ to_svector(body)) and WHERE 'a & b' @@ + body::tsvector will not. This guarantees that an index will be used + only with the same configuration used to create the index rows. + + + + It is possible to setup more complex expression indexes where the + configuration name is specified by another column, e.g.: + CREATE INDEX pgweb_idx ON pgweb USING gin(to_tsvector(config_name, body)); -where config_name is a column in the pgweb -table. This allows mixed configurations in the same index while -recording which configuration was used for each index row. - - -Indexes can even concatenate columns: + where config_name is a column in the pgweb + table. This allows mixed configurations in the same index while + recording which configuration was used for each index row. + + + + Indexes can even concatenate columns: + CREATE INDEX pgweb_idx ON pgweb USING gin(to_tsvector('english', title || body)); - + + + + A more complex case is to create a separate tsvector column + to hold the output of to_tsvector(). This example is a + concatenation of title and body, + with ranking information. We assign different labels to them to encode + information about the origin of each word: - -A more complex case is to create a separate tsvector column -to hold the output of to_tsvector(). This example is a -concatenation of title and body, -with ranking information. We assign different labels to them to encode -information about the origin of each word: ALTER TABLE pgweb ADD COLUMN textsearch_index tsvector; UPDATE pgweb SET textsearch_index = setweight(to_tsvector('english', coalesce(title,'')), 'A') || ' ' || setweight(to_tsvector('english', coalesce(body,'')),'D'); -Then we create a GIN index to speed up the search: + + Then we create a GIN index to speed up the search: + CREATE INDEX textsearch_idx ON pgweb USING gin(textsearch_index); -After vacuuming, we are ready to perform a fast full text search: + + After vacuuming, we are ready to perform a fast full text search: + SELECT ts_rank_cd(textsearch_index, q) AS rank, title FROM pgweb, to_tsquery('create & table') q WHERE q @@ textsearch_index ORDER BY rank DESC LIMIT 10; -It is necessary to create a trigger to keep the new tsvector -column current anytime title or body changes. -Keep in mind that, just like with expression indexes, it is important to -specify the configuration name when creating text search data types -inside triggers so the column's contents are not affected by changes to -default_text_search_config. - - + It is necessary to create a trigger to keep the new tsvector + column current anytime title or body changes. + Keep in mind that, just like with expression indexes, it is important to + specify the configuration name when creating text search data types + inside triggers so the column's contents are not affected by changes to + default_text_search_config. + + + - + - -Operators and Functions + + Operators and Functions - -This section outlines all the functions and operators that are available -for full text searching. - + + This section outlines all the functions and operators that are available + for full text searching. + - -Full text search vectors and queries both use lexemes, but for different -purposes. A tsvector represents the lexemes (tokens) parsed -out of a document, with an optional position. A tsquery -specifies a boolean condition using lexemes. - + + Full text search vectors and queries both use lexemes, but for different + purposes. A tsvector represents the lexemes (tokens) parsed + out of a document, with an optional position. A tsquery + specifies a boolean condition using lexemes. + - -All of the following functions that accept a configuration argument can -use a textual configuration name to select a configuration. If the option -is omitted the configuration specified by -default_text_search_config is used. For more information on -configuration, see . - + + All of the following functions that accept a configuration argument can + use a textual configuration name to select a configuration. If the option + is omitted the configuration specified by + default_text_search_config is used. For more information on + configuration, see . + - -Search + + Search -The operator @@ is used to perform full text -searches: + The operator @@ is used to perform full text + searches: + - + - + - -TSVECTOR @@ TSQUERY - + + TSVECTOR @@ TSQUERY + - - - -TSVECTOR @@ TSQUERY -TSQUERY @@ TSVECTOR - - + + + + TSVECTOR @@ TSQUERY + TSQUERY @@ TSVECTOR + + + + + + Returns true if TSQUERY is contained + in TSVECTOR, and false if not: - - -Returns true if TSQUERY is contained -in TSVECTOR, and false if not: SELECT 'a fat cat sat on a mat and ate a fat rat'::tsvector @@ 'cat & rat'::tsquery; ?column? - ---------- - t +---------- + t + SELECT 'a fat cat sat on a mat and ate a fat rat'::tsvector @@ 'fat & cow'::tsquery; ?column? - ---------- - f +---------- + f - + - - + + - + - -TEXT @@ TSQUERY - + + TEXT @@ TSQUERY + - - -text @@ tsquery - - + + + text @@ tsquery + + + + + + Returns true if TSQUERY is contained + in TEXT, and false if not: - - -Returns true if TSQUERY is contained -in TEXT, and false if not: SELECT 'a fat cat sat on a mat and ate a fat rat'::text @@ 'cat & rat'::tsquery; ?column? ---------- t + SELECT 'a fat cat sat on a mat and ate a fat rat'::text @@ 'cat & cow'::tsquery; ?column? ---------- f - + + + - - + - + + TEXT @@ TEXT + - -TEXT @@ TEXT - + + + + text @@ text + + - - - -text @@ text - - + + + Returns true if the right + argument (the query) is contained in the left argument, and + false otherwise: - - -Returns true if the right -argument (the query) is contained in the left argument, and -false otherwise: SELECT 'a fat cat sat on a mat and ate a fat rat' @@ 'cat rat'; ?column? ---------- t + SELECT 'a fat cat sat on a mat and ate a fat rat' @@ 'cat cow'; ?column? ---------- f - - - - - - - - - -For index support of full text operators consult . - - - + + + + - -tsvector - - - - - - -to_tsvector - - - - -to_tsvector(config_name, document TEXT) returns TSVECTOR - - - - - -Parses a document into tokens, reduces the tokens to lexemes, and returns a -tsvector which lists the lexemes together with their positions in the document -in lexicographic order. - - - - - - - - -strip - - - - -strip(vector TSVECTOR) returns TSVECTOR - - - - - -Returns a vector which lists the same lexemes as the given vector, but -which lacks any information about where in the document each lexeme -appeared. While the returned vector is useless for relevance ranking it -will usually be much smaller. - - - - - - - - -setweight - - - - -setweight(vector TSVECTOR, letter) returns TSVECTOR - - - - - -This function returns a copy of the input vector in which every location -has been labeled with either the letter A, -B, or C, or the default label -D (which is the default for new vectors -and as such is usually not displayed). These labels are retained -when vectors are concatenated, allowing words from different parts of a -document to be weighted differently by ranking functions. - - - - - - - - - -tsvector concatenation - - - - -vector1 || vector2 -tsvector_concat(vector1 TSVECTOR, vector2 TSVECTOR) returns TSVECTOR - - - - - -Returns a vector which combines the lexemes and positional information of -the two vectors given as arguments. Positional weight labels (described -in the previous paragraph) are retained during the concatenation. This -has at least two uses. First, if some sections of your document need to be -parsed with different configurations than others, you can parse them -separately and then concatenate the resulting vectors. Second, you can -weigh words from one section of your document differently than the others -by parsing the sections into separate vectors and assigning each vector -a different position label with the setweight() -function. You can then concatenate them into a single vector and provide -a weights argument to the ts_rank() function that assigns -different weights to positions with different labels. - - - - - - - -length(tsvector) - - - - -length(vector TSVECTOR) returns INT4 - - - - - -Returns the number of lexemes stored in the vector. - - - - - - - -text::tsvector - - - - -text::TSVECTOR returns TSVECTOR - - - - - -Directly casting text to a tsvector allows you -to directly inject lexemes into a vector with whatever positions and -positional weights you choose to specify. The text should be formatted to -match the way a vector is displayed by SELECT. - - - - - - - -trigger -for updating a derived tsvector column - - - - -tsvector_update_trigger(tsvector_column_name, config_name, text_column_name , ... ) -tsvector_update_trigger_column(tsvector_column_name, config_column_name, text_column_name , ... ) - - - - - -Two built-in trigger functions are available to automatically update a -tsvector column from one or more textual columns. An example -of their use is: + + For index support of full text operators consult . + + + + + + + + tsvector + + + + + + + to_tsvector + + + + + to_tsvector(config_name, document TEXT) returns TSVECTOR + + + + + + Parses a document into tokens, reduces the tokens to lexemes, and returns a + tsvector which lists the lexemes together with their positions in the document + in lexicographic order. + + + + + + + + + strip + + + + + strip(vector TSVECTOR) returns TSVECTOR + + + + + + Returns a vector which lists the same lexemes as the given vector, but + which lacks any information about where in the document each lexeme + appeared. While the returned vector is useless for relevance ranking it + will usually be much smaller. + + + + + + + + + setweight + + + + + setweight(vector TSVECTOR, letter) returns TSVECTOR + + + + + + This function returns a copy of the input vector in which every location + has been labeled with either the letter A, + B, or C, or the default label + D (which is the default for new vectors + and as such is usually not displayed). These labels are retained + when vectors are concatenated, allowing words from different parts of a + document to be weighted differently by ranking functions. + + + + + + + + tsvector concatenation + + + + + vector1 || vector2 + tsvector_concat(vector1 TSVECTOR, vector2 TSVECTOR) returns TSVECTOR + + + + + + Returns a vector which combines the lexemes and positional information of + the two vectors given as arguments. Positional weight labels (described + in the previous paragraph) are retained during the concatenation. This + has at least two uses. First, if some sections of your document need to be + parsed with different configurations than others, you can parse them + separately and then concatenate the resulting vectors. Second, you can + weigh words from one section of your document differently than the others + by parsing the sections into separate vectors and assigning each vector + a different position label with the setweight() + function. You can then concatenate them into a single vector and provide + a weights argument to the ts_rank() function that assigns + different weights to positions with different labels. + + + + + + + + length(tsvector) + + + + + length(vector TSVECTOR) returns INT4 + + + + + + Returns the number of lexemes stored in the vector. + + + + + + + + text::tsvector + + + + + text::TSVECTOR returns TSVECTOR + + + + + + Directly casting text to a tsvector allows you + to directly inject lexemes into a vector with whatever positions and + positional weights you choose to specify. The text should be formatted to + match the way a vector is displayed by SELECT. + + + + + + + + + trigger + for updating a derived tsvector column + + + + + tsvector_update_trigger(tsvector_column_name, config_name, text_column_name , ... ) + tsvector_update_trigger_column(tsvector_column_name, config_column_name, text_column_name , ... ) + + + + + + Two built-in trigger functions are available to automatically update a + tsvector column from one or more textual columns. An example + of their use is: CREATE TABLE tblMessages ( @@ -804,50 +855,52 @@ ON tblMessages FOR EACH ROW EXECUTE PROCEDURE tsvector_update_trigger(tsv, 'pg_catalog.english', strMessage); -Having created this trigger, any change in strMessage -will be automatically reflected into tsv. - - - -Both triggers require you to specify the text search configuration to -be used to perform the conversion. For -tsvector_update_trigger, the configuration name is simply -given as the second trigger argument. It must be schema-qualified as -shown above, so that the trigger behavior will not change with changes -in search_path. For -tsvector_update_trigger_column, the second trigger argument -is the name of another table column, which must be of type -regconfig. This allows a per-row selection of configuration -to be made. - - - - - - - -ts_stat - - - - -ts_stat(sqlquery text , weights text ) returns SETOF statinfo - - - - - -Here statinfo is a type, defined as: + Having created this trigger, any change in strMessage + will be automatically reflected into tsv. + + + + Both triggers require you to specify the text search configuration to + be used to perform the conversion. For + tsvector_update_trigger, the configuration name is simply + given as the second trigger argument. It must be schema-qualified as + shown above, so that the trigger behavior will not change with changes + in search_path. For + tsvector_update_trigger_column, the second trigger argument + is the name of another table column, which must be of type + regconfig. This allows a per-row selection of configuration + to be made. + + + + + + + + ts_stat + + + + + ts_stat(sqlquery text , weights text ) returns SETOF statinfo + + + + + + Here statinfo is a type, defined as: + CREATE TYPE statinfo AS (word text, ndoc integer, nentry integer); -and sqlquery is a text value containing a SQL query -which returns a single tsvector column. ts_stat -executes the query and returns statistics about the resulting -tsvector data, i.e., the number of documents, ndoc, -and the total number of words in the collection, nentry. It is -useful for checking your configuration and to find stop word candidates. For -example, to find the ten most frequent words: + + and sqlquery is a text value containing a SQL query + which returns a single tsvector column. ts_stat + executes the query and returns statistics about the resulting + tsvector data, i.e., the number of documents, ndoc, + and the total number of words in the collection, nentry. It is + useful for checking your configuration and to find stop word candidates. For + example, to find the ten most frequent words: SELECT * FROM ts_stat('SELECT vector from apod') @@ -855,8 +908,8 @@ ORDER BY ndoc DESC, nentry DESC, word LIMIT 10; -Optionally, one can specify weights to obtain -statistics about words with a specific weight: + Optionally, one can specify weights to obtain + statistics about words with a specific weight: SELECT * FROM ts_stat('SELECT vector FROM apod','a') @@ -864,387 +917,394 @@ ORDER BY ndoc DESC, nentry DESC, word LIMIT 10; - - - + + + + - - -Btree operations for tsvector - + + Btree operations for tsvector + - - -TSVECTOR < TSVECTOR -TSVECTOR <= TSVECTOR -TSVECTOR = TSVECTOR -TSVECTOR >= TSVECTOR -TSVECTOR > TSVECTOR - - + + + TSVECTOR < TSVECTOR + TSVECTOR <= TSVECTOR + TSVECTOR = TSVECTOR + TSVECTOR >= TSVECTOR + TSVECTOR > TSVECTOR + + - - -All btree operations are defined for the tsvector type. -tsvectors are compared with each other using -lexicographical ordering. - - - - + + + All btree operations are defined for the tsvector type. + tsvectors are compared with each other using + lexicographical ordering. + + + + - + + - + + tsquery - -tsquery + - + - + + to_tsquery + - -to_tsquery - + + + to_tsquery(config_name, querytext text) returns TSQUERY + + - - -to_tsquery(config_name, querytext text) returns TSQUERY - - + + + Accepts querytext, which should consist of single tokens + separated by the boolean operators & (and), | + (or) and ! (not), which can be grouped using parentheses. + In other words, to_tsquery expects already parsed text. + Each token is reduced to a lexeme using the specified or current configuration. + A weight class can be assigned to each lexeme entry to restrict the search region + (see setweight for an explanation). For example: - - -Accepts querytext, which should consist of single tokens -separated by the boolean operators & (and), | -(or) and ! (not), which can be grouped using parentheses. -In other words, to_tsquery expects already parsed text. -Each token is reduced to a lexeme using the specified or current configuration. -A weight class can be assigned to each lexeme entry to restrict the search region -(see setweight for an explanation). For example: 'fat:a & rats' -The to_tsquery function can also accept a text -string. In this case querytext should -be quoted. This may be useful, for example, to use with a thesaurus -dictionary. In the example below, a thesaurus contains rule supernovae -stars : sn: + + The to_tsquery function can also accept a text + string. In this case querytext should + be quoted. This may be useful, for example, to use with a thesaurus + dictionary. In the example below, a thesaurus contains rule supernovae + stars : sn: + SELECT to_tsquery('''supernovae stars'' & !crab'); - to_tsquery ----------------- + to_tsquery +--------------- 'sn' & !'crab' -Without quotes to_tsquery will generate a syntax error. - - - + Without quotes to_tsquery will generate a syntax error. + + + + - + - -plainto_tsquery - + + plainto_tsquery + - - -plainto_tsquery(config_name, querytext text) returns TSQUERY - - + + + plainto_tsquery(config_name, querytext text) returns TSQUERY + + - - -Transforms unformatted text querytext to tsquery. -It is the same as to_tsquery but accepts text -without quotes and will call the parser to break it into tokens. -plainto_tsquery assumes the & boolean -operator between words and does not recognize weight classes. - - - + + + Transforms unformatted text querytext to tsquery. + It is the same as to_tsquery but accepts text + without quotes and will call the parser to break it into tokens. + plainto_tsquery assumes the & boolean + operator between words and does not recognize weight classes. + + + - + - -querytree - + + querytree + - - -querytree(query TSQUERY) returns TEXT - - + + + querytree(query TSQUERY) returns TEXT + + + + + + This returns the query used for searching an index. It can be used to test + for an empty query. The SELECT below returns NULL, + which corresponds to an empty query since GIN indexes do not support queries with negation + + (a full index scan is inefficient): - - -This returns the query used for searching an index. It can be used to test -for an empty query. The SELECT below returns NULL, -which corresponds to an empty query since GIN indexes do not support queries with negation - -(a full index scan is inefficient): SELECT querytree(to_tsquery('!defined')); querytree ----------- - - - + + + + - + + text::tsquery casting + - -text::tsquery casting - + + + text::TSQUERY returns TSQUERY + + - - -text::TSQUERY returns TSQUERY - - + + + Directly casting text to a tsquery + allows you to directly inject lexemes into a query using whatever positions + and positional weight flags you choose to specify. The text should be + formatted to match the way a vector is displayed by + SELECT. + + + + - - -Directly casting text to a tsquery -allows you to directly inject lexemes into a query using whatever positions -and positional weight flags you choose to specify. The text should be -formatted to match the way a vector is displayed by -SELECT. - - - - + - + + numnode + - -numnode - + + + numnode(query TSQUERY) returns INTEGER + + - - -numnode(query TSQUERY) returns INTEGER - - + + + This returns the number of nodes in a query tree. This function can be + used to determine if query is meaningful + (returns > 0), or contains only stop words (returns 0): - - -This returns the number of nodes in a query tree. This function can be -used to determine if query is meaningful -(returns > 0), or contains only stop words (returns 0): SELECT numnode(plainto_tsquery('the any')); -NOTICE: query contains only stopword(s) or does not contain lexeme(s), -ignored +NOTICE: query contains only stopword(s) or does not contain lexeme(s), ignored numnode --------- 0 + SELECT numnode(plainto_tsquery('the table')); numnode --------- 1 + SELECT numnode(plainto_tsquery('long table')); numnode --------- 3 - - - - - - - -TSQUERY && TSQUERY - - - - -TSQUERY && TSQUERY returns TSQUERY - - - - - -Returns AND-ed TSQUERY - - - - - - - -TSQUERY || TSQUERY - - - - -TSQUERY || TSQUERY returns TSQUERY - - - - - -Returns OR-ed TSQUERY - - - - - - - -!! TSQUERY - - - - -!! TSQUERY returns TSQUERY - - - - - -negation of TSQUERY - - - - - - - -Btree operations for tsquery - - - - -TSQUERY < TSQUERY -TSQUERY <= TSQUERY -TSQUERY = TSQUERY -TSQUERY >= TSQUERY -TSQUERY > TSQUERY - - - - - -All btree operations are defined for the tsquery type. -tsqueries are compared to each other using lexicographical -ordering. - - - - - - - -Query Rewriting - - -Query rewriting is a set of functions and operators for the -tsquery data type. It allows control at search -query time without reindexing (the opposite of the -thesaurus). For example, you can expand the search using synonyms -(new york, big apple, nyc, -gotham) or narrow the search to direct the user to some hot -topic. - - - -The ts_rewrite() function changes the original query by -replacing part of the query with some other string of type tsquery, -as defined by the rewrite rule. Arguments to ts_rewrite() -can be names of columns of type tsquery. - + + + + + + + + TSQUERY && TSQUERY + + + + + TSQUERY && TSQUERY returns TSQUERY + + + + + + Returns AND-ed TSQUERY + + + + + + + + TSQUERY || TSQUERY + + + + + TSQUERY || TSQUERY returns TSQUERY + + + + + + Returns OR-ed TSQUERY + + + + + + + + !! TSQUERY + + + + + !! TSQUERY returns TSQUERY + + + + + + negation of TSQUERY + + + + + + + + Btree operations for tsquery + + + + + TSQUERY < TSQUERY + TSQUERY <= TSQUERY + TSQUERY = TSQUERY + TSQUERY >= TSQUERY + TSQUERY > TSQUERY + + + + + + All btree operations are defined for the tsquery type. + tsqueries are compared to each other using lexicographical + ordering. + + + + + + + + Query Rewriting + + + Query rewriting is a set of functions and operators for the + tsquery data type. It allows control at search + query time without reindexing (the opposite of the + thesaurus). For example, you can expand the search using synonyms + (new york, big apple, nyc, + gotham) or narrow the search to direct the user to some hot + topic. + + + + The ts_rewrite() function changes the original query by + replacing part of the query with some other string of type tsquery, + as defined by the rewrite rule. Arguments to ts_rewrite() + can be names of columns of type tsquery. + CREATE TABLE aliases (t TSQUERY PRIMARY KEY, s TSQUERY); INSERT INTO aliases VALUES('a', 'c'); - - + - -ts_rewrite - + - - -ts_rewrite (query TSQUERY, target TSQUERY, sample TSQUERY) returns TSQUERY - - + + ts_rewrite + - - + + + ts_rewrite (query TSQUERY, target TSQUERY, sample TSQUERY) returns TSQUERY + + + + + SELECT ts_rewrite('a & b'::tsquery, 'a'::tsquery, 'c'::tsquery); - ts_rewrite - ----------- - 'b' & 'c' + ts_rewrite +------------ + 'b' & 'c' - - - + + + - + - - -ts_rewrite(ARRAY[query TSQUERY, target TSQUERY, sample TSQUERY]) returns TSQUERY - - + + + ts_rewrite(ARRAY[query TSQUERY, target TSQUERY, sample TSQUERY]) returns TSQUERY + + - - + + SELECT ts_rewrite(ARRAY['a & b'::tsquery, t,s]) FROM aliases; - ts_rewrite - ----------- - 'b' & 'c' + ts_rewrite +------------ + 'b' & 'c' - - - + + + - + - - -ts_rewrite (query TSQUERY,'SELECT target ,sample FROM test'::text) returns TSQUERY - - + + + ts_rewrite (query TSQUERY,'SELECT target ,sample FROM test'::text) returns TSQUERY + + - - + + SELECT ts_rewrite('a & b'::tsquery, 'SELECT t,s FROM aliases'); - ts_rewrite - ----------- - 'b' & 'c' + ts_rewrite +------------ + 'b' & 'c' - - - - + + + - -What if there are several instances of rewriting? For example, query -'a & b' can be rewritten as -'b & c' and 'cc'. + + + + What if there are several instances of rewriting? For example, query + 'a & b' can be rewritten as + 'b & c' and 'cc'. SELECT * FROM aliases; @@ -1254,191 +1314,203 @@ SELECT * FROM aliases; 'x' | 'z' 'a' & 'b' | 'cc' -This ambiguity can be resolved by specifying a sort order: + + This ambiguity can be resolved by specifying a sort order: + SELECT ts_rewrite('a & b', 'SELECT t, s FROM aliases ORDER BY t DESC'); ts_rewrite ---------- + --------- 'cc' + SELECT ts_rewrite('a & b', 'SELECT t, s FROM aliases ORDER BY t ASC'); ts_rewrite ------------ +-------------- 'b' & 'c' - + + + + Let's consider a real-life astronomical example. We'll expand query + supernovae using table-driven rewriting rules: - -Let's consider a real-life astronomical example. We'll expand query -supernovae using table-driven rewriting rules: CREATE TABLE aliases (t tsquery primary key, s tsquery); INSERT INTO aliases VALUES(to_tsquery('supernovae'), to_tsquery('supernovae|sn')); + SELECT ts_rewrite(to_tsquery('supernovae'), 'SELECT * FROM aliases') && to_tsquery('crab'); - ?column? ---------------------------------- - ( 'supernova' | 'sn' ) & 'crab' + ?column? +------------------------------- +( 'supernova' | 'sn' ) & 'crab' -Notice, that we can change the rewriting rule online: + + Notice, that we can change the rewriting rule online: + UPDATE aliases SET s=to_tsquery('supernovae|sn & !nebulae') WHERE t=to_tsquery('supernovae'); SELECT ts_rewrite(to_tsquery('supernovae'), 'SELECT * FROM aliases') && to_tsquery('crab'); - ?column? ---------------------------------------------- - ( 'supernova' | 'sn' & !'nebula' ) & 'crab' + ?column? +----------------------------------------------- + 'supernova' | 'sn' & !'nebula' ) & 'crab' - - + + - -Operators For tsquery + + Operators For tsquery + + + Rewriting can be slow for many rewriting rules since it checks every rule + for a possible hit. To filter out obvious non-candidate rules there are containment + operators for the tsquery type. In the example below, we select only those + rules which might contain the original query: - -Rewriting can be slow for many rewriting rules since it checks every rule -for a possible hit. To filter out obvious non-candidate rules there are containment -operators for the tsquery type. In the example below, we select only those -rules which might contain the original query: SELECT ts_rewrite(ARRAY['a & b'::tsquery, t,s]) FROM aliases WHERE 'a & b' @> t; - ts_rewrite ------------ + ts_rewrite +------------ 'b' & 'c' - + + + + Two operators are defined for tsquery: + - -Two operators are defined for tsquery: - + + + - - + + TSQUERY @> TSQUERY + - -TSQUERY @> TSQUERY - + + + TSQUERY @> TSQUERY + + - - -TSQUERY @> TSQUERY - - + + + Returns true if the right argument might be contained in left argument. + + + - - -Returns true if the right argument might be contained in left argument. - - - + - + + tsquery <@ tsquery + - -tsquery <@ tsquery - + + + TSQUERY <@ TSQUERY + + - - -TSQUERY <@ TSQUERY - - + + + Returns true if the left argument might be contained in right argument. + + + - - -Returns true if the left argument might be contained in right argument. - - - - + - + - -Index For tsquery + + Index For tsquery - -To speed up operators <@ and @> for -tsquery one can use a GiST index with -a tsquery_ops opclass: + + To speed up operators <@ and @> for + tsquery one can use a GiST index with + a tsquery_ops opclass: CREATE INDEX t_idx ON aliases USING gist (t tsquery_ops); - + - + - + - + - -Additional Controls + + Additional Controls - -To implement full text searching there must be a function to create a -tsvector from a document and a tsquery from a -user query. Also, we need to return results in some order, i.e., we need -a function which compares documents with respect to their relevance to -the tsquery. Full text searching in -PostgreSQL provides support for all of these -functions. - + + To implement full text searching there must be a function to create a + tsvector from a document and a tsquery from a + user query. Also, we need to return results in some order, i.e., we need + a function which compares documents with respect to their relevance to + the tsquery. Full text searching in + PostgreSQL provides support for all of these + functions. + - -Parsing + + Parsing + + + Full text searching in PostgreSQL provides + function to_tsvector, which converts a document to + the tsvector data type. More details are available in , but for now consider a simple example: - -Full text searching in PostgreSQL provides -function to_tsvector, which converts a document to -the tsvector data type. More details are available in , but for now consider a simple example: SELECT to_tsvector('english', 'a fat cat sat on a mat - it ate a fat rats'); - to_tsvector + to_tsvector ----------------------------------------------------- 'ate':9 'cat':3 'fat':2,11 'mat':7 'rat':12 'sat':4 - - - -In the example above we see that the resulting tsvector does not -contain the words a, on, or -it, the word rats became -rat, and the punctuation sign - was -ignored. - - - -The to_tsvector function internally calls a parser -which breaks the document (a fat cat sat on a mat - it ate a -fat rats) into words and corresponding types. The default parser -recognizes 23 types. Each word, depending on its type, passes through a -group of dictionaries (). At the -end of this step we obtain lexemes. For example, -rats became rat because one of the -dictionaries recognized that the word rats is a plural -form of rat. Some words are treated as "stop words" -() and ignored since they occur too -frequently and have little informational value. In our example these are -a, on, and it. -The punctuation sign - was also ignored because its -type (Space symbols) is not indexed. All information -about the parser, dictionaries and what types of lexemes to index is -documented in the full text configuration section (). It is possible to have -several different configurations in the same database, and many predefined -system configurations are available for different languages. In our example -we used the default configuration english for the -English language. - - - -As another example, below is the output from the ts_debug -function ( ), which shows all details -of the full text machinery: + + + + In the example above we see that the resulting tsvector does not + contain the words a, on, or + it, the word rats became + rat, and the punctuation sign - was + ignored. + + + + The to_tsvector function internally calls a parser + which breaks the document (a fat cat sat on a mat - it ate a + fat rats) into words and corresponding types. The default parser + recognizes 23 types. Each word, depending on its type, passes through a + group of dictionaries (). At the + end of this step we obtain lexemes. For example, + rats became rat because one of the + dictionaries recognized that the word rats is a plural + form of rat. Some words are treated as "stop words" + () and ignored since they occur too + frequently and have little informational value. In our example these are + a, on, and it. + The punctuation sign - was also ignored because its + type (Space symbols) is not indexed. All information + about the parser, dictionaries and what types of lexemes to index is + documented in the full text configuration section (). It is possible to have + several different configurations in the same database, and many predefined + system configurations are available for different languages. In our example + we used the default configuration english for the + English language. + + + + As another example, below is the output from the ts_debug + function ( ), which shows all details + of the full text machinery: + SELECT * FROM ts_debug('english','a fat cat sat on a mat - it ate a fat rats'); Alias | Description | Token | Dictionaries | Lexized token @@ -1467,25 +1539,26 @@ SELECT * FROM ts_debug('english','a fat cat sat on a mat - it ate a fat rats'); lword | Latin word | fat | {english} | english: {fat} blank | Space symbols | | | lword | Latin word | rats | {english} | english: {rat} -(24 rows) - - - - -Function setweight() is used to label -tsvector. The typical usage of this is to mark out the -different parts of a document, perhaps by importance. Later, this can be -used for ranking of search results in addition to positional information -(distance between query terms). If no ranking is required, positional -information can be removed from tsvector using the -strip() function to save space. - - - -Because to_tsvector(NULL) can -return NULL, it is recommended to use -coalesce. Here is the safe method for creating a -tsvector from a structured document: + (24 rows) + + + + + Function setweight() is used to label + tsvector. The typical usage of this is to mark out the + different parts of a document, perhaps by importance. Later, this can be + used for ranking of search results in addition to positional information + (distance between query terms). If no ranking is required, positional + information can be removed from tsvector using the + strip() function to save space. + + + + Because to_tsvector(NULL) can + return NULL, it is recommended to use + coalesce. Here is the safe method for creating a + tsvector from a structured document: + UPDATE tt SET ti= setweight(to_tsvector(coalesce(title,'')), 'A') || ' ' || @@ -1493,31 +1566,32 @@ UPDATE tt SET ti= setweight(to_tsvector(coalesce(abstract,'')), 'C') || ' ' || setweight(to_tsvector(coalesce(body,'')), 'D'); - + + + + The following functions allow manual parsing control: - -The following functions allow manual parsing control: + - + - + + parse + - -parse - + + + ts_parse(parser, document TEXT) returns SETOF tokenout + + - - -ts_parse(parser, document TEXT) returns SETOF tokenout - - + + + Parses the given document and returns a series + of records, one for each token produced by parsing. Each record includes + a tokid giving its type and a token + which gives its content: - - -Parses the given document and returns a series -of records, one for each token produced by parsing. Each record includes -a tokid giving its type and a token -which gives its content: SELECT * FROM ts_parse('default','123 - a number'); tokid | token @@ -1529,29 +1603,30 @@ SELECT * FROM ts_parse('default','123 - a number'); 12 | 1 | number - - - - - - -ts_token_type - - - - -ts_token_type(parser ) returns SETOF tokentype - - - - - -Returns a table which describes each kind of token the -parser might produce as output. For each token -type the table gives the tokid which the -parser uses to label each -token of that type, the alias which -names the token type, and a short description: + + + + + + + ts_token_type + + + + + ts_token_type(parser ) returns SETOF tokentype + + + + + + Returns a table which describes each kind of token the + parser might produce as output. For each token + type the table gives the tokid which the + parser uses to label each + token of that type, the alias which + names the token type, and a short description: + SELECT * FROM ts_token_type('default'); tokid | alias | description @@ -1581,146 +1656,163 @@ SELECT * FROM ts_token_type('default'); 23 | entity | HTML Entity - - - + + + - - + + - + - -Ranking Search Results + + Ranking Search Results - -Ranking attempts to measure how relevant documents are to a particular -query by inspecting the number of times each search word appears in the -document, and whether different search terms occur near each other. Full -text searching provides two predefined ranking functions which attempt to -produce a measure of how a document is relevant to the query. In spite -of that, the concept of relevancy is vague and very application-specific. -These functions try to take into account lexical, proximity, and structural -information. Different applications might require additional information -for ranking, e.g. document modification time. - + + Ranking attempts to measure how relevant documents are to a particular + query by inspecting the number of times each search word appears in the + document, and whether different search terms occur near each other. Full + text searching provides two predefined ranking functions which attempt to + produce a measure of how a document is relevant to the query. In spite + of that, the concept of relevancy is vague and very application-specific. + These functions try to take into account lexical, proximity, and structural + information. Different applications might require additional information + for ranking, e.g. document modification time. + - -The lexical part of ranking reflects how often the query terms appear in -the document, how close the document query terms are, and in what part of -the document they occur. Note that ranking functions that use positional -information will only work on unstripped tsvectors because stripped -tsvectors lack positional information. - + + The lexical part of ranking reflects how often the query terms appear in + the document, how close the document query terms are, and in what part of + the document they occur. Note that ranking functions that use positional + information will only work on unstripped tsvectors because stripped + tsvectors lack positional information. + - -The two ranking functions currently available are: + + The two ranking functions currently available are: - + - + - -ts_rank - + + ts_rank + - - -ts_rank( weights float4[], vector TSVECTOR, query TSQUERY, normalization int4 ) returns float4 - - + + + ts_rank( weights float4[], vector TSVECTOR, query TSQUERY, normalization int4 ) returns float4 + + + + + + This ranking function offers the ability to weigh word instances more + heavily depending on how you have classified them. The weights specify + how heavily to weigh each category of word: - - -This ranking function offers the ability to weigh word instances more -heavily depending on how you have classified them. The weights specify -how heavily to weigh each category of word: {D-weight, C-weight, B-weight, A-weight} - -If no weights are provided, -then these defaults are used: + + + If no weights are provided, + then these defaults are used: + {0.1, 0.2, 0.4, 1.0} -Often weights are used to mark words from special areas of the document, -like the title or an initial abstract, and make them more or less important -than words in the document body. - - - - - - - -ts_rank_cd - - - - -ts_rank_cd( weights float4[], vector TSVECTOR, query TSQUERY, normalization int4 ) returns float4 - - - - - -This function computes the cover density ranking for -the given document vector and query, as described in Clarke, Cormack, and -Tudhope's "Relevance Ranking for One to Three Term Queries" in the -"Information Processing and Management", 1999. - - - - - - - - - -Since a longer document has a greater chance of containing a query term -it is reasonable to take into account document size, i.e. a hundred-word -document with five instances of a search word is probably more relevant -than a thousand-word document with five instances. Both ranking functions -take an integer normalization option that -specifies whether a document's length should impact its rank. The integer -option controls several behaviors which is done using bit-wise fields and -| (for example, 2|4): - - - -0 (the default) ignores the document length - - -1 divides the rank by 1 + the logarithm of the document length - - -2 divides the rank by the length itself - - - -4 divides the rank by the mean harmonic distance between extents - - -8 divides the rank by the number of unique words in document - - -16 divides the rank by 1 + logarithm of the number of unique words in document - - - - - - -It is important to note that ranking functions do not use any global -information so it is impossible to produce a fair normalization to 1% or -100%, as sometimes required. However, a simple technique like -rank/(rank+1) can be applied. Of course, this is just -a cosmetic change, i.e., the ordering of the search results will not change. - - - -Several examples are shown below; note that the second example uses -normalized ranking: + + Often weights are used to mark words from special areas of the document, + like the title or an initial abstract, and make them more or less important + than words in the document body. + + + + + + + + ts_rank_cd + + + + + ts_rank_cd( weights float4[], vector TSVECTOR, query TSQUERY, normalization int4 ) returns float4 + + + + + + This function computes the cover density ranking for + the given document vector and query, as described in Clarke, Cormack, and + Tudhope's "Relevance Ranking for One to Three Term Queries" in the + "Information Processing and Management", 1999. + + + + + + + + + + Since a longer document has a greater chance of containing a query term + it is reasonable to take into account document size, i.e. a hundred-word + document with five instances of a search word is probably more relevant + than a thousand-word document with five instances. Both ranking functions + take an integer normalization option that + specifies whether a document's length should impact its rank. The integer + option controls several behaviors which is done using bit-wise fields and + | (for example, 2|4): + + + + + 0 (the default) ignores the document length + + + + + 1 divides the rank by 1 + the logarithm of the document length + + + + + 2 divides the rank by the length itself + + + + + + 4 divides the rank by the mean harmonic distance between extents + + + + + 8 divides the rank by the number of unique words in document + + + + + 16 divides the rank by 1 + logarithm of the number of unique words in document + + + + + + + + It is important to note that ranking functions do not use any global + information so it is impossible to produce a fair normalization to 1% or + 100%, as sometimes required. However, a simple technique like + rank/(rank+1) can be applied. Of course, this is just + a cosmetic change, i.e., the ordering of the search results will not change. + + + + Several examples are shown below; note that the second example uses + normalized ranking: + SELECT title, ts_rank_cd('{0.1, 0.2, 0.4, 1.0}',textsearch, query) AS rnk FROM apod, to_tsquery('neutrino|(dark & matter)') query @@ -1757,252 +1849,283 @@ ORDER BY rnk DESC LIMIT 10; Ice Fishing for Cosmic Neutrinos | 0.615384618911517 Weak Lensing Distorts the Universe | 0.450010798361481 - - - -The first argument in ts_rank_cd ('{0.1, 0.2, -0.4, 1.0}') is an optional parameter which specifies the -weights for labels D, C, -B, and A used in function -setweight. These default values show that lexemes -labeled as A are ten times more important than ones -that are labeled with D. - - - -Ranking can be expensive since it requires consulting the -tsvector of all documents, which can be I/O bound and -therefore slow. Unfortunately, it is almost impossible to avoid since full -text searching in a database should work without indexes . Moreover an index can be lossy (a GiST -index, for example) so it must check documents to avoid false hits. - - - -Note that the ranking functions above are only examples. You can write -your own ranking functions and/or combine additional factors to fit your -specific needs. - - - - - - -Highlighting Results - - -headline - - - -To present search results it is ideal to show a part of each document and -how it is related to the query. Usually, search engines show fragments of -the document with marked search terms. PostgreSQL full -text searching provides the function headline that -implements such functionality. - - - - - - - - -ts_headline( config_name text, document text, query TSQUERY, options text ) returns text - - - - - -The ts_headline function accepts a document along with -a query, and returns one or more ellipsis-separated excerpts from the -document in which terms from the query are highlighted. The configuration -used to parse the document can be specified by its -config_name; if none is specified, the current -configuration is used. - - - - - - - - -If an options string is specified it should -consist of a comma-separated list of one or more 'option=value' pairs. -The available options are: - - - -StartSel, StopSel: the strings with which -query words appearing in the document should be delimited to distinguish -them from other excerpted words. - - -MaxWords, MinWords: limit the shortest and -longest headlines to output - - -ShortWord: this prevents your headline from beginning -or ending with a word which has this many characters or less. The default -value of three eliminates the English articles. - - -HighlightAll: boolean flag; if -true the whole document will be highlighted - - - -Any unspecified options receive these defaults: + + + + The first argument in ts_rank_cd ('{0.1, 0.2, + 0.4, 1.0}') is an optional parameter which specifies the + weights for labels D, C, + B, and A used in function + setweight. These default values show that lexemes + labeled as A are ten times more important than ones + that are labeled with D. + + + + Ranking can be expensive since it requires consulting the + tsvector of all documents, which can be I/O bound and + therefore slow. Unfortunately, it is almost impossible to avoid since full + text searching in a database should work without indexes . Moreover an index can be lossy (a GiST + index, for example) so it must check documents to avoid false hits. + + + + Note that the ranking functions above are only examples. You can write + your own ranking functions and/or combine additional factors to fit your + specific needs. + + + + + + + Highlighting Results + + + headline + + + + To present search results it is ideal to show a part of each document and + how it is related to the query. Usually, search engines show fragments of + the document with marked search terms. PostgreSQL full + text searching provides the function headline that + implements such functionality. + + + + + + + + + ts_headline( config_name text, document text, query TSQUERY, options text ) returns text + + + + + + The ts_headline function accepts a document along with + a query, and returns one or more ellipsis-separated excerpts from the + document in which terms from the query are highlighted. The configuration + used to parse the document can be specified by its + config_name; if none is specified, the current + configuration is used. + + + + + + + + + If an options string is specified it should + consist of a comma-separated list of one or more 'option=value' pairs. + The available options are: + + + + + StartSel, StopSel: the strings with which + query words appearing in the document should be delimited to distinguish + them from other excerpted words. + + + + + MaxWords, MinWords: limit the shortest and + longest headlines to output + + + + + ShortWord: this prevents your headline from beginning + or ending with a word which has this many characters or less. The default + value of three eliminates the English articles. + + + + + HighlightAll: boolean flag; if + true the whole document will be highlighted + + + + + Any unspecified options receive these defaults: + StartSel=<b>, StopSel=</b>, MaxWords=35, MinWords=15, ShortWord=3, HighlightAll=FALSE - + - -For example: + + For example: SELECT ts_headline('a b c', 'c'::tsquery); headline -------------- a b <b>c</b> + SELECT ts_headline('a b c', 'c'::tsquery, 'StartSel=<,StopSel=>'); ts_headline ------------- a b <c> - + - -headline uses the original document, not -tsvector, so it can be slow and should be used with care. -A typical mistake is to call headline() for -every matching document when only ten documents are -shown. SQL subselects can help here; below is an -example: + + headline uses the original document, not + tsvector, so it can be slow and should be used with care. + A typical mistake is to call headline() for + every matching document when only ten documents are + shown. SQL subselects can help here; below is an + example: SELECT id,ts_headline(body,q), rank FROM (SELECT id,body,q, ts_rank_cd (ti,q) AS rank FROM apod, to_tsquery('stars') q - WHERE ti @@ q - ORDER BY rank DESC LIMIT 10) AS foo; - - - - -Note that the cascade dropping of the parser function -causes dropping of the ts_headline used in the full text search -configuration config_name. - - - - - - - -Dictionaries - - -Dictionaries are used to eliminate words that should not be considered in a -search (stop words), and to normalize words so -that different derived forms of the same word will match. Aside from -improving search quality, normalization and removal of stop words reduce the -size of the tsvector representation of a document, thereby -improving performance. Normalization does not always have linguistic meaning -and usually depends on application semantics. - - - -Some examples of normalization: - - - - - Linguistic - ispell dictionaries try to reduce input words to a -normalized form; stemmer dictionaries remove word endings - - - Identical URL locations are identified and canonicalized: - - - -http://www.pgsql.ru/db/mw/index.html - - -http://www.pgsql.ru/db/mw/ - - -http://www.pgsql.ru/db/../db/mw/index.html - - - - - -Colour names are substituted by their hexadecimal values, e.g., -red, green, blue, magenta -> FF0000, 00FF00, 0000FF, FF00FF - - -Remove some numeric fractional digits to reduce the range of possible -numbers, so 3.14159265359, -3.1415926, 3.14 will be the same -after normalization if only two digits are kept after the decimal point. - - - - - - -A dictionary is a program which accepts lexemes as -input and returns: - - -an array of lexemes if the input lexeme is known to the dictionary - - -a void array if the dictionary knows the lexeme, but it is a stop word - - -NULL if the dictionary does not recognize the input lexeme - - - - - -Full text searching provides predefined dictionaries for many languages, -and SQL commands to manipulate them. There are also -several predefined template dictionaries that can be used to create new -dictionaries by overriding their default parameters. Besides this, it is -possible to develop custom dictionaries using an API; -see the dictionary for integers () as an example. - - - -The ALTER TEXT SEARCH CONFIGURATION ADD -MAPPING command binds specific types of lexemes and a set of -dictionaries to process them. (Mappings can also be specified as part of -configuration creation.) Lexemes are processed by a stack of dictionaries -until some dictionary identifies it as a known word or it turns out to be -a stop word. If no dictionary recognizes a lexeme, it will be discarded -and not indexed. A general rule for configuring a stack of dictionaries -is to place first the most narrow, most specific dictionary, then the more -general dictionaries and finish it with a very general dictionary, like -the snowball stemmer or simple, which -recognizes everything. For example, for an astronomy-specific search -(astro_en configuration) one could bind -lword (latin word) with a synonym dictionary of astronomical -terms, a general English dictionary and a snowball English -stemmer: +WHERE ti @@ q +ORDER BY rank DESC LIMIT 10) AS foo; + + + + + Note that the cascade dropping of the parser function + causes dropping of the ts_headline used in the full text search + configuration config_name. + + + + + + + + Dictionaries + + + Dictionaries are used to eliminate words that should not be considered in a + search (stop words), and to normalize words so + that different derived forms of the same word will match. Aside from + improving search quality, normalization and removal of stop words reduce the + size of the tsvector representation of a document, thereby + improving performance. Normalization does not always have linguistic meaning + and usually depends on application semantics. + + + + Some examples of normalization: + + + + + + Linguistic - ispell dictionaries try to reduce input words to a + normalized form; stemmer dictionaries remove word endings + + + + + Identical URL locations are identified and canonicalized: + + + + + http://www.pgsql.ru/db/mw/index.html + + + + + http://www.pgsql.ru/db/mw/ + + + + + http://www.pgsql.ru/db/../db/mw/index.html + + + + + + + + Colour names are substituted by their hexadecimal values, e.g., + red, green, blue, magenta -> FF0000, 00FF00, 0000FF, FF00FF + + + + + Remove some numeric fractional digits to reduce the range of possible + numbers, so 3.14159265359, + 3.1415926, 3.14 will be the same + after normalization if only two digits are kept after the decimal point. + + + + + + + + A dictionary is a program which accepts lexemes as + input and returns: + + + + an array of lexemes if the input lexeme is known to the dictionary + + + + + a void array if the dictionary knows the lexeme, but it is a stop word + + + + + NULL if the dictionary does not recognize the input lexeme + + + + + + + Full text searching provides predefined dictionaries for many languages, + and SQL commands to manipulate them. There are also + several predefined template dictionaries that can be used to create new + dictionaries by overriding their default parameters. Besides this, it is + possible to develop custom dictionaries using an API; + see the dictionary for integers () as an example. + + + + The ALTER TEXT SEARCH CONFIGURATION ADD + MAPPING command binds specific types of lexemes and a set of + dictionaries to process them. (Mappings can also be specified as part of + configuration creation.) Lexemes are processed by a stack of dictionaries + until some dictionary identifies it as a known word or it turns out to be + a stop word. If no dictionary recognizes a lexeme, it will be discarded + and not indexed. A general rule for configuring a stack of dictionaries + is to place first the most narrow, most specific dictionary, then the more + general dictionaries and finish it with a very general dictionary, like + the snowball stemmer or simple, which + recognizes everything. For example, for an astronomy-specific search + (astro_en configuration) one could bind + lword (latin word) with a synonym dictionary of astronomical + terms, a general English dictionary and a snowball English + stemmer: + ALTER TEXT SEARCH CONFIGURATION astro_en ADD MAPPING FOR lword WITH astrosyn, english_ispell, english_stem; - + + + + Function ts_lexize can be used to test dictionaries, + for example: - -Function ts_lexize can be used to test dictionaries, -for example: SELECT ts_lexize('english_stem', 'stars'); ts_lexize @@ -2010,27 +2133,32 @@ SELECT ts_lexize('english_stem', 'stars'); {star} (1 row) -Also, the ts_debug function () -can be used for this. - - -Stop Words - -Stop words are words which are very common, appear in almost -every document, and have no discrimination value. Therefore, they can be ignored -in the context of full text searching. For example, every English text contains -words like a although it is useless to store them in an index. -However, stop words do affect the positions in tsvector, -which in turn, do affect ranking: + Also, the ts_debug function () + can be used for this. + + + + Stop Words + + + Stop words are words which are very common, appear in almost + every document, and have no discrimination value. Therefore, they can be ignored + in the context of full text searching. For example, every English text contains + words like a although it is useless to store them in an index. + However, stop words do affect the positions in tsvector, + which in turn, do affect ranking: + SELECT to_tsvector('english','in the list of stop words'); to_tsvector ---------------------------- 'list':3 'stop':5 'word':6 -The gaps between positions 1-3 and 3-5 are because of stop words, so ranks -calculated for documents with and without stop words are quite different: + + The gaps between positions 1-3 and 3-5 are because of stop words, so ranks + calculated for documents with and without stop words are quite different: + SELECT ts_rank_cd ('{1,1,1,1}', to_tsvector('english','in the list of stop words'), to_tsquery('list & stop')); ts_rank_cd @@ -2043,63 +2171,68 @@ SELECT ts_rank_cd ('{1,1,1,1}', to_tsvector('english','list stop words'), to_tsq 1 - + + + + It is up to the specific dictionary how it treats stop words. For example, + ispell dictionaries first normalize words and then + look at the list of stop words, while stemmers + first check the list of stop words. The reason for the different + behaviour is an attempt to decrease possible noise. + - -It is up to the specific dictionary how it treats stop words. For example, -ispell dictionaries first normalize words and then -look at the list of stop words, while stemmers -first check the list of stop words. The reason for the different -behaviour is an attempt to decrease possible noise. - + + Here is an example of a dictionary that returns the input word as lowercase + or NULL if it is a stop word; it also specifies the name + of a file of stop words. It uses the simple dictionary as + a template: - -Here is an example of a dictionary that returns the input word as lowercase -or NULL if it is a stop word; it also specifies the name -of a file of stop words. It uses the simple dictionary as -a template: CREATE TEXT SEARCH DICTIONARY public.simple_dict ( TEMPLATE = pg_catalog.simple, STOPWORDS = english ); -Now we can test our dictionary: + + Now we can test our dictionary: + SELECT ts_lexize('public.simple_dict','YeS'); ts_lexize ----------- {yes} + SELECT ts_lexize('public.simple_dict','The'); ts_lexize ----------- {} - + - - -Most types of dictionaries rely on configuration files, such as files of stop -words. These files must be stored in UTF-8 encoding. They will -be translated to the actual database encoding, if that is different, when they -are read into the server. - - + + + Most types of dictionaries rely on configuration files, such as files of stop + words. These files must be stored in UTF-8 encoding. They will + be translated to the actual database encoding, if that is different, when they + are read into the server. + + - + - -Synonym Dictionary + + Synonym Dictionary + + + This dictionary template is used to create dictionaries which replace a + word with a synonym. Phrases are not supported (use the thesaurus + dictionary () for that). A synonym + dictionary can be used to overcome linguistic problems, for example, to + prevent an English stemmer dictionary from reducing the word 'Paris' to + 'pari'. It is enough to have a Paris paris line in the + synonym dictionary and put it before the english_stem dictionary: - -This dictionary template is used to create dictionaries which replace a -word with a synonym. Phrases are not supported (use the thesaurus -dictionary () for that). A synonym -dictionary can be used to overcome linguistic problems, for example, to -prevent an English stemmer dictionary from reducing the word 'Paris' to -'pari'. It is enough to have a Paris paris line in the -synonym dictionary and put it before the english_stem dictionary: SELECT * FROM ts_debug('english','Paris'); Alias | Description | Token | Dictionaries | Lexized token @@ -2119,90 +2252,95 @@ SELECT * FROM ts_debug('english','Paris'); lword | Latin word | Paris | {synonym,english_stem} | synonym: {paris} (1 row) - - - - - -Thesaurus Dictionary - - -A thesaurus dictionary (sometimes abbreviated as TZ) is -a collection of words which includes information about the relationships -of words and phrases, i.e., broader terms (BT), narrower -terms (NT), preferred terms, non-preferred terms, related -terms, etc. - - -Basically a thesaurus dictionary replaces all non-preferred terms by one -preferred term and, optionally, preserves them for indexing. Thesauruses -are used during indexing so any change in the thesaurus requires -reindexing. The current implementation of the thesaurus -dictionary is an extension of the synonym dictionary with added -phrase support. A thesaurus dictionary requires -a configuration file of the following format: + + + + + + Thesaurus Dictionary + + + A thesaurus dictionary (sometimes abbreviated as TZ) is + a collection of words which includes information about the relationships + of words and phrases, i.e., broader terms (BT), narrower + terms (NT), preferred terms, non-preferred terms, related + terms, etc. + + + + Basically a thesaurus dictionary replaces all non-preferred terms by one + preferred term and, optionally, preserves them for indexing. Thesauruses + are used during indexing so any change in the thesaurus requires + reindexing. The current implementation of the thesaurus + dictionary is an extension of the synonym dictionary with added + phrase support. A thesaurus dictionary requires + a configuration file of the following format: + # this is a comment sample word(s) : indexed word(s) more sample word(s) : more indexed word(s) ... -where the colon (:) symbol acts as a delimiter between a -a phrase and its replacement. - - - -A thesaurus dictionary uses a subdictionary (which -is defined in the dictionary's configuration) to normalize the input text -before checking for phrase matches. It is only possible to select one -subdictionary. An error is reported if the subdictionary fails to -recognize a word. In that case, you should remove the use of the word or teach -the subdictionary about it. Use an asterisk (*) at the -beginning of an indexed word to skip the subdictionary. It is still required -that sample words are known. - - - -The thesaurus dictionary looks for the longest match. - - - -Stop words recognized by the subdictionary are replaced by a 'stop word -placeholder' to record their position. To break possible ties the thesaurus -uses the last definition. To illustrate this, consider a thesaurus (with -a simple subdictionary) with pattern -swsw, where s designates any stop word and -w, any known word: + + where the colon (:) symbol acts as a delimiter between a + a phrase and its replacement. + + + + A thesaurus dictionary uses a subdictionary (which + is defined in the dictionary's configuration) to normalize the input text + before checking for phrase matches. It is only possible to select one + subdictionary. An error is reported if the subdictionary fails to + recognize a word. In that case, you should remove the use of the word or teach + the subdictionary about it. Use an asterisk (*) at the + beginning of an indexed word to skip the subdictionary. It is still required + that sample words are known. + + + + The thesaurus dictionary looks for the longest match. + + + + Stop words recognized by the subdictionary are replaced by a 'stop word + placeholder' to record their position. To break possible ties the thesaurus + uses the last definition. To illustrate this, consider a thesaurus (with + a simple subdictionary) with pattern + swsw, where s designates any stop word and + w, any known word: + a one the two : swsw the one a two : swsw2 -Words a and the are stop words defined in the -configuration of a subdictionary. The thesaurus considers the -one the two and that one then two as equal -and will use definition swsw2. - - -As any normal dictionary, it can be assigned to the specific lexeme types. -Since a thesaurus dictionary has the capability to recognize phrases it -must remember its state and interact with the parser. A thesaurus dictionary -uses these assignments to check if it should handle the next word or stop -accumulation. The thesaurus dictionary compiler must be configured -carefully. For example, if the thesaurus dictionary is assigned to handle -only the lword lexeme, then a thesaurus dictionary -definition like ' one 7' will not work since lexeme type -digit is not assigned to the thesaurus dictionary. - + Words a and the are stop words defined in the + configuration of a subdictionary. The thesaurus considers the + one the two and that one then two as equal + and will use definition swsw2. + - + + As any normal dictionary, it can be assigned to the specific lexeme types. + Since a thesaurus dictionary has the capability to recognize phrases it + must remember its state and interact with the parser. A thesaurus dictionary + uses these assignments to check if it should handle the next word or stop + accumulation. The thesaurus dictionary compiler must be configured + carefully. For example, if the thesaurus dictionary is assigned to handle + only the lword lexeme, then a thesaurus dictionary + definition like ' one 7' will not work since lexeme type + digit is not assigned to the thesaurus dictionary. + - -Thesaurus Configuration + - -To define a new thesaurus dictionary one can use the thesaurus template. -For example: + + Thesaurus Configuration + + + To define a new thesaurus dictionary one can use the thesaurus template. + For example: CREATE TEXT SEARCH DICTIONARY thesaurus_simple ( @@ -2211,88 +2349,106 @@ CREATE TEXT SEARCH DICTIONARY thesaurus_simple ( Dictionary = pg_catalog.english_stem ); -Here: - - -thesaurus_simple is the thesaurus dictionary name - - -mythesaurus is the base name of the thesaurus file -(its full name will be $SHAREDIR/tsearch_data/mythesaurus.ths, -where $SHAREDIR means the installation shared-data directory, -often /usr/local/share). - - -pg_catalog.english_stem is the dictionary (Snowball -English stemmer) to use for thesaurus normalization. Notice that the -english_stem dictionary has its own configuration (for example, -stop words), which is not shown here. - - - -Now it is possible to bind the thesaurus dictionary thesaurus_simple -and selected tokens, for example: + + Here: + + + + thesaurus_simple is the thesaurus dictionary name + + + + + mythesaurus is the base name of the thesaurus file + (its full name will be $SHAREDIR/tsearch_data/mythesaurus.ths, + where $SHAREDIR means the installation shared-data directory, + often /usr/local/share). + + + + + pg_catalog.english_stem is the dictionary (Snowball + English stemmer) to use for thesaurus normalization. Notice that the + english_stem dictionary has its own configuration (for example, + stop words), which is not shown here. + + + + + Now it is possible to bind the thesaurus dictionary thesaurus_simple + and selected tokens, for example: ALTER TEXT SEARCH CONFIGURATION russian ADD MAPPING FOR lword, lhword, lpart_hword WITH thesaurus_simple; - + + + - + + Thesaurus Example - -Thesaurus Example + + Consider a simple astronomical thesaurus thesaurus_astro, + which contains some astronomical word combinations: - -Consider a simple astronomical thesaurus thesaurus_astro, -which contains some astronomical word combinations: supernovae stars : sn crab nebulae : crab -Below we create a dictionary and bind some token types with -an astronomical thesaurus and english stemmer: + + Below we create a dictionary and bind some token types with + an astronomical thesaurus and english stemmer: + CREATE TEXT SEARCH DICTIONARY thesaurus_astro ( TEMPLATE = thesaurus, DictFile = thesaurus_astro, Dictionary = english_stem ); + ALTER TEXT SEARCH CONFIGURATION russian ADD MAPPING FOR lword, lhword, lpart_hword WITH thesaurus_astro, english_stem; -Now we can see how it works. Note that ts_lexize cannot -be used for testing the thesaurus (see description of -ts_lexize), but we can use -plainto_tsquery and to_tsvector -which accept text arguments, not lexemes: + + Now we can see how it works. Note that ts_lexize cannot + be used for testing the thesaurus (see description of + ts_lexize), but we can use + plainto_tsquery and to_tsvector + which accept text arguments, not lexemes: SELECT plainto_tsquery('supernova star'); plainto_tsquery ----------------- 'sn' + SELECT to_tsvector('supernova star'); to_tsvector ------------- 'sn':1 -In principle, one can use to_tsquery if you quote -the argument: + + In principle, one can use to_tsquery if you quote + the argument: + SELECT to_tsquery('''supernova star'''); to_tsquery ------------ 'sn' -Notice that supernova star matches supernovae -stars in thesaurus_astro because we specified the -english_stem stemmer in the thesaurus definition. - - -To keep an original phrase in full text indexing just add it to the right part -of the definition: + + Notice that supernova star matches supernovae + stars in thesaurus_astro because we specified the + english_stem stemmer in the thesaurus definition. + + + + To keep an original phrase in full text indexing just add it to the right part + of the definition: + supernovae stars : sn supernovae stars @@ -2301,56 +2457,60 @@ SELECT plainto_tsquery('supernova star'); ----------------------------- 'sn' & 'supernova' & 'star' - - - - - -Ispell Dictionary - - -The Ispell template dictionary for full text allows the -creation of morphological dictionaries based on Ispell, which -supports a large number of languages. This dictionary tries to change an -input word to its normalized form. Also, more modern spelling dictionaries -are supported - MySpell (OO < 2.0.1) -and Hunspell -(OO >= 2.0.2). A large list of dictionaries is available on the OpenOffice -Wiki. - - - -The Ispell dictionary allows searches without bothering -about different linguistic forms of a word. For example, a search on -bank would return hits of all declensions and -conjugations of the search term bank, e.g. -banking, banked, banks, -banks', and bank's. + + + + + + Ispell Dictionary + + + The Ispell template dictionary for full text allows the + creation of morphological dictionaries based on Ispell, which + supports a large number of languages. This dictionary tries to change an + input word to its normalized form. Also, more modern spelling dictionaries + are supported - MySpell (OO < 2.0.1) + and Hunspell + (OO >= 2.0.2). A large list of dictionaries is available on the OpenOffice + Wiki. + + + + The Ispell dictionary allows searches without bothering + about different linguistic forms of a word. For example, a search on + bank would return hits of all declensions and + conjugations of the search term bank, e.g. + banking, banked, banks, + banks', and bank's. + SELECT ts_lexize('english_ispell','banking'); ts_lexize ----------- {bank} + SELECT ts_lexize('english_ispell','bank''s'); ts_lexize ----------- {bank} + SELECT ts_lexize('english_ispell','banked'); ts_lexize ----------- {bank} - + + + + To create an ispell dictionary one should use the built-in + ispell dictionary and specify several + parameters. + - -To create an ispell dictionary one should use the built-in -ispell dictionary and specify several -parameters. - CREATE TEXT SEARCH DICTIONARY english_ispell ( TEMPLATE = ispell, @@ -2359,200 +2519,216 @@ CREATE TEXT SEARCH DICTIONARY english_ispell ( StopWords = english ); - -Here, DictFile, AffFile, and StopWords -specify the names of the dictionary, affixes, and stop-words files. - - - -Ispell dictionaries usually recognize a restricted set of words so they -should be used in conjunction with another broader dictionary; for -example, a stemming dictionary, which recognizes everything. - - - -Ispell dictionaries support splitting compound words based on an -ispell dictionary. This is a nice feature and full text searching -in PostgreSQL supports it. -Notice that the affix file should specify a special flag using the -compoundwords controlled statement that marks dictionary -words that can participate in compound formation: + + + Here, DictFile, AffFile, and StopWords + specify the names of the dictionary, affixes, and stop-words files. + + + + Ispell dictionaries usually recognize a restricted set of words so they + should be used in conjunction with another broader dictionary; for + example, a stemming dictionary, which recognizes everything. + + + + Ispell dictionaries support splitting compound words based on an + ispell dictionary. This is a nice feature and full text searching + in PostgreSQL supports it. + Notice that the affix file should specify a special flag using the + compoundwords controlled statement that marks dictionary + words that can participate in compound formation: + compoundwords controlled z -Several examples for the Norwegian language: + + Several examples for the Norwegian language: + SELECT ts_lexize('norwegian_ispell','overbuljongterningpakkmesterassistent'); - {over,buljong,terning,pakk,mester,assistent} + {over,buljong,terning,pakk,mester,assistent} SELECT ts_lexize('norwegian_ispell','sjokoladefabrikk'); - {sjokoladefabrikk,sjokolade,fabrikk} - - - - - -MySpell does not support compound words. -Hunspell has sophisticated support for compound words. At -present, full text searching implements only the basic compound word -operations of Hunspell. - - - - - - -<application>Snowball</> Stemming Dictionary - - -The Snowball dictionary template is based on the project -of Martin Porter, inventor of the popular Porter's stemming algorithm -for the English language and now supported in many languages (see the Snowball site for more -information). The Snowball project supplies a large number of stemmers for -many languages. A Snowball dictionary requires a language parameter to -identify which stemmer to use, and optionally can specify a stopword file name. -For example, there is a built-in definition equivalent to + {sjokoladefabrikk,sjokolade,fabrikk} + + + + + + MySpell does not support compound words. + Hunspell has sophisticated support for compound words. At + present, full text searching implements only the basic compound word + operations of Hunspell. + + + + + + + <application>Snowball</> Stemming Dictionary + + + The Snowball dictionary template is based on the project + of Martin Porter, inventor of the popular Porter's stemming algorithm + for the English language and now supported in many languages (see the Snowball site for more + information). The Snowball project supplies a large number of stemmers for + many languages. A Snowball dictionary requires a language parameter to + identify which stemmer to use, and optionally can specify a stopword file name. + For example, there is a built-in definition equivalent to + CREATE TEXT SEARCH DICTIONARY english_stem ( TEMPLATE = snowball, Language = english, StopWords = english ); - + + + + The Snowball dictionary recognizes everything, so it is best + to place it at the end of the dictionary stack. It it useless to have it + before any other dictionary because a lexeme will never pass through it to + the next dictionary. + - -The Snowball dictionary recognizes everything, so it is best -to place it at the end of the dictionary stack. It it useless to have it -before any other dictionary because a lexeme will never pass through it to -the next dictionary. - + - + + Dictionary Testing - -Dictionary Testing + + The ts_lexize function facilitates dictionary testing: - -The ts_lexize function facilitates dictionary testing: + - - + - -ts_lexize - + + ts_lexize + - - -ts_lexize(dict_name text, lexeme text) returns text[] - - + + + ts_lexize(dict_name text, lexeme text) returns text[] + + + + + + Returns an array of lexemes if the input lexeme + is known to the dictionary dictname, or a void + array if the lexeme is known to the dictionary but it is a stop word, or + NULL if it is an unknown word. + - - -Returns an array of lexemes if the input lexeme -is known to the dictionary dictname, or a void -array if the lexeme is known to the dictionary but it is a stop word, or -NULL if it is an unknown word. - SELECT ts_lexize('english_stem', 'stars'); ts_lexize ----------- {star} + SELECT ts_lexize('english_stem', 'a'); ts_lexize ----------- {} - - + + + + + - - + + + The ts_lexize function expects a + lexeme, not text. Below is an example: - - -The ts_lexize function expects a -lexeme, not text. Below is an example: SELECT ts_lexize('thesaurus_astro','supernovae stars') is null; ?column? ---------- t -The thesaurus dictionary thesaurus_astro does know -supernovae stars, but ts_lexize fails since it -does not parse the input text and considers it as a single lexeme. Use -plainto_tsquery and to_tsvector to test thesaurus -dictionaries: + + The thesaurus dictionary thesaurus_astro does know + supernovae stars, but ts_lexize fails since it + does not parse the input text and considers it as a single lexeme. Use + plainto_tsquery and to_tsvector to test thesaurus + dictionaries: + SELECT plainto_tsquery('supernovae stars'); plainto_tsquery ----------------- 'sn' - - - - - - -Configuration Example - - -A full text configuration specifies all options necessary to transform a -document into a tsvector: the parser breaks text into tokens, -and the dictionaries transform each token into a lexeme. Every call to -to_tsvector() and to_tsquery() -needs a configuration to perform its processing. To facilitate management -of full text searching objects, a set of SQL commands -is available, and there are several psql commands which display information -about full text searching objects (). - - - -The configuration parameter - -specifies the name of the current default configuration, which is the -one used by text search functions when an explicit configuration -parameter is omitted. -It can be set in postgresql.conf, or set for an -individual session using the SET command. - - - -Several predefined text searching configurations are available in the -pg_catalog schema. If you need a custom configuration -you can create a new text searching configuration and modify it using SQL -commands. - -New text searching objects are created in the current schema by default -(usually the public schema), but a schema-qualified -name can be used to create objects in the specified schema. - - - -As an example, we will create a configuration -pg which starts as a duplicate of the -english configuration. To be safe, we do this in a transaction: + + + + + + + Configuration Example + + + A full text configuration specifies all options necessary to transform a + document into a tsvector: the parser breaks text into tokens, + and the dictionaries transform each token into a lexeme. Every call to + to_tsvector() and to_tsquery() + needs a configuration to perform its processing. To facilitate management + of full text searching objects, a set of SQL commands + is available, and there are several psql commands which display information + about full text searching objects (). + + + + The configuration parameter + + specifies the name of the current default configuration, which is the + one used by text search functions when an explicit configuration + parameter is omitted. + It can be set in postgresql.conf, or set for an + individual session using the SET command. + + + + Several predefined text searching configurations are available in the + pg_catalog schema. If you need a custom configuration + you can create a new text searching configuration and modify it using SQL + commands. + + + + New text searching objects are created in the current schema by default + (usually the public schema), but a schema-qualified + name can be used to create objects in the specified schema. + + + + As an example, we will create a configuration + pg which starts as a duplicate of the + english configuration. To be safe, we do this in a transaction: + BEGIN; CREATE TEXT SEARCH CONFIGURATION public.pg ( COPY = english ); - + + + + We will use a PostgreSQL-specific synonym list + and store it in share/tsearch_data/pg_dict.syn. + The file contents look like: - -We will use a PostgreSQL-specific synonym list -and store it in share/tsearch_data/pg_dict.syn. -The file contents look like: - + postgres pg pgsql pg postgresql pg -We define the dictionary like this: + We define the dictionary like this: + CREATE TEXT SEARCH DICTIONARY pg_dict ( TEMPLATE = synonym @@ -2560,11 +2736,11 @@ CREATE TEXT SEARCH DICTIONARY pg_dict ( ); - + - -Then register the ispell dictionary -english_ispell using the ispell template: + + Then register the ispell dictionary + english_ispell using the ispell template: CREATE TEXT SEARCH DICTIONARY english_ispell ( @@ -2574,29 +2750,30 @@ CREATE TEXT SEARCH DICTIONARY english_ispell ( StopWords = english ); - + - -Now modify mappings for Latin words for configuration pg: + + Now modify mappings for Latin words for configuration pg: ALTER TEXT SEARCH CONFIGURATION pg ALTER MAPPING FOR lword, lhword, lpart_hword WITH pg_dict, english_ispell, english_stem; - + - -We do not index or search some tokens: + + We do not index or search some tokens: ALTER TEXT SEARCH CONFIGURATION pg DROP MAPPING FOR email, url, sfloat, uri, float; - + + + + Now, we can test our configuration: - -Now, we can test our configuration: SELECT * FROM ts_debug('public.pg', ' PostgreSQL, the highly scalable, SQL compliant, open source object-relational @@ -2604,15 +2781,16 @@ database management system, is now undergoing beta testing of the next version of our software: PostgreSQL 8.3. '); -COMMIT; + COMMIT; - + + + + With the dictionaries and mappings set up, suppose we have a table + pgweb which contains 11239 documents from the + PostgreSQL web site. Only relevant columns + are shown: - -With the dictionaries and mappings set up, suppose we have a table -pgweb which contains 11239 documents from the -PostgreSQL web site. Only relevant columns -are shown: => \d pgweb Table "public.pgweb" @@ -2624,18 +2802,18 @@ are shown: title | character varying | dlm | date | - + + + + The next step is to set the session to use the new configuration, which was + created in the public schema: - -The next step is to set the session to use the new configuration, which was -created in the public schema: => \dF -postgres=# \dF public.* -List of fulltext configurations - Schema | Name | Description ---------+------+------------- - public | pg | + List of fulltext configurations + Schema | Name | Description +---------+------+------------- + public | pg | SET default_text_search_config = 'public.pg'; SET @@ -2645,41 +2823,43 @@ SHOW default_text_search_config; ---------------------------- public.pg - + + + - + + Managing Multiple Configurations - -Managing Multiple Configurations + + If you are using the same text search configuration for the entire cluster + just set the value in postgresql.conf. If using a single + text search configuration for an entire database, use ALTER + DATABASE ... SET. + - -If you are using the same text search configuration for the entire cluster -just set the value in postgresql.conf. If using a single -text search configuration for an entire database, use ALTER -DATABASE ... SET. - + + However, if you need to use several text search configurations in the same + database you must be careful to reference the proper text search + configuration. This can be done by either setting + default_text_search_config in each session or supplying the + configuration name in every function call, e.g. to_tsquery('french', + 'friend'), to_tsvector('english', col). If you are using an expression + index you must embed the configuration name into the expression index, e.g.: - -However, if you need to use several text search configurations in the same -database you must be careful to reference the proper text search -configuration. This can be done by either setting -default_text_search_config in each session or supplying the -configuration name in every function call, e.g. to_tsquery('french', -'friend'), to_tsvector('english', col). If you are using an expression -index you must embed the configuration name into the expression index, e.g.: CREATE INDEX pgweb_idx ON pgweb USING gin(to_tsvector('french', title || body)); -And for an expression index, specify the configuration name in the -WHERE clause as well so the expression index will be used. - - + And for an expression index, specify the configuration name in the + WHERE clause as well so the expression index will be used. + + + - + - -GiST and GIN Index Types + + GiST and GIN Index Types index @@ -2687,67 +2867,66 @@ And for an expression index, specify the configuration name in the - -There are two kinds of indexes which can be used to speed up full text -operators (). -Note that indexes are not mandatory for full text searching. + + There are two kinds of indexes which can be used to speed up full text + operators (). + Note that indexes are not mandatory for full text searching. - + - + - -index -GIST, for text searching - + + index + GIST, for text searching + - - -CREATE INDEX name ON table USING gist(column); - - + + + CREATE INDEX name ON table USING gist(column); + + - - -Creates a GiST (Generalized Search Tree)-based index. -The column can be of tsvector or -tsquery type. - + + + Creates a GiST (Generalized Search Tree)-based index. + The column can be of tsvector or + tsquery type. + + + - - + - + + index + GIN + - -index -GIN - + + + CREATE INDEX name ON table USING gin(column); + + - - -CREATE INDEX name ON table USING gin(column); - - + + + Creates a GIN (Generalized Inverted Index)-based index. + The column must be of tsvector type. + + + - - -Creates a GIN (Generalized Inverted Index)-based index. -The column must be of tsvector type. - + + - - + + A GiST index is lossy, meaning it is necessary + to check the actual table row to eliminate false matches. + PostgreSQL does this automatically; for + example, in the query plan below, the Filter: + line indicates the index output will be rechecked: - - - - -A GiST index is lossy, meaning it is necessary -to check the actual table row to eliminate false matches. -PostgreSQL does this automatically; for -example, in the query plan below, the Filter: -line indicates the index output will be rechecked: EXPLAIN SELECT * FROM apod WHERE textsearch @@ to_tsquery('supernovae'); QUERY PLAN @@ -2756,62 +2935,64 @@ EXPLAIN SELECT * FROM apod WHERE textsearch @@ to_tsquery('supernovae'); Index Cond: (textsearch @@ '''supernova'''::tsquery) Filter: (textsearch @@ '''supernova'''::tsquery) -GiST index lossiness happens because each document is represented by a -fixed-length signature. The signature is generated by hashing (crc32) each -word into a random bit in an n-bit string and all words combine to produce -an n-bit document signature. Because of hashing there is a chance that -some words hash to the same position and could result in a false hit. -Signatures calculated for each document in a collection are stored in an -RD-tree (Russian Doll tree), invented by Hellerstein, -which is an adaptation of R-tree for sets. In our case -the transitive containment relation is realized by -superimposed coding (Knuth, 1973) of signatures, i.e., a parent is the -result of 'OR'-ing the bit-strings of all children. This is a second -factor of lossiness. It is clear that parents tend to be full of -1s (degenerates) and become quite useless because of the -limited selectivity. Searching is performed as a bit comparison of a -signature representing the query and an RD-tree entry. -If all 1s of both signatures are in the same position we -say that this branch probably matches the query, but if there is even one -discrepancy we can definitely reject this branch. - - - -Lossiness causes serious performance degradation since random access of -heap records is slow and limits the usefulness of GiST -indexes. The likelihood of false hits depends on several factors, like -the number of unique words, so using dictionaries to reduce this number -is recommended. - - - -Actually, this is not the whole story. GiST indexes have an optimization -for storing small tsvectors (< TOAST_INDEX_TARGET -bytes, 512 bytes). On leaf pages small tsvectors are stored unchanged, -while longer ones are represented by their signatures, which introduces -some lossiness. Unfortunately, the existing index API does not allow for -a return value to say whether it found an exact value (tsvector) or whether -the result needs to be checked. This is why the GiST index is -currently marked as lossy. We hope to improve this in the future. - - - -GIN indexes are not lossy but their performance depends logarithmically on -the number of unique words. - - - -There is one side-effect of the non-lossiness of a GIN index when using -query labels/weights, like 'supernovae:a'. A GIN index -has all the information necessary to determine a match, so the heap is -not accessed. However, label information is not stored in the index, -so if the query involves label weights it must access -the heap. Therefore, a special full text search operator @@@ -was created which forces the use of the heap to get information about -labels. GiST indexes are lossy so it always reads the heap and there is -no need for a special operator. In the example below, -fulltext_idx is a GIN index: + + GiST index lossiness happens because each document is represented by a + fixed-length signature. The signature is generated by hashing (crc32) each + word into a random bit in an n-bit string and all words combine to produce + an n-bit document signature. Because of hashing there is a chance that + some words hash to the same position and could result in a false hit. + Signatures calculated for each document in a collection are stored in an + RD-tree (Russian Doll tree), invented by Hellerstein, + which is an adaptation of R-tree for sets. In our case + the transitive containment relation is realized by + superimposed coding (Knuth, 1973) of signatures, i.e., a parent is the + result of 'OR'-ing the bit-strings of all children. This is a second + factor of lossiness. It is clear that parents tend to be full of + 1s (degenerates) and become quite useless because of the + limited selectivity. Searching is performed as a bit comparison of a + signature representing the query and an RD-tree entry. + If all 1s of both signatures are in the same position we + say that this branch probably matches the query, but if there is even one + discrepancy we can definitely reject this branch. + + + + Lossiness causes serious performance degradation since random access of + heap records is slow and limits the usefulness of GiST + indexes. The likelihood of false hits depends on several factors, like + the number of unique words, so using dictionaries to reduce this number + is recommended. + + + + Actually, this is not the whole story. GiST indexes have an optimization + for storing small tsvectors (< TOAST_INDEX_TARGET + bytes, 512 bytes). On leaf pages small tsvectors are stored unchanged, + while longer ones are represented by their signatures, which introduces + some lossiness. Unfortunately, the existing index API does not allow for + a return value to say whether it found an exact value (tsvector) or whether + the result needs to be checked. This is why the GiST index is + currently marked as lossy. We hope to improve this in the future. + + + + GIN indexes are not lossy but their performance depends logarithmically on + the number of unique words. + + + + There is one side-effect of the non-lossiness of a GIN index when using + query labels/weights, like 'supernovae:a'. A GIN index + has all the information necessary to determine a match, so the heap is + not accessed. However, label information is not stored in the index, + so if the query involves label weights it must access + the heap. Therefore, a special full text search operator @@@ + was created which forces the use of the heap to get information about + labels. GiST indexes are lossy so it always reads the heap and there is + no need for a special operator. In the example below, + fulltext_idx is a GIN index: + EXPLAIN SELECT * FROM apod WHERE textsearch @@@ to_tsquery('supernovae:a'); QUERY PLAN @@ -2821,95 +3002,117 @@ EXPLAIN SELECT * FROM apod WHERE textsearch @@@ to_tsquery('supernovae:a'); Filter: (textsearch @@@ '''supernova'':A'::tsquery) - - - -In choosing which index type to use, GiST or GIN, consider these differences: - - -GiN index lookups are three times faster than GiST - - -GiN indexes take three times longer to build than GiST - - -GiN is about ten times slower to update than GiST - - -GiN indexes are two-to-three times larger than GiST - - - - - -In summary, GIN indexes are best for static data because -the indexes are faster for lookups. For dynamic data, GiST indexes are -faster to update. Specifically, GiST indexes are very -good for dynamic data and fast if the number of unique words (lexemes) is -under 100,000, while GIN handles +100,000 lexemes better -but is slower to update. - - - -Partitioning of big collections and the proper use of GiST and GIN indexes -allows the implementation of very fast searches with online update. -Partitioning can be done at the database level using table inheritance -and constraint_exclusion, or distributing documents over -servers and collecting search results using the contrib/dblink -extension module. The latter is possible because ranking functions use -only local information. - - - - - -Limitations - - -The current limitations of Full Text Searching are: - -The length of each lexeme must be less than 2K bytes -The length of a tsvector (lexemes + positions) must be less than 1 megabyte -The number of lexemes must be less than 264 -Positional information must be non-negative and less than 16,383 -No more than 256 positions per lexeme -The number of nodes (lexemes + operations) in tsquery must be less than 32,768 - - - - -For comparison, the PostgreSQL 8.1 documentation -contained 10,441 unique words, a total of 335,420 words, and the most frequent -word postgresql was mentioned 6,127 times in 655 documents. - - - - -Another example — the PostgreSQL mailing list -archives contained 910,989 unique words with 57,491,343 lexemes in 461,020 -messages. - - - - - -<application>psql</> Support - - -Information about full text searching objects can be obtained -in psql using a set of commands: - -\dF{,d,p}+ PATTERN - -An optional + produces more details. - - -The optional parameter PATTERN should be the name of -a full text searching object, optionally schema-qualified. If -PATTERN is not specified then information about all -visible objects will be displayed. PATTERN can be a -regular expression and can apply separately to schema -names and object names. The following examples illustrate this: + + + + In choosing which index type to use, GiST or GIN, consider these differences: + + + + GiN index lookups are three times faster than GiST + + + + + GiN indexes take three times longer to build than GiST + + + + + GiN is about ten times slower to update than GiST + + + + + GiN indexes are two-to-three times larger than GiST + + + + + + + In summary, GIN indexes are best for static data because + the indexes are faster for lookups. For dynamic data, GiST indexes are + faster to update. Specifically, GiST indexes are very + good for dynamic data and fast if the number of unique words (lexemes) is + under 100,000, while GIN handles +100,000 lexemes better + but is slower to update. + + + + Partitioning of big collections and the proper use of GiST and GIN indexes + allows the implementation of very fast searches with online update. + Partitioning can be done at the database level using table inheritance + and constraint_exclusion, or distributing documents over + servers and collecting search results using the contrib/dblink + extension module. The latter is possible because ranking functions use + only local information. + + + + + + Limitations + + + The current limitations of Full Text Searching are: + + + The length of each lexeme must be less than 2K bytes + + + The length of a tsvector (lexemes + positions) must be less than 1 megabyte + + + The number of lexemes must be less than 264 + + + Positional information must be non-negative and less than 16,383 + + + No more than 256 positions per lexeme + + + The number of nodes (lexemes + operations) in tsquery must be less than 32,768 + + + + + + For comparison, the PostgreSQL 8.1 documentation + contained 10,441 unique words, a total of 335,420 words, and the most frequent + word postgresql was mentioned 6,127 times in 655 documents. + + + + + Another example — the PostgreSQL mailing list + archives contained 910,989 unique words with 57,491,343 lexemes in 461,020 + messages. + + + + + + <application>psql</> Support + + + Information about full text searching objects can be obtained + in psql using a set of commands: + + \dF{,d,p}+ PATTERN + + An optional + produces more details. + + + + The optional parameter PATTERN should be the name of + a full text searching object, optionally schema-qualified. If + PATTERN is not specified then information about all + visible objects will be displayed. PATTERN can be a + regular expression and can apply separately to schema + names and object names. The following examples illustrate this: + => \dF *fulltext* List of fulltext configurations @@ -2926,23 +3129,24 @@ names and object names. The following examples illustrate this: fulltext | fulltext_cfg | public | fulltext_cfg | - + - + -\dF[+] [PATTERN] + \dF[+] [PATTERN] - List full text searching configurations (add "+" for more detail) + List full text searching configurations (add "+" for more detail) By default (without PATTERN), information about all visible full text configurations will be displayed. - + + => \dF russian List of fulltext configurations @@ -2951,8 +3155,8 @@ names and object names. The following examples illustrate this: pg_catalog | russian | default configuration for Russian => \dF+ russian -Configuration "pg_catalog.russian" -Parser name: "pg_catalog.default" + Configuration "pg_catalog.russian" + Parser name: "pg_catalog.default" Token | Dictionaries --------------+------------------------- email | pg_catalog.simple @@ -2975,21 +3179,22 @@ Parser name: "pg_catalog.default" version | pg_catalog.simple word | pg_catalog.russian_stem - + -\dFd[+] [PATTERN] + \dFd[+] [PATTERN] - List full text dictionaries (add "+" for more detail). + List full text dictionaries (add "+" for more detail). By default (without PATTERN), information about all visible dictionaries will be displayed. - + + => \dFd List of fulltext dictionaries @@ -3012,29 +3217,29 @@ Parser name: "pg_catalog.default" pg_catalog | swedish | Snowball stemmer for swedish language pg_catalog | turkish | Snowball stemmer for turkish language - + -\dFp[+] [PATTERN] + \dFp[+] [PATTERN] - List full text parsers (add "+" for more detail) + List full text parsers (add "+" for more detail) By default (without PATTERN), information about all visible full text parsers will be displayed. - + -=> \dFp + => \dFp List of fulltext parsers Schema | Name | Description ------------+---------+--------------------- pg_catalog | default | default word parser -(1 row) + (1 row) => \dFp+ Fulltext parser "pg_catalog.default" Method | Function | Description @@ -3073,35 +3278,36 @@ Parser name: "pg_catalog.default" word | Word (23 rows) - + - - + - -Debugging + + Debugging - -Function ts_debug allows easy testing of your full text searching -configuration. - + + Function ts_debug allows easy testing of your full text searching + configuration. + - -ts_debug(config_name, document TEXT) returns SETOF ts_debug - + + ts_debug(config_name, document TEXT) returns SETOF ts_debug + + + + ts_debug displays information about every token of + document as produced by the + parser and processed by the configured dictionaries using the configuration + specified by config_name. + + + + ts_debug type defined as: - -ts_debug displays information about every token of -document as produced by the -parser and processed by the configured dictionaries using the configuration -specified by config_name. - - -ts_debug type defined as: CREATE TYPE ts_debug AS ( "Alias" text, @@ -3111,14 +3317,15 @@ CREATE TYPE ts_debug AS ( "Lexized token" text ); - + + + + For a demonstration of how function ts_debug works we + first create a public.english configuration and + ispell dictionary for the English language. You can skip the test step and + play with the standard english configuration. + - -For a demonstration of how function ts_debug works we -first create a public.english configuration and -ispell dictionary for the English language. You can skip the test step and -play with the standard english configuration. - CREATE TEXT SEARCH CONFIGURATION public.english ( COPY = pg_catalog.english ); @@ -3130,7 +3337,7 @@ CREATE TEXT SEARCH DICTIONARY english_ispell ( ); ALTER TEXT SEARCH CONFIGURATION public.english - ALTER MAPPING FOR lword WITH english_ispell, english_stem; + ALTER MAPPING FOR lword WITH english_ispell, english_stem; @@ -3144,26 +3351,28 @@ SELECT * FROM ts_debug('public.english','The Brightest supernovaes'); lword | Latin word | supernovaes | {public.english_ispell,pg_catalog.english_stem} | pg_catalog.english_stem: {supernova} (5 rows) - -In this example, the word Brightest was recognized by a -parser as a Latin word (alias lword) -and came through the dictionaries public.english_ispell and -pg_catalog.english_stem. It was recognized by -public.english_ispell, which reduced it to the noun -bright. The word supernovaes is unknown -by the public.english_ispell dictionary so it was passed to -the next dictionary, and, fortunately, was recognized (in fact, -public.english_stem is a stemming dictionary and recognizes -everything; that is why it was placed at the end of the dictionary stack). - - - -The word The was recognized by public.english_ispell -dictionary as a stop word () and will not be indexed. - - - -You can always explicitly specify which columns you want to see: + + + In this example, the word Brightest was recognized by a + parser as a Latin word (alias lword) + and came through the dictionaries public.english_ispell and + pg_catalog.english_stem. It was recognized by + public.english_ispell, which reduced it to the noun + bright. The word supernovaes is unknown + by the public.english_ispell dictionary so it was passed to + the next dictionary, and, fortunately, was recognized (in fact, + public.english_stem is a stemming dictionary and recognizes + everything; that is why it was placed at the end of the dictionary stack). + + + + The word The was recognized by public.english_ispell + dictionary as a stop word () and will not be indexed. + + + + You can always explicitly specify which columns you want to see: + SELECT "Alias", "Token", "Lexized token" FROM ts_debug('public.english','The Brightest supernovaes'); @@ -3176,96 +3385,104 @@ FROM ts_debug('public.english','The Brightest supernovaes'); lword | supernovaes | pg_catalog.english_stem: {supernova} (5 rows) - - - - - -Example of Creating a Rule-Based Dictionary - - -The motivation for this example dictionary is to control the indexing of -integers (signed and unsigned), and, consequently, to minimize the number -of unique words which greatly affects to performance of searching. - - - -The dictionary accepts two options: - - - -The MAXLEN parameter specifies the maximum length of the -number considered as a 'good' integer. The default value is 6. - - - -The REJECTLONG parameter specifies if a 'long' integer -should be indexed or treated as a stop word. If -REJECTLONG=FALSE (default), -the dictionary returns the prefixed part of the integer with length -MAXLEN. If -REJECTLONG=TRUE, the dictionary -considers a long integer as a stop word. - - - - - - - -A similar idea can be applied to the indexing of decimal numbers, for -example, in the DecDict dictionary. The dictionary -accepts two options: the MAXLENFRAC parameter specifies -the maximum length of the fractional part considered as a 'good' decimal. -The default value is 3. The REJECTLONG parameter -controls whether a decimal number with a 'long' fractional part should be indexed -or treated as a stop word. If -REJECTLONG=FALSE (default), -the dictionary returns the decimal number with the length of its fraction part -truncated to MAXLEN. If -REJECTLONG=TRUE, the dictionary -considers the number as a stop word. Notice that -REJECTLONG=FALSE allows the indexing -of 'shortened' numbers and search results will contain documents with -shortened numbers. - - - - -Examples: + + + + + + Example of Creating a Rule-Based Dictionary + + + The motivation for this example dictionary is to control the indexing of + integers (signed and unsigned), and, consequently, to minimize the number + of unique words which greatly affects to performance of searching. + + + + The dictionary accepts two options: + + + + + The MAXLEN parameter specifies the maximum length of the + number considered as a 'good' integer. The default value is 6. + + + + + + The REJECTLONG parameter specifies if a 'long' integer + should be indexed or treated as a stop word. If + REJECTLONG=FALSE (default), + the dictionary returns the prefixed part of the integer with length + MAXLEN. If + REJECTLONG=TRUE, the dictionary + considers a long integer as a stop word. + + + + + + + + + A similar idea can be applied to the indexing of decimal numbers, for + example, in the DecDict dictionary. The dictionary + accepts two options: the MAXLENFRAC parameter specifies + the maximum length of the fractional part considered as a 'good' decimal. + The default value is 3. The REJECTLONG parameter + controls whether a decimal number with a 'long' fractional part should be indexed + or treated as a stop word. If + REJECTLONG=FALSE (default), + the dictionary returns the decimal number with the length of its fraction part + truncated to MAXLEN. If + REJECTLONG=TRUE, the dictionary + considers the number as a stop word. Notice that + REJECTLONG=FALSE allows the indexing + of 'shortened' numbers and search results will contain documents with + shortened numbers. + + + + Examples: + SELECT ts_lexize('intdict', 11234567890); ts_lexize ----------- {112345} - - -Now, we want to ignore long integers: + + + + Now, we want to ignore long integers: + ALTER TEXT SEARCH DICTIONARY intdict ( MAXLEN = 6, REJECTLONG = TRUE ); + SELECT ts_lexize('intdict', 11234567890); ts_lexize ----------- {} - + + + + Create contrib/dict_intdict directory with files + dict_tmpl.c, Makefile, dict_intdict.sql.in: - -Create contrib/dict_intdict directory with files -dict_tmpl.c, Makefile, dict_intdict.sql.in: -make && make install -psql DBNAME < dict_intdict.sql +$ make && make install +$ psql DBNAME < dict_intdict.sql - + - -This is a dict_tmpl.c file: - + + This is a dict_tmpl.c file: + #include "postgres.h" @@ -3280,61 +3497,61 @@ PG_MODULE_MAGIC; #include "utils/ts_public.h" #include "utils/ts_utils.h" - typedef struct { - int maxlen; - bool rejectlong; - } DictInt; +typedef struct { + int maxlen; + bool rejectlong; +} DictInt; - PG_FUNCTION_INFO_V1(dinit_intdict); - Datum dinit_intdict(PG_FUNCTION_ARGS); +PG_FUNCTION_INFO_V1(dinit_intdict); +Datum dinit_intdict(PG_FUNCTION_ARGS); - Datum - dinit_intdict(PG_FUNCTION_ARGS) { - DictInt *d = (DictInt*)malloc( sizeof(DictInt) ); - Map *cfg, *pcfg; - text *in; +Datum +dinit_intdict(PG_FUNCTION_ARGS) { + DictInt *d = (DictInt*)malloc( sizeof(DictInt) ); + Map *cfg, *pcfg; + text *in; - if (!d) - elog(ERROR, "No memory"); - memset(d, 0, sizeof(DictInt)); + if (!d) + elog(ERROR, "No memory"); + memset(d, 0, sizeof(DictInt)); - /* Your INIT code */ -/* defaults */ - d->maxlen = 6; - d->rejectlong = false; + /* Your INIT code */ + /* defaults */ + d->maxlen = 6; + d->rejectlong = false; - if ( PG_ARGISNULL(0) || PG_GETARG_POINTER(0) == NULL ) /* no options */ - PG_RETURN_POINTER(d); + if (PG_ARGISNULL(0) || PG_GETARG_POINTER(0) == NULL) /* no options */ + PG_RETURN_POINTER(d); - in = PG_GETARG_TEXT_P(0); - parse_keyvalpairs(in, &cfg); - PG_FREE_IF_COPY(in, 0); - pcfg=cfg; + in = PG_GETARG_TEXT_P(0); + parse_keyvalpairs(in, &cfg); + PG_FREE_IF_COPY(in, 0); + pcfg=cfg; - while (pcfg->key) + while (pcfg->key) + { + if (strcasecmp("MAXLEN", pcfg->key) == 0) + d->maxlen=atoi(pcfg->value); + else if ( strcasecmp("REJECTLONG", pcfg->key) == 0) { - if (strcasecmp("MAXLEN", pcfg->key) == 0) - d->maxlen=atoi(pcfg->value); - else if ( strcasecmp("REJECTLONG", pcfg->key) == 0) - { - if ( strcasecmp("true", pcfg->value) == 0 ) - d->rejectlong=true; - else if ( strcasecmp("false", pcfg->value) == 0) - d->rejectlong=false; - else - elog(ERROR,"Unknown value: %s => %s", pcfg->key, pcfg->value); - } - else - elog(ERROR,"Unknown option: %s => %s", pcfg->key, pcfg->value); - - pfree(pcfg->key); - pfree(pcfg->value); - pcfg++; + if ( strcasecmp("true", pcfg->value) == 0 ) + d->rejectlong=true; + else if ( strcasecmp("false", pcfg->value) == 0) + d->rejectlong=false; + else + elog(ERROR,"Unknown value: %s => %s", pcfg->key, pcfg->value); } - pfree(cfg); + else + elog(ERROR,"Unknown option: %s => %s", pcfg->key, pcfg->value); - PG_RETURN_POINTER(d); + pfree(pcfg->key); + pfree(pcfg->value); + pcfg++; + } + pfree(cfg); + + PG_RETURN_POINTER(d); } PG_FUNCTION_INFO_V1(dlexize_intdict); @@ -3342,36 +3559,37 @@ Datum dlexize_intdict(PG_FUNCTION_ARGS); Datum dlexize_intdict(PG_FUNCTION_ARGS) { - DictInt *d = (DictInt*)PG_GETARG_POINTER(0); - char *in = (char*)PG_GETARG_POINTER(1); - char *txt = pnstrdup(in, PG_GETARG_INT32(2)); - TSLexeme *res = palloc(sizeof(TSLexeme) * 2); + DictInt *d = (DictInt*)PG_GETARG_POINTER(0); + char *in = (char*)PG_GETARG_POINTER(1); + char *txt = pnstrdup(in, PG_GETARG_INT32(2)); + TSLexeme *res = palloc(sizeof(TSLexeme) * 2); - /* Your INIT dictionary code */ - res[1].lexeme = NULL; + /* Your INIT dictionary code */ + res[1].lexeme = NULL; - if (PG_GETARG_INT32(2) > d->maxlen) - { - if (d->rejectlong) - { /* stop, return void array */ - pfree(txt); - res[0].lexeme = NULL; - } - else - { /* cut integer */ - txt[d->maxlen] = '\0'; - res[0].lexeme = txt; - } + if (PG_GETARG_INT32(2) > d->maxlen) + { + if (d->rejectlong) + { /* stop, return void array */ + pfree(txt); + res[0].lexeme = NULL; } else - res[0].lexeme = txt; + { /* cut integer */ + txt[d->maxlen] = '\0'; + res[0].lexeme = txt; + } + } + else + res[0].lexeme = txt; - PG_RETURN_POINTER(res); + PG_RETURN_POINTER(res); } - -This is the Makefile: + + This is the Makefile: + subdir = contrib/dict_intdict top_builddir = ../.. @@ -3384,134 +3602,136 @@ DOCS = include $(top_srcdir)/contrib/contrib-global.mk - + + + + This is a dict_intdict.sql.in: - -This is a dict_intdict.sql.in: SET default_text_search_config = 'english'; BEGIN; CREATE OR REPLACE FUNCTION dinit_intdict(internal) -RETURNS internal -AS 'MODULE_PATHNAME' -LANGUAGE 'C'; + RETURNS internal + AS 'MODULE_PATHNAME' + LANGUAGE 'C'; CREATE OR REPLACE FUNCTION dlexize_intdict(internal,internal,internal,internal) -RETURNS internal -AS 'MODULE_PATHNAME' -LANGUAGE 'C' -WITH (isstrict); + RETURNS internal + AS 'MODULE_PATHNAME' + LANGUAGE 'C' + WITH (isstrict); CREATE TEXT SEARCH TEMPLATE intdict_template ( LEXIZE = dlexize_intdict, INIT = dinit_intdict ); CREATE TEXT SEARCH DICTIONARY intdict ( - TEMPLATE = intdict_template, - MAXLEN = 6, REJECTLONG = false + TEMPLATE = intdict_template, + MAXLEN = 6, REJECTLONG = false ); COMMENT ON TEXT SEARCH DICTIONARY intdict IS 'Dictionary for Integers'; END; - - - - - -Example of Creating a Parser - - -SQL command CREATE TEXT SEARCH PARSER creates -a parser for full text searching. In our example we will implement -a simple parser which recognizes space-delimited words and -has only two types (3, word, Word; 12, blank, Space symbols). Identifiers -were chosen to keep compatibility with the default headline() function -since we do not implement our own version. - - - -To implement a parser one needs to create a minimum of four functions. - - - - - - - -START = start_function - - - - -Initialize the parser. Arguments are a pointer to the parsed text and its -length. - - -Returns a pointer to the internal structure of a parser. Note that it should -be malloced or palloced in the -TopMemoryContext. We name it ParserState. - - - - - - - -GETTOKEN = gettoken_function - - - - -Returns the next token. -Arguments are ParserState *, char **, int *. - - -This procedure will be called as long as the procedure returns token type zero. - - - - - - - -END = end_function, - - - - -This void function will be called after parsing is finished to free -allocated resources in this procedure (ParserState). The argument -is ParserState *. - - - - - - - -LEXTYPES = lextypes_function - - - - -Returns an array containing the id, alias, and the description of the tokens -in the parser. See LexDescr in src/include/utils/ts_public.h. - - - - - - - -Below is the source code of our test parser, organized as a contrib module. - - - -Testing: + + + + + + Example of Creating a Parser + + + SQL command CREATE TEXT SEARCH PARSER creates + a parser for full text searching. In our example we will implement + a simple parser which recognizes space-delimited words and + has only two types (3, word, Word; 12, blank, Space symbols). Identifiers + were chosen to keep compatibility with the default headline() function + since we do not implement our own version. + + + + To implement a parser one needs to create a minimum of four functions. + + + + + + + + START = start_function + + + + + Initialize the parser. Arguments are a pointer to the parsed text and its + length. + + + Returns a pointer to the internal structure of a parser. Note that it should + be malloced or palloced in the + TopMemoryContext. We name it ParserState. + + + + + + + + GETTOKEN = gettoken_function + + + + + Returns the next token. + Arguments are ParserState *, char **, int *. + + + This procedure will be called as long as the procedure returns token type zero. + + + + + + + + END = end_function, + + + + + This void function will be called after parsing is finished to free + allocated resources in this procedure (ParserState). The argument + is ParserState *. + + + + + + + + LEXTYPES = lextypes_function + + + + + Returns an array containing the id, alias, and the description of the tokens + in the parser. See LexDescr in src/include/utils/ts_public.h. + + + + + + + + Below is the source code of our test parser, organized as a contrib module. + + + + Testing: + SELECT * FROM ts_parse('testparser','That''s my first own parser'); tokid | token @@ -3525,35 +3745,39 @@ SELECT * FROM ts_parse('testparser','That''s my first own parser'); 3 | own 12 | 3 | parser + SELECT to_tsvector('testcfg','That''s my first own parser'); to_tsvector ------------------------------------------------- 'my':2 'own':4 'first':3 'parser':5 'that''s':1 + SELECT ts_headline('testcfg','Supernovae stars are the brightest phenomena in galaxies', to_tsquery('testcfg', 'star')); headline ----------------------------------------------------------------- Supernovae <b>stars</b> are the brightest phenomena in galaxies - + - -This test parser is an example adopted from a tutorial by Valli, parser -HOWTO. - + + This test parser is an example adopted from a tutorial by Valli, parser + HOWTO. + + + + To compile the example just do: - -To compile the example just do: -make -make install -psql regression < test_parser.sql +$ make +$ make install +$ psql regression < test_parser.sql - + + + + This is a test_parser.c: - -This is a test_parser.c: #ifdef PG_MODULE_MAGIC @@ -3630,7 +3854,7 @@ Datum testprs_getlexeme(PG_FUNCTION_ARGS) /* go to the next white-space character */ while ((pst->buffer)[pst->pos] != ' ' && pst->pos < pst->len) - (pst->pos)++; + (pst->pos)++; } *tlen = pst->pos - *tlen; @@ -3641,6 +3865,7 @@ Datum testprs_getlexeme(PG_FUNCTION_ARGS) PG_RETURN_INT32(type); } + Datum testprs_end(PG_FUNCTION_ARGS) { ParserState *pst = (ParserState *) PG_GETARG_POINTER(0); @@ -3673,7 +3898,7 @@ Datum testprs_lextype(PG_FUNCTION_ARGS) -This is a Makefile + This is a Makefile override CPPFLAGS := -I. $(CPPFLAGS) @@ -3698,7 +3923,7 @@ include $(top_srcdir)/contrib/contrib-global.mk endif -This is a test_parser.sql.in: + This is a test_parser.sql.in: SET default_text_search_config = 'english'; @@ -3706,41 +3931,41 @@ SET default_text_search_config = 'english'; BEGIN; CREATE FUNCTION testprs_start(internal,int4) -RETURNS internal -AS 'MODULE_PATHNAME' -LANGUAGE 'C' with (isstrict); + RETURNS internal + AS 'MODULE_PATHNAME' + LANGUAGE 'C' with (isstrict); CREATE FUNCTION testprs_getlexeme(internal,internal,internal) -RETURNS internal -AS 'MODULE_PATHNAME' -LANGUAGE 'C' with (isstrict); + RETURNS internal + AS 'MODULE_PATHNAME' + LANGUAGE 'C' with (isstrict); CREATE FUNCTION testprs_end(internal) -RETURNS void -AS 'MODULE_PATHNAME' -LANGUAGE 'C' with (isstrict); + RETURNS void + AS 'MODULE_PATHNAME' + LANGUAGE 'C' with (isstrict); CREATE FUNCTION testprs_lextype(internal) -RETURNS internal -AS 'MODULE_PATHNAME' -LANGUAGE 'C' with (isstrict); + RETURNS internal + AS 'MODULE_PATHNAME' + LANGUAGE 'C' with (isstrict); CREATE TEXT SEARCH PARSER testparser ( - START = testprs_start, - GETTOKEN = testprs_getlexeme, - END = testprs_end, - LEXTYPES = testprs_lextype -; + START = testprs_start, + GETTOKEN = testprs_getlexeme, + END = testprs_end, + LEXTYPES = testprs_lextype +); -CREATE TEXT SEARCH CONFIGURATION testcfg ( PARSER = testparser ); +CREATE TEXT SEARCH CONFIGURATION testcfg (PARSER = testparser); ALTER TEXT SEARCH CONFIGURATION testcfg ADD MAPPING FOR word WITH simple; END; - + - +