Trim trailing whitespace

[postgresql] / doc / src / sgml / pgtrgm.sgml
diff --git a/doc/src/sgml/pgtrgm.sgml b/doc/src/sgml/pgtrgm.sgml

index 1b75f0292b0fec9d90c1e2082f81b24f98b1b5b5..775a7b8be7933166d1e644265d0dadbbfae19d81 100644 (file)
--- a/doc/src/sgml/pgtrgm.sgml
+++ b/doc/src/sgml/pgtrgm.sgml
@@ -1,6 +1,6 @@
  <!-- doc/src/sgml/pgtrgm.sgml -->
  
-<sect1 id="pgtrgm">
+<sect1 id="pgtrgm" xreflabel="pg_trgm">
   <title>pg_trgm</title>
  
   <indexterm zone="pgtrgm">
@@ -9,7 +9,8 @@
  
   <para>
    The <filename>pg_trgm</filename> module provides functions and operators
-  for determining the similarity of text based on trigram matching, as
+  for determining the similarity of
+  alphanumeric text based on trigram matching, as
    well as index operator classes that support fast searching for similar
    strings.
   </para>
@@ -27,7 +28,9 @@
  
    <note>
     <para>
-    A string is considered to have two spaces
+    <filename>pg_trgm</filename> ignores non-word characters
+    (non-alphanumerics) when extracting trigrams from a string.
+    Each word is considered to have two spaces
      prefixed and one space suffixed when determining the set
      of trigrams contained in the string.
      For example, the set of trigrams in the string
@@ -36,6 +39,16 @@
      <quote><literal> ca</literal></quote>,
      <quote><literal>cat</literal></quote>, and
      <quote><literal>at </literal></quote>.
+    The set of trigrams in the string
+    <quote><literal>foo|bar</literal></quote> is
+    <quote><literal>  f</literal></quote>,
+    <quote><literal> fo</literal></quote>,
+    <quote><literal>foo</literal></quote>,
+    <quote><literal>oo </literal></quote>,
+    <quote><literal>  b</literal></quote>,
+    <quote><literal> ba</literal></quote>,
+    <quote><literal>bar</literal></quote>, and
+    <quote><literal>ar </literal></quote>.
     </para>
    </note>
   </sect2>
@@ -43,8 +56,14 @@
   <sect2>
    <title>Functions and Operators</title>
  
+  <para>
+   The functions provided by the <filename>pg_trgm</filename> module
+   are shown in <xref linkend="pgtrgm-func-table">, the operators
+   in <xref linkend="pgtrgm-op-table">.
+  </para>
+
    <table id="pgtrgm-func-table">
-   <title><filename>pg_trgm</filename> functions</title>
+   <title><filename>pg_trgm</filename> Functions</title>
     <tgroup cols="3">
      <thead>
       <row>
@@ -56,7 +75,7 @@
  
      <tbody>
       <row>
-      <entry><function>similarity(text, text)</function></entry>
+      <entry><function>similarity(text, text)</function><indexterm><primary>similarity</primary></indexterm></entry>
        <entry><type>real</type></entry>
        <entry>
         Returns a number that indicates how similar the two arguments are.
@@ -66,7 +85,7 @@
        </entry>
       </row>
       <row>
-      <entry><function>show_trgm(text)</function></entry>
+      <entry><function>show_trgm(text)</function><indexterm><primary>show_trgm</primary></indexterm></entry>
        <entry><type>text[]</type></entry>
        <entry>
         Returns an array of all the trigrams in the given string.
@@ -74,22 +93,38 @@
        </entry>
       </row>
       <row>
-      <entry><function>show_limit()</function></entry>
+      <entry>
+       <function>word_similarity(text, text)</function>
+       <indexterm><primary>word_similarity</primary></indexterm>
+      </entry>
+      <entry><type>real</type></entry>
+      <entry>
+       Returns a number that indicates how similar the first string
+       to the most similar word of the second string. The function searches in
+       the second string a most similar word not a most similar substring.  The
+       range of the result is zero (indicating that the two strings are
+       completely dissimilar) to one (indicating that the first string is
+       identical to one of the words of the second string).
+      </entry>
+     </row>
+     <row>
+      <entry><function>show_limit()</function><indexterm><primary>show_limit</primary></indexterm></entry>
        <entry><type>real</type></entry>
        <entry>
         Returns the current similarity threshold used by the <literal>%</>
         operator.  This sets the minimum similarity between
         two words for them to be considered similar enough to
-       be misspellings of each other, for example.
+       be misspellings of each other, for example
+       (<emphasis>deprecated</emphasis>).
        </entry>
       </row>
       <row>
-      <entry><function>set_limit(real)</function></entry>
+      <entry><function>set_limit(real)</function><indexterm><primary>set_limit</primary></indexterm></entry>
        <entry><type>real</type></entry>
        <entry>
         Sets the current similarity threshold that is used by the <literal>%</>
         operator.  The threshold must be between 0 and 1 (default is 0.3).
-       Returns the same value passed in.
+       Returns the same value passed in (<emphasis>deprecated</emphasis>).
        </entry>
       </row>
      </tbody>
@@ -97,7 +132,7 @@
    </table>
  
    <table id="pgtrgm-op-table">
-   <title><filename>pg_trgm</filename> operators</title>
+   <title><filename>pg_trgm</filename> Operators</title>
     <tgroup cols="3">
      <thead>
       <row>
@@ -114,7 +149,24 @@
        <entry>
         Returns <literal>true</> if its arguments have a similarity that is
         greater than the current similarity threshold set by
-       <function>set_limit</>.
+       <varname>pg_trgm.similarity_threshold</>.
+      </entry>
+     </row>
+     <row>
+      <entry><type>text</> <literal>&lt;%</literal> <type>text</></entry>
+      <entry><type>boolean</type></entry>
+      <entry>
+       Returns <literal>true</> if its first argument has the similar word in
+       the second argument and they have a similarity that is greater than the
+       current word similarity threshold set by
+       <varname>pg_trgm.word_similarity_threshold</> parameter.
+      </entry>
+     </row>
+     <row>
+      <entry><type>text</> <literal>%&gt;</literal> <type>text</></entry>
+      <entry><type>boolean</type></entry>
+      <entry>
+       Commutator of the <literal>&lt;%</> operator.
        </entry>
       </row>
       <row>
@@ -125,11 +177,68 @@
         one minus the <function>similarity()</> value.
        </entry>
       </row>
+     <row>
+      <entry>
+       <type>text</> <literal>&lt;&lt;-&gt;</literal> <type>text</>
+      </entry>
+      <entry><type>real</type></entry>
+      <entry>
+       Returns the <quote>distance</> between the arguments, that is
+       one minus the <function>word_similarity()</> value.
+      </entry>
+     </row>
+     <row>
+      <entry>
+       <type>text</> <literal>&lt;-&gt;&gt;</literal> <type>text</>
+      </entry>
+      <entry><type>real</type></entry>
+      <entry>
+       Commutator of the <literal>&lt;&lt;-&gt;</> operator.
+      </entry>
+     </row>
      </tbody>
     </tgroup>
    </table>
   </sect2>
  
+ <sect2>
+  <title>GUC Parameters</title>
+
+  <variablelist>
+   <varlistentry id="guc-pgtrgm-similarity-threshold" xreflabel="pg_trgm.similarity_threshold">
+    <term>
+     <varname>pg_trgm.similarity_threshold</> (<type>real</type>)
+     <indexterm>
+      <primary><varname>pg_trgm.similarity_threshold</> configuration parameter</primary>
+     </indexterm>
+    </term>
+    <listitem>
+     <para>
+      Sets the current similarity threshold that is used by the <literal>%</>
+      operator.  The threshold must be between 0 and 1 (default is 0.3).
+     </para>
+    </listitem>
+   </varlistentry>
+    <varlistentry id="guc-pgtrgm-word-similarity-threshold" xreflabel="pg_trgm.word_similarity_threshold">
+     <term>
+      <varname>pg_trgm.word_similarity_threshold</> (<type>real</type>)
+      <indexterm>
+       <primary>
+        <varname>pg_trgm.word_similarity_threshold</> configuration parameter
+       </primary>
+      </indexterm>
+     </term>
+     <listitem>
+      <para>
+       Sets the current word similarity threshold that is used by
+       <literal>&lt;%</> and <literal>%&gt;</> operators.  The threshold
+       must be between 0 and 1 (default is 0.6).
+      </para>
+     </listitem>
+    </varlistentry>
+  </variablelist>
+ </sect2>
+
   <sect2>
    <title>Index Support</title>
  
@@ -137,8 +246,11 @@
     The <filename>pg_trgm</filename> module provides GiST and GIN index
     operator classes that allow you to create an index over a text column for
     the purpose of very fast similarity searches.  These index types support
-   the above-described similarity operators (and no other operators, so you may
-   want a regular B-tree index too).
+   the above-described similarity operators, and additionally support
+   trigram-based index searches for <literal>LIKE</>, <literal>ILIKE</>,
+   <literal>~</> and <literal>~*</> queries.  (These indexes do not
+   support equality nor simple comparison operators, so you may need a
+   regular B-tree index too.)
    </para>
  
    <para>
@@ -146,11 +258,11 @@
  
  <programlisting>
  CREATE TABLE test_trgm (t text);
-CREATE INDEX trgm_idx ON test_trgm USING gist (t gist_trgm_ops);
+CREATE INDEX trgm_idx ON test_trgm USING GIST (t gist_trgm_ops);
  </programlisting>
  or
  <programlisting>
-CREATE INDEX trgm_idx ON test_trgm USING gin (t gin_trgm_ops);
+CREATE INDEX trgm_idx ON test_trgm USING GIN (t gin_trgm_ops);
  </programlisting>
    </para>
  
@@ -181,12 +293,68 @@ SELECT t, t &lt;-&gt; '<replaceable>word</>' AS dist
     a small number of the closest matches is wanted.
    </para>
  
+  <para>
+   Also you can use an index on the <structfield>t</> column for word
+   similarity.  For example:
+<programlisting>
+SELECT t, word_similarity('<replaceable>word</>', t) AS sml
+  FROM test_trgm
+  WHERE '<replaceable>word</>' &lt;% t
+  ORDER BY sml DESC, t;
+</programlisting>
+   This will return all values in the text column that have a word
+   which sufficiently similar to <replaceable>word</>, sorted from best
+   match to worst.  The index will be used to make this a fast operation
+   even over very large data sets.
+  </para>
+
+  <para>
+   A variant of the above query is
+<programlisting>
+SELECT t, '<replaceable>word</>' &lt;&lt;-&gt; t AS dist
+  FROM test_trgm
+  ORDER BY dist LIMIT 10;
+</programlisting>
+   This can be implemented quite efficiently by GiST indexes, but not
+   by GIN indexes.
+  </para>
+
+
+  <para>
+   Beginning in <productname>PostgreSQL</> 9.1, these index types also support
+   index searches for <literal>LIKE</> and <literal>ILIKE</>, for example
+<programlisting>
+SELECT * FROM test_trgm WHERE t LIKE '%foo%bar';
+</programlisting>
+   The index search works by extracting trigrams from the search string
+   and then looking these up in the index.  The more trigrams in the search
+   string, the more effective the index search is.  Unlike B-tree based
+   searches, the search string need not be left-anchored.
+  </para>
+
+  <para>
+   Beginning in <productname>PostgreSQL</> 9.3, these index types also support
+   index searches for regular-expression matches
+   (<literal>~</> and <literal>~*</> operators), for example
+<programlisting>
+SELECT * FROM test_trgm WHERE t ~ '(foo|bar)';
+</programlisting>
+   The index search works by extracting trigrams from the regular expression
+   and then looking these up in the index.  The more trigrams that can be
+   extracted from the regular expression, the more effective the index search
+   is.  Unlike B-tree based searches, the search string need not be
+   left-anchored.
+  </para>
+
+  <para>
+   For both <literal>LIKE</> and regular-expression searches, keep in mind
+   that a pattern with no extractable trigrams will degenerate to a full-index
+   scan.
+  </para>
+
    <para>
     The choice between GiST and GIN indexing depends on the relative
     performance characteristics of GiST and GIN, which are discussed elsewhere.
-   As a rule of thumb, a GIN index is faster to search than a GiST index, but
-   slower to build or update; so GIN is better suited for static data and GiST
-   for often-updated data.
    </para>
   </sect2>
  
@@ -220,7 +388,7 @@ CREATE TABLE words AS SELECT word FROM
     Next, create a trigram index on the word column:
  
  <programlisting>
-CREATE INDEX words_idx ON words USING gin(word gin_trgm_ops);
+CREATE INDEX words_idx ON words USING GIN (word gin_trgm_ops);
  </programlisting>
  
     Now, a <command>SELECT</command> query similar to the previous example can
@@ -261,6 +429,9 @@ CREATE INDEX words_idx ON words USING gin(word gin_trgm_ops);
    <para>
     Teodor Sigaev <email>teodor@sigaev.ru</email>, Moscow, Delta-Soft Ltd.,Russia
    </para>
+  <para>
+   Alexander Korotkov <email>a.korotkov@postgrespro.ru</email>, Moscow, Postgres Professional, Russia
+  </para>
    <para>
     Documentation: Christopher Kings-Lynne
    </para>