Trim trailing whitespace

[postgresql] / doc / src / sgml / pgtrgm.sgml
diff --git a/doc/src/sgml/pgtrgm.sgml b/doc/src/sgml/pgtrgm.sgml

index f66439523a5c3dc47f5f5e7dea86b04e5224ae77..775a7b8be7933166d1e644265d0dadbbfae19d81 100644 (file)
--- a/doc/src/sgml/pgtrgm.sgml
+++ b/doc/src/sgml/pgtrgm.sgml
@@ -92,6 +92,21 @@
         (In practice this is seldom useful except for debugging.)
        </entry>
       </row>
+     <row>
+      <entry>
+       <function>word_similarity(text, text)</function>
+       <indexterm><primary>word_similarity</primary></indexterm>
+      </entry>
+      <entry><type>real</type></entry>
+      <entry>
+       Returns a number that indicates how similar the first string
+       to the most similar word of the second string. The function searches in
+       the second string a most similar word not a most similar substring.  The
+       range of the result is zero (indicating that the two strings are
+       completely dissimilar) to one (indicating that the first string is
+       identical to one of the words of the second string).
+      </entry>
+     </row>
       <row>
        <entry><function>show_limit()</function><indexterm><primary>show_limit</primary></indexterm></entry>
        <entry><type>real</type></entry>
@@ -99,7 +114,8 @@
         Returns the current similarity threshold used by the <literal>%</>
         operator.  This sets the minimum similarity between
         two words for them to be considered similar enough to
-       be misspellings of each other, for example.
+       be misspellings of each other, for example
+       (<emphasis>deprecated</emphasis>).
        </entry>
       </row>
       <row>
@@ -108,7 +124,7 @@
        <entry>
         Sets the current similarity threshold that is used by the <literal>%</>
         operator.  The threshold must be between 0 and 1 (default is 0.3).
-       Returns the same value passed in.
+       Returns the same value passed in (<emphasis>deprecated</emphasis>).
        </entry>
       </row>
      </tbody>
@@ -133,7 +149,24 @@
        <entry>
         Returns <literal>true</> if its arguments have a similarity that is
         greater than the current similarity threshold set by
-       <function>set_limit</>.
+       <varname>pg_trgm.similarity_threshold</>.
+      </entry>
+     </row>
+     <row>
+      <entry><type>text</> <literal>&lt;%</literal> <type>text</></entry>
+      <entry><type>boolean</type></entry>
+      <entry>
+       Returns <literal>true</> if its first argument has the similar word in
+       the second argument and they have a similarity that is greater than the
+       current word similarity threshold set by
+       <varname>pg_trgm.word_similarity_threshold</> parameter.
+      </entry>
+     </row>
+     <row>
+      <entry><type>text</> <literal>%&gt;</literal> <type>text</></entry>
+      <entry><type>boolean</type></entry>
+      <entry>
+       Commutator of the <literal>&lt;%</> operator.
        </entry>
       </row>
       <row>
@@ -144,11 +177,68 @@
         one minus the <function>similarity()</> value.
        </entry>
       </row>
+     <row>
+      <entry>
+       <type>text</> <literal>&lt;&lt;-&gt;</literal> <type>text</>
+      </entry>
+      <entry><type>real</type></entry>
+      <entry>
+       Returns the <quote>distance</> between the arguments, that is
+       one minus the <function>word_similarity()</> value.
+      </entry>
+     </row>
+     <row>
+      <entry>
+       <type>text</> <literal>&lt;-&gt;&gt;</literal> <type>text</>
+      </entry>
+      <entry><type>real</type></entry>
+      <entry>
+       Commutator of the <literal>&lt;&lt;-&gt;</> operator.
+      </entry>
+     </row>
      </tbody>
     </tgroup>
    </table>
   </sect2>
  
+ <sect2>
+  <title>GUC Parameters</title>
+
+  <variablelist>
+   <varlistentry id="guc-pgtrgm-similarity-threshold" xreflabel="pg_trgm.similarity_threshold">
+    <term>
+     <varname>pg_trgm.similarity_threshold</> (<type>real</type>)
+     <indexterm>
+      <primary><varname>pg_trgm.similarity_threshold</> configuration parameter</primary>
+     </indexterm>
+    </term>
+    <listitem>
+     <para>
+      Sets the current similarity threshold that is used by the <literal>%</>
+      operator.  The threshold must be between 0 and 1 (default is 0.3).
+     </para>
+    </listitem>
+   </varlistentry>
+    <varlistentry id="guc-pgtrgm-word-similarity-threshold" xreflabel="pg_trgm.word_similarity_threshold">
+     <term>
+      <varname>pg_trgm.word_similarity_threshold</> (<type>real</type>)
+      <indexterm>
+       <primary>
+        <varname>pg_trgm.word_similarity_threshold</> configuration parameter
+       </primary>
+      </indexterm>
+     </term>
+     <listitem>
+      <para>
+       Sets the current word similarity threshold that is used by
+       <literal>&lt;%</> and <literal>%&gt;</> operators.  The threshold
+       must be between 0 and 1 (default is 0.6).
+      </para>
+     </listitem>
+    </varlistentry>
+  </variablelist>
+ </sect2>
+
   <sect2>
    <title>Index Support</title>
  
@@ -168,11 +258,11 @@
  
  <programlisting>
  CREATE TABLE test_trgm (t text);
-CREATE INDEX trgm_idx ON test_trgm USING gist (t gist_trgm_ops);
+CREATE INDEX trgm_idx ON test_trgm USING GIST (t gist_trgm_ops);
  </programlisting>
  or
  <programlisting>
-CREATE INDEX trgm_idx ON test_trgm USING gin (t gin_trgm_ops);
+CREATE INDEX trgm_idx ON test_trgm USING GIN (t gin_trgm_ops);
  </programlisting>
    </para>
  
@@ -203,6 +293,33 @@ SELECT t, t &lt;-&gt; '<replaceable>word</>' AS dist
     a small number of the closest matches is wanted.
    </para>
  
+  <para>
+   Also you can use an index on the <structfield>t</> column for word
+   similarity.  For example:
+<programlisting>
+SELECT t, word_similarity('<replaceable>word</>', t) AS sml
+  FROM test_trgm
+  WHERE '<replaceable>word</>' &lt;% t
+  ORDER BY sml DESC, t;
+</programlisting>
+   This will return all values in the text column that have a word
+   which sufficiently similar to <replaceable>word</>, sorted from best
+   match to worst.  The index will be used to make this a fast operation
+   even over very large data sets.
+  </para>
+
+  <para>
+   A variant of the above query is
+<programlisting>
+SELECT t, '<replaceable>word</>' &lt;&lt;-&gt; t AS dist
+  FROM test_trgm
+  ORDER BY dist LIMIT 10;
+</programlisting>
+   This can be implemented quite efficiently by GiST indexes, but not
+   by GIN indexes.
+  </para>
+
+
    <para>
     Beginning in <productname>PostgreSQL</> 9.1, these index types also support
     index searches for <literal>LIKE</> and <literal>ILIKE</>, for example
@@ -238,9 +355,6 @@ SELECT * FROM test_trgm WHERE t ~ '(foo|bar)';
    <para>
     The choice between GiST and GIN indexing depends on the relative
     performance characteristics of GiST and GIN, which are discussed elsewhere.
-   As a rule of thumb, a GIN index is faster to search than a GiST index, but
-   slower to build or update; so GIN is better suited for static data and GiST
-   for often-updated data.
    </para>
   </sect2>
  
@@ -274,7 +388,7 @@ CREATE TABLE words AS SELECT word FROM
     Next, create a trigram index on the word column:
  
  <programlisting>
-CREATE INDEX words_idx ON words USING gin(word gin_trgm_ops);
+CREATE INDEX words_idx ON words USING GIN (word gin_trgm_ops);
  </programlisting>
  
     Now, a <command>SELECT</command> query similar to the previous example can
@@ -315,6 +429,9 @@ CREATE INDEX words_idx ON words USING gin(word gin_trgm_ops);
    <para>
     Teodor Sigaev <email>teodor@sigaev.ru</email>, Moscow, Delta-Soft Ltd.,Russia
    </para>
+  <para>
+   Alexander Korotkov <email>a.korotkov@postgrespro.ru</email>, Moscow, Postgres Professional, Russia
+  </para>
    <para>
     Documentation: Christopher Kings-Lynne
    </para>