GIN documentation and slightly improving GiST docs.

author Teodor Sigaev <teodor@sigaev.ru>

Thu, 14 Sep 2006 11:16:27 +0000 (11:16 +0000)

committer Teodor Sigaev <teodor@sigaev.ru>

Thu, 14 Sep 2006 11:16:27 +0000 (11:16 +0000)
author Teodor Sigaev <teodor@sigaev.ru>
Thu, 14 Sep 2006 11:16:27 +0000 (11:16 +0000)
committer Teodor Sigaev <teodor@sigaev.ru>
Thu, 14 Sep 2006 11:16:27 +0000 (11:16 +0000)
diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml

index 2dcde4c14d1e87811237e567c5af61bf501b14cf..12f01d6470e9d853ceb9dabea23051595274fc51 100644 (file)
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -1,4 +1,4 @@
-<!-- $PostgreSQL: pgsql/doc/src/sgml/config.sgml,v 1.85 2006/09/08 15:55:52 tgl Exp $ -->
+<!-- $PostgreSQL: pgsql/doc/src/sgml/config.sgml,v 1.86 2006/09/14 11:16:27 teodor Exp $ -->
  
  <chapter Id="runtime-config">
    <title>Server Configuration</title>
@@ -2172,7 +2172,20 @@ SELECT * FROM parent WHERE key = 2400;
         </para>
        </listitem>
       </varlistentry>
-     
+
+        <varlistentry id="guc-gin-fuzzy-search-limit" xreflabel="gin_fuzzy_search_limit">
+         <term><varname>gin_fuzzy_search_limit</varname> (<type>integer</type>)</term>
+         <indexterm>
+         <primary><varname>gin_fuzzy_search_limit</> configuration parameter</primary>
+         </indexterm>
+         <listitem>
+          <para>
+               Soft upper limit of the size of the returned set by GIN index. For more
+               information see <xref linkend="gin-tips">.
+          </para>
+         </listitem>
+        </varlistentry>
+         
       </variablelist>
      </sect2>
     </sect1>
diff --git a/doc/src/sgml/filelist.sgml b/doc/src/sgml/filelist.sgml

index 23af846a67a8f94ab489e4c090cb60f3d1a010ea..7a8e37dd197eff4e6c03586957bb6d10263b14ef 100644 (file)
--- a/doc/src/sgml/filelist.sgml
+++ b/doc/src/sgml/filelist.sgml
@@ -1,4 +1,4 @@
-<!-- $PostgreSQL: pgsql/doc/src/sgml/filelist.sgml,v 1.46 2006/09/05 03:09:56 momjian Exp $ -->
+<!-- $PostgreSQL: pgsql/doc/src/sgml/filelist.sgml,v 1.47 2006/09/14 11:16:27 teodor Exp $ -->
  
  <!entity history    SYSTEM "history.sgml">
  <!entity info       SYSTEM "info.sgml">
@@ -78,6 +78,7 @@
  <!entity catalogs   SYSTEM "catalogs.sgml">
  <!entity geqo       SYSTEM "geqo.sgml">
  <!entity gist       SYSTEM "gist.sgml">
+<!entity gin        SYSTEM "gin.sgml">
  <!entity planstats    SYSTEM "planstats.sgml">
  <!entity indexam    SYSTEM "indexam.sgml">
  <!entity nls        SYSTEM "nls.sgml">
diff --git a/doc/src/sgml/geqo.sgml b/doc/src/sgml/geqo.sgml

index e8de838a9c9675e4d28cd7ddd3a4035a4e556ac7..448b1be542c977c7308615a1fca40cdc89291c71 100644 (file)
--- a/doc/src/sgml/geqo.sgml
+++ b/doc/src/sgml/geqo.sgml
@@ -1,4 +1,4 @@
-<!-- $PostgreSQL: pgsql/doc/src/sgml/geqo.sgml,v 1.36 2006/03/10 19:10:48 momjian Exp $ -->
+<!-- $PostgreSQL: pgsql/doc/src/sgml/geqo.sgml,v 1.37 2006/09/14 11:16:27 teodor Exp $ -->
  
   <chapter id="geqo">
    <chapterinfo>
@@ -49,8 +49,8 @@
      methods</firstterm> (e.g., nested loop, hash join, merge join in
      <productname>PostgreSQL</productname>) to process individual joins
      and a diversity of <firstterm>indexes</firstterm> (e.g.,
-    B-tree, hash, GiST in <productname>PostgreSQL</productname>) as access
-    paths for relations.
+    B-tree, hash, GiST and GIN in <productname>PostgreSQL</productname>) as 
+       access paths for relations.
     </para>
  
     <para>
diff --git a/doc/src/sgml/gin.sgml b/doc/src/sgml/gin.sgml

new file mode 100644 (file)

index 0000000..e261b0d
--- /dev/null
+++ b/doc/src/sgml/gin.sgml
@@ -0,0 +1,231 @@
+<!-- $PostgreSQL: pgsql/doc/src/sgml/gin.sgml,v 2.1 2006/09/14 11:16:27 teodor Exp $ -->
+
+<chapter id="GIN">
+<title>GIN Indexes</title>
+
+   <indexterm>
+    <primary>index</primary>
+    <secondary>GIN</secondary>
+   </indexterm>
+
+<sect1 id="gin-intro">
+ <title>Introduction</title>
+
+ <para>
+   <acronym>GIN</acronym> stands for Generalized Inverted Index.  It is
+   an index structure storing a set of (key, posting list) pairs, where
+   'posting list' is a set of rows in which the key occurs. The
+   row may contain many keys.
+ </para>
+
+ <para>
+   It is generalized in the sense that a <acronym>GIN</acronym> index
+   does not need to be aware of the operation that it accelerates.
+   Instead, it uses custom strategies defined for particular data types.
+ </para>
+
+ <para>
+  One advantage of <acronym>GIN</acronym> is that it allows the development
+  of custom data types with the appropriate access methods, by
+  an expert in the domain of the data type, rather than a database expert.
+  This is much the same advantage as using <acronym>GiST</acronym>.
+ </para>
+
+  <para>
+   The <acronym>GIN</acronym>
+    implementation in <productname>PostgreSQL</productname> is primarily
+    maintained by Teodor Sigaev and Oleg Bartunov, and there is more
+    information on their
+    <ulink url="http://www.sai.msu.su/~megera/oddmuse/index.cgi/Gin">website</ulink>.
+  </para>
+
+</sect1>
+
+<sect1 id="gin-extensibility">
+ <title>Extensibility</title>
+
+ <para>
+   The <acronym>GIN</acronym> interface has a high level of abstraction,
+   requiring the access method implementer to only implement the semantics of
+   the data type being accessed.  The <acronym>GIN</acronym> layer itself
+   takes care of concurrency, logging and searching the tree structure.
+ </para>
+
+ <para>
+   All it takes to get a <acronym>GIN</acronym> access method working
+   is to implement four user-defined methods, which define the behavior of
+   keys in the tree. In short, <acronym>GIN</acronym> combines extensibility
+   along with generality, code reuse, and a clean interface.
+ </para>
+
+</sect1>
+
+<sect1 id="gin-implementation">
+ <title>Implementation</title>
+
+ <para>
+  Internally, <acronym>GIN</acronym> consists of a B-tree index constructed 
+  over keys, where each key is an element of the indexed value 
+  (element of array, for example) and where each tuple in a leaf page is 
+  either a pointer to a B-tree over heap pointers (PT, posting tree), or a 
+  list of heap pointers (PL, posting list) if the tuple is small enough.
+ </para>
+
+ <para>
+   There are four methods that an index operator class for
+   <acronym>GIN</acronym> must provide (prototypes are in pseudocode):
+ </para>
+
+ <variablelist>
+    <varlistentry>
+     <term>int compare( Datum a, Datum b )</term>
+     <listitem>
+      <para>
+          Compares keys (not indexed values!) and returns an integer less than 
+          zero, zero, or greater than zero, indicating whether the first key is 
+          less than, equal to, or greater than the second.
+      </para>
+     </listitem>
+    </varlistentry>
+
+    <varlistentry>
+     <term>Datum* extractValue(Datum inputValue, uint32 *nkeys)</term>
+     <listitem>
+      <para>
+          Returns an array of keys of value to be indexed, nkeys should
+          contain the number of returned keys.
+      </para>
+     </listitem>
+    </varlistentry>
+
+    <varlistentry>
+     <term>Datum* extractQuery(Datum query, uint32 nkeys, 
+               StrategyNumber n)</term>
+     <listitem>
+      <para>
+          Returns an array of keys of the query to be executed. n contains
+          strategy number of operation (see <xref linkend="xindex-strategies">).
+          Depending on n, query may be different type.
+      </para>
+     </listitem>
+    </varlistentry>
+
+    <varlistentry>
+     <term>bool consistent( bool check[], StrategyNumber n, Datum query)</term>
+     <listitem>
+      <para>
+          Returns TRUE if indexed value satisfies query qualifier with strategy n 
+          (or may satisfy in case of RECHECK mark in operator class). 
+          Each element of the check array is TRUE if indexed value has a 
+          corresponding key in the query: if (check[i] == TRUE ) the i-th key of 
+          the query is present in the indexed value.
+      </para>
+     </listitem>
+    </varlistentry>
+
+  </variablelist>
+
+</sect1>
+
+<sect1 id="gin-tips">
+<title>GIN tips and trics</title>
+
+ <variablelist>
+  <varlistentry>
+   <term>Create vs insert</term>
+   <listitem>
+       <para>
+        In most cases, insertion into <acronym>GIN</acronym> index is slow because
+        many GIN keys may be inserted for each table row. So, when loading data
+        in bulk it may be useful to drop index and recreate it
+        after the data is loaded in the table.
+       </para>
+   </listitem>
+  </varlistentry>
+
+  <varlistentry>
+   <term>gin_fuzzy_search_limit</term>
+   <listitem>
+       <para>
+        The primary goal of development <acronym>GIN</acronym> indices was 
+        support for highly scalable, full-text search in 
+        <productname>PostgreSQL</productname> and there are often situations when 
+        a full-text search returns a very large set of results.  Since reading 
+        tuples from the disk and sorting them could take a lot of time, this is 
+        unacceptable for production.  (Note that the index search itself is very 
+        fast.) 
+    </para>
+       <para>
+        Such queries usually contain very frequent words, so the results are not 
+        very helpful. To facilitate execution of such queries 
+        <acronym>GIN</acronym> has a configurable  soft upper limit of the size 
+        of the returned set, determined by the 
+        <varname>gin_fuzzy_search_limit</varname> GUC variable.  It is set to 0 by
+        default (no limit).
+       </para>
+       <para>
+        If a non-zero search limit is set, then the returned set is a subset of 
+        the whole result set, chosen at random.
+       </para>
+       <para>
+        "Soft" means that the actual number of returned results could slightly 
+        differ from the specified limit, depending on the query and the quality 
+        of the system's random number generator.
+       </para>
+   </listitem>
+  </varlistentry>
+ <variablelist>
+
+</sect1>
+
+<sect1 id="gin-limit">
+ <title>Limitations</title>
+
+ <para>
+  <acronym>GIN</acronym> doesn't support full scan of index due to it's 
+  extremely inefficiency: because of a lot of keys per value, 
+  each heap pointer will returned several times.
+ </para>
+
+ <para>
+  When extractQuery returns zero number of keys, <acronym>GIN</acronym> will 
+  emit a error: for different opclass and strategy semantic meaning of void 
+  query may be different (for example, any array contains void array, 
+  but they aren't overlapped with void one), and <acronym>GIN</acronym> can't 
+  suggest reasonable answer.
+ </para>
+
+ <para>
+  <acronym>GIN</acronym> searches keys only by equality matching.  This may 
+  be improved in future.
+ </para>
+</sect1>
+<sect1 id="gin-examples">
+ <title>Examples</title>
+
+ <para>
+  The <productname>PostgreSQL</productname> source distribution includes
+  <acronym>GIN</acronym> classes for one-dimensional arrays of all internal 
+  types.  The following
+  <filename>contrib</> modules also contain <acronym>GIN</acronym>
+  operator classes: 
+ </para>
+ 
+ <variablelist>
+  <varlistentry>
+   <term>intarray</term>
+   <listitem>
+    <para>Enhanced support for int4[]</para>
+   </listitem>
+  </varlistentry>
+
+  <varlistentry>
+   <term>tsearch2</term>
+   <listitem>
+    <para>Support for inverted text indexing.  This is much faster for very
+     large, mostly-static sets of documents.
+    </para>
+   </listitem>
+  </varlistentry>
+
+</chapter>
diff --git a/doc/src/sgml/indices.sgml b/doc/src/sgml/indices.sgml

index d060d044de08c40b04048c88488b1e7d0d9fcd59..ff011da53ca1a41dcd2b7244fb2a5ae616dffb06 100644 (file)
--- a/doc/src/sgml/indices.sgml
+++ b/doc/src/sgml/indices.sgml
@@ -1,4 +1,4 @@
-<!-- $PostgreSQL: pgsql/doc/src/sgml/indices.sgml,v 1.61 2006/09/13 23:42:26 tgl Exp $ -->
+<!-- $PostgreSQL: pgsql/doc/src/sgml/indices.sgml,v 1.62 2006/09/14 11:16:27 teodor Exp $ -->
  
  <chapter id="indexes">
   <title id="indexes-title">Indexes</title>
@@ -116,7 +116,7 @@ CREATE INDEX test1_id_index ON test1 (id);
  
    <para>
     <productname>PostgreSQL</productname> provides several index types:
-   B-tree, Hash, and GiST.  Each index type uses a different
+   B-tree, Hash, GIN and GiST.  Each index type uses a different
     algorithm that is best suited to different types of queries.
     By default, the <command>CREATE INDEX</command> command will create a
     B-tree index, which fits the most common situations.
@@ -238,6 +238,37 @@ CREATE INDEX <replaceable>name</replaceable> ON <replaceable>table</replaceable>
     classes are available in the <literal>contrib</> collection or as separate
     projects.  For more information see <xref linkend="GiST">.
    </para>
+  <para>
+   <indexterm>
+    <primary>index</primary>
+    <secondary>GIN</secondary>
+   </indexterm>
+   <indexterm>
+    <primary>GIN</primary>
+    <see>index</see>
+   </indexterm>
+   GIN is a inverted index and it's usable for values which have more
+   than one key, arrays for example. Like to GiST, GIN may support
+   many different user-defined indexing strategies and the particular 
+   operators with which a GIN index can be used vary depending on the 
+   indexing strategy.  
+   As an example, the standard distribution of
+   <productname>PostgreSQL</productname> includes GIN operator classes
+   for one-dimentional arrays, which support indexed
+   queries using these operators:
+
+   <simplelist>
+    <member><literal>&lt;@</literal></member>
+    <member><literal>@&gt;</literal></member>
+    <member><literal>=</literal></member>
+    <member><literal>&amp;&amp;</literal></member>
+   </simplelist>
+
+   (See <xref linkend="functions-array"> for the meaning of
+   these operators.)
+   Another GIN operator classes are available in the <literal>contrib</> 
+   tsearch2 and intarray modules. For more information see <xref linkend="GIN">.
+  </para>
   </sect1>
  
  
diff --git a/doc/src/sgml/mvcc.sgml b/doc/src/sgml/mvcc.sgml

index eba07a80dad034063f6418bf03b937e4b6066a62..baee0d85a2eaedc56158e82668a54a3a734b85c4 100644 (file)
--- a/doc/src/sgml/mvcc.sgml
+++ b/doc/src/sgml/mvcc.sgml
@@ -1,4 +1,4 @@
-<!-- $PostgreSQL: pgsql/doc/src/sgml/mvcc.sgml,v 2.58 2006/09/03 01:59:09 momjian Exp $ -->
+<!-- $PostgreSQL: pgsql/doc/src/sgml/mvcc.sgml,v 2.59 2006/09/14 11:16:27 teodor Exp $ -->
  
   <chapter id="mvcc">
    <title>Concurrency Control</title>
@@ -987,6 +987,20 @@ UPDATE accounts SET balance = balance - 100.00 WHERE acctnum = 22222;
         </para>
        </listitem>
       </varlistentry>
+
+     <varlistentry>
+      <term>
+       <acronym>GIN</acronym> indexes
+      </term>
+      <listitem>
+       <para>
+               Short-term share/exclusive page-level locks are used for 
+               read/write access. Locks are released immediately after each
+               index row is fetched or inserted. However, note that GIN index
+               usually requires several inserts per one table row.
+       </para>
+      </listitem>
+     </varlistentry>
      </variablelist>
     </para>
  
@@ -995,7 +1009,7 @@ UPDATE accounts SET balance = balance - 100.00 WHERE acctnum = 22222;
      applications; since they also have more features than hash
      indexes, they are the recommended index type for concurrent
      applications that need to index scalar data. When dealing with
-    non-scalar data, B-trees are not useful, and GiST indexes should
+    non-scalar data, B-trees are not useful, and GiST or GIN indexes should
      be used instead.
     </para>
    </sect1>
diff --git a/doc/src/sgml/ref/create_opclass.sgml b/doc/src/sgml/ref/create_opclass.sgml

index ed7b77b65652e15db9c2bd327a9d3826f244dae7..50742980bcd20c5f775daff01f0f73993672a608 100644 (file)
--- a/doc/src/sgml/ref/create_opclass.sgml
+++ b/doc/src/sgml/ref/create_opclass.sgml
@@ -1,5 +1,5 @@
  <!--
-$PostgreSQL: pgsql/doc/src/sgml/ref/create_opclass.sgml,v 1.15 2006/09/10 17:36:52 tgl Exp $
+$PostgreSQL: pgsql/doc/src/sgml/ref/create_opclass.sgml,v 1.16 2006/09/14 11:16:27 teodor Exp $
  PostgreSQL documentation
  -->
  
@@ -192,7 +192,7 @@ CREATE OPERATOR CLASS <replaceable class="parameter">name</replaceable> [ DEFAUL
       <para>
        The data type actually stored in the index.  Normally this is
        the same as the column data type, but some index methods
-      (only GiST at this writing) allow it to be different.  The
+      (GIN and GiST for now) allow it to be different.  The
        <literal>STORAGE</> clause must be omitted unless the index
        method allows a different type to be used.
       </para>
diff --git a/doc/src/sgml/xindex.sgml b/doc/src/sgml/xindex.sgml

index c5c34087bed19301bc0ba8e8171ff0a2b14c3ba4..3d4ef9e2bdb408880a3196fa2e258c8b23c38676 100644 (file)
--- a/doc/src/sgml/xindex.sgml
+++ b/doc/src/sgml/xindex.sgml
@@ -1,4 +1,4 @@
-<!-- $PostgreSQL: pgsql/doc/src/sgml/xindex.sgml,v 1.45 2006/09/05 03:09:56 momjian Exp $ -->
+<!-- $PostgreSQL: pgsql/doc/src/sgml/xindex.sgml,v 1.46 2006/09/14 11:16:27 teodor Exp $ -->
  
  <sect1 id="xindex">
   <title>Interfacing Extensions To Indexes</title>
@@ -242,6 +242,44 @@
      </tgroup>
     </table>
  
+  <para>
+   GIN indexes are similar to GiST in flexibility: it hasn't a fixed set
+   of strategies. Instead, the <quote>consistency</> support routine
+   interprets the strategy numbers accordingly with operator class
+   definition. As an example, strategies of operator class over arrays
+   is shown in <xref linkend="xindex-gin-array-strat-table">.
+  </para>
+
+   <table tocentry="1" id="xindex-gin-array-strat-table">
+    <title>GiST Two-Dimensional <quote>R-tree</> Strategies</title>
+    <tgroup cols="2">
+     <thead>
+      <row>
+       <entry>Operation</entry>
+       <entry>Strategy Number</entry>
+      </row>
+     </thead>
+     <tbody>
+      <row>
+       <entry>overlap</entry>
+       <entry>1</entry>
+      </row>
+      <row>
+       <entry>contains</entry>
+       <entry>2</entry>
+      </row>
+      <row>
+       <entry>is contained by</entry>
+       <entry>3</entry>
+      </row>
+      <row>
+       <entry>equal</entry>
+       <entry>4</entry>
+      </row>
+     </tbody>
+    </tgroup>
+   </table>
+
    <para>
     Note that all strategy operators return Boolean values.  In
     practice, all operators defined as index method strategies must
@@ -349,37 +387,84 @@
       </thead>
       <tbody>
        <row>
-       <entry>consistent</entry>
+       <entry>consistent - determine whether key satifies the 
+               query qualifier</entry>
         <entry>1</entry>
        </row>
        <row>
-       <entry>union</entry>
+       <entry>union - compute union of of a set of given keys</entry>
         <entry>2</entry>
        </row>
        <row>
-       <entry>compress</entry>
+       <entry>compress - computes a compressed representation of a key or value
+               to be indexed</entry>
         <entry>3</entry>
        </row>
        <row>
-       <entry>decompress</entry>
+       <entry>decompress - computes a decompressed representation of a 
+          compressed key </entry>
         <entry>4</entry>
        </row>
        <row>
-       <entry>penalty</entry>
+       <entry>penalty - compute penalty for inserting new key into subtree 
+          with given subtree's key</entry>
         <entry>5</entry>
        </row>
        <row>
-       <entry>picksplit</entry>
+       <entry>picksplit - determine which entries of a page are to be moved
+          to the new page and compute the union keys for resulting pages </entry>
         <entry>6</entry>
        </row>
        <row>
-       <entry>equal</entry>
+       <entry>equal - compare two keys and returns true if they are equal 
+               </entry>
         <entry>7</entry>
        </row>
       </tbody>
      </tgroup>
     </table>
  
+  <para>
+   GIN indexes require four support functions,
+   shown in <xref linkend="xindex-gin-support-table">.
+  </para>
+
+   <table tocentry="1" id="xindex-gin-support-table">
+    <title>GIN Support Functions</title>
+    <tgroup cols="2">
+     <thead>
+      <row>
+       <entry>Function</entry>
+       <entry>Support Number</entry>
+      </row>
+     </thead>
+     <tbody>
+      <row>
+       <entry>
+   compare - Compare two keys and return an integer less than zero, zero, or
+   greater than zero, indicating whether the first key is less than, equal to,
+   or greater than the second.
+          </entry>
+       <entry>1</entry>
+      </row>
+      <row>
+       <entry>extractValue - extract keys from value to be indexed</entry>
+       <entry>2</entry>
+      </row>
+      <row>
+       <entry>extractQuery - extract keys from query</entry>
+       <entry>3</entry>
+      </row>
+      <row>
+       <entry>consistent - determine whether value matches by the
+                          query</entry>
+       <entry>4</entry>
+      </row>
+      </row>
+     </tbody>
+    </tgroup>
+   </table>
+
    <para>
     Unlike strategy operators, support functions return whichever data
     type the particular index method expects; for example in the case
author	Teodor Sigaev <teodor@sigaev.ru>
	Thu, 14 Sep 2006 11:16:27 +0000 (11:16 +0000)
committer	Teodor Sigaev <teodor@sigaev.ru>
	Thu, 14 Sep 2006 11:16:27 +0000 (11:16 +0000)
doc/src/sgml/config.sgml		patch \| blob \| history
doc/src/sgml/filelist.sgml		patch \| blob \| history
doc/src/sgml/geqo.sgml		patch \| blob \| history
doc/src/sgml/gin.sgml	[new file with mode: 0644]	patch \| blob
doc/src/sgml/indices.sgml		patch \| blob \| history
doc/src/sgml/mvcc.sgml		patch \| blob \| history
doc/src/sgml/ref/create_opclass.sgml		patch \| blob \| history
doc/src/sgml/xindex.sgml		patch \| blob \| history