]> granicus.if.org Git - postgresql/commitdiff
Add KNNGIST support to contrib/pg_trgm.
authorTom Lane <tgl@sss.pgh.pa.us>
Sat, 4 Dec 2010 05:16:21 +0000 (00:16 -0500)
committerTom Lane <tgl@sss.pgh.pa.us>
Sat, 4 Dec 2010 05:16:21 +0000 (00:16 -0500)
Teodor Sigaev, with some revision by Tom

contrib/pg_trgm/expected/pg_trgm.out
contrib/pg_trgm/pg_trgm.sql.in
contrib/pg_trgm/sql/pg_trgm.sql
contrib/pg_trgm/trgm.h
contrib/pg_trgm/trgm_gin.c
contrib/pg_trgm/trgm_gist.c
contrib/pg_trgm/trgm_op.c
contrib/pg_trgm/uninstall_pg_trgm.sql
doc/src/sgml/pgtrgm.sgml

index 98385347295d70f6b618d6ff40f0af733d54c17c..0532a78e4ae3c71f9873d8ddcd46ba8172375005 100644 (file)
@@ -1187,6 +1187,13 @@ select t,similarity(t,'gwertyu1988') as sml from test_trgm where t % 'gwertyu198
  qwertyu0988 | 0.333333
 (1 row)
 
+select t <-> 'q0987wertyu0988', t from test_trgm order by t <-> 'q0987wertyu0988' limit 2;
+ ?column? |      t      
+----------+-------------
+ 0.411765 | qwertyu0988
+      0.5 | qwertyu0987
+(2 rows)
+
 create index trgm_idx on test_trgm using gist (t gist_trgm_ops);
 set enable_seqscan=off;
 select t,similarity(t,'qwertyu0988') as sml from test_trgm where t % 'qwertyu0988' order by sml desc, t;
@@ -2315,6 +2322,22 @@ select t,similarity(t,'gwertyu1988') as sml from test_trgm where t % 'gwertyu198
  qwertyu0988 | 0.333333
 (1 row)
 
+explain (costs off)
+select t <-> 'q0987wertyu0988', t from test_trgm order by t <-> 'q0987wertyu0988' limit 2;
+                    QUERY PLAN                     
+---------------------------------------------------
+ Limit
+   ->  Index Scan using trgm_idx on test_trgm
+         Order By: (t <-> 'q0987wertyu0988'::text)
+(3 rows)
+
+select t <-> 'q0987wertyu0988', t from test_trgm order by t <-> 'q0987wertyu0988' limit 2;
+ ?column? |      t      
+----------+-------------
+ 0.411765 | qwertyu0988
+      0.5 | qwertyu0987
+(2 rows)
+
 drop index trgm_idx;
 create index trgm_idx on test_trgm using gin (t gin_trgm_ops);
 set enable_seqscan=off;
index cce6cd9872f140ac26000e4009d5800ab7328a1b..3e116e8306f3234157cceb559412ab71068d2410 100644 (file)
@@ -26,7 +26,7 @@ LANGUAGE C STRICT IMMUTABLE;
 CREATE OR REPLACE FUNCTION similarity_op(text,text)
 RETURNS bool
 AS 'MODULE_PATHNAME'
-LANGUAGE C STRICT STABLE;
+LANGUAGE C STRICT STABLE;  -- stable because depends on trgm_limit
 
 CREATE OPERATOR % (
         LEFTARG = text,
@@ -37,6 +37,18 @@ CREATE OPERATOR % (
         JOIN = contjoinsel
 );
 
+CREATE OR REPLACE FUNCTION similarity_dist(text,text)
+RETURNS float4
+AS 'MODULE_PATHNAME'
+LANGUAGE C STRICT IMMUTABLE;
+
+CREATE OPERATOR <-> (
+        LEFTARG = text,
+        RIGHTARG = text,
+        PROCEDURE = similarity_dist,
+        COMMUTATOR = '<->'
+);
+
 -- gist key
 CREATE OR REPLACE FUNCTION gtrgm_in(cstring)
 RETURNS gtrgm
@@ -60,6 +72,11 @@ RETURNS bool
 AS 'MODULE_PATHNAME'
 LANGUAGE C IMMUTABLE STRICT;
 
+CREATE OR REPLACE FUNCTION gtrgm_distance(internal,text,int,oid)
+RETURNS float8
+AS 'MODULE_PATHNAME'
+LANGUAGE C IMMUTABLE STRICT;
+
 CREATE OR REPLACE FUNCTION gtrgm_compress(internal)
 RETURNS internal
 AS 'MODULE_PATHNAME'
@@ -95,6 +112,7 @@ CREATE OPERATOR CLASS gist_trgm_ops
 FOR TYPE text USING gist
 AS
         OPERATOR        1       % (text, text),
+        OPERATOR        2       <-> (text, text) FOR ORDER BY pg_catalog.float_ops,
         FUNCTION        1       gtrgm_consistent (internal, text, int, oid, internal),
         FUNCTION        2       gtrgm_union (bytea, internal),
         FUNCTION        3       gtrgm_compress (internal),
@@ -102,6 +120,7 @@ AS
         FUNCTION        5       gtrgm_penalty (internal, internal, internal),
         FUNCTION        6       gtrgm_picksplit (internal, internal),
         FUNCTION        7       gtrgm_same (gtrgm, gtrgm, internal),
+        FUNCTION        8       gtrgm_distance (internal, text, int, oid),
         STORAGE         gtrgm;
 
 -- support functions for gin
index 20d86b8a0d28057f491ecd60e682a69e9ae0c699..5e5539c00577026f014a36ff67192a19484931d4 100644 (file)
@@ -26,6 +26,7 @@ CREATE TABLE test_trgm(t text);
 select t,similarity(t,'qwertyu0988') as sml from test_trgm where t % 'qwertyu0988' order by sml desc, t;
 select t,similarity(t,'gwertyu0988') as sml from test_trgm where t % 'gwertyu0988' order by sml desc, t;
 select t,similarity(t,'gwertyu1988') as sml from test_trgm where t % 'gwertyu1988' order by sml desc, t;
+select t <-> 'q0987wertyu0988', t from test_trgm order by t <-> 'q0987wertyu0988' limit 2;
 
 create index trgm_idx on test_trgm using gist (t gist_trgm_ops);
 set enable_seqscan=off;
@@ -33,6 +34,9 @@ set enable_seqscan=off;
 select t,similarity(t,'qwertyu0988') as sml from test_trgm where t % 'qwertyu0988' order by sml desc, t;
 select t,similarity(t,'gwertyu0988') as sml from test_trgm where t % 'gwertyu0988' order by sml desc, t;
 select t,similarity(t,'gwertyu1988') as sml from test_trgm where t % 'gwertyu1988' order by sml desc, t;
+explain (costs off)
+select t <-> 'q0987wertyu0988', t from test_trgm order by t <-> 'q0987wertyu0988' limit 2;
+select t <-> 'q0987wertyu0988', t from test_trgm order by t <-> 'q0987wertyu0988' limit 2;
 
 drop index trgm_idx;
 create index trgm_idx on test_trgm using gin (t gin_trgm_ops);
index 85826733f55c58cfc244e60136cc7054c97c517d..1cc812554f7d2b574ac0f7c3617ce5f563fa38fa 100644 (file)
@@ -4,12 +4,10 @@
 #ifndef __TRGM_H__
 #define __TRGM_H__
 
-#include "postgres.h"
-
 #include "access/gist.h"
 #include "access/itup.h"
-#include "utils/builtins.h"
 #include "storage/bufpage.h"
+#include "utils/builtins.h"
 
 /* options */
 #define LPADDING               2
 #define IGNORECASE
 #define DIVUNION
 
+/* operator strategy numbers */
+#define        SimilarityStrategyNumber        1
+#define        DistanceStrategyNumber          2
+
 
 typedef char trgm[3];
 
@@ -89,4 +91,4 @@ extern float4 trgm_limit;
 TRGM      *generate_trgm(char *str, int slen);
 float4         cnt_sml(TRGM *trg1, TRGM *trg2);
 
-#endif
+#endif /* __TRGM_H__ */
index 3ce0b2deb55974a8bc7d869a8fe98aa96218d38a..a5a94ca6755a693532d4846e6a56082578ce03c1 100644 (file)
@@ -1,6 +1,8 @@
 /*
  * contrib/pg_trgm/trgm_gin.c
  */
+#include "postgres.h"
+
 #include "trgm.h"
 
 #include "access/gin.h"
@@ -10,6 +12,7 @@
 #include "utils/array.h"
 #include "utils/builtins.h"
 
+
 PG_FUNCTION_INFO_V1(gin_extract_trgm);
 Datum          gin_extract_trgm(PG_FUNCTION_ARGS);
 
index 567b2f878ff67218011884c2b3f720a537082889..d9f3d40c17969ca7f59a22a30840ddcf4e0b4463 100644 (file)
@@ -1,15 +1,19 @@
 /*
  * contrib/pg_trgm/trgm_gist.c
  */
+#include "postgres.h"
+
 #include "trgm.h"
 
 #include "access/gist.h"
 #include "access/itup.h"
+#include "access/skey.h"
 #include "access/tuptoaster.h"
 #include "storage/bufpage.h"
 #include "utils/array.h"
 #include "utils/builtins.h"
 
+
 PG_FUNCTION_INFO_V1(gtrgm_in);
 Datum          gtrgm_in(PG_FUNCTION_ARGS);
 
@@ -25,6 +29,9 @@ Datum         gtrgm_decompress(PG_FUNCTION_ARGS);
 PG_FUNCTION_INFO_V1(gtrgm_consistent);
 Datum          gtrgm_consistent(PG_FUNCTION_ARGS);
 
+PG_FUNCTION_INFO_V1(gtrgm_distance);
+Datum          gtrgm_distance(PG_FUNCTION_ARGS);
+
 PG_FUNCTION_INFO_V1(gtrgm_union);
 Datum          gtrgm_union(PG_FUNCTION_ARGS);
 
@@ -159,18 +166,35 @@ gtrgm_decompress(PG_FUNCTION_ARGS)
        }
 }
 
+static int4
+cnt_sml_sign_common(TRGM *qtrg, BITVECP sign)
+{
+       int4            count = 0;
+       int4            k,
+                               len = ARRNELEM(qtrg);
+       trgm       *ptr = GETARR(qtrg);
+       int4            tmp = 0;
+
+       for (k = 0; k < len; k++)
+       {
+               CPTRGM(((char *) &tmp), ptr + k);
+               count += GETBIT(sign, HASHVAL(tmp));
+       }
+
+       return count;
+}
+
 Datum
 gtrgm_consistent(PG_FUNCTION_ARGS)
 {
        GISTENTRY  *entry = (GISTENTRY *) PG_GETARG_POINTER(0);
        text       *query = PG_GETARG_TEXT_P(1);
-
-       /* StrategyNumber strategy = (StrategyNumber) PG_GETARG_UINT16(2); */
+       StrategyNumber strategy = (StrategyNumber) PG_GETARG_UINT16(2);
        /* Oid          subtype = PG_GETARG_OID(3); */
        bool       *recheck = (bool *) PG_GETARG_POINTER(4);
        TRGM       *key = (TRGM *) DatumGetPointer(entry->key);
        TRGM       *qtrg;
-       bool            res = false;
+       bool            res;
        char       *cache = (char *) fcinfo->flinfo->fn_extra;
 
        /* All cases served by this function are exact */
@@ -193,39 +217,95 @@ gtrgm_consistent(PG_FUNCTION_ARGS)
 
        qtrg = (TRGM *) (cache + MAXALIGN(VARSIZE(query)));
 
-       if (GIST_LEAF(entry))
-       {                                                       /* all leafs contains orig trgm */
-               float4          tmpsml = cnt_sml(key, qtrg);
+       switch (strategy)
+       {
+               case SimilarityStrategyNumber:
+                       if (GIST_LEAF(entry))
+                       {                                                       /* all leafs contains orig trgm */
+                               float4      tmpsml = cnt_sml(key, qtrg);
 
-               /* strange bug at freebsd 5.2.1 and gcc 3.3.3 */
-               res = (*(int *) &tmpsml == *(int *) &trgm_limit || tmpsml > trgm_limit) ? true : false;
+                               /* strange bug at freebsd 5.2.1 and gcc 3.3.3 */
+                               res = (*(int *) &tmpsml == *(int *) &trgm_limit || tmpsml > trgm_limit) ? true : false;
+                       }
+                       else if (ISALLTRUE(key))
+                       {                                                       /* non-leaf contains signature */
+                               res = true;
+                       }
+                       else
+                       {                                                       /* non-leaf contains signature */
+                               int4 count = cnt_sml_sign_common(qtrg, GETSIGN(key));
+                               int4 len = ARRNELEM(qtrg);
+
+                               if (len == 0)
+                                       res = false;
+                               else
+                                       res = (((((float8) count) / ((float8) len))) >= trgm_limit) ? true : false;
+                       }
+                       break;
+               default:
+                       elog(ERROR, "unrecognized strategy number: %d", strategy);
+                       res = false;            /* keep compiler quiet */
+                       break;
        }
-       else if (ISALLTRUE(key))
-       {                                                       /* non-leaf contains signature */
-               res = true;
+
+       PG_RETURN_BOOL(res);
+}
+
+Datum
+gtrgm_distance(PG_FUNCTION_ARGS)
+{
+       GISTENTRY  *entry = (GISTENTRY *) PG_GETARG_POINTER(0);
+       text       *query = PG_GETARG_TEXT_P(1);
+       StrategyNumber strategy = (StrategyNumber) PG_GETARG_UINT16(2);
+       /* Oid          subtype = PG_GETARG_OID(3); */
+       TRGM       *key = (TRGM *) DatumGetPointer(entry->key);
+       TRGM       *qtrg;
+       float8          res;
+       char       *cache = (char *) fcinfo->flinfo->fn_extra;
+
+       if (cache == NULL || VARSIZE(cache) != VARSIZE(query) || memcmp(cache, query, VARSIZE(query)) != 0)
+       {
+               qtrg = generate_trgm(VARDATA(query), VARSIZE(query) - VARHDRSZ);
+
+               if (cache)
+                       pfree(cache);
+
+               fcinfo->flinfo->fn_extra = MemoryContextAlloc(fcinfo->flinfo->fn_mcxt,
+                                                                  MAXALIGN(VARSIZE(query)) + VARSIZE(qtrg));
+               cache = (char *) fcinfo->flinfo->fn_extra;
+
+               memcpy(cache, query, VARSIZE(query));
+               memcpy(cache + MAXALIGN(VARSIZE(query)), qtrg, VARSIZE(qtrg));
        }
-       else
-       {                                                       /* non-leaf contains signature */
-               int4            count = 0;
-               int4            k,
-                                       len = ARRNELEM(qtrg);
-               trgm       *ptr = GETARR(qtrg);
-               BITVECP         sign = GETSIGN(key);
-               int4            tmp = 0;
 
-               for (k = 0; k < len; k++)
-               {
-                       CPTRGM(((char *) &tmp), ptr + k);
-                       count += GETBIT(sign, HASHVAL(tmp));
-               }
-#ifdef DIVUNION
-               res = (len == count) ? true : ((((((float4) count) / ((float4) (len - count)))) >= trgm_limit) ? true : false);
-#else
-               res = (len == 0) ? false : ((((((float4) count) / ((float4) len))) >= trgm_limit) ? true : false);
-#endif
+       qtrg = (TRGM *) (cache + MAXALIGN(VARSIZE(query)));
+
+       switch (strategy)
+       {
+               case DistanceStrategyNumber:
+                       if (GIST_LEAF(entry))
+                       {                                                       /* all leafs contains orig trgm */
+                               res = 1.0 - cnt_sml(key, qtrg);
+                       }
+                       else if (ISALLTRUE(key))
+                       {                                                       /* all leafs contains orig trgm */
+                               res = 0.0;
+                       }
+                       else
+                       {                                                       /* non-leaf contains signature */
+                               int4 count = cnt_sml_sign_common(qtrg, GETSIGN(key));
+                               int4 len = ARRNELEM(qtrg);
+
+                               res = (len == 0) ? -1.0 : 1.0 - ((float8) count) / ((float8) len);
+                       }
+                       break;
+               default:
+                       elog(ERROR, "unrecognized strategy number: %d", strategy);
+                       res = 0;                        /* keep compiler quiet */
+                       break;
        }
 
-       PG_RETURN_BOOL(res);
+       PG_RETURN_FLOAT8(res);
 }
 
 static int4
index e15c826e1897571e1bbf138f9d29c49e5903cdf2..b97e9912baddbe6dc9bbe5acdc9f6d286ac041ce 100644 (file)
@@ -1,11 +1,16 @@
 /*
  * contrib/pg_trgm/trgm_op.c
  */
-#include "trgm.h"
+#include "postgres.h"
+
 #include <ctype.h>
-#include "utils/array.h"
+
+#include "trgm.h"
+
 #include "catalog/pg_type.h"
 #include "tsearch/ts_locale.h"
+#include "utils/array.h"
+
 
 PG_MODULE_MAGIC;
 
@@ -359,16 +364,25 @@ similarity(PG_FUNCTION_ARGS)
        PG_RETURN_FLOAT4(res);
 }
 
+PG_FUNCTION_INFO_V1(similarity_dist);
+Datum          similarity_dist(PG_FUNCTION_ARGS);
+Datum
+similarity_dist(PG_FUNCTION_ARGS)
+{
+       float4          res = DatumGetFloat4(DirectFunctionCall2(similarity,
+                                                                                                                PG_GETARG_DATUM(0),
+                                                                                                                PG_GETARG_DATUM(1)));
+       PG_RETURN_FLOAT4(1.0 - res);
+}
+
 PG_FUNCTION_INFO_V1(similarity_op);
 Datum          similarity_op(PG_FUNCTION_ARGS);
 Datum
 similarity_op(PG_FUNCTION_ARGS)
 {
-       float4          res = DatumGetFloat4(DirectFunctionCall2(
-                                                                                                                similarity,
+       float4          res = DatumGetFloat4(DirectFunctionCall2(similarity,
                                                                                                                 PG_GETARG_DATUM(0),
-                                                                                                                PG_GETARG_DATUM(1)
-                                                                                                                ));
+                                                                                                                PG_GETARG_DATUM(1)));
 
        PG_RETURN_BOOL(res >= trgm_limit);
 }
index 6706dd133e1fdaba917ca99a00e29584342a142a..bc8f1fa983dcea388040092c5e65bfad5f87fa81 100644 (file)
@@ -19,6 +19,8 @@ DROP FUNCTION gtrgm_compress(internal);
 
 DROP FUNCTION gtrgm_consistent(internal,text,int,oid,internal);
 
+DROP FUNCTION gtrgm_distance(internal,text,int,oid);
+
 DROP TYPE gtrgm CASCADE;
 
 DROP OPERATOR CLASS gin_trgm_ops USING gin;
@@ -33,6 +35,10 @@ DROP OPERATOR % (text, text);
 
 DROP FUNCTION similarity_op(text,text);
 
+DROP OPERATOR <-> (text, text);
+
+DROP FUNCTION similarity_dist(text,text);
+
 DROP FUNCTION similarity(text,text);
 
 DROP FUNCTION show_trgm(text);
index 376ab85823b2c946d204416eb2862f662556dd50..1b75f0292b0fec9d90c1e2082f81b24f98b1b5b5 100644 (file)
        <function>set_limit</>.
       </entry>
      </row>
+     <row>
+      <entry><type>text</> <literal>&lt;-&gt;</literal> <type>text</></entry>
+      <entry><type>real</type></entry>
+      <entry>
+       Returns the <quote>distance</> between the arguments, that is
+       one minus the <function>similarity()</> value.
+      </entry>
+     </row>
     </tbody>
    </tgroup>
   </table>
    The <filename>pg_trgm</filename> module provides GiST and GIN index
    operator classes that allow you to create an index over a text column for
    the purpose of very fast similarity searches.  These index types support
-   the <literal>%</> similarity operator (and no other operators, so you may
+   the above-described similarity operators (and no other operators, so you may
    want a regular B-tree index too).
   </para>
 
@@ -161,6 +169,18 @@ SELECT t, similarity(t, '<replaceable>word</>') AS sml
    sets.
   </para>
 
+  <para>
+   A variant of the above query is
+<programlisting>
+SELECT t, t &lt;-&gt; '<replaceable>word</>' AS dist
+  FROM test_trgm
+  ORDER BY dist LIMIT 10;
+</programlisting>
+   This can be implemented quite efficiently by GiST indexes, but not
+   by GIN indexes.  It will usually beat the first formulation when only
+   a small number of the closest matches is wanted.
+  </para>
+
   <para>
    The choice between GiST and GIN indexing depends on the relative
    performance characteristics of GiST and GIN, which are discussed elsewhere.