]> granicus.if.org Git - postgresql/commitdiff
Support LIKE and ILIKE index searches via contrib/pg_trgm indexes.
authorTom Lane <tgl@sss.pgh.pa.us>
Tue, 1 Feb 2011 02:33:55 +0000 (21:33 -0500)
committerTom Lane <tgl@sss.pgh.pa.us>
Tue, 1 Feb 2011 02:34:49 +0000 (21:34 -0500)
Unlike Btree-based LIKE optimization, this works for non-left-anchored
search patterns.  The effectiveness of the search depends on how many
trigrams can be extracted from the pattern.  (The worst case, with no
trigrams, degrades to a full-table scan, so this isn't a panacea.  But
it can be very useful.)

Alexander Korotkov, reviewed by Jan Urbanski

contrib/pg_trgm/expected/pg_trgm.out
contrib/pg_trgm/pg_trgm.sql.in
contrib/pg_trgm/sql/pg_trgm.sql
contrib/pg_trgm/trgm.h
contrib/pg_trgm/trgm_gin.c
contrib/pg_trgm/trgm_gist.c
contrib/pg_trgm/trgm_op.c
contrib/pg_trgm/uninstall_pg_trgm.sql
doc/src/sgml/pgtrgm.sgml

index 0532a78e4ae3c71f9873d8ddcd46ba8172375005..6e73af2a32f2fc086f687aab8c8cf52e775ea14b 100644 (file)
@@ -3467,3 +3467,93 @@ select t,similarity(t,'gwertyu1988') as sml from test_trgm where t % 'gwertyu198
  qwertyu0988 | 0.333333
 (1 row)
 
+create table test2(t text);
+insert into test2 values ('abcdef');
+insert into test2 values ('quark');
+create index test2_idx_gin on test2 using gin (t gin_trgm_ops);
+set enable_seqscan=off;
+explain (costs off)
+  select * from test2 where t like '%BCD%';
+                QUERY PLAN                
+------------------------------------------
+ Bitmap Heap Scan on test2
+   Recheck Cond: (t ~~ '%BCD%'::text)
+   ->  Bitmap Index Scan on test2_idx_gin
+         Index Cond: (t ~~ '%BCD%'::text)
+(4 rows)
+
+explain (costs off)
+  select * from test2 where t ilike '%BCD%';
+                QUERY PLAN                 
+-------------------------------------------
+ Bitmap Heap Scan on test2
+   Recheck Cond: (t ~~* '%BCD%'::text)
+   ->  Bitmap Index Scan on test2_idx_gin
+         Index Cond: (t ~~* '%BCD%'::text)
+(4 rows)
+
+select * from test2 where t like '%BCD%';
+ t 
+---
+(0 rows)
+
+select * from test2 where t like '%bcd%';
+   t    
+--------
+ abcdef
+(1 row)
+
+select * from test2 where t ilike '%BCD%';
+   t    
+--------
+ abcdef
+(1 row)
+
+select * from test2 where t ilike 'qua%';
+   t   
+-------
+ quark
+(1 row)
+
+drop index test2_idx_gin;
+create index test2_idx_gist on test2 using gist (t gist_trgm_ops);
+set enable_seqscan=off;
+explain (costs off)
+  select * from test2 where t like '%BCD%';
+                QUERY PLAN                
+------------------------------------------
+ Index Scan using test2_idx_gist on test2
+   Index Cond: (t ~~ '%BCD%'::text)
+(2 rows)
+
+explain (costs off)
+  select * from test2 where t ilike '%BCD%';
+                QUERY PLAN                
+------------------------------------------
+ Index Scan using test2_idx_gist on test2
+   Index Cond: (t ~~* '%BCD%'::text)
+(2 rows)
+
+select * from test2 where t like '%BCD%';
+ t 
+---
+(0 rows)
+
+select * from test2 where t like '%bcd%';
+   t    
+--------
+ abcdef
+(1 row)
+
+select * from test2 where t ilike '%BCD%';
+   t    
+--------
+ abcdef
+(1 row)
+
+select * from test2 where t ilike 'qua%';
+   t   
+-------
+ quark
+(1 row)
+
index 3e116e8306f3234157cceb559412ab71068d2410..12a8c21071b5377293dace8c09a8b3a3630b1248 100644 (file)
@@ -113,6 +113,8 @@ FOR TYPE text USING gist
 AS
         OPERATOR        1       % (text, text),
         OPERATOR        2       <-> (text, text) FOR ORDER BY pg_catalog.float_ops,
+        OPERATOR        3       pg_catalog.~~ (text, text),
+        OPERATOR        4       pg_catalog.~~* (text, text),
         FUNCTION        1       gtrgm_consistent (internal, text, int, oid, internal),
         FUNCTION        2       gtrgm_union (bytea, internal),
         FUNCTION        3       gtrgm_compress (internal),
@@ -124,17 +126,17 @@ AS
         STORAGE         gtrgm;
 
 -- support functions for gin
-CREATE OR REPLACE FUNCTION gin_extract_trgm(text, internal)
+CREATE OR REPLACE FUNCTION gin_extract_value_trgm(text, internal)
 RETURNS internal
 AS 'MODULE_PATHNAME'
 LANGUAGE C IMMUTABLE STRICT;
 
-CREATE OR REPLACE FUNCTION gin_extract_trgm(text, internal, int2, internal, internal)
+CREATE OR REPLACE FUNCTION gin_extract_query_trgm(text, internal, int2, internal, internal, internal, internal)
 RETURNS internal
 AS 'MODULE_PATHNAME'
 LANGUAGE C IMMUTABLE STRICT;
 
-CREATE OR REPLACE FUNCTION gin_trgm_consistent(internal, int2, text, int4, internal, internal)
+CREATE OR REPLACE FUNCTION gin_trgm_consistent(internal, int2, text, int4, internal, internal, internal, internal)
 RETURNS bool
 AS 'MODULE_PATHNAME'
 LANGUAGE C IMMUTABLE STRICT;
@@ -144,8 +146,10 @@ CREATE OPERATOR CLASS gin_trgm_ops
 FOR TYPE text USING gin
 AS
         OPERATOR        1       % (text, text),
+        OPERATOR        3       pg_catalog.~~ (text, text),
+        OPERATOR        4       pg_catalog.~~* (text, text),
         FUNCTION        1       btint4cmp (int4, int4),
-        FUNCTION        2       gin_extract_trgm (text, internal),
-        FUNCTION        3       gin_extract_trgm (text, internal, int2, internal, internal),
-        FUNCTION        4       gin_trgm_consistent (internal, int2, text, int4, internal, internal),
+        FUNCTION        2       gin_extract_value_trgm (text, internal),
+        FUNCTION        3       gin_extract_query_trgm (text, internal, int2, internal, internal, internal, internal),
+        FUNCTION        4       gin_trgm_consistent (internal, int2, text, int4, internal, internal, internal, internal),
         STORAGE         int4;
index 5e5539c00577026f014a36ff67192a19484931d4..b8209344c3f0ed5e50e3ae27095e2023b33f3148 100644 (file)
@@ -45,3 +45,28 @@ set enable_seqscan=off;
 select t,similarity(t,'qwertyu0988') as sml from test_trgm where t % 'qwertyu0988' order by sml desc, t;
 select t,similarity(t,'gwertyu0988') as sml from test_trgm where t % 'gwertyu0988' order by sml desc, t;
 select t,similarity(t,'gwertyu1988') as sml from test_trgm where t % 'gwertyu1988' order by sml desc, t;
+
+create table test2(t text);
+insert into test2 values ('abcdef');
+insert into test2 values ('quark');
+create index test2_idx_gin on test2 using gin (t gin_trgm_ops);
+set enable_seqscan=off;
+explain (costs off)
+  select * from test2 where t like '%BCD%';
+explain (costs off)
+  select * from test2 where t ilike '%BCD%';
+select * from test2 where t like '%BCD%';
+select * from test2 where t like '%bcd%';
+select * from test2 where t ilike '%BCD%';
+select * from test2 where t ilike 'qua%';
+drop index test2_idx_gin;
+create index test2_idx_gist on test2 using gist (t gist_trgm_ops);
+set enable_seqscan=off;
+explain (costs off)
+  select * from test2 where t like '%BCD%';
+explain (costs off)
+  select * from test2 where t ilike '%BCD%';
+select * from test2 where t like '%BCD%';
+select * from test2 where t like '%bcd%';
+select * from test2 where t ilike '%BCD%';
+select * from test2 where t ilike 'qua%';
index 1cc812554f7d2b574ac0f7c3617ce5f563fa38fa..f3644fcce76271f1713791184ad9a072145fb1d2 100644 (file)
 #define LPADDING               2
 #define RPADDING               1
 #define KEEPONLYALNUM
+/*
+ * Caution: IGNORECASE macro means that trigrams are case-insensitive.
+ * If this macro is disabled, the ~~* operator must be removed from the
+ * operator classes, because we can't handle case-insensitive wildcard search
+ * with case-sensitive trigrams.  Failure to do this will result in "cannot
+ * handle ~~* with case-sensitive trigrams" errors.
+ */
 #define IGNORECASE
 #define DIVUNION
 
 /* operator strategy numbers */
-#define        SimilarityStrategyNumber        1
-#define        DistanceStrategyNumber          2
+#define SimilarityStrategyNumber       1
+#define DistanceStrategyNumber         2
+#define LikeStrategyNumber                     3
+#define ILikeStrategyNumber                    4
 
 
 typedef char trgm[3];
@@ -40,7 +49,10 @@ uint32               trgm2int(trgm *ptr);
 #else
 #define ISPRINTABLECHAR(a)     ( isascii( *(unsigned char*)(a) ) && isprint( *(unsigned char*)(a) ) )
 #endif
-#define ISPRINTABLETRGM(t)     ( ISPRINTABLECHAR( ((char*)t) ) && ISPRINTABLECHAR( ((char*)t)+1 ) && ISPRINTABLECHAR( ((char*)t)+2 ) )
+#define ISPRINTABLETRGM(t)     ( ISPRINTABLECHAR( ((char*)(t)) ) && ISPRINTABLECHAR( ((char*)(t))+1 ) && ISPRINTABLECHAR( ((char*)(t))+2 ) )
+
+#define ISESCAPECHAR(x) (*(x) == '\\') /* Wildcard escape character */
+#define ISWILDCARDCHAR(x) (*(x) == '_' || *(x) == '%')  /* Wildcard meta-character */
 
 typedef struct
 {
@@ -65,7 +77,7 @@ typedef char *BITVECP;
                        for(i=0;i<SIGLEN;i++)
 
 #define GETBYTE(x,i) ( *( (BITVECP)(x) + (int)( (i) / BITBYTE ) ) )
-#define GETBITBYTE(x,i) ( ((char)(x)) >> i & 0x01 )
+#define GETBITBYTE(x,i) ( (((char)(x)) >> (i)) & 0x01 )
 #define CLRBIT(x,i)   GETBYTE(x,i) &= ~( 0x01 << ( (i) % BITBYTE ) )
 #define SETBIT(x,i)   GETBYTE(x,i) |=  ( 0x01 << ( (i) % BITBYTE ) )
 #define GETBIT(x,i) ( (GETBYTE(x,i) >> ( (i) % BITBYTE )) & 0x01 )
@@ -89,6 +101,8 @@ typedef char *BITVECP;
 extern float4 trgm_limit;
 
 TRGM      *generate_trgm(char *str, int slen);
+TRGM      *generate_wildcard_trgm(const char *str, int slen);
 float4         cnt_sml(TRGM *trg1, TRGM *trg2);
+bool           trgm_contained_by(TRGM *trg1, TRGM *trg2);
 
 #endif /* __TRGM_H__ */
index 2d3e35e23308fffb2053e26378ffb826f7df5fa6..cdaa72ea6110467173af8bd62e11e5928a930474 100644 (file)
@@ -7,6 +7,7 @@
 
 #include "access/gin.h"
 #include "access/itup.h"
+#include "access/skey.h"
 #include "access/tuptoaster.h"
 #include "storage/bufpage.h"
 #include "utils/array.h"
 PG_FUNCTION_INFO_V1(gin_extract_trgm);
 Datum          gin_extract_trgm(PG_FUNCTION_ARGS);
 
+PG_FUNCTION_INFO_V1(gin_extract_value_trgm);
+Datum          gin_extract_value_trgm(PG_FUNCTION_ARGS);
+
+PG_FUNCTION_INFO_V1(gin_extract_query_trgm);
+Datum          gin_extract_query_trgm(PG_FUNCTION_ARGS);
+
 PG_FUNCTION_INFO_V1(gin_trgm_consistent);
 Datum          gin_trgm_consistent(PG_FUNCTION_ARGS);
 
 /*
- * This function is used as both extractValue and extractQuery
+ * This function can only be called if a pre-9.1 version of the GIN operator
+ * class definition is present in the catalogs (probably as a consequence
+ * of upgrade-in-place).  Complain.
  */
 Datum
 gin_extract_trgm(PG_FUNCTION_ARGS)
+{
+       ereport(ERROR,
+                       (errmsg("GIN operator class for pg_trgm is out of date"),
+                        errhint("Please drop and re-create the pg_trgm catalog entries.")));
+       PG_RETURN_NULL();
+}
+
+Datum
+gin_extract_value_trgm(PG_FUNCTION_ARGS)
 {
        text       *val = (text *) PG_GETARG_TEXT_P(0);
        int32      *nentries = (int32 *) PG_GETARG_POINTER(1);
@@ -57,34 +75,124 @@ gin_extract_trgm(PG_FUNCTION_ARGS)
        PG_RETURN_POINTER(entries);
 }
 
+Datum
+gin_extract_query_trgm(PG_FUNCTION_ARGS)
+{
+       text       *val = (text *) PG_GETARG_TEXT_P(0);
+       int32      *nentries = (int32 *) PG_GETARG_POINTER(1);
+       StrategyNumber strategy = PG_GETARG_UINT16(2);
+       /* bool   **pmatch = (bool **) PG_GETARG_POINTER(3); */
+       /* Pointer    *extra_data = (Pointer *) PG_GETARG_POINTER(4); */
+       /* bool   **nullFlags = (bool **) PG_GETARG_POINTER(5); */
+       int32      *searchMode = (int32 *) PG_GETARG_POINTER(6);
+       Datum      *entries = NULL;
+       TRGM       *trg;
+       int32           trglen;
+       trgm       *ptr;
+       int32           i;
+
+       switch (strategy)
+       {
+               case SimilarityStrategyNumber:
+                       trg = generate_trgm(VARDATA(val), VARSIZE(val) - VARHDRSZ);
+                       break;
+               case ILikeStrategyNumber:
+#ifndef IGNORECASE
+                       elog(ERROR, "cannot handle ~~* with case-sensitive trigrams");
+#endif
+                       /* FALL THRU */
+               case LikeStrategyNumber:
+                       /*
+                        * For wildcard search we extract all the trigrams that every
+                        * potentially-matching string must include.
+                        */
+                       trg = generate_wildcard_trgm(VARDATA(val), VARSIZE(val) - VARHDRSZ);
+                       break;
+               default:
+                       elog(ERROR, "unrecognized strategy number: %d", strategy);
+                       trg = NULL;             /* keep compiler quiet */
+                       break;
+       }
+
+       trglen = ARRNELEM(trg);
+       *nentries = trglen;
+
+       if (trglen > 0)
+       {
+               entries = (Datum *) palloc(sizeof(Datum) * trglen);
+               ptr = GETARR(trg);
+               for (i = 0; i < trglen; i++)
+               {
+                       int32   item = trgm2int(ptr);
+
+                       entries[i] = Int32GetDatum(item);
+                       ptr++;
+               }
+       }
+
+       /*
+        * If no trigram was extracted then we have to scan all the index.
+        */
+       if (trglen == 0)
+               *searchMode = GIN_SEARCH_MODE_ALL;
+
+       PG_RETURN_POINTER(entries);
+}
+
 Datum
 gin_trgm_consistent(PG_FUNCTION_ARGS)
 {
        bool       *check = (bool *) PG_GETARG_POINTER(0);
-       /* StrategyNumber strategy = PG_GETARG_UINT16(1); */
+       StrategyNumber strategy = PG_GETARG_UINT16(1);
        /* text    *query = PG_GETARG_TEXT_P(2); */
        int32           nkeys = PG_GETARG_INT32(3);
        /* Pointer    *extra_data = (Pointer *) PG_GETARG_POINTER(4); */
        bool       *recheck = (bool *) PG_GETARG_POINTER(5);
-       bool            res = FALSE;
+       bool            res;
        int32           i,
-                               ntrue = 0;
+                               ntrue;
 
        /* All cases served by this function are inexact */
        *recheck = true;
 
-       /* Count the matches */
-       for (i = 0; i < nkeys; i++)
+       switch (strategy)
        {
-               if (check[i])
-                       ntrue++;
-       }
-
+               case SimilarityStrategyNumber:
+                       /* Count the matches */
+                       ntrue = 0;
+                       for (i = 0; i < nkeys; i++)
+                       {
+                               if (check[i])
+                                       ntrue++;
+                       }
 #ifdef DIVUNION
-       res = (nkeys == ntrue) ? true : ((((((float4) ntrue) / ((float4) (nkeys - ntrue)))) >= trgm_limit) ? true : false);
+                       res = (nkeys == ntrue) ? true : ((((((float4) ntrue) / ((float4) (nkeys - ntrue)))) >= trgm_limit) ? true : false);
 #else
-       res = (nkeys == 0) ? false : ((((((float4) ntrue) / ((float4) nkeys))) >= trgm_limit) ? true : false);
+                       res = (nkeys == 0) ? false : ((((((float4) ntrue) / ((float4) nkeys))) >= trgm_limit) ? true : false);
 #endif
+                       break;
+               case ILikeStrategyNumber:
+#ifndef IGNORECASE
+                       elog(ERROR, "cannot handle ~~* with case-sensitive trigrams");
+#endif
+                       /* FALL THRU */
+               case LikeStrategyNumber:
+                       /* Check if all extracted trigrams are presented. */
+                       res = true;
+                       for (i = 0; i < nkeys; i++)
+                       {
+                               if (!check[i])
+                               {
+                                       res = false;
+                                       break;
+                               }
+                       }
+                       break;
+               default:
+                       elog(ERROR, "unrecognized strategy number: %d", strategy);
+                       res = false;            /* keep compiler quiet */
+                       break;
+       }
 
        PG_RETURN_BOOL(res);
 }
index d9f3d40c17969ca7f59a22a30840ddcf4e0b4463..d83265c11c38fcb1c37e17babe4d37f5dfd8da8e 100644 (file)
@@ -195,31 +195,61 @@ gtrgm_consistent(PG_FUNCTION_ARGS)
        TRGM       *key = (TRGM *) DatumGetPointer(entry->key);
        TRGM       *qtrg;
        bool            res;
-       char       *cache = (char *) fcinfo->flinfo->fn_extra;
-
-       /* All cases served by this function are exact */
-       *recheck = false;
-
-       if (cache == NULL || VARSIZE(cache) != VARSIZE(query) || memcmp(cache, query, VARSIZE(query)) != 0)
+       char       *cache = (char *) fcinfo->flinfo->fn_extra,
+                      *cacheContents = cache + MAXALIGN(sizeof(StrategyNumber));
+
+       /*
+        * Store both the strategy number and extracted trigrams in cache, because
+        * trigram extraction is relatively CPU-expensive.  We must include
+        * strategy number because trigram extraction depends on strategy.
+        */
+       if (cache == NULL || strategy != *((StrategyNumber *) cache) ||
+               VARSIZE(cacheContents) != VARSIZE(query) ||
+               memcmp(cacheContents, query, VARSIZE(query)) != 0)
        {
-               qtrg = generate_trgm(VARDATA(query), VARSIZE(query) - VARHDRSZ);
+               switch (strategy)
+               {
+                       case SimilarityStrategyNumber:
+                               qtrg = generate_trgm(VARDATA(query), VARSIZE(query) - VARHDRSZ);
+                               break;
+                       case ILikeStrategyNumber:
+#ifndef IGNORECASE
+                               elog(ERROR, "cannot handle ~~* with case-sensitive trigrams");
+#endif
+                               /* FALL THRU */
+                       case LikeStrategyNumber:
+                               qtrg = generate_wildcard_trgm(VARDATA(query), VARSIZE(query) - VARHDRSZ);
+                               break;
+                       default:
+                               elog(ERROR, "unrecognized strategy number: %d", strategy);
+                               qtrg = NULL;            /* keep compiler quiet */
+                               break;
+               }
 
                if (cache)
                        pfree(cache);
 
-               fcinfo->flinfo->fn_extra = MemoryContextAlloc(fcinfo->flinfo->fn_mcxt,
-                                                                  MAXALIGN(VARSIZE(query)) + VARSIZE(qtrg));
+               fcinfo->flinfo->fn_extra =
+                       MemoryContextAlloc(fcinfo->flinfo->fn_mcxt,
+                                                          MAXALIGN(sizeof(StrategyNumber)) +
+                                                          MAXALIGN(VARSIZE(query)) +
+                                                          VARSIZE(qtrg));
                cache = (char *) fcinfo->flinfo->fn_extra;
+               cacheContents = cache + MAXALIGN(sizeof(StrategyNumber));
 
-               memcpy(cache, query, VARSIZE(query));
-               memcpy(cache + MAXALIGN(VARSIZE(query)), qtrg, VARSIZE(qtrg));
+               *((StrategyNumber *) cache) = strategy;
+               memcpy(cacheContents, query, VARSIZE(query));
+               memcpy(cacheContents + MAXALIGN(VARSIZE(query)), qtrg, VARSIZE(qtrg));
        }
 
-       qtrg = (TRGM *) (cache + MAXALIGN(VARSIZE(query)));
+       qtrg = (TRGM *) (cacheContents + MAXALIGN(VARSIZE(query)));
 
        switch (strategy)
        {
                case SimilarityStrategyNumber:
+                       /* Similarity search is exact */
+                       *recheck = false;
+
                        if (GIST_LEAF(entry))
                        {                                                       /* all leafs contains orig trgm */
                                float4      tmpsml = cnt_sml(key, qtrg);
@@ -242,6 +272,47 @@ gtrgm_consistent(PG_FUNCTION_ARGS)
                                        res = (((((float8) count) / ((float8) len))) >= trgm_limit) ? true : false;
                        }
                        break;
+               case ILikeStrategyNumber:
+#ifndef IGNORECASE
+                       elog(ERROR, "cannot handle ~~* with case-sensitive trigrams");
+#endif
+                       /* FALL THRU */
+               case LikeStrategyNumber:
+                       /* Wildcard search is inexact */
+                       *recheck = true;
+
+                       /*
+                        * Check if all the extracted trigrams can be present in child
+                        * nodes.
+                        */
+                       if (GIST_LEAF(entry))
+                       {                                                       /* all leafs contains orig trgm */
+                               res = trgm_contained_by(qtrg, key);
+                       }
+                       else if (ISALLTRUE(key))
+                       {                                                       /* non-leaf contains signature */
+                               res = true;
+                       }
+                       else
+                       {                                                       /* non-leaf contains signature */
+                               int32   k,
+                                               tmp = 0,
+                                               len = ARRNELEM(qtrg);
+                               trgm *ptr = GETARR(qtrg);
+                               BITVECP sign = GETSIGN(key);
+
+                               res = true;
+                               for (k = 0; k < len; k++)
+                               {
+                                       CPTRGM(((char *) &tmp), ptr + k);
+                                       if (!GETBIT(sign, HASHVAL(tmp)))
+                                       {
+                                               res = false;
+                                               break;
+                                       }
+                               }
+                       }
+                       break;
                default:
                        elog(ERROR, "unrecognized strategy number: %d", strategy);
                        res = false;            /* keep compiler quiet */
index b97e9912baddbe6dc9bbe5acdc9f6d286ac041ce..52f9172f6d52d3e307341b1a5446082315ce8e27 100644 (file)
@@ -18,6 +18,23 @@ float4               trgm_limit = 0.3f;
 
 PG_FUNCTION_INFO_V1(set_limit);
 Datum          set_limit(PG_FUNCTION_ARGS);
+
+PG_FUNCTION_INFO_V1(show_limit);
+Datum          show_limit(PG_FUNCTION_ARGS);
+
+PG_FUNCTION_INFO_V1(show_trgm);
+Datum          show_trgm(PG_FUNCTION_ARGS);
+
+PG_FUNCTION_INFO_V1(similarity);
+Datum          similarity(PG_FUNCTION_ARGS);
+
+PG_FUNCTION_INFO_V1(similarity_dist);
+Datum          similarity_dist(PG_FUNCTION_ARGS);
+
+PG_FUNCTION_INFO_V1(similarity_op);
+Datum          similarity_op(PG_FUNCTION_ARGS);
+
+
 Datum
 set_limit(PG_FUNCTION_ARGS)
 {
@@ -29,8 +46,6 @@ set_limit(PG_FUNCTION_ARGS)
        PG_RETURN_FLOAT4(trgm_limit);
 }
 
-PG_FUNCTION_INFO_V1(show_limit);
-Datum          show_limit(PG_FUNCTION_ARGS);
 Datum
 show_limit(PG_FUNCTION_ARGS)
 {
@@ -120,7 +135,7 @@ cnt_trigram(trgm *tptr, char *str, int bytelen)
 #endif
 
 /*
- * Adds trigramm from words (already padded).
+ * Adds trigrams from words (already padded).
  */
 static trgm *
 make_trigrams(trgm *tptr, char *str, int bytelen, int charlen)
@@ -236,6 +251,225 @@ generate_trgm(char *str, int slen)
        return trg;
 }
 
+/*
+ * Extract the next non-wildcard part of a search string, ie, a word bounded
+ * by '_' or '%' meta-characters, non-word characters or string end.
+ *
+ * str: source string, of length lenstr bytes (need not be null-terminated)
+ * buf: where to return the substring (must be long enough)
+ * *bytelen: receives byte length of the found substring
+ * *charlen: receives character length of the found substring
+ *
+ * Returns pointer to end+1 of the found substring in the source string.
+ * Returns NULL if no word found (in which case buf, bytelen, charlen not set)
+ *
+ * If the found word is bounded by non-word characters or string boundaries
+ * then this function will include corresponding padding spaces into buf.
+ */
+static const char *
+get_wildcard_part(const char *str, int lenstr,
+                                 char *buf, int *bytelen, int *charlen)
+{
+       const char *beginword = str;
+       const char *endword;
+       char       *s = buf;
+       bool        in_wildcard_meta = false;
+       bool        in_escape = false;
+       int         clen;
+
+       /*
+        * Find the first word character remembering whether last character was
+        * wildcard meta-character.
+        */
+       while (beginword - str < lenstr)
+       {
+               if (in_escape)
+               {
+                       in_escape = false;
+                       in_wildcard_meta = false;
+                       if (iswordchr(beginword))
+                               break;
+               }
+               else
+               {
+                       if (ISESCAPECHAR(beginword))
+                               in_escape = true;
+                       else if (ISWILDCARDCHAR(beginword))
+                               in_wildcard_meta = true;
+                       else if (iswordchr(beginword))
+                               break;
+                       else
+                               in_wildcard_meta = false;
+               }
+               beginword += pg_mblen(beginword);
+       }
+
+       /*
+        * Handle string end.
+        */
+       if (beginword - str >= lenstr)
+               return NULL;
+
+       /*
+        * Add left padding spaces if last character wasn't wildcard
+        * meta-character.
+        */
+       *charlen = 0;
+       if (!in_wildcard_meta)
+       {
+               if (LPADDING > 0)
+               {
+                       *s++ = ' ';
+                       (*charlen)++;
+                       if (LPADDING > 1)
+                       {
+                               *s++ = ' ';
+                               (*charlen)++;
+                       }
+               }
+       }
+
+       /*
+        * Copy data into buf until wildcard meta-character, non-word character or
+        * string boundary.  Strip escapes during copy.
+        */
+       endword = beginword;
+       in_wildcard_meta = false;
+       in_escape = false;
+       while (endword - str < lenstr)
+       {
+               clen = pg_mblen(endword);
+               if (in_escape)
+               {
+                       in_escape = false;
+                       in_wildcard_meta = false;
+                       if (iswordchr(endword))
+                       {
+                               memcpy(s, endword, clen);
+                               (*charlen)++;
+                               s += clen;
+                       }
+                       else
+                               break;
+               }
+               else
+               {
+                       if (ISESCAPECHAR(endword))
+                               in_escape = true;
+                       else if (ISWILDCARDCHAR(endword))
+                       {
+                               in_wildcard_meta = true;
+                               break;
+                       }
+                       else if (iswordchr(endword))
+                       {
+                               memcpy(s, endword, clen);
+                               (*charlen)++;
+                               s += clen;
+                       }
+                       else
+                       {
+                               in_wildcard_meta = false;
+                               break;
+                       }
+               }
+               endword += clen;
+       }
+
+       /*
+        * Add right padding spaces if last character wasn't wildcard
+        * meta-character.
+        */
+       if (!in_wildcard_meta)
+       {
+               if (RPADDING > 0)
+               {
+                       *s++ = ' ';
+                       (*charlen)++;
+                       if (RPADDING > 1)
+                       {
+                               *s++ = ' ';
+                               (*charlen)++;
+                       }
+               }
+       }
+
+       *bytelen = s - buf;
+       return endword;
+}
+
+/*
+ * Generates trigrams for wildcard search string.
+ *
+ * Returns array of trigrams that must occur in any string that matches the
+ * wildcard string.  For example, given pattern "a%bcd%" the trigrams
+ * " a", "bcd" would be extracted.
+ */
+TRGM *
+generate_wildcard_trgm(const char *str, int slen)
+{
+       TRGM       *trg;
+       char       *buf,
+                      *buf2;
+       trgm       *tptr;
+       int                     len,
+                               charlen,
+                               bytelen;
+       const char *eword;
+
+       trg = (TRGM *) palloc(TRGMHDRSIZE + sizeof(trgm) * (slen / 2 + 1) * 3);
+       trg->flag = ARRKEY;
+       SET_VARSIZE(trg, TRGMHDRSIZE);
+
+       if (slen + LPADDING + RPADDING < 3 || slen == 0)
+               return trg;
+
+       tptr = GETARR(trg);
+
+       buf = palloc(sizeof(char) * (slen + 4));
+
+       /*
+        * Extract trigrams from each substring extracted by get_wildcard_part.
+        */
+       eword = str;
+       while ((eword = get_wildcard_part(eword, slen - (eword - str),
+                                                                         buf, &bytelen, &charlen)) != NULL)
+       {
+#ifdef IGNORECASE
+               buf2 = lowerstr_with_len(buf, bytelen);
+               bytelen = strlen(buf2);
+#else
+               buf2 = buf;
+#endif
+
+               /*
+                * count trigrams
+                */
+               tptr = make_trigrams(tptr, buf2, bytelen, charlen);
+#ifdef IGNORECASE
+               pfree(buf2);
+#endif
+       }
+
+       pfree(buf);
+
+       if ((len = tptr - GETARR(trg)) == 0)
+               return trg;
+
+       /*
+        * Make trigrams unique.
+        */
+       if (len > 0)
+       {
+               qsort((void *) GETARR(trg), len, sizeof(trgm), comp_trgm);
+               len = unique_array(GETARR(trg), len);
+       }
+
+       SET_VARSIZE(trg, CALCGTSIZE(ARRKEY, len));
+
+       return trg;
+}
+
 uint32
 trgm2int(trgm *ptr)
 {
@@ -250,8 +484,6 @@ trgm2int(trgm *ptr)
        return val;
 }
 
-PG_FUNCTION_INFO_V1(show_trgm);
-Datum          show_trgm(PG_FUNCTION_ARGS);
 Datum
 show_trgm(PG_FUNCTION_ARGS)
 {
@@ -340,8 +572,44 @@ cnt_sml(TRGM *trg1, TRGM *trg2)
 
 }
 
-PG_FUNCTION_INFO_V1(similarity);
-Datum          similarity(PG_FUNCTION_ARGS);
+/*
+ * Returns whether trg2 contains all trigrams in trg1.
+ * This relies on the trigram arrays being sorted.
+ */
+bool
+trgm_contained_by(TRGM *trg1, TRGM *trg2)
+{
+       trgm       *ptr1,
+                          *ptr2;
+       int                     len1,
+                               len2;
+
+       ptr1 = GETARR(trg1);
+       ptr2 = GETARR(trg2);
+
+       len1 = ARRNELEM(trg1);
+       len2 = ARRNELEM(trg2);
+
+       while (ptr1 - GETARR(trg1) < len1 && ptr2 - GETARR(trg2) < len2)
+       {
+               int                     res = CMPTRGM(ptr1, ptr2);
+
+               if (res < 0)
+                       return false;
+               else if (res > 0)
+                       ptr2++;
+               else
+               {
+                       ptr1++;
+                       ptr2++;
+               }
+       }
+       if (ptr1 - GETARR(trg1) < len1)
+               return false;
+       else
+               return true;
+}
+
 Datum
 similarity(PG_FUNCTION_ARGS)
 {
@@ -364,8 +632,6 @@ similarity(PG_FUNCTION_ARGS)
        PG_RETURN_FLOAT4(res);
 }
 
-PG_FUNCTION_INFO_V1(similarity_dist);
-Datum          similarity_dist(PG_FUNCTION_ARGS);
 Datum
 similarity_dist(PG_FUNCTION_ARGS)
 {
@@ -375,8 +641,6 @@ similarity_dist(PG_FUNCTION_ARGS)
        PG_RETURN_FLOAT4(1.0 - res);
 }
 
-PG_FUNCTION_INFO_V1(similarity_op);
-Datum          similarity_op(PG_FUNCTION_ARGS);
 Datum
 similarity_op(PG_FUNCTION_ARGS)
 {
index bc8f1fa983dcea388040092c5e65bfad5f87fa81..961e40ca4875a4317d68a4503449fba09c74c937 100644 (file)
@@ -25,11 +25,11 @@ DROP TYPE gtrgm CASCADE;
 
 DROP OPERATOR CLASS gin_trgm_ops USING gin;
 
-DROP FUNCTION gin_extract_trgm(text, internal);
+DROP FUNCTION gin_extract_value_trgm(text, internal);
 
-DROP FUNCTION gin_extract_trgm(text, internal, int2, internal, internal);
+DROP FUNCTION gin_extract_query_trgm(text, internal, int2, internal, internal, internal, internal);
 
-DROP FUNCTION gin_trgm_consistent(internal, int2, text, int4, internal, internal);
+DROP FUNCTION gin_trgm_consistent(internal, int2, text, int4, internal, internal, internal, internal);
 
 DROP OPERATOR % (text, text);
 
index 27a23067205f8a74c496d71061599cfb58584a3d..9210899ec2402de92ca21314b327452bda48e9e0 100644 (file)
    The <filename>pg_trgm</filename> module provides GiST and GIN index
    operator classes that allow you to create an index over a text column for
    the purpose of very fast similarity searches.  These index types support
-   the above-described similarity operators (and no other operators, so you may
-   want a regular B-tree index too).
+   the above-described similarity operators, and additionally support
+   trigram-based index searches for <literal>LIKE</> and <literal>ILIKE</>
+   queries.  (These indexes do not support equality nor simple comparison
+   operators, so you may need a regular B-tree index too.)
   </para>
 
   <para>
@@ -181,6 +183,18 @@ SELECT t, t &lt;-&gt; '<replaceable>word</>' AS dist
    a small number of the closest matches is wanted.
   </para>
 
+  <para>
+   Beginning in <productname>PostgreSQL</> 9.1, these index types also support
+   index searches for <literal>LIKE</> and <literal>ILIKE</>, for example
+<programlisting>
+SELECT * FROM test_trgm WHERE t LIKE '%foo%bar';
+</programlisting>
+   The index search works by extracting trigrams from the search string
+   and then looking these up in the index.  The more trigrams in the search
+   string, the more effective the index search is.  Unlike B-tree based
+   searches, the search string need not be left-anchored.
+  </para>
+
   <para>
    The choice between GiST and GIN indexing depends on the relative
    performance characteristics of GiST and GIN, which are discussed elsewhere.