]> granicus.if.org Git - postgresql/commitdiff
Make contrib/pg_trgm also support regex searches with GiST indexes.
authorTom Lane <tgl@sss.pgh.pa.us>
Wed, 10 Apr 2013 17:30:14 +0000 (13:30 -0400)
committerTom Lane <tgl@sss.pgh.pa.us>
Wed, 10 Apr 2013 17:31:02 +0000 (13:31 -0400)
This wasn't addressed in the original patch, but it doesn't take very
much additional code to cover the case, so let's get it done.

Since pg_trgm 1.1 hasn't been released yet, I just changed the definition
of what's in it, rather than inventing a 1.2.

contrib/pg_trgm/expected/pg_trgm.out
contrib/pg_trgm/pg_trgm--1.0--1.1.sql
contrib/pg_trgm/pg_trgm--1.1.sql
contrib/pg_trgm/sql/pg_trgm.sql
contrib/pg_trgm/trgm.h
contrib/pg_trgm/trgm_gin.c
contrib/pg_trgm/trgm_gist.c
contrib/pg_trgm/trgm_op.c
contrib/pg_trgm/trgm_regexp.c
doc/src/sgml/pgtrgm.sgml

index 0ba44fa6a052ab84a4f70ef07a6dab511abc79ac..13b1fde1b8ba058bb69b8d8e7b2ad9e090ca6be6 100644 (file)
@@ -3706,3 +3706,135 @@ select * from test2 where t ilike 'qua%';
  quark
 (1 row)
 
+select * from test2 where t like '%z foo bar%';
+      t      
+-------------
+   z foo bar
+(1 row)
+
+select * from test2 where t like '  z foo%';
+      t      
+-------------
+   z foo bar
+(1 row)
+
+explain (costs off)
+  select * from test2 where t ~ '[abc]{3}';
+                QUERY PLAN                
+------------------------------------------
+ Index Scan using test2_idx_gist on test2
+   Index Cond: (t ~ '[abc]{3}'::text)
+(2 rows)
+
+explain (costs off)
+  select * from test2 where t ~* 'DEF';
+                QUERY PLAN                
+------------------------------------------
+ Index Scan using test2_idx_gist on test2
+   Index Cond: (t ~* 'DEF'::text)
+(2 rows)
+
+select * from test2 where t ~ '[abc]{3}';
+   t    
+--------
+ abcdef
+(1 row)
+
+select * from test2 where t ~ 'a[bc]+d';
+   t    
+--------
+ abcdef
+(1 row)
+
+select * from test2 where t ~ '(abc)*$';
+      t      
+-------------
+ abcdef
+ quark
+   z foo bar
+(3 rows)
+
+select * from test2 where t ~* 'DEF';
+   t    
+--------
+ abcdef
+(1 row)
+
+select * from test2 where t ~ 'dEf';
+ t 
+---
+(0 rows)
+
+select * from test2 where t ~* '^q';
+   t   
+-------
+ quark
+(1 row)
+
+select * from test2 where t ~* '[abc]{3}[def]{3}';
+   t    
+--------
+ abcdef
+(1 row)
+
+select * from test2 where t ~* 'ab[a-z]{3}';
+   t    
+--------
+ abcdef
+(1 row)
+
+select * from test2 where t ~* '(^| )qua';
+   t   
+-------
+ quark
+(1 row)
+
+select * from test2 where t ~ 'q.*rk$';
+   t   
+-------
+ quark
+(1 row)
+
+select * from test2 where t ~ 'q';
+   t   
+-------
+ quark
+(1 row)
+
+select * from test2 where t ~ '[a-z]{3}';
+      t      
+-------------
+ abcdef
+ quark
+   z foo bar
+(3 rows)
+
+select * from test2 where t ~* '(a{10}|b{10}|c{10}){10}';
+ t 
+---
+(0 rows)
+
+select * from test2 where t ~ 'z foo bar';
+      t      
+-------------
+   z foo bar
+(1 row)
+
+select * from test2 where t ~ ' z foo bar';
+      t      
+-------------
+   z foo bar
+(1 row)
+
+select * from test2 where t ~ '  z foo bar';
+      t      
+-------------
+   z foo bar
+(1 row)
+
+select * from test2 where t ~ '  z foo';
+      t      
+-------------
+   z foo bar
+(1 row)
+
index 449f840084b34d99f3a688b38a28dc2f3ab27a06..65bbb1cfde6329a3347dae7db63136701083e68e 100644 (file)
@@ -3,6 +3,10 @@
 -- complain if script is sourced in psql, rather than via CREATE EXTENSION
 \echo Use "ALTER EXTENSION pg_trgm UPDATE TO '1.1'" to load this file. \quit
 
+ALTER OPERATOR FAMILY gist_trgm_ops USING gist ADD
+        OPERATOR        5       pg_catalog.~ (text, text),
+        OPERATOR        6       pg_catalog.~* (text, text);
+
 ALTER OPERATOR FAMILY gin_trgm_ops USING gin ADD
         OPERATOR        5       pg_catalog.~ (text, text),
         OPERATOR        6       pg_catalog.~* (text, text);
index 5d28339738e69b492ef4bd6bd7e9b88e22037e91..1fff7af2c480fe7b201ba524e9559fa4d179172c 100644 (file)
@@ -132,6 +132,12 @@ ALTER OPERATOR FAMILY gist_trgm_ops USING gist ADD
         OPERATOR        4       pg_catalog.~~* (text, text),
         FUNCTION        8 (text, text)  gtrgm_distance (internal, text, int, oid);
 
+-- Add operators that are new in 9.3.
+
+ALTER OPERATOR FAMILY gist_trgm_ops USING gist ADD
+        OPERATOR        5       pg_catalog.~ (text, text),
+        OPERATOR        6       pg_catalog.~* (text, text);
+
 -- support functions for gin
 CREATE FUNCTION gin_extract_value_trgm(text, internal)
 RETURNS internal
index 37a4c247056dcdaa3746904591a16e2376c29e4a..7b02d988187a06dcaca078f8d3105ba705eca849 100644 (file)
@@ -90,3 +90,26 @@ select * from test2 where t like '%bcd%';
 select * from test2 where t like E'%\\bcd%';
 select * from test2 where t ilike '%BCD%';
 select * from test2 where t ilike 'qua%';
+select * from test2 where t like '%z foo bar%';
+select * from test2 where t like '  z foo%';
+explain (costs off)
+  select * from test2 where t ~ '[abc]{3}';
+explain (costs off)
+  select * from test2 where t ~* 'DEF';
+select * from test2 where t ~ '[abc]{3}';
+select * from test2 where t ~ 'a[bc]+d';
+select * from test2 where t ~ '(abc)*$';
+select * from test2 where t ~* 'DEF';
+select * from test2 where t ~ 'dEf';
+select * from test2 where t ~* '^q';
+select * from test2 where t ~* '[abc]{3}[def]{3}';
+select * from test2 where t ~* 'ab[a-z]{3}';
+select * from test2 where t ~* '(^| )qua';
+select * from test2 where t ~ 'q.*rk$';
+select * from test2 where t ~ 'q';
+select * from test2 where t ~ '[a-z]{3}';
+select * from test2 where t ~* '(a{10}|b{10}|c{10}){10}';
+select * from test2 where t ~ 'z foo bar';
+select * from test2 where t ~ ' z foo bar';
+select * from test2 where t ~ '  z foo bar';
+select * from test2 where t ~ '  z foo';
index 15e7bebb0011c328fc8215f0b8e160a6e8657b7a..ed649b8dccd57cf4e34b39a3d9b1146d19478c20 100644 (file)
@@ -113,8 +113,9 @@ extern TRGM *generate_trgm(char *str, int slen);
 extern TRGM *generate_wildcard_trgm(const char *str, int slen);
 extern float4 cnt_sml(TRGM *trg1, TRGM *trg2);
 extern bool trgm_contained_by(TRGM *trg1, TRGM *trg2);
-extern TRGM *createTrgmNFA(text *text_re, TrgmPackedGraph **graph,
-                         Oid collation);
+extern bool *trgm_presence_map(TRGM *query, TRGM *key);
+extern TRGM *createTrgmNFA(text *text_re, Oid collation,
+                         TrgmPackedGraph **graph, MemoryContext rcontext);
 extern bool trigramsMatchGraph(TrgmPackedGraph *graph, bool *check);
 
 #endif   /* __TRGM_H__ */
index e828571563105b5621c9330db236e6d84581e5bd..1fbbd9ca35ca4498ebecc7c3f30b52494ea55e9a 100644 (file)
@@ -115,7 +115,8 @@ gin_extract_query_trgm(PG_FUNCTION_ARGS)
 #endif
                        /* FALL THRU */
                case RegExpStrategyNumber:
-                       trg = createTrgmNFA(val, &graph, PG_GET_COLLATION());
+                       trg = createTrgmNFA(val, PG_GET_COLLATION(),
+                                                               &graph, CurrentMemoryContext);
                        if (trg && ARRNELEM(trg) > 0)
                        {
                                /*
index 605d7ea3569a5a40733cce6e486bb39fd2c5d865..178f073755b11803781d22d8406c84ab196d03bd 100644 (file)
@@ -8,6 +8,25 @@
 #include "access/skey.h"
 
 
+typedef struct
+{
+       /* most recent inputs to gtrgm_consistent */
+       StrategyNumber strategy;
+       text       *query;
+       /* extracted trigrams for query */
+       TRGM       *trigrams;
+       /* if a regex operator, the extracted graph */
+       TrgmPackedGraph *graph;
+
+       /*
+        * The "query" and "trigrams" are stored in the same palloc block as this
+        * cache struct, at MAXALIGN'ed offsets.  The graph however isn't.
+        */
+} gtrgm_consistent_cache;
+
+#define GETENTRY(vec,pos) ((TRGM *) DatumGetPointer((vec)->vector[(pos)].key))
+
+
 PG_FUNCTION_INFO_V1(gtrgm_in);
 Datum          gtrgm_in(PG_FUNCTION_ARGS);
 
@@ -38,8 +57,6 @@ Datum         gtrgm_penalty(PG_FUNCTION_ARGS);
 PG_FUNCTION_INFO_V1(gtrgm_picksplit);
 Datum          gtrgm_picksplit(PG_FUNCTION_ARGS);
 
-#define GETENTRY(vec,pos) ((TRGM *) DatumGetPointer((vec)->vector[(pos)].key))
-
 /* Number of one-bits in an unsigned byte */
 static const uint8 number_of_ones[256] = {
        0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
@@ -191,24 +208,30 @@ gtrgm_consistent(PG_FUNCTION_ARGS)
        TRGM       *qtrg;
        bool            res;
        Size            querysize = VARSIZE(query);
-       char       *cache = (char *) fcinfo->flinfo->fn_extra,
-                          *cachedQuery = cache + MAXALIGN(sizeof(StrategyNumber));
+       gtrgm_consistent_cache *cache;
 
        /*
-        * Store both the strategy number and extracted trigrams in cache, because
-        * trigram extraction is relatively CPU-expensive.      We must include
-        * strategy number because trigram extraction depends on strategy.
+        * We keep the extracted trigrams in cache, because trigram extraction is
+        * relatively CPU-expensive.  When trying to reuse a cached value, check
+        * strategy number not just query itself, because trigram extraction
+        * depends on strategy.
         *
-        * The cached structure contains the strategy number, then the input query
-        * (starting at a MAXALIGN boundary), then the TRGM value (also starting
-        * at a MAXALIGN boundary).
+        * The cached structure is a single palloc chunk containing the
+        * gtrgm_consistent_cache header, then the input query (starting at a
+        * MAXALIGN boundary), then the TRGM value (also starting at a MAXALIGN
+        * boundary).  However we don't try to include the regex graph (if any) in
+        * that struct.  (XXX currently, this approach can leak regex graphs
+        * across index rescans.  Not clear if that's worth fixing.)
         */
+       cache = (gtrgm_consistent_cache *) fcinfo->flinfo->fn_extra;
        if (cache == NULL ||
-               strategy != *((StrategyNumber *) cache) ||
-               VARSIZE(cachedQuery) != querysize ||
-               memcmp(cachedQuery, query, querysize) != 0)
+               cache->strategy != strategy ||
+               VARSIZE(cache->query) != querysize ||
+               memcmp((char *) cache->query, (char *) query, querysize) != 0)
        {
-               char       *newcache;
+               gtrgm_consistent_cache *newcache;
+               TrgmPackedGraph *graph = NULL;
+               Size            qtrgsize;
 
                switch (strategy)
                {
@@ -225,28 +248,58 @@ gtrgm_consistent(PG_FUNCTION_ARGS)
                                qtrg = generate_wildcard_trgm(VARDATA(query),
                                                                                          querysize - VARHDRSZ);
                                break;
+                       case RegExpICaseStrategyNumber:
+#ifndef IGNORECASE
+                               elog(ERROR, "cannot handle ~* with case-sensitive trigrams");
+#endif
+                               /* FALL THRU */
+                       case RegExpStrategyNumber:
+                               qtrg = createTrgmNFA(query, PG_GET_COLLATION(),
+                                                                        &graph, fcinfo->flinfo->fn_mcxt);
+                               /* just in case an empty array is returned ... */
+                               if (qtrg && ARRNELEM(qtrg) <= 0)
+                               {
+                                       pfree(qtrg);
+                                       qtrg = NULL;
+                               }
+                               break;
                        default:
                                elog(ERROR, "unrecognized strategy number: %d", strategy);
                                qtrg = NULL;    /* keep compiler quiet */
                                break;
                }
 
-               newcache = MemoryContextAlloc(fcinfo->flinfo->fn_mcxt,
-                                                                         MAXALIGN(sizeof(StrategyNumber)) +
-                                                                         MAXALIGN(querysize) +
-                                                                         VARSIZE(qtrg));
-               cachedQuery = newcache + MAXALIGN(sizeof(StrategyNumber));
+               qtrgsize = qtrg ? VARSIZE(qtrg) : 0;
 
-               *((StrategyNumber *) newcache) = strategy;
-               memcpy(cachedQuery, query, querysize);
-               memcpy(cachedQuery + MAXALIGN(querysize), qtrg, VARSIZE(qtrg));
+               newcache = (gtrgm_consistent_cache *)
+                       MemoryContextAlloc(fcinfo->flinfo->fn_mcxt,
+                                                          MAXALIGN(sizeof(gtrgm_consistent_cache)) +
+                                                          MAXALIGN(querysize) +
+                                                          qtrgsize);
+
+               newcache->strategy = strategy;
+               newcache->query = (text *)
+                       ((char *) newcache + MAXALIGN(sizeof(gtrgm_consistent_cache)));
+               memcpy((char *) newcache->query, (char *) query, querysize);
+               if (qtrg)
+               {
+                       newcache->trigrams = (TRGM *)
+                               ((char *) newcache->query + MAXALIGN(querysize));
+                       memcpy((char *) newcache->trigrams, (char *) qtrg, qtrgsize);
+                       /* release qtrg in case it was made in fn_mcxt */
+                       pfree(qtrg);
+               }
+               else
+                       newcache->trigrams = NULL;
+               newcache->graph = graph;
 
                if (cache)
                        pfree(cache);
-               fcinfo->flinfo->fn_extra = newcache;
+               fcinfo->flinfo->fn_extra = (void *) newcache;
+               cache = newcache;
        }
 
-       qtrg = (TRGM *) (cachedQuery + MAXALIGN(querysize));
+       qtrg = cache->trigrams;
 
        switch (strategy)
        {
@@ -317,6 +370,57 @@ gtrgm_consistent(PG_FUNCTION_ARGS)
                                }
                        }
                        break;
+               case RegExpICaseStrategyNumber:
+#ifndef IGNORECASE
+                       elog(ERROR, "cannot handle ~* with case-sensitive trigrams");
+#endif
+                       /* FALL THRU */
+               case RegExpStrategyNumber:
+                       /* Regexp search is inexact */
+                       *recheck = true;
+
+                       /* Check regex match as much as we can with available info */
+                       if (qtrg)
+                       {
+                               if (GIST_LEAF(entry))
+                               {                               /* all leafs contains orig trgm */
+                                       bool       *check;
+
+                                       check = trgm_presence_map(qtrg, key);
+                                       res = trigramsMatchGraph(cache->graph, check);
+                                       pfree(check);
+                               }
+                               else if (ISALLTRUE(key))
+                               {                               /* non-leaf contains signature */
+                                       res = true;
+                               }
+                               else
+                               {                               /* non-leaf contains signature */
+                                       int32           k,
+                                                               tmp = 0,
+                                                               len = ARRNELEM(qtrg);
+                                       trgm       *ptr = GETARR(qtrg);
+                                       BITVECP         sign = GETSIGN(key);
+
+                                       /* descend only if at least one trigram is present */
+                                       res = false;
+                                       for (k = 0; k < len; k++)
+                                       {
+                                               CPTRGM(((char *) &tmp), ptr + k);
+                                               if (GETBIT(sign, HASHVAL(tmp)))
+                                               {
+                                                       res = true;
+                                                       break;
+                                               }
+                                       }
+                               }
+                       }
+                       else
+                       {
+                               /* trigram-free query must be rechecked everywhere */
+                               res = true;
+                       }
+                       break;
                default:
                        elog(ERROR, "unrecognized strategy number: %d", strategy);
                        res = false;            /* keep compiler quiet */
index 49e94f57a84abfc9604a3fbcd26d602d71ff540e..76e470c77855fd7b6bbd85299edef5907ff83fc7 100644 (file)
@@ -616,6 +616,50 @@ trgm_contained_by(TRGM *trg1, TRGM *trg2)
                return true;
 }
 
+/*
+ * Return a palloc'd boolean array showing, for each trigram in "query",
+ * whether it is present in the trigram array "key".
+ * This relies on the "key" array being sorted, but "query" need not be.
+ */
+bool *
+trgm_presence_map(TRGM *query, TRGM *key)
+{
+       bool       *result;
+       trgm       *ptrq = GETARR(query),
+                          *ptrk = GETARR(key);
+       int                     lenq = ARRNELEM(query),
+                               lenk = ARRNELEM(key),
+                               i;
+
+       result = (bool *) palloc0(lenq * sizeof(bool));
+
+       /* for each query trigram, do a binary search in the key array */
+       for (i = 0; i < lenq; i++)
+       {
+               int                     lo = 0;
+               int                     hi = lenk;
+
+               while (lo < hi)
+               {
+                       int                     mid = (lo + hi) / 2;
+                       int                     res = CMPTRGM(ptrq, ptrk + mid);
+
+                       if (res < 0)
+                               hi = mid;
+                       else if (res > 0)
+                               lo = mid + 1;
+                       else
+                       {
+                               result[i] = true;
+                               break;
+                       }
+               }
+               ptrq++;
+       }
+
+       return result;
+}
+
 Datum
 similarity(PG_FUNCTION_ARGS)
 {
index 72e9b208fe5a54d8d39b3d43cb7c05d83c7c5cde..772fc44b3c4d8cffcb24d154e27fc1a6f89580ed 100644 (file)
@@ -476,10 +476,13 @@ static void printTrgmPackedGraph(TrgmPackedGraph *packedGraph, TRGM *trigrams);
  *
  * Returns an array of trigrams required by the regular expression, or NULL if
  * the regular expression was too complex to analyze.  In addition, a packed
- * graph representation of the regex is returned into *graph.
+ * graph representation of the regex is returned into *graph.  The results
+ * must be allocated in rcontext (which might or might not be the current
+ * context).
  */
 TRGM *
-createTrgmNFA(text *text_re, TrgmPackedGraph **graph, Oid collation)
+createTrgmNFA(text *text_re, Oid collation,
+                         TrgmPackedGraph **graph, MemoryContext rcontext)
 {
        TRGM       *trg;
        regex_t         regex;
@@ -488,10 +491,9 @@ createTrgmNFA(text *text_re, TrgmPackedGraph **graph, Oid collation)
 
        /*
         * This processing generates a great deal of cruft, which we'd like to
-        * clean up before returning (since this function is normally called in a
+        * clean up before returning (since this function may be called in a
         * query-lifespan memory context).      Make a temp context we can work in so
-        * that cleanup is easy.  Note that the returned data structures must be
-        * allocated in caller's context, however.
+        * that cleanup is easy.
         */
        tmpcontext = AllocSetContextCreate(CurrentMemoryContext,
                                                                           "createTrgmNFA temporary context",
@@ -516,7 +518,7 @@ createTrgmNFA(text *text_re, TrgmPackedGraph **graph, Oid collation)
         */
        PG_TRY();
        {
-               trg = createTrgmNFAInternal(&regex, graph, oldcontext);
+               trg = createTrgmNFAInternal(&regex, graph, rcontext);
        }
        PG_CATCH();
        {
index 4572750f4d76995fee4b36f3a7f1f3af4bdaef4e..9039f03e8b93a0aa2bba5ccae4477470edbc45bd 100644 (file)
@@ -216,8 +216,8 @@ SELECT * FROM test_trgm WHERE t LIKE '%foo%bar';
   </para>
 
   <para>
-   Beginning in <productname>PostgreSQL</> 9.3, <filename>pg_trgm</filename>
-   GIN indexes also support index searches for regular-expression matches
+   Beginning in <productname>PostgreSQL</> 9.3, these index types also support
+   index searches for regular-expression matches
    (<literal>~</> and <literal>~*</> operators), for example
 <programlisting>
 SELECT * FROM test_trgm WHERE t ~ '(foo|bar)';