From: Tom Lane Date: Tue, 1 Feb 2011 02:33:55 +0000 (-0500) Subject: Support LIKE and ILIKE index searches via contrib/pg_trgm indexes. X-Git-Tag: REL9_1_ALPHA4~313 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=6e2f3ae8842392c46ccc91a9ce4bba92296890cb;p=postgresql Support LIKE and ILIKE index searches via contrib/pg_trgm indexes. Unlike Btree-based LIKE optimization, this works for non-left-anchored search patterns. The effectiveness of the search depends on how many trigrams can be extracted from the pattern. (The worst case, with no trigrams, degrades to a full-table scan, so this isn't a panacea. But it can be very useful.) Alexander Korotkov, reviewed by Jan Urbanski --- diff --git a/contrib/pg_trgm/expected/pg_trgm.out b/contrib/pg_trgm/expected/pg_trgm.out index 0532a78e4a..6e73af2a32 100644 --- a/contrib/pg_trgm/expected/pg_trgm.out +++ b/contrib/pg_trgm/expected/pg_trgm.out @@ -3467,3 +3467,93 @@ select t,similarity(t,'gwertyu1988') as sml from test_trgm where t % 'gwertyu198 qwertyu0988 | 0.333333 (1 row) +create table test2(t text); +insert into test2 values ('abcdef'); +insert into test2 values ('quark'); +create index test2_idx_gin on test2 using gin (t gin_trgm_ops); +set enable_seqscan=off; +explain (costs off) + select * from test2 where t like '%BCD%'; + QUERY PLAN +------------------------------------------ + Bitmap Heap Scan on test2 + Recheck Cond: (t ~~ '%BCD%'::text) + -> Bitmap Index Scan on test2_idx_gin + Index Cond: (t ~~ '%BCD%'::text) +(4 rows) + +explain (costs off) + select * from test2 where t ilike '%BCD%'; + QUERY PLAN +------------------------------------------- + Bitmap Heap Scan on test2 + Recheck Cond: (t ~~* '%BCD%'::text) + -> Bitmap Index Scan on test2_idx_gin + Index Cond: (t ~~* '%BCD%'::text) +(4 rows) + +select * from test2 where t like '%BCD%'; + t +--- +(0 rows) + +select * from test2 where t like '%bcd%'; + t +-------- + abcdef +(1 row) + +select * from test2 where t ilike '%BCD%'; + t +-------- + abcdef +(1 row) + +select * from test2 where t ilike 'qua%'; + t +------- + quark +(1 row) + +drop index test2_idx_gin; +create index test2_idx_gist on test2 using gist (t gist_trgm_ops); +set enable_seqscan=off; +explain (costs off) + select * from test2 where t like '%BCD%'; + QUERY PLAN +------------------------------------------ + Index Scan using test2_idx_gist on test2 + Index Cond: (t ~~ '%BCD%'::text) +(2 rows) + +explain (costs off) + select * from test2 where t ilike '%BCD%'; + QUERY PLAN +------------------------------------------ + Index Scan using test2_idx_gist on test2 + Index Cond: (t ~~* '%BCD%'::text) +(2 rows) + +select * from test2 where t like '%BCD%'; + t +--- +(0 rows) + +select * from test2 where t like '%bcd%'; + t +-------- + abcdef +(1 row) + +select * from test2 where t ilike '%BCD%'; + t +-------- + abcdef +(1 row) + +select * from test2 where t ilike 'qua%'; + t +------- + quark +(1 row) + diff --git a/contrib/pg_trgm/pg_trgm.sql.in b/contrib/pg_trgm/pg_trgm.sql.in index 3e116e8306..12a8c21071 100644 --- a/contrib/pg_trgm/pg_trgm.sql.in +++ b/contrib/pg_trgm/pg_trgm.sql.in @@ -113,6 +113,8 @@ FOR TYPE text USING gist AS OPERATOR 1 % (text, text), OPERATOR 2 <-> (text, text) FOR ORDER BY pg_catalog.float_ops, + OPERATOR 3 pg_catalog.~~ (text, text), + OPERATOR 4 pg_catalog.~~* (text, text), FUNCTION 1 gtrgm_consistent (internal, text, int, oid, internal), FUNCTION 2 gtrgm_union (bytea, internal), FUNCTION 3 gtrgm_compress (internal), @@ -124,17 +126,17 @@ AS STORAGE gtrgm; -- support functions for gin -CREATE OR REPLACE FUNCTION gin_extract_trgm(text, internal) +CREATE OR REPLACE FUNCTION gin_extract_value_trgm(text, internal) RETURNS internal AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT; -CREATE OR REPLACE FUNCTION gin_extract_trgm(text, internal, int2, internal, internal) +CREATE OR REPLACE FUNCTION gin_extract_query_trgm(text, internal, int2, internal, internal, internal, internal) RETURNS internal AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT; -CREATE OR REPLACE FUNCTION gin_trgm_consistent(internal, int2, text, int4, internal, internal) +CREATE OR REPLACE FUNCTION gin_trgm_consistent(internal, int2, text, int4, internal, internal, internal, internal) RETURNS bool AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT; @@ -144,8 +146,10 @@ CREATE OPERATOR CLASS gin_trgm_ops FOR TYPE text USING gin AS OPERATOR 1 % (text, text), + OPERATOR 3 pg_catalog.~~ (text, text), + OPERATOR 4 pg_catalog.~~* (text, text), FUNCTION 1 btint4cmp (int4, int4), - FUNCTION 2 gin_extract_trgm (text, internal), - FUNCTION 3 gin_extract_trgm (text, internal, int2, internal, internal), - FUNCTION 4 gin_trgm_consistent (internal, int2, text, int4, internal, internal), + FUNCTION 2 gin_extract_value_trgm (text, internal), + FUNCTION 3 gin_extract_query_trgm (text, internal, int2, internal, internal, internal, internal), + FUNCTION 4 gin_trgm_consistent (internal, int2, text, int4, internal, internal, internal, internal), STORAGE int4; diff --git a/contrib/pg_trgm/sql/pg_trgm.sql b/contrib/pg_trgm/sql/pg_trgm.sql index 5e5539c005..b8209344c3 100644 --- a/contrib/pg_trgm/sql/pg_trgm.sql +++ b/contrib/pg_trgm/sql/pg_trgm.sql @@ -45,3 +45,28 @@ set enable_seqscan=off; select t,similarity(t,'qwertyu0988') as sml from test_trgm where t % 'qwertyu0988' order by sml desc, t; select t,similarity(t,'gwertyu0988') as sml from test_trgm where t % 'gwertyu0988' order by sml desc, t; select t,similarity(t,'gwertyu1988') as sml from test_trgm where t % 'gwertyu1988' order by sml desc, t; + +create table test2(t text); +insert into test2 values ('abcdef'); +insert into test2 values ('quark'); +create index test2_idx_gin on test2 using gin (t gin_trgm_ops); +set enable_seqscan=off; +explain (costs off) + select * from test2 where t like '%BCD%'; +explain (costs off) + select * from test2 where t ilike '%BCD%'; +select * from test2 where t like '%BCD%'; +select * from test2 where t like '%bcd%'; +select * from test2 where t ilike '%BCD%'; +select * from test2 where t ilike 'qua%'; +drop index test2_idx_gin; +create index test2_idx_gist on test2 using gist (t gist_trgm_ops); +set enable_seqscan=off; +explain (costs off) + select * from test2 where t like '%BCD%'; +explain (costs off) + select * from test2 where t ilike '%BCD%'; +select * from test2 where t like '%BCD%'; +select * from test2 where t like '%bcd%'; +select * from test2 where t ilike '%BCD%'; +select * from test2 where t ilike 'qua%'; diff --git a/contrib/pg_trgm/trgm.h b/contrib/pg_trgm/trgm.h index 1cc812554f..f3644fcce7 100644 --- a/contrib/pg_trgm/trgm.h +++ b/contrib/pg_trgm/trgm.h @@ -13,12 +13,21 @@ #define LPADDING 2 #define RPADDING 1 #define KEEPONLYALNUM +/* + * Caution: IGNORECASE macro means that trigrams are case-insensitive. + * If this macro is disabled, the ~~* operator must be removed from the + * operator classes, because we can't handle case-insensitive wildcard search + * with case-sensitive trigrams. Failure to do this will result in "cannot + * handle ~~* with case-sensitive trigrams" errors. + */ #define IGNORECASE #define DIVUNION /* operator strategy numbers */ -#define SimilarityStrategyNumber 1 -#define DistanceStrategyNumber 2 +#define SimilarityStrategyNumber 1 +#define DistanceStrategyNumber 2 +#define LikeStrategyNumber 3 +#define ILikeStrategyNumber 4 typedef char trgm[3]; @@ -40,7 +49,10 @@ uint32 trgm2int(trgm *ptr); #else #define ISPRINTABLECHAR(a) ( isascii( *(unsigned char*)(a) ) && isprint( *(unsigned char*)(a) ) ) #endif -#define ISPRINTABLETRGM(t) ( ISPRINTABLECHAR( ((char*)t) ) && ISPRINTABLECHAR( ((char*)t)+1 ) && ISPRINTABLECHAR( ((char*)t)+2 ) ) +#define ISPRINTABLETRGM(t) ( ISPRINTABLECHAR( ((char*)(t)) ) && ISPRINTABLECHAR( ((char*)(t))+1 ) && ISPRINTABLECHAR( ((char*)(t))+2 ) ) + +#define ISESCAPECHAR(x) (*(x) == '\\') /* Wildcard escape character */ +#define ISWILDCARDCHAR(x) (*(x) == '_' || *(x) == '%') /* Wildcard meta-character */ typedef struct { @@ -65,7 +77,7 @@ typedef char *BITVECP; for(i=0;i> i & 0x01 ) +#define GETBITBYTE(x,i) ( (((char)(x)) >> (i)) & 0x01 ) #define CLRBIT(x,i) GETBYTE(x,i) &= ~( 0x01 << ( (i) % BITBYTE ) ) #define SETBIT(x,i) GETBYTE(x,i) |= ( 0x01 << ( (i) % BITBYTE ) ) #define GETBIT(x,i) ( (GETBYTE(x,i) >> ( (i) % BITBYTE )) & 0x01 ) @@ -89,6 +101,8 @@ typedef char *BITVECP; extern float4 trgm_limit; TRGM *generate_trgm(char *str, int slen); +TRGM *generate_wildcard_trgm(const char *str, int slen); float4 cnt_sml(TRGM *trg1, TRGM *trg2); +bool trgm_contained_by(TRGM *trg1, TRGM *trg2); #endif /* __TRGM_H__ */ diff --git a/contrib/pg_trgm/trgm_gin.c b/contrib/pg_trgm/trgm_gin.c index 2d3e35e233..cdaa72ea61 100644 --- a/contrib/pg_trgm/trgm_gin.c +++ b/contrib/pg_trgm/trgm_gin.c @@ -7,6 +7,7 @@ #include "access/gin.h" #include "access/itup.h" +#include "access/skey.h" #include "access/tuptoaster.h" #include "storage/bufpage.h" #include "utils/array.h" @@ -16,14 +17,31 @@ PG_FUNCTION_INFO_V1(gin_extract_trgm); Datum gin_extract_trgm(PG_FUNCTION_ARGS); +PG_FUNCTION_INFO_V1(gin_extract_value_trgm); +Datum gin_extract_value_trgm(PG_FUNCTION_ARGS); + +PG_FUNCTION_INFO_V1(gin_extract_query_trgm); +Datum gin_extract_query_trgm(PG_FUNCTION_ARGS); + PG_FUNCTION_INFO_V1(gin_trgm_consistent); Datum gin_trgm_consistent(PG_FUNCTION_ARGS); /* - * This function is used as both extractValue and extractQuery + * This function can only be called if a pre-9.1 version of the GIN operator + * class definition is present in the catalogs (probably as a consequence + * of upgrade-in-place). Complain. */ Datum gin_extract_trgm(PG_FUNCTION_ARGS) +{ + ereport(ERROR, + (errmsg("GIN operator class for pg_trgm is out of date"), + errhint("Please drop and re-create the pg_trgm catalog entries."))); + PG_RETURN_NULL(); +} + +Datum +gin_extract_value_trgm(PG_FUNCTION_ARGS) { text *val = (text *) PG_GETARG_TEXT_P(0); int32 *nentries = (int32 *) PG_GETARG_POINTER(1); @@ -57,34 +75,124 @@ gin_extract_trgm(PG_FUNCTION_ARGS) PG_RETURN_POINTER(entries); } +Datum +gin_extract_query_trgm(PG_FUNCTION_ARGS) +{ + text *val = (text *) PG_GETARG_TEXT_P(0); + int32 *nentries = (int32 *) PG_GETARG_POINTER(1); + StrategyNumber strategy = PG_GETARG_UINT16(2); + /* bool **pmatch = (bool **) PG_GETARG_POINTER(3); */ + /* Pointer *extra_data = (Pointer *) PG_GETARG_POINTER(4); */ + /* bool **nullFlags = (bool **) PG_GETARG_POINTER(5); */ + int32 *searchMode = (int32 *) PG_GETARG_POINTER(6); + Datum *entries = NULL; + TRGM *trg; + int32 trglen; + trgm *ptr; + int32 i; + + switch (strategy) + { + case SimilarityStrategyNumber: + trg = generate_trgm(VARDATA(val), VARSIZE(val) - VARHDRSZ); + break; + case ILikeStrategyNumber: +#ifndef IGNORECASE + elog(ERROR, "cannot handle ~~* with case-sensitive trigrams"); +#endif + /* FALL THRU */ + case LikeStrategyNumber: + /* + * For wildcard search we extract all the trigrams that every + * potentially-matching string must include. + */ + trg = generate_wildcard_trgm(VARDATA(val), VARSIZE(val) - VARHDRSZ); + break; + default: + elog(ERROR, "unrecognized strategy number: %d", strategy); + trg = NULL; /* keep compiler quiet */ + break; + } + + trglen = ARRNELEM(trg); + *nentries = trglen; + + if (trglen > 0) + { + entries = (Datum *) palloc(sizeof(Datum) * trglen); + ptr = GETARR(trg); + for (i = 0; i < trglen; i++) + { + int32 item = trgm2int(ptr); + + entries[i] = Int32GetDatum(item); + ptr++; + } + } + + /* + * If no trigram was extracted then we have to scan all the index. + */ + if (trglen == 0) + *searchMode = GIN_SEARCH_MODE_ALL; + + PG_RETURN_POINTER(entries); +} + Datum gin_trgm_consistent(PG_FUNCTION_ARGS) { bool *check = (bool *) PG_GETARG_POINTER(0); - /* StrategyNumber strategy = PG_GETARG_UINT16(1); */ + StrategyNumber strategy = PG_GETARG_UINT16(1); /* text *query = PG_GETARG_TEXT_P(2); */ int32 nkeys = PG_GETARG_INT32(3); /* Pointer *extra_data = (Pointer *) PG_GETARG_POINTER(4); */ bool *recheck = (bool *) PG_GETARG_POINTER(5); - bool res = FALSE; + bool res; int32 i, - ntrue = 0; + ntrue; /* All cases served by this function are inexact */ *recheck = true; - /* Count the matches */ - for (i = 0; i < nkeys; i++) + switch (strategy) { - if (check[i]) - ntrue++; - } - + case SimilarityStrategyNumber: + /* Count the matches */ + ntrue = 0; + for (i = 0; i < nkeys; i++) + { + if (check[i]) + ntrue++; + } #ifdef DIVUNION - res = (nkeys == ntrue) ? true : ((((((float4) ntrue) / ((float4) (nkeys - ntrue)))) >= trgm_limit) ? true : false); + res = (nkeys == ntrue) ? true : ((((((float4) ntrue) / ((float4) (nkeys - ntrue)))) >= trgm_limit) ? true : false); #else - res = (nkeys == 0) ? false : ((((((float4) ntrue) / ((float4) nkeys))) >= trgm_limit) ? true : false); + res = (nkeys == 0) ? false : ((((((float4) ntrue) / ((float4) nkeys))) >= trgm_limit) ? true : false); #endif + break; + case ILikeStrategyNumber: +#ifndef IGNORECASE + elog(ERROR, "cannot handle ~~* with case-sensitive trigrams"); +#endif + /* FALL THRU */ + case LikeStrategyNumber: + /* Check if all extracted trigrams are presented. */ + res = true; + for (i = 0; i < nkeys; i++) + { + if (!check[i]) + { + res = false; + break; + } + } + break; + default: + elog(ERROR, "unrecognized strategy number: %d", strategy); + res = false; /* keep compiler quiet */ + break; + } PG_RETURN_BOOL(res); } diff --git a/contrib/pg_trgm/trgm_gist.c b/contrib/pg_trgm/trgm_gist.c index d9f3d40c17..d83265c11c 100644 --- a/contrib/pg_trgm/trgm_gist.c +++ b/contrib/pg_trgm/trgm_gist.c @@ -195,31 +195,61 @@ gtrgm_consistent(PG_FUNCTION_ARGS) TRGM *key = (TRGM *) DatumGetPointer(entry->key); TRGM *qtrg; bool res; - char *cache = (char *) fcinfo->flinfo->fn_extra; - - /* All cases served by this function are exact */ - *recheck = false; - - if (cache == NULL || VARSIZE(cache) != VARSIZE(query) || memcmp(cache, query, VARSIZE(query)) != 0) + char *cache = (char *) fcinfo->flinfo->fn_extra, + *cacheContents = cache + MAXALIGN(sizeof(StrategyNumber)); + + /* + * Store both the strategy number and extracted trigrams in cache, because + * trigram extraction is relatively CPU-expensive. We must include + * strategy number because trigram extraction depends on strategy. + */ + if (cache == NULL || strategy != *((StrategyNumber *) cache) || + VARSIZE(cacheContents) != VARSIZE(query) || + memcmp(cacheContents, query, VARSIZE(query)) != 0) { - qtrg = generate_trgm(VARDATA(query), VARSIZE(query) - VARHDRSZ); + switch (strategy) + { + case SimilarityStrategyNumber: + qtrg = generate_trgm(VARDATA(query), VARSIZE(query) - VARHDRSZ); + break; + case ILikeStrategyNumber: +#ifndef IGNORECASE + elog(ERROR, "cannot handle ~~* with case-sensitive trigrams"); +#endif + /* FALL THRU */ + case LikeStrategyNumber: + qtrg = generate_wildcard_trgm(VARDATA(query), VARSIZE(query) - VARHDRSZ); + break; + default: + elog(ERROR, "unrecognized strategy number: %d", strategy); + qtrg = NULL; /* keep compiler quiet */ + break; + } if (cache) pfree(cache); - fcinfo->flinfo->fn_extra = MemoryContextAlloc(fcinfo->flinfo->fn_mcxt, - MAXALIGN(VARSIZE(query)) + VARSIZE(qtrg)); + fcinfo->flinfo->fn_extra = + MemoryContextAlloc(fcinfo->flinfo->fn_mcxt, + MAXALIGN(sizeof(StrategyNumber)) + + MAXALIGN(VARSIZE(query)) + + VARSIZE(qtrg)); cache = (char *) fcinfo->flinfo->fn_extra; + cacheContents = cache + MAXALIGN(sizeof(StrategyNumber)); - memcpy(cache, query, VARSIZE(query)); - memcpy(cache + MAXALIGN(VARSIZE(query)), qtrg, VARSIZE(qtrg)); + *((StrategyNumber *) cache) = strategy; + memcpy(cacheContents, query, VARSIZE(query)); + memcpy(cacheContents + MAXALIGN(VARSIZE(query)), qtrg, VARSIZE(qtrg)); } - qtrg = (TRGM *) (cache + MAXALIGN(VARSIZE(query))); + qtrg = (TRGM *) (cacheContents + MAXALIGN(VARSIZE(query))); switch (strategy) { case SimilarityStrategyNumber: + /* Similarity search is exact */ + *recheck = false; + if (GIST_LEAF(entry)) { /* all leafs contains orig trgm */ float4 tmpsml = cnt_sml(key, qtrg); @@ -242,6 +272,47 @@ gtrgm_consistent(PG_FUNCTION_ARGS) res = (((((float8) count) / ((float8) len))) >= trgm_limit) ? true : false; } break; + case ILikeStrategyNumber: +#ifndef IGNORECASE + elog(ERROR, "cannot handle ~~* with case-sensitive trigrams"); +#endif + /* FALL THRU */ + case LikeStrategyNumber: + /* Wildcard search is inexact */ + *recheck = true; + + /* + * Check if all the extracted trigrams can be present in child + * nodes. + */ + if (GIST_LEAF(entry)) + { /* all leafs contains orig trgm */ + res = trgm_contained_by(qtrg, key); + } + else if (ISALLTRUE(key)) + { /* non-leaf contains signature */ + res = true; + } + else + { /* non-leaf contains signature */ + int32 k, + tmp = 0, + len = ARRNELEM(qtrg); + trgm *ptr = GETARR(qtrg); + BITVECP sign = GETSIGN(key); + + res = true; + for (k = 0; k < len; k++) + { + CPTRGM(((char *) &tmp), ptr + k); + if (!GETBIT(sign, HASHVAL(tmp))) + { + res = false; + break; + } + } + } + break; default: elog(ERROR, "unrecognized strategy number: %d", strategy); res = false; /* keep compiler quiet */ diff --git a/contrib/pg_trgm/trgm_op.c b/contrib/pg_trgm/trgm_op.c index b97e9912ba..52f9172f6d 100644 --- a/contrib/pg_trgm/trgm_op.c +++ b/contrib/pg_trgm/trgm_op.c @@ -18,6 +18,23 @@ float4 trgm_limit = 0.3f; PG_FUNCTION_INFO_V1(set_limit); Datum set_limit(PG_FUNCTION_ARGS); + +PG_FUNCTION_INFO_V1(show_limit); +Datum show_limit(PG_FUNCTION_ARGS); + +PG_FUNCTION_INFO_V1(show_trgm); +Datum show_trgm(PG_FUNCTION_ARGS); + +PG_FUNCTION_INFO_V1(similarity); +Datum similarity(PG_FUNCTION_ARGS); + +PG_FUNCTION_INFO_V1(similarity_dist); +Datum similarity_dist(PG_FUNCTION_ARGS); + +PG_FUNCTION_INFO_V1(similarity_op); +Datum similarity_op(PG_FUNCTION_ARGS); + + Datum set_limit(PG_FUNCTION_ARGS) { @@ -29,8 +46,6 @@ set_limit(PG_FUNCTION_ARGS) PG_RETURN_FLOAT4(trgm_limit); } -PG_FUNCTION_INFO_V1(show_limit); -Datum show_limit(PG_FUNCTION_ARGS); Datum show_limit(PG_FUNCTION_ARGS) { @@ -120,7 +135,7 @@ cnt_trigram(trgm *tptr, char *str, int bytelen) #endif /* - * Adds trigramm from words (already padded). + * Adds trigrams from words (already padded). */ static trgm * make_trigrams(trgm *tptr, char *str, int bytelen, int charlen) @@ -236,6 +251,225 @@ generate_trgm(char *str, int slen) return trg; } +/* + * Extract the next non-wildcard part of a search string, ie, a word bounded + * by '_' or '%' meta-characters, non-word characters or string end. + * + * str: source string, of length lenstr bytes (need not be null-terminated) + * buf: where to return the substring (must be long enough) + * *bytelen: receives byte length of the found substring + * *charlen: receives character length of the found substring + * + * Returns pointer to end+1 of the found substring in the source string. + * Returns NULL if no word found (in which case buf, bytelen, charlen not set) + * + * If the found word is bounded by non-word characters or string boundaries + * then this function will include corresponding padding spaces into buf. + */ +static const char * +get_wildcard_part(const char *str, int lenstr, + char *buf, int *bytelen, int *charlen) +{ + const char *beginword = str; + const char *endword; + char *s = buf; + bool in_wildcard_meta = false; + bool in_escape = false; + int clen; + + /* + * Find the first word character remembering whether last character was + * wildcard meta-character. + */ + while (beginword - str < lenstr) + { + if (in_escape) + { + in_escape = false; + in_wildcard_meta = false; + if (iswordchr(beginword)) + break; + } + else + { + if (ISESCAPECHAR(beginword)) + in_escape = true; + else if (ISWILDCARDCHAR(beginword)) + in_wildcard_meta = true; + else if (iswordchr(beginword)) + break; + else + in_wildcard_meta = false; + } + beginword += pg_mblen(beginword); + } + + /* + * Handle string end. + */ + if (beginword - str >= lenstr) + return NULL; + + /* + * Add left padding spaces if last character wasn't wildcard + * meta-character. + */ + *charlen = 0; + if (!in_wildcard_meta) + { + if (LPADDING > 0) + { + *s++ = ' '; + (*charlen)++; + if (LPADDING > 1) + { + *s++ = ' '; + (*charlen)++; + } + } + } + + /* + * Copy data into buf until wildcard meta-character, non-word character or + * string boundary. Strip escapes during copy. + */ + endword = beginword; + in_wildcard_meta = false; + in_escape = false; + while (endword - str < lenstr) + { + clen = pg_mblen(endword); + if (in_escape) + { + in_escape = false; + in_wildcard_meta = false; + if (iswordchr(endword)) + { + memcpy(s, endword, clen); + (*charlen)++; + s += clen; + } + else + break; + } + else + { + if (ISESCAPECHAR(endword)) + in_escape = true; + else if (ISWILDCARDCHAR(endword)) + { + in_wildcard_meta = true; + break; + } + else if (iswordchr(endword)) + { + memcpy(s, endword, clen); + (*charlen)++; + s += clen; + } + else + { + in_wildcard_meta = false; + break; + } + } + endword += clen; + } + + /* + * Add right padding spaces if last character wasn't wildcard + * meta-character. + */ + if (!in_wildcard_meta) + { + if (RPADDING > 0) + { + *s++ = ' '; + (*charlen)++; + if (RPADDING > 1) + { + *s++ = ' '; + (*charlen)++; + } + } + } + + *bytelen = s - buf; + return endword; +} + +/* + * Generates trigrams for wildcard search string. + * + * Returns array of trigrams that must occur in any string that matches the + * wildcard string. For example, given pattern "a%bcd%" the trigrams + * " a", "bcd" would be extracted. + */ +TRGM * +generate_wildcard_trgm(const char *str, int slen) +{ + TRGM *trg; + char *buf, + *buf2; + trgm *tptr; + int len, + charlen, + bytelen; + const char *eword; + + trg = (TRGM *) palloc(TRGMHDRSIZE + sizeof(trgm) * (slen / 2 + 1) * 3); + trg->flag = ARRKEY; + SET_VARSIZE(trg, TRGMHDRSIZE); + + if (slen + LPADDING + RPADDING < 3 || slen == 0) + return trg; + + tptr = GETARR(trg); + + buf = palloc(sizeof(char) * (slen + 4)); + + /* + * Extract trigrams from each substring extracted by get_wildcard_part. + */ + eword = str; + while ((eword = get_wildcard_part(eword, slen - (eword - str), + buf, &bytelen, &charlen)) != NULL) + { +#ifdef IGNORECASE + buf2 = lowerstr_with_len(buf, bytelen); + bytelen = strlen(buf2); +#else + buf2 = buf; +#endif + + /* + * count trigrams + */ + tptr = make_trigrams(tptr, buf2, bytelen, charlen); +#ifdef IGNORECASE + pfree(buf2); +#endif + } + + pfree(buf); + + if ((len = tptr - GETARR(trg)) == 0) + return trg; + + /* + * Make trigrams unique. + */ + if (len > 0) + { + qsort((void *) GETARR(trg), len, sizeof(trgm), comp_trgm); + len = unique_array(GETARR(trg), len); + } + + SET_VARSIZE(trg, CALCGTSIZE(ARRKEY, len)); + + return trg; +} + uint32 trgm2int(trgm *ptr) { @@ -250,8 +484,6 @@ trgm2int(trgm *ptr) return val; } -PG_FUNCTION_INFO_V1(show_trgm); -Datum show_trgm(PG_FUNCTION_ARGS); Datum show_trgm(PG_FUNCTION_ARGS) { @@ -340,8 +572,44 @@ cnt_sml(TRGM *trg1, TRGM *trg2) } -PG_FUNCTION_INFO_V1(similarity); -Datum similarity(PG_FUNCTION_ARGS); +/* + * Returns whether trg2 contains all trigrams in trg1. + * This relies on the trigram arrays being sorted. + */ +bool +trgm_contained_by(TRGM *trg1, TRGM *trg2) +{ + trgm *ptr1, + *ptr2; + int len1, + len2; + + ptr1 = GETARR(trg1); + ptr2 = GETARR(trg2); + + len1 = ARRNELEM(trg1); + len2 = ARRNELEM(trg2); + + while (ptr1 - GETARR(trg1) < len1 && ptr2 - GETARR(trg2) < len2) + { + int res = CMPTRGM(ptr1, ptr2); + + if (res < 0) + return false; + else if (res > 0) + ptr2++; + else + { + ptr1++; + ptr2++; + } + } + if (ptr1 - GETARR(trg1) < len1) + return false; + else + return true; +} + Datum similarity(PG_FUNCTION_ARGS) { @@ -364,8 +632,6 @@ similarity(PG_FUNCTION_ARGS) PG_RETURN_FLOAT4(res); } -PG_FUNCTION_INFO_V1(similarity_dist); -Datum similarity_dist(PG_FUNCTION_ARGS); Datum similarity_dist(PG_FUNCTION_ARGS) { @@ -375,8 +641,6 @@ similarity_dist(PG_FUNCTION_ARGS) PG_RETURN_FLOAT4(1.0 - res); } -PG_FUNCTION_INFO_V1(similarity_op); -Datum similarity_op(PG_FUNCTION_ARGS); Datum similarity_op(PG_FUNCTION_ARGS) { diff --git a/contrib/pg_trgm/uninstall_pg_trgm.sql b/contrib/pg_trgm/uninstall_pg_trgm.sql index bc8f1fa983..961e40ca48 100644 --- a/contrib/pg_trgm/uninstall_pg_trgm.sql +++ b/contrib/pg_trgm/uninstall_pg_trgm.sql @@ -25,11 +25,11 @@ DROP TYPE gtrgm CASCADE; DROP OPERATOR CLASS gin_trgm_ops USING gin; -DROP FUNCTION gin_extract_trgm(text, internal); +DROP FUNCTION gin_extract_value_trgm(text, internal); -DROP FUNCTION gin_extract_trgm(text, internal, int2, internal, internal); +DROP FUNCTION gin_extract_query_trgm(text, internal, int2, internal, internal, internal, internal); -DROP FUNCTION gin_trgm_consistent(internal, int2, text, int4, internal, internal); +DROP FUNCTION gin_trgm_consistent(internal, int2, text, int4, internal, internal, internal, internal); DROP OPERATOR % (text, text); diff --git a/doc/src/sgml/pgtrgm.sgml b/doc/src/sgml/pgtrgm.sgml index 27a2306720..9210899ec2 100644 --- a/doc/src/sgml/pgtrgm.sgml +++ b/doc/src/sgml/pgtrgm.sgml @@ -137,8 +137,10 @@ The pg_trgm module provides GiST and GIN index operator classes that allow you to create an index over a text column for the purpose of very fast similarity searches. These index types support - the above-described similarity operators (and no other operators, so you may - want a regular B-tree index too). + the above-described similarity operators, and additionally support + trigram-based index searches for LIKE and ILIKE + queries. (These indexes do not support equality nor simple comparison + operators, so you may need a regular B-tree index too.) @@ -181,6 +183,18 @@ SELECT t, t <-> 'word' AS dist a small number of the closest matches is wanted. + + Beginning in PostgreSQL 9.1, these index types also support + index searches for LIKE and ILIKE, for example + +SELECT * FROM test_trgm WHERE t LIKE '%foo%bar'; + + The index search works by extracting trigrams from the search string + and then looking these up in the index. The more trigrams in the search + string, the more effective the index search is. Unlike B-tree based + searches, the search string need not be left-anchored. + + The choice between GiST and GIN indexing depends on the relative performance characteristics of GiST and GIN, which are discussed elsewhere.