From dbde97cdde4afbd0a175880446e301f1bfb8994c Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Fri, 28 May 2010 17:35:23 +0000 Subject: [PATCH] Rewrite LIKE's %-followed-by-_ optimization so it really works (this time for sure ;-)). It now also optimizes more cases, such as %_%_. Improve comments too. Per bug #5478. In passing, also rename the TCHAR macro to GETCHAR, because pgindent is messing with the formatting of the former (apparently it now thinks TCHAR is a typedef name). Back-patch to 8.3, where the bug was introduced. --- src/backend/utils/adt/like_match.c | 132 +++++++++++++------------- src/test/regress/expected/strings.out | 8 +- src/test/regress/sql/strings.sql | 4 +- 3 files changed, 74 insertions(+), 70 deletions(-) diff --git a/src/backend/utils/adt/like_match.c b/src/backend/utils/adt/like_match.c index 41acc01d60..b67ba020cf 100644 --- a/src/backend/utils/adt/like_match.c +++ b/src/backend/utils/adt/like_match.c @@ -14,12 +14,12 @@ * NextChar * MatchText - to name of function wanted * do_like_escape - name of function if wanted - needs CHAREQ and CopyAdvChar - * MATCH_LOWER - define iff using to_lower on text chars + * MATCH_LOWER - define for case (4), using to_lower on single-byte chars * * Copyright (c) 1996-2010, PostgreSQL Global Development Group * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/utils/adt/like_match.c,v 1.27 2010/01/02 16:57:54 momjian Exp $ + * $PostgreSQL: pgsql/src/backend/utils/adt/like_match.c,v 1.28 2010/05/28 17:35:23 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -70,9 +70,9 @@ */ #ifdef MATCH_LOWER -#define TCHAR(t) ((char) tolower((unsigned char) (t))) +#define GETCHAR(t) ((char) tolower((unsigned char) (t))) #else -#define TCHAR(t) (t) +#define GETCHAR(t) (t) #endif static int @@ -101,85 +101,80 @@ MatchText(char *t, int tlen, char *p, int plen) ereport(ERROR, (errcode(ERRCODE_INVALID_ESCAPE_SEQUENCE), errmsg("LIKE pattern must not end with escape character"))); - if (TCHAR (*p) != TCHAR (*t)) + if (GETCHAR(*p) != GETCHAR(*t)) return LIKE_FALSE; } else if (*p == '%') { + char firstpat; + /* - * % processing is essentially a search for a match for what - * follows the %, plus a recursive match of the remainder. We - * succeed if and only if both conditions are met. + * % processing is essentially a search for a text position at + * which the remainder of the text matches the remainder of the + * pattern, using a recursive call to check each potential match. + * + * If there are wildcards immediately following the %, we can skip + * over them first, using the idea that any sequence of N _'s and + * one or more %'s is equivalent to N _'s and one % (ie, it will + * match any sequence of at least N text characters). In this + * way we will always run the recursive search loop using a + * pattern fragment that begins with a literal character-to-match, + * thereby not recursing more than we have to. */ + NextByte(p, plen); - /* %% is the same as % according to the SQL standard */ - /* Advance past all %'s */ - while (plen > 0 && *p == '%') - NextByte(p, plen); - /* Trailing percent matches everything. */ + while (plen > 0) + { + if (*p == '%') + NextByte(p, plen); + else if (*p == '_') + { + /* If not enough text left to match the pattern, ABORT */ + if (tlen <= 0) + return LIKE_ABORT; + NextChar(t, tlen); + NextByte(p, plen); + } + else + break; /* Reached a non-wildcard pattern char */ + } + + /* + * If we're at end of pattern, match: we have a trailing % which + * matches any remaining text string. + */ if (plen <= 0) return LIKE_TRUE; /* * Otherwise, scan for a text position at which we can match the - * rest of the pattern. + * rest of the pattern. The first remaining pattern char is known + * to be a regular or escaped literal character, so we can compare + * the first pattern byte to each text byte to avoid recursing + * more than we have to. This fact also guarantees that we don't + * have to consider a match to the zero-length substring at the + * end of the text. */ - if (*p == '_') + if (*p == '\\') { - /* %_ is the same as _% - avoid matching _ repeatedly */ + if (plen < 2) + return LIKE_FALSE; /* XXX should throw error */ + firstpat = GETCHAR(p[1]); + } + else + firstpat = GETCHAR(*p); - do - { - NextChar(t, tlen); - NextByte(p, plen); - } while (tlen > 0 && plen > 0 && *p == '_'); - - /* - * If we are at the end of the pattern, succeed: % followed by - * n _'s matches any string of at least n characters, and we - * have now found there are at least n characters. - */ - if (plen <= 0) - return LIKE_TRUE; - - /* Look for a place that matches the rest of the pattern */ - while (tlen > 0) + while (tlen > 0) + { + if (GETCHAR(*t) == firstpat) { int matched = MatchText(t, tlen, p, plen); if (matched != LIKE_FALSE) - return matched; /* TRUE or ABORT */ - - NextChar(t, tlen); + return matched; /* TRUE or ABORT */ } - } - else - { - char firstpat = TCHAR (*p); - if (*p == '\\') - { - if (plen < 2) - return LIKE_FALSE; - firstpat = TCHAR (p[1]); - } - - while (tlen > 0) - { - /* - * Optimization to prevent most recursion: don't recurse - * unless first pattern byte matches first text byte. - */ - if (TCHAR (*t) == firstpat) - { - int matched = MatchText(t, tlen, p, plen); - - if (matched != LIKE_FALSE) - return matched; /* TRUE or ABORT */ - } - - NextChar(t, tlen); - } + NextChar(t, tlen); } /* @@ -195,7 +190,7 @@ MatchText(char *t, int tlen, char *p, int plen) NextByte(p, plen); continue; } - else if (TCHAR (*p) != TCHAR (*t)) + else if (GETCHAR(*p) != GETCHAR(*t)) { /* non-wildcard pattern char fails to match text char */ return LIKE_FALSE; @@ -220,11 +215,12 @@ MatchText(char *t, int tlen, char *p, int plen) if (tlen > 0) return LIKE_FALSE; /* end of pattern, but not of text */ - /* End of text string. Do we have matching pattern remaining? */ - while (plen > 0 && *p == '%') /* allow multiple %'s at end of - * pattern */ + /* + * End of text, but perhaps not of pattern. Match iff the remaining + * pattern can match a zero-length string, ie, it's zero or more %'s. + */ + while (plen > 0 && *p == '%') NextByte(p, plen); - if (plen <= 0) return LIKE_TRUE; @@ -348,7 +344,7 @@ do_like_escape(text *pat, text *esc) #undef do_like_escape #endif -#undef TCHAR +#undef GETCHAR #ifdef MATCH_LOWER #undef MATCH_LOWER diff --git a/src/test/regress/expected/strings.out b/src/test/regress/expected/strings.out index 42c34a5522..1bd6772ddd 100644 --- a/src/test/regress/expected/strings.out +++ b/src/test/regress/expected/strings.out @@ -996,7 +996,7 @@ SELECT 'Hawkeye' NOT ILIKE 'h%' AS "false"; (1 row) -- --- test %/_ combination cases, cf bug #4821 +-- test %/_ combination cases, cf bugs #4821 and #5478 -- SELECT 'foo' LIKE '_%' as t, 'f' LIKE '_%' as t, '' LIKE '_%' as f; t | t | f @@ -1022,6 +1022,12 @@ SELECT 'foo' LIKE '%__' as t, 'foo' LIKE '%___' as t, 'foo' LIKE '%____' as f; t | t | f (1 row) +SELECT 'jack' LIKE '%____%' AS t; + t +--- + t +(1 row) + -- -- test implicit type conversion -- diff --git a/src/test/regress/sql/strings.sql b/src/test/regress/sql/strings.sql index 8c35be8136..6ef446308b 100644 --- a/src/test/regress/sql/strings.sql +++ b/src/test/regress/sql/strings.sql @@ -301,7 +301,7 @@ SELECT 'Hawkeye' ILIKE 'h%' AS "true"; SELECT 'Hawkeye' NOT ILIKE 'h%' AS "false"; -- --- test %/_ combination cases, cf bug #4821 +-- test %/_ combination cases, cf bugs #4821 and #5478 -- SELECT 'foo' LIKE '_%' as t, 'f' LIKE '_%' as t, '' LIKE '_%' as f; @@ -310,6 +310,8 @@ SELECT 'foo' LIKE '%_' as t, 'f' LIKE '%_' as t, '' LIKE '%_' as f; SELECT 'foo' LIKE '__%' as t, 'foo' LIKE '___%' as t, 'foo' LIKE '____%' as f; SELECT 'foo' LIKE '%__' as t, 'foo' LIKE '%___' as t, 'foo' LIKE '%____' as f; +SELECT 'jack' LIKE '%____%' AS t; + -- -- test implicit type conversion -- 2.40.0