From dbde97cdde4afbd0a175880446e301f1bfb8994c Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Fri, 28 May 2010 17:35:23 +0000
Subject: [PATCH] Rewrite LIKE's %-followed-by-_ optimization so it really
 works (this time for sure ;-)).  It now also optimizes more cases, such as
 %_%_.  Improve comments too.  Per bug #5478.

In passing, also rename the TCHAR macro to GETCHAR, because pgindent is
messing with the formatting of the former (apparently it now thinks TCHAR
is a typedef name).

Back-patch to 8.3, where the bug was introduced.
---
 src/backend/utils/adt/like_match.c    | 132 +++++++++++++-------------
 src/test/regress/expected/strings.out |   8 +-
 src/test/regress/sql/strings.sql      |   4 +-
 3 files changed, 74 insertions(+), 70 deletions(-)

diff --git a/src/backend/utils/adt/like_match.c b/src/backend/utils/adt/like_match.c
index 41acc01d60..b67ba020cf 100644
--- a/src/backend/utils/adt/like_match.c
+++ b/src/backend/utils/adt/like_match.c
@@ -14,12 +14,12 @@
  * NextChar
  * MatchText - to name of function wanted
  * do_like_escape - name of function if wanted - needs CHAREQ and CopyAdvChar
- * MATCH_LOWER - define iff using to_lower on text chars
+ * MATCH_LOWER - define for case (4), using to_lower on single-byte chars
  *
  * Copyright (c) 1996-2010, PostgreSQL Global Development Group
  *
  * IDENTIFICATION
- *	$PostgreSQL: pgsql/src/backend/utils/adt/like_match.c,v 1.27 2010/01/02 16:57:54 momjian Exp $
+ *	$PostgreSQL: pgsql/src/backend/utils/adt/like_match.c,v 1.28 2010/05/28 17:35:23 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -70,9 +70,9 @@
  */
 
 #ifdef MATCH_LOWER
-#define TCHAR(t) ((char) tolower((unsigned char) (t)))
+#define GETCHAR(t) ((char) tolower((unsigned char) (t)))
 #else
-#define TCHAR(t) (t)
+#define GETCHAR(t) (t)
 #endif
 
 static int
@@ -101,85 +101,80 @@ MatchText(char *t, int tlen, char *p, int plen)
 				ereport(ERROR,
 						(errcode(ERRCODE_INVALID_ESCAPE_SEQUENCE),
 				 errmsg("LIKE pattern must not end with escape character")));
-			if (TCHAR (*p) != TCHAR (*t))
+			if (GETCHAR(*p) != GETCHAR(*t))
 				return LIKE_FALSE;
 		}
 		else if (*p == '%')
 		{
+			char		firstpat;
+
 			/*
-			 * % processing is essentially a search for a match for what
-			 * follows the %, plus a recursive match of the remainder. We
-			 * succeed if and only if both conditions are met.
+			 * % processing is essentially a search for a text position at
+			 * which the remainder of the text matches the remainder of the
+			 * pattern, using a recursive call to check each potential match.
+			 *
+			 * If there are wildcards immediately following the %, we can skip
+			 * over them first, using the idea that any sequence of N _'s and
+			 * one or more %'s is equivalent to N _'s and one % (ie, it will
+			 * match any sequence of at least N text characters).  In this
+			 * way we will always run the recursive search loop using a
+			 * pattern fragment that begins with a literal character-to-match,
+			 * thereby not recursing more than we have to.
 			 */
+			NextByte(p, plen);
 
-			/* %% is the same as % according to the SQL standard */
-			/* Advance past all %'s */
-			while (plen > 0 && *p == '%')
-				NextByte(p, plen);
-			/* Trailing percent matches everything. */
+			while (plen > 0)
+			{
+				if (*p == '%')
+					NextByte(p, plen);
+				else if (*p == '_')
+				{
+					/* If not enough text left to match the pattern, ABORT */
+					if (tlen <= 0)
+						return LIKE_ABORT;
+					NextChar(t, tlen);
+					NextByte(p, plen);
+				}
+				else
+					break;		/* Reached a non-wildcard pattern char */
+			}
+
+			/*
+			 * If we're at end of pattern, match: we have a trailing % which
+			 * matches any remaining text string.
+			 */
 			if (plen <= 0)
 				return LIKE_TRUE;
 
 			/*
 			 * Otherwise, scan for a text position at which we can match the
-			 * rest of the pattern.
+			 * rest of the pattern.  The first remaining pattern char is known
+			 * to be a regular or escaped literal character, so we can compare
+			 * the first pattern byte to each text byte to avoid recursing
+			 * more than we have to.  This fact also guarantees that we don't
+			 * have to consider a match to the zero-length substring at the
+			 * end of the text.
 			 */
-			if (*p == '_')
+			if (*p == '\\')
 			{
-				/* %_ is the same as _% - avoid matching _ repeatedly */
+				if (plen < 2)
+					return LIKE_FALSE; /* XXX should throw error */
+				firstpat = GETCHAR(p[1]);
+			}
+			else
+				firstpat = GETCHAR(*p);
 
-				do
-				{
-					NextChar(t, tlen);
-					NextByte(p, plen);
-				} while (tlen > 0 && plen > 0 && *p == '_');
-
-				/*
-				 * If we are at the end of the pattern, succeed: % followed by
-				 * n _'s matches any string of at least n characters, and we
-				 * have now found there are at least n characters.
-				 */
-				if (plen <= 0)
-					return LIKE_TRUE;
-
-				/* Look for a place that matches the rest of the pattern */
-				while (tlen > 0)
+			while (tlen > 0)
+			{
+				if (GETCHAR(*t) == firstpat)
 				{
 					int			matched = MatchText(t, tlen, p, plen);
 
 					if (matched != LIKE_FALSE)
-						return matched; /* TRUE or ABORT */
-
-					NextChar(t, tlen);
+						return matched;		/* TRUE or ABORT */
 				}
-			}
-			else
-			{
-				char		firstpat = TCHAR (*p);
 
-				if (*p == '\\')
-				{
-					if (plen < 2)
-						return LIKE_FALSE;
-					firstpat = TCHAR (p[1]);
-				}
-
-				while (tlen > 0)
-				{
-					/*
-					 * Optimization to prevent most recursion: don't recurse
-					 * unless first pattern byte matches first text byte.
-					 */
-					if (TCHAR (*t) == firstpat)
-					{
-						int			matched = MatchText(t, tlen, p, plen);
-
-						if (matched != LIKE_FALSE)
-							return matched;		/* TRUE or ABORT */
-					}
-
-					NextChar(t, tlen);
-				}
+				NextChar(t, tlen);
 			}
 
 			/*
@@ -195,7 +190,7 @@ MatchText(char *t, int tlen, char *p, int plen)
 			NextByte(p, plen);
 			continue;
 		}
-		else if (TCHAR (*p) != TCHAR (*t))
+		else if (GETCHAR(*p) != GETCHAR(*t))
 		{
 			/* non-wildcard pattern char fails to match text char */
 			return LIKE_FALSE;
@@ -220,11 +215,12 @@ MatchText(char *t, int tlen, char *p, int plen)
 	if (tlen > 0)
 		return LIKE_FALSE;		/* end of pattern, but not of text */
 
-	/* End of text string.	Do we have matching pattern remaining? */
-	while (plen > 0 && *p == '%')		/* allow multiple %'s at end of
-										 * pattern */
+	/*
+	 * End of text, but perhaps not of pattern.  Match iff the remaining
+	 * pattern can match a zero-length string, ie, it's zero or more %'s.
+	 */
+	while (plen > 0 && *p == '%')
 		NextByte(p, plen);
-
 	if (plen <= 0)
 		return LIKE_TRUE;
 
@@ -348,7 +344,7 @@ do_like_escape(text *pat, text *esc)
 #undef do_like_escape
 #endif
 
-#undef TCHAR
+#undef GETCHAR
 
 #ifdef MATCH_LOWER
 #undef MATCH_LOWER
diff --git a/src/test/regress/expected/strings.out b/src/test/regress/expected/strings.out
index 42c34a5522..1bd6772ddd 100644
--- a/src/test/regress/expected/strings.out
+++ b/src/test/regress/expected/strings.out
@@ -996,7 +996,7 @@ SELECT 'Hawkeye' NOT ILIKE 'h%' AS "false";
 (1 row)
 
 --
--- test %/_ combination cases, cf bug #4821
+-- test %/_ combination cases, cf bugs #4821 and #5478
 --
 SELECT 'foo' LIKE '_%' as t, 'f' LIKE '_%' as t, '' LIKE '_%' as f;
  t | t | f 
@@ -1022,6 +1022,12 @@ SELECT 'foo' LIKE '%__' as t, 'foo' LIKE '%___' as t, 'foo' LIKE '%____' as f;
  t | t | f
 (1 row)
 
+SELECT 'jack' LIKE '%____%' AS t;
+ t 
+---
+ t
+(1 row)
+
 --
 -- test implicit type conversion
 --
diff --git a/src/test/regress/sql/strings.sql b/src/test/regress/sql/strings.sql
index 8c35be8136..6ef446308b 100644
--- a/src/test/regress/sql/strings.sql
+++ b/src/test/regress/sql/strings.sql
@@ -301,7 +301,7 @@ SELECT 'Hawkeye' ILIKE 'h%' AS "true";
 SELECT 'Hawkeye' NOT ILIKE 'h%' AS "false";
 
 --
--- test %/_ combination cases, cf bug #4821
+-- test %/_ combination cases, cf bugs #4821 and #5478
 --
 
 SELECT 'foo' LIKE '_%' as t, 'f' LIKE '_%' as t, '' LIKE '_%' as f;
@@ -310,6 +310,8 @@ SELECT 'foo' LIKE '%_' as t, 'f' LIKE '%_' as t, '' LIKE '%_' as f;
 SELECT 'foo' LIKE '__%' as t, 'foo' LIKE '___%' as t, 'foo' LIKE '____%' as f;
 SELECT 'foo' LIKE '%__' as t, 'foo' LIKE '%___' as t, 'foo' LIKE '%____' as f;
 
+SELECT 'jack' LIKE '%____%' AS t;
+
 
 --
 -- test implicit type conversion
-- 
2.50.1