From fc2660fc253ce871b2f26e7b6dce133092357cb0 Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Sun, 24 May 2009 18:10:38 +0000
Subject: [PATCH] Fix LIKE's special-case code for % followed by _.  I'm not
 entirely sure that this case is worth a special code path, but a special code
 path that gets the boundary condition wrong is definitely no good.  Per bug
 #4821 from Andrew Gierth.

In passing, clean up some minor code formatting issues (excess parentheses
and blank lines in odd places).

Back-patch to 8.3, where the bug was introduced.
---
 src/backend/utils/adt/like_match.c    | 120 +++++++++++++-------------
 src/test/regress/expected/strings.out |  27 ++++++
 src/test/regress/sql/strings.sql      |  11 +++
 3 files changed, 96 insertions(+), 62 deletions(-)
diff --git a/src/backend/utils/adt/like_match.c b/src/backend/utils/adt/like_match.c
index 95550e0530..d72113b66b 100644
--- a/src/backend/utils/adt/like_match.c
+++ b/src/backend/utils/adt/like_match.c
@@ -9,7 +9,7 @@
  * (UTF8 is a special case because we can use a much more efficient version
  * of NextChar than can be used for general multi-byte encodings.)
  *
- * Before the inclusion, we need to define following macros:
+ * Before the inclusion, we need to define the following macros:
  *
  * NextChar
  * MatchText - to name of function wanted
@@ -19,47 +19,46 @@
  * Copyright (c) 1996-2009, PostgreSQL Global Development Group
  *
  * IDENTIFICATION
- *	$PostgreSQL: pgsql/src/backend/utils/adt/like_match.c,v 1.24 2009/01/01 17:23:49 momjian Exp $
+ *	$PostgreSQL: pgsql/src/backend/utils/adt/like_match.c,v 1.25 2009/05/24 18:10:37 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
 
 /*
-**	Originally written by Rich $alz, mirror!rs, Wed Nov 26 19:03:17 EST 1986.
-**	Rich $alz is now <rsalz@bbn.com>.
-**	Special thanks to Lars Mathiesen <thorinn@diku.dk> for the LABORT code.
-**
-**	This code was shamelessly stolen from the "pql" code by myself and
-**	slightly modified :)
-**
-**	All references to the word "star" were replaced by "percent"
-**	All references to the word "wild" were replaced by "like"
-**
-**	All the nice shell RE matching stuff was replaced by just "_" and "%"
-**
-**	As I don't have a copy of the SQL standard handy I wasn't sure whether
-**	to leave in the '\' escape character handling.
-**
-**	Keith Parks. <keith@mtcc.demon.co.uk>
-**
-**	SQL92 lets you specify the escape character by saying
-**	LIKE <pattern> ESCAPE <escape character>. We are a small operation
-**	so we force you to use '\'. - ay 7/95
-**
-**	Now we have the like_escape() function that converts patterns with
-**	any specified escape character (or none at all) to the internal
-**	default escape character, which is still '\'. - tgl 9/2000
-**
-** The code is rewritten to avoid requiring null-terminated strings,
-** which in turn allows us to leave out some memcpy() operations.
-** This code should be faster and take less memory, but no promises...
-** - thomas 2000-08-06
-**
-*/
+ *	Originally written by Rich $alz, mirror!rs, Wed Nov 26 19:03:17 EST 1986.
+ *	Rich $alz is now <rsalz@bbn.com>.
+ *	Special thanks to Lars Mathiesen <thorinn@diku.dk> for the LABORT code.
+ *
+ *	This code was shamelessly stolen from the "pql" code by myself and
+ *	slightly modified :)
+ *
+ *	All references to the word "star" were replaced by "percent"
+ *	All references to the word "wild" were replaced by "like"
+ *
+ *	All the nice shell RE matching stuff was replaced by just "_" and "%"
+ *
+ *	As I don't have a copy of the SQL standard handy I wasn't sure whether
+ *	to leave in the '\' escape character handling.
+ *
+ *	Keith Parks. <keith@mtcc.demon.co.uk>
+ *
+ *	SQL92 lets you specify the escape character by saying
+ *	LIKE <pattern> ESCAPE <escape character>. We are a small operation
+ *	so we force you to use '\'. - ay 7/95
+ *
+ *	Now we have the like_escape() function that converts patterns with
+ *	any specified escape character (or none at all) to the internal
+ *	default escape character, which is still '\'. - tgl 9/2000
+ *
+ * The code is rewritten to avoid requiring null-terminated strings,
+ * which in turn allows us to leave out some memcpy() operations.
+ * This code should be faster and take less memory, but no promises...
+ * - thomas 2000-08-06
+ */
 
 
 /*--------------------
- *	Match text and p, return LIKE_TRUE, LIKE_FALSE, or LIKE_ABORT.
+ *	Match text and pattern, return LIKE_TRUE, LIKE_FALSE, or LIKE_ABORT.
  *
  *	LIKE_TRUE: they match
  *	LIKE_FALSE: they don't match
@@ -80,19 +79,18 @@ static int
 MatchText(char *t, int tlen, char *p, int plen)
 {
 	/* Fast path for match-everything pattern */
-	if ((plen == 1) && (*p == '%'))
+	if (plen == 1 && *p == '%')
 		return LIKE_TRUE;
 
 	/*
 	 * In this loop, we advance by char when matching wildcards (and thus on
 	 * recursive entry to this function we are properly char-synced). On other
 	 * occasions it is safe to advance by byte, as the text and pattern will
-	 * be in lockstep. This allows us to perform all comparisons  between the
+	 * be in lockstep. This allows us to perform all comparisons between the
 	 * text and pattern on a byte by byte basis, even for multi-byte
 	 * encodings.
 	 */
-
-	while ((tlen > 0) && (plen > 0))
+	while (tlen > 0 && plen > 0)
 	{
 		if (*p == '\\')
 		{
@@ -116,7 +114,7 @@ MatchText(char *t, int tlen, char *p, int plen)
 
 			/* %% is the same as % according to the SQL standard */
 			/* Advance past all %'s */
-			while ((plen > 0) && (*p == '%'))
+			while (plen > 0 && *p == '%')
 				NextByte(p, plen);
 			/* Trailing percent matches everything. */
 			if (plen <= 0)
@@ -127,22 +125,24 @@ MatchText(char *t, int tlen, char *p, int plen)
 			 * rest of the pattern.
 			 */
 			if (*p == '_')
-
 			{
 				/* %_ is the same as _% - avoid matching _ repeatedly */
 
-				NextChar(t, tlen);
-				NextByte(p, plen);
-
-				if (tlen <= 0)
-				{
-					return (plen <= 0) ? LIKE_TRUE : LIKE_ABORT;
-				}
-				else if (plen <= 0)
+				do
 				{
-					return LIKE_FALSE;
-				}
-
+					NextChar(t, tlen);
+					NextByte(p, plen);
+				} while (tlen > 0 && plen > 0 && *p == '_');
+
+				/*
+				 * If we are at the end of the pattern, succeed: % followed
+				 * by n _'s matches any string of at least n characters, and
+				 * we have now found there are at least n characters.
+				 */
+				if (plen <= 0)
+					return LIKE_TRUE;
+
+				/* Look for a place that matches the rest of the pattern */
 				while (tlen > 0)
 				{
 					int			matched = MatchText(t, tlen, p, plen);
@@ -155,7 +155,6 @@ MatchText(char *t, int tlen, char *p, int plen)
 			}
 			else
 			{
-
 				char		firstpat = TCHAR(*p);
 
 				if (*p == '\\')
@@ -180,7 +179,6 @@ MatchText(char *t, int tlen, char *p, int plen)
 					}
 
 					NextChar(t, tlen);
-
 				}
 			}
 
@@ -192,20 +190,20 @@ MatchText(char *t, int tlen, char *p, int plen)
 		}
 		else if (*p == '_')
 		{
+			/* _ matches any single character, and we know there is one */
 			NextChar(t, tlen);
 			NextByte(p, plen);
 			continue;
 		}
-		else if (TCHAR(*t) != TCHAR(*p))
+		else if (TCHAR(*p) != TCHAR(*t))
 		{
-			/*
-			 * Not the single-character wildcard and no explicit match? Then
-			 * time to quit...
-			 */
+			/* non-wildcard pattern char fails to match text char */
 			return LIKE_FALSE;
 		}
 
 		/*
+		 * Pattern and text match, so advance.
+		 *
 		 * It is safe to use NextByte instead of NextChar here, even for
 		 * multi-byte character sets, because we are not following immediately
 		 * after a wildcard character. If we are in the middle of a multibyte
@@ -222,9 +220,8 @@ MatchText(char *t, int tlen, char *p, int plen)
 	if (tlen > 0)
 		return LIKE_FALSE;		/* end of pattern, but not of text */
 
-	/* End of input string.  Do we have matching pattern remaining? */
-	while ((plen > 0) && (*p == '%'))	/* allow multiple %'s at end of
-										 * pattern */
+	/* End of text string.  Do we have matching pattern remaining? */
+	while (plen > 0 && *p == '%')	/* allow multiple %'s at end of pattern */
 		NextByte(p, plen);
 
 	if (plen <= 0)
@@ -354,5 +351,4 @@ do_like_escape(text *pat, text *esc)
 
 #ifdef MATCH_LOWER
 #undef MATCH_LOWER
-
 #endif
diff --git a/src/test/regress/expected/strings.out b/src/test/regress/expected/strings.out
index be8eb919fa..1241a2ace6 100644
--- a/src/test/regress/expected/strings.out
+++ b/src/test/regress/expected/strings.out
@@ -902,6 +902,33 @@ SELECT 'Hawkeye' NOT ILIKE 'h%' AS "false";
  f
 (1 row)
 
+--
+-- test %/_ combination cases, cf bug #4821
+--
+SELECT 'foo' LIKE '_%' as t, 'f' LIKE '_%' as t, '' LIKE '_%' as f;
+ t | t | f 
+---+---+---
+ t | t | f
+(1 row)
+
+SELECT 'foo' LIKE '%_' as t, 'f' LIKE '%_' as t, '' LIKE '%_' as f;
+ t | t | f 
+---+---+---
+ t | t | f
+(1 row)
+
+SELECT 'foo' LIKE '__%' as t, 'foo' LIKE '___%' as t, 'foo' LIKE '____%' as f;
+ t | t | f 
+---+---+---
+ t | t | f
+(1 row)
+
+SELECT 'foo' LIKE '%__' as t, 'foo' LIKE '%___' as t, 'foo' LIKE '%____' as f;
+ t | t | f 
+---+---+---
+ t | t | f
+(1 row)
+
 --
 -- test implicit type conversion
 --
diff --git a/src/test/regress/sql/strings.sql b/src/test/regress/sql/strings.sql
index a28c75ac04..681a0e1e62 100644
--- a/src/test/regress/sql/strings.sql
+++ b/src/test/regress/sql/strings.sql
@@ -279,6 +279,17 @@ SELECT 'hawkeye' NOT ILIKE 'H%Eye' AS "false";
 SELECT 'Hawkeye' ILIKE 'h%' AS "true";
 SELECT 'Hawkeye' NOT ILIKE 'h%' AS "false";
 
+--
+-- test %/_ combination cases, cf bug #4821
+--
+
+SELECT 'foo' LIKE '_%' as t, 'f' LIKE '_%' as t, '' LIKE '_%' as f;
+SELECT 'foo' LIKE '%_' as t, 'f' LIKE '%_' as t, '' LIKE '%_' as f;
+
+SELECT 'foo' LIKE '__%' as t, 'foo' LIKE '___%' as t, 'foo' LIKE '____%' as f;
+SELECT 'foo' LIKE '%__' as t, 'foo' LIKE '%___' as t, 'foo' LIKE '%____' as f;
+
+
 --
 -- test implicit type conversion
 --
-- 
2.40.0