Rewrite LIKE's %-followed-by-_ optimization so it really works (this time

author Tom Lane <tgl@sss.pgh.pa.us>

Fri, 28 May 2010 17:35:36 +0000 (17:35 +0000)

committer Tom Lane <tgl@sss.pgh.pa.us>

Fri, 28 May 2010 17:35:36 +0000 (17:35 +0000)
author Tom Lane <tgl@sss.pgh.pa.us>
Fri, 28 May 2010 17:35:36 +0000 (17:35 +0000)
committer Tom Lane <tgl@sss.pgh.pa.us>
Fri, 28 May 2010 17:35:36 +0000 (17:35 +0000)
diff --git a/src/backend/utils/adt/like_match.c b/src/backend/utils/adt/like_match.c

index 287356309fbadd4672dd5b97c0f556e5362b738c..90b58525b0606874b17c0f02ea3e9152a0ec18c3 100644 (file)
--- a/src/backend/utils/adt/like_match.c
+++ b/src/backend/utils/adt/like_match.c
@@ -1,25 +1,25 @@
  /*-------------------------------------------------------------------------
   *
   * like_match.c
- *       like expression handling internal code.
+ *       LIKE pattern matching internal code.
   *
- * This file is included by like.c four times, to provide natching code for
- * single-byte encodings, UTF8, and for other multi-byte encodings,
- * and case insensitive matches for single byte encodings.
- * UTF8 is a special case because we can use a much more efficient version
- * of NextChar than can be used for other multi-byte encodings.
+ * This file is included by like.c four times, to provide matching code for
+ * (1) single-byte encodings, (2) UTF8, (3) other multi-byte encodings,
+ * and (4) case insensitive matches in single byte encodings.
+ * (UTF8 is a special case because we can use a much more efficient version
+ * of NextChar than can be used for general multi-byte encodings.)
   *
   * Before the inclusion, we need to define the following macros:
   *
   * NextChar
   * MatchText - to name of function wanted
   * do_like_escape - name of function if wanted - needs CHAREQ and CopyAdvChar
- * MATCH_LOWER - define iff using to_lower on text chars
+ * MATCH_LOWER - define for case (4), using to_lower on single-byte chars
   *
   * Copyright (c) 1996-2008, PostgreSQL Global Development Group
   *
   * IDENTIFICATION
- *     $PostgreSQL: pgsql/src/backend/utils/adt/like_match.c,v 1.20.2.3 2009/05/24 18:10:47 tgl Exp $
+ *     $PostgreSQL: pgsql/src/backend/utils/adt/like_match.c,v 1.20.2.4 2010/05/28 17:35:36 tgl Exp $
   *
   *-------------------------------------------------------------------------
   */
@@ -70,9 +70,9 @@
   */
  
  #ifdef MATCH_LOWER
-#define TCHAR(t) ((char) tolower((unsigned char) (t)))
+#define GETCHAR(t) ((char) tolower((unsigned char) (t)))
  #else
-#define TCHAR(t) (t)
+#define GETCHAR(t) (t)
  #endif
  
  static int
@@ -94,87 +94,82 @@ MatchText(char *t, int tlen, char *p, int plen)
         {
                 if (*p == '\\')
                 {
-                       /* Next byte must match literally, whatever it is */
+                       /* Next pattern byte must match literally, whatever it is */
                         NextByte(p, plen);
-                       if ((plen <= 0) || TCHAR(*p) != TCHAR(*t))
+                       if (plen <= 0 || GETCHAR(*p) != GETCHAR(*t))
                                 return LIKE_FALSE;
                 }
                 else if (*p == '%')
                 {
+                       char            firstpat;
+
                         /*
-                        * % processing is essentially a search for a match for what
-                        * follows the %, plus a recursive match of the remainder. We
-                        * succeed if and only if both conditions are met.
+                        * % processing is essentially a search for a text position at
+                        * which the remainder of the text matches the remainder of the
+                        * pattern, using a recursive call to check each potential match.
+                        *
+                        * If there are wildcards immediately following the %, we can skip
+                        * over them first, using the idea that any sequence of N _'s and
+                        * one or more %'s is equivalent to N _'s and one % (ie, it will
+                        * match any sequence of at least N text characters).  In this
+                        * way we will always run the recursive search loop using a
+                        * pattern fragment that begins with a literal character-to-match,
+                        * thereby not recursing more than we have to.
                          */
+                       NextByte(p, plen);
+
+                       while (plen > 0)
+                       {
+                               if (*p == '%')
+                                       NextByte(p, plen);
+                               else if (*p == '_')
+                               {
+                                       /* If not enough text left to match the pattern, ABORT */
+                                       if (tlen <= 0)
+                                               return LIKE_ABORT;
+                                       NextChar(t, tlen);
+                                       NextByte(p, plen);
+                               }
+                               else
+                                       break;          /* Reached a non-wildcard pattern char */
+                       }
  
-                       /* %% is the same as % according to the SQL standard */
-                       /* Advance past all %'s */
-                       while (plen > 0 && *p == '%')
-                               NextByte(p, plen);
-                       /* Trailing percent matches everything. */
+                       /*
+                        * If we're at end of pattern, match: we have a trailing % which
+                        * matches any remaining text string.
+                        */
                         if (plen <= 0)
                                 return LIKE_TRUE;
  
                         /*
                          * Otherwise, scan for a text position at which we can match the
-                        * rest of the pattern.
+                        * rest of the pattern.  The first remaining pattern char is known
+                        * to be a regular or escaped literal character, so we can compare
+                        * the first pattern byte to each text byte to avoid recursing
+                        * more than we have to.  This fact also guarantees that we don't
+                        * have to consider a match to the zero-length substring at the
+                        * end of the text.
                          */
-                       if (*p == '_')
+                       if (*p == '\\')
                         {
-                               /* %_ is the same as _% - avoid matching _ repeatedly */
+                               if (plen < 2)
+                                       return LIKE_FALSE; /* XXX should throw error */
+                               firstpat = GETCHAR(p[1]);
+                       }
+                       else
+                               firstpat = GETCHAR(*p);
  
-                               do
-                               {
-                                       NextChar(t, tlen);
-                                       NextByte(p, plen);
-                               } while (tlen > 0 && plen > 0 && *p == '_');
-
-                               /*
-                                * If we are at the end of the pattern, succeed: % followed
-                                * by n _'s matches any string of at least n characters, and
-                                * we have now found there are at least n characters.
-                                */
-                               if (plen <= 0)
-                                       return LIKE_TRUE;
-
-                               /* Look for a place that matches the rest of the pattern */
-                               while (tlen > 0)
+                       while (tlen > 0)
+                       {
+                               if (GETCHAR(*t) == firstpat)
                                 {
                                         int                     matched = MatchText(t, tlen, p, plen);
  
                                         if (matched != LIKE_FALSE)
-                                               return matched; /* TRUE or ABORT */
-
-                                       NextChar(t, tlen);
-                               }
-                       }
-                       else
-                       {
-                               char            firstpat = TCHAR(*p);
-
-                               if (*p == '\\')
-                               {
-                                       if (plen < 2)
-                                               return LIKE_FALSE;
-                                       firstpat = TCHAR(p[1]);
+                                               return matched;         /* TRUE or ABORT */
                                 }
  
-                               while (tlen > 0)
-                               {
-                                       /*
-                                        * Optimization to prevent most recursion: don't recurse
-                                        * unless first pattern byte matches first text byte.
-                                        */
-                                       if (TCHAR(*t) == firstpat)
-                                       {
-                                               int                     matched = MatchText(t, tlen, p, plen);
-
-                                               if (matched != LIKE_FALSE)
-                                                       return matched;         /* TRUE or ABORT */
-                                       }
-
-                                       NextChar(t, tlen);
-                               }
+                               NextChar(t, tlen);
                         }
  
                         /*
@@ -190,7 +185,7 @@ MatchText(char *t, int tlen, char *p, int plen)
                         NextByte(p, plen);
                         continue;
                 }
-               else if (TCHAR(*p) != TCHAR(*t))
+               else if (GETCHAR(*p) != GETCHAR(*t))
                 {
                         /* non-wildcard pattern char fails to match text char */
                         return LIKE_FALSE;
@@ -215,10 +210,12 @@ MatchText(char *t, int tlen, char *p, int plen)
         if (tlen > 0)
                 return LIKE_FALSE;              /* end of pattern, but not of text */
  
-       /* End of text string.  Do we have matching pattern remaining? */
-       while (plen > 0 && *p == '%')   /* allow multiple %'s at end of pattern */
+       /*
+        * End of text, but perhaps not of pattern.  Match iff the remaining
+        * pattern can match a zero-length string, ie, it's zero or more %'s.
+        */
+       while (plen > 0 && *p == '%')
                 NextByte(p, plen);
-
         if (plen <= 0)
                 return LIKE_TRUE;
  
@@ -342,8 +339,9 @@ do_like_escape(text *pat, text *esc)
  #undef do_like_escape
  #endif
  
-#undef TCHAR
+#undef GETCHAR
  
  #ifdef MATCH_LOWER
  #undef MATCH_LOWER
+
  #endif
diff --git a/src/test/regress/expected/strings.out b/src/test/regress/expected/strings.out

index ce37780a3aef47e8c09c29a9ee5042f78d15be38..abde8e921bd4d1998dd1c571d9a0b7635890117a 100644 (file)
--- a/src/test/regress/expected/strings.out
+++ b/src/test/regress/expected/strings.out
@@ -827,7 +827,7 @@ SELECT 'Hawkeye' NOT ILIKE 'h%' AS "false";
  (1 row)
  
  --
--- test %/_ combination cases, cf bug #4821
+-- test %/_ combination cases, cf bugs #4821 and #5478
  --
  SELECT 'foo' LIKE '_%' as t, 'f' LIKE '_%' as t, '' LIKE '_%' as f;
   t | t | f 
@@ -853,6 +853,12 @@ SELECT 'foo' LIKE '%__' as t, 'foo' LIKE '%___' as t, 'foo' LIKE '%____' as f;
   t | t | f
  (1 row)
  
+SELECT 'jack' LIKE '%____%' AS t;
+ t 
+---
+ t
+(1 row)
+
  --
  -- test implicit type conversion
  --
diff --git a/src/test/regress/sql/strings.sql b/src/test/regress/sql/strings.sql

index 80344da57dc2a9e0eb799261e89269a8b6cf8559..0a51287fb75d6a2def54a3ea007aec49f01ea468 100644 (file)
--- a/src/test/regress/sql/strings.sql
+++ b/src/test/regress/sql/strings.sql
@@ -253,7 +253,7 @@ SELECT 'Hawkeye' ILIKE 'h%' AS "true";
  SELECT 'Hawkeye' NOT ILIKE 'h%' AS "false";
  
  --
--- test %/_ combination cases, cf bug #4821
+-- test %/_ combination cases, cf bugs #4821 and #5478
  --
  
  SELECT 'foo' LIKE '_%' as t, 'f' LIKE '_%' as t, '' LIKE '_%' as f;
@@ -262,6 +262,8 @@ SELECT 'foo' LIKE '%_' as t, 'f' LIKE '%_' as t, '' LIKE '%_' as f;
  SELECT 'foo' LIKE '__%' as t, 'foo' LIKE '___%' as t, 'foo' LIKE '____%' as f;
  SELECT 'foo' LIKE '%__' as t, 'foo' LIKE '%___' as t, 'foo' LIKE '%____' as f;
  
+SELECT 'jack' LIKE '%____%' AS t;
+
  
  --
  -- test implicit type conversion
author	Tom Lane <tgl@sss.pgh.pa.us>
	Fri, 28 May 2010 17:35:36 +0000 (17:35 +0000)
committer	Tom Lane <tgl@sss.pgh.pa.us>
	Fri, 28 May 2010 17:35:36 +0000 (17:35 +0000)
src/backend/utils/adt/like_match.c		patch \| blob \| history
src/test/regress/expected/strings.out		patch \| blob \| history
src/test/regress/sql/strings.sql		patch \| blob \| history