Improve efficiency of LIKE/ILIKE code, especially for multi-byte charsets,

author Andrew Dunstan <andrew@dunslane.net>

Sat, 2 Jun 2007 02:03:42 +0000 (02:03 +0000)

committer Andrew Dunstan <andrew@dunslane.net>

Sat, 2 Jun 2007 02:03:42 +0000 (02:03 +0000)
author Andrew Dunstan <andrew@dunslane.net>
Sat, 2 Jun 2007 02:03:42 +0000 (02:03 +0000)
committer Andrew Dunstan <andrew@dunslane.net>
Sat, 2 Jun 2007 02:03:42 +0000 (02:03 +0000)
diff --git a/src/backend/utils/adt/like.c b/src/backend/utils/adt/like.c

index 46f223b38ffd1519b941ddd4d1254d5a6532eab4..de5d7e7c8599344588a15ab3d7425745892d4b31 100644 (file)
--- a/src/backend/utils/adt/like.c
+++ b/src/backend/utils/adt/like.c
@@ -11,7 +11,7 @@
   * Portions Copyright (c) 1994, Regents of the University of California
   *
   * IDENTIFICATION
- *     $PostgreSQL: pgsql/src/backend/utils/adt/like.c,v 1.68 2007/02/27 23:48:08 tgl Exp $
+ *     $PostgreSQL: pgsql/src/backend/utils/adt/like.c,v 1.69 2007/06/02 02:03:42 adunstan Exp $
   *
   *-------------------------------------------------------------------------
   */
@@ -28,21 +28,23 @@
  #define LIKE_ABORT                                             (-1)
  
  
-static int     MatchText(char *t, int tlen, char *p, int plen);
-static int     MatchTextIC(char *t, int tlen, char *p, int plen);
-static int     MatchBytea(char *t, int tlen, char *p, int plen);
-static text *do_like_escape(text *, text *);
+static int     SB_MatchText(char *t, int tlen, char *p, int plen);
+static text *SB_do_like_escape(text *, text *);
  
-static int     MBMatchText(char *t, int tlen, char *p, int plen);
-static int     MBMatchTextIC(char *t, int tlen, char *p, int plen);
+static int     MB_MatchText(char *t, int tlen, char *p, int plen);
  static text *MB_do_like_escape(text *, text *);
  
+static int     UTF8_MatchText(char *t, int tlen, char *p, int plen);
+
+static int     GenericMatchText(char *s, int slen, char* p, int plen);
+static int     Generic_Text_IC_like(text *str, text *pat);
+
  /*--------------------
   * Support routine for MatchText. Compares given multibyte streams
   * as wide characters. If they match, returns 1 otherwise returns 0.
   *--------------------
   */
-static int
+static inline int
  wchareq(char *p1, char *p2)
  {
         int                     p1_len;
@@ -72,15 +74,12 @@ wchareq(char *p1, char *p2)
   * of getting a single character transformed to the system's wchar_t format.
   * So now, we just downcase the strings using lower() and apply regular LIKE
   * comparison. This should be revisited when we install better locale support.
- *
- * Note that MBMatchText and MBMatchTextIC do exactly the same thing now.
- * Is it worth refactoring to avoid duplicated code?  They might become
- * different again in the future.
   */
  
+#define NextByte(p, plen)      ((p)++, (plen)--)
+
  /* Set up to compile like_match.c for multibyte characters */
-#define CHAREQ(p1, p2) wchareq(p1, p2)
-#define ICHAREQ(p1, p2) wchareq(p1, p2)
+#define CHAREQ(p1, p2) wchareq((p1), (p2))
  #define NextChar(p, plen) \
         do { int __l = pg_mblen(p); (p) +=__l; (plen) -=__l; } while (0)
  #define CopyAdvChar(dst, src, srclen) \
@@ -90,33 +89,59 @@ wchareq(char *p1, char *p2)
                          *(dst)++ = *(src)++; \
            } while (0)
  
-#define MatchText      MBMatchText
-#define MatchTextIC MBMatchTextIC
+#define MatchText      MB_MatchText
  #define do_like_escape MB_do_like_escape
  
  #include "like_match.c"
  
-#undef CHAREQ
-#undef ICHAREQ
-#undef NextChar
-#undef CopyAdvChar
-#undef MatchText
-#undef MatchTextIC
-#undef do_like_escape
-
  /* Set up to compile like_match.c for single-byte characters */
  #define CHAREQ(p1, p2) (*(p1) == *(p2))
-#define ICHAREQ(p1, p2) (tolower((unsigned char) *(p1)) == tolower((unsigned char) *(p2)))
-#define NextChar(p, plen) ((p)++, (plen)--)
+#define NextChar(p, plen) NextByte((p), (plen))
  #define CopyAdvChar(dst, src, srclen) (*(dst)++ = *(src)++, (srclen)--)
  
+#define MatchText      SB_MatchText
+#define do_like_escape SB_do_like_escape
+
+#include "like_match.c"
+
+
+/* setup to compile like_match.c for UTF8 encoding, using fast NextChar */
+
+#define NextChar(p, plen) \
+       do { (p)++; (plen)--; } while ((plen) > 0 && (*(p) & 0xC0) == 0x80 ) 
+#define MatchText      UTF8_MatchText
+
  #include "like_match.c"
  
-/* And some support for BYTEA */
-#define BYTEA_CHAREQ(p1, p2) (*(p1) == *(p2))
-#define BYTEA_NextChar(p, plen) ((p)++, (plen)--)
-#define BYTEA_CopyAdvChar(dst, src, srclen) (*(dst)++ = *(src)++, (srclen)--)
+static inline int
+GenericMatchText(char *s, int slen, char* p, int plen)
+{
+       if (pg_database_encoding_max_length() == 1)
+               return SB_MatchText(s, slen, p, plen);
+       else if (GetDatabaseEncoding() == PG_UTF8)
+               return UTF8_MatchText(s, slen, p, plen);
+       else
+               return MB_MatchText(s, slen, p, plen);
+}
+
+static inline int
+Generic_Text_IC_like(text *str, text *pat)
+{
+       char       *s,
+                          *p;
+       int                     slen,
+                               plen;
+
+       /* Force inputs to lower case to achieve case insensitivity */
+       str = DatumGetTextP(DirectFunctionCall1(lower, PointerGetDatum(str)));
+       pat = DatumGetTextP(DirectFunctionCall1(lower, PointerGetDatum(pat)));
+       s = VARDATA(str);
+       slen = (VARSIZE(str) - VARHDRSZ);
+       p = VARDATA(pat);
+       plen = (VARSIZE(pat) - VARHDRSZ);
  
+       return GenericMatchText(s, slen, p, plen);
+}
  
  /*
   *     interface routines called by the function manager
@@ -138,10 +163,7 @@ namelike(PG_FUNCTION_ARGS)
         p = VARDATA(pat);
         plen = (VARSIZE(pat) - VARHDRSZ);
  
-       if (pg_database_encoding_max_length() == 1)
-               result = (MatchText(s, slen, p, plen) == LIKE_TRUE);
-       else
-               result = (MBMatchText(s, slen, p, plen) == LIKE_TRUE);
+       result = (GenericMatchText(s, slen, p, plen) == LIKE_TRUE);
  
         PG_RETURN_BOOL(result);
  }
@@ -162,10 +184,7 @@ namenlike(PG_FUNCTION_ARGS)
         p = VARDATA(pat);
         plen = (VARSIZE(pat) - VARHDRSZ);
  
-       if (pg_database_encoding_max_length() == 1)
-               result = (MatchText(s, slen, p, plen) != LIKE_TRUE);
-       else
-               result = (MBMatchText(s, slen, p, plen) != LIKE_TRUE);
+       result = (GenericMatchText(s, slen, p, plen) != LIKE_TRUE);
  
         PG_RETURN_BOOL(result);
  }
@@ -186,10 +205,7 @@ textlike(PG_FUNCTION_ARGS)
         p = VARDATA(pat);
         plen = (VARSIZE(pat) - VARHDRSZ);
  
-       if (pg_database_encoding_max_length() == 1)
-               result = (MatchText(s, slen, p, plen) == LIKE_TRUE);
-       else
-               result = (MBMatchText(s, slen, p, plen) == LIKE_TRUE);
+       result = (GenericMatchText(s, slen, p, plen) == LIKE_TRUE);
  
         PG_RETURN_BOOL(result);
  }
@@ -210,10 +226,7 @@ textnlike(PG_FUNCTION_ARGS)
         p = VARDATA(pat);
         plen = (VARSIZE(pat) - VARHDRSZ);
  
-       if (pg_database_encoding_max_length() == 1)
-               result = (MatchText(s, slen, p, plen) != LIKE_TRUE);
-       else
-               result = (MBMatchText(s, slen, p, plen) != LIKE_TRUE);
+       result = (GenericMatchText(s, slen, p, plen) != LIKE_TRUE);
  
         PG_RETURN_BOOL(result);
  }
@@ -234,7 +247,7 @@ bytealike(PG_FUNCTION_ARGS)
         p = VARDATA(pat);
         plen = (VARSIZE(pat) - VARHDRSZ);
  
-       result = (MatchBytea(s, slen, p, plen) == LIKE_TRUE);
+       result = (SB_MatchText(s, slen, p, plen) == LIKE_TRUE);
  
         PG_RETURN_BOOL(result);
  }
@@ -255,7 +268,7 @@ byteanlike(PG_FUNCTION_ARGS)
         p = VARDATA(pat);
         plen = (VARSIZE(pat) - VARHDRSZ);
  
-       result = (MatchBytea(s, slen, p, plen) != LIKE_TRUE);
+       result = (SB_MatchText(s, slen, p, plen) != LIKE_TRUE);
  
         PG_RETURN_BOOL(result);
  }
@@ -270,37 +283,11 @@ nameiclike(PG_FUNCTION_ARGS)
         Name            str = PG_GETARG_NAME(0);
         text       *pat = PG_GETARG_TEXT_P(1);
         bool            result;
-       char       *s,
-                          *p;
-       int                     slen,
-                               plen;
-
-       if (pg_database_encoding_max_length() == 1)
-       {
-               s = NameStr(*str);
-               slen = strlen(s);
-               p = VARDATA(pat);
-               plen = (VARSIZE(pat) - VARHDRSZ);
-               result = (MatchTextIC(s, slen, p, plen) == LIKE_TRUE);
-       }
-       else
-       {
-               /* Force inputs to lower case to achieve case insensitivity */
-               text       *strtext;
+       text       *strtext;
  
-               strtext = DatumGetTextP(DirectFunctionCall1(name_text,
+       strtext = DatumGetTextP(DirectFunctionCall1(name_text,
                                                                                                         NameGetDatum(str)));
-               strtext = DatumGetTextP(DirectFunctionCall1(lower,
-                                                                                                 PointerGetDatum(strtext)));
-               pat = DatumGetTextP(DirectFunctionCall1(lower,
-                                                                                               PointerGetDatum(pat)));
-
-               s = VARDATA(strtext);
-               slen = (VARSIZE(strtext) - VARHDRSZ);
-               p = VARDATA(pat);
-               plen = (VARSIZE(pat) - VARHDRSZ);
-               result = (MBMatchTextIC(s, slen, p, plen) == LIKE_TRUE);
-       }
+       result = (Generic_Text_IC_like(strtext, pat) == LIKE_TRUE);
  
         PG_RETURN_BOOL(result);
  }
@@ -311,37 +298,11 @@ nameicnlike(PG_FUNCTION_ARGS)
         Name            str = PG_GETARG_NAME(0);
         text       *pat = PG_GETARG_TEXT_P(1);
         bool            result;
-       char       *s,
-                          *p;
-       int                     slen,
-                               plen;
-
-       if (pg_database_encoding_max_length() == 1)
-       {
-               s = NameStr(*str);
-               slen = strlen(s);
-               p = VARDATA(pat);
-               plen = (VARSIZE(pat) - VARHDRSZ);
-               result = (MatchTextIC(s, slen, p, plen) != LIKE_TRUE);
-       }
-       else
-       {
-               /* Force inputs to lower case to achieve case insensitivity */
-               text       *strtext;
+       text       *strtext;
  
-               strtext = DatumGetTextP(DirectFunctionCall1(name_text,
+       strtext = DatumGetTextP(DirectFunctionCall1(name_text,
                                                                                                         NameGetDatum(str)));
-               strtext = DatumGetTextP(DirectFunctionCall1(lower,
-                                                                                                 PointerGetDatum(strtext)));
-               pat = DatumGetTextP(DirectFunctionCall1(lower,
-                                                                                               PointerGetDatum(pat)));
-
-               s = VARDATA(strtext);
-               slen = (VARSIZE(strtext) - VARHDRSZ);
-               p = VARDATA(pat);
-               plen = (VARSIZE(pat) - VARHDRSZ);
-               result = (MBMatchTextIC(s, slen, p, plen) != LIKE_TRUE);
-       }
+       result = (Generic_Text_IC_like(strtext, pat) != LIKE_TRUE);
  
         PG_RETURN_BOOL(result);
  }
@@ -352,32 +313,8 @@ texticlike(PG_FUNCTION_ARGS)
         text       *str = PG_GETARG_TEXT_P(0);
         text       *pat = PG_GETARG_TEXT_P(1);
         bool            result;
-       char       *s,
-                          *p;
-       int                     slen,
-                               plen;
  
-       if (pg_database_encoding_max_length() == 1)
-       {
-               s = VARDATA(str);
-               slen = (VARSIZE(str) - VARHDRSZ);
-               p = VARDATA(pat);
-               plen = (VARSIZE(pat) - VARHDRSZ);
-               result = (MatchTextIC(s, slen, p, plen) == LIKE_TRUE);
-       }
-       else
-       {
-               /* Force inputs to lower case to achieve case insensitivity */
-               str = DatumGetTextP(DirectFunctionCall1(lower,
-                                                                                               PointerGetDatum(str)));
-               pat = DatumGetTextP(DirectFunctionCall1(lower,
-                                                                                               PointerGetDatum(pat)));
-               s = VARDATA(str);
-               slen = (VARSIZE(str) - VARHDRSZ);
-               p = VARDATA(pat);
-               plen = (VARSIZE(pat) - VARHDRSZ);
-               result = (MBMatchTextIC(s, slen, p, plen) == LIKE_TRUE);
-       }
+       result = (Generic_Text_IC_like(str, pat) == LIKE_TRUE);
  
         PG_RETURN_BOOL(result);
  }
@@ -388,32 +325,8 @@ texticnlike(PG_FUNCTION_ARGS)
         text       *str = PG_GETARG_TEXT_P(0);
         text       *pat = PG_GETARG_TEXT_P(1);
         bool            result;
-       char       *s,
-                          *p;
-       int                     slen,
-                               plen;
  
-       if (pg_database_encoding_max_length() == 1)
-       {
-               s = VARDATA(str);
-               slen = (VARSIZE(str) - VARHDRSZ);
-               p = VARDATA(pat);
-               plen = (VARSIZE(pat) - VARHDRSZ);
-               result = (MatchTextIC(s, slen, p, plen) != LIKE_TRUE);
-       }
-       else
-       {
-               /* Force inputs to lower case to achieve case insensitivity */
-               str = DatumGetTextP(DirectFunctionCall1(lower,
-                                                                                               PointerGetDatum(str)));
-               pat = DatumGetTextP(DirectFunctionCall1(lower,
-                                                                                               PointerGetDatum(pat)));
-               s = VARDATA(str);
-               slen = (VARSIZE(str) - VARHDRSZ);
-               p = VARDATA(pat);
-               plen = (VARSIZE(pat) - VARHDRSZ);
-               result = (MBMatchTextIC(s, slen, p, plen) != LIKE_TRUE);
-       }
+       result = (Generic_Text_IC_like(str, pat) != LIKE_TRUE);
  
         PG_RETURN_BOOL(result);
  }
@@ -430,7 +343,7 @@ like_escape(PG_FUNCTION_ARGS)
         text       *result;
  
         if (pg_database_encoding_max_length() == 1)
-               result = do_like_escape(pat, esc);
+               result = SB_do_like_escape(pat, esc);
         else
                 result = MB_do_like_escape(pat, esc);
  
@@ -446,179 +359,8 @@ like_escape_bytea(PG_FUNCTION_ARGS)
  {
         bytea      *pat = PG_GETARG_BYTEA_P(0);
         bytea      *esc = PG_GETARG_BYTEA_P(1);
-       bytea      *result;
-       char       *p,
-                          *e,
-                          *r;
-       int                     plen,
-                               elen;
-       bool            afterescape;
-
-       p = VARDATA(pat);
-       plen = (VARSIZE(pat) - VARHDRSZ);
-       e = VARDATA(esc);
-       elen = (VARSIZE(esc) - VARHDRSZ);
-
-       /*
-        * Worst-case pattern growth is 2x --- unlikely, but it's hardly worth
-        * trying to calculate the size more accurately than that.
-        */
-       result = (text *) palloc(plen * 2 + VARHDRSZ);
-       r = VARDATA(result);
-
-       if (elen == 0)
-       {
-               /*
-                * No escape character is wanted.  Double any backslashes in the
-                * pattern to make them act like ordinary characters.
-                */
-               while (plen > 0)
-               {
-                       if (*p == '\\')
-                               *r++ = '\\';
-                       BYTEA_CopyAdvChar(r, p, plen);
-               }
-       }
-       else
-       {
-               /*
-                * The specified escape must be only a single character.
-                */
-               BYTEA_NextChar(e, elen);
-               if (elen != 0)
-                       ereport(ERROR,
-                                       (errcode(ERRCODE_INVALID_ESCAPE_SEQUENCE),
-                                        errmsg("invalid escape string"),
-                                 errhint("Escape string must be empty or one character.")));
-
-               e = VARDATA(esc);
-
-               /*
-                * If specified escape is '\', just copy the pattern as-is.
-                */
-               if (*e == '\\')
-               {
-                       memcpy(result, pat, VARSIZE(pat));
-                       PG_RETURN_BYTEA_P(result);
-               }
-
-               /*
-                * Otherwise, convert occurrences of the specified escape character to
-                * '\', and double occurrences of '\' --- unless they immediately
-                * follow an escape character!
-                */
-               afterescape = false;
-               while (plen > 0)
-               {
-                       if (BYTEA_CHAREQ(p, e) && !afterescape)
-                       {
-                               *r++ = '\\';
-                               BYTEA_NextChar(p, plen);
-                               afterescape = true;
-                       }
-                       else if (*p == '\\')
-                       {
-                               *r++ = '\\';
-                               if (!afterescape)
-                                       *r++ = '\\';
-                               BYTEA_NextChar(p, plen);
-                               afterescape = false;
-                       }
-                       else
-                       {
-                               BYTEA_CopyAdvChar(r, p, plen);
-                               afterescape = false;
-                       }
-               }
-       }
-
-       SET_VARSIZE(result, r - ((char *) result));
+       bytea      *result = SB_do_like_escape((text *)pat, (text *)esc);
  
-       PG_RETURN_BYTEA_P(result);
+       PG_RETURN_BYTEA_P((bytea *)result);
  }
  
-/*
- * Same as above, but specifically for bytea (binary) datatype
- */
-static int
-MatchBytea(char *t, int tlen, char *p, int plen)
-{
-       /* Fast path for match-everything pattern */
-       if ((plen == 1) && (*p == '%'))
-               return LIKE_TRUE;
-
-       while ((tlen > 0) && (plen > 0))
-       {
-               if (*p == '\\')
-               {
-                       /* Next pattern char must match literally, whatever it is */
-                       BYTEA_NextChar(p, plen);
-                       if ((plen <= 0) || !BYTEA_CHAREQ(t, p))
-                               return LIKE_FALSE;
-               }
-               else if (*p == '%')
-               {
-                       /* %% is the same as % according to the SQL standard */
-                       /* Advance past all %'s */
-                       while ((plen > 0) && (*p == '%'))
-                               BYTEA_NextChar(p, plen);
-                       /* Trailing percent matches everything. */
-                       if (plen <= 0)
-                               return LIKE_TRUE;
-
-                       /*
-                        * Otherwise, scan for a text position at which we can match the
-                        * rest of the pattern.
-                        */
-                       while (tlen > 0)
-                       {
-                               /*
-                                * Optimization to prevent most recursion: don't recurse
-                                * unless first pattern char might match this text char.
-                                */
-                               if (BYTEA_CHAREQ(t, p) || (*p == '\\') || (*p == '_'))
-                               {
-                                       int                     matched = MatchBytea(t, tlen, p, plen);
-
-                                       if (matched != LIKE_FALSE)
-                                               return matched; /* TRUE or ABORT */
-                               }
-
-                               BYTEA_NextChar(t, tlen);
-                       }
-
-                       /*
-                        * End of text with no match, so no point in trying later places
-                        * to start matching this pattern.
-                        */
-                       return LIKE_ABORT;
-               }
-               else if ((*p != '_') && !BYTEA_CHAREQ(t, p))
-               {
-                       /*
-                        * Not the single-character wildcard and no explicit match? Then
-                        * time to quit...
-                        */
-                       return LIKE_FALSE;
-               }
-
-               BYTEA_NextChar(t, tlen);
-               BYTEA_NextChar(p, plen);
-       }
-
-       if (tlen > 0)
-               return LIKE_FALSE;              /* end of pattern, but not of text */
-
-       /* End of input string.  Do we have matching pattern remaining? */
-       while ((plen > 0) && (*p == '%'))       /* allow multiple %'s at end of
-                                                                                * pattern */
-               BYTEA_NextChar(p, plen);
-       if (plen <= 0)
-               return LIKE_TRUE;
-
-       /*
-        * End of text with no match, so no point in trying later places to start
-        * matching this pattern.
-        */
-       return LIKE_ABORT;
-}      /* MatchBytea() */
diff --git a/src/backend/utils/adt/like_match.c b/src/backend/utils/adt/like_match.c

index 22e2705fb368f7f51cb0b81806d5d53f67c9488b..62f8bc40a15e815af80bb668dca629b5ff0f61ea 100644 (file)
--- a/src/backend/utils/adt/like_match.c
+++ b/src/backend/utils/adt/like_match.c
@@ -3,23 +3,21 @@
   * like_match.c
   *       like expression handling internal code.
   *
- * This file is included by like.c *twice*, to provide an optimization
- * for single-byte encodings.
+ * This file is included by like.c three times, to provide natching code for
+ * single-byte encodings, UTF8, and for other multi-byte encodings.
+ * UTF8 is a special case because we can use a much more efficient version
+ * of NextChar than can be used for other multi-byte encodings.
   *
   * Before the inclusion, we need to define following macros:
   *
- * CHAREQ
- * ICHAREQ
- * NextChar
- * CopyAdvChar
- * MatchText (MBMatchText)
- * MatchTextIC (MBMatchTextIC)
- * do_like_escape (MB_do_like_escape)
+ * NextChar 
+ * MatchText - to name of function wanted
+ * do_like_escape - name of function if wanted - needs CHAREQ and CopyAdvChar
   *
   * Copyright (c) 1996-2007, PostgreSQL Global Development Group
   *
   * IDENTIFICATION
- *     $PostgreSQL: pgsql/src/backend/utils/adt/like_match.c,v 1.15 2007/02/27 23:48:08 tgl Exp $
+ *     $PostgreSQL: pgsql/src/backend/utils/adt/like_match.c,v 1.16 2007/06/02 02:03:42 adunstan Exp $
   *
   *-------------------------------------------------------------------------
   */
@@ -77,21 +75,36 @@ MatchText(char *t, int tlen, char *p, int plen)
         if ((plen == 1) && (*p == '%'))
                 return LIKE_TRUE;
  
+       /*
+        * In this loop, we advance by char when matching wildcards (and thus
+        * on recursive entry to this function we are properly char-synced). On
+        * other occasions it is safe to advance by byte, as the text and pattern
+        * will be in lockstep. This allows us to perform all comparisons  between
+        * the text and pattern on a byte by byte basis, even for multi-byte
+        * encodings.
+        */
+
         while ((tlen > 0) && (plen > 0))
         {
                 if (*p == '\\')
                 {
-                       /* Next pattern char must match literally, whatever it is */
-                       NextChar(p, plen);
-                       if ((plen <= 0) || !CHAREQ(t, p))
+                       /* Next byte must match literally, whatever it is */
+                       NextByte(p, plen);
+                       if ((plen <= 0) || *p != *t )
                                 return LIKE_FALSE;
                 }
                 else if (*p == '%')
                 {
+                       /*
+                        * % processing is essentially a search for a match for what 
+                        * follows the %, plus a recursive match of the remainder.
+                        * We succeed if and only if both conditions are met.
+                        */
+
                         /* %% is the same as % according to the SQL standard */
                         /* Advance past all %'s */
                         while ((plen > 0) && (*p == '%'))
-                               NextChar(p, plen);
+                               NextByte(p, plen);
                         /* Trailing percent matches everything. */
                         if (plen <= 0)
                                 return LIKE_TRUE;
@@ -100,107 +113,62 @@ MatchText(char *t, int tlen, char *p, int plen)
                          * Otherwise, scan for a text position at which we can match the
                          * rest of the pattern.
                          */
-                       while (tlen > 0)
-                       {
-                               /*
-                                * Optimization to prevent most recursion: don't recurse
-                                * unless first pattern char might match this text char.
-                                */
-                               if (CHAREQ(t, p) || (*p == '\\') || (*p == '_'))
-                               {
-                                       int                     matched = MatchText(t, tlen, p, plen);
+                       if (*p == '_')
  
-                                       if (matched != LIKE_FALSE)
-                                               return matched; /* TRUE or ABORT */
-                               }
+                       {
+                               /* %_ is the same as _% - avoid matching _ repeatedly */
  
                                 NextChar(t, tlen);
-                       }
+                               NextByte(p, plen);
  
-                       /*
-                        * End of text with no match, so no point in trying later places
-                        * to start matching this pattern.
-                        */
-                       return LIKE_ABORT;
-               }
-               else if ((*p != '_') && !CHAREQ(t, p))
-               {
-                       /*
-                        * Not the single-character wildcard and no explicit match? Then
-                        * time to quit...
-                        */
-                       return LIKE_FALSE;
-               }
-
-               NextChar(t, tlen);
-               NextChar(p, plen);
-       }
-
-       if (tlen > 0)
-               return LIKE_FALSE;              /* end of pattern, but not of text */
+                               if (tlen <= 0)
+                               {
+                                       return (plen <= 0) ? LIKE_TRUE : LIKE_ABORT;
+                               }
+                               else if (plen <= 0)
+                               {
+                                       return LIKE_FALSE;
+                               }
  
-       /* End of input string.  Do we have matching pattern remaining? */
-       while ((plen > 0) && (*p == '%'))       /* allow multiple %'s at end of
-                                                                                * pattern */
-               NextChar(p, plen);
-       if (plen <= 0)
-               return LIKE_TRUE;
+                               while (tlen > 0)
+                               {
+                                       int                     matched = MatchText(t, tlen, p, plen);
+                                               
+                                       if (matched != LIKE_FALSE)
+                                                       return matched; /* TRUE or ABORT */
  
-       /*
-        * End of text with no match, so no point in trying later places to start
-        * matching this pattern.
-        */
-       return LIKE_ABORT;
-}      /* MatchText() */
+                                       NextChar(t, tlen);
+                               }
+                       }
+                       else
+                       {
  
-/*
- * Same as above, but ignore case
- */
-static int
-MatchTextIC(char *t, int tlen, char *p, int plen)
-{
-       /* Fast path for match-everything pattern */
-       if ((plen == 1) && (*p == '%'))
-               return LIKE_TRUE;
+                               char firstpat = *p ;
  
-       while ((tlen > 0) && (plen > 0))
-       {
-               if (*p == '\\')
-               {
-                       /* Next pattern char must match literally, whatever it is */
-                       NextChar(p, plen);
-                       if ((plen <= 0) || !ICHAREQ(t, p))
-                               return LIKE_FALSE;
-               }
-               else if (*p == '%')
-               {
-                       /* %% is the same as % according to the SQL standard */
-                       /* Advance past all %'s */
-                       while ((plen > 0) && (*p == '%'))
-                               NextChar(p, plen);
-                       /* Trailing percent matches everything. */
-                       if (plen <= 0)
-                               return LIKE_TRUE;
+                               if (*p == '\\')
+                               {
+                                       if (plen < 2)
+                                               return LIKE_FALSE;
+                                       firstpat = p[1];
+                               }
  
-                       /*
-                        * Otherwise, scan for a text position at which we can match the
-                        * rest of the pattern.
-                        */
-                       while (tlen > 0)
-                       {
-                               /*
-                                * Optimization to prevent most recursion: don't recurse
-                                * unless first pattern char might match this text char.
-                                */
-                               if (ICHAREQ(t, p) || (*p == '\\') || (*p == '_'))
+                               while (tlen > 0)
                                 {
-                                       int                     matched = MatchTextIC(t, tlen, p, plen);
+                                       /*
+                                        * Optimization to prevent most recursion: don't recurse
+                                        * unless first pattern byte matches first text byte.
+                                        */
+                                       if (*t == firstpat)
+                                       {
+                                               int                     matched = MatchText(t, tlen, p, plen);
+                                               
+                                               if (matched != LIKE_FALSE)
+                                                       return matched; /* TRUE or ABORT */
+                                       }
+
+                                       NextChar(t, tlen);
  
-                                       if (matched != LIKE_FALSE)
-                                               return matched; /* TRUE or ABORT */
                                 }
-
-                               NextChar(t, tlen);
                         }
  
                         /*
@@ -209,7 +177,13 @@ MatchTextIC(char *t, int tlen, char *p, int plen)
                          */
                         return LIKE_ABORT;
                 }
-               else if ((*p != '_') && !ICHAREQ(t, p))
+               else if (*p == '_')
+               {
+                       NextChar(t, tlen);
+                       NextByte(p, plen);
+                       continue;
+               }
+               else if (*t != *p)
                 {
                         /*
                          * Not the single-character wildcard and no explicit match? Then
@@ -217,9 +191,20 @@ MatchTextIC(char *t, int tlen, char *p, int plen)
                          */
                         return LIKE_FALSE;
                 }
-
-               NextChar(t, tlen);
-               NextChar(p, plen);
+               /*
+                * It is safe to use NextByte instead of NextChar here, even for
+                * multi-byte character sets, because we are not following 
+                * immediately after a wildcard character.
+                * If we are in the middle of a multibyte character, we must 
+                * already have matched at least one byte of the character from 
+                * both text and pattern; so we cannot get out-of-sync
+                * on character boundaries.  And we know that no backend-legal 
+                * encoding allows ASCII characters such as '%' to appear as 
+                * non-first bytes of characters, so we won't mistakenly detect 
+                * a new wildcard.
+                */
+               NextByte(t, tlen);
+               NextByte(p, plen);
         }
  
         if (tlen > 0)
@@ -228,7 +213,8 @@ MatchTextIC(char *t, int tlen, char *p, int plen)
         /* End of input string.  Do we have matching pattern remaining? */
         while ((plen > 0) && (*p == '%'))       /* allow multiple %'s at end of
                                                                                  * pattern */
-               NextChar(p, plen);
+               NextByte(p, plen);
+
         if (plen <= 0)
                 return LIKE_TRUE;
  
@@ -237,12 +223,14 @@ MatchTextIC(char *t, int tlen, char *p, int plen)
          * matching this pattern.
          */
         return LIKE_ABORT;
-}      /* MatchTextIC() */
+}      /* MatchText() */
  
  /*
   * like_escape() --- given a pattern and an ESCAPE string,
   * convert the pattern to use Postgres' standard backslash escape convention.
   */
+#ifdef do_like_escape
+
  static text *
  do_like_escape(text *pat, text *esc)
  {
@@ -336,3 +324,17 @@ do_like_escape(text *pat, text *esc)
  
         return result;
  }
+#endif /* do_like_escape */
+
+#ifdef CHAREQ
+#undef CHAREQ
+#endif
+
+#undef NextChar
+#undef CopyAdvChar
+#undef MatchText
+
+#ifdef do_like_escape
+#undef do_like_escape
+#endif
+
author	Andrew Dunstan <andrew@dunslane.net>
	Sat, 2 Jun 2007 02:03:42 +0000 (02:03 +0000)
committer	Andrew Dunstan <andrew@dunslane.net>
	Sat, 2 Jun 2007 02:03:42 +0000 (02:03 +0000)
src/backend/utils/adt/like.c		patch \| blob \| history
src/backend/utils/adt/like_match.c		patch \| blob \| history