]> granicus.if.org Git - postgresql/commitdiff
Allow multibyte characters as escape in SIMILAR TO and SUBSTRING.
authorJeff Davis <jdavis@postgresql.org>
Thu, 28 Aug 2014 04:07:36 +0000 (21:07 -0700)
committerJeff Davis <jdavis@postgresql.org>
Thu, 28 Aug 2014 04:07:36 +0000 (21:07 -0700)
Previously, only a single-byte character was allowed as an
escape. This patch allows it to be a multi-byte character, though it
still must be a single character.

Reviewed by Heikki Linnakangas and Tom Lane.

src/backend/utils/adt/regexp.c

index caf45ef85f9717e989a19a21c1241c894b55d8ef..50b33f6b364528f6c70906b1ba299e18f60b992b 100644 (file)
@@ -688,11 +688,16 @@ similar_escape(PG_FUNCTION_ARGS)
                elen = VARSIZE_ANY_EXHDR(esc_text);
                if (elen == 0)
                        e = NULL;                       /* no escape character */
-               else if (elen != 1)
-                       ereport(ERROR,
-                                       (errcode(ERRCODE_INVALID_ESCAPE_SEQUENCE),
-                                        errmsg("invalid escape string"),
-                                 errhint("Escape string must be empty or one character.")));
+               else
+               {
+                       int                     escape_mblen = pg_mbstrlen_with_len(e, elen);
+
+                       if (escape_mblen > 1)
+                               ereport(ERROR,
+                                               (errcode(ERRCODE_INVALID_ESCAPE_SEQUENCE),
+                                                errmsg("invalid escape string"),
+                                                errhint("Escape string must be empty or one character.")));
+               }
        }
 
        /*----------
@@ -724,6 +729,54 @@ similar_escape(PG_FUNCTION_ARGS)
        {
                char            pchar = *p;
 
+               /*
+                * If both the escape character and the current character from the
+                * pattern are multi-byte, we need to take the slow path.
+                *
+                * But if one of them is single-byte, we can process the pattern one
+                * byte at a time, ignoring multi-byte characters.  (This works
+                * because all server-encodings have the property that a valid
+                * multi-byte character representation cannot contain the
+                * representation of a valid single-byte character.)
+                */
+
+               if (elen > 1)
+               {
+                       int mblen = pg_mblen(p);
+                       if (mblen > 1)
+                       {
+                               /* slow, multi-byte path */
+                               if (afterescape)
+                               {
+                                       *r++ = '\\';
+                                       memcpy(r, p, mblen);
+                                       r += mblen;
+                                       afterescape = false;
+                               }
+                               else if (e && elen == mblen && memcmp(e, p, mblen) == 0)
+                               {
+                                       /* SQL99 escape character; do not send to output */
+                                       afterescape = true;
+                               }
+                               else
+                               {
+                                       /*
+                                        * We know it's a multi-byte character, so we don't need
+                                        * to do all the comparisons to single-byte characters
+                                        * that we do below.
+                                        */
+                                       memcpy(r, p, mblen);
+                                       r += mblen;
+                               }
+
+                               p += mblen;
+                               plen -= mblen;
+
+                               continue;
+                       }
+               }
+
+               /* fast path */
                if (afterescape)
                {
                        if (pchar == '"' && !incharclass)       /* for SUBSTRING patterns */