From d6061d2f31187795790e7e63d5d9730f9d78e7ea Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Wed, 3 Jan 2007 22:39:26 +0000
Subject: [PATCH] Fix regex_fixed_prefix() to cope reasonably well with regex
 patterns of the form '^(foo)$'.  Before, these could never be optimized into
 indexscans. The recent changes to make psql and pg_dump generate such
 patterns (for \d commands and -t and related switches, respectively)
 therefore represented a big performance hit for people with large pg_class
 catalogs, as seen in recent gripe from Erik Jones.  While at it, be more
 paranoid about case-sensitivity checking in multibyte encodings, and fix some
 other corner cases in which a regex might be interpreted too liberally.

---
 src/backend/utils/adt/regexp.c   |  11 ++-
 src/backend/utils/adt/selfuncs.c | 137 +++++++++++++++++++++----------
 src/include/utils/builtins.h     |   3 +-
 3 files changed, 105 insertions(+), 46 deletions(-)

diff --git a/src/backend/utils/adt/regexp.c b/src/backend/utils/adt/regexp.c
index d249772064..a477d83320 100644
--- a/src/backend/utils/adt/regexp.c
+++ b/src/backend/utils/adt/regexp.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/utils/adt/regexp.c,v 1.66 2006/10/04 00:29:59 momjian Exp $
+ *	  $PostgreSQL: pgsql/src/backend/utils/adt/regexp.c,v 1.67 2007/01/03 22:39:26 tgl Exp $
  *
  *		Alistair Crooks added the code for the regex caching
  *		agc - cached the regular expressions used - there's a good chance
@@ -624,3 +624,12 @@ similar_escape(PG_FUNCTION_ARGS)
 
 	PG_RETURN_TEXT_P(result);
 }
+
+/*
+ * report whether regex_flavor is currently BASIC
+ */
+bool
+regex_flavor_is_basic(void)
+{
+	return (regex_flavor == REG_BASIC);
+}
diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c
index 73e82cd5e0..0f934ef9d7 100644
--- a/src/backend/utils/adt/selfuncs.c
+++ b/src/backend/utils/adt/selfuncs.c
@@ -15,7 +15,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/utils/adt/selfuncs.c,v 1.216 2006/12/23 00:43:11 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/utils/adt/selfuncs.c,v 1.217 2007/01/03 22:39:26 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -3805,7 +3805,10 @@ get_variable_maximum(PlannerInfo *root, VariableStatData *vardata,
  * These routines support analysis of LIKE and regular-expression patterns
  * by the planner/optimizer.  It's important that they agree with the
  * regular-expression code in backend/regex/ and the LIKE code in
- * backend/utils/adt/like.c.
+ * backend/utils/adt/like.c.  Also, the computation of the fixed prefix
+ * must be conservative: if we report a string longer than the true fixed
+ * prefix, the query may produce actually wrong answers, rather than just
+ * getting a bad selectivity estimate!
  *
  * Note that the prefix-analysis functions are called from
  * backend/optimizer/path/indxpath.c as well as from routines in this file.
@@ -3837,6 +3840,7 @@ like_fixed_prefix(Const *patt_const, bool case_insensitive,
 	Oid			typeid = patt_const->consttype;
 	int			pos,
 				match_pos;
+	bool		is_multibyte = (pg_database_encoding_max_length() > 1);
 
 	/* the right-hand const is type text or bytea */
 	Assert(typeid == BYTEAOID || typeid == TEXTOID);
@@ -3880,11 +3884,16 @@ like_fixed_prefix(Const *patt_const, bool case_insensitive,
 		}
 
 		/*
-		 * XXX I suspect isalpha() is not an adequately locale-sensitive test
-		 * for characters that can vary under case folding?
+		 * XXX In multibyte character sets, we can't trust isalpha, so assume
+		 * any multibyte char is potentially case-varying.
 		 */
-		if (case_insensitive && isalpha((unsigned char) patt[pos]))
-			break;
+		if (case_insensitive)
+		{
+			if (is_multibyte && (unsigned char) patt[pos] >= 0x80)
+				break;
+			if (isalpha((unsigned char) patt[pos]))
+				break;
+		}
 
 		/*
 		 * NOTE: this code used to think that %% meant a literal %, but
@@ -3929,11 +3938,13 @@ regex_fixed_prefix(Const *patt_const, bool case_insensitive,
 	int			pos,
 				match_pos,
 				prev_pos,
-				prev_match_pos,
-				paren_depth;
+				prev_match_pos;
+	bool		have_leading_paren;
 	char	   *patt;
 	char	   *rest;
 	Oid			typeid = patt_const->consttype;
+	bool		is_basic = regex_flavor_is_basic();
+	bool		is_multibyte = (pg_database_encoding_max_length() > 1);
 
 	/*
 	 * Should be unnecessary, there are no bytea regex operators defined. As
@@ -3948,8 +3959,19 @@ regex_fixed_prefix(Const *patt_const, bool case_insensitive,
 	/* the right-hand const is type text for all of these */
 	patt = DatumGetCString(DirectFunctionCall1(textout, patt_const->constvalue));
 
+	/*
+	 * Check for ARE director prefix.  It's worth our trouble to recognize
+	 * this because similar_escape() uses it.
+	 */
+	pos = 0;
+	if (strncmp(patt, "***:", 4) == 0)
+	{
+		pos = 4;
+		is_basic = false;
+	}
+
 	/* Pattern must be anchored left */
-	if (patt[0] != '^')
+	if (patt[pos] != '^')
 	{
 		rest = patt;
 
@@ -3958,72 +3980,86 @@ regex_fixed_prefix(Const *patt_const, bool case_insensitive,
 
 		return Pattern_Prefix_None;
 	}
+	pos++;
 
 	/*
-	 * If unquoted | is present at paren level 0 in pattern, then there are
-	 * multiple alternatives for the start of the string.
+	 * If '|' is present in pattern, then there may be multiple alternatives
+	 * for the start of the string.  (There are cases where this isn't so,
+	 * for instance if the '|' is inside parens, but detecting that reliably
+	 * is too hard.)
 	 */
-	paren_depth = 0;
-	for (pos = 1; patt[pos]; pos++)
+	if (strchr(patt + pos, '|') != NULL)
 	{
-		if (patt[pos] == '|' && paren_depth == 0)
-		{
-			rest = patt;
+		rest = patt;
 
-			*prefix_const = NULL;
-			*rest_const = string_to_const(rest, typeid);
+		*prefix_const = NULL;
+		*rest_const = string_to_const(rest, typeid);
 
-			return Pattern_Prefix_None;
-		}
-		else if (patt[pos] == '(')
-			paren_depth++;
-		else if (patt[pos] == ')' && paren_depth > 0)
-			paren_depth--;
-		else if (patt[pos] == '\\')
-		{
-			/* backslash quotes the next character */
-			pos++;
-			if (patt[pos] == '\0')
-				break;
-		}
+		return Pattern_Prefix_None;
 	}
 
 	/* OK, allocate space for pattern */
 	match = palloc(strlen(patt) + 1);
 	prev_match_pos = match_pos = 0;
 
-	/* note start at pos 1 to skip leading ^ */
-	for (prev_pos = pos = 1; patt[pos];)
+	/*
+	 * We special-case the syntax '^(...)$' because psql uses it.  But beware:
+	 * in BRE mode these parentheses are just ordinary characters.  Also,
+	 * sequences beginning "(?" are not what they seem, unless they're "(?:".
+	 * (We should recognize that, too, because of similar_escape().)
+	 *
+	 * Note: it's a bit bogus to be depending on the current regex_flavor
+	 * setting here, because the setting could change before the pattern is
+	 * used.  We minimize the risk by trusting the flavor as little as we can,
+	 * but perhaps it would be a good idea to get rid of the "basic" setting.
+	 */
+	have_leading_paren = false;
+	if (patt[pos] == '(' && !is_basic &&
+		(patt[pos + 1] != '?' || patt[pos + 2] == ':'))
+	{
+		have_leading_paren = true;
+		pos += (patt[pos + 1] != '?' ? 1 : 3);
+	}
+
+	/* Scan remainder of pattern */
+	prev_pos = pos;
+	while (patt[pos])
 	{
 		int			len;
 
 		/*
 		 * Check for characters that indicate multiple possible matches here.
-		 * XXX I suspect isalpha() is not an adequately locale-sensitive test
-		 * for characters that can vary under case folding?
+		 * Also, drop out at ')' or '$' so the termination test works right.
 		 */
 		if (patt[pos] == '.' ||
 			patt[pos] == '(' ||
+			patt[pos] == ')' ||
 			patt[pos] == '[' ||
-			patt[pos] == '$' ||
-			(case_insensitive && isalpha((unsigned char) patt[pos])))
+			patt[pos] == '^' ||
+			patt[pos] == '$')
 			break;
 
 		/*
-		 * In AREs, backslash followed by alphanumeric is an escape, not a
-		 * quoted character.  Must treat it as having multiple possible
-		 * matches.
+		 * XXX In multibyte character sets, we can't trust isalpha, so assume
+		 * any multibyte char is potentially case-varying.
 		 */
-		if (patt[pos] == '\\' && isalnum((unsigned char) patt[pos + 1]))
-			break;
+		if (case_insensitive)
+		{
+			if (is_multibyte && (unsigned char) patt[pos] >= 0x80)
+				break;
+			if (isalpha((unsigned char) patt[pos]))
+				break;
+		}
 
 		/*
 		 * Check for quantifiers.  Except for +, this means the preceding
 		 * character is optional, so we must remove it from the prefix too!
+		 * Note: in BREs, \{ is a quantifier.
 		 */
 		if (patt[pos] == '*' ||
 			patt[pos] == '?' ||
-			patt[pos] == '{')
+			patt[pos] == '{' ||
+			(patt[pos] == '\\' && patt[pos + 1] == '{'))
 		{
 			match_pos = prev_match_pos;
 			pos = prev_pos;
@@ -4034,9 +4070,19 @@ regex_fixed_prefix(Const *patt_const, bool case_insensitive,
 			pos = prev_pos;
 			break;
 		}
+
+		/*
+		 * Normally, backslash quotes the next character.  But in AREs,
+		 * backslash followed by alphanumeric is an escape, not a quoted
+		 * character.  Must treat it as having multiple possible matches.
+		 * In BREs, \( is a parenthesis, so don't trust that either.
+		 * Note: since only ASCII alphanumerics are escapes, we don't have
+		 * to be paranoid about multibyte here.
+		 */
 		if (patt[pos] == '\\')
 		{
-			/* backslash quotes the next character */
+			if (isalnum((unsigned char) patt[pos + 1]) || patt[pos + 1] == '(')
+				break;
 			pos++;
 			if (patt[pos] == '\0')
 				break;
@@ -4054,6 +4100,9 @@ regex_fixed_prefix(Const *patt_const, bool case_insensitive,
 	match[match_pos] = '\0';
 	rest = &patt[pos];
 
+	if (have_leading_paren && patt[pos] == ')')
+		pos++;
+
 	if (patt[pos] == '$' && patt[pos + 1] == '\0')
 	{
 		rest = &patt[pos + 1];
diff --git a/src/include/utils/builtins.h b/src/include/utils/builtins.h
index 56e63d7f73..170fa28ca8 100644
--- a/src/include/utils/builtins.h
+++ b/src/include/utils/builtins.h
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1996-2006, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/utils/builtins.h,v 1.283 2006/12/30 21:21:55 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/utils/builtins.h,v 1.284 2007/01/03 22:39:26 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -477,6 +477,7 @@ extern Datum textregexsubstr(PG_FUNCTION_ARGS);
 extern Datum textregexreplace_noopt(PG_FUNCTION_ARGS);
 extern Datum textregexreplace(PG_FUNCTION_ARGS);
 extern Datum similar_escape(PG_FUNCTION_ARGS);
+extern bool regex_flavor_is_basic(void);
 
 /* regproc.c */
 extern Datum regprocin(PG_FUNCTION_ARGS);
-- 
2.40.0