From 8d96a06d43378ee8e509eb0d5d5dc3958fe7980f Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Wed, 3 Jan 2007 22:39:57 +0000 Subject: [PATCH] Fix regex_fixed_prefix() to cope reasonably well with regex patterns of the form '^(foo)$'. Before, these could never be optimized into indexscans. The recent changes to make psql and pg_dump generate such patterns (for \d commands and -t and related switches, respectively) therefore represented a big performance hit for people with large pg_class catalogs, as seen in recent gripe from Erik Jones. While at it, be more paranoid about case-sensitivity checking in multibyte encodings, and fix some other corner cases in which a regex might be interpreted too liberally. --- src/backend/utils/adt/regexp.c | 11 ++- src/backend/utils/adt/selfuncs.c | 145 ++++++++++++++++++++----------- src/include/utils/builtins.h | 3 +- 3 files changed, 108 insertions(+), 51 deletions(-) diff --git a/src/backend/utils/adt/regexp.c b/src/backend/utils/adt/regexp.c index cfd946e630..37a82eed4f 100644 --- a/src/backend/utils/adt/regexp.c +++ b/src/backend/utils/adt/regexp.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $Header: /cvsroot/pgsql/src/backend/utils/adt/regexp.c,v 1.49.4.3 2006/04/13 18:01:53 tgl Exp $ + * $Header: /cvsroot/pgsql/src/backend/utils/adt/regexp.c,v 1.49.4.4 2007/01/03 22:39:56 tgl Exp $ * * Alistair Crooks added the code for the regex caching * agc - cached the regular expressions used - there's a good chance @@ -556,3 +556,12 @@ similar_escape(PG_FUNCTION_ARGS) PG_RETURN_TEXT_P(result); } + +/* + * report whether regex_flavor is currently BASIC + */ +bool +regex_flavor_is_basic(void) +{ + return (regex_flavor == REG_BASIC); +} diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c index 680a8d53fd..aa3d9f8f32 100644 --- a/src/backend/utils/adt/selfuncs.c +++ b/src/backend/utils/adt/selfuncs.c @@ -15,7 +15,7 @@ * * * IDENTIFICATION - * $Header: /cvsroot/pgsql/src/backend/utils/adt/selfuncs.c,v 1.147.2.4 2004/12/02 02:45:24 tgl Exp $ + * $Header: /cvsroot/pgsql/src/backend/utils/adt/selfuncs.c,v 1.147.2.5 2007/01/03 22:39:56 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -3087,7 +3087,10 @@ get_join_vars(List *args, Var **var1, Var **var2) * These routines support analysis of LIKE and regular-expression patterns * by the planner/optimizer. It's important that they agree with the * regular-expression code in backend/regex/ and the LIKE code in - * backend/utils/adt/like.c. + * backend/utils/adt/like.c. Also, the computation of the fixed prefix + * must be conservative: if we report a string longer than the true fixed + * prefix, the query may produce actually wrong answers, rather than just + * getting a bad selectivity estimate! * * Note that the prefix-analysis functions are called from * backend/optimizer/path/indxpath.c as well as from routines in this file. @@ -3119,6 +3122,7 @@ like_fixed_prefix(Const *patt_const, bool case_insensitive, Oid typeid = patt_const->consttype; int pos, match_pos; + bool is_multibyte = (pg_database_encoding_max_length() > 1); /* the right-hand const is type text or bytea */ Assert(typeid == BYTEAOID || typeid == TEXTOID); @@ -3168,11 +3172,16 @@ like_fixed_prefix(Const *patt_const, bool case_insensitive, } /* - * XXX I suspect isalpha() is not an adequately locale-sensitive - * test for characters that can vary under case folding? + * XXX In multibyte character sets, we can't trust isalpha, so assume + * any multibyte char is potentially case-varying. */ - if (case_insensitive && isalpha((unsigned char) patt[pos])) - break; + if (case_insensitive) + { + if (is_multibyte && (unsigned char) patt[pos] >= 0x80) + break; + if (isalpha((unsigned char) patt[pos])) + break; + } /* * NOTE: this code used to think that %% meant a literal %, but @@ -3219,11 +3228,13 @@ regex_fixed_prefix(Const *patt_const, bool case_insensitive, int pos, match_pos, prev_pos, - prev_match_pos, - paren_depth; + prev_match_pos; + bool have_leading_paren; char *patt; char *rest; Oid typeid = patt_const->consttype; + bool is_basic = regex_flavor_is_basic(); + bool is_multibyte = (pg_database_encoding_max_length() > 1); /* * Should be unnecessary, there are no bytea regex operators defined. @@ -3238,8 +3249,19 @@ regex_fixed_prefix(Const *patt_const, bool case_insensitive, /* the right-hand const is type text for all of these */ patt = DatumGetCString(DirectFunctionCall1(textout, patt_const->constvalue)); + /* + * Check for ARE director prefix. It's worth our trouble to recognize + * this because similar_escape() uses it. + */ + pos = 0; + if (strncmp(patt, "***:", 4) == 0) + { + pos = 4; + is_basic = false; + } + /* Pattern must be anchored left */ - if (patt[0] != '^') + if (patt[pos] != '^') { rest = patt; @@ -3248,74 +3270,86 @@ regex_fixed_prefix(Const *patt_const, bool case_insensitive, return Pattern_Prefix_None; } + pos++; /* - * If unquoted | is present at paren level 0 in pattern, then there - * are multiple alternatives for the start of the string. + * If '|' is present in pattern, then there may be multiple alternatives + * for the start of the string. (There are cases where this isn't so, + * for instance if the '|' is inside parens, but detecting that reliably + * is too hard.) */ - paren_depth = 0; - for (pos = 1; patt[pos]; pos++) + if (strchr(patt + pos, '|') != NULL) { - if (patt[pos] == '|' && paren_depth == 0) - { - rest = patt; + rest = patt; - *prefix_const = NULL; - *rest_const = string_to_const(rest, typeid); + *prefix_const = NULL; + *rest_const = string_to_const(rest, typeid); - return Pattern_Prefix_None; - } - else if (patt[pos] == '(') - paren_depth++; - else if (patt[pos] == ')' && paren_depth > 0) - paren_depth--; - else if (patt[pos] == '\\') - { - /* backslash quotes the next character */ - pos++; - if (patt[pos] == '\0') - break; - } + return Pattern_Prefix_None; } /* OK, allocate space for pattern */ match = palloc(strlen(patt) + 1); prev_match_pos = match_pos = 0; - /* note start at pos 1 to skip leading ^ */ - for (prev_pos = pos = 1; patt[pos]; ) + /* + * We special-case the syntax '^(...)$' because psql uses it. But beware: + * in BRE mode these parentheses are just ordinary characters. Also, + * sequences beginning "(?" are not what they seem, unless they're "(?:". + * (We should recognize that, too, because of similar_escape().) + * + * Note: it's a bit bogus to be depending on the current regex_flavor + * setting here, because the setting could change before the pattern is + * used. We minimize the risk by trusting the flavor as little as we can, + * but perhaps it would be a good idea to get rid of the "basic" setting. + */ + have_leading_paren = false; + if (patt[pos] == '(' && !is_basic && + (patt[pos + 1] != '?' || patt[pos + 2] == ':')) { - int len; + have_leading_paren = true; + pos += (patt[pos + 1] != '?' ? 1 : 3); + } + + /* Scan remainder of pattern */ + prev_pos = pos; + while (patt[pos]) + { + int len; /* - * Check for characters that indicate multiple possible matches - * here. XXX I suspect isalpha() is not an adequately - * locale-sensitive test for characters that can vary under case - * folding? + * Check for characters that indicate multiple possible matches here. + * Also, drop out at ')' or '$' so the termination test works right. */ if (patt[pos] == '.' || patt[pos] == '(' || + patt[pos] == ')' || patt[pos] == '[' || - patt[pos] == '$' || - (case_insensitive && isalpha((unsigned char) patt[pos]))) + patt[pos] == '^' || + patt[pos] == '$') break; /* - * In AREs, backslash followed by alphanumeric is an escape, not - * a quoted character. Must treat it as having multiple possible - * matches. + * XXX In multibyte character sets, we can't trust isalpha, so assume + * any multibyte char is potentially case-varying. */ - if (patt[pos] == '\\' && isalnum((unsigned char) patt[pos + 1])) - break; + if (case_insensitive) + { + if (is_multibyte && (unsigned char) patt[pos] >= 0x80) + break; + if (isalpha((unsigned char) patt[pos])) + break; + } /* * Check for quantifiers. Except for +, this means the preceding - * character is optional, so we must remove it from the prefix - * too! + * character is optional, so we must remove it from the prefix too! + * Note: in BREs, \{ is a quantifier. */ if (patt[pos] == '*' || patt[pos] == '?' || - patt[pos] == '{') + patt[pos] == '{' || + (patt[pos] == '\\' && patt[pos + 1] == '{')) { match_pos = prev_match_pos; pos = prev_pos; @@ -3326,9 +3360,19 @@ regex_fixed_prefix(Const *patt_const, bool case_insensitive, pos = prev_pos; break; } + + /* + * Normally, backslash quotes the next character. But in AREs, + * backslash followed by alphanumeric is an escape, not a quoted + * character. Must treat it as having multiple possible matches. + * In BREs, \( is a parenthesis, so don't trust that either. + * Note: since only ASCII alphanumerics are escapes, we don't have + * to be paranoid about multibyte here. + */ if (patt[pos] == '\\') { - /* backslash quotes the next character */ + if (isalnum((unsigned char) patt[pos + 1]) || patt[pos + 1] == '(') + break; pos++; if (patt[pos] == '\0') break; @@ -3346,6 +3390,9 @@ regex_fixed_prefix(Const *patt_const, bool case_insensitive, match[match_pos] = '\0'; rest = &patt[pos]; + if (have_leading_paren && patt[pos] == ')') + pos++; + if (patt[pos] == '$' && patt[pos + 1] == '\0') { rest = &patt[pos + 1]; diff --git a/src/include/utils/builtins.h b/src/include/utils/builtins.h index 397064db3b..a1bb39f7a8 100644 --- a/src/include/utils/builtins.h +++ b/src/include/utils/builtins.h @@ -7,7 +7,7 @@ * Portions Copyright (c) 1996-2003, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $Id: builtins.h,v 1.228.2.1 2003/12/02 12:40:08 momjian Exp $ + * $Id: builtins.h,v 1.228.2.2 2007/01/03 22:39:57 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -411,6 +411,7 @@ extern Datum textregexsubstr(PG_FUNCTION_ARGS); extern Datum similar_escape(PG_FUNCTION_ARGS); extern const char *assign_regex_flavor(const char *value, bool doit, bool interactive); +extern bool regex_flavor_is_basic(void); /* regproc.c */ extern Datum regprocin(PG_FUNCTION_ARGS); -- 2.40.0