From: Tom Lane Date: Wed, 19 Mar 2008 02:41:00 +0000 (+0000) Subject: Fix regexp substring matching (substring(string from pattern)) for the corner X-Git-Tag: REL8_1_12~25 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=c249e9ed304b361f78bc9cc77bbf04e3b123df3b;p=postgresql Fix regexp substring matching (substring(string from pattern)) for the corner case where there is a match to the pattern overall but the user has specified a parenthesized subexpression and that subexpression hasn't got a match. An example is substring('foo' from 'foo(bar)?'). This should return NULL, since (bar) isn't matched, but it was mistakenly returning the whole-pattern match instead (ie, 'foo'). Per bug #4044 from Rui Martins. This has been broken since the beginning; patch in all supported versions. The old behavior was sufficiently inconsistent that it's impossible to believe anyone is depending on it. --- diff --git a/src/backend/utils/adt/regexp.c b/src/backend/utils/adt/regexp.c index 5c7ad349d4..25df843ba6 100644 --- a/src/backend/utils/adt/regexp.c +++ b/src/backend/utils/adt/regexp.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/utils/adt/regexp.c,v 1.60.2.3 2007/01/03 22:39:42 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/utils/adt/regexp.c,v 1.60.2.4 2008/03/19 02:41:00 tgl Exp $ * * Alistair Crooks added the code for the regex caching * agc - cached the regular expressions used - there's a good chance @@ -192,36 +192,31 @@ RE_compile_and_cache(text *text_re, int cflags) } /* - * RE_compile_and_execute - compile and execute a RE + * RE_execute - execute a RE * * Returns TRUE on match, FALSE on no match * - * text_re --- the pattern, expressed as an *untoasted* TEXT object + * re --- the compiled pattern as returned by RE_compile_and_cache * dat --- the data to match against (need not be null-terminated) * dat_len --- the length of the data string - * cflags --- compile options for the pattern * nmatch, pmatch --- optional return area for match details * - * Both pattern and data are given in the database encoding. We internally + * Data is given in the database encoding. We internally * convert to array of pg_wchar which is what Spencer's regex package wants. */ static bool -RE_compile_and_execute(text *text_re, char *dat, int dat_len, - int cflags, int nmatch, regmatch_t *pmatch) +RE_execute(regex_t *re, char *dat, int dat_len, + int nmatch, regmatch_t *pmatch) { pg_wchar *data; size_t data_len; int regexec_result; - regex_t *re; char errMsg[100]; /* Convert data string to wide characters */ data = (pg_wchar *) palloc((dat_len + 1) * sizeof(pg_wchar)); data_len = pg_mb2wchar_with_len(dat, data, dat_len); - /* Compile RE */ - re = RE_compile_and_cache(text_re, cflags); - /* Perform RE match and return result */ regexec_result = pg_regexec(re, data, @@ -246,6 +241,33 @@ RE_compile_and_execute(text *text_re, char *dat, int dat_len, return (regexec_result == REG_OKAY); } +/* + * RE_compile_and_execute - compile and execute a RE + * + * Returns TRUE on match, FALSE on no match + * + * text_re --- the pattern, expressed as an *untoasted* TEXT object + * dat --- the data to match against (need not be null-terminated) + * dat_len --- the length of the data string + * cflags --- compile options for the pattern + * nmatch, pmatch --- optional return area for match details + * + * Both pattern and data are given in the database encoding. We internally + * convert to array of pg_wchar which is what Spencer's regex package wants. + */ +static bool +RE_compile_and_execute(text *text_re, char *dat, int dat_len, + int cflags, int nmatch, regmatch_t *pmatch) +{ + regex_t *re; + + /* Compile RE */ + re = RE_compile_and_cache(text_re, cflags); + + /* Perform RE match and return result */ + return RE_execute(re, dat, dat_len, nmatch, pmatch); +} + /* * assign_regex_flavor - GUC hook to validate and set REGEX_FLAVOR @@ -400,8 +422,13 @@ textregexsubstr(PG_FUNCTION_ARGS) { text *s = PG_GETARG_TEXT_P(0); text *p = PG_GETARG_TEXT_P(1); - bool match; + regex_t *re; regmatch_t pmatch[2]; + int so, + eo; + + /* Compile RE */ + re = RE_compile_and_cache(p, regex_flavor); /* * We pass two regmatch_t structs to get info about the overall match and @@ -409,34 +436,37 @@ textregexsubstr(PG_FUNCTION_ARGS) * is a parenthesized subexpression, we return what it matched; else * return what the whole regexp matched. */ - match = RE_compile_and_execute(p, - VARDATA(s), - VARSIZE(s) - VARHDRSZ, - regex_flavor, - 2, pmatch); - - /* match? then return the substring matching the pattern */ - if (match) - { - int so, - eo; + if (!RE_execute(re, + VARDATA(s), VARSIZE(s) - VARHDRSZ, + 2, pmatch)) + PG_RETURN_NULL(); /* definitely no match */ + if (re->re_nsub > 0) + { + /* has parenthesized subexpressions, use the first one */ so = pmatch[1].rm_so; eo = pmatch[1].rm_eo; - if (so < 0 || eo < 0) - { - /* no parenthesized subexpression */ - so = pmatch[0].rm_so; - eo = pmatch[0].rm_eo; - } - - return DirectFunctionCall3(text_substr, - PointerGetDatum(s), - Int32GetDatum(so + 1), - Int32GetDatum(eo - so)); } + else + { + /* no parenthesized subexpression, use whole match */ + so = pmatch[0].rm_so; + eo = pmatch[0].rm_eo; + } + + /* + * It is possible to have a match to the whole pattern but no match + * for a subexpression; for example 'foo(bar)?' is considered to match + * 'foo' but there is no subexpression match. So this extra test for + * match failure is not redundant. + */ + if (so < 0 || eo < 0) + PG_RETURN_NULL(); - PG_RETURN_NULL(); + return DirectFunctionCall3(text_substr, + PointerGetDatum(s), + Int32GetDatum(so + 1), + Int32GetDatum(eo - so)); } /*