]> granicus.if.org Git - postgresql/commitdiff
Avoid quadratic slowdown in regexp match/split functions.
authorAndrew Gierth <rhodiumtoad@postgresql.org>
Tue, 28 Aug 2018 08:52:25 +0000 (09:52 +0100)
committerAndrew Gierth <rhodiumtoad@postgresql.org>
Tue, 28 Aug 2018 11:17:33 +0000 (12:17 +0100)
regexp_matches, regexp_split_to_table and regexp_split_to_array all
work by compiling a list of match positions as character offsets (NOT
byte positions) in the source string.

Formerly, they then used text_substr to extract the matched text; but
in a multi-byte encoding, that counts the characters in the string,
and the characters needed to reach the starting byte position, on
every call. Accordingly, the performance degraded as the product of
the input string length and the number of match positions, such that
splitting a string of a few hundred kbytes could take many minutes.

Repair by keeping the wide-character copy of the input string
available (only in the case where encoding_max_length is not 1) after
performing the match operation, and extracting substrings from that
instead. This reduces the complexity to being linear in the number of
result bytes, discounting the actual regexp match itself (which is not
affected by this patch).

In passing, remove cleanup using retail pfree() which was obsoleted by
commit ff428cded (Feb 2008) which made cleanup of SRF multi-call
contexts automatic. Also increase (to ~134 million) the maximum number
of matches and provide an error message when it is reached.

Backpatch all the way because this has been wrong forever.

Analysis and patch by me; review by Kaiting Chen.

Discussion: https://postgr.es/m/87pnyn55qh.fsf@news-spur.riddles.org.uk

see also https://postgr.es/m/87lg996g4r.fsf@news-spur.riddles.org.uk

src/backend/utils/adt/regexp.c

index 5025a449fb3f160152a4bfc208e97779ea98bd8f..d8b692123421b5e46e2b5c48f6e832b64a5def9e 100644 (file)
@@ -35,6 +35,7 @@
 #include "regex/regex.h"
 #include "utils/array.h"
 #include "utils/builtins.h"
+#include "utils/memutils.h"
 #include "utils/varlena.h"
 
 #define PG_GETARG_TEXT_PP_IF_EXISTS(_n) \
@@ -61,6 +62,9 @@ typedef struct regexp_matches_ctx
        /* workspace for build_regexp_match_result() */
        Datum      *elems;                      /* has npatterns elements */
        bool       *nulls;                      /* has npatterns elements */
+       pg_wchar   *wide_str;           /* wide-char version of original string */
+       char       *conv_buf;           /* conversion buffer */
+       int                     conv_bufsiz;    /* size thereof */
 } regexp_matches_ctx;
 
 /*
@@ -111,8 +115,8 @@ static regexp_matches_ctx *setup_regexp_matches(text *orig_str, text *pattern,
                                         pg_re_flags *flags,
                                         Oid collation,
                                         bool use_subpatterns,
-                                        bool ignore_degenerate);
-static void cleanup_regexp_matches(regexp_matches_ctx *matchctx);
+                                        bool ignore_degenerate,
+                                        bool fetching_unmatched);
 static ArrayType *build_regexp_match_result(regexp_matches_ctx *matchctx);
 static Datum build_regexp_split_result(regexp_matches_ctx *splitctx);
 
@@ -863,7 +867,7 @@ regexp_match(PG_FUNCTION_ARGS)
                                 errhint("Use the regexp_matches function instead.")));
 
        matchctx = setup_regexp_matches(orig_str, pattern, &re_flags,
-                                                                       PG_GET_COLLATION(), true, false);
+                                                                       PG_GET_COLLATION(), true, false, false);
 
        if (matchctx->nmatches == 0)
                PG_RETURN_NULL();
@@ -911,7 +915,7 @@ regexp_matches(PG_FUNCTION_ARGS)
                matchctx = setup_regexp_matches(PG_GETARG_TEXT_P_COPY(0), pattern,
                                                                                &re_flags,
                                                                                PG_GET_COLLATION(),
-                                                                               true, false);
+                                                                               true, false, false);
 
                /* Pre-create workspace that build_regexp_match_result needs */
                matchctx->elems = (Datum *) palloc(sizeof(Datum) * matchctx->npatterns);
@@ -933,9 +937,6 @@ regexp_matches(PG_FUNCTION_ARGS)
                SRF_RETURN_NEXT(funcctx, PointerGetDatum(result_ary));
        }
 
-       /* release space in multi-call ctx to avoid intraquery memory leak */
-       cleanup_regexp_matches(matchctx);
-
        SRF_RETURN_DONE(funcctx);
 }
 
@@ -954,17 +955,24 @@ regexp_matches_no_flags(PG_FUNCTION_ARGS)
  * all the matching in one swoop.  The returned regexp_matches_ctx contains
  * the locations of all the substrings matching the pattern.
  *
- * The two bool parameters have only two patterns (one for matching, one for
+ * The three bool parameters have only two patterns (one for matching, one for
  * splitting) but it seems clearer to distinguish the functionality this way
- * than to key it all off one "is_split" flag.
+ * than to key it all off one "is_split" flag. We don't currently assume that
+ * fetching_unmatched is exclusive of fetching the matched text too; if it's
+ * set, the conversion buffer is large enough to fetch any single matched or
+ * unmatched string, but not any larger substring. (In practice, when splitting
+ * the matches are usually small anyway, and it didn't seem worth complicating
+ * the code further.)
  */
 static regexp_matches_ctx *
 setup_regexp_matches(text *orig_str, text *pattern, pg_re_flags *re_flags,
                                         Oid collation,
                                         bool use_subpatterns,
-                                        bool ignore_degenerate)
+                                        bool ignore_degenerate,
+                                        bool fetching_unmatched)
 {
        regexp_matches_ctx *matchctx = palloc0(sizeof(regexp_matches_ctx));
+       int                     eml = pg_database_encoding_max_length();
        int                     orig_len;
        pg_wchar   *wide_str;
        int                     wide_len;
@@ -975,6 +983,7 @@ setup_regexp_matches(text *orig_str, text *pattern, pg_re_flags *re_flags,
        int                     array_idx;
        int                     prev_match_end;
        int                     start_search;
+       int                     maxlen = 0;             /* largest fetch length in characters */
 
        /* save original string --- we'll extract result substrings from it */
        matchctx->orig_str = orig_str;
@@ -1003,8 +1012,13 @@ setup_regexp_matches(text *orig_str, text *pattern, pg_re_flags *re_flags,
        /* temporary output space for RE package */
        pmatch = palloc(sizeof(regmatch_t) * pmatch_len);
 
-       /* the real output space (grown dynamically if needed) */
-       array_len = re_flags->glob ? 256 : 32;
+       /*
+        * the real output space (grown dynamically if needed)
+        *
+        * use values 2^n-1, not 2^n, so that we hit the limit at 2^28-1 rather
+        * than at 2^27
+        */
+       array_len = re_flags->glob ? 255 : 31;
        matchctx->match_locs = (int *) palloc(sizeof(int) * array_len);
        array_idx = 0;
 
@@ -1024,9 +1038,13 @@ setup_regexp_matches(text *orig_str, text *pattern, pg_re_flags *re_flags,
                         pmatch[0].rm_eo > prev_match_end))
                {
                        /* enlarge output space if needed */
-                       while (array_idx + matchctx->npatterns * 2 > array_len)
+                       while (array_idx + matchctx->npatterns * 2 + 1 > array_len)
                        {
-                               array_len *= 2;
+                               array_len += array_len + 1;             /* 2^n-1 => 2^(n+1)-1 */
+                               if (array_len > MaxAllocSize/sizeof(int))
+                                       ereport(ERROR,
+                                                       (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+                                                        errmsg("too many regular expression matches")));
                                matchctx->match_locs = (int *) repalloc(matchctx->match_locs,
                                                                                                                sizeof(int) * array_len);
                        }
@@ -1038,16 +1056,33 @@ setup_regexp_matches(text *orig_str, text *pattern, pg_re_flags *re_flags,
 
                                for (i = 1; i <= matchctx->npatterns; i++)
                                {
-                                       matchctx->match_locs[array_idx++] = pmatch[i].rm_so;
-                                       matchctx->match_locs[array_idx++] = pmatch[i].rm_eo;
+                                       int             so = pmatch[i].rm_so;
+                                       int             eo = pmatch[i].rm_eo;
+                                       matchctx->match_locs[array_idx++] = so;
+                                       matchctx->match_locs[array_idx++] = eo;
+                                       if (so >= 0 && eo >= 0 && (eo - so) > maxlen)
+                                               maxlen = (eo - so);
                                }
                        }
                        else
                        {
-                               matchctx->match_locs[array_idx++] = pmatch[0].rm_so;
-                               matchctx->match_locs[array_idx++] = pmatch[0].rm_eo;
+                               int             so = pmatch[0].rm_so;
+                               int             eo = pmatch[0].rm_eo;
+                               matchctx->match_locs[array_idx++] = so;
+                               matchctx->match_locs[array_idx++] = eo;
+                               if (so >= 0 && eo >= 0 && (eo - so) > maxlen)
+                                       maxlen = (eo - so);
                        }
                        matchctx->nmatches++;
+
+                       /*
+                        * check length of unmatched portion between end of previous match
+                        * and start of current one
+                        */
+                       if (fetching_unmatched &&
+                               pmatch[0].rm_so >= 0 &&
+                               (pmatch[0].rm_so - prev_match_end) > maxlen)
+                               maxlen = (pmatch[0].rm_so - prev_match_end);
                }
                prev_match_end = pmatch[0].rm_eo;
 
@@ -1068,34 +1103,67 @@ setup_regexp_matches(text *orig_str, text *pattern, pg_re_flags *re_flags,
                        break;
        }
 
+       /*
+        * check length of unmatched portion between end of last match and end of
+        * input string
+        */
+       if (fetching_unmatched &&
+               (wide_len - prev_match_end) > maxlen)
+               maxlen = (wide_len - prev_match_end);
+
+       /*
+        * Keep a note of the end position of the string for the benefit of
+        * splitting code.
+        */
+       matchctx->match_locs[array_idx] = wide_len;
+
+       if (eml > 1)
+       {
+               int64           maxsiz = eml * (int64) maxlen;
+               int                     conv_bufsiz;
+
+               /*
+                * Make the conversion buffer large enough for any substring of
+                * interest.
+                *
+                * Worst case: assume we need the maximum size (maxlen*eml), but take
+                * advantage of the fact that the original string length in bytes is an
+                * upper bound on the byte length of any fetched substring (and we know
+                * that len+1 is safe to allocate because the varlena header is longer
+                * than 1 byte).
+                */
+               if (maxsiz > orig_len)
+                       conv_bufsiz = orig_len + 1;
+               else
+                       conv_bufsiz = maxsiz + 1;       /* safe since maxsiz < 2^30 */
+
+               matchctx->conv_buf = palloc(conv_bufsiz);
+               matchctx->conv_bufsiz = conv_bufsiz;
+               matchctx->wide_str = wide_str;
+       }
+       else
+       {
+               /* No need to keep the wide string if we're in a single-byte charset. */
+               pfree(wide_str);
+               matchctx->wide_str = NULL;
+               matchctx->conv_buf = NULL;
+               matchctx->conv_bufsiz = 0;
+       }
+
        /* Clean up temp storage */
-       pfree(wide_str);
        pfree(pmatch);
 
        return matchctx;
 }
 
-/*
- * cleanup_regexp_matches - release memory of a regexp_matches_ctx
- */
-static void
-cleanup_regexp_matches(regexp_matches_ctx *matchctx)
-{
-       pfree(matchctx->orig_str);
-       pfree(matchctx->match_locs);
-       if (matchctx->elems)
-               pfree(matchctx->elems);
-       if (matchctx->nulls)
-               pfree(matchctx->nulls);
-       pfree(matchctx);
-}
-
 /*
  * build_regexp_match_result - build output array for current match
  */
 static ArrayType *
 build_regexp_match_result(regexp_matches_ctx *matchctx)
 {
+       char       *buf = matchctx->conv_buf;
+       int                     bufsiz PG_USED_FOR_ASSERTS_ONLY = matchctx->conv_bufsiz;
        Datum      *elems = matchctx->elems;
        bool       *nulls = matchctx->nulls;
        int                     dims[1];
@@ -1115,6 +1183,15 @@ build_regexp_match_result(regexp_matches_ctx *matchctx)
                        elems[i] = (Datum) 0;
                        nulls[i] = true;
                }
+               else if (buf)
+               {
+                       int             len = pg_wchar2mb_with_len(matchctx->wide_str + so,
+                                                                                          buf,
+                                                                                          eo - so);
+                       Assert(len < bufsiz);
+                       elems[i] = PointerGetDatum(cstring_to_text_with_len(buf, len));
+                       nulls[i] = false;
+               }
                else
                {
                        elems[i] = DirectFunctionCall3(text_substr,
@@ -1168,7 +1245,7 @@ regexp_split_to_table(PG_FUNCTION_ARGS)
                splitctx = setup_regexp_matches(PG_GETARG_TEXT_P_COPY(0), pattern,
                                                                                &re_flags,
                                                                                PG_GET_COLLATION(),
-                                                                               false, true);
+                                                                               false, true, true);
 
                MemoryContextSwitchTo(oldcontext);
                funcctx->user_fctx = (void *) splitctx;
@@ -1185,9 +1262,6 @@ regexp_split_to_table(PG_FUNCTION_ARGS)
                SRF_RETURN_NEXT(funcctx, result);
        }
 
-       /* release space in multi-call ctx to avoid intraquery memory leak */
-       cleanup_regexp_matches(splitctx);
-
        SRF_RETURN_DONE(funcctx);
 }
 
@@ -1224,7 +1298,7 @@ regexp_split_to_array(PG_FUNCTION_ARGS)
                                                                        PG_GETARG_TEXT_PP(1),
                                                                        &re_flags,
                                                                        PG_GET_COLLATION(),
-                                                                       false, true);
+                                                                       false, true, true);
 
        while (splitctx->next_match <= splitctx->nmatches)
        {
@@ -1236,12 +1310,6 @@ regexp_split_to_array(PG_FUNCTION_ARGS)
                splitctx->next_match++;
        }
 
-       /*
-        * We don't call cleanup_regexp_matches here; it would try to pfree the
-        * input string, which we didn't copy.  The space is not in a long-lived
-        * memory context anyway.
-        */
-
        PG_RETURN_ARRAYTYPE_P(makeArrayResult(astate, CurrentMemoryContext));
 }
 
@@ -1261,6 +1329,7 @@ regexp_split_to_array_no_flags(PG_FUNCTION_ARGS)
 static Datum
 build_regexp_split_result(regexp_matches_ctx *splitctx)
 {
+       char       *buf = splitctx->conv_buf;
        int                     startpos;
        int                     endpos;
 
@@ -1271,22 +1340,29 @@ build_regexp_split_result(regexp_matches_ctx *splitctx)
        if (startpos < 0)
                elog(ERROR, "invalid match ending position");
 
-       if (splitctx->next_match < splitctx->nmatches)
+       if (buf)
        {
+               int             bufsiz PG_USED_FOR_ASSERTS_ONLY = splitctx->conv_bufsiz;
+               int             len;
+
                endpos = splitctx->match_locs[splitctx->next_match * 2];
                if (endpos < startpos)
                        elog(ERROR, "invalid match starting position");
-               return DirectFunctionCall3(text_substr,
-                                                                  PointerGetDatum(splitctx->orig_str),
-                                                                  Int32GetDatum(startpos + 1),
-                                                                  Int32GetDatum(endpos - startpos));
+               len = pg_wchar2mb_with_len(splitctx->wide_str + startpos,
+                                                                  buf,
+                                                                  endpos-startpos);
+               Assert(len < bufsiz);
+               return PointerGetDatum(cstring_to_text_with_len(buf, len));
        }
        else
        {
-               /* no more matches, return rest of string */
-               return DirectFunctionCall2(text_substr_no_len,
+               endpos = splitctx->match_locs[splitctx->next_match * 2];
+               if (endpos < startpos)
+                       elog(ERROR, "invalid match starting position");
+               return DirectFunctionCall3(text_substr,
                                                                   PointerGetDatum(splitctx->orig_str),
-                                                                  Int32GetDatum(startpos + 1));
+                                                                  Int32GetDatum(startpos + 1),
+                                                                  Int32GetDatum(endpos - startpos));
        }
 }