Avoid quadratic slowdown in regexp match/split functions.

author Andrew Gierth <rhodiumtoad@postgresql.org>

Tue, 28 Aug 2018 08:52:25 +0000 (09:52 +0100)

committer Andrew Gierth <rhodiumtoad@postgresql.org>

Tue, 28 Aug 2018 11:17:33 +0000 (12:17 +0100)
author Andrew Gierth <rhodiumtoad@postgresql.org>
Tue, 28 Aug 2018 08:52:25 +0000 (09:52 +0100)
committer Andrew Gierth <rhodiumtoad@postgresql.org>
Tue, 28 Aug 2018 11:17:33 +0000 (12:17 +0100)
diff --git a/src/backend/utils/adt/regexp.c b/src/backend/utils/adt/regexp.c

index 5025a449fb3f160152a4bfc208e97779ea98bd8f..d8b692123421b5e46e2b5c48f6e832b64a5def9e 100644 (file)
--- a/src/backend/utils/adt/regexp.c
+++ b/src/backend/utils/adt/regexp.c
@@ -35,6 +35,7 @@
  #include "regex/regex.h"
  #include "utils/array.h"
  #include "utils/builtins.h"
+#include "utils/memutils.h"
  #include "utils/varlena.h"
  
  #define PG_GETARG_TEXT_PP_IF_EXISTS(_n) \
@@ -61,6 +62,9 @@ typedef struct regexp_matches_ctx
         /* workspace for build_regexp_match_result() */
         Datum      *elems;                      /* has npatterns elements */
         bool       *nulls;                      /* has npatterns elements */
+       pg_wchar   *wide_str;           /* wide-char version of original string */
+       char       *conv_buf;           /* conversion buffer */
+       int                     conv_bufsiz;    /* size thereof */
  } regexp_matches_ctx;
  
  /*
@@ -111,8 +115,8 @@ static regexp_matches_ctx *setup_regexp_matches(text *orig_str, text *pattern,
                                          pg_re_flags *flags,
                                          Oid collation,
                                          bool use_subpatterns,
-                                        bool ignore_degenerate);
-static void cleanup_regexp_matches(regexp_matches_ctx *matchctx);
+                                        bool ignore_degenerate,
+                                        bool fetching_unmatched);
  static ArrayType *build_regexp_match_result(regexp_matches_ctx *matchctx);
  static Datum build_regexp_split_result(regexp_matches_ctx *splitctx);
  
@@ -863,7 +867,7 @@ regexp_match(PG_FUNCTION_ARGS)
                                  errhint("Use the regexp_matches function instead.")));
  
         matchctx = setup_regexp_matches(orig_str, pattern, &re_flags,
-                                                                       PG_GET_COLLATION(), true, false);
+                                                                       PG_GET_COLLATION(), true, false, false);
  
         if (matchctx->nmatches == 0)
                 PG_RETURN_NULL();
@@ -911,7 +915,7 @@ regexp_matches(PG_FUNCTION_ARGS)
                 matchctx = setup_regexp_matches(PG_GETARG_TEXT_P_COPY(0), pattern,
                                                                                 &re_flags,
                                                                                 PG_GET_COLLATION(),
-                                                                               true, false);
+                                                                               true, false, false);
  
                 /* Pre-create workspace that build_regexp_match_result needs */
                 matchctx->elems = (Datum *) palloc(sizeof(Datum) * matchctx->npatterns);
@@ -933,9 +937,6 @@ regexp_matches(PG_FUNCTION_ARGS)
                 SRF_RETURN_NEXT(funcctx, PointerGetDatum(result_ary));
         }
  
-       /* release space in multi-call ctx to avoid intraquery memory leak */
-       cleanup_regexp_matches(matchctx);
-
         SRF_RETURN_DONE(funcctx);
  }
  
@@ -954,17 +955,24 @@ regexp_matches_no_flags(PG_FUNCTION_ARGS)
   * all the matching in one swoop.  The returned regexp_matches_ctx contains
   * the locations of all the substrings matching the pattern.
   *
- * The two bool parameters have only two patterns (one for matching, one for
+ * The three bool parameters have only two patterns (one for matching, one for
   * splitting) but it seems clearer to distinguish the functionality this way
- * than to key it all off one "is_split" flag.
+ * than to key it all off one "is_split" flag. We don't currently assume that
+ * fetching_unmatched is exclusive of fetching the matched text too; if it's
+ * set, the conversion buffer is large enough to fetch any single matched or
+ * unmatched string, but not any larger substring. (In practice, when splitting
+ * the matches are usually small anyway, and it didn't seem worth complicating
+ * the code further.)
   */
  static regexp_matches_ctx *
  setup_regexp_matches(text *orig_str, text *pattern, pg_re_flags *re_flags,
                                          Oid collation,
                                          bool use_subpatterns,
-                                        bool ignore_degenerate)
+                                        bool ignore_degenerate,
+                                        bool fetching_unmatched)
  {
         regexp_matches_ctx *matchctx = palloc0(sizeof(regexp_matches_ctx));
+       int                     eml = pg_database_encoding_max_length();
         int                     orig_len;
         pg_wchar   *wide_str;
         int                     wide_len;
@@ -975,6 +983,7 @@ setup_regexp_matches(text *orig_str, text *pattern, pg_re_flags *re_flags,
         int                     array_idx;
         int                     prev_match_end;
         int                     start_search;
+       int                     maxlen = 0;             /* largest fetch length in characters */
  
         /* save original string --- we'll extract result substrings from it */
         matchctx->orig_str = orig_str;
@@ -1003,8 +1012,13 @@ setup_regexp_matches(text *orig_str, text *pattern, pg_re_flags *re_flags,
         /* temporary output space for RE package */
         pmatch = palloc(sizeof(regmatch_t) * pmatch_len);
  
-       /* the real output space (grown dynamically if needed) */
-       array_len = re_flags->glob ? 256 : 32;
+       /*
+        * the real output space (grown dynamically if needed)
+        *
+        * use values 2^n-1, not 2^n, so that we hit the limit at 2^28-1 rather
+        * than at 2^27
+        */
+       array_len = re_flags->glob ? 255 : 31;
         matchctx->match_locs = (int *) palloc(sizeof(int) * array_len);
         array_idx = 0;
  
@@ -1024,9 +1038,13 @@ setup_regexp_matches(text *orig_str, text *pattern, pg_re_flags *re_flags,
                          pmatch[0].rm_eo > prev_match_end))
                 {
                         /* enlarge output space if needed */
-                       while (array_idx + matchctx->npatterns * 2 > array_len)
+                       while (array_idx + matchctx->npatterns * 2 + 1 > array_len)
                         {
-                               array_len *= 2;
+                               array_len += array_len + 1;             /* 2^n-1 => 2^(n+1)-1 */
+                               if (array_len > MaxAllocSize/sizeof(int))
+                                       ereport(ERROR,
+                                                       (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+                                                        errmsg("too many regular expression matches")));
                                 matchctx->match_locs = (int *) repalloc(matchctx->match_locs,
                                                                                                                 sizeof(int) * array_len);
                         }
@@ -1038,16 +1056,33 @@ setup_regexp_matches(text *orig_str, text *pattern, pg_re_flags *re_flags,
  
                                 for (i = 1; i <= matchctx->npatterns; i++)
                                 {
-                                       matchctx->match_locs[array_idx++] = pmatch[i].rm_so;
-                                       matchctx->match_locs[array_idx++] = pmatch[i].rm_eo;
+                                       int             so = pmatch[i].rm_so;
+                                       int             eo = pmatch[i].rm_eo;
+                                       matchctx->match_locs[array_idx++] = so;
+                                       matchctx->match_locs[array_idx++] = eo;
+                                       if (so >= 0 && eo >= 0 && (eo - so) > maxlen)
+                                               maxlen = (eo - so);
                                 }
                         }
                         else
                         {
-                               matchctx->match_locs[array_idx++] = pmatch[0].rm_so;
-                               matchctx->match_locs[array_idx++] = pmatch[0].rm_eo;
+                               int             so = pmatch[0].rm_so;
+                               int             eo = pmatch[0].rm_eo;
+                               matchctx->match_locs[array_idx++] = so;
+                               matchctx->match_locs[array_idx++] = eo;
+                               if (so >= 0 && eo >= 0 && (eo - so) > maxlen)
+                                       maxlen = (eo - so);
                         }
                         matchctx->nmatches++;
+
+                       /*
+                        * check length of unmatched portion between end of previous match
+                        * and start of current one
+                        */
+                       if (fetching_unmatched &&
+                               pmatch[0].rm_so >= 0 &&
+                               (pmatch[0].rm_so - prev_match_end) > maxlen)
+                               maxlen = (pmatch[0].rm_so - prev_match_end);
                 }
                 prev_match_end = pmatch[0].rm_eo;
  
@@ -1068,34 +1103,67 @@ setup_regexp_matches(text *orig_str, text *pattern, pg_re_flags *re_flags,
                         break;
         }
  
+       /*
+        * check length of unmatched portion between end of last match and end of
+        * input string
+        */
+       if (fetching_unmatched &&
+               (wide_len - prev_match_end) > maxlen)
+               maxlen = (wide_len - prev_match_end);
+
+       /*
+        * Keep a note of the end position of the string for the benefit of
+        * splitting code.
+        */
+       matchctx->match_locs[array_idx] = wide_len;
+
+       if (eml > 1)
+       {
+               int64           maxsiz = eml * (int64) maxlen;
+               int                     conv_bufsiz;
+
+               /*
+                * Make the conversion buffer large enough for any substring of
+                * interest.
+                *
+                * Worst case: assume we need the maximum size (maxlen*eml), but take
+                * advantage of the fact that the original string length in bytes is an
+                * upper bound on the byte length of any fetched substring (and we know
+                * that len+1 is safe to allocate because the varlena header is longer
+                * than 1 byte).
+                */
+               if (maxsiz > orig_len)
+                       conv_bufsiz = orig_len + 1;
+               else
+                       conv_bufsiz = maxsiz + 1;       /* safe since maxsiz < 2^30 */
+
+               matchctx->conv_buf = palloc(conv_bufsiz);
+               matchctx->conv_bufsiz = conv_bufsiz;
+               matchctx->wide_str = wide_str;
+       }
+       else
+       {
+               /* No need to keep the wide string if we're in a single-byte charset. */
+               pfree(wide_str);
+               matchctx->wide_str = NULL;
+               matchctx->conv_buf = NULL;
+               matchctx->conv_bufsiz = 0;
+       }
+
         /* Clean up temp storage */
-       pfree(wide_str);
         pfree(pmatch);
  
         return matchctx;
  }
  
-/*
- * cleanup_regexp_matches - release memory of a regexp_matches_ctx
- */
-static void
-cleanup_regexp_matches(regexp_matches_ctx *matchctx)
-{
-       pfree(matchctx->orig_str);
-       pfree(matchctx->match_locs);
-       if (matchctx->elems)
-               pfree(matchctx->elems);
-       if (matchctx->nulls)
-               pfree(matchctx->nulls);
-       pfree(matchctx);
-}
-
  /*
   * build_regexp_match_result - build output array for current match
   */
  static ArrayType *
  build_regexp_match_result(regexp_matches_ctx *matchctx)
  {
+       char       *buf = matchctx->conv_buf;
+       int                     bufsiz PG_USED_FOR_ASSERTS_ONLY = matchctx->conv_bufsiz;
         Datum      *elems = matchctx->elems;
         bool       *nulls = matchctx->nulls;
         int                     dims[1];
@@ -1115,6 +1183,15 @@ build_regexp_match_result(regexp_matches_ctx *matchctx)
                         elems[i] = (Datum) 0;
                         nulls[i] = true;
                 }
+               else if (buf)
+               {
+                       int             len = pg_wchar2mb_with_len(matchctx->wide_str + so,
+                                                                                          buf,
+                                                                                          eo - so);
+                       Assert(len < bufsiz);
+                       elems[i] = PointerGetDatum(cstring_to_text_with_len(buf, len));
+                       nulls[i] = false;
+               }
                 else
                 {
                         elems[i] = DirectFunctionCall3(text_substr,
@@ -1168,7 +1245,7 @@ regexp_split_to_table(PG_FUNCTION_ARGS)
                 splitctx = setup_regexp_matches(PG_GETARG_TEXT_P_COPY(0), pattern,
                                                                                 &re_flags,
                                                                                 PG_GET_COLLATION(),
-                                                                               false, true);
+                                                                               false, true, true);
  
                 MemoryContextSwitchTo(oldcontext);
                 funcctx->user_fctx = (void *) splitctx;
@@ -1185,9 +1262,6 @@ regexp_split_to_table(PG_FUNCTION_ARGS)
                 SRF_RETURN_NEXT(funcctx, result);
         }
  
-       /* release space in multi-call ctx to avoid intraquery memory leak */
-       cleanup_regexp_matches(splitctx);
-
         SRF_RETURN_DONE(funcctx);
  }
  
@@ -1224,7 +1298,7 @@ regexp_split_to_array(PG_FUNCTION_ARGS)
                                                                         PG_GETARG_TEXT_PP(1),
                                                                         &re_flags,
                                                                         PG_GET_COLLATION(),
-                                                                       false, true);
+                                                                       false, true, true);
  
         while (splitctx->next_match <= splitctx->nmatches)
         {
@@ -1236,12 +1310,6 @@ regexp_split_to_array(PG_FUNCTION_ARGS)
                 splitctx->next_match++;
         }
  
-       /*
-        * We don't call cleanup_regexp_matches here; it would try to pfree the
-        * input string, which we didn't copy.  The space is not in a long-lived
-        * memory context anyway.
-        */
-
         PG_RETURN_ARRAYTYPE_P(makeArrayResult(astate, CurrentMemoryContext));
  }
  
@@ -1261,6 +1329,7 @@ regexp_split_to_array_no_flags(PG_FUNCTION_ARGS)
  static Datum
  build_regexp_split_result(regexp_matches_ctx *splitctx)
  {
+       char       *buf = splitctx->conv_buf;
         int                     startpos;
         int                     endpos;
  
@@ -1271,22 +1340,29 @@ build_regexp_split_result(regexp_matches_ctx *splitctx)
         if (startpos < 0)
                 elog(ERROR, "invalid match ending position");
  
-       if (splitctx->next_match < splitctx->nmatches)
+       if (buf)
         {
+               int             bufsiz PG_USED_FOR_ASSERTS_ONLY = splitctx->conv_bufsiz;
+               int             len;
+
                 endpos = splitctx->match_locs[splitctx->next_match * 2];
                 if (endpos < startpos)
                         elog(ERROR, "invalid match starting position");
-               return DirectFunctionCall3(text_substr,
-                                                                  PointerGetDatum(splitctx->orig_str),
-                                                                  Int32GetDatum(startpos + 1),
-                                                                  Int32GetDatum(endpos - startpos));
+               len = pg_wchar2mb_with_len(splitctx->wide_str + startpos,
+                                                                  buf,
+                                                                  endpos-startpos);
+               Assert(len < bufsiz);
+               return PointerGetDatum(cstring_to_text_with_len(buf, len));
         }
         else
         {
-               /* no more matches, return rest of string */
-               return DirectFunctionCall2(text_substr_no_len,
+               endpos = splitctx->match_locs[splitctx->next_match * 2];
+               if (endpos < startpos)
+                       elog(ERROR, "invalid match starting position");
+               return DirectFunctionCall3(text_substr,
                                                                    PointerGetDatum(splitctx->orig_str),
-                                                                  Int32GetDatum(startpos + 1));
+                                                                  Int32GetDatum(startpos + 1),
+                                                                  Int32GetDatum(endpos - startpos));
         }
  }
author	Andrew Gierth <rhodiumtoad@postgresql.org>
	Tue, 28 Aug 2018 08:52:25 +0000 (09:52 +0100)
committer	Andrew Gierth <rhodiumtoad@postgresql.org>
	Tue, 28 Aug 2018 11:17:33 +0000 (12:17 +0100)