]> granicus.if.org Git - postgresql/commitdiff
Repair bug in regexp split performance improvements.
authorAndrew Gierth <rhodiumtoad@postgresql.org>
Wed, 12 Sep 2018 18:31:06 +0000 (19:31 +0100)
committerAndrew Gierth <rhodiumtoad@postgresql.org>
Wed, 12 Sep 2018 18:45:13 +0000 (19:45 +0100)
Commit c8ea87e4b introduced a temporary conversion buffer for
substrings extracted during regexp splits. Unfortunately the code that
sized it was failing to ignore the effects of ignored degenerate
regexp matches, so for regexp_split_* calls it could under-size the
buffer in such cases.

Fix, and add some regression test cases (though those will only catch
the bug if run in a multibyte encoding).

Backpatch to 9.3 as the faulty code was.

Thanks to the PostGIS project, Regina Obe and Paul Ramsey for the
report (via IRC) and assistance in analysis. Patch by me.

src/backend/utils/adt/regexp.c
src/test/regress/expected/strings.out
src/test/regress/sql/strings.sql

index e3a852769f6b9643fdfad45679f59098e097dca5..7463c0dd2fff8f87efb8635f0eddc83b2706ba5d 100644 (file)
@@ -936,6 +936,7 @@ setup_regexp_matches(text *orig_str, text *pattern, text *flags,
        int                     array_len;
        int                     array_idx;
        int                     prev_match_end;
+       int                     prev_valid_match_end;
        int                     start_search;
        int                     maxlen = 0;             /* largest fetch length in characters */
 
@@ -991,6 +992,7 @@ setup_regexp_matches(text *orig_str, text *pattern, text *flags,
 
        /* search for the pattern, perhaps repeatedly */
        prev_match_end = 0;
+       prev_valid_match_end = 0;
        start_search = 0;
        while (RE_wchar_execute(cpattern, wide_str, wide_len, start_search,
                                                        pmatch_len, pmatch))
@@ -1043,13 +1045,15 @@ setup_regexp_matches(text *orig_str, text *pattern, text *flags,
                        matchctx->nmatches++;
 
                        /*
-                        * check length of unmatched portion between end of previous match
-                        * and start of current one
+                        * check length of unmatched portion between end of previous valid
+                        * (nondegenerate, or degenerate but not ignored) match and start
+                        * of current one
                         */
                        if (fetching_unmatched &&
                                pmatch[0].rm_so >= 0 &&
-                               (pmatch[0].rm_so - prev_match_end) > maxlen)
-                               maxlen = (pmatch[0].rm_so - prev_match_end);
+                               (pmatch[0].rm_so - prev_valid_match_end) > maxlen)
+                               maxlen = (pmatch[0].rm_so - prev_valid_match_end);
+                       prev_valid_match_end = pmatch[0].rm_eo;
                }
                prev_match_end = pmatch[0].rm_eo;
 
@@ -1075,8 +1079,8 @@ setup_regexp_matches(text *orig_str, text *pattern, text *flags,
         * input string
         */
        if (fetching_unmatched &&
-               (wide_len - prev_match_end) > maxlen)
-               maxlen = (wide_len - prev_match_end);
+               (wide_len - prev_valid_match_end) > maxlen)
+               maxlen = (wide_len - prev_valid_match_end);
 
        /*
         * Keep a note of the end position of the string for the benefit of
index 19708c32fdd30b9566050b55ef82843b72989923..8e7470d1cbbfd9173e3303ec6f06a81b01e2ef69 100644 (file)
@@ -674,6 +674,24 @@ SELECT regexp_split_to_array('123456','.');
  {"","","","","","",""}
 (1 row)
 
+SELECT regexp_split_to_array('123456','');
+ regexp_split_to_array 
+-----------------------
+ {1,2,3,4,5,6}
+(1 row)
+
+SELECT regexp_split_to_array('123456','(?:)');
+ regexp_split_to_array 
+-----------------------
+ {1,2,3,4,5,6}
+(1 row)
+
+SELECT regexp_split_to_array('1','');
+ regexp_split_to_array 
+-----------------------
+ {1}
+(1 row)
+
 -- errors
 SELECT foo, length(foo) FROM regexp_split_to_table('thE QUick bROWn FOx jUMPs ovEr The lazy dOG', 'e', 'zippy') AS foo;
 ERROR:  invalid regexp option: "z"
index f9cfaeb44ac2f55d8f5c859a0951aaba9e1dd381..5e39458bd2234d5e4b13016115bca137f01831db 100644 (file)
@@ -188,6 +188,9 @@ SELECT regexp_split_to_array('the quick brown fox jumps over the lazy dog', 'nom
 SELECT regexp_split_to_array('123456','1');
 SELECT regexp_split_to_array('123456','6');
 SELECT regexp_split_to_array('123456','.');
+SELECT regexp_split_to_array('123456','');
+SELECT regexp_split_to_array('123456','(?:)');
+SELECT regexp_split_to_array('1','');
 -- errors
 SELECT foo, length(foo) FROM regexp_split_to_table('thE QUick bROWn FOx jUMPs ovEr The lazy dOG', 'e', 'zippy') AS foo;
 SELECT regexp_split_to_array('thE QUick bROWn FOx jUMPs ovEr The lazy dOG', 'e', 'iz');