]> granicus.if.org Git - postgresql/commitdiff
Code review for regexp_matches/regexp_split patch. Refactor to avoid assuming
authorTom Lane <tgl@sss.pgh.pa.us>
Sat, 11 Aug 2007 03:56:24 +0000 (03:56 +0000)
committerTom Lane <tgl@sss.pgh.pa.us>
Sat, 11 Aug 2007 03:56:24 +0000 (03:56 +0000)
that cached compiled patterns will still be there when the function is next
called.  Clean up looping logic, thereby fixing bug identified by Pavel
Stehule.  Share setup code between the two functions, add some comments, and
avoid risky mixing of int and size_t variables.  Clean up the documentation a
tad, and accept all the flag characters mentioned in table 9-19 rather than
just a subset.

doc/src/sgml/func.sgml
src/backend/utils/adt/regexp.c
src/test/regress/expected/strings.out
src/test/regress/sql/strings.sql

index 5d5a86a0c7fa066b2ddc1c9b141f3da155fb285a..4e5f8f1148aba2467323be7a14b91dba2e978642 100644 (file)
@@ -1,4 +1,4 @@
-<!-- $PostgreSQL: pgsql/doc/src/sgml/func.sgml,v 1.383 2007/07/18 03:12:42 momjian Exp $ -->
+<!-- $PostgreSQL: pgsql/doc/src/sgml/func.sgml,v 1.384 2007/08/11 03:56:24 tgl Exp $ -->
 
  <chapter id="functions">
   <title>Functions and Operators</title>
        <entry><literal><function>regexp_matches</function>(<parameter>string</parameter> <type>text</type>, <parameter>pattern</parameter> <type>text</type> [, <parameter>flags</parameter> <type>text</type>])</literal></entry>
        <entry><type>setof text[]</type></entry>
        <entry>
-        Return all capture groups resulting from matching POSIX regular
+        Return all captured substrings resulting from matching a POSIX regular
         expression against the <parameter>string</parameter>. See
         <xref linkend="functions-posix-regexp"> for more information.
        </entry>
        <entry><literal><function>regexp_replace</function>(<parameter>string</parameter> <type>text</type>, <parameter>pattern</parameter> <type>text</type>, <parameter>replacement</parameter> <type>text</type> [, <parameter>flags</parameter> <type>text</type>])</literal></entry>
        <entry><type>text</type></entry>
        <entry>
-        Replace substring matching POSIX regular expression. See
+        Replace substring(s) matching a POSIX regular expression. See
         <xref linkend="functions-posix-regexp"> for more information.
        </entry>
        <entry><literal>regexp_replace('Thomas', '.[mN]a.', 'M')</literal></entry>
        <entry><literal><function>regexp_split_to_array</function>(<parameter>string</parameter> <type>text</type>, <parameter>pattern</parameter> <type>text</type> [, <parameter>flags</parameter> <type>text</type> ])</literal></entry>
        <entry><type>text[]</type></entry>
        <entry>
-        Split <parameter>string</parameter> using POSIX regular expression as
+        Split <parameter>string</parameter> using POSIX regular expression as
         the delimiter.  See <xref linkend="functions-posix-regexp"> for more
         information.
        </entry>
        <entry><literal><function>regexp_split_to_table</function>(<parameter>string</parameter> <type>text</type>, <parameter>pattern</parameter> <type>text</type> [, <parameter>flags</parameter> <type>text</type>])</literal></entry>
        <entry><type>setof text</type></entry>
        <entry>
-        Split <parameter>string</parameter> using POSIX regular expression as
+        Split <parameter>string</parameter> using POSIX regular expression as
         the delimiter.  See <xref linkend="functions-posix-regexp"> for more
         information.
        </entry>
@@ -2856,11 +2856,9 @@ cast(-44 as bit(12))           <lineannotation>111111010100</lineannotation>
     <acronym>SQL</acronym> <function>LIKE</function> operator, the
     more recent <function>SIMILAR TO</function> operator (added in
     SQL:1999), and <acronym>POSIX</acronym>-style regular
-    expressions.
-    Additionally, a pattern matching function,
-    <function>substring</function>, is available, using either
-    <function>SIMILAR TO</function>-style or POSIX-style regular
-    expressions.
+    expressions.  Aside from the basic <quote>does this string match
+    this pattern?</> operators, functions are available to extract
+    or replace matching substrings and to split a string at the matches.
    </para>
 
    <tip>
@@ -3186,15 +3184,20 @@ substring('foobar' from '#"o_b#"%' for '#')    <lineannotation>NULL</lineannotat
      end of the string.
     </para>
 
-   <para>
-    Some examples:
+    <para>
+     Some examples:
 <programlisting>
 'abc' ~ 'abc'    <lineannotation>true</lineannotation>
 'abc' ~ '^a'     <lineannotation>true</lineannotation>
 'abc' ~ '(b|d)'  <lineannotation>true</lineannotation>
 'abc' ~ '^(b|c)' <lineannotation>false</lineannotation>
 </programlisting>
-   </para>
+    </para>
+
+    <para>
+     The <acronym>POSIX</acronym> pattern language is described in much
+     greater detail below.
+    </para>
 
     <para>
      The <function>substring</> function with two parameters,
@@ -3246,9 +3249,7 @@ substring('foobar' from 'o(.)b')   <lineannotation>o</lineannotation>
      function's behavior.  Flag <literal>i</> specifies case-insensitive
      matching, while flag <literal>g</> specifies replacement of each matching
      substring rather than only the first one.  Other supported flags are
-     <literal>m</>, <literal>n</>, <literal>p</>, <literal>w</> and
-     <literal>x</>, whose meanings correspond to those shown in
-     <xref linkend="posix-embedded-options-table">.
+     described in <xref linkend="posix-embedded-options-table">.
     </para>
 
    <para>
@@ -3264,23 +3265,25 @@ regexp_replace('foobarbaz', 'b(..)', E'X\\1Y', 'g')
    </para>
 
     <para>
-     The <function>regexp_matches</> function returns all of the capture
-     groups resulting from matching a POSIX regular expression pattern.
+     The <function>regexp_matches</> function returns all of the captured
+     substrings resulting from matching a POSIX regular expression pattern.
      It has the syntax
      <function>regexp_matches</function>(<replaceable>string</>, <replaceable>pattern</>
      <optional>, <replaceable>flags</> </optional>).
-     If there is no match to the <replaceable>pattern</>, the function returns no rows.
-     If there is a match, the function returns the contents of all of the capture groups
-     in a text array, or if there were no capture groups in the pattern, it returns the
-     contents of the entire match as a single-element text array.
+     If there is no match to the <replaceable>pattern</>, the function returns
+     no rows.  If there is a match, the function returns a text array whose
+     <replaceable>n</>'th element is the substring matching the
+     <replaceable>n</>'th parenthesized subexpression of the pattern
+     (not counting <quote>non-capturing</> parentheses; see below for
+     details).  If the pattern does not contain any parenthesized
+     subexpressions, then the result is a single-element text array containing
+     the substring matching the whole pattern.
      The <replaceable>flags</> parameter is an optional text
      string containing zero or more single-letter flags that change the
-     function's behavior.  Flag <literal>i</> specifies case-insensitive
-     matching, while flag <literal>g</> causes the return of each matching
-     substring rather than only the first one.  Other supported
-     flags are <literal>m</>, <literal>n</>, <literal>p</>, <literal>w</> and
-     <literal>x</>, whose meanings are described in
-     <xref linkend="posix-embedded-options-table">.
+     function's behavior.  Flag <literal>g</> causes the function to find
+     each match in the string, not only the first one, and return a row for
+     each such match.  Other supported
+     flags are described in <xref linkend="posix-embedded-options-table">.
     </para>
 
    <para>
@@ -3319,16 +3322,14 @@ SELECT regexp_matches('foobarbequebaz', 'barbeque');
      returns the text from the end of the last match to the end of the string.
      The <replaceable>flags</> parameter is an optional text string containing
      zero or more single-letter flags that change the function's behavior.
-     <function>regexp_split_to_table</function> supports the flags <literal>i</>,
-     <literal>m</>, <literal>n</>, <literal>p</>, <literal>w</> and
-     <literal>x</>, whose meanings are described in
+     <function>regexp_split_to_table</function> supports the flags described in
      <xref linkend="posix-embedded-options-table">.
     </para>
 
     <para>
      The <function>regexp_split_to_array</> function behaves the same as
      <function>regexp_split_to_table</>, except that <function>regexp_split_to_array</>
-     returns its results as a <type>text[]</>.  It has the syntax
+     returns its result as an array of <type>text</>.  It has the syntax
      <function>regexp_split_to_array</function>(<replaceable>string</>, <replaceable>pattern</>
      <optional>, <replaceable>flags</> </optional>).
      The parameters are the same as for <function>regexp_split_to_table</>.
index 7f140ddfdcb1a4261d4724a16cd5cd438c1d67db..05c00deaf98040297d49795aac0754f82a86b201 100644 (file)
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/utils/adt/regexp.c,v 1.71 2007/03/28 22:59:37 neilc Exp $
+ *       $PostgreSQL: pgsql/src/backend/utils/adt/regexp.c,v 1.72 2007/08/11 03:56:24 tgl Exp $
  *
  *             Alistair Crooks added the code for the regex caching
  *             agc - cached the regular expressions used - there's a good chance
  */
 #include "postgres.h"
 
-#include "access/heapam.h"
 #include "catalog/pg_type.h"
 #include "funcapi.h"
 #include "regex/regex.h"
 #include "utils/builtins.h"
 #include "utils/guc.h"
-#include "utils/lsyscache.h"
+
+#define PG_GETARG_TEXT_P_IF_EXISTS(_n) \
+       (PG_NARGS() > (_n) ? PG_GETARG_TEXT_P(_n) : NULL)
 
 
 /* GUC-settable flavor parameter */
 static int     regex_flavor = REG_ADVANCED;
 
 
+/* all the options of interest for regex functions */
+typedef struct pg_re_flags
+{
+       int                     cflags;                 /* compile flags for Spencer's regex code */
+       bool            glob;                   /* do it globally (for each occurrence) */
+} pg_re_flags;
+
+/* cross-call state for regexp_matches(), also regexp_split() */
+typedef struct regexp_matches_ctx
+{
+       text       *orig_str;           /* data string in original TEXT form */
+       int                     nmatches;               /* number of places where pattern matched */
+       int                     npatterns;              /* number of capturing subpatterns */
+       /* We store start char index and end+1 char index for each match */
+       /* so the number of entries in match_locs is nmatches * npatterns * 2 */
+       int                *match_locs;         /* 0-based character indexes */
+       int                     next_match;             /* 0-based index of next match to process */
+       /* workspace for build_regexp_matches_result() */
+       Datum      *elems;                      /* has npatterns elements */
+       bool       *nulls;                      /* has npatterns elements */
+} regexp_matches_ctx;
+
 /*
  * We cache precompiled regular expressions using a "self organizing list"
  * structure, in which recently-used items tend to be near the front.
@@ -79,48 +102,18 @@ typedef struct cached_re_str
        regex_t         cre_re;                 /* the compiled regular expression */
 } cached_re_str;
 
-typedef struct re_comp_flags
-{
-       int                       cflags;
-       bool              glob;
-} re_comp_flags;
-
-typedef struct regexp_matches_ctx
-{
-       text             *orig_str;
-       size_t            orig_len;
-       pg_wchar         *wide_str;
-       size_t            wide_len;
-       regex_t          *cpattern;
-       regmatch_t       *pmatch;
-       size_t            offset;
-
-       re_comp_flags flags;
-} regexp_matches_ctx;
-
-typedef struct regexp_split_ctx
-{
-       text             *orig_str;
-       size_t            orig_len;
-       pg_wchar         *wide_str;
-       size_t            wide_len;
-       regex_t          *cpattern;
-       regmatch_t        match;
-       size_t            offset;
-       re_comp_flags flags;
-} regexp_split_ctx;
-
-
 static int     num_res = 0;            /* # of cached re's */
 static cached_re_str re_array[MAX_CACHED_RES]; /* cached re's */
 
-static regexp_matches_ctx *setup_regexp_matches(text *orig_str, text *pattern,
-                                                                                               text *flags);
-static ArrayType *perform_regexp_matches(regexp_matches_ctx *matchctx);
 
-static regexp_split_ctx *setup_regexp_split(text *str, text *pattern,
-                                                                                       text *flags);
-static Datum get_next_split(regexp_split_ctx *splitctx);
+/* Local functions */
+static regexp_matches_ctx *setup_regexp_matches(text *orig_str, text *pattern,
+                                                                                               text *flags,
+                                                                                               bool force_glob,
+                                                                                               bool use_subpatterns,
+                                                                                               bool ignore_degenerate);
+static ArrayType *build_regexp_matches_result(regexp_matches_ctx *matchctx);
+static Datum build_regexp_split_result(regexp_matches_ctx *splitctx);
 
 
 /*
@@ -139,7 +132,7 @@ RE_compile_and_cache(text *text_re, int cflags)
 {
        int                     text_re_len = VARSIZE(text_re);
        pg_wchar   *pattern;
-       size_t          pattern_len;
+       int                     pattern_len;
        int                     i;
        int                     regcomp_result;
        cached_re_str re_temp;
@@ -235,7 +228,7 @@ RE_compile_and_cache(text *text_re, int cflags)
 }
 
 /*
- * RE_wchar_execute - execute a RE
+ * RE_wchar_execute - execute a RE on pg_wchar data
  *
  * Returns TRUE on match, FALSE on no match
  *
@@ -250,7 +243,7 @@ RE_compile_and_cache(text *text_re, int cflags)
  */
 static bool
 RE_wchar_execute(regex_t *re, pg_wchar *data, int data_len,
-                                size_t start_search, int nmatch, regmatch_t *pmatch)
+                                int start_search, int nmatch, regmatch_t *pmatch)
 {
        int                     regexec_result;
        char            errMsg[100];
@@ -295,7 +288,7 @@ RE_execute(regex_t *re, char *dat, int dat_len,
                   int nmatch, regmatch_t *pmatch)
 {
        pg_wchar   *data;
-       size_t          data_len;
+       int                     data_len;
        bool            match;
 
        /* Convert data string to wide characters */
@@ -304,6 +297,7 @@ RE_execute(regex_t *re, char *dat, int dat_len,
 
        /* Perform RE match and return result */
        match = RE_wchar_execute(re, data, data_len, 0, nmatch, pmatch);
+
        pfree(data);
        return match;
 }
@@ -334,17 +328,28 @@ RE_compile_and_execute(text *text_re, char *dat, int dat_len,
        return RE_execute(re, dat, dat_len, nmatch, pmatch);
 }
 
+
+/*
+ * parse_re_flags - parse the options argument of regexp_matches and friends
+ *
+ *     flags --- output argument, filled with desired options
+ *     opts --- *untoasted* TEXT object, or NULL for defaults
+ *
+ * This accepts all the options allowed by any of the callers; callers that
+ * don't want some have to reject them after the fact.
+ */
 static void
-parse_re_comp_flags(re_comp_flags *flags, text *opts)
+parse_re_flags(pg_re_flags *flags, text *opts)
 {
-       MemSet(flags, 0, sizeof(re_comp_flags));
+       /* regex_flavor is always folded into the compile flags */
        flags->cflags = regex_flavor;
+       flags->glob = false;
 
        if (opts)
        {
-               char  *opt_p = VARDATA(opts);
-               size_t opt_len = VARSIZE(opts) - VARHDRSZ;
-               int i;
+               char   *opt_p = VARDATA(opts);
+               int             opt_len = VARSIZE(opts) - VARHDRSZ;
+               int             i;
 
                for (i = 0; i < opt_len; i++)
                {
@@ -353,28 +358,49 @@ parse_re_comp_flags(re_comp_flags *flags, text *opts)
                                case 'g':
                                        flags->glob = true;
                                        break;
-                               case 'i':
+                               case 'b':       /* BREs (but why???) */
+                                       flags->cflags &= ~(REG_ADVANCED | REG_EXTENDED | REG_QUOTE);
+                                       break;
+                               case 'c':       /* case sensitive */
+                                       flags->cflags &= ~REG_ICASE;
+                                       break;
+                               case 'e':       /* plain EREs */
+                                       flags->cflags |= REG_EXTENDED;
+                                       flags->cflags &= ~(REG_ADVANCED | REG_QUOTE);
+                                       break;
+                               case 'i':       /* case insensitive */
                                        flags->cflags |= REG_ICASE;
                                        break;
-                               case 'm':
-                               case 'n':
+                               case 'm':       /* Perloid synonym for n */
+                               case 'n':       /* \n affects ^ $ . [^ */
                                        flags->cflags |= REG_NEWLINE;
                                        break;
-                               case 'p':
+                               case 'p':       /* ~Perl, \n affects . [^ */
                                        flags->cflags |= REG_NLSTOP;
                                        flags->cflags &= ~REG_NLANCH;
                                        break;
-                               case 'w':
+                               case 'q':       /* literal string */
+                                       flags->cflags |= REG_QUOTE;
+                                       flags->cflags &= ~(REG_ADVANCED | REG_EXTENDED);
+                                       break;
+                               case 's':       /* single line, \n ordinary */
+                                       flags->cflags &= ~REG_NEWLINE;
+                                       break;
+                               case 't':       /* tight syntax */
+                                       flags->cflags &= ~REG_EXPANDED;
+                                       break;
+                               case 'w':       /* weird, \n affects ^ $ only */
                                        flags->cflags &= ~REG_NLSTOP;
                                        flags->cflags |= REG_NLANCH;
                                        break;
-                               case 'x':
+                               case 'x':       /* expanded syntax */
                                        flags->cflags |= REG_EXPANDED;
                                        break;
                                default:
                                        ereport(ERROR,
                                                        (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
-                                                        errmsg("invalid regexp option: %c", opt_p[i])));
+                                                        errmsg("invalid regexp option: \"%c\"",
+                                                                       opt_p[i])));
                                        break;
                        }
                }
@@ -409,6 +435,16 @@ assign_regex_flavor(const char *value, bool doit, GucSource source)
 }
 
 
+/*
+ * report whether regex_flavor is currently BASIC
+ */
+bool
+regex_flavor_is_basic(void)
+{
+       return (regex_flavor == REG_BASIC);
+}
+
+
 /*
  *     interface routines called by the function manager
  */
@@ -605,16 +641,17 @@ textregexreplace(PG_FUNCTION_ARGS)
        text       *r = PG_GETARG_TEXT_P(2);
        text       *opt = PG_GETARG_TEXT_P(3);
        regex_t    *re;
-       re_comp_flags flags;
+       pg_re_flags flags;
 
-       parse_re_comp_flags(&flags, opt);
+       parse_re_flags(&flags, opt);
 
        re = RE_compile_and_cache(p, flags.cflags);
 
        PG_RETURN_TEXT_P(replace_text_regexp(s, (void *) re, r, flags.glob));
 }
 
-/* similar_escape()
+/*
+ * similar_escape()
  * Convert a SQL99 regexp pattern to POSIX style, so it can be used by
  * our regexp engine.
  */
@@ -735,185 +772,255 @@ similar_escape(PG_FUNCTION_ARGS)
        PG_RETURN_TEXT_P(result);
 }
 
-#define PG_GETARG_TEXT_P_IF_EXISTS(_n) \
-       (PG_NARGS() > _n ? PG_GETARG_TEXT_P(_n) : NULL)
-
+/*
+ * regexp_matches()
+ *             Return a table of matches of a pattern within a string.
+ */
 Datum
 regexp_matches(PG_FUNCTION_ARGS)
 {
        FuncCallContext         *funcctx;
-       MemoryContext            oldcontext;
        regexp_matches_ctx      *matchctx;
 
        if (SRF_IS_FIRSTCALL())
        {
                text *pattern = PG_GETARG_TEXT_P(1);
                text *flags   = PG_GETARG_TEXT_P_IF_EXISTS(2);
+               MemoryContext            oldcontext;
 
                funcctx = SRF_FIRSTCALL_INIT();
                oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
 
                /* be sure to copy the input string into the multi-call ctx */
                matchctx = setup_regexp_matches(PG_GETARG_TEXT_P_COPY(0), pattern,
-                                                                               flags);
+                                                                               flags, false, true, false);
+
+               /* Pre-create workspace that build_regexp_matches_result needs */
+               matchctx->elems = (Datum *) palloc(sizeof(Datum) * matchctx->npatterns);
+               matchctx->nulls = (bool *) palloc(sizeof(bool) * matchctx->npatterns);
 
                MemoryContextSwitchTo(oldcontext);
                funcctx->user_fctx = (void *) matchctx;
-
-               /*
-                * Avoid run-away function by making sure we never iterate
-                * more than the length of the text + 1 (the number of matches
-                * an empty pattern will make is length + 1)
-                */
-               if (matchctx->flags.glob)
-                       funcctx->max_calls = matchctx->wide_len + 1;
-               else
-                       funcctx->max_calls = 0;
        }
 
        funcctx = SRF_PERCALL_SETUP();
        matchctx = (regexp_matches_ctx *) funcctx->user_fctx;
 
-       if (funcctx->call_cntr > funcctx->max_calls)
-       {
-               /*
-                * If max_calls == 0, then we are doing a non-global match, we
-                * should stop now, no problem.  Otherwise, if we exceed
-                * max_calls something really wonky is going on, since it is
-                * returning more matches than there are characters in the
-                * string, which should not happen
-                */
-               if (funcctx->max_calls != 0)
-                       elog(ERROR, "set returning match function terminated after iterating %d times",
-                                funcctx->call_cntr);
-
-               SRF_RETURN_DONE(funcctx);
-       }
-
-       if (matchctx->offset < matchctx->wide_len)
+       if (matchctx->next_match < matchctx->nmatches)
        {
                ArrayType *result_ary;
 
-               if (matchctx->pmatch[0].rm_so == matchctx->pmatch[0].rm_eo)
-                       matchctx->offset++;
-
-               result_ary = perform_regexp_matches(matchctx);
-               if (result_ary != NULL)
-               {
-                       matchctx->offset = matchctx->pmatch[0].rm_eo;
-                       SRF_RETURN_NEXT(funcctx, PointerGetDatum(result_ary));
-               }
-               /* else fall through and return done */
+               result_ary = build_regexp_matches_result(matchctx);
+               matchctx->next_match++;
+               SRF_RETURN_NEXT(funcctx, PointerGetDatum(result_ary));
        }
 
        SRF_RETURN_DONE(funcctx);
 }
 
+/* This is separate to keep the opr_sanity regression test from complaining */
 Datum
 regexp_matches_no_flags(PG_FUNCTION_ARGS)
 {
        return regexp_matches(fcinfo);
 }
 
+/*
+ * setup_regexp_matches --- do the initial matching for regexp_matches()
+ *             or regexp_split()
+ *
+ * To avoid having to re-find the compiled pattern on each call, we do
+ * all the matching in one swoop.  The returned regexp_matches_ctx contains
+ * the locations of all the substrings matching the pattern.
+ *
+ * The three bool parameters have only two patterns (one for each caller)
+ * but it seems clearer to distinguish the functionality this way than to
+ * key it all off one "is_split" flag.
+ */
 static regexp_matches_ctx *
-setup_regexp_matches(text *orig_str, text *pattern, text *flags)
+setup_regexp_matches(text *orig_str, text *pattern, text *flags,
+                                        bool force_glob, bool use_subpatterns,
+                                        bool ignore_degenerate)
 {
-       regexp_matches_ctx      *matchctx = palloc(sizeof(regexp_matches_ctx));
-
+       regexp_matches_ctx *matchctx = palloc0(sizeof(regexp_matches_ctx));
+       int                     orig_len;
+       pg_wchar   *wide_str;
+       int                     wide_len;
+       pg_re_flags     re_flags;
+       regex_t    *cpattern;
+       regmatch_t *pmatch;
+       int                     pmatch_len;
+       int                     array_len;
+       int                     array_idx;
+       int                     prev_match_end;
+       int                     start_search;
+
+       /* save original string --- we'll extract result substrings from it */
        matchctx->orig_str = orig_str;
-       matchctx->orig_len = VARSIZE(matchctx->orig_str) - VARHDRSZ;
-
-       parse_re_comp_flags(&matchctx->flags, flags);
 
-       matchctx->cpattern = RE_compile_and_cache(pattern, matchctx->flags.cflags);
-       matchctx->pmatch = palloc(sizeof(regmatch_t) * (matchctx->cpattern->re_nsub + 1));
-       matchctx->offset = 0;
+       /* convert string to pg_wchar form for matching */
+       orig_len = VARSIZE(orig_str) - VARHDRSZ;
+       wide_str = (pg_wchar *) palloc(sizeof(pg_wchar) * (orig_len + 1));
+       wide_len = pg_mb2wchar_with_len(VARDATA(orig_str), wide_str, orig_len);
 
-       matchctx->wide_str = palloc(sizeof(pg_wchar) * (matchctx->orig_len + 1));
-       matchctx->wide_len = pg_mb2wchar_with_len(VARDATA(matchctx->orig_str),
-                                                                                         matchctx->wide_str, matchctx->orig_len);
-
-       matchctx->pmatch[0].rm_so = -1;
-       /* both < 0 but not equal */
-       matchctx->pmatch[0].rm_eo = -2;
+       /* determine options */
+       parse_re_flags(&re_flags, flags);
+       if (force_glob)
+       {
+               /* user mustn't specify 'g' for regexp_split */
+               if (re_flags.glob)
+                       ereport(ERROR,
+                                       (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+                                        errmsg("regexp_split does not support the global option")));
+               /* but we find all the matches anyway */
+               re_flags.glob = true;
+       }
 
-       return matchctx;
-}
+       /* set up the compiled pattern */
+       cpattern = RE_compile_and_cache(pattern, re_flags.cflags);
 
-static ArrayType *
-perform_regexp_matches(regexp_matches_ctx *matchctx)
-{
-       Datum           *elems;
-       bool            *nulls;
-       Datum            fullmatch;             /* used to avoid a palloc if no matches */
-       int              ndims = 1;
-       int              dims[1];
-       int          lbs[1] = {1};
-
-       if (RE_wchar_execute(matchctx->cpattern,
-                                                matchctx->wide_str,
-                                                matchctx->wide_len,
-                                                matchctx->offset,
-                                                matchctx->cpattern->re_nsub + 1,
-                                                matchctx->pmatch) == false)
-               return NULL;
-
-       if (matchctx->cpattern->re_nsub > 0)
+       /* do we want to remember subpatterns? */
+       if (use_subpatterns && cpattern->re_nsub > 0)
        {
-               int i;
+               matchctx->npatterns = cpattern->re_nsub;
+               pmatch_len = cpattern->re_nsub + 1;
+       }
+       else
+       {
+               use_subpatterns = false;
+               matchctx->npatterns = 1;
+               pmatch_len = 1;
+       }
 
-               elems = palloc(sizeof(Datum) * matchctx->cpattern->re_nsub);
-               nulls = palloc(sizeof(bool) * matchctx->cpattern->re_nsub);
-               dims[0] = matchctx->cpattern->re_nsub;
+       /* temporary output space for RE package */
+       pmatch = palloc(sizeof(regmatch_t) * pmatch_len);
 
-               for (i = 0; i < matchctx->cpattern->re_nsub; i++)
+       /* the real output space (grown dynamically if needed) */
+       array_len = re_flags.glob ? 256 : 32;
+       matchctx->match_locs = (int *) palloc(sizeof(int) * array_len);
+       array_idx = 0;
+
+       /* search for the pattern, perhaps repeatedly */
+       prev_match_end = 0;
+       start_search = 0;
+       while (RE_wchar_execute(cpattern, wide_str, wide_len, start_search,
+                                                       pmatch_len, pmatch))
+       {
+               /*
+                * If requested, ignore degenerate matches, which are zero-length
+                * matches occurring at the start or end of a string or just after
+                * a previous match.
+                */
+               if (!ignore_degenerate ||
+                       (pmatch[0].rm_so < wide_len &&
+                        pmatch[0].rm_eo > prev_match_end))
                {
-                       int so = matchctx->pmatch[i + 1].rm_so;
-                       int     eo = matchctx->pmatch[i + 1].rm_eo;
+                       /* enlarge output space if needed */
+                       while (array_idx + matchctx->npatterns * 2 > array_len)
+                       {
+                               array_len *= 2;
+                               matchctx->match_locs = (int *) repalloc(matchctx->match_locs,
+                                                                                                               sizeof(int) * array_len);
+                       }
 
-                       if (so < 0 || eo < 0)
+                       /* save this match's locations */
+                       if (use_subpatterns)
                        {
-                               elems[i] = 0;
-                               nulls[i] = true;
+                               int i;
+
+                               for (i = 1; i <= matchctx->npatterns; i++)
+                               {
+                                       matchctx->match_locs[array_idx++] = pmatch[i].rm_so;
+                                       matchctx->match_locs[array_idx++] = pmatch[i].rm_eo;
+                               }
                        }
                        else
                        {
-                               elems[i] = DirectFunctionCall3(text_substr,
-                                                                                          PointerGetDatum(matchctx->orig_str),
-                                                                                          Int32GetDatum(so + 1),
-                                                                                          Int32GetDatum(eo - so));
-                               nulls[i] = false;
+                               matchctx->match_locs[array_idx++] = pmatch[0].rm_so;
+                               matchctx->match_locs[array_idx++] = pmatch[0].rm_eo;
                        }
+                       matchctx->nmatches++;
                }
+               prev_match_end = pmatch[0].rm_eo;
+
+               /* if not glob, stop after one match */
+               if (!re_flags.glob)
+                       break;
+
+               /*
+                * Advance search position.  Normally we start just after the end
+                * of the previous match, but always advance at least one character
+                * (the special case can occur if the pattern matches zero characters
+                * just after the prior match or at the end of the string).
+                */
+               if (start_search < pmatch[0].rm_eo)
+                       start_search = pmatch[0].rm_eo;
+               else
+                       start_search++;
+               if (start_search > wide_len)
+                       break;
        }
-       else
-       {
-               int so = matchctx->pmatch[0].rm_so;
-               int     eo = matchctx->pmatch[0].rm_eo;
 
-               if (so < 0 || eo < 0)
-                       elog(ERROR, "regexp code said it had a match, but did not return it");
+       /* Clean up temp storage */
+       pfree(wide_str);
+       pfree(pmatch);
 
-               fullmatch = DirectFunctionCall3(text_substr,
-                                                                               PointerGetDatum(matchctx->orig_str),
-                                                                               Int32GetDatum(so + 1),
-                                                                               Int32GetDatum(eo - so));
+       return matchctx;
+}
+
+/*
+ * build_regexp_matches_result - build output array for current match
+ */
+static ArrayType *
+build_regexp_matches_result(regexp_matches_ctx *matchctx)
+{
+       Datum      *elems = matchctx->elems;
+       bool       *nulls = matchctx->nulls;
+       int             dims[1];
+       int         lbs[1];
+       int                     loc;
+       int                     i;
 
-               elems = &fullmatch;
-               nulls = NULL;
-               dims[0] = 1;
+       /* Extract matching substrings from the original string */
+       loc = matchctx->next_match * matchctx->npatterns * 2;
+       for (i = 0; i < matchctx->npatterns; i++)
+       {
+               int     so = matchctx->match_locs[loc++];
+               int     eo = matchctx->match_locs[loc++];
+
+               if (so < 0 || eo < 0)
+               {
+                       elems[i] = (Datum) 0;
+                       nulls[i] = true;
+               }
+               else
+               {
+                       elems[i] = DirectFunctionCall3(text_substr,
+                                                                                  PointerGetDatum(matchctx->orig_str),
+                                                                                  Int32GetDatum(so + 1),
+                                                                                  Int32GetDatum(eo - so));
+                       nulls[i] = false;
+               }
        }
 
+       /* And form an array */
+       dims[0] = matchctx->npatterns;
+       lbs[0] = 1;
        /* XXX: this hardcodes assumptions about the text type */
-       return construct_md_array(elems, nulls, ndims, dims, lbs,
+       return construct_md_array(elems, nulls, 1, dims, lbs,
                                                          TEXTOID, -1, false, 'i');
 }
 
+/*
+ * regexp_split_to_table()
+ *             Split the string at matches of the pattern, returning the
+ *             split-out substrings as a table.
+ */
 Datum
 regexp_split_to_table(PG_FUNCTION_ARGS)
 {
        FuncCallContext  *funcctx;
-       regexp_split_ctx *splitctx;
+       regexp_matches_ctx *splitctx;
 
        if (SRF_IS_FIRSTCALL())
        {
@@ -924,168 +1031,102 @@ regexp_split_to_table(PG_FUNCTION_ARGS)
                funcctx = SRF_FIRSTCALL_INIT();
                oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
 
-               splitctx = setup_regexp_split(PG_GETARG_TEXT_P_COPY(0), pattern, flags);
+               /* be sure to copy the input string into the multi-call ctx */
+               splitctx = setup_regexp_matches(PG_GETARG_TEXT_P_COPY(0), pattern,
+                                                                               flags, true, false, true);
 
                MemoryContextSwitchTo(oldcontext);
                funcctx->user_fctx = (void *) splitctx;
-
-               /*
-                * Avoid run-away function by making sure we never iterate
-                * more than the length of the text
-                */
-               funcctx->max_calls = splitctx->wide_len;
        }
 
        funcctx = SRF_PERCALL_SETUP();
-       splitctx = (regexp_split_ctx *) funcctx->user_fctx;
+       splitctx = (regexp_matches_ctx *) funcctx->user_fctx;
 
-       if (funcctx->call_cntr > funcctx->max_calls)
+       if (splitctx->next_match <= splitctx->nmatches)
        {
-               /*
-                * If we exceed wide_len something really wonky is going on,
-                * since it is returning more matches than there are
-                * characters in the string, which should not happen
-                */
-               elog(ERROR, "set returning split function terminated after iterating %d times",
-                        funcctx->call_cntr);
+               Datum result = build_regexp_split_result(splitctx);
+
+               splitctx->next_match++;
+               SRF_RETURN_NEXT(funcctx, result);
        }
 
-       if (splitctx->offset < splitctx->wide_len)
-               SRF_RETURN_NEXT(funcctx, get_next_split(splitctx));
-       else
-               SRF_RETURN_DONE(funcctx);
+       SRF_RETURN_DONE(funcctx);
 }
 
+/* This is separate to keep the opr_sanity regression test from complaining */
 Datum regexp_split_to_table_no_flags(PG_FUNCTION_ARGS)
 {
        return regexp_split_to_table(fcinfo);
 }
 
+/*
+ * regexp_split_to_array()
+ *             Split the string at matches of the pattern, returning the
+ *             split-out substrings as an array.
+ */
 Datum regexp_split_to_array(PG_FUNCTION_ARGS)
 {
        ArrayBuildState         *astate = NULL;
-       regexp_split_ctx        *splitctx;
-       int                              nitems;
+       regexp_matches_ctx      *splitctx;
 
-       splitctx = setup_regexp_split(PG_GETARG_TEXT_P(0),
-                                                                 PG_GETARG_TEXT_P(1),
-                                                                 PG_GETARG_TEXT_P_IF_EXISTS(2));
+       splitctx = setup_regexp_matches(PG_GETARG_TEXT_P(0),
+                                                                       PG_GETARG_TEXT_P(1),
+                                                                       PG_GETARG_TEXT_P_IF_EXISTS(2),
+                                                                       true, false, true);
 
-       for (nitems = 0; splitctx->offset < splitctx->wide_len; nitems++)
+       while (splitctx->next_match <= splitctx->nmatches)
        {
-               if (nitems > splitctx->wide_len)
-                       elog(ERROR, "split function terminated after iterating %d times",
-                                nitems);
-
                astate = accumArrayResult(astate,
-                                                                 get_next_split(splitctx),
+                                                                 build_regexp_split_result(splitctx),
                                                                  false,
                                                                  TEXTOID,
                                                                  CurrentMemoryContext);
+               splitctx->next_match++;
        }
 
        PG_RETURN_ARRAYTYPE_P(makeArrayResult(astate, CurrentMemoryContext));
 }
 
+/* This is separate to keep the opr_sanity regression test from complaining */
 Datum regexp_split_to_array_no_flags(PG_FUNCTION_ARGS)
 {
        return regexp_split_to_array(fcinfo);
 }
 
-static regexp_split_ctx *
-setup_regexp_split(text *str, text *pattern, text *flags)
-{
-       regexp_split_ctx *splitctx = palloc(sizeof(regexp_split_ctx));
-
-       splitctx->orig_str = str;
-       splitctx->orig_len = VARSIZE(splitctx->orig_str) - VARHDRSZ;
-
-       parse_re_comp_flags(&splitctx->flags, flags);
-       if (splitctx->flags.glob)
-               ereport(ERROR,
-                               (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
-                                errmsg("regexp_split does not support the global option")));
-
-       splitctx->cpattern = RE_compile_and_cache(pattern, splitctx->flags.cflags);
-
-       splitctx->wide_str = palloc(sizeof(pg_wchar) * (splitctx->orig_len + 1));
-       splitctx->wide_len = pg_mb2wchar_with_len(VARDATA(splitctx->orig_str),
-                                                                                         splitctx->wide_str,
-                                                                                         splitctx->orig_len);
-
-       splitctx->offset = 0;
-
-       splitctx->match.rm_so = -1;
-       /* both < 0 but not equal */
-       splitctx->match.rm_eo = -2;
-
-       return splitctx;
-}
-
+/*
+ * build_regexp_split_result - build output string for current match
+ *
+ * We return the string between the current match and the previous one,
+ * or the string after the last match when next_match == nmatches.
+ */
 static Datum
-get_next_split(regexp_split_ctx *splitctx)
+build_regexp_split_result(regexp_matches_ctx *splitctx)
 {
-       regmatch_t *pmatch = &(splitctx->match);
-
-       for (;;)
-       {
-               Datum result;
-               int       startpos = splitctx->offset + 1;
-
-               /*
-                * If the last match was zero-length, we need to push the
-                * offset forward to avoid matching the same place forever
-                */
-               if (pmatch->rm_so == pmatch->rm_eo)
-                       splitctx->offset++;
-
-               if (RE_wchar_execute(splitctx->cpattern,
-                                                        splitctx->wide_str,
-                                                        splitctx->wide_len,
-                                                        splitctx->offset,
-                                                        1,
-                                                        pmatch))
-               {
-                       int length = splitctx->match.rm_so - startpos + 1;
-
-                       /*
-                        * If we are trying to match at the beginning of the string and
-                        * we got a zero-length match, or if we just matched where we
-                        * left off last time, go around the loop again and increment
-                        * the offset.  If we have incremented the offset already and
-                        * it matched at the new offset, that's ok
-                        */
-                       if (length == 0)
-                               continue;
+       int             startpos;
+       int             endpos;
 
-                       result = DirectFunctionCall3(text_substr,
-                                                                                PointerGetDatum(splitctx->orig_str),
-                                                                                Int32GetDatum(startpos),
-                                                                                Int32GetDatum(length));
-
-                       /* set the offset to the end of this match for next time */
-                       splitctx->offset = pmatch->rm_eo;
-
-                       return result;
-               }
+       if (splitctx->next_match > 0)
+               startpos = splitctx->match_locs[splitctx->next_match * 2 - 1];
+       else
+               startpos = 0;
+       if (startpos < 0)
+               elog(ERROR, "invalid match ending position");
 
+       if (splitctx->next_match < splitctx->nmatches)
+       {
+               endpos = splitctx->match_locs[splitctx->next_match * 2];
+               if (endpos < startpos)
+                       elog(ERROR, "invalid match starting position");
+               return DirectFunctionCall3(text_substr,
+                                                                  PointerGetDatum(splitctx->orig_str),
+                                                                  Int32GetDatum(startpos + 1),
+                                                                  Int32GetDatum(endpos - startpos));
+       }
+       else
+       {
                /* no more matches, return rest of string */
-               result = DirectFunctionCall2(text_substr_no_len,
-                                                                        PointerGetDatum(splitctx->orig_str),
-                                                                        Int32GetDatum(startpos));
-
-               /* so we know we're done next time through */
-               splitctx->offset = splitctx->wide_len;
-
-               return result;
+               return DirectFunctionCall2(text_substr_no_len,
+                                                                  PointerGetDatum(splitctx->orig_str),
+                                                                  Int32GetDatum(startpos + 1));
        }
 }
-
-/*
- * report whether regex_flavor is currently BASIC
- */
-bool
-regex_flavor_is_basic(void)
-{
-       return (regex_flavor == REG_BASIC);
-}
index 31ab2bb546379cc28db93956e1cd2a656512df7c..dac28eb10c3c3d05ba86c83aa89ed94d9d9b3af2 100644 (file)
@@ -219,7 +219,7 @@ SELECT regexp_replace('AAA aaa', 'A+', 'Z', 'gi');
 
 -- invalid regexp option
 SELECT regexp_replace('AAA aaa', 'A+', 'Z', 'z');
-ERROR:  invalid regexp option: z
+ERROR:  invalid regexp option: "z"
 -- set so we can tell NULL from empty string
 \pset null '\\N'
 -- return all matches from regexp
@@ -272,8 +272,8 @@ SELECT regexp_matches('foobarbequebaz', $re$barbeque$re$);
 (1 row)
 
 -- give me errors
-SELECT regexp_matches('foobarbequebaz', $re$(bar)(beque)$re$, 'zipper');
-ERROR:  invalid regexp option: z
+SELECT regexp_matches('foobarbequebaz', $re$(bar)(beque)$re$, 'gz');
+ERROR:  invalid regexp option: "z"
 SELECT regexp_matches('foobarbequebaz', $re$(barbeque$re$);
 ERROR:  invalid regular expression: parentheses () not balanced
 SELECT regexp_matches('foobarbequebaz', $re$(bar)(beque){2,1}$re$);
@@ -431,11 +431,30 @@ SELECT regexp_split_to_array('the quick brown fox jumped over the lazy dog', 'no
  {"the quick brown fox jumped over the lazy dog"}
 (1 row)
 
+-- some corner cases
+SELECT regexp_split_to_array('123456','1');
+ regexp_split_to_array 
+-----------------------
+ {"",23456}
+(1 row)
+
+SELECT regexp_split_to_array('123456','6');
+ regexp_split_to_array 
+-----------------------
+ {12345,""}
+(1 row)
+
+SELECT regexp_split_to_array('123456','.');
+ regexp_split_to_array  
+------------------------
+ {"","","","","","",""}
+(1 row)
+
 -- errors
 SELECT foo, length(foo) FROM regexp_split_to_table('thE QUick bROWn FOx jUMPed ovEr THE lazy dOG', 'e', 'zippy') AS foo;
-ERROR:  invalid regexp option: z
-SELECT regexp_split_to_array('thE QUick bROWn FOx jUMPed ovEr THE lazy dOG', 'e', 'zippy');
-ERROR:  invalid regexp option: z
+ERROR:  invalid regexp option: "z"
+SELECT regexp_split_to_array('thE QUick bROWn FOx jUMPed ovEr THE lazy dOG', 'e', 'iz');
+ERROR:  invalid regexp option: "z"
 -- global option meaningless for regexp_split
 SELECT foo, length(foo) FROM regexp_split_to_table('thE QUick bROWn FOx jUMPed ovEr THE lazy dOG', 'e', 'g') AS foo;
 ERROR:  regexp_split does not support the global option
index 43b35ab667ffbc4d03a627e28e4f8e1845622a3d..9ab6c218728ac6b222edacdb14836184a5efd58d 100644 (file)
@@ -111,7 +111,7 @@ SELECT regexp_matches('foobarbequebaz', $re$(bar)(.+)?(beque)$re$);
 SELECT regexp_matches('foobarbequebaz', $re$barbeque$re$);
 
 -- give me errors
-SELECT regexp_matches('foobarbequebaz', $re$(bar)(beque)$re$, 'zipper');
+SELECT regexp_matches('foobarbequebaz', $re$(bar)(beque)$re$, 'gz');
 SELECT regexp_matches('foobarbequebaz', $re$(barbeque$re$);
 SELECT regexp_matches('foobarbequebaz', $re$(bar)(beque){2,1}$re$);
 
@@ -129,9 +129,13 @@ SELECT regexp_split_to_array('thE QUick bROWn FOx jUMPed ovEr THE lazy dOG', 'e'
 -- no match of pattern
 SELECT foo, length(foo) FROM regexp_split_to_table('the quick brown fox jumped over the lazy dog', 'nomatch') AS foo;
 SELECT regexp_split_to_array('the quick brown fox jumped over the lazy dog', 'nomatch');
+-- some corner cases
+SELECT regexp_split_to_array('123456','1');
+SELECT regexp_split_to_array('123456','6');
+SELECT regexp_split_to_array('123456','.');
 -- errors
 SELECT foo, length(foo) FROM regexp_split_to_table('thE QUick bROWn FOx jUMPed ovEr THE lazy dOG', 'e', 'zippy') AS foo;
-SELECT regexp_split_to_array('thE QUick bROWn FOx jUMPed ovEr THE lazy dOG', 'e', 'zippy');
+SELECT regexp_split_to_array('thE QUick bROWn FOx jUMPed ovEr THE lazy dOG', 'e', 'iz');
 -- global option meaningless for regexp_split
 SELECT foo, length(foo) FROM regexp_split_to_table('thE QUick bROWn FOx jUMPed ovEr THE lazy dOG', 'e', 'g') AS foo;
 SELECT regexp_split_to_array('thE QUick bROWn FOx jUMPed ovEr THE lazy dOG', 'e', 'g');