Code review for regexp_matches/regexp_split patch. Refactor to avoid assuming

author Tom Lane <tgl@sss.pgh.pa.us>

Sat, 11 Aug 2007 03:56:24 +0000 (03:56 +0000)

committer Tom Lane <tgl@sss.pgh.pa.us>

Sat, 11 Aug 2007 03:56:24 +0000 (03:56 +0000)
author Tom Lane <tgl@sss.pgh.pa.us>
Sat, 11 Aug 2007 03:56:24 +0000 (03:56 +0000)
committer Tom Lane <tgl@sss.pgh.pa.us>
Sat, 11 Aug 2007 03:56:24 +0000 (03:56 +0000)
diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml

index 5d5a86a0c7fa066b2ddc1c9b141f3da155fb285a..4e5f8f1148aba2467323be7a14b91dba2e978642 100644 (file)
--- a/doc/src/sgml/func.sgml
+++ b/doc/src/sgml/func.sgml
@@ -1,4 +1,4 @@
-<!-- $PostgreSQL: pgsql/doc/src/sgml/func.sgml,v 1.383 2007/07/18 03:12:42 momjian Exp $ -->
+<!-- $PostgreSQL: pgsql/doc/src/sgml/func.sgml,v 1.384 2007/08/11 03:56:24 tgl Exp $ -->
  
   <chapter id="functions">
    <title>Functions and Operators</title>
@@ -1499,7 +1499,7 @@
         <entry><literal><function>regexp_matches</function>(<parameter>string</parameter> <type>text</type>, <parameter>pattern</parameter> <type>text</type> [, <parameter>flags</parameter> <type>text</type>])</literal></entry>
         <entry><type>setof text[]</type></entry>
         <entry>
-        Return all capture groups resulting from matching POSIX regular
+        Return all captured substrings resulting from matching a POSIX regular
          expression against the <parameter>string</parameter>. See
          <xref linkend="functions-posix-regexp"> for more information.
         </entry>
@@ -1511,7 +1511,7 @@
         <entry><literal><function>regexp_replace</function>(<parameter>string</parameter> <type>text</type>, <parameter>pattern</parameter> <type>text</type>, <parameter>replacement</parameter> <type>text</type> [, <parameter>flags</parameter> <type>text</type>])</literal></entry>
         <entry><type>text</type></entry>
         <entry>
-        Replace substring matching POSIX regular expression. See
+        Replace substring(s) matching a POSIX regular expression. See
          <xref linkend="functions-posix-regexp"> for more information.
         </entry>
         <entry><literal>regexp_replace('Thomas', '.[mN]a.', 'M')</literal></entry>
@@ -1522,7 +1522,7 @@
         <entry><literal><function>regexp_split_to_array</function>(<parameter>string</parameter> <type>text</type>, <parameter>pattern</parameter> <type>text</type> [, <parameter>flags</parameter> <type>text</type> ])</literal></entry>
         <entry><type>text[]</type></entry>
         <entry>
-        Split <parameter>string</parameter> using POSIX regular expression as
+        Split <parameter>string</parameter> using a POSIX regular expression as
          the delimiter.  See <xref linkend="functions-posix-regexp"> for more
          information.
         </entry>
@@ -1534,7 +1534,7 @@
         <entry><literal><function>regexp_split_to_table</function>(<parameter>string</parameter> <type>text</type>, <parameter>pattern</parameter> <type>text</type> [, <parameter>flags</parameter> <type>text</type>])</literal></entry>
         <entry><type>setof text</type></entry>
         <entry>
-        Split <parameter>string</parameter> using POSIX regular expression as
+        Split <parameter>string</parameter> using a POSIX regular expression as
          the delimiter.  See <xref linkend="functions-posix-regexp"> for more
          information.
         </entry>
@@ -2856,11 +2856,9 @@ cast(-44 as bit(12))           <lineannotation>111111010100</lineannotation>
      <acronym>SQL</acronym> <function>LIKE</function> operator, the
      more recent <function>SIMILAR TO</function> operator (added in
      SQL:1999), and <acronym>POSIX</acronym>-style regular
-    expressions.
-    Additionally, a pattern matching function,
-    <function>substring</function>, is available, using either
-    <function>SIMILAR TO</function>-style or POSIX-style regular
-    expressions.
+    expressions.  Aside from the basic <quote>does this string match
+    this pattern?</> operators, functions are available to extract
+    or replace matching substrings and to split a string at the matches.
     </para>
  
     <tip>
@@ -3186,15 +3184,20 @@ substring('foobar' from '#"o_b#"%' for '#')    <lineannotation>NULL</lineannotat
       end of the string.
      </para>
  
-   <para>
-    Some examples:
+    <para>
+     Some examples:
  <programlisting>
  'abc' ~ 'abc'    <lineannotation>true</lineannotation>
  'abc' ~ '^a'     <lineannotation>true</lineannotation>
  'abc' ~ '(b|d)'  <lineannotation>true</lineannotation>
  'abc' ~ '^(b|c)' <lineannotation>false</lineannotation>
  </programlisting>
-   </para>
+    </para>
+
+    <para>
+     The <acronym>POSIX</acronym> pattern language is described in much
+     greater detail below.
+    </para>
  
      <para>
       The <function>substring</> function with two parameters,
@@ -3246,9 +3249,7 @@ substring('foobar' from 'o(.)b')   <lineannotation>o</lineannotation>
       function's behavior.  Flag <literal>i</> specifies case-insensitive
       matching, while flag <literal>g</> specifies replacement of each matching
       substring rather than only the first one.  Other supported flags are
-     <literal>m</>, <literal>n</>, <literal>p</>, <literal>w</> and
-     <literal>x</>, whose meanings correspond to those shown in
-     <xref linkend="posix-embedded-options-table">.
+     described in <xref linkend="posix-embedded-options-table">.
      </para>
  
     <para>
@@ -3264,23 +3265,25 @@ regexp_replace('foobarbaz', 'b(..)', E'X\\1Y', 'g')
     </para>
  
      <para>
-     The <function>regexp_matches</> function returns all of the capture
-     groups resulting from matching a POSIX regular expression pattern.
+     The <function>regexp_matches</> function returns all of the captured
+     substrings resulting from matching a POSIX regular expression pattern.
       It has the syntax
       <function>regexp_matches</function>(<replaceable>string</>, <replaceable>pattern</>
       <optional>, <replaceable>flags</> </optional>).
-     If there is no match to the <replaceable>pattern</>, the function returns no rows.
-     If there is a match, the function returns the contents of all of the capture groups
-     in a text array, or if there were no capture groups in the pattern, it returns the
-     contents of the entire match as a single-element text array.
+     If there is no match to the <replaceable>pattern</>, the function returns
+     no rows.  If there is a match, the function returns a text array whose
+     <replaceable>n</>'th element is the substring matching the
+     <replaceable>n</>'th parenthesized subexpression of the pattern
+     (not counting <quote>non-capturing</> parentheses; see below for
+     details).  If the pattern does not contain any parenthesized
+     subexpressions, then the result is a single-element text array containing
+     the substring matching the whole pattern.
       The <replaceable>flags</> parameter is an optional text
       string containing zero or more single-letter flags that change the
-     function's behavior.  Flag <literal>i</> specifies case-insensitive
-     matching, while flag <literal>g</> causes the return of each matching
-     substring rather than only the first one.  Other supported
-     flags are <literal>m</>, <literal>n</>, <literal>p</>, <literal>w</> and
-     <literal>x</>, whose meanings are described in
-     <xref linkend="posix-embedded-options-table">.
+     function's behavior.  Flag <literal>g</> causes the function to find
+     each match in the string, not only the first one, and return a row for
+     each such match.  Other supported
+     flags are described in <xref linkend="posix-embedded-options-table">.
      </para>
  
     <para>
@@ -3319,16 +3322,14 @@ SELECT regexp_matches('foobarbequebaz', 'barbeque');
       returns the text from the end of the last match to the end of the string.
       The <replaceable>flags</> parameter is an optional text string containing
       zero or more single-letter flags that change the function's behavior.
-     <function>regexp_split_to_table</function> supports the flags <literal>i</>,
-     <literal>m</>, <literal>n</>, <literal>p</>, <literal>w</> and
-     <literal>x</>, whose meanings are described in
+     <function>regexp_split_to_table</function> supports the flags described in
       <xref linkend="posix-embedded-options-table">.
      </para>
  
      <para>
       The <function>regexp_split_to_array</> function behaves the same as
       <function>regexp_split_to_table</>, except that <function>regexp_split_to_array</>
-     returns its results as a <type>text[]</>.  It has the syntax
+     returns its result as an array of <type>text</>.  It has the syntax
       <function>regexp_split_to_array</function>(<replaceable>string</>, <replaceable>pattern</>
       <optional>, <replaceable>flags</> </optional>).
       The parameters are the same as for <function>regexp_split_to_table</>.
diff --git a/src/backend/utils/adt/regexp.c b/src/backend/utils/adt/regexp.c

index 7f140ddfdcb1a4261d4724a16cd5cd438c1d67db..05c00deaf98040297d49795aac0754f82a86b201 100644 (file)
--- a/src/backend/utils/adt/regexp.c
+++ b/src/backend/utils/adt/regexp.c
@@ -8,7 +8,7 @@
   *
   *
   * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/utils/adt/regexp.c,v 1.71 2007/03/28 22:59:37 neilc Exp $
+ *       $PostgreSQL: pgsql/src/backend/utils/adt/regexp.c,v 1.72 2007/08/11 03:56:24 tgl Exp $
   *
   *             Alistair Crooks added the code for the regex caching
   *             agc - cached the regular expressions used - there's a good chance
@@ -29,19 +29,42 @@
   */
  #include "postgres.h"
  
-#include "access/heapam.h"
  #include "catalog/pg_type.h"
  #include "funcapi.h"
  #include "regex/regex.h"
  #include "utils/builtins.h"
  #include "utils/guc.h"
-#include "utils/lsyscache.h"
+
+#define PG_GETARG_TEXT_P_IF_EXISTS(_n) \
+       (PG_NARGS() > (_n) ? PG_GETARG_TEXT_P(_n) : NULL)
  
  
  /* GUC-settable flavor parameter */
  static int     regex_flavor = REG_ADVANCED;
  
  
+/* all the options of interest for regex functions */
+typedef struct pg_re_flags
+{
+       int                     cflags;                 /* compile flags for Spencer's regex code */
+       bool            glob;                   /* do it globally (for each occurrence) */
+} pg_re_flags;
+
+/* cross-call state for regexp_matches(), also regexp_split() */
+typedef struct regexp_matches_ctx
+{
+       text       *orig_str;           /* data string in original TEXT form */
+       int                     nmatches;               /* number of places where pattern matched */
+       int                     npatterns;              /* number of capturing subpatterns */
+       /* We store start char index and end+1 char index for each match */
+       /* so the number of entries in match_locs is nmatches * npatterns * 2 */
+       int                *match_locs;         /* 0-based character indexes */
+       int                     next_match;             /* 0-based index of next match to process */
+       /* workspace for build_regexp_matches_result() */
+       Datum      *elems;                      /* has npatterns elements */
+       bool       *nulls;                      /* has npatterns elements */
+} regexp_matches_ctx;
+
  /*
   * We cache precompiled regular expressions using a "self organizing list"
   * structure, in which recently-used items tend to be near the front.
@@ -79,48 +102,18 @@ typedef struct cached_re_str
         regex_t         cre_re;                 /* the compiled regular expression */
  } cached_re_str;
  
-typedef struct re_comp_flags
-{
-       int                       cflags;
-       bool              glob;
-} re_comp_flags;
-
-typedef struct regexp_matches_ctx
-{
-       text             *orig_str;
-       size_t            orig_len;
-       pg_wchar         *wide_str;
-       size_t            wide_len;
-       regex_t          *cpattern;
-       regmatch_t       *pmatch;
-       size_t            offset;
-
-       re_comp_flags flags;
-} regexp_matches_ctx;
-
-typedef struct regexp_split_ctx
-{
-       text             *orig_str;
-       size_t            orig_len;
-       pg_wchar         *wide_str;
-       size_t            wide_len;
-       regex_t          *cpattern;
-       regmatch_t        match;
-       size_t            offset;
-       re_comp_flags flags;
-} regexp_split_ctx;
-
-
  static int     num_res = 0;            /* # of cached re's */
  static cached_re_str re_array[MAX_CACHED_RES]; /* cached re's */
  
-static regexp_matches_ctx *setup_regexp_matches(text *orig_str, text *pattern,
-                                                                                               text *flags);
-static ArrayType *perform_regexp_matches(regexp_matches_ctx *matchctx);
  
-static regexp_split_ctx *setup_regexp_split(text *str, text *pattern,
-                                                                                       text *flags);
-static Datum get_next_split(regexp_split_ctx *splitctx);
+/* Local functions */
+static regexp_matches_ctx *setup_regexp_matches(text *orig_str, text *pattern,
+                                                                                               text *flags,
+                                                                                               bool force_glob,
+                                                                                               bool use_subpatterns,
+                                                                                               bool ignore_degenerate);
+static ArrayType *build_regexp_matches_result(regexp_matches_ctx *matchctx);
+static Datum build_regexp_split_result(regexp_matches_ctx *splitctx);
  
  
  /*
@@ -139,7 +132,7 @@ RE_compile_and_cache(text *text_re, int cflags)
  {
         int                     text_re_len = VARSIZE(text_re);
         pg_wchar   *pattern;
-       size_t          pattern_len;
+       int                     pattern_len;
         int                     i;
         int                     regcomp_result;
         cached_re_str re_temp;
@@ -235,7 +228,7 @@ RE_compile_and_cache(text *text_re, int cflags)
  }
  
  /*
- * RE_wchar_execute - execute a RE
+ * RE_wchar_execute - execute a RE on pg_wchar data
   *
   * Returns TRUE on match, FALSE on no match
   *
@@ -250,7 +243,7 @@ RE_compile_and_cache(text *text_re, int cflags)
   */
  static bool
  RE_wchar_execute(regex_t *re, pg_wchar *data, int data_len,
-                                size_t start_search, int nmatch, regmatch_t *pmatch)
+                                int start_search, int nmatch, regmatch_t *pmatch)
  {
         int                     regexec_result;
         char            errMsg[100];
@@ -295,7 +288,7 @@ RE_execute(regex_t *re, char *dat, int dat_len,
                    int nmatch, regmatch_t *pmatch)
  {
         pg_wchar   *data;
-       size_t          data_len;
+       int                     data_len;
         bool            match;
  
         /* Convert data string to wide characters */
@@ -304,6 +297,7 @@ RE_execute(regex_t *re, char *dat, int dat_len,
  
         /* Perform RE match and return result */
         match = RE_wchar_execute(re, data, data_len, 0, nmatch, pmatch);
+
         pfree(data);
         return match;
  }
@@ -334,17 +328,28 @@ RE_compile_and_execute(text *text_re, char *dat, int dat_len,
         return RE_execute(re, dat, dat_len, nmatch, pmatch);
  }
  
+
+/*
+ * parse_re_flags - parse the options argument of regexp_matches and friends
+ *
+ *     flags --- output argument, filled with desired options
+ *     opts --- *untoasted* TEXT object, or NULL for defaults
+ *
+ * This accepts all the options allowed by any of the callers; callers that
+ * don't want some have to reject them after the fact.
+ */
  static void
-parse_re_comp_flags(re_comp_flags *flags, text *opts)
+parse_re_flags(pg_re_flags *flags, text *opts)
  {
-       MemSet(flags, 0, sizeof(re_comp_flags));
+       /* regex_flavor is always folded into the compile flags */
         flags->cflags = regex_flavor;
+       flags->glob = false;
  
         if (opts)
         {
-               char  *opt_p = VARDATA(opts);
-               size_t opt_len = VARSIZE(opts) - VARHDRSZ;
-               int i;
+               char   *opt_p = VARDATA(opts);
+               int             opt_len = VARSIZE(opts) - VARHDRSZ;
+               int             i;
  
                 for (i = 0; i < opt_len; i++)
                 {
@@ -353,28 +358,49 @@ parse_re_comp_flags(re_comp_flags *flags, text *opts)
                                 case 'g':
                                         flags->glob = true;
                                         break;
-                               case 'i':
+                               case 'b':       /* BREs (but why???) */
+                                       flags->cflags &= ~(REG_ADVANCED | REG_EXTENDED | REG_QUOTE);
+                                       break;
+                               case 'c':       /* case sensitive */
+                                       flags->cflags &= ~REG_ICASE;
+                                       break;
+                               case 'e':       /* plain EREs */
+                                       flags->cflags |= REG_EXTENDED;
+                                       flags->cflags &= ~(REG_ADVANCED | REG_QUOTE);
+                                       break;
+                               case 'i':       /* case insensitive */
                                         flags->cflags |= REG_ICASE;
                                         break;
-                               case 'm':
-                               case 'n':
+                               case 'm':       /* Perloid synonym for n */
+                               case 'n':       /* \n affects ^ $ . [^ */
                                         flags->cflags |= REG_NEWLINE;
                                         break;
-                               case 'p':
+                               case 'p':       /* ~Perl, \n affects . [^ */
                                         flags->cflags |= REG_NLSTOP;
                                         flags->cflags &= ~REG_NLANCH;
                                         break;
-                               case 'w':
+                               case 'q':       /* literal string */
+                                       flags->cflags |= REG_QUOTE;
+                                       flags->cflags &= ~(REG_ADVANCED | REG_EXTENDED);
+                                       break;
+                               case 's':       /* single line, \n ordinary */
+                                       flags->cflags &= ~REG_NEWLINE;
+                                       break;
+                               case 't':       /* tight syntax */
+                                       flags->cflags &= ~REG_EXPANDED;
+                                       break;
+                               case 'w':       /* weird, \n affects ^ $ only */
                                         flags->cflags &= ~REG_NLSTOP;
                                         flags->cflags |= REG_NLANCH;
                                         break;
-                               case 'x':
+                               case 'x':       /* expanded syntax */
                                         flags->cflags |= REG_EXPANDED;
                                         break;
                                 default:
                                         ereport(ERROR,
                                                         (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
-                                                        errmsg("invalid regexp option: %c", opt_p[i])));
+                                                        errmsg("invalid regexp option: \"%c\"",
+                                                                       opt_p[i])));
                                         break;
                         }
                 }
@@ -409,6 +435,16 @@ assign_regex_flavor(const char *value, bool doit, GucSource source)
  }
  
  
+/*
+ * report whether regex_flavor is currently BASIC
+ */
+bool
+regex_flavor_is_basic(void)
+{
+       return (regex_flavor == REG_BASIC);
+}
+
+
  /*
   *     interface routines called by the function manager
   */
@@ -605,16 +641,17 @@ textregexreplace(PG_FUNCTION_ARGS)
         text       *r = PG_GETARG_TEXT_P(2);
         text       *opt = PG_GETARG_TEXT_P(3);
         regex_t    *re;
-       re_comp_flags flags;
+       pg_re_flags flags;
  
-       parse_re_comp_flags(&flags, opt);
+       parse_re_flags(&flags, opt);
  
         re = RE_compile_and_cache(p, flags.cflags);
  
         PG_RETURN_TEXT_P(replace_text_regexp(s, (void *) re, r, flags.glob));
  }
  
-/* similar_escape()
+/*
+ * similar_escape()
   * Convert a SQL99 regexp pattern to POSIX style, so it can be used by
   * our regexp engine.
   */
@@ -735,185 +772,255 @@ similar_escape(PG_FUNCTION_ARGS)
         PG_RETURN_TEXT_P(result);
  }
  
-#define PG_GETARG_TEXT_P_IF_EXISTS(_n) \
-       (PG_NARGS() > _n ? PG_GETARG_TEXT_P(_n) : NULL)
-
+/*
+ * regexp_matches()
+ *             Return a table of matches of a pattern within a string.
+ */
  Datum
  regexp_matches(PG_FUNCTION_ARGS)
  {
         FuncCallContext         *funcctx;
-       MemoryContext            oldcontext;
         regexp_matches_ctx      *matchctx;
  
         if (SRF_IS_FIRSTCALL())
         {
                 text *pattern = PG_GETARG_TEXT_P(1);
                 text *flags   = PG_GETARG_TEXT_P_IF_EXISTS(2);
+               MemoryContext            oldcontext;
  
                 funcctx = SRF_FIRSTCALL_INIT();
                 oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
  
                 /* be sure to copy the input string into the multi-call ctx */
                 matchctx = setup_regexp_matches(PG_GETARG_TEXT_P_COPY(0), pattern,
-                                                                               flags);
+                                                                               flags, false, true, false);
+
+               /* Pre-create workspace that build_regexp_matches_result needs */
+               matchctx->elems = (Datum *) palloc(sizeof(Datum) * matchctx->npatterns);
+               matchctx->nulls = (bool *) palloc(sizeof(bool) * matchctx->npatterns);
  
                 MemoryContextSwitchTo(oldcontext);
                 funcctx->user_fctx = (void *) matchctx;
-
-               /*
-                * Avoid run-away function by making sure we never iterate
-                * more than the length of the text + 1 (the number of matches
-                * an empty pattern will make is length + 1)
-                */
-               if (matchctx->flags.glob)
-                       funcctx->max_calls = matchctx->wide_len + 1;
-               else
-                       funcctx->max_calls = 0;
         }
  
         funcctx = SRF_PERCALL_SETUP();
         matchctx = (regexp_matches_ctx *) funcctx->user_fctx;
  
-       if (funcctx->call_cntr > funcctx->max_calls)
-       {
-               /*
-                * If max_calls == 0, then we are doing a non-global match, we
-                * should stop now, no problem.  Otherwise, if we exceed
-                * max_calls something really wonky is going on, since it is
-                * returning more matches than there are characters in the
-                * string, which should not happen
-                */
-               if (funcctx->max_calls != 0)
-                       elog(ERROR, "set returning match function terminated after iterating %d times",
-                                funcctx->call_cntr);
-
-               SRF_RETURN_DONE(funcctx);
-       }
-
-       if (matchctx->offset < matchctx->wide_len)
+       if (matchctx->next_match < matchctx->nmatches)
         {
                 ArrayType *result_ary;
  
-               if (matchctx->pmatch[0].rm_so == matchctx->pmatch[0].rm_eo)
-                       matchctx->offset++;
-
-               result_ary = perform_regexp_matches(matchctx);
-               if (result_ary != NULL)
-               {
-                       matchctx->offset = matchctx->pmatch[0].rm_eo;
-                       SRF_RETURN_NEXT(funcctx, PointerGetDatum(result_ary));
-               }
-               /* else fall through and return done */
+               result_ary = build_regexp_matches_result(matchctx);
+               matchctx->next_match++;
+               SRF_RETURN_NEXT(funcctx, PointerGetDatum(result_ary));
         }
  
         SRF_RETURN_DONE(funcctx);
  }
  
+/* This is separate to keep the opr_sanity regression test from complaining */
  Datum
  regexp_matches_no_flags(PG_FUNCTION_ARGS)
  {
         return regexp_matches(fcinfo);
  }
  
+/*
+ * setup_regexp_matches --- do the initial matching for regexp_matches()
+ *             or regexp_split()
+ *
+ * To avoid having to re-find the compiled pattern on each call, we do
+ * all the matching in one swoop.  The returned regexp_matches_ctx contains
+ * the locations of all the substrings matching the pattern.
+ *
+ * The three bool parameters have only two patterns (one for each caller)
+ * but it seems clearer to distinguish the functionality this way than to
+ * key it all off one "is_split" flag.
+ */
  static regexp_matches_ctx *
-setup_regexp_matches(text *orig_str, text *pattern, text *flags)
+setup_regexp_matches(text *orig_str, text *pattern, text *flags,
+                                        bool force_glob, bool use_subpatterns,
+                                        bool ignore_degenerate)
  {
-       regexp_matches_ctx      *matchctx = palloc(sizeof(regexp_matches_ctx));
-
+       regexp_matches_ctx *matchctx = palloc0(sizeof(regexp_matches_ctx));
+       int                     orig_len;
+       pg_wchar   *wide_str;
+       int                     wide_len;
+       pg_re_flags     re_flags;
+       regex_t    *cpattern;
+       regmatch_t *pmatch;
+       int                     pmatch_len;
+       int                     array_len;
+       int                     array_idx;
+       int                     prev_match_end;
+       int                     start_search;
+
+       /* save original string --- we'll extract result substrings from it */
         matchctx->orig_str = orig_str;
-       matchctx->orig_len = VARSIZE(matchctx->orig_str) - VARHDRSZ;
-
-       parse_re_comp_flags(&matchctx->flags, flags);
  
-       matchctx->cpattern = RE_compile_and_cache(pattern, matchctx->flags.cflags);
-       matchctx->pmatch = palloc(sizeof(regmatch_t) * (matchctx->cpattern->re_nsub + 1));
-       matchctx->offset = 0;
+       /* convert string to pg_wchar form for matching */
+       orig_len = VARSIZE(orig_str) - VARHDRSZ;
+       wide_str = (pg_wchar *) palloc(sizeof(pg_wchar) * (orig_len + 1));
+       wide_len = pg_mb2wchar_with_len(VARDATA(orig_str), wide_str, orig_len);
  
-       matchctx->wide_str = palloc(sizeof(pg_wchar) * (matchctx->orig_len + 1));
-       matchctx->wide_len = pg_mb2wchar_with_len(VARDATA(matchctx->orig_str),
-                                                                                         matchctx->wide_str, matchctx->orig_len);
-
-       matchctx->pmatch[0].rm_so = -1;
-       /* both < 0 but not equal */
-       matchctx->pmatch[0].rm_eo = -2;
+       /* determine options */
+       parse_re_flags(&re_flags, flags);
+       if (force_glob)
+       {
+               /* user mustn't specify 'g' for regexp_split */
+               if (re_flags.glob)
+                       ereport(ERROR,
+                                       (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+                                        errmsg("regexp_split does not support the global option")));
+               /* but we find all the matches anyway */
+               re_flags.glob = true;
+       }
  
-       return matchctx;
-}
+       /* set up the compiled pattern */
+       cpattern = RE_compile_and_cache(pattern, re_flags.cflags);
  
-static ArrayType *
-perform_regexp_matches(regexp_matches_ctx *matchctx)
-{
-       Datum           *elems;
-       bool            *nulls;
-       Datum            fullmatch;             /* used to avoid a palloc if no matches */
-       int              ndims = 1;
-       int              dims[1];
-       int          lbs[1] = {1};
-
-       if (RE_wchar_execute(matchctx->cpattern,
-                                                matchctx->wide_str,
-                                                matchctx->wide_len,
-                                                matchctx->offset,
-                                                matchctx->cpattern->re_nsub + 1,
-                                                matchctx->pmatch) == false)
-               return NULL;
-
-       if (matchctx->cpattern->re_nsub > 0)
+       /* do we want to remember subpatterns? */
+       if (use_subpatterns && cpattern->re_nsub > 0)
         {
-               int i;
+               matchctx->npatterns = cpattern->re_nsub;
+               pmatch_len = cpattern->re_nsub + 1;
+       }
+       else
+       {
+               use_subpatterns = false;
+               matchctx->npatterns = 1;
+               pmatch_len = 1;
+       }
  
-               elems = palloc(sizeof(Datum) * matchctx->cpattern->re_nsub);
-               nulls = palloc(sizeof(bool) * matchctx->cpattern->re_nsub);
-               dims[0] = matchctx->cpattern->re_nsub;
+       /* temporary output space for RE package */
+       pmatch = palloc(sizeof(regmatch_t) * pmatch_len);
  
-               for (i = 0; i < matchctx->cpattern->re_nsub; i++)
+       /* the real output space (grown dynamically if needed) */
+       array_len = re_flags.glob ? 256 : 32;
+       matchctx->match_locs = (int *) palloc(sizeof(int) * array_len);
+       array_idx = 0;
+
+       /* search for the pattern, perhaps repeatedly */
+       prev_match_end = 0;
+       start_search = 0;
+       while (RE_wchar_execute(cpattern, wide_str, wide_len, start_search,
+                                                       pmatch_len, pmatch))
+       {
+               /*
+                * If requested, ignore degenerate matches, which are zero-length
+                * matches occurring at the start or end of a string or just after
+                * a previous match.
+                */
+               if (!ignore_degenerate ||
+                       (pmatch[0].rm_so < wide_len &&
+                        pmatch[0].rm_eo > prev_match_end))
                 {
-                       int so = matchctx->pmatch[i + 1].rm_so;
-                       int     eo = matchctx->pmatch[i + 1].rm_eo;
+                       /* enlarge output space if needed */
+                       while (array_idx + matchctx->npatterns * 2 > array_len)
+                       {
+                               array_len *= 2;
+                               matchctx->match_locs = (int *) repalloc(matchctx->match_locs,
+                                                                                                               sizeof(int) * array_len);
+                       }
  
-                       if (so < 0 || eo < 0)
+                       /* save this match's locations */
+                       if (use_subpatterns)
                         {
-                               elems[i] = 0;
-                               nulls[i] = true;
+                               int i;
+
+                               for (i = 1; i <= matchctx->npatterns; i++)
+                               {
+                                       matchctx->match_locs[array_idx++] = pmatch[i].rm_so;
+                                       matchctx->match_locs[array_idx++] = pmatch[i].rm_eo;
+                               }
                         }
                         else
                         {
-                               elems[i] = DirectFunctionCall3(text_substr,
-                                                                                          PointerGetDatum(matchctx->orig_str),
-                                                                                          Int32GetDatum(so + 1),
-                                                                                          Int32GetDatum(eo - so));
-                               nulls[i] = false;
+                               matchctx->match_locs[array_idx++] = pmatch[0].rm_so;
+                               matchctx->match_locs[array_idx++] = pmatch[0].rm_eo;
                         }
+                       matchctx->nmatches++;
                 }
+               prev_match_end = pmatch[0].rm_eo;
+
+               /* if not glob, stop after one match */
+               if (!re_flags.glob)
+                       break;
+
+               /*
+                * Advance search position.  Normally we start just after the end
+                * of the previous match, but always advance at least one character
+                * (the special case can occur if the pattern matches zero characters
+                * just after the prior match or at the end of the string).
+                */
+               if (start_search < pmatch[0].rm_eo)
+                       start_search = pmatch[0].rm_eo;
+               else
+                       start_search++;
+               if (start_search > wide_len)
+                       break;
         }
-       else
-       {
-               int so = matchctx->pmatch[0].rm_so;
-               int     eo = matchctx->pmatch[0].rm_eo;
  
-               if (so < 0 || eo < 0)
-                       elog(ERROR, "regexp code said it had a match, but did not return it");
+       /* Clean up temp storage */
+       pfree(wide_str);
+       pfree(pmatch);
  
-               fullmatch = DirectFunctionCall3(text_substr,
-                                                                               PointerGetDatum(matchctx->orig_str),
-                                                                               Int32GetDatum(so + 1),
-                                                                               Int32GetDatum(eo - so));
+       return matchctx;
+}
+
+/*
+ * build_regexp_matches_result - build output array for current match
+ */
+static ArrayType *
+build_regexp_matches_result(regexp_matches_ctx *matchctx)
+{
+       Datum      *elems = matchctx->elems;
+       bool       *nulls = matchctx->nulls;
+       int             dims[1];
+       int         lbs[1];
+       int                     loc;
+       int                     i;
  
-               elems = &fullmatch;
-               nulls = NULL;
-               dims[0] = 1;
+       /* Extract matching substrings from the original string */
+       loc = matchctx->next_match * matchctx->npatterns * 2;
+       for (i = 0; i < matchctx->npatterns; i++)
+       {
+               int     so = matchctx->match_locs[loc++];
+               int     eo = matchctx->match_locs[loc++];
+
+               if (so < 0 || eo < 0)
+               {
+                       elems[i] = (Datum) 0;
+                       nulls[i] = true;
+               }
+               else
+               {
+                       elems[i] = DirectFunctionCall3(text_substr,
+                                                                                  PointerGetDatum(matchctx->orig_str),
+                                                                                  Int32GetDatum(so + 1),
+                                                                                  Int32GetDatum(eo - so));
+                       nulls[i] = false;
+               }
         }
  
+       /* And form an array */
+       dims[0] = matchctx->npatterns;
+       lbs[0] = 1;
         /* XXX: this hardcodes assumptions about the text type */
-       return construct_md_array(elems, nulls, ndims, dims, lbs,
+       return construct_md_array(elems, nulls, 1, dims, lbs,
                                                           TEXTOID, -1, false, 'i');
  }
  
+/*
+ * regexp_split_to_table()
+ *             Split the string at matches of the pattern, returning the
+ *             split-out substrings as a table.
+ */
  Datum
  regexp_split_to_table(PG_FUNCTION_ARGS)
  {
         FuncCallContext  *funcctx;
-       regexp_split_ctx *splitctx;
+       regexp_matches_ctx *splitctx;
  
         if (SRF_IS_FIRSTCALL())
         {
@@ -924,168 +1031,102 @@ regexp_split_to_table(PG_FUNCTION_ARGS)
                 funcctx = SRF_FIRSTCALL_INIT();
                 oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
  
-               splitctx = setup_regexp_split(PG_GETARG_TEXT_P_COPY(0), pattern, flags);
+               /* be sure to copy the input string into the multi-call ctx */
+               splitctx = setup_regexp_matches(PG_GETARG_TEXT_P_COPY(0), pattern,
+                                                                               flags, true, false, true);
  
                 MemoryContextSwitchTo(oldcontext);
                 funcctx->user_fctx = (void *) splitctx;
-
-               /*
-                * Avoid run-away function by making sure we never iterate
-                * more than the length of the text
-                */
-               funcctx->max_calls = splitctx->wide_len;
         }
  
         funcctx = SRF_PERCALL_SETUP();
-       splitctx = (regexp_split_ctx *) funcctx->user_fctx;
+       splitctx = (regexp_matches_ctx *) funcctx->user_fctx;
  
-       if (funcctx->call_cntr > funcctx->max_calls)
+       if (splitctx->next_match <= splitctx->nmatches)
         {
-               /*
-                * If we exceed wide_len something really wonky is going on,
-                * since it is returning more matches than there are
-                * characters in the string, which should not happen
-                */
-               elog(ERROR, "set returning split function terminated after iterating %d times",
-                        funcctx->call_cntr);
+               Datum result = build_regexp_split_result(splitctx);
+
+               splitctx->next_match++;
+               SRF_RETURN_NEXT(funcctx, result);
         }
  
-       if (splitctx->offset < splitctx->wide_len)
-               SRF_RETURN_NEXT(funcctx, get_next_split(splitctx));
-       else
-               SRF_RETURN_DONE(funcctx);
+       SRF_RETURN_DONE(funcctx);
  }
  
+/* This is separate to keep the opr_sanity regression test from complaining */
  Datum regexp_split_to_table_no_flags(PG_FUNCTION_ARGS)
  {
         return regexp_split_to_table(fcinfo);
  }
  
+/*
+ * regexp_split_to_array()
+ *             Split the string at matches of the pattern, returning the
+ *             split-out substrings as an array.
+ */
  Datum regexp_split_to_array(PG_FUNCTION_ARGS)
  {
         ArrayBuildState         *astate = NULL;
-       regexp_split_ctx        *splitctx;
-       int                              nitems;
+       regexp_matches_ctx      *splitctx;
  
-       splitctx = setup_regexp_split(PG_GETARG_TEXT_P(0),
-                                                                 PG_GETARG_TEXT_P(1),
-                                                                 PG_GETARG_TEXT_P_IF_EXISTS(2));
+       splitctx = setup_regexp_matches(PG_GETARG_TEXT_P(0),
+                                                                       PG_GETARG_TEXT_P(1),
+                                                                       PG_GETARG_TEXT_P_IF_EXISTS(2),
+                                                                       true, false, true);
  
-       for (nitems = 0; splitctx->offset < splitctx->wide_len; nitems++)
+       while (splitctx->next_match <= splitctx->nmatches)
         {
-               if (nitems > splitctx->wide_len)
-                       elog(ERROR, "split function terminated after iterating %d times",
-                                nitems);
-
                 astate = accumArrayResult(astate,
-                                                                 get_next_split(splitctx),
+                                                                 build_regexp_split_result(splitctx),
                                                                   false,
                                                                   TEXTOID,
                                                                   CurrentMemoryContext);
+               splitctx->next_match++;
         }
  
         PG_RETURN_ARRAYTYPE_P(makeArrayResult(astate, CurrentMemoryContext));
  }
  
+/* This is separate to keep the opr_sanity regression test from complaining */
  Datum regexp_split_to_array_no_flags(PG_FUNCTION_ARGS)
  {
         return regexp_split_to_array(fcinfo);
  }
  
-static regexp_split_ctx *
-setup_regexp_split(text *str, text *pattern, text *flags)
-{
-       regexp_split_ctx *splitctx = palloc(sizeof(regexp_split_ctx));
-
-       splitctx->orig_str = str;
-       splitctx->orig_len = VARSIZE(splitctx->orig_str) - VARHDRSZ;
-
-       parse_re_comp_flags(&splitctx->flags, flags);
-       if (splitctx->flags.glob)
-               ereport(ERROR,
-                               (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
-                                errmsg("regexp_split does not support the global option")));
-
-       splitctx->cpattern = RE_compile_and_cache(pattern, splitctx->flags.cflags);
-
-       splitctx->wide_str = palloc(sizeof(pg_wchar) * (splitctx->orig_len + 1));
-       splitctx->wide_len = pg_mb2wchar_with_len(VARDATA(splitctx->orig_str),
-                                                                                         splitctx->wide_str,
-                                                                                         splitctx->orig_len);
-
-       splitctx->offset = 0;
-
-       splitctx->match.rm_so = -1;
-       /* both < 0 but not equal */
-       splitctx->match.rm_eo = -2;
-
-       return splitctx;
-}
-
+/*
+ * build_regexp_split_result - build output string for current match
+ *
+ * We return the string between the current match and the previous one,
+ * or the string after the last match when next_match == nmatches.
+ */
  static Datum
-get_next_split(regexp_split_ctx *splitctx)
+build_regexp_split_result(regexp_matches_ctx *splitctx)
  {
-       regmatch_t *pmatch = &(splitctx->match);
-
-       for (;;)
-       {
-               Datum result;
-               int       startpos = splitctx->offset + 1;
-
-               /*
-                * If the last match was zero-length, we need to push the
-                * offset forward to avoid matching the same place forever
-                */
-               if (pmatch->rm_so == pmatch->rm_eo)
-                       splitctx->offset++;
-
-               if (RE_wchar_execute(splitctx->cpattern,
-                                                        splitctx->wide_str,
-                                                        splitctx->wide_len,
-                                                        splitctx->offset,
-                                                        1,
-                                                        pmatch))
-               {
-                       int length = splitctx->match.rm_so - startpos + 1;
-
-                       /*
-                        * If we are trying to match at the beginning of the string and
-                        * we got a zero-length match, or if we just matched where we
-                        * left off last time, go around the loop again and increment
-                        * the offset.  If we have incremented the offset already and
-                        * it matched at the new offset, that's ok
-                        */
-                       if (length == 0)
-                               continue;
+       int             startpos;
+       int             endpos;
  
-                       result = DirectFunctionCall3(text_substr,
-                                                                                PointerGetDatum(splitctx->orig_str),
-                                                                                Int32GetDatum(startpos),
-                                                                                Int32GetDatum(length));
-
-                       /* set the offset to the end of this match for next time */
-                       splitctx->offset = pmatch->rm_eo;
-
-                       return result;
-               }
+       if (splitctx->next_match > 0)
+               startpos = splitctx->match_locs[splitctx->next_match * 2 - 1];
+       else
+               startpos = 0;
+       if (startpos < 0)
+               elog(ERROR, "invalid match ending position");
  
+       if (splitctx->next_match < splitctx->nmatches)
+       {
+               endpos = splitctx->match_locs[splitctx->next_match * 2];
+               if (endpos < startpos)
+                       elog(ERROR, "invalid match starting position");
+               return DirectFunctionCall3(text_substr,
+                                                                  PointerGetDatum(splitctx->orig_str),
+                                                                  Int32GetDatum(startpos + 1),
+                                                                  Int32GetDatum(endpos - startpos));
+       }
+       else
+       {
                 /* no more matches, return rest of string */
-               result = DirectFunctionCall2(text_substr_no_len,
-                                                                        PointerGetDatum(splitctx->orig_str),
-                                                                        Int32GetDatum(startpos));
-
-               /* so we know we're done next time through */
-               splitctx->offset = splitctx->wide_len;
-
-               return result;
+               return DirectFunctionCall2(text_substr_no_len,
+                                                                  PointerGetDatum(splitctx->orig_str),
+                                                                  Int32GetDatum(startpos + 1));
         }
  }
-
-/*
- * report whether regex_flavor is currently BASIC
- */
-bool
-regex_flavor_is_basic(void)
-{
-       return (regex_flavor == REG_BASIC);
-}
diff --git a/src/test/regress/expected/strings.out b/src/test/regress/expected/strings.out

index 31ab2bb546379cc28db93956e1cd2a656512df7c..dac28eb10c3c3d05ba86c83aa89ed94d9d9b3af2 100644 (file)
--- a/src/test/regress/expected/strings.out
+++ b/src/test/regress/expected/strings.out
@@ -219,7 +219,7 @@ SELECT regexp_replace('AAA aaa', 'A+', 'Z', 'gi');
  
  -- invalid regexp option
  SELECT regexp_replace('AAA aaa', 'A+', 'Z', 'z');
-ERROR:  invalid regexp option: z
+ERROR:  invalid regexp option: "z"
  -- set so we can tell NULL from empty string
  \pset null '\\N'
  -- return all matches from regexp
@@ -272,8 +272,8 @@ SELECT regexp_matches('foobarbequebaz', $re$barbeque$re$);
  (1 row)
  
  -- give me errors
-SELECT regexp_matches('foobarbequebaz', $re$(bar)(beque)$re$, 'zipper');
-ERROR:  invalid regexp option: z
+SELECT regexp_matches('foobarbequebaz', $re$(bar)(beque)$re$, 'gz');
+ERROR:  invalid regexp option: "z"
  SELECT regexp_matches('foobarbequebaz', $re$(barbeque$re$);
  ERROR:  invalid regular expression: parentheses () not balanced
  SELECT regexp_matches('foobarbequebaz', $re$(bar)(beque){2,1}$re$);
@@ -431,11 +431,30 @@ SELECT regexp_split_to_array('the quick brown fox jumped over the lazy dog', 'no
   {"the quick brown fox jumped over the lazy dog"}
  (1 row)
  
+-- some corner cases
+SELECT regexp_split_to_array('123456','1');
+ regexp_split_to_array 
+-----------------------
+ {"",23456}
+(1 row)
+
+SELECT regexp_split_to_array('123456','6');
+ regexp_split_to_array 
+-----------------------
+ {12345,""}
+(1 row)
+
+SELECT regexp_split_to_array('123456','.');
+ regexp_split_to_array  
+------------------------
+ {"","","","","","",""}
+(1 row)
+
  -- errors
  SELECT foo, length(foo) FROM regexp_split_to_table('thE QUick bROWn FOx jUMPed ovEr THE lazy dOG', 'e', 'zippy') AS foo;
-ERROR:  invalid regexp option: z
-SELECT regexp_split_to_array('thE QUick bROWn FOx jUMPed ovEr THE lazy dOG', 'e', 'zippy');
-ERROR:  invalid regexp option: z
+ERROR:  invalid regexp option: "z"
+SELECT regexp_split_to_array('thE QUick bROWn FOx jUMPed ovEr THE lazy dOG', 'e', 'iz');
+ERROR:  invalid regexp option: "z"
  -- global option meaningless for regexp_split
  SELECT foo, length(foo) FROM regexp_split_to_table('thE QUick bROWn FOx jUMPed ovEr THE lazy dOG', 'e', 'g') AS foo;
  ERROR:  regexp_split does not support the global option
diff --git a/src/test/regress/sql/strings.sql b/src/test/regress/sql/strings.sql

index 43b35ab667ffbc4d03a627e28e4f8e1845622a3d..9ab6c218728ac6b222edacdb14836184a5efd58d 100644 (file)
--- a/src/test/regress/sql/strings.sql
+++ b/src/test/regress/sql/strings.sql
@@ -111,7 +111,7 @@ SELECT regexp_matches('foobarbequebaz', $re$(bar)(.+)?(beque)$re$);
  SELECT regexp_matches('foobarbequebaz', $re$barbeque$re$);
  
  -- give me errors
-SELECT regexp_matches('foobarbequebaz', $re$(bar)(beque)$re$, 'zipper');
+SELECT regexp_matches('foobarbequebaz', $re$(bar)(beque)$re$, 'gz');
  SELECT regexp_matches('foobarbequebaz', $re$(barbeque$re$);
  SELECT regexp_matches('foobarbequebaz', $re$(bar)(beque){2,1}$re$);
  
@@ -129,9 +129,13 @@ SELECT regexp_split_to_array('thE QUick bROWn FOx jUMPed ovEr THE lazy dOG', 'e'
  -- no match of pattern
  SELECT foo, length(foo) FROM regexp_split_to_table('the quick brown fox jumped over the lazy dog', 'nomatch') AS foo;
  SELECT regexp_split_to_array('the quick brown fox jumped over the lazy dog', 'nomatch');
+-- some corner cases
+SELECT regexp_split_to_array('123456','1');
+SELECT regexp_split_to_array('123456','6');
+SELECT regexp_split_to_array('123456','.');
  -- errors
  SELECT foo, length(foo) FROM regexp_split_to_table('thE QUick bROWn FOx jUMPed ovEr THE lazy dOG', 'e', 'zippy') AS foo;
-SELECT regexp_split_to_array('thE QUick bROWn FOx jUMPed ovEr THE lazy dOG', 'e', 'zippy');
+SELECT regexp_split_to_array('thE QUick bROWn FOx jUMPed ovEr THE lazy dOG', 'e', 'iz');
  -- global option meaningless for regexp_split
  SELECT foo, length(foo) FROM regexp_split_to_table('thE QUick bROWn FOx jUMPed ovEr THE lazy dOG', 'e', 'g') AS foo;
  SELECT regexp_split_to_array('thE QUick bROWn FOx jUMPed ovEr THE lazy dOG', 'e', 'g');
author	Tom Lane <tgl@sss.pgh.pa.us>
	Sat, 11 Aug 2007 03:56:24 +0000 (03:56 +0000)
committer	Tom Lane <tgl@sss.pgh.pa.us>
	Sat, 11 Aug 2007 03:56:24 +0000 (03:56 +0000)
doc/src/sgml/func.sgml		patch \| blob \| history
src/backend/utils/adt/regexp.c		patch \| blob \| history
src/test/regress/expected/strings.out		patch \| blob \| history
src/test/regress/sql/strings.sql		patch \| blob \| history