Code review for regexp_replace patch. Improve documentation and comments,

author Tom Lane <tgl@sss.pgh.pa.us>

Tue, 18 Oct 2005 20:38:58 +0000 (20:38 +0000)

committer Tom Lane <tgl@sss.pgh.pa.us>

Tue, 18 Oct 2005 20:38:58 +0000 (20:38 +0000)
author Tom Lane <tgl@sss.pgh.pa.us>
Tue, 18 Oct 2005 20:38:58 +0000 (20:38 +0000)
committer Tom Lane <tgl@sss.pgh.pa.us>
Tue, 18 Oct 2005 20:38:58 +0000 (20:38 +0000)
diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml

index a641db7ee7481c785bc329c25d2e162b56b30ece..4ef1696c8c90cba8ce5a3e0ad5d0c37f0a0c1ec6 100644 (file)
--- a/doc/src/sgml/func.sgml
+++ b/doc/src/sgml/func.sgml
@@ -1,5 +1,5 @@
  <!--
-$PostgreSQL: pgsql/doc/src/sgml/func.sgml,v 1.287 2005/10/02 23:50:06 tgl Exp $
+$PostgreSQL: pgsql/doc/src/sgml/func.sgml,v 1.288 2005/10/18 20:38:57 tgl Exp $
  PostgreSQL documentation
  -->
  
@@ -1193,9 +1193,6 @@ PostgreSQL documentation
     <indexterm>
      <primary>quote_literal</primary>
     </indexterm>
-   <indexterm>
-    <primary>regexp_replace</primary>
-   </indexterm>
     <indexterm>
      <primary>repeat</primary>
     </indexterm>
@@ -1419,26 +1416,6 @@ PostgreSQL documentation
         <entry><literal>'O''Reilly'</literal></entry>
        </row>
  
-      <row>
-       <entry><literal><function>regexp_replace</function>(<parameter>source</parameter> <type>text</type>,
-       <parameter>pattern</parameter> <type>text</type>,
-       <parameter>replacement</parameter> <type>text</type>
-       <optional>, <parameter>flags</parameter> <type>text</type></optional>)</literal></entry>
-       <entry><type>text</type></entry>
-       <entry>Replace string that matches the regular expression
-        <parameter>pattern</parameter> in <parameter>source</parameter> to
-        <parameter>replacement</parameter>.
-        <parameter>replacement</parameter> can use <literal>\1</>-<literal>\9</> and <literal>\&amp;</>.
-        <literal>\1</>-<literal>\9</> is a back reference to the n'th subexpression, and
-        <literal>\&amp;</> is the entire matched string.
-        <parameter>flags</parameter> can use <literal>g</>(global) and <literal>i</>(ignore case).
-        When flags is not specified, case sensitive matching is used, and it replaces
-        only the instance.
-       </entry>
-       <entry><literal>regexp_replace('1112223333', '(\\d{3})(\\d{3})(\\d{4})', '(\\1) \\2-\\3')</literal></entry>
-       <entry><literal>(111) 222-3333</literal></entry>
-      </row>
-
        <row>
         <entry><literal><function>repeat</function>(<parameter>string</parameter> <type>text</type>, <parameter>number</parameter> <type>int</type>)</literal></entry>
         <entry><type>text</type></entry>
@@ -2821,10 +2798,12 @@ cast(-44 as bit(12))           <lineannotation>111111010100</lineannotation>
     <indexterm>
      <primary>SIMILAR TO</primary>
     </indexterm>
-
     <indexterm>
      <primary>substring</primary>
     </indexterm>
+   <indexterm>
+    <primary>regexp_replace</primary>
+   </indexterm>
  
  <synopsis>
  <replaceable>string</replaceable> SIMILAR TO <replaceable>pattern</replaceable> <optional>ESCAPE <replaceable>escape-character</replaceable></optional>
@@ -3002,7 +2981,7 @@ substring('foobar' from '#"o_b#"%' for '#')    <lineannotation>NULL</lineannotat
      <para>
       A regular expression is a character sequence that is an
       abbreviated definition of a set of strings (a <firstterm>regular
-      set</firstterm>).  A string is said to match a regular expression
+     set</firstterm>).  A string is said to match a regular expression
       if it is a member of the regular set described by the regular
       expression.  As with <function>LIKE</function>, pattern characters
       match string characters exactly unless they are special characters
@@ -3027,7 +3006,8 @@ substring('foobar' from '#"o_b#"%' for '#')    <lineannotation>NULL</lineannotat
      <para>
       The <function>substring</> function with two parameters,
       <function>substring(<replaceable>string</replaceable> from
-     <replaceable>pattern</replaceable>)</function>, provides extraction of a substring
+     <replaceable>pattern</replaceable>)</function>, provides extraction of a
+     substring
       that matches a POSIX regular expression pattern.  It returns null if
       there is no match, otherwise the portion of the text that matched the
       pattern.  But if the pattern contains any parentheses, the portion
@@ -3048,6 +3028,45 @@ substring('foobar' from 'o(.)b')   <lineannotation>o</lineannotation>
  </programlisting>
     </para>
  
+    <para>
+     The <function>regexp_replace</> function provides substitution of
+     new text for substrings that match POSIX regular expression patterns.
+     It has the syntax
+     <function>regexp_replace</function>(<replaceable>source</>,
+     <replaceable>pattern</>, <replaceable>replacement</>
+     <optional>, <replaceable>flags</> </optional>).
+     The <replaceable>source</> string is returned unchanged if
+     there is no match to the <replaceable>pattern</>.  If there is a
+     match, the <replaceable>source</> string is returned with the
+     <replaceable>replacement</> string substituted for the matching
+     substring.  The <replaceable>replacement</> string can contain
+     <literal>\</><replaceable>n</>, where <replaceable>n</> is <literal>1</>
+     through <literal>9</>, to indicate that the source substring matching the
+     <replaceable>n</>'th parenthesized subexpression of the pattern should be
+     inserted, and it can contain <literal>\&amp;</> to indicate that the
+     substring matching the entire pattern should be inserted.  Write
+     <literal>\\</> if you need to put a literal backslash in the replacement
+     text.  (As always, remember to double backslashes written in literal
+     constant strings.)
+     The <replaceable>flags</> parameter is an optional text
+     string containing zero or more single-letter flags that change the
+     function's behavior.  Flag <literal>i</> specifies case-insensitive
+     matching, while flag <literal>g</> specifies replacement of each matching
+     substring rather than only the first one.
+    </para>
+
+   <para>
+    Some examples:
+<programlisting>
+regexp_replace('foobarbaz', 'b..', 'X')
+                                   <lineannotation>fooXbaz</lineannotation>
+regexp_replace('foobarbaz', 'b..', 'X', 'g')
+                                   <lineannotation>fooXX</lineannotation>
+regexp_replace('foobarbaz', 'b(..)', 'X\\1Y', 'g')
+                                   <lineannotation>fooXarYXazY</lineannotation>
+</programlisting>
+   </para>
+
     <para>
      <productname>PostgreSQL</productname>'s regular expressions are implemented
      using a package written by Henry Spencer.  Much of
diff --git a/src/backend/utils/adt/regexp.c b/src/backend/utils/adt/regexp.c

index a872762c3c204077bc12b0b5abfb2434aedb1416..ce04ce77e67902746504860bca5d40b93261babd 100644 (file)
--- a/src/backend/utils/adt/regexp.c
+++ b/src/backend/utils/adt/regexp.c
@@ -8,7 +8,7 @@
   *
   *
   * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/utils/adt/regexp.c,v 1.59 2005/10/15 02:49:29 momjian Exp $
+ *       $PostgreSQL: pgsql/src/backend/utils/adt/regexp.c,v 1.60 2005/10/18 20:38:58 tgl Exp $
   *
   *             Alistair Crooks added the code for the regex caching
   *             agc - cached the regular expressions used - there's a good chance
@@ -83,7 +83,7 @@ static cached_re_str re_array[MAX_CACHED_RES];        /* cached re's */
  /*
   * RE_compile_and_cache - compile a RE, caching if possible
   *
- * Returns regex_t
+ * Returns regex_t *
   *
   *     text_re --- the pattern, expressed as an *untoasted* TEXT object
   *     cflags --- compile options for the pattern
@@ -91,7 +91,7 @@ static cached_re_str re_array[MAX_CACHED_RES];        /* cached re's */
   * Pattern is given in the database encoding.  We internally convert to
   * array of pg_wchar which is what Spencer's regex package wants.
   */
-static regex_t
+static regex_t *
  RE_compile_and_cache(text *text_re, int cflags)
  {
         int                     text_re_len = VARSIZE(text_re);
@@ -123,7 +123,7 @@ RE_compile_and_cache(text *text_re, int cflags)
                                 re_array[0] = re_temp;
                         }
  
-                       return re_array[0].cre_re;
+                       return &re_array[0].cre_re;
                 }
         }
  
@@ -188,7 +188,7 @@ RE_compile_and_cache(text *text_re, int cflags)
         re_array[0] = re_temp;
         num_res++;
  
-       return re_array[0].cre_re;
+       return &re_array[0].cre_re;
  }
  
  /*
@@ -212,7 +212,7 @@ RE_compile_and_execute(text *text_re, char *dat, int dat_len,
         pg_wchar   *data;
         size_t          data_len;
         int                     regexec_result;
-       regex_t         re;
+       regex_t    *re;
         char            errMsg[100];
  
         /* Convert data string to wide characters */
@@ -223,7 +223,7 @@ RE_compile_and_execute(text *text_re, char *dat, int dat_len,
         re = RE_compile_and_cache(text_re, cflags);
  
         /* Perform RE match and return result */
-       regexec_result = pg_regexec(&re_array[0].cre_re,
+       regexec_result = pg_regexec(re,
                                                                 data,
                                                                 data_len,
                                                                 0,
@@ -237,8 +237,7 @@ RE_compile_and_execute(text *text_re, char *dat, int dat_len,
         if (regexec_result != REG_OKAY && regexec_result != REG_NOMATCH)
         {
                 /* re failed??? */
-               pg_regerror(regexec_result, &re_array[0].cre_re,
-                                       errMsg, sizeof(errMsg));
+               pg_regerror(regexec_result, re, errMsg, sizeof(errMsg));
                 ereport(ERROR,
                                 (errcode(ERRCODE_INVALID_REGULAR_EXPRESSION),
                                  errmsg("regular expression failed: %s", errMsg)));
@@ -442,10 +441,10 @@ textregexsubstr(PG_FUNCTION_ARGS)
  
  /*
   * textregexreplace_noopt()
- *             Return a replace string matched by a regular expression.
- *             This function is a version that doesn't specify the option of
- *             textregexreplace. This is case sensitive, replace the first
- *             instance only.
+ *             Return a string matched by a regular expression, with replacement.
+ *
+ * This version doesn't have an option argument: we default to case
+ * sensitive match, replace the first instance only.
   */
  Datum
  textregexreplace_noopt(PG_FUNCTION_ARGS)
@@ -453,20 +452,16 @@ textregexreplace_noopt(PG_FUNCTION_ARGS)
         text       *s = PG_GETARG_TEXT_P(0);
         text       *p = PG_GETARG_TEXT_P(1);
         text       *r = PG_GETARG_TEXT_P(2);
-       regex_t         re;
+       regex_t    *re;
  
         re = RE_compile_and_cache(p, regex_flavor);
  
-       return DirectFunctionCall4(replace_text_regexp,
-                                                          PointerGetDatum(s),
-                                                          PointerGetDatum(&re),
-                                                          PointerGetDatum(r),
-                                                          BoolGetDatum(false));
+       PG_RETURN_TEXT_P(replace_text_regexp(s, (void *) re, r, false));
  }
  
  /*
   * textregexreplace()
- *             Return a replace string matched by a regular expression.
+ *             Return a string matched by a regular expression, with replacement.
   */
  Datum
  textregexreplace(PG_FUNCTION_ARGS)
@@ -478,9 +473,9 @@ textregexreplace(PG_FUNCTION_ARGS)
         char       *opt_p = VARDATA(opt);
         int                     opt_len = (VARSIZE(opt) - VARHDRSZ);
         int                     i;
-       bool global = false;
+       bool            glob = false;
         bool            ignorecase = false;
-       regex_t         re;
+       regex_t    *re;
  
         /* parse options */
         for (i = 0; i < opt_len; i++)
@@ -491,8 +486,7 @@ textregexreplace(PG_FUNCTION_ARGS)
                                 ignorecase = true;
                                 break;
                         case 'g':
-                               global = true;
-
+                               glob = true;
                                 break;
                         default:
                                 ereport(ERROR,
@@ -508,11 +502,7 @@ textregexreplace(PG_FUNCTION_ARGS)
         else
                 re = RE_compile_and_cache(p, regex_flavor);
  
-       return DirectFunctionCall4(replace_text_regexp,
-                                                          PointerGetDatum(s),
-                                                          PointerGetDatum(&re),
-                                                          PointerGetDatum(r),
-                                                          BoolGetDatum(global));
+       PG_RETURN_TEXT_P(replace_text_regexp(s, (void *) re, r, glob));
  }
  
  /* similar_escape()
diff --git a/src/backend/utils/adt/varlena.c b/src/backend/utils/adt/varlena.c

index 5fc3128360c837f92d52ea59bbc6217e6d839a66..3f67403ba44a59bde951668b25d451119f7707bc 100644 (file)
--- a/src/backend/utils/adt/varlena.c
+++ b/src/backend/utils/adt/varlena.c
@@ -8,7 +8,7 @@
   *
   *
   * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/utils/adt/varlena.c,v 1.137 2005/10/17 16:24:19 tgl Exp $
+ *       $PostgreSQL: pgsql/src/backend/utils/adt/varlena.c,v 1.138 2005/10/18 20:38:58 tgl Exp $
   *
   *-------------------------------------------------------------------------
   */
@@ -24,11 +24,11 @@
  #include "mb/pg_wchar.h"
  #include "miscadmin.h"
  #include "parser/scansup.h"
+#include "regex/regex.h"
  #include "utils/array.h"
  #include "utils/builtins.h"
  #include "utils/lsyscache.h"
  #include "utils/pg_locale.h"
-#include "regex/regex.h"
  
  
  typedef struct varlena unknown;
@@ -2066,7 +2066,8 @@ replace_text(PG_FUNCTION_ARGS)
  
  /*
   * check_replace_text_has_escape_char
- * check whether replace_text has escape char.
+ *
+ * check whether replace_text contains escape char.
   */
  static bool
  check_replace_text_has_escape_char(const text *replace_text)
@@ -2077,14 +2078,18 @@ check_replace_text_has_escape_char(const text *replace_text)
         if (pg_database_encoding_max_length() == 1)
         {
                 for (; p < p_end; p++)
+               {
                         if (*p == '\\')
                                 return true;
+               }
         }
         else
         {
                 for (; p < p_end; p += pg_mblen(p))
+               {
                         if (*p == '\\')
                                 return true;
+               }
         }
  
         return false;
@@ -2092,7 +2097,9 @@ check_replace_text_has_escape_char(const text *replace_text)
  
  /*
   * appendStringInfoRegexpSubstr
- * append string by using back references of regexp.
+ *
+ * Append replace_text to str, substituting regexp back references for
+ * \n escapes.
   */
  static void
  appendStringInfoRegexpSubstr(StringInfo str, text *replace_text,
@@ -2100,50 +2107,41 @@ appendStringInfoRegexpSubstr(StringInfo str, text *replace_text,
  {
         const char *p = VARDATA(replace_text);
         const char *p_end = p + (VARSIZE(replace_text) - VARHDRSZ);
-
         int                     eml = pg_database_encoding_max_length();
  
-       int                     substr_start = 1;
-       int                     ch_cnt;
-
-       int                     so;
-       int                     eo;
-
-       while (1)
+       for (;;)
         {
-               /* Find escape char. */
-               ch_cnt = 0;
+               const char *chunk_start = p;
+               int                     so;
+               int                     eo;
+
+               /* Find next escape char. */
                 if (eml == 1)
                 {
                         for (; p < p_end && *p != '\\'; p++)
-                               ch_cnt++;
+                               /* nothing */ ;
                 }
                 else
                 {
                         for (; p < p_end && *p != '\\'; p += pg_mblen(p))
-                               ch_cnt++;
+                               /* nothing */ ;
                 }
  
-               /*
-                * Copy the text when there is a text in the left of escape char or
-                * escape char is not found.
-                */
-               if (ch_cnt)
-               {
-                       text       *append_text = text_substring(PointerGetDatum(replace_text),
-                                                                                               substr_start, ch_cnt, false);
-
-                       appendStringInfoText(str, append_text);
-                       pfree(append_text);
-               }
-               substr_start += ch_cnt + 1;
+               /* Copy the text we just scanned over, if any. */
+               if (p > chunk_start)
+                       appendBinaryStringInfo(str, chunk_start, p - chunk_start);
  
-               if (p >= p_end)                 /* When escape char is not found. */
+               /* Done if at end of string, else advance over escape char. */
+               if (p >= p_end)
                         break;
-
-               /* See the next character of escape char. */
                 p++;
-               so = eo = -1;
+
+               if (p >= p_end)
+               {
+                       /* Escape at very end of input.  Treat same as unexpected char */
+                       appendStringInfoChar(str, '\\');
+                       break;
+               }
  
                 if (*p >= '1' && *p <= '9')
                 {
@@ -2153,7 +2151,6 @@ appendStringInfoRegexpSubstr(StringInfo str, text *replace_text,
                         so = pmatch[idx].rm_so;
                         eo = pmatch[idx].rm_eo;
                         p++;
-                       substr_start++;
                 }
                 else if (*p == '&')
                 {
@@ -2161,15 +2158,36 @@ appendStringInfoRegexpSubstr(StringInfo str, text *replace_text,
                         so = pmatch[0].rm_so;
                         eo = pmatch[0].rm_eo;
                         p++;
-                       substr_start++;
+               }
+               else if (*p == '\\')
+               {
+                       /* \\ means transfer one \ to output. */
+                       appendStringInfoChar(str, '\\');
+                       p++;
+                       continue;
+               }
+               else
+               {
+                       /*
+                        * If escape char is not followed by any expected char,
+                        * just treat it as ordinary data to copy.  (XXX would it be
+                        * better to throw an error?)
+                        */
+                       appendStringInfoChar(str, '\\');
+                       continue;
                 }
  
                 if (so != -1 && eo != -1)
                 {
-                       /* Copy the text that is back reference of regexp. */
-                       text       *append_text = text_substring(PointerGetDatum(src_text),
-                                                                                                  so + 1, (eo - so), false);
+                       /*
+                        * Copy the text that is back reference of regexp.  Because so and
+                        * eo are counted in characters not bytes, it's easiest to use
+                        * text_substring to pull out the correct chunk of text.
+                        */
+                       text       *append_text;
  
+                       append_text = text_substring(PointerGetDatum(src_text),
+                                                                                so + 1, (eo - so), false);
                         appendStringInfoText(str, append_text);
                         pfree(append_text);
                 }
@@ -2180,17 +2198,19 @@ appendStringInfoRegexpSubstr(StringInfo str, text *replace_text,
  
  /*
   * replace_text_regexp
+ *
   * replace text that matches to regexp in src_text to replace_text.
+ *
+ * Note: to avoid having to include regex.h in builtins.h, we declare
+ * the regexp argument as void *, but really it's regex_t *.
   */
-Datum
-replace_text_regexp(PG_FUNCTION_ARGS)
+text *
+replace_text_regexp(text *src_text, void *regexp,
+                                       text *replace_text, bool glob)
  {
         text       *ret_text;
-       text       *src_text = PG_GETARG_TEXT_P(0);
+       regex_t    *re = (regex_t *) regexp;
         int                     src_text_len = VARSIZE(src_text) - VARHDRSZ;
-       regex_t    *re = (regex_t *) PG_GETARG_POINTER(1);
-       text       *replace_text = PG_GETARG_TEXT_P(2);
-       bool global = PG_GETARG_BOOL(3);
         StringInfo      str = makeStringInfo();
         int                     regexec_result;
         regmatch_t      pmatch[REGEXP_REPLACE_BACKREF_CNT];
@@ -2233,14 +2253,18 @@ replace_text_regexp(PG_FUNCTION_ARGS)
                         break;
  
                 /*
-                * Copy the text when there is a text in the left of matched position.
+                * Copy the text to the left of the match position.  Because we
+                * are working with character not byte indexes, it's easiest to
+                * use text_substring to pull out the needed data.
                  */
                 if (pmatch[0].rm_so - data_pos > 0)
                 {
-                       text       *left_text = text_substring(PointerGetDatum(src_text),
-                                                                                                  data_pos + 1,
-                                                                                 pmatch[0].rm_so - data_pos, false);
+                       text       *left_text;
  
+                       left_text = text_substring(PointerGetDatum(src_text),
+                                                                          data_pos + 1,
+                                                                          pmatch[0].rm_so - data_pos,
+                                                                          false);
                         appendStringInfoText(str, left_text);
                         pfree(left_text);
                 }
@@ -2259,7 +2283,7 @@ replace_text_regexp(PG_FUNCTION_ARGS)
                 /*
                  * When global option is off, replace the first instance only.
                  */
-               if (!global)
+               if (!glob)
                         break;
  
                 /*
@@ -2270,14 +2294,14 @@ replace_text_regexp(PG_FUNCTION_ARGS)
         }
  
         /*
-        * Copy the text when there is a text at the right of last matched or
-        * regexp is not matched.
+        * Copy the text to the right of the last match.
          */
         if (data_pos < data_len)
         {
-               text       *right_text = text_substring(PointerGetDatum(src_text),
-                                                                                               data_pos + 1, -1, true);
+               text       *right_text;
  
+               right_text = text_substring(PointerGetDatum(src_text),
+                                                                       data_pos + 1, -1, true);
                 appendStringInfoText(str, right_text);
                 pfree(right_text);
         }
@@ -2287,7 +2311,7 @@ replace_text_regexp(PG_FUNCTION_ARGS)
         pfree(str);
         pfree(data);
  
-       PG_RETURN_TEXT_P(ret_text);
+       return ret_text;
  }
  
  /*
diff --git a/src/include/utils/builtins.h b/src/include/utils/builtins.h

index 469d993d04dc0bb452a9299d852bf035f1d0ca9e..1ec358f3849de9d556887127bc926658bc77be61 100644 (file)
--- a/src/include/utils/builtins.h
+++ b/src/include/utils/builtins.h
@@ -7,7 +7,7 @@
   * Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
   *
- * $PostgreSQL: pgsql/src/include/utils/builtins.h,v 1.266 2005/10/15 02:49:46 momjian Exp $
+ * $PostgreSQL: pgsql/src/include/utils/builtins.h,v 1.267 2005/10/18 20:38:58 tgl Exp $
   *
   *-------------------------------------------------------------------------
   */
@@ -593,7 +593,8 @@ extern List *textToQualifiedNameList(text *textval);
  extern bool SplitIdentifierString(char *rawstring, char separator,
                                           List **namelist);
  extern Datum replace_text(PG_FUNCTION_ARGS);
-extern Datum replace_text_regexp(PG_FUNCTION_ARGS);
+extern text *replace_text_regexp(text *src_text, void *regexp,
+                                                                text *replace_text, bool glob);
  extern Datum split_text(PG_FUNCTION_ARGS);
  extern Datum text_to_array(PG_FUNCTION_ARGS);
  extern Datum array_to_text(PG_FUNCTION_ARGS);
author	Tom Lane <tgl@sss.pgh.pa.us>
	Tue, 18 Oct 2005 20:38:58 +0000 (20:38 +0000)
committer	Tom Lane <tgl@sss.pgh.pa.us>
	Tue, 18 Oct 2005 20:38:58 +0000 (20:38 +0000)
doc/src/sgml/func.sgml		patch \| blob \| history
src/backend/utils/adt/regexp.c		patch \| blob \| history
src/backend/utils/adt/varlena.c		patch \| blob \| history
src/include/utils/builtins.h		patch \| blob \| history