Fix regex back-references that are directly quantified with *.

author Tom Lane <tgl@sss.pgh.pa.us>

Mon, 20 Feb 2012 05:52:33 +0000 (00:52 -0500)

committer Tom Lane <tgl@sss.pgh.pa.us>

Mon, 20 Feb 2012 05:52:33 +0000 (00:52 -0500)
author Tom Lane <tgl@sss.pgh.pa.us>
Mon, 20 Feb 2012 05:52:33 +0000 (00:52 -0500)
committer Tom Lane <tgl@sss.pgh.pa.us>
Mon, 20 Feb 2012 05:52:33 +0000 (00:52 -0500)
diff --git a/src/backend/regex/regcomp.c b/src/backend/regex/regcomp.c

index 4f9da5b0468d53b997ddb52eb0b5a66a17696318..6b80140e90940b4a348c342090e26fdd0bc82c8f 100644 (file)
--- a/src/backend/regex/regcomp.c
+++ b/src/backend/regex/regcomp.c
@@ -1088,8 +1088,12 @@ parseqatom(struct vars * v,
                 NOERR();
         }
  
-       /* it's quantifier time; first, turn x{0,...} into x{1,...}|empty */
-       if (m == 0)
+       /*
+        * It's quantifier time.  If the atom is just a BACKREF, we'll let it deal
+        * with quantifiers internally.  Otherwise, the first step is to turn
+        * x{0,...} into x{1,...}|empty
+        */
+       if (m == 0 && atomtype != BACKREF)
         {
                 EMPTYARC(s2, atom->end);        /* the bypass */
                 assert(PREF(qprefer) != 0);
diff --git a/src/backend/regex/regexec.c b/src/backend/regex/regexec.c

index f8e31f8f4ade89d9a4bec2ab8287b4689c1d082b..224da5064b69b9577856b21793bd9a92afb03377 100644 (file)
--- a/src/backend/regex/regexec.c
+++ b/src/backend/regex/regexec.c
@@ -720,7 +720,7 @@ cdissect(struct vars * v,
                 case '|':                               /* alternation */
                         assert(t->left != NULL);
                         return caltdissect(v, t, begin, end);
-               case 'b':                               /* back ref -- shouldn't be calling us! */
+               case 'b':                               /* back reference */
                         assert(t->left == NULL && t->right == NULL);
                         return cbrdissect(v, t, begin, end);
                 case '.':                               /* concatenation */
@@ -962,12 +962,12 @@ cbrdissect(struct vars * v,
                    chr *begin,                  /* beginning of relevant substring */
                    chr *end)                    /* end of same */
  {
-       int                     i;
         int                     n = t->subno;
-       size_t          len;
-       chr                *paren;
+       size_t          numreps;
+       size_t          tlen;
+       size_t          brlen;
+       chr                *brstring;
         chr                *p;
-       chr                *stop;
         int                     min = t->min;
         int                     max = t->max;
  
@@ -978,46 +978,65 @@ cbrdissect(struct vars * v,
  
         MDEBUG(("cbackref n%d %d{%d-%d}\n", t->retry, n, min, max));
  
+       /* get the backreferenced string */
         if (v->pmatch[n].rm_so == -1)
                 return REG_NOMATCH;
-       paren = v->start + v->pmatch[n].rm_so;
-       len = v->pmatch[n].rm_eo - v->pmatch[n].rm_so;
+       brstring = v->start + v->pmatch[n].rm_so;
+       brlen = v->pmatch[n].rm_eo - v->pmatch[n].rm_so;
  
         /* no room to maneuver -- retries are pointless */
         if (v->mem[t->retry])
                 return REG_NOMATCH;
         v->mem[t->retry] = 1;
  
-       /* special-case zero-length string */
-       if (len == 0)
+       /* special cases for zero-length strings */
+       if (brlen == 0)
+       {
+               /*
+                * matches only if target is zero length, but any number of
+                * repetitions can be considered to be present
+                */
+               if (begin == end && min <= max)
+               {
+                       MDEBUG(("cbackref matched trivially\n"));
+                       return REG_OKAY;
+               }
+               return REG_NOMATCH;
+       }
+       if (begin == end)
         {
-               if (begin == end)
+               /* matches only if zero repetitions are okay */
+               if (min == 0)
+               {
+                       MDEBUG(("cbackref matched trivially\n"));
                         return REG_OKAY;
+               }
                 return REG_NOMATCH;
         }
  
-       /* and too-short string */
-       assert(end >= begin);
-       if ((size_t) (end - begin) < len)
+       /*
+        * check target length to see if it could possibly be an allowed number of
+        * repetitions of brstring
+        */
+       assert(end > begin);
+       tlen = end - begin;
+       if (tlen % brlen != 0)
+               return REG_NOMATCH;
+       numreps = tlen / brlen;
+       if (numreps < min || (numreps > max && max != INFINITY))
                 return REG_NOMATCH;
-       stop = end - len;
  
-       /* count occurrences */
-       i = 0;
-       for (p = begin; p <= stop && (i < max || max == INFINITY); p += len)
+       /* okay, compare the actual string contents */
+       p = begin;
+       while (numreps-- > 0)
         {
-               if ((*v->g->compare) (paren, p, len) != 0)
-                       break;
-               i++;
+               if ((*v->g->compare) (brstring, p, brlen) != 0)
+                       return REG_NOMATCH;
+               p += brlen;
         }
-       MDEBUG(("cbackref found %d\n", i));
  
-       /* and sort it out */
-       if (p != end)                           /* didn't consume all of it */
-               return REG_NOMATCH;
-       if (min <= i && (i <= max || max == INFINITY))
-               return REG_OKAY;
-       return REG_NOMATCH;                     /* out of range */
+       MDEBUG(("cbackref matched\n"));
+       return REG_OKAY;
  }
  
  /*
diff --git a/src/test/regress/expected/regex.out b/src/test/regress/expected/regex.out

new file mode 100644 (file)

index 0000000..5694908
--- /dev/null
+++ b/src/test/regress/expected/regex.out
@@ -0,0 +1,36 @@
+--
+-- Regular expression tests
+--
+-- Don't want to have to double backslashes in regexes
+set standard_conforming_strings = on;
+-- Test simple quantified backrefs
+select 'bbbbb' ~ '^([bc])\1*$' as t;
+ t 
+---
+ t
+(1 row)
+
+select 'ccc' ~ '^([bc])\1*$' as t;
+ t 
+---
+ t
+(1 row)
+
+select 'xxx' ~ '^([bc])\1*$' as f;
+ f 
+---
+ f
+(1 row)
+
+select 'bbc' ~ '^([bc])\1*$' as f;
+ f 
+---
+ f
+(1 row)
+
+select 'b' ~ '^([bc])\1*$' as t;
+ t 
+---
+ t
+(1 row)
+
diff --git a/src/test/regress/parallel_schedule b/src/test/regress/parallel_schedule

index 862f5b20077a66d80aa0009522d310f875e93487..8852e0a40fc5ca9d0123bdda955e3a04fc71ce0a 100644 (file)
--- a/src/test/regress/parallel_schedule
+++ b/src/test/regress/parallel_schedule
@@ -30,7 +30,7 @@ test: point lseg box path polygon circle date time timetz timestamp timestamptz
  # geometry depends on point, lseg, box, path, polygon and circle
  # horology depends on interval, timetz, timestamp, timestamptz, reltime and abstime
  # ----------
-test: geometry horology oidjoins type_sanity opr_sanity
+test: geometry horology regex oidjoins type_sanity opr_sanity
  
  # ----------
  # These four each depend on the previous one
diff --git a/src/test/regress/serial_schedule b/src/test/regress/serial_schedule

index 142fc9cf0d1a177fe5881e6065ab343f88ade78d..0bc5df7fe73f59b4868ca881247487eadc83107d 100644 (file)
--- a/src/test/regress/serial_schedule
+++ b/src/test/regress/serial_schedule
@@ -42,6 +42,7 @@ test: tstypes
  test: comments
  test: geometry
  test: horology
+test: regex
  test: oidjoins
  test: type_sanity
  test: opr_sanity
diff --git a/src/test/regress/sql/regex.sql b/src/test/regress/sql/regex.sql

new file mode 100644 (file)

index 0000000..242a81e
--- /dev/null
+++ b/src/test/regress/sql/regex.sql
@@ -0,0 +1,13 @@
+--
+-- Regular expression tests
+--
+
+-- Don't want to have to double backslashes in regexes
+set standard_conforming_strings = on;
+
+-- Test simple quantified backrefs
+select 'bbbbb' ~ '^([bc])\1*$' as t;
+select 'ccc' ~ '^([bc])\1*$' as t;
+select 'xxx' ~ '^([bc])\1*$' as f;
+select 'bbc' ~ '^([bc])\1*$' as f;
+select 'b' ~ '^([bc])\1*$' as t;
author	Tom Lane <tgl@sss.pgh.pa.us>
	Mon, 20 Feb 2012 05:52:33 +0000 (00:52 -0500)
committer	Tom Lane <tgl@sss.pgh.pa.us>
	Mon, 20 Feb 2012 05:52:33 +0000 (00:52 -0500)
src/backend/regex/regcomp.c		patch \| blob \| history
src/backend/regex/regexec.c		patch \| blob \| history
src/test/regress/expected/regex.out	[new file with mode: 0644]	patch \| blob
src/test/regress/parallel_schedule		patch \| blob \| history
src/test/regress/serial_schedule		patch \| blob \| history
src/test/regress/sql/regex.sql	[new file with mode: 0644]	patch \| blob