Fix the general case of quantified regex back-references.

author Tom Lane <tgl@sss.pgh.pa.us>

Fri, 24 Feb 2012 06:40:18 +0000 (01:40 -0500)

committer Tom Lane <tgl@sss.pgh.pa.us>

Fri, 24 Feb 2012 06:41:03 +0000 (01:41 -0500)
author Tom Lane <tgl@sss.pgh.pa.us>
Fri, 24 Feb 2012 06:40:18 +0000 (01:40 -0500)
committer Tom Lane <tgl@sss.pgh.pa.us>
Fri, 24 Feb 2012 06:41:03 +0000 (01:41 -0500)
diff --git a/src/backend/regex/README b/src/backend/regex/README

index 3fd58c000119a24d61dff594baec28a27a4437f0..89ba6a62ea2f70bfda729f03efedba4b9b6fce9b 100644 (file)
--- a/src/backend/regex/README
+++ b/src/backend/regex/README
@@ -102,15 +102,15 @@ consists of a tree of sub-expressions ("subre"s).  Leaf tree nodes are
  either plain regular expressions (which are executed as DFAs in the manner
  described above) or back-references (which try to match the input to some
  previous substring).  Non-leaf nodes are capture nodes (which save the
-location of the substring currently matching their child node) or
-concatenation or alternation nodes.  At execution time, the executor
-recursively scans the tree.  At concatenation or alternation nodes,
-it considers each possible alternative way of matching the input string,
-ie each place where the string could be split for a concatenation, or each
-child node for an alternation.  It tries the next alternative if the match
-fails according to the child nodes.  This is exactly the sort of
-backtracking search done by a traditional NFA regex engine.  If there are
-many tree levels it can get very slow.
+location of the substring currently matching their child node),
+concatenation, alternation, or iteration nodes.  At execution time, the
+executor recursively scans the tree.  At concatenation, alternation, or
+iteration nodes, it considers each possible alternative way of matching the
+input string, that is each place where the string could be split for a
+concatenation or iteration, or each child node for an alternation.  It
+tries the next alternative if the match fails according to the child nodes.
+This is exactly the sort of backtracking search done by a traditional NFA
+regex engine.  If there are many tree levels it can get very slow.
  
  But all is not lost: we can still be smarter than the average pure NFA
  engine.  To do this, each subre node has an associated DFA, which
diff --git a/src/backend/regex/regcomp.c b/src/backend/regex/regcomp.c

index 6b80140e90940b4a348c342090e26fdd0bc82c8f..b84d0c3af55f04873bc61c8b843af3b99c865b4a 100644 (file)
--- a/src/backend/regex/regcomp.c
+++ b/src/backend/regex/regcomp.c
@@ -1036,11 +1036,17 @@ parseqatom(struct vars * v,
         /*----------
          * Prepare a general-purpose state skeleton.
          *
-        *        ---> [s] ---prefix---> [begin] ---atom---> [end] ----rest---> [rp]
-        *       /                                                                                        /
-        * [lp] ----> [s2] ----bypass---------------------
+        * In the no-backrefs case, we want this:
          *
-        * where bypass is an empty, and prefix is some repetitions of atom
+        * [lp] ---> [s] ---prefix---> [begin] ---atom---> [end] ---rest---> [rp]
+        *
+        * where prefix is some repetitions of atom.  In the general case we need
+        *
+        * [lp] ---> [s] ---iterator---> [s2] ---rest---> [rp]
+        *
+        * where the iterator wraps around [begin] ---atom---> [end]
+        *
+        * We make the s state here for both cases; s2 is made below if needed
          *----------
          */
         s = newstate(v->nfa);           /* first, new endpoints for the atom */
@@ -1051,11 +1057,9 @@ parseqatom(struct vars * v,
         NOERR();
         atom->begin = s;
         atom->end = s2;
-       s = newstate(v->nfa);           /* and spots for prefix and bypass */
-       s2 = newstate(v->nfa);
+       s = newstate(v->nfa);           /* set up starting state */
         NOERR();
         EMPTYARC(lp, s);
-       EMPTYARC(lp, s2);
         NOERR();
  
         /* break remaining subRE into x{...} and what follows */
@@ -1089,28 +1093,9 @@ parseqatom(struct vars * v,
         }
  
         /*
-        * It's quantifier time.  If the atom is just a BACKREF, we'll let it deal
-        * with quantifiers internally.  Otherwise, the first step is to turn
-        * x{0,...} into x{1,...}|empty
+        * It's quantifier time.  If the atom is just a backref, we'll let it deal
+        * with quantifiers internally.
          */
-       if (m == 0 && atomtype != BACKREF)
-       {
-               EMPTYARC(s2, atom->end);        /* the bypass */
-               assert(PREF(qprefer) != 0);
-               f = COMBINE(qprefer, atom->flags);
-               t = subre(v, '|', f, lp, atom->end);
-               NOERR();
-               t->left = atom;
-               t->right = subre(v, '|', PREF(f), s2, atom->end);
-               NOERR();
-               t->right->left = subre(v, '=', 0, s2, atom->end);
-               NOERR();
-               *atomp = t;
-               atomp = &t->left;
-               m = 1;
-       }
-
-       /* deal with the rest of the quantifier */
         if (atomtype == BACKREF)
         {
                 /* special case:  backrefs have internal quantifiers */
@@ -1120,17 +1105,25 @@ parseqatom(struct vars * v,
                 atom->min = (short) m;
                 atom->max = (short) n;
                 atom->flags |= COMBINE(qprefer, atom->flags);
+               /* rest of branch can be strung starting from atom->end */
+               s2 = atom->end;
         }
         else if (m == 1 && n == 1)
         {
                 /* no/vacuous quantifier:  done */
                 EMPTYARC(s, atom->begin);               /* empty prefix */
+               /* rest of branch can be strung starting from atom->end */
+               s2 = atom->end;
         }
-       else
+       else if (m > 0 && !(atom->flags & BACKR))
         {
                 /*
-                * Turn x{m,n} into x{m-1,n-1}x, with capturing parens in only the
-                * second x
+                * If there's no backrefs involved, we can turn x{m,n} into
+                * x{m-1,n-1}x, with capturing parens in only the second x.  This
+                * is valid because we only care about capturing matches from the
+                * final iteration of the quantifier.  It's a win because we can
+                * implement the backref-free left side as a plain DFA node, since
+                * we don't really care where its submatches are.
                  */
                 dupnfa(v->nfa, atom->begin, atom->end, s, atom->begin);
                 assert(m >= 1 && m != INFINITY && n >= 1);
@@ -1142,16 +1135,36 @@ parseqatom(struct vars * v,
                 NOERR();
                 t->right = atom;
                 *atomp = t;
+               /* rest of branch can be strung starting from atom->end */
+               s2 = atom->end;
+       }
+       else
+       {
+               /* general case: need an iteration node */
+               s2 = newstate(v->nfa);
+               NOERR();
+               moveouts(v->nfa, atom->end, s2);
+               NOERR();
+               dupnfa(v->nfa, atom->begin, atom->end, s, s2);
+               repeat(v, s, s2, m, n);
+               f = COMBINE(qprefer, atom->flags);
+               t = subre(v, '*', f, s, s2);
+               NOERR();
+               t->min = (short) m;
+               t->max = (short) n;
+               t->left = atom;
+               *atomp = t;
+               /* rest of branch is to be strung from iteration's end state */
         }
  
         /* and finally, look after that postponed recursion */
         t = top->right;
         if (!(SEE('|') || SEE(stopper) || SEE(EOS)))
-               t->right = parsebranch(v, stopper, type, atom->end, rp, 1);
+               t->right = parsebranch(v, stopper, type, s2, rp, 1);
         else
         {
-               EMPTYARC(atom->end, rp);
-               t->right = subre(v, '=', 0, atom->end, rp);
+               EMPTYARC(s2, rp);
+               t->right = subre(v, '=', 0, s2, rp);
         }
         assert(SEE('|') || SEE(stopper) || SEE(EOS));
         t->flags |= COMBINE(t->flags, t->right->flags);
@@ -1214,6 +1227,9 @@ scannum(struct vars * v)
  /*
   * repeat - replicate subNFA for quantifiers
   *
+ * The sub-NFA strung from lp to rp is modified to represent m to n
+ * repetitions of its initial contents.
+ *
   * The duplication sequences used here are chosen carefully so that any
   * pointers starting out pointing into the subexpression end up pointing into
   * the last occurrence.  (Note that it may not be strung between the same
@@ -1229,7 +1245,7 @@ repeat(struct vars * v,
            int n)
  {
  #define  SOME   2
-#define  INF 3
+#define  INF    3
  #define  PAIR(x, y)  ((x)*4 + (y))
  #define  REDUCE(x)      ( ((x) == INFINITY) ? INF : (((x) > 1) ? SOME : (x)) )
         const int       rm = REDUCE(m);
@@ -1603,7 +1619,7 @@ subre(struct vars * v,
                 v->treechain = ret;
         }
  
-       assert(strchr("|.b(=", op) != NULL);
+       assert(strchr("=b|.*(", op) != NULL);
  
         ret->op = op;
         ret->flags = flags;
diff --git a/src/backend/regex/regexec.c b/src/backend/regex/regexec.c

index 224da5064b69b9577856b21793bd9a92afb03377..ea16e39a6eddfc6267e7c8fc2977afab7cb17e49 100644 (file)
--- a/src/backend/regex/regexec.c
+++ b/src/backend/regex/regexec.c
@@ -140,11 +140,15 @@ static void subset(struct vars *, struct subre *, chr *, chr *);
  static int     dissect(struct vars *, struct subre *, chr *, chr *);
  static int     condissect(struct vars *, struct subre *, chr *, chr *);
  static int     altdissect(struct vars *, struct subre *, chr *, chr *);
+static int     iterdissect(struct vars *, struct subre *, chr *, chr *);
+static int     reviterdissect(struct vars *, struct subre *, chr *, chr *);
  static int     cdissect(struct vars *, struct subre *, chr *, chr *);
  static int     ccondissect(struct vars *, struct subre *, chr *, chr *);
  static int     crevdissect(struct vars *, struct subre *, chr *, chr *);
  static int     cbrdissect(struct vars *, struct subre *, chr *, chr *);
  static int     caltdissect(struct vars *, struct subre *, chr *, chr *);
+static int     citerdissect(struct vars *, struct subre *, chr *, chr *);
+static int     creviterdissect(struct vars *, struct subre *, chr *, chr *);
  
  /* === rege_dfa.c === */
  static chr *longest(struct vars *, struct dfa *, chr *, chr *, int *);
@@ -563,14 +567,17 @@ dissect(struct vars * v,
                 case '=':                               /* terminal node */
                         assert(t->left == NULL && t->right == NULL);
                         return REG_OKAY;        /* no action, parent did the work */
-               case '|':                               /* alternation */
-                       assert(t->left != NULL);
-                       return altdissect(v, t, begin, end);
                 case 'b':                               /* back ref -- shouldn't be calling us! */
                         return REG_ASSERT;
                 case '.':                               /* concatenation */
                         assert(t->left != NULL && t->right != NULL);
                         return condissect(v, t, begin, end);
+               case '|':                               /* alternation */
+                       assert(t->left != NULL);
+                       return altdissect(v, t, begin, end);
+               case '*':                               /* iteration */
+                       assert(t->left != NULL);
+                       return iterdissect(v, t, begin, end);
                 case '(':                               /* capturing */
                         assert(t->left != NULL && t->right == NULL);
                         assert(t->subno > 0);
@@ -696,6 +703,375 @@ altdissect(struct vars * v,
         return REG_ASSERT;                      /* none of them matched?!? */
  }
  
+/*
+ * iterdissect - iteration subexpression matches (uncomplicated)
+ */
+static int                                             /* regexec return code */
+iterdissect(struct vars * v,
+                       struct subre * t,
+                       chr *begin,                     /* beginning of relevant substring */
+                       chr *end)                       /* end of same */
+{
+       struct dfa *d;
+       chr               **endpts;
+       chr                *limit;
+       int                     min_matches;
+       size_t          max_matches;
+       int                     nverified;
+       int                     k;
+       int                     i;
+       int                     er;
+
+       assert(t->op == '*');
+       assert(t->left != NULL && t->left->cnfa.nstates > 0);
+       assert(begin <= end);
+
+       if (t->left->flags & SHORTER)           /* reverse scan */
+               return reviterdissect(v, t, begin, end);
+
+       /*
+        * If zero matches are allowed, and target string is empty, just declare
+        * victory.  OTOH, if target string isn't empty, zero matches can't work
+        * so we pretend the min is 1.
+        */
+       min_matches = t->min;
+       if (min_matches <= 0)
+       {
+               if (begin == end)
+                       return REG_OKAY;
+               min_matches = 1;
+       }
+
+       /*
+        * We need workspace to track the endpoints of each sub-match.  Normally
+        * we consider only nonzero-length sub-matches, so there can be at most
+        * end-begin of them.  However, if min is larger than that, we will also
+        * consider zero-length sub-matches in order to find enough matches.
+        *
+        * For convenience, endpts[0] contains the "begin" pointer and we store
+        * sub-match endpoints in endpts[1..max_matches].
+        */
+       max_matches = end - begin;
+       if (max_matches > t->max && t->max != INFINITY)
+               max_matches = t->max;
+       if (max_matches < min_matches)
+               max_matches = min_matches;
+       endpts = (chr **) MALLOC((max_matches + 1) * sizeof(chr *));
+       if (endpts == NULL)
+               return REG_ESPACE;
+       endpts[0] = begin;
+
+       d = newdfa(v, &t->left->cnfa, &v->g->cmap, DOMALLOC);
+       if (ISERR())
+       {
+               FREE(endpts);
+               return v->err;
+       }
+       MDEBUG(("iter %d\n", t->retry));
+
+       /*
+        * Our strategy is to first find a set of sub-match endpoints that are
+        * valid according to the child node's DFA, and then recursively dissect
+        * each sub-match to confirm validity.  If any validity check fails,
+        * backtrack the last sub-match and try again.  And, when we next try for
+        * a validity check, we need not recheck any successfully verified
+        * sub-matches that we didn't move the endpoints of.  nverified remembers
+        * how many sub-matches are currently known okay.
+        */
+
+       /* initialize to consider first sub-match */
+       nverified = 0;
+       k = 1;
+       limit = end;
+
+       /* iterate until satisfaction or failure */
+       while (k > 0)
+       {
+               /* try to find an endpoint for the k'th sub-match */
+               endpts[k] = longest(v, d, endpts[k - 1], limit, (int *) NULL);
+               if (endpts[k] == NULL)
+               {
+                       /* no match possible, so see if we can shorten previous one */
+                       k--;
+                       goto backtrack;
+               }
+               MDEBUG(("%d: working endpoint %d: %ld\n",
+                               t->retry, k, LOFF(endpts[k])));
+
+               /* k'th sub-match can no longer be considered verified */
+               if (nverified >= k)
+                       nverified = k - 1;
+
+               if (endpts[k] != end)
+               {
+                       /* haven't reached end yet, try another iteration if allowed */
+                       if (k >= max_matches)
+                       {
+                               /* must try to shorten some previous match */
+                               k--;
+                               goto backtrack;
+                       }
+
+                       /* reject zero-length match unless necessary to achieve min */
+                       if (endpts[k] == endpts[k - 1] &&
+                               (k >= min_matches || min_matches - k < end - endpts[k]))
+                               goto backtrack;
+
+                       k++;
+                       limit = end;
+                       continue;
+               }
+
+               /*
+                * We've identified a way to divide the string into k sub-matches
+                * that works so far as the child DFA can tell.  If k is an allowed
+                * number of matches, start the slow part: recurse to verify each
+                * sub-match.  We always have k <= max_matches, needn't check that.
+                */
+               if (k < min_matches)
+                       goto backtrack;
+
+               MDEBUG(("%d: verifying %d..%d\n", t->retry, nverified + 1, k));
+
+               for (i = nverified + 1; i <= k; i++)
+               {
+                       er = dissect(v, t->left, endpts[i - 1], endpts[i]);
+                       if (er == REG_OKAY)
+                       {
+                               nverified = i;
+                               continue;
+                       }
+                       if (er == REG_NOMATCH)
+                               break;
+                       /* oops, something failed */
+                       freedfa(d);
+                       FREE(endpts);
+                       return er;
+               }
+
+               if (i > k)
+               {
+                       /* satisfaction */
+                       MDEBUG(("%d successful\n", t->retry));
+                       freedfa(d);
+                       FREE(endpts);
+                       return REG_OKAY;
+               }
+
+               /* match failed to verify, so backtrack */
+
+backtrack:
+               /*
+                * Must consider shorter versions of the current sub-match.  However,
+                * we'll only ask for a zero-length match if necessary.
+                */
+               while (k > 0)
+               {
+                       chr        *prev_end = endpts[k - 1];
+
+                       if (endpts[k] > prev_end)
+                       {
+                               limit = endpts[k] - 1;
+                               if (limit > prev_end ||
+                                       (k < min_matches && min_matches - k >= end - prev_end))
+                               {
+                                       /* break out of backtrack loop, continue the outer one */
+                                       break;
+                               }
+                       }
+                       /* can't shorten k'th sub-match any more, consider previous one */
+                       k--;
+               }
+       }
+
+       /* all possibilities exhausted - shouldn't happen in uncomplicated mode */
+       MDEBUG(("%d failed\n", t->retry));
+       freedfa(d);
+       FREE(endpts);
+       return REG_ASSERT;
+}
+
+/*
+ * reviterdissect - shortest-first iteration subexpression matches
+ */
+static int                                             /* regexec return code */
+reviterdissect(struct vars * v,
+                          struct subre * t,
+                          chr *begin,          /* beginning of relevant substring */
+                          chr *end)            /* end of same */
+{
+       struct dfa *d;
+       chr               **endpts;
+       chr                *limit;
+       int                     min_matches;
+       size_t          max_matches;
+       int                     nverified;
+       int                     k;
+       int                     i;
+       int                     er;
+
+       assert(t->op == '*');
+       assert(t->left != NULL && t->left->cnfa.nstates > 0);
+       assert(t->left->flags & SHORTER);
+       assert(begin <= end);
+
+       /*
+        * If zero matches are allowed, and target string is empty, just declare
+        * victory.  OTOH, if target string isn't empty, zero matches can't work
+        * so we pretend the min is 1.
+        */
+       min_matches = t->min;
+       if (min_matches <= 0)
+       {
+               if (begin == end)
+                       return REG_OKAY;
+               min_matches = 1;
+       }
+
+       /*
+        * We need workspace to track the endpoints of each sub-match.  Normally
+        * we consider only nonzero-length sub-matches, so there can be at most
+        * end-begin of them.  However, if min is larger than that, we will also
+        * consider zero-length sub-matches in order to find enough matches.
+        *
+        * For convenience, endpts[0] contains the "begin" pointer and we store
+        * sub-match endpoints in endpts[1..max_matches].
+        */
+       max_matches = end - begin;
+       if (max_matches > t->max && t->max != INFINITY)
+               max_matches = t->max;
+       if (max_matches < min_matches)
+               max_matches = min_matches;
+       endpts = (chr **) MALLOC((max_matches + 1) * sizeof(chr *));
+       if (endpts == NULL)
+               return REG_ESPACE;
+       endpts[0] = begin;
+
+       d = newdfa(v, &t->left->cnfa, &v->g->cmap, DOMALLOC);
+       if (ISERR())
+       {
+               FREE(endpts);
+               return v->err;
+       }
+       MDEBUG(("reviter %d\n", t->retry));
+
+       /*
+        * Our strategy is to first find a set of sub-match endpoints that are
+        * valid according to the child node's DFA, and then recursively dissect
+        * each sub-match to confirm validity.  If any validity check fails,
+        * backtrack the last sub-match and try again.  And, when we next try for
+        * a validity check, we need not recheck any successfully verified
+        * sub-matches that we didn't move the endpoints of.  nverified remembers
+        * how many sub-matches are currently known okay.
+        */
+
+       /* initialize to consider first sub-match */
+       nverified = 0;
+       k = 1;
+       limit = begin;
+
+       /* iterate until satisfaction or failure */
+       while (k > 0)
+       {
+               /* disallow zero-length match unless necessary to achieve min */
+               if (limit == endpts[k - 1] &&
+                       limit != end &&
+                       (k >= min_matches || min_matches - k < end - limit))
+                       limit++;
+
+               /* try to find an endpoint for the k'th sub-match */
+               endpts[k] = shortest(v, d, endpts[k - 1], limit, end,
+                                                        (chr **) NULL, (int *) NULL);
+               if (endpts[k] == NULL)
+               {
+                       /* no match possible, so see if we can lengthen previous one */
+                       k--;
+                       goto backtrack;
+               }
+               MDEBUG(("%d: working endpoint %d: %ld\n",
+                               t->retry, k, LOFF(endpts[k])));
+
+               /* k'th sub-match can no longer be considered verified */
+               if (nverified >= k)
+                       nverified = k - 1;
+
+               if (endpts[k] != end)
+               {
+                       /* haven't reached end yet, try another iteration if allowed */
+                       if (k >= max_matches)
+                       {
+                               /* must try to lengthen some previous match */
+                               k--;
+                               goto backtrack;
+                       }
+
+                       k++;
+                       limit = endpts[k - 1];
+                       continue;
+               }
+
+               /*
+                * We've identified a way to divide the string into k sub-matches
+                * that works so far as the child DFA can tell.  If k is an allowed
+                * number of matches, start the slow part: recurse to verify each
+                * sub-match.  We always have k <= max_matches, needn't check that.
+                */
+               if (k < min_matches)
+                       goto backtrack;
+
+               MDEBUG(("%d: verifying %d..%d\n", t->retry, nverified + 1, k));
+
+               for (i = nverified + 1; i <= k; i++)
+               {
+                       er = dissect(v, t->left, endpts[i - 1], endpts[i]);
+                       if (er == REG_OKAY)
+                       {
+                               nverified = i;
+                               continue;
+                       }
+                       if (er == REG_NOMATCH)
+                               break;
+                       /* oops, something failed */
+                       freedfa(d);
+                       FREE(endpts);
+                       return er;
+               }
+
+               if (i > k)
+               {
+                       /* satisfaction */
+                       MDEBUG(("%d successful\n", t->retry));
+                       freedfa(d);
+                       FREE(endpts);
+                       return REG_OKAY;
+               }
+
+               /* match failed to verify, so backtrack */
+
+backtrack:
+               /*
+                * Must consider longer versions of the current sub-match.
+                */
+               while (k > 0)
+               {
+                       if (endpts[k] < end)
+                       {
+                               limit = endpts[k] + 1;
+                               /* break out of backtrack loop, continue the outer one */
+                               break;
+                       }
+                       /* can't lengthen k'th sub-match any more, consider previous one */
+                       k--;
+               }
+       }
+
+       /* all possibilities exhausted - shouldn't happen in uncomplicated mode */
+       MDEBUG(("%d failed\n", t->retry));
+       freedfa(d);
+       FREE(endpts);
+       return REG_ASSERT;
+}
+
  /*
   * cdissect - determine subexpression matches (with complications)
   * The retry memory stores the offset of the trial midpoint from begin,
@@ -717,15 +1093,18 @@ cdissect(struct vars * v,
                 case '=':                               /* terminal node */
                         assert(t->left == NULL && t->right == NULL);
                         return REG_OKAY;        /* no action, parent did the work */
-               case '|':                               /* alternation */
-                       assert(t->left != NULL);
-                       return caltdissect(v, t, begin, end);
                 case 'b':                               /* back reference */
                         assert(t->left == NULL && t->right == NULL);
                         return cbrdissect(v, t, begin, end);
                 case '.':                               /* concatenation */
                         assert(t->left != NULL && t->right != NULL);
                         return ccondissect(v, t, begin, end);
+               case '|':                               /* alternation */
+                       assert(t->left != NULL);
+                       return caltdissect(v, t, begin, end);
+               case '*':                               /* iteration */
+                       assert(t->left != NULL);
+                       return citerdissect(v, t, begin, end);
                 case '(':                               /* capturing */
                         assert(t->left != NULL && t->right == NULL);
                         assert(t->subno > 0);
@@ -847,7 +1226,7 @@ ccondissect(struct vars * v,
  }
  
  /*
- * crevdissect - determine backref shortest-first subexpression matches
+ * crevdissect - shortest-first concatenation subexpression matches
   * The retry memory stores the offset of the trial midpoint from begin,
   * plus 1 so that 0 uniquely means "clean slate".
   */
@@ -1088,6 +1467,377 @@ caltdissect(struct vars * v,
         return caltdissect(v, t->right, begin, end);
  }
  
+/*
+ * citerdissect - iteration subexpression matches (with complications)
+ */
+static int                                             /* regexec return code */
+citerdissect(struct vars * v,
+                        struct subre * t,
+                        chr *begin,            /* beginning of relevant substring */
+                        chr *end)                      /* end of same */
+{
+       struct dfa *d;
+       chr               **endpts;
+       chr                *limit;
+       int                     min_matches;
+       size_t          max_matches;
+       int                     nverified;
+       int                     k;
+       int                     i;
+       int                     er;
+
+       assert(t->op == '*');
+       assert(t->left != NULL && t->left->cnfa.nstates > 0);
+       assert(begin <= end);
+
+       if (t->left->flags & SHORTER)           /* reverse scan */
+               return creviterdissect(v, t, begin, end);
+
+       /*
+        * If zero matches are allowed, and target string is empty, just declare
+        * victory.  OTOH, if target string isn't empty, zero matches can't work
+        * so we pretend the min is 1.
+        */
+       min_matches = t->min;
+       if (min_matches <= 0)
+       {
+               if (begin == end)
+                       return REG_OKAY;
+               min_matches = 1;
+       }
+
+       /*
+        * We need workspace to track the endpoints of each sub-match.  Normally
+        * we consider only nonzero-length sub-matches, so there can be at most
+        * end-begin of them.  However, if min is larger than that, we will also
+        * consider zero-length sub-matches in order to find enough matches.
+        *
+        * For convenience, endpts[0] contains the "begin" pointer and we store
+        * sub-match endpoints in endpts[1..max_matches].
+        */
+       max_matches = end - begin;
+       if (max_matches > t->max && t->max != INFINITY)
+               max_matches = t->max;
+       if (max_matches < min_matches)
+               max_matches = min_matches;
+       endpts = (chr **) MALLOC((max_matches + 1) * sizeof(chr *));
+       if (endpts == NULL)
+               return REG_ESPACE;
+       endpts[0] = begin;
+
+       d = newdfa(v, &t->left->cnfa, &v->g->cmap, DOMALLOC);
+       if (ISERR())
+       {
+               FREE(endpts);
+               return v->err;
+       }
+       MDEBUG(("citer %d\n", t->retry));
+
+       /*
+        * Our strategy is to first find a set of sub-match endpoints that are
+        * valid according to the child node's DFA, and then recursively dissect
+        * each sub-match to confirm validity.  If any validity check fails,
+        * backtrack the last sub-match and try again.  And, when we next try for
+        * a validity check, we need not recheck any successfully verified
+        * sub-matches that we didn't move the endpoints of.  nverified remembers
+        * how many sub-matches are currently known okay.
+        */
+
+       /* initialize to consider first sub-match */
+       nverified = 0;
+       k = 1;
+       limit = end;
+
+       /* iterate until satisfaction or failure */
+       while (k > 0)
+       {
+               /* try to find an endpoint for the k'th sub-match */
+               endpts[k] = longest(v, d, endpts[k - 1], limit, (int *) NULL);
+               if (endpts[k] == NULL)
+               {
+                       /* no match possible, so see if we can shorten previous one */
+                       k--;
+                       goto backtrack;
+               }
+               MDEBUG(("%d: working endpoint %d: %ld\n",
+                               t->retry, k, LOFF(endpts[k])));
+
+               /* k'th sub-match can no longer be considered verified */
+               if (nverified >= k)
+                       nverified = k - 1;
+
+               if (endpts[k] != end)
+               {
+                       /* haven't reached end yet, try another iteration if allowed */
+                       if (k >= max_matches)
+                       {
+                               /* must try to shorten some previous match */
+                               k--;
+                               goto backtrack;
+                       }
+
+                       /* reject zero-length match unless necessary to achieve min */
+                       if (endpts[k] == endpts[k - 1] &&
+                               (k >= min_matches || min_matches - k < end - endpts[k]))
+                               goto backtrack;
+
+                       k++;
+                       limit = end;
+                       continue;
+               }
+
+               /*
+                * We've identified a way to divide the string into k sub-matches
+                * that works so far as the child DFA can tell.  If k is an allowed
+                * number of matches, start the slow part: recurse to verify each
+                * sub-match.  We always have k <= max_matches, needn't check that.
+                */
+               if (k < min_matches)
+                       goto backtrack;
+
+               MDEBUG(("%d: verifying %d..%d\n", t->retry, nverified + 1, k));
+
+               for (i = nverified + 1; i <= k; i++)
+               {
+                       zapmem(v, t->left);
+                       er = cdissect(v, t->left, endpts[i - 1], endpts[i]);
+                       if (er == REG_OKAY)
+                       {
+                               nverified = i;
+                               continue;
+                       }
+                       if (er == REG_NOMATCH)
+                               break;
+                       /* oops, something failed */
+                       freedfa(d);
+                       FREE(endpts);
+                       return er;
+               }
+
+               if (i > k)
+               {
+                       /* satisfaction */
+                       MDEBUG(("%d successful\n", t->retry));
+                       freedfa(d);
+                       FREE(endpts);
+                       return REG_OKAY;
+               }
+
+               /* match failed to verify, so backtrack */
+
+backtrack:
+               /*
+                * Must consider shorter versions of the current sub-match.  However,
+                * we'll only ask for a zero-length match if necessary.
+                */
+               while (k > 0)
+               {
+                       chr        *prev_end = endpts[k - 1];
+
+                       if (endpts[k] > prev_end)
+                       {
+                               limit = endpts[k] - 1;
+                               if (limit > prev_end ||
+                                       (k < min_matches && min_matches - k >= end - prev_end))
+                               {
+                                       /* break out of backtrack loop, continue the outer one */
+                                       break;
+                               }
+                       }
+                       /* can't shorten k'th sub-match any more, consider previous one */
+                       k--;
+               }
+       }
+
+       /* all possibilities exhausted */
+       MDEBUG(("%d failed\n", t->retry));
+       freedfa(d);
+       FREE(endpts);
+       return REG_NOMATCH;
+}
+
+/*
+ * creviterdissect - shortest-first iteration subexpression matches
+ */
+static int                                             /* regexec return code */
+creviterdissect(struct vars * v,
+                               struct subre * t,
+                               chr *begin,             /* beginning of relevant substring */
+                               chr *end)               /* end of same */
+{
+       struct dfa *d;
+       chr               **endpts;
+       chr                *limit;
+       int                     min_matches;
+       size_t          max_matches;
+       int                     nverified;
+       int                     k;
+       int                     i;
+       int                     er;
+
+       assert(t->op == '*');
+       assert(t->left != NULL && t->left->cnfa.nstates > 0);
+       assert(t->left->flags & SHORTER);
+       assert(begin <= end);
+
+       /*
+        * If zero matches are allowed, and target string is empty, just declare
+        * victory.  OTOH, if target string isn't empty, zero matches can't work
+        * so we pretend the min is 1.
+        */
+       min_matches = t->min;
+       if (min_matches <= 0)
+       {
+               if (begin == end)
+                       return REG_OKAY;
+               min_matches = 1;
+       }
+
+       /*
+        * We need workspace to track the endpoints of each sub-match.  Normally
+        * we consider only nonzero-length sub-matches, so there can be at most
+        * end-begin of them.  However, if min is larger than that, we will also
+        * consider zero-length sub-matches in order to find enough matches.
+        *
+        * For convenience, endpts[0] contains the "begin" pointer and we store
+        * sub-match endpoints in endpts[1..max_matches].
+        */
+       max_matches = end - begin;
+       if (max_matches > t->max && t->max != INFINITY)
+               max_matches = t->max;
+       if (max_matches < min_matches)
+               max_matches = min_matches;
+       endpts = (chr **) MALLOC((max_matches + 1) * sizeof(chr *));
+       if (endpts == NULL)
+               return REG_ESPACE;
+       endpts[0] = begin;
+
+       d = newdfa(v, &t->left->cnfa, &v->g->cmap, DOMALLOC);
+       if (ISERR())
+       {
+               FREE(endpts);
+               return v->err;
+       }
+       MDEBUG(("creviter %d\n", t->retry));
+
+       /*
+        * Our strategy is to first find a set of sub-match endpoints that are
+        * valid according to the child node's DFA, and then recursively dissect
+        * each sub-match to confirm validity.  If any validity check fails,
+        * backtrack the last sub-match and try again.  And, when we next try for
+        * a validity check, we need not recheck any successfully verified
+        * sub-matches that we didn't move the endpoints of.  nverified remembers
+        * how many sub-matches are currently known okay.
+        */
+
+       /* initialize to consider first sub-match */
+       nverified = 0;
+       k = 1;
+       limit = begin;
+
+       /* iterate until satisfaction or failure */
+       while (k > 0)
+       {
+               /* disallow zero-length match unless necessary to achieve min */
+               if (limit == endpts[k - 1] &&
+                       limit != end &&
+                       (k >= min_matches || min_matches - k < end - limit))
+                       limit++;
+
+               /* try to find an endpoint for the k'th sub-match */
+               endpts[k] = shortest(v, d, endpts[k - 1], limit, end,
+                                                        (chr **) NULL, (int *) NULL);
+               if (endpts[k] == NULL)
+               {
+                       /* no match possible, so see if we can lengthen previous one */
+                       k--;
+                       goto backtrack;
+               }
+               MDEBUG(("%d: working endpoint %d: %ld\n",
+                               t->retry, k, LOFF(endpts[k])));
+
+               /* k'th sub-match can no longer be considered verified */
+               if (nverified >= k)
+                       nverified = k - 1;
+
+               if (endpts[k] != end)
+               {
+                       /* haven't reached end yet, try another iteration if allowed */
+                       if (k >= max_matches)
+                       {
+                               /* must try to lengthen some previous match */
+                               k--;
+                               goto backtrack;
+                       }
+
+                       k++;
+                       limit = endpts[k - 1];
+                       continue;
+               }
+
+               /*
+                * We've identified a way to divide the string into k sub-matches
+                * that works so far as the child DFA can tell.  If k is an allowed
+                * number of matches, start the slow part: recurse to verify each
+                * sub-match.  We always have k <= max_matches, needn't check that.
+                */
+               if (k < min_matches)
+                       goto backtrack;
+
+               MDEBUG(("%d: verifying %d..%d\n", t->retry, nverified + 1, k));
+
+               for (i = nverified + 1; i <= k; i++)
+               {
+                       zapmem(v, t->left);
+                       er = cdissect(v, t->left, endpts[i - 1], endpts[i]);
+                       if (er == REG_OKAY)
+                       {
+                               nverified = i;
+                               continue;
+                       }
+                       if (er == REG_NOMATCH)
+                               break;
+                       /* oops, something failed */
+                       freedfa(d);
+                       FREE(endpts);
+                       return er;
+               }
+
+               if (i > k)
+               {
+                       /* satisfaction */
+                       MDEBUG(("%d successful\n", t->retry));
+                       freedfa(d);
+                       FREE(endpts);
+                       return REG_OKAY;
+               }
+
+               /* match failed to verify, so backtrack */
+
+backtrack:
+               /*
+                * Must consider longer versions of the current sub-match.
+                */
+               while (k > 0)
+               {
+                       if (endpts[k] < end)
+                       {
+                               limit = endpts[k] + 1;
+                               /* break out of backtrack loop, continue the outer one */
+                               break;
+                       }
+                       /* can't lengthen k'th sub-match any more, consider previous one */
+                       k--;
+               }
+       }
+
+       /* all possibilities exhausted */
+       MDEBUG(("%d failed\n", t->retry));
+       freedfa(d);
+       FREE(endpts);
+       return REG_NOMATCH;
+}
+
  
  
  #include "rege_dfa.c"
diff --git a/src/include/regex/regguts.h b/src/include/regex/regguts.h

index fb6789b560f3899b7c77feef4e0e91ac1f2d9c5d..d420ea8316e18f2ff009af5c618027cf3fae8256 100644 (file)
--- a/src/include/regex/regguts.h
+++ b/src/include/regex/regguts.h
@@ -372,10 +372,28 @@ struct cnfa
  
  /*
   * subexpression tree
+ *
+ * "op" is one of:
+ *             '='  plain regex without interesting substructure (implemented as DFA)
+ *             'b'  back-reference (has no substructure either)
+ *             '('  capture node: captures the match of its single child
+ *             '.'  concatenation: matches a match for left, then a match for right
+ *             '|'  alternation: matches a match for left or a match for right
+ *             '*'  iteration: matches some number of matches of its single child
+ *
+ * Note: the right child of an alternation must be another alternation or
+ * NULL; hence, an N-way branch requires N alternation nodes, not N-1 as you
+ * might expect.  This could stand to be changed.  Actually I'd rather see
+ * a single alternation node with N children, but that will take revising
+ * the representation of struct subre.
+ *
+ * Note: when a backref is directly quantified, we stick the min/max counts
+ * into the backref rather than plastering an iteration node on top.  This is
+ * for efficiency: there is no need to search for possible division points.
   */
  struct subre
  {
-       char            op;                             /* '|', '.' (concat), 'b' (backref), '(', '=' */
+       char            op;                             /* see type codes above */
         char            flags;
  #define  LONGER  01                            /* prefers longer match */
  #define  SHORTER 02                            /* prefers shorter match */
@@ -393,8 +411,8 @@ struct subre
  #define  COMBINE(f1, f2) (UP((f1)|(f2)) | PREF2(f1, f2))
         short           retry;                  /* index into retry memory */
         int                     subno;                  /* subexpression number (for 'b' and '(') */
-       short           min;                    /* min repetitions, for backref only */
-       short           max;                    /* max repetitions, for backref only */
+       short           min;                    /* min repetitions for iteration or backref */
+       short           max;                    /* max repetitions for iteration or backref */
         struct subre *left;                     /* left child, if any (also freelist chain) */
         struct subre *right;            /* right child, if any */
         struct state *begin;            /* outarcs from here... */
diff --git a/src/test/regress/expected/regex.out b/src/test/regress/expected/regex.out

index 5694908163af856f78940a9e31d75a370b4bfe09..4acc4a47a03bca1ad6303765c519f414fdbf5c42 100644 (file)
--- a/src/test/regress/expected/regex.out
+++ b/src/test/regress/expected/regex.out
@@ -34,3 +34,40 @@ select 'b' ~ '^([bc])\1*$' as t;
   t
  (1 row)
  
+-- Test quantified backref within a larger expression
+select 'abc abc abc' ~ '^(\w+)( \1)+$' as t;
+ t 
+---
+ t
+(1 row)
+
+select 'abc abd abc' ~ '^(\w+)( \1)+$' as f;
+ f 
+---
+ f
+(1 row)
+
+select 'abc abc abd' ~ '^(\w+)( \1)+$' as f;
+ f 
+---
+ f
+(1 row)
+
+select 'abc abc abc' ~ '^(.+)( \1)+$' as t;
+ t 
+---
+ t
+(1 row)
+
+select 'abc abd abc' ~ '^(.+)( \1)+$' as f;
+ f 
+---
+ f
+(1 row)
+
+select 'abc abc abd' ~ '^(.+)( \1)+$' as f;
+ f 
+---
+ f
+(1 row)
+
diff --git a/src/test/regress/sql/regex.sql b/src/test/regress/sql/regex.sql

index 242a81ef3298a68d79cf4af971e17935f3630964..b5315a3df6ddda340d2bf6f27137aba1210267f1 100644 (file)
--- a/src/test/regress/sql/regex.sql
+++ b/src/test/regress/sql/regex.sql
@@ -11,3 +11,11 @@ select 'ccc' ~ '^([bc])\1*$' as t;
  select 'xxx' ~ '^([bc])\1*$' as f;
  select 'bbc' ~ '^([bc])\1*$' as f;
  select 'b' ~ '^([bc])\1*$' as t;
+
+-- Test quantified backref within a larger expression
+select 'abc abc abc' ~ '^(\w+)( \1)+$' as t;
+select 'abc abd abc' ~ '^(\w+)( \1)+$' as f;
+select 'abc abc abd' ~ '^(\w+)( \1)+$' as f;
+select 'abc abc abc' ~ '^(.+)( \1)+$' as t;
+select 'abc abd abc' ~ '^(.+)( \1)+$' as f;
+select 'abc abc abd' ~ '^(.+)( \1)+$' as f;
author	Tom Lane <tgl@sss.pgh.pa.us>
	Fri, 24 Feb 2012 06:40:18 +0000 (01:40 -0500)
committer	Tom Lane <tgl@sss.pgh.pa.us>
	Fri, 24 Feb 2012 06:41:03 +0000 (01:41 -0500)
src/backend/regex/README		patch \| blob \| history
src/backend/regex/regcomp.c		patch \| blob \| history
src/backend/regex/regexec.c		patch \| blob \| history
src/include/regex/regguts.h		patch \| blob \| history
src/test/regress/expected/regex.out		patch \| blob \| history
src/test/regress/sql/regex.sql		patch \| blob \| history