/*----------
* Prepare a general-purpose state skeleton.
*
- * ---> [s] ---prefix---> [begin] ---atom---> [end] ----rest---> [rp]
- * / /
- * [lp] ----> [s2] ----bypass---------------------
+ * In the no-backrefs case, we want this:
*
- * where bypass is an empty, and prefix is some repetitions of atom
+ * [lp] ---> [s] ---prefix---> [begin] ---atom---> [end] ---rest---> [rp]
+ *
+ * where prefix is some repetitions of atom. In the general case we need
+ *
+ * [lp] ---> [s] ---iterator---> [s2] ---rest---> [rp]
+ *
+ * where the iterator wraps around [begin] ---atom---> [end]
+ *
+ * We make the s state here for both cases; s2 is made below if needed
*----------
*/
s = newstate(v->nfa); /* first, new endpoints for the atom */
NOERR();
atom->begin = s;
atom->end = s2;
- s = newstate(v->nfa); /* and spots for prefix and bypass */
- s2 = newstate(v->nfa);
+ s = newstate(v->nfa); /* set up starting state */
NOERR();
EMPTYARC(lp, s);
- EMPTYARC(lp, s2);
NOERR();
/* break remaining subRE into x{...} and what follows */
}
/*
- * It's quantifier time. If the atom is just a BACKREF, we'll let it deal
- * with quantifiers internally. Otherwise, the first step is to turn
- * x{0,...} into x{1,...}|empty
+ * It's quantifier time. If the atom is just a backref, we'll let it deal
+ * with quantifiers internally.
*/
- if (m == 0 && atomtype != BACKREF)
- {
- EMPTYARC(s2, atom->end); /* the bypass */
- assert(PREF(qprefer) != 0);
- f = COMBINE(qprefer, atom->flags);
- t = subre(v, '|', f, lp, atom->end);
- NOERR();
- t->left = atom;
- t->right = subre(v, '|', PREF(f), s2, atom->end);
- NOERR();
- t->right->left = subre(v, '=', 0, s2, atom->end);
- NOERR();
- *atomp = t;
- atomp = &t->left;
- m = 1;
- }
-
- /* deal with the rest of the quantifier */
if (atomtype == BACKREF)
{
/* special case: backrefs have internal quantifiers */
atom->min = (short) m;
atom->max = (short) n;
atom->flags |= COMBINE(qprefer, atom->flags);
+ /* rest of branch can be strung starting from atom->end */
+ s2 = atom->end;
}
else if (m == 1 && n == 1)
{
/* no/vacuous quantifier: done */
EMPTYARC(s, atom->begin); /* empty prefix */
+ /* rest of branch can be strung starting from atom->end */
+ s2 = atom->end;
}
- else
+ else if (m > 0 && !(atom->flags & BACKR))
{
/*
- * Turn x{m,n} into x{m-1,n-1}x, with capturing parens in only the
- * second x
+ * If there's no backrefs involved, we can turn x{m,n} into
+ * x{m-1,n-1}x, with capturing parens in only the second x. This
+ * is valid because we only care about capturing matches from the
+ * final iteration of the quantifier. It's a win because we can
+ * implement the backref-free left side as a plain DFA node, since
+ * we don't really care where its submatches are.
*/
dupnfa(v->nfa, atom->begin, atom->end, s, atom->begin);
assert(m >= 1 && m != INFINITY && n >= 1);
NOERR();
t->right = atom;
*atomp = t;
+ /* rest of branch can be strung starting from atom->end */
+ s2 = atom->end;
+ }
+ else
+ {
+ /* general case: need an iteration node */
+ s2 = newstate(v->nfa);
+ NOERR();
+ moveouts(v->nfa, atom->end, s2);
+ NOERR();
+ dupnfa(v->nfa, atom->begin, atom->end, s, s2);
+ repeat(v, s, s2, m, n);
+ f = COMBINE(qprefer, atom->flags);
+ t = subre(v, '*', f, s, s2);
+ NOERR();
+ t->min = (short) m;
+ t->max = (short) n;
+ t->left = atom;
+ *atomp = t;
+ /* rest of branch is to be strung from iteration's end state */
}
/* and finally, look after that postponed recursion */
t = top->right;
if (!(SEE('|') || SEE(stopper) || SEE(EOS)))
- t->right = parsebranch(v, stopper, type, atom->end, rp, 1);
+ t->right = parsebranch(v, stopper, type, s2, rp, 1);
else
{
- EMPTYARC(atom->end, rp);
- t->right = subre(v, '=', 0, atom->end, rp);
+ EMPTYARC(s2, rp);
+ t->right = subre(v, '=', 0, s2, rp);
}
assert(SEE('|') || SEE(stopper) || SEE(EOS));
t->flags |= COMBINE(t->flags, t->right->flags);
/*
* repeat - replicate subNFA for quantifiers
*
+ * The sub-NFA strung from lp to rp is modified to represent m to n
+ * repetitions of its initial contents.
+ *
* The duplication sequences used here are chosen carefully so that any
* pointers starting out pointing into the subexpression end up pointing into
* the last occurrence. (Note that it may not be strung between the same
int n)
{
#define SOME 2
-#define INF 3
+#define INF 3
#define PAIR(x, y) ((x)*4 + (y))
#define REDUCE(x) ( ((x) == INFINITY) ? INF : (((x) > 1) ? SOME : (x)) )
const int rm = REDUCE(m);
v->treechain = ret;
}
- assert(strchr("|.b(=", op) != NULL);
+ assert(strchr("=b|.*(", op) != NULL);
ret->op = op;
ret->flags = flags;
static int dissect(struct vars *, struct subre *, chr *, chr *);
static int condissect(struct vars *, struct subre *, chr *, chr *);
static int altdissect(struct vars *, struct subre *, chr *, chr *);
+static int iterdissect(struct vars *, struct subre *, chr *, chr *);
+static int reviterdissect(struct vars *, struct subre *, chr *, chr *);
static int cdissect(struct vars *, struct subre *, chr *, chr *);
static int ccondissect(struct vars *, struct subre *, chr *, chr *);
static int crevdissect(struct vars *, struct subre *, chr *, chr *);
static int cbrdissect(struct vars *, struct subre *, chr *, chr *);
static int caltdissect(struct vars *, struct subre *, chr *, chr *);
+static int citerdissect(struct vars *, struct subre *, chr *, chr *);
+static int creviterdissect(struct vars *, struct subre *, chr *, chr *);
/* === rege_dfa.c === */
static chr *longest(struct vars *, struct dfa *, chr *, chr *, int *);
case '=': /* terminal node */
assert(t->left == NULL && t->right == NULL);
return REG_OKAY; /* no action, parent did the work */
- case '|': /* alternation */
- assert(t->left != NULL);
- return altdissect(v, t, begin, end);
case 'b': /* back ref -- shouldn't be calling us! */
return REG_ASSERT;
case '.': /* concatenation */
assert(t->left != NULL && t->right != NULL);
return condissect(v, t, begin, end);
+ case '|': /* alternation */
+ assert(t->left != NULL);
+ return altdissect(v, t, begin, end);
+ case '*': /* iteration */
+ assert(t->left != NULL);
+ return iterdissect(v, t, begin, end);
case '(': /* capturing */
assert(t->left != NULL && t->right == NULL);
assert(t->subno > 0);
return REG_ASSERT; /* none of them matched?!? */
}
+/*
+ * iterdissect - iteration subexpression matches (uncomplicated)
+ */
+static int /* regexec return code */
+iterdissect(struct vars * v,
+ struct subre * t,
+ chr *begin, /* beginning of relevant substring */
+ chr *end) /* end of same */
+{
+ struct dfa *d;
+ chr **endpts;
+ chr *limit;
+ int min_matches;
+ size_t max_matches;
+ int nverified;
+ int k;
+ int i;
+ int er;
+
+ assert(t->op == '*');
+ assert(t->left != NULL && t->left->cnfa.nstates > 0);
+ assert(begin <= end);
+
+ if (t->left->flags & SHORTER) /* reverse scan */
+ return reviterdissect(v, t, begin, end);
+
+ /*
+ * If zero matches are allowed, and target string is empty, just declare
+ * victory. OTOH, if target string isn't empty, zero matches can't work
+ * so we pretend the min is 1.
+ */
+ min_matches = t->min;
+ if (min_matches <= 0)
+ {
+ if (begin == end)
+ return REG_OKAY;
+ min_matches = 1;
+ }
+
+ /*
+ * We need workspace to track the endpoints of each sub-match. Normally
+ * we consider only nonzero-length sub-matches, so there can be at most
+ * end-begin of them. However, if min is larger than that, we will also
+ * consider zero-length sub-matches in order to find enough matches.
+ *
+ * For convenience, endpts[0] contains the "begin" pointer and we store
+ * sub-match endpoints in endpts[1..max_matches].
+ */
+ max_matches = end - begin;
+ if (max_matches > t->max && t->max != INFINITY)
+ max_matches = t->max;
+ if (max_matches < min_matches)
+ max_matches = min_matches;
+ endpts = (chr **) MALLOC((max_matches + 1) * sizeof(chr *));
+ if (endpts == NULL)
+ return REG_ESPACE;
+ endpts[0] = begin;
+
+ d = newdfa(v, &t->left->cnfa, &v->g->cmap, DOMALLOC);
+ if (ISERR())
+ {
+ FREE(endpts);
+ return v->err;
+ }
+ MDEBUG(("iter %d\n", t->retry));
+
+ /*
+ * Our strategy is to first find a set of sub-match endpoints that are
+ * valid according to the child node's DFA, and then recursively dissect
+ * each sub-match to confirm validity. If any validity check fails,
+ * backtrack the last sub-match and try again. And, when we next try for
+ * a validity check, we need not recheck any successfully verified
+ * sub-matches that we didn't move the endpoints of. nverified remembers
+ * how many sub-matches are currently known okay.
+ */
+
+ /* initialize to consider first sub-match */
+ nverified = 0;
+ k = 1;
+ limit = end;
+
+ /* iterate until satisfaction or failure */
+ while (k > 0)
+ {
+ /* try to find an endpoint for the k'th sub-match */
+ endpts[k] = longest(v, d, endpts[k - 1], limit, (int *) NULL);
+ if (endpts[k] == NULL)
+ {
+ /* no match possible, so see if we can shorten previous one */
+ k--;
+ goto backtrack;
+ }
+ MDEBUG(("%d: working endpoint %d: %ld\n",
+ t->retry, k, LOFF(endpts[k])));
+
+ /* k'th sub-match can no longer be considered verified */
+ if (nverified >= k)
+ nverified = k - 1;
+
+ if (endpts[k] != end)
+ {
+ /* haven't reached end yet, try another iteration if allowed */
+ if (k >= max_matches)
+ {
+ /* must try to shorten some previous match */
+ k--;
+ goto backtrack;
+ }
+
+ /* reject zero-length match unless necessary to achieve min */
+ if (endpts[k] == endpts[k - 1] &&
+ (k >= min_matches || min_matches - k < end - endpts[k]))
+ goto backtrack;
+
+ k++;
+ limit = end;
+ continue;
+ }
+
+ /*
+ * We've identified a way to divide the string into k sub-matches
+ * that works so far as the child DFA can tell. If k is an allowed
+ * number of matches, start the slow part: recurse to verify each
+ * sub-match. We always have k <= max_matches, needn't check that.
+ */
+ if (k < min_matches)
+ goto backtrack;
+
+ MDEBUG(("%d: verifying %d..%d\n", t->retry, nverified + 1, k));
+
+ for (i = nverified + 1; i <= k; i++)
+ {
+ er = dissect(v, t->left, endpts[i - 1], endpts[i]);
+ if (er == REG_OKAY)
+ {
+ nverified = i;
+ continue;
+ }
+ if (er == REG_NOMATCH)
+ break;
+ /* oops, something failed */
+ freedfa(d);
+ FREE(endpts);
+ return er;
+ }
+
+ if (i > k)
+ {
+ /* satisfaction */
+ MDEBUG(("%d successful\n", t->retry));
+ freedfa(d);
+ FREE(endpts);
+ return REG_OKAY;
+ }
+
+ /* match failed to verify, so backtrack */
+
+backtrack:
+ /*
+ * Must consider shorter versions of the current sub-match. However,
+ * we'll only ask for a zero-length match if necessary.
+ */
+ while (k > 0)
+ {
+ chr *prev_end = endpts[k - 1];
+
+ if (endpts[k] > prev_end)
+ {
+ limit = endpts[k] - 1;
+ if (limit > prev_end ||
+ (k < min_matches && min_matches - k >= end - prev_end))
+ {
+ /* break out of backtrack loop, continue the outer one */
+ break;
+ }
+ }
+ /* can't shorten k'th sub-match any more, consider previous one */
+ k--;
+ }
+ }
+
+ /* all possibilities exhausted - shouldn't happen in uncomplicated mode */
+ MDEBUG(("%d failed\n", t->retry));
+ freedfa(d);
+ FREE(endpts);
+ return REG_ASSERT;
+}
+
+/*
+ * reviterdissect - shortest-first iteration subexpression matches
+ */
+static int /* regexec return code */
+reviterdissect(struct vars * v,
+ struct subre * t,
+ chr *begin, /* beginning of relevant substring */
+ chr *end) /* end of same */
+{
+ struct dfa *d;
+ chr **endpts;
+ chr *limit;
+ int min_matches;
+ size_t max_matches;
+ int nverified;
+ int k;
+ int i;
+ int er;
+
+ assert(t->op == '*');
+ assert(t->left != NULL && t->left->cnfa.nstates > 0);
+ assert(t->left->flags & SHORTER);
+ assert(begin <= end);
+
+ /*
+ * If zero matches are allowed, and target string is empty, just declare
+ * victory. OTOH, if target string isn't empty, zero matches can't work
+ * so we pretend the min is 1.
+ */
+ min_matches = t->min;
+ if (min_matches <= 0)
+ {
+ if (begin == end)
+ return REG_OKAY;
+ min_matches = 1;
+ }
+
+ /*
+ * We need workspace to track the endpoints of each sub-match. Normally
+ * we consider only nonzero-length sub-matches, so there can be at most
+ * end-begin of them. However, if min is larger than that, we will also
+ * consider zero-length sub-matches in order to find enough matches.
+ *
+ * For convenience, endpts[0] contains the "begin" pointer and we store
+ * sub-match endpoints in endpts[1..max_matches].
+ */
+ max_matches = end - begin;
+ if (max_matches > t->max && t->max != INFINITY)
+ max_matches = t->max;
+ if (max_matches < min_matches)
+ max_matches = min_matches;
+ endpts = (chr **) MALLOC((max_matches + 1) * sizeof(chr *));
+ if (endpts == NULL)
+ return REG_ESPACE;
+ endpts[0] = begin;
+
+ d = newdfa(v, &t->left->cnfa, &v->g->cmap, DOMALLOC);
+ if (ISERR())
+ {
+ FREE(endpts);
+ return v->err;
+ }
+ MDEBUG(("reviter %d\n", t->retry));
+
+ /*
+ * Our strategy is to first find a set of sub-match endpoints that are
+ * valid according to the child node's DFA, and then recursively dissect
+ * each sub-match to confirm validity. If any validity check fails,
+ * backtrack the last sub-match and try again. And, when we next try for
+ * a validity check, we need not recheck any successfully verified
+ * sub-matches that we didn't move the endpoints of. nverified remembers
+ * how many sub-matches are currently known okay.
+ */
+
+ /* initialize to consider first sub-match */
+ nverified = 0;
+ k = 1;
+ limit = begin;
+
+ /* iterate until satisfaction or failure */
+ while (k > 0)
+ {
+ /* disallow zero-length match unless necessary to achieve min */
+ if (limit == endpts[k - 1] &&
+ limit != end &&
+ (k >= min_matches || min_matches - k < end - limit))
+ limit++;
+
+ /* try to find an endpoint for the k'th sub-match */
+ endpts[k] = shortest(v, d, endpts[k - 1], limit, end,
+ (chr **) NULL, (int *) NULL);
+ if (endpts[k] == NULL)
+ {
+ /* no match possible, so see if we can lengthen previous one */
+ k--;
+ goto backtrack;
+ }
+ MDEBUG(("%d: working endpoint %d: %ld\n",
+ t->retry, k, LOFF(endpts[k])));
+
+ /* k'th sub-match can no longer be considered verified */
+ if (nverified >= k)
+ nverified = k - 1;
+
+ if (endpts[k] != end)
+ {
+ /* haven't reached end yet, try another iteration if allowed */
+ if (k >= max_matches)
+ {
+ /* must try to lengthen some previous match */
+ k--;
+ goto backtrack;
+ }
+
+ k++;
+ limit = endpts[k - 1];
+ continue;
+ }
+
+ /*
+ * We've identified a way to divide the string into k sub-matches
+ * that works so far as the child DFA can tell. If k is an allowed
+ * number of matches, start the slow part: recurse to verify each
+ * sub-match. We always have k <= max_matches, needn't check that.
+ */
+ if (k < min_matches)
+ goto backtrack;
+
+ MDEBUG(("%d: verifying %d..%d\n", t->retry, nverified + 1, k));
+
+ for (i = nverified + 1; i <= k; i++)
+ {
+ er = dissect(v, t->left, endpts[i - 1], endpts[i]);
+ if (er == REG_OKAY)
+ {
+ nverified = i;
+ continue;
+ }
+ if (er == REG_NOMATCH)
+ break;
+ /* oops, something failed */
+ freedfa(d);
+ FREE(endpts);
+ return er;
+ }
+
+ if (i > k)
+ {
+ /* satisfaction */
+ MDEBUG(("%d successful\n", t->retry));
+ freedfa(d);
+ FREE(endpts);
+ return REG_OKAY;
+ }
+
+ /* match failed to verify, so backtrack */
+
+backtrack:
+ /*
+ * Must consider longer versions of the current sub-match.
+ */
+ while (k > 0)
+ {
+ if (endpts[k] < end)
+ {
+ limit = endpts[k] + 1;
+ /* break out of backtrack loop, continue the outer one */
+ break;
+ }
+ /* can't lengthen k'th sub-match any more, consider previous one */
+ k--;
+ }
+ }
+
+ /* all possibilities exhausted - shouldn't happen in uncomplicated mode */
+ MDEBUG(("%d failed\n", t->retry));
+ freedfa(d);
+ FREE(endpts);
+ return REG_ASSERT;
+}
+
/*
* cdissect - determine subexpression matches (with complications)
* The retry memory stores the offset of the trial midpoint from begin,
case '=': /* terminal node */
assert(t->left == NULL && t->right == NULL);
return REG_OKAY; /* no action, parent did the work */
- case '|': /* alternation */
- assert(t->left != NULL);
- return caltdissect(v, t, begin, end);
case 'b': /* back reference */
assert(t->left == NULL && t->right == NULL);
return cbrdissect(v, t, begin, end);
case '.': /* concatenation */
assert(t->left != NULL && t->right != NULL);
return ccondissect(v, t, begin, end);
+ case '|': /* alternation */
+ assert(t->left != NULL);
+ return caltdissect(v, t, begin, end);
+ case '*': /* iteration */
+ assert(t->left != NULL);
+ return citerdissect(v, t, begin, end);
case '(': /* capturing */
assert(t->left != NULL && t->right == NULL);
assert(t->subno > 0);
}
/*
- * crevdissect - determine backref shortest-first subexpression matches
+ * crevdissect - shortest-first concatenation subexpression matches
* The retry memory stores the offset of the trial midpoint from begin,
* plus 1 so that 0 uniquely means "clean slate".
*/
return caltdissect(v, t->right, begin, end);
}
+/*
+ * citerdissect - iteration subexpression matches (with complications)
+ */
+static int /* regexec return code */
+citerdissect(struct vars * v,
+ struct subre * t,
+ chr *begin, /* beginning of relevant substring */
+ chr *end) /* end of same */
+{
+ struct dfa *d;
+ chr **endpts;
+ chr *limit;
+ int min_matches;
+ size_t max_matches;
+ int nverified;
+ int k;
+ int i;
+ int er;
+
+ assert(t->op == '*');
+ assert(t->left != NULL && t->left->cnfa.nstates > 0);
+ assert(begin <= end);
+
+ if (t->left->flags & SHORTER) /* reverse scan */
+ return creviterdissect(v, t, begin, end);
+
+ /*
+ * If zero matches are allowed, and target string is empty, just declare
+ * victory. OTOH, if target string isn't empty, zero matches can't work
+ * so we pretend the min is 1.
+ */
+ min_matches = t->min;
+ if (min_matches <= 0)
+ {
+ if (begin == end)
+ return REG_OKAY;
+ min_matches = 1;
+ }
+
+ /*
+ * We need workspace to track the endpoints of each sub-match. Normally
+ * we consider only nonzero-length sub-matches, so there can be at most
+ * end-begin of them. However, if min is larger than that, we will also
+ * consider zero-length sub-matches in order to find enough matches.
+ *
+ * For convenience, endpts[0] contains the "begin" pointer and we store
+ * sub-match endpoints in endpts[1..max_matches].
+ */
+ max_matches = end - begin;
+ if (max_matches > t->max && t->max != INFINITY)
+ max_matches = t->max;
+ if (max_matches < min_matches)
+ max_matches = min_matches;
+ endpts = (chr **) MALLOC((max_matches + 1) * sizeof(chr *));
+ if (endpts == NULL)
+ return REG_ESPACE;
+ endpts[0] = begin;
+
+ d = newdfa(v, &t->left->cnfa, &v->g->cmap, DOMALLOC);
+ if (ISERR())
+ {
+ FREE(endpts);
+ return v->err;
+ }
+ MDEBUG(("citer %d\n", t->retry));
+
+ /*
+ * Our strategy is to first find a set of sub-match endpoints that are
+ * valid according to the child node's DFA, and then recursively dissect
+ * each sub-match to confirm validity. If any validity check fails,
+ * backtrack the last sub-match and try again. And, when we next try for
+ * a validity check, we need not recheck any successfully verified
+ * sub-matches that we didn't move the endpoints of. nverified remembers
+ * how many sub-matches are currently known okay.
+ */
+
+ /* initialize to consider first sub-match */
+ nverified = 0;
+ k = 1;
+ limit = end;
+
+ /* iterate until satisfaction or failure */
+ while (k > 0)
+ {
+ /* try to find an endpoint for the k'th sub-match */
+ endpts[k] = longest(v, d, endpts[k - 1], limit, (int *) NULL);
+ if (endpts[k] == NULL)
+ {
+ /* no match possible, so see if we can shorten previous one */
+ k--;
+ goto backtrack;
+ }
+ MDEBUG(("%d: working endpoint %d: %ld\n",
+ t->retry, k, LOFF(endpts[k])));
+
+ /* k'th sub-match can no longer be considered verified */
+ if (nverified >= k)
+ nverified = k - 1;
+
+ if (endpts[k] != end)
+ {
+ /* haven't reached end yet, try another iteration if allowed */
+ if (k >= max_matches)
+ {
+ /* must try to shorten some previous match */
+ k--;
+ goto backtrack;
+ }
+
+ /* reject zero-length match unless necessary to achieve min */
+ if (endpts[k] == endpts[k - 1] &&
+ (k >= min_matches || min_matches - k < end - endpts[k]))
+ goto backtrack;
+
+ k++;
+ limit = end;
+ continue;
+ }
+
+ /*
+ * We've identified a way to divide the string into k sub-matches
+ * that works so far as the child DFA can tell. If k is an allowed
+ * number of matches, start the slow part: recurse to verify each
+ * sub-match. We always have k <= max_matches, needn't check that.
+ */
+ if (k < min_matches)
+ goto backtrack;
+
+ MDEBUG(("%d: verifying %d..%d\n", t->retry, nverified + 1, k));
+
+ for (i = nverified + 1; i <= k; i++)
+ {
+ zapmem(v, t->left);
+ er = cdissect(v, t->left, endpts[i - 1], endpts[i]);
+ if (er == REG_OKAY)
+ {
+ nverified = i;
+ continue;
+ }
+ if (er == REG_NOMATCH)
+ break;
+ /* oops, something failed */
+ freedfa(d);
+ FREE(endpts);
+ return er;
+ }
+
+ if (i > k)
+ {
+ /* satisfaction */
+ MDEBUG(("%d successful\n", t->retry));
+ freedfa(d);
+ FREE(endpts);
+ return REG_OKAY;
+ }
+
+ /* match failed to verify, so backtrack */
+
+backtrack:
+ /*
+ * Must consider shorter versions of the current sub-match. However,
+ * we'll only ask for a zero-length match if necessary.
+ */
+ while (k > 0)
+ {
+ chr *prev_end = endpts[k - 1];
+
+ if (endpts[k] > prev_end)
+ {
+ limit = endpts[k] - 1;
+ if (limit > prev_end ||
+ (k < min_matches && min_matches - k >= end - prev_end))
+ {
+ /* break out of backtrack loop, continue the outer one */
+ break;
+ }
+ }
+ /* can't shorten k'th sub-match any more, consider previous one */
+ k--;
+ }
+ }
+
+ /* all possibilities exhausted */
+ MDEBUG(("%d failed\n", t->retry));
+ freedfa(d);
+ FREE(endpts);
+ return REG_NOMATCH;
+}
+
+/*
+ * creviterdissect - shortest-first iteration subexpression matches
+ */
+static int /* regexec return code */
+creviterdissect(struct vars * v,
+ struct subre * t,
+ chr *begin, /* beginning of relevant substring */
+ chr *end) /* end of same */
+{
+ struct dfa *d;
+ chr **endpts;
+ chr *limit;
+ int min_matches;
+ size_t max_matches;
+ int nverified;
+ int k;
+ int i;
+ int er;
+
+ assert(t->op == '*');
+ assert(t->left != NULL && t->left->cnfa.nstates > 0);
+ assert(t->left->flags & SHORTER);
+ assert(begin <= end);
+
+ /*
+ * If zero matches are allowed, and target string is empty, just declare
+ * victory. OTOH, if target string isn't empty, zero matches can't work
+ * so we pretend the min is 1.
+ */
+ min_matches = t->min;
+ if (min_matches <= 0)
+ {
+ if (begin == end)
+ return REG_OKAY;
+ min_matches = 1;
+ }
+
+ /*
+ * We need workspace to track the endpoints of each sub-match. Normally
+ * we consider only nonzero-length sub-matches, so there can be at most
+ * end-begin of them. However, if min is larger than that, we will also
+ * consider zero-length sub-matches in order to find enough matches.
+ *
+ * For convenience, endpts[0] contains the "begin" pointer and we store
+ * sub-match endpoints in endpts[1..max_matches].
+ */
+ max_matches = end - begin;
+ if (max_matches > t->max && t->max != INFINITY)
+ max_matches = t->max;
+ if (max_matches < min_matches)
+ max_matches = min_matches;
+ endpts = (chr **) MALLOC((max_matches + 1) * sizeof(chr *));
+ if (endpts == NULL)
+ return REG_ESPACE;
+ endpts[0] = begin;
+
+ d = newdfa(v, &t->left->cnfa, &v->g->cmap, DOMALLOC);
+ if (ISERR())
+ {
+ FREE(endpts);
+ return v->err;
+ }
+ MDEBUG(("creviter %d\n", t->retry));
+
+ /*
+ * Our strategy is to first find a set of sub-match endpoints that are
+ * valid according to the child node's DFA, and then recursively dissect
+ * each sub-match to confirm validity. If any validity check fails,
+ * backtrack the last sub-match and try again. And, when we next try for
+ * a validity check, we need not recheck any successfully verified
+ * sub-matches that we didn't move the endpoints of. nverified remembers
+ * how many sub-matches are currently known okay.
+ */
+
+ /* initialize to consider first sub-match */
+ nverified = 0;
+ k = 1;
+ limit = begin;
+
+ /* iterate until satisfaction or failure */
+ while (k > 0)
+ {
+ /* disallow zero-length match unless necessary to achieve min */
+ if (limit == endpts[k - 1] &&
+ limit != end &&
+ (k >= min_matches || min_matches - k < end - limit))
+ limit++;
+
+ /* try to find an endpoint for the k'th sub-match */
+ endpts[k] = shortest(v, d, endpts[k - 1], limit, end,
+ (chr **) NULL, (int *) NULL);
+ if (endpts[k] == NULL)
+ {
+ /* no match possible, so see if we can lengthen previous one */
+ k--;
+ goto backtrack;
+ }
+ MDEBUG(("%d: working endpoint %d: %ld\n",
+ t->retry, k, LOFF(endpts[k])));
+
+ /* k'th sub-match can no longer be considered verified */
+ if (nverified >= k)
+ nverified = k - 1;
+
+ if (endpts[k] != end)
+ {
+ /* haven't reached end yet, try another iteration if allowed */
+ if (k >= max_matches)
+ {
+ /* must try to lengthen some previous match */
+ k--;
+ goto backtrack;
+ }
+
+ k++;
+ limit = endpts[k - 1];
+ continue;
+ }
+
+ /*
+ * We've identified a way to divide the string into k sub-matches
+ * that works so far as the child DFA can tell. If k is an allowed
+ * number of matches, start the slow part: recurse to verify each
+ * sub-match. We always have k <= max_matches, needn't check that.
+ */
+ if (k < min_matches)
+ goto backtrack;
+
+ MDEBUG(("%d: verifying %d..%d\n", t->retry, nverified + 1, k));
+
+ for (i = nverified + 1; i <= k; i++)
+ {
+ zapmem(v, t->left);
+ er = cdissect(v, t->left, endpts[i - 1], endpts[i]);
+ if (er == REG_OKAY)
+ {
+ nverified = i;
+ continue;
+ }
+ if (er == REG_NOMATCH)
+ break;
+ /* oops, something failed */
+ freedfa(d);
+ FREE(endpts);
+ return er;
+ }
+
+ if (i > k)
+ {
+ /* satisfaction */
+ MDEBUG(("%d successful\n", t->retry));
+ freedfa(d);
+ FREE(endpts);
+ return REG_OKAY;
+ }
+
+ /* match failed to verify, so backtrack */
+
+backtrack:
+ /*
+ * Must consider longer versions of the current sub-match.
+ */
+ while (k > 0)
+ {
+ if (endpts[k] < end)
+ {
+ limit = endpts[k] + 1;
+ /* break out of backtrack loop, continue the outer one */
+ break;
+ }
+ /* can't lengthen k'th sub-match any more, consider previous one */
+ k--;
+ }
+ }
+
+ /* all possibilities exhausted */
+ MDEBUG(("%d failed\n", t->retry));
+ freedfa(d);
+ FREE(endpts);
+ return REG_NOMATCH;
+}
+
#include "rege_dfa.c"