From 587359479acbbdc95c8e37da40707e37097423f5 Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Fri, 24 Feb 2012 14:56:35 -0500 Subject: [PATCH] Avoid repeated creation/freeing of per-subre DFAs during regex search. In nested sub-regex trees, lower-level nodes created DFAs and then destroyed them again before exiting, which is a bit dumb considering that the recursive search is likely to call those nodes again later. Instead cache each created DFA until the end of pg_regexec(). This is basically a space for time tradeoff, in that it might increase the maximum memory usage. However, in most regex patterns there are not all that many subre nodes, so not that many DFAs --- and in any case, the peak usage occurs when reaching the bottom recursion level, and except for alternation cases that's going to be the same anyway. --- src/backend/regex/regexec.c | 158 +++++++++++++++--------------------- src/include/regex/regguts.h | 4 +- 2 files changed, 68 insertions(+), 94 deletions(-) diff --git a/src/backend/regex/regexec.c b/src/backend/regex/regexec.c index 55f0c18d14..d3e850a869 100644 --- a/src/backend/regex/regexec.c +++ b/src/backend/regex/regexec.c @@ -112,6 +112,7 @@ struct vars chr *search_start; /* search start of string */ chr *stop; /* just past end of string */ int err; /* error code if any (0 none) */ + struct dfa **subdfas; /* per-subre DFAs */ struct smalldfa dfa1; struct smalldfa dfa2; }; @@ -130,6 +131,7 @@ struct vars * forward declarations */ /* === regexec.c === */ +static struct dfa *getsubdfa(struct vars *, struct subre *); static int find(struct vars *, struct cnfa *, struct colormap *); static int cfind(struct vars *, struct cnfa *, struct colormap *); static int cfindloop(struct vars *, struct cnfa *, struct colormap *, struct dfa *, struct dfa *, chr **); @@ -180,11 +182,15 @@ pg_regexec(regex_t *re, register struct vars *v = &var; int st; size_t n; + size_t i; int backref; #define LOCALMAT 20 regmatch_t mat[LOCALMAT]; +#define LOCALDFAS 40 + struct dfa *subdfas[LOCALDFAS]; + /* sanity checks */ if (re == NULL || string == NULL || re->re_magic != REMAGIC) return REG_INVARG; @@ -225,6 +231,20 @@ pg_regexec(regex_t *re, v->search_start = (chr *) string + search_start; v->stop = (chr *) string + len; v->err = 0; + assert(v->g->ntree >= 0); + n = (size_t) v->g->ntree; + if (n <= LOCALDFAS) + v->subdfas = subdfas; + else + v->subdfas = (struct dfa **) MALLOC(n * sizeof(struct dfa *)); + if (v->subdfas == NULL) + { + if (v->pmatch != pmatch && v->pmatch != mat) + FREE(v->pmatch); + return REG_ESPACE; + } + for (i = 0; i < n; i++) + v->subdfas[i] = NULL; /* do it */ assert(v->g->tree != NULL); @@ -244,9 +264,36 @@ pg_regexec(regex_t *re, /* clean up */ if (v->pmatch != pmatch && v->pmatch != mat) FREE(v->pmatch); + for (i = 0; i < n; i++) + { + if (v->subdfas[i] != NULL) + freedfa(v->subdfas[i]); + } + if (v->subdfas != subdfas) + FREE(v->subdfas); + return st; } +/* + * getsubdfa - create or re-fetch the DFA for a subre node + * + * We only need to create the DFA once per overall regex execution. + * The DFA will be freed by the cleanup step in pg_regexec(). + */ +static struct dfa * +getsubdfa(struct vars * v, + struct subre * t) +{ + if (v->subdfas[t->id] == NULL) + { + v->subdfas[t->id] = newdfa(v, &t->cnfa, &v->g->cmap, DOMALLOC); + if (ISERR()) + return NULL; + } + return v->subdfas[t->id]; +} + /* * find - find a match for the main NFA (no-complications case) */ @@ -578,15 +625,10 @@ condissect(struct vars * v, assert(t->left != NULL && t->left->cnfa.nstates > 0); assert(t->right != NULL && t->right->cnfa.nstates > 0); - d = newdfa(v, &t->left->cnfa, &v->g->cmap, &v->dfa1); + d = getsubdfa(v, t->left); + NOERR(); + d2 = getsubdfa(v, t->right); NOERR(); - d2 = newdfa(v, &t->right->cnfa, &v->g->cmap, &v->dfa2); - if (ISERR()) - { - assert(d2 == NULL); - freedfa(d); - return v->err; - } /* pick a tentative midpoint */ if (shorter) @@ -595,11 +637,7 @@ condissect(struct vars * v, else mid = longest(v, d, begin, end, (int *) NULL); if (mid == NULL) - { - freedfa(d); - freedfa(d2); return REG_ASSERT; - } MDEBUG(("tentative midpoint %ld\n", LOFF(mid))); /* iterate until satisfaction or failure */ @@ -610,8 +648,6 @@ condissect(struct vars * v, { /* all possibilities exhausted! */ MDEBUG(("no midpoint!\n")); - freedfa(d); - freedfa(d2); return REG_ASSERT; } if (shorter) @@ -623,8 +659,6 @@ condissect(struct vars * v, { /* failed to find a new one! */ MDEBUG(("failed midpoint!\n")); - freedfa(d); - freedfa(d2); return REG_ASSERT; } MDEBUG(("new midpoint %ld\n", LOFF(mid))); @@ -632,8 +666,6 @@ condissect(struct vars * v, /* satisfaction */ MDEBUG(("successful\n")); - freedfa(d); - freedfa(d2); i = dissect(v, t->left, begin, mid); if (i != REG_OKAY) return i; @@ -659,16 +691,13 @@ altdissect(struct vars * v, { MDEBUG(("trying %dth\n", i)); assert(t->left != NULL && t->left->cnfa.nstates > 0); - d = newdfa(v, &t->left->cnfa, &v->g->cmap, &v->dfa1); - if (ISERR()) - return v->err; + d = getsubdfa(v, t->left); + NOERR(); if (longest(v, d, begin, end, (int *) NULL) == end) { MDEBUG(("success\n")); - freedfa(d); return dissect(v, t->left, begin, end); } - freedfa(d); } return REG_ASSERT; /* none of them matched?!? */ } @@ -731,7 +760,7 @@ iterdissect(struct vars * v, return REG_ESPACE; endpts[0] = begin; - d = newdfa(v, &t->left->cnfa, &v->g->cmap, DOMALLOC); + d = getsubdfa(v, t->left); if (ISERR()) { FREE(endpts); @@ -814,7 +843,6 @@ iterdissect(struct vars * v, if (er == REG_NOMATCH) break; /* oops, something failed */ - freedfa(d); FREE(endpts); return er; } @@ -823,7 +851,6 @@ iterdissect(struct vars * v, { /* satisfaction */ MDEBUG(("%d successful\n", t->id)); - freedfa(d); FREE(endpts); return REG_OKAY; } @@ -856,7 +883,6 @@ backtrack: /* all possibilities exhausted - shouldn't happen in uncomplicated mode */ MDEBUG(("%d failed\n", t->id)); - freedfa(d); FREE(endpts); return REG_ASSERT; } @@ -917,7 +943,7 @@ reviterdissect(struct vars * v, return REG_ESPACE; endpts[0] = begin; - d = newdfa(v, &t->left->cnfa, &v->g->cmap, DOMALLOC); + d = getsubdfa(v, t->left); if (ISERR()) { FREE(endpts); @@ -1002,7 +1028,6 @@ reviterdissect(struct vars * v, if (er == REG_NOMATCH) break; /* oops, something failed */ - freedfa(d); FREE(endpts); return er; } @@ -1011,7 +1036,6 @@ reviterdissect(struct vars * v, { /* satisfaction */ MDEBUG(("%d successful\n", t->id)); - freedfa(d); FREE(endpts); return REG_OKAY; } @@ -1037,7 +1061,6 @@ backtrack: /* all possibilities exhausted - shouldn't happen in uncomplicated mode */ MDEBUG(("%d failed\n", t->id)); - freedfa(d); FREE(endpts); return REG_ASSERT; } @@ -1106,25 +1129,16 @@ ccondissect(struct vars * v, if (t->left->flags & SHORTER) /* reverse scan */ return crevdissect(v, t, begin, end); - d = newdfa(v, &t->left->cnfa, &v->g->cmap, DOMALLOC); - if (ISERR()) - return v->err; - d2 = newdfa(v, &t->right->cnfa, &v->g->cmap, DOMALLOC); - if (ISERR()) - { - freedfa(d); - return v->err; - } + d = getsubdfa(v, t->left); + NOERR(); + d2 = getsubdfa(v, t->right); + NOERR(); MDEBUG(("cconcat %d\n", t->id)); /* pick a tentative midpoint */ mid = longest(v, d, begin, end, (int *) NULL); if (mid == NULL) - { - freedfa(d); - freedfa(d2); return REG_NOMATCH; - } MDEBUG(("tentative midpoint %ld\n", LOFF(mid))); /* iterate until satisfaction or failure */ @@ -1141,17 +1155,11 @@ ccondissect(struct vars * v, { /* satisfaction */ MDEBUG(("successful\n")); - freedfa(d); - freedfa(d2); return REG_OKAY; } } if (er != REG_OKAY && er != REG_NOMATCH) - { - freedfa(d); - freedfa(d2); return er; - } } /* that midpoint didn't work, find a new one */ @@ -1159,8 +1167,6 @@ ccondissect(struct vars * v, { /* all possibilities exhausted */ MDEBUG(("%d no midpoint\n", t->id)); - freedfa(d); - freedfa(d2); return REG_NOMATCH; } mid = longest(v, d, begin, mid - 1, (int *) NULL); @@ -1168,8 +1174,6 @@ ccondissect(struct vars * v, { /* failed to find a new one */ MDEBUG(("%d failed midpoint\n", t->id)); - freedfa(d); - freedfa(d2); return REG_NOMATCH; } MDEBUG(("%d: new midpoint %ld\n", t->id, LOFF(mid))); @@ -1201,25 +1205,16 @@ crevdissect(struct vars * v, assert(t->left->flags & SHORTER); /* concatenation -- need to split the substring between parts */ - d = newdfa(v, &t->left->cnfa, &v->g->cmap, DOMALLOC); - if (ISERR()) - return v->err; - d2 = newdfa(v, &t->right->cnfa, &v->g->cmap, DOMALLOC); - if (ISERR()) - { - freedfa(d); - return v->err; - } + d = getsubdfa(v, t->left); + NOERR(); + d2 = getsubdfa(v, t->right); + NOERR(); MDEBUG(("crev %d\n", t->id)); /* pick a tentative midpoint */ mid = shortest(v, d, begin, begin, end, (chr **) NULL, (int *) NULL); if (mid == NULL) - { - freedfa(d); - freedfa(d2); return REG_NOMATCH; - } MDEBUG(("tentative midpoint %ld\n", LOFF(mid))); /* iterate until satisfaction or failure */ @@ -1236,17 +1231,11 @@ crevdissect(struct vars * v, { /* satisfaction */ MDEBUG(("successful\n")); - freedfa(d); - freedfa(d2); return REG_OKAY; } } if (er != REG_OKAY && er != REG_NOMATCH) - { - freedfa(d); - freedfa(d2); return er; - } } /* that midpoint didn't work, find a new one */ @@ -1254,8 +1243,6 @@ crevdissect(struct vars * v, { /* all possibilities exhausted */ MDEBUG(("%d no midpoint\n", t->id)); - freedfa(d); - freedfa(d2); return REG_NOMATCH; } mid = shortest(v, d, begin, mid + 1, end, (chr **) NULL, (int *) NULL); @@ -1263,8 +1250,6 @@ crevdissect(struct vars * v, { /* failed to find a new one */ MDEBUG(("%d failed midpoint\n", t->id)); - freedfa(d); - freedfa(d2); return REG_NOMATCH; } MDEBUG(("%d: new midpoint %ld\n", t->id, LOFF(mid))); @@ -1377,15 +1362,10 @@ caltdissect(struct vars * v, MDEBUG(("calt n%d\n", t->id)); - d = newdfa(v, &t->left->cnfa, &v->g->cmap, DOMALLOC); - if (ISERR()) - return v->err; + d = getsubdfa(v, t->left); + NOERR(); if (longest(v, d, begin, end, (int *) NULL) != end) - { - freedfa(d); return caltdissect(v, t->right, begin, end); - } - freedfa(d); MDEBUG(("calt matched\n")); er = cdissect(v, t->left, begin, end); @@ -1453,7 +1433,7 @@ citerdissect(struct vars * v, return REG_ESPACE; endpts[0] = begin; - d = newdfa(v, &t->left->cnfa, &v->g->cmap, DOMALLOC); + d = getsubdfa(v, t->left); if (ISERR()) { FREE(endpts); @@ -1537,7 +1517,6 @@ citerdissect(struct vars * v, if (er == REG_NOMATCH) break; /* oops, something failed */ - freedfa(d); FREE(endpts); return er; } @@ -1546,7 +1525,6 @@ citerdissect(struct vars * v, { /* satisfaction */ MDEBUG(("%d successful\n", t->id)); - freedfa(d); FREE(endpts); return REG_OKAY; } @@ -1579,7 +1557,6 @@ backtrack: /* all possibilities exhausted */ MDEBUG(("%d failed\n", t->id)); - freedfa(d); FREE(endpts); return REG_NOMATCH; } @@ -1640,7 +1617,7 @@ creviterdissect(struct vars * v, return REG_ESPACE; endpts[0] = begin; - d = newdfa(v, &t->left->cnfa, &v->g->cmap, DOMALLOC); + d = getsubdfa(v, t->left); if (ISERR()) { FREE(endpts); @@ -1726,7 +1703,6 @@ creviterdissect(struct vars * v, if (er == REG_NOMATCH) break; /* oops, something failed */ - freedfa(d); FREE(endpts); return er; } @@ -1735,7 +1711,6 @@ creviterdissect(struct vars * v, { /* satisfaction */ MDEBUG(("%d successful\n", t->id)); - freedfa(d); FREE(endpts); return REG_OKAY; } @@ -1761,7 +1736,6 @@ backtrack: /* all possibilities exhausted */ MDEBUG(("%d failed\n", t->id)); - freedfa(d); FREE(endpts); return REG_NOMATCH; } diff --git a/src/include/regex/regguts.h b/src/include/regex/regguts.h index bc5419d98e..65b8d178da 100644 --- a/src/include/regex/regguts.h +++ b/src/include/regex/regguts.h @@ -409,7 +409,7 @@ struct subre #define PREF(f) ((f)&LOCAL) #define PREF2(f1, f2) ((PREF(f1) != 0) ? PREF(f1) : PREF(f2)) #define COMBINE(f1, f2) (UP((f1)|(f2)) | PREF2(f1, f2)) - short id; /* ID of subre (1..ntree) */ + short id; /* ID of subre (1..ntree-1) */ int subno; /* subexpression number (for 'b' and '(') */ short min; /* min repetitions for iteration or backref */ short max; /* max repetitions for iteration or backref */ @@ -446,7 +446,7 @@ struct guts size_t nsub; /* copy of re_nsub */ struct subre *tree; struct cnfa search; /* for fast preliminary search */ - int ntree; + int ntree; /* number of subre's, less one */ struct colormap cmap; int FUNCPTR(compare, (const chr *, const chr *, size_t)); struct subre *lacons; /* lookahead-constraint vector */ -- 2.40.0