From a7b61d4f5af37344f8973b2dfce47e2ba2680061 Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Thu, 7 Mar 2013 11:51:03 -0500
Subject: [PATCH] Fix infinite-loop risk in fixempties() stage of regex
 compilation.

The previous coding of this function could get into situations where it
would never terminate, because successive passes would re-add EMPTY arcs
that had been removed by the previous pass.  Rewrite the function
completely using a new algorithm that is guaranteed to terminate, and
also seems to be usually faster than the old one.  Per Tcl bugs 3604074
and 3606683.

Tom Lane and Don Porter
---
 src/backend/regex/regc_nfa.c        | 314 ++++++++++++++++++++++------
 src/backend/regex/regcomp.c         |  12 +-
 src/test/regress/expected/regex.out |  20 ++
 src/test/regress/sql/regex.sql      |   7 +
 4 files changed, 280 insertions(+), 73 deletions(-)

diff --git a/src/backend/regex/regc_nfa.c b/src/backend/regex/regc_nfa.c
index 085842c92b..05fe8b0808 100644
--- a/src/backend/regex/regc_nfa.c
+++ b/src/backend/regex/regc_nfa.c
@@ -455,6 +455,56 @@ freearc(struct nfa * nfa,
 	from->free = victim;
 }
 
+/*
+ * hasnonemptyout - Does state have a non-EMPTY out arc?
+ */
+static int
+hasnonemptyout(struct state * s)
+{
+	struct arc *a;
+
+	for (a = s->outs; a != NULL; a = a->outchain)
+	{
+		if (a->type != EMPTY)
+			return 1;
+	}
+	return 0;
+}
+
+/*
+ * nonemptyouts - count non-EMPTY out arcs of a state
+ */
+static int
+nonemptyouts(struct state * s)
+{
+	int			n = 0;
+	struct arc *a;
+
+	for (a = s->outs; a != NULL; a = a->outchain)
+	{
+		if (a->type != EMPTY)
+			n++;
+	}
+	return n;
+}
+
+/*
+ * nonemptyins - count non-EMPTY in arcs of a state
+ */
+static int
+nonemptyins(struct state * s)
+{
+	int			n = 0;
+	struct arc *a;
+
+	for (a = s->ins; a != NULL; a = a->inchain)
+	{
+		if (a->type != EMPTY)
+			n++;
+	}
+	return n;
+}
+
 /*
  * findarc - find arc, if any, from given source with given type and color
  * If there is more than one such arc, the result is random.
@@ -511,19 +561,25 @@ moveins(struct nfa * nfa,
 }
 
 /*
- * copyins - copy all in arcs of a state to another state
+ * copyins - copy in arcs of a state to another state
+ *
+ * Either all arcs, or only non-empty ones as determined by all value.
  */
 static void
 copyins(struct nfa * nfa,
 		struct state * oldState,
-		struct state * newState)
+		struct state * newState,
+		int all)
 {
 	struct arc *a;
 
 	assert(oldState != newState);
 
 	for (a = oldState->ins; a != NULL; a = a->inchain)
-		cparc(nfa, a, a->from, newState);
+	{
+		if (all || a->type != EMPTY)
+			cparc(nfa, a, a->from, newState);
+	}
 }
 
 /*
@@ -546,19 +602,25 @@ moveouts(struct nfa * nfa,
 }
 
 /*
- * copyouts - copy all out arcs of a state to another state
+ * copyouts - copy out arcs of a state to another state
+ *
+ * Either all arcs, or only non-empty ones as determined by all value.
  */
 static void
 copyouts(struct nfa * nfa,
 		 struct state * oldState,
-		 struct state * newState)
+		 struct state * newState,
+		 int all)
 {
 	struct arc *a;
 
 	assert(oldState != newState);
 
 	for (a = oldState->outs; a != NULL; a = a->outchain)
-		cparc(nfa, a, newState, a->to);
+	{
+		if (all || a->type != EMPTY)
+			cparc(nfa, a, newState, a->to);
+	}
 }
 
 /*
@@ -881,7 +943,7 @@ pull(struct nfa * nfa,
 		if (NISERR())
 			return 0;
 		assert(to != from);		/* con is not an inarc */
-		copyins(nfa, from, s);	/* duplicate inarcs */
+		copyins(nfa, from, s, 1);		/* duplicate inarcs */
 		cparc(nfa, con, s, to); /* move constraint arc */
 		freearc(nfa, con);
 		from = s;
@@ -1027,7 +1089,7 @@ push(struct nfa * nfa,
 		s = newstate(nfa);
 		if (NISERR())
 			return 0;
-		copyouts(nfa, to, s);	/* duplicate outarcs */
+		copyouts(nfa, to, s, 1);	/* duplicate outarcs */
 		cparc(nfa, con, from, s);		/* move constraint */
 		freearc(nfa, con);
 		to = s;
@@ -1134,91 +1196,205 @@ fixempties(struct nfa * nfa,
 		   FILE *f)				/* for debug output; NULL none */
 {
 	struct state *s;
+	struct state *s2;
 	struct state *nexts;
 	struct arc *a;
 	struct arc *nexta;
-	int			progress;
 
-	/* find and eliminate empties until there are no more */
-	do
+	/*
+	 * First, get rid of any states whose sole out-arc is an EMPTY, since
+	 * they're basically just aliases for their successor.  The parsing
+	 * algorithm creates enough of these that it's worth special-casing this.
+	 */
+	for (s = nfa->states; s != NULL && !NISERR(); s = nexts)
 	{
-		progress = 0;
-		for (s = nfa->states; s != NULL && !NISERR() &&
-			 s->no != FREESTATE; s = nexts)
+		nexts = s->next;
+		if (s->flag || s->nouts != 1)
+			continue;
+		a = s->outs;
+		assert(a != NULL && a->outchain == NULL);
+		if (a->type != EMPTY)
+			continue;
+		if (s != a->to)
+			moveins(nfa, s, a->to);
+		dropstate(nfa, s);
+	}
+
+	/*
+	 * Similarly, get rid of any state with a single EMPTY in-arc, by folding
+	 * it into its predecessor.
+	 */
+	for (s = nfa->states; s != NULL && !NISERR(); s = nexts)
+	{
+		nexts = s->next;
+		/* while we're at it, ensure tmp fields are clear for next step */
+		assert(s->tmp == NULL);
+		if (s->flag || s->nins != 1)
+			continue;
+		a = s->ins;
+		assert(a != NULL && a->inchain == NULL);
+		if (a->type != EMPTY)
+			continue;
+		if (s != a->from)
+			moveouts(nfa, s, a->from);
+		dropstate(nfa, s);
+	}
+
+	/*
+	 * For each remaining NFA state, find all other states that are reachable
+	 * from it by a chain of one or more EMPTY arcs.  Then generate new arcs
+	 * that eliminate the need for each such chain.
+	 *
+	 * If we just do this straightforwardly, the algorithm gets slow in
+	 * complex graphs, because the same arcs get copied to all intermediate
+	 * states of an EMPTY chain, and then uselessly pushed repeatedly to the
+	 * chain's final state; we waste a lot of time in newarc's duplicate
+	 * checking.  To improve matters, we decree that any state with only EMPTY
+	 * out-arcs is "doomed" and will not be part of the final NFA. That can be
+	 * ensured by not adding any new out-arcs to such a state. Having ensured
+	 * that, we need not update the state's in-arcs list either; all arcs that
+	 * might have gotten pushed forward to it will just get pushed directly to
+	 * successor states.  This eliminates most of the useless duplicate arcs.
+	 */
+	for (s = nfa->states; s != NULL && !NISERR(); s = s->next)
+	{
+		for (s2 = emptyreachable(s, s); s2 != s && !NISERR(); s2 = nexts)
 		{
-			nexts = s->next;
-			for (a = s->outs; a != NULL && !NISERR(); a = nexta)
-			{
-				nexta = a->outchain;
-				if (a->type == EMPTY && unempty(nfa, a))
-					progress = 1;
-				assert(nexta == NULL || s->no != FREESTATE);
-			}
+			/*
+			 * If s2 is doomed, we decide that (1) we will always push arcs
+			 * forward to it, not pull them back to s; and (2) we can optimize
+			 * away the push-forward, per comment above.  So do nothing.
+			 */
+			if (s2->flag || hasnonemptyout(s2))
+				replaceempty(nfa, s, s2);
+
+			/* Reset the tmp fields as we walk back */
+			nexts = s2->tmp;
+			s2->tmp = NULL;
 		}
-		if (progress && f != NULL)
-			dumpnfa(nfa, f);
-	} while (progress && !NISERR());
+		s->tmp = NULL;
+	}
+
+	if (NISERR())
+		return;
+
+	/*
+	 * Now remove all the EMPTY arcs, since we don't need them anymore.
+	 */
+	for (s = nfa->states; s != NULL; s = s->next)
+	{
+		for (a = s->outs; a != NULL; a = nexta)
+		{
+			nexta = a->outchain;
+			if (a->type == EMPTY)
+				freearc(nfa, a);
+		}
+	}
+
+	/*
+	 * And remove any states that have become useless.	(This cleanup is not
+	 * very thorough, and would be even less so if we tried to combine it with
+	 * the previous step; but cleanup() will take care of anything we miss.)
+	 */
+	for (s = nfa->states; s != NULL; s = nexts)
+	{
+		nexts = s->next;
+		if ((s->nins == 0 || s->nouts == 0) && !s->flag)
+			dropstate(nfa, s);
+	}
+
+	if (f != NULL)
+		dumpnfa(nfa, f);
 }
 
 /*
- * unempty - optimize out an EMPTY arc, if possible
+ * emptyreachable - recursively find all states reachable from s by EMPTY arcs
+ *
+ * The return value is the last such state found.  Its tmp field links back
+ * to the next-to-last such state, and so on back to s, so that all these
+ * states can be located without searching the whole NFA.
  *
- * Actually, as it stands this function always succeeds, but the return
- * value is kept with an eye on possible future changes.
+ * The maximum recursion depth here is equal to the length of the longest
+ * loop-free chain of EMPTY arcs, which is surely no more than the size of
+ * the NFA, and in practice will be a lot less than that.
  */
-static int						/* 0 couldn't, 1 could */
-unempty(struct nfa * nfa,
-		struct arc * a)
+static struct state *
+emptyreachable(struct state * s, struct state * lastfound)
 {
-	struct state *from = a->from;
-	struct state *to = a->to;
-	int			usefrom;		/* work on from, as opposed to to? */
-
-	assert(a->type == EMPTY);
-	assert(from != nfa->pre && to != nfa->post);
+	struct arc *a;
 
-	if (from == to)
-	{							/* vacuous loop */
-		freearc(nfa, a);
-		return 1;
+	s->tmp = lastfound;
+	lastfound = s;
+	for (a = s->outs; a != NULL; a = a->outchain)
+	{
+		if (a->type == EMPTY && a->to->tmp == NULL)
+			lastfound = emptyreachable(a->to, lastfound);
 	}
+	return lastfound;
+}
 
-	/* decide which end to work on */
-	usefrom = 1;				/* default:  attack from */
-	if (from->nouts > to->nins)
-		usefrom = 0;
-	else if (from->nouts == to->nins)
+/*
+ * replaceempty - replace an EMPTY arc chain with some non-empty arcs
+ *
+ * The EMPTY arc(s) should be deleted later, but we can't do it here because
+ * they may still be needed to identify other arc chains during fixempties().
+ */
+static void
+replaceempty(struct nfa * nfa,
+			 struct state * from,
+			 struct state * to)
+{
+	int			fromouts;
+	int			toins;
+
+	assert(from != to);
+
+	/*
+	 * Create replacement arcs that bypass the need for the EMPTY chain.  We
+	 * can do this either by pushing arcs forward (linking directly from
+	 * "from"'s predecessors to "to") or by pulling them back (linking
+	 * directly from "from" to "to"'s successors).  In general, we choose
+	 * whichever way creates greater fan-out or fan-in, so as to improve the
+	 * odds of reducing the other state to zero in-arcs or out-arcs and
+	 * thereby being able to delete it.  However, if "from" is doomed (has no
+	 * non-EMPTY out-arcs), we must keep it so, so always push forward in that
+	 * case.
+	 *
+	 * The fan-out/fan-in comparison should count only non-EMPTY arcs.	If
+	 * "from" is doomed, we can skip counting "to"'s arcs, since we want to
+	 * force taking the copyins path in that case.
+	 */
+	fromouts = nonemptyouts(from);
+	toins = (fromouts == 0) ? 1 : nonemptyins(to);
+
+	if (fromouts > toins)
 	{
-		/* decide on secondary issue:  move/copy fewest arcs */
-		if (from->nins > to->nouts)
-			usefrom = 0;
+		copyouts(nfa, to, from, 0);
+		return;
+	}
+	if (fromouts < toins)
+	{
+		copyins(nfa, from, to, 0);
+		return;
 	}
 
-	freearc(nfa, a);
-	if (usefrom)
+	/*
+	 * fromouts == toins.  Decide on secondary issue: copy fewest arcs.
+	 *
+	 * Doesn't seem to be worth the trouble to exclude empties from these
+	 * comparisons; that takes extra time and doesn't seem to improve the
+	 * resulting graph much.
+	 */
+	if (from->nins > to->nouts)
 	{
-		if (from->nouts == 0)
-		{
-			/* was the state's only outarc */
-			moveins(nfa, from, to);
-			freestate(nfa, from);
-		}
-		else
-			copyins(nfa, from, to);
+		copyouts(nfa, to, from, 0);
+		return;
 	}
 	else
 	{
-		if (to->nins == 0)
-		{
-			/* was the state's only inarc */
-			moveouts(nfa, to, from);
-			freestate(nfa, to);
-		}
-		else
-			copyouts(nfa, to, from);
+		copyins(nfa, from, to, 0);
+		return;
 	}
-
-	return 1;
 }
 
 /*
diff --git a/src/backend/regex/regcomp.c b/src/backend/regex/regcomp.c
index 9b3fe64807..b5988a2fbc 100644
--- a/src/backend/regex/regcomp.c
+++ b/src/backend/regex/regcomp.c
@@ -122,12 +122,15 @@ static void destroystate(struct nfa *, struct state *);
 static void newarc(struct nfa *, int, pcolor, struct state *, struct state *);
 static struct arc *allocarc(struct nfa *, struct state *);
 static void freearc(struct nfa *, struct arc *);
+static int	hasnonemptyout(struct state *);
+static int	nonemptyouts(struct state *);
+static int	nonemptyins(struct state *);
 static struct arc *findarc(struct state *, int, pcolor);
 static void cparc(struct nfa *, struct arc *, struct state *, struct state *);
 static void moveins(struct nfa *, struct state *, struct state *);
-static void copyins(struct nfa *, struct state *, struct state *);
+static void copyins(struct nfa *, struct state *, struct state *, int);
 static void moveouts(struct nfa *, struct state *, struct state *);
-static void copyouts(struct nfa *, struct state *, struct state *);
+static void copyouts(struct nfa *, struct state *, struct state *, int);
 static void cloneouts(struct nfa *, struct state *, struct state *, struct state *, int);
 static void delsub(struct nfa *, struct state *, struct state *);
 static void deltraverse(struct nfa *, struct state *, struct state *);
@@ -146,7 +149,8 @@ static int	push(struct nfa *, struct arc *);
 #define COMPATIBLE	3			/* compatible but not satisfied yet */
 static int	combine(struct arc *, struct arc *);
 static void fixempties(struct nfa *, FILE *);
-static int	unempty(struct nfa *, struct arc *);
+static struct state *emptyreachable(struct state *, struct state *);
+static void replaceempty(struct nfa *, struct state *, struct state *);
 static void cleanup(struct nfa *);
 static void markreachable(struct nfa *, struct state *, struct state *, struct state *);
 static void markcanreach(struct nfa *, struct state *, struct state *, struct state *);
@@ -583,7 +587,7 @@ makesearch(struct vars * v,
 	for (s = slist; s != NULL; s = s2)
 	{
 		s2 = newstate(nfa);
-		copyouts(nfa, s, s2);
+		copyouts(nfa, s, s2, 1);
 		for (a = s->ins; a != NULL; a = b)
 		{
 			b = a->inchain;
diff --git a/src/test/regress/expected/regex.out b/src/test/regress/expected/regex.out
index 658538fd41..757f2a4028 100644
--- a/src/test/regress/expected/regex.out
+++ b/src/test/regress/expected/regex.out
@@ -153,3 +153,23 @@ explain (costs off) select * from pg_proc where proname ~ '^(abc)?d';
    Filter: (proname ~ '^(abc)?d'::text)
 (2 rows)
 
+-- Test for infinite loop in pullback() (CVE-2007-4772)
+select 'a' ~ '($|^)*';
+ ?column? 
+----------
+ t
+(1 row)
+
+-- Test for infinite loop in fixempties() (Tcl bugs 3604074, 3606683)
+select 'a' ~ '((((((a)*)*)*)*)*)*';
+ ?column? 
+----------
+ t
+(1 row)
+
+select 'a' ~ '((((((a+|)+|)+|)+|)+|)+|)';
+ ?column? 
+----------
+ t
+(1 row)
+
diff --git a/src/test/regress/sql/regex.sql b/src/test/regress/sql/regex.sql
index c29ed05d76..1426562119 100644
--- a/src/test/regress/sql/regex.sql
+++ b/src/test/regress/sql/regex.sql
@@ -34,3 +34,10 @@ explain (costs off) select * from pg_proc where proname ~ '^abc+d';
 explain (costs off) select * from pg_proc where proname ~ '^(abc)(def)';
 explain (costs off) select * from pg_proc where proname ~ '^(abc)$';
 explain (costs off) select * from pg_proc where proname ~ '^(abc)?d';
+
+-- Test for infinite loop in pullback() (CVE-2007-4772)
+select 'a' ~ '($|^)*';
+
+-- Test for infinite loop in fixempties() (Tcl bugs 3604074, 3606683)
+select 'a' ~ '((((((a)*)*)*)*)*)*';
+select 'a' ~ '((((((a+|)+|)+|)+|)+|)+|)';
-- 
2.40.0