Fix infinite-loop risk in fixempties() stage of regex compilation.

author Tom Lane <tgl@sss.pgh.pa.us>

Thu, 7 Mar 2013 16:51:03 +0000 (11:51 -0500)

committer Tom Lane <tgl@sss.pgh.pa.us>

Thu, 7 Mar 2013 16:51:03 +0000 (11:51 -0500)
author Tom Lane <tgl@sss.pgh.pa.us>
Thu, 7 Mar 2013 16:51:03 +0000 (11:51 -0500)
committer Tom Lane <tgl@sss.pgh.pa.us>
Thu, 7 Mar 2013 16:51:03 +0000 (11:51 -0500)
diff --git a/src/backend/regex/regc_nfa.c b/src/backend/regex/regc_nfa.c

index 085842c92b70d74ea969488e0cc50d1466f2e53e..05fe8b0808c21a2946a3cef8d896ec74cd298ba6 100644 (file)
--- a/src/backend/regex/regc_nfa.c
+++ b/src/backend/regex/regc_nfa.c
@@ -455,6 +455,56 @@ freearc(struct nfa * nfa,
         from->free = victim;
  }
  
+/*
+ * hasnonemptyout - Does state have a non-EMPTY out arc?
+ */
+static int
+hasnonemptyout(struct state * s)
+{
+       struct arc *a;
+
+       for (a = s->outs; a != NULL; a = a->outchain)
+       {
+               if (a->type != EMPTY)
+                       return 1;
+       }
+       return 0;
+}
+
+/*
+ * nonemptyouts - count non-EMPTY out arcs of a state
+ */
+static int
+nonemptyouts(struct state * s)
+{
+       int                     n = 0;
+       struct arc *a;
+
+       for (a = s->outs; a != NULL; a = a->outchain)
+       {
+               if (a->type != EMPTY)
+                       n++;
+       }
+       return n;
+}
+
+/*
+ * nonemptyins - count non-EMPTY in arcs of a state
+ */
+static int
+nonemptyins(struct state * s)
+{
+       int                     n = 0;
+       struct arc *a;
+
+       for (a = s->ins; a != NULL; a = a->inchain)
+       {
+               if (a->type != EMPTY)
+                       n++;
+       }
+       return n;
+}
+
  /*
   * findarc - find arc, if any, from given source with given type and color
   * If there is more than one such arc, the result is random.
@@ -511,19 +561,25 @@ moveins(struct nfa * nfa,
  }
  
  /*
- * copyins - copy all in arcs of a state to another state
+ * copyins - copy in arcs of a state to another state
+ *
+ * Either all arcs, or only non-empty ones as determined by all value.
   */
  static void
  copyins(struct nfa * nfa,
                 struct state * oldState,
-               struct state * newState)
+               struct state * newState,
+               int all)
  {
         struct arc *a;
  
         assert(oldState != newState);
  
         for (a = oldState->ins; a != NULL; a = a->inchain)
-               cparc(nfa, a, a->from, newState);
+       {
+               if (all || a->type != EMPTY)
+                       cparc(nfa, a, a->from, newState);
+       }
  }
  
  /*
@@ -546,19 +602,25 @@ moveouts(struct nfa * nfa,
  }
  
  /*
- * copyouts - copy all out arcs of a state to another state
+ * copyouts - copy out arcs of a state to another state
+ *
+ * Either all arcs, or only non-empty ones as determined by all value.
   */
  static void
  copyouts(struct nfa * nfa,
                  struct state * oldState,
-                struct state * newState)
+                struct state * newState,
+                int all)
  {
         struct arc *a;
  
         assert(oldState != newState);
  
         for (a = oldState->outs; a != NULL; a = a->outchain)
-               cparc(nfa, a, newState, a->to);
+       {
+               if (all || a->type != EMPTY)
+                       cparc(nfa, a, newState, a->to);
+       }
  }
  
  /*
@@ -881,7 +943,7 @@ pull(struct nfa * nfa,
                 if (NISERR())
                         return 0;
                 assert(to != from);             /* con is not an inarc */
-               copyins(nfa, from, s);  /* duplicate inarcs */
+               copyins(nfa, from, s, 1);               /* duplicate inarcs */
                 cparc(nfa, con, s, to); /* move constraint arc */
                 freearc(nfa, con);
                 from = s;
@@ -1027,7 +1089,7 @@ push(struct nfa * nfa,
                 s = newstate(nfa);
                 if (NISERR())
                         return 0;
-               copyouts(nfa, to, s);   /* duplicate outarcs */
+               copyouts(nfa, to, s, 1);        /* duplicate outarcs */
                 cparc(nfa, con, from, s);               /* move constraint */
                 freearc(nfa, con);
                 to = s;
@@ -1134,91 +1196,205 @@ fixempties(struct nfa * nfa,
                    FILE *f)                             /* for debug output; NULL none */
  {
         struct state *s;
+       struct state *s2;
         struct state *nexts;
         struct arc *a;
         struct arc *nexta;
-       int                     progress;
  
-       /* find and eliminate empties until there are no more */
-       do
+       /*
+        * First, get rid of any states whose sole out-arc is an EMPTY, since
+        * they're basically just aliases for their successor.  The parsing
+        * algorithm creates enough of these that it's worth special-casing this.
+        */
+       for (s = nfa->states; s != NULL && !NISERR(); s = nexts)
         {
-               progress = 0;
-               for (s = nfa->states; s != NULL && !NISERR() &&
-                        s->no != FREESTATE; s = nexts)
+               nexts = s->next;
+               if (s->flag || s->nouts != 1)
+                       continue;
+               a = s->outs;
+               assert(a != NULL && a->outchain == NULL);
+               if (a->type != EMPTY)
+                       continue;
+               if (s != a->to)
+                       moveins(nfa, s, a->to);
+               dropstate(nfa, s);
+       }
+
+       /*
+        * Similarly, get rid of any state with a single EMPTY in-arc, by folding
+        * it into its predecessor.
+        */
+       for (s = nfa->states; s != NULL && !NISERR(); s = nexts)
+       {
+               nexts = s->next;
+               /* while we're at it, ensure tmp fields are clear for next step */
+               assert(s->tmp == NULL);
+               if (s->flag || s->nins != 1)
+                       continue;
+               a = s->ins;
+               assert(a != NULL && a->inchain == NULL);
+               if (a->type != EMPTY)
+                       continue;
+               if (s != a->from)
+                       moveouts(nfa, s, a->from);
+               dropstate(nfa, s);
+       }
+
+       /*
+        * For each remaining NFA state, find all other states that are reachable
+        * from it by a chain of one or more EMPTY arcs.  Then generate new arcs
+        * that eliminate the need for each such chain.
+        *
+        * If we just do this straightforwardly, the algorithm gets slow in
+        * complex graphs, because the same arcs get copied to all intermediate
+        * states of an EMPTY chain, and then uselessly pushed repeatedly to the
+        * chain's final state; we waste a lot of time in newarc's duplicate
+        * checking.  To improve matters, we decree that any state with only EMPTY
+        * out-arcs is "doomed" and will not be part of the final NFA. That can be
+        * ensured by not adding any new out-arcs to such a state. Having ensured
+        * that, we need not update the state's in-arcs list either; all arcs that
+        * might have gotten pushed forward to it will just get pushed directly to
+        * successor states.  This eliminates most of the useless duplicate arcs.
+        */
+       for (s = nfa->states; s != NULL && !NISERR(); s = s->next)
+       {
+               for (s2 = emptyreachable(s, s); s2 != s && !NISERR(); s2 = nexts)
                 {
-                       nexts = s->next;
-                       for (a = s->outs; a != NULL && !NISERR(); a = nexta)
-                       {
-                               nexta = a->outchain;
-                               if (a->type == EMPTY && unempty(nfa, a))
-                                       progress = 1;
-                               assert(nexta == NULL || s->no != FREESTATE);
-                       }
+                       /*
+                        * If s2 is doomed, we decide that (1) we will always push arcs
+                        * forward to it, not pull them back to s; and (2) we can optimize
+                        * away the push-forward, per comment above.  So do nothing.
+                        */
+                       if (s2->flag || hasnonemptyout(s2))
+                               replaceempty(nfa, s, s2);
+
+                       /* Reset the tmp fields as we walk back */
+                       nexts = s2->tmp;
+                       s2->tmp = NULL;
                 }
-               if (progress && f != NULL)
-                       dumpnfa(nfa, f);
-       } while (progress && !NISERR());
+               s->tmp = NULL;
+       }
+
+       if (NISERR())
+               return;
+
+       /*
+        * Now remove all the EMPTY arcs, since we don't need them anymore.
+        */
+       for (s = nfa->states; s != NULL; s = s->next)
+       {
+               for (a = s->outs; a != NULL; a = nexta)
+               {
+                       nexta = a->outchain;
+                       if (a->type == EMPTY)
+                               freearc(nfa, a);
+               }
+       }
+
+       /*
+        * And remove any states that have become useless.      (This cleanup is not
+        * very thorough, and would be even less so if we tried to combine it with
+        * the previous step; but cleanup() will take care of anything we miss.)
+        */
+       for (s = nfa->states; s != NULL; s = nexts)
+       {
+               nexts = s->next;
+               if ((s->nins == 0 || s->nouts == 0) && !s->flag)
+                       dropstate(nfa, s);
+       }
+
+       if (f != NULL)
+               dumpnfa(nfa, f);
  }
  
  /*
- * unempty - optimize out an EMPTY arc, if possible
+ * emptyreachable - recursively find all states reachable from s by EMPTY arcs
+ *
+ * The return value is the last such state found.  Its tmp field links back
+ * to the next-to-last such state, and so on back to s, so that all these
+ * states can be located without searching the whole NFA.
   *
- * Actually, as it stands this function always succeeds, but the return
- * value is kept with an eye on possible future changes.
+ * The maximum recursion depth here is equal to the length of the longest
+ * loop-free chain of EMPTY arcs, which is surely no more than the size of
+ * the NFA, and in practice will be a lot less than that.
   */
-static int                                             /* 0 couldn't, 1 could */
-unempty(struct nfa * nfa,
-               struct arc * a)
+static struct state *
+emptyreachable(struct state * s, struct state * lastfound)
  {
-       struct state *from = a->from;
-       struct state *to = a->to;
-       int                     usefrom;                /* work on from, as opposed to to? */
-
-       assert(a->type == EMPTY);
-       assert(from != nfa->pre && to != nfa->post);
+       struct arc *a;
  
-       if (from == to)
-       {                                                       /* vacuous loop */
-               freearc(nfa, a);
-               return 1;
+       s->tmp = lastfound;
+       lastfound = s;
+       for (a = s->outs; a != NULL; a = a->outchain)
+       {
+               if (a->type == EMPTY && a->to->tmp == NULL)
+                       lastfound = emptyreachable(a->to, lastfound);
         }
+       return lastfound;
+}
  
-       /* decide which end to work on */
-       usefrom = 1;                            /* default:  attack from */
-       if (from->nouts > to->nins)
-               usefrom = 0;
-       else if (from->nouts == to->nins)
+/*
+ * replaceempty - replace an EMPTY arc chain with some non-empty arcs
+ *
+ * The EMPTY arc(s) should be deleted later, but we can't do it here because
+ * they may still be needed to identify other arc chains during fixempties().
+ */
+static void
+replaceempty(struct nfa * nfa,
+                        struct state * from,
+                        struct state * to)
+{
+       int                     fromouts;
+       int                     toins;
+
+       assert(from != to);
+
+       /*
+        * Create replacement arcs that bypass the need for the EMPTY chain.  We
+        * can do this either by pushing arcs forward (linking directly from
+        * "from"'s predecessors to "to") or by pulling them back (linking
+        * directly from "from" to "to"'s successors).  In general, we choose
+        * whichever way creates greater fan-out or fan-in, so as to improve the
+        * odds of reducing the other state to zero in-arcs or out-arcs and
+        * thereby being able to delete it.  However, if "from" is doomed (has no
+        * non-EMPTY out-arcs), we must keep it so, so always push forward in that
+        * case.
+        *
+        * The fan-out/fan-in comparison should count only non-EMPTY arcs.      If
+        * "from" is doomed, we can skip counting "to"'s arcs, since we want to
+        * force taking the copyins path in that case.
+        */
+       fromouts = nonemptyouts(from);
+       toins = (fromouts == 0) ? 1 : nonemptyins(to);
+
+       if (fromouts > toins)
         {
-               /* decide on secondary issue:  move/copy fewest arcs */
-               if (from->nins > to->nouts)
-                       usefrom = 0;
+               copyouts(nfa, to, from, 0);
+               return;
+       }
+       if (fromouts < toins)
+       {
+               copyins(nfa, from, to, 0);
+               return;
         }
  
-       freearc(nfa, a);
-       if (usefrom)
+       /*
+        * fromouts == toins.  Decide on secondary issue: copy fewest arcs.
+        *
+        * Doesn't seem to be worth the trouble to exclude empties from these
+        * comparisons; that takes extra time and doesn't seem to improve the
+        * resulting graph much.
+        */
+       if (from->nins > to->nouts)
         {
-               if (from->nouts == 0)
-               {
-                       /* was the state's only outarc */
-                       moveins(nfa, from, to);
-                       freestate(nfa, from);
-               }
-               else
-                       copyins(nfa, from, to);
+               copyouts(nfa, to, from, 0);
+               return;
         }
         else
         {
-               if (to->nins == 0)
-               {
-                       /* was the state's only inarc */
-                       moveouts(nfa, to, from);
-                       freestate(nfa, to);
-               }
-               else
-                       copyouts(nfa, to, from);
+               copyins(nfa, from, to, 0);
+               return;
         }
-
-       return 1;
  }
  
  /*
diff --git a/src/backend/regex/regcomp.c b/src/backend/regex/regcomp.c

index 9b3fe64807e508a5975b03cce2b243adf972ccdb..b5988a2fbc16f67fd067cb338fdbc50a252c6f2d 100644 (file)
--- a/src/backend/regex/regcomp.c
+++ b/src/backend/regex/regcomp.c
@@ -122,12 +122,15 @@ static void destroystate(struct nfa *, struct state *);
  static void newarc(struct nfa *, int, pcolor, struct state *, struct state *);
  static struct arc *allocarc(struct nfa *, struct state *);
  static void freearc(struct nfa *, struct arc *);
+static int     hasnonemptyout(struct state *);
+static int     nonemptyouts(struct state *);
+static int     nonemptyins(struct state *);
  static struct arc *findarc(struct state *, int, pcolor);
  static void cparc(struct nfa *, struct arc *, struct state *, struct state *);
  static void moveins(struct nfa *, struct state *, struct state *);
-static void copyins(struct nfa *, struct state *, struct state *);
+static void copyins(struct nfa *, struct state *, struct state *, int);
  static void moveouts(struct nfa *, struct state *, struct state *);
-static void copyouts(struct nfa *, struct state *, struct state *);
+static void copyouts(struct nfa *, struct state *, struct state *, int);
  static void cloneouts(struct nfa *, struct state *, struct state *, struct state *, int);
  static void delsub(struct nfa *, struct state *, struct state *);
  static void deltraverse(struct nfa *, struct state *, struct state *);
@@ -146,7 +149,8 @@ static int  push(struct nfa *, struct arc *);
  #define COMPATIBLE     3                       /* compatible but not satisfied yet */
  static int     combine(struct arc *, struct arc *);
  static void fixempties(struct nfa *, FILE *);
-static int     unempty(struct nfa *, struct arc *);
+static struct state *emptyreachable(struct state *, struct state *);
+static void replaceempty(struct nfa *, struct state *, struct state *);
  static void cleanup(struct nfa *);
  static void markreachable(struct nfa *, struct state *, struct state *, struct state *);
  static void markcanreach(struct nfa *, struct state *, struct state *, struct state *);
@@ -583,7 +587,7 @@ makesearch(struct vars * v,
         for (s = slist; s != NULL; s = s2)
         {
                 s2 = newstate(nfa);
-               copyouts(nfa, s, s2);
+               copyouts(nfa, s, s2, 1);
                 for (a = s->ins; a != NULL; a = b)
                 {
                         b = a->inchain;
diff --git a/src/test/regress/expected/regex.out b/src/test/regress/expected/regex.out

index 658538fd4192a7b04062c52fce666897cc94b633..757f2a4028a36849d19160870a2f973a74ff1c17 100644 (file)
--- a/src/test/regress/expected/regex.out
+++ b/src/test/regress/expected/regex.out
@@ -153,3 +153,23 @@ explain (costs off) select * from pg_proc where proname ~ '^(abc)?d';
     Filter: (proname ~ '^(abc)?d'::text)
  (2 rows)
  
+-- Test for infinite loop in pullback() (CVE-2007-4772)
+select 'a' ~ '($|^)*';
+ ?column? 
+----------
+ t
+(1 row)
+
+-- Test for infinite loop in fixempties() (Tcl bugs 3604074, 3606683)
+select 'a' ~ '((((((a)*)*)*)*)*)*';
+ ?column? 
+----------
+ t
+(1 row)
+
+select 'a' ~ '((((((a+|)+|)+|)+|)+|)+|)';
+ ?column? 
+----------
+ t
+(1 row)
+
diff --git a/src/test/regress/sql/regex.sql b/src/test/regress/sql/regex.sql

index c29ed05d7688bade1bd6ab1e4eb91603170290af..1426562119a8e3bbd2cc8cb4d64db0016e2eafec 100644 (file)
--- a/src/test/regress/sql/regex.sql
+++ b/src/test/regress/sql/regex.sql
@@ -34,3 +34,10 @@ explain (costs off) select * from pg_proc where proname ~ '^abc+d';
  explain (costs off) select * from pg_proc where proname ~ '^(abc)(def)';
  explain (costs off) select * from pg_proc where proname ~ '^(abc)$';
  explain (costs off) select * from pg_proc where proname ~ '^(abc)?d';
+
+-- Test for infinite loop in pullback() (CVE-2007-4772)
+select 'a' ~ '($|^)*';
+
+-- Test for infinite loop in fixempties() (Tcl bugs 3604074, 3606683)
+select 'a' ~ '((((((a)*)*)*)*)*)*';
+select 'a' ~ '((((((a+|)+|)+|)+|)+|)+|)';
author	Tom Lane <tgl@sss.pgh.pa.us>
	Thu, 7 Mar 2013 16:51:03 +0000 (11:51 -0500)
committer	Tom Lane <tgl@sss.pgh.pa.us>
	Thu, 7 Mar 2013 16:51:03 +0000 (11:51 -0500)
src/backend/regex/regc_nfa.c		patch \| blob \| history
src/backend/regex/regcomp.c		patch \| blob \| history
src/test/regress/expected/regex.out		patch \| blob \| history
src/test/regress/sql/regex.sql		patch \| blob \| history