Fix infinite-loop risk in fixempties() stage of regex compilation.

The previous coding of this function could get into situations where it
would never terminate, because successive passes would re-add EMPTY arcs
that had been removed by the previous pass.  Rewrite the function
completely using a new algorithm that is guaranteed to terminate, and
also seems to be usually faster than the old one.  Per Tcl bugs 3604074
and 3606683.

Tom Lane and Don Porter
This commit is contained in:
Tom Lane 2013-03-07 11:51:03 -05:00
parent 7ccefe8610
commit a7b61d4f5a
4 changed files with 282 additions and 75 deletions

View File

@ -455,6 +455,56 @@ freearc(struct nfa * nfa,
from->free = victim;
}
/*
* hasnonemptyout - Does state have a non-EMPTY out arc?
*/
static int
hasnonemptyout(struct state * s)
{
struct arc *a;
for (a = s->outs; a != NULL; a = a->outchain)
{
if (a->type != EMPTY)
return 1;
}
return 0;
}
/*
* nonemptyouts - count non-EMPTY out arcs of a state
*/
static int
nonemptyouts(struct state * s)
{
int n = 0;
struct arc *a;
for (a = s->outs; a != NULL; a = a->outchain)
{
if (a->type != EMPTY)
n++;
}
return n;
}
/*
* nonemptyins - count non-EMPTY in arcs of a state
*/
static int
nonemptyins(struct state * s)
{
int n = 0;
struct arc *a;
for (a = s->ins; a != NULL; a = a->inchain)
{
if (a->type != EMPTY)
n++;
}
return n;
}
/*
* findarc - find arc, if any, from given source with given type and color
* If there is more than one such arc, the result is random.
@ -511,19 +561,25 @@ moveins(struct nfa * nfa,
}
/*
* copyins - copy all in arcs of a state to another state
* copyins - copy in arcs of a state to another state
*
* Either all arcs, or only non-empty ones as determined by all value.
*/
static void
copyins(struct nfa * nfa,
struct state * oldState,
struct state * newState)
struct state * newState,
int all)
{
struct arc *a;
assert(oldState != newState);
for (a = oldState->ins; a != NULL; a = a->inchain)
cparc(nfa, a, a->from, newState);
{
if (all || a->type != EMPTY)
cparc(nfa, a, a->from, newState);
}
}
/*
@ -546,19 +602,25 @@ moveouts(struct nfa * nfa,
}
/*
* copyouts - copy all out arcs of a state to another state
* copyouts - copy out arcs of a state to another state
*
* Either all arcs, or only non-empty ones as determined by all value.
*/
static void
copyouts(struct nfa * nfa,
struct state * oldState,
struct state * newState)
struct state * newState,
int all)
{
struct arc *a;
assert(oldState != newState);
for (a = oldState->outs; a != NULL; a = a->outchain)
cparc(nfa, a, newState, a->to);
{
if (all || a->type != EMPTY)
cparc(nfa, a, newState, a->to);
}
}
/*
@ -881,7 +943,7 @@ pull(struct nfa * nfa,
if (NISERR())
return 0;
assert(to != from); /* con is not an inarc */
copyins(nfa, from, s); /* duplicate inarcs */
copyins(nfa, from, s, 1); /* duplicate inarcs */
cparc(nfa, con, s, to); /* move constraint arc */
freearc(nfa, con);
from = s;
@ -1027,7 +1089,7 @@ push(struct nfa * nfa,
s = newstate(nfa);
if (NISERR())
return 0;
copyouts(nfa, to, s); /* duplicate outarcs */
copyouts(nfa, to, s, 1); /* duplicate outarcs */
cparc(nfa, con, from, s); /* move constraint */
freearc(nfa, con);
to = s;
@ -1134,91 +1196,205 @@ fixempties(struct nfa * nfa,
FILE *f) /* for debug output; NULL none */
{
struct state *s;
struct state *s2;
struct state *nexts;
struct arc *a;
struct arc *nexta;
int progress;
/* find and eliminate empties until there are no more */
do
/*
* First, get rid of any states whose sole out-arc is an EMPTY, since
* they're basically just aliases for their successor. The parsing
* algorithm creates enough of these that it's worth special-casing this.
*/
for (s = nfa->states; s != NULL && !NISERR(); s = nexts)
{
progress = 0;
for (s = nfa->states; s != NULL && !NISERR() &&
s->no != FREESTATE; s = nexts)
nexts = s->next;
if (s->flag || s->nouts != 1)
continue;
a = s->outs;
assert(a != NULL && a->outchain == NULL);
if (a->type != EMPTY)
continue;
if (s != a->to)
moveins(nfa, s, a->to);
dropstate(nfa, s);
}
/*
* Similarly, get rid of any state with a single EMPTY in-arc, by folding
* it into its predecessor.
*/
for (s = nfa->states; s != NULL && !NISERR(); s = nexts)
{
nexts = s->next;
/* while we're at it, ensure tmp fields are clear for next step */
assert(s->tmp == NULL);
if (s->flag || s->nins != 1)
continue;
a = s->ins;
assert(a != NULL && a->inchain == NULL);
if (a->type != EMPTY)
continue;
if (s != a->from)
moveouts(nfa, s, a->from);
dropstate(nfa, s);
}
/*
* For each remaining NFA state, find all other states that are reachable
* from it by a chain of one or more EMPTY arcs. Then generate new arcs
* that eliminate the need for each such chain.
*
* If we just do this straightforwardly, the algorithm gets slow in
* complex graphs, because the same arcs get copied to all intermediate
* states of an EMPTY chain, and then uselessly pushed repeatedly to the
* chain's final state; we waste a lot of time in newarc's duplicate
* checking. To improve matters, we decree that any state with only EMPTY
* out-arcs is "doomed" and will not be part of the final NFA. That can be
* ensured by not adding any new out-arcs to such a state. Having ensured
* that, we need not update the state's in-arcs list either; all arcs that
* might have gotten pushed forward to it will just get pushed directly to
* successor states. This eliminates most of the useless duplicate arcs.
*/
for (s = nfa->states; s != NULL && !NISERR(); s = s->next)
{
for (s2 = emptyreachable(s, s); s2 != s && !NISERR(); s2 = nexts)
{
nexts = s->next;
for (a = s->outs; a != NULL && !NISERR(); a = nexta)
{
nexta = a->outchain;
if (a->type == EMPTY && unempty(nfa, a))
progress = 1;
assert(nexta == NULL || s->no != FREESTATE);
}
/*
* If s2 is doomed, we decide that (1) we will always push arcs
* forward to it, not pull them back to s; and (2) we can optimize
* away the push-forward, per comment above. So do nothing.
*/
if (s2->flag || hasnonemptyout(s2))
replaceempty(nfa, s, s2);
/* Reset the tmp fields as we walk back */
nexts = s2->tmp;
s2->tmp = NULL;
}
if (progress && f != NULL)
dumpnfa(nfa, f);
} while (progress && !NISERR());
s->tmp = NULL;
}
if (NISERR())
return;
/*
* Now remove all the EMPTY arcs, since we don't need them anymore.
*/
for (s = nfa->states; s != NULL; s = s->next)
{
for (a = s->outs; a != NULL; a = nexta)
{
nexta = a->outchain;
if (a->type == EMPTY)
freearc(nfa, a);
}
}
/*
* And remove any states that have become useless. (This cleanup is not
* very thorough, and would be even less so if we tried to combine it with
* the previous step; but cleanup() will take care of anything we miss.)
*/
for (s = nfa->states; s != NULL; s = nexts)
{
nexts = s->next;
if ((s->nins == 0 || s->nouts == 0) && !s->flag)
dropstate(nfa, s);
}
if (f != NULL)
dumpnfa(nfa, f);
}
/*
* unempty - optimize out an EMPTY arc, if possible
* emptyreachable - recursively find all states reachable from s by EMPTY arcs
*
* Actually, as it stands this function always succeeds, but the return
* value is kept with an eye on possible future changes.
* The return value is the last such state found. Its tmp field links back
* to the next-to-last such state, and so on back to s, so that all these
* states can be located without searching the whole NFA.
*
* The maximum recursion depth here is equal to the length of the longest
* loop-free chain of EMPTY arcs, which is surely no more than the size of
* the NFA, and in practice will be a lot less than that.
*/
static int /* 0 couldn't, 1 could */
unempty(struct nfa * nfa,
struct arc * a)
static struct state *
emptyreachable(struct state * s, struct state * lastfound)
{
struct state *from = a->from;
struct state *to = a->to;
int usefrom; /* work on from, as opposed to to? */
struct arc *a;
assert(a->type == EMPTY);
assert(from != nfa->pre && to != nfa->post);
s->tmp = lastfound;
lastfound = s;
for (a = s->outs; a != NULL; a = a->outchain)
{
if (a->type == EMPTY && a->to->tmp == NULL)
lastfound = emptyreachable(a->to, lastfound);
}
return lastfound;
}
if (from == to)
{ /* vacuous loop */
freearc(nfa, a);
return 1;
/*
* replaceempty - replace an EMPTY arc chain with some non-empty arcs
*
* The EMPTY arc(s) should be deleted later, but we can't do it here because
* they may still be needed to identify other arc chains during fixempties().
*/
static void
replaceempty(struct nfa * nfa,
struct state * from,
struct state * to)
{
int fromouts;
int toins;
assert(from != to);
/*
* Create replacement arcs that bypass the need for the EMPTY chain. We
* can do this either by pushing arcs forward (linking directly from
* "from"'s predecessors to "to") or by pulling them back (linking
* directly from "from" to "to"'s successors). In general, we choose
* whichever way creates greater fan-out or fan-in, so as to improve the
* odds of reducing the other state to zero in-arcs or out-arcs and
* thereby being able to delete it. However, if "from" is doomed (has no
* non-EMPTY out-arcs), we must keep it so, so always push forward in that
* case.
*
* The fan-out/fan-in comparison should count only non-EMPTY arcs. If
* "from" is doomed, we can skip counting "to"'s arcs, since we want to
* force taking the copyins path in that case.
*/
fromouts = nonemptyouts(from);
toins = (fromouts == 0) ? 1 : nonemptyins(to);
if (fromouts > toins)
{
copyouts(nfa, to, from, 0);
return;
}
if (fromouts < toins)
{
copyins(nfa, from, to, 0);
return;
}
/* decide which end to work on */
usefrom = 1; /* default: attack from */
if (from->nouts > to->nins)
usefrom = 0;
else if (from->nouts == to->nins)
/*
* fromouts == toins. Decide on secondary issue: copy fewest arcs.
*
* Doesn't seem to be worth the trouble to exclude empties from these
* comparisons; that takes extra time and doesn't seem to improve the
* resulting graph much.
*/
if (from->nins > to->nouts)
{
/* decide on secondary issue: move/copy fewest arcs */
if (from->nins > to->nouts)
usefrom = 0;
}
freearc(nfa, a);
if (usefrom)
{
if (from->nouts == 0)
{
/* was the state's only outarc */
moveins(nfa, from, to);
freestate(nfa, from);
}
else
copyins(nfa, from, to);
copyouts(nfa, to, from, 0);
return;
}
else
{
if (to->nins == 0)
{
/* was the state's only inarc */
moveouts(nfa, to, from);
freestate(nfa, to);
}
else
copyouts(nfa, to, from);
copyins(nfa, from, to, 0);
return;
}
return 1;
}
/*

View File

@ -122,12 +122,15 @@ static void destroystate(struct nfa *, struct state *);
static void newarc(struct nfa *, int, pcolor, struct state *, struct state *);
static struct arc *allocarc(struct nfa *, struct state *);
static void freearc(struct nfa *, struct arc *);
static int hasnonemptyout(struct state *);
static int nonemptyouts(struct state *);
static int nonemptyins(struct state *);
static struct arc *findarc(struct state *, int, pcolor);
static void cparc(struct nfa *, struct arc *, struct state *, struct state *);
static void moveins(struct nfa *, struct state *, struct state *);
static void copyins(struct nfa *, struct state *, struct state *);
static void copyins(struct nfa *, struct state *, struct state *, int);
static void moveouts(struct nfa *, struct state *, struct state *);
static void copyouts(struct nfa *, struct state *, struct state *);
static void copyouts(struct nfa *, struct state *, struct state *, int);
static void cloneouts(struct nfa *, struct state *, struct state *, struct state *, int);
static void delsub(struct nfa *, struct state *, struct state *);
static void deltraverse(struct nfa *, struct state *, struct state *);
@ -146,7 +149,8 @@ static int push(struct nfa *, struct arc *);
#define COMPATIBLE 3 /* compatible but not satisfied yet */
static int combine(struct arc *, struct arc *);
static void fixempties(struct nfa *, FILE *);
static int unempty(struct nfa *, struct arc *);
static struct state *emptyreachable(struct state *, struct state *);
static void replaceempty(struct nfa *, struct state *, struct state *);
static void cleanup(struct nfa *);
static void markreachable(struct nfa *, struct state *, struct state *, struct state *);
static void markcanreach(struct nfa *, struct state *, struct state *, struct state *);
@ -583,7 +587,7 @@ makesearch(struct vars * v,
for (s = slist; s != NULL; s = s2)
{
s2 = newstate(nfa);
copyouts(nfa, s, s2);
copyouts(nfa, s, s2, 1);
for (a = s->ins; a != NULL; a = b)
{
b = a->inchain;

View File

@ -153,3 +153,23 @@ explain (costs off) select * from pg_proc where proname ~ '^(abc)?d';
Filter: (proname ~ '^(abc)?d'::text)
(2 rows)
-- Test for infinite loop in pullback() (CVE-2007-4772)
select 'a' ~ '($|^)*';
?column?
----------
t
(1 row)
-- Test for infinite loop in fixempties() (Tcl bugs 3604074, 3606683)
select 'a' ~ '((((((a)*)*)*)*)*)*';
?column?
----------
t
(1 row)
select 'a' ~ '((((((a+|)+|)+|)+|)+|)+|)';
?column?
----------
t
(1 row)

View File

@ -34,3 +34,10 @@ explain (costs off) select * from pg_proc where proname ~ '^abc+d';
explain (costs off) select * from pg_proc where proname ~ '^(abc)(def)';
explain (costs off) select * from pg_proc where proname ~ '^(abc)$';
explain (costs off) select * from pg_proc where proname ~ '^(abc)?d';
-- Test for infinite loop in pullback() (CVE-2007-4772)
select 'a' ~ '($|^)*';
-- Test for infinite loop in fixempties() (Tcl bugs 3604074, 3606683)
select 'a' ~ '((((((a)*)*)*)*)*)*';
select 'a' ~ '((((((a+|)+|)+|)+|)+|)+|)';