Back-patch fix for extraction of fixed prefixes from regular expressions.

Back-patch of commits 628cbb50ba and c6aae3042b. This has been broken since 7.3, so back-patch to all supported branches.
2025-02-17 19:30:00 +08:00 · 2012-07-10 18:00:47 -04:00 · 2012-07-10 18:00:47 -04:00 · dffc6c8a85
commit dffc6c8a85
parent 647ae3cfe3
11 changed files with 429 additions and 196 deletions
--- a/src/backend/regex/Makefile
+++ b/src/backend/regex/Makefile
@ -12,7 +12,7 @@ subdir = src/backend/regex
 top_builddir = ../../..
 include $(top_builddir)/src/Makefile.global

-OBJS = regcomp.o regerror.o regexec.o regfree.o
+OBJS = regcomp.o regerror.o regexec.o regfree.o regprefix.o

 include $(top_srcdir)/src/backend/common.mk

--- a/src/backend/regex/regc_color.c
+++ b/src/backend/regex/regc_color.c
@ -66,8 +66,9 @@ initcm(struct vars * v,
 	cd = cm->cd;				/* cm->cd[WHITE] */
 	cd->sub = NOSUB;
 	cd->arcs = NULL;
-	cd->flags = 0;
+	cd->firstchr = CHR_MIN;
 	cd->nchrs = CHR_MAX - CHR_MIN + 1;
+	cd->flags = 0;

 	/* upper levels of tree */
 	for (t = &cm->tree[0], j = NBYTS - 1; j > 0; t = nextt, j--)
@ -272,6 +273,7 @@ newcolor(struct colormap * cm)
 	cd->nchrs = 0;
 	cd->sub = NOSUB;
 	cd->arcs = NULL;
+	cd->firstchr = CHR_MIN;		/* in case never set otherwise */
 	cd->flags = 0;
 	cd->block = NULL;

@ -371,6 +373,8 @@ subcolor(struct colormap * cm, chr c)
 	if (co == sco)				/* already in an open subcolor */
 		return co;				/* rest is redundant */
 	cm->cd[co].nchrs--;
+	if (cm->cd[sco].nchrs == 0)
+		cm->cd[sco].firstchr = c;
 	cm->cd[sco].nchrs++;
 	setcolor(cm, c, sco);
 	return sco;
@ -438,6 +442,11 @@ subrange(struct vars * v,

 /*
 * subblock - allocate new subcolors for one tree block of chrs, fill in arcs
+ *
+ * Note: subcolors that are created during execution of this function
+ * will not be given a useful value of firstchr; it'll be left as CHR_MIN.
+ * For the current usage of firstchr in pg_regprefix, this does not matter
+ * because such subcolors won't occur in the common prefix of a regex.
 */
 static void
 subblock(struct vars * v,
--- a/src/backend/regex/regc_nfa.c
+++ b/src/backend/regex/regc_nfa.c
@ -1330,14 +1330,16 @@ compact(struct nfa * nfa,
 	for (s = nfa->states; s != NULL; s = s->next)
 	{
 		nstates++;
-		narcs += 1 + s->nouts + 1;
-		/* 1 as a fake for flags, nouts for arcs, 1 as endmarker */
+		narcs += s->nouts + 1;		/* need one extra for endmarker */
 	}

+	cnfa->stflags = (char *) MALLOC(nstates * sizeof(char));
 	cnfa->states = (struct carc **) MALLOC(nstates * sizeof(struct carc *));
 	cnfa->arcs = (struct carc *) MALLOC(narcs * sizeof(struct carc));
-	if (cnfa->states == NULL || cnfa->arcs == NULL)
+	if (cnfa->stflags == NULL || cnfa->states == NULL || cnfa->arcs == NULL)
 	{
+		if (cnfa->stflags != NULL)
+			FREE(cnfa->stflags);
 		if (cnfa->states != NULL)
 			FREE(cnfa->states);
 		if (cnfa->arcs != NULL)
@ -1359,9 +1361,8 @@ compact(struct nfa * nfa,
 	for (s = nfa->states; s != NULL; s = s->next)
 	{
 		assert((size_t) s->no < nstates);
+		cnfa->stflags[s->no] = 0;
 		cnfa->states[s->no] = ca;
-		ca->co = 0;				/* clear and skip flags "arc" */
-		ca++;
 		first = ca;
 		for (a = s->outs; a != NULL; a = a->outchain)
 			switch (a->type)
@ -1392,8 +1393,8 @@ compact(struct nfa * nfa,

 	/* mark no-progress states */
 	for (a = nfa->pre->outs; a != NULL; a = a->outchain)
-		cnfa->states[a->to->no]->co = 1;
-	cnfa->states[nfa->pre->no]->co = 1;
+		cnfa->stflags[a->to->no] = CNFA_NOPROGRESS;
+	cnfa->stflags[nfa->pre->no] = CNFA_NOPROGRESS;
 }

 /*
@ -1433,6 +1434,7 @@ freecnfa(struct cnfa * cnfa)
 {
 	assert(cnfa->nstates != 0); /* not empty already */
 	cnfa->nstates = 0;
+	FREE(cnfa->stflags);
 	FREE(cnfa->states);
 	FREE(cnfa->arcs);
 }
@ -1617,7 +1619,7 @@ dumpcnfa(struct cnfa * cnfa,
 		fprintf(f, ", haslacons");
 	fprintf(f, "\n");
 	for (st = 0; st < cnfa->nstates; st++)
-		dumpcstate(st, cnfa->states[st], cnfa, f);
+		dumpcstate(st, cnfa, f);
 	fflush(f);
 }
 #endif
@ -1629,22 +1631,20 @@ dumpcnfa(struct cnfa * cnfa,
 */
 static void
 dumpcstate(int st,
-		   struct carc * ca,
 		   struct cnfa * cnfa,
 		   FILE *f)
 {
-	int			i;
+	struct carc * ca;
 	int			pos;

-	fprintf(f, "%d%s", st, (ca[0].co) ? ":" : ".");
+	fprintf(f, "%d%s", st, (cnfa->stflags[st] & CNFA_NOPROGRESS) ? ":" : ".");
 	pos = 1;
-	for (i = 1; ca[i].co != COLORLESS; i++)
+	for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++)
 	{
-		if (ca[i].co < cnfa->ncolors)
-			fprintf(f, "\t[%ld]->%d", (long) ca[i].co, ca[i].to);
+		if (ca->co < cnfa->ncolors)
+			fprintf(f, "\t[%ld]->%d", (long) ca->co, ca->to);
 		else
-			fprintf(f, "\t:%ld:->%d", (long) ca[i].co - cnfa->ncolors,
-					ca[i].to);
+			fprintf(f, "\t:%ld:->%d", (long) (ca->co - cnfa->ncolors), ca->to);
 		if (pos == 5)
 		{
 			fprintf(f, "\n");
@ -1653,7 +1653,7 @@ dumpcstate(int st,
 		else
 			pos++;
 	}
-	if (i == 1 || pos != 1)
+	if (ca == cnfa->states[st] || pos != 1)
 		fprintf(f, "\n");
 	fflush(f);
 }
--- a/src/backend/regex/regcomp.c
+++ b/src/backend/regex/regcomp.c
@ -162,7 +162,7 @@ static void dumparcs(struct state *, FILE *);
 static int	dumprarcs(struct arc *, struct state *, FILE *, int);
 static void dumparc(struct arc *, struct state *, FILE *);
 static void dumpcnfa(struct cnfa *, FILE *);
-static void dumpcstate(int, struct carc *, struct cnfa *, FILE *);
+static void dumpcstate(int, struct cnfa *, FILE *);
 #endif
 /* === regc_cvec.c === */
 static struct cvec *newcvec(int, int);
--- a/src/backend/regex/rege_dfa.c
+++ b/src/backend/regex/rege_dfa.c
@ -458,14 +458,14 @@ miss(struct vars * v,			/* used only for debug flags */
 	gotstate = 0;
 	for (i = 0; i < d->nstates; i++)
 		if (ISBSET(css->states, i))
-			for (ca = cnfa->states[i] + 1; ca->co != COLORLESS; ca++)
+			for (ca = cnfa->states[i]; ca->co != COLORLESS; ca++)
 				if (ca->co == co)
 				{
 					BSET(d->work, ca->to);
 					gotstate = 1;
 					if (ca->to == cnfa->post)
 						ispost = 1;
-					if (!cnfa->states[ca->to]->co)
+					if (!(cnfa->stflags[ca->to] & CNFA_NOPROGRESS))
 						noprogress = 0;
 					FDEBUG(("%d -> %d\n", i, ca->to));
 				}
@ -476,10 +476,9 @@ miss(struct vars * v,			/* used only for debug flags */
 		dolacons = 0;
 		for (i = 0; i < d->nstates; i++)
 			if (ISBSET(d->work, i))
-				for (ca = cnfa->states[i] + 1; ca->co != COLORLESS;
-					 ca++)
+				for (ca = cnfa->states[i]; ca->co != COLORLESS; ca++)
 				{
-					if (ca->co <= cnfa->ncolors)
+					if (ca->co < cnfa->ncolors)
 						continue;		/* NOTE CONTINUE */
 					sawlacons = 1;
 					if (ISBSET(d->work, ca->to))
@ -490,7 +489,7 @@ miss(struct vars * v,			/* used only for debug flags */
 					dolacons = 1;
 					if (ca->to == cnfa->post)
 						ispost = 1;
-					if (!cnfa->states[ca->to]->co)
+					if (!(cnfa->stflags[ca->to] & CNFA_NOPROGRESS))
 						noprogress = 0;
 					FDEBUG(("%d :> %d\n", i, ca->to));
 				}
--- a/src/backend/regex/regprefix.c
+++ b/src/backend/regex/regprefix.c
@ -0,0 +1,256 @@
+/*-------------------------------------------------------------------------
+ *
+ * regprefix.c
+ *	  Extract a common prefix, if any, from a compiled regex.
+ *
+ *
+ * Portions Copyright (c) 2012, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1998, 1999 Henry Spencer
+ *
+ * IDENTIFICATION
+ *	  src/backend/regex/regprefix.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "regex/regguts.h"
+
+
+/*
+ * forward declarations
+ */
+static int findprefix(struct cnfa * cnfa, struct colormap * cm,
+					  chr *string, size_t *slength);
+
+
+/*
+ * pg_regprefix - get common prefix for regular expression
+ *
+ * Returns one of:
+ *	REG_NOMATCH: there is no common prefix of strings matching the regex
+ *	REG_PREFIX: there is a common prefix of strings matching the regex
+ *	REG_EXACT: all strings satisfying the regex must match the same string
+ *	or a REG_XXX error code
+ *
+ * In the non-failure cases, *string is set to a malloc'd string containing
+ * the common prefix or exact value, of length *slength (measured in chrs
+ * not bytes!).
+ *
+ * This function does not analyze all complex cases (such as lookahead
+ * constraints) exactly.  Therefore it is possible that some strings matching
+ * the reported prefix or exact-match string do not satisfy the regex.  But
+ * it should never be the case that a string satisfying the regex does not
+ * match the reported prefix or exact-match string.
+ */
+int
+pg_regprefix(regex_t *re,
+			 chr **string,
+			 size_t *slength)
+{
+	struct guts *g;
+	struct cnfa *cnfa;
+	int			st;
+
+	/* sanity checks */
+	if (string == NULL || slength == NULL)
+		return REG_INVARG;
+	*string = NULL;				/* initialize for failure cases */
+	*slength = 0;
+	if (re == NULL || re->re_magic != REMAGIC)
+		return REG_INVARG;
+	if (re->re_csize != sizeof(chr))
+		return REG_MIXED;
+
+	/* setup */
+	g = (struct guts *) re->re_guts;
+	if (g->info & REG_UIMPOSSIBLE)
+		return REG_NOMATCH;
+
+	/*
+	 * This implementation considers only the search NFA for the topmost regex
+	 * tree node.  Therefore, constraints such as backrefs are not fully
+	 * applied, which is allowed per the function's API spec.
+	 */
+	assert(g->tree != NULL);
+	cnfa = &g->tree->cnfa;
+
+	/*
+	 * Since a correct NFA should never contain any exit-free loops, it should
+	 * not be possible for our traversal to return to a previously visited
+	 * NFA state.  Hence we need at most nstates chrs in the output string.
+	 */
+	*string = (chr *) MALLOC(cnfa->nstates * sizeof(chr));
+	if (*string == NULL)
+		return REG_ESPACE;
+
+	/* do it */
+	st = findprefix(cnfa, &g->cmap, *string, slength);
+
+	assert(*slength <= cnfa->nstates);
+
+	/* clean up */
+	if (st != REG_PREFIX && st != REG_EXACT)
+	{
+		FREE(*string);
+		*string = NULL;
+		*slength = 0;
+	}
+
+	return st;
+}
+
+/*
+ * findprefix - extract common prefix from cNFA
+ *
+ * Results are returned into the preallocated chr array string[], with
+ * *slength (which must be preset to zero) incremented for each chr.
+ */
+static int						/* regprefix return code */
+findprefix(struct cnfa * cnfa,
+		   struct colormap * cm,
+		   chr *string,
+		   size_t *slength)
+{
+	int			st;
+	int			nextst;
+	color		thiscolor;
+	chr			c;
+	struct carc *ca;
+
+	/*
+	 * The "pre" state must have only BOS/BOL outarcs, else pattern isn't
+	 * anchored left.  If we have both BOS and BOL, they must go to the
+	 * same next state.
+	 */
+	st = cnfa->pre;
+	nextst = -1;
+	for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++)
+	{
+		if (ca->co == cnfa->bos[0] || ca->co == cnfa->bos[1])
+		{
+			if (nextst == -1)
+				nextst = ca->to;
+			else if (nextst != ca->to)
+				return REG_NOMATCH;
+		}
+		else
+			return REG_NOMATCH;
+	}
+	if (nextst == -1)
+		return REG_NOMATCH;
+
+	/*
+	 * Scan through successive states, stopping as soon as we find one with
+	 * more than one acceptable transition character (either multiple colors
+	 * on out-arcs, or a color with more than one member chr).
+	 *
+	 * We could find a state with multiple out-arcs that are all labeled with
+	 * the same singleton color; this comes from patterns like "^ab(cde|cxy)".
+	 * In that case we add the chr "c" to the output string but then exit the
+	 * loop with nextst == -1.  This leaves a little bit on the table: if the
+	 * pattern is like "^ab(cde|cdy)", we won't notice that "d" could be added
+	 * to the prefix.  But chasing multiple parallel state chains doesn't seem
+	 * worth the trouble.
+	 */
+	do
+	{
+		st = nextst;
+		nextst = -1;
+		thiscolor = COLORLESS;
+		for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++)
+		{
+			/* We ignore lookahead constraints */
+			if (ca->co >= cnfa->ncolors)
+				continue;
+			/* We can also ignore BOS/BOL arcs */
+			if (ca->co == cnfa->bos[0] || ca->co == cnfa->bos[1])
+				continue;
+			/* ... but EOS/EOL arcs terminate the search */
+			if (ca->co == cnfa->eos[0] || ca->co == cnfa->eos[1])
+			{
+				thiscolor = COLORLESS;
+				break;
+			}
+			if (thiscolor == COLORLESS)
+			{
+				/* First plain outarc */
+				thiscolor = ca->co;
+				nextst = ca->to;
+			}
+			else if (thiscolor == ca->co)
+			{
+				/* Another plain outarc for same color */
+				nextst = -1;
+			}
+			else
+			{
+				/* More than one plain outarc color terminates the search */
+				thiscolor = COLORLESS;
+				break;
+			}
+		}
+		/* Done if we didn't find exactly one color on plain outarcs */
+		if (thiscolor == COLORLESS)
+			break;
+		/* The color must be a singleton */
+		if (cm->cd[thiscolor].nchrs != 1)
+			break;
+
+		/*
+		 * Identify the color's sole member chr and add it to the prefix
+		 * string.  In general the colormap data structure doesn't provide a
+		 * way to find color member chrs, except by trying GETCOLOR() on each
+		 * possible chr value, which won't do at all.  However, for the cases
+		 * we care about it should be sufficient to test the "firstchr" value,
+		 * that is the first chr ever added to the color.  There are cases
+		 * where this might no longer be a member of the color (so we do need
+		 * to test), but none of them are likely to arise for a character that
+		 * is a member of a common prefix.  If we do hit such a corner case,
+		 * we just fall out without adding anything to the prefix string.
+		 */
+		c = cm->cd[thiscolor].firstchr;
+		if (GETCOLOR(cm, c) != thiscolor)
+			break;
+
+		string[(*slength)++] = c;
+
+		/* Advance to next state, but only if we have a unique next state */
+	} while (nextst != -1);
+
+	/*
+	 * If we ended at a state that only has EOS/EOL outarcs leading to the
+	 * "post" state, then we have an exact-match string.  Note this is true
+	 * even if the string is of zero length.
+	 */
+	nextst = -1;
+	for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++)
+	{
+		if (ca->co == cnfa->eos[0] || ca->co == cnfa->eos[1])
+		{
+			if (nextst == -1)
+				nextst = ca->to;
+			else if (nextst != ca->to)
+			{
+				nextst = -1;
+				break;
+			}
+		}
+		else
+		{
+			nextst = -1;
+			break;
+		}
+	}
+	if (nextst == cnfa->post)
+		return REG_EXACT;
+
+	/*
+	 * Otherwise, if we were unable to identify any prefix characters, say
+	 * NOMATCH --- the pattern is anchored left, but doesn't specify any
+	 * particular first character.
+	 */
+	if (*slength > 0)
+		return REG_PREFIX;
+
+	return REG_NOMATCH;
+}
--- a/src/backend/utils/adt/regexp.c
+++ b/src/backend/utils/adt/regexp.c
@ -1148,3 +1148,68 @@ build_regexp_split_result(regexp_matches_ctx *splitctx)
 								   Int32GetDatum(startpos + 1));
 	}
 }
+
+/*
+ * regexp_fixed_prefix - extract fixed prefix, if any, for a regexp
+ *
+ * The result is NULL if there is no fixed prefix, else a palloc'd string.
+ * If it is an exact match, not just a prefix, *exact is returned as TRUE.
+ */
+char *
+regexp_fixed_prefix(text *text_re, bool case_insensitive,
+					bool *exact)
+{
+	char	   *result;
+	regex_t    *re;
+	int			cflags;
+	int			re_result;
+	pg_wchar   *str;
+	size_t		slen;
+	size_t		maxlen;
+	char		errMsg[100];
+
+	*exact = false;				/* default result */
+
+	/* Compile RE */
+	cflags = REG_ADVANCED;
+	if (case_insensitive)
+		cflags |= REG_ICASE;
+
+	re = RE_compile_and_cache(text_re, cflags);
+
+	/* Examine it to see if there's a fixed prefix */
+	re_result = pg_regprefix(re, &str, &slen);
+
+	switch (re_result)
+	{
+		case REG_NOMATCH:
+			return NULL;
+
+		case REG_PREFIX:
+			/* continue with wchar conversion */
+			break;
+
+		case REG_EXACT:
+			*exact = true;
+			/* continue with wchar conversion */
+			break;
+
+		default:
+			/* re failed??? */
+			pg_regerror(re_result, re, errMsg, sizeof(errMsg));
+			ereport(ERROR,
+					(errcode(ERRCODE_INVALID_REGULAR_EXPRESSION),
+					 errmsg("regular expression failed: %s", errMsg)));
+			break;
+	}
+
+	/* Convert pg_wchar result back to database encoding */
+	maxlen = pg_database_encoding_max_length() * slen + 1;
+	result = (char *) palloc(maxlen);
+	slen = pg_wchar2mb_with_len(str, result, slen);
+	Assert(slen < maxlen);
+
+	free(str);
+
+	return result;
+}
--- a/src/backend/utils/adt/selfuncs.c
+++ b/src/backend/utils/adt/selfuncs.c
@ -178,7 +178,8 @@ static Selectivity prefix_selectivity(PlannerInfo *root,
 static Selectivity like_selectivity(const char *patt, int pattlen,
 									bool case_insensitive);
 static Selectivity regex_selectivity(const char *patt, int pattlen,
-									 bool case_insensitive);
+									 bool case_insensitive,
+									 int fixed_prefix_len);
 static Datum string_to_datum(const char *str, Oid datatype);
 static Const *string_to_const(const char *str, Oid datatype);
 static Const *string_to_bytea_const(const char *str, size_t str_len);
@ -4922,16 +4923,9 @@ static Pattern_Prefix_Status
 regex_fixed_prefix(Const *patt_const, bool case_insensitive,
 				   Const **prefix_const, Selectivity *rest_selec)
 {
-	char	   *match;
-	int			pos,
-				match_pos,
-				prev_pos,
-				prev_match_pos;
-	bool		have_leading_paren;
-	char	   *patt;
-	char	   *rest;
 	Oid			typeid = patt_const->consttype;
-	bool		is_multibyte = (pg_database_encoding_max_length() > 1);
+	char	   *prefix;
+	bool		exact;

 	/*
 	 * Should be unnecessary, there are no bytea regex operators defined. As
@ -4943,170 +4937,54 @@ regex_fixed_prefix(Const *patt_const, bool case_insensitive,
 				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
 		 errmsg("regular-expression matching not supported on type bytea")));

-	/* the right-hand const is type text for all of these */
-	patt = TextDatumGetCString(patt_const->constvalue);
+	/* Use the regexp machinery to extract the prefix, if any */
+	prefix = regexp_fixed_prefix(DatumGetTextPP(patt_const->constvalue),
+								 case_insensitive,
+								 &exact);

-	/*
-	 * Check for ARE director prefix.  It's worth our trouble to recognize
-	 * this because similar_escape() used to use it, and some other code might
-	 * still use it, to force ARE mode.
-	 */
-	pos = 0;
-	if (strncmp(patt, "***:", 4) == 0)
-		pos = 4;
-
-	/* Pattern must be anchored left */
-	if (patt[pos] != '^')
+	if (prefix == NULL)
 	{
 		*prefix_const = NULL;

 		if (rest_selec != NULL)
+		{
+			char   *patt = TextDatumGetCString(patt_const->constvalue);
+
 			*rest_selec = regex_selectivity(patt, strlen(patt),
-											case_insensitive);
-
-		return Pattern_Prefix_None;
-	}
-	pos++;
-
-	/*
-	 * If '|' is present in pattern, then there may be multiple alternatives
-	 * for the start of the string.  (There are cases where this isn't so, for
-	 * instance if the '|' is inside parens, but detecting that reliably is
-	 * too hard.)
-	 */
-	if (strchr(patt + pos, '|') != NULL)
-	{
-		*prefix_const = NULL;
-
-		if (rest_selec != NULL)
-			*rest_selec = regex_selectivity(patt, strlen(patt),
-											case_insensitive);
+											case_insensitive,
+											0);
+			pfree(patt);
+		}

 		return Pattern_Prefix_None;
 	}

-	/* OK, allocate space for pattern */
-	match = palloc(strlen(patt) + 1);
-	prev_match_pos = match_pos = 0;
-
-	/*
-	 * We special-case the syntax '^(...)$' because psql uses it.  But beware:
-	 * sequences beginning "(?" are not what they seem, unless they're "(?:".
-	 * (We must recognize that because of similar_escape().)
-	 */
-	have_leading_paren = false;
-	if (patt[pos] == '(' &&
-		(patt[pos + 1] != '?' || patt[pos + 2] == ':'))
-	{
-		have_leading_paren = true;
-		pos += (patt[pos + 1] != '?' ? 1 : 3);
-	}
-
-	/* Scan remainder of pattern */
-	prev_pos = pos;
-	while (patt[pos])
-	{
-		int			len;
-
-		/*
-		 * Check for characters that indicate multiple possible matches here.
-		 * Also, drop out at ')' or '$' so the termination test works right.
-		 */
-		if (patt[pos] == '.' ||
-			patt[pos] == '(' ||
-			patt[pos] == ')' ||
-			patt[pos] == '[' ||
-			patt[pos] == '^' ||
-			patt[pos] == '$')
-			break;
-
-		/*
-		 * XXX In multibyte character sets, we can't trust isalpha, so assume
-		 * any multibyte char is potentially case-varying.
-		 */
-		if (case_insensitive)
-		{
-			if (is_multibyte && (unsigned char) patt[pos] >= 0x80)
-				break;
-			if (isalpha((unsigned char) patt[pos]))
-				break;
-		}
-
-		/*
-		 * Check for quantifiers.  Except for +, this means the preceding
-		 * character is optional, so we must remove it from the prefix too!
-		 */
-		if (patt[pos] == '*' ||
-			patt[pos] == '?' ||
-			patt[pos] == '{')
-		{
-			match_pos = prev_match_pos;
-			pos = prev_pos;
-			break;
-		}
-		if (patt[pos] == '+')
-		{
-			pos = prev_pos;
-			break;
-		}
-
-		/*
-		 * Normally, backslash quotes the next character.  But in AREs,
-		 * backslash followed by alphanumeric is an escape, not a quoted
-		 * character.  Must treat it as having multiple possible matches.
-		 * Note: since only ASCII alphanumerics are escapes, we don't have to
-		 * be paranoid about multibyte here.
-		 */
-		if (patt[pos] == '\\')
-		{
-			if (isalnum((unsigned char) patt[pos + 1]))
-				break;
-			pos++;
-			if (patt[pos] == '\0')
-				break;
-		}
-		/* save position in case we need to back up on next loop cycle */
-		prev_match_pos = match_pos;
-		prev_pos = pos;
-		/* must use encoding-aware processing here */
-		len = pg_mblen(&patt[pos]);
-		memcpy(&match[match_pos], &patt[pos], len);
-		match_pos += len;
-		pos += len;
-	}
-
-	match[match_pos] = '\0';
-	rest = &patt[pos];
-
-	if (have_leading_paren && patt[pos] == ')')
-		pos++;
-
-	if (patt[pos] == '$' && patt[pos + 1] == '\0')
-	{
-		*prefix_const = string_to_const(match, typeid);
-
-		if (rest_selec != NULL)
-			*rest_selec = 1.0;
-
-		pfree(patt);
-		pfree(match);
-
-		return Pattern_Prefix_Exact;	/* pattern specifies exact match */
-	}
-
-	*prefix_const = string_to_const(match, typeid);
+	*prefix_const = string_to_const(prefix, typeid);

 	if (rest_selec != NULL)
-		*rest_selec = regex_selectivity(rest, strlen(rest),
-										case_insensitive);
+	{
+		if (exact)
+		{
+			/* Exact match, so there's no additional selectivity */
+			*rest_selec = 1.0;
+		}
+		else
+		{
+			char   *patt = TextDatumGetCString(patt_const->constvalue);

-	pfree(patt);
-	pfree(match);
+			*rest_selec = regex_selectivity(patt, strlen(patt),
+											case_insensitive,
+											strlen(prefix));
+			pfree(patt);
+		}
+	}

-	if (match_pos > 0)
+	pfree(prefix);
+
+	if (exact)
+		return Pattern_Prefix_Exact;	/* pattern specifies exact match */
+	else
 		return Pattern_Prefix_Partial;
-
-	return Pattern_Prefix_None;
 }

 Pattern_Prefix_Status
@ -5387,7 +5265,8 @@ regex_selectivity_sub(const char *patt, int pattlen, bool case_insensitive)
 }

 static Selectivity
-regex_selectivity(const char *patt, int pattlen, bool case_insensitive)
+regex_selectivity(const char *patt, int pattlen, bool case_insensitive,
+				  int fixed_prefix_len)
 {
 	Selectivity sel;

@ -5403,9 +5282,14 @@ regex_selectivity(const char *patt, int pattlen, bool case_insensitive)
 		/* no trailing $ */
 		sel = regex_selectivity_sub(patt, pattlen, case_insensitive);
 		sel *= FULL_WILDCARD_SEL;
-		if (sel > 1.0)
-			sel = 1.0;
 	}
+
+	/* If there's a fixed prefix, discount its selectivity */
+	if (fixed_prefix_len > 0)
+		sel /= pow(FIXED_CHAR_SEL, fixed_prefix_len);
+
+	/* Make sure result stays in range */
+	CLAMP_PROBABILITY(sel);
 	return sel;
 }

--- a/src/include/regex/regex.h
+++ b/src/include/regex/regex.h
@ -155,6 +155,9 @@ typedef struct
 /* two specials for debugging and testing */
 #define REG_ATOI	101			/* convert error-code name to number */
 #define REG_ITOA	102			/* convert error-code number to name */
+/* non-error result codes for pg_regprefix */
+#define REG_PREFIX	(-1)		/* identified a common prefix */
+#define REG_EXACT	(-2)		/* identified an exact match */



@ -163,6 +166,7 @@ typedef struct
 */
 extern int	pg_regcomp(regex_t *, const pg_wchar *, size_t, int);
 extern int	pg_regexec(regex_t *, const pg_wchar *, size_t, size_t, rm_detail_t *, size_t, regmatch_t[], int);
+extern int	pg_regprefix(regex_t *, pg_wchar **, size_t *);
 extern void pg_regfree(regex_t *);
 extern size_t pg_regerror(int, const regex_t *, char *, size_t);

--- a/src/include/regex/regguts.h
+++ b/src/include/regex/regguts.h
@ -188,6 +188,7 @@ struct colordesc
 	color		sub;			/* open subcolor (if any); free chain ptr */
 #define  NOSUB	 COLORLESS
 	struct arc *arcs;			/* color chain */
+	chr			firstchr;		/* char first assigned to this color */
 	int			flags;
 #define  FREECOL 01				/* currently free */
 #define  PSEUDO  02				/* pseudocolor, no real chars */
@ -255,15 +256,14 @@ struct state;

 struct arc
 {
-	int			type;
-#define  ARCFREE '\0'
+	int			type;			/* 0 if free, else an NFA arc type code */
 	color		co;
 	struct state *from;			/* where it's from (and contained within) */
 	struct state *to;			/* where it's to */
-	struct arc *outchain;		/* *from's outs chain or free chain */
+	struct arc *outchain;		/* link in *from's outs chain or free chain */
 #define  freechain	 outchain
-	struct arc *inchain;		/* *to's ins chain */
-	struct arc *colorchain;		/* color's arc chain */
+	struct arc *inchain;		/* link in *to's ins chain */
+	struct arc *colorchain;		/* link in color's arc chain */
 	struct arc *colorchainRev;	/* back-link in color's arc chain */
 };

@ -315,24 +315,38 @@ struct nfa

 /*
 * definitions for compacted NFA
+ *
+ * The main space savings in a compacted NFA is from making the arcs as small
+ * as possible.  We store only the transition color and next-state number for
+ * each arc.  The list of out arcs for each state is an array beginning at
+ * cnfa.states[statenumber], and terminated by a dummy carc struct with
+ * co == COLORLESS.
+ *
+ * The non-dummy carc structs are of two types: plain arcs and LACON arcs.
+ * Plain arcs just store the transition color number as "co".  LACON arcs
+ * store the lookahead constraint number plus cnfa.ncolors as "co".  LACON
+ * arcs can be distinguished from plain by testing for co >= cnfa.ncolors.
 */
 struct carc
 {
 	color		co;				/* COLORLESS is list terminator */
-	int			to;				/* state number */
+	int			to;				/* next-state number */
 };

 struct cnfa
 {
 	int			nstates;		/* number of states */
-	int			ncolors;		/* number of colors */
+	int			ncolors;		/* number of colors (max color in use + 1) */
 	int			flags;
-#define  HASLACONS	 01			/* uses lookahead constraints */
+#define  HASLACONS	01			/* uses lookahead constraints */
 	int			pre;			/* setup state number */
 	int			post;			/* teardown state number */
 	color		bos[2];			/* colors, if any, assigned to BOS and BOL */
 	color		eos[2];			/* colors, if any, assigned to EOS and EOL */
+	char	   *stflags;		/* vector of per-state flags bytes */
+#define  CNFA_NOPROGRESS	01	/* flag bit for a no-progress state */
 	struct carc **states;		/* vector of pointers to outarc lists */
+	/* states[n] are pointers into a single malloc'd array of arcs */
 	struct carc *arcs;			/* the area for the lists */
 };

--- a/src/include/utils/builtins.h
+++ b/src/include/utils/builtins.h
@ -530,6 +530,8 @@ extern Datum regexp_split_to_table(PG_FUNCTION_ARGS);
 extern Datum regexp_split_to_table_no_flags(PG_FUNCTION_ARGS);
 extern Datum regexp_split_to_array(PG_FUNCTION_ARGS);
 extern Datum regexp_split_to_array_no_flags(PG_FUNCTION_ARGS);
+extern char *regexp_fixed_prefix(text *text_re, bool case_insensitive,
+								 bool *exact);

 /* regproc.c */
 extern Datum regprocin(PG_FUNCTION_ARGS);