Fix performance issues in replace_text(), replace_text_regexp(), and

text_to_array(): they all had O(N^2) behavior on long input strings in multibyte encodings, because of repeated rescanning of the input text to identify substrings whose positions/lengths were computed in characters instead of bytes. Fix by tracking the current source position as a char pointer as well as a character-count. Also avoid some unnecessary palloc operations. text_to_array() also leaked memory intracall due to failure to pfree temporary strings. Per gripe from Tatsuo Ishii.
2025-01-30 19:00:29 +08:00 · 2006-11-08 19:22:25 +00:00 · 2006-11-08 19:22:25 +00:00 · a5cf12e2ef
commit a5cf12e2ef
parent 9d6f26325f
1 changed files with 129 additions and 54 deletions
--- a/src/backend/utils/adt/varlena.c
+++ b/src/backend/utils/adt/varlena.c
@ -8,7 +8,7 @@
 *
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/utils/adt/varlena.c,v 1.152 2006/10/07 00:11:53 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/utils/adt/varlena.c,v 1.153 2006/11/08 19:22:25 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@ -21,6 +21,7 @@
 #include "catalog/pg_type.h"
 #include "libpq/md5.h"
 #include "libpq/pqformat.h"
+#include "miscadmin.h"
 #include "parser/scansup.h"
 #include "regex/regex.h"
 #include "utils/builtins.h"
@ -477,6 +478,32 @@ textcat(PG_FUNCTION_ARGS)
 	PG_RETURN_TEXT_P(result);
 }

+/*
+ * charlen_to_bytelen()
+ *	Compute the number of bytes occupied by n characters starting at *p
+ *
+ * It is caller's responsibility that there actually are n characters;
+ * the string need not be null-terminated.
+ */
+static int
+charlen_to_bytelen(const char *p, int n)
+{
+	if (pg_database_encoding_max_length() == 1)
+	{
+		/* Optimization for single-byte encodings */
+		return n;
+	}
+	else
+	{
+		const char *s;
+
+		for (s = p; n > 0; n--)
+			s += pg_mblen(s);
+
+		return s - p;
+	}
+}
+
 /*
 * text_substr()
 * Return a substring starting at the specified position.
@ -534,6 +561,8 @@ text_substr_no_len(PG_FUNCTION_ARGS)
 *	functions.	Note that the argument is passed as a Datum, to indicate that
 *	it may still be in compressed/toasted form.  We can avoid detoasting all
 *	of it in some cases.
+ *
+ *	The result is always a freshly palloc'd datum.
 */
 static text *
 text_substring(Datum str, int32 start, int32 length, bool length_not_specified)
@ -649,11 +678,23 @@ text_substring(Datum str, int32 start, int32 length, bool length_not_specified)
 			 */
 			slice_size = (S1 + L1) * eml;
 		}
-		slice = DatumGetTextPSlice(str, slice_start, slice_size);
+
+		/*
+		 * If we're working with an untoasted source, no need to do an
+		 * extra copying step.
+		 */
+		if (VARATT_IS_EXTENDED(str))
+			slice = DatumGetTextPSlice(str, slice_start, slice_size);
+		else
+			slice = (text *) DatumGetPointer(str);

 		/* see if we got back an empty string */
 		if ((VARSIZE(slice) - VARHDRSZ) == 0)
+		{
+			if (slice != (text *) DatumGetPointer(str))
+				pfree(slice);
 			return PG_STR_GET_TEXT("");
+		}

 		/* Now we can get the actual length of the slice in MB characters */
 		slice_strlen = pg_mbstrlen_with_len(VARDATA(slice), VARSIZE(slice) - VARHDRSZ);
@ -663,7 +704,11 @@ text_substring(Datum str, int32 start, int32 length, bool length_not_specified)
 		 * says to return a zero-length string.
 		 */
 		if (S1 > slice_strlen)
+		{
+			if (slice != (text *) DatumGetPointer(str))
+				pfree(slice);
 			return PG_STR_GET_TEXT("");
+		}

 		/*
 		 * Adjust L1 and E1 now that we know the slice string length. Again
@ -695,6 +740,9 @@ text_substring(Datum str, int32 start, int32 length, bool length_not_specified)
 		VARATT_SIZEP(ret) = VARHDRSZ + (p - s);
 		memcpy(VARDATA(ret), s, (p - s));

+		if (slice != (text *) DatumGetPointer(str))
+			pfree(slice);
+
 		return ret;
 	}
 	else
@ -2076,10 +2124,11 @@ replace_text(PG_FUNCTION_ARGS)
 	int			src_text_len = TEXTLEN(src_text);
 	int			from_sub_text_len = TEXTLEN(from_sub_text);
 	TextPositionState state;
-	text	   *chunk_text;
 	text	   *ret_text;
 	int			start_posn;
 	int			curr_posn;
+	int			chunk_len;
+	char	   *start_ptr;
 	StringInfoData str;

 	if (src_text_len == 0 || from_sub_text_len == 0)
@ -2097,31 +2146,31 @@ replace_text(PG_FUNCTION_ARGS)
 		PG_RETURN_TEXT_P(src_text);
 	}

+	/* start_ptr points to the start_posn'th character of src_text */
+	start_ptr = (char *) VARDATA(src_text);
+
 	initStringInfo(&str);

 	do
 	{
-		chunk_text = text_substring(PointerGetDatum(src_text),
-									start_posn,
-									curr_posn - start_posn,
-									false);
-		appendStringInfoText(&str, chunk_text);
-		pfree(chunk_text);
+		/* copy the data skipped over by last text_position_next() */
+		chunk_len = charlen_to_bytelen(start_ptr, curr_posn - start_posn);
+		appendBinaryStringInfo(&str, start_ptr, chunk_len);

 		appendStringInfoText(&str, to_sub_text);

-		start_posn = curr_posn + from_sub_text_len;
+		start_posn = curr_posn;
+		start_ptr += chunk_len;
+		start_posn += from_sub_text_len;
+		start_ptr += charlen_to_bytelen(start_ptr, from_sub_text_len);
+
 		curr_posn = text_position_next(start_posn, &state);
 	}
 	while (curr_posn > 0);

-	/* copy trailing chunk */
-	chunk_text = text_substring(PointerGetDatum(src_text),
-								start_posn,
-								-1,
-								true);
-	appendStringInfoText(&str, chunk_text);
-	pfree(chunk_text);
+	/* copy trailing data */
+	chunk_len = ((char *) src_text + VARSIZE(src_text)) - start_ptr;
+	appendBinaryStringInfo(&str, start_ptr, chunk_len);

 	text_position_cleanup(&state);

@ -2166,11 +2215,13 @@ check_replace_text_has_escape_char(const text *replace_text)
 * appendStringInfoRegexpSubstr
 *
 * Append replace_text to str, substituting regexp back references for
- * \n escapes.
+ * \n escapes.  start_ptr is the start of the match in the source string,
+ * at logical character position data_pos.
 */
 static void
 appendStringInfoRegexpSubstr(StringInfo str, text *replace_text,
-							 regmatch_t *pmatch, text *src_text)
+							 regmatch_t *pmatch,
+							 char *start_ptr, int data_pos)
 {
 	const char *p = VARDATA(replace_text);
 	const char *p_end = p + (VARSIZE(replace_text) - VARHDRSZ);
@ -2247,16 +2298,17 @@ appendStringInfoRegexpSubstr(StringInfo str, text *replace_text,
 		if (so != -1 && eo != -1)
 		{
 			/*
-			 * Copy the text that is back reference of regexp.	Because so and
-			 * eo are counted in characters not bytes, it's easiest to use
-			 * text_substring to pull out the correct chunk of text.
+			 * Copy the text that is back reference of regexp.	Note so and
+			 * eo are counted in characters not bytes.
 			 */
-			text	   *append_text;
+			char	   *chunk_start;
+			int			chunk_len;

-			append_text = text_substring(PointerGetDatum(src_text),
-										 so + 1, (eo - so), false);
-			appendStringInfoText(str, append_text);
-			pfree(append_text);
+			Assert(so >= data_pos);
+			chunk_start = start_ptr;
+			chunk_start += charlen_to_bytelen(chunk_start, so - data_pos);
+			chunk_len = charlen_to_bytelen(chunk_start, eo - so);
+			appendBinaryStringInfo(str, chunk_start, chunk_len);
 		}
 	}
 }
@ -2284,6 +2336,7 @@ replace_text_regexp(text *src_text, void *regexp,
 	size_t		data_len;
 	int			search_start;
 	int			data_pos;
+	char	   *start_ptr;
 	bool		have_escape;

 	initStringInfo(&buf);
@ -2295,10 +2348,17 @@ replace_text_regexp(text *src_text, void *regexp,
 	/* Check whether replace_text has escape char. */
 	have_escape = check_replace_text_has_escape_char(replace_text);

-	for (search_start = data_pos = 0; search_start <= data_len;)
+	/* start_ptr points to the data_pos'th character of src_text */
+	start_ptr = (char *) VARDATA(src_text);
+	data_pos = 0;
+
+	search_start = 0;
+	while (search_start <= data_len)
 	{
 		int			regexec_result;

+		CHECK_FOR_INTERRUPTS();
+
 		regexec_result = pg_regexec(re,
 									data,
 									data_len,
@ -2322,20 +2382,22 @@ replace_text_regexp(text *src_text, void *regexp,
 		}

 		/*
-		 * Copy the text to the left of the match position.  Because we are
-		 * working with character not byte indexes, it's easiest to use
-		 * text_substring to pull out the needed data.
+		 * Copy the text to the left of the match position.  Note we are
+		 * given character not byte indexes.
 		 */
 		if (pmatch[0].rm_so - data_pos > 0)
 		{
-			text	   *left_text;
+			int			chunk_len;

-			left_text = text_substring(PointerGetDatum(src_text),
-									   data_pos + 1,
-									   pmatch[0].rm_so - data_pos,
-									   false);
-			appendStringInfoText(&buf, left_text);
-			pfree(left_text);
+			chunk_len = charlen_to_bytelen(start_ptr,
+										   pmatch[0].rm_so - data_pos);
+			appendBinaryStringInfo(&buf, start_ptr, chunk_len);
+			/*
+			 * Advance start_ptr over that text, to avoid multiple rescans
+			 * of it if the replace_text contains multiple back-references.
+			 */
+			start_ptr += chunk_len;
+			data_pos = pmatch[0].rm_so;
 		}

 		/*
@ -2343,11 +2405,15 @@ replace_text_regexp(text *src_text, void *regexp,
 		 * replace_text has escape characters.
 		 */
 		if (have_escape)
-			appendStringInfoRegexpSubstr(&buf, replace_text, pmatch, src_text);
+			appendStringInfoRegexpSubstr(&buf, replace_text, pmatch,
+										 start_ptr, data_pos);
 		else
 			appendStringInfoText(&buf, replace_text);

-		search_start = data_pos = pmatch[0].rm_eo;
+		/* Advance start_ptr and data_pos over the matched text. */
+		start_ptr += charlen_to_bytelen(start_ptr,
+										pmatch[0].rm_eo - data_pos);
+		data_pos = pmatch[0].rm_eo;

 		/*
 		 * When global option is off, replace the first instance only.
@ -2358,6 +2424,7 @@ replace_text_regexp(text *src_text, void *regexp,
 		/*
 		 * Search from next character when the matching text is zero width.
 		 */
+		search_start = data_pos;
 		if (pmatch[0].rm_so == pmatch[0].rm_eo)
 			search_start++;
 	}
@ -2367,12 +2434,10 @@ replace_text_regexp(text *src_text, void *regexp,
 	 */
 	if (data_pos < data_len)
 	{
-		text	   *right_text;
+		int			chunk_len;

-		right_text = text_substring(PointerGetDatum(src_text),
-									data_pos + 1, -1, true);
-		appendStringInfoText(&buf, right_text);
-		pfree(right_text);
+		chunk_len = ((char *) src_text + VARSIZE(src_text)) - start_ptr;
+		appendBinaryStringInfo(&buf, start_ptr, chunk_len);
 	}

 	ret_text = PG_STR_GET_TEXT(buf.data);
@ -2488,6 +2553,8 @@ text_to_array(PG_FUNCTION_ARGS)
 	int			fldnum;
 	int			start_posn;
 	int			end_posn;
+	int			chunk_len;
+	char	   *start_ptr;
 	text	   *result_text;
 	ArrayBuildState *astate = NULL;

@ -2506,6 +2573,9 @@ text_to_array(PG_FUNCTION_ARGS)
 	text_position_setup(inputstring, fldsep, &state);

 	start_posn = 1;
+	/* start_ptr points to the start_posn'th character of inputstring */
+	start_ptr = (char *) VARDATA(inputstring);
+
 	for (fldnum = 1;; fldnum++) /* field number is 1 based */
 	{
 		end_posn = text_position_next(start_posn, &state);
@ -2513,20 +2583,19 @@ text_to_array(PG_FUNCTION_ARGS)
 		if (end_posn == 0)
 		{
 			/* fetch last field */
-			result_text = text_substring(PointerGetDatum(inputstring),
-										 start_posn,
-										 -1,
-										 true);
+			chunk_len = ((char *) inputstring + VARSIZE(inputstring)) - start_ptr;
 		}
 		else
 		{
 			/* fetch non-last field */
-			result_text = text_substring(PointerGetDatum(inputstring),
-										 start_posn,
-										 end_posn - start_posn,
-										 false);
+			chunk_len = charlen_to_bytelen(start_ptr, end_posn - start_posn);
 		}

+		/* must build a temp text datum to pass to accumArrayResult */
+		result_text = (text *) palloc(VARHDRSZ + chunk_len);
+		VARATT_SIZEP(result_text) = VARHDRSZ + chunk_len;
+		memcpy(VARDATA(result_text), start_ptr, chunk_len);
+
 		/* stash away this field */
 		astate = accumArrayResult(astate,
 								  PointerGetDatum(result_text),
@ -2534,9 +2603,15 @@ text_to_array(PG_FUNCTION_ARGS)
 								  TEXTOID,
 								  CurrentMemoryContext);

+		pfree(result_text);
+
 		if (end_posn == 0)
 			break;
-		start_posn = end_posn + fldsep_len;
+
+		start_posn = end_posn;
+		start_ptr += chunk_len;
+		start_posn += fldsep_len;
+		start_ptr += charlen_to_bytelen(start_ptr, fldsep_len);
 	}

 	text_position_cleanup(&state);