mirror of
https://git.postgresql.org/git/postgresql.git
synced 2025-01-30 19:00:29 +08:00
Fix performance issues in replace_text(), replace_text_regexp(), and
text_to_array(): they all had O(N^2) behavior on long input strings in multibyte encodings, because of repeated rescanning of the input text to identify substrings whose positions/lengths were computed in characters instead of bytes. Fix by tracking the current source position as a char pointer as well as a character-count. Also avoid some unnecessary palloc operations. text_to_array() also leaked memory intracall due to failure to pfree temporary strings. Per gripe from Tatsuo Ishii.
This commit is contained in:
parent
9d6f26325f
commit
a5cf12e2ef
@ -8,7 +8,7 @@
|
|||||||
*
|
*
|
||||||
*
|
*
|
||||||
* IDENTIFICATION
|
* IDENTIFICATION
|
||||||
* $PostgreSQL: pgsql/src/backend/utils/adt/varlena.c,v 1.152 2006/10/07 00:11:53 tgl Exp $
|
* $PostgreSQL: pgsql/src/backend/utils/adt/varlena.c,v 1.153 2006/11/08 19:22:25 tgl Exp $
|
||||||
*
|
*
|
||||||
*-------------------------------------------------------------------------
|
*-------------------------------------------------------------------------
|
||||||
*/
|
*/
|
||||||
@ -21,6 +21,7 @@
|
|||||||
#include "catalog/pg_type.h"
|
#include "catalog/pg_type.h"
|
||||||
#include "libpq/md5.h"
|
#include "libpq/md5.h"
|
||||||
#include "libpq/pqformat.h"
|
#include "libpq/pqformat.h"
|
||||||
|
#include "miscadmin.h"
|
||||||
#include "parser/scansup.h"
|
#include "parser/scansup.h"
|
||||||
#include "regex/regex.h"
|
#include "regex/regex.h"
|
||||||
#include "utils/builtins.h"
|
#include "utils/builtins.h"
|
||||||
@ -477,6 +478,32 @@ textcat(PG_FUNCTION_ARGS)
|
|||||||
PG_RETURN_TEXT_P(result);
|
PG_RETURN_TEXT_P(result);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* charlen_to_bytelen()
|
||||||
|
* Compute the number of bytes occupied by n characters starting at *p
|
||||||
|
*
|
||||||
|
* It is caller's responsibility that there actually are n characters;
|
||||||
|
* the string need not be null-terminated.
|
||||||
|
*/
|
||||||
|
static int
|
||||||
|
charlen_to_bytelen(const char *p, int n)
|
||||||
|
{
|
||||||
|
if (pg_database_encoding_max_length() == 1)
|
||||||
|
{
|
||||||
|
/* Optimization for single-byte encodings */
|
||||||
|
return n;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
const char *s;
|
||||||
|
|
||||||
|
for (s = p; n > 0; n--)
|
||||||
|
s += pg_mblen(s);
|
||||||
|
|
||||||
|
return s - p;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* text_substr()
|
* text_substr()
|
||||||
* Return a substring starting at the specified position.
|
* Return a substring starting at the specified position.
|
||||||
@ -534,6 +561,8 @@ text_substr_no_len(PG_FUNCTION_ARGS)
|
|||||||
* functions. Note that the argument is passed as a Datum, to indicate that
|
* functions. Note that the argument is passed as a Datum, to indicate that
|
||||||
* it may still be in compressed/toasted form. We can avoid detoasting all
|
* it may still be in compressed/toasted form. We can avoid detoasting all
|
||||||
* of it in some cases.
|
* of it in some cases.
|
||||||
|
*
|
||||||
|
* The result is always a freshly palloc'd datum.
|
||||||
*/
|
*/
|
||||||
static text *
|
static text *
|
||||||
text_substring(Datum str, int32 start, int32 length, bool length_not_specified)
|
text_substring(Datum str, int32 start, int32 length, bool length_not_specified)
|
||||||
@ -649,11 +678,23 @@ text_substring(Datum str, int32 start, int32 length, bool length_not_specified)
|
|||||||
*/
|
*/
|
||||||
slice_size = (S1 + L1) * eml;
|
slice_size = (S1 + L1) * eml;
|
||||||
}
|
}
|
||||||
slice = DatumGetTextPSlice(str, slice_start, slice_size);
|
|
||||||
|
/*
|
||||||
|
* If we're working with an untoasted source, no need to do an
|
||||||
|
* extra copying step.
|
||||||
|
*/
|
||||||
|
if (VARATT_IS_EXTENDED(str))
|
||||||
|
slice = DatumGetTextPSlice(str, slice_start, slice_size);
|
||||||
|
else
|
||||||
|
slice = (text *) DatumGetPointer(str);
|
||||||
|
|
||||||
/* see if we got back an empty string */
|
/* see if we got back an empty string */
|
||||||
if ((VARSIZE(slice) - VARHDRSZ) == 0)
|
if ((VARSIZE(slice) - VARHDRSZ) == 0)
|
||||||
|
{
|
||||||
|
if (slice != (text *) DatumGetPointer(str))
|
||||||
|
pfree(slice);
|
||||||
return PG_STR_GET_TEXT("");
|
return PG_STR_GET_TEXT("");
|
||||||
|
}
|
||||||
|
|
||||||
/* Now we can get the actual length of the slice in MB characters */
|
/* Now we can get the actual length of the slice in MB characters */
|
||||||
slice_strlen = pg_mbstrlen_with_len(VARDATA(slice), VARSIZE(slice) - VARHDRSZ);
|
slice_strlen = pg_mbstrlen_with_len(VARDATA(slice), VARSIZE(slice) - VARHDRSZ);
|
||||||
@ -663,7 +704,11 @@ text_substring(Datum str, int32 start, int32 length, bool length_not_specified)
|
|||||||
* says to return a zero-length string.
|
* says to return a zero-length string.
|
||||||
*/
|
*/
|
||||||
if (S1 > slice_strlen)
|
if (S1 > slice_strlen)
|
||||||
|
{
|
||||||
|
if (slice != (text *) DatumGetPointer(str))
|
||||||
|
pfree(slice);
|
||||||
return PG_STR_GET_TEXT("");
|
return PG_STR_GET_TEXT("");
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Adjust L1 and E1 now that we know the slice string length. Again
|
* Adjust L1 and E1 now that we know the slice string length. Again
|
||||||
@ -695,6 +740,9 @@ text_substring(Datum str, int32 start, int32 length, bool length_not_specified)
|
|||||||
VARATT_SIZEP(ret) = VARHDRSZ + (p - s);
|
VARATT_SIZEP(ret) = VARHDRSZ + (p - s);
|
||||||
memcpy(VARDATA(ret), s, (p - s));
|
memcpy(VARDATA(ret), s, (p - s));
|
||||||
|
|
||||||
|
if (slice != (text *) DatumGetPointer(str))
|
||||||
|
pfree(slice);
|
||||||
|
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
@ -2076,10 +2124,11 @@ replace_text(PG_FUNCTION_ARGS)
|
|||||||
int src_text_len = TEXTLEN(src_text);
|
int src_text_len = TEXTLEN(src_text);
|
||||||
int from_sub_text_len = TEXTLEN(from_sub_text);
|
int from_sub_text_len = TEXTLEN(from_sub_text);
|
||||||
TextPositionState state;
|
TextPositionState state;
|
||||||
text *chunk_text;
|
|
||||||
text *ret_text;
|
text *ret_text;
|
||||||
int start_posn;
|
int start_posn;
|
||||||
int curr_posn;
|
int curr_posn;
|
||||||
|
int chunk_len;
|
||||||
|
char *start_ptr;
|
||||||
StringInfoData str;
|
StringInfoData str;
|
||||||
|
|
||||||
if (src_text_len == 0 || from_sub_text_len == 0)
|
if (src_text_len == 0 || from_sub_text_len == 0)
|
||||||
@ -2097,31 +2146,31 @@ replace_text(PG_FUNCTION_ARGS)
|
|||||||
PG_RETURN_TEXT_P(src_text);
|
PG_RETURN_TEXT_P(src_text);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* start_ptr points to the start_posn'th character of src_text */
|
||||||
|
start_ptr = (char *) VARDATA(src_text);
|
||||||
|
|
||||||
initStringInfo(&str);
|
initStringInfo(&str);
|
||||||
|
|
||||||
do
|
do
|
||||||
{
|
{
|
||||||
chunk_text = text_substring(PointerGetDatum(src_text),
|
/* copy the data skipped over by last text_position_next() */
|
||||||
start_posn,
|
chunk_len = charlen_to_bytelen(start_ptr, curr_posn - start_posn);
|
||||||
curr_posn - start_posn,
|
appendBinaryStringInfo(&str, start_ptr, chunk_len);
|
||||||
false);
|
|
||||||
appendStringInfoText(&str, chunk_text);
|
|
||||||
pfree(chunk_text);
|
|
||||||
|
|
||||||
appendStringInfoText(&str, to_sub_text);
|
appendStringInfoText(&str, to_sub_text);
|
||||||
|
|
||||||
start_posn = curr_posn + from_sub_text_len;
|
start_posn = curr_posn;
|
||||||
|
start_ptr += chunk_len;
|
||||||
|
start_posn += from_sub_text_len;
|
||||||
|
start_ptr += charlen_to_bytelen(start_ptr, from_sub_text_len);
|
||||||
|
|
||||||
curr_posn = text_position_next(start_posn, &state);
|
curr_posn = text_position_next(start_posn, &state);
|
||||||
}
|
}
|
||||||
while (curr_posn > 0);
|
while (curr_posn > 0);
|
||||||
|
|
||||||
/* copy trailing chunk */
|
/* copy trailing data */
|
||||||
chunk_text = text_substring(PointerGetDatum(src_text),
|
chunk_len = ((char *) src_text + VARSIZE(src_text)) - start_ptr;
|
||||||
start_posn,
|
appendBinaryStringInfo(&str, start_ptr, chunk_len);
|
||||||
-1,
|
|
||||||
true);
|
|
||||||
appendStringInfoText(&str, chunk_text);
|
|
||||||
pfree(chunk_text);
|
|
||||||
|
|
||||||
text_position_cleanup(&state);
|
text_position_cleanup(&state);
|
||||||
|
|
||||||
@ -2166,11 +2215,13 @@ check_replace_text_has_escape_char(const text *replace_text)
|
|||||||
* appendStringInfoRegexpSubstr
|
* appendStringInfoRegexpSubstr
|
||||||
*
|
*
|
||||||
* Append replace_text to str, substituting regexp back references for
|
* Append replace_text to str, substituting regexp back references for
|
||||||
* \n escapes.
|
* \n escapes. start_ptr is the start of the match in the source string,
|
||||||
|
* at logical character position data_pos.
|
||||||
*/
|
*/
|
||||||
static void
|
static void
|
||||||
appendStringInfoRegexpSubstr(StringInfo str, text *replace_text,
|
appendStringInfoRegexpSubstr(StringInfo str, text *replace_text,
|
||||||
regmatch_t *pmatch, text *src_text)
|
regmatch_t *pmatch,
|
||||||
|
char *start_ptr, int data_pos)
|
||||||
{
|
{
|
||||||
const char *p = VARDATA(replace_text);
|
const char *p = VARDATA(replace_text);
|
||||||
const char *p_end = p + (VARSIZE(replace_text) - VARHDRSZ);
|
const char *p_end = p + (VARSIZE(replace_text) - VARHDRSZ);
|
||||||
@ -2247,16 +2298,17 @@ appendStringInfoRegexpSubstr(StringInfo str, text *replace_text,
|
|||||||
if (so != -1 && eo != -1)
|
if (so != -1 && eo != -1)
|
||||||
{
|
{
|
||||||
/*
|
/*
|
||||||
* Copy the text that is back reference of regexp. Because so and
|
* Copy the text that is back reference of regexp. Note so and
|
||||||
* eo are counted in characters not bytes, it's easiest to use
|
* eo are counted in characters not bytes.
|
||||||
* text_substring to pull out the correct chunk of text.
|
|
||||||
*/
|
*/
|
||||||
text *append_text;
|
char *chunk_start;
|
||||||
|
int chunk_len;
|
||||||
|
|
||||||
append_text = text_substring(PointerGetDatum(src_text),
|
Assert(so >= data_pos);
|
||||||
so + 1, (eo - so), false);
|
chunk_start = start_ptr;
|
||||||
appendStringInfoText(str, append_text);
|
chunk_start += charlen_to_bytelen(chunk_start, so - data_pos);
|
||||||
pfree(append_text);
|
chunk_len = charlen_to_bytelen(chunk_start, eo - so);
|
||||||
|
appendBinaryStringInfo(str, chunk_start, chunk_len);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -2284,6 +2336,7 @@ replace_text_regexp(text *src_text, void *regexp,
|
|||||||
size_t data_len;
|
size_t data_len;
|
||||||
int search_start;
|
int search_start;
|
||||||
int data_pos;
|
int data_pos;
|
||||||
|
char *start_ptr;
|
||||||
bool have_escape;
|
bool have_escape;
|
||||||
|
|
||||||
initStringInfo(&buf);
|
initStringInfo(&buf);
|
||||||
@ -2295,10 +2348,17 @@ replace_text_regexp(text *src_text, void *regexp,
|
|||||||
/* Check whether replace_text has escape char. */
|
/* Check whether replace_text has escape char. */
|
||||||
have_escape = check_replace_text_has_escape_char(replace_text);
|
have_escape = check_replace_text_has_escape_char(replace_text);
|
||||||
|
|
||||||
for (search_start = data_pos = 0; search_start <= data_len;)
|
/* start_ptr points to the data_pos'th character of src_text */
|
||||||
|
start_ptr = (char *) VARDATA(src_text);
|
||||||
|
data_pos = 0;
|
||||||
|
|
||||||
|
search_start = 0;
|
||||||
|
while (search_start <= data_len)
|
||||||
{
|
{
|
||||||
int regexec_result;
|
int regexec_result;
|
||||||
|
|
||||||
|
CHECK_FOR_INTERRUPTS();
|
||||||
|
|
||||||
regexec_result = pg_regexec(re,
|
regexec_result = pg_regexec(re,
|
||||||
data,
|
data,
|
||||||
data_len,
|
data_len,
|
||||||
@ -2322,20 +2382,22 @@ replace_text_regexp(text *src_text, void *regexp,
|
|||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Copy the text to the left of the match position. Because we are
|
* Copy the text to the left of the match position. Note we are
|
||||||
* working with character not byte indexes, it's easiest to use
|
* given character not byte indexes.
|
||||||
* text_substring to pull out the needed data.
|
|
||||||
*/
|
*/
|
||||||
if (pmatch[0].rm_so - data_pos > 0)
|
if (pmatch[0].rm_so - data_pos > 0)
|
||||||
{
|
{
|
||||||
text *left_text;
|
int chunk_len;
|
||||||
|
|
||||||
left_text = text_substring(PointerGetDatum(src_text),
|
chunk_len = charlen_to_bytelen(start_ptr,
|
||||||
data_pos + 1,
|
pmatch[0].rm_so - data_pos);
|
||||||
pmatch[0].rm_so - data_pos,
|
appendBinaryStringInfo(&buf, start_ptr, chunk_len);
|
||||||
false);
|
/*
|
||||||
appendStringInfoText(&buf, left_text);
|
* Advance start_ptr over that text, to avoid multiple rescans
|
||||||
pfree(left_text);
|
* of it if the replace_text contains multiple back-references.
|
||||||
|
*/
|
||||||
|
start_ptr += chunk_len;
|
||||||
|
data_pos = pmatch[0].rm_so;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@ -2343,11 +2405,15 @@ replace_text_regexp(text *src_text, void *regexp,
|
|||||||
* replace_text has escape characters.
|
* replace_text has escape characters.
|
||||||
*/
|
*/
|
||||||
if (have_escape)
|
if (have_escape)
|
||||||
appendStringInfoRegexpSubstr(&buf, replace_text, pmatch, src_text);
|
appendStringInfoRegexpSubstr(&buf, replace_text, pmatch,
|
||||||
|
start_ptr, data_pos);
|
||||||
else
|
else
|
||||||
appendStringInfoText(&buf, replace_text);
|
appendStringInfoText(&buf, replace_text);
|
||||||
|
|
||||||
search_start = data_pos = pmatch[0].rm_eo;
|
/* Advance start_ptr and data_pos over the matched text. */
|
||||||
|
start_ptr += charlen_to_bytelen(start_ptr,
|
||||||
|
pmatch[0].rm_eo - data_pos);
|
||||||
|
data_pos = pmatch[0].rm_eo;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* When global option is off, replace the first instance only.
|
* When global option is off, replace the first instance only.
|
||||||
@ -2358,6 +2424,7 @@ replace_text_regexp(text *src_text, void *regexp,
|
|||||||
/*
|
/*
|
||||||
* Search from next character when the matching text is zero width.
|
* Search from next character when the matching text is zero width.
|
||||||
*/
|
*/
|
||||||
|
search_start = data_pos;
|
||||||
if (pmatch[0].rm_so == pmatch[0].rm_eo)
|
if (pmatch[0].rm_so == pmatch[0].rm_eo)
|
||||||
search_start++;
|
search_start++;
|
||||||
}
|
}
|
||||||
@ -2367,12 +2434,10 @@ replace_text_regexp(text *src_text, void *regexp,
|
|||||||
*/
|
*/
|
||||||
if (data_pos < data_len)
|
if (data_pos < data_len)
|
||||||
{
|
{
|
||||||
text *right_text;
|
int chunk_len;
|
||||||
|
|
||||||
right_text = text_substring(PointerGetDatum(src_text),
|
chunk_len = ((char *) src_text + VARSIZE(src_text)) - start_ptr;
|
||||||
data_pos + 1, -1, true);
|
appendBinaryStringInfo(&buf, start_ptr, chunk_len);
|
||||||
appendStringInfoText(&buf, right_text);
|
|
||||||
pfree(right_text);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
ret_text = PG_STR_GET_TEXT(buf.data);
|
ret_text = PG_STR_GET_TEXT(buf.data);
|
||||||
@ -2488,6 +2553,8 @@ text_to_array(PG_FUNCTION_ARGS)
|
|||||||
int fldnum;
|
int fldnum;
|
||||||
int start_posn;
|
int start_posn;
|
||||||
int end_posn;
|
int end_posn;
|
||||||
|
int chunk_len;
|
||||||
|
char *start_ptr;
|
||||||
text *result_text;
|
text *result_text;
|
||||||
ArrayBuildState *astate = NULL;
|
ArrayBuildState *astate = NULL;
|
||||||
|
|
||||||
@ -2506,6 +2573,9 @@ text_to_array(PG_FUNCTION_ARGS)
|
|||||||
text_position_setup(inputstring, fldsep, &state);
|
text_position_setup(inputstring, fldsep, &state);
|
||||||
|
|
||||||
start_posn = 1;
|
start_posn = 1;
|
||||||
|
/* start_ptr points to the start_posn'th character of inputstring */
|
||||||
|
start_ptr = (char *) VARDATA(inputstring);
|
||||||
|
|
||||||
for (fldnum = 1;; fldnum++) /* field number is 1 based */
|
for (fldnum = 1;; fldnum++) /* field number is 1 based */
|
||||||
{
|
{
|
||||||
end_posn = text_position_next(start_posn, &state);
|
end_posn = text_position_next(start_posn, &state);
|
||||||
@ -2513,20 +2583,19 @@ text_to_array(PG_FUNCTION_ARGS)
|
|||||||
if (end_posn == 0)
|
if (end_posn == 0)
|
||||||
{
|
{
|
||||||
/* fetch last field */
|
/* fetch last field */
|
||||||
result_text = text_substring(PointerGetDatum(inputstring),
|
chunk_len = ((char *) inputstring + VARSIZE(inputstring)) - start_ptr;
|
||||||
start_posn,
|
|
||||||
-1,
|
|
||||||
true);
|
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
/* fetch non-last field */
|
/* fetch non-last field */
|
||||||
result_text = text_substring(PointerGetDatum(inputstring),
|
chunk_len = charlen_to_bytelen(start_ptr, end_posn - start_posn);
|
||||||
start_posn,
|
|
||||||
end_posn - start_posn,
|
|
||||||
false);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* must build a temp text datum to pass to accumArrayResult */
|
||||||
|
result_text = (text *) palloc(VARHDRSZ + chunk_len);
|
||||||
|
VARATT_SIZEP(result_text) = VARHDRSZ + chunk_len;
|
||||||
|
memcpy(VARDATA(result_text), start_ptr, chunk_len);
|
||||||
|
|
||||||
/* stash away this field */
|
/* stash away this field */
|
||||||
astate = accumArrayResult(astate,
|
astate = accumArrayResult(astate,
|
||||||
PointerGetDatum(result_text),
|
PointerGetDatum(result_text),
|
||||||
@ -2534,9 +2603,15 @@ text_to_array(PG_FUNCTION_ARGS)
|
|||||||
TEXTOID,
|
TEXTOID,
|
||||||
CurrentMemoryContext);
|
CurrentMemoryContext);
|
||||||
|
|
||||||
|
pfree(result_text);
|
||||||
|
|
||||||
if (end_posn == 0)
|
if (end_posn == 0)
|
||||||
break;
|
break;
|
||||||
start_posn = end_posn + fldsep_len;
|
|
||||||
|
start_posn = end_posn;
|
||||||
|
start_ptr += chunk_len;
|
||||||
|
start_posn += fldsep_len;
|
||||||
|
start_ptr += charlen_to_bytelen(start_ptr, fldsep_len);
|
||||||
}
|
}
|
||||||
|
|
||||||
text_position_cleanup(&state);
|
text_position_cleanup(&state);
|
||||||
|
Loading…
Reference in New Issue
Block a user