mirror of
https://git.postgresql.org/git/postgresql.git
synced 2025-01-18 18:44:06 +08:00
Fix string_to_array() to correctly handle the case where there are
overlapping possible matches for the separator string, such as string_to_array('123xx456xxx789', 'xx'). Also, revise the logic of replace(), split_part(), and string_to_array() to avoid O(N^2) work from redundant searches and conversions to pg_wchar format when there are N matches to the separator string. Backpatched the full patch as far as 8.0. 7.4 also has the bug, but the code has diverged a lot, so I just went for a quick-and-dirty fix of the bug itself in that branch.
This commit is contained in:
parent
d2cc024be1
commit
704f8ea11c
@ -8,7 +8,7 @@
|
||||
*
|
||||
*
|
||||
* IDENTIFICATION
|
||||
* $PostgreSQL: pgsql/src/backend/utils/adt/varlena.c,v 1.118.4.2 2006/05/21 20:06:16 tgl Exp $
|
||||
* $PostgreSQL: pgsql/src/backend/utils/adt/varlena.c,v 1.118.4.3 2006/10/07 00:12:05 tgl Exp $
|
||||
*
|
||||
*-------------------------------------------------------------------------
|
||||
*/
|
||||
@ -32,6 +32,17 @@
|
||||
|
||||
typedef struct varlena unknown;
|
||||
|
||||
typedef struct
|
||||
{
|
||||
bool use_wchar; /* T if multibyte encoding */
|
||||
char *str1; /* use these if not use_wchar */
|
||||
char *str2; /* note: these point to original texts */
|
||||
pg_wchar *wstr1; /* use these if use_wchar */
|
||||
pg_wchar *wstr2; /* note: these are palloc'd */
|
||||
int len1; /* string lengths in logical characters */
|
||||
int len2;
|
||||
} TextPositionState;
|
||||
|
||||
#define DatumGetUnknownP(X) ((unknown *) PG_DETOAST_DATUM(X))
|
||||
#define DatumGetUnknownPCopy(X) ((unknown *) PG_DETOAST_DATUM_COPY(X))
|
||||
#define PG_GETARG_UNKNOWN_P(n) DatumGetUnknownP(PG_GETARG_DATUM(n))
|
||||
@ -46,27 +57,20 @@ typedef struct varlena unknown;
|
||||
DatumGetTextP(DirectFunctionCall1(textin, CStringGetDatum(str_)))
|
||||
#define TEXTLEN(textp) \
|
||||
text_length(PointerGetDatum(textp))
|
||||
#define TEXTPOS(buf_text, from_sub_text) \
|
||||
text_position(buf_text, from_sub_text, 1)
|
||||
#define TEXTDUP(textp) \
|
||||
DatumGetTextPCopy(PointerGetDatum(textp))
|
||||
#define LEFT(buf_text, from_sub_text) \
|
||||
text_substring(PointerGetDatum(buf_text), \
|
||||
1, \
|
||||
TEXTPOS(buf_text, from_sub_text) - 1, false)
|
||||
#define RIGHT(buf_text, from_sub_text, from_sub_text_len) \
|
||||
text_substring(PointerGetDatum(buf_text), \
|
||||
TEXTPOS(buf_text, from_sub_text) + (from_sub_text_len), \
|
||||
-1, true)
|
||||
|
||||
static int text_cmp(text *arg1, text *arg2);
|
||||
static int32 text_length(Datum str);
|
||||
static int32 text_position(text *t1, text *t2, int matchnum);
|
||||
static int text_position(text *t1, text *t2);
|
||||
static void text_position_setup(text *t1, text *t2, TextPositionState *state);
|
||||
static int text_position_next(int start_pos, TextPositionState *state);
|
||||
static void text_position_cleanup(TextPositionState *state);
|
||||
static text *text_substring(Datum str,
|
||||
int32 start,
|
||||
int32 length,
|
||||
bool length_not_specified);
|
||||
|
||||
static void appendStringInfoText(StringInfo str, const text *t);
|
||||
|
||||
|
||||
/*****************************************************************************
|
||||
* USER I/O ROUTINES *
|
||||
@ -733,7 +737,7 @@ textpos(PG_FUNCTION_ARGS)
|
||||
text *str = PG_GETARG_TEXT_P(0);
|
||||
text *search_str = PG_GETARG_TEXT_P(1);
|
||||
|
||||
PG_RETURN_INT32(text_position(str, search_str, 1));
|
||||
PG_RETURN_INT32((int32) text_position(str, search_str));
|
||||
}
|
||||
|
||||
/*
|
||||
@ -743,7 +747,6 @@ textpos(PG_FUNCTION_ARGS)
|
||||
* Inputs:
|
||||
* t1 - string to be searched
|
||||
* t2 - pattern to match within t1
|
||||
* matchnum - number of the match to be found (1 is the first match)
|
||||
* Result:
|
||||
* Character index of the first matched char, starting from 1,
|
||||
* or 0 if no match.
|
||||
@ -751,46 +754,92 @@ textpos(PG_FUNCTION_ARGS)
|
||||
* This is broken out so it can be called directly by other string processing
|
||||
* functions.
|
||||
*/
|
||||
static int32
|
||||
text_position(text *t1, text *t2, int matchnum)
|
||||
static int
|
||||
text_position(text *t1, text *t2)
|
||||
{
|
||||
int match = 0,
|
||||
pos = 0,
|
||||
p,
|
||||
px,
|
||||
len1,
|
||||
len2;
|
||||
TextPositionState state;
|
||||
int result;
|
||||
|
||||
if (matchnum <= 0)
|
||||
return 0; /* result for 0th match */
|
||||
text_position_setup(t1, t2, &state);
|
||||
result = text_position_next(1, &state);
|
||||
text_position_cleanup(&state);
|
||||
return result;
|
||||
}
|
||||
|
||||
if (VARSIZE(t2) <= VARHDRSZ)
|
||||
return 1; /* result for empty pattern */
|
||||
/*
|
||||
* text_position_setup, text_position_next, text_position_cleanup -
|
||||
* Component steps of text_position()
|
||||
*
|
||||
* These are broken out so that a string can be efficiently searched for
|
||||
* multiple occurrences of the same pattern. text_position_next may be
|
||||
* called multiple times with increasing values of start_pos, which is
|
||||
* the 1-based character position to start the search from. The "state"
|
||||
* variable is normally just a local variable in the caller.
|
||||
*/
|
||||
|
||||
len1 = (VARSIZE(t1) - VARHDRSZ);
|
||||
len2 = (VARSIZE(t2) - VARHDRSZ);
|
||||
static void
|
||||
text_position_setup(text *t1, text *t2, TextPositionState *state)
|
||||
{
|
||||
int len1 = VARSIZE(t1) - VARHDRSZ;
|
||||
int len2 = VARSIZE(t2) - VARHDRSZ;
|
||||
|
||||
if (pg_database_encoding_max_length() == 1)
|
||||
{
|
||||
/* simple case - single byte encoding */
|
||||
char *p1,
|
||||
state->use_wchar = false;
|
||||
state->str1 = VARDATA(t1);
|
||||
state->str2 = VARDATA(t2);
|
||||
state->len1 = len1;
|
||||
state->len2 = len2;
|
||||
}
|
||||
else
|
||||
{
|
||||
/* not as simple - multibyte encoding */
|
||||
pg_wchar *p1,
|
||||
*p2;
|
||||
|
||||
p1 = VARDATA(t1);
|
||||
p2 = VARDATA(t2);
|
||||
p1 = (pg_wchar *) palloc((len1 + 1) * sizeof(pg_wchar));
|
||||
len1 = pg_mb2wchar_with_len(VARDATA(t1), p1, len1);
|
||||
p2 = (pg_wchar *) palloc((len2 + 1) * sizeof(pg_wchar));
|
||||
len2 = pg_mb2wchar_with_len(VARDATA(t2), p2, len2);
|
||||
|
||||
state->use_wchar = true;
|
||||
state->wstr1 = p1;
|
||||
state->wstr2 = p2;
|
||||
state->len1 = len1;
|
||||
state->len2 = len2;
|
||||
}
|
||||
}
|
||||
|
||||
static int
|
||||
text_position_next(int start_pos, TextPositionState *state)
|
||||
{
|
||||
int pos = 0,
|
||||
p,
|
||||
px;
|
||||
|
||||
Assert(start_pos > 0); /* else caller error */
|
||||
|
||||
if (state->len2 <= 0)
|
||||
return start_pos; /* result for empty pattern */
|
||||
|
||||
if (!state->use_wchar)
|
||||
{
|
||||
/* simple case - single byte encoding */
|
||||
char *p1 = state->str1;
|
||||
char *p2 = state->str2;
|
||||
|
||||
/* no use in searching str past point where search_str will fit */
|
||||
px = (len1 - len2);
|
||||
px = (state->len1 - state->len2);
|
||||
|
||||
for (p = 0; p <= px; p++)
|
||||
p1 += start_pos - 1;
|
||||
|
||||
for (p = start_pos - 1; p <= px; p++)
|
||||
{
|
||||
if ((*p1 == *p2) && (strncmp(p1, p2, len2) == 0))
|
||||
if ((*p1 == *p2) && (strncmp(p1, p2, state->len2) == 0))
|
||||
{
|
||||
if (++match == matchnum)
|
||||
{
|
||||
pos = p + 1;
|
||||
break;
|
||||
}
|
||||
pos = p + 1;
|
||||
break;
|
||||
}
|
||||
p1++;
|
||||
}
|
||||
@ -798,41 +847,38 @@ text_position(text *t1, text *t2, int matchnum)
|
||||
else
|
||||
{
|
||||
/* not as simple - multibyte encoding */
|
||||
pg_wchar *p1,
|
||||
*p2,
|
||||
*ps1,
|
||||
*ps2;
|
||||
|
||||
ps1 = p1 = (pg_wchar *) palloc((len1 + 1) * sizeof(pg_wchar));
|
||||
(void) pg_mb2wchar_with_len((unsigned char *) VARDATA(t1), p1, len1);
|
||||
len1 = pg_wchar_strlen(p1);
|
||||
ps2 = p2 = (pg_wchar *) palloc((len2 + 1) * sizeof(pg_wchar));
|
||||
(void) pg_mb2wchar_with_len((unsigned char *) VARDATA(t2), p2, len2);
|
||||
len2 = pg_wchar_strlen(p2);
|
||||
pg_wchar *p1 = state->wstr1;
|
||||
pg_wchar *p2 = state->wstr2;
|
||||
|
||||
/* no use in searching str past point where search_str will fit */
|
||||
px = (len1 - len2);
|
||||
px = (state->len1 - state->len2);
|
||||
|
||||
for (p = 0; p <= px; p++)
|
||||
p1 += start_pos - 1;
|
||||
|
||||
for (p = start_pos - 1; p <= px; p++)
|
||||
{
|
||||
if ((*p1 == *p2) && (pg_wchar_strncmp(p1, p2, len2) == 0))
|
||||
if ((*p1 == *p2) && (pg_wchar_strncmp(p1, p2, state->len2) == 0))
|
||||
{
|
||||
if (++match == matchnum)
|
||||
{
|
||||
pos = p + 1;
|
||||
break;
|
||||
}
|
||||
pos = p + 1;
|
||||
break;
|
||||
}
|
||||
p1++;
|
||||
}
|
||||
|
||||
pfree(ps1);
|
||||
pfree(ps2);
|
||||
}
|
||||
|
||||
return pos;
|
||||
}
|
||||
|
||||
static void
|
||||
text_position_cleanup(TextPositionState *state)
|
||||
{
|
||||
if (state->use_wchar)
|
||||
{
|
||||
pfree(state->wstr1);
|
||||
pfree(state->wstr2);
|
||||
}
|
||||
}
|
||||
|
||||
/* varstr_cmp()
|
||||
* Comparison function for text strings with given lengths.
|
||||
* Includes locale support, but must copy strings to temporary memory
|
||||
@ -1278,6 +1324,7 @@ byteacat(PG_FUNCTION_ARGS)
|
||||
|
||||
#define PG_STR_GET_BYTEA(str_) \
|
||||
DatumGetByteaP(DirectFunctionCall1(byteain, CStringGetDatum(str_)))
|
||||
|
||||
/*
|
||||
* bytea_substr()
|
||||
* Return a substring starting at the specified position.
|
||||
@ -1951,6 +1998,18 @@ byteacmp(PG_FUNCTION_ARGS)
|
||||
PG_RETURN_INT32(cmp);
|
||||
}
|
||||
|
||||
/*
|
||||
* appendStringInfoText
|
||||
*
|
||||
* Append a text to str.
|
||||
* Like appendStringInfoString(str, PG_TEXT_GET_STR(s)) but faster.
|
||||
*/
|
||||
static void
|
||||
appendStringInfoText(StringInfo str, const text *t)
|
||||
{
|
||||
appendBinaryStringInfo(str, VARDATA(t), VARSIZE(t) - VARHDRSZ);
|
||||
}
|
||||
|
||||
/*
|
||||
* replace_text
|
||||
* replace all occurrences of 'old_sub_str' in 'orig_str'
|
||||
@ -1967,40 +2026,58 @@ replace_text(PG_FUNCTION_ARGS)
|
||||
text *to_sub_text = PG_GETARG_TEXT_P(2);
|
||||
int src_text_len = TEXTLEN(src_text);
|
||||
int from_sub_text_len = TEXTLEN(from_sub_text);
|
||||
char *to_sub_str = PG_TEXT_GET_STR(to_sub_text);
|
||||
text *left_text;
|
||||
text *right_text;
|
||||
text *buf_text;
|
||||
TextPositionState state;
|
||||
text *chunk_text;
|
||||
text *ret_text;
|
||||
int start_posn;
|
||||
int curr_posn;
|
||||
StringInfo str = makeStringInfo();
|
||||
StringInfoData str;
|
||||
|
||||
if (src_text_len == 0 || from_sub_text_len == 0)
|
||||
PG_RETURN_TEXT_P(src_text);
|
||||
|
||||
buf_text = TEXTDUP(src_text);
|
||||
curr_posn = TEXTPOS(buf_text, from_sub_text);
|
||||
text_position_setup(src_text, from_sub_text, &state);
|
||||
|
||||
while (curr_posn > 0)
|
||||
start_posn = 1;
|
||||
curr_posn = text_position_next(1, &state);
|
||||
|
||||
/* When the from_sub_text is not found, there is nothing to do. */
|
||||
if (curr_posn == 0)
|
||||
{
|
||||
left_text = LEFT(buf_text, from_sub_text);
|
||||
right_text = RIGHT(buf_text, from_sub_text, from_sub_text_len);
|
||||
|
||||
appendStringInfoString(str, PG_TEXT_GET_STR(left_text));
|
||||
appendStringInfoString(str, to_sub_str);
|
||||
|
||||
pfree(buf_text);
|
||||
pfree(left_text);
|
||||
buf_text = right_text;
|
||||
curr_posn = TEXTPOS(buf_text, from_sub_text);
|
||||
text_position_cleanup(&state);
|
||||
PG_RETURN_TEXT_P(src_text);
|
||||
}
|
||||
|
||||
appendStringInfoString(str, PG_TEXT_GET_STR(buf_text));
|
||||
pfree(buf_text);
|
||||
initStringInfo(&str);
|
||||
|
||||
ret_text = PG_STR_GET_TEXT(str->data);
|
||||
pfree(str->data);
|
||||
pfree(str);
|
||||
do
|
||||
{
|
||||
chunk_text = text_substring(PointerGetDatum(src_text),
|
||||
start_posn,
|
||||
curr_posn - start_posn,
|
||||
false);
|
||||
appendStringInfoText(&str, chunk_text);
|
||||
pfree(chunk_text);
|
||||
|
||||
appendStringInfoText(&str, to_sub_text);
|
||||
|
||||
start_posn = curr_posn + from_sub_text_len;
|
||||
curr_posn = text_position_next(start_posn, &state);
|
||||
}
|
||||
while (curr_posn > 0);
|
||||
|
||||
/* copy trailing chunk */
|
||||
chunk_text = text_substring(PointerGetDatum(src_text),
|
||||
start_posn,
|
||||
-1,
|
||||
true);
|
||||
appendStringInfoText(&str, chunk_text);
|
||||
pfree(chunk_text);
|
||||
|
||||
text_position_cleanup(&state);
|
||||
|
||||
ret_text = PG_STR_GET_TEXT(str.data);
|
||||
pfree(str.data);
|
||||
|
||||
PG_RETURN_TEXT_P(ret_text);
|
||||
}
|
||||
@ -2019,6 +2096,7 @@ split_text(PG_FUNCTION_ARGS)
|
||||
int fldnum = PG_GETARG_INT32(2);
|
||||
int inputstring_len = TEXTLEN(inputstring);
|
||||
int fldsep_len = TEXTLEN(fldsep);
|
||||
TextPositionState state;
|
||||
int start_posn;
|
||||
int end_posn;
|
||||
text *result_text;
|
||||
@ -2043,40 +2121,54 @@ split_text(PG_FUNCTION_ARGS)
|
||||
PG_RETURN_TEXT_P(PG_STR_GET_TEXT(""));
|
||||
}
|
||||
|
||||
start_posn = text_position(inputstring, fldsep, fldnum - 1);
|
||||
end_posn = text_position(inputstring, fldsep, fldnum);
|
||||
text_position_setup(inputstring, fldsep, &state);
|
||||
|
||||
if ((start_posn == 0) && (end_posn == 0)) /* fldsep not found */
|
||||
/* identify bounds of first field */
|
||||
start_posn = 1;
|
||||
end_posn = text_position_next(1, &state);
|
||||
|
||||
/* special case if fldsep not found at all */
|
||||
if (end_posn == 0)
|
||||
{
|
||||
/* if first field, return input string, else empty string */
|
||||
text_position_cleanup(&state);
|
||||
/* if field 1 requested, return input string, else empty string */
|
||||
if (fldnum == 1)
|
||||
PG_RETURN_TEXT_P(inputstring);
|
||||
else
|
||||
PG_RETURN_TEXT_P(PG_STR_GET_TEXT(""));
|
||||
}
|
||||
else if (start_posn == 0)
|
||||
|
||||
while (end_posn > 0 && --fldnum > 0)
|
||||
{
|
||||
/* first field requested */
|
||||
result_text = LEFT(inputstring, fldsep);
|
||||
PG_RETURN_TEXT_P(result_text);
|
||||
/* identify bounds of next field */
|
||||
start_posn = end_posn + fldsep_len;
|
||||
end_posn = text_position_next(start_posn, &state);
|
||||
}
|
||||
else if (end_posn == 0)
|
||||
|
||||
text_position_cleanup(&state);
|
||||
|
||||
if (fldnum > 0)
|
||||
{
|
||||
/* last field requested */
|
||||
result_text = text_substring(PointerGetDatum(inputstring),
|
||||
start_posn + fldsep_len,
|
||||
-1, true);
|
||||
PG_RETURN_TEXT_P(result_text);
|
||||
/* N'th field separator not found */
|
||||
/* if last field requested, return it, else empty string */
|
||||
if (fldnum == 1)
|
||||
result_text = text_substring(PointerGetDatum(inputstring),
|
||||
start_posn,
|
||||
-1,
|
||||
true);
|
||||
else
|
||||
result_text = PG_STR_GET_TEXT("");
|
||||
}
|
||||
else
|
||||
{
|
||||
/* interior field requested */
|
||||
/* non-last field requested */
|
||||
result_text = text_substring(PointerGetDatum(inputstring),
|
||||
start_posn + fldsep_len,
|
||||
end_posn - start_posn - fldsep_len,
|
||||
start_posn,
|
||||
end_posn - start_posn,
|
||||
false);
|
||||
PG_RETURN_TEXT_P(result_text);
|
||||
}
|
||||
|
||||
PG_RETURN_TEXT_P(result_text);
|
||||
}
|
||||
|
||||
/*
|
||||
@ -2092,6 +2184,7 @@ text_to_array(PG_FUNCTION_ARGS)
|
||||
text *fldsep = PG_GETARG_TEXT_P(1);
|
||||
int inputstring_len = TEXTLEN(inputstring);
|
||||
int fldsep_len = TEXTLEN(fldsep);
|
||||
TextPositionState state;
|
||||
int fldnum;
|
||||
int start_posn;
|
||||
int end_posn;
|
||||
@ -2108,66 +2201,48 @@ text_to_array(PG_FUNCTION_ARGS)
|
||||
*/
|
||||
if (fldsep_len < 1)
|
||||
PG_RETURN_ARRAYTYPE_P(create_singleton_array(fcinfo, TEXTOID,
|
||||
CStringGetDatum(inputstring), 1));
|
||||
PointerGetDatum(inputstring), 1));
|
||||
|
||||
/* start with end position holding the initial start position */
|
||||
end_posn = 0;
|
||||
text_position_setup(inputstring, fldsep, &state);
|
||||
|
||||
start_posn = 1;
|
||||
for (fldnum = 1;; fldnum++) /* field number is 1 based */
|
||||
{
|
||||
Datum dvalue;
|
||||
bool disnull = false;
|
||||
end_posn = text_position_next(start_posn, &state);
|
||||
|
||||
start_posn = end_posn;
|
||||
end_posn = text_position(inputstring, fldsep, fldnum);
|
||||
|
||||
if ((start_posn == 0) && (end_posn == 0)) /* fldsep not found */
|
||||
if (end_posn == 0)
|
||||
{
|
||||
if (fldnum == 1)
|
||||
{
|
||||
/*
|
||||
* first element return one element, 1D, array using the
|
||||
* input string
|
||||
*/
|
||||
PG_RETURN_ARRAYTYPE_P(create_singleton_array(fcinfo, TEXTOID,
|
||||
CStringGetDatum(inputstring), 1));
|
||||
}
|
||||
else
|
||||
{
|
||||
/* otherwise create array and exit */
|
||||
PG_RETURN_ARRAYTYPE_P(makeArrayResult(astate,
|
||||
CurrentMemoryContext));
|
||||
}
|
||||
}
|
||||
else if (start_posn == 0)
|
||||
{
|
||||
/* first field requested */
|
||||
result_text = LEFT(inputstring, fldsep);
|
||||
}
|
||||
else if (end_posn == 0)
|
||||
{
|
||||
/* last field requested */
|
||||
/* fetch last field */
|
||||
result_text = text_substring(PointerGetDatum(inputstring),
|
||||
start_posn + fldsep_len,
|
||||
-1, true);
|
||||
start_posn,
|
||||
-1,
|
||||
true);
|
||||
}
|
||||
else
|
||||
{
|
||||
/* interior field requested */
|
||||
/* fetch non-last field */
|
||||
result_text = text_substring(PointerGetDatum(inputstring),
|
||||
start_posn + fldsep_len,
|
||||
end_posn - start_posn - fldsep_len,
|
||||
start_posn,
|
||||
end_posn - start_posn,
|
||||
false);
|
||||
}
|
||||
|
||||
/* stash away current value */
|
||||
dvalue = PointerGetDatum(result_text);
|
||||
astate = accumArrayResult(astate, dvalue,
|
||||
disnull, TEXTOID,
|
||||
/* stash away this field */
|
||||
astate = accumArrayResult(astate,
|
||||
PointerGetDatum(result_text),
|
||||
false,
|
||||
TEXTOID,
|
||||
CurrentMemoryContext);
|
||||
|
||||
if (end_posn == 0)
|
||||
break;
|
||||
start_posn = end_posn + fldsep_len;
|
||||
}
|
||||
|
||||
/* never reached -- keep compiler quiet */
|
||||
PG_RETURN_NULL();
|
||||
text_position_cleanup(&state);
|
||||
|
||||
PG_RETURN_ARRAYTYPE_P(makeArrayResult(astate,
|
||||
CurrentMemoryContext));
|
||||
}
|
||||
|
||||
/*
|
||||
|
Loading…
Reference in New Issue
Block a user