mirror of
https://git.postgresql.org/git/postgresql.git
synced 2024-12-21 08:29:39 +08:00
Add unistr function
This allows decoding a string with Unicode escape sequences. It is similar to Unicode escape strings, but offers some more flexibility. Author: Pavel Stehule <pavel.stehule@gmail.com> Reviewed-by: Asif Rehman <asifr.rehman@gmail.com> Discussion: https://www.postgresql.org/message-id/flat/CAFj8pRA5GnKT+gDVwbVRH2ep451H_myBt+NTz8RkYUARE9+qOQ@mail.gmail.com
This commit is contained in:
parent
ebedd0c78f
commit
f37fec837c
@ -3551,6 +3551,52 @@ repeat('Pg', 4) <returnvalue>PgPgPgPg</returnvalue>
|
||||
</para></entry>
|
||||
</row>
|
||||
|
||||
<row>
|
||||
<entry role="func_table_entry"><para role="func_signature">
|
||||
<indexterm>
|
||||
<primary>unistr</primary>
|
||||
</indexterm>
|
||||
<function>unistr</function> ( <type>text</type> )
|
||||
<returnvalue>text</returnvalue>
|
||||
</para>
|
||||
<para>
|
||||
Evaluate escaped Unicode characters in argument. Unicode characters
|
||||
can be specified as
|
||||
<literal>\<replaceable>XXXX</replaceable></literal> (4 hexadecimal
|
||||
digits), <literal>\+<replaceable>XXXXXX</replaceable></literal> (6
|
||||
hexadecimal digits),
|
||||
<literal>\u<replaceable>XXXX</replaceable></literal> (4 hexadecimal
|
||||
digits), or <literal>\U<replaceable>XXXXXXXX</replaceable></literal>
|
||||
(8 hexadecimal digits). To specify a backslash, write two
|
||||
backslashes. All other characters are taken literally.
|
||||
</para>
|
||||
|
||||
<para>
|
||||
If the server encoding is not UTF-8, the Unicode code point identified
|
||||
by one of these escape sequences is converted to the actual server
|
||||
encoding; an error is reported if that's not possible.
|
||||
</para>
|
||||
|
||||
<para>
|
||||
This function provides a (non-standard) alternative to string
|
||||
constants with Unicode escapes (see <xref
|
||||
linkend="sql-syntax-strings-uescape"/>).
|
||||
</para>
|
||||
|
||||
<para>
|
||||
<literal>unistr('\0441\043B\043E\043D')</literal>
|
||||
<returnvalue>слон</returnvalue>
|
||||
</para>
|
||||
<para>
|
||||
<literal>unistr('d\0061t\+000061')</literal>
|
||||
<returnvalue>data</returnvalue>
|
||||
</para>
|
||||
<para>
|
||||
<literal>unistr('d\u0061t\U00000061')</literal>
|
||||
<returnvalue>data</returnvalue>
|
||||
</para></entry>
|
||||
</row>
|
||||
|
||||
</tbody>
|
||||
</tgroup>
|
||||
</table>
|
||||
|
@ -6380,3 +6380,213 @@ unicode_is_normalized(PG_FUNCTION_ARGS)
|
||||
|
||||
PG_RETURN_BOOL(result);
|
||||
}
|
||||
|
||||
/*
|
||||
* Check if first n chars are hexadecimal digits
|
||||
*/
|
||||
static bool
|
||||
isxdigits_n(const char *instr, size_t n)
|
||||
{
|
||||
for (size_t i = 0; i < n; i++)
|
||||
if (!isxdigit((unsigned char) instr[i]))
|
||||
return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static unsigned int
|
||||
hexval(unsigned char c)
|
||||
{
|
||||
if (c >= '0' && c <= '9')
|
||||
return c - '0';
|
||||
if (c >= 'a' && c <= 'f')
|
||||
return c - 'a' + 0xA;
|
||||
if (c >= 'A' && c <= 'F')
|
||||
return c - 'A' + 0xA;
|
||||
elog(ERROR, "invalid hexadecimal digit");
|
||||
return 0; /* not reached */
|
||||
}
|
||||
|
||||
/*
|
||||
* Translate string with hexadecimal digits to number
|
||||
*/
|
||||
static unsigned int
|
||||
hexval_n(const char *instr, size_t n)
|
||||
{
|
||||
unsigned int result = 0;
|
||||
|
||||
for (size_t i = 0; i < n; i++)
|
||||
result += hexval(instr[i]) << (4 * (n - i - 1));
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
/*
|
||||
* Replaces Unicode escape sequences by Unicode characters
|
||||
*/
|
||||
Datum
|
||||
unistr(PG_FUNCTION_ARGS)
|
||||
{
|
||||
text *input_text = PG_GETARG_TEXT_PP(0);
|
||||
char *instr;
|
||||
int len;
|
||||
StringInfoData str;
|
||||
text *result;
|
||||
pg_wchar pair_first = 0;
|
||||
char cbuf[MAX_UNICODE_EQUIVALENT_STRING + 1];
|
||||
|
||||
instr = VARDATA_ANY(input_text);
|
||||
len = VARSIZE_ANY_EXHDR(input_text);
|
||||
|
||||
initStringInfo(&str);
|
||||
|
||||
while (len > 0)
|
||||
{
|
||||
if (instr[0] == '\\')
|
||||
{
|
||||
if (len >= 2 &&
|
||||
instr[1] == '\\')
|
||||
{
|
||||
if (pair_first)
|
||||
goto invalid_pair;
|
||||
appendStringInfoChar(&str, '\\');
|
||||
instr += 2;
|
||||
len -= 2;
|
||||
}
|
||||
else if ((len >= 5 && isxdigits_n(instr + 1, 4)) ||
|
||||
(len >= 6 && instr[1] == 'u' && isxdigits_n(instr + 2, 4)))
|
||||
{
|
||||
pg_wchar unicode;
|
||||
int offset = instr[1] == 'u' ? 2 : 1;
|
||||
|
||||
unicode = hexval_n(instr + offset, 4);
|
||||
|
||||
if (!is_valid_unicode_codepoint(unicode))
|
||||
ereport(ERROR,
|
||||
errcode(ERRCODE_INVALID_PARAMETER_VALUE),
|
||||
errmsg("invalid Unicode code point: %04X", unicode));
|
||||
|
||||
if (pair_first)
|
||||
{
|
||||
if (is_utf16_surrogate_second(unicode))
|
||||
{
|
||||
unicode = surrogate_pair_to_codepoint(pair_first, unicode);
|
||||
pair_first = 0;
|
||||
}
|
||||
else
|
||||
goto invalid_pair;
|
||||
}
|
||||
else if (is_utf16_surrogate_second(unicode))
|
||||
goto invalid_pair;
|
||||
|
||||
if (is_utf16_surrogate_first(unicode))
|
||||
pair_first = unicode;
|
||||
else
|
||||
{
|
||||
pg_unicode_to_server(unicode, (unsigned char *) cbuf);
|
||||
appendStringInfoString(&str, cbuf);
|
||||
}
|
||||
|
||||
instr += 4 + offset;
|
||||
len -= 4 + offset;
|
||||
}
|
||||
else if (len >= 8 && instr[1] == '+' && isxdigits_n(instr + 2, 6))
|
||||
{
|
||||
pg_wchar unicode;
|
||||
|
||||
unicode = hexval_n(instr + 2, 6);
|
||||
|
||||
if (!is_valid_unicode_codepoint(unicode))
|
||||
ereport(ERROR,
|
||||
errcode(ERRCODE_INVALID_PARAMETER_VALUE),
|
||||
errmsg("invalid Unicode code point: %04X", unicode));
|
||||
|
||||
if (pair_first)
|
||||
{
|
||||
if (is_utf16_surrogate_second(unicode))
|
||||
{
|
||||
unicode = surrogate_pair_to_codepoint(pair_first, unicode);
|
||||
pair_first = 0;
|
||||
}
|
||||
else
|
||||
goto invalid_pair;
|
||||
}
|
||||
else if (is_utf16_surrogate_second(unicode))
|
||||
goto invalid_pair;
|
||||
|
||||
if (is_utf16_surrogate_first(unicode))
|
||||
pair_first = unicode;
|
||||
else
|
||||
{
|
||||
pg_unicode_to_server(unicode, (unsigned char *) cbuf);
|
||||
appendStringInfoString(&str, cbuf);
|
||||
}
|
||||
|
||||
instr += 8;
|
||||
len -= 8;
|
||||
}
|
||||
else if (len >= 10 && instr[1] == 'U' && isxdigits_n(instr + 2, 8))
|
||||
{
|
||||
pg_wchar unicode;
|
||||
|
||||
unicode = hexval_n(instr + 2, 8);
|
||||
|
||||
if (!is_valid_unicode_codepoint(unicode))
|
||||
ereport(ERROR,
|
||||
errcode(ERRCODE_INVALID_PARAMETER_VALUE),
|
||||
errmsg("invalid Unicode code point: %04X", unicode));
|
||||
|
||||
if (pair_first)
|
||||
{
|
||||
if (is_utf16_surrogate_second(unicode))
|
||||
{
|
||||
unicode = surrogate_pair_to_codepoint(pair_first, unicode);
|
||||
pair_first = 0;
|
||||
}
|
||||
else
|
||||
goto invalid_pair;
|
||||
}
|
||||
else if (is_utf16_surrogate_second(unicode))
|
||||
goto invalid_pair;
|
||||
|
||||
if (is_utf16_surrogate_first(unicode))
|
||||
pair_first = unicode;
|
||||
else
|
||||
{
|
||||
pg_unicode_to_server(unicode, (unsigned char *) cbuf);
|
||||
appendStringInfoString(&str, cbuf);
|
||||
}
|
||||
|
||||
instr += 10;
|
||||
len -= 10;
|
||||
}
|
||||
else
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_SYNTAX_ERROR),
|
||||
errmsg("invalid Unicode escape"),
|
||||
errhint("Unicode escapes must be \\XXXX, \\+XXXXXX, \\uXXXX, or \\UXXXXXXXX.")));
|
||||
}
|
||||
else
|
||||
{
|
||||
if (pair_first)
|
||||
goto invalid_pair;
|
||||
|
||||
appendStringInfoChar(&str, *instr++);
|
||||
len--;
|
||||
}
|
||||
}
|
||||
|
||||
/* unfinished surrogate pair? */
|
||||
if (pair_first)
|
||||
goto invalid_pair;
|
||||
|
||||
result = cstring_to_text_with_len(str.data, str.len);
|
||||
pfree(str.data);
|
||||
|
||||
PG_RETURN_TEXT_P(result);
|
||||
|
||||
invalid_pair:
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_SYNTAX_ERROR),
|
||||
errmsg("invalid Unicode surrogate pair")));
|
||||
}
|
||||
|
@ -53,6 +53,6 @@
|
||||
*/
|
||||
|
||||
/* yyyymmddN */
|
||||
#define CATALOG_VERSION_NO 202103266
|
||||
#define CATALOG_VERSION_NO 202103291
|
||||
|
||||
#endif
|
||||
|
@ -11527,6 +11527,10 @@
|
||||
proname => 'is_normalized', prorettype => 'bool', proargtypes => 'text text',
|
||||
prosrc => 'unicode_is_normalized' },
|
||||
|
||||
{ oid => '9822', descr => 'unescape Unicode characters',
|
||||
proname => 'unistr', prorettype => 'text', proargtypes => 'text',
|
||||
prosrc => 'unistr' },
|
||||
|
||||
{ oid => '4596', descr => 'I/O',
|
||||
proname => 'brin_bloom_summary_in', prorettype => 'pg_brin_bloom_summary',
|
||||
proargtypes => 'cstring', prosrc => 'brin_bloom_summary_in' },
|
||||
|
@ -2234,3 +2234,39 @@ SELECT bit_count('\x1234567890'::bytea);
|
||||
15
|
||||
(1 row)
|
||||
|
||||
SELECT unistr('\0064at\+0000610');
|
||||
unistr
|
||||
--------
|
||||
data0
|
||||
(1 row)
|
||||
|
||||
SELECT unistr('d\u0061t\U000000610');
|
||||
unistr
|
||||
--------
|
||||
data0
|
||||
(1 row)
|
||||
|
||||
SELECT unistr('a\\b');
|
||||
unistr
|
||||
--------
|
||||
a\b
|
||||
(1 row)
|
||||
|
||||
-- errors:
|
||||
SELECT unistr('wrong: \db99');
|
||||
ERROR: invalid Unicode surrogate pair
|
||||
SELECT unistr('wrong: \db99\0061');
|
||||
ERROR: invalid Unicode surrogate pair
|
||||
SELECT unistr('wrong: \+00db99\+000061');
|
||||
ERROR: invalid Unicode surrogate pair
|
||||
SELECT unistr('wrong: \+2FFFFF');
|
||||
ERROR: invalid Unicode code point: 2FFFFF
|
||||
SELECT unistr('wrong: \udb99\u0061');
|
||||
ERROR: invalid Unicode surrogate pair
|
||||
SELECT unistr('wrong: \U0000db99\U00000061');
|
||||
ERROR: invalid Unicode surrogate pair
|
||||
SELECT unistr('wrong: \U002FFFFF');
|
||||
ERROR: invalid Unicode code point: 2FFFFF
|
||||
SELECT unistr('wrong: \xyz');
|
||||
ERROR: invalid Unicode escape
|
||||
HINT: Unicode escapes must be \XXXX, \+XXXXXX, \uXXXX, or \UXXXXXXXX.
|
||||
|
@ -746,3 +746,16 @@ SELECT encode(overlay(E'Th\\000omas'::bytea placing E'\\002\\003'::bytea from 8)
|
||||
SELECT encode(overlay(E'Th\\000omas'::bytea placing E'\\002\\003'::bytea from 5 for 3),'escape');
|
||||
|
||||
SELECT bit_count('\x1234567890'::bytea);
|
||||
|
||||
SELECT unistr('\0064at\+0000610');
|
||||
SELECT unistr('d\u0061t\U000000610');
|
||||
SELECT unistr('a\\b');
|
||||
-- errors:
|
||||
SELECT unistr('wrong: \db99');
|
||||
SELECT unistr('wrong: \db99\0061');
|
||||
SELECT unistr('wrong: \+00db99\+000061');
|
||||
SELECT unistr('wrong: \+2FFFFF');
|
||||
SELECT unistr('wrong: \udb99\u0061');
|
||||
SELECT unistr('wrong: \U0000db99\U00000061');
|
||||
SELECT unistr('wrong: \U002FFFFF');
|
||||
SELECT unistr('wrong: \xyz');
|
||||
|
Loading…
Reference in New Issue
Block a user