Add unistr function

This allows decoding a string with Unicode escape sequences. It is similar to Unicode escape strings, but offers some more flexibility. Author: Pavel Stehule <pavel.stehule@gmail.com> Reviewed-by: Asif Rehman <asifr.rehman@gmail.com> Discussion: https://www.postgresql.org/message-id/flat/CAFj8pRA5GnKT+gDVwbVRH2ep451H_myBt+NTz8RkYUARE9+qOQ@mail.gmail.com
2024-12-21 08:29:39 +08:00 · 2021-03-28 08:16:15 +02:00 · 2021-03-28 08:16:15 +02:00 · f37fec837c
commit f37fec837c
parent ebedd0c78f
6 changed files with 310 additions and 1 deletions
--- a/doc/src/sgml/func.sgml
+++ b/doc/src/sgml/func.sgml
@ -3551,6 +3551,52 @@ repeat('Pg', 4) <returnvalue>PgPgPgPg</returnvalue>
       </para></entry>
      </row>
      <row>
       <entry role="func_table_entry"><para role="func_signature">
        <indexterm>
         <primary>unistr</primary>
        </indexterm>
        <function>unistr</function> ( <type>text</type> )
        <returnvalue>text</returnvalue>
       </para>
       <para>
        Evaluate escaped Unicode characters in argument.  Unicode characters
        can be specified as
        <literal>\<replaceable>XXXX</replaceable></literal> (4 hexadecimal
        digits), <literal>\+<replaceable>XXXXXX</replaceable></literal> (6
        hexadecimal digits),
        <literal>\u<replaceable>XXXX</replaceable></literal> (4 hexadecimal
        digits), or <literal>\U<replaceable>XXXXXXXX</replaceable></literal>
        (8 hexadecimal digits).  To specify a backslash, write two
        backslashes.  All other characters are taken literally.
       </para>
       <para>
        If the server encoding is not UTF-8, the Unicode code point identified
        by one of these escape sequences is converted to the actual server
        encoding; an error is reported if that's not possible.
       </para>
       <para>
        This function provides a (non-standard) alternative to string
        constants with Unicode escapes (see <xref
        linkend="sql-syntax-strings-uescape"/>).
       </para>
       <para>
        <literal>unistr('\0441\043B\043E\043D')</literal>
        <returnvalue>слон</returnvalue>
       </para>
       <para>
        <literal>unistr('d\0061t\+000061')</literal>
        <returnvalue>data</returnvalue>
       </para>
       <para>
        <literal>unistr('d\u0061t\U00000061')</literal>
        <returnvalue>data</returnvalue>
       </para></entry>
      </row>
     </tbody>
    </tgroup>
   </table>
--- a/src/backend/utils/adt/varlena.c
+++ b/src/backend/utils/adt/varlena.c
@ -6380,3 +6380,213 @@ unicode_is_normalized(PG_FUNCTION_ARGS)
 	PG_RETURN_BOOL(result);
 }
 /*
 * Check if first n chars are hexadecimal digits
 */
 static bool
 isxdigits_n(const char *instr, size_t n)
 {
 	for (size_t i = 0; i < n; i++)
 		if (!isxdigit((unsigned char) instr[i]))
 			return false;
 	return true;
 }
 static unsigned int
 hexval(unsigned char c)
 {
 	if (c >= '0' && c <= '9')
 		return c - '0';
 	if (c >= 'a' && c <= 'f')
 		return c - 'a' + 0xA;
 	if (c >= 'A' && c <= 'F')
 		return c - 'A' + 0xA;
 	elog(ERROR, "invalid hexadecimal digit");
 	return 0;					/* not reached */
 }
 /*
 * Translate string with hexadecimal digits to number
 */
 static unsigned int
 hexval_n(const char *instr, size_t n)
 {
 	unsigned int result = 0;
 	for (size_t i = 0; i < n; i++)
 		result += hexval(instr[i]) << (4 * (n - i - 1));
 	return result;
 }
 /*
 * Replaces Unicode escape sequences by Unicode characters
 */
 Datum
 unistr(PG_FUNCTION_ARGS)
 {
 	text	   *input_text = PG_GETARG_TEXT_PP(0);
 	char	   *instr;
 	int			len;
 	StringInfoData str;
 	text	   *result;
 	pg_wchar	pair_first = 0;
 	char		cbuf[MAX_UNICODE_EQUIVALENT_STRING + 1];
 	instr = VARDATA_ANY(input_text);
 	len = VARSIZE_ANY_EXHDR(input_text);
 	initStringInfo(&str);
 	while (len > 0)
 	{
 		if (instr[0] == '\\')
 		{
 			if (len >= 2 &&
 				instr[1] == '\\')
 			{
 				if (pair_first)
 					goto invalid_pair;
 				appendStringInfoChar(&str, '\\');
 				instr += 2;
 				len -= 2;
 			}
 			else if ((len >= 5 && isxdigits_n(instr + 1, 4)) ||
 					 (len >= 6 && instr[1] == 'u' && isxdigits_n(instr + 2, 4)))
 			{
 				pg_wchar	unicode;
 				int			offset = instr[1] == 'u' ? 2 : 1;
 				unicode = hexval_n(instr + offset, 4);
 				if (!is_valid_unicode_codepoint(unicode))
 					ereport(ERROR,
 							errcode(ERRCODE_INVALID_PARAMETER_VALUE),
 							errmsg("invalid Unicode code point: %04X", unicode));
 				if (pair_first)
 				{
 					if (is_utf16_surrogate_second(unicode))
 					{
 						unicode = surrogate_pair_to_codepoint(pair_first, unicode);
 						pair_first = 0;
 					}
 					else
 						goto invalid_pair;
 				}
 				else if (is_utf16_surrogate_second(unicode))
 					goto invalid_pair;
 				if (is_utf16_surrogate_first(unicode))
 					pair_first = unicode;
 				else
 				{
 					pg_unicode_to_server(unicode, (unsigned char *) cbuf);
 					appendStringInfoString(&str, cbuf);
 				}
 				instr += 4 + offset;
 				len -= 4 + offset;
 			}
 			else if (len >= 8 && instr[1] == '+' && isxdigits_n(instr + 2, 6))
 			{
 				pg_wchar	unicode;
 				unicode = hexval_n(instr + 2, 6);
 				if (!is_valid_unicode_codepoint(unicode))
 					ereport(ERROR,
 							errcode(ERRCODE_INVALID_PARAMETER_VALUE),
 							errmsg("invalid Unicode code point: %04X", unicode));
 				if (pair_first)
 				{
 					if (is_utf16_surrogate_second(unicode))
 					{
 						unicode = surrogate_pair_to_codepoint(pair_first, unicode);
 						pair_first = 0;
 					}
 					else
 						goto invalid_pair;
 				}
 				else if (is_utf16_surrogate_second(unicode))
 					goto invalid_pair;
 				if (is_utf16_surrogate_first(unicode))
 					pair_first = unicode;
 				else
 				{
 					pg_unicode_to_server(unicode, (unsigned char *) cbuf);
 					appendStringInfoString(&str, cbuf);
 				}
 				instr += 8;
 				len -= 8;
 			}
 			else if (len >= 10 && instr[1] == 'U' && isxdigits_n(instr + 2, 8))
 			{
 				pg_wchar	unicode;
 				unicode = hexval_n(instr + 2, 8);
 				if (!is_valid_unicode_codepoint(unicode))
 					ereport(ERROR,
 							errcode(ERRCODE_INVALID_PARAMETER_VALUE),
 							errmsg("invalid Unicode code point: %04X", unicode));
 				if (pair_first)
 				{
 					if (is_utf16_surrogate_second(unicode))
 					{
 						unicode = surrogate_pair_to_codepoint(pair_first, unicode);
 						pair_first = 0;
 					}
 					else
 						goto invalid_pair;
 				}
 				else if (is_utf16_surrogate_second(unicode))
 					goto invalid_pair;
 				if (is_utf16_surrogate_first(unicode))
 					pair_first = unicode;
 				else
 				{
 					pg_unicode_to_server(unicode, (unsigned char *) cbuf);
 					appendStringInfoString(&str, cbuf);
 				}
 				instr += 10;
 				len -= 10;
 			}
 			else
 				ereport(ERROR,
 						(errcode(ERRCODE_SYNTAX_ERROR),
 						 errmsg("invalid Unicode escape"),
 						 errhint("Unicode escapes must be \\XXXX, \\+XXXXXX, \\uXXXX, or \\UXXXXXXXX.")));
 		}
 		else
 		{
 			if (pair_first)
 				goto invalid_pair;
 			appendStringInfoChar(&str, *instr++);
 			len--;
 		}
 	}
 	/* unfinished surrogate pair? */
 	if (pair_first)
 		goto invalid_pair;
 	result = cstring_to_text_with_len(str.data, str.len);
 	pfree(str.data);
 	PG_RETURN_TEXT_P(result);
 invalid_pair:
 	ereport(ERROR,
 			(errcode(ERRCODE_SYNTAX_ERROR),
 			 errmsg("invalid Unicode surrogate pair")));
 }
--- a/src/include/catalog/catversion.h
+++ b/src/include/catalog/catversion.h
@ -53,6 +53,6 @@
 */
 /*							yyyymmddN */
-#define CATALOG_VERSION_NO	202103266
+#define CATALOG_VERSION_NO	202103291
 #endif
--- a/src/include/catalog/pg_proc.dat
+++ b/src/include/catalog/pg_proc.dat
@ -11527,6 +11527,10 @@
  proname => 'is_normalized', prorettype => 'bool', proargtypes => 'text text',
  prosrc => 'unicode_is_normalized' },
 { oid => '9822', descr => 'unescape Unicode characters',
  proname => 'unistr', prorettype => 'text', proargtypes => 'text',
  prosrc => 'unistr' },
 { oid => '4596', descr => 'I/O',
  proname => 'brin_bloom_summary_in', prorettype => 'pg_brin_bloom_summary',
  proargtypes => 'cstring', prosrc => 'brin_bloom_summary_in' },
--- a/src/test/regress/expected/strings.out
+++ b/src/test/regress/expected/strings.out
@ -2234,3 +2234,39 @@ SELECT bit_count('\x1234567890'::bytea);
        15
 (1 row)
 SELECT unistr('\0064at\+0000610');
 unistr 
 --------
 data0
 (1 row)
 SELECT unistr('d\u0061t\U000000610');
 unistr 
 --------
 data0
 (1 row)
 SELECT unistr('a\\b');
 unistr 
 --------
 a\b
 (1 row)
 -- errors:
 SELECT unistr('wrong: \db99');
 ERROR:  invalid Unicode surrogate pair
 SELECT unistr('wrong: \db99\0061');
 ERROR:  invalid Unicode surrogate pair
 SELECT unistr('wrong: \+00db99\+000061');
 ERROR:  invalid Unicode surrogate pair
 SELECT unistr('wrong: \+2FFFFF');
 ERROR:  invalid Unicode code point: 2FFFFF
 SELECT unistr('wrong: \udb99\u0061');
 ERROR:  invalid Unicode surrogate pair
 SELECT unistr('wrong: \U0000db99\U00000061');
 ERROR:  invalid Unicode surrogate pair
 SELECT unistr('wrong: \U002FFFFF');
 ERROR:  invalid Unicode code point: 2FFFFF
 SELECT unistr('wrong: \xyz');
 ERROR:  invalid Unicode escape
 HINT:  Unicode escapes must be \XXXX, \+XXXXXX, \uXXXX, or \UXXXXXXXX.
--- a/src/test/regress/sql/strings.sql
+++ b/src/test/regress/sql/strings.sql
@ -746,3 +746,16 @@ SELECT encode(overlay(E'Th\\000omas'::bytea placing E'\\002\\003'::bytea from 8)
 SELECT encode(overlay(E'Th\\000omas'::bytea placing E'\\002\\003'::bytea from 5 for 3),'escape');
 SELECT bit_count('\x1234567890'::bytea);
 SELECT unistr('\0064at\+0000610');
 SELECT unistr('d\u0061t\U000000610');
 SELECT unistr('a\\b');
 -- errors:
 SELECT unistr('wrong: \db99');
 SELECT unistr('wrong: \db99\0061');
 SELECT unistr('wrong: \+00db99\+000061');
 SELECT unistr('wrong: \+2FFFFF');
 SELECT unistr('wrong: \udb99\u0061');
 SELECT unistr('wrong: \U0000db99\U00000061');
 SELECT unistr('wrong: \U002FFFFF');
 SELECT unistr('wrong: \xyz');