From f7eeb324e7b9fa948dfd55b55e9b20fa919e7f79 Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Wed, 16 Sep 2015 14:50:12 -0400 Subject: [PATCH] Fix documentation of regular expression character-entry escapes. The docs claimed that \uhhhh would be interpreted as a Unicode value regardless of the database encoding, but it's never been implemented that way: \uhhhh and \xhhhh actually mean exactly the same thing, namely the character that pg_mb2wchar translates to 0xhhhh. Moreover we were falsely dismissive of the usefulness of Unicode code points above FFFF. Fix that. It's been like this for ages, so back-patch to all supported branches. --- doc/src/sgml/func.sgml | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml index 0d614024a78..2b8cb99bb97 100644 --- a/doc/src/sgml/func.sgml +++ b/doc/src/sgml/func.sgml @@ -4148,7 +4148,7 @@ SELECT foo FROM regexp_split_to_table('the quick brown fox', E'\\s*') AS foo; \e the character whose collating-sequence name is ESC, - or failing that, the character with octal value 033 + or failing that, the character with octal value 033 @@ -4174,15 +4174,17 @@ SELECT foo FROM regexp_split_to_table('the quick brown fox', E'\\s*') AS foo; \uwxyz (where wxyz is exactly four hexadecimal digits) - the UTF16 (Unicode, 16-bit) character U+wxyz - in the local byte ordering + the character whose hexadecimal value is + 0xwxyz + \Ustuvwxyz (where stuvwxyz is exactly eight hexadecimal digits) - reserved for a hypothetical Unicode extension to 32 bits + the character whose hexadecimal value is + 0xstuvwxyz @@ -4231,6 +4233,17 @@ SELECT foo FROM regexp_split_to_table('the quick brown fox', E'\\s*') AS foo; Octal digits are 0-7. + + Numeric character-entry escapes specifying values outside the ASCII range + (0-127) have meanings dependent on the database encoding. When the + encoding is UTF-8, escape values are equivalent to Unicode code points, + for example \u1234 means the character U+1234. + For other multibyte encodings, character-entry escapes usually just + specify the concatenation of the byte values for the character. If the + escape value does not correspond to any legal character in the database + encoding, no error will be raised, but it will never match any data. + + The character-entry escapes are always taken as ordinary characters. For example, \135 is ] in ASCII, but