Fix encode(...bytea..., 'escape') so that it converts all high-bit-set byte

values into \nnn octal escape sequences. When the database encoding is multibyte this is *necessary* to avoid generating invalidly encoded text. Even in a single-byte encoding, the old behavior seems very hazardous --- consider for example what happens if the text is transferred to another database with a different encoding. Decoding would then yield some other bytea value than what was encoded, which is surely undesirable. Per gripe from Hernan Gonzalez. Backpatch to 8.3, but not further. This is a bit of a judgment call, but I make it on these grounds: pre-8.3 we don't really have much encoding safety anyway because of the convert() function family, and we would also have much higher risk of breaking existing apps that may not be expecting this behavior. 8.3 is still new enough that we can probably get away with making this change in the function's behavior.
2024-12-21 08:29:39 +08:00 · 2008-02-26 02:54:08 +00:00 · 2008-02-26 02:54:08 +00:00 · fd15dba543
commit fd15dba543
parent bc93919be7
1 changed files with 21 additions and 16 deletions
--- a/src/backend/utils/adt/encode.c
+++ b/src/backend/utils/adt/encode.c
@ -7,7 +7,7 @@
 *
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/utils/adt/encode.c,v 1.20 2008/01/01 19:45:52 momjian Exp $
+ *	  $PostgreSQL: pgsql/src/backend/utils/adt/encode.c,v 1.21 2008/02/26 02:54:08 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@ -26,7 +26,7 @@ struct pg_encoding
 	unsigned	(*decode) (const char *data, unsigned dlen, char *res);
 };

-static struct pg_encoding *pg_find_encoding(const char *name);
+static const struct pg_encoding *pg_find_encoding(const char *name);

 /*
 * SQL functions.
@ -42,7 +42,7 @@ binary_encode(PG_FUNCTION_ARGS)
 	int			datalen,
 				resultlen,
 				res;
-	struct pg_encoding *enc;
+	const struct pg_encoding *enc;

 	datalen = VARSIZE(data) - VARHDRSZ;

@ -78,7 +78,7 @@ binary_decode(PG_FUNCTION_ARGS)
 	int			datalen,
 				resultlen,
 				res;
-	struct pg_encoding *enc;
+	const struct pg_encoding *enc;

 	datalen = VARSIZE(data) - VARHDRSZ;

@ -348,10 +348,13 @@ b64_dec_len(const char *src, unsigned srclen)
 * Minimally escape bytea to text.
 * De-escape text to bytea.
 *
- * Only two characters are escaped:
- * \0 (null) and \\ (backslash)
+ * We must escape zero bytes and high-bit-set bytes to avoid generating
+ * text that might be invalid in the current encoding, or that might
+ * change to something else if passed through an encoding conversion
+ * (leading to failing to de-escape to the original bytea value).
+ * Also of course backslash itself has to be escaped.
 *
- * De-escapes \\ and any \### octal
+ * De-escaping processes \\ and any \### octal
 */

 #define VAL(CH)			((CH) - '0')
@ -366,16 +369,18 @@ esc_encode(const char *src, unsigned srclen, char *dst)

 	while (src < end)
 	{
-		if (*src == '\0')
+		unsigned char c = (unsigned char) *src;
+
+		if (c == '\0' || IS_HIGHBIT_SET(c))
 		{
 			rp[0] = '\\';
-			rp[1] = '0';
-			rp[2] = '0';
-			rp[3] = '0';
+			rp[1] = DIG(c >> 6);
+			rp[2] = DIG((c >> 3) & 7);
+			rp[3] = DIG(c & 7);
 			rp += 4;
 			len += 4;
 		}
-		else if (*src == '\\')
+		else if (c == '\\')
 		{
 			rp[0] = '\\';
 			rp[1] = '\\';
@ -384,7 +389,7 @@ esc_encode(const char *src, unsigned srclen, char *dst)
 		}
 		else
 		{
-			*rp++ = *src;
+			*rp++ = c;
 			len++;
 		}

@ -450,7 +455,7 @@ esc_enc_len(const char *src, unsigned srclen)

 	while (src < end)
 	{
-		if (*src == '\0')
+		if (*src == '\0' || IS_HIGHBIT_SET(*src))
 			len += 4;
 		else if (*src == '\\')
 			len += 2;
@ -510,7 +515,7 @@ esc_dec_len(const char *src, unsigned srclen)
 * Common
 */

-static struct
+static const struct
 {
 	const char *name;
 	struct pg_encoding enc;
@ -543,7 +548,7 @@ static struct
 	}
 };

-static struct pg_encoding *
+static const struct pg_encoding *
 pg_find_encoding(const char *name)
 {
 	int			i;