From fd15dba543247eb1ce879d22632b9fdb4c230831 Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Tue, 26 Feb 2008 02:54:08 +0000
Subject: [PATCH] Fix encode(...bytea..., 'escape') so that it converts all
 high-bit-set byte values into \nnn octal escape sequences.  When the database
 encoding is multibyte this is *necessary* to avoid generating invalidly
 encoded text. Even in a single-byte encoding, the old behavior seems very
 hazardous --- consider for example what happens if the text is transferred to
 another database with a different encoding.  Decoding would then yield some
 other bytea value than what was encoded, which is surely undesirable.  Per
 gripe from Hernan Gonzalez.

Backpatch to 8.3, but not further.  This is a bit of a judgment call, but I
make it on these grounds: pre-8.3 we don't really have much encoding safety
anyway because of the convert() function family, and we would also have much
higher risk of breaking existing apps that may not be expecting this behavior.
8.3 is still new enough that we can probably get away with making this change
in the function's behavior.
---
 src/backend/utils/adt/encode.c | 37 +++++++++++++++++++---------------
 1 file changed, 21 insertions(+), 16 deletions(-)

diff --git a/src/backend/utils/adt/encode.c b/src/backend/utils/adt/encode.c
index a6a9e23888..cbbc4b69a2 100644
--- a/src/backend/utils/adt/encode.c
+++ b/src/backend/utils/adt/encode.c
@@ -7,7 +7,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/utils/adt/encode.c,v 1.20 2008/01/01 19:45:52 momjian Exp $
+ *	  $PostgreSQL: pgsql/src/backend/utils/adt/encode.c,v 1.21 2008/02/26 02:54:08 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -26,7 +26,7 @@ struct pg_encoding
 	unsigned	(*decode) (const char *data, unsigned dlen, char *res);
 };
 
-static struct pg_encoding *pg_find_encoding(const char *name);
+static const struct pg_encoding *pg_find_encoding(const char *name);
 
 /*
  * SQL functions.
@@ -42,7 +42,7 @@ binary_encode(PG_FUNCTION_ARGS)
 	int			datalen,
 				resultlen,
 				res;
-	struct pg_encoding *enc;
+	const struct pg_encoding *enc;
 
 	datalen = VARSIZE(data) - VARHDRSZ;
 
@@ -78,7 +78,7 @@ binary_decode(PG_FUNCTION_ARGS)
 	int			datalen,
 				resultlen,
 				res;
-	struct pg_encoding *enc;
+	const struct pg_encoding *enc;
 
 	datalen = VARSIZE(data) - VARHDRSZ;
 
@@ -348,10 +348,13 @@ b64_dec_len(const char *src, unsigned srclen)
  * Minimally escape bytea to text.
  * De-escape text to bytea.
  *
- * Only two characters are escaped:
- * \0 (null) and \\ (backslash)
+ * We must escape zero bytes and high-bit-set bytes to avoid generating
+ * text that might be invalid in the current encoding, or that might
+ * change to something else if passed through an encoding conversion
+ * (leading to failing to de-escape to the original bytea value).
+ * Also of course backslash itself has to be escaped.
  *
- * De-escapes \\ and any \### octal
+ * De-escaping processes \\ and any \### octal
  */
 
 #define VAL(CH)			((CH) - '0')
@@ -366,16 +369,18 @@ esc_encode(const char *src, unsigned srclen, char *dst)
 
 	while (src < end)
 	{
-		if (*src == '\0')
+		unsigned char c = (unsigned char) *src;
+
+		if (c == '\0' || IS_HIGHBIT_SET(c))
 		{
 			rp[0] = '\\';
-			rp[1] = '0';
-			rp[2] = '0';
-			rp[3] = '0';
+			rp[1] = DIG(c >> 6);
+			rp[2] = DIG((c >> 3) & 7);
+			rp[3] = DIG(c & 7);
 			rp += 4;
 			len += 4;
 		}
-		else if (*src == '\\')
+		else if (c == '\\')
 		{
 			rp[0] = '\\';
 			rp[1] = '\\';
@@ -384,7 +389,7 @@ esc_encode(const char *src, unsigned srclen, char *dst)
 		}
 		else
 		{
-			*rp++ = *src;
+			*rp++ = c;
 			len++;
 		}
 
@@ -450,7 +455,7 @@ esc_enc_len(const char *src, unsigned srclen)
 
 	while (src < end)
 	{
-		if (*src == '\0')
+		if (*src == '\0' || IS_HIGHBIT_SET(*src))
 			len += 4;
 		else if (*src == '\\')
 			len += 2;
@@ -510,7 +515,7 @@ esc_dec_len(const char *src, unsigned srclen)
  * Common
  */
 
-static struct
+static const struct
 {
 	const char *name;
 	struct pg_encoding enc;
@@ -543,7 +548,7 @@ static struct
 	}
 };
 
-static struct pg_encoding *
+static const struct pg_encoding *
 pg_find_encoding(const char *name)
 {
 	int			i;