From fd15dba543247eb1ce879d22632b9fdb4c230831 Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Tue, 26 Feb 2008 02:54:08 +0000 Subject: [PATCH] Fix encode(...bytea..., 'escape') so that it converts all high-bit-set byte values into \nnn octal escape sequences. When the database encoding is multibyte this is *necessary* to avoid generating invalidly encoded text. Even in a single-byte encoding, the old behavior seems very hazardous --- consider for example what happens if the text is transferred to another database with a different encoding. Decoding would then yield some other bytea value than what was encoded, which is surely undesirable. Per gripe from Hernan Gonzalez. Backpatch to 8.3, but not further. This is a bit of a judgment call, but I make it on these grounds: pre-8.3 we don't really have much encoding safety anyway because of the convert() function family, and we would also have much higher risk of breaking existing apps that may not be expecting this behavior. 8.3 is still new enough that we can probably get away with making this change in the function's behavior. --- src/backend/utils/adt/encode.c | 37 +++++++++++++++++++--------------- 1 file changed, 21 insertions(+), 16 deletions(-) diff --git a/src/backend/utils/adt/encode.c b/src/backend/utils/adt/encode.c index a6a9e23888..cbbc4b69a2 100644 --- a/src/backend/utils/adt/encode.c +++ b/src/backend/utils/adt/encode.c @@ -7,7 +7,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/utils/adt/encode.c,v 1.20 2008/01/01 19:45:52 momjian Exp $ + * $PostgreSQL: pgsql/src/backend/utils/adt/encode.c,v 1.21 2008/02/26 02:54:08 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -26,7 +26,7 @@ struct pg_encoding unsigned (*decode) (const char *data, unsigned dlen, char *res); }; -static struct pg_encoding *pg_find_encoding(const char *name); +static const struct pg_encoding *pg_find_encoding(const char *name); /* * SQL functions. @@ -42,7 +42,7 @@ binary_encode(PG_FUNCTION_ARGS) int datalen, resultlen, res; - struct pg_encoding *enc; + const struct pg_encoding *enc; datalen = VARSIZE(data) - VARHDRSZ; @@ -78,7 +78,7 @@ binary_decode(PG_FUNCTION_ARGS) int datalen, resultlen, res; - struct pg_encoding *enc; + const struct pg_encoding *enc; datalen = VARSIZE(data) - VARHDRSZ; @@ -348,10 +348,13 @@ b64_dec_len(const char *src, unsigned srclen) * Minimally escape bytea to text. * De-escape text to bytea. * - * Only two characters are escaped: - * \0 (null) and \\ (backslash) + * We must escape zero bytes and high-bit-set bytes to avoid generating + * text that might be invalid in the current encoding, or that might + * change to something else if passed through an encoding conversion + * (leading to failing to de-escape to the original bytea value). + * Also of course backslash itself has to be escaped. * - * De-escapes \\ and any \### octal + * De-escaping processes \\ and any \### octal */ #define VAL(CH) ((CH) - '0') @@ -366,16 +369,18 @@ esc_encode(const char *src, unsigned srclen, char *dst) while (src < end) { - if (*src == '\0') + unsigned char c = (unsigned char) *src; + + if (c == '\0' || IS_HIGHBIT_SET(c)) { rp[0] = '\\'; - rp[1] = '0'; - rp[2] = '0'; - rp[3] = '0'; + rp[1] = DIG(c >> 6); + rp[2] = DIG((c >> 3) & 7); + rp[3] = DIG(c & 7); rp += 4; len += 4; } - else if (*src == '\\') + else if (c == '\\') { rp[0] = '\\'; rp[1] = '\\'; @@ -384,7 +389,7 @@ esc_encode(const char *src, unsigned srclen, char *dst) } else { - *rp++ = *src; + *rp++ = c; len++; } @@ -450,7 +455,7 @@ esc_enc_len(const char *src, unsigned srclen) while (src < end) { - if (*src == '\0') + if (*src == '\0' || IS_HIGHBIT_SET(*src)) len += 4; else if (*src == '\\') len += 2; @@ -510,7 +515,7 @@ esc_dec_len(const char *src, unsigned srclen) * Common */ -static struct +static const struct { const char *name; struct pg_encoding enc; @@ -543,7 +548,7 @@ static struct } }; -static struct pg_encoding * +static const struct pg_encoding * pg_find_encoding(const char *name) { int i;