From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Tue, 26 Feb 2008 02:54:08 +0000 (+0000)
Subject: Fix encode(...bytea..., 'escape') so that it converts all high-bit-set byte
X-Git-Tag: REL8_4_BETA1~1955
X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=fd15dba543247eb1ce879d22632b9fdb4c230831;p=postgresql

Fix encode(...bytea..., 'escape') so that it converts all high-bit-set byte
values into \nnn octal escape sequences.  When the database encoding is
multibyte this is *necessary* to avoid generating invalidly encoded text.
Even in a single-byte encoding, the old behavior seems very hazardous ---
consider for example what happens if the text is transferred to another
database with a different encoding.  Decoding would then yield some other
bytea value than what was encoded, which is surely undesirable.  Per gripe
from Hernan Gonzalez.

Backpatch to 8.3, but not further.  This is a bit of a judgment call, but I
make it on these grounds: pre-8.3 we don't really have much encoding safety
anyway because of the convert() function family, and we would also have much
higher risk of breaking existing apps that may not be expecting this behavior.
8.3 is still new enough that we can probably get away with making this change
in the function's behavior.
---

diff --git a/src/backend/utils/adt/encode.c b/src/backend/utils/adt/encode.c
index a6a9e23888..cbbc4b69a2 100644
--- a/src/backend/utils/adt/encode.c
+++ b/src/backend/utils/adt/encode.c
@@ -7,7 +7,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/utils/adt/encode.c,v 1.20 2008/01/01 19:45:52 momjian Exp $
+ *	  $PostgreSQL: pgsql/src/backend/utils/adt/encode.c,v 1.21 2008/02/26 02:54:08 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -26,7 +26,7 @@ struct pg_encoding
 	unsigned	(*decode) (const char *data, unsigned dlen, char *res);
 };
 
-static struct pg_encoding *pg_find_encoding(const char *name);
+static const struct pg_encoding *pg_find_encoding(const char *name);
 
 /*
  * SQL functions.
@@ -42,7 +42,7 @@ binary_encode(PG_FUNCTION_ARGS)
 	int			datalen,
 				resultlen,
 				res;
-	struct pg_encoding *enc;
+	const struct pg_encoding *enc;
 
 	datalen = VARSIZE(data) - VARHDRSZ;
 
@@ -78,7 +78,7 @@ binary_decode(PG_FUNCTION_ARGS)
 	int			datalen,
 				resultlen,
 				res;
-	struct pg_encoding *enc;
+	const struct pg_encoding *enc;
 
 	datalen = VARSIZE(data) - VARHDRSZ;
 
@@ -348,10 +348,13 @@ b64_dec_len(const char *src, unsigned srclen)
  * Minimally escape bytea to text.
  * De-escape text to bytea.
  *
- * Only two characters are escaped:
- * \0 (null) and \\ (backslash)
+ * We must escape zero bytes and high-bit-set bytes to avoid generating
+ * text that might be invalid in the current encoding, or that might
+ * change to something else if passed through an encoding conversion
+ * (leading to failing to de-escape to the original bytea value).
+ * Also of course backslash itself has to be escaped.
  *
- * De-escapes \\ and any \### octal
+ * De-escaping processes \\ and any \### octal
  */
 
 #define VAL(CH)			((CH) - '0')
@@ -366,16 +369,18 @@ esc_encode(const char *src, unsigned srclen, char *dst)
 
 	while (src < end)
 	{
-		if (*src == '\0')
+		unsigned char c = (unsigned char) *src;
+
+		if (c == '\0' || IS_HIGHBIT_SET(c))
 		{
 			rp[0] = '\\';
-			rp[1] = '0';
-			rp[2] = '0';
-			rp[3] = '0';
+			rp[1] = DIG(c >> 6);
+			rp[2] = DIG((c >> 3) & 7);
+			rp[3] = DIG(c & 7);
 			rp += 4;
 			len += 4;
 		}
-		else if (*src == '\\')
+		else if (c == '\\')
 		{
 			rp[0] = '\\';
 			rp[1] = '\\';
@@ -384,7 +389,7 @@ esc_encode(const char *src, unsigned srclen, char *dst)
 		}
 		else
 		{
-			*rp++ = *src;
+			*rp++ = c;
 			len++;
 		}
 
@@ -450,7 +455,7 @@ esc_enc_len(const char *src, unsigned srclen)
 
 	while (src < end)
 	{
-		if (*src == '\0')
+		if (*src == '\0' || IS_HIGHBIT_SET(*src))
 			len += 4;
 		else if (*src == '\\')
 			len += 2;
@@ -510,7 +515,7 @@ esc_dec_len(const char *src, unsigned srclen)
  * Common
  */
 
-static struct
+static const struct
 {
 	const char *name;
 	struct pg_encoding enc;
@@ -543,7 +548,7 @@ static struct
 	}
 };
 
-static struct pg_encoding *
+static const struct pg_encoding *
 pg_find_encoding(const char *name)
 {
 	int			i;