Make sure chr(int) can't create invalid UTF8 sequences.

author Tom Lane <tgl@sss.pgh.pa.us>

Fri, 16 May 2014 20:51:28 +0000 (16:51 -0400)

committer Tom Lane <tgl@sss.pgh.pa.us>

Fri, 16 May 2014 20:51:28 +0000 (16:51 -0400)
author Tom Lane <tgl@sss.pgh.pa.us>
Fri, 16 May 2014 20:51:28 +0000 (16:51 -0400)
committer Tom Lane <tgl@sss.pgh.pa.us>
Fri, 16 May 2014 20:51:28 +0000 (16:51 -0400)
diff --git a/src/backend/utils/adt/oracle_compat.c b/src/backend/utils/adt/oracle_compat.c

index 4dab45caf4f4f8fb5f1594ebf10fbf183d51659a..ba3d5d6e1399fb8f0dd41dcf0cdc515b9b031705 100644 (file)
--- a/src/backend/utils/adt/oracle_compat.c
+++ b/src/backend/utils/adt/oracle_compat.c
@@ -932,10 +932,14 @@ chr                       (PG_FUNCTION_ARGS)
         {
                 /* for Unicode we treat the argument as a code point */
                 int                     bytes;
-               char       *wch;
+               unsigned char *wch;
  
-               /* We only allow valid Unicode code points */
-               if (cvalue > 0x001fffff)
+               /*
+                * We only allow valid Unicode code points; per RFC3629 that stops at
+                * U+10FFFF, even though 4-byte UTF8 sequences can hold values up to
+                * U+1FFFFF.
+                */
+               if (cvalue > 0x0010ffff)
                         ereport(ERROR,
                                         (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
                                          errmsg("requested character too large for encoding: %d",
@@ -950,7 +954,7 @@ chr                 (PG_FUNCTION_ARGS)
  
                 result = (text *) palloc(VARHDRSZ + bytes);
                 SET_VARSIZE(result, VARHDRSZ + bytes);
-               wch = VARDATA(result);
+               wch = (unsigned char *) VARDATA(result);
  
                 if (bytes == 2)
                 {
@@ -971,8 +975,17 @@ chr                        (PG_FUNCTION_ARGS)
                         wch[3] = 0x80 | (cvalue & 0x3F);
                 }
  
+               /*
+                * The preceding range check isn't sufficient, because UTF8 excludes
+                * Unicode "surrogate pair" codes.  Make sure what we created is valid
+                * UTF8.
+                */
+               if (!pg_utf8_islegal(wch, bytes))
+                       ereport(ERROR,
+                                       (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+                                        errmsg("requested character not valid for encoding: %d",
+                                                       cvalue)));
         }
-
         else
         {
                 bool            is_mb;
@@ -981,7 +994,6 @@ chr                 (PG_FUNCTION_ARGS)
                  * Error out on arguments that make no sense or that we can't validly
                  * represent in the encoding.
                  */
-
                 if (cvalue == 0)
                         ereport(ERROR,
                                         (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
@@ -995,7 +1007,6 @@ chr                        (PG_FUNCTION_ARGS)
                                          errmsg("requested character too large for encoding: %d",
                                                         cvalue)));
  
-
                 result = (text *) palloc(VARHDRSZ + 1);
                 SET_VARSIZE(result, VARHDRSZ + 1);
                 *VARDATA(result) = (char) cvalue;
author	Tom Lane <tgl@sss.pgh.pa.us>
	Fri, 16 May 2014 20:51:28 +0000 (16:51 -0400)
committer	Tom Lane <tgl@sss.pgh.pa.us>
	Fri, 16 May 2014 20:51:28 +0000 (16:51 -0400)