Surrogate pair support for U& string and identifier syntax

author Peter Eisentraut <peter_e@gmx.net>

Mon, 21 Sep 2009 22:22:07 +0000 (22:22 +0000)

committer Peter Eisentraut <peter_e@gmx.net>

Mon, 21 Sep 2009 22:22:07 +0000 (22:22 +0000)
author Peter Eisentraut <peter_e@gmx.net>
Mon, 21 Sep 2009 22:22:07 +0000 (22:22 +0000)
committer Peter Eisentraut <peter_e@gmx.net>
Mon, 21 Sep 2009 22:22:07 +0000 (22:22 +0000)
diff --git a/doc/src/sgml/syntax.sgml b/doc/src/sgml/syntax.sgml

index c2dd31b98d37b591ff64774aeab6ed6d8fd46b8c..c805e2e7141b9572945ddf5ddb87e268e83bb105 100644 (file)
--- a/doc/src/sgml/syntax.sgml
+++ b/doc/src/sgml/syntax.sgml
@@ -1,4 +1,4 @@
-<!-- $PostgreSQL: pgsql/doc/src/sgml/syntax.sgml,v 1.134 2009/08/27 20:08:02 tgl Exp $ -->
+<!-- $PostgreSQL: pgsql/doc/src/sgml/syntax.sgml,v 1.135 2009/09/21 22:22:07 petere Exp $ -->
  
  <chapter id="sql-syntax">
   <title>SQL Syntax</title>
@@ -238,6 +238,10 @@ U&amp;"d!0061t!+000061" UESCAPE '!'
      The Unicode escape syntax works only when the server encoding is
      UTF8.  When other server encodings are used, only code points in
      the ASCII range (up to <literal>\007F</literal>) can be specified.
+    Both the 4-digit and the 6-digit form can be used to specify
+    UTF-16 surrogate pairs to compose characters with code points
+    larger than <literal>\FFFF</literal> (although the availability of
+    the 6-digit form technically makes this unnecessary).
     </para>
  
     <para>
@@ -497,6 +501,10 @@ U&amp;'d!0061t!+000061' UESCAPE '!'
       UTF8.  When other server encodings are used, only code points in
       the ASCII range (up to <literal>\007F</literal>) can be
       specified.
+     Both the 4-digit and the 6-digit form can be used to specify
+     UTF-16 surrogate pairs to compose characters with code points
+     larger than <literal>\FFFF</literal> (although the availability
+     of the 6-digit form technically makes this unnecessary).
      </para>
  
      <para>
diff --git a/src/backend/parser/scan.l b/src/backend/parser/scan.l

index a5ed54792b66ae3c0f83915c4a08854bf4487450..d40bd9dd97e3b351de4c4148a9e022033890ca6b 100644 (file)
--- a/src/backend/parser/scan.l
+++ b/src/backend/parser/scan.l
@@ -24,7 +24,7 @@
   * Portions Copyright (c) 1994, Regents of the University of California
   *
   * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/parser/scan.l,v 1.157 2009/07/14 20:24:10 tgl Exp $
+ *       $PostgreSQL: pgsql/src/backend/parser/scan.l,v 1.158 2009/09/21 22:22:07 petere Exp $
   *
   *-------------------------------------------------------------------------
   */
@@ -1097,11 +1097,30 @@ check_unicode_value(pg_wchar c, char *loc, base_yyscan_t yyscanner)
         }
  }
  
+static bool
+is_utf16_surrogate_first(pg_wchar c)
+{
+       return (c >= 0xD800 && c <= 0xDBFF);
+}
+
+static bool
+is_utf16_surrogate_second(pg_wchar c)
+{
+       return (c >= 0xDC00 && c <= 0xDFFF);
+}
+
+static pg_wchar
+surrogate_pair_to_codepoint(pg_wchar first, pg_wchar second)
+{
+       return ((first & 0x3FF) << 10) + 0x10000 + (second & 0x3FF);
+}
+
  static char *
  litbuf_udeescape(unsigned char escape, base_yyscan_t yyscanner)
  {
         char *new;
         char *litbuf, *in, *out;
+       pg_wchar pair_first = 0;
  
         if (isxdigit(escape)
                 || escape == '+'
@@ -1131,6 +1150,11 @@ litbuf_udeescape(unsigned char escape, base_yyscan_t yyscanner)
                 {
                         if (in[1] == escape)
                         {
+                               if (pair_first)
+                               {
+                                       ADVANCE_YYLLOC(in - litbuf + 3);   /* 3 for U&" */
+                                       yyerror("invalid Unicode surrogate pair");
+                               }
                                 *out++ = escape;
                                 in += 2;
                         }
@@ -1138,9 +1162,27 @@ litbuf_udeescape(unsigned char escape, base_yyscan_t yyscanner)
                         {
                                 pg_wchar unicode = hexval(in[1]) * 16*16*16 + hexval(in[2]) * 16*16 + hexval(in[3]) * 16 + hexval(in[4]);
                                 check_unicode_value(unicode, in, yyscanner);
-                               unicode_to_utf8(unicode, (unsigned char *) out);
+                               if (pair_first)
+                               {
+                                       if (is_utf16_surrogate_second(unicode))
+                                       {
+                                               unicode = surrogate_pair_to_codepoint(pair_first, unicode);
+                                               pair_first = 0;
+                                       }
+                                       else
+                                       {
+                                               ADVANCE_YYLLOC(in - litbuf + 3);   /* 3 for U&" */
+                                               yyerror("invalid Unicode surrogate pair");
+                                       }
+                               }
+                               if (is_utf16_surrogate_first(unicode))
+                                       pair_first = unicode;
+                               else
+                               {
+                                       unicode_to_utf8(unicode, (unsigned char *) out);
+                                       out += pg_mblen(out);
+                               }
                                 in += 5;
-                               out += pg_mblen(out);
                         }
                         else if (in[1] == '+'
                                          && isxdigit(in[2]) && isxdigit(in[3])
@@ -1150,9 +1192,27 @@ litbuf_udeescape(unsigned char escape, base_yyscan_t yyscanner)
                                 pg_wchar unicode = hexval(in[2]) * 16*16*16*16*16 + hexval(in[3]) * 16*16*16*16 + hexval(in[4]) * 16*16*16
                                                                         + hexval(in[5]) * 16*16 + hexval(in[6]) * 16 + hexval(in[7]);
                                 check_unicode_value(unicode, in, yyscanner);
-                               unicode_to_utf8(unicode, (unsigned char *) out);
+                               if (pair_first)
+                               {
+                                       if (is_utf16_surrogate_second(unicode))
+                                       {
+                                               unicode = surrogate_pair_to_codepoint(pair_first, unicode);
+                                               pair_first = 0;
+                                       }
+                                       else
+                                       {
+                                               ADVANCE_YYLLOC(in - litbuf + 3);   /* 3 for U&" */
+                                               yyerror("invalid Unicode surrogate pair");
+                                       }
+                               }
+                               if (is_utf16_surrogate_first(unicode))
+                                       pair_first = unicode;
+                               else
+                               {
+                                       unicode_to_utf8(unicode, (unsigned char *) out);
+                                       out += pg_mblen(out);
+                               }
                                 in += 8;
-                               out += pg_mblen(out);
                         }
                         else
                         {
@@ -1161,7 +1221,14 @@ litbuf_udeescape(unsigned char escape, base_yyscan_t yyscanner)
                         }
                 }
                 else
+               {
+                       if (pair_first)
+                       {
+                               ADVANCE_YYLLOC(in - litbuf + 3);   /* 3 for U&" */
+                               yyerror("invalid Unicode surrogate pair");
+                       }
                         *out++ = *in++;
+               }
         }
  
         *out = '\0';
author	Peter Eisentraut <peter_e@gmx.net>
	Mon, 21 Sep 2009 22:22:07 +0000 (22:22 +0000)
committer	Peter Eisentraut <peter_e@gmx.net>
	Mon, 21 Sep 2009 22:22:07 +0000 (22:22 +0000)
doc/src/sgml/syntax.sgml		patch \| blob \| history
src/backend/parser/scan.l		patch \| blob \| history