-<!-- $PostgreSQL: pgsql/doc/src/sgml/syntax.sgml,v 1.135 2009/09/21 22:22:07 petere Exp $ -->
+<!-- $PostgreSQL: pgsql/doc/src/sgml/syntax.sgml,v 1.136 2009/09/22 23:52:53 petere Exp $ -->
<chapter id="sql-syntax">
<title>SQL Syntax</title>
</entry>
<entry>hexadecimal byte value</entry>
</row>
+ <row>
+ <entry>
+ <literal>\u<replaceable>xxxx</replaceable></literal>,
+ <literal>\U<replaceable>xxxxxxxx</replaceable></literal>
+ (<replaceable>x</replaceable> = 0 - 9, A - F)
+ </entry>
+ <entry>16 or 32-bit hexadecimal Unicode character value</entry>
+ </row>
</tbody>
</tgroup>
</table>
</para>
<para>
- It is your responsibility that the byte sequences you create are
+ It is your responsibility that the byte sequences you create,
+ especially when using the octal or hexadecimal escapes, compose
valid characters in the server character set encoding. When the
- server encoding is UTF-8, then the alternative Unicode escape
- syntax, explained in <xref linkend="sql-syntax-strings-uescape">,
- should be used instead. (The alternative would be doing the
- UTF-8 encoding by hand and writing out the bytes, which would be
- very cumbersome.)
+ server encoding is UTF-8, then the Unicode escapes or the
+ alternative Unicode escape syntax, explained
+ in <xref linkend="sql-syntax-strings-uescape">, should be used
+ instead. (The alternative would be doing the UTF-8 encoding by
+ hand and writing out the bytes, which would be very cumbersome.)
+ </para>
+
+ <para>
+ The Unicode escape syntax works fully only when the server
+ encoding is UTF-8. When other server encodings are used, only
+ code points in the ASCII range (up to <literal>\u007F</>) can be
+ specified. Both the 4-digit and the 8-digit form can be used to
+ specify UTF-16 surrogate pairs to compose characters with code
+ points larger than <literal>\FFFF</literal> (although the
+ availability of the 8-digit form technically makes this
+ unnecessary).
</para>
<caution>
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/parser/scan.l,v 1.158 2009/09/21 22:22:07 petere Exp $
+ * $PostgreSQL: pgsql/src/backend/parser/scan.l,v 1.159 2009/09/22 23:52:53 petere Exp $
*
*-------------------------------------------------------------------------
*/
static char *litbufdup(base_yyscan_t yyscanner);
static char *litbuf_udeescape(unsigned char escape, base_yyscan_t yyscanner);
static unsigned char unescape_single_char(unsigned char c, base_yyscan_t yyscanner);
+static bool is_utf16_surrogate_first(pg_wchar c);
+static bool is_utf16_surrogate_second(pg_wchar c);
+static pg_wchar surrogate_pair_to_codepoint(pg_wchar first, pg_wchar second);
#define yyerror(msg) scanner_yyerror(msg, yyscanner)
extern int base_yyget_column(yyscan_t yyscanner);
extern void base_yyset_column(int column_no, yyscan_t yyscanner);
+static void addunicode(pg_wchar c, yyscan_t yyscanner);
+
%}
%option reentrant
* <xdolq> $foo$ quoted strings
* <xui> quoted identifier with Unicode escapes
* <xus> quoted string with Unicode escapes
+ * <xeu> Unicode surrogate pair in extended quoted string
*/
%x xb
%x xdolq
%x xui
%x xus
+%x xeu
/*
* In order to make the world safe for Windows and Mac clients as well as
xeescape [\\][^0-7]
xeoctesc [\\][0-7]{1,3}
xehexesc [\\]x[0-9A-Fa-f]{1,2}
+xeunicode [\\](u[0-9A-Fa-f]{4}|U[0-9A-Fa-f]{8})
+xeunicodebad [\\]([uU])
/* Extended quote
* xqdouble implements embedded quote, ''''
<xe>{xeinside} {
addlit(yytext, yyleng, yyscanner);
}
+<xe>{xeunicode} {
+ pg_wchar c = strtoul(yytext+2, NULL, 16);
+
+ check_escape_warning(yyscanner);
+
+ if (is_utf16_surrogate_first(c))
+ {
+ yyextra->utf16_first_part = c;
+ BEGIN(xeu);
+ }
+ else if (is_utf16_surrogate_second(c))
+ yyerror("invalid Unicode surrogate pair");
+ else
+ addunicode(c, yyscanner);
+ }
+<xeu>{xeunicode} {
+ pg_wchar c = strtoul(yytext+2, NULL, 16);
+
+ if (!is_utf16_surrogate_second(c))
+ yyerror("invalid Unicode surrogate pair");
+
+ c = surrogate_pair_to_codepoint(yyextra->utf16_first_part, c);
+
+ addunicode(c, yyscanner);
+
+ BEGIN(xe);
+ }
+<xeu>. |
+<xeu>\n |
+<xeu><<EOF>> { yyerror("invalid Unicode surrogate pair"); }
+
+<xe>{xeunicodebad} {
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_ESCAPE_SEQUENCE),
+ errmsg("invalid Unicode escape"),
+ errhint("Unicode escapes must be \\uXXXX or \\UXXXXXXXX."),
+ lexer_errposition()));
+ }
+
<xe>{xeescape} {
if (yytext[1] == '\'')
{
if (ptr)
pfree(ptr);
}
+
+static void
+addunicode(pg_wchar c, base_yyscan_t yyscanner)
+{
+ char buf[8];
+
+ if (c == 0 || c > 0x10FFFF)
+ yyerror("invalid Unicode escape value");
+ if (c > 0x7F)
+ {
+ if (GetDatabaseEncoding() != PG_UTF8)
+ yyerror("Unicode escape values cannot be used for code point values above 007F when the server encoding is not UTF8");
+ yyextra->saw_non_ascii = true;
+ }
+ unicode_to_utf8(c, (unsigned char *)buf);
+ addlit(buf, pg_mblen(buf), yyscanner);
+}
+
* Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
- * $PostgreSQL: pgsql/src/include/parser/gramparse.h,v 1.47 2009/07/14 20:24:10 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/parser/gramparse.h,v 1.48 2009/09/22 23:52:53 petere Exp $
*
*-------------------------------------------------------------------------
*/
int xcdepth; /* depth of nesting in slash-star comments */
char *dolqstart; /* current $foo$ quote start string */
+ /* first part of UTF16 surrogate pair for Unicode escapes */
+ int32 utf16_first_part;
+
/* state variables for literal-lexing warnings */
bool warn_on_first_escape;
bool saw_non_ascii;