]> granicus.if.org Git - postgresql/blob - src/backend/parser/scan.l
Change internal integer representation of Value node
[postgresql] / src / backend / parser / scan.l
1 %top{
2 /*-------------------------------------------------------------------------
3  *
4  * scan.l
5  *        lexical scanner for PostgreSQL
6  *
7  * NOTE NOTE NOTE:
8  *
9  * The rules in this file must be kept in sync with src/fe_utils/psqlscan.l!
10  *
11  * The rules are designed so that the scanner never has to backtrack,
12  * in the sense that there is always a rule that can match the input
13  * consumed so far (the rule action may internally throw back some input
14  * with yyless(), however).  As explained in the flex manual, this makes
15  * for a useful speed increase --- about a third faster than a plain -CF
16  * lexer, in simple testing.  The extra complexity is mostly in the rules
17  * for handling float numbers and continued string literals.  If you change
18  * the lexical rules, verify that you haven't broken the no-backtrack
19  * property by running flex with the "-b" option and checking that the
20  * resulting "lex.backup" file says that no backing up is needed.  (As of
21  * Postgres 9.2, this check is made automatically by the Makefile.)
22  *
23  *
24  * Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group
25  * Portions Copyright (c) 1994, Regents of the University of California
26  *
27  * IDENTIFICATION
28  *        src/backend/parser/scan.l
29  *
30  *-------------------------------------------------------------------------
31  */
32 #include "postgres.h"
33
34 #include <ctype.h>
35 #include <unistd.h>
36
37 #include "parser/gramparse.h"
38 #include "parser/parser.h"              /* only needed for GUC variables */
39 #include "parser/scansup.h"
40 #include "mb/pg_wchar.h"
41 }
42
43 %{
44
45 /* LCOV_EXCL_START */
46
47 /* Avoid exit() on fatal scanner errors (a bit ugly -- see yy_fatal_error) */
48 #undef fprintf
49 #define fprintf(file, fmt, msg)  fprintf_to_ereport(fmt, msg)
50
51 static void
52 fprintf_to_ereport(const char *fmt, const char *msg)
53 {
54         ereport(ERROR, (errmsg_internal("%s", msg)));
55 }
56
57 /*
58  * GUC variables.  This is a DIRECT violation of the warning given at the
59  * head of gram.y, ie flex/bison code must not depend on any GUC variables;
60  * as such, changing their values can induce very unintuitive behavior.
61  * But we shall have to live with it until we can remove these variables.
62  */
63 int                     backslash_quote = BACKSLASH_QUOTE_SAFE_ENCODING;
64 bool            escape_string_warning = true;
65 bool            standard_conforming_strings = true;
66
67 /*
68  * Set the type of YYSTYPE.
69  */
70 #define YYSTYPE core_YYSTYPE
71
72 /*
73  * Set the type of yyextra.  All state variables used by the scanner should
74  * be in yyextra, *not* statically allocated.
75  */
76 #define YY_EXTRA_TYPE core_yy_extra_type *
77
78 /*
79  * Each call to yylex must set yylloc to the location of the found token
80  * (expressed as a byte offset from the start of the input text).
81  * When we parse a token that requires multiple lexer rules to process,
82  * this should be done in the first such rule, else yylloc will point
83  * into the middle of the token.
84  */
85 #define SET_YYLLOC()  (*(yylloc) = yytext - yyextra->scanbuf)
86
87 /*
88  * Advance yylloc by the given number of bytes.
89  */
90 #define ADVANCE_YYLLOC(delta)  ( *(yylloc) += (delta) )
91
92 #define startlit()      ( yyextra->literallen = 0 )
93 static void addlit(char *ytext, int yleng, core_yyscan_t yyscanner);
94 static void addlitchar(unsigned char ychar, core_yyscan_t yyscanner);
95 static char *litbufdup(core_yyscan_t yyscanner);
96 static char *litbuf_udeescape(unsigned char escape, core_yyscan_t yyscanner);
97 static unsigned char unescape_single_char(unsigned char c, core_yyscan_t yyscanner);
98 static int      process_integer_literal(const char *token, YYSTYPE *lval);
99 static bool is_utf16_surrogate_first(pg_wchar c);
100 static bool is_utf16_surrogate_second(pg_wchar c);
101 static pg_wchar surrogate_pair_to_codepoint(pg_wchar first, pg_wchar second);
102 static void addunicode(pg_wchar c, yyscan_t yyscanner);
103 static bool check_uescapechar(unsigned char escape);
104
105 #define yyerror(msg)  scanner_yyerror(msg, yyscanner)
106
107 #define lexer_errposition()  scanner_errposition(*(yylloc), yyscanner)
108
109 static void check_string_escape_warning(unsigned char ychar, core_yyscan_t yyscanner);
110 static void check_escape_warning(core_yyscan_t yyscanner);
111
112 /*
113  * Work around a bug in flex 2.5.35: it emits a couple of functions that
114  * it forgets to emit declarations for.  Since we use -Wmissing-prototypes,
115  * this would cause warnings.  Providing our own declarations should be
116  * harmless even when the bug gets fixed.
117  */
118 extern int      core_yyget_column(yyscan_t yyscanner);
119 extern void core_yyset_column(int column_no, yyscan_t yyscanner);
120
121 %}
122
123 %option reentrant
124 %option bison-bridge
125 %option bison-locations
126 %option 8bit
127 %option never-interactive
128 %option nodefault
129 %option noinput
130 %option nounput
131 %option noyywrap
132 %option noyyalloc
133 %option noyyrealloc
134 %option noyyfree
135 %option warn
136 %option prefix="core_yy"
137
138 /*
139  * OK, here is a short description of lex/flex rules behavior.
140  * The longest pattern which matches an input string is always chosen.
141  * For equal-length patterns, the first occurring in the rules list is chosen.
142  * INITIAL is the starting state, to which all non-conditional rules apply.
143  * Exclusive states change parsing rules while the state is active.  When in
144  * an exclusive state, only those rules defined for that state apply.
145  *
146  * We use exclusive states for quoted strings, extended comments,
147  * and to eliminate parsing troubles for numeric strings.
148  * Exclusive states:
149  *  <xb> bit string literal
150  *  <xc> extended C-style comments
151  *  <xd> delimited identifiers (double-quoted identifiers)
152  *  <xh> hexadecimal numeric string
153  *  <xq> standard quoted strings
154  *  <xe> extended quoted strings (support backslash escape sequences)
155  *  <xdolq> $foo$ quoted strings
156  *  <xui> quoted identifier with Unicode escapes
157  *  <xuiend> end of a quoted identifier with Unicode escapes, UESCAPE can follow
158  *  <xus> quoted string with Unicode escapes
159  *  <xusend> end of a quoted string with Unicode escapes, UESCAPE can follow
160  *  <xeu> Unicode surrogate pair in extended quoted string
161  *
162  * Remember to add an <<EOF>> case whenever you add a new exclusive state!
163  * The default one is probably not the right thing.
164  */
165
166 %x xb
167 %x xc
168 %x xd
169 %x xh
170 %x xe
171 %x xq
172 %x xdolq
173 %x xui
174 %x xuiend
175 %x xus
176 %x xusend
177 %x xeu
178
179 /*
180  * In order to make the world safe for Windows and Mac clients as well as
181  * Unix ones, we accept either \n or \r as a newline.  A DOS-style \r\n
182  * sequence will be seen as two successive newlines, but that doesn't cause
183  * any problems.  Comments that start with -- and extend to the next
184  * newline are treated as equivalent to a single whitespace character.
185  *
186  * NOTE a fine point: if there is no newline following --, we will absorb
187  * everything to the end of the input as a comment.  This is correct.  Older
188  * versions of Postgres failed to recognize -- as a comment if the input
189  * did not end with a newline.
190  *
191  * XXX perhaps \f (formfeed) should be treated as a newline as well?
192  *
193  * XXX if you change the set of whitespace characters, fix scanner_isspace()
194  * to agree, and see also the plpgsql lexer.
195  */
196
197 space                   [ \t\n\r\f]
198 horiz_space             [ \t\f]
199 newline                 [\n\r]
200 non_newline             [^\n\r]
201
202 comment                 ("--"{non_newline}*)
203
204 whitespace              ({space}+|{comment})
205
206 /*
207  * SQL requires at least one newline in the whitespace separating
208  * string literals that are to be concatenated.  Silly, but who are we
209  * to argue?  Note that {whitespace_with_newline} should not have * after
210  * it, whereas {whitespace} should generally have a * after it...
211  */
212
213 special_whitespace              ({space}+|{comment}{newline})
214 horiz_whitespace                ({horiz_space}|{comment})
215 whitespace_with_newline ({horiz_whitespace}*{newline}{special_whitespace}*)
216
217 /*
218  * To ensure that {quotecontinue} can be scanned without having to back up
219  * if the full pattern isn't matched, we include trailing whitespace in
220  * {quotestop}.  This matches all cases where {quotecontinue} fails to match,
221  * except for {quote} followed by whitespace and just one "-" (not two,
222  * which would start a {comment}).  To cover that we have {quotefail}.
223  * The actions for {quotestop} and {quotefail} must throw back characters
224  * beyond the quote proper.
225  */
226 quote                   '
227 quotestop               {quote}{whitespace}*
228 quotecontinue   {quote}{whitespace_with_newline}{quote}
229 quotefail               {quote}{whitespace}*"-"
230
231 /* Bit string
232  * It is tempting to scan the string for only those characters
233  * which are allowed. However, this leads to silently swallowed
234  * characters if illegal characters are included in the string.
235  * For example, if xbinside is [01] then B'ABCD' is interpreted
236  * as a zero-length string, and the ABCD' is lost!
237  * Better to pass the string forward and let the input routines
238  * validate the contents.
239  */
240 xbstart                 [bB]{quote}
241 xbinside                [^']*
242
243 /* Hexadecimal number */
244 xhstart                 [xX]{quote}
245 xhinside                [^']*
246
247 /* National character */
248 xnstart                 [nN]{quote}
249
250 /* Quoted string that allows backslash escapes */
251 xestart                 [eE]{quote}
252 xeinside                [^\\']+
253 xeescape                [\\][^0-7]
254 xeoctesc                [\\][0-7]{1,3}
255 xehexesc                [\\]x[0-9A-Fa-f]{1,2}
256 xeunicode               [\\](u[0-9A-Fa-f]{4}|U[0-9A-Fa-f]{8})
257 xeunicodefail   [\\](u[0-9A-Fa-f]{0,3}|U[0-9A-Fa-f]{0,7})
258
259 /* Extended quote
260  * xqdouble implements embedded quote, ''''
261  */
262 xqstart                 {quote}
263 xqdouble                {quote}{quote}
264 xqinside                [^']+
265
266 /* $foo$ style quotes ("dollar quoting")
267  * The quoted string starts with $foo$ where "foo" is an optional string
268  * in the form of an identifier, except that it may not contain "$",
269  * and extends to the first occurrence of an identical string.
270  * There is *no* processing of the quoted text.
271  *
272  * {dolqfailed} is an error rule to avoid scanner backup when {dolqdelim}
273  * fails to match its trailing "$".
274  */
275 dolq_start              [A-Za-z\200-\377_]
276 dolq_cont               [A-Za-z\200-\377_0-9]
277 dolqdelim               \$({dolq_start}{dolq_cont}*)?\$
278 dolqfailed              \${dolq_start}{dolq_cont}*
279 dolqinside              [^$]+
280
281 /* Double quote
282  * Allows embedded spaces and other special characters into identifiers.
283  */
284 dquote                  \"
285 xdstart                 {dquote}
286 xdstop                  {dquote}
287 xddouble                {dquote}{dquote}
288 xdinside                [^"]+
289
290 /* Unicode escapes */
291 uescape                 [uU][eE][sS][cC][aA][pP][eE]{whitespace}*{quote}[^']{quote}
292 /* error rule to avoid backup */
293 uescapefail             [uU][eE][sS][cC][aA][pP][eE]{whitespace}*"-"|[uU][eE][sS][cC][aA][pP][eE]{whitespace}*{quote}[^']|[uU][eE][sS][cC][aA][pP][eE]{whitespace}*{quote}|[uU][eE][sS][cC][aA][pP][eE]{whitespace}*|[uU][eE][sS][cC][aA][pP]|[uU][eE][sS][cC][aA]|[uU][eE][sS][cC]|[uU][eE][sS]|[uU][eE]|[uU]
294
295 /* Quoted identifier with Unicode escapes */
296 xuistart                [uU]&{dquote}
297
298 /* Quoted string with Unicode escapes */
299 xusstart                [uU]&{quote}
300
301 /* Optional UESCAPE after a quoted string or identifier with Unicode escapes. */
302 xustop1         {uescapefail}?
303 xustop2         {uescape}
304
305 /* error rule to avoid backup */
306 xufailed                [uU]&
307
308
309 /* C-style comments
310  *
311  * The "extended comment" syntax closely resembles allowable operator syntax.
312  * The tricky part here is to get lex to recognize a string starting with
313  * slash-star as a comment, when interpreting it as an operator would produce
314  * a longer match --- remember lex will prefer a longer match!  Also, if we
315  * have something like plus-slash-star, lex will think this is a 3-character
316  * operator whereas we want to see it as a + operator and a comment start.
317  * The solution is two-fold:
318  * 1. append {op_chars}* to xcstart so that it matches as much text as
319  *    {operator} would. Then the tie-breaker (first matching rule of same
320  *    length) ensures xcstart wins.  We put back the extra stuff with yyless()
321  *    in case it contains a star-slash that should terminate the comment.
322  * 2. In the operator rule, check for slash-star within the operator, and
323  *    if found throw it back with yyless().  This handles the plus-slash-star
324  *    problem.
325  * Dash-dash comments have similar interactions with the operator rule.
326  */
327 xcstart                 \/\*{op_chars}*
328 xcstop                  \*+\/
329 xcinside                [^*/]+
330
331 digit                   [0-9]
332 ident_start             [A-Za-z\200-\377_]
333 ident_cont              [A-Za-z\200-\377_0-9\$]
334
335 identifier              {ident_start}{ident_cont}*
336
337 /* Assorted special-case operators and operator-like tokens */
338 typecast                "::"
339 dot_dot                 \.\.
340 colon_equals    ":="
341 equals_greater  "=>"
342 less_equals             "<="
343 greater_equals  ">="
344 less_greater    "<>"
345 not_equals              "!="
346
347 /*
348  * "self" is the set of chars that should be returned as single-character
349  * tokens.  "op_chars" is the set of chars that can make up "Op" tokens,
350  * which can be one or more characters long (but if a single-char token
351  * appears in the "self" set, it is not to be returned as an Op).  Note
352  * that the sets overlap, but each has some chars that are not in the other.
353  *
354  * If you change either set, adjust the character lists appearing in the
355  * rule for "operator"!
356  */
357 self                    [,()\[\].;\:\+\-\*\/\%\^\<\>\=]
358 op_chars                [\~\!\@\#\^\&\|\`\?\+\-\*\/\%\<\>\=]
359 operator                {op_chars}+
360
361 /* we no longer allow unary minus in numbers.
362  * instead we pass it separately to parser. there it gets
363  * coerced via doNegate() -- Leon aug 20 1999
364  *
365  * {decimalfail} is used because we would like "1..10" to lex as 1, dot_dot, 10.
366  *
367  * {realfail1} and {realfail2} are added to prevent the need for scanner
368  * backup when the {real} rule fails to match completely.
369  */
370
371 integer                 {digit}+
372 decimal                 (({digit}*\.{digit}+)|({digit}+\.{digit}*))
373 decimalfail             {digit}+\.\.
374 real                    ({integer}|{decimal})[Ee][-+]?{digit}+
375 realfail1               ({integer}|{decimal})[Ee]
376 realfail2               ({integer}|{decimal})[Ee][-+]
377
378 param                   \${integer}
379
380 other                   .
381
382 /*
383  * Dollar quoted strings are totally opaque, and no escaping is done on them.
384  * Other quoted strings must allow some special characters such as single-quote
385  *  and newline.
386  * Embedded single-quotes are implemented both in the SQL standard
387  *  style of two adjacent single quotes "''" and in the Postgres/Java style
388  *  of escaped-quote "\'".
389  * Other embedded escaped characters are matched explicitly and the leading
390  *  backslash is dropped from the string.
391  * Note that xcstart must appear before operator, as explained above!
392  *  Also whitespace (comment) must appear before operator.
393  */
394
395 %%
396
397 {whitespace}    {
398                                         /* ignore */
399                                 }
400
401 {xcstart}               {
402                                         /* Set location in case of syntax error in comment */
403                                         SET_YYLLOC();
404                                         yyextra->xcdepth = 0;
405                                         BEGIN(xc);
406                                         /* Put back any characters past slash-star; see above */
407                                         yyless(2);
408                                 }
409
410 <xc>{xcstart}   {
411                                         (yyextra->xcdepth)++;
412                                         /* Put back any characters past slash-star; see above */
413                                         yyless(2);
414                                 }
415
416 <xc>{xcstop}    {
417                                         if (yyextra->xcdepth <= 0)
418                                                 BEGIN(INITIAL);
419                                         else
420                                                 (yyextra->xcdepth)--;
421                                 }
422
423 <xc>{xcinside}  {
424                                         /* ignore */
425                                 }
426
427 <xc>{op_chars}  {
428                                         /* ignore */
429                                 }
430
431 <xc>\*+                 {
432                                         /* ignore */
433                                 }
434
435 <xc><<EOF>>             { yyerror("unterminated /* comment"); }
436
437 {xbstart}               {
438                                         /* Binary bit type.
439                                          * At some point we should simply pass the string
440                                          * forward to the parser and label it there.
441                                          * In the meantime, place a leading "b" on the string
442                                          * to mark it for the input routine as a binary string.
443                                          */
444                                         SET_YYLLOC();
445                                         BEGIN(xb);
446                                         startlit();
447                                         addlitchar('b', yyscanner);
448                                 }
449 <xb>{quotestop} |
450 <xb>{quotefail} {
451                                         yyless(1);
452                                         BEGIN(INITIAL);
453                                         yylval->str = litbufdup(yyscanner);
454                                         return BCONST;
455                                 }
456 <xh>{xhinside}  |
457 <xb>{xbinside}  {
458                                         addlit(yytext, yyleng, yyscanner);
459                                 }
460 <xh>{quotecontinue}     |
461 <xb>{quotecontinue}     {
462                                         /* ignore */
463                                 }
464 <xb><<EOF>>             { yyerror("unterminated bit string literal"); }
465
466 {xhstart}               {
467                                         /* Hexadecimal bit type.
468                                          * At some point we should simply pass the string
469                                          * forward to the parser and label it there.
470                                          * In the meantime, place a leading "x" on the string
471                                          * to mark it for the input routine as a hex string.
472                                          */
473                                         SET_YYLLOC();
474                                         BEGIN(xh);
475                                         startlit();
476                                         addlitchar('x', yyscanner);
477                                 }
478 <xh>{quotestop} |
479 <xh>{quotefail} {
480                                         yyless(1);
481                                         BEGIN(INITIAL);
482                                         yylval->str = litbufdup(yyscanner);
483                                         return XCONST;
484                                 }
485 <xh><<EOF>>             { yyerror("unterminated hexadecimal string literal"); }
486
487 {xnstart}               {
488                                         /* National character.
489                                          * We will pass this along as a normal character string,
490                                          * but preceded with an internally-generated "NCHAR".
491                                          */
492                                         const ScanKeyword *keyword;
493
494                                         SET_YYLLOC();
495                                         yyless(1);      /* eat only 'n' this time */
496
497                                         keyword = ScanKeywordLookup("nchar",
498                                                                                                 yyextra->keywords,
499                                                                                                 yyextra->num_keywords);
500                                         if (keyword != NULL)
501                                         {
502                                                 yylval->keyword = keyword->name;
503                                                 return keyword->value;
504                                         }
505                                         else
506                                         {
507                                                 /* If NCHAR isn't a keyword, just return "n" */
508                                                 yylval->str = pstrdup("n");
509                                                 return IDENT;
510                                         }
511                                 }
512
513 {xqstart}               {
514                                         yyextra->warn_on_first_escape = true;
515                                         yyextra->saw_non_ascii = false;
516                                         SET_YYLLOC();
517                                         if (yyextra->standard_conforming_strings)
518                                                 BEGIN(xq);
519                                         else
520                                                 BEGIN(xe);
521                                         startlit();
522                                 }
523 {xestart}               {
524                                         yyextra->warn_on_first_escape = false;
525                                         yyextra->saw_non_ascii = false;
526                                         SET_YYLLOC();
527                                         BEGIN(xe);
528                                         startlit();
529                                 }
530 {xusstart}              {
531                                         SET_YYLLOC();
532                                         if (!yyextra->standard_conforming_strings)
533                                                 ereport(ERROR,
534                                                                 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
535                                                                  errmsg("unsafe use of string constant with Unicode escapes"),
536                                                                  errdetail("String constants with Unicode escapes cannot be used when standard_conforming_strings is off."),
537                                                                  lexer_errposition()));
538                                         BEGIN(xus);
539                                         startlit();
540                                 }
541 <xq,xe>{quotestop}      |
542 <xq,xe>{quotefail} {
543                                         yyless(1);
544                                         BEGIN(INITIAL);
545                                         /*
546                                          * check that the data remains valid if it might have been
547                                          * made invalid by unescaping any chars.
548                                          */
549                                         if (yyextra->saw_non_ascii)
550                                                 pg_verifymbstr(yyextra->literalbuf,
551                                                                            yyextra->literallen,
552                                                                            false);
553                                         yylval->str = litbufdup(yyscanner);
554                                         return SCONST;
555                                 }
556 <xus>{quotestop} |
557 <xus>{quotefail} {
558                                         /* throw back all but the quote */
559                                         yyless(1);
560                                         /* xusend state looks for possible UESCAPE */
561                                         BEGIN(xusend);
562                                 }
563 <xusend>{whitespace} {
564                                         /* stay in xusend state over whitespace */
565                                 }
566 <xusend><<EOF>> |
567 <xusend>{other} |
568 <xusend>{xustop1} {
569                                         /* no UESCAPE after the quote, throw back everything */
570                                         yyless(0);
571                                         BEGIN(INITIAL);
572                                         yylval->str = litbuf_udeescape('\\', yyscanner);
573                                         return SCONST;
574                                 }
575 <xusend>{xustop2} {
576                                         /* found UESCAPE after the end quote */
577                                         BEGIN(INITIAL);
578                                         if (!check_uescapechar(yytext[yyleng - 2]))
579                                         {
580                                                 SET_YYLLOC();
581                                                 ADVANCE_YYLLOC(yyleng - 2);
582                                                 yyerror("invalid Unicode escape character");
583                                         }
584                                         yylval->str = litbuf_udeescape(yytext[yyleng - 2],
585                                                                                                    yyscanner);
586                                         return SCONST;
587                                 }
588 <xq,xe,xus>{xqdouble} {
589                                         addlitchar('\'', yyscanner);
590                                 }
591 <xq,xus>{xqinside}  {
592                                         addlit(yytext, yyleng, yyscanner);
593                                 }
594 <xe>{xeinside}  {
595                                         addlit(yytext, yyleng, yyscanner);
596                                 }
597 <xe>{xeunicode} {
598                                         pg_wchar        c = strtoul(yytext + 2, NULL, 16);
599
600                                         check_escape_warning(yyscanner);
601
602                                         if (is_utf16_surrogate_first(c))
603                                         {
604                                                 yyextra->utf16_first_part = c;
605                                                 BEGIN(xeu);
606                                         }
607                                         else if (is_utf16_surrogate_second(c))
608                                                 yyerror("invalid Unicode surrogate pair");
609                                         else
610                                                 addunicode(c, yyscanner);
611                                 }
612 <xeu>{xeunicode} {
613                                         pg_wchar        c = strtoul(yytext + 2, NULL, 16);
614
615                                         if (!is_utf16_surrogate_second(c))
616                                                 yyerror("invalid Unicode surrogate pair");
617
618                                         c = surrogate_pair_to_codepoint(yyextra->utf16_first_part, c);
619
620                                         addunicode(c, yyscanner);
621
622                                         BEGIN(xe);
623                                 }
624 <xeu>.                  { yyerror("invalid Unicode surrogate pair"); }
625 <xeu>\n                 { yyerror("invalid Unicode surrogate pair"); }
626 <xeu><<EOF>>    { yyerror("invalid Unicode surrogate pair"); }
627 <xe,xeu>{xeunicodefail} {
628                                         ereport(ERROR,
629                                                         (errcode(ERRCODE_INVALID_ESCAPE_SEQUENCE),
630                                                          errmsg("invalid Unicode escape"),
631                                                          errhint("Unicode escapes must be \\uXXXX or \\UXXXXXXXX."),
632                                                          lexer_errposition()));
633                                 }
634 <xe>{xeescape}  {
635                                         if (yytext[1] == '\'')
636                                         {
637                                                 if (yyextra->backslash_quote == BACKSLASH_QUOTE_OFF ||
638                                                         (yyextra->backslash_quote == BACKSLASH_QUOTE_SAFE_ENCODING &&
639                                                          PG_ENCODING_IS_CLIENT_ONLY(pg_get_client_encoding())))
640                                                         ereport(ERROR,
641                                                                         (errcode(ERRCODE_NONSTANDARD_USE_OF_ESCAPE_CHARACTER),
642                                                                          errmsg("unsafe use of \\' in a string literal"),
643                                                                          errhint("Use '' to write quotes in strings. \\' is insecure in client-only encodings."),
644                                                                          lexer_errposition()));
645                                         }
646                                         check_string_escape_warning(yytext[1], yyscanner);
647                                         addlitchar(unescape_single_char(yytext[1], yyscanner),
648                                                            yyscanner);
649                                 }
650 <xe>{xeoctesc}  {
651                                         unsigned char c = strtoul(yytext + 1, NULL, 8);
652
653                                         check_escape_warning(yyscanner);
654                                         addlitchar(c, yyscanner);
655                                         if (c == '\0' || IS_HIGHBIT_SET(c))
656                                                 yyextra->saw_non_ascii = true;
657                                 }
658 <xe>{xehexesc}  {
659                                         unsigned char c = strtoul(yytext + 2, NULL, 16);
660
661                                         check_escape_warning(yyscanner);
662                                         addlitchar(c, yyscanner);
663                                         if (c == '\0' || IS_HIGHBIT_SET(c))
664                                                 yyextra->saw_non_ascii = true;
665                                 }
666 <xq,xe,xus>{quotecontinue} {
667                                         /* ignore */
668                                 }
669 <xe>.                   {
670                                         /* This is only needed for \ just before EOF */
671                                         addlitchar(yytext[0], yyscanner);
672                                 }
673 <xq,xe,xus><<EOF>>              { yyerror("unterminated quoted string"); }
674
675 {dolqdelim}             {
676                                         SET_YYLLOC();
677                                         yyextra->dolqstart = pstrdup(yytext);
678                                         BEGIN(xdolq);
679                                         startlit();
680                                 }
681 {dolqfailed}    {
682                                         SET_YYLLOC();
683                                         /* throw back all but the initial "$" */
684                                         yyless(1);
685                                         /* and treat it as {other} */
686                                         return yytext[0];
687                                 }
688 <xdolq>{dolqdelim} {
689                                         if (strcmp(yytext, yyextra->dolqstart) == 0)
690                                         {
691                                                 pfree(yyextra->dolqstart);
692                                                 yyextra->dolqstart = NULL;
693                                                 BEGIN(INITIAL);
694                                                 yylval->str = litbufdup(yyscanner);
695                                                 return SCONST;
696                                         }
697                                         else
698                                         {
699                                                 /*
700                                                  * When we fail to match $...$ to dolqstart, transfer
701                                                  * the $... part to the output, but put back the final
702                                                  * $ for rescanning.  Consider $delim$...$junk$delim$
703                                                  */
704                                                 addlit(yytext, yyleng - 1, yyscanner);
705                                                 yyless(yyleng - 1);
706                                         }
707                                 }
708 <xdolq>{dolqinside} {
709                                         addlit(yytext, yyleng, yyscanner);
710                                 }
711 <xdolq>{dolqfailed} {
712                                         addlit(yytext, yyleng, yyscanner);
713                                 }
714 <xdolq>.                {
715                                         /* This is only needed for $ inside the quoted text */
716                                         addlitchar(yytext[0], yyscanner);
717                                 }
718 <xdolq><<EOF>>  { yyerror("unterminated dollar-quoted string"); }
719
720 {xdstart}               {
721                                         SET_YYLLOC();
722                                         BEGIN(xd);
723                                         startlit();
724                                 }
725 {xuistart}              {
726                                         SET_YYLLOC();
727                                         BEGIN(xui);
728                                         startlit();
729                                 }
730 <xd>{xdstop}    {
731                                         char       *ident;
732
733                                         BEGIN(INITIAL);
734                                         if (yyextra->literallen == 0)
735                                                 yyerror("zero-length delimited identifier");
736                                         ident = litbufdup(yyscanner);
737                                         if (yyextra->literallen >= NAMEDATALEN)
738                                                 truncate_identifier(ident, yyextra->literallen, true);
739                                         yylval->str = ident;
740                                         return IDENT;
741                                 }
742 <xui>{dquote} {
743                                         yyless(1);
744                                         /* xuiend state looks for possible UESCAPE */
745                                         BEGIN(xuiend);
746                                 }
747 <xuiend>{whitespace} {
748                                         /* stay in xuiend state over whitespace */
749                                 }
750 <xuiend><<EOF>> |
751 <xuiend>{other} |
752 <xuiend>{xustop1} {
753                                         /* no UESCAPE after the quote, throw back everything */
754                                         char       *ident;
755                                         int                     identlen;
756
757                                         yyless(0);
758
759                                         BEGIN(INITIAL);
760                                         if (yyextra->literallen == 0)
761                                                 yyerror("zero-length delimited identifier");
762                                         ident = litbuf_udeescape('\\', yyscanner);
763                                         identlen = strlen(ident);
764                                         if (identlen >= NAMEDATALEN)
765                                                 truncate_identifier(ident, identlen, true);
766                                         yylval->str = ident;
767                                         return IDENT;
768                                 }
769 <xuiend>{xustop2}       {
770                                         /* found UESCAPE after the end quote */
771                                         char       *ident;
772                                         int                     identlen;
773
774                                         BEGIN(INITIAL);
775                                         if (yyextra->literallen == 0)
776                                                 yyerror("zero-length delimited identifier");
777                                         if (!check_uescapechar(yytext[yyleng - 2]))
778                                         {
779                                                 SET_YYLLOC();
780                                                 ADVANCE_YYLLOC(yyleng - 2);
781                                                 yyerror("invalid Unicode escape character");
782                                         }
783                                         ident = litbuf_udeescape(yytext[yyleng - 2], yyscanner);
784                                         identlen = strlen(ident);
785                                         if (identlen >= NAMEDATALEN)
786                                                 truncate_identifier(ident, identlen, true);
787                                         yylval->str = ident;
788                                         return IDENT;
789                                 }
790 <xd,xui>{xddouble}      {
791                                         addlitchar('"', yyscanner);
792                                 }
793 <xd,xui>{xdinside}      {
794                                         addlit(yytext, yyleng, yyscanner);
795                                 }
796 <xd,xui><<EOF>>         { yyerror("unterminated quoted identifier"); }
797
798 {xufailed}      {
799                                         char       *ident;
800
801                                         SET_YYLLOC();
802                                         /* throw back all but the initial u/U */
803                                         yyless(1);
804                                         /* and treat it as {identifier} */
805                                         ident = downcase_truncate_identifier(yytext, yyleng, true);
806                                         yylval->str = ident;
807                                         return IDENT;
808                                 }
809
810 {typecast}              {
811                                         SET_YYLLOC();
812                                         return TYPECAST;
813                                 }
814
815 {dot_dot}               {
816                                         SET_YYLLOC();
817                                         return DOT_DOT;
818                                 }
819
820 {colon_equals}  {
821                                         SET_YYLLOC();
822                                         return COLON_EQUALS;
823                                 }
824
825 {equals_greater} {
826                                         SET_YYLLOC();
827                                         return EQUALS_GREATER;
828                                 }
829
830 {less_equals}   {
831                                         SET_YYLLOC();
832                                         return LESS_EQUALS;
833                                 }
834
835 {greater_equals} {
836                                         SET_YYLLOC();
837                                         return GREATER_EQUALS;
838                                 }
839
840 {less_greater}  {
841                                         /* We accept both "<>" and "!=" as meaning NOT_EQUALS */
842                                         SET_YYLLOC();
843                                         return NOT_EQUALS;
844                                 }
845
846 {not_equals}    {
847                                         /* We accept both "<>" and "!=" as meaning NOT_EQUALS */
848                                         SET_YYLLOC();
849                                         return NOT_EQUALS;
850                                 }
851
852 {self}                  {
853                                         SET_YYLLOC();
854                                         return yytext[0];
855                                 }
856
857 {operator}              {
858                                         /*
859                                          * Check for embedded slash-star or dash-dash; those
860                                          * are comment starts, so operator must stop there.
861                                          * Note that slash-star or dash-dash at the first
862                                          * character will match a prior rule, not this one.
863                                          */
864                                         int                     nchars = yyleng;
865                                         char       *slashstar = strstr(yytext, "/*");
866                                         char       *dashdash = strstr(yytext, "--");
867
868                                         if (slashstar && dashdash)
869                                         {
870                                                 /* if both appear, take the first one */
871                                                 if (slashstar > dashdash)
872                                                         slashstar = dashdash;
873                                         }
874                                         else if (!slashstar)
875                                                 slashstar = dashdash;
876                                         if (slashstar)
877                                                 nchars = slashstar - yytext;
878
879                                         /*
880                                          * For SQL compatibility, '+' and '-' cannot be the
881                                          * last char of a multi-char operator unless the operator
882                                          * contains chars that are not in SQL operators.
883                                          * The idea is to lex '=-' as two operators, but not
884                                          * to forbid operator names like '?-' that could not be
885                                          * sequences of SQL operators.
886                                          */
887                                         while (nchars > 1 &&
888                                                    (yytext[nchars - 1] == '+' ||
889                                                         yytext[nchars - 1] == '-'))
890                                         {
891                                                 int                     ic;
892
893                                                 for (ic = nchars - 2; ic >= 0; ic--)
894                                                 {
895                                                         if (strchr("~!@#^&|`?%", yytext[ic]))
896                                                                 break;
897                                                 }
898                                                 if (ic >= 0)
899                                                         break; /* found a char that makes it OK */
900                                                 nchars--; /* else remove the +/-, and check again */
901                                         }
902
903                                         SET_YYLLOC();
904
905                                         if (nchars < yyleng)
906                                         {
907                                                 /* Strip the unwanted chars from the token */
908                                                 yyless(nchars);
909                                                 /*
910                                                  * If what we have left is only one char, and it's
911                                                  * one of the characters matching "self", then
912                                                  * return it as a character token the same way
913                                                  * that the "self" rule would have.
914                                                  */
915                                                 if (nchars == 1 &&
916                                                         strchr(",()[].;:+-*/%^<>=", yytext[0]))
917                                                         return yytext[0];
918                                         }
919
920                                         /*
921                                          * Complain if operator is too long.  Unlike the case
922                                          * for identifiers, we make this an error not a notice-
923                                          * and-truncate, because the odds are we are looking at
924                                          * a syntactic mistake anyway.
925                                          */
926                                         if (nchars >= NAMEDATALEN)
927                                                 yyerror("operator too long");
928
929                                         yylval->str = pstrdup(yytext);
930                                         return Op;
931                                 }
932
933 {param}                 {
934                                         SET_YYLLOC();
935                                         yylval->ival = atol(yytext + 1);
936                                         return PARAM;
937                                 }
938
939 {integer}               {
940                                         SET_YYLLOC();
941                                         return process_integer_literal(yytext, yylval);
942                                 }
943 {decimal}               {
944                                         SET_YYLLOC();
945                                         yylval->str = pstrdup(yytext);
946                                         return FCONST;
947                                 }
948 {decimalfail}   {
949                                         /* throw back the .., and treat as integer */
950                                         yyless(yyleng - 2);
951                                         SET_YYLLOC();
952                                         return process_integer_literal(yytext, yylval);
953                                 }
954 {real}                  {
955                                         SET_YYLLOC();
956                                         yylval->str = pstrdup(yytext);
957                                         return FCONST;
958                                 }
959 {realfail1}             {
960                                         /*
961                                          * throw back the [Ee], and treat as {decimal}.  Note
962                                          * that it is possible the input is actually {integer},
963                                          * but since this case will almost certainly lead to a
964                                          * syntax error anyway, we don't bother to distinguish.
965                                          */
966                                         yyless(yyleng - 1);
967                                         SET_YYLLOC();
968                                         yylval->str = pstrdup(yytext);
969                                         return FCONST;
970                                 }
971 {realfail2}             {
972                                         /* throw back the [Ee][+-], and proceed as above */
973                                         yyless(yyleng - 2);
974                                         SET_YYLLOC();
975                                         yylval->str = pstrdup(yytext);
976                                         return FCONST;
977                                 }
978
979
980 {identifier}    {
981                                         const ScanKeyword *keyword;
982                                         char       *ident;
983
984                                         SET_YYLLOC();
985
986                                         /* Is it a keyword? */
987                                         keyword = ScanKeywordLookup(yytext,
988                                                                                                 yyextra->keywords,
989                                                                                                 yyextra->num_keywords);
990                                         if (keyword != NULL)
991                                         {
992                                                 yylval->keyword = keyword->name;
993                                                 return keyword->value;
994                                         }
995
996                                         /*
997                                          * No.  Convert the identifier to lower case, and truncate
998                                          * if necessary.
999                                          */
1000                                         ident = downcase_truncate_identifier(yytext, yyleng, true);
1001                                         yylval->str = ident;
1002                                         return IDENT;
1003                                 }
1004
1005 {other}                 {
1006                                         SET_YYLLOC();
1007                                         return yytext[0];
1008                                 }
1009
1010 <<EOF>>                 {
1011                                         SET_YYLLOC();
1012                                         yyterminate();
1013                                 }
1014
1015 %%
1016
1017 /* LCOV_EXCL_STOP */
1018
1019 /*
1020  * Arrange access to yyextra for subroutines of the main yylex() function.
1021  * We expect each subroutine to have a yyscanner parameter.  Rather than
1022  * use the yyget_xxx functions, which might or might not get inlined by the
1023  * compiler, we cheat just a bit and cast yyscanner to the right type.
1024  */
1025 #undef yyextra
1026 #define yyextra  (((struct yyguts_t *) yyscanner)->yyextra_r)
1027
1028 /* Likewise for a couple of other things we need. */
1029 #undef yylloc
1030 #define yylloc  (((struct yyguts_t *) yyscanner)->yylloc_r)
1031 #undef yyleng
1032 #define yyleng  (((struct yyguts_t *) yyscanner)->yyleng_r)
1033
1034
1035 /*
1036  * scanner_errposition
1037  *              Report a lexer or grammar error cursor position, if possible.
1038  *
1039  * This is expected to be used within an ereport() call.  The return value
1040  * is a dummy (always 0, in fact).
1041  *
1042  * Note that this can only be used for messages emitted during raw parsing
1043  * (essentially, scan.l and gram.y), since it requires the yyscanner struct
1044  * to still be available.
1045  */
1046 int
1047 scanner_errposition(int location, core_yyscan_t yyscanner)
1048 {
1049         int                     pos;
1050
1051         if (location < 0)
1052                 return 0;                               /* no-op if location is unknown */
1053
1054         /* Convert byte offset to character number */
1055         pos = pg_mbstrlen_with_len(yyextra->scanbuf, location) + 1;
1056         /* And pass it to the ereport mechanism */
1057         return errposition(pos);
1058 }
1059
1060 /*
1061  * scanner_yyerror
1062  *              Report a lexer or grammar error.
1063  *
1064  * The message's cursor position is whatever YYLLOC was last set to,
1065  * ie, the start of the current token if called within yylex(), or the
1066  * most recently lexed token if called from the grammar.
1067  * This is OK for syntax error messages from the Bison parser, because Bison
1068  * parsers report error as soon as the first unparsable token is reached.
1069  * Beware of using yyerror for other purposes, as the cursor position might
1070  * be misleading!
1071  */
1072 void
1073 scanner_yyerror(const char *message, core_yyscan_t yyscanner)
1074 {
1075         const char *loc = yyextra->scanbuf + *yylloc;
1076
1077         if (*loc == YY_END_OF_BUFFER_CHAR)
1078         {
1079                 ereport(ERROR,
1080                                 (errcode(ERRCODE_SYNTAX_ERROR),
1081                 /* translator: %s is typically the translation of "syntax error" */
1082                                  errmsg("%s at end of input", _(message)),
1083                                  lexer_errposition()));
1084         }
1085         else
1086         {
1087                 ereport(ERROR,
1088                                 (errcode(ERRCODE_SYNTAX_ERROR),
1089                 /* translator: first %s is typically the translation of "syntax error" */
1090                                  errmsg("%s at or near \"%s\"", _(message), loc),
1091                                  lexer_errposition()));
1092         }
1093 }
1094
1095
1096 /*
1097  * Called before any actual parsing is done
1098  */
1099 core_yyscan_t
1100 scanner_init(const char *str,
1101                          core_yy_extra_type *yyext,
1102                          const ScanKeyword *keywords,
1103                          int num_keywords)
1104 {
1105         Size            slen = strlen(str);
1106         yyscan_t        scanner;
1107
1108         if (yylex_init(&scanner) != 0)
1109                 elog(ERROR, "yylex_init() failed: %m");
1110
1111         core_yyset_extra(yyext, scanner);
1112
1113         yyext->keywords = keywords;
1114         yyext->num_keywords = num_keywords;
1115
1116         yyext->backslash_quote = backslash_quote;
1117         yyext->escape_string_warning = escape_string_warning;
1118         yyext->standard_conforming_strings = standard_conforming_strings;
1119
1120         /*
1121          * Make a scan buffer with special termination needed by flex.
1122          */
1123         yyext->scanbuf = (char *) palloc(slen + 2);
1124         yyext->scanbuflen = slen;
1125         memcpy(yyext->scanbuf, str, slen);
1126         yyext->scanbuf[slen] = yyext->scanbuf[slen + 1] = YY_END_OF_BUFFER_CHAR;
1127         yy_scan_buffer(yyext->scanbuf, slen + 2, scanner);
1128
1129         /* initialize literal buffer to a reasonable but expansible size */
1130         yyext->literalalloc = 1024;
1131         yyext->literalbuf = (char *) palloc(yyext->literalalloc);
1132         yyext->literallen = 0;
1133
1134         return scanner;
1135 }
1136
1137
1138 /*
1139  * Called after parsing is done to clean up after scanner_init()
1140  */
1141 void
1142 scanner_finish(core_yyscan_t yyscanner)
1143 {
1144         /*
1145          * We don't bother to call yylex_destroy(), because all it would do is
1146          * pfree a small amount of control storage.  It's cheaper to leak the
1147          * storage until the parsing context is destroyed.  The amount of space
1148          * involved is usually negligible compared to the output parse tree
1149          * anyway.
1150          *
1151          * We do bother to pfree the scanbuf and literal buffer, but only if they
1152          * represent a nontrivial amount of space.  The 8K cutoff is arbitrary.
1153          */
1154         if (yyextra->scanbuflen >= 8192)
1155                 pfree(yyextra->scanbuf);
1156         if (yyextra->literalalloc >= 8192)
1157                 pfree(yyextra->literalbuf);
1158 }
1159
1160
1161 static void
1162 addlit(char *ytext, int yleng, core_yyscan_t yyscanner)
1163 {
1164         /* enlarge buffer if needed */
1165         if ((yyextra->literallen + yleng) >= yyextra->literalalloc)
1166         {
1167                 do
1168                 {
1169                         yyextra->literalalloc *= 2;
1170                 } while ((yyextra->literallen + yleng) >= yyextra->literalalloc);
1171                 yyextra->literalbuf = (char *) repalloc(yyextra->literalbuf,
1172                                                                                                 yyextra->literalalloc);
1173         }
1174         /* append new data */
1175         memcpy(yyextra->literalbuf + yyextra->literallen, ytext, yleng);
1176         yyextra->literallen += yleng;
1177 }
1178
1179
1180 static void
1181 addlitchar(unsigned char ychar, core_yyscan_t yyscanner)
1182 {
1183         /* enlarge buffer if needed */
1184         if ((yyextra->literallen + 1) >= yyextra->literalalloc)
1185         {
1186                 yyextra->literalalloc *= 2;
1187                 yyextra->literalbuf = (char *) repalloc(yyextra->literalbuf,
1188                                                                                                 yyextra->literalalloc);
1189         }
1190         /* append new data */
1191         yyextra->literalbuf[yyextra->literallen] = ychar;
1192         yyextra->literallen += 1;
1193 }
1194
1195
1196 /*
1197  * Create a palloc'd copy of literalbuf, adding a trailing null.
1198  */
1199 static char *
1200 litbufdup(core_yyscan_t yyscanner)
1201 {
1202         int                     llen = yyextra->literallen;
1203         char       *new;
1204
1205         new = palloc(llen + 1);
1206         memcpy(new, yyextra->literalbuf, llen);
1207         new[llen] = '\0';
1208         return new;
1209 }
1210
1211 static int
1212 process_integer_literal(const char *token, YYSTYPE *lval)
1213 {
1214         long            val;
1215         char       *endptr;
1216
1217         errno = 0;
1218         val = strtol(token, &endptr, 10);
1219         if (*endptr != '\0' || errno == ERANGE ||
1220                 /* check for overflow of int */
1221                 val != (int) val)
1222         {
1223                 /* integer too large, treat it as a float */
1224                 lval->str = pstrdup(token);
1225                 return FCONST;
1226         }
1227         lval->ival = val;
1228         return ICONST;
1229 }
1230
1231 static unsigned int
1232 hexval(unsigned char c)
1233 {
1234         if (c >= '0' && c <= '9')
1235                 return c - '0';
1236         if (c >= 'a' && c <= 'f')
1237                 return c - 'a' + 0xA;
1238         if (c >= 'A' && c <= 'F')
1239                 return c - 'A' + 0xA;
1240         elog(ERROR, "invalid hexadecimal digit");
1241         return 0;                                       /* not reached */
1242 }
1243
1244 static void
1245 check_unicode_value(pg_wchar c, char *loc, core_yyscan_t yyscanner)
1246 {
1247         if (GetDatabaseEncoding() == PG_UTF8)
1248                 return;
1249
1250         if (c > 0x7F)
1251         {
1252                 ADVANCE_YYLLOC(loc - yyextra->literalbuf + 3);  /* 3 for U&" */
1253                 yyerror("Unicode escape values cannot be used for code point values above 007F when the server encoding is not UTF8");
1254         }
1255 }
1256
1257 static bool
1258 is_utf16_surrogate_first(pg_wchar c)
1259 {
1260         return (c >= 0xD800 && c <= 0xDBFF);
1261 }
1262
1263 static bool
1264 is_utf16_surrogate_second(pg_wchar c)
1265 {
1266         return (c >= 0xDC00 && c <= 0xDFFF);
1267 }
1268
1269 static pg_wchar
1270 surrogate_pair_to_codepoint(pg_wchar first, pg_wchar second)
1271 {
1272         return ((first & 0x3FF) << 10) + 0x10000 + (second & 0x3FF);
1273 }
1274
1275 static void
1276 addunicode(pg_wchar c, core_yyscan_t yyscanner)
1277 {
1278         char            buf[8];
1279
1280         if (c == 0 || c > 0x10FFFF)
1281                 yyerror("invalid Unicode escape value");
1282         if (c > 0x7F)
1283         {
1284                 if (GetDatabaseEncoding() != PG_UTF8)
1285                         yyerror("Unicode escape values cannot be used for code point values above 007F when the server encoding is not UTF8");
1286                 yyextra->saw_non_ascii = true;
1287         }
1288         unicode_to_utf8(c, (unsigned char *) buf);
1289         addlit(buf, pg_mblen(buf), yyscanner);
1290 }
1291
1292 /* is 'escape' acceptable as Unicode escape character (UESCAPE syntax) ? */
1293 static bool
1294 check_uescapechar(unsigned char escape)
1295 {
1296         if (isxdigit(escape)
1297                 || escape == '+'
1298                 || escape == '\''
1299                 || escape == '"'
1300                 || scanner_isspace(escape))
1301         {
1302                 return false;
1303         }
1304         else
1305                 return true;
1306 }
1307
1308 /* like litbufdup, but handle unicode escapes */
1309 static char *
1310 litbuf_udeescape(unsigned char escape, core_yyscan_t yyscanner)
1311 {
1312         char       *new;
1313         char       *litbuf,
1314                            *in,
1315                            *out;
1316         pg_wchar        pair_first = 0;
1317
1318         /* Make literalbuf null-terminated to simplify the scanning loop */
1319         litbuf = yyextra->literalbuf;
1320         litbuf[yyextra->literallen] = '\0';
1321
1322         /*
1323          * This relies on the subtle assumption that a UTF-8 expansion cannot be
1324          * longer than its escaped representation.
1325          */
1326         new = palloc(yyextra->literallen + 1);
1327
1328         in = litbuf;
1329         out = new;
1330         while (*in)
1331         {
1332                 if (in[0] == escape)
1333                 {
1334                         if (in[1] == escape)
1335                         {
1336                                 if (pair_first)
1337                                 {
1338                                         ADVANCE_YYLLOC(in - litbuf + 3);        /* 3 for U&" */
1339                                         yyerror("invalid Unicode surrogate pair");
1340                                 }
1341                                 *out++ = escape;
1342                                 in += 2;
1343                         }
1344                         else if (isxdigit((unsigned char) in[1]) &&
1345                                          isxdigit((unsigned char) in[2]) &&
1346                                          isxdigit((unsigned char) in[3]) &&
1347                                          isxdigit((unsigned char) in[4]))
1348                         {
1349                                 pg_wchar        unicode;
1350
1351                                 unicode = (hexval(in[1]) << 12) +
1352                                         (hexval(in[2]) << 8) +
1353                                         (hexval(in[3]) << 4) +
1354                                         hexval(in[4]);
1355                                 check_unicode_value(unicode, in, yyscanner);
1356                                 if (pair_first)
1357                                 {
1358                                         if (is_utf16_surrogate_second(unicode))
1359                                         {
1360                                                 unicode = surrogate_pair_to_codepoint(pair_first, unicode);
1361                                                 pair_first = 0;
1362                                         }
1363                                         else
1364                                         {
1365                                                 ADVANCE_YYLLOC(in - litbuf + 3);                /* 3 for U&" */
1366                                                 yyerror("invalid Unicode surrogate pair");
1367                                         }
1368                                 }
1369                                 else if (is_utf16_surrogate_second(unicode))
1370                                         yyerror("invalid Unicode surrogate pair");
1371
1372                                 if (is_utf16_surrogate_first(unicode))
1373                                         pair_first = unicode;
1374                                 else
1375                                 {
1376                                         unicode_to_utf8(unicode, (unsigned char *) out);
1377                                         out += pg_mblen(out);
1378                                 }
1379                                 in += 5;
1380                         }
1381                         else if (in[1] == '+' &&
1382                                          isxdigit((unsigned char) in[2]) &&
1383                                          isxdigit((unsigned char) in[3]) &&
1384                                          isxdigit((unsigned char) in[4]) &&
1385                                          isxdigit((unsigned char) in[5]) &&
1386                                          isxdigit((unsigned char) in[6]) &&
1387                                          isxdigit((unsigned char) in[7]))
1388                         {
1389                                 pg_wchar        unicode;
1390
1391                                 unicode = (hexval(in[2]) << 20) +
1392                                         (hexval(in[3]) << 16) +
1393                                         (hexval(in[4]) << 12) +
1394                                         (hexval(in[5]) << 8) +
1395                                         (hexval(in[6]) << 4) +
1396                                         hexval(in[7]);
1397                                 check_unicode_value(unicode, in, yyscanner);
1398                                 if (pair_first)
1399                                 {
1400                                         if (is_utf16_surrogate_second(unicode))
1401                                         {
1402                                                 unicode = surrogate_pair_to_codepoint(pair_first, unicode);
1403                                                 pair_first = 0;
1404                                         }
1405                                         else
1406                                         {
1407                                                 ADVANCE_YYLLOC(in - litbuf + 3);                /* 3 for U&" */
1408                                                 yyerror("invalid Unicode surrogate pair");
1409                                         }
1410                                 }
1411                                 else if (is_utf16_surrogate_second(unicode))
1412                                         yyerror("invalid Unicode surrogate pair");
1413
1414                                 if (is_utf16_surrogate_first(unicode))
1415                                         pair_first = unicode;
1416                                 else
1417                                 {
1418                                         unicode_to_utf8(unicode, (unsigned char *) out);
1419                                         out += pg_mblen(out);
1420                                 }
1421                                 in += 8;
1422                         }
1423                         else
1424                         {
1425                                 ADVANCE_YYLLOC(in - litbuf + 3);                /* 3 for U&" */
1426                                 yyerror("invalid Unicode escape value");
1427                         }
1428                 }
1429                 else
1430                 {
1431                         if (pair_first)
1432                         {
1433                                 ADVANCE_YYLLOC(in - litbuf + 3);                /* 3 for U&" */
1434                                 yyerror("invalid Unicode surrogate pair");
1435                         }
1436                         *out++ = *in++;
1437                 }
1438         }
1439
1440         /* unfinished surrogate pair? */
1441         if (pair_first)
1442         {
1443                 ADVANCE_YYLLOC(in - litbuf + 3);                                /* 3 for U&" */
1444                 yyerror("invalid Unicode surrogate pair");
1445         }
1446
1447         *out = '\0';
1448
1449         /*
1450          * We could skip pg_verifymbstr if we didn't process any non-7-bit-ASCII
1451          * codes; but it's probably not worth the trouble, since this isn't likely
1452          * to be a performance-critical path.
1453          */
1454         pg_verifymbstr(new, out - new, false);
1455         return new;
1456 }
1457
1458 static unsigned char
1459 unescape_single_char(unsigned char c, core_yyscan_t yyscanner)
1460 {
1461         switch (c)
1462         {
1463                 case 'b':
1464                         return '\b';
1465                 case 'f':
1466                         return '\f';
1467                 case 'n':
1468                         return '\n';
1469                 case 'r':
1470                         return '\r';
1471                 case 't':
1472                         return '\t';
1473                 default:
1474                         /* check for backslash followed by non-7-bit-ASCII */
1475                         if (c == '\0' || IS_HIGHBIT_SET(c))
1476                                 yyextra->saw_non_ascii = true;
1477
1478                         return c;
1479         }
1480 }
1481
1482 static void
1483 check_string_escape_warning(unsigned char ychar, core_yyscan_t yyscanner)
1484 {
1485         if (ychar == '\'')
1486         {
1487                 if (yyextra->warn_on_first_escape && yyextra->escape_string_warning)
1488                         ereport(WARNING,
1489                                         (errcode(ERRCODE_NONSTANDARD_USE_OF_ESCAPE_CHARACTER),
1490                                          errmsg("nonstandard use of \\' in a string literal"),
1491                                          errhint("Use '' to write quotes in strings, or use the escape string syntax (E'...')."),
1492                                          lexer_errposition()));
1493                 yyextra->warn_on_first_escape = false;  /* warn only once per string */
1494         }
1495         else if (ychar == '\\')
1496         {
1497                 if (yyextra->warn_on_first_escape && yyextra->escape_string_warning)
1498                         ereport(WARNING,
1499                                         (errcode(ERRCODE_NONSTANDARD_USE_OF_ESCAPE_CHARACTER),
1500                                          errmsg("nonstandard use of \\\\ in a string literal"),
1501                                          errhint("Use the escape string syntax for backslashes, e.g., E'\\\\'."),
1502                                          lexer_errposition()));
1503                 yyextra->warn_on_first_escape = false;  /* warn only once per string */
1504         }
1505         else
1506                 check_escape_warning(yyscanner);
1507 }
1508
1509 static void
1510 check_escape_warning(core_yyscan_t yyscanner)
1511 {
1512         if (yyextra->warn_on_first_escape && yyextra->escape_string_warning)
1513                 ereport(WARNING,
1514                                 (errcode(ERRCODE_NONSTANDARD_USE_OF_ESCAPE_CHARACTER),
1515                                  errmsg("nonstandard use of escape in a string literal"),
1516                 errhint("Use the escape string syntax for escapes, e.g., E'\\r\\n'."),
1517                                  lexer_errposition()));
1518         yyextra->warn_on_first_escape = false;          /* warn only once per string */
1519 }
1520
1521 /*
1522  * Interface functions to make flex use palloc() instead of malloc().
1523  * It'd be better to make these static, but flex insists otherwise.
1524  */
1525
1526 void *
1527 core_yyalloc(yy_size_t bytes, core_yyscan_t yyscanner)
1528 {
1529         return palloc(bytes);
1530 }
1531
1532 void *
1533 core_yyrealloc(void *ptr, yy_size_t bytes, core_yyscan_t yyscanner)
1534 {
1535         if (ptr)
1536                 return repalloc(ptr, bytes);
1537         else
1538                 return palloc(bytes);
1539 }
1540
1541 void
1542 core_yyfree(void *ptr, core_yyscan_t yyscanner)
1543 {
1544         if (ptr)
1545                 pfree(ptr);
1546 }