]> granicus.if.org Git - postgresql/blob - src/backend/parser/scan.l
Disable the use of Unicode escapes in string constants (U&'') when
[postgresql] / src / backend / parser / scan.l
1 %{
2 /*-------------------------------------------------------------------------
3  *
4  * scan.l
5  *        lexical scanner for PostgreSQL
6  *
7  * NOTE NOTE NOTE:
8  *
9  * The rules in this file must be kept in sync with psql's lexer!!!
10  *
11  * The rules are designed so that the scanner never has to backtrack,
12  * in the sense that there is always a rule that can match the input
13  * consumed so far (the rule action may internally throw back some input
14  * with yyless(), however).  As explained in the flex manual, this makes
15  * for a useful speed increase --- about a third faster than a plain -CF
16  * lexer, in simple testing.  The extra complexity is mostly in the rules
17  * for handling float numbers and continued string literals.  If you change
18  * the lexical rules, verify that you haven't broken the no-backtrack
19  * property by running flex with the "-b" option and checking that the
20  * resulting "lex.backup" file says that no backing up is needed.
21  *
22  *
23  * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
24  * Portions Copyright (c) 1994, Regents of the University of California
25  *
26  * IDENTIFICATION
27  *        $PostgreSQL: pgsql/src/backend/parser/scan.l,v 1.152 2009/05/05 18:32:17 petere Exp $
28  *
29  *-------------------------------------------------------------------------
30  */
31 #include "postgres.h"
32
33 #include <ctype.h>
34 #include <unistd.h>
35
36 #include "parser/gramparse.h"
37 #include "parser/keywords.h"
38 /* Not needed now that this file is compiled as part of gram.y */
39 /* #include "parser/gram.h" */
40 #include "parser/scansup.h"
41 #include "mb/pg_wchar.h"
42
43
44 /* Avoid exit() on fatal scanner errors (a bit ugly -- see yy_fatal_error) */
45 #undef fprintf
46 #define fprintf(file, fmt, msg)  ereport(ERROR, (errmsg_internal("%s", msg)))
47
48 static int              xcdepth = 0;    /* depth of nesting in slash-star comments */
49 static char    *dolqstart;      /* current $foo$ quote start string */
50
51 /*
52  * GUC variables.  This is a DIRECT violation of the warning given at the
53  * head of gram.y, ie flex/bison code must not depend on any GUC variables;
54  * as such, changing their values can induce very unintuitive behavior.
55  * But we shall have to live with it as a short-term thing until the switch
56  * to SQL-standard string syntax is complete.
57  */
58 int                             backslash_quote = BACKSLASH_QUOTE_SAFE_ENCODING;
59 bool                    escape_string_warning = true;
60 bool                    standard_conforming_strings = false;
61
62 static bool             warn_on_first_escape;
63 static bool             saw_non_ascii = false;
64
65 /*
66  * literalbuf is used to accumulate literal values when multiple rules
67  * are needed to parse a single literal.  Call startlit to reset buffer
68  * to empty, addlit to add text.  Note that the buffer is palloc'd and
69  * starts life afresh on every parse cycle.
70  */
71 static char        *literalbuf;         /* expandable buffer */
72 static int              literallen;             /* actual current length */
73 static int              literalalloc;   /* current allocated buffer size */
74
75 #define startlit()  (literalbuf[0] = '\0', literallen = 0)
76 static void addlit(char *ytext, int yleng);
77 static void addlitchar(unsigned char ychar);
78 static char *litbufdup(void);
79 static char *litbuf_udeescape(unsigned char escape);
80
81 #define lexer_errposition()  scanner_errposition(yylloc)
82
83 static void check_escape_warning(void);
84 static void check_string_escape_warning(unsigned char ychar);
85
86 /*
87  * Each call to yylex must set yylloc to the location of the found token
88  * (expressed as a byte offset from the start of the input text).
89  * When we parse a token that requires multiple lexer rules to process,
90  * this should be done in the first such rule, else yylloc will point
91  * into the middle of the token.
92  */
93 #define SET_YYLLOC()  (yylloc = yytext - scanbuf)
94
95 /* Handles to the buffer that the lexer uses internally */
96 static YY_BUFFER_STATE scanbufhandle;
97 static char *scanbuf;
98
99 static unsigned char unescape_single_char(unsigned char c);
100
101 %}
102
103 %option 8bit
104 %option never-interactive
105 %option nodefault
106 %option noinput
107 %option nounput
108 %option noyywrap
109 %option prefix="base_yy"
110
111 /*
112  * OK, here is a short description of lex/flex rules behavior.
113  * The longest pattern which matches an input string is always chosen.
114  * For equal-length patterns, the first occurring in the rules list is chosen.
115  * INITIAL is the starting state, to which all non-conditional rules apply.
116  * Exclusive states change parsing rules while the state is active.  When in
117  * an exclusive state, only those rules defined for that state apply.
118  *
119  * We use exclusive states for quoted strings, extended comments,
120  * and to eliminate parsing troubles for numeric strings.
121  * Exclusive states:
122  *  <xb> bit string literal
123  *  <xc> extended C-style comments
124  *  <xd> delimited identifiers (double-quoted identifiers)
125  *  <xh> hexadecimal numeric string
126  *  <xq> standard quoted strings
127  *  <xe> extended quoted strings (support backslash escape sequences)
128  *  <xdolq> $foo$ quoted strings
129  *  <xui> quoted identifier with Unicode escapes
130  *  <xus> quoted string with Unicode escapes
131  */
132
133 %x xb
134 %x xc
135 %x xd
136 %x xh
137 %x xe
138 %x xq
139 %x xdolq
140 %x xui
141 %x xus
142
143 /*
144  * In order to make the world safe for Windows and Mac clients as well as
145  * Unix ones, we accept either \n or \r as a newline.  A DOS-style \r\n
146  * sequence will be seen as two successive newlines, but that doesn't cause
147  * any problems.  Comments that start with -- and extend to the next
148  * newline are treated as equivalent to a single whitespace character.
149  *
150  * NOTE a fine point: if there is no newline following --, we will absorb
151  * everything to the end of the input as a comment.  This is correct.  Older
152  * versions of Postgres failed to recognize -- as a comment if the input
153  * did not end with a newline.
154  *
155  * XXX perhaps \f (formfeed) should be treated as a newline as well?
156  *
157  * XXX if you change the set of whitespace characters, fix scanner_isspace()
158  * to agree, and see also the plpgsql lexer.
159  */
160
161 space                   [ \t\n\r\f]
162 horiz_space             [ \t\f]
163 newline                 [\n\r]
164 non_newline             [^\n\r]
165
166 comment                 ("--"{non_newline}*)
167
168 whitespace              ({space}+|{comment})
169
170 /*
171  * SQL requires at least one newline in the whitespace separating
172  * string literals that are to be concatenated.  Silly, but who are we
173  * to argue?  Note that {whitespace_with_newline} should not have * after
174  * it, whereas {whitespace} should generally have a * after it...
175  */
176
177 special_whitespace              ({space}+|{comment}{newline})
178 horiz_whitespace                ({horiz_space}|{comment})
179 whitespace_with_newline ({horiz_whitespace}*{newline}{special_whitespace}*)
180
181 /*
182  * To ensure that {quotecontinue} can be scanned without having to back up
183  * if the full pattern isn't matched, we include trailing whitespace in
184  * {quotestop}.  This matches all cases where {quotecontinue} fails to match,
185  * except for {quote} followed by whitespace and just one "-" (not two,
186  * which would start a {comment}).  To cover that we have {quotefail}.
187  * The actions for {quotestop} and {quotefail} must throw back characters
188  * beyond the quote proper.
189  */
190 quote                   '
191 quotestop               {quote}{whitespace}*
192 quotecontinue   {quote}{whitespace_with_newline}{quote}
193 quotefail               {quote}{whitespace}*"-"
194
195 /* Bit string
196  * It is tempting to scan the string for only those characters
197  * which are allowed. However, this leads to silently swallowed
198  * characters if illegal characters are included in the string.
199  * For example, if xbinside is [01] then B'ABCD' is interpreted
200  * as a zero-length string, and the ABCD' is lost!
201  * Better to pass the string forward and let the input routines
202  * validate the contents.
203  */
204 xbstart                 [bB]{quote}
205 xbinside                [^']*
206
207 /* Hexadecimal number */
208 xhstart                 [xX]{quote}
209 xhinside                [^']*
210
211 /* National character */
212 xnstart                 [nN]{quote}
213
214 /* Quoted string that allows backslash escapes */
215 xestart                 [eE]{quote}
216 xeinside                [^\\']+
217 xeescape                [\\][^0-7]
218 xeoctesc                [\\][0-7]{1,3}
219 xehexesc                [\\]x[0-9A-Fa-f]{1,2}
220
221 /* Extended quote
222  * xqdouble implements embedded quote, ''''
223  */
224 xqstart                 {quote}
225 xqdouble                {quote}{quote}
226 xqinside                [^']+
227
228 /* $foo$ style quotes ("dollar quoting")
229  * The quoted string starts with $foo$ where "foo" is an optional string
230  * in the form of an identifier, except that it may not contain "$", 
231  * and extends to the first occurrence of an identical string.  
232  * There is *no* processing of the quoted text.
233  *
234  * {dolqfailed} is an error rule to avoid scanner backup when {dolqdelim}
235  * fails to match its trailing "$".
236  */
237 dolq_start              [A-Za-z\200-\377_]
238 dolq_cont               [A-Za-z\200-\377_0-9]
239 dolqdelim               \$({dolq_start}{dolq_cont}*)?\$
240 dolqfailed              \${dolq_start}{dolq_cont}*
241 dolqinside              [^$]+
242
243 /* Double quote
244  * Allows embedded spaces and other special characters into identifiers.
245  */
246 dquote                  \"
247 xdstart                 {dquote}
248 xdstop                  {dquote}
249 xddouble                {dquote}{dquote}
250 xdinside                [^"]+
251
252 /* Unicode escapes */
253 uescape                 [uU][eE][sS][cC][aA][pP][eE]{whitespace}*{quote}[^']{quote}
254 /* error rule to avoid backup */
255 uescapefail             ("-"|[uU][eE][sS][cC][aA][pP][eE]{whitespace}*"-"|[uU][eE][sS][cC][aA][pP][eE]{whitespace}*{quote}[^']|[uU][eE][sS][cC][aA][pP][eE]{whitespace}*{quote}|[uU][eE][sS][cC][aA][pP][eE]{whitespace}*|[uU][eE][sS][cC][aA][pP]|[uU][eE][sS][cC][aA]|[uU][eE][sS][cC]|[uU][eE][sS]|[uU][eE]|[uU])
256
257 /* Quoted identifier with Unicode escapes */
258 xuistart                [uU]&{dquote}
259 xuistop1                {dquote}{whitespace}*{uescapefail}?
260 xuistop2                {dquote}{whitespace}*{uescape}
261
262 /* Quoted string with Unicode escapes */
263 xusstart                [uU]&{quote}
264 xusstop1                {quote}{whitespace}*{uescapefail}?
265 xusstop2                {quote}{whitespace}*{uescape}
266
267 /* error rule to avoid backup */
268 xufailed                [uU]&
269
270
271 /* C-style comments
272  *
273  * The "extended comment" syntax closely resembles allowable operator syntax.
274  * The tricky part here is to get lex to recognize a string starting with
275  * slash-star as a comment, when interpreting it as an operator would produce
276  * a longer match --- remember lex will prefer a longer match!  Also, if we
277  * have something like plus-slash-star, lex will think this is a 3-character
278  * operator whereas we want to see it as a + operator and a comment start.
279  * The solution is two-fold:
280  * 1. append {op_chars}* to xcstart so that it matches as much text as
281  *    {operator} would. Then the tie-breaker (first matching rule of same
282  *    length) ensures xcstart wins.  We put back the extra stuff with yyless()
283  *    in case it contains a star-slash that should terminate the comment.
284  * 2. In the operator rule, check for slash-star within the operator, and
285  *    if found throw it back with yyless().  This handles the plus-slash-star
286  *    problem.
287  * Dash-dash comments have similar interactions with the operator rule.
288  */
289 xcstart                 \/\*{op_chars}*
290 xcstop                  \*+\/
291 xcinside                [^*/]+
292
293 digit                   [0-9]
294 ident_start             [A-Za-z\200-\377_]
295 ident_cont              [A-Za-z\200-\377_0-9\$]
296
297 identifier              {ident_start}{ident_cont}*
298
299 typecast                "::"
300
301 /*
302  * "self" is the set of chars that should be returned as single-character
303  * tokens.  "op_chars" is the set of chars that can make up "Op" tokens,
304  * which can be one or more characters long (but if a single-char token
305  * appears in the "self" set, it is not to be returned as an Op).  Note
306  * that the sets overlap, but each has some chars that are not in the other.
307  *
308  * If you change either set, adjust the character lists appearing in the
309  * rule for "operator"!
310  */
311 self                    [,()\[\].;\:\+\-\*\/\%\^\<\>\=]
312 op_chars                [\~\!\@\#\^\&\|\`\?\+\-\*\/\%\<\>\=]
313 operator                {op_chars}+
314
315 /* we no longer allow unary minus in numbers. 
316  * instead we pass it separately to parser. there it gets
317  * coerced via doNegate() -- Leon aug 20 1999
318  *
319  * {realfail1} and {realfail2} are added to prevent the need for scanner
320  * backup when the {real} rule fails to match completely.
321  */
322
323 integer                 {digit}+
324 decimal                 (({digit}*\.{digit}+)|({digit}+\.{digit}*))
325 real                    ({integer}|{decimal})[Ee][-+]?{digit}+
326 realfail1               ({integer}|{decimal})[Ee]
327 realfail2               ({integer}|{decimal})[Ee][-+]
328
329 param                   \${integer}
330
331 other                   .
332
333 /*
334  * Dollar quoted strings are totally opaque, and no escaping is done on them.
335  * Other quoted strings must allow some special characters such as single-quote
336  *  and newline.
337  * Embedded single-quotes are implemented both in the SQL standard
338  *  style of two adjacent single quotes "''" and in the Postgres/Java style
339  *  of escaped-quote "\'".
340  * Other embedded escaped characters are matched explicitly and the leading
341  *  backslash is dropped from the string.
342  * Note that xcstart must appear before operator, as explained above!
343  *  Also whitespace (comment) must appear before operator.
344  */
345
346 %%
347
348 {whitespace}    {
349                                         /* ignore */
350                                 }
351
352 {xcstart}               {
353                                         /* Set location in case of syntax error in comment */
354                                         SET_YYLLOC();
355                                         xcdepth = 0;
356                                         BEGIN(xc);
357                                         /* Put back any characters past slash-star; see above */
358                                         yyless(2);
359                                 }
360
361 <xc>{xcstart}   {
362                                         xcdepth++;
363                                         /* Put back any characters past slash-star; see above */
364                                         yyless(2);
365                                 }
366
367 <xc>{xcstop}    {
368                                         if (xcdepth <= 0)
369                                                 BEGIN(INITIAL);
370                                         else
371                                                 xcdepth--;
372                                 }
373
374 <xc>{xcinside}  {
375                                         /* ignore */
376                                 }
377
378 <xc>{op_chars}  {
379                                         /* ignore */
380                                 }
381
382 <xc>\*+                 {
383                                         /* ignore */
384                                 }
385
386 <xc><<EOF>>             { yyerror("unterminated /* comment"); }
387
388 {xbstart}               {
389                                         /* Binary bit type.
390                                          * At some point we should simply pass the string
391                                          * forward to the parser and label it there.
392                                          * In the meantime, place a leading "b" on the string
393                                          * to mark it for the input routine as a binary string.
394                                          */
395                                         SET_YYLLOC();
396                                         BEGIN(xb);
397                                         startlit();
398                                         addlitchar('b');
399                                 }
400 <xb>{quotestop} |
401 <xb>{quotefail} {
402                                         yyless(1);
403                                         BEGIN(INITIAL);
404                                         yylval.str = litbufdup();
405                                         return BCONST;
406                                 }
407 <xh>{xhinside}  |
408 <xb>{xbinside}  {
409                                         addlit(yytext, yyleng);
410                                 }
411 <xh>{quotecontinue}     |
412 <xb>{quotecontinue}     {
413                                         /* ignore */
414                                 }
415 <xb><<EOF>>             { yyerror("unterminated bit string literal"); }
416
417 {xhstart}               {
418                                         /* Hexadecimal bit type.
419                                          * At some point we should simply pass the string
420                                          * forward to the parser and label it there.
421                                          * In the meantime, place a leading "x" on the string
422                                          * to mark it for the input routine as a hex string.
423                                          */
424                                         SET_YYLLOC();
425                                         BEGIN(xh);
426                                         startlit();
427                                         addlitchar('x');
428                                 }
429 <xh>{quotestop} |
430 <xh>{quotefail} {
431                                         yyless(1);
432                                         BEGIN(INITIAL);
433                                         yylval.str = litbufdup();
434                                         return XCONST;
435                                 }
436 <xh><<EOF>>             { yyerror("unterminated hexadecimal string literal"); }
437
438 {xnstart}               {
439                                         /* National character.
440                                          * We will pass this along as a normal character string,
441                                          * but preceded with an internally-generated "NCHAR".
442                                          */
443                                         const ScanKeyword *keyword;
444
445                                         SET_YYLLOC();
446                                         yyless(1);                              /* eat only 'n' this time */
447                                         /* nchar had better be a keyword! */
448                                         keyword = ScanKeywordLookup("nchar");
449                                         Assert(keyword != NULL);
450                                         yylval.keyword = keyword->name;
451                                         return keyword->value;
452                                 }
453
454 {xqstart}               {
455                                         warn_on_first_escape = true;
456                                         saw_non_ascii = false;
457                                         SET_YYLLOC();
458                                         if (standard_conforming_strings)
459                                                 BEGIN(xq);
460                                         else
461                                                 BEGIN(xe);
462                                         startlit();
463                                 }
464 {xestart}               {
465                                         warn_on_first_escape = false;
466                                         saw_non_ascii = false;
467                                         SET_YYLLOC();
468                                         BEGIN(xe);
469                                         startlit();
470                                 }
471 {xusstart}              {
472                                         if (!standard_conforming_strings)
473                                                 ereport(ERROR,
474                                                                 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
475                                                                  errmsg("unsafe use of string constant with Unicode escapes"),
476                                                                  errdetail("String constants with Unicode escapes cannot be used when standard_conforming_strings is off.")));
477                                         SET_YYLLOC();
478                                         BEGIN(xus);
479                                         startlit();
480                                 }
481 <xq,xe>{quotestop}      |
482 <xq,xe>{quotefail} {
483                                         yyless(1);
484                                         BEGIN(INITIAL);
485                                         /*
486                                          * check that the data remains valid if it might have been
487                                          * made invalid by unescaping any chars.
488                                          */
489                                         if (saw_non_ascii)
490                                                 pg_verifymbstr(literalbuf, literallen, false);
491                                         yylval.str = litbufdup();
492                                         return SCONST;
493                                 }
494 <xus>{xusstop1} {
495                                         /* throw back all but the quote */
496                                         yyless(1);
497                                         BEGIN(INITIAL);
498                                         yylval.str = litbuf_udeescape('\\');
499                                         return SCONST;
500                                 }
501 <xus>{xusstop2} {
502                                         BEGIN(INITIAL);
503                                         yylval.str = litbuf_udeescape(yytext[yyleng-2]);
504                                         return SCONST;
505                                 }
506 <xq,xe,xus>{xqdouble} {
507                                         addlitchar('\'');
508                                 }
509 <xq,xus>{xqinside}  {
510                                         addlit(yytext, yyleng);
511                                 }
512 <xe>{xeinside}  {
513                                         addlit(yytext, yyleng);
514                                 }
515 <xe>{xeescape}  {
516                                         if (yytext[1] == '\'')
517                                         {
518                                                 if (backslash_quote == BACKSLASH_QUOTE_OFF ||
519                                                         (backslash_quote == BACKSLASH_QUOTE_SAFE_ENCODING &&
520                                                          PG_ENCODING_IS_CLIENT_ONLY(pg_get_client_encoding())))
521                                                         ereport(ERROR,
522                                                                         (errcode(ERRCODE_NONSTANDARD_USE_OF_ESCAPE_CHARACTER),
523                                                                          errmsg("unsafe use of \\' in a string literal"),
524                                                                          errhint("Use '' to write quotes in strings. \\' is insecure in client-only encodings."),
525                                                                          lexer_errposition()));
526                                         }
527                                         check_string_escape_warning(yytext[1]);
528                                         addlitchar(unescape_single_char(yytext[1]));
529                                 }
530 <xe>{xeoctesc}  {
531                                         unsigned char c = strtoul(yytext+1, NULL, 8);
532
533                                         check_escape_warning();
534                                         addlitchar(c);
535                                         if (c == '\0' || IS_HIGHBIT_SET(c))
536                                                 saw_non_ascii = true;
537                                 }
538 <xe>{xehexesc}  {
539                                         unsigned char c = strtoul(yytext+2, NULL, 16);
540
541                                         check_escape_warning();
542                                         addlitchar(c);
543                                         if (c == '\0' || IS_HIGHBIT_SET(c))
544                                                 saw_non_ascii = true;
545                                 }
546 <xq,xe,xus>{quotecontinue} {
547                                         /* ignore */
548                                 }
549 <xe>.                   {
550                                         /* This is only needed for \ just before EOF */
551                                         addlitchar(yytext[0]);
552                                 }
553 <xq,xe,xus><<EOF>>              { yyerror("unterminated quoted string"); }
554
555 {dolqdelim}             {
556                                         SET_YYLLOC();
557                                         dolqstart = pstrdup(yytext);
558                                         BEGIN(xdolq);
559                                         startlit();
560                                 }
561 {dolqfailed}    {
562                                         SET_YYLLOC();
563                                         /* throw back all but the initial "$" */
564                                         yyless(1);
565                                         /* and treat it as {other} */
566                                         return yytext[0];
567                                 }
568 <xdolq>{dolqdelim} {
569                                         if (strcmp(yytext, dolqstart) == 0)
570                                         {
571                                                 pfree(dolqstart);
572                                                 BEGIN(INITIAL);
573                                                 yylval.str = litbufdup();
574                                                 return SCONST;
575                                         }
576                                         else
577                                         {
578                                                 /*
579                                                  * When we fail to match $...$ to dolqstart, transfer
580                                                  * the $... part to the output, but put back the final
581                                                  * $ for rescanning.  Consider $delim$...$junk$delim$
582                                                  */
583                                                 addlit(yytext, yyleng-1);
584                                                 yyless(yyleng-1);
585                                         }
586                                 }
587 <xdolq>{dolqinside} {
588                                         addlit(yytext, yyleng);
589                                 }
590 <xdolq>{dolqfailed} {
591                                         addlit(yytext, yyleng);
592                                 }
593 <xdolq>.                {
594                                         /* This is only needed for $ inside the quoted text */
595                                         addlitchar(yytext[0]);
596                                 }
597 <xdolq><<EOF>>  { yyerror("unterminated dollar-quoted string"); }
598
599 {xdstart}               {
600                                         SET_YYLLOC();
601                                         BEGIN(xd);
602                                         startlit();
603                                 }
604 {xuistart}              {
605                                         SET_YYLLOC();
606                                         BEGIN(xui);
607                                         startlit();
608                                 }
609 <xd>{xdstop}    {
610                                         char               *ident;
611
612                                         BEGIN(INITIAL);
613                                         if (literallen == 0)
614                                                 yyerror("zero-length delimited identifier");
615                                         ident = litbufdup();
616                                         if (literallen >= NAMEDATALEN)
617                                                 truncate_identifier(ident, literallen, true);
618                                         yylval.str = ident;
619                                         return IDENT;
620                                 }
621 <xui>{xuistop1} {
622                                         char               *ident;
623
624                                         BEGIN(INITIAL);
625                                         if (literallen == 0)
626                                                 yyerror("zero-length delimited identifier");
627                                         ident = litbuf_udeescape('\\');
628                                         if (literallen >= NAMEDATALEN)
629                                                 truncate_identifier(ident, literallen, true);
630                                         yylval.str = ident;
631                                         /* throw back all but the quote */
632                                         yyless(1);
633                                         return IDENT;
634                                 }
635 <xui>{xuistop2} {
636                                         char               *ident;
637
638                                         BEGIN(INITIAL);
639                                         if (literallen == 0)
640                                                 yyerror("zero-length delimited identifier");
641                                         ident = litbuf_udeescape(yytext[yyleng - 2]);
642                                         if (literallen >= NAMEDATALEN)
643                                                 truncate_identifier(ident, literallen, true);
644                                         yylval.str = ident;
645                                         return IDENT;
646                                 }
647 <xd,xui>{xddouble}      {
648                                         addlitchar('"');
649                                 }
650 <xd,xui>{xdinside}      {
651                                         addlit(yytext, yyleng);
652                                 }
653 <xd,xui><<EOF>>         { yyerror("unterminated quoted identifier"); }
654
655 {xufailed}      {
656                                         char               *ident;
657
658                                         SET_YYLLOC();
659                                         /* throw back all but the initial u/U */
660                                         yyless(1);
661                                         /* and treat it as {identifier} */
662                                         ident = downcase_truncate_identifier(yytext, yyleng, true);
663                                         yylval.str = ident;
664                                         return IDENT;
665                                 }
666
667 {typecast}              {
668                                         SET_YYLLOC();
669                                         return TYPECAST;
670                                 }
671
672 {self}                  {
673                                         SET_YYLLOC();
674                                         return yytext[0];
675                                 }
676
677 {operator}              {
678                                         /*
679                                          * Check for embedded slash-star or dash-dash; those
680                                          * are comment starts, so operator must stop there.
681                                          * Note that slash-star or dash-dash at the first
682                                          * character will match a prior rule, not this one.
683                                          */
684                                         int             nchars = yyleng;
685                                         char   *slashstar = strstr(yytext, "/*");
686                                         char   *dashdash = strstr(yytext, "--");
687
688                                         if (slashstar && dashdash)
689                                         {
690                                                 /* if both appear, take the first one */
691                                                 if (slashstar > dashdash)
692                                                         slashstar = dashdash;
693                                         }
694                                         else if (!slashstar)
695                                                 slashstar = dashdash;
696                                         if (slashstar)
697                                                 nchars = slashstar - yytext;
698
699                                         /*
700                                          * For SQL compatibility, '+' and '-' cannot be the
701                                          * last char of a multi-char operator unless the operator
702                                          * contains chars that are not in SQL operators.
703                                          * The idea is to lex '=-' as two operators, but not
704                                          * to forbid operator names like '?-' that could not be
705                                          * sequences of SQL operators.
706                                          */
707                                         while (nchars > 1 &&
708                                                    (yytext[nchars-1] == '+' ||
709                                                         yytext[nchars-1] == '-'))
710                                         {
711                                                 int             ic;
712
713                                                 for (ic = nchars-2; ic >= 0; ic--)
714                                                 {
715                                                         if (strchr("~!@#^&|`?%", yytext[ic]))
716                                                                 break;
717                                                 }
718                                                 if (ic >= 0)
719                                                         break; /* found a char that makes it OK */
720                                                 nchars--; /* else remove the +/-, and check again */
721                                         }
722
723                                         SET_YYLLOC();
724
725                                         if (nchars < yyleng)
726                                         {
727                                                 /* Strip the unwanted chars from the token */
728                                                 yyless(nchars);
729                                                 /*
730                                                  * If what we have left is only one char, and it's
731                                                  * one of the characters matching "self", then
732                                                  * return it as a character token the same way
733                                                  * that the "self" rule would have.
734                                                  */
735                                                 if (nchars == 1 &&
736                                                         strchr(",()[].;:+-*/%^<>=", yytext[0]))
737                                                         return yytext[0];
738                                         }
739
740                                         /*
741                                          * Complain if operator is too long.  Unlike the case
742                                          * for identifiers, we make this an error not a notice-
743                                          * and-truncate, because the odds are we are looking at
744                                          * a syntactic mistake anyway.
745                                          */
746                                         if (nchars >= NAMEDATALEN)
747                                                 yyerror("operator too long");
748
749                                         /* Convert "!=" operator to "<>" for compatibility */
750                                         if (strcmp(yytext, "!=") == 0)
751                                                 yylval.str = pstrdup("<>");
752                                         else
753                                                 yylval.str = pstrdup(yytext);
754                                         return Op;
755                                 }
756
757 {param}                 {
758                                         SET_YYLLOC();
759                                         yylval.ival = atol(yytext + 1);
760                                         return PARAM;
761                                 }
762
763 {integer}               {
764                                         long val;
765                                         char* endptr;
766
767                                         SET_YYLLOC();
768                                         errno = 0;
769                                         val = strtol(yytext, &endptr, 10);
770                                         if (*endptr != '\0' || errno == ERANGE
771 #ifdef HAVE_LONG_INT_64
772                                                 /* if long > 32 bits, check for overflow of int4 */
773                                                 || val != (long) ((int32) val)
774 #endif
775                                                 )
776                                         {
777                                                 /* integer too large, treat it as a float */
778                                                 yylval.str = pstrdup(yytext);
779                                                 return FCONST;
780                                         }
781                                         yylval.ival = val;
782                                         return ICONST;
783                                 }
784 {decimal}               {
785                                         SET_YYLLOC();
786                                         yylval.str = pstrdup(yytext);
787                                         return FCONST;
788                                 }
789 {real}                  {
790                                         SET_YYLLOC();
791                                         yylval.str = pstrdup(yytext);
792                                         return FCONST;
793                                 }
794 {realfail1}             {
795                                         /*
796                                          * throw back the [Ee], and treat as {decimal}.  Note
797                                          * that it is possible the input is actually {integer},
798                                          * but since this case will almost certainly lead to a
799                                          * syntax error anyway, we don't bother to distinguish.
800                                          */
801                                         yyless(yyleng-1);
802                                         SET_YYLLOC();
803                                         yylval.str = pstrdup(yytext);
804                                         return FCONST;
805                                 }
806 {realfail2}             {
807                                         /* throw back the [Ee][+-], and proceed as above */
808                                         yyless(yyleng-2);
809                                         SET_YYLLOC();
810                                         yylval.str = pstrdup(yytext);
811                                         return FCONST;
812                                 }
813
814
815 {identifier}    {
816                                         const ScanKeyword *keyword;
817                                         char               *ident;
818
819                                         SET_YYLLOC();
820
821                                         /* Is it a keyword? */
822                                         keyword = ScanKeywordLookup(yytext);
823                                         if (keyword != NULL)
824                                         {
825                                                 yylval.keyword = keyword->name;
826                                                 return keyword->value;
827                                         }
828
829                                         /*
830                                          * No.  Convert the identifier to lower case, and truncate
831                                          * if necessary.
832                                          */
833                                         ident = downcase_truncate_identifier(yytext, yyleng, true);
834                                         yylval.str = ident;
835                                         return IDENT;
836                                 }
837
838 {other}                 {
839                                         SET_YYLLOC();
840                                         return yytext[0];
841                                 }
842
843 <<EOF>>                 {
844                                         SET_YYLLOC();
845                                         yyterminate();
846                                 }
847
848 %%
849
850 /*
851  * scanner_errposition
852  *              Report a lexer or grammar error cursor position, if possible.
853  *
854  * This is expected to be used within an ereport() call.  The return value
855  * is a dummy (always 0, in fact).
856  *
857  * Note that this can only be used for messages emitted during raw parsing
858  * (essentially, scan.l and gram.y), since it requires scanbuf to still be
859  * valid.
860  */
861 int
862 scanner_errposition(int location)
863 {
864         int             pos;
865
866         Assert(scanbuf != NULL);        /* else called from wrong place */
867         if (location < 0)
868                 return 0;                               /* no-op if location is unknown */
869
870         /* Convert byte offset to character number */
871         pos = pg_mbstrlen_with_len(scanbuf, location) + 1;
872         /* And pass it to the ereport mechanism */
873         return errposition(pos);
874 }
875
876 /*
877  * yyerror
878  *              Report a lexer or grammar error.
879  *
880  * The message's cursor position identifies the most recently lexed token.
881  * This is OK for syntax error messages from the Bison parser, because Bison
882  * parsers report error as soon as the first unparsable token is reached.
883  * Beware of using yyerror for other purposes, as the cursor position might
884  * be misleading!
885  */
886 void
887 yyerror(const char *message)
888 {
889         const char *loc = scanbuf + yylloc;
890
891         if (*loc == YY_END_OF_BUFFER_CHAR)
892         {
893                 ereport(ERROR,
894                                 (errcode(ERRCODE_SYNTAX_ERROR),
895                                  /* translator: %s is typically the translation of "syntax error" */
896                                  errmsg("%s at end of input", _(message)),
897                                  lexer_errposition()));
898         }
899         else
900         {
901                 ereport(ERROR,
902                                 (errcode(ERRCODE_SYNTAX_ERROR),
903                                  /* translator: first %s is typically the translation of "syntax error" */
904                                  errmsg("%s at or near \"%s\"", _(message), loc),
905                                  lexer_errposition()));
906         }
907 }
908
909
910 /*
911  * Called before any actual parsing is done
912  */
913 void
914 scanner_init(const char *str)
915 {
916         Size    slen = strlen(str);
917
918         /*
919          * Might be left over after ereport()
920          */
921         if (YY_CURRENT_BUFFER)
922                 yy_delete_buffer(YY_CURRENT_BUFFER);
923
924         /*
925          * Make a scan buffer with special termination needed by flex.
926          */
927         scanbuf = palloc(slen + 2);
928         memcpy(scanbuf, str, slen);
929         scanbuf[slen] = scanbuf[slen + 1] = YY_END_OF_BUFFER_CHAR;
930         scanbufhandle = yy_scan_buffer(scanbuf, slen + 2);
931
932         /* initialize literal buffer to a reasonable but expansible size */
933         literalalloc = 1024;
934         literalbuf = (char *) palloc(literalalloc);
935         startlit();
936
937         BEGIN(INITIAL);
938 }
939
940
941 /*
942  * Called after parsing is done to clean up after scanner_init()
943  */
944 void
945 scanner_finish(void)
946 {
947         yy_delete_buffer(scanbufhandle);
948         pfree(scanbuf);
949         scanbuf = NULL;
950 }
951
952
953 static void
954 addlit(char *ytext, int yleng)
955 {
956         /* enlarge buffer if needed */
957         if ((literallen+yleng) >= literalalloc)
958         {
959                 do {
960                         literalalloc *= 2;
961                 } while ((literallen+yleng) >= literalalloc);
962                 literalbuf = (char *) repalloc(literalbuf, literalalloc);
963         }
964         /* append new data, add trailing null */
965         memcpy(literalbuf+literallen, ytext, yleng);
966         literallen += yleng;
967         literalbuf[literallen] = '\0';
968 }
969
970
971 static void
972 addlitchar(unsigned char ychar)
973 {
974         /* enlarge buffer if needed */
975         if ((literallen+1) >= literalalloc)
976         {
977                 literalalloc *= 2;
978                 literalbuf = (char *) repalloc(literalbuf, literalalloc);
979         }
980         /* append new data, add trailing null */
981         literalbuf[literallen] = ychar;
982         literallen += 1;
983         literalbuf[literallen] = '\0';
984 }
985
986
987 /*
988  * One might be tempted to write pstrdup(literalbuf) instead of this,
989  * but for long literals this is much faster because the length is
990  * already known.
991  */
992 static char *
993 litbufdup(void)
994 {
995         char *new;
996
997         new = palloc(literallen + 1);
998         memcpy(new, literalbuf, literallen+1);
999         return new;
1000 }
1001
1002 static int
1003 hexval(unsigned char c)
1004 {
1005         if (c >= '0' && c <= '9')
1006                 return c - '0';
1007         if (c >= 'a' && c <= 'f')
1008                 return c - 'a' + 0xA;
1009         if (c >= 'A' && c <= 'F')
1010                 return c - 'A' + 0xA;
1011         elog(ERROR, "invalid hexadecimal digit");
1012         return 0; /* not reached */
1013 }
1014
1015 static void
1016 check_unicode_value(pg_wchar c, char * loc)
1017 {
1018         if (GetDatabaseEncoding() == PG_UTF8)
1019                 return;
1020
1021         if (c > 0x7F)
1022         {
1023                 yylloc += (char *) loc - literalbuf + 3;   /* 3 for U&" */
1024                 yyerror("Unicode escape values cannot be used for code point values above 007F when the server encoding is not UTF8");
1025         }
1026 }
1027
1028 static char *
1029 litbuf_udeescape(unsigned char escape)
1030 {
1031         char *new;
1032         char *in, *out;
1033
1034         if (isxdigit(escape)
1035                 || escape == '+'
1036                 || escape == '\''
1037                 || escape == '"'
1038                 || scanner_isspace(escape))
1039         {
1040                 yylloc += literallen + yyleng + 1;
1041                 yyerror("invalid Unicode escape character");
1042         }
1043
1044         /*
1045          * This relies on the subtle assumption that a UTF-8 expansion
1046          * cannot be longer than its escaped representation.
1047          */
1048         new = palloc(literallen + 1);
1049
1050         in = literalbuf;
1051         out = new;
1052         while (*in)
1053         {
1054                 if (in[0] == escape)
1055                 {
1056                         if (in[1] == escape)
1057                         {
1058                                 *out++ = escape;
1059                                 in += 2;
1060                         }
1061                         else if (isxdigit(in[1]) && isxdigit(in[2]) && isxdigit(in[3]) && isxdigit(in[4]))
1062                         {
1063                                 pg_wchar unicode = hexval(in[1]) * 16*16*16 + hexval(in[2]) * 16*16 + hexval(in[3]) * 16 + hexval(in[4]);
1064                                 check_unicode_value(unicode, in);
1065                                 unicode_to_utf8(unicode, (unsigned char *) out);
1066                                 in += 5;
1067                                 out += pg_mblen(out);
1068                         }
1069                         else if (in[1] == '+'
1070                                          && isxdigit(in[2]) && isxdigit(in[3])
1071                                          && isxdigit(in[4]) && isxdigit(in[5])
1072                                          && isxdigit(in[6]) && isxdigit(in[7]))
1073                         {
1074                                 pg_wchar unicode = hexval(in[2]) * 16*16*16*16*16 + hexval(in[3]) * 16*16*16*16 + hexval(in[4]) * 16*16*16
1075                                                                         + hexval(in[5]) * 16*16 + hexval(in[6]) * 16 + hexval(in[7]);
1076                                 check_unicode_value(unicode, in);
1077                                 unicode_to_utf8(unicode, (unsigned char *) out);
1078                                 in += 8;
1079                                 out += pg_mblen(out);
1080                         }
1081                         else
1082                         {
1083                                 yylloc += in - literalbuf + 3;   /* 3 for U&" */
1084                                 yyerror("invalid Unicode escape value");
1085                         }
1086                 }
1087                 else
1088                         *out++ = *in++;
1089         }
1090
1091         *out = '\0';
1092         /*
1093          * We could skip pg_verifymbstr if we didn't process any non-7-bit-ASCII
1094          * codes; but it's probably not worth the trouble, since this isn't
1095          * likely to be a performance-critical path.
1096          */
1097         pg_verifymbstr(new, out - new, false);
1098         return new;
1099 }
1100
1101 static unsigned char
1102 unescape_single_char(unsigned char c)
1103 {
1104         switch (c)
1105         {
1106                 case 'b':
1107                         return '\b';
1108                 case 'f':
1109                         return '\f';
1110                 case 'n':
1111                         return '\n';
1112                 case 'r':
1113                         return '\r';
1114                 case 't':
1115                         return '\t';
1116                 default:
1117                         /* check for backslash followed by non-7-bit-ASCII */
1118                         if (c == '\0' || IS_HIGHBIT_SET(c))
1119                                 saw_non_ascii = true;
1120
1121                         return c;
1122         }
1123 }
1124
1125 static void
1126 check_string_escape_warning(unsigned char ychar)
1127 {
1128         if (ychar == '\'')
1129         {
1130                 if (warn_on_first_escape && escape_string_warning)
1131                         ereport(WARNING,
1132                                         (errcode(ERRCODE_NONSTANDARD_USE_OF_ESCAPE_CHARACTER),
1133                                          errmsg("nonstandard use of \\' in a string literal"),
1134                                          errhint("Use '' to write quotes in strings, or use the escape string syntax (E'...')."),
1135                                          lexer_errposition()));
1136                 warn_on_first_escape = false;   /* warn only once per string */
1137         }
1138         else if (ychar == '\\')
1139         {
1140                 if (warn_on_first_escape && escape_string_warning)
1141                         ereport(WARNING,
1142                                         (errcode(ERRCODE_NONSTANDARD_USE_OF_ESCAPE_CHARACTER),
1143                                          errmsg("nonstandard use of \\\\ in a string literal"),
1144                                          errhint("Use the escape string syntax for backslashes, e.g., E'\\\\'."),
1145                                          lexer_errposition()));
1146                 warn_on_first_escape = false;   /* warn only once per string */
1147         }
1148         else
1149                 check_escape_warning();
1150 }
1151
1152 static void
1153 check_escape_warning(void)
1154 {
1155         if (warn_on_first_escape && escape_string_warning)
1156                 ereport(WARNING,
1157                                 (errcode(ERRCODE_NONSTANDARD_USE_OF_ESCAPE_CHARACTER),
1158                                  errmsg("nonstandard use of escape in a string literal"),
1159                                  errhint("Use the escape string syntax for escapes, e.g., E'\\r\\n'."),
1160                                  lexer_errposition()));
1161         warn_on_first_escape = false;   /* warn only once per string */
1162 }