granicus.if.org Git - postgresql/blob - src/backend/parser/scan.l

   1 %{
   2 /*-------------------------------------------------------------------------
   3  *
   4  * scan.l
   5  *        lexical scanner for PostgreSQL
   6  *
   7  * NOTE NOTE NOTE:
   8  *
   9  * The rules in this file must be kept in sync with psql's lexer!!!
  10  *
  11  * The rules are designed so that the scanner never has to backtrack,
  12  * in the sense that there is always a rule that can match the input
  13  * consumed so far (the rule action may internally throw back some input
  14  * with yyless(), however).  As explained in the flex manual, this makes
  15  * for a useful speed increase --- about a third faster than a plain -CF
  16  * lexer, in simple testing.  The extra complexity is mostly in the rules
  17  * for handling float numbers and continued string literals.  If you change
  18  * the lexical rules, verify that you haven't broken the no-backtrack
  19  * property by running flex with the "-b" option and checking that the
  20  * resulting "lex.backup" file says that no backing up is needed.
  21  *
  22  *
  23  * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
  24  * Portions Copyright (c) 1994, Regents of the University of California
  25  *
  26  * IDENTIFICATION
  27  *        $PostgreSQL: pgsql/src/backend/parser/scan.l,v 1.152 2009/05/05 18:32:17 petere Exp $
  28  *
  29  *-------------------------------------------------------------------------
  30  */
  31 #include "postgres.h"
  32
  33 #include <ctype.h>
  34 #include <unistd.h>
  35
  36 #include "parser/gramparse.h"
  37 #include "parser/keywords.h"
  38 /* Not needed now that this file is compiled as part of gram.y */
  39 /* #include "parser/gram.h" */
  40 #include "parser/scansup.h"
  41 #include "mb/pg_wchar.h"
  42
  43
  44 /* Avoid exit() on fatal scanner errors (a bit ugly -- see yy_fatal_error) */
  45 #undef fprintf
  46 #define fprintf(file, fmt, msg)  ereport(ERROR, (errmsg_internal("%s", msg)))
  47
  48 static int              xcdepth = 0;    /* depth of nesting in slash-star comments */
  49 static char    *dolqstart;      /* current $foo$ quote start string */
  50
  51 /*
  52  * GUC variables.  This is a DIRECT violation of the warning given at the
  53  * head of gram.y, ie flex/bison code must not depend on any GUC variables;
  54  * as such, changing their values can induce very unintuitive behavior.
  55  * But we shall have to live with it as a short-term thing until the switch
  56  * to SQL-standard string syntax is complete.
  57  */
  58 int                             backslash_quote = BACKSLASH_QUOTE_SAFE_ENCODING;
  59 bool                    escape_string_warning = true;
  60 bool                    standard_conforming_strings = false;
  61
  62 static bool             warn_on_first_escape;
  63 static bool             saw_non_ascii = false;
  64
  65 /*
  66  * literalbuf is used to accumulate literal values when multiple rules
  67  * are needed to parse a single literal.  Call startlit to reset buffer
  68  * to empty, addlit to add text.  Note that the buffer is palloc'd and
  69  * starts life afresh on every parse cycle.
  70  */
  71 static char        *literalbuf;         /* expandable buffer */
  72 static int              literallen;             /* actual current length */
  73 static int              literalalloc;   /* current allocated buffer size */
  74
  75 #define startlit()  (literalbuf[0] = '\0', literallen = 0)
  76 static void addlit(char *ytext, int yleng);
  77 static void addlitchar(unsigned char ychar);
  78 static char *litbufdup(void);
  79 static char *litbuf_udeescape(unsigned char escape);
  80
  81 #define lexer_errposition()  scanner_errposition(yylloc)
  82
  83 static void check_escape_warning(void);
  84 static void check_string_escape_warning(unsigned char ychar);
  85
  86 /*
  87  * Each call to yylex must set yylloc to the location of the found token
  88  * (expressed as a byte offset from the start of the input text).
  89  * When we parse a token that requires multiple lexer rules to process,
  90  * this should be done in the first such rule, else yylloc will point
  91  * into the middle of the token.
  92  */
  93 #define SET_YYLLOC()  (yylloc = yytext - scanbuf)
  94
  95 /* Handles to the buffer that the lexer uses internally */
  96 static YY_BUFFER_STATE scanbufhandle;
  97 static char *scanbuf;
  98
  99 static unsigned char unescape_single_char(unsigned char c);
 100
 101 %}
 102
 103 %option 8bit
 104 %option never-interactive
 105 %option nodefault
 106 %option noinput
 107 %option nounput
 108 %option noyywrap
 109 %option prefix="base_yy"
 110
 111 /*
 112  * OK, here is a short description of lex/flex rules behavior.
 113  * The longest pattern which matches an input string is always chosen.
 114  * For equal-length patterns, the first occurring in the rules list is chosen.
 115  * INITIAL is the starting state, to which all non-conditional rules apply.
 116  * Exclusive states change parsing rules while the state is active.  When in
 117  * an exclusive state, only those rules defined for that state apply.
 118  *
 119  * We use exclusive states for quoted strings, extended comments,
 120  * and to eliminate parsing troubles for numeric strings.
 121  * Exclusive states:
 122  *  <xb> bit string literal
 123  *  <xc> extended C-style comments
 124  *  <xd> delimited identifiers (double-quoted identifiers)
 125  *  <xh> hexadecimal numeric string
 126  *  <xq> standard quoted strings
 127  *  <xe> extended quoted strings (support backslash escape sequences)
 128  *  <xdolq> $foo$ quoted strings
 129  *  <xui> quoted identifier with Unicode escapes
 130  *  <xus> quoted string with Unicode escapes
 131  */
 132
 133 %x xb
 134 %x xc
 135 %x xd
 136 %x xh
 137 %x xe
 138 %x xq
 139 %x xdolq
 140 %x xui
 141 %x xus
 142
 143 /*
 144  * In order to make the world safe for Windows and Mac clients as well as
 145  * Unix ones, we accept either \n or \r as a newline.  A DOS-style \r\n
 146  * sequence will be seen as two successive newlines, but that doesn't cause
 147  * any problems.  Comments that start with -- and extend to the next
 148  * newline are treated as equivalent to a single whitespace character.
 149  *
 150  * NOTE a fine point: if there is no newline following --, we will absorb
 151  * everything to the end of the input as a comment.  This is correct.  Older
 152  * versions of Postgres failed to recognize -- as a comment if the input
 153  * did not end with a newline.
 154  *
 155  * XXX perhaps \f (formfeed) should be treated as a newline as well?
 156  *
 157  * XXX if you change the set of whitespace characters, fix scanner_isspace()
 158  * to agree, and see also the plpgsql lexer.
 159  */
 160
 161 space                   [ \t\n\r\f]
 162 horiz_space             [ \t\f]
 163 newline                 [\n\r]
 164 non_newline             [^\n\r]
 165
 166 comment                 ("--"{non_newline}*)
 167
 168 whitespace              ({space}+|{comment})
 169
 170 /*
 171  * SQL requires at least one newline in the whitespace separating
 172  * string literals that are to be concatenated.  Silly, but who are we
 173  * to argue?  Note that {whitespace_with_newline} should not have * after
 174  * it, whereas {whitespace} should generally have a * after it...
 175  */
 176
 177 special_whitespace              ({space}+|{comment}{newline})
 178 horiz_whitespace                ({horiz_space}|{comment})
 179 whitespace_with_newline ({horiz_whitespace}*{newline}{special_whitespace}*)
 180
 181 /*
 182  * To ensure that {quotecontinue} can be scanned without having to back up
 183  * if the full pattern isn't matched, we include trailing whitespace in
 184  * {quotestop}.  This matches all cases where {quotecontinue} fails to match,
 185  * except for {quote} followed by whitespace and just one "-" (not two,
 186  * which would start a {comment}).  To cover that we have {quotefail}.
 187  * The actions for {quotestop} and {quotefail} must throw back characters
 188  * beyond the quote proper.
 189  */
 190 quote                   '
 191 quotestop               {quote}{whitespace}*
 192 quotecontinue   {quote}{whitespace_with_newline}{quote}
 193 quotefail               {quote}{whitespace}*"-"
 194
 195 /* Bit string
 196  * It is tempting to scan the string for only those characters
 197  * which are allowed. However, this leads to silently swallowed
 198  * characters if illegal characters are included in the string.
 199  * For example, if xbinside is [01] then B'ABCD' is interpreted
 200  * as a zero-length string, and the ABCD' is lost!
 201  * Better to pass the string forward and let the input routines
 202  * validate the contents.
 203  */
 204 xbstart                 [bB]{quote}
 205 xbinside                [^']*
 206
 207 /* Hexadecimal number */
 208 xhstart                 [xX]{quote}
 209 xhinside                [^']*
 210
 211 /* National character */
 212 xnstart                 [nN]{quote}
 213
 214 /* Quoted string that allows backslash escapes */
 215 xestart                 [eE]{quote}
 216 xeinside                [^\\']+
 217 xeescape                [\\][^0-7]
 218 xeoctesc                [\\][0-7]{1,3}
 219 xehexesc                [\\]x[0-9A-Fa-f]{1,2}
 220
 221 /* Extended quote
 222  * xqdouble implements embedded quote, ''''
 223  */
 224 xqstart                 {quote}
 225 xqdouble                {quote}{quote}
 226 xqinside                [^']+
 227
 228 /* $foo$ style quotes ("dollar quoting")
 229  * The quoted string starts with $foo$ where "foo" is an optional string
 230  * in the form of an identifier, except that it may not contain "$",
 231  * and extends to the first occurrence of an identical string.
 232  * There is *no* processing of the quoted text.
 233  *
 234  * {dolqfailed} is an error rule to avoid scanner backup when {dolqdelim}
 235  * fails to match its trailing "$".
 236  */
 237 dolq_start              [A-Za-z\200-\377_]
 238 dolq_cont               [A-Za-z\200-\377_0-9]
 239 dolqdelim               \$({dolq_start}{dolq_cont}*)?\$
 240 dolqfailed              \${dolq_start}{dolq_cont}*
 241 dolqinside              [^$]+
 242
 243 /* Double quote
 244  * Allows embedded spaces and other special characters into identifiers.
 245  */
 246 dquote                  \"
 247 xdstart                 {dquote}
 248 xdstop                  {dquote}
 249 xddouble                {dquote}{dquote}
 250 xdinside                [^"]+
 251
 252 /* Unicode escapes */
 253 uescape                 [uU][eE][sS][cC][aA][pP][eE]{whitespace}*{quote}[^']{quote}
 254 /* error rule to avoid backup */
 255 uescapefail             ("-"|[uU][eE][sS][cC][aA][pP][eE]{whitespace}*"-"|[uU][eE][sS][cC][aA][pP][eE]{whitespace}*{quote}[^']|[uU][eE][sS][cC][aA][pP][eE]{whitespace}*{quote}|[uU][eE][sS][cC][aA][pP][eE]{whitespace}*|[uU][eE][sS][cC][aA][pP]|[uU][eE][sS][cC][aA]|[uU][eE][sS][cC]|[uU][eE][sS]|[uU][eE]|[uU])
 256
 257 /* Quoted identifier with Unicode escapes */
 258 xuistart                [uU]&{dquote}
 259 xuistop1                {dquote}{whitespace}*{uescapefail}?
 260 xuistop2                {dquote}{whitespace}*{uescape}
 261
 262 /* Quoted string with Unicode escapes */
 263 xusstart                [uU]&{quote}
 264 xusstop1                {quote}{whitespace}*{uescapefail}?
 265 xusstop2                {quote}{whitespace}*{uescape}
 266
 267 /* error rule to avoid backup */
 268 xufailed                [uU]&
 269
 270
 271 /* C-style comments
 272  *
 273  * The "extended comment" syntax closely resembles allowable operator syntax.
 274  * The tricky part here is to get lex to recognize a string starting with
 275  * slash-star as a comment, when interpreting it as an operator would produce
 276  * a longer match --- remember lex will prefer a longer match!  Also, if we
 277  * have something like plus-slash-star, lex will think this is a 3-character
 278  * operator whereas we want to see it as a + operator and a comment start.
 279  * The solution is two-fold:
 280  * 1. append {op_chars}* to xcstart so that it matches as much text as
 281  *    {operator} would. Then the tie-breaker (first matching rule of same
 282  *    length) ensures xcstart wins.  We put back the extra stuff with yyless()
 283  *    in case it contains a star-slash that should terminate the comment.
 284  * 2. In the operator rule, check for slash-star within the operator, and
 285  *    if found throw it back with yyless().  This handles the plus-slash-star
 286  *    problem.
 287  * Dash-dash comments have similar interactions with the operator rule.
 288  */
 289 xcstart                 \/\*{op_chars}*
 290 xcstop                  \*+\/
 291 xcinside                [^*/]+
 292
 293 digit                   [0-9]
 294 ident_start             [A-Za-z\200-\377_]
 295 ident_cont              [A-Za-z\200-\377_0-9\$]
 296
 297 identifier              {ident_start}{ident_cont}*
 298
 299 typecast                "::"
 300
 301 /*
 302  * "self" is the set of chars that should be returned as single-character
 303  * tokens.  "op_chars" is the set of chars that can make up "Op" tokens,
 304  * which can be one or more characters long (but if a single-char token
 305  * appears in the "self" set, it is not to be returned as an Op).  Note
 306  * that the sets overlap, but each has some chars that are not in the other.
 307  *
 308  * If you change either set, adjust the character lists appearing in the
 309  * rule for "operator"!
 310  */
 311 self                    [,()\[\].;\:\+\-\*\/\%\^\<\>\=]
 312 op_chars                [\~\!\@\#\^\&\|\`\?\+\-\*\/\%\<\>\=]
 313 operator                {op_chars}+
 314
 315 /* we no longer allow unary minus in numbers.
 316  * instead we pass it separately to parser. there it gets
 317  * coerced via doNegate() -- Leon aug 20 1999
 318  *
 319  * {realfail1} and {realfail2} are added to prevent the need for scanner
 320  * backup when the {real} rule fails to match completely.
 321  */
 322
 323 integer                 {digit}+
 324 decimal                 (({digit}*\.{digit}+)|({digit}+\.{digit}*))
 325 real                    ({integer}|{decimal})[Ee][-+]?{digit}+
 326 realfail1               ({integer}|{decimal})[Ee]
 327 realfail2               ({integer}|{decimal})[Ee][-+]
 328
 329 param                   \${integer}
 330
 331 other                   .
 332
 333 /*
 334  * Dollar quoted strings are totally opaque, and no escaping is done on them.
 335  * Other quoted strings must allow some special characters such as single-quote
 336  *  and newline.
 337  * Embedded single-quotes are implemented both in the SQL standard
 338  *  style of two adjacent single quotes "''" and in the Postgres/Java style
 339  *  of escaped-quote "\'".
 340  * Other embedded escaped characters are matched explicitly and the leading
 341  *  backslash is dropped from the string.
 342  * Note that xcstart must appear before operator, as explained above!
 343  *  Also whitespace (comment) must appear before operator.
 344  */
 345
 346 %%
 347
 348 {whitespace}    {
 349                                         /* ignore */
 350                                 }
 351
 352 {xcstart}               {
 353                                         /* Set location in case of syntax error in comment */
 354                                         SET_YYLLOC();
 355                                         xcdepth = 0;
 356                                         BEGIN(xc);
 357                                         /* Put back any characters past slash-star; see above */
 358                                         yyless(2);
 359                                 }
 360
 361 <xc>{xcstart}   {
 362                                         xcdepth++;
 363                                         /* Put back any characters past slash-star; see above */
 364                                         yyless(2);
 365                                 }
 366
 367 <xc>{xcstop}    {
 368                                         if (xcdepth <= 0)
 369                                                 BEGIN(INITIAL);
 370                                         else
 371                                                 xcdepth--;
 372                                 }
 373
 374 <xc>{xcinside}  {
 375                                         /* ignore */
 376                                 }
 377
 378 <xc>{op_chars}  {
 379                                         /* ignore */
 380                                 }
 381
 382 <xc>\*+                 {
 383                                         /* ignore */
 384                                 }
 385
 386 <xc><<EOF>>             { yyerror("unterminated /* comment"); }
 387
 388 {xbstart}               {
 389                                         /* Binary bit type.
 390                                          * At some point we should simply pass the string
 391                                          * forward to the parser and label it there.
 392                                          * In the meantime, place a leading "b" on the string
 393                                          * to mark it for the input routine as a binary string.
 394                                          */
 395                                         SET_YYLLOC();
 396                                         BEGIN(xb);
 397                                         startlit();
 398                                         addlitchar('b');
 399                                 }
 400 <xb>{quotestop} |
 401 <xb>{quotefail} {
 402                                         yyless(1);
 403                                         BEGIN(INITIAL);
 404                                         yylval.str = litbufdup();
 405                                         return BCONST;
 406                                 }
 407 <xh>{xhinside}  |
 408 <xb>{xbinside}  {
 409                                         addlit(yytext, yyleng);
 410                                 }
 411 <xh>{quotecontinue}     |
 412 <xb>{quotecontinue}     {
 413                                         /* ignore */
 414                                 }
 415 <xb><<EOF>>             { yyerror("unterminated bit string literal"); }
 416
 417 {xhstart}               {
 418                                         /* Hexadecimal bit type.
 419                                          * At some point we should simply pass the string
 420                                          * forward to the parser and label it there.
 421                                          * In the meantime, place a leading "x" on the string
 422                                          * to mark it for the input routine as a hex string.
 423                                          */
 424                                         SET_YYLLOC();
 425                                         BEGIN(xh);
 426                                         startlit();
 427                                         addlitchar('x');
 428                                 }
 429 <xh>{quotestop} |
 430 <xh>{quotefail} {
 431                                         yyless(1);
 432                                         BEGIN(INITIAL);
 433                                         yylval.str = litbufdup();
 434                                         return XCONST;
 435                                 }
 436 <xh><<EOF>>             { yyerror("unterminated hexadecimal string literal"); }
 437
 438 {xnstart}               {
 439                                         /* National character.
 440                                          * We will pass this along as a normal character string,
 441                                          * but preceded with an internally-generated "NCHAR".
 442                                          */
 443                                         const ScanKeyword *keyword;
 444
 445                                         SET_YYLLOC();
 446                                         yyless(1);                              /* eat only 'n' this time */
 447                                         /* nchar had better be a keyword! */
 448                                         keyword = ScanKeywordLookup("nchar");
 449                                         Assert(keyword != NULL);
 450                                         yylval.keyword = keyword->name;
 451                                         return keyword->value;
 452                                 }
 453
 454 {xqstart}               {
 455                                         warn_on_first_escape = true;
 456                                         saw_non_ascii = false;
 457                                         SET_YYLLOC();
 458                                         if (standard_conforming_strings)
 459                                                 BEGIN(xq);
 460                                         else
 461                                                 BEGIN(xe);
 462                                         startlit();
 463                                 }
 464 {xestart}               {
 465                                         warn_on_first_escape = false;
 466                                         saw_non_ascii = false;
 467                                         SET_YYLLOC();
 468                                         BEGIN(xe);
 469                                         startlit();
 470                                 }
 471 {xusstart}              {
 472                                         if (!standard_conforming_strings)
 473                                                 ereport(ERROR,
 474                                                                 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
 475                                                                  errmsg("unsafe use of string constant with Unicode escapes"),
 476                                                                  errdetail("String constants with Unicode escapes cannot be used when standard_conforming_strings is off.")));
 477                                         SET_YYLLOC();
 478                                         BEGIN(xus);
 479                                         startlit();
 480                                 }
 481 <xq,xe>{quotestop}      |
 482 <xq,xe>{quotefail} {
 483                                         yyless(1);
 484                                         BEGIN(INITIAL);
 485                                         /*
 486                                          * check that the data remains valid if it might have been
 487                                          * made invalid by unescaping any chars.
 488                                          */
 489                                         if (saw_non_ascii)
 490                                                 pg_verifymbstr(literalbuf, literallen, false);
 491                                         yylval.str = litbufdup();
 492                                         return SCONST;
 493                                 }
 494 <xus>{xusstop1} {
 495                                         /* throw back all but the quote */
 496                                         yyless(1);
 497                                         BEGIN(INITIAL);
 498                                         yylval.str = litbuf_udeescape('\\');
 499                                         return SCONST;
 500                                 }
 501 <xus>{xusstop2} {
 502                                         BEGIN(INITIAL);
 503                                         yylval.str = litbuf_udeescape(yytext[yyleng-2]);
 504                                         return SCONST;
 505                                 }
 506 <xq,xe,xus>{xqdouble} {
 507                                         addlitchar('\'');
 508                                 }
 509 <xq,xus>{xqinside}  {
 510                                         addlit(yytext, yyleng);
 511                                 }
 512 <xe>{xeinside}  {
 513                                         addlit(yytext, yyleng);
 514                                 }
 515 <xe>{xeescape}  {
 516                                         if (yytext[1] == '\'')
 517                                         {
 518                                                 if (backslash_quote == BACKSLASH_QUOTE_OFF ||
 519                                                         (backslash_quote == BACKSLASH_QUOTE_SAFE_ENCODING &&
 520                                                          PG_ENCODING_IS_CLIENT_ONLY(pg_get_client_encoding())))
 521                                                         ereport(ERROR,
 522                                                                         (errcode(ERRCODE_NONSTANDARD_USE_OF_ESCAPE_CHARACTER),
 523                                                                          errmsg("unsafe use of \\' in a string literal"),
 524                                                                          errhint("Use '' to write quotes in strings. \\' is insecure in client-only encodings."),
 525                                                                          lexer_errposition()));
 526                                         }
 527                                         check_string_escape_warning(yytext[1]);
 528                                         addlitchar(unescape_single_char(yytext[1]));
 529                                 }
 530 <xe>{xeoctesc}  {
 531                                         unsigned char c = strtoul(yytext+1, NULL, 8);
 532
 533                                         check_escape_warning();
 534                                         addlitchar(c);
 535                                         if (c == '\0' || IS_HIGHBIT_SET(c))
 536                                                 saw_non_ascii = true;
 537                                 }
 538 <xe>{xehexesc}  {
 539                                         unsigned char c = strtoul(yytext+2, NULL, 16);
 540
 541                                         check_escape_warning();
 542                                         addlitchar(c);
 543                                         if (c == '\0' || IS_HIGHBIT_SET(c))
 544                                                 saw_non_ascii = true;
 545                                 }
 546 <xq,xe,xus>{quotecontinue} {
 547                                         /* ignore */
 548                                 }
 549 <xe>.                   {
 550                                         /* This is only needed for \ just before EOF */
 551                                         addlitchar(yytext[0]);
 552                                 }
 553 <xq,xe,xus><<EOF>>              { yyerror("unterminated quoted string"); }
 554
 555 {dolqdelim}             {
 556                                         SET_YYLLOC();
 557                                         dolqstart = pstrdup(yytext);
 558                                         BEGIN(xdolq);
 559                                         startlit();
 560                                 }
 561 {dolqfailed}    {
 562                                         SET_YYLLOC();
 563                                         /* throw back all but the initial "$" */
 564                                         yyless(1);
 565                                         /* and treat it as {other} */
 566                                         return yytext[0];
 567                                 }
 568 <xdolq>{dolqdelim} {
 569                                         if (strcmp(yytext, dolqstart) == 0)
 570                                         {
 571                                                 pfree(dolqstart);
 572                                                 BEGIN(INITIAL);
 573                                                 yylval.str = litbufdup();
 574                                                 return SCONST;
 575                                         }
 576                                         else
 577                                         {
 578                                                 /*
 579                                                  * When we fail to match $...$ to dolqstart, transfer
 580                                                  * the $... part to the output, but put back the final
 581                                                  * $ for rescanning.  Consider $delim$...$junk$delim$
 582                                                  */
 583                                                 addlit(yytext, yyleng-1);
 584                                                 yyless(yyleng-1);
 585                                         }
 586                                 }
 587 <xdolq>{dolqinside} {
 588                                         addlit(yytext, yyleng);
 589                                 }
 590 <xdolq>{dolqfailed} {
 591                                         addlit(yytext, yyleng);
 592                                 }
 593 <xdolq>.                {
 594                                         /* This is only needed for $ inside the quoted text */
 595                                         addlitchar(yytext[0]);
 596                                 }
 597 <xdolq><<EOF>>  { yyerror("unterminated dollar-quoted string"); }
 598
 599 {xdstart}               {
 600                                         SET_YYLLOC();
 601                                         BEGIN(xd);
 602                                         startlit();
 603                                 }
 604 {xuistart}              {
 605                                         SET_YYLLOC();
 606                                         BEGIN(xui);
 607                                         startlit();
 608                                 }
 609 <xd>{xdstop}    {
 610                                         char               *ident;
 611
 612                                         BEGIN(INITIAL);
 613                                         if (literallen == 0)
 614                                                 yyerror("zero-length delimited identifier");
 615                                         ident = litbufdup();
 616                                         if (literallen >= NAMEDATALEN)
 617                                                 truncate_identifier(ident, literallen, true);
 618                                         yylval.str = ident;
 619                                         return IDENT;
 620                                 }
 621 <xui>{xuistop1} {
 622                                         char               *ident;
 623
 624                                         BEGIN(INITIAL);
 625                                         if (literallen == 0)
 626                                                 yyerror("zero-length delimited identifier");
 627                                         ident = litbuf_udeescape('\\');
 628                                         if (literallen >= NAMEDATALEN)
 629                                                 truncate_identifier(ident, literallen, true);
 630                                         yylval.str = ident;
 631                                         /* throw back all but the quote */
 632                                         yyless(1);
 633                                         return IDENT;
 634                                 }
 635 <xui>{xuistop2} {
 636                                         char               *ident;
 637
 638                                         BEGIN(INITIAL);
 639                                         if (literallen == 0)
 640                                                 yyerror("zero-length delimited identifier");
 641                                         ident = litbuf_udeescape(yytext[yyleng - 2]);
 642                                         if (literallen >= NAMEDATALEN)
 643                                                 truncate_identifier(ident, literallen, true);
 644                                         yylval.str = ident;
 645                                         return IDENT;
 646                                 }
 647 <xd,xui>{xddouble}      {
 648                                         addlitchar('"');
 649                                 }
 650 <xd,xui>{xdinside}      {
 651                                         addlit(yytext, yyleng);
 652                                 }
 653 <xd,xui><<EOF>>         { yyerror("unterminated quoted identifier"); }
 654
 655 {xufailed}      {
 656                                         char               *ident;
 657
 658                                         SET_YYLLOC();
 659                                         /* throw back all but the initial u/U */
 660                                         yyless(1);
 661                                         /* and treat it as {identifier} */
 662                                         ident = downcase_truncate_identifier(yytext, yyleng, true);
 663                                         yylval.str = ident;
 664                                         return IDENT;
 665                                 }
 666
 667 {typecast}              {
 668                                         SET_YYLLOC();
 669                                         return TYPECAST;
 670                                 }
 671
 672 {self}                  {
 673                                         SET_YYLLOC();
 674                                         return yytext[0];
 675                                 }
 676
 677 {operator}              {
 678                                         /*
 679                                          * Check for embedded slash-star or dash-dash; those
 680                                          * are comment starts, so operator must stop there.
 681                                          * Note that slash-star or dash-dash at the first
 682                                          * character will match a prior rule, not this one.
 683                                          */
 684                                         int             nchars = yyleng;
 685                                         char   *slashstar = strstr(yytext, "/*");
 686                                         char   *dashdash = strstr(yytext, "--");
 687
 688                                         if (slashstar && dashdash)
 689                                         {
 690                                                 /* if both appear, take the first one */
 691                                                 if (slashstar > dashdash)
 692                                                         slashstar = dashdash;
 693                                         }
 694                                         else if (!slashstar)
 695                                                 slashstar = dashdash;
 696                                         if (slashstar)
 697                                                 nchars = slashstar - yytext;
 698
 699                                         /*
 700                                          * For SQL compatibility, '+' and '-' cannot be the
 701                                          * last char of a multi-char operator unless the operator
 702                                          * contains chars that are not in SQL operators.
 703                                          * The idea is to lex '=-' as two operators, but not
 704                                          * to forbid operator names like '?-' that could not be
 705                                          * sequences of SQL operators.
 706                                          */
 707                                         while (nchars > 1 &&
 708                                                    (yytext[nchars-1] == '+' ||
 709                                                         yytext[nchars-1] == '-'))
 710                                         {
 711                                                 int             ic;
 712
 713                                                 for (ic = nchars-2; ic >= 0; ic--)
 714                                                 {
 715                                                         if (strchr("~!@#^&|`?%", yytext[ic]))
 716                                                                 break;
 717                                                 }
 718                                                 if (ic >= 0)
 719                                                         break; /* found a char that makes it OK */
 720                                                 nchars--; /* else remove the +/-, and check again */
 721                                         }
 722
 723                                         SET_YYLLOC();
 724
 725                                         if (nchars < yyleng)
 726                                         {
 727                                                 /* Strip the unwanted chars from the token */
 728                                                 yyless(nchars);
 729                                                 /*
 730                                                  * If what we have left is only one char, and it's
 731                                                  * one of the characters matching "self", then
 732                                                  * return it as a character token the same way
 733                                                  * that the "self" rule would have.
 734                                                  */
 735                                                 if (nchars == 1 &&
 736                                                         strchr(",()[].;:+-*/%^<>=", yytext[0]))
 737                                                         return yytext[0];
 738                                         }
 739
 740                                         /*
 741                                          * Complain if operator is too long.  Unlike the case
 742                                          * for identifiers, we make this an error not a notice-
 743                                          * and-truncate, because the odds are we are looking at
 744                                          * a syntactic mistake anyway.
 745                                          */
 746                                         if (nchars >= NAMEDATALEN)
 747                                                 yyerror("operator too long");
 748
 749                                         /* Convert "!=" operator to "<>" for compatibility */
 750                                         if (strcmp(yytext, "!=") == 0)
 751                                                 yylval.str = pstrdup("<>");
 752                                         else
 753                                                 yylval.str = pstrdup(yytext);
 754                                         return Op;
 755                                 }
 756
 757 {param}                 {
 758                                         SET_YYLLOC();
 759                                         yylval.ival = atol(yytext + 1);
 760                                         return PARAM;
 761                                 }
 762
 763 {integer}               {
 764                                         long val;
 765                                         char* endptr;
 766
 767                                         SET_YYLLOC();
 768                                         errno = 0;
 769                                         val = strtol(yytext, &endptr, 10);
 770                                         if (*endptr != '\0' || errno == ERANGE
 771 #ifdef HAVE_LONG_INT_64
 772                                                 /* if long > 32 bits, check for overflow of int4 */
 773                                                 || val != (long) ((int32) val)
 774 #endif
 775                                                 )
 776                                         {
 777                                                 /* integer too large, treat it as a float */
 778                                                 yylval.str = pstrdup(yytext);
 779                                                 return FCONST;
 780                                         }
 781                                         yylval.ival = val;
 782                                         return ICONST;
 783                                 }
 784 {decimal}               {
 785                                         SET_YYLLOC();
 786                                         yylval.str = pstrdup(yytext);
 787                                         return FCONST;
 788                                 }
 789 {real}                  {
 790                                         SET_YYLLOC();
 791                                         yylval.str = pstrdup(yytext);
 792                                         return FCONST;
 793                                 }
 794 {realfail1}             {
 795                                         /*
 796                                          * throw back the [Ee], and treat as {decimal}.  Note
 797                                          * that it is possible the input is actually {integer},
 798                                          * but since this case will almost certainly lead to a
 799                                          * syntax error anyway, we don't bother to distinguish.
 800                                          */
 801                                         yyless(yyleng-1);
 802                                         SET_YYLLOC();
 803                                         yylval.str = pstrdup(yytext);
 804                                         return FCONST;
 805                                 }
 806 {realfail2}             {
 807                                         /* throw back the [Ee][+-], and proceed as above */
 808                                         yyless(yyleng-2);
 809                                         SET_YYLLOC();
 810                                         yylval.str = pstrdup(yytext);
 811                                         return FCONST;
 812                                 }
 813
 814
 815 {identifier}    {
 816                                         const ScanKeyword *keyword;
 817                                         char               *ident;
 818
 819                                         SET_YYLLOC();
 820
 821                                         /* Is it a keyword? */
 822                                         keyword = ScanKeywordLookup(yytext);
 823                                         if (keyword != NULL)
 824                                         {
 825                                                 yylval.keyword = keyword->name;
 826                                                 return keyword->value;
 827                                         }
 828
 829                                         /*
 830                                          * No.  Convert the identifier to lower case, and truncate
 831                                          * if necessary.
 832                                          */
 833                                         ident = downcase_truncate_identifier(yytext, yyleng, true);
 834                                         yylval.str = ident;
 835                                         return IDENT;
 836                                 }
 837
 838 {other}                 {
 839                                         SET_YYLLOC();
 840                                         return yytext[0];
 841                                 }
 842
 843 <<EOF>>                 {
 844                                         SET_YYLLOC();
 845                                         yyterminate();
 846                                 }
 847
 848 %%
 849
 850 /*
 851  * scanner_errposition
 852  *              Report a lexer or grammar error cursor position, if possible.
 853  *
 854  * This is expected to be used within an ereport() call.  The return value
 855  * is a dummy (always 0, in fact).
 856  *
 857  * Note that this can only be used for messages emitted during raw parsing
 858  * (essentially, scan.l and gram.y), since it requires scanbuf to still be
 859  * valid.
 860  */
 861 int
 862 scanner_errposition(int location)
 863 {
 864         int             pos;
 865
 866         Assert(scanbuf != NULL);        /* else called from wrong place */
 867         if (location < 0)
 868                 return 0;                               /* no-op if location is unknown */
 869
 870         /* Convert byte offset to character number */
 871         pos = pg_mbstrlen_with_len(scanbuf, location) + 1;
 872         /* And pass it to the ereport mechanism */
 873         return errposition(pos);
 874 }
 875
 876 /*
 877  * yyerror
 878  *              Report a lexer or grammar error.
 879  *
 880  * The message's cursor position identifies the most recently lexed token.
 881  * This is OK for syntax error messages from the Bison parser, because Bison
 882  * parsers report error as soon as the first unparsable token is reached.
 883  * Beware of using yyerror for other purposes, as the cursor position might
 884  * be misleading!
 885  */
 886 void
 887 yyerror(const char *message)
 888 {
 889         const char *loc = scanbuf + yylloc;
 890
 891         if (*loc == YY_END_OF_BUFFER_CHAR)
 892         {
 893                 ereport(ERROR,
 894                                 (errcode(ERRCODE_SYNTAX_ERROR),
 895                                  /* translator: %s is typically the translation of "syntax error" */
 896                                  errmsg("%s at end of input", _(message)),
 897                                  lexer_errposition()));
 898         }
 899         else
 900         {
 901                 ereport(ERROR,
 902                                 (errcode(ERRCODE_SYNTAX_ERROR),
 903                                  /* translator: first %s is typically the translation of "syntax error" */
 904                                  errmsg("%s at or near \"%s\"", _(message), loc),
 905                                  lexer_errposition()));
 906         }
 907 }
 908
 909
 910 /*
 911  * Called before any actual parsing is done
 912  */
 913 void
 914 scanner_init(const char *str)
 915 {
 916         Size    slen = strlen(str);
 917
 918         /*
 919          * Might be left over after ereport()
 920          */
 921         if (YY_CURRENT_BUFFER)
 922                 yy_delete_buffer(YY_CURRENT_BUFFER);
 923
 924         /*
 925          * Make a scan buffer with special termination needed by flex.
 926          */
 927         scanbuf = palloc(slen + 2);
 928         memcpy(scanbuf, str, slen);
 929         scanbuf[slen] = scanbuf[slen + 1] = YY_END_OF_BUFFER_CHAR;
 930         scanbufhandle = yy_scan_buffer(scanbuf, slen + 2);
 931
 932         /* initialize literal buffer to a reasonable but expansible size */
 933         literalalloc = 1024;
 934         literalbuf = (char *) palloc(literalalloc);
 935         startlit();
 936
 937         BEGIN(INITIAL);
 938 }
 939
 940
 941 /*
 942  * Called after parsing is done to clean up after scanner_init()
 943  */
 944 void
 945 scanner_finish(void)
 946 {
 947         yy_delete_buffer(scanbufhandle);
 948         pfree(scanbuf);
 949         scanbuf = NULL;
 950 }
 951
 952
 953 static void
 954 addlit(char *ytext, int yleng)
 955 {
 956         /* enlarge buffer if needed */
 957         if ((literallen+yleng) >= literalalloc)
 958         {
 959                 do {
 960                         literalalloc *= 2;
 961                 } while ((literallen+yleng) >= literalalloc);
 962                 literalbuf = (char *) repalloc(literalbuf, literalalloc);
 963         }
 964         /* append new data, add trailing null */
 965         memcpy(literalbuf+literallen, ytext, yleng);
 966         literallen += yleng;
 967         literalbuf[literallen] = '\0';
 968 }
 969
 970
 971 static void
 972 addlitchar(unsigned char ychar)
 973 {
 974         /* enlarge buffer if needed */
 975         if ((literallen+1) >= literalalloc)
 976         {
 977                 literalalloc *= 2;
 978                 literalbuf = (char *) repalloc(literalbuf, literalalloc);
 979         }
 980         /* append new data, add trailing null */
 981         literalbuf[literallen] = ychar;
 982         literallen += 1;
 983         literalbuf[literallen] = '\0';
 984 }
 985
 986
 987 /*
 988  * One might be tempted to write pstrdup(literalbuf) instead of this,
 989  * but for long literals this is much faster because the length is
 990  * already known.
 991  */
 992 static char *
 993 litbufdup(void)
 994 {
 995         char *new;
 996
 997         new = palloc(literallen + 1);
 998         memcpy(new, literalbuf, literallen+1);
 999         return new;
1000 }
1001
1002 static int
1003 hexval(unsigned char c)
1004 {
1005         if (c >= '0' && c <= '9')
1006                 return c - '0';
1007         if (c >= 'a' && c <= 'f')
1008                 return c - 'a' + 0xA;
1009         if (c >= 'A' && c <= 'F')
1010                 return c - 'A' + 0xA;
1011         elog(ERROR, "invalid hexadecimal digit");
1012         return 0; /* not reached */
1013 }
1014
1015 static void
1016 check_unicode_value(pg_wchar c, char * loc)
1017 {
1018         if (GetDatabaseEncoding() == PG_UTF8)
1019                 return;
1020
1021         if (c > 0x7F)
1022         {
1023                 yylloc += (char *) loc - literalbuf + 3;   /* 3 for U&" */
1024                 yyerror("Unicode escape values cannot be used for code point values above 007F when the server encoding is not UTF8");
1025         }
1026 }
1027
1028 static char *
1029 litbuf_udeescape(unsigned char escape)
1030 {
1031         char *new;
1032         char *in, *out;
1033
1034         if (isxdigit(escape)
1035                 || escape == '+'
1036                 || escape == '\''
1037                 || escape == '"'
1038                 || scanner_isspace(escape))
1039         {
1040                 yylloc += literallen + yyleng + 1;
1041                 yyerror("invalid Unicode escape character");
1042         }
1043
1044         /*
1045          * This relies on the subtle assumption that a UTF-8 expansion
1046          * cannot be longer than its escaped representation.
1047          */
1048         new = palloc(literallen + 1);
1049
1050         in = literalbuf;
1051         out = new;
1052         while (*in)
1053         {
1054                 if (in[0] == escape)
1055                 {
1056                         if (in[1] == escape)
1057                         {
1058                                 *out++ = escape;
1059                                 in += 2;
1060                         }
1061                         else if (isxdigit(in[1]) && isxdigit(in[2]) && isxdigit(in[3]) && isxdigit(in[4]))
1062                         {
1063                                 pg_wchar unicode = hexval(in[1]) * 16*16*16 + hexval(in[2]) * 16*16 + hexval(in[3]) * 16 + hexval(in[4]);
1064                                 check_unicode_value(unicode, in);
1065                                 unicode_to_utf8(unicode, (unsigned char *) out);
1066                                 in += 5;
1067                                 out += pg_mblen(out);
1068                         }
1069                         else if (in[1] == '+'
1070                                          && isxdigit(in[2]) && isxdigit(in[3])
1071                                          && isxdigit(in[4]) && isxdigit(in[5])
1072                                          && isxdigit(in[6]) && isxdigit(in[7]))
1073                         {
1074                                 pg_wchar unicode = hexval(in[2]) * 16*16*16*16*16 + hexval(in[3]) * 16*16*16*16 + hexval(in[4]) * 16*16*16
1075                                                                         + hexval(in[5]) * 16*16 + hexval(in[6]) * 16 + hexval(in[7]);
1076                                 check_unicode_value(unicode, in);
1077                                 unicode_to_utf8(unicode, (unsigned char *) out);
1078                                 in += 8;
1079                                 out += pg_mblen(out);
1080                         }
1081                         else
1082                         {
1083                                 yylloc += in - literalbuf + 3;   /* 3 for U&" */
1084                                 yyerror("invalid Unicode escape value");
1085                         }
1086                 }
1087                 else
1088                         *out++ = *in++;
1089         }
1090
1091         *out = '\0';
1092         /*
1093          * We could skip pg_verifymbstr if we didn't process any non-7-bit-ASCII
1094          * codes; but it's probably not worth the trouble, since this isn't
1095          * likely to be a performance-critical path.
1096          */
1097         pg_verifymbstr(new, out - new, false);
1098         return new;
1099 }
1100
1101 static unsigned char
1102 unescape_single_char(unsigned char c)
1103 {
1104         switch (c)
1105         {
1106                 case 'b':
1107                         return '\b';
1108                 case 'f':
1109                         return '\f';
1110                 case 'n':
1111                         return '\n';
1112                 case 'r':
1113                         return '\r';
1114                 case 't':
1115                         return '\t';
1116                 default:
1117                         /* check for backslash followed by non-7-bit-ASCII */
1118                         if (c == '\0' || IS_HIGHBIT_SET(c))
1119                                 saw_non_ascii = true;
1120
1121                         return c;
1122         }
1123 }
1124
1125 static void
1126 check_string_escape_warning(unsigned char ychar)
1127 {
1128         if (ychar == '\'')
1129         {
1130                 if (warn_on_first_escape && escape_string_warning)
1131                         ereport(WARNING,
1132                                         (errcode(ERRCODE_NONSTANDARD_USE_OF_ESCAPE_CHARACTER),
1133                                          errmsg("nonstandard use of \\' in a string literal"),
1134                                          errhint("Use '' to write quotes in strings, or use the escape string syntax (E'...')."),
1135                                          lexer_errposition()));
1136                 warn_on_first_escape = false;   /* warn only once per string */
1137         }
1138         else if (ychar == '\\')
1139         {
1140                 if (warn_on_first_escape && escape_string_warning)
1141                         ereport(WARNING,
1142                                         (errcode(ERRCODE_NONSTANDARD_USE_OF_ESCAPE_CHARACTER),
1143                                          errmsg("nonstandard use of \\\\ in a string literal"),
1144                                          errhint("Use the escape string syntax for backslashes, e.g., E'\\\\'."),
1145                                          lexer_errposition()));
1146                 warn_on_first_escape = false;   /* warn only once per string */
1147         }
1148         else
1149                 check_escape_warning();
1150 }
1151
1152 static void
1153 check_escape_warning(void)
1154 {
1155         if (warn_on_first_escape && escape_string_warning)
1156                 ereport(WARNING,
1157                                 (errcode(ERRCODE_NONSTANDARD_USE_OF_ESCAPE_CHARACTER),
1158                                  errmsg("nonstandard use of escape in a string literal"),
1159                                  errhint("Use the escape string syntax for escapes, e.g., E'\\r\\n'."),
1160                                  lexer_errposition()));
1161         warn_on_first_escape = false;   /* warn only once per string */
1162 }