granicus.if.org Git - postgresql/blob - src/backend/parser/scan.l

   1 %top{
   2 /*-------------------------------------------------------------------------
   3  *
   4  * scan.l
   5  *        lexical scanner for PostgreSQL
   6  *
   7  * NOTE NOTE NOTE:
   8  *
   9  * The rules in this file must be kept in sync with src/fe_utils/psqlscan.l!
  10  *
  11  * The rules are designed so that the scanner never has to backtrack,
  12  * in the sense that there is always a rule that can match the input
  13  * consumed so far (the rule action may internally throw back some input
  14  * with yyless(), however).  As explained in the flex manual, this makes
  15  * for a useful speed increase --- about a third faster than a plain -CF
  16  * lexer, in simple testing.  The extra complexity is mostly in the rules
  17  * for handling float numbers and continued string literals.  If you change
  18  * the lexical rules, verify that you haven't broken the no-backtrack
  19  * property by running flex with the "-b" option and checking that the
  20  * resulting "lex.backup" file says that no backing up is needed.  (As of
  21  * Postgres 9.2, this check is made automatically by the Makefile.)
  22  *
  23  *
  24  * Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group
  25  * Portions Copyright (c) 1994, Regents of the University of California
  26  *
  27  * IDENTIFICATION
  28  *        src/backend/parser/scan.l
  29  *
  30  *-------------------------------------------------------------------------
  31  */
  32 #include "postgres.h"
  33
  34 #include <ctype.h>
  35 #include <unistd.h>
  36
  37 #include "parser/gramparse.h"
  38 #include "parser/parser.h"              /* only needed for GUC variables */
  39 #include "parser/scansup.h"
  40 #include "mb/pg_wchar.h"
  41 }
  42
  43 %{
  44
  45 /* LCOV_EXCL_START */
  46
  47 /* Avoid exit() on fatal scanner errors (a bit ugly -- see yy_fatal_error) */
  48 #undef fprintf
  49 #define fprintf(file, fmt, msg)  fprintf_to_ereport(fmt, msg)
  50
  51 static void
  52 fprintf_to_ereport(const char *fmt, const char *msg)
  53 {
  54         ereport(ERROR, (errmsg_internal("%s", msg)));
  55 }
  56
  57 /*
  58  * GUC variables.  This is a DIRECT violation of the warning given at the
  59  * head of gram.y, ie flex/bison code must not depend on any GUC variables;
  60  * as such, changing their values can induce very unintuitive behavior.
  61  * But we shall have to live with it until we can remove these variables.
  62  */
  63 int                     backslash_quote = BACKSLASH_QUOTE_SAFE_ENCODING;
  64 bool            escape_string_warning = true;
  65 bool            standard_conforming_strings = true;
  66
  67 /*
  68  * Set the type of YYSTYPE.
  69  */
  70 #define YYSTYPE core_YYSTYPE
  71
  72 /*
  73  * Set the type of yyextra.  All state variables used by the scanner should
  74  * be in yyextra, *not* statically allocated.
  75  */
  76 #define YY_EXTRA_TYPE core_yy_extra_type *
  77
  78 /*
  79  * Each call to yylex must set yylloc to the location of the found token
  80  * (expressed as a byte offset from the start of the input text).
  81  * When we parse a token that requires multiple lexer rules to process,
  82  * this should be done in the first such rule, else yylloc will point
  83  * into the middle of the token.
  84  */
  85 #define SET_YYLLOC()  (*(yylloc) = yytext - yyextra->scanbuf)
  86
  87 /*
  88  * Advance yylloc by the given number of bytes.
  89  */
  90 #define ADVANCE_YYLLOC(delta)  ( *(yylloc) += (delta) )
  91
  92 #define startlit()      ( yyextra->literallen = 0 )
  93 static void addlit(char *ytext, int yleng, core_yyscan_t yyscanner);
  94 static void addlitchar(unsigned char ychar, core_yyscan_t yyscanner);
  95 static char *litbufdup(core_yyscan_t yyscanner);
  96 static char *litbuf_udeescape(unsigned char escape, core_yyscan_t yyscanner);
  97 static unsigned char unescape_single_char(unsigned char c, core_yyscan_t yyscanner);
  98 static int      process_integer_literal(const char *token, YYSTYPE *lval);
  99 static bool is_utf16_surrogate_first(pg_wchar c);
 100 static bool is_utf16_surrogate_second(pg_wchar c);
 101 static pg_wchar surrogate_pair_to_codepoint(pg_wchar first, pg_wchar second);
 102 static void addunicode(pg_wchar c, yyscan_t yyscanner);
 103 static bool check_uescapechar(unsigned char escape);
 104
 105 #define yyerror(msg)  scanner_yyerror(msg, yyscanner)
 106
 107 #define lexer_errposition()  scanner_errposition(*(yylloc), yyscanner)
 108
 109 static void check_string_escape_warning(unsigned char ychar, core_yyscan_t yyscanner);
 110 static void check_escape_warning(core_yyscan_t yyscanner);
 111
 112 /*
 113  * Work around a bug in flex 2.5.35: it emits a couple of functions that
 114  * it forgets to emit declarations for.  Since we use -Wmissing-prototypes,
 115  * this would cause warnings.  Providing our own declarations should be
 116  * harmless even when the bug gets fixed.
 117  */
 118 extern int      core_yyget_column(yyscan_t yyscanner);
 119 extern void core_yyset_column(int column_no, yyscan_t yyscanner);
 120
 121 %}
 122
 123 %option reentrant
 124 %option bison-bridge
 125 %option bison-locations
 126 %option 8bit
 127 %option never-interactive
 128 %option nodefault
 129 %option noinput
 130 %option nounput
 131 %option noyywrap
 132 %option noyyalloc
 133 %option noyyrealloc
 134 %option noyyfree
 135 %option warn
 136 %option prefix="core_yy"
 137
 138 /*
 139  * OK, here is a short description of lex/flex rules behavior.
 140  * The longest pattern which matches an input string is always chosen.
 141  * For equal-length patterns, the first occurring in the rules list is chosen.
 142  * INITIAL is the starting state, to which all non-conditional rules apply.
 143  * Exclusive states change parsing rules while the state is active.  When in
 144  * an exclusive state, only those rules defined for that state apply.
 145  *
 146  * We use exclusive states for quoted strings, extended comments,
 147  * and to eliminate parsing troubles for numeric strings.
 148  * Exclusive states:
 149  *  <xb> bit string literal
 150  *  <xc> extended C-style comments
 151  *  <xd> delimited identifiers (double-quoted identifiers)
 152  *  <xh> hexadecimal numeric string
 153  *  <xq> standard quoted strings
 154  *  <xe> extended quoted strings (support backslash escape sequences)
 155  *  <xdolq> $foo$ quoted strings
 156  *  <xui> quoted identifier with Unicode escapes
 157  *  <xuiend> end of a quoted identifier with Unicode escapes, UESCAPE can follow
 158  *  <xus> quoted string with Unicode escapes
 159  *  <xusend> end of a quoted string with Unicode escapes, UESCAPE can follow
 160  *  <xeu> Unicode surrogate pair in extended quoted string
 161  *
 162  * Remember to add an <<EOF>> case whenever you add a new exclusive state!
 163  * The default one is probably not the right thing.
 164  */
 165
 166 %x xb
 167 %x xc
 168 %x xd
 169 %x xh
 170 %x xe
 171 %x xq
 172 %x xdolq
 173 %x xui
 174 %x xuiend
 175 %x xus
 176 %x xusend
 177 %x xeu
 178
 179 /*
 180  * In order to make the world safe for Windows and Mac clients as well as
 181  * Unix ones, we accept either \n or \r as a newline.  A DOS-style \r\n
 182  * sequence will be seen as two successive newlines, but that doesn't cause
 183  * any problems.  Comments that start with -- and extend to the next
 184  * newline are treated as equivalent to a single whitespace character.
 185  *
 186  * NOTE a fine point: if there is no newline following --, we will absorb
 187  * everything to the end of the input as a comment.  This is correct.  Older
 188  * versions of Postgres failed to recognize -- as a comment if the input
 189  * did not end with a newline.
 190  *
 191  * XXX perhaps \f (formfeed) should be treated as a newline as well?
 192  *
 193  * XXX if you change the set of whitespace characters, fix scanner_isspace()
 194  * to agree, and see also the plpgsql lexer.
 195  */
 196
 197 space                   [ \t\n\r\f]
 198 horiz_space             [ \t\f]
 199 newline                 [\n\r]
 200 non_newline             [^\n\r]
 201
 202 comment                 ("--"{non_newline}*)
 203
 204 whitespace              ({space}+|{comment})
 205
 206 /*
 207  * SQL requires at least one newline in the whitespace separating
 208  * string literals that are to be concatenated.  Silly, but who are we
 209  * to argue?  Note that {whitespace_with_newline} should not have * after
 210  * it, whereas {whitespace} should generally have a * after it...
 211  */
 212
 213 special_whitespace              ({space}+|{comment}{newline})
 214 horiz_whitespace                ({horiz_space}|{comment})
 215 whitespace_with_newline ({horiz_whitespace}*{newline}{special_whitespace}*)
 216
 217 /*
 218  * To ensure that {quotecontinue} can be scanned without having to back up
 219  * if the full pattern isn't matched, we include trailing whitespace in
 220  * {quotestop}.  This matches all cases where {quotecontinue} fails to match,
 221  * except for {quote} followed by whitespace and just one "-" (not two,
 222  * which would start a {comment}).  To cover that we have {quotefail}.
 223  * The actions for {quotestop} and {quotefail} must throw back characters
 224  * beyond the quote proper.
 225  */
 226 quote                   '
 227 quotestop               {quote}{whitespace}*
 228 quotecontinue   {quote}{whitespace_with_newline}{quote}
 229 quotefail               {quote}{whitespace}*"-"
 230
 231 /* Bit string
 232  * It is tempting to scan the string for only those characters
 233  * which are allowed. However, this leads to silently swallowed
 234  * characters if illegal characters are included in the string.
 235  * For example, if xbinside is [01] then B'ABCD' is interpreted
 236  * as a zero-length string, and the ABCD' is lost!
 237  * Better to pass the string forward and let the input routines
 238  * validate the contents.
 239  */
 240 xbstart                 [bB]{quote}
 241 xbinside                [^']*
 242
 243 /* Hexadecimal number */
 244 xhstart                 [xX]{quote}
 245 xhinside                [^']*
 246
 247 /* National character */
 248 xnstart                 [nN]{quote}
 249
 250 /* Quoted string that allows backslash escapes */
 251 xestart                 [eE]{quote}
 252 xeinside                [^\\']+
 253 xeescape                [\\][^0-7]
 254 xeoctesc                [\\][0-7]{1,3}
 255 xehexesc                [\\]x[0-9A-Fa-f]{1,2}
 256 xeunicode               [\\](u[0-9A-Fa-f]{4}|U[0-9A-Fa-f]{8})
 257 xeunicodefail   [\\](u[0-9A-Fa-f]{0,3}|U[0-9A-Fa-f]{0,7})
 258
 259 /* Extended quote
 260  * xqdouble implements embedded quote, ''''
 261  */
 262 xqstart                 {quote}
 263 xqdouble                {quote}{quote}
 264 xqinside                [^']+
 265
 266 /* $foo$ style quotes ("dollar quoting")
 267  * The quoted string starts with $foo$ where "foo" is an optional string
 268  * in the form of an identifier, except that it may not contain "$",
 269  * and extends to the first occurrence of an identical string.
 270  * There is *no* processing of the quoted text.
 271  *
 272  * {dolqfailed} is an error rule to avoid scanner backup when {dolqdelim}
 273  * fails to match its trailing "$".
 274  */
 275 dolq_start              [A-Za-z\200-\377_]
 276 dolq_cont               [A-Za-z\200-\377_0-9]
 277 dolqdelim               \$({dolq_start}{dolq_cont}*)?\$
 278 dolqfailed              \${dolq_start}{dolq_cont}*
 279 dolqinside              [^$]+
 280
 281 /* Double quote
 282  * Allows embedded spaces and other special characters into identifiers.
 283  */
 284 dquote                  \"
 285 xdstart                 {dquote}
 286 xdstop                  {dquote}
 287 xddouble                {dquote}{dquote}
 288 xdinside                [^"]+
 289
 290 /* Unicode escapes */
 291 uescape                 [uU][eE][sS][cC][aA][pP][eE]{whitespace}*{quote}[^']{quote}
 292 /* error rule to avoid backup */
 293 uescapefail             [uU][eE][sS][cC][aA][pP][eE]{whitespace}*"-"|[uU][eE][sS][cC][aA][pP][eE]{whitespace}*{quote}[^']|[uU][eE][sS][cC][aA][pP][eE]{whitespace}*{quote}|[uU][eE][sS][cC][aA][pP][eE]{whitespace}*|[uU][eE][sS][cC][aA][pP]|[uU][eE][sS][cC][aA]|[uU][eE][sS][cC]|[uU][eE][sS]|[uU][eE]|[uU]
 294
 295 /* Quoted identifier with Unicode escapes */
 296 xuistart                [uU]&{dquote}
 297
 298 /* Quoted string with Unicode escapes */
 299 xusstart                [uU]&{quote}
 300
 301 /* Optional UESCAPE after a quoted string or identifier with Unicode escapes. */
 302 xustop1         {uescapefail}?
 303 xustop2         {uescape}
 304
 305 /* error rule to avoid backup */
 306 xufailed                [uU]&
 307
 308
 309 /* C-style comments
 310  *
 311  * The "extended comment" syntax closely resembles allowable operator syntax.
 312  * The tricky part here is to get lex to recognize a string starting with
 313  * slash-star as a comment, when interpreting it as an operator would produce
 314  * a longer match --- remember lex will prefer a longer match!  Also, if we
 315  * have something like plus-slash-star, lex will think this is a 3-character
 316  * operator whereas we want to see it as a + operator and a comment start.
 317  * The solution is two-fold:
 318  * 1. append {op_chars}* to xcstart so that it matches as much text as
 319  *    {operator} would. Then the tie-breaker (first matching rule of same
 320  *    length) ensures xcstart wins.  We put back the extra stuff with yyless()
 321  *    in case it contains a star-slash that should terminate the comment.
 322  * 2. In the operator rule, check for slash-star within the operator, and
 323  *    if found throw it back with yyless().  This handles the plus-slash-star
 324  *    problem.
 325  * Dash-dash comments have similar interactions with the operator rule.
 326  */
 327 xcstart                 \/\*{op_chars}*
 328 xcstop                  \*+\/
 329 xcinside                [^*/]+
 330
 331 digit                   [0-9]
 332 ident_start             [A-Za-z\200-\377_]
 333 ident_cont              [A-Za-z\200-\377_0-9\$]
 334
 335 identifier              {ident_start}{ident_cont}*
 336
 337 /* Assorted special-case operators and operator-like tokens */
 338 typecast                "::"
 339 dot_dot                 \.\.
 340 colon_equals    ":="
 341 equals_greater  "=>"
 342 less_equals             "<="
 343 greater_equals  ">="
 344 less_greater    "<>"
 345 not_equals              "!="
 346
 347 /*
 348  * "self" is the set of chars that should be returned as single-character
 349  * tokens.  "op_chars" is the set of chars that can make up "Op" tokens,
 350  * which can be one or more characters long (but if a single-char token
 351  * appears in the "self" set, it is not to be returned as an Op).  Note
 352  * that the sets overlap, but each has some chars that are not in the other.
 353  *
 354  * If you change either set, adjust the character lists appearing in the
 355  * rule for "operator"!
 356  */
 357 self                    [,()\[\].;\:\+\-\*\/\%\^\<\>\=]
 358 op_chars                [\~\!\@\#\^\&\|\`\?\+\-\*\/\%\<\>\=]
 359 operator                {op_chars}+
 360
 361 /* we no longer allow unary minus in numbers.
 362  * instead we pass it separately to parser. there it gets
 363  * coerced via doNegate() -- Leon aug 20 1999
 364  *
 365  * {decimalfail} is used because we would like "1..10" to lex as 1, dot_dot, 10.
 366  *
 367  * {realfail1} and {realfail2} are added to prevent the need for scanner
 368  * backup when the {real} rule fails to match completely.
 369  */
 370
 371 integer                 {digit}+
 372 decimal                 (({digit}*\.{digit}+)|({digit}+\.{digit}*))
 373 decimalfail             {digit}+\.\.
 374 real                    ({integer}|{decimal})[Ee][-+]?{digit}+
 375 realfail1               ({integer}|{decimal})[Ee]
 376 realfail2               ({integer}|{decimal})[Ee][-+]
 377
 378 param                   \${integer}
 379
 380 other                   .
 381
 382 /*
 383  * Dollar quoted strings are totally opaque, and no escaping is done on them.
 384  * Other quoted strings must allow some special characters such as single-quote
 385  *  and newline.
 386  * Embedded single-quotes are implemented both in the SQL standard
 387  *  style of two adjacent single quotes "''" and in the Postgres/Java style
 388  *  of escaped-quote "\'".
 389  * Other embedded escaped characters are matched explicitly and the leading
 390  *  backslash is dropped from the string.
 391  * Note that xcstart must appear before operator, as explained above!
 392  *  Also whitespace (comment) must appear before operator.
 393  */
 394
 395 %%
 396
 397 {whitespace}    {
 398                                         /* ignore */
 399                                 }
 400
 401 {xcstart}               {
 402                                         /* Set location in case of syntax error in comment */
 403                                         SET_YYLLOC();
 404                                         yyextra->xcdepth = 0;
 405                                         BEGIN(xc);
 406                                         /* Put back any characters past slash-star; see above */
 407                                         yyless(2);
 408                                 }
 409
 410 <xc>{xcstart}   {
 411                                         (yyextra->xcdepth)++;
 412                                         /* Put back any characters past slash-star; see above */
 413                                         yyless(2);
 414                                 }
 415
 416 <xc>{xcstop}    {
 417                                         if (yyextra->xcdepth <= 0)
 418                                                 BEGIN(INITIAL);
 419                                         else
 420                                                 (yyextra->xcdepth)--;
 421                                 }
 422
 423 <xc>{xcinside}  {
 424                                         /* ignore */
 425                                 }
 426
 427 <xc>{op_chars}  {
 428                                         /* ignore */
 429                                 }
 430
 431 <xc>\*+                 {
 432                                         /* ignore */
 433                                 }
 434
 435 <xc><<EOF>>             { yyerror("unterminated /* comment"); }
 436
 437 {xbstart}               {
 438                                         /* Binary bit type.
 439                                          * At some point we should simply pass the string
 440                                          * forward to the parser and label it there.
 441                                          * In the meantime, place a leading "b" on the string
 442                                          * to mark it for the input routine as a binary string.
 443                                          */
 444                                         SET_YYLLOC();
 445                                         BEGIN(xb);
 446                                         startlit();
 447                                         addlitchar('b', yyscanner);
 448                                 }
 449 <xb>{quotestop} |
 450 <xb>{quotefail} {
 451                                         yyless(1);
 452                                         BEGIN(INITIAL);
 453                                         yylval->str = litbufdup(yyscanner);
 454                                         return BCONST;
 455                                 }
 456 <xh>{xhinside}  |
 457 <xb>{xbinside}  {
 458                                         addlit(yytext, yyleng, yyscanner);
 459                                 }
 460 <xh>{quotecontinue}     |
 461 <xb>{quotecontinue}     {
 462                                         /* ignore */
 463                                 }
 464 <xb><<EOF>>             { yyerror("unterminated bit string literal"); }
 465
 466 {xhstart}               {
 467                                         /* Hexadecimal bit type.
 468                                          * At some point we should simply pass the string
 469                                          * forward to the parser and label it there.
 470                                          * In the meantime, place a leading "x" on the string
 471                                          * to mark it for the input routine as a hex string.
 472                                          */
 473                                         SET_YYLLOC();
 474                                         BEGIN(xh);
 475                                         startlit();
 476                                         addlitchar('x', yyscanner);
 477                                 }
 478 <xh>{quotestop} |
 479 <xh>{quotefail} {
 480                                         yyless(1);
 481                                         BEGIN(INITIAL);
 482                                         yylval->str = litbufdup(yyscanner);
 483                                         return XCONST;
 484                                 }
 485 <xh><<EOF>>             { yyerror("unterminated hexadecimal string literal"); }
 486
 487 {xnstart}               {
 488                                         /* National character.
 489                                          * We will pass this along as a normal character string,
 490                                          * but preceded with an internally-generated "NCHAR".
 491                                          */
 492                                         const ScanKeyword *keyword;
 493
 494                                         SET_YYLLOC();
 495                                         yyless(1);      /* eat only 'n' this time */
 496
 497                                         keyword = ScanKeywordLookup("nchar",
 498                                                                                                 yyextra->keywords,
 499                                                                                                 yyextra->num_keywords);
 500                                         if (keyword != NULL)
 501                                         {
 502                                                 yylval->keyword = keyword->name;
 503                                                 return keyword->value;
 504                                         }
 505                                         else
 506                                         {
 507                                                 /* If NCHAR isn't a keyword, just return "n" */
 508                                                 yylval->str = pstrdup("n");
 509                                                 return IDENT;
 510                                         }
 511                                 }
 512
 513 {xqstart}               {
 514                                         yyextra->warn_on_first_escape = true;
 515                                         yyextra->saw_non_ascii = false;
 516                                         SET_YYLLOC();
 517                                         if (yyextra->standard_conforming_strings)
 518                                                 BEGIN(xq);
 519                                         else
 520                                                 BEGIN(xe);
 521                                         startlit();
 522                                 }
 523 {xestart}               {
 524                                         yyextra->warn_on_first_escape = false;
 525                                         yyextra->saw_non_ascii = false;
 526                                         SET_YYLLOC();
 527                                         BEGIN(xe);
 528                                         startlit();
 529                                 }
 530 {xusstart}              {
 531                                         SET_YYLLOC();
 532                                         if (!yyextra->standard_conforming_strings)
 533                                                 ereport(ERROR,
 534                                                                 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
 535                                                                  errmsg("unsafe use of string constant with Unicode escapes"),
 536                                                                  errdetail("String constants with Unicode escapes cannot be used when standard_conforming_strings is off."),
 537                                                                  lexer_errposition()));
 538                                         BEGIN(xus);
 539                                         startlit();
 540                                 }
 541 <xq,xe>{quotestop}      |
 542 <xq,xe>{quotefail} {
 543                                         yyless(1);
 544                                         BEGIN(INITIAL);
 545                                         /*
 546                                          * check that the data remains valid if it might have been
 547                                          * made invalid by unescaping any chars.
 548                                          */
 549                                         if (yyextra->saw_non_ascii)
 550                                                 pg_verifymbstr(yyextra->literalbuf,
 551                                                                            yyextra->literallen,
 552                                                                            false);
 553                                         yylval->str = litbufdup(yyscanner);
 554                                         return SCONST;
 555                                 }
 556 <xus>{quotestop} |
 557 <xus>{quotefail} {
 558                                         /* throw back all but the quote */
 559                                         yyless(1);
 560                                         /* xusend state looks for possible UESCAPE */
 561                                         BEGIN(xusend);
 562                                 }
 563 <xusend>{whitespace} {
 564                                         /* stay in xusend state over whitespace */
 565                                 }
 566 <xusend><<EOF>> |
 567 <xusend>{other} |
 568 <xusend>{xustop1} {
 569                                         /* no UESCAPE after the quote, throw back everything */
 570                                         yyless(0);
 571                                         BEGIN(INITIAL);
 572                                         yylval->str = litbuf_udeescape('\\', yyscanner);
 573                                         return SCONST;
 574                                 }
 575 <xusend>{xustop2} {
 576                                         /* found UESCAPE after the end quote */
 577                                         BEGIN(INITIAL);
 578                                         if (!check_uescapechar(yytext[yyleng - 2]))
 579                                         {
 580                                                 SET_YYLLOC();
 581                                                 ADVANCE_YYLLOC(yyleng - 2);
 582                                                 yyerror("invalid Unicode escape character");
 583                                         }
 584                                         yylval->str = litbuf_udeescape(yytext[yyleng - 2],
 585                                                                                                    yyscanner);
 586                                         return SCONST;
 587                                 }
 588 <xq,xe,xus>{xqdouble} {
 589                                         addlitchar('\'', yyscanner);
 590                                 }
 591 <xq,xus>{xqinside}  {
 592                                         addlit(yytext, yyleng, yyscanner);
 593                                 }
 594 <xe>{xeinside}  {
 595                                         addlit(yytext, yyleng, yyscanner);
 596                                 }
 597 <xe>{xeunicode} {
 598                                         pg_wchar        c = strtoul(yytext + 2, NULL, 16);
 599
 600                                         check_escape_warning(yyscanner);
 601
 602                                         if (is_utf16_surrogate_first(c))
 603                                         {
 604                                                 yyextra->utf16_first_part = c;
 605                                                 BEGIN(xeu);
 606                                         }
 607                                         else if (is_utf16_surrogate_second(c))
 608                                                 yyerror("invalid Unicode surrogate pair");
 609                                         else
 610                                                 addunicode(c, yyscanner);
 611                                 }
 612 <xeu>{xeunicode} {
 613                                         pg_wchar        c = strtoul(yytext + 2, NULL, 16);
 614
 615                                         if (!is_utf16_surrogate_second(c))
 616                                                 yyerror("invalid Unicode surrogate pair");
 617
 618                                         c = surrogate_pair_to_codepoint(yyextra->utf16_first_part, c);
 619
 620                                         addunicode(c, yyscanner);
 621
 622                                         BEGIN(xe);
 623                                 }
 624 <xeu>.                  { yyerror("invalid Unicode surrogate pair"); }
 625 <xeu>\n                 { yyerror("invalid Unicode surrogate pair"); }
 626 <xeu><<EOF>>    { yyerror("invalid Unicode surrogate pair"); }
 627 <xe,xeu>{xeunicodefail} {
 628                                         ereport(ERROR,
 629                                                         (errcode(ERRCODE_INVALID_ESCAPE_SEQUENCE),
 630                                                          errmsg("invalid Unicode escape"),
 631                                                          errhint("Unicode escapes must be \\uXXXX or \\UXXXXXXXX."),
 632                                                          lexer_errposition()));
 633                                 }
 634 <xe>{xeescape}  {
 635                                         if (yytext[1] == '\'')
 636                                         {
 637                                                 if (yyextra->backslash_quote == BACKSLASH_QUOTE_OFF ||
 638                                                         (yyextra->backslash_quote == BACKSLASH_QUOTE_SAFE_ENCODING &&
 639                                                          PG_ENCODING_IS_CLIENT_ONLY(pg_get_client_encoding())))
 640                                                         ereport(ERROR,
 641                                                                         (errcode(ERRCODE_NONSTANDARD_USE_OF_ESCAPE_CHARACTER),
 642                                                                          errmsg("unsafe use of \\' in a string literal"),
 643                                                                          errhint("Use '' to write quotes in strings. \\' is insecure in client-only encodings."),
 644                                                                          lexer_errposition()));
 645                                         }
 646                                         check_string_escape_warning(yytext[1], yyscanner);
 647                                         addlitchar(unescape_single_char(yytext[1], yyscanner),
 648                                                            yyscanner);
 649                                 }
 650 <xe>{xeoctesc}  {
 651                                         unsigned char c = strtoul(yytext + 1, NULL, 8);
 652
 653                                         check_escape_warning(yyscanner);
 654                                         addlitchar(c, yyscanner);
 655                                         if (c == '\0' || IS_HIGHBIT_SET(c))
 656                                                 yyextra->saw_non_ascii = true;
 657                                 }
 658 <xe>{xehexesc}  {
 659                                         unsigned char c = strtoul(yytext + 2, NULL, 16);
 660
 661                                         check_escape_warning(yyscanner);
 662                                         addlitchar(c, yyscanner);
 663                                         if (c == '\0' || IS_HIGHBIT_SET(c))
 664                                                 yyextra->saw_non_ascii = true;
 665                                 }
 666 <xq,xe,xus>{quotecontinue} {
 667                                         /* ignore */
 668                                 }
 669 <xe>.                   {
 670                                         /* This is only needed for \ just before EOF */
 671                                         addlitchar(yytext[0], yyscanner);
 672                                 }
 673 <xq,xe,xus><<EOF>>              { yyerror("unterminated quoted string"); }
 674
 675 {dolqdelim}             {
 676                                         SET_YYLLOC();
 677                                         yyextra->dolqstart = pstrdup(yytext);
 678                                         BEGIN(xdolq);
 679                                         startlit();
 680                                 }
 681 {dolqfailed}    {
 682                                         SET_YYLLOC();
 683                                         /* throw back all but the initial "$" */
 684                                         yyless(1);
 685                                         /* and treat it as {other} */
 686                                         return yytext[0];
 687                                 }
 688 <xdolq>{dolqdelim} {
 689                                         if (strcmp(yytext, yyextra->dolqstart) == 0)
 690                                         {
 691                                                 pfree(yyextra->dolqstart);
 692                                                 yyextra->dolqstart = NULL;
 693                                                 BEGIN(INITIAL);
 694                                                 yylval->str = litbufdup(yyscanner);
 695                                                 return SCONST;
 696                                         }
 697                                         else
 698                                         {
 699                                                 /*
 700                                                  * When we fail to match $...$ to dolqstart, transfer
 701                                                  * the $... part to the output, but put back the final
 702                                                  * $ for rescanning.  Consider $delim$...$junk$delim$
 703                                                  */
 704                                                 addlit(yytext, yyleng - 1, yyscanner);
 705                                                 yyless(yyleng - 1);
 706                                         }
 707                                 }
 708 <xdolq>{dolqinside} {
 709                                         addlit(yytext, yyleng, yyscanner);
 710                                 }
 711 <xdolq>{dolqfailed} {
 712                                         addlit(yytext, yyleng, yyscanner);
 713                                 }
 714 <xdolq>.                {
 715                                         /* This is only needed for $ inside the quoted text */
 716                                         addlitchar(yytext[0], yyscanner);
 717                                 }
 718 <xdolq><<EOF>>  { yyerror("unterminated dollar-quoted string"); }
 719
 720 {xdstart}               {
 721                                         SET_YYLLOC();
 722                                         BEGIN(xd);
 723                                         startlit();
 724                                 }
 725 {xuistart}              {
 726                                         SET_YYLLOC();
 727                                         BEGIN(xui);
 728                                         startlit();
 729                                 }
 730 <xd>{xdstop}    {
 731                                         char       *ident;
 732
 733                                         BEGIN(INITIAL);
 734                                         if (yyextra->literallen == 0)
 735                                                 yyerror("zero-length delimited identifier");
 736                                         ident = litbufdup(yyscanner);
 737                                         if (yyextra->literallen >= NAMEDATALEN)
 738                                                 truncate_identifier(ident, yyextra->literallen, true);
 739                                         yylval->str = ident;
 740                                         return IDENT;
 741                                 }
 742 <xui>{dquote} {
 743                                         yyless(1);
 744                                         /* xuiend state looks for possible UESCAPE */
 745                                         BEGIN(xuiend);
 746                                 }
 747 <xuiend>{whitespace} {
 748                                         /* stay in xuiend state over whitespace */
 749                                 }
 750 <xuiend><<EOF>> |
 751 <xuiend>{other} |
 752 <xuiend>{xustop1} {
 753                                         /* no UESCAPE after the quote, throw back everything */
 754                                         char       *ident;
 755                                         int                     identlen;
 756
 757                                         yyless(0);
 758
 759                                         BEGIN(INITIAL);
 760                                         if (yyextra->literallen == 0)
 761                                                 yyerror("zero-length delimited identifier");
 762                                         ident = litbuf_udeescape('\\', yyscanner);
 763                                         identlen = strlen(ident);
 764                                         if (identlen >= NAMEDATALEN)
 765                                                 truncate_identifier(ident, identlen, true);
 766                                         yylval->str = ident;
 767                                         return IDENT;
 768                                 }
 769 <xuiend>{xustop2}       {
 770                                         /* found UESCAPE after the end quote */
 771                                         char       *ident;
 772                                         int                     identlen;
 773
 774                                         BEGIN(INITIAL);
 775                                         if (yyextra->literallen == 0)
 776                                                 yyerror("zero-length delimited identifier");
 777                                         if (!check_uescapechar(yytext[yyleng - 2]))
 778                                         {
 779                                                 SET_YYLLOC();
 780                                                 ADVANCE_YYLLOC(yyleng - 2);
 781                                                 yyerror("invalid Unicode escape character");
 782                                         }
 783                                         ident = litbuf_udeescape(yytext[yyleng - 2], yyscanner);
 784                                         identlen = strlen(ident);
 785                                         if (identlen >= NAMEDATALEN)
 786                                                 truncate_identifier(ident, identlen, true);
 787                                         yylval->str = ident;
 788                                         return IDENT;
 789                                 }
 790 <xd,xui>{xddouble}      {
 791                                         addlitchar('"', yyscanner);
 792                                 }
 793 <xd,xui>{xdinside}      {
 794                                         addlit(yytext, yyleng, yyscanner);
 795                                 }
 796 <xd,xui><<EOF>>         { yyerror("unterminated quoted identifier"); }
 797
 798 {xufailed}      {
 799                                         char       *ident;
 800
 801                                         SET_YYLLOC();
 802                                         /* throw back all but the initial u/U */
 803                                         yyless(1);
 804                                         /* and treat it as {identifier} */
 805                                         ident = downcase_truncate_identifier(yytext, yyleng, true);
 806                                         yylval->str = ident;
 807                                         return IDENT;
 808                                 }
 809
 810 {typecast}              {
 811                                         SET_YYLLOC();
 812                                         return TYPECAST;
 813                                 }
 814
 815 {dot_dot}               {
 816                                         SET_YYLLOC();
 817                                         return DOT_DOT;
 818                                 }
 819
 820 {colon_equals}  {
 821                                         SET_YYLLOC();
 822                                         return COLON_EQUALS;
 823                                 }
 824
 825 {equals_greater} {
 826                                         SET_YYLLOC();
 827                                         return EQUALS_GREATER;
 828                                 }
 829
 830 {less_equals}   {
 831                                         SET_YYLLOC();
 832                                         return LESS_EQUALS;
 833                                 }
 834
 835 {greater_equals} {
 836                                         SET_YYLLOC();
 837                                         return GREATER_EQUALS;
 838                                 }
 839
 840 {less_greater}  {
 841                                         /* We accept both "<>" and "!=" as meaning NOT_EQUALS */
 842                                         SET_YYLLOC();
 843                                         return NOT_EQUALS;
 844                                 }
 845
 846 {not_equals}    {
 847                                         /* We accept both "<>" and "!=" as meaning NOT_EQUALS */
 848                                         SET_YYLLOC();
 849                                         return NOT_EQUALS;
 850                                 }
 851
 852 {self}                  {
 853                                         SET_YYLLOC();
 854                                         return yytext[0];
 855                                 }
 856
 857 {operator}              {
 858                                         /*
 859                                          * Check for embedded slash-star or dash-dash; those
 860                                          * are comment starts, so operator must stop there.
 861                                          * Note that slash-star or dash-dash at the first
 862                                          * character will match a prior rule, not this one.
 863                                          */
 864                                         int                     nchars = yyleng;
 865                                         char       *slashstar = strstr(yytext, "/*");
 866                                         char       *dashdash = strstr(yytext, "--");
 867
 868                                         if (slashstar && dashdash)
 869                                         {
 870                                                 /* if both appear, take the first one */
 871                                                 if (slashstar > dashdash)
 872                                                         slashstar = dashdash;
 873                                         }
 874                                         else if (!slashstar)
 875                                                 slashstar = dashdash;
 876                                         if (slashstar)
 877                                                 nchars = slashstar - yytext;
 878
 879                                         /*
 880                                          * For SQL compatibility, '+' and '-' cannot be the
 881                                          * last char of a multi-char operator unless the operator
 882                                          * contains chars that are not in SQL operators.
 883                                          * The idea is to lex '=-' as two operators, but not
 884                                          * to forbid operator names like '?-' that could not be
 885                                          * sequences of SQL operators.
 886                                          */
 887                                         while (nchars > 1 &&
 888                                                    (yytext[nchars - 1] == '+' ||
 889                                                         yytext[nchars - 1] == '-'))
 890                                         {
 891                                                 int                     ic;
 892
 893                                                 for (ic = nchars - 2; ic >= 0; ic--)
 894                                                 {
 895                                                         if (strchr("~!@#^&|`?%", yytext[ic]))
 896                                                                 break;
 897                                                 }
 898                                                 if (ic >= 0)
 899                                                         break; /* found a char that makes it OK */
 900                                                 nchars--; /* else remove the +/-, and check again */
 901                                         }
 902
 903                                         SET_YYLLOC();
 904
 905                                         if (nchars < yyleng)
 906                                         {
 907                                                 /* Strip the unwanted chars from the token */
 908                                                 yyless(nchars);
 909                                                 /*
 910                                                  * If what we have left is only one char, and it's
 911                                                  * one of the characters matching "self", then
 912                                                  * return it as a character token the same way
 913                                                  * that the "self" rule would have.
 914                                                  */
 915                                                 if (nchars == 1 &&
 916                                                         strchr(",()[].;:+-*/%^<>=", yytext[0]))
 917                                                         return yytext[0];
 918                                         }
 919
 920                                         /*
 921                                          * Complain if operator is too long.  Unlike the case
 922                                          * for identifiers, we make this an error not a notice-
 923                                          * and-truncate, because the odds are we are looking at
 924                                          * a syntactic mistake anyway.
 925                                          */
 926                                         if (nchars >= NAMEDATALEN)
 927                                                 yyerror("operator too long");
 928
 929                                         yylval->str = pstrdup(yytext);
 930                                         return Op;
 931                                 }
 932
 933 {param}                 {
 934                                         SET_YYLLOC();
 935                                         yylval->ival = atol(yytext + 1);
 936                                         return PARAM;
 937                                 }
 938
 939 {integer}               {
 940                                         SET_YYLLOC();
 941                                         return process_integer_literal(yytext, yylval);
 942                                 }
 943 {decimal}               {
 944                                         SET_YYLLOC();
 945                                         yylval->str = pstrdup(yytext);
 946                                         return FCONST;
 947                                 }
 948 {decimalfail}   {
 949                                         /* throw back the .., and treat as integer */
 950                                         yyless(yyleng - 2);
 951                                         SET_YYLLOC();
 952                                         return process_integer_literal(yytext, yylval);
 953                                 }
 954 {real}                  {
 955                                         SET_YYLLOC();
 956                                         yylval->str = pstrdup(yytext);
 957                                         return FCONST;
 958                                 }
 959 {realfail1}             {
 960                                         /*
 961                                          * throw back the [Ee], and treat as {decimal}.  Note
 962                                          * that it is possible the input is actually {integer},
 963                                          * but since this case will almost certainly lead to a
 964                                          * syntax error anyway, we don't bother to distinguish.
 965                                          */
 966                                         yyless(yyleng - 1);
 967                                         SET_YYLLOC();
 968                                         yylval->str = pstrdup(yytext);
 969                                         return FCONST;
 970                                 }
 971 {realfail2}             {
 972                                         /* throw back the [Ee][+-], and proceed as above */
 973                                         yyless(yyleng - 2);
 974                                         SET_YYLLOC();
 975                                         yylval->str = pstrdup(yytext);
 976                                         return FCONST;
 977                                 }
 978
 979
 980 {identifier}    {
 981                                         const ScanKeyword *keyword;
 982                                         char       *ident;
 983
 984                                         SET_YYLLOC();
 985
 986                                         /* Is it a keyword? */
 987                                         keyword = ScanKeywordLookup(yytext,
 988                                                                                                 yyextra->keywords,
 989                                                                                                 yyextra->num_keywords);
 990                                         if (keyword != NULL)
 991                                         {
 992                                                 yylval->keyword = keyword->name;
 993                                                 return keyword->value;
 994                                         }
 995
 996                                         /*
 997                                          * No.  Convert the identifier to lower case, and truncate
 998                                          * if necessary.
 999                                          */
1000                                         ident = downcase_truncate_identifier(yytext, yyleng, true);
1001                                         yylval->str = ident;
1002                                         return IDENT;
1003                                 }
1004
1005 {other}                 {
1006                                         SET_YYLLOC();
1007                                         return yytext[0];
1008                                 }
1009
1010 <<EOF>>                 {
1011                                         SET_YYLLOC();
1012                                         yyterminate();
1013                                 }
1014
1015 %%
1016
1017 /* LCOV_EXCL_STOP */
1018
1019 /*
1020  * Arrange access to yyextra for subroutines of the main yylex() function.
1021  * We expect each subroutine to have a yyscanner parameter.  Rather than
1022  * use the yyget_xxx functions, which might or might not get inlined by the
1023  * compiler, we cheat just a bit and cast yyscanner to the right type.
1024  */
1025 #undef yyextra
1026 #define yyextra  (((struct yyguts_t *) yyscanner)->yyextra_r)
1027
1028 /* Likewise for a couple of other things we need. */
1029 #undef yylloc
1030 #define yylloc  (((struct yyguts_t *) yyscanner)->yylloc_r)
1031 #undef yyleng
1032 #define yyleng  (((struct yyguts_t *) yyscanner)->yyleng_r)
1033
1034
1035 /*
1036  * scanner_errposition
1037  *              Report a lexer or grammar error cursor position, if possible.
1038  *
1039  * This is expected to be used within an ereport() call.  The return value
1040  * is a dummy (always 0, in fact).
1041  *
1042  * Note that this can only be used for messages emitted during raw parsing
1043  * (essentially, scan.l and gram.y), since it requires the yyscanner struct
1044  * to still be available.
1045  */
1046 int
1047 scanner_errposition(int location, core_yyscan_t yyscanner)
1048 {
1049         int                     pos;
1050
1051         if (location < 0)
1052                 return 0;                               /* no-op if location is unknown */
1053
1054         /* Convert byte offset to character number */
1055         pos = pg_mbstrlen_with_len(yyextra->scanbuf, location) + 1;
1056         /* And pass it to the ereport mechanism */
1057         return errposition(pos);
1058 }
1059
1060 /*
1061  * scanner_yyerror
1062  *              Report a lexer or grammar error.
1063  *
1064  * The message's cursor position is whatever YYLLOC was last set to,
1065  * ie, the start of the current token if called within yylex(), or the
1066  * most recently lexed token if called from the grammar.
1067  * This is OK for syntax error messages from the Bison parser, because Bison
1068  * parsers report error as soon as the first unparsable token is reached.
1069  * Beware of using yyerror for other purposes, as the cursor position might
1070  * be misleading!
1071  */
1072 void
1073 scanner_yyerror(const char *message, core_yyscan_t yyscanner)
1074 {
1075         const char *loc = yyextra->scanbuf + *yylloc;
1076
1077         if (*loc == YY_END_OF_BUFFER_CHAR)
1078         {
1079                 ereport(ERROR,
1080                                 (errcode(ERRCODE_SYNTAX_ERROR),
1081                 /* translator: %s is typically the translation of "syntax error" */
1082                                  errmsg("%s at end of input", _(message)),
1083                                  lexer_errposition()));
1084         }
1085         else
1086         {
1087                 ereport(ERROR,
1088                                 (errcode(ERRCODE_SYNTAX_ERROR),
1089                 /* translator: first %s is typically the translation of "syntax error" */
1090                                  errmsg("%s at or near \"%s\"", _(message), loc),
1091                                  lexer_errposition()));
1092         }
1093 }
1094
1095
1096 /*
1097  * Called before any actual parsing is done
1098  */
1099 core_yyscan_t
1100 scanner_init(const char *str,
1101                          core_yy_extra_type *yyext,
1102                          const ScanKeyword *keywords,
1103                          int num_keywords)
1104 {
1105         Size            slen = strlen(str);
1106         yyscan_t        scanner;
1107
1108         if (yylex_init(&scanner) != 0)
1109                 elog(ERROR, "yylex_init() failed: %m");
1110
1111         core_yyset_extra(yyext, scanner);
1112
1113         yyext->keywords = keywords;
1114         yyext->num_keywords = num_keywords;
1115
1116         yyext->backslash_quote = backslash_quote;
1117         yyext->escape_string_warning = escape_string_warning;
1118         yyext->standard_conforming_strings = standard_conforming_strings;
1119
1120         /*
1121          * Make a scan buffer with special termination needed by flex.
1122          */
1123         yyext->scanbuf = (char *) palloc(slen + 2);
1124         yyext->scanbuflen = slen;
1125         memcpy(yyext->scanbuf, str, slen);
1126         yyext->scanbuf[slen] = yyext->scanbuf[slen + 1] = YY_END_OF_BUFFER_CHAR;
1127         yy_scan_buffer(yyext->scanbuf, slen + 2, scanner);
1128
1129         /* initialize literal buffer to a reasonable but expansible size */
1130         yyext->literalalloc = 1024;
1131         yyext->literalbuf = (char *) palloc(yyext->literalalloc);
1132         yyext->literallen = 0;
1133
1134         return scanner;
1135 }
1136
1137
1138 /*
1139  * Called after parsing is done to clean up after scanner_init()
1140  */
1141 void
1142 scanner_finish(core_yyscan_t yyscanner)
1143 {
1144         /*
1145          * We don't bother to call yylex_destroy(), because all it would do is
1146          * pfree a small amount of control storage.  It's cheaper to leak the
1147          * storage until the parsing context is destroyed.  The amount of space
1148          * involved is usually negligible compared to the output parse tree
1149          * anyway.
1150          *
1151          * We do bother to pfree the scanbuf and literal buffer, but only if they
1152          * represent a nontrivial amount of space.  The 8K cutoff is arbitrary.
1153          */
1154         if (yyextra->scanbuflen >= 8192)
1155                 pfree(yyextra->scanbuf);
1156         if (yyextra->literalalloc >= 8192)
1157                 pfree(yyextra->literalbuf);
1158 }
1159
1160
1161 static void
1162 addlit(char *ytext, int yleng, core_yyscan_t yyscanner)
1163 {
1164         /* enlarge buffer if needed */
1165         if ((yyextra->literallen + yleng) >= yyextra->literalalloc)
1166         {
1167                 do
1168                 {
1169                         yyextra->literalalloc *= 2;
1170                 } while ((yyextra->literallen + yleng) >= yyextra->literalalloc);
1171                 yyextra->literalbuf = (char *) repalloc(yyextra->literalbuf,
1172                                                                                                 yyextra->literalalloc);
1173         }
1174         /* append new data */
1175         memcpy(yyextra->literalbuf + yyextra->literallen, ytext, yleng);
1176         yyextra->literallen += yleng;
1177 }
1178
1179
1180 static void
1181 addlitchar(unsigned char ychar, core_yyscan_t yyscanner)
1182 {
1183         /* enlarge buffer if needed */
1184         if ((yyextra->literallen + 1) >= yyextra->literalalloc)
1185         {
1186                 yyextra->literalalloc *= 2;
1187                 yyextra->literalbuf = (char *) repalloc(yyextra->literalbuf,
1188                                                                                                 yyextra->literalalloc);
1189         }
1190         /* append new data */
1191         yyextra->literalbuf[yyextra->literallen] = ychar;
1192         yyextra->literallen += 1;
1193 }
1194
1195
1196 /*
1197  * Create a palloc'd copy of literalbuf, adding a trailing null.
1198  */
1199 static char *
1200 litbufdup(core_yyscan_t yyscanner)
1201 {
1202         int                     llen = yyextra->literallen;
1203         char       *new;
1204
1205         new = palloc(llen + 1);
1206         memcpy(new, yyextra->literalbuf, llen);
1207         new[llen] = '\0';
1208         return new;
1209 }
1210
1211 static int
1212 process_integer_literal(const char *token, YYSTYPE *lval)
1213 {
1214         long            val;
1215         char       *endptr;
1216
1217         errno = 0;
1218         val = strtol(token, &endptr, 10);
1219         if (*endptr != '\0' || errno == ERANGE ||
1220                 /* check for overflow of int */
1221                 val != (int) val)
1222         {
1223                 /* integer too large, treat it as a float */
1224                 lval->str = pstrdup(token);
1225                 return FCONST;
1226         }
1227         lval->ival = val;
1228         return ICONST;
1229 }
1230
1231 static unsigned int
1232 hexval(unsigned char c)
1233 {
1234         if (c >= '0' && c <= '9')
1235                 return c - '0';
1236         if (c >= 'a' && c <= 'f')
1237                 return c - 'a' + 0xA;
1238         if (c >= 'A' && c <= 'F')
1239                 return c - 'A' + 0xA;
1240         elog(ERROR, "invalid hexadecimal digit");
1241         return 0;                                       /* not reached */
1242 }
1243
1244 static void
1245 check_unicode_value(pg_wchar c, char *loc, core_yyscan_t yyscanner)
1246 {
1247         if (GetDatabaseEncoding() == PG_UTF8)
1248                 return;
1249
1250         if (c > 0x7F)
1251         {
1252                 ADVANCE_YYLLOC(loc - yyextra->literalbuf + 3);  /* 3 for U&" */
1253                 yyerror("Unicode escape values cannot be used for code point values above 007F when the server encoding is not UTF8");
1254         }
1255 }
1256
1257 static bool
1258 is_utf16_surrogate_first(pg_wchar c)
1259 {
1260         return (c >= 0xD800 && c <= 0xDBFF);
1261 }
1262
1263 static bool
1264 is_utf16_surrogate_second(pg_wchar c)
1265 {
1266         return (c >= 0xDC00 && c <= 0xDFFF);
1267 }
1268
1269 static pg_wchar
1270 surrogate_pair_to_codepoint(pg_wchar first, pg_wchar second)
1271 {
1272         return ((first & 0x3FF) << 10) + 0x10000 + (second & 0x3FF);
1273 }
1274
1275 static void
1276 addunicode(pg_wchar c, core_yyscan_t yyscanner)
1277 {
1278         char            buf[8];
1279
1280         if (c == 0 || c > 0x10FFFF)
1281                 yyerror("invalid Unicode escape value");
1282         if (c > 0x7F)
1283         {
1284                 if (GetDatabaseEncoding() != PG_UTF8)
1285                         yyerror("Unicode escape values cannot be used for code point values above 007F when the server encoding is not UTF8");
1286                 yyextra->saw_non_ascii = true;
1287         }
1288         unicode_to_utf8(c, (unsigned char *) buf);
1289         addlit(buf, pg_mblen(buf), yyscanner);
1290 }
1291
1292 /* is 'escape' acceptable as Unicode escape character (UESCAPE syntax) ? */
1293 static bool
1294 check_uescapechar(unsigned char escape)
1295 {
1296         if (isxdigit(escape)
1297                 || escape == '+'
1298                 || escape == '\''
1299                 || escape == '"'
1300                 || scanner_isspace(escape))
1301         {
1302                 return false;
1303         }
1304         else
1305                 return true;
1306 }
1307
1308 /* like litbufdup, but handle unicode escapes */
1309 static char *
1310 litbuf_udeescape(unsigned char escape, core_yyscan_t yyscanner)
1311 {
1312         char       *new;
1313         char       *litbuf,
1314                            *in,
1315                            *out;
1316         pg_wchar        pair_first = 0;
1317
1318         /* Make literalbuf null-terminated to simplify the scanning loop */
1319         litbuf = yyextra->literalbuf;
1320         litbuf[yyextra->literallen] = '\0';
1321
1322         /*
1323          * This relies on the subtle assumption that a UTF-8 expansion cannot be
1324          * longer than its escaped representation.
1325          */
1326         new = palloc(yyextra->literallen + 1);
1327
1328         in = litbuf;
1329         out = new;
1330         while (*in)
1331         {
1332                 if (in[0] == escape)
1333                 {
1334                         if (in[1] == escape)
1335                         {
1336                                 if (pair_first)
1337                                 {
1338                                         ADVANCE_YYLLOC(in - litbuf + 3);        /* 3 for U&" */
1339                                         yyerror("invalid Unicode surrogate pair");
1340                                 }
1341                                 *out++ = escape;
1342                                 in += 2;
1343                         }
1344                         else if (isxdigit((unsigned char) in[1]) &&
1345                                          isxdigit((unsigned char) in[2]) &&
1346                                          isxdigit((unsigned char) in[3]) &&
1347                                          isxdigit((unsigned char) in[4]))
1348                         {
1349                                 pg_wchar        unicode;
1350
1351                                 unicode = (hexval(in[1]) << 12) +
1352                                         (hexval(in[2]) << 8) +
1353                                         (hexval(in[3]) << 4) +
1354                                         hexval(in[4]);
1355                                 check_unicode_value(unicode, in, yyscanner);
1356                                 if (pair_first)
1357                                 {
1358                                         if (is_utf16_surrogate_second(unicode))
1359                                         {
1360                                                 unicode = surrogate_pair_to_codepoint(pair_first, unicode);
1361                                                 pair_first = 0;
1362                                         }
1363                                         else
1364                                         {
1365                                                 ADVANCE_YYLLOC(in - litbuf + 3);                /* 3 for U&" */
1366                                                 yyerror("invalid Unicode surrogate pair");
1367                                         }
1368                                 }
1369                                 else if (is_utf16_surrogate_second(unicode))
1370                                         yyerror("invalid Unicode surrogate pair");
1371
1372                                 if (is_utf16_surrogate_first(unicode))
1373                                         pair_first = unicode;
1374                                 else
1375                                 {
1376                                         unicode_to_utf8(unicode, (unsigned char *) out);
1377                                         out += pg_mblen(out);
1378                                 }
1379                                 in += 5;
1380                         }
1381                         else if (in[1] == '+' &&
1382                                          isxdigit((unsigned char) in[2]) &&
1383                                          isxdigit((unsigned char) in[3]) &&
1384                                          isxdigit((unsigned char) in[4]) &&
1385                                          isxdigit((unsigned char) in[5]) &&
1386                                          isxdigit((unsigned char) in[6]) &&
1387                                          isxdigit((unsigned char) in[7]))
1388                         {
1389                                 pg_wchar        unicode;
1390
1391                                 unicode = (hexval(in[2]) << 20) +
1392                                         (hexval(in[3]) << 16) +
1393                                         (hexval(in[4]) << 12) +
1394                                         (hexval(in[5]) << 8) +
1395                                         (hexval(in[6]) << 4) +
1396                                         hexval(in[7]);
1397                                 check_unicode_value(unicode, in, yyscanner);
1398                                 if (pair_first)
1399                                 {
1400                                         if (is_utf16_surrogate_second(unicode))
1401                                         {
1402                                                 unicode = surrogate_pair_to_codepoint(pair_first, unicode);
1403                                                 pair_first = 0;
1404                                         }
1405                                         else
1406                                         {
1407                                                 ADVANCE_YYLLOC(in - litbuf + 3);                /* 3 for U&" */
1408                                                 yyerror("invalid Unicode surrogate pair");
1409                                         }
1410                                 }
1411                                 else if (is_utf16_surrogate_second(unicode))
1412                                         yyerror("invalid Unicode surrogate pair");
1413
1414                                 if (is_utf16_surrogate_first(unicode))
1415                                         pair_first = unicode;
1416                                 else
1417                                 {
1418                                         unicode_to_utf8(unicode, (unsigned char *) out);
1419                                         out += pg_mblen(out);
1420                                 }
1421                                 in += 8;
1422                         }
1423                         else
1424                         {
1425                                 ADVANCE_YYLLOC(in - litbuf + 3);                /* 3 for U&" */
1426                                 yyerror("invalid Unicode escape value");
1427                         }
1428                 }
1429                 else
1430                 {
1431                         if (pair_first)
1432                         {
1433                                 ADVANCE_YYLLOC(in - litbuf + 3);                /* 3 for U&" */
1434                                 yyerror("invalid Unicode surrogate pair");
1435                         }
1436                         *out++ = *in++;
1437                 }
1438         }
1439
1440         /* unfinished surrogate pair? */
1441         if (pair_first)
1442         {
1443                 ADVANCE_YYLLOC(in - litbuf + 3);                                /* 3 for U&" */
1444                 yyerror("invalid Unicode surrogate pair");
1445         }
1446
1447         *out = '\0';
1448
1449         /*
1450          * We could skip pg_verifymbstr if we didn't process any non-7-bit-ASCII
1451          * codes; but it's probably not worth the trouble, since this isn't likely
1452          * to be a performance-critical path.
1453          */
1454         pg_verifymbstr(new, out - new, false);
1455         return new;
1456 }
1457
1458 static unsigned char
1459 unescape_single_char(unsigned char c, core_yyscan_t yyscanner)
1460 {
1461         switch (c)
1462         {
1463                 case 'b':
1464                         return '\b';
1465                 case 'f':
1466                         return '\f';
1467                 case 'n':
1468                         return '\n';
1469                 case 'r':
1470                         return '\r';
1471                 case 't':
1472                         return '\t';
1473                 default:
1474                         /* check for backslash followed by non-7-bit-ASCII */
1475                         if (c == '\0' || IS_HIGHBIT_SET(c))
1476                                 yyextra->saw_non_ascii = true;
1477
1478                         return c;
1479         }
1480 }
1481
1482 static void
1483 check_string_escape_warning(unsigned char ychar, core_yyscan_t yyscanner)
1484 {
1485         if (ychar == '\'')
1486         {
1487                 if (yyextra->warn_on_first_escape && yyextra->escape_string_warning)
1488                         ereport(WARNING,
1489                                         (errcode(ERRCODE_NONSTANDARD_USE_OF_ESCAPE_CHARACTER),
1490                                          errmsg("nonstandard use of \\' in a string literal"),
1491                                          errhint("Use '' to write quotes in strings, or use the escape string syntax (E'...')."),
1492                                          lexer_errposition()));
1493                 yyextra->warn_on_first_escape = false;  /* warn only once per string */
1494         }
1495         else if (ychar == '\\')
1496         {
1497                 if (yyextra->warn_on_first_escape && yyextra->escape_string_warning)
1498                         ereport(WARNING,
1499                                         (errcode(ERRCODE_NONSTANDARD_USE_OF_ESCAPE_CHARACTER),
1500                                          errmsg("nonstandard use of \\\\ in a string literal"),
1501                                          errhint("Use the escape string syntax for backslashes, e.g., E'\\\\'."),
1502                                          lexer_errposition()));
1503                 yyextra->warn_on_first_escape = false;  /* warn only once per string */
1504         }
1505         else
1506                 check_escape_warning(yyscanner);
1507 }
1508
1509 static void
1510 check_escape_warning(core_yyscan_t yyscanner)
1511 {
1512         if (yyextra->warn_on_first_escape && yyextra->escape_string_warning)
1513                 ereport(WARNING,
1514                                 (errcode(ERRCODE_NONSTANDARD_USE_OF_ESCAPE_CHARACTER),
1515                                  errmsg("nonstandard use of escape in a string literal"),
1516                 errhint("Use the escape string syntax for escapes, e.g., E'\\r\\n'."),
1517                                  lexer_errposition()));
1518         yyextra->warn_on_first_escape = false;          /* warn only once per string */
1519 }
1520
1521 /*
1522  * Interface functions to make flex use palloc() instead of malloc().
1523  * It'd be better to make these static, but flex insists otherwise.
1524  */
1525
1526 void *
1527 core_yyalloc(yy_size_t bytes, core_yyscan_t yyscanner)
1528 {
1529         return palloc(bytes);
1530 }
1531
1532 void *
1533 core_yyrealloc(void *ptr, yy_size_t bytes, core_yyscan_t yyscanner)
1534 {
1535         if (ptr)
1536                 return repalloc(ptr, bytes);
1537         else
1538                 return palloc(bytes);
1539 }
1540
1541 void
1542 core_yyfree(void *ptr, core_yyscan_t yyscanner)
1543 {
1544         if (ptr)
1545                 pfree(ptr);
1546 }