2 /*-------------------------------------------------------------------------
5 * lexical scanner for PostgreSQL
9 * The rules in this file must be kept in sync with src/fe_utils/psqlscan.l!
11 * The rules are designed so that the scanner never has to backtrack,
12 * in the sense that there is always a rule that can match the input
13 * consumed so far (the rule action may internally throw back some input
14 * with yyless(), however). As explained in the flex manual, this makes
15 * for a useful speed increase --- about a third faster than a plain -CF
16 * lexer, in simple testing. The extra complexity is mostly in the rules
17 * for handling float numbers and continued string literals. If you change
18 * the lexical rules, verify that you haven't broken the no-backtrack
19 * property by running flex with the "-b" option and checking that the
20 * resulting "lex.backup" file says that no backing up is needed. (As of
21 * Postgres 9.2, this check is made automatically by the Makefile.)
24 * Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group
25 * Portions Copyright (c) 1994, Regents of the University of California
28 * src/backend/parser/scan.l
30 *-------------------------------------------------------------------------
37 #include "parser/gramparse.h"
38 #include "parser/parser.h" /* only needed for GUC variables */
39 #include "parser/scansup.h"
40 #include "mb/pg_wchar.h"
47 /* Avoid exit() on fatal scanner errors (a bit ugly -- see yy_fatal_error) */
49 #define fprintf(file, fmt, msg) fprintf_to_ereport(fmt, msg)
52 fprintf_to_ereport(const char *fmt, const char *msg)
54 ereport(ERROR, (errmsg_internal("%s", msg)));
58 * GUC variables. This is a DIRECT violation of the warning given at the
59 * head of gram.y, ie flex/bison code must not depend on any GUC variables;
60 * as such, changing their values can induce very unintuitive behavior.
61 * But we shall have to live with it until we can remove these variables.
63 int backslash_quote = BACKSLASH_QUOTE_SAFE_ENCODING;
64 bool escape_string_warning = true;
65 bool standard_conforming_strings = true;
68 * Set the type of YYSTYPE.
70 #define YYSTYPE core_YYSTYPE
73 * Set the type of yyextra. All state variables used by the scanner should
74 * be in yyextra, *not* statically allocated.
76 #define YY_EXTRA_TYPE core_yy_extra_type *
79 * Each call to yylex must set yylloc to the location of the found token
80 * (expressed as a byte offset from the start of the input text).
81 * When we parse a token that requires multiple lexer rules to process,
82 * this should be done in the first such rule, else yylloc will point
83 * into the middle of the token.
85 #define SET_YYLLOC() (*(yylloc) = yytext - yyextra->scanbuf)
88 * Advance yylloc by the given number of bytes.
90 #define ADVANCE_YYLLOC(delta) ( *(yylloc) += (delta) )
92 #define startlit() ( yyextra->literallen = 0 )
93 static void addlit(char *ytext, int yleng, core_yyscan_t yyscanner);
94 static void addlitchar(unsigned char ychar, core_yyscan_t yyscanner);
95 static char *litbufdup(core_yyscan_t yyscanner);
96 static char *litbuf_udeescape(unsigned char escape, core_yyscan_t yyscanner);
97 static unsigned char unescape_single_char(unsigned char c, core_yyscan_t yyscanner);
98 static int process_integer_literal(const char *token, YYSTYPE *lval);
99 static bool is_utf16_surrogate_first(pg_wchar c);
100 static bool is_utf16_surrogate_second(pg_wchar c);
101 static pg_wchar surrogate_pair_to_codepoint(pg_wchar first, pg_wchar second);
102 static void addunicode(pg_wchar c, yyscan_t yyscanner);
103 static bool check_uescapechar(unsigned char escape);
105 #define yyerror(msg) scanner_yyerror(msg, yyscanner)
107 #define lexer_errposition() scanner_errposition(*(yylloc), yyscanner)
109 static void check_string_escape_warning(unsigned char ychar, core_yyscan_t yyscanner);
110 static void check_escape_warning(core_yyscan_t yyscanner);
113 * Work around a bug in flex 2.5.35: it emits a couple of functions that
114 * it forgets to emit declarations for. Since we use -Wmissing-prototypes,
115 * this would cause warnings. Providing our own declarations should be
116 * harmless even when the bug gets fixed.
118 extern int core_yyget_column(yyscan_t yyscanner);
119 extern void core_yyset_column(int column_no, yyscan_t yyscanner);
125 %option bison-locations
127 %option never-interactive
136 %option prefix="core_yy"
139 * OK, here is a short description of lex/flex rules behavior.
140 * The longest pattern which matches an input string is always chosen.
141 * For equal-length patterns, the first occurring in the rules list is chosen.
142 * INITIAL is the starting state, to which all non-conditional rules apply.
143 * Exclusive states change parsing rules while the state is active. When in
144 * an exclusive state, only those rules defined for that state apply.
146 * We use exclusive states for quoted strings, extended comments,
147 * and to eliminate parsing troubles for numeric strings.
149 * <xb> bit string literal
150 * <xc> extended C-style comments
151 * <xd> delimited identifiers (double-quoted identifiers)
152 * <xh> hexadecimal numeric string
153 * <xq> standard quoted strings
154 * <xe> extended quoted strings (support backslash escape sequences)
155 * <xdolq> $foo$ quoted strings
156 * <xui> quoted identifier with Unicode escapes
157 * <xuiend> end of a quoted identifier with Unicode escapes, UESCAPE can follow
158 * <xus> quoted string with Unicode escapes
159 * <xusend> end of a quoted string with Unicode escapes, UESCAPE can follow
160 * <xeu> Unicode surrogate pair in extended quoted string
162 * Remember to add an <<EOF>> case whenever you add a new exclusive state!
163 * The default one is probably not the right thing.
180 * In order to make the world safe for Windows and Mac clients as well as
181 * Unix ones, we accept either \n or \r as a newline. A DOS-style \r\n
182 * sequence will be seen as two successive newlines, but that doesn't cause
183 * any problems. Comments that start with -- and extend to the next
184 * newline are treated as equivalent to a single whitespace character.
186 * NOTE a fine point: if there is no newline following --, we will absorb
187 * everything to the end of the input as a comment. This is correct. Older
188 * versions of Postgres failed to recognize -- as a comment if the input
189 * did not end with a newline.
191 * XXX perhaps \f (formfeed) should be treated as a newline as well?
193 * XXX if you change the set of whitespace characters, fix scanner_isspace()
194 * to agree, and see also the plpgsql lexer.
202 comment ("--"{non_newline}*)
204 whitespace ({space}+|{comment})
207 * SQL requires at least one newline in the whitespace separating
208 * string literals that are to be concatenated. Silly, but who are we
209 * to argue? Note that {whitespace_with_newline} should not have * after
210 * it, whereas {whitespace} should generally have a * after it...
213 special_whitespace ({space}+|{comment}{newline})
214 horiz_whitespace ({horiz_space}|{comment})
215 whitespace_with_newline ({horiz_whitespace}*{newline}{special_whitespace}*)
218 * To ensure that {quotecontinue} can be scanned without having to back up
219 * if the full pattern isn't matched, we include trailing whitespace in
220 * {quotestop}. This matches all cases where {quotecontinue} fails to match,
221 * except for {quote} followed by whitespace and just one "-" (not two,
222 * which would start a {comment}). To cover that we have {quotefail}.
223 * The actions for {quotestop} and {quotefail} must throw back characters
224 * beyond the quote proper.
227 quotestop {quote}{whitespace}*
228 quotecontinue {quote}{whitespace_with_newline}{quote}
229 quotefail {quote}{whitespace}*"-"
232 * It is tempting to scan the string for only those characters
233 * which are allowed. However, this leads to silently swallowed
234 * characters if illegal characters are included in the string.
235 * For example, if xbinside is [01] then B'ABCD' is interpreted
236 * as a zero-length string, and the ABCD' is lost!
237 * Better to pass the string forward and let the input routines
238 * validate the contents.
243 /* Hexadecimal number */
247 /* National character */
250 /* Quoted string that allows backslash escapes */
254 xeoctesc [\\][0-7]{1,3}
255 xehexesc [\\]x[0-9A-Fa-f]{1,2}
256 xeunicode [\\](u[0-9A-Fa-f]{4}|U[0-9A-Fa-f]{8})
257 xeunicodefail [\\](u[0-9A-Fa-f]{0,3}|U[0-9A-Fa-f]{0,7})
260 * xqdouble implements embedded quote, ''''
263 xqdouble {quote}{quote}
266 /* $foo$ style quotes ("dollar quoting")
267 * The quoted string starts with $foo$ where "foo" is an optional string
268 * in the form of an identifier, except that it may not contain "$",
269 * and extends to the first occurrence of an identical string.
270 * There is *no* processing of the quoted text.
272 * {dolqfailed} is an error rule to avoid scanner backup when {dolqdelim}
273 * fails to match its trailing "$".
275 dolq_start [A-Za-z\200-\377_]
276 dolq_cont [A-Za-z\200-\377_0-9]
277 dolqdelim \$({dolq_start}{dolq_cont}*)?\$
278 dolqfailed \${dolq_start}{dolq_cont}*
282 * Allows embedded spaces and other special characters into identifiers.
287 xddouble {dquote}{dquote}
290 /* Unicode escapes */
291 uescape [uU][eE][sS][cC][aA][pP][eE]{whitespace}*{quote}[^']{quote}
292 /* error rule to avoid backup */
293 uescapefail [uU][eE][sS][cC][aA][pP][eE]{whitespace}*"-"|[uU][eE][sS][cC][aA][pP][eE]{whitespace}*{quote}[^']|[uU][eE][sS][cC][aA][pP][eE]{whitespace}*{quote}|[uU][eE][sS][cC][aA][pP][eE]{whitespace}*|[uU][eE][sS][cC][aA][pP]|[uU][eE][sS][cC][aA]|[uU][eE][sS][cC]|[uU][eE][sS]|[uU][eE]|[uU]
295 /* Quoted identifier with Unicode escapes */
296 xuistart [uU]&{dquote}
298 /* Quoted string with Unicode escapes */
299 xusstart [uU]&{quote}
301 /* Optional UESCAPE after a quoted string or identifier with Unicode escapes. */
302 xustop1 {uescapefail}?
305 /* error rule to avoid backup */
311 * The "extended comment" syntax closely resembles allowable operator syntax.
312 * The tricky part here is to get lex to recognize a string starting with
313 * slash-star as a comment, when interpreting it as an operator would produce
314 * a longer match --- remember lex will prefer a longer match! Also, if we
315 * have something like plus-slash-star, lex will think this is a 3-character
316 * operator whereas we want to see it as a + operator and a comment start.
317 * The solution is two-fold:
318 * 1. append {op_chars}* to xcstart so that it matches as much text as
319 * {operator} would. Then the tie-breaker (first matching rule of same
320 * length) ensures xcstart wins. We put back the extra stuff with yyless()
321 * in case it contains a star-slash that should terminate the comment.
322 * 2. In the operator rule, check for slash-star within the operator, and
323 * if found throw it back with yyless(). This handles the plus-slash-star
325 * Dash-dash comments have similar interactions with the operator rule.
327 xcstart \/\*{op_chars}*
332 ident_start [A-Za-z\200-\377_]
333 ident_cont [A-Za-z\200-\377_0-9\$]
335 identifier {ident_start}{ident_cont}*
337 /* Assorted special-case operators and operator-like tokens */
348 * "self" is the set of chars that should be returned as single-character
349 * tokens. "op_chars" is the set of chars that can make up "Op" tokens,
350 * which can be one or more characters long (but if a single-char token
351 * appears in the "self" set, it is not to be returned as an Op). Note
352 * that the sets overlap, but each has some chars that are not in the other.
354 * If you change either set, adjust the character lists appearing in the
355 * rule for "operator"!
357 self [,()\[\].;\:\+\-\*\/\%\^\<\>\=]
358 op_chars [\~\!\@\#\^\&\|\`\?\+\-\*\/\%\<\>\=]
361 /* we no longer allow unary minus in numbers.
362 * instead we pass it separately to parser. there it gets
363 * coerced via doNegate() -- Leon aug 20 1999
365 * {decimalfail} is used because we would like "1..10" to lex as 1, dot_dot, 10.
367 * {realfail1} and {realfail2} are added to prevent the need for scanner
368 * backup when the {real} rule fails to match completely.
372 decimal (({digit}*\.{digit}+)|({digit}+\.{digit}*))
373 decimalfail {digit}+\.\.
374 real ({integer}|{decimal})[Ee][-+]?{digit}+
375 realfail1 ({integer}|{decimal})[Ee]
376 realfail2 ({integer}|{decimal})[Ee][-+]
383 * Dollar quoted strings are totally opaque, and no escaping is done on them.
384 * Other quoted strings must allow some special characters such as single-quote
386 * Embedded single-quotes are implemented both in the SQL standard
387 * style of two adjacent single quotes "''" and in the Postgres/Java style
388 * of escaped-quote "\'".
389 * Other embedded escaped characters are matched explicitly and the leading
390 * backslash is dropped from the string.
391 * Note that xcstart must appear before operator, as explained above!
392 * Also whitespace (comment) must appear before operator.
402 /* Set location in case of syntax error in comment */
404 yyextra->xcdepth = 0;
406 /* Put back any characters past slash-star; see above */
411 (yyextra->xcdepth)++;
412 /* Put back any characters past slash-star; see above */
417 if (yyextra->xcdepth <= 0)
420 (yyextra->xcdepth)--;
435 <xc><<EOF>> { yyerror("unterminated /* comment"); }
439 * At some point we should simply pass the string
440 * forward to the parser and label it there.
441 * In the meantime, place a leading "b" on the string
442 * to mark it for the input routine as a binary string.
447 addlitchar('b', yyscanner);
453 yylval->str = litbufdup(yyscanner);
458 addlit(yytext, yyleng, yyscanner);
460 <xh>{quotecontinue} |
461 <xb>{quotecontinue} {
464 <xb><<EOF>> { yyerror("unterminated bit string literal"); }
467 /* Hexadecimal bit type.
468 * At some point we should simply pass the string
469 * forward to the parser and label it there.
470 * In the meantime, place a leading "x" on the string
471 * to mark it for the input routine as a hex string.
476 addlitchar('x', yyscanner);
482 yylval->str = litbufdup(yyscanner);
485 <xh><<EOF>> { yyerror("unterminated hexadecimal string literal"); }
488 /* National character.
489 * We will pass this along as a normal character string,
490 * but preceded with an internally-generated "NCHAR".
492 const ScanKeyword *keyword;
495 yyless(1); /* eat only 'n' this time */
497 keyword = ScanKeywordLookup("nchar",
499 yyextra->num_keywords);
502 yylval->keyword = keyword->name;
503 return keyword->value;
507 /* If NCHAR isn't a keyword, just return "n" */
508 yylval->str = pstrdup("n");
514 yyextra->warn_on_first_escape = true;
515 yyextra->saw_non_ascii = false;
517 if (yyextra->standard_conforming_strings)
524 yyextra->warn_on_first_escape = false;
525 yyextra->saw_non_ascii = false;
532 if (!yyextra->standard_conforming_strings)
534 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
535 errmsg("unsafe use of string constant with Unicode escapes"),
536 errdetail("String constants with Unicode escapes cannot be used when standard_conforming_strings is off."),
537 lexer_errposition()));
546 * check that the data remains valid if it might have been
547 * made invalid by unescaping any chars.
549 if (yyextra->saw_non_ascii)
550 pg_verifymbstr(yyextra->literalbuf,
553 yylval->str = litbufdup(yyscanner);
558 /* throw back all but the quote */
560 /* xusend state looks for possible UESCAPE */
563 <xusend>{whitespace} {
564 /* stay in xusend state over whitespace */
569 /* no UESCAPE after the quote, throw back everything */
572 yylval->str = litbuf_udeescape('\\', yyscanner);
576 /* found UESCAPE after the end quote */
578 if (!check_uescapechar(yytext[yyleng - 2]))
581 ADVANCE_YYLLOC(yyleng - 2);
582 yyerror("invalid Unicode escape character");
584 yylval->str = litbuf_udeescape(yytext[yyleng - 2],
588 <xq,xe,xus>{xqdouble} {
589 addlitchar('\'', yyscanner);
592 addlit(yytext, yyleng, yyscanner);
595 addlit(yytext, yyleng, yyscanner);
598 pg_wchar c = strtoul(yytext + 2, NULL, 16);
600 check_escape_warning(yyscanner);
602 if (is_utf16_surrogate_first(c))
604 yyextra->utf16_first_part = c;
607 else if (is_utf16_surrogate_second(c))
608 yyerror("invalid Unicode surrogate pair");
610 addunicode(c, yyscanner);
613 pg_wchar c = strtoul(yytext + 2, NULL, 16);
615 if (!is_utf16_surrogate_second(c))
616 yyerror("invalid Unicode surrogate pair");
618 c = surrogate_pair_to_codepoint(yyextra->utf16_first_part, c);
620 addunicode(c, yyscanner);
624 <xeu>. { yyerror("invalid Unicode surrogate pair"); }
625 <xeu>\n { yyerror("invalid Unicode surrogate pair"); }
626 <xeu><<EOF>> { yyerror("invalid Unicode surrogate pair"); }
627 <xe,xeu>{xeunicodefail} {
629 (errcode(ERRCODE_INVALID_ESCAPE_SEQUENCE),
630 errmsg("invalid Unicode escape"),
631 errhint("Unicode escapes must be \\uXXXX or \\UXXXXXXXX."),
632 lexer_errposition()));
635 if (yytext[1] == '\'')
637 if (yyextra->backslash_quote == BACKSLASH_QUOTE_OFF ||
638 (yyextra->backslash_quote == BACKSLASH_QUOTE_SAFE_ENCODING &&
639 PG_ENCODING_IS_CLIENT_ONLY(pg_get_client_encoding())))
641 (errcode(ERRCODE_NONSTANDARD_USE_OF_ESCAPE_CHARACTER),
642 errmsg("unsafe use of \\' in a string literal"),
643 errhint("Use '' to write quotes in strings. \\' is insecure in client-only encodings."),
644 lexer_errposition()));
646 check_string_escape_warning(yytext[1], yyscanner);
647 addlitchar(unescape_single_char(yytext[1], yyscanner),
651 unsigned char c = strtoul(yytext + 1, NULL, 8);
653 check_escape_warning(yyscanner);
654 addlitchar(c, yyscanner);
655 if (c == '\0' || IS_HIGHBIT_SET(c))
656 yyextra->saw_non_ascii = true;
659 unsigned char c = strtoul(yytext + 2, NULL, 16);
661 check_escape_warning(yyscanner);
662 addlitchar(c, yyscanner);
663 if (c == '\0' || IS_HIGHBIT_SET(c))
664 yyextra->saw_non_ascii = true;
666 <xq,xe,xus>{quotecontinue} {
670 /* This is only needed for \ just before EOF */
671 addlitchar(yytext[0], yyscanner);
673 <xq,xe,xus><<EOF>> { yyerror("unterminated quoted string"); }
677 yyextra->dolqstart = pstrdup(yytext);
683 /* throw back all but the initial "$" */
685 /* and treat it as {other} */
689 if (strcmp(yytext, yyextra->dolqstart) == 0)
691 pfree(yyextra->dolqstart);
692 yyextra->dolqstart = NULL;
694 yylval->str = litbufdup(yyscanner);
700 * When we fail to match $...$ to dolqstart, transfer
701 * the $... part to the output, but put back the final
702 * $ for rescanning. Consider $delim$...$junk$delim$
704 addlit(yytext, yyleng - 1, yyscanner);
708 <xdolq>{dolqinside} {
709 addlit(yytext, yyleng, yyscanner);
711 <xdolq>{dolqfailed} {
712 addlit(yytext, yyleng, yyscanner);
715 /* This is only needed for $ inside the quoted text */
716 addlitchar(yytext[0], yyscanner);
718 <xdolq><<EOF>> { yyerror("unterminated dollar-quoted string"); }
734 if (yyextra->literallen == 0)
735 yyerror("zero-length delimited identifier");
736 ident = litbufdup(yyscanner);
737 if (yyextra->literallen >= NAMEDATALEN)
738 truncate_identifier(ident, yyextra->literallen, true);
744 /* xuiend state looks for possible UESCAPE */
747 <xuiend>{whitespace} {
748 /* stay in xuiend state over whitespace */
753 /* no UESCAPE after the quote, throw back everything */
760 if (yyextra->literallen == 0)
761 yyerror("zero-length delimited identifier");
762 ident = litbuf_udeescape('\\', yyscanner);
763 identlen = strlen(ident);
764 if (identlen >= NAMEDATALEN)
765 truncate_identifier(ident, identlen, true);
770 /* found UESCAPE after the end quote */
775 if (yyextra->literallen == 0)
776 yyerror("zero-length delimited identifier");
777 if (!check_uescapechar(yytext[yyleng - 2]))
780 ADVANCE_YYLLOC(yyleng - 2);
781 yyerror("invalid Unicode escape character");
783 ident = litbuf_udeescape(yytext[yyleng - 2], yyscanner);
784 identlen = strlen(ident);
785 if (identlen >= NAMEDATALEN)
786 truncate_identifier(ident, identlen, true);
791 addlitchar('"', yyscanner);
794 addlit(yytext, yyleng, yyscanner);
796 <xd,xui><<EOF>> { yyerror("unterminated quoted identifier"); }
802 /* throw back all but the initial u/U */
804 /* and treat it as {identifier} */
805 ident = downcase_truncate_identifier(yytext, yyleng, true);
827 return EQUALS_GREATER;
837 return GREATER_EQUALS;
841 /* We accept both "<>" and "!=" as meaning NOT_EQUALS */
847 /* We accept both "<>" and "!=" as meaning NOT_EQUALS */
859 * Check for embedded slash-star or dash-dash; those
860 * are comment starts, so operator must stop there.
861 * Note that slash-star or dash-dash at the first
862 * character will match a prior rule, not this one.
865 char *slashstar = strstr(yytext, "/*");
866 char *dashdash = strstr(yytext, "--");
868 if (slashstar && dashdash)
870 /* if both appear, take the first one */
871 if (slashstar > dashdash)
872 slashstar = dashdash;
875 slashstar = dashdash;
877 nchars = slashstar - yytext;
880 * For SQL compatibility, '+' and '-' cannot be the
881 * last char of a multi-char operator unless the operator
882 * contains chars that are not in SQL operators.
883 * The idea is to lex '=-' as two operators, but not
884 * to forbid operator names like '?-' that could not be
885 * sequences of SQL operators.
888 (yytext[nchars - 1] == '+' ||
889 yytext[nchars - 1] == '-'))
893 for (ic = nchars - 2; ic >= 0; ic--)
895 if (strchr("~!@#^&|`?%", yytext[ic]))
899 break; /* found a char that makes it OK */
900 nchars--; /* else remove the +/-, and check again */
907 /* Strip the unwanted chars from the token */
910 * If what we have left is only one char, and it's
911 * one of the characters matching "self", then
912 * return it as a character token the same way
913 * that the "self" rule would have.
916 strchr(",()[].;:+-*/%^<>=", yytext[0]))
921 * Complain if operator is too long. Unlike the case
922 * for identifiers, we make this an error not a notice-
923 * and-truncate, because the odds are we are looking at
924 * a syntactic mistake anyway.
926 if (nchars >= NAMEDATALEN)
927 yyerror("operator too long");
929 yylval->str = pstrdup(yytext);
935 yylval->ival = atol(yytext + 1);
941 return process_integer_literal(yytext, yylval);
945 yylval->str = pstrdup(yytext);
949 /* throw back the .., and treat as integer */
952 return process_integer_literal(yytext, yylval);
956 yylval->str = pstrdup(yytext);
961 * throw back the [Ee], and treat as {decimal}. Note
962 * that it is possible the input is actually {integer},
963 * but since this case will almost certainly lead to a
964 * syntax error anyway, we don't bother to distinguish.
968 yylval->str = pstrdup(yytext);
972 /* throw back the [Ee][+-], and proceed as above */
975 yylval->str = pstrdup(yytext);
981 const ScanKeyword *keyword;
986 /* Is it a keyword? */
987 keyword = ScanKeywordLookup(yytext,
989 yyextra->num_keywords);
992 yylval->keyword = keyword->name;
993 return keyword->value;
997 * No. Convert the identifier to lower case, and truncate
1000 ident = downcase_truncate_identifier(yytext, yyleng, true);
1001 yylval->str = ident;
1017 /* LCOV_EXCL_STOP */
1020 * Arrange access to yyextra for subroutines of the main yylex() function.
1021 * We expect each subroutine to have a yyscanner parameter. Rather than
1022 * use the yyget_xxx functions, which might or might not get inlined by the
1023 * compiler, we cheat just a bit and cast yyscanner to the right type.
1026 #define yyextra (((struct yyguts_t *) yyscanner)->yyextra_r)
1028 /* Likewise for a couple of other things we need. */
1030 #define yylloc (((struct yyguts_t *) yyscanner)->yylloc_r)
1032 #define yyleng (((struct yyguts_t *) yyscanner)->yyleng_r)
1036 * scanner_errposition
1037 * Report a lexer or grammar error cursor position, if possible.
1039 * This is expected to be used within an ereport() call. The return value
1040 * is a dummy (always 0, in fact).
1042 * Note that this can only be used for messages emitted during raw parsing
1043 * (essentially, scan.l and gram.y), since it requires the yyscanner struct
1044 * to still be available.
1047 scanner_errposition(int location, core_yyscan_t yyscanner)
1052 return 0; /* no-op if location is unknown */
1054 /* Convert byte offset to character number */
1055 pos = pg_mbstrlen_with_len(yyextra->scanbuf, location) + 1;
1056 /* And pass it to the ereport mechanism */
1057 return errposition(pos);
1062 * Report a lexer or grammar error.
1064 * The message's cursor position is whatever YYLLOC was last set to,
1065 * ie, the start of the current token if called within yylex(), or the
1066 * most recently lexed token if called from the grammar.
1067 * This is OK for syntax error messages from the Bison parser, because Bison
1068 * parsers report error as soon as the first unparsable token is reached.
1069 * Beware of using yyerror for other purposes, as the cursor position might
1073 scanner_yyerror(const char *message, core_yyscan_t yyscanner)
1075 const char *loc = yyextra->scanbuf + *yylloc;
1077 if (*loc == YY_END_OF_BUFFER_CHAR)
1080 (errcode(ERRCODE_SYNTAX_ERROR),
1081 /* translator: %s is typically the translation of "syntax error" */
1082 errmsg("%s at end of input", _(message)),
1083 lexer_errposition()));
1088 (errcode(ERRCODE_SYNTAX_ERROR),
1089 /* translator: first %s is typically the translation of "syntax error" */
1090 errmsg("%s at or near \"%s\"", _(message), loc),
1091 lexer_errposition()));
1097 * Called before any actual parsing is done
1100 scanner_init(const char *str,
1101 core_yy_extra_type *yyext,
1102 const ScanKeyword *keywords,
1105 Size slen = strlen(str);
1108 if (yylex_init(&scanner) != 0)
1109 elog(ERROR, "yylex_init() failed: %m");
1111 core_yyset_extra(yyext, scanner);
1113 yyext->keywords = keywords;
1114 yyext->num_keywords = num_keywords;
1116 yyext->backslash_quote = backslash_quote;
1117 yyext->escape_string_warning = escape_string_warning;
1118 yyext->standard_conforming_strings = standard_conforming_strings;
1121 * Make a scan buffer with special termination needed by flex.
1123 yyext->scanbuf = (char *) palloc(slen + 2);
1124 yyext->scanbuflen = slen;
1125 memcpy(yyext->scanbuf, str, slen);
1126 yyext->scanbuf[slen] = yyext->scanbuf[slen + 1] = YY_END_OF_BUFFER_CHAR;
1127 yy_scan_buffer(yyext->scanbuf, slen + 2, scanner);
1129 /* initialize literal buffer to a reasonable but expansible size */
1130 yyext->literalalloc = 1024;
1131 yyext->literalbuf = (char *) palloc(yyext->literalalloc);
1132 yyext->literallen = 0;
1139 * Called after parsing is done to clean up after scanner_init()
1142 scanner_finish(core_yyscan_t yyscanner)
1145 * We don't bother to call yylex_destroy(), because all it would do is
1146 * pfree a small amount of control storage. It's cheaper to leak the
1147 * storage until the parsing context is destroyed. The amount of space
1148 * involved is usually negligible compared to the output parse tree
1151 * We do bother to pfree the scanbuf and literal buffer, but only if they
1152 * represent a nontrivial amount of space. The 8K cutoff is arbitrary.
1154 if (yyextra->scanbuflen >= 8192)
1155 pfree(yyextra->scanbuf);
1156 if (yyextra->literalalloc >= 8192)
1157 pfree(yyextra->literalbuf);
1162 addlit(char *ytext, int yleng, core_yyscan_t yyscanner)
1164 /* enlarge buffer if needed */
1165 if ((yyextra->literallen + yleng) >= yyextra->literalalloc)
1169 yyextra->literalalloc *= 2;
1170 } while ((yyextra->literallen + yleng) >= yyextra->literalalloc);
1171 yyextra->literalbuf = (char *) repalloc(yyextra->literalbuf,
1172 yyextra->literalalloc);
1174 /* append new data */
1175 memcpy(yyextra->literalbuf + yyextra->literallen, ytext, yleng);
1176 yyextra->literallen += yleng;
1181 addlitchar(unsigned char ychar, core_yyscan_t yyscanner)
1183 /* enlarge buffer if needed */
1184 if ((yyextra->literallen + 1) >= yyextra->literalalloc)
1186 yyextra->literalalloc *= 2;
1187 yyextra->literalbuf = (char *) repalloc(yyextra->literalbuf,
1188 yyextra->literalalloc);
1190 /* append new data */
1191 yyextra->literalbuf[yyextra->literallen] = ychar;
1192 yyextra->literallen += 1;
1197 * Create a palloc'd copy of literalbuf, adding a trailing null.
1200 litbufdup(core_yyscan_t yyscanner)
1202 int llen = yyextra->literallen;
1205 new = palloc(llen + 1);
1206 memcpy(new, yyextra->literalbuf, llen);
1212 process_integer_literal(const char *token, YYSTYPE *lval)
1218 val = strtol(token, &endptr, 10);
1219 if (*endptr != '\0' || errno == ERANGE ||
1220 /* check for overflow of int */
1223 /* integer too large, treat it as a float */
1224 lval->str = pstrdup(token);
1232 hexval(unsigned char c)
1234 if (c >= '0' && c <= '9')
1236 if (c >= 'a' && c <= 'f')
1237 return c - 'a' + 0xA;
1238 if (c >= 'A' && c <= 'F')
1239 return c - 'A' + 0xA;
1240 elog(ERROR, "invalid hexadecimal digit");
1241 return 0; /* not reached */
1245 check_unicode_value(pg_wchar c, char *loc, core_yyscan_t yyscanner)
1247 if (GetDatabaseEncoding() == PG_UTF8)
1252 ADVANCE_YYLLOC(loc - yyextra->literalbuf + 3); /* 3 for U&" */
1253 yyerror("Unicode escape values cannot be used for code point values above 007F when the server encoding is not UTF8");
1258 is_utf16_surrogate_first(pg_wchar c)
1260 return (c >= 0xD800 && c <= 0xDBFF);
1264 is_utf16_surrogate_second(pg_wchar c)
1266 return (c >= 0xDC00 && c <= 0xDFFF);
1270 surrogate_pair_to_codepoint(pg_wchar first, pg_wchar second)
1272 return ((first & 0x3FF) << 10) + 0x10000 + (second & 0x3FF);
1276 addunicode(pg_wchar c, core_yyscan_t yyscanner)
1280 if (c == 0 || c > 0x10FFFF)
1281 yyerror("invalid Unicode escape value");
1284 if (GetDatabaseEncoding() != PG_UTF8)
1285 yyerror("Unicode escape values cannot be used for code point values above 007F when the server encoding is not UTF8");
1286 yyextra->saw_non_ascii = true;
1288 unicode_to_utf8(c, (unsigned char *) buf);
1289 addlit(buf, pg_mblen(buf), yyscanner);
1292 /* is 'escape' acceptable as Unicode escape character (UESCAPE syntax) ? */
1294 check_uescapechar(unsigned char escape)
1296 if (isxdigit(escape)
1300 || scanner_isspace(escape))
1308 /* like litbufdup, but handle unicode escapes */
1310 litbuf_udeescape(unsigned char escape, core_yyscan_t yyscanner)
1316 pg_wchar pair_first = 0;
1318 /* Make literalbuf null-terminated to simplify the scanning loop */
1319 litbuf = yyextra->literalbuf;
1320 litbuf[yyextra->literallen] = '\0';
1323 * This relies on the subtle assumption that a UTF-8 expansion cannot be
1324 * longer than its escaped representation.
1326 new = palloc(yyextra->literallen + 1);
1332 if (in[0] == escape)
1334 if (in[1] == escape)
1338 ADVANCE_YYLLOC(in - litbuf + 3); /* 3 for U&" */
1339 yyerror("invalid Unicode surrogate pair");
1344 else if (isxdigit((unsigned char) in[1]) &&
1345 isxdigit((unsigned char) in[2]) &&
1346 isxdigit((unsigned char) in[3]) &&
1347 isxdigit((unsigned char) in[4]))
1351 unicode = (hexval(in[1]) << 12) +
1352 (hexval(in[2]) << 8) +
1353 (hexval(in[3]) << 4) +
1355 check_unicode_value(unicode, in, yyscanner);
1358 if (is_utf16_surrogate_second(unicode))
1360 unicode = surrogate_pair_to_codepoint(pair_first, unicode);
1365 ADVANCE_YYLLOC(in - litbuf + 3); /* 3 for U&" */
1366 yyerror("invalid Unicode surrogate pair");
1369 else if (is_utf16_surrogate_second(unicode))
1370 yyerror("invalid Unicode surrogate pair");
1372 if (is_utf16_surrogate_first(unicode))
1373 pair_first = unicode;
1376 unicode_to_utf8(unicode, (unsigned char *) out);
1377 out += pg_mblen(out);
1381 else if (in[1] == '+' &&
1382 isxdigit((unsigned char) in[2]) &&
1383 isxdigit((unsigned char) in[3]) &&
1384 isxdigit((unsigned char) in[4]) &&
1385 isxdigit((unsigned char) in[5]) &&
1386 isxdigit((unsigned char) in[6]) &&
1387 isxdigit((unsigned char) in[7]))
1391 unicode = (hexval(in[2]) << 20) +
1392 (hexval(in[3]) << 16) +
1393 (hexval(in[4]) << 12) +
1394 (hexval(in[5]) << 8) +
1395 (hexval(in[6]) << 4) +
1397 check_unicode_value(unicode, in, yyscanner);
1400 if (is_utf16_surrogate_second(unicode))
1402 unicode = surrogate_pair_to_codepoint(pair_first, unicode);
1407 ADVANCE_YYLLOC(in - litbuf + 3); /* 3 for U&" */
1408 yyerror("invalid Unicode surrogate pair");
1411 else if (is_utf16_surrogate_second(unicode))
1412 yyerror("invalid Unicode surrogate pair");
1414 if (is_utf16_surrogate_first(unicode))
1415 pair_first = unicode;
1418 unicode_to_utf8(unicode, (unsigned char *) out);
1419 out += pg_mblen(out);
1425 ADVANCE_YYLLOC(in - litbuf + 3); /* 3 for U&" */
1426 yyerror("invalid Unicode escape value");
1433 ADVANCE_YYLLOC(in - litbuf + 3); /* 3 for U&" */
1434 yyerror("invalid Unicode surrogate pair");
1440 /* unfinished surrogate pair? */
1443 ADVANCE_YYLLOC(in - litbuf + 3); /* 3 for U&" */
1444 yyerror("invalid Unicode surrogate pair");
1450 * We could skip pg_verifymbstr if we didn't process any non-7-bit-ASCII
1451 * codes; but it's probably not worth the trouble, since this isn't likely
1452 * to be a performance-critical path.
1454 pg_verifymbstr(new, out - new, false);
1458 static unsigned char
1459 unescape_single_char(unsigned char c, core_yyscan_t yyscanner)
1474 /* check for backslash followed by non-7-bit-ASCII */
1475 if (c == '\0' || IS_HIGHBIT_SET(c))
1476 yyextra->saw_non_ascii = true;
1483 check_string_escape_warning(unsigned char ychar, core_yyscan_t yyscanner)
1487 if (yyextra->warn_on_first_escape && yyextra->escape_string_warning)
1489 (errcode(ERRCODE_NONSTANDARD_USE_OF_ESCAPE_CHARACTER),
1490 errmsg("nonstandard use of \\' in a string literal"),
1491 errhint("Use '' to write quotes in strings, or use the escape string syntax (E'...')."),
1492 lexer_errposition()));
1493 yyextra->warn_on_first_escape = false; /* warn only once per string */
1495 else if (ychar == '\\')
1497 if (yyextra->warn_on_first_escape && yyextra->escape_string_warning)
1499 (errcode(ERRCODE_NONSTANDARD_USE_OF_ESCAPE_CHARACTER),
1500 errmsg("nonstandard use of \\\\ in a string literal"),
1501 errhint("Use the escape string syntax for backslashes, e.g., E'\\\\'."),
1502 lexer_errposition()));
1503 yyextra->warn_on_first_escape = false; /* warn only once per string */
1506 check_escape_warning(yyscanner);
1510 check_escape_warning(core_yyscan_t yyscanner)
1512 if (yyextra->warn_on_first_escape && yyextra->escape_string_warning)
1514 (errcode(ERRCODE_NONSTANDARD_USE_OF_ESCAPE_CHARACTER),
1515 errmsg("nonstandard use of escape in a string literal"),
1516 errhint("Use the escape string syntax for escapes, e.g., E'\\r\\n'."),
1517 lexer_errposition()));
1518 yyextra->warn_on_first_escape = false; /* warn only once per string */
1522 * Interface functions to make flex use palloc() instead of malloc().
1523 * It'd be better to make these static, but flex insists otherwise.
1527 core_yyalloc(yy_size_t bytes, core_yyscan_t yyscanner)
1529 return palloc(bytes);
1533 core_yyrealloc(void *ptr, yy_size_t bytes, core_yyscan_t yyscanner)
1536 return repalloc(ptr, bytes);
1538 return palloc(bytes);
1542 core_yyfree(void *ptr, core_yyscan_t yyscanner)