2 /*-------------------------------------------------------------------------
5 * lexical scanner for PostgreSQL
9 * The rules in this file must be kept in sync with psql's lexer!!!
11 * The rules are designed so that the scanner never has to backtrack,
12 * in the sense that there is always a rule that can match the input
13 * consumed so far (the rule action may internally throw back some input
14 * with yyless(), however). As explained in the flex manual, this makes
15 * for a useful speed increase --- about a third faster than a plain -CF
16 * lexer, in simple testing. The extra complexity is mostly in the rules
17 * for handling float numbers and continued string literals. If you change
18 * the lexical rules, verify that you haven't broken the no-backtrack
19 * property by running flex with the "-b" option and checking that the
20 * resulting "lex.backup" file says that no backing up is needed.
23 * Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group
24 * Portions Copyright (c) 1994, Regents of the University of California
27 * $PostgreSQL: pgsql/src/backend/parser/scan.l,v 1.125 2005/06/15 16:28:06 momjian Exp $
29 *-------------------------------------------------------------------------
36 #include "parser/gramparse.h"
37 #include "parser/keywords.h"
38 /* Not needed now that this file is compiled as part of gram.y */
39 /* #include "parser/parse.h" */
40 #include "parser/scansup.h"
41 #include "mb/pg_wchar.h"
44 /* Avoid exit() on fatal scanner errors (a bit ugly -- see yy_fatal_error) */
46 #define fprintf(file, fmt, msg) ereport(ERROR, (errmsg_internal("%s", msg)))
48 extern YYSTYPE yylval;
50 static int xcdepth = 0; /* depth of nesting in slash-star comments */
51 static char *dolqstart; /* current $foo$ quote start string */
54 * literalbuf is used to accumulate literal values when multiple rules
55 * are needed to parse a single literal. Call startlit to reset buffer
56 * to empty, addlit to add text. Note that the buffer is palloc'd and
57 * starts life afresh on every parse cycle.
59 static char *literalbuf; /* expandable buffer */
60 static int literallen; /* actual current length */
61 static int literalalloc; /* current allocated buffer size */
63 #define startlit() (literalbuf[0] = '\0', literallen = 0)
64 static void addlit(char *ytext, int yleng);
65 static void addlitchar(unsigned char ychar);
66 static char *litbufdup(void);
69 * When we parse a token that requires multiple lexer rules to process,
70 * we set token_start to point at the true start of the token, for use
71 * by yyerror(). yytext will point at just the text consumed by the last
72 * rule, so it's not very helpful (e.g., it might contain just the last
73 * quote mark of a quoted identifier). But to avoid cluttering every rule
74 * with setting token_start, we allow token_start = NULL to denote that
75 * it's okay to use yytext.
77 static char *token_start;
79 /* Handles to the buffer that the lexer uses internally */
80 static YY_BUFFER_STATE scanbufhandle;
83 unsigned char unescape_single_char(unsigned char c);
88 %option never-interactive
92 %option prefix="base_yy"
95 * OK, here is a short description of lex/flex rules behavior.
96 * The longest pattern which matches an input string is always chosen.
97 * For equal-length patterns, the first occurring in the rules list is chosen.
98 * INITIAL is the starting state, to which all non-conditional rules apply.
99 * Exclusive states change parsing rules while the state is active. When in
100 * an exclusive state, only those rules defined for that state apply.
102 * We use exclusive states for quoted strings, extended comments,
103 * and to eliminate parsing troubles for numeric strings.
105 * <xb> bit string literal
106 * <xc> extended C-style comments
107 * <xd> delimited identifiers (double-quoted identifiers)
108 * <xh> hexadecimal numeric string
109 * <xq> quoted strings
110 * <xdolq> $foo$ quoted strings
121 * In order to make the world safe for Windows and Mac clients as well as
122 * Unix ones, we accept either \n or \r as a newline. A DOS-style \r\n
123 * sequence will be seen as two successive newlines, but that doesn't cause
124 * any problems. Comments that start with -- and extend to the next
125 * newline are treated as equivalent to a single whitespace character.
127 * NOTE a fine point: if there is no newline following --, we will absorb
128 * everything to the end of the input as a comment. This is correct. Older
129 * versions of Postgres failed to recognize -- as a comment if the input
130 * did not end with a newline.
132 * XXX perhaps \f (formfeed) should be treated as a newline as well?
140 comment ("--"{non_newline}*)
142 whitespace ({space}+|{comment})
145 * SQL requires at least one newline in the whitespace separating
146 * string literals that are to be concatenated. Silly, but who are we
147 * to argue? Note that {whitespace_with_newline} should not have * after
148 * it, whereas {whitespace} should generally have a * after it...
151 special_whitespace ({space}+|{comment}{newline})
152 horiz_whitespace ({horiz_space}|{comment})
153 whitespace_with_newline ({horiz_whitespace}*{newline}{special_whitespace}*)
156 * To ensure that {quotecontinue} can be scanned without having to back up
157 * if the full pattern isn't matched, we include trailing whitespace in
158 * {quotestop}. This matches all cases where {quotecontinue} fails to match,
159 * except for {quote} followed by whitespace and just one "-" (not two,
160 * which would start a {comment}). To cover that we have {quotefail}.
161 * The actions for {quotestop} and {quotefail} must throw back characters
162 * beyond the quote proper.
165 quotestop {quote}{whitespace}*
166 quotecontinue {quote}{whitespace_with_newline}{quote}
167 quotefail {quote}{whitespace}*"-"
170 * It is tempting to scan the string for only those characters
171 * which are allowed. However, this leads to silently swallowed
172 * characters if illegal characters are included in the string.
173 * For example, if xbinside is [01] then B'ABCD' is interpreted
174 * as a zero-length string, and the ABCD' is lost!
175 * Better to pass the string forward and let the input routines
176 * validate the contents.
181 /* Hexadecimal number */
185 /* National character */
189 * xqdouble implements embedded quote, ''''
192 xqdouble {quote}{quote}
195 xqoctesc [\\][0-7]{1,3}
196 xqhexesc [\\]x[0-9A-Fa-f]{1,2}
198 /* $foo$ style quotes ("dollar quoting")
199 * The quoted string starts with $foo$ where "foo" is an optional string
200 * in the form of an identifier, except that it may not contain "$",
201 * and extends to the first occurrence of an identical string.
202 * There is *no* processing of the quoted text.
204 * {dolqfailed} is an error rule to avoid scanner backup when {dolqdelim}
205 * fails to match its trailing "$".
207 dolq_start [A-Za-z\200-\377_]
208 dolq_cont [A-Za-z\200-\377_0-9]
209 dolqdelim \$({dolq_start}{dolq_cont}*)?\$
210 dolqfailed \${dolq_start}{dolq_cont}*
214 * Allows embedded spaces and other special characters into identifiers.
219 xddouble {dquote}{dquote}
224 * The "extended comment" syntax closely resembles allowable operator syntax.
225 * The tricky part here is to get lex to recognize a string starting with
226 * slash-star as a comment, when interpreting it as an operator would produce
227 * a longer match --- remember lex will prefer a longer match! Also, if we
228 * have something like plus-slash-star, lex will think this is a 3-character
229 * operator whereas we want to see it as a + operator and a comment start.
230 * The solution is two-fold:
231 * 1. append {op_chars}* to xcstart so that it matches as much text as
232 * {operator} would. Then the tie-breaker (first matching rule of same
233 * length) ensures xcstart wins. We put back the extra stuff with yyless()
234 * in case it contains a star-slash that should terminate the comment.
235 * 2. In the operator rule, check for slash-star within the operator, and
236 * if found throw it back with yyless(). This handles the plus-slash-star
238 * Dash-dash comments have similar interactions with the operator rule.
240 xcstart \/\*{op_chars}*
245 ident_start [A-Za-z\200-\377_]
246 ident_cont [A-Za-z\200-\377_0-9\$]
248 identifier {ident_start}{ident_cont}*
253 * "self" is the set of chars that should be returned as single-character
254 * tokens. "op_chars" is the set of chars that can make up "Op" tokens,
255 * which can be one or more characters long (but if a single-char token
256 * appears in the "self" set, it is not to be returned as an Op). Note
257 * that the sets overlap, but each has some chars that are not in the other.
259 * If you change either set, adjust the character lists appearing in the
260 * rule for "operator"!
262 self [,()\[\].;\:\+\-\*\/\%\^\<\>\=]
263 op_chars [\~\!\@\#\^\&\|\`\?\+\-\*\/\%\<\>\=]
266 /* we no longer allow unary minus in numbers.
267 * instead we pass it separately to parser. there it gets
268 * coerced via doNegate() -- Leon aug 20 1999
270 * {realfail1} and {realfail2} are added to prevent the need for scanner
271 * backup when the {real} rule fails to match completely.
275 decimal (({digit}*\.{digit}+)|({digit}+\.{digit}*))
276 real ({integer}|{decimal})[Ee][-+]?{digit}+
277 realfail1 ({integer}|{decimal})[Ee]
278 realfail2 ({integer}|{decimal})[Ee][-+]
285 * Dollar quoted strings are totally opaque, and no escaping is done on them.
286 * Other quoted strings must allow some special characters such as single-quote
288 * Embedded single-quotes are implemented both in the SQL standard
289 * style of two adjacent single quotes "''" and in the Postgres/Java style
290 * of escaped-quote "\'".
291 * Other embedded escaped characters are matched explicitly and the leading
292 * backslash is dropped from the string.
293 * Note that xcstart must appear before operator, as explained above!
294 * Also whitespace (comment) must appear before operator.
300 /* code to execute during start of each call of yylex() */
309 token_start = yytext;
312 /* Put back any characters past slash-star; see above */
318 /* Put back any characters past slash-star; see above */
326 /* reset token_start for next token */
345 <xc><<EOF>> { yyerror("unterminated /* comment"); }
349 * At some point we should simply pass the string
350 * forward to the parser and label it there.
351 * In the meantime, place a leading "b" on the string
352 * to mark it for the input routine as a binary string.
354 token_start = yytext;
363 yylval.str = litbufdup();
368 addlit(yytext, yyleng);
370 <xh>{quotecontinue} |
371 <xb>{quotecontinue} {
374 <xb><<EOF>> { yyerror("unterminated bit string literal"); }
377 /* Hexadecimal bit type.
378 * At some point we should simply pass the string
379 * forward to the parser and label it there.
380 * In the meantime, place a leading "x" on the string
381 * to mark it for the input routine as a hex string.
383 token_start = yytext;
392 yylval.str = litbufdup();
395 <xh><<EOF>> { yyerror("unterminated hexadecimal string literal"); }
398 /* National character.
399 * We will pass this along as a normal character string,
400 * but preceded with an internally-generated "NCHAR".
402 const ScanKeyword *keyword;
404 yyless(1); /* eat only 'n' this time */
405 /* nchar had better be a keyword! */
406 keyword = ScanKeywordLookup("nchar");
407 Assert(keyword != NULL);
408 yylval.keyword = keyword->name;
409 return keyword->value;
413 token_start = yytext;
421 yylval.str = litbufdup();
428 addlit(yytext, yyleng);
431 addlitchar(unescape_single_char(yytext[1]));
434 unsigned char c = strtoul(yytext+1, NULL, 8);
438 unsigned char c = strtoul(yytext+2, NULL, 16);
441 <xq>{quotecontinue} {
445 /* This is only needed for \ just before EOF */
446 addlitchar(yytext[0]);
448 <xq><<EOF>> { yyerror("unterminated quoted string"); }
451 token_start = yytext;
452 dolqstart = pstrdup(yytext);
457 /* throw back all but the initial "$" */
459 /* and treat it as {other} */
463 if (strcmp(yytext, dolqstart) == 0)
467 yylval.str = litbufdup();
473 * When we fail to match $...$ to dolqstart, transfer
474 * the $... part to the output, but put back the final
475 * $ for rescanning. Consider $delim$...$junk$delim$
477 addlit(yytext, yyleng-1);
481 <xdolq>{dolqinside} {
482 addlit(yytext, yyleng);
484 <xdolq>{dolqfailed} {
485 addlit(yytext, yyleng);
488 /* This is only needed for $ inside the quoted text */
489 addlitchar(yytext[0]);
491 <xdolq><<EOF>> { yyerror("unterminated dollar-quoted string"); }
494 token_start = yytext;
503 yyerror("zero-length delimited identifier");
505 if (literallen >= NAMEDATALEN)
506 truncate_identifier(ident, literallen, true);
514 addlit(yytext, yyleng);
516 <xd><<EOF>> { yyerror("unterminated quoted identifier"); }
528 * Check for embedded slash-star or dash-dash; those
529 * are comment starts, so operator must stop there.
530 * Note that slash-star or dash-dash at the first
531 * character will match a prior rule, not this one.
534 char *slashstar = strstr(yytext, "/*");
535 char *dashdash = strstr(yytext, "--");
537 if (slashstar && dashdash)
539 /* if both appear, take the first one */
540 if (slashstar > dashdash)
541 slashstar = dashdash;
544 slashstar = dashdash;
546 nchars = slashstar - yytext;
549 * For SQL compatibility, '+' and '-' cannot be the
550 * last char of a multi-char operator unless the operator
551 * contains chars that are not in SQL operators.
552 * The idea is to lex '=-' as two operators, but not
553 * to forbid operator names like '?-' that could not be
554 * sequences of SQL operators.
557 (yytext[nchars-1] == '+' ||
558 yytext[nchars-1] == '-'))
562 for (ic = nchars-2; ic >= 0; ic--)
564 if (strchr("~!@#^&|`?%", yytext[ic]))
568 break; /* found a char that makes it OK */
569 nchars--; /* else remove the +/-, and check again */
574 /* Strip the unwanted chars from the token */
577 * If what we have left is only one char, and it's
578 * one of the characters matching "self", then
579 * return it as a character token the same way
580 * that the "self" rule would have.
583 strchr(",()[].;:+-*/%^<>=", yytext[0]))
587 /* Convert "!=" operator to "<>" for compatibility */
588 if (strcmp(yytext, "!=") == 0)
589 yylval.str = pstrdup("<>");
591 yylval.str = pstrdup(yytext);
596 yylval.ival = atol(yytext + 1);
605 val = strtol(yytext, &endptr, 10);
606 if (*endptr != '\0' || errno == ERANGE
607 #ifdef HAVE_LONG_INT_64
608 /* if long > 32 bits, check for overflow of int4 */
609 || val != (long) ((int32) val)
613 /* integer too large, treat it as a float */
614 yylval.str = pstrdup(yytext);
621 yylval.str = pstrdup(yytext);
625 yylval.str = pstrdup(yytext);
630 * throw back the [Ee], and treat as {decimal}. Note
631 * that it is possible the input is actually {integer},
632 * but since this case will almost certainly lead to a
633 * syntax error anyway, we don't bother to distinguish.
636 yylval.str = pstrdup(yytext);
640 /* throw back the [Ee][+-], and proceed as above */
642 yylval.str = pstrdup(yytext);
648 const ScanKeyword *keyword;
651 /* Is it a keyword? */
652 keyword = ScanKeywordLookup(yytext);
655 yylval.keyword = keyword->name;
656 return keyword->value;
660 * No. Convert the identifier to lower case, and truncate
663 ident = downcase_truncate_identifier(yytext, yyleng, true);
675 yyerror(const char *message)
677 const char *loc = token_start ? token_start : yytext;
680 /* in multibyte encodings, return index in characters not bytes */
681 cursorpos = pg_mbstrlen_with_len(scanbuf, loc - scanbuf) + 1;
683 if (*loc == YY_END_OF_BUFFER_CHAR)
686 (errcode(ERRCODE_SYNTAX_ERROR),
687 /* translator: %s is typically "syntax error" */
688 errmsg("%s at end of input", _(message)),
689 errposition(cursorpos)));
694 (errcode(ERRCODE_SYNTAX_ERROR),
695 /* translator: first %s is typically "syntax error" */
696 errmsg("%s at or near \"%s\"", _(message), loc),
697 errposition(cursorpos)));
703 * Called before any actual parsing is done
706 scanner_init(const char *str)
708 Size slen = strlen(str);
711 * Might be left over after ereport()
713 if (YY_CURRENT_BUFFER)
714 yy_delete_buffer(YY_CURRENT_BUFFER);
717 * Make a scan buffer with special termination needed by flex.
719 scanbuf = palloc(slen + 2);
720 memcpy(scanbuf, str, slen);
721 scanbuf[slen] = scanbuf[slen + 1] = YY_END_OF_BUFFER_CHAR;
722 scanbufhandle = yy_scan_buffer(scanbuf, slen + 2);
724 /* initialize literal buffer to a reasonable but expansible size */
726 literalbuf = (char *) palloc(literalalloc);
734 * Called after parsing is done to clean up after scanner_init()
739 yy_delete_buffer(scanbufhandle);
745 addlit(char *ytext, int yleng)
747 /* enlarge buffer if needed */
748 if ((literallen+yleng) >= literalalloc)
752 } while ((literallen+yleng) >= literalalloc);
753 literalbuf = (char *) repalloc(literalbuf, literalalloc);
755 /* append new data, add trailing null */
756 memcpy(literalbuf+literallen, ytext, yleng);
758 literalbuf[literallen] = '\0';
763 addlitchar(unsigned char ychar)
765 /* enlarge buffer if needed */
766 if ((literallen+1) >= literalalloc)
769 literalbuf = (char *) repalloc(literalbuf, literalalloc);
771 /* append new data, add trailing null */
772 literalbuf[literallen] = ychar;
774 literalbuf[literallen] = '\0';
779 * One might be tempted to write pstrdup(literalbuf) instead of this,
780 * but for long literals this is much faster because the length is
788 new = palloc(literallen + 1);
789 memcpy(new, literalbuf, literallen+1);
795 unescape_single_char(unsigned char c)