]> granicus.if.org Git - postgresql/blob - src/pl/plpgsql/src/scan.l
Simplify ParamListInfo data structure to support only numbered parameters,
[postgresql] / src / pl / plpgsql / src / scan.l
1 %{
2 /*-------------------------------------------------------------------------
3  *
4  * scan.l               - Scanner for the PL/pgSQL
5  *                        procedural language
6  *
7  * Portions Copyright (c) 1996-2006, PostgreSQL Global Development Group
8  * Portions Copyright (c) 1994, Regents of the University of California
9  *
10  *
11  * IDENTIFICATION
12  *        $PostgreSQL: pgsql/src/pl/plpgsql/src/scan.l,v 1.45 2006/03/09 21:29:38 momjian Exp $
13  *
14  *-------------------------------------------------------------------------
15  */
16
17 #include "plpgsql.h"
18
19 #include "mb/pg_wchar.h"
20
21
22 /* No reason to constrain amount of data slurped */
23 #define YY_READ_BUF_SIZE 16777216
24
25 /* Avoid exit() on fatal scanner errors (a bit ugly -- see yy_fatal_error) */
26 #undef fprintf
27 #define fprintf(file, fmt, msg)  ereport(ERROR, (errmsg_internal("%s", msg)))
28
29 /* Handles to the buffer that the lexer uses internally */
30 static YY_BUFFER_STATE scanbufhandle;
31 static char *scanbuf;
32
33 static const char *scanstr;             /* original input string */
34
35 static int      scanner_functype;
36 static bool     scanner_typereported;
37 static int      pushback_token;
38 static bool have_pushback_token;
39 static int      lookahead_token;        
40 static bool have_lookahead_token;
41 static const char *cur_line_start;
42 static int      cur_line_num;
43 static char    *dolqstart;      /* current $foo$ quote start string */
44 static int      dolqlen;                        /* signal to plpgsql_get_string_value */
45
46 bool plpgsql_SpaceScanned = false;
47 %}
48
49 %option 8bit
50 %option never-interactive
51 %option nodefault
52 %option nounput
53 %option noyywrap
54 %option prefix="plpgsql_base_yy"
55
56 %option case-insensitive
57
58
59 %x      IN_STRING
60 %x      IN_COMMENT
61 %x      IN_DOLLARQUOTE
62
63 digit                   [0-9]
64 ident_start             [A-Za-z\200-\377_]
65 ident_cont              [A-Za-z\200-\377_0-9\$]
66
67 quoted_ident    (\"[^\"]*\")+
68
69 identifier              ({ident_start}{ident_cont}*|{quoted_ident})
70
71 param                   \${digit}+
72
73 space                   [ \t\n\r\f]
74
75 /* $foo$ style quotes ("dollar quoting")
76  * copied straight from the backend SQL parser
77  */
78 dolq_start              [A-Za-z\200-\377_]
79 dolq_cont               [A-Za-z\200-\377_0-9]
80 dolqdelim               \$({dolq_start}{dolq_cont}*)?\$
81 dolqinside              [^$]+
82
83 %%
84     /* ----------
85      * Local variables in scanner to remember where
86      * a string or comment started
87      * ----------
88      */
89     int start_lineno = 0;
90         char *start_charpos = NULL;
91
92     /* ----------
93      * Reset the state when entering the scanner
94      * ----------
95      */
96     BEGIN(INITIAL);
97     plpgsql_SpaceScanned = false;
98
99     /* ----------
100      * On the first call to a new source report the
101      * function's type (T_FUNCTION or T_TRIGGER)
102      * ----------
103      */
104         if (!scanner_typereported)
105         {
106                 scanner_typereported = true;
107                 return scanner_functype;
108         }
109
110     /* ----------
111      * The keyword rules
112      * ----------
113      */
114 :=                              { return K_ASSIGN;                      }
115 =                               { return K_ASSIGN;                      }
116 \.\.                    { return K_DOTDOT;                      }
117 alias                   { return K_ALIAS;                       }
118 begin                   { return K_BEGIN;                       }
119 close                   { return K_CLOSE;                       }
120 constant                { return K_CONSTANT;            }
121 continue                { return K_CONTINUE;            }
122 cursor                  { return K_CURSOR;                      }
123 debug                   { return K_DEBUG;                       }
124 declare                 { return K_DECLARE;                     }
125 default                 { return K_DEFAULT;                     }
126 diagnostics             { return K_DIAGNOSTICS;         }
127 else                    { return K_ELSE;                        }
128 elseif          { return K_ELSIF;           }
129 elsif           { return K_ELSIF;           }
130 end                             { return K_END;                         }
131 exception               { return K_EXCEPTION;           }
132 execute                 { return K_EXECUTE;                     }
133 exit                    { return K_EXIT;                        }
134 fetch                   { return K_FETCH;                       }
135 for                             { return K_FOR;                         }
136 from                    { return K_FROM;                        }
137 get                             { return K_GET;                         }
138 if                              { return K_IF;                          }
139 in                              { return K_IN;                          }
140 info                    { return K_INFO;                        }
141 into                    { return K_INTO;                        }
142 is                              { return K_IS;                          }
143 log                             { return K_LOG;                         }
144 loop                    { return K_LOOP;                        }
145 next                    { return K_NEXT;                        }
146 not                             { return K_NOT;                         }
147 notice                  { return K_NOTICE;                      }
148 null                    { return K_NULL;                        }
149 open                    { return K_OPEN;                        }
150 or                              { return K_OR;                          }
151 perform                 { return K_PERFORM;                     }
152 raise                   { return K_RAISE;                       }
153 rename                  { return K_RENAME;                      }
154 result_oid              { return K_RESULT_OID;          }
155 return                  { return K_RETURN;                      }
156 reverse                 { return K_REVERSE;                     }
157 row_count               { return K_ROW_COUNT;           }
158 select                  { return K_SELECT;                      }
159 then                    { return K_THEN;                        }
160 to                              { return K_TO;                          }
161 type                    { return K_TYPE;                        }
162 warning                 { return K_WARNING;                     }
163 when                    { return K_WHEN;                        }
164 while                   { return K_WHILE;                       }
165
166 ^#option                { return O_OPTION;                      }
167 dump                    { return O_DUMP;                        }
168
169
170     /* ----------
171      * Special word rules
172          *
173          * We set plpgsql_error_lineno in each rule so that errors reported
174          * in the pl_comp.c subroutines will point to the right place.
175      * ----------
176      */
177 {identifier}                                    {
178         plpgsql_error_lineno = plpgsql_scanner_lineno();
179         return plpgsql_parse_word(yytext); }
180 {identifier}{space}*\.{space}*{identifier}      {
181         plpgsql_error_lineno = plpgsql_scanner_lineno();
182         return plpgsql_parse_dblword(yytext); }
183 {identifier}{space}*\.{space}*{identifier}{space}*\.{space}*{identifier}        {
184         plpgsql_error_lineno = plpgsql_scanner_lineno();
185         return plpgsql_parse_tripword(yytext); }
186 {identifier}{space}*%TYPE               {
187         plpgsql_error_lineno = plpgsql_scanner_lineno();
188         return plpgsql_parse_wordtype(yytext); }
189 {identifier}{space}*\.{space}*{identifier}{space}*%TYPE {
190         plpgsql_error_lineno = plpgsql_scanner_lineno();
191         return plpgsql_parse_dblwordtype(yytext); }
192 {identifier}{space}*\.{space}*{identifier}{space}*\.{space}*{identifier}{space}*%TYPE   {
193         plpgsql_error_lineno = plpgsql_scanner_lineno();
194         return plpgsql_parse_tripwordtype(yytext); }
195 {identifier}{space}*%ROWTYPE    {
196         plpgsql_error_lineno = plpgsql_scanner_lineno();
197         return plpgsql_parse_wordrowtype(yytext); }
198 {identifier}{space}*\.{space}*{identifier}{space}*%ROWTYPE      {
199         plpgsql_error_lineno = plpgsql_scanner_lineno();
200         return plpgsql_parse_dblwordrowtype(yytext); }
201 {param}                                                 {
202         plpgsql_error_lineno = plpgsql_scanner_lineno();
203         return plpgsql_parse_word(yytext); }
204 {param}{space}*\.{space}*{identifier}   {
205         plpgsql_error_lineno = plpgsql_scanner_lineno();
206         return plpgsql_parse_dblword(yytext); }
207 {param}{space}*\.{space}*{identifier}{space}*\.{space}*{identifier}     {
208         plpgsql_error_lineno = plpgsql_scanner_lineno();
209         return plpgsql_parse_tripword(yytext); }
210 {param}{space}*%TYPE                    {
211         plpgsql_error_lineno = plpgsql_scanner_lineno();
212         return plpgsql_parse_wordtype(yytext); }
213 {param}{space}*\.{space}*{identifier}{space}*%TYPE      {
214         plpgsql_error_lineno = plpgsql_scanner_lineno();
215         return plpgsql_parse_dblwordtype(yytext); }
216 {param}{space}*\.{space}*{identifier}{space}*\.{space}*{identifier}{space}*%TYPE        {
217         plpgsql_error_lineno = plpgsql_scanner_lineno();
218         return plpgsql_parse_tripwordtype(yytext); }
219 {param}{space}*%ROWTYPE         {
220         plpgsql_error_lineno = plpgsql_scanner_lineno();
221         return plpgsql_parse_wordrowtype(yytext); }
222 {param}{space}*\.{space}*{identifier}{space}*%ROWTYPE   {
223         plpgsql_error_lineno = plpgsql_scanner_lineno();
224         return plpgsql_parse_dblwordrowtype(yytext); }
225
226 {digit}+                { return T_NUMBER;                      }
227
228 \".                             {
229                                 plpgsql_error_lineno = plpgsql_scanner_lineno();
230                                 ereport(ERROR,
231                                                 (errcode(ERRCODE_DATATYPE_MISMATCH),
232                                                  errmsg("unterminated quoted identifier")));
233                         }
234
235     /* ----------
236      * Ignore whitespaces but remember this happened
237      * ----------
238      */
239 {space}+                { plpgsql_SpaceScanned = true;          }
240
241     /* ----------
242      * Eat up comments
243      * ----------
244      */
245 --[^\r\n]*              ;
246
247 \/\*                    { start_lineno = plpgsql_scanner_lineno();
248                           BEGIN(IN_COMMENT);
249                         }
250 <IN_COMMENT>\*\/        { BEGIN(INITIAL); plpgsql_SpaceScanned = true; }
251 <IN_COMMENT>\n          ;
252 <IN_COMMENT>.           ;
253 <IN_COMMENT><<EOF>>     {
254                                 plpgsql_error_lineno = start_lineno;
255                                 ereport(ERROR,
256                                                 (errcode(ERRCODE_DATATYPE_MISMATCH),
257                                                  errmsg("unterminated comment")));
258                         }
259
260     /* ----------
261      * Collect anything inside of ''s and return one STRING token
262          *
263          * Hacking yytext/yyleng here lets us avoid using yymore(), which is
264          * a win for performance.  It's safe because we know the underlying
265          * input buffer is not changing.
266      * ----------
267      */
268 '                       {
269                           start_lineno = plpgsql_scanner_lineno();
270                           start_charpos = yytext;
271                           BEGIN(IN_STRING);
272                         }
273 [eE]'           {
274                           /* for now, treat the same as a regular literal */
275                           start_lineno = plpgsql_scanner_lineno();
276                           start_charpos = yytext;
277                           BEGIN(IN_STRING);
278                         }
279 <IN_STRING>\\.          { }
280 <IN_STRING>\\           { /* can only happen with \ at EOF */ }
281 <IN_STRING>''           { }
282 <IN_STRING>'            {
283                           /* tell plpgsql_get_string_value it's not a dollar quote */
284                           dolqlen = 0;
285                           /* adjust yytext/yyleng to describe whole string token */
286                           yyleng += (yytext - start_charpos);
287                           yytext = start_charpos;
288                           BEGIN(INITIAL);
289                           return T_STRING;
290                         }
291 <IN_STRING>[^'\\]+      { }
292 <IN_STRING><<EOF>>      {
293                                 plpgsql_error_lineno = start_lineno;
294                                 ereport(ERROR,
295                                                 (errcode(ERRCODE_DATATYPE_MISMATCH),
296                                                  errmsg("unterminated string")));
297                         }
298
299 {dolqdelim}             {
300                           start_lineno = plpgsql_scanner_lineno();
301                           start_charpos = yytext;
302                           dolqstart = pstrdup(yytext);
303                           BEGIN(IN_DOLLARQUOTE);
304                         }
305 <IN_DOLLARQUOTE>{dolqdelim} {
306                           if (strcmp(yytext, dolqstart) == 0)
307                           {
308                                         pfree(dolqstart);
309                                         /* tell plpgsql_get_string_value it is a dollar quote */
310                                         dolqlen = yyleng;
311                                         /* adjust yytext/yyleng to describe whole string token */
312                                         yyleng += (yytext - start_charpos);
313                                         yytext = start_charpos;
314                                         BEGIN(INITIAL);
315                                         return T_STRING;
316                           }
317                           else
318                           {
319                                         /*
320                                          * When we fail to match $...$ to dolqstart, transfer
321                                          * the $... part to the output, but put back the final
322                                          * $ for rescanning.  Consider $delim$...$junk$delim$
323                                          */
324                                         yyless(yyleng-1);
325                           }
326                         }
327 <IN_DOLLARQUOTE>{dolqinside} { }
328 <IN_DOLLARQUOTE>.       { /* needed for $ inside the quoted text */ }
329 <IN_DOLLARQUOTE><<EOF>> { 
330                                 plpgsql_error_lineno = start_lineno;
331                                 ereport(ERROR,
332                                                 (errcode(ERRCODE_DATATYPE_MISMATCH),
333                                                  errmsg("unterminated dollar-quoted string")));
334                         }
335
336     /* ----------
337      * Any unmatched character is returned as is
338      * ----------
339      */
340 .                       { return yytext[0];                     }
341
342 %%
343
344
345 /*
346  * This is the yylex routine called from outside. It exists to provide
347  * a pushback facility, as well as to allow us to parse syntax that
348  * requires more than one token of lookahead.
349  */
350 int
351 plpgsql_yylex(void)
352 {
353         int cur_token;
354
355         if (have_pushback_token)
356         {
357                 have_pushback_token = false;
358                 cur_token = pushback_token;
359         }
360         else if (have_lookahead_token)
361         {
362                 have_lookahead_token = false;
363                 cur_token = lookahead_token;
364         }
365         else
366                 cur_token = yylex();
367
368         /* Do we need to look ahead for a possible multiword token? */
369         switch (cur_token)
370         {
371                 /* RETURN NEXT must be reduced to a single token */
372                 case K_RETURN:
373                         if (!have_lookahead_token)
374                         {
375                                 lookahead_token = yylex();
376                                 have_lookahead_token = true;
377                         }
378                         if (lookahead_token == K_NEXT)
379                         {
380                                 have_lookahead_token = false;
381                                 cur_token = K_RETURN_NEXT;
382                         }
383                         break;
384
385                 default:
386                         break;
387         }
388
389         return cur_token;
390 }
391
392 /*
393  * Push back a single token to be re-read by next plpgsql_yylex() call.
394  */
395 void
396 plpgsql_push_back_token(int token)
397 {
398         if (have_pushback_token)
399                 elog(ERROR, "cannot push back multiple tokens");
400         pushback_token = token;
401         have_pushback_token = true;
402 }
403
404 /*
405  * Report a syntax error.
406  */
407 void
408 plpgsql_yyerror(const char *message)
409 {
410         const char *loc = yytext;
411         int                     cursorpos;
412
413         plpgsql_error_lineno = plpgsql_scanner_lineno();
414
415         /* in multibyte encodings, return index in characters not bytes */
416         cursorpos = pg_mbstrlen_with_len(scanbuf, loc - scanbuf) + 1;
417
418         if (*loc == YY_END_OF_BUFFER_CHAR)
419         {
420                 ereport(ERROR,
421                                 (errcode(ERRCODE_SYNTAX_ERROR),
422                                  /* translator: %s is typically "syntax error" */
423                                  errmsg("%s at end of input", message),
424                                  internalerrposition(cursorpos),
425                                  internalerrquery(scanstr)));
426         }
427         else
428         {
429                 ereport(ERROR,
430                                 (errcode(ERRCODE_SYNTAX_ERROR),
431                                  /* translator: first %s is typically "syntax error" */
432                                  errmsg("%s at or near \"%s\"", message, loc),
433                                  internalerrposition(cursorpos),
434                                  internalerrquery(scanstr)));
435         }
436 }
437
438 /*
439  * Get the line number at which the current token ends.  This substitutes
440  * for flex's very poorly implemented yylineno facility.
441  *
442  * We assume that flex has written a '\0' over the character following the
443  * current token in scanbuf.  So, we just have to count the '\n' characters
444  * before that.  We optimize this a little by keeping track of the last
445  * '\n' seen so far.
446  */
447 int
448 plpgsql_scanner_lineno(void)
449 {
450         const char *c;
451
452         while ((c = strchr(cur_line_start, '\n')) != NULL)
453         {
454                 cur_line_start = c + 1;
455                 cur_line_num++;
456         }
457         return cur_line_num;
458 }
459
460 /*
461  * Called before any actual parsing is done
462  *
463  * Note: the passed "str" must remain valid until plpgsql_scanner_finish().
464  * Although it is not fed directly to flex, we need the original string
465  * to cite in error messages.
466  */
467 void
468 plpgsql_scanner_init(const char *str, int functype)
469 {
470         Size    slen;
471
472         slen = strlen(str);
473
474         /*
475          * Might be left over after ereport()
476          */
477         if (YY_CURRENT_BUFFER)
478                 yy_delete_buffer(YY_CURRENT_BUFFER);
479
480         /*
481          * Make a scan buffer with special termination needed by flex.
482          */
483         scanbuf = palloc(slen + 2);
484         memcpy(scanbuf, str, slen);
485         scanbuf[slen] = scanbuf[slen + 1] = YY_END_OF_BUFFER_CHAR;
486         scanbufhandle = yy_scan_buffer(scanbuf, slen + 2);
487
488         /* Other setup */
489         scanstr = str;
490
491     scanner_functype = functype;
492     scanner_typereported = false;
493
494         have_pushback_token = false;
495         have_lookahead_token = false;
496
497         cur_line_start = scanbuf;
498         cur_line_num = 1;
499
500         /*----------
501          * Hack: skip any initial newline, so that in the common coding layout
502          *              CREATE FUNCTION ... AS '
503          *                      code body
504          *              ' LANGUAGE plpgsql;
505          * we will think "line 1" is what the programmer thinks of as line 1.
506          *----------
507          */
508     if (*cur_line_start == '\r')
509         cur_line_start++;
510     if (*cur_line_start == '\n')
511         cur_line_start++;
512
513         BEGIN(INITIAL);
514 }
515
516 /*
517  * Called after parsing is done to clean up after plpgsql_scanner_init()
518  */
519 void
520 plpgsql_scanner_finish(void)
521 {
522         yy_delete_buffer(scanbufhandle);
523         pfree(scanbuf);
524 }
525
526 /*
527  * Called after a T_STRING token is read to get the string literal's value
528  * as a palloc'd string.  (We make this a separate call because in many
529  * scenarios there's no need to get the decoded value.)
530  *
531  * Note: we expect the literal to be the most recently lexed token.  This
532  * would not work well if we supported multiple-token pushback or if 
533  * plpgsql_yylex() wanted to read ahead beyond a T_STRING token.
534  */
535 char *
536 plpgsql_get_string_value(void)
537 {
538         char       *result;
539         const char *cp;
540         int                     len;
541
542         if (dolqlen > 0)
543         {
544                 /* Token is a $foo$...$foo$ string */
545                 len = yyleng - 2 * dolqlen;
546                 Assert(len >= 0);
547                 result = (char *) palloc(len + 1);
548                 memcpy(result, yytext + dolqlen, len);
549                 result[len] = '\0';
550         }
551         else if (*yytext == 'E' || *yytext == 'e')
552         {
553                 /* Token is an E'...' string */
554                 result = (char *) palloc(yyleng + 1);   /* more than enough room */
555                 len = 0;
556                 for (cp = yytext + 2; *cp; cp++)
557                 {
558                         if (*cp == '\'')
559                         {
560                                 if (cp[1] == '\'')
561                                         result[len++] = *cp++;
562                                 /* else it must be string end quote */
563                         }
564                         else if (*cp == '\\')
565                         {
566                                 if (cp[1] != '\0')      /* just a paranoid check */
567                                         result[len++] = *(++cp);
568                         }
569                         else
570                                 result[len++] = *cp;
571                 }
572                 result[len] = '\0';
573         }
574         else
575         {
576                 /* Token is a '...' string */
577                 result = (char *) palloc(yyleng + 1);   /* more than enough room */
578                 len = 0;
579                 for (cp = yytext + 1; *cp; cp++)
580                 {
581                         if (*cp == '\'')
582                         {
583                                 if (cp[1] == '\'')
584                                         result[len++] = *cp++;
585                                 /* else it must be string end quote */
586                         }
587                         else if (*cp == '\\')
588                         {
589                                 if (cp[1] != '\0')      /* just a paranoid check */
590                                         result[len++] = *(++cp);
591                         }
592                         else
593                                 result[len++] = *cp;
594                 }
595                 result[len] = '\0';
596         }
597         return result;
598 }