granicus.if.org Git - postgresql/blob - src/pl/plpgsql/src/scan.l

   1 %{
   2 /*-------------------------------------------------------------------------
   3  *
   4  * scan.l               - Scanner for the PL/pgSQL
   5  *                        procedural language
   6  *
   7  * Portions Copyright (c) 1996-2006, PostgreSQL Global Development Group
   8  * Portions Copyright (c) 1994, Regents of the University of California
   9  *
  10  *
  11  * IDENTIFICATION
  12  *        $PostgreSQL: pgsql/src/pl/plpgsql/src/scan.l,v 1.45 2006/03/09 21:29:38 momjian Exp $
  13  *
  14  *-------------------------------------------------------------------------
  15  */
  16
  17 #include "plpgsql.h"
  18
  19 #include "mb/pg_wchar.h"
  20
  21
  22 /* No reason to constrain amount of data slurped */
  23 #define YY_READ_BUF_SIZE 16777216
  24
  25 /* Avoid exit() on fatal scanner errors (a bit ugly -- see yy_fatal_error) */
  26 #undef fprintf
  27 #define fprintf(file, fmt, msg)  ereport(ERROR, (errmsg_internal("%s", msg)))
  28
  29 /* Handles to the buffer that the lexer uses internally */
  30 static YY_BUFFER_STATE scanbufhandle;
  31 static char *scanbuf;
  32
  33 static const char *scanstr;             /* original input string */
  34
  35 static int      scanner_functype;
  36 static bool     scanner_typereported;
  37 static int      pushback_token;
  38 static bool have_pushback_token;
  39 static int      lookahead_token;
  40 static bool have_lookahead_token;
  41 static const char *cur_line_start;
  42 static int      cur_line_num;
  43 static char    *dolqstart;      /* current $foo$ quote start string */
  44 static int      dolqlen;                        /* signal to plpgsql_get_string_value */
  45
  46 bool plpgsql_SpaceScanned = false;
  47 %}
  48
  49 %option 8bit
  50 %option never-interactive
  51 %option nodefault
  52 %option nounput
  53 %option noyywrap
  54 %option prefix="plpgsql_base_yy"
  55
  56 %option case-insensitive
  57
  58
  59 %x      IN_STRING
  60 %x      IN_COMMENT
  61 %x      IN_DOLLARQUOTE
  62
  63 digit                   [0-9]
  64 ident_start             [A-Za-z\200-\377_]
  65 ident_cont              [A-Za-z\200-\377_0-9\$]
  66
  67 quoted_ident    (\"[^\"]*\")+
  68
  69 identifier              ({ident_start}{ident_cont}*|{quoted_ident})
  70
  71 param                   \${digit}+
  72
  73 space                   [ \t\n\r\f]
  74
  75 /* $foo$ style quotes ("dollar quoting")
  76  * copied straight from the backend SQL parser
  77  */
  78 dolq_start              [A-Za-z\200-\377_]
  79 dolq_cont               [A-Za-z\200-\377_0-9]
  80 dolqdelim               \$({dolq_start}{dolq_cont}*)?\$
  81 dolqinside              [^$]+
  82
  83 %%
  84     /* ----------
  85      * Local variables in scanner to remember where
  86      * a string or comment started
  87      * ----------
  88      */
  89     int start_lineno = 0;
  90         char *start_charpos = NULL;
  91
  92     /* ----------
  93      * Reset the state when entering the scanner
  94      * ----------
  95      */
  96     BEGIN(INITIAL);
  97     plpgsql_SpaceScanned = false;
  98
  99     /* ----------
 100      * On the first call to a new source report the
 101      * function's type (T_FUNCTION or T_TRIGGER)
 102      * ----------
 103      */
 104         if (!scanner_typereported)
 105         {
 106                 scanner_typereported = true;
 107                 return scanner_functype;
 108         }
 109
 110     /* ----------
 111      * The keyword rules
 112      * ----------
 113      */
 114 :=                              { return K_ASSIGN;                      }
 115 =                               { return K_ASSIGN;                      }
 116 \.\.                    { return K_DOTDOT;                      }
 117 alias                   { return K_ALIAS;                       }
 118 begin                   { return K_BEGIN;                       }
 119 close                   { return K_CLOSE;                       }
 120 constant                { return K_CONSTANT;            }
 121 continue                { return K_CONTINUE;            }
 122 cursor                  { return K_CURSOR;                      }
 123 debug                   { return K_DEBUG;                       }
 124 declare                 { return K_DECLARE;                     }
 125 default                 { return K_DEFAULT;                     }
 126 diagnostics             { return K_DIAGNOSTICS;         }
 127 else                    { return K_ELSE;                        }
 128 elseif          { return K_ELSIF;           }
 129 elsif           { return K_ELSIF;           }
 130 end                             { return K_END;                         }
 131 exception               { return K_EXCEPTION;           }
 132 execute                 { return K_EXECUTE;                     }
 133 exit                    { return K_EXIT;                        }
 134 fetch                   { return K_FETCH;                       }
 135 for                             { return K_FOR;                         }
 136 from                    { return K_FROM;                        }
 137 get                             { return K_GET;                         }
 138 if                              { return K_IF;                          }
 139 in                              { return K_IN;                          }
 140 info                    { return K_INFO;                        }
 141 into                    { return K_INTO;                        }
 142 is                              { return K_IS;                          }
 143 log                             { return K_LOG;                         }
 144 loop                    { return K_LOOP;                        }
 145 next                    { return K_NEXT;                        }
 146 not                             { return K_NOT;                         }
 147 notice                  { return K_NOTICE;                      }
 148 null                    { return K_NULL;                        }
 149 open                    { return K_OPEN;                        }
 150 or                              { return K_OR;                          }
 151 perform                 { return K_PERFORM;                     }
 152 raise                   { return K_RAISE;                       }
 153 rename                  { return K_RENAME;                      }
 154 result_oid              { return K_RESULT_OID;          }
 155 return                  { return K_RETURN;                      }
 156 reverse                 { return K_REVERSE;                     }
 157 row_count               { return K_ROW_COUNT;           }
 158 select                  { return K_SELECT;                      }
 159 then                    { return K_THEN;                        }
 160 to                              { return K_TO;                          }
 161 type                    { return K_TYPE;                        }
 162 warning                 { return K_WARNING;                     }
 163 when                    { return K_WHEN;                        }
 164 while                   { return K_WHILE;                       }
 165
 166 ^#option                { return O_OPTION;                      }
 167 dump                    { return O_DUMP;                        }
 168
 169
 170     /* ----------
 171      * Special word rules
 172          *
 173          * We set plpgsql_error_lineno in each rule so that errors reported
 174          * in the pl_comp.c subroutines will point to the right place.
 175      * ----------
 176      */
 177 {identifier}                                    {
 178         plpgsql_error_lineno = plpgsql_scanner_lineno();
 179         return plpgsql_parse_word(yytext); }
 180 {identifier}{space}*\.{space}*{identifier}      {
 181         plpgsql_error_lineno = plpgsql_scanner_lineno();
 182         return plpgsql_parse_dblword(yytext); }
 183 {identifier}{space}*\.{space}*{identifier}{space}*\.{space}*{identifier}        {
 184         plpgsql_error_lineno = plpgsql_scanner_lineno();
 185         return plpgsql_parse_tripword(yytext); }
 186 {identifier}{space}*%TYPE               {
 187         plpgsql_error_lineno = plpgsql_scanner_lineno();
 188         return plpgsql_parse_wordtype(yytext); }
 189 {identifier}{space}*\.{space}*{identifier}{space}*%TYPE {
 190         plpgsql_error_lineno = plpgsql_scanner_lineno();
 191         return plpgsql_parse_dblwordtype(yytext); }
 192 {identifier}{space}*\.{space}*{identifier}{space}*\.{space}*{identifier}{space}*%TYPE   {
 193         plpgsql_error_lineno = plpgsql_scanner_lineno();
 194         return plpgsql_parse_tripwordtype(yytext); }
 195 {identifier}{space}*%ROWTYPE    {
 196         plpgsql_error_lineno = plpgsql_scanner_lineno();
 197         return plpgsql_parse_wordrowtype(yytext); }
 198 {identifier}{space}*\.{space}*{identifier}{space}*%ROWTYPE      {
 199         plpgsql_error_lineno = plpgsql_scanner_lineno();
 200         return plpgsql_parse_dblwordrowtype(yytext); }
 201 {param}                                                 {
 202         plpgsql_error_lineno = plpgsql_scanner_lineno();
 203         return plpgsql_parse_word(yytext); }
 204 {param}{space}*\.{space}*{identifier}   {
 205         plpgsql_error_lineno = plpgsql_scanner_lineno();
 206         return plpgsql_parse_dblword(yytext); }
 207 {param}{space}*\.{space}*{identifier}{space}*\.{space}*{identifier}     {
 208         plpgsql_error_lineno = plpgsql_scanner_lineno();
 209         return plpgsql_parse_tripword(yytext); }
 210 {param}{space}*%TYPE                    {
 211         plpgsql_error_lineno = plpgsql_scanner_lineno();
 212         return plpgsql_parse_wordtype(yytext); }
 213 {param}{space}*\.{space}*{identifier}{space}*%TYPE      {
 214         plpgsql_error_lineno = plpgsql_scanner_lineno();
 215         return plpgsql_parse_dblwordtype(yytext); }
 216 {param}{space}*\.{space}*{identifier}{space}*\.{space}*{identifier}{space}*%TYPE        {
 217         plpgsql_error_lineno = plpgsql_scanner_lineno();
 218         return plpgsql_parse_tripwordtype(yytext); }
 219 {param}{space}*%ROWTYPE         {
 220         plpgsql_error_lineno = plpgsql_scanner_lineno();
 221         return plpgsql_parse_wordrowtype(yytext); }
 222 {param}{space}*\.{space}*{identifier}{space}*%ROWTYPE   {
 223         plpgsql_error_lineno = plpgsql_scanner_lineno();
 224         return plpgsql_parse_dblwordrowtype(yytext); }
 225
 226 {digit}+                { return T_NUMBER;                      }
 227
 228 \".                             {
 229                                 plpgsql_error_lineno = plpgsql_scanner_lineno();
 230                                 ereport(ERROR,
 231                                                 (errcode(ERRCODE_DATATYPE_MISMATCH),
 232                                                  errmsg("unterminated quoted identifier")));
 233                         }
 234
 235     /* ----------
 236      * Ignore whitespaces but remember this happened
 237      * ----------
 238      */
 239 {space}+                { plpgsql_SpaceScanned = true;          }
 240
 241     /* ----------
 242      * Eat up comments
 243      * ----------
 244      */
 245 --[^\r\n]*              ;
 246
 247 \/\*                    { start_lineno = plpgsql_scanner_lineno();
 248                           BEGIN(IN_COMMENT);
 249                         }
 250 <IN_COMMENT>\*\/        { BEGIN(INITIAL); plpgsql_SpaceScanned = true; }
 251 <IN_COMMENT>\n          ;
 252 <IN_COMMENT>.           ;
 253 <IN_COMMENT><<EOF>>     {
 254                                 plpgsql_error_lineno = start_lineno;
 255                                 ereport(ERROR,
 256                                                 (errcode(ERRCODE_DATATYPE_MISMATCH),
 257                                                  errmsg("unterminated comment")));
 258                         }
 259
 260     /* ----------
 261      * Collect anything inside of ''s and return one STRING token
 262          *
 263          * Hacking yytext/yyleng here lets us avoid using yymore(), which is
 264          * a win for performance.  It's safe because we know the underlying
 265          * input buffer is not changing.
 266      * ----------
 267      */
 268 '                       {
 269                           start_lineno = plpgsql_scanner_lineno();
 270                           start_charpos = yytext;
 271                           BEGIN(IN_STRING);
 272                         }
 273 [eE]'           {
 274                           /* for now, treat the same as a regular literal */
 275                           start_lineno = plpgsql_scanner_lineno();
 276                           start_charpos = yytext;
 277                           BEGIN(IN_STRING);
 278                         }
 279 <IN_STRING>\\.          { }
 280 <IN_STRING>\\           { /* can only happen with \ at EOF */ }
 281 <IN_STRING>''           { }
 282 <IN_STRING>'            {
 283                           /* tell plpgsql_get_string_value it's not a dollar quote */
 284                           dolqlen = 0;
 285                           /* adjust yytext/yyleng to describe whole string token */
 286                           yyleng += (yytext - start_charpos);
 287                           yytext = start_charpos;
 288                           BEGIN(INITIAL);
 289                           return T_STRING;
 290                         }
 291 <IN_STRING>[^'\\]+      { }
 292 <IN_STRING><<EOF>>      {
 293                                 plpgsql_error_lineno = start_lineno;
 294                                 ereport(ERROR,
 295                                                 (errcode(ERRCODE_DATATYPE_MISMATCH),
 296                                                  errmsg("unterminated string")));
 297                         }
 298
 299 {dolqdelim}             {
 300                           start_lineno = plpgsql_scanner_lineno();
 301                           start_charpos = yytext;
 302                           dolqstart = pstrdup(yytext);
 303                           BEGIN(IN_DOLLARQUOTE);
 304                         }
 305 <IN_DOLLARQUOTE>{dolqdelim} {
 306                           if (strcmp(yytext, dolqstart) == 0)
 307                           {
 308                                         pfree(dolqstart);
 309                                         /* tell plpgsql_get_string_value it is a dollar quote */
 310                                         dolqlen = yyleng;
 311                                         /* adjust yytext/yyleng to describe whole string token */
 312                                         yyleng += (yytext - start_charpos);
 313                                         yytext = start_charpos;
 314                                         BEGIN(INITIAL);
 315                                         return T_STRING;
 316                           }
 317                           else
 318                           {
 319                                         /*
 320                                          * When we fail to match $...$ to dolqstart, transfer
 321                                          * the $... part to the output, but put back the final
 322                                          * $ for rescanning.  Consider $delim$...$junk$delim$
 323                                          */
 324                                         yyless(yyleng-1);
 325                           }
 326                         }
 327 <IN_DOLLARQUOTE>{dolqinside} { }
 328 <IN_DOLLARQUOTE>.       { /* needed for $ inside the quoted text */ }
 329 <IN_DOLLARQUOTE><<EOF>> {
 330                                 plpgsql_error_lineno = start_lineno;
 331                                 ereport(ERROR,
 332                                                 (errcode(ERRCODE_DATATYPE_MISMATCH),
 333                                                  errmsg("unterminated dollar-quoted string")));
 334                         }
 335
 336     /* ----------
 337      * Any unmatched character is returned as is
 338      * ----------
 339      */
 340 .                       { return yytext[0];                     }
 341
 342 %%
 343
 344
 345 /*
 346  * This is the yylex routine called from outside. It exists to provide
 347  * a pushback facility, as well as to allow us to parse syntax that
 348  * requires more than one token of lookahead.
 349  */
 350 int
 351 plpgsql_yylex(void)
 352 {
 353         int cur_token;
 354
 355         if (have_pushback_token)
 356         {
 357                 have_pushback_token = false;
 358                 cur_token = pushback_token;
 359         }
 360         else if (have_lookahead_token)
 361         {
 362                 have_lookahead_token = false;
 363                 cur_token = lookahead_token;
 364         }
 365         else
 366                 cur_token = yylex();
 367
 368         /* Do we need to look ahead for a possible multiword token? */
 369         switch (cur_token)
 370         {
 371                 /* RETURN NEXT must be reduced to a single token */
 372                 case K_RETURN:
 373                         if (!have_lookahead_token)
 374                         {
 375                                 lookahead_token = yylex();
 376                                 have_lookahead_token = true;
 377                         }
 378                         if (lookahead_token == K_NEXT)
 379                         {
 380                                 have_lookahead_token = false;
 381                                 cur_token = K_RETURN_NEXT;
 382                         }
 383                         break;
 384
 385                 default:
 386                         break;
 387         }
 388
 389         return cur_token;
 390 }
 391
 392 /*
 393  * Push back a single token to be re-read by next plpgsql_yylex() call.
 394  */
 395 void
 396 plpgsql_push_back_token(int token)
 397 {
 398         if (have_pushback_token)
 399                 elog(ERROR, "cannot push back multiple tokens");
 400         pushback_token = token;
 401         have_pushback_token = true;
 402 }
 403
 404 /*
 405  * Report a syntax error.
 406  */
 407 void
 408 plpgsql_yyerror(const char *message)
 409 {
 410         const char *loc = yytext;
 411         int                     cursorpos;
 412
 413         plpgsql_error_lineno = plpgsql_scanner_lineno();
 414
 415         /* in multibyte encodings, return index in characters not bytes */
 416         cursorpos = pg_mbstrlen_with_len(scanbuf, loc - scanbuf) + 1;
 417
 418         if (*loc == YY_END_OF_BUFFER_CHAR)
 419         {
 420                 ereport(ERROR,
 421                                 (errcode(ERRCODE_SYNTAX_ERROR),
 422                                  /* translator: %s is typically "syntax error" */
 423                                  errmsg("%s at end of input", message),
 424                                  internalerrposition(cursorpos),
 425                                  internalerrquery(scanstr)));
 426         }
 427         else
 428         {
 429                 ereport(ERROR,
 430                                 (errcode(ERRCODE_SYNTAX_ERROR),
 431                                  /* translator: first %s is typically "syntax error" */
 432                                  errmsg("%s at or near \"%s\"", message, loc),
 433                                  internalerrposition(cursorpos),
 434                                  internalerrquery(scanstr)));
 435         }
 436 }
 437
 438 /*
 439  * Get the line number at which the current token ends.  This substitutes
 440  * for flex's very poorly implemented yylineno facility.
 441  *
 442  * We assume that flex has written a '\0' over the character following the
 443  * current token in scanbuf.  So, we just have to count the '\n' characters
 444  * before that.  We optimize this a little by keeping track of the last
 445  * '\n' seen so far.
 446  */
 447 int
 448 plpgsql_scanner_lineno(void)
 449 {
 450         const char *c;
 451
 452         while ((c = strchr(cur_line_start, '\n')) != NULL)
 453         {
 454                 cur_line_start = c + 1;
 455                 cur_line_num++;
 456         }
 457         return cur_line_num;
 458 }
 459
 460 /*
 461  * Called before any actual parsing is done
 462  *
 463  * Note: the passed "str" must remain valid until plpgsql_scanner_finish().
 464  * Although it is not fed directly to flex, we need the original string
 465  * to cite in error messages.
 466  */
 467 void
 468 plpgsql_scanner_init(const char *str, int functype)
 469 {
 470         Size    slen;
 471
 472         slen = strlen(str);
 473
 474         /*
 475          * Might be left over after ereport()
 476          */
 477         if (YY_CURRENT_BUFFER)
 478                 yy_delete_buffer(YY_CURRENT_BUFFER);
 479
 480         /*
 481          * Make a scan buffer with special termination needed by flex.
 482          */
 483         scanbuf = palloc(slen + 2);
 484         memcpy(scanbuf, str, slen);
 485         scanbuf[slen] = scanbuf[slen + 1] = YY_END_OF_BUFFER_CHAR;
 486         scanbufhandle = yy_scan_buffer(scanbuf, slen + 2);
 487
 488         /* Other setup */
 489         scanstr = str;
 490
 491     scanner_functype = functype;
 492     scanner_typereported = false;
 493
 494         have_pushback_token = false;
 495         have_lookahead_token = false;
 496
 497         cur_line_start = scanbuf;
 498         cur_line_num = 1;
 499
 500         /*----------
 501          * Hack: skip any initial newline, so that in the common coding layout
 502          *              CREATE FUNCTION ... AS '
 503          *                      code body
 504          *              ' LANGUAGE plpgsql;
 505          * we will think "line 1" is what the programmer thinks of as line 1.
 506          *----------
 507          */
 508     if (*cur_line_start == '\r')
 509         cur_line_start++;
 510     if (*cur_line_start == '\n')
 511         cur_line_start++;
 512
 513         BEGIN(INITIAL);
 514 }
 515
 516 /*
 517  * Called after parsing is done to clean up after plpgsql_scanner_init()
 518  */
 519 void
 520 plpgsql_scanner_finish(void)
 521 {
 522         yy_delete_buffer(scanbufhandle);
 523         pfree(scanbuf);
 524 }
 525
 526 /*
 527  * Called after a T_STRING token is read to get the string literal's value
 528  * as a palloc'd string.  (We make this a separate call because in many
 529  * scenarios there's no need to get the decoded value.)
 530  *
 531  * Note: we expect the literal to be the most recently lexed token.  This
 532  * would not work well if we supported multiple-token pushback or if
 533  * plpgsql_yylex() wanted to read ahead beyond a T_STRING token.
 534  */
 535 char *
 536 plpgsql_get_string_value(void)
 537 {
 538         char       *result;
 539         const char *cp;
 540         int                     len;
 541
 542         if (dolqlen > 0)
 543         {
 544                 /* Token is a $foo$...$foo$ string */
 545                 len = yyleng - 2 * dolqlen;
 546                 Assert(len >= 0);
 547                 result = (char *) palloc(len + 1);
 548                 memcpy(result, yytext + dolqlen, len);
 549                 result[len] = '\0';
 550         }
 551         else if (*yytext == 'E' || *yytext == 'e')
 552         {
 553                 /* Token is an E'...' string */
 554                 result = (char *) palloc(yyleng + 1);   /* more than enough room */
 555                 len = 0;
 556                 for (cp = yytext + 2; *cp; cp++)
 557                 {
 558                         if (*cp == '\'')
 559                         {
 560                                 if (cp[1] == '\'')
 561                                         result[len++] = *cp++;
 562                                 /* else it must be string end quote */
 563                         }
 564                         else if (*cp == '\\')
 565                         {
 566                                 if (cp[1] != '\0')      /* just a paranoid check */
 567                                         result[len++] = *(++cp);
 568                         }
 569                         else
 570                                 result[len++] = *cp;
 571                 }
 572                 result[len] = '\0';
 573         }
 574         else
 575         {
 576                 /* Token is a '...' string */
 577                 result = (char *) palloc(yyleng + 1);   /* more than enough room */
 578                 len = 0;
 579                 for (cp = yytext + 1; *cp; cp++)
 580                 {
 581                         if (*cp == '\'')
 582                         {
 583                                 if (cp[1] == '\'')
 584                                         result[len++] = *cp++;
 585                                 /* else it must be string end quote */
 586                         }
 587                         else if (*cp == '\\')
 588                         {
 589                                 if (cp[1] != '\0')      /* just a paranoid check */
 590                                         result[len++] = *(++cp);
 591                         }
 592                         else
 593                                 result[len++] = *cp;
 594                 }
 595                 result[len] = '\0';
 596         }
 597         return result;
 598 }