granicus.if.org Git - postgresql/blob - src/backend/utils/adt/jsonpath_scan.l

   1 %{
   2 /*-------------------------------------------------------------------------
   3  *
   4  * jsonpath_scan.l
   5  *      Lexical parser for jsonpath datatype
   6  *
   7  * Splits jsonpath string into tokens represented as JsonPathString structs.
   8  * Decodes unicode and hex escaped strings.
   9  *
  10  * Copyright (c) 2019, PostgreSQL Global Development Group
  11  *
  12  * IDENTIFICATION
  13  *      src/backend/utils/adt/jsonpath_scan.l
  14  *
  15  *-------------------------------------------------------------------------
  16  */
  17
  18 #include "postgres.h"
  19
  20 #include "mb/pg_wchar.h"
  21 #include "nodes/pg_list.h"
  22
  23 static JsonPathString scanstring;
  24
  25 /* Handles to the buffer that the lexer uses internally */
  26 static YY_BUFFER_STATE scanbufhandle;
  27 static char *scanbuf;
  28 static int      scanbuflen;
  29
  30 static void addstring(bool init, char *s, int l);
  31 static void addchar(bool init, char s);
  32 static enum yytokentype checkKeyword(void);
  33 static void parseUnicode(char *s, int l);
  34 static void parseHexChar(char *s);
  35
  36 /* Avoid exit() on fatal scanner errors (a bit ugly -- see yy_fatal_error) */
  37 #undef fprintf
  38 #define fprintf(file, fmt, msg)  fprintf_to_ereport(fmt, msg)
  39
  40 static void
  41 fprintf_to_ereport(const char *fmt, const char *msg)
  42 {
  43         ereport(ERROR, (errmsg_internal("%s", msg)));
  44 }
  45
  46 %}
  47
  48 %option 8bit
  49 %option never-interactive
  50 %option nodefault
  51 %option noinput
  52 %option nounput
  53 %option noyywrap
  54 %option warn
  55 %option prefix="jsonpath_yy"
  56 %option bison-bridge
  57 %option noyyalloc
  58 %option noyyrealloc
  59 %option noyyfree
  60
  61 /*
  62  * We use exclusive states for quoted, signle-quoted and non-quoted strings,
  63  * quoted variable names and C-tyle comments.
  64  * Exclusive states:
  65  *  <xq> - quoted strings
  66  *  <xnq> - non-quoted strings
  67  *  <xvq> - quoted variable names
  68  *  <xsq> - single-quoted strings
  69  *  <xc> - C-style comment
  70  */
  71
  72 %x xq
  73 %x xnq
  74 %x xvq
  75 %x xsq
  76 %x xc
  77
  78 special          [\?\%\$\.\[\]\{\}\(\)\|\&\!\=\<\>\@\#\,\*:\-\+\/]
  79 any                     [^\?\%\$\.\[\]\{\}\(\)\|\&\!\=\<\>\@\#\,\*:\-\+\/\\\"\' \t\n\r\f]
  80 blank           [ \t\n\r\f]
  81
  82 digit           [0-9]
  83 integer         (0|[1-9]{digit}*)
  84 decimal         {integer}\.{digit}+
  85 decimalfail     {integer}\.
  86 real            ({integer}|{decimal})[Ee][-+]?{digit}+
  87 realfail1       ({integer}|{decimal})[Ee]
  88 realfail2       ({integer}|{decimal})[Ee][-+]
  89
  90 hex_dig         [0-9A-Fa-f]
  91 unicode         \\u({hex_dig}{4}|\{{hex_dig}{1,6}\})
  92 unicodefail     \\u({hex_dig}{0,3}|\{{hex_dig}{0,6})
  93 hex_char        \\x{hex_dig}{2}
  94 hex_fail        \\x{hex_dig}{0,1}
  95
  96 %%
  97
  98 <xnq>{any}+                                             {
  99                                                                         addstring(false, yytext, yyleng);
 100                                                                 }
 101
 102 <xnq>{blank}+                                   {
 103                                                                         yylval->str = scanstring;
 104                                                                         BEGIN INITIAL;
 105                                                                         return checkKeyword();
 106                                                                 }
 107
 108
 109 <xnq>\/\*                                               {
 110                                                                         yylval->str = scanstring;
 111                                                                         BEGIN xc;
 112                                                                 }
 113
 114 <xnq>({special}|\"|\')                  {
 115                                                                         yylval->str = scanstring;
 116                                                                         yyless(0);
 117                                                                         BEGIN INITIAL;
 118                                                                         return checkKeyword();
 119                                                                 }
 120
 121 <xnq><<EOF>>                                    {
 122                                                                         yylval->str = scanstring;
 123                                                                         BEGIN INITIAL;
 124                                                                         return checkKeyword();
 125                                                                 }
 126
 127 <xnq,xq,xvq,xsq>\\[\"\'\\]              { addchar(false, yytext[1]); }
 128
 129 <xnq,xq,xvq,xsq>\\b                             { addchar(false, '\b'); }
 130
 131 <xnq,xq,xvq,xsq>\\f                             { addchar(false, '\f'); }
 132
 133 <xnq,xq,xvq,xsq>\\n                             { addchar(false, '\n'); }
 134
 135 <xnq,xq,xvq,xsq>\\r                             { addchar(false, '\r'); }
 136
 137 <xnq,xq,xvq,xsq>\\t                             { addchar(false, '\t'); }
 138
 139 <xnq,xq,xvq,xsq>\\v                             { addchar(false, '\v'); }
 140
 141 <xnq,xq,xvq,xsq>{unicode}+              { parseUnicode(yytext, yyleng); }
 142
 143 <xnq,xq,xvq,xsq>{hex_char}              { parseHexChar(yytext); }
 144
 145 <xnq,xq,xvq,xsq>{unicode}*{unicodefail} { yyerror(NULL, "invalid unicode sequence"); }
 146
 147 <xnq,xq,xvq,xsq>{hex_fail}              { yyerror(NULL, "invalid hex character sequence"); }
 148
 149 <xnq,xq,xvq,xsq>{unicode}+\\    {
 150                                                                         /* throw back the \\, and treat as unicode */
 151                                                                         yyless(yyleng - 1);
 152                                                                         parseUnicode(yytext, yyleng);
 153                                                                 }
 154
 155 <xnq,xq,xvq,xsq>\\.                             { yyerror(NULL, "escape sequence is invalid"); }
 156
 157 <xnq,xq,xvq,xsq>\\                              { yyerror(NULL, "unexpected end after backslash"); }
 158
 159 <xq,xvq,xsq><<EOF>>                             { yyerror(NULL, "unexpected end of quoted string"); }
 160
 161 <xq>\"                                                  {
 162                                                                         yylval->str = scanstring;
 163                                                                         BEGIN INITIAL;
 164                                                                         return STRING_P;
 165                                                                 }
 166
 167 <xvq>\"                                                 {
 168                                                                         yylval->str = scanstring;
 169                                                                         BEGIN INITIAL;
 170                                                                         return VARIABLE_P;
 171                                                                 }
 172
 173 <xsq>\'                                                 {
 174                                                                         yylval->str = scanstring;
 175                                                                         BEGIN INITIAL;
 176                                                                         return STRING_P;
 177                                                                 }
 178
 179 <xq,xvq>[^\\\"]+                                { addstring(false, yytext, yyleng); }
 180
 181 <xsq>[^\\\']+                                   { addstring(false, yytext, yyleng); }
 182
 183 <xc>\*\/                                                { BEGIN INITIAL; }
 184
 185 <xc>[^\*]+                                              { }
 186
 187 <xc>\*                                                  { }
 188
 189 <xc><<EOF>>                                             { yyerror(NULL, "unexpected end of comment"); }
 190
 191 \&\&                                                    { return AND_P; }
 192
 193 \|\|                                                    { return OR_P; }
 194
 195 \!                                                              { return NOT_P; }
 196
 197 \*\*                                                    { return ANY_P; }
 198
 199 \<                                                              { return LESS_P; }
 200
 201 \<\=                                                    { return LESSEQUAL_P; }
 202
 203 \=\=                                                    { return EQUAL_P; }
 204
 205 \<\>                                                    { return NOTEQUAL_P; }
 206
 207 \!\=                                                    { return NOTEQUAL_P; }
 208
 209 \>\=                                                    { return GREATEREQUAL_P; }
 210
 211 \>                                                              { return GREATER_P; }
 212
 213 \${any}+                                                {
 214                                                                         addstring(true, yytext + 1, yyleng - 1);
 215                                                                         addchar(false, '\0');
 216                                                                         yylval->str = scanstring;
 217                                                                         return VARIABLE_P;
 218                                                                 }
 219
 220 \$\"                                                    {
 221                                                                         addchar(true, '\0');
 222                                                                         BEGIN xvq;
 223                                                                 }
 224
 225 {special}                                               { return *yytext; }
 226
 227 {blank}+                                                { /* ignore */ }
 228
 229 \/\*                                                    {
 230                                                                         addchar(true, '\0');
 231                                                                         BEGIN xc;
 232                                                                 }
 233
 234 {real}                                                  {
 235                                                                         addstring(true, yytext, yyleng);
 236                                                                         addchar(false, '\0');
 237                                                                         yylval->str = scanstring;
 238                                                                         return NUMERIC_P;
 239                                                                 }
 240
 241 {decimal}                                               {
 242                                                                         addstring(true, yytext, yyleng);
 243                                                                         addchar(false, '\0');
 244                                                                         yylval->str = scanstring;
 245                                                                         return NUMERIC_P;
 246                                                                 }
 247
 248 {integer}                                               {
 249                                                                         addstring(true, yytext, yyleng);
 250                                                                         addchar(false, '\0');
 251                                                                         yylval->str = scanstring;
 252                                                                         return INT_P;
 253                                                                 }
 254
 255 {decimalfail}                                   {
 256                                                                         /* throw back the ., and treat as integer */
 257                                                                         yyless(yyleng - 1);
 258                                                                         addstring(true, yytext, yyleng);
 259                                                                         addchar(false, '\0');
 260                                                                         yylval->str = scanstring;
 261                                                                         return INT_P;
 262                                                                 }
 263
 264 ({realfail1}|{realfail2})               { yyerror(NULL, "invalid floating point number"); }
 265
 266 {any}+                                                  {
 267                                                                         addstring(true, yytext, yyleng);
 268                                                                         BEGIN xnq;
 269                                                                 }
 270
 271 \"                                                              {
 272                                                                         addchar(true, '\0');
 273                                                                         BEGIN xq;
 274                                                                 }
 275
 276 \'                                                              {
 277                                                                         addchar(true, '\0');
 278                                                                         BEGIN xsq;
 279                                                                 }
 280
 281 \\                                                              {
 282                                                                         yyless(0);
 283                                                                         addchar(true, '\0');
 284                                                                         BEGIN xnq;
 285                                                                 }
 286
 287 <<EOF>>                                                 { yyterminate(); }
 288
 289 %%
 290
 291 void
 292 jsonpath_yyerror(JsonPathParseResult **result, const char *message)
 293 {
 294         if (*yytext == YY_END_OF_BUFFER_CHAR)
 295         {
 296                 ereport(ERROR,
 297                                 (errcode(ERRCODE_SYNTAX_ERROR),
 298                                  /* translator: %s is typically "syntax error" */
 299                                  errmsg("%s at end of jsonpath input", _(message))));
 300         }
 301         else
 302         {
 303                 ereport(ERROR,
 304                                 (errcode(ERRCODE_SYNTAX_ERROR),
 305                                  /* translator: first %s is typically "syntax error" */
 306                                  errmsg("%s at or near \"%s\" of jsonpath input",
 307                                                 _(message), yytext)));
 308         }
 309 }
 310
 311 typedef struct JsonPathKeyword
 312 {
 313         int16           len;
 314         bool            lowercase;
 315         int                     val;
 316         const char *keyword;
 317 } JsonPathKeyword;
 318
 319 /*
 320  * Array of key words should be sorted by length and then
 321  * alphabetical order
 322  */
 323 static const JsonPathKeyword keywords[] = {
 324         { 2, false,     IS_P,           "is"},
 325         { 2, false,     TO_P,           "to"},
 326         { 3, false,     ABS_P,          "abs"},
 327         { 3, false,     LAX_P,          "lax"},
 328         { 4, false,     FLAG_P,         "flag"},
 329         { 4, false,     LAST_P,         "last"},
 330         { 4, true,      NULL_P,         "null"},
 331         { 4, false,     SIZE_P,         "size"},
 332         { 4, true,      TRUE_P,         "true"},
 333         { 4, false,     TYPE_P,         "type"},
 334         { 4, false,     WITH_P,         "with"},
 335         { 5, true,      FALSE_P,        "false"},
 336         { 5, false,     FLOOR_P,        "floor"},
 337         { 6, false,     DOUBLE_P,       "double"},
 338         { 6, false,     EXISTS_P,       "exists"},
 339         { 6, false,     STARTS_P,       "starts"},
 340         { 6, false,     STRICT_P,       "strict"},
 341         { 7, false,     CEILING_P,      "ceiling"},
 342         { 7, false,     UNKNOWN_P,      "unknown"},
 343         { 8, false,     KEYVALUE_P,     "keyvalue"},
 344         { 10,false, LIKE_REGEX_P, "like_regex"},
 345 };
 346
 347 /* Check if current scanstring value is a keyword */
 348 static enum yytokentype
 349 checkKeyword()
 350 {
 351         int                                             res = IDENT_P;
 352         int                                             diff;
 353         const JsonPathKeyword  *StopLow = keywords,
 354                                                    *StopHigh = keywords + lengthof(keywords),
 355                                                    *StopMiddle;
 356
 357         if (scanstring.len > keywords[lengthof(keywords) - 1].len)
 358                 return res;
 359
 360         while (StopLow < StopHigh)
 361         {
 362                 StopMiddle = StopLow + ((StopHigh - StopLow) >> 1);
 363
 364                 if (StopMiddle->len == scanstring.len)
 365                         diff = pg_strncasecmp(StopMiddle->keyword, scanstring.val,
 366                                                                   scanstring.len);
 367                 else
 368                         diff = StopMiddle->len - scanstring.len;
 369
 370                 if (diff < 0)
 371                         StopLow = StopMiddle + 1;
 372                 else if (diff > 0)
 373                         StopHigh = StopMiddle;
 374                 else
 375                 {
 376                         if (StopMiddle->lowercase)
 377                                 diff = strncmp(StopMiddle->keyword, scanstring.val,
 378                                                            scanstring.len);
 379
 380                         if (diff == 0)
 381                                 res = StopMiddle->val;
 382
 383                         break;
 384                 }
 385         }
 386
 387         return res;
 388 }
 389
 390 /*
 391  * Called before any actual parsing is done
 392  */
 393 static void
 394 jsonpath_scanner_init(const char *str, int slen)
 395 {
 396         if (slen <= 0)
 397                 slen = strlen(str);
 398
 399         /*
 400          * Might be left over after ereport()
 401          */
 402         yy_init_globals();
 403
 404         /*
 405          * Make a scan buffer with special termination needed by flex.
 406          */
 407
 408         scanbuflen = slen;
 409         scanbuf = palloc(slen + 2);
 410         memcpy(scanbuf, str, slen);
 411         scanbuf[slen] = scanbuf[slen + 1] = YY_END_OF_BUFFER_CHAR;
 412         scanbufhandle = yy_scan_buffer(scanbuf, slen + 2);
 413
 414         BEGIN(INITIAL);
 415 }
 416
 417
 418 /*
 419  * Called after parsing is done to clean up after jsonpath_scanner_init()
 420  */
 421 static void
 422 jsonpath_scanner_finish(void)
 423 {
 424         yy_delete_buffer(scanbufhandle);
 425         pfree(scanbuf);
 426 }
 427
 428 /*
 429  * Resize scanstring so that it can append string of given length.
 430  * Reinitialize if required.
 431  */
 432 static void
 433 resizeString(bool init, int appendLen)
 434 {
 435         if (init)
 436         {
 437                 scanstring.total = Max(32, appendLen);
 438                 scanstring.val = (char *) palloc(scanstring.total);
 439                 scanstring.len = 0;
 440         }
 441         else
 442         {
 443                 if (scanstring.len + appendLen >= scanstring.total)
 444                 {
 445                         while (scanstring.len + appendLen >= scanstring.total)
 446                                 scanstring.total *= 2;
 447                         scanstring.val = repalloc(scanstring.val, scanstring.total);
 448                 }
 449         }
 450 }
 451
 452 /* Add set of bytes at "s" of length "l" to scanstring */
 453 static void
 454 addstring(bool init, char *s, int l)
 455 {
 456         resizeString(init, l + 1);
 457         memcpy(scanstring.val + scanstring.len, s, l);
 458         scanstring.len += l;
 459 }
 460
 461 /* Add single byte "c" to scanstring */
 462 static void
 463 addchar(bool init, char c)
 464 {
 465         resizeString(init, 1);
 466         scanstring.val[scanstring.len] = c;
 467         if (c != '\0')
 468                 scanstring.len++;
 469 }
 470
 471 /* Interface to jsonpath parser */
 472 JsonPathParseResult *
 473 parsejsonpath(const char *str, int len)
 474 {
 475         JsonPathParseResult     *parseresult;
 476
 477         jsonpath_scanner_init(str, len);
 478
 479         if (jsonpath_yyparse((void *) &parseresult) != 0)
 480                 jsonpath_yyerror(NULL, "bogus input"); /* shouldn't happen */
 481
 482         jsonpath_scanner_finish();
 483
 484         return parseresult;
 485 }
 486
 487 /* Turn hex character into integer */
 488 static int
 489 hexval(char c)
 490 {
 491         if (c >= '0' && c <= '9')
 492                 return c - '0';
 493         if (c >= 'a' && c <= 'f')
 494                 return c - 'a' + 0xA;
 495         if (c >= 'A' && c <= 'F')
 496                 return c - 'A' + 0xA;
 497         jsonpath_yyerror(NULL, "invalid hexadecimal digit");
 498         return 0; /* not reached */
 499 }
 500
 501 /* Add given unicode character to scanstring */
 502 static void
 503 addUnicodeChar(int ch)
 504 {
 505         /*
 506          * For UTF8, replace the escape sequence by the actual
 507          * utf8 character in lex->strval. Do this also for other
 508          * encodings if the escape designates an ASCII character,
 509          * otherwise raise an error.
 510          */
 511
 512         if (ch == 0)
 513         {
 514                 /* We can't allow this, since our TEXT type doesn't */
 515                 ereport(ERROR,
 516                                 (errcode(ERRCODE_UNTRANSLATABLE_CHARACTER),
 517                                  errmsg("unsupported Unicode escape sequence"),
 518                                   errdetail("\\u0000 cannot be converted to text.")));
 519         }
 520         else if (GetDatabaseEncoding() == PG_UTF8)
 521         {
 522                 char utf8str[5];
 523                 int utf8len;
 524
 525                 unicode_to_utf8(ch, (unsigned char *) utf8str);
 526                 utf8len = pg_utf_mblen((unsigned char *) utf8str);
 527                 addstring(false, utf8str, utf8len);
 528         }
 529         else if (ch <= 0x007f)
 530         {
 531                 /*
 532                  * This is the only way to designate things like a
 533                  * form feed character in JSON, so it's useful in all
 534                  * encodings.
 535                  */
 536                 addchar(false, (char) ch);
 537         }
 538         else
 539         {
 540                 ereport(ERROR,
 541                                 (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
 542                                  errmsg("invalid input syntax for type %s", "jsonpath"),
 543                                  errdetail("Unicode escape values cannot be used for code "
 544                                                    "point values above 007F when the server encoding "
 545                                                    "is not UTF8.")));
 546         }
 547 }
 548
 549 /* Add unicode character and process its hi surrogate */
 550 static void
 551 addUnicode(int ch, int *hi_surrogate)
 552 {
 553         if (ch >= 0xd800 && ch <= 0xdbff)
 554         {
 555                 if (*hi_surrogate != -1)
 556                         ereport(ERROR,
 557                                         (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
 558                                          errmsg("invalid input syntax for type %s", "jsonpath"),
 559                                          errdetail("Unicode high surrogate must not follow "
 560                                                            "a high surrogate.")));
 561                 *hi_surrogate = (ch & 0x3ff) << 10;
 562                 return;
 563         }
 564         else if (ch >= 0xdc00 && ch <= 0xdfff)
 565         {
 566                 if (*hi_surrogate == -1)
 567                         ereport(ERROR,
 568                                         (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
 569                                          errmsg("invalid input syntax for type %s", "jsonpath"),
 570                                          errdetail("Unicode low surrogate must follow a high "
 571                                                            "surrogate.")));
 572                 ch = 0x10000 + *hi_surrogate + (ch & 0x3ff);
 573                 *hi_surrogate = -1;
 574         }
 575         else if (*hi_surrogate != -1)
 576         {
 577                 ereport(ERROR,
 578                                 (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
 579                                  errmsg("invalid input syntax for type %s", "jsonpath"),
 580                                  errdetail("Unicode low surrogate must follow a high "
 581                                                    "surrogate.")));
 582         }
 583
 584         addUnicodeChar(ch);
 585 }
 586
 587 /*
 588  * parseUnicode was adopted from json_lex_string() in
 589  * src/backend/utils/adt/json.c
 590  */
 591 static void
 592 parseUnicode(char *s, int l)
 593 {
 594         int                     i = 2;
 595         int                     hi_surrogate = -1;
 596
 597         for (i = 2; i < l; i += 2)      /* skip '\u' */
 598         {
 599                 int                     ch = 0;
 600                 int                     j;
 601
 602                 if (s[i] == '{')        /* parse '\u{XX...}' */
 603                 {
 604                         while (s[++i] != '}' && i < l)
 605                                 ch = (ch << 4) | hexval(s[i]);
 606                         i++;    /* ski p '}' */
 607                 }
 608                 else            /* parse '\uXXXX' */
 609                 {
 610                         for (j = 0; j < 4 && i < l; j++)
 611                                 ch = (ch << 4) | hexval(s[i++]);
 612                 }
 613
 614                 addUnicode(ch, &hi_surrogate);
 615         }
 616
 617         if (hi_surrogate != -1)
 618         {
 619                 ereport(ERROR,
 620                                 (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
 621                                  errmsg("invalid input syntax for type %s", "jsonpath"),
 622                                  errdetail("Unicode low surrogate must follow a high "
 623                                                    "surrogate.")));
 624         }
 625 }
 626
 627 /* Parse sequence of hex-encoded characters */
 628 static void
 629 parseHexChar(char *s)
 630 {
 631         int                     ch = (hexval(s[2]) << 4) |
 632                                           hexval(s[3]);
 633
 634         addUnicodeChar(ch);
 635 }
 636
 637 /*
 638  * Interface functions to make flex use palloc() instead of malloc().
 639  * It'd be better to make these static, but flex insists otherwise.
 640  */
 641
 642 void *
 643 jsonpath_yyalloc(yy_size_t bytes)
 644 {
 645         return palloc(bytes);
 646 }
 647
 648 void *
 649 jsonpath_yyrealloc(void *ptr, yy_size_t bytes)
 650 {
 651         if (ptr)
 652                 return repalloc(ptr, bytes);
 653         else
 654                 return palloc(bytes);
 655 }
 656
 657 void
 658 jsonpath_yyfree(void *ptr)
 659 {
 660         if (ptr)
 661                 pfree(ptr);
 662 }