granicus.if.org Git - postgresql/blob - src/backend/utils/adt/jsonpath_scan.l

   1 %{
   2 /*-------------------------------------------------------------------------
   3  *
   4  * jsonpath_scan.l
   5  *      Lexical parser for jsonpath datatype
   6  *
   7  * Splits jsonpath string into tokens represented as JsonPathString structs.
   8  * Decodes unicode and hex escaped strings.
   9  *
  10  * Copyright (c) 2019, PostgreSQL Global Development Group
  11  *
  12  * IDENTIFICATION
  13  *      src/backend/utils/adt/jsonpath_scan.l
  14  *
  15  *-------------------------------------------------------------------------
  16  */
  17
  18 #include "postgres.h"
  19
  20 #include "mb/pg_wchar.h"
  21 #include "nodes/pg_list.h"
  22
  23 static JsonPathString scanstring;
  24
  25 /* Handles to the buffer that the lexer uses internally */
  26 static YY_BUFFER_STATE scanbufhandle;
  27 static char *scanbuf;
  28 static int      scanbuflen;
  29
  30 static void addstring(bool init, char *s, int l);
  31 static void addchar(bool init, char s);
  32 static enum yytokentype checkKeyword(void);
  33 static void parseUnicode(char *s, int l);
  34 static void parseHexChar(char *s);
  35
  36 /* Avoid exit() on fatal scanner errors (a bit ugly -- see yy_fatal_error) */
  37 #undef fprintf
  38 #define fprintf(file, fmt, msg)  fprintf_to_ereport(fmt, msg)
  39
  40 static void
  41 fprintf_to_ereport(const char *fmt, const char *msg)
  42 {
  43         ereport(ERROR, (errmsg_internal("%s", msg)));
  44 }
  45
  46 %}
  47
  48 %option 8bit
  49 %option never-interactive
  50 %option nodefault
  51 %option noinput
  52 %option nounput
  53 %option noyywrap
  54 %option warn
  55 %option prefix="jsonpath_yy"
  56 %option bison-bridge
  57 %option noyyalloc
  58 %option noyyrealloc
  59 %option noyyfree
  60
  61 /*
  62  * We use exclusive states for quoted and non-quoted strings,
  63  * quoted variable names and C-style comments.
  64  * Exclusive states:
  65  *  <xq> - quoted strings
  66  *  <xnq> - non-quoted strings
  67  *  <xvq> - quoted variable names
  68  *  <xc> - C-style comment
  69  */
  70
  71 %x xq
  72 %x xnq
  73 %x xvq
  74 %x xc
  75
  76 special         [\?\%\$\.\[\]\{\}\(\)\|\&\!\=\<\>\@\#\,\*:\-\+\/]
  77 blank           [ \t\n\r\f]
  78 /* "other" means anything that's not special, blank, or '\' or '"' */
  79 other           [^\?\%\$\.\[\]\{\}\(\)\|\&\!\=\<\>\@\#\,\*:\-\+\/\\\" \t\n\r\f]
  80
  81 digit           [0-9]
  82 integer         (0|[1-9]{digit}*)
  83 decimal         {integer}\.{digit}+
  84 decimalfail     {integer}\.
  85 real            ({integer}|{decimal})[Ee][-+]?{digit}+
  86 realfail1       ({integer}|{decimal})[Ee]
  87 realfail2       ({integer}|{decimal})[Ee][-+]
  88
  89 hex_dig         [0-9A-Fa-f]
  90 unicode         \\u({hex_dig}{4}|\{{hex_dig}{1,6}\})
  91 unicodefail     \\u({hex_dig}{0,3}|\{{hex_dig}{0,6})
  92 hex_char        \\x{hex_dig}{2}
  93 hex_fail        \\x{hex_dig}{0,1}
  94
  95 %%
  96
  97 <xnq>{other}+                                   {
  98                                                                         addstring(false, yytext, yyleng);
  99                                                                 }
 100
 101 <xnq>{blank}+                                   {
 102                                                                         yylval->str = scanstring;
 103                                                                         BEGIN INITIAL;
 104                                                                         return checkKeyword();
 105                                                                 }
 106
 107 <xnq>\/\*                                               {
 108                                                                         yylval->str = scanstring;
 109                                                                         BEGIN xc;
 110                                                                 }
 111
 112 <xnq>({special}|\")                             {
 113                                                                         yylval->str = scanstring;
 114                                                                         yyless(0);
 115                                                                         BEGIN INITIAL;
 116                                                                         return checkKeyword();
 117                                                                 }
 118
 119 <xnq><<EOF>>                                    {
 120                                                                         yylval->str = scanstring;
 121                                                                         BEGIN INITIAL;
 122                                                                         return checkKeyword();
 123                                                                 }
 124
 125 <xnq,xq,xvq>\\b                         { addchar(false, '\b'); }
 126
 127 <xnq,xq,xvq>\\f                         { addchar(false, '\f'); }
 128
 129 <xnq,xq,xvq>\\n                         { addchar(false, '\n'); }
 130
 131 <xnq,xq,xvq>\\r                         { addchar(false, '\r'); }
 132
 133 <xnq,xq,xvq>\\t                         { addchar(false, '\t'); }
 134
 135 <xnq,xq,xvq>\\v                         { addchar(false, '\v'); }
 136
 137 <xnq,xq,xvq>{unicode}+          { parseUnicode(yytext, yyleng); }
 138
 139 <xnq,xq,xvq>{hex_char}          { parseHexChar(yytext); }
 140
 141 <xnq,xq,xvq>{unicode}*{unicodefail}     { yyerror(NULL, "invalid unicode sequence"); }
 142
 143 <xnq,xq,xvq>{hex_fail}          { yyerror(NULL, "invalid hex character sequence"); }
 144
 145 <xnq,xq,xvq>{unicode}+\\        {
 146                                                                 /* throw back the \\, and treat as unicode */
 147                                                                 yyless(yyleng - 1);
 148                                                                 parseUnicode(yytext, yyleng);
 149                                                         }
 150
 151 <xnq,xq,xvq>\\.                         { addchar(false, yytext[1]); }
 152
 153 <xnq,xq,xvq>\\                          { yyerror(NULL, "unexpected end after backslash"); }
 154
 155 <xq,xvq><<EOF>>                         { yyerror(NULL, "unexpected end of quoted string"); }
 156
 157 <xq>\"                                                  {
 158                                                                         yylval->str = scanstring;
 159                                                                         BEGIN INITIAL;
 160                                                                         return STRING_P;
 161                                                                 }
 162
 163 <xvq>\"                                                 {
 164                                                                         yylval->str = scanstring;
 165                                                                         BEGIN INITIAL;
 166                                                                         return VARIABLE_P;
 167                                                                 }
 168
 169 <xq,xvq>[^\\\"]+                                { addstring(false, yytext, yyleng); }
 170
 171 <xc>\*\/                                                { BEGIN INITIAL; }
 172
 173 <xc>[^\*]+                                              { }
 174
 175 <xc>\*                                                  { }
 176
 177 <xc><<EOF>>                                             { yyerror(NULL, "unexpected end of comment"); }
 178
 179 \&\&                                                    { return AND_P; }
 180
 181 \|\|                                                    { return OR_P; }
 182
 183 \!                                                              { return NOT_P; }
 184
 185 \*\*                                                    { return ANY_P; }
 186
 187 \<                                                              { return LESS_P; }
 188
 189 \<\=                                                    { return LESSEQUAL_P; }
 190
 191 \=\=                                                    { return EQUAL_P; }
 192
 193 \<\>                                                    { return NOTEQUAL_P; }
 194
 195 \!\=                                                    { return NOTEQUAL_P; }
 196
 197 \>\=                                                    { return GREATEREQUAL_P; }
 198
 199 \>                                                              { return GREATER_P; }
 200
 201 \${other}+                                              {
 202                                                                         addstring(true, yytext + 1, yyleng - 1);
 203                                                                         addchar(false, '\0');
 204                                                                         yylval->str = scanstring;
 205                                                                         return VARIABLE_P;
 206                                                                 }
 207
 208 \$\"                                                    {
 209                                                                         addchar(true, '\0');
 210                                                                         BEGIN xvq;
 211                                                                 }
 212
 213 {special}                                               { return *yytext; }
 214
 215 {blank}+                                                { /* ignore */ }
 216
 217 \/\*                                                    {
 218                                                                         addchar(true, '\0');
 219                                                                         BEGIN xc;
 220                                                                 }
 221
 222 {real}                                                  {
 223                                                                         addstring(true, yytext, yyleng);
 224                                                                         addchar(false, '\0');
 225                                                                         yylval->str = scanstring;
 226                                                                         return NUMERIC_P;
 227                                                                 }
 228
 229 {decimal}                                               {
 230                                                                         addstring(true, yytext, yyleng);
 231                                                                         addchar(false, '\0');
 232                                                                         yylval->str = scanstring;
 233                                                                         return NUMERIC_P;
 234                                                                 }
 235
 236 {integer}                                               {
 237                                                                         addstring(true, yytext, yyleng);
 238                                                                         addchar(false, '\0');
 239                                                                         yylval->str = scanstring;
 240                                                                         return INT_P;
 241                                                                 }
 242
 243 {decimalfail}                                   {
 244                                                                         /* throw back the ., and treat as integer */
 245                                                                         yyless(yyleng - 1);
 246                                                                         addstring(true, yytext, yyleng);
 247                                                                         addchar(false, '\0');
 248                                                                         yylval->str = scanstring;
 249                                                                         return INT_P;
 250                                                                 }
 251
 252 ({realfail1}|{realfail2})               { yyerror(NULL, "invalid floating point number"); }
 253
 254 \"                                                              {
 255                                                                         addchar(true, '\0');
 256                                                                         BEGIN xq;
 257                                                                 }
 258
 259 \\                                                              {
 260                                                                         yyless(0);
 261                                                                         addchar(true, '\0');
 262                                                                         BEGIN xnq;
 263                                                                 }
 264
 265 {other}+                                                {
 266                                                                         addstring(true, yytext, yyleng);
 267                                                                         BEGIN xnq;
 268                                                                 }
 269
 270 <<EOF>>                                                 { yyterminate(); }
 271
 272 %%
 273
 274 void
 275 jsonpath_yyerror(JsonPathParseResult **result, const char *message)
 276 {
 277         if (*yytext == YY_END_OF_BUFFER_CHAR)
 278         {
 279                 ereport(ERROR,
 280                                 (errcode(ERRCODE_SYNTAX_ERROR),
 281                                  /* translator: %s is typically "syntax error" */
 282                                  errmsg("%s at end of jsonpath input", _(message))));
 283         }
 284         else
 285         {
 286                 ereport(ERROR,
 287                                 (errcode(ERRCODE_SYNTAX_ERROR),
 288                                  /* translator: first %s is typically "syntax error" */
 289                                  errmsg("%s at or near \"%s\" of jsonpath input",
 290                                                 _(message), yytext)));
 291         }
 292 }
 293
 294 typedef struct JsonPathKeyword
 295 {
 296         int16           len;
 297         bool            lowercase;
 298         int                     val;
 299         const char *keyword;
 300 } JsonPathKeyword;
 301
 302 /*
 303  * Array of key words should be sorted by length and then
 304  * alphabetical order
 305  */
 306 static const JsonPathKeyword keywords[] = {
 307         { 2, false,     IS_P,           "is"},
 308         { 2, false,     TO_P,           "to"},
 309         { 3, false,     ABS_P,          "abs"},
 310         { 3, false,     LAX_P,          "lax"},
 311         { 4, false,     FLAG_P,         "flag"},
 312         { 4, false,     LAST_P,         "last"},
 313         { 4, true,      NULL_P,         "null"},
 314         { 4, false,     SIZE_P,         "size"},
 315         { 4, true,      TRUE_P,         "true"},
 316         { 4, false,     TYPE_P,         "type"},
 317         { 4, false,     WITH_P,         "with"},
 318         { 5, true,      FALSE_P,        "false"},
 319         { 5, false,     FLOOR_P,        "floor"},
 320         { 6, false,     DOUBLE_P,       "double"},
 321         { 6, false,     EXISTS_P,       "exists"},
 322         { 6, false,     STARTS_P,       "starts"},
 323         { 6, false,     STRICT_P,       "strict"},
 324         { 7, false,     CEILING_P,      "ceiling"},
 325         { 7, false,     UNKNOWN_P,      "unknown"},
 326         { 8, false,     KEYVALUE_P,     "keyvalue"},
 327         { 10,false, LIKE_REGEX_P, "like_regex"},
 328 };
 329
 330 /* Check if current scanstring value is a keyword */
 331 static enum yytokentype
 332 checkKeyword()
 333 {
 334         int                                             res = IDENT_P;
 335         int                                             diff;
 336         const JsonPathKeyword  *StopLow = keywords,
 337                                                    *StopHigh = keywords + lengthof(keywords),
 338                                                    *StopMiddle;
 339
 340         if (scanstring.len > keywords[lengthof(keywords) - 1].len)
 341                 return res;
 342
 343         while (StopLow < StopHigh)
 344         {
 345                 StopMiddle = StopLow + ((StopHigh - StopLow) >> 1);
 346
 347                 if (StopMiddle->len == scanstring.len)
 348                         diff = pg_strncasecmp(StopMiddle->keyword, scanstring.val,
 349                                                                   scanstring.len);
 350                 else
 351                         diff = StopMiddle->len - scanstring.len;
 352
 353                 if (diff < 0)
 354                         StopLow = StopMiddle + 1;
 355                 else if (diff > 0)
 356                         StopHigh = StopMiddle;
 357                 else
 358                 {
 359                         if (StopMiddle->lowercase)
 360                                 diff = strncmp(StopMiddle->keyword, scanstring.val,
 361                                                            scanstring.len);
 362
 363                         if (diff == 0)
 364                                 res = StopMiddle->val;
 365
 366                         break;
 367                 }
 368         }
 369
 370         return res;
 371 }
 372
 373 /*
 374  * Called before any actual parsing is done
 375  */
 376 static void
 377 jsonpath_scanner_init(const char *str, int slen)
 378 {
 379         if (slen <= 0)
 380                 slen = strlen(str);
 381
 382         /*
 383          * Might be left over after ereport()
 384          */
 385         yy_init_globals();
 386
 387         /*
 388          * Make a scan buffer with special termination needed by flex.
 389          */
 390
 391         scanbuflen = slen;
 392         scanbuf = palloc(slen + 2);
 393         memcpy(scanbuf, str, slen);
 394         scanbuf[slen] = scanbuf[slen + 1] = YY_END_OF_BUFFER_CHAR;
 395         scanbufhandle = yy_scan_buffer(scanbuf, slen + 2);
 396
 397         BEGIN(INITIAL);
 398 }
 399
 400
 401 /*
 402  * Called after parsing is done to clean up after jsonpath_scanner_init()
 403  */
 404 static void
 405 jsonpath_scanner_finish(void)
 406 {
 407         yy_delete_buffer(scanbufhandle);
 408         pfree(scanbuf);
 409 }
 410
 411 /*
 412  * Resize scanstring so that it can append string of given length.
 413  * Reinitialize if required.
 414  */
 415 static void
 416 resizeString(bool init, int appendLen)
 417 {
 418         if (init)
 419         {
 420                 scanstring.total = Max(32, appendLen);
 421                 scanstring.val = (char *) palloc(scanstring.total);
 422                 scanstring.len = 0;
 423         }
 424         else
 425         {
 426                 if (scanstring.len + appendLen >= scanstring.total)
 427                 {
 428                         while (scanstring.len + appendLen >= scanstring.total)
 429                                 scanstring.total *= 2;
 430                         scanstring.val = repalloc(scanstring.val, scanstring.total);
 431                 }
 432         }
 433 }
 434
 435 /* Add set of bytes at "s" of length "l" to scanstring */
 436 static void
 437 addstring(bool init, char *s, int l)
 438 {
 439         resizeString(init, l + 1);
 440         memcpy(scanstring.val + scanstring.len, s, l);
 441         scanstring.len += l;
 442 }
 443
 444 /* Add single byte "c" to scanstring */
 445 static void
 446 addchar(bool init, char c)
 447 {
 448         resizeString(init, 1);
 449         scanstring.val[scanstring.len] = c;
 450         if (c != '\0')
 451                 scanstring.len++;
 452 }
 453
 454 /* Interface to jsonpath parser */
 455 JsonPathParseResult *
 456 parsejsonpath(const char *str, int len)
 457 {
 458         JsonPathParseResult     *parseresult;
 459
 460         jsonpath_scanner_init(str, len);
 461
 462         if (jsonpath_yyparse((void *) &parseresult) != 0)
 463                 jsonpath_yyerror(NULL, "bogus input"); /* shouldn't happen */
 464
 465         jsonpath_scanner_finish();
 466
 467         return parseresult;
 468 }
 469
 470 /* Turn hex character into integer */
 471 static int
 472 hexval(char c)
 473 {
 474         if (c >= '0' && c <= '9')
 475                 return c - '0';
 476         if (c >= 'a' && c <= 'f')
 477                 return c - 'a' + 0xA;
 478         if (c >= 'A' && c <= 'F')
 479                 return c - 'A' + 0xA;
 480         jsonpath_yyerror(NULL, "invalid hexadecimal digit");
 481         return 0; /* not reached */
 482 }
 483
 484 /* Add given unicode character to scanstring */
 485 static void
 486 addUnicodeChar(int ch)
 487 {
 488         /*
 489          * For UTF8, replace the escape sequence by the actual
 490          * utf8 character in lex->strval. Do this also for other
 491          * encodings if the escape designates an ASCII character,
 492          * otherwise raise an error.
 493          */
 494
 495         if (ch == 0)
 496         {
 497                 /* We can't allow this, since our TEXT type doesn't */
 498                 ereport(ERROR,
 499                                 (errcode(ERRCODE_UNTRANSLATABLE_CHARACTER),
 500                                  errmsg("unsupported Unicode escape sequence"),
 501                                   errdetail("\\u0000 cannot be converted to text.")));
 502         }
 503         else if (GetDatabaseEncoding() == PG_UTF8)
 504         {
 505                 char utf8str[5];
 506                 int utf8len;
 507
 508                 unicode_to_utf8(ch, (unsigned char *) utf8str);
 509                 utf8len = pg_utf_mblen((unsigned char *) utf8str);
 510                 addstring(false, utf8str, utf8len);
 511         }
 512         else if (ch <= 0x007f)
 513         {
 514                 /*
 515                  * This is the only way to designate things like a
 516                  * form feed character in JSON, so it's useful in all
 517                  * encodings.
 518                  */
 519                 addchar(false, (char) ch);
 520         }
 521         else
 522         {
 523                 ereport(ERROR,
 524                                 (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
 525                                  errmsg("invalid input syntax for type %s", "jsonpath"),
 526                                  errdetail("Unicode escape values cannot be used for code "
 527                                                    "point values above 007F when the server encoding "
 528                                                    "is not UTF8.")));
 529         }
 530 }
 531
 532 /* Add unicode character and process its hi surrogate */
 533 static void
 534 addUnicode(int ch, int *hi_surrogate)
 535 {
 536         if (ch >= 0xd800 && ch <= 0xdbff)
 537         {
 538                 if (*hi_surrogate != -1)
 539                         ereport(ERROR,
 540                                         (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
 541                                          errmsg("invalid input syntax for type %s", "jsonpath"),
 542                                          errdetail("Unicode high surrogate must not follow "
 543                                                            "a high surrogate.")));
 544                 *hi_surrogate = (ch & 0x3ff) << 10;
 545                 return;
 546         }
 547         else if (ch >= 0xdc00 && ch <= 0xdfff)
 548         {
 549                 if (*hi_surrogate == -1)
 550                         ereport(ERROR,
 551                                         (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
 552                                          errmsg("invalid input syntax for type %s", "jsonpath"),
 553                                          errdetail("Unicode low surrogate must follow a high "
 554                                                            "surrogate.")));
 555                 ch = 0x10000 + *hi_surrogate + (ch & 0x3ff);
 556                 *hi_surrogate = -1;
 557         }
 558         else if (*hi_surrogate != -1)
 559         {
 560                 ereport(ERROR,
 561                                 (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
 562                                  errmsg("invalid input syntax for type %s", "jsonpath"),
 563                                  errdetail("Unicode low surrogate must follow a high "
 564                                                    "surrogate.")));
 565         }
 566
 567         addUnicodeChar(ch);
 568 }
 569
 570 /*
 571  * parseUnicode was adopted from json_lex_string() in
 572  * src/backend/utils/adt/json.c
 573  */
 574 static void
 575 parseUnicode(char *s, int l)
 576 {
 577         int                     i = 2;
 578         int                     hi_surrogate = -1;
 579
 580         for (i = 2; i < l; i += 2)      /* skip '\u' */
 581         {
 582                 int                     ch = 0;
 583                 int                     j;
 584
 585                 if (s[i] == '{')        /* parse '\u{XX...}' */
 586                 {
 587                         while (s[++i] != '}' && i < l)
 588                                 ch = (ch << 4) | hexval(s[i]);
 589                         i++;    /* skip '}' */
 590                 }
 591                 else            /* parse '\uXXXX' */
 592                 {
 593                         for (j = 0; j < 4 && i < l; j++)
 594                                 ch = (ch << 4) | hexval(s[i++]);
 595                 }
 596
 597                 addUnicode(ch, &hi_surrogate);
 598         }
 599
 600         if (hi_surrogate != -1)
 601         {
 602                 ereport(ERROR,
 603                                 (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
 604                                  errmsg("invalid input syntax for type %s", "jsonpath"),
 605                                  errdetail("Unicode low surrogate must follow a high "
 606                                                    "surrogate.")));
 607         }
 608 }
 609
 610 /* Parse sequence of hex-encoded characters */
 611 static void
 612 parseHexChar(char *s)
 613 {
 614         int                     ch = (hexval(s[2]) << 4) |
 615                                           hexval(s[3]);
 616
 617         addUnicodeChar(ch);
 618 }
 619
 620 /*
 621  * Interface functions to make flex use palloc() instead of malloc().
 622  * It'd be better to make these static, but flex insists otherwise.
 623  */
 624
 625 void *
 626 jsonpath_yyalloc(yy_size_t bytes)
 627 {
 628         return palloc(bytes);
 629 }
 630
 631 void *
 632 jsonpath_yyrealloc(void *ptr, yy_size_t bytes)
 633 {
 634         if (ptr)
 635                 return repalloc(ptr, bytes);
 636         else
 637                 return palloc(bytes);
 638 }
 639
 640 void
 641 jsonpath_yyfree(void *ptr)
 642 {
 643         if (ptr)
 644                 pfree(ptr);
 645 }