granicus.if.org Git - postgresql/blob - src/backend/utils/adt/jsonpath_scan.l

   1 /*-------------------------------------------------------------------------
   2  *
   3  * jsonpath_scan.l
   4  *      Lexical parser for jsonpath datatype
   5  *
   6  * Copyright (c) 2019, PostgreSQL Global Development Group
   7  *
   8  * IDENTIFICATION
   9  *      src/backend/utils/adt/jsonpath_scan.l
  10  *
  11  *-------------------------------------------------------------------------
  12  */
  13
  14 %{
  15 #include "postgres.h"
  16
  17 #include "mb/pg_wchar.h"
  18 #include "nodes/pg_list.h"
  19 #include "utils/jsonpath_scanner.h"
  20
  21 static string scanstring;
  22
  23 /* No reason to constrain amount of data slurped */
  24 /* #define YY_READ_BUF_SIZE 16777216 */
  25
  26 /* Handles to the buffer that the lexer uses internally */
  27 static YY_BUFFER_STATE scanbufhandle;
  28 static char *scanbuf;
  29 static int      scanbuflen;
  30
  31 static void addstring(bool init, char *s, int l);
  32 static void addchar(bool init, char s);
  33 static int checkSpecialVal(void); /* examine scanstring for the special
  34                                                                    * value */
  35
  36 static void parseUnicode(char *s, int l);
  37 static void parseHexChars(char *s, int l);
  38
  39 /* Avoid exit() on fatal scanner errors (a bit ugly -- see yy_fatal_error) */
  40 #undef fprintf
  41 #define fprintf(file, fmt, msg)  fprintf_to_ereport(fmt, msg)
  42
  43 static void
  44 fprintf_to_ereport(const char *fmt, const char *msg)
  45 {
  46         ereport(ERROR, (errmsg_internal("%s", msg)));
  47 }
  48
  49 #define yyerror jsonpath_yyerror
  50 %}
  51
  52 %option 8bit
  53 %option never-interactive
  54 %option nodefault
  55 %option noinput
  56 %option nounput
  57 %option noyywrap
  58 %option warn
  59 %option prefix="jsonpath_yy"
  60 %option bison-bridge
  61 %option noyyalloc
  62 %option noyyrealloc
  63 %option noyyfree
  64
  65 %x xQUOTED
  66 %x xNONQUOTED
  67 %x xVARQUOTED
  68 %x xSINGLEQUOTED
  69 %x xCOMMENT
  70
  71 special          [\?\%\$\.\[\]\{\}\(\)\|\&\!\=\<\>\@\#\,\*:\-\+\/]
  72 any                     [^\?\%\$\.\[\]\{\}\(\)\|\&\!\=\<\>\@\#\,\*:\-\+\/\\\"\' \t\n\r\f]
  73 blank           [ \t\n\r\f]
  74 hex_dig         [0-9A-Fa-f]
  75 unicode         \\u({hex_dig}{4}|\{{hex_dig}{1,6}\})
  76 hex_char        \\x{hex_dig}{2}
  77
  78
  79 %%
  80
  81 <INITIAL>\&\&                                   { return AND_P; }
  82
  83 <INITIAL>\|\|                                   { return OR_P; }
  84
  85 <INITIAL>\!                                             { return NOT_P; }
  86
  87 <INITIAL>\*\*                                   { return ANY_P; }
  88
  89 <INITIAL>\<                                             { return LESS_P; }
  90
  91 <INITIAL>\<\=                                   { return LESSEQUAL_P; }
  92
  93 <INITIAL>\=\=                                   { return EQUAL_P; }
  94
  95 <INITIAL>\<\>                                   { return NOTEQUAL_P; }
  96
  97 <INITIAL>\!\=                                   { return NOTEQUAL_P; }
  98
  99 <INITIAL>\>\=                                   { return GREATEREQUAL_P; }
 100
 101 <INITIAL>\>                                             { return GREATER_P; }
 102
 103 <INITIAL>\${any}+                               {
 104                                                                         addstring(true, yytext + 1, yyleng - 1);
 105                                                                         addchar(false, '\0');
 106                                                                         yylval->str = scanstring;
 107                                                                         return VARIABLE_P;
 108                                                                 }
 109
 110 <INITIAL>\$\"                                   {
 111                                                                         addchar(true, '\0');
 112                                                                         BEGIN xVARQUOTED;
 113                                                                 }
 114
 115 <INITIAL>{special}                              { return *yytext; }
 116
 117 <INITIAL>{blank}+                               { /* ignore */ }
 118
 119 <INITIAL>\/\*                                   {
 120                                                                         addchar(true, '\0');
 121                                                                         BEGIN xCOMMENT;
 122                                                                 }
 123
 124 <INITIAL>[0-9]+(\.[0-9]+)?[eE][+-]?[0-9]+  /* float */  {
 125                                                                         addstring(true, yytext, yyleng);
 126                                                                         addchar(false, '\0');
 127                                                                         yylval->str = scanstring;
 128                                                                         return NUMERIC_P;
 129                                                                 }
 130
 131 <INITIAL>\.[0-9]+[eE][+-]?[0-9]+  /* float */  {
 132                                                                         addstring(true, yytext, yyleng);
 133                                                                         addchar(false, '\0');
 134                                                                         yylval->str = scanstring;
 135                                                                         return NUMERIC_P;
 136                                                                 }
 137
 138 <INITIAL>([0-9]+)?\.[0-9]+              {
 139                                                                         addstring(true, yytext, yyleng);
 140                                                                         addchar(false, '\0');
 141                                                                         yylval->str = scanstring;
 142                                                                         return NUMERIC_P;
 143                                                                 }
 144
 145 <INITIAL>[0-9]+                                 {
 146                                                                         addstring(true, yytext, yyleng);
 147                                                                         addchar(false, '\0');
 148                                                                         yylval->str = scanstring;
 149                                                                         return INT_P;
 150                                                                 }
 151
 152 <INITIAL>{any}+                                 {
 153                                                                         addstring(true, yytext, yyleng);
 154                                                                         BEGIN xNONQUOTED;
 155                                                                 }
 156
 157 <INITIAL>\"                                             {
 158                                                                         addchar(true, '\0');
 159                                                                         BEGIN xQUOTED;
 160                                                                 }
 161
 162 <INITIAL>\'                                             {
 163                                                                         addchar(true, '\0');
 164                                                                         BEGIN xSINGLEQUOTED;
 165                                                                 }
 166
 167 <INITIAL>\\                                             {
 168                                                                         yyless(0);
 169                                                                         addchar(true, '\0');
 170                                                                         BEGIN xNONQUOTED;
 171                                                                 }
 172
 173 <xNONQUOTED>{any}+                              {
 174                                                                         addstring(false, yytext, yyleng);
 175                                                                 }
 176
 177 <xNONQUOTED>{blank}+                    {
 178                                                                         yylval->str = scanstring;
 179                                                                         BEGIN INITIAL;
 180                                                                         return checkSpecialVal();
 181                                                                 }
 182
 183
 184 <xNONQUOTED>\/\*                                {
 185                                                                         yylval->str = scanstring;
 186                                                                         BEGIN xCOMMENT;
 187                                                                 }
 188
 189 <xNONQUOTED>({special}|\"|\')   {
 190                                                                         yylval->str = scanstring;
 191                                                                         yyless(0);
 192                                                                         BEGIN INITIAL;
 193                                                                         return checkSpecialVal();
 194                                                                 }
 195
 196 <xNONQUOTED><<EOF>>                             {
 197                                                                         yylval->str = scanstring;
 198                                                                         BEGIN INITIAL;
 199                                                                         return checkSpecialVal();
 200                                                                 }
 201
 202 <xNONQUOTED,xQUOTED,xVARQUOTED,xSINGLEQUOTED>\\[\"\'\\] { addchar(false, yytext[1]); }
 203
 204 <xNONQUOTED,xQUOTED,xVARQUOTED,xSINGLEQUOTED>\\b        { addchar(false, '\b'); }
 205
 206 <xNONQUOTED,xQUOTED,xVARQUOTED,xSINGLEQUOTED>\\f        { addchar(false, '\f'); }
 207
 208 <xNONQUOTED,xQUOTED,xVARQUOTED,xSINGLEQUOTED>\\n        { addchar(false, '\n'); }
 209
 210 <xNONQUOTED,xQUOTED,xVARQUOTED,xSINGLEQUOTED>\\r        { addchar(false, '\r'); }
 211
 212 <xNONQUOTED,xQUOTED,xVARQUOTED,xSINGLEQUOTED>\\t        { addchar(false, '\t'); }
 213
 214 <xNONQUOTED,xQUOTED,xVARQUOTED,xSINGLEQUOTED>\\v        { addchar(false, '\v'); }
 215
 216 <xNONQUOTED,xQUOTED,xVARQUOTED,xSINGLEQUOTED>{unicode}+         { parseUnicode(yytext, yyleng); }
 217
 218 <xNONQUOTED,xQUOTED,xVARQUOTED,xSINGLEQUOTED>{hex_char}+        { parseHexChars(yytext, yyleng); }
 219
 220 <xNONQUOTED,xQUOTED,xVARQUOTED,xSINGLEQUOTED>\\x        { yyerror(NULL, "Hex character sequence is invalid"); }
 221
 222 <xNONQUOTED,xQUOTED,xVARQUOTED,xSINGLEQUOTED>\\u        { yyerror(NULL, "Unicode sequence is invalid"); }
 223
 224 <xNONQUOTED,xQUOTED,xVARQUOTED,xSINGLEQUOTED>\\.        { yyerror(NULL, "Escape sequence is invalid"); }
 225
 226 <xNONQUOTED,xQUOTED,xVARQUOTED,xSINGLEQUOTED>\\         { yyerror(NULL, "Unexpected end after backslash"); }
 227
 228 <xQUOTED,xVARQUOTED,xSINGLEQUOTED><<EOF>>                       { yyerror(NULL, "Unexpected end of quoted string"); }
 229
 230 <xQUOTED>\"                                             {
 231                                                                         yylval->str = scanstring;
 232                                                                         BEGIN INITIAL;
 233                                                                         return STRING_P;
 234                                                                 }
 235
 236 <xVARQUOTED>\"                                  {
 237                                                                         yylval->str = scanstring;
 238                                                                         BEGIN INITIAL;
 239                                                                         return VARIABLE_P;
 240                                                                 }
 241
 242 <xSINGLEQUOTED>\'                               {
 243                                                                         yylval->str = scanstring;
 244                                                                         BEGIN INITIAL;
 245                                                                         return STRING_P;
 246                                                                 }
 247
 248 <xQUOTED,xVARQUOTED>[^\\\"]+    { addstring(false, yytext, yyleng); }
 249
 250 <xSINGLEQUOTED>[^\\\']+                 { addstring(false, yytext, yyleng); }
 251
 252 <INITIAL><<EOF>>                                { yyterminate(); }
 253
 254 <xCOMMENT>\*\/                                  { BEGIN INITIAL; }
 255
 256 <xCOMMENT>[^\*]+                                { }
 257
 258 <xCOMMENT>\*                                    { }
 259
 260 <xCOMMENT><<EOF>>                               { yyerror(NULL, "Unexpected end of comment"); }
 261
 262 %%
 263
 264 void
 265 jsonpath_yyerror(JsonPathParseResult **result, const char *message)
 266 {
 267         if (*yytext == YY_END_OF_BUFFER_CHAR)
 268         {
 269                 ereport(ERROR,
 270                                 (errcode(ERRCODE_SYNTAX_ERROR),
 271                                  errmsg("bad jsonpath representation"),
 272                                  /* translator: %s is typically "syntax error" */
 273                                  errdetail("%s at end of input", message)));
 274         }
 275         else
 276         {
 277                 ereport(ERROR,
 278                                 (errcode(ERRCODE_SYNTAX_ERROR),
 279                                  errmsg("bad jsonpath representation"),
 280                                  /* translator: first %s is typically "syntax error" */
 281                                  errdetail("%s at or near \"%s\"", message, yytext)));
 282         }
 283 }
 284
 285 typedef struct keyword
 286 {
 287         int16           len;
 288         bool            lowercase;
 289         int                     val;
 290         const char      *keyword;
 291 } keyword;
 292
 293 /*
 294  * Array of key words should be sorted by length and then
 295  * alphabetical order
 296  */
 297
 298 static const keyword keywords[] = {
 299         { 2, false,     IS_P,           "is"},
 300         { 2, false,     TO_P,           "to"},
 301         { 3, false,     ABS_P,          "abs"},
 302         { 3, false,     LAX_P,          "lax"},
 303         { 4, false,     FLAG_P,         "flag"},
 304         { 4, false,     LAST_P,         "last"},
 305         { 4, true,      NULL_P,         "null"},
 306         { 4, false,     SIZE_P,         "size"},
 307         { 4, true,      TRUE_P,         "true"},
 308         { 4, false,     TYPE_P,         "type"},
 309         { 4, false,     WITH_P,         "with"},
 310         { 5, true,      FALSE_P,        "false"},
 311         { 5, false,     FLOOR_P,        "floor"},
 312         { 6, false,     DOUBLE_P,       "double"},
 313         { 6, false,     EXISTS_P,       "exists"},
 314         { 6, false,     STARTS_P,       "starts"},
 315         { 6, false,     STRICT_P,       "strict"},
 316         { 7, false,     CEILING_P,      "ceiling"},
 317         { 7, false,     UNKNOWN_P,      "unknown"},
 318         { 8, false,     KEYVALUE_P,     "keyvalue"},
 319         { 10,false, LIKE_REGEX_P, "like_regex"},
 320 };
 321
 322 static int
 323 checkSpecialVal()
 324 {
 325         int                             res = IDENT_P;
 326         int                             diff;
 327         const keyword  *StopLow = keywords,
 328                                    *StopHigh = keywords + lengthof(keywords),
 329                                    *StopMiddle;
 330
 331         if (scanstring.len > keywords[lengthof(keywords) - 1].len)
 332                 return res;
 333
 334         while(StopLow < StopHigh)
 335         {
 336                 StopMiddle = StopLow + ((StopHigh - StopLow) >> 1);
 337
 338                 if (StopMiddle->len == scanstring.len)
 339                         diff = pg_strncasecmp(StopMiddle->keyword, scanstring.val,
 340                                                                   scanstring.len);
 341                 else
 342                         diff = StopMiddle->len - scanstring.len;
 343
 344                 if (diff < 0)
 345                         StopLow = StopMiddle + 1;
 346                 else if (diff > 0)
 347                         StopHigh = StopMiddle;
 348                 else
 349                 {
 350                         if (StopMiddle->lowercase)
 351                                 diff = strncmp(StopMiddle->keyword, scanstring.val,
 352                                                            scanstring.len);
 353
 354                         if (diff == 0)
 355                                 res = StopMiddle->val;
 356
 357                         break;
 358                 }
 359         }
 360
 361         return res;
 362 }
 363
 364 /*
 365  * Called before any actual parsing is done
 366  */
 367 static void
 368 jsonpath_scanner_init(const char *str, int slen)
 369 {
 370         if (slen <= 0)
 371                 slen = strlen(str);
 372
 373         /*
 374          * Might be left over after ereport()
 375          */
 376         yy_init_globals();
 377
 378         /*
 379          * Make a scan buffer with special termination needed by flex.
 380          */
 381
 382         scanbuflen = slen;
 383         scanbuf = palloc(slen + 2);
 384         memcpy(scanbuf, str, slen);
 385         scanbuf[slen] = scanbuf[slen + 1] = YY_END_OF_BUFFER_CHAR;
 386         scanbufhandle = yy_scan_buffer(scanbuf, slen + 2);
 387
 388         BEGIN(INITIAL);
 389 }
 390
 391
 392 /*
 393  * Called after parsing is done to clean up after jsonpath_scanner_init()
 394  */
 395 static void
 396 jsonpath_scanner_finish(void)
 397 {
 398         yy_delete_buffer(scanbufhandle);
 399         pfree(scanbuf);
 400 }
 401
 402 static void
 403 addstring(bool init, char *s, int l)
 404 {
 405         if (init)
 406         {
 407                 scanstring.total = 32;
 408                 scanstring.val = palloc(scanstring.total);
 409                 scanstring.len = 0;
 410         }
 411
 412         if (s && l)
 413         {
 414                 while(scanstring.len + l + 1 >= scanstring.total)
 415                 {
 416                         scanstring.total *= 2;
 417                         scanstring.val = repalloc(scanstring.val, scanstring.total);
 418                 }
 419
 420                 memcpy(scanstring.val + scanstring.len, s, l);
 421                 scanstring.len += l;
 422         }
 423 }
 424
 425 static void
 426 addchar(bool init, char s)
 427 {
 428         if (init)
 429         {
 430                 scanstring.total = 32;
 431                 scanstring.val = palloc(scanstring.total);
 432                 scanstring.len = 0;
 433         }
 434         else if(scanstring.len + 1 >= scanstring.total)
 435         {
 436                 scanstring.total *= 2;
 437                 scanstring.val = repalloc(scanstring.val, scanstring.total);
 438         }
 439
 440         scanstring.val[ scanstring.len ] = s;
 441         if (s != '\0')
 442                 scanstring.len++;
 443 }
 444
 445 JsonPathParseResult *
 446 parsejsonpath(const char *str, int len)
 447 {
 448         JsonPathParseResult     *parseresult;
 449
 450         jsonpath_scanner_init(str, len);
 451
 452         if (jsonpath_yyparse((void*)&parseresult) != 0)
 453                 jsonpath_yyerror(NULL, "bugus input");
 454
 455         jsonpath_scanner_finish();
 456
 457         return parseresult;
 458 }
 459
 460 static int
 461 hexval(char c)
 462 {
 463         if (c >= '0' && c <= '9')
 464                 return c - '0';
 465         if (c >= 'a' && c <= 'f')
 466                 return c - 'a' + 0xA;
 467         if (c >= 'A' && c <= 'F')
 468                 return c - 'A' + 0xA;
 469         elog(ERROR, "invalid hexadecimal digit");
 470         return 0; /* not reached */
 471 }
 472
 473 static void
 474 addUnicodeChar(int ch)
 475 {
 476         /*
 477          * For UTF8, replace the escape sequence by the actual
 478          * utf8 character in lex->strval. Do this also for other
 479          * encodings if the escape designates an ASCII character,
 480          * otherwise raise an error.
 481          */
 482
 483         if (ch == 0)
 484         {
 485                 /* We can't allow this, since our TEXT type doesn't */
 486                 ereport(ERROR,
 487                                 (errcode(ERRCODE_UNTRANSLATABLE_CHARACTER),
 488                                  errmsg("unsupported Unicode escape sequence"),
 489                                   errdetail("\\u0000 cannot be converted to text.")));
 490         }
 491         else if (GetDatabaseEncoding() == PG_UTF8)
 492         {
 493                 char utf8str[5];
 494                 int utf8len;
 495
 496                 unicode_to_utf8(ch, (unsigned char *) utf8str);
 497                 utf8len = pg_utf_mblen((unsigned char *) utf8str);
 498                 addstring(false, utf8str, utf8len);
 499         }
 500         else if (ch <= 0x007f)
 501         {
 502                 /*
 503                  * This is the only way to designate things like a
 504                  * form feed character in JSON, so it's useful in all
 505                  * encodings.
 506                  */
 507                 addchar(false, (char) ch);
 508         }
 509         else
 510         {
 511                 ereport(ERROR,
 512                                 (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
 513                                  errmsg("invalid input syntax for type jsonpath"),
 514                                  errdetail("Unicode escape values cannot be used for code "
 515                                                    "point values above 007F when the server encoding "
 516                                                    "is not UTF8.")));
 517         }
 518 }
 519
 520 static void
 521 addUnicode(int ch, int *hi_surrogate)
 522 {
 523         if (ch >= 0xd800 && ch <= 0xdbff)
 524         {
 525                 if (*hi_surrogate != -1)
 526                         ereport(ERROR,
 527                                         (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
 528                                          errmsg("invalid input syntax for type jsonpath"),
 529                                          errdetail("Unicode high surrogate must not follow "
 530                                                            "a high surrogate.")));
 531                 *hi_surrogate = (ch & 0x3ff) << 10;
 532                 return;
 533         }
 534         else if (ch >= 0xdc00 && ch <= 0xdfff)
 535         {
 536                 if (*hi_surrogate == -1)
 537                         ereport(ERROR,
 538                                         (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
 539                                          errmsg("invalid input syntax for type jsonpath"),
 540                                          errdetail("Unicode low surrogate must follow a high "
 541                                                            "surrogate.")));
 542                 ch = 0x10000 + *hi_surrogate + (ch & 0x3ff);
 543                 *hi_surrogate = -1;
 544         }
 545         else if (*hi_surrogate != -1)
 546         {
 547                 ereport(ERROR,
 548                                 (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
 549                                  errmsg("invalid input syntax for type jsonpath"),
 550                                  errdetail("Unicode low surrogate must follow a high "
 551                                                    "surrogate.")));
 552         }
 553
 554         addUnicodeChar(ch);
 555 }
 556
 557 /*
 558  * parseUnicode was adopted from json_lex_string() in
 559  * src/backend/utils/adt/json.c
 560  */
 561 static void
 562 parseUnicode(char *s, int l)
 563 {
 564         int                     i;
 565         int                     hi_surrogate = -1;
 566
 567         for (i = 2; i < l; i += 2)      /* skip '\u' */
 568         {
 569                 int                     ch = 0;
 570                 int                     j;
 571
 572                 if (s[i] == '{')        /* parse '\u{XX...}' */
 573                 {
 574                         while (s[++i] != '}' && i < l)
 575                                 ch = (ch << 4) | hexval(s[i]);
 576                         i++;    /* ski p '}' */
 577                 }
 578                 else            /* parse '\uXXXX' */
 579                 {
 580                         for (j = 0; j < 4 && i < l; j++)
 581                                 ch = (ch << 4) | hexval(s[i++]);
 582                 }
 583
 584                 addUnicode(ch, &hi_surrogate);
 585         }
 586
 587         if (hi_surrogate != -1)
 588         {
 589                 ereport(ERROR,
 590                                 (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
 591                                  errmsg("invalid input syntax for type jsonpath"),
 592                                  errdetail("Unicode low surrogate must follow a high "
 593                                                    "surrogate.")));
 594         }
 595 }
 596
 597 static void
 598 parseHexChars(char *s, int l)
 599 {
 600         int i;
 601
 602         Assert(l % 4 /* \xXX */ == 0);
 603
 604         for (i = 0; i < l / 4; i++)
 605         {
 606                 int                     ch = (hexval(s[i * 4 + 2]) << 4) | hexval(s[i * 4 + 3]);
 607
 608                 addUnicodeChar(ch);
 609         }
 610 }
 611
 612 /*
 613  * Interface functions to make flex use palloc() instead of malloc().
 614  * It'd be better to make these static, but flex insists otherwise.
 615  */
 616
 617 void *
 618 jsonpath_yyalloc(yy_size_t bytes)
 619 {
 620         return palloc(bytes);
 621 }
 622
 623 void *
 624 jsonpath_yyrealloc(void *ptr, yy_size_t bytes)
 625 {
 626         if (ptr)
 627                 return repalloc(ptr, bytes);
 628         else
 629                 return palloc(bytes);
 630 }
 631
 632 void
 633 jsonpath_yyfree(void *ptr)
 634 {
 635         if (ptr)
 636                 pfree(ptr);
 637 }
 638