]> granicus.if.org Git - postgresql/blob - src/backend/utils/adt/jsonpath_scan.l
Improve error reporting in jsonpath
[postgresql] / src / backend / utils / adt / jsonpath_scan.l
1 %{
2 /*-------------------------------------------------------------------------
3  *
4  * jsonpath_scan.l
5  *      Lexical parser for jsonpath datatype
6  *
7  * Splits jsonpath string into tokens represented as JsonPathString structs.
8  * Decodes unicode and hex escaped strings.
9  *
10  * Copyright (c) 2019, PostgreSQL Global Development Group
11  *
12  * IDENTIFICATION
13  *      src/backend/utils/adt/jsonpath_scan.l
14  *
15  *-------------------------------------------------------------------------
16  */
17
18 #include "postgres.h"
19
20 #include "mb/pg_wchar.h"
21 #include "nodes/pg_list.h"
22
23 static JsonPathString scanstring;
24
25 /* Handles to the buffer that the lexer uses internally */
26 static YY_BUFFER_STATE scanbufhandle;
27 static char *scanbuf;
28 static int      scanbuflen;
29
30 static void addstring(bool init, char *s, int l);
31 static void addchar(bool init, char s);
32 static enum yytokentype checkKeyword(void);
33 static void parseUnicode(char *s, int l);
34 static void parseHexChar(char *s);
35
36 /* Avoid exit() on fatal scanner errors (a bit ugly -- see yy_fatal_error) */
37 #undef fprintf
38 #define fprintf(file, fmt, msg)  fprintf_to_ereport(fmt, msg)
39
40 static void
41 fprintf_to_ereport(const char *fmt, const char *msg)
42 {
43         ereport(ERROR, (errmsg_internal("%s", msg)));
44 }
45
46 %}
47
48 %option 8bit
49 %option never-interactive
50 %option nodefault
51 %option noinput
52 %option nounput
53 %option noyywrap
54 %option warn
55 %option prefix="jsonpath_yy"
56 %option bison-bridge
57 %option noyyalloc
58 %option noyyrealloc
59 %option noyyfree
60
61 /*
62  * We use exclusive states for quoted, signle-quoted and non-quoted strings,
63  * quoted variable names and C-tyle comments.
64  * Exclusive states:
65  *  <xq> - quoted strings
66  *  <xnq> - non-quoted strings
67  *  <xvq> - quoted variable names
68  *  <xsq> - single-quoted strings
69  *  <xc> - C-style comment
70  */
71
72 %x xq
73 %x xnq
74 %x xvq
75 %x xsq
76 %x xc
77
78 special          [\?\%\$\.\[\]\{\}\(\)\|\&\!\=\<\>\@\#\,\*:\-\+\/]
79 any                     [^\?\%\$\.\[\]\{\}\(\)\|\&\!\=\<\>\@\#\,\*:\-\+\/\\\"\' \t\n\r\f]
80 blank           [ \t\n\r\f]
81
82 digit           [0-9]
83 integer         (0|[1-9]{digit}*)
84 decimal         {integer}\.{digit}+
85 decimalfail     {integer}\.
86 real            ({integer}|{decimal})[Ee][-+]?{digit}+
87 realfail1       ({integer}|{decimal})[Ee]
88 realfail2       ({integer}|{decimal})[Ee][-+]
89
90 hex_dig         [0-9A-Fa-f]
91 unicode         \\u({hex_dig}{4}|\{{hex_dig}{1,6}\})
92 unicodefail     \\u({hex_dig}{0,3}|\{{hex_dig}{0,6})
93 hex_char        \\x{hex_dig}{2}
94 hex_fail        \\x{hex_dig}{0,1}
95
96 %%
97
98 <xnq>{any}+                                             {
99                                                                         addstring(false, yytext, yyleng);
100                                                                 }
101
102 <xnq>{blank}+                                   {
103                                                                         yylval->str = scanstring;
104                                                                         BEGIN INITIAL;
105                                                                         return checkKeyword();
106                                                                 }
107
108
109 <xnq>\/\*                                               {
110                                                                         yylval->str = scanstring;
111                                                                         BEGIN xc;
112                                                                 }
113
114 <xnq>({special}|\"|\')                  {
115                                                                         yylval->str = scanstring;
116                                                                         yyless(0);
117                                                                         BEGIN INITIAL;
118                                                                         return checkKeyword();
119                                                                 }
120
121 <xnq><<EOF>>                                    {
122                                                                         yylval->str = scanstring;
123                                                                         BEGIN INITIAL;
124                                                                         return checkKeyword();
125                                                                 }
126
127 <xnq,xq,xvq,xsq>\\[\"\'\\]              { addchar(false, yytext[1]); }
128
129 <xnq,xq,xvq,xsq>\\b                             { addchar(false, '\b'); }
130
131 <xnq,xq,xvq,xsq>\\f                             { addchar(false, '\f'); }
132
133 <xnq,xq,xvq,xsq>\\n                             { addchar(false, '\n'); }
134
135 <xnq,xq,xvq,xsq>\\r                             { addchar(false, '\r'); }
136
137 <xnq,xq,xvq,xsq>\\t                             { addchar(false, '\t'); }
138
139 <xnq,xq,xvq,xsq>\\v                             { addchar(false, '\v'); }
140
141 <xnq,xq,xvq,xsq>{unicode}+              { parseUnicode(yytext, yyleng); }
142
143 <xnq,xq,xvq,xsq>{hex_char}              { parseHexChar(yytext); }
144
145 <xnq,xq,xvq,xsq>{unicode}*{unicodefail} { yyerror(NULL, "invalid unicode sequence"); }
146
147 <xnq,xq,xvq,xsq>{hex_fail}              { yyerror(NULL, "invalid hex character sequence"); }
148
149 <xnq,xq,xvq,xsq>{unicode}+\\    {
150                                                                         /* throw back the \\, and treat as unicode */
151                                                                         yyless(yyleng - 1);
152                                                                         parseUnicode(yytext, yyleng);
153                                                                 }
154
155 <xnq,xq,xvq,xsq>\\.                             { yyerror(NULL, "escape sequence is invalid"); }
156
157 <xnq,xq,xvq,xsq>\\                              { yyerror(NULL, "unexpected end after backslash"); }
158
159 <xq,xvq,xsq><<EOF>>                             { yyerror(NULL, "unexpected end of quoted string"); }
160
161 <xq>\"                                                  {
162                                                                         yylval->str = scanstring;
163                                                                         BEGIN INITIAL;
164                                                                         return STRING_P;
165                                                                 }
166
167 <xvq>\"                                                 {
168                                                                         yylval->str = scanstring;
169                                                                         BEGIN INITIAL;
170                                                                         return VARIABLE_P;
171                                                                 }
172
173 <xsq>\'                                                 {
174                                                                         yylval->str = scanstring;
175                                                                         BEGIN INITIAL;
176                                                                         return STRING_P;
177                                                                 }
178
179 <xq,xvq>[^\\\"]+                                { addstring(false, yytext, yyleng); }
180
181 <xsq>[^\\\']+                                   { addstring(false, yytext, yyleng); }
182
183 <xc>\*\/                                                { BEGIN INITIAL; }
184
185 <xc>[^\*]+                                              { }
186
187 <xc>\*                                                  { }
188
189 <xc><<EOF>>                                             { yyerror(NULL, "unexpected end of comment"); }
190
191 \&\&                                                    { return AND_P; }
192
193 \|\|                                                    { return OR_P; }
194
195 \!                                                              { return NOT_P; }
196
197 \*\*                                                    { return ANY_P; }
198
199 \<                                                              { return LESS_P; }
200
201 \<\=                                                    { return LESSEQUAL_P; }
202
203 \=\=                                                    { return EQUAL_P; }
204
205 \<\>                                                    { return NOTEQUAL_P; }
206
207 \!\=                                                    { return NOTEQUAL_P; }
208
209 \>\=                                                    { return GREATEREQUAL_P; }
210
211 \>                                                              { return GREATER_P; }
212
213 \${any}+                                                {
214                                                                         addstring(true, yytext + 1, yyleng - 1);
215                                                                         addchar(false, '\0');
216                                                                         yylval->str = scanstring;
217                                                                         return VARIABLE_P;
218                                                                 }
219
220 \$\"                                                    {
221                                                                         addchar(true, '\0');
222                                                                         BEGIN xvq;
223                                                                 }
224
225 {special}                                               { return *yytext; }
226
227 {blank}+                                                { /* ignore */ }
228
229 \/\*                                                    {
230                                                                         addchar(true, '\0');
231                                                                         BEGIN xc;
232                                                                 }
233
234 {real}                                                  {
235                                                                         addstring(true, yytext, yyleng);
236                                                                         addchar(false, '\0');
237                                                                         yylval->str = scanstring;
238                                                                         return NUMERIC_P;
239                                                                 }
240
241 {decimal}                                               {
242                                                                         addstring(true, yytext, yyleng);
243                                                                         addchar(false, '\0');
244                                                                         yylval->str = scanstring;
245                                                                         return NUMERIC_P;
246                                                                 }
247
248 {integer}                                               {
249                                                                         addstring(true, yytext, yyleng);
250                                                                         addchar(false, '\0');
251                                                                         yylval->str = scanstring;
252                                                                         return INT_P;
253                                                                 }
254
255 {decimalfail}                                   {
256                                                                         /* throw back the ., and treat as integer */
257                                                                         yyless(yyleng - 1);
258                                                                         addstring(true, yytext, yyleng);
259                                                                         addchar(false, '\0');
260                                                                         yylval->str = scanstring;
261                                                                         return INT_P;
262                                                                 }
263
264 ({realfail1}|{realfail2})               { yyerror(NULL, "invalid floating point number"); }
265
266 {any}+                                                  {
267                                                                         addstring(true, yytext, yyleng);
268                                                                         BEGIN xnq;
269                                                                 }
270
271 \"                                                              {
272                                                                         addchar(true, '\0');
273                                                                         BEGIN xq;
274                                                                 }
275
276 \'                                                              {
277                                                                         addchar(true, '\0');
278                                                                         BEGIN xsq;
279                                                                 }
280
281 \\                                                              {
282                                                                         yyless(0);
283                                                                         addchar(true, '\0');
284                                                                         BEGIN xnq;
285                                                                 }
286
287 <<EOF>>                                                 { yyterminate(); }
288
289 %%
290
291 void
292 jsonpath_yyerror(JsonPathParseResult **result, const char *message)
293 {
294         if (*yytext == YY_END_OF_BUFFER_CHAR)
295         {
296                 ereport(ERROR,
297                                 (errcode(ERRCODE_SYNTAX_ERROR),
298                                  /* translator: %s is typically "syntax error" */
299                                  errmsg("%s at end of jsonpath input", _(message))));
300         }
301         else
302         {
303                 ereport(ERROR,
304                                 (errcode(ERRCODE_SYNTAX_ERROR),
305                                  /* translator: first %s is typically "syntax error" */
306                                  errmsg("%s at or near \"%s\" of jsonpath input",
307                                                 _(message), yytext)));
308         }
309 }
310
311 typedef struct JsonPathKeyword
312 {
313         int16           len;
314         bool            lowercase;
315         int                     val;
316         const char *keyword;
317 } JsonPathKeyword;
318
319 /*
320  * Array of key words should be sorted by length and then
321  * alphabetical order
322  */
323 static const JsonPathKeyword keywords[] = {
324         { 2, false,     IS_P,           "is"},
325         { 2, false,     TO_P,           "to"},
326         { 3, false,     ABS_P,          "abs"},
327         { 3, false,     LAX_P,          "lax"},
328         { 4, false,     FLAG_P,         "flag"},
329         { 4, false,     LAST_P,         "last"},
330         { 4, true,      NULL_P,         "null"},
331         { 4, false,     SIZE_P,         "size"},
332         { 4, true,      TRUE_P,         "true"},
333         { 4, false,     TYPE_P,         "type"},
334         { 4, false,     WITH_P,         "with"},
335         { 5, true,      FALSE_P,        "false"},
336         { 5, false,     FLOOR_P,        "floor"},
337         { 6, false,     DOUBLE_P,       "double"},
338         { 6, false,     EXISTS_P,       "exists"},
339         { 6, false,     STARTS_P,       "starts"},
340         { 6, false,     STRICT_P,       "strict"},
341         { 7, false,     CEILING_P,      "ceiling"},
342         { 7, false,     UNKNOWN_P,      "unknown"},
343         { 8, false,     KEYVALUE_P,     "keyvalue"},
344         { 10,false, LIKE_REGEX_P, "like_regex"},
345 };
346
347 /* Check if current scanstring value is a keyword */
348 static enum yytokentype
349 checkKeyword()
350 {
351         int                                             res = IDENT_P;
352         int                                             diff;
353         const JsonPathKeyword  *StopLow = keywords,
354                                                    *StopHigh = keywords + lengthof(keywords),
355                                                    *StopMiddle;
356
357         if (scanstring.len > keywords[lengthof(keywords) - 1].len)
358                 return res;
359
360         while (StopLow < StopHigh)
361         {
362                 StopMiddle = StopLow + ((StopHigh - StopLow) >> 1);
363
364                 if (StopMiddle->len == scanstring.len)
365                         diff = pg_strncasecmp(StopMiddle->keyword, scanstring.val,
366                                                                   scanstring.len);
367                 else
368                         diff = StopMiddle->len - scanstring.len;
369
370                 if (diff < 0)
371                         StopLow = StopMiddle + 1;
372                 else if (diff > 0)
373                         StopHigh = StopMiddle;
374                 else
375                 {
376                         if (StopMiddle->lowercase)
377                                 diff = strncmp(StopMiddle->keyword, scanstring.val,
378                                                            scanstring.len);
379
380                         if (diff == 0)
381                                 res = StopMiddle->val;
382
383                         break;
384                 }
385         }
386
387         return res;
388 }
389
390 /*
391  * Called before any actual parsing is done
392  */
393 static void
394 jsonpath_scanner_init(const char *str, int slen)
395 {
396         if (slen <= 0)
397                 slen = strlen(str);
398
399         /*
400          * Might be left over after ereport()
401          */
402         yy_init_globals();
403
404         /*
405          * Make a scan buffer with special termination needed by flex.
406          */
407
408         scanbuflen = slen;
409         scanbuf = palloc(slen + 2);
410         memcpy(scanbuf, str, slen);
411         scanbuf[slen] = scanbuf[slen + 1] = YY_END_OF_BUFFER_CHAR;
412         scanbufhandle = yy_scan_buffer(scanbuf, slen + 2);
413
414         BEGIN(INITIAL);
415 }
416
417
418 /*
419  * Called after parsing is done to clean up after jsonpath_scanner_init()
420  */
421 static void
422 jsonpath_scanner_finish(void)
423 {
424         yy_delete_buffer(scanbufhandle);
425         pfree(scanbuf);
426 }
427
428 /*
429  * Resize scanstring so that it can append string of given length.
430  * Reinitialize if required.
431  */
432 static void
433 resizeString(bool init, int appendLen)
434 {
435         if (init)
436         {
437                 scanstring.total = Max(32, appendLen);
438                 scanstring.val = (char *) palloc(scanstring.total);
439                 scanstring.len = 0;
440         }
441         else
442         {
443                 if (scanstring.len + appendLen >= scanstring.total)
444                 {
445                         while (scanstring.len + appendLen >= scanstring.total)
446                                 scanstring.total *= 2;
447                         scanstring.val = repalloc(scanstring.val, scanstring.total);
448                 }
449         }
450 }
451
452 /* Add set of bytes at "s" of length "l" to scanstring */
453 static void
454 addstring(bool init, char *s, int l)
455 {
456         resizeString(init, l + 1);
457         memcpy(scanstring.val + scanstring.len, s, l);
458         scanstring.len += l;
459 }
460
461 /* Add single byte "c" to scanstring */
462 static void
463 addchar(bool init, char c)
464 {
465         resizeString(init, 1);
466         scanstring.val[scanstring.len] = c;
467         if (c != '\0')
468                 scanstring.len++;
469 }
470
471 /* Interface to jsonpath parser */
472 JsonPathParseResult *
473 parsejsonpath(const char *str, int len)
474 {
475         JsonPathParseResult     *parseresult;
476
477         jsonpath_scanner_init(str, len);
478
479         if (jsonpath_yyparse((void *) &parseresult) != 0)
480                 jsonpath_yyerror(NULL, "bogus input"); /* shouldn't happen */
481
482         jsonpath_scanner_finish();
483
484         return parseresult;
485 }
486
487 /* Turn hex character into integer */
488 static int
489 hexval(char c)
490 {
491         if (c >= '0' && c <= '9')
492                 return c - '0';
493         if (c >= 'a' && c <= 'f')
494                 return c - 'a' + 0xA;
495         if (c >= 'A' && c <= 'F')
496                 return c - 'A' + 0xA;
497         jsonpath_yyerror(NULL, "invalid hexadecimal digit");
498         return 0; /* not reached */
499 }
500
501 /* Add given unicode character to scanstring */
502 static void
503 addUnicodeChar(int ch)
504 {
505         /*
506          * For UTF8, replace the escape sequence by the actual
507          * utf8 character in lex->strval. Do this also for other
508          * encodings if the escape designates an ASCII character,
509          * otherwise raise an error.
510          */
511
512         if (ch == 0)
513         {
514                 /* We can't allow this, since our TEXT type doesn't */
515                 ereport(ERROR,
516                                 (errcode(ERRCODE_UNTRANSLATABLE_CHARACTER),
517                                  errmsg("unsupported Unicode escape sequence"),
518                                   errdetail("\\u0000 cannot be converted to text.")));
519         }
520         else if (GetDatabaseEncoding() == PG_UTF8)
521         {
522                 char utf8str[5];
523                 int utf8len;
524
525                 unicode_to_utf8(ch, (unsigned char *) utf8str);
526                 utf8len = pg_utf_mblen((unsigned char *) utf8str);
527                 addstring(false, utf8str, utf8len);
528         }
529         else if (ch <= 0x007f)
530         {
531                 /*
532                  * This is the only way to designate things like a
533                  * form feed character in JSON, so it's useful in all
534                  * encodings.
535                  */
536                 addchar(false, (char) ch);
537         }
538         else
539         {
540                 ereport(ERROR,
541                                 (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
542                                  errmsg("invalid input syntax for type jsonpath"),
543                                  errdetail("Unicode escape values cannot be used for code "
544                                                    "point values above 007F when the server encoding "
545                                                    "is not UTF8.")));
546         }
547 }
548
549 /* Add unicode character and process its hi surrogate */
550 static void
551 addUnicode(int ch, int *hi_surrogate)
552 {
553         if (ch >= 0xd800 && ch <= 0xdbff)
554         {
555                 if (*hi_surrogate != -1)
556                         ereport(ERROR,
557                                         (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
558                                          errmsg("invalid input syntax for type jsonpath"),
559                                          errdetail("Unicode high surrogate must not follow "
560                                                            "a high surrogate.")));
561                 *hi_surrogate = (ch & 0x3ff) << 10;
562                 return;
563         }
564         else if (ch >= 0xdc00 && ch <= 0xdfff)
565         {
566                 if (*hi_surrogate == -1)
567                         ereport(ERROR,
568                                         (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
569                                          errmsg("invalid input syntax for type jsonpath"),
570                                          errdetail("Unicode low surrogate must follow a high "
571                                                            "surrogate.")));
572                 ch = 0x10000 + *hi_surrogate + (ch & 0x3ff);
573                 *hi_surrogate = -1;
574         }
575         else if (*hi_surrogate != -1)
576         {
577                 ereport(ERROR,
578                                 (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
579                                  errmsg("invalid input syntax for type jsonpath"),
580                                  errdetail("Unicode low surrogate must follow a high "
581                                                    "surrogate.")));
582         }
583
584         addUnicodeChar(ch);
585 }
586
587 /*
588  * parseUnicode was adopted from json_lex_string() in
589  * src/backend/utils/adt/json.c
590  */
591 static void
592 parseUnicode(char *s, int l)
593 {
594         int                     i = 2;
595         int                     hi_surrogate = -1;
596
597         for (i = 2; i < l; i += 2)      /* skip '\u' */
598         {
599                 int                     ch = 0;
600                 int                     j;
601
602                 if (s[i] == '{')        /* parse '\u{XX...}' */
603                 {
604                         while (s[++i] != '}' && i < l)
605                                 ch = (ch << 4) | hexval(s[i]);
606                         i++;    /* ski p '}' */
607                 }
608                 else            /* parse '\uXXXX' */
609                 {
610                         for (j = 0; j < 4 && i < l; j++)
611                                 ch = (ch << 4) | hexval(s[i++]);
612                 }
613
614                 addUnicode(ch, &hi_surrogate);
615         }
616
617         if (hi_surrogate != -1)
618         {
619                 ereport(ERROR,
620                                 (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
621                                  errmsg("invalid input syntax for type jsonpath"),
622                                  errdetail("Unicode low surrogate must follow a high "
623                                                    "surrogate.")));
624         }
625 }
626
627 /* Parse sequence of hex-encoded characters */
628 static void
629 parseHexChar(char *s)
630 {
631         int                     ch = (hexval(s[2]) << 4) |
632                                           hexval(s[3]);
633
634         addUnicodeChar(ch);
635 }
636
637 /*
638  * Interface functions to make flex use palloc() instead of malloc().
639  * It'd be better to make these static, but flex insists otherwise.
640  */
641
642 void *
643 jsonpath_yyalloc(yy_size_t bytes)
644 {
645         return palloc(bytes);
646 }
647
648 void *
649 jsonpath_yyrealloc(void *ptr, yy_size_t bytes)
650 {
651         if (ptr)
652                 return repalloc(ptr, bytes);
653         else
654                 return palloc(bytes);
655 }
656
657 void
658 jsonpath_yyfree(void *ptr)
659 {
660         if (ptr)
661                 pfree(ptr);
662 }