]> granicus.if.org Git - postgresql/blob - src/backend/utils/adt/jsonpath_scan.l
Fix some minor spec-compliance issues in jsonpath lexer.
[postgresql] / src / backend / utils / adt / jsonpath_scan.l
1 %{
2 /*-------------------------------------------------------------------------
3  *
4  * jsonpath_scan.l
5  *      Lexical parser for jsonpath datatype
6  *
7  * Splits jsonpath string into tokens represented as JsonPathString structs.
8  * Decodes unicode and hex escaped strings.
9  *
10  * Copyright (c) 2019, PostgreSQL Global Development Group
11  *
12  * IDENTIFICATION
13  *      src/backend/utils/adt/jsonpath_scan.l
14  *
15  *-------------------------------------------------------------------------
16  */
17
18 #include "postgres.h"
19
20 #include "mb/pg_wchar.h"
21 #include "nodes/pg_list.h"
22
23 static JsonPathString scanstring;
24
25 /* Handles to the buffer that the lexer uses internally */
26 static YY_BUFFER_STATE scanbufhandle;
27 static char *scanbuf;
28 static int      scanbuflen;
29
30 static void addstring(bool init, char *s, int l);
31 static void addchar(bool init, char s);
32 static enum yytokentype checkKeyword(void);
33 static void parseUnicode(char *s, int l);
34 static void parseHexChar(char *s);
35
36 /* Avoid exit() on fatal scanner errors (a bit ugly -- see yy_fatal_error) */
37 #undef fprintf
38 #define fprintf(file, fmt, msg)  fprintf_to_ereport(fmt, msg)
39
40 static void
41 fprintf_to_ereport(const char *fmt, const char *msg)
42 {
43         ereport(ERROR, (errmsg_internal("%s", msg)));
44 }
45
46 %}
47
48 %option 8bit
49 %option never-interactive
50 %option nodefault
51 %option noinput
52 %option nounput
53 %option noyywrap
54 %option warn
55 %option prefix="jsonpath_yy"
56 %option bison-bridge
57 %option noyyalloc
58 %option noyyrealloc
59 %option noyyfree
60
61 /*
62  * We use exclusive states for quoted and non-quoted strings,
63  * quoted variable names and C-style comments.
64  * Exclusive states:
65  *  <xq> - quoted strings
66  *  <xnq> - non-quoted strings
67  *  <xvq> - quoted variable names
68  *  <xc> - C-style comment
69  */
70
71 %x xq
72 %x xnq
73 %x xvq
74 %x xc
75
76 special         [\?\%\$\.\[\]\{\}\(\)\|\&\!\=\<\>\@\#\,\*:\-\+\/]
77 blank           [ \t\n\r\f]
78 /* "other" means anything that's not special, blank, or '\' or '"' */
79 other           [^\?\%\$\.\[\]\{\}\(\)\|\&\!\=\<\>\@\#\,\*:\-\+\/\\\" \t\n\r\f]
80
81 digit           [0-9]
82 integer         (0|[1-9]{digit}*)
83 decimal         {integer}\.{digit}+
84 decimalfail     {integer}\.
85 real            ({integer}|{decimal})[Ee][-+]?{digit}+
86 realfail1       ({integer}|{decimal})[Ee]
87 realfail2       ({integer}|{decimal})[Ee][-+]
88
89 hex_dig         [0-9A-Fa-f]
90 unicode         \\u({hex_dig}{4}|\{{hex_dig}{1,6}\})
91 unicodefail     \\u({hex_dig}{0,3}|\{{hex_dig}{0,6})
92 hex_char        \\x{hex_dig}{2}
93 hex_fail        \\x{hex_dig}{0,1}
94
95 %%
96
97 <xnq>{other}+                                   {
98                                                                         addstring(false, yytext, yyleng);
99                                                                 }
100
101 <xnq>{blank}+                                   {
102                                                                         yylval->str = scanstring;
103                                                                         BEGIN INITIAL;
104                                                                         return checkKeyword();
105                                                                 }
106
107 <xnq>\/\*                                               {
108                                                                         yylval->str = scanstring;
109                                                                         BEGIN xc;
110                                                                 }
111
112 <xnq>({special}|\")                             {
113                                                                         yylval->str = scanstring;
114                                                                         yyless(0);
115                                                                         BEGIN INITIAL;
116                                                                         return checkKeyword();
117                                                                 }
118
119 <xnq><<EOF>>                                    {
120                                                                         yylval->str = scanstring;
121                                                                         BEGIN INITIAL;
122                                                                         return checkKeyword();
123                                                                 }
124
125 <xnq,xq,xvq>\\b                         { addchar(false, '\b'); }
126
127 <xnq,xq,xvq>\\f                         { addchar(false, '\f'); }
128
129 <xnq,xq,xvq>\\n                         { addchar(false, '\n'); }
130
131 <xnq,xq,xvq>\\r                         { addchar(false, '\r'); }
132
133 <xnq,xq,xvq>\\t                         { addchar(false, '\t'); }
134
135 <xnq,xq,xvq>\\v                         { addchar(false, '\v'); }
136
137 <xnq,xq,xvq>{unicode}+          { parseUnicode(yytext, yyleng); }
138
139 <xnq,xq,xvq>{hex_char}          { parseHexChar(yytext); }
140
141 <xnq,xq,xvq>{unicode}*{unicodefail}     { yyerror(NULL, "invalid unicode sequence"); }
142
143 <xnq,xq,xvq>{hex_fail}          { yyerror(NULL, "invalid hex character sequence"); }
144
145 <xnq,xq,xvq>{unicode}+\\        {
146                                                                 /* throw back the \\, and treat as unicode */
147                                                                 yyless(yyleng - 1);
148                                                                 parseUnicode(yytext, yyleng);
149                                                         }
150
151 <xnq,xq,xvq>\\.                         { addchar(false, yytext[1]); }
152
153 <xnq,xq,xvq>\\                          { yyerror(NULL, "unexpected end after backslash"); }
154
155 <xq,xvq><<EOF>>                         { yyerror(NULL, "unexpected end of quoted string"); }
156
157 <xq>\"                                                  {
158                                                                         yylval->str = scanstring;
159                                                                         BEGIN INITIAL;
160                                                                         return STRING_P;
161                                                                 }
162
163 <xvq>\"                                                 {
164                                                                         yylval->str = scanstring;
165                                                                         BEGIN INITIAL;
166                                                                         return VARIABLE_P;
167                                                                 }
168
169 <xq,xvq>[^\\\"]+                                { addstring(false, yytext, yyleng); }
170
171 <xc>\*\/                                                { BEGIN INITIAL; }
172
173 <xc>[^\*]+                                              { }
174
175 <xc>\*                                                  { }
176
177 <xc><<EOF>>                                             { yyerror(NULL, "unexpected end of comment"); }
178
179 \&\&                                                    { return AND_P; }
180
181 \|\|                                                    { return OR_P; }
182
183 \!                                                              { return NOT_P; }
184
185 \*\*                                                    { return ANY_P; }
186
187 \<                                                              { return LESS_P; }
188
189 \<\=                                                    { return LESSEQUAL_P; }
190
191 \=\=                                                    { return EQUAL_P; }
192
193 \<\>                                                    { return NOTEQUAL_P; }
194
195 \!\=                                                    { return NOTEQUAL_P; }
196
197 \>\=                                                    { return GREATEREQUAL_P; }
198
199 \>                                                              { return GREATER_P; }
200
201 \${other}+                                              {
202                                                                         addstring(true, yytext + 1, yyleng - 1);
203                                                                         addchar(false, '\0');
204                                                                         yylval->str = scanstring;
205                                                                         return VARIABLE_P;
206                                                                 }
207
208 \$\"                                                    {
209                                                                         addchar(true, '\0');
210                                                                         BEGIN xvq;
211                                                                 }
212
213 {special}                                               { return *yytext; }
214
215 {blank}+                                                { /* ignore */ }
216
217 \/\*                                                    {
218                                                                         addchar(true, '\0');
219                                                                         BEGIN xc;
220                                                                 }
221
222 {real}                                                  {
223                                                                         addstring(true, yytext, yyleng);
224                                                                         addchar(false, '\0');
225                                                                         yylval->str = scanstring;
226                                                                         return NUMERIC_P;
227                                                                 }
228
229 {decimal}                                               {
230                                                                         addstring(true, yytext, yyleng);
231                                                                         addchar(false, '\0');
232                                                                         yylval->str = scanstring;
233                                                                         return NUMERIC_P;
234                                                                 }
235
236 {integer}                                               {
237                                                                         addstring(true, yytext, yyleng);
238                                                                         addchar(false, '\0');
239                                                                         yylval->str = scanstring;
240                                                                         return INT_P;
241                                                                 }
242
243 {decimalfail}                                   {
244                                                                         /* throw back the ., and treat as integer */
245                                                                         yyless(yyleng - 1);
246                                                                         addstring(true, yytext, yyleng);
247                                                                         addchar(false, '\0');
248                                                                         yylval->str = scanstring;
249                                                                         return INT_P;
250                                                                 }
251
252 ({realfail1}|{realfail2})               { yyerror(NULL, "invalid floating point number"); }
253
254 \"                                                              {
255                                                                         addchar(true, '\0');
256                                                                         BEGIN xq;
257                                                                 }
258
259 \\                                                              {
260                                                                         yyless(0);
261                                                                         addchar(true, '\0');
262                                                                         BEGIN xnq;
263                                                                 }
264
265 {other}+                                                {
266                                                                         addstring(true, yytext, yyleng);
267                                                                         BEGIN xnq;
268                                                                 }
269
270 <<EOF>>                                                 { yyterminate(); }
271
272 %%
273
274 void
275 jsonpath_yyerror(JsonPathParseResult **result, const char *message)
276 {
277         if (*yytext == YY_END_OF_BUFFER_CHAR)
278         {
279                 ereport(ERROR,
280                                 (errcode(ERRCODE_SYNTAX_ERROR),
281                                  /* translator: %s is typically "syntax error" */
282                                  errmsg("%s at end of jsonpath input", _(message))));
283         }
284         else
285         {
286                 ereport(ERROR,
287                                 (errcode(ERRCODE_SYNTAX_ERROR),
288                                  /* translator: first %s is typically "syntax error" */
289                                  errmsg("%s at or near \"%s\" of jsonpath input",
290                                                 _(message), yytext)));
291         }
292 }
293
294 typedef struct JsonPathKeyword
295 {
296         int16           len;
297         bool            lowercase;
298         int                     val;
299         const char *keyword;
300 } JsonPathKeyword;
301
302 /*
303  * Array of key words should be sorted by length and then
304  * alphabetical order
305  */
306 static const JsonPathKeyword keywords[] = {
307         { 2, false,     IS_P,           "is"},
308         { 2, false,     TO_P,           "to"},
309         { 3, false,     ABS_P,          "abs"},
310         { 3, false,     LAX_P,          "lax"},
311         { 4, false,     FLAG_P,         "flag"},
312         { 4, false,     LAST_P,         "last"},
313         { 4, true,      NULL_P,         "null"},
314         { 4, false,     SIZE_P,         "size"},
315         { 4, true,      TRUE_P,         "true"},
316         { 4, false,     TYPE_P,         "type"},
317         { 4, false,     WITH_P,         "with"},
318         { 5, true,      FALSE_P,        "false"},
319         { 5, false,     FLOOR_P,        "floor"},
320         { 6, false,     DOUBLE_P,       "double"},
321         { 6, false,     EXISTS_P,       "exists"},
322         { 6, false,     STARTS_P,       "starts"},
323         { 6, false,     STRICT_P,       "strict"},
324         { 7, false,     CEILING_P,      "ceiling"},
325         { 7, false,     UNKNOWN_P,      "unknown"},
326         { 8, false,     KEYVALUE_P,     "keyvalue"},
327         { 10,false, LIKE_REGEX_P, "like_regex"},
328 };
329
330 /* Check if current scanstring value is a keyword */
331 static enum yytokentype
332 checkKeyword()
333 {
334         int                                             res = IDENT_P;
335         int                                             diff;
336         const JsonPathKeyword  *StopLow = keywords,
337                                                    *StopHigh = keywords + lengthof(keywords),
338                                                    *StopMiddle;
339
340         if (scanstring.len > keywords[lengthof(keywords) - 1].len)
341                 return res;
342
343         while (StopLow < StopHigh)
344         {
345                 StopMiddle = StopLow + ((StopHigh - StopLow) >> 1);
346
347                 if (StopMiddle->len == scanstring.len)
348                         diff = pg_strncasecmp(StopMiddle->keyword, scanstring.val,
349                                                                   scanstring.len);
350                 else
351                         diff = StopMiddle->len - scanstring.len;
352
353                 if (diff < 0)
354                         StopLow = StopMiddle + 1;
355                 else if (diff > 0)
356                         StopHigh = StopMiddle;
357                 else
358                 {
359                         if (StopMiddle->lowercase)
360                                 diff = strncmp(StopMiddle->keyword, scanstring.val,
361                                                            scanstring.len);
362
363                         if (diff == 0)
364                                 res = StopMiddle->val;
365
366                         break;
367                 }
368         }
369
370         return res;
371 }
372
373 /*
374  * Called before any actual parsing is done
375  */
376 static void
377 jsonpath_scanner_init(const char *str, int slen)
378 {
379         if (slen <= 0)
380                 slen = strlen(str);
381
382         /*
383          * Might be left over after ereport()
384          */
385         yy_init_globals();
386
387         /*
388          * Make a scan buffer with special termination needed by flex.
389          */
390
391         scanbuflen = slen;
392         scanbuf = palloc(slen + 2);
393         memcpy(scanbuf, str, slen);
394         scanbuf[slen] = scanbuf[slen + 1] = YY_END_OF_BUFFER_CHAR;
395         scanbufhandle = yy_scan_buffer(scanbuf, slen + 2);
396
397         BEGIN(INITIAL);
398 }
399
400
401 /*
402  * Called after parsing is done to clean up after jsonpath_scanner_init()
403  */
404 static void
405 jsonpath_scanner_finish(void)
406 {
407         yy_delete_buffer(scanbufhandle);
408         pfree(scanbuf);
409 }
410
411 /*
412  * Resize scanstring so that it can append string of given length.
413  * Reinitialize if required.
414  */
415 static void
416 resizeString(bool init, int appendLen)
417 {
418         if (init)
419         {
420                 scanstring.total = Max(32, appendLen);
421                 scanstring.val = (char *) palloc(scanstring.total);
422                 scanstring.len = 0;
423         }
424         else
425         {
426                 if (scanstring.len + appendLen >= scanstring.total)
427                 {
428                         while (scanstring.len + appendLen >= scanstring.total)
429                                 scanstring.total *= 2;
430                         scanstring.val = repalloc(scanstring.val, scanstring.total);
431                 }
432         }
433 }
434
435 /* Add set of bytes at "s" of length "l" to scanstring */
436 static void
437 addstring(bool init, char *s, int l)
438 {
439         resizeString(init, l + 1);
440         memcpy(scanstring.val + scanstring.len, s, l);
441         scanstring.len += l;
442 }
443
444 /* Add single byte "c" to scanstring */
445 static void
446 addchar(bool init, char c)
447 {
448         resizeString(init, 1);
449         scanstring.val[scanstring.len] = c;
450         if (c != '\0')
451                 scanstring.len++;
452 }
453
454 /* Interface to jsonpath parser */
455 JsonPathParseResult *
456 parsejsonpath(const char *str, int len)
457 {
458         JsonPathParseResult     *parseresult;
459
460         jsonpath_scanner_init(str, len);
461
462         if (jsonpath_yyparse((void *) &parseresult) != 0)
463                 jsonpath_yyerror(NULL, "bogus input"); /* shouldn't happen */
464
465         jsonpath_scanner_finish();
466
467         return parseresult;
468 }
469
470 /* Turn hex character into integer */
471 static int
472 hexval(char c)
473 {
474         if (c >= '0' && c <= '9')
475                 return c - '0';
476         if (c >= 'a' && c <= 'f')
477                 return c - 'a' + 0xA;
478         if (c >= 'A' && c <= 'F')
479                 return c - 'A' + 0xA;
480         jsonpath_yyerror(NULL, "invalid hexadecimal digit");
481         return 0; /* not reached */
482 }
483
484 /* Add given unicode character to scanstring */
485 static void
486 addUnicodeChar(int ch)
487 {
488         /*
489          * For UTF8, replace the escape sequence by the actual
490          * utf8 character in lex->strval. Do this also for other
491          * encodings if the escape designates an ASCII character,
492          * otherwise raise an error.
493          */
494
495         if (ch == 0)
496         {
497                 /* We can't allow this, since our TEXT type doesn't */
498                 ereport(ERROR,
499                                 (errcode(ERRCODE_UNTRANSLATABLE_CHARACTER),
500                                  errmsg("unsupported Unicode escape sequence"),
501                                   errdetail("\\u0000 cannot be converted to text.")));
502         }
503         else if (GetDatabaseEncoding() == PG_UTF8)
504         {
505                 char utf8str[5];
506                 int utf8len;
507
508                 unicode_to_utf8(ch, (unsigned char *) utf8str);
509                 utf8len = pg_utf_mblen((unsigned char *) utf8str);
510                 addstring(false, utf8str, utf8len);
511         }
512         else if (ch <= 0x007f)
513         {
514                 /*
515                  * This is the only way to designate things like a
516                  * form feed character in JSON, so it's useful in all
517                  * encodings.
518                  */
519                 addchar(false, (char) ch);
520         }
521         else
522         {
523                 ereport(ERROR,
524                                 (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
525                                  errmsg("invalid input syntax for type %s", "jsonpath"),
526                                  errdetail("Unicode escape values cannot be used for code "
527                                                    "point values above 007F when the server encoding "
528                                                    "is not UTF8.")));
529         }
530 }
531
532 /* Add unicode character and process its hi surrogate */
533 static void
534 addUnicode(int ch, int *hi_surrogate)
535 {
536         if (ch >= 0xd800 && ch <= 0xdbff)
537         {
538                 if (*hi_surrogate != -1)
539                         ereport(ERROR,
540                                         (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
541                                          errmsg("invalid input syntax for type %s", "jsonpath"),
542                                          errdetail("Unicode high surrogate must not follow "
543                                                            "a high surrogate.")));
544                 *hi_surrogate = (ch & 0x3ff) << 10;
545                 return;
546         }
547         else if (ch >= 0xdc00 && ch <= 0xdfff)
548         {
549                 if (*hi_surrogate == -1)
550                         ereport(ERROR,
551                                         (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
552                                          errmsg("invalid input syntax for type %s", "jsonpath"),
553                                          errdetail("Unicode low surrogate must follow a high "
554                                                            "surrogate.")));
555                 ch = 0x10000 + *hi_surrogate + (ch & 0x3ff);
556                 *hi_surrogate = -1;
557         }
558         else if (*hi_surrogate != -1)
559         {
560                 ereport(ERROR,
561                                 (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
562                                  errmsg("invalid input syntax for type %s", "jsonpath"),
563                                  errdetail("Unicode low surrogate must follow a high "
564                                                    "surrogate.")));
565         }
566
567         addUnicodeChar(ch);
568 }
569
570 /*
571  * parseUnicode was adopted from json_lex_string() in
572  * src/backend/utils/adt/json.c
573  */
574 static void
575 parseUnicode(char *s, int l)
576 {
577         int                     i = 2;
578         int                     hi_surrogate = -1;
579
580         for (i = 2; i < l; i += 2)      /* skip '\u' */
581         {
582                 int                     ch = 0;
583                 int                     j;
584
585                 if (s[i] == '{')        /* parse '\u{XX...}' */
586                 {
587                         while (s[++i] != '}' && i < l)
588                                 ch = (ch << 4) | hexval(s[i]);
589                         i++;    /* skip '}' */
590                 }
591                 else            /* parse '\uXXXX' */
592                 {
593                         for (j = 0; j < 4 && i < l; j++)
594                                 ch = (ch << 4) | hexval(s[i++]);
595                 }
596
597                 addUnicode(ch, &hi_surrogate);
598         }
599
600         if (hi_surrogate != -1)
601         {
602                 ereport(ERROR,
603                                 (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
604                                  errmsg("invalid input syntax for type %s", "jsonpath"),
605                                  errdetail("Unicode low surrogate must follow a high "
606                                                    "surrogate.")));
607         }
608 }
609
610 /* Parse sequence of hex-encoded characters */
611 static void
612 parseHexChar(char *s)
613 {
614         int                     ch = (hexval(s[2]) << 4) |
615                                           hexval(s[3]);
616
617         addUnicodeChar(ch);
618 }
619
620 /*
621  * Interface functions to make flex use palloc() instead of malloc().
622  * It'd be better to make these static, but flex insists otherwise.
623  */
624
625 void *
626 jsonpath_yyalloc(yy_size_t bytes)
627 {
628         return palloc(bytes);
629 }
630
631 void *
632 jsonpath_yyrealloc(void *ptr, yy_size_t bytes)
633 {
634         if (ptr)
635                 return repalloc(ptr, bytes);
636         else
637                 return palloc(bytes);
638 }
639
640 void
641 jsonpath_yyfree(void *ptr)
642 {
643         if (ptr)
644                 pfree(ptr);
645 }