]> granicus.if.org Git - postgresql/blob - src/backend/utils/adt/jsonpath_scan.l
Partial implementation of SQL/JSON path language
[postgresql] / src / backend / utils / adt / jsonpath_scan.l
1 /*-------------------------------------------------------------------------
2  *
3  * jsonpath_scan.l
4  *      Lexical parser for jsonpath datatype
5  *
6  * Copyright (c) 2019, PostgreSQL Global Development Group
7  *
8  * IDENTIFICATION
9  *      src/backend/utils/adt/jsonpath_scan.l
10  *
11  *-------------------------------------------------------------------------
12  */
13
14 %{
15 #include "postgres.h"
16
17 #include "mb/pg_wchar.h"
18 #include "nodes/pg_list.h"
19 #include "utils/jsonpath_scanner.h"
20
21 static string scanstring;
22
23 /* No reason to constrain amount of data slurped */
24 /* #define YY_READ_BUF_SIZE 16777216 */
25
26 /* Handles to the buffer that the lexer uses internally */
27 static YY_BUFFER_STATE scanbufhandle;
28 static char *scanbuf;
29 static int      scanbuflen;
30
31 static void addstring(bool init, char *s, int l);
32 static void addchar(bool init, char s);
33 static int checkSpecialVal(void); /* examine scanstring for the special
34                                                                    * value */
35
36 static void parseUnicode(char *s, int l);
37 static void parseHexChars(char *s, int l);
38
39 /* Avoid exit() on fatal scanner errors (a bit ugly -- see yy_fatal_error) */
40 #undef fprintf
41 #define fprintf(file, fmt, msg)  fprintf_to_ereport(fmt, msg)
42
43 static void
44 fprintf_to_ereport(const char *fmt, const char *msg)
45 {
46         ereport(ERROR, (errmsg_internal("%s", msg)));
47 }
48
49 #define yyerror jsonpath_yyerror
50 %}
51
52 %option 8bit
53 %option never-interactive
54 %option nodefault
55 %option noinput
56 %option nounput
57 %option noyywrap
58 %option warn
59 %option prefix="jsonpath_yy"
60 %option bison-bridge
61 %option noyyalloc
62 %option noyyrealloc
63 %option noyyfree
64
65 %x xQUOTED
66 %x xNONQUOTED
67 %x xVARQUOTED
68 %x xSINGLEQUOTED
69 %x xCOMMENT
70
71 special          [\?\%\$\.\[\]\{\}\(\)\|\&\!\=\<\>\@\#\,\*:\-\+\/]
72 any                     [^\?\%\$\.\[\]\{\}\(\)\|\&\!\=\<\>\@\#\,\*:\-\+\/\\\"\' \t\n\r\f]
73 blank           [ \t\n\r\f]
74 hex_dig         [0-9A-Fa-f]
75 unicode         \\u({hex_dig}{4}|\{{hex_dig}{1,6}\})
76 hex_char        \\x{hex_dig}{2}
77
78
79 %%
80
81 <INITIAL>\&\&                                   { return AND_P; }
82
83 <INITIAL>\|\|                                   { return OR_P; }
84
85 <INITIAL>\!                                             { return NOT_P; }
86
87 <INITIAL>\*\*                                   { return ANY_P; }
88
89 <INITIAL>\<                                             { return LESS_P; }
90
91 <INITIAL>\<\=                                   { return LESSEQUAL_P; }
92
93 <INITIAL>\=\=                                   { return EQUAL_P; }
94
95 <INITIAL>\<\>                                   { return NOTEQUAL_P; }
96
97 <INITIAL>\!\=                                   { return NOTEQUAL_P; }
98
99 <INITIAL>\>\=                                   { return GREATEREQUAL_P; }
100
101 <INITIAL>\>                                             { return GREATER_P; }
102
103 <INITIAL>\${any}+                               {
104                                                                         addstring(true, yytext + 1, yyleng - 1);
105                                                                         addchar(false, '\0');
106                                                                         yylval->str = scanstring;
107                                                                         return VARIABLE_P;
108                                                                 }
109
110 <INITIAL>\$\"                                   {
111                                                                         addchar(true, '\0');
112                                                                         BEGIN xVARQUOTED;
113                                                                 }
114
115 <INITIAL>{special}                              { return *yytext; }
116
117 <INITIAL>{blank}+                               { /* ignore */ }
118
119 <INITIAL>\/\*                                   {
120                                                                         addchar(true, '\0');
121                                                                         BEGIN xCOMMENT;
122                                                                 }
123
124 <INITIAL>[0-9]+(\.[0-9]+)?[eE][+-]?[0-9]+  /* float */  {
125                                                                         addstring(true, yytext, yyleng);
126                                                                         addchar(false, '\0');
127                                                                         yylval->str = scanstring;
128                                                                         return NUMERIC_P;
129                                                                 }
130
131 <INITIAL>\.[0-9]+[eE][+-]?[0-9]+  /* float */  {
132                                                                         addstring(true, yytext, yyleng);
133                                                                         addchar(false, '\0');
134                                                                         yylval->str = scanstring;
135                                                                         return NUMERIC_P;
136                                                                 }
137
138 <INITIAL>([0-9]+)?\.[0-9]+              {
139                                                                         addstring(true, yytext, yyleng);
140                                                                         addchar(false, '\0');
141                                                                         yylval->str = scanstring;
142                                                                         return NUMERIC_P;
143                                                                 }
144
145 <INITIAL>[0-9]+                                 {
146                                                                         addstring(true, yytext, yyleng);
147                                                                         addchar(false, '\0');
148                                                                         yylval->str = scanstring;
149                                                                         return INT_P;
150                                                                 }
151
152 <INITIAL>{any}+                                 {
153                                                                         addstring(true, yytext, yyleng);
154                                                                         BEGIN xNONQUOTED;
155                                                                 }
156
157 <INITIAL>\"                                             {
158                                                                         addchar(true, '\0');
159                                                                         BEGIN xQUOTED;
160                                                                 }
161
162 <INITIAL>\'                                             {
163                                                                         addchar(true, '\0');
164                                                                         BEGIN xSINGLEQUOTED;
165                                                                 }
166
167 <INITIAL>\\                                             {
168                                                                         yyless(0);
169                                                                         addchar(true, '\0');
170                                                                         BEGIN xNONQUOTED;
171                                                                 }
172
173 <xNONQUOTED>{any}+                              {
174                                                                         addstring(false, yytext, yyleng);
175                                                                 }
176
177 <xNONQUOTED>{blank}+                    {
178                                                                         yylval->str = scanstring;
179                                                                         BEGIN INITIAL;
180                                                                         return checkSpecialVal();
181                                                                 }
182
183
184 <xNONQUOTED>\/\*                                {
185                                                                         yylval->str = scanstring;
186                                                                         BEGIN xCOMMENT;
187                                                                 }
188
189 <xNONQUOTED>({special}|\"|\')   {
190                                                                         yylval->str = scanstring;
191                                                                         yyless(0);
192                                                                         BEGIN INITIAL;
193                                                                         return checkSpecialVal();
194                                                                 }
195
196 <xNONQUOTED><<EOF>>                             {
197                                                                         yylval->str = scanstring;
198                                                                         BEGIN INITIAL;
199                                                                         return checkSpecialVal();
200                                                                 }
201
202 <xNONQUOTED,xQUOTED,xVARQUOTED,xSINGLEQUOTED>\\[\"\'\\] { addchar(false, yytext[1]); }
203
204 <xNONQUOTED,xQUOTED,xVARQUOTED,xSINGLEQUOTED>\\b        { addchar(false, '\b'); }
205
206 <xNONQUOTED,xQUOTED,xVARQUOTED,xSINGLEQUOTED>\\f        { addchar(false, '\f'); }
207
208 <xNONQUOTED,xQUOTED,xVARQUOTED,xSINGLEQUOTED>\\n        { addchar(false, '\n'); }
209
210 <xNONQUOTED,xQUOTED,xVARQUOTED,xSINGLEQUOTED>\\r        { addchar(false, '\r'); }
211
212 <xNONQUOTED,xQUOTED,xVARQUOTED,xSINGLEQUOTED>\\t        { addchar(false, '\t'); }
213
214 <xNONQUOTED,xQUOTED,xVARQUOTED,xSINGLEQUOTED>\\v        { addchar(false, '\v'); }
215
216 <xNONQUOTED,xQUOTED,xVARQUOTED,xSINGLEQUOTED>{unicode}+         { parseUnicode(yytext, yyleng); }
217
218 <xNONQUOTED,xQUOTED,xVARQUOTED,xSINGLEQUOTED>{hex_char}+        { parseHexChars(yytext, yyleng); }
219
220 <xNONQUOTED,xQUOTED,xVARQUOTED,xSINGLEQUOTED>\\x        { yyerror(NULL, "Hex character sequence is invalid"); }
221
222 <xNONQUOTED,xQUOTED,xVARQUOTED,xSINGLEQUOTED>\\u        { yyerror(NULL, "Unicode sequence is invalid"); }
223
224 <xNONQUOTED,xQUOTED,xVARQUOTED,xSINGLEQUOTED>\\.        { yyerror(NULL, "Escape sequence is invalid"); }
225
226 <xNONQUOTED,xQUOTED,xVARQUOTED,xSINGLEQUOTED>\\         { yyerror(NULL, "Unexpected end after backslash"); }
227
228 <xQUOTED,xVARQUOTED,xSINGLEQUOTED><<EOF>>                       { yyerror(NULL, "Unexpected end of quoted string"); }
229
230 <xQUOTED>\"                                             {
231                                                                         yylval->str = scanstring;
232                                                                         BEGIN INITIAL;
233                                                                         return STRING_P;
234                                                                 }
235
236 <xVARQUOTED>\"                                  {
237                                                                         yylval->str = scanstring;
238                                                                         BEGIN INITIAL;
239                                                                         return VARIABLE_P;
240                                                                 }
241
242 <xSINGLEQUOTED>\'                               {
243                                                                         yylval->str = scanstring;
244                                                                         BEGIN INITIAL;
245                                                                         return STRING_P;
246                                                                 }
247
248 <xQUOTED,xVARQUOTED>[^\\\"]+    { addstring(false, yytext, yyleng); }
249
250 <xSINGLEQUOTED>[^\\\']+                 { addstring(false, yytext, yyleng); }
251
252 <INITIAL><<EOF>>                                { yyterminate(); }
253
254 <xCOMMENT>\*\/                                  { BEGIN INITIAL; }
255
256 <xCOMMENT>[^\*]+                                { }
257
258 <xCOMMENT>\*                                    { }
259
260 <xCOMMENT><<EOF>>                               { yyerror(NULL, "Unexpected end of comment"); }
261
262 %%
263
264 void
265 jsonpath_yyerror(JsonPathParseResult **result, const char *message)
266 {
267         if (*yytext == YY_END_OF_BUFFER_CHAR)
268         {
269                 ereport(ERROR,
270                                 (errcode(ERRCODE_SYNTAX_ERROR),
271                                  errmsg("bad jsonpath representation"),
272                                  /* translator: %s is typically "syntax error" */
273                                  errdetail("%s at end of input", message)));
274         }
275         else
276         {
277                 ereport(ERROR,
278                                 (errcode(ERRCODE_SYNTAX_ERROR),
279                                  errmsg("bad jsonpath representation"),
280                                  /* translator: first %s is typically "syntax error" */
281                                  errdetail("%s at or near \"%s\"", message, yytext)));
282         }
283 }
284
285 typedef struct keyword
286 {
287         int16   len;
288         bool    lowercase;
289         int             val;
290         char    *keyword;
291 } keyword;
292
293 /*
294  * Array of key words should be sorted by length and then
295  * alphabetical order
296  */
297
298 static keyword keywords[] = {
299         { 2, false,     IS_P,           "is"},
300         { 2, false,     TO_P,           "to"},
301         { 3, false,     ABS_P,          "abs"},
302         { 3, false,     LAX_P,          "lax"},
303         { 4, false,     FLAG_P,         "flag"},
304         { 4, false,     LAST_P,         "last"},
305         { 4, true,      NULL_P,         "null"},
306         { 4, false,     SIZE_P,         "size"},
307         { 4, true,      TRUE_P,         "true"},
308         { 4, false,     TYPE_P,         "type"},
309         { 4, false,     WITH_P,         "with"},
310         { 5, true,      FALSE_P,        "false"},
311         { 5, false,     FLOOR_P,        "floor"},
312         { 6, false,     DOUBLE_P,       "double"},
313         { 6, false,     EXISTS_P,       "exists"},
314         { 6, false,     STARTS_P,       "starts"},
315         { 6, false,     STRICT_P,       "strict"},
316         { 7, false,     CEILING_P,      "ceiling"},
317         { 7, false,     UNKNOWN_P,      "unknown"},
318         { 8, false,     KEYVALUE_P,     "keyvalue"},
319         { 10,false, LIKE_REGEX_P, "like_regex"},
320 };
321
322 static int
323 checkSpecialVal()
324 {
325         int                     res = IDENT_P;
326         int                     diff;
327         keyword         *StopLow = keywords,
328                                 *StopHigh = keywords + lengthof(keywords),
329                                 *StopMiddle;
330
331         if (scanstring.len > keywords[lengthof(keywords) - 1].len)
332                 return res;
333
334         while(StopLow < StopHigh)
335         {
336                 StopMiddle = StopLow + ((StopHigh - StopLow) >> 1);
337
338                 if (StopMiddle->len == scanstring.len)
339                         diff = pg_strncasecmp(StopMiddle->keyword, scanstring.val,
340                                                                   scanstring.len);
341                 else
342                         diff = StopMiddle->len - scanstring.len;
343
344                 if (diff < 0)
345                         StopLow = StopMiddle + 1;
346                 else if (diff > 0)
347                         StopHigh = StopMiddle;
348                 else
349                 {
350                         if (StopMiddle->lowercase)
351                                 diff = strncmp(StopMiddle->keyword, scanstring.val,
352                                                            scanstring.len);
353
354                         if (diff == 0)
355                                 res = StopMiddle->val;
356
357                         break;
358                 }
359         }
360
361         return res;
362 }
363
364 /*
365  * Called before any actual parsing is done
366  */
367 static void
368 jsonpath_scanner_init(const char *str, int slen)
369 {
370         if (slen <= 0)
371                 slen = strlen(str);
372
373         /*
374          * Might be left over after ereport()
375          */
376         yy_init_globals();
377
378         /*
379          * Make a scan buffer with special termination needed by flex.
380          */
381
382         scanbuflen = slen;
383         scanbuf = palloc(slen + 2);
384         memcpy(scanbuf, str, slen);
385         scanbuf[slen] = scanbuf[slen + 1] = YY_END_OF_BUFFER_CHAR;
386         scanbufhandle = yy_scan_buffer(scanbuf, slen + 2);
387
388         BEGIN(INITIAL);
389 }
390
391
392 /*
393  * Called after parsing is done to clean up after jsonpath_scanner_init()
394  */
395 static void
396 jsonpath_scanner_finish(void)
397 {
398         yy_delete_buffer(scanbufhandle);
399         pfree(scanbuf);
400 }
401
402 static void
403 addstring(bool init, char *s, int l)
404 {
405         if (init)
406         {
407                 scanstring.total = 32;
408                 scanstring.val = palloc(scanstring.total);
409                 scanstring.len = 0;
410         }
411
412         if (s && l)
413         {
414                 while(scanstring.len + l + 1 >= scanstring.total)
415                 {
416                         scanstring.total *= 2;
417                         scanstring.val = repalloc(scanstring.val, scanstring.total);
418                 }
419
420                 memcpy(scanstring.val + scanstring.len, s, l);
421                 scanstring.len += l;
422         }
423 }
424
425 static void
426 addchar(bool init, char s)
427 {
428         if (init)
429         {
430                 scanstring.total = 32;
431                 scanstring.val = palloc(scanstring.total);
432                 scanstring.len = 0;
433         }
434         else if(scanstring.len + 1 >= scanstring.total)
435         {
436                 scanstring.total *= 2;
437                 scanstring.val = repalloc(scanstring.val, scanstring.total);
438         }
439
440         scanstring.val[ scanstring.len ] = s;
441         if (s != '\0')
442                 scanstring.len++;
443 }
444
445 JsonPathParseResult *
446 parsejsonpath(const char *str, int len)
447 {
448         JsonPathParseResult     *parseresult;
449
450         jsonpath_scanner_init(str, len);
451
452         if (jsonpath_yyparse((void*)&parseresult) != 0)
453                 jsonpath_yyerror(NULL, "bugus input");
454
455         jsonpath_scanner_finish();
456
457         return parseresult;
458 }
459
460 static int
461 hexval(char c)
462 {
463         if (c >= '0' && c <= '9')
464                 return c - '0';
465         if (c >= 'a' && c <= 'f')
466                 return c - 'a' + 0xA;
467         if (c >= 'A' && c <= 'F')
468                 return c - 'A' + 0xA;
469         elog(ERROR, "invalid hexadecimal digit");
470         return 0; /* not reached */
471 }
472
473 static void
474 addUnicodeChar(int ch)
475 {
476         /*
477          * For UTF8, replace the escape sequence by the actual
478          * utf8 character in lex->strval. Do this also for other
479          * encodings if the escape designates an ASCII character,
480          * otherwise raise an error.
481          */
482
483         if (ch == 0)
484         {
485                 /* We can't allow this, since our TEXT type doesn't */
486                 ereport(ERROR,
487                                 (errcode(ERRCODE_UNTRANSLATABLE_CHARACTER),
488                                  errmsg("unsupported Unicode escape sequence"),
489                                   errdetail("\\u0000 cannot be converted to text.")));
490         }
491         else if (GetDatabaseEncoding() == PG_UTF8)
492         {
493                 char utf8str[5];
494                 int utf8len;
495
496                 unicode_to_utf8(ch, (unsigned char *) utf8str);
497                 utf8len = pg_utf_mblen((unsigned char *) utf8str);
498                 addstring(false, utf8str, utf8len);
499         }
500         else if (ch <= 0x007f)
501         {
502                 /*
503                  * This is the only way to designate things like a
504                  * form feed character in JSON, so it's useful in all
505                  * encodings.
506                  */
507                 addchar(false, (char) ch);
508         }
509         else
510         {
511                 ereport(ERROR,
512                                 (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
513                                  errmsg("invalid input syntax for type jsonpath"),
514                                  errdetail("Unicode escape values cannot be used for code "
515                                                    "point values above 007F when the server encoding "
516                                                    "is not UTF8.")));
517         }
518 }
519
520 static void
521 addUnicode(int ch, int *hi_surrogate)
522 {
523         if (ch >= 0xd800 && ch <= 0xdbff)
524         {
525                 if (*hi_surrogate != -1)
526                         ereport(ERROR,
527                                         (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
528                                          errmsg("invalid input syntax for type jsonpath"),
529                                          errdetail("Unicode high surrogate must not follow "
530                                                            "a high surrogate.")));
531                 *hi_surrogate = (ch & 0x3ff) << 10;
532                 return;
533         }
534         else if (ch >= 0xdc00 && ch <= 0xdfff)
535         {
536                 if (*hi_surrogate == -1)
537                         ereport(ERROR,
538                                         (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
539                                          errmsg("invalid input syntax for type jsonpath"),
540                                          errdetail("Unicode low surrogate must follow a high "
541                                                            "surrogate.")));
542                 ch = 0x10000 + *hi_surrogate + (ch & 0x3ff);
543                 *hi_surrogate = -1;
544         }
545         else if (*hi_surrogate != -1)
546         {
547                 ereport(ERROR,
548                                 (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
549                                  errmsg("invalid input syntax for type jsonpath"),
550                                  errdetail("Unicode low surrogate must follow a high "
551                                                    "surrogate.")));
552         }
553
554         addUnicodeChar(ch);
555 }
556
557 /*
558  * parseUnicode was adopted from json_lex_string() in
559  * src/backend/utils/adt/json.c
560  */
561 static void
562 parseUnicode(char *s, int l)
563 {
564         int                     i;
565         int                     hi_surrogate = -1;
566
567         for (i = 2; i < l; i += 2)      /* skip '\u' */
568         {
569                 int                     ch = 0;
570                 int                     j;
571
572                 if (s[i] == '{')        /* parse '\u{XX...}' */
573                 {
574                         while (s[++i] != '}' && i < l)
575                                 ch = (ch << 4) | hexval(s[i]);
576                         i++;    /* ski p '}' */
577                 }
578                 else            /* parse '\uXXXX' */
579                 {
580                         for (j = 0; j < 4 && i < l; j++)
581                                 ch = (ch << 4) | hexval(s[i++]);
582                 }
583
584                 addUnicode(ch, &hi_surrogate);
585         }
586
587         if (hi_surrogate != -1)
588         {
589                 ereport(ERROR,
590                                 (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
591                                  errmsg("invalid input syntax for type jsonpath"),
592                                  errdetail("Unicode low surrogate must follow a high "
593                                                    "surrogate.")));
594         }
595 }
596
597 static void
598 parseHexChars(char *s, int l)
599 {
600         int i;
601
602         Assert(l % 4 /* \xXX */ == 0);
603
604         for (i = 0; i < l / 4; i++)
605         {
606                 int                     ch = (hexval(s[i * 4 + 2]) << 4) | hexval(s[i * 4 + 3]);
607
608                 addUnicodeChar(ch);
609         }
610 }
611
612 /*
613  * Interface functions to make flex use palloc() instead of malloc().
614  * It'd be better to make these static, but flex insists otherwise.
615  */
616
617 void *
618 jsonpath_yyalloc(yy_size_t bytes)
619 {
620         return palloc(bytes);
621 }
622
623 void *
624 jsonpath_yyrealloc(void *ptr, yy_size_t bytes)
625 {
626         if (ptr)
627                 return repalloc(ptr, bytes);
628         else
629                 return palloc(bytes);
630 }
631
632 void
633 jsonpath_yyfree(void *ptr)
634 {
635         if (ptr)
636                 pfree(ptr);
637 }
638