1 /*-------------------------------------------------------------------------
4 * Lexical parser for jsonpath datatype
6 * Copyright (c) 2019, PostgreSQL Global Development Group
9 * src/backend/utils/adt/jsonpath_scan.l
11 *-------------------------------------------------------------------------
17 #include "mb/pg_wchar.h"
18 #include "nodes/pg_list.h"
19 #include "utils/jsonpath_scanner.h"
21 static string scanstring;
23 /* No reason to constrain amount of data slurped */
24 /* #define YY_READ_BUF_SIZE 16777216 */
26 /* Handles to the buffer that the lexer uses internally */
27 static YY_BUFFER_STATE scanbufhandle;
29 static int scanbuflen;
31 static void addstring(bool init, char *s, int l);
32 static void addchar(bool init, char s);
33 static int checkSpecialVal(void); /* examine scanstring for the special
36 static void parseUnicode(char *s, int l);
37 static void parseHexChars(char *s, int l);
39 /* Avoid exit() on fatal scanner errors (a bit ugly -- see yy_fatal_error) */
41 #define fprintf(file, fmt, msg) fprintf_to_ereport(fmt, msg)
44 fprintf_to_ereport(const char *fmt, const char *msg)
46 ereport(ERROR, (errmsg_internal("%s", msg)));
49 #define yyerror jsonpath_yyerror
53 %option never-interactive
59 %option prefix="jsonpath_yy"
71 special [\?\%\$\.\[\]\{\}\(\)\|\&\!\=\<\>\@\#\,\*:\-\+\/]
72 any [^\?\%\$\.\[\]\{\}\(\)\|\&\!\=\<\>\@\#\,\*:\-\+\/\\\"\' \t\n\r\f]
75 unicode \\u({hex_dig}{4}|\{{hex_dig}{1,6}\})
76 hex_char \\x{hex_dig}{2}
81 <INITIAL>\&\& { return AND_P; }
83 <INITIAL>\|\| { return OR_P; }
85 <INITIAL>\! { return NOT_P; }
87 <INITIAL>\*\* { return ANY_P; }
89 <INITIAL>\< { return LESS_P; }
91 <INITIAL>\<\= { return LESSEQUAL_P; }
93 <INITIAL>\=\= { return EQUAL_P; }
95 <INITIAL>\<\> { return NOTEQUAL_P; }
97 <INITIAL>\!\= { return NOTEQUAL_P; }
99 <INITIAL>\>\= { return GREATEREQUAL_P; }
101 <INITIAL>\> { return GREATER_P; }
104 addstring(true, yytext + 1, yyleng - 1);
105 addchar(false, '\0');
106 yylval->str = scanstring;
115 <INITIAL>{special} { return *yytext; }
117 <INITIAL>{blank}+ { /* ignore */ }
124 <INITIAL>[0-9]+(\.[0-9]+)?[eE][+-]?[0-9]+ /* float */ {
125 addstring(true, yytext, yyleng);
126 addchar(false, '\0');
127 yylval->str = scanstring;
131 <INITIAL>\.[0-9]+[eE][+-]?[0-9]+ /* float */ {
132 addstring(true, yytext, yyleng);
133 addchar(false, '\0');
134 yylval->str = scanstring;
138 <INITIAL>([0-9]+)?\.[0-9]+ {
139 addstring(true, yytext, yyleng);
140 addchar(false, '\0');
141 yylval->str = scanstring;
146 addstring(true, yytext, yyleng);
147 addchar(false, '\0');
148 yylval->str = scanstring;
153 addstring(true, yytext, yyleng);
174 addstring(false, yytext, yyleng);
177 <xNONQUOTED>{blank}+ {
178 yylval->str = scanstring;
180 return checkSpecialVal();
185 yylval->str = scanstring;
189 <xNONQUOTED>({special}|\"|\') {
190 yylval->str = scanstring;
193 return checkSpecialVal();
196 <xNONQUOTED><<EOF>> {
197 yylval->str = scanstring;
199 return checkSpecialVal();
202 <xNONQUOTED,xQUOTED,xVARQUOTED,xSINGLEQUOTED>\\[\"\'\\] { addchar(false, yytext[1]); }
204 <xNONQUOTED,xQUOTED,xVARQUOTED,xSINGLEQUOTED>\\b { addchar(false, '\b'); }
206 <xNONQUOTED,xQUOTED,xVARQUOTED,xSINGLEQUOTED>\\f { addchar(false, '\f'); }
208 <xNONQUOTED,xQUOTED,xVARQUOTED,xSINGLEQUOTED>\\n { addchar(false, '\n'); }
210 <xNONQUOTED,xQUOTED,xVARQUOTED,xSINGLEQUOTED>\\r { addchar(false, '\r'); }
212 <xNONQUOTED,xQUOTED,xVARQUOTED,xSINGLEQUOTED>\\t { addchar(false, '\t'); }
214 <xNONQUOTED,xQUOTED,xVARQUOTED,xSINGLEQUOTED>\\v { addchar(false, '\v'); }
216 <xNONQUOTED,xQUOTED,xVARQUOTED,xSINGLEQUOTED>{unicode}+ { parseUnicode(yytext, yyleng); }
218 <xNONQUOTED,xQUOTED,xVARQUOTED,xSINGLEQUOTED>{hex_char}+ { parseHexChars(yytext, yyleng); }
220 <xNONQUOTED,xQUOTED,xVARQUOTED,xSINGLEQUOTED>\\x { yyerror(NULL, "Hex character sequence is invalid"); }
222 <xNONQUOTED,xQUOTED,xVARQUOTED,xSINGLEQUOTED>\\u { yyerror(NULL, "Unicode sequence is invalid"); }
224 <xNONQUOTED,xQUOTED,xVARQUOTED,xSINGLEQUOTED>\\. { yyerror(NULL, "Escape sequence is invalid"); }
226 <xNONQUOTED,xQUOTED,xVARQUOTED,xSINGLEQUOTED>\\ { yyerror(NULL, "Unexpected end after backslash"); }
228 <xQUOTED,xVARQUOTED,xSINGLEQUOTED><<EOF>> { yyerror(NULL, "Unexpected end of quoted string"); }
231 yylval->str = scanstring;
237 yylval->str = scanstring;
243 yylval->str = scanstring;
248 <xQUOTED,xVARQUOTED>[^\\\"]+ { addstring(false, yytext, yyleng); }
250 <xSINGLEQUOTED>[^\\\']+ { addstring(false, yytext, yyleng); }
252 <INITIAL><<EOF>> { yyterminate(); }
254 <xCOMMENT>\*\/ { BEGIN INITIAL; }
260 <xCOMMENT><<EOF>> { yyerror(NULL, "Unexpected end of comment"); }
265 jsonpath_yyerror(JsonPathParseResult **result, const char *message)
267 if (*yytext == YY_END_OF_BUFFER_CHAR)
270 (errcode(ERRCODE_SYNTAX_ERROR),
271 errmsg("bad jsonpath representation"),
272 /* translator: %s is typically "syntax error" */
273 errdetail("%s at end of input", message)));
278 (errcode(ERRCODE_SYNTAX_ERROR),
279 errmsg("bad jsonpath representation"),
280 /* translator: first %s is typically "syntax error" */
281 errdetail("%s at or near \"%s\"", message, yytext)));
285 typedef struct keyword
294 * Array of key words should be sorted by length and then
298 static const keyword keywords[] = {
299 { 2, false, IS_P, "is"},
300 { 2, false, TO_P, "to"},
301 { 3, false, ABS_P, "abs"},
302 { 3, false, LAX_P, "lax"},
303 { 4, false, FLAG_P, "flag"},
304 { 4, false, LAST_P, "last"},
305 { 4, true, NULL_P, "null"},
306 { 4, false, SIZE_P, "size"},
307 { 4, true, TRUE_P, "true"},
308 { 4, false, TYPE_P, "type"},
309 { 4, false, WITH_P, "with"},
310 { 5, true, FALSE_P, "false"},
311 { 5, false, FLOOR_P, "floor"},
312 { 6, false, DOUBLE_P, "double"},
313 { 6, false, EXISTS_P, "exists"},
314 { 6, false, STARTS_P, "starts"},
315 { 6, false, STRICT_P, "strict"},
316 { 7, false, CEILING_P, "ceiling"},
317 { 7, false, UNKNOWN_P, "unknown"},
318 { 8, false, KEYVALUE_P, "keyvalue"},
319 { 10,false, LIKE_REGEX_P, "like_regex"},
327 const keyword *StopLow = keywords,
328 *StopHigh = keywords + lengthof(keywords),
331 if (scanstring.len > keywords[lengthof(keywords) - 1].len)
334 while(StopLow < StopHigh)
336 StopMiddle = StopLow + ((StopHigh - StopLow) >> 1);
338 if (StopMiddle->len == scanstring.len)
339 diff = pg_strncasecmp(StopMiddle->keyword, scanstring.val,
342 diff = StopMiddle->len - scanstring.len;
345 StopLow = StopMiddle + 1;
347 StopHigh = StopMiddle;
350 if (StopMiddle->lowercase)
351 diff = strncmp(StopMiddle->keyword, scanstring.val,
355 res = StopMiddle->val;
365 * Called before any actual parsing is done
368 jsonpath_scanner_init(const char *str, int slen)
374 * Might be left over after ereport()
379 * Make a scan buffer with special termination needed by flex.
383 scanbuf = palloc(slen + 2);
384 memcpy(scanbuf, str, slen);
385 scanbuf[slen] = scanbuf[slen + 1] = YY_END_OF_BUFFER_CHAR;
386 scanbufhandle = yy_scan_buffer(scanbuf, slen + 2);
393 * Called after parsing is done to clean up after jsonpath_scanner_init()
396 jsonpath_scanner_finish(void)
398 yy_delete_buffer(scanbufhandle);
403 addstring(bool init, char *s, int l)
407 scanstring.total = 32;
408 scanstring.val = palloc(scanstring.total);
414 while(scanstring.len + l + 1 >= scanstring.total)
416 scanstring.total *= 2;
417 scanstring.val = repalloc(scanstring.val, scanstring.total);
420 memcpy(scanstring.val + scanstring.len, s, l);
426 addchar(bool init, char s)
430 scanstring.total = 32;
431 scanstring.val = palloc(scanstring.total);
434 else if(scanstring.len + 1 >= scanstring.total)
436 scanstring.total *= 2;
437 scanstring.val = repalloc(scanstring.val, scanstring.total);
440 scanstring.val[ scanstring.len ] = s;
445 JsonPathParseResult *
446 parsejsonpath(const char *str, int len)
448 JsonPathParseResult *parseresult;
450 jsonpath_scanner_init(str, len);
452 if (jsonpath_yyparse((void*)&parseresult) != 0)
453 jsonpath_yyerror(NULL, "bugus input");
455 jsonpath_scanner_finish();
463 if (c >= '0' && c <= '9')
465 if (c >= 'a' && c <= 'f')
466 return c - 'a' + 0xA;
467 if (c >= 'A' && c <= 'F')
468 return c - 'A' + 0xA;
469 elog(ERROR, "invalid hexadecimal digit");
470 return 0; /* not reached */
474 addUnicodeChar(int ch)
477 * For UTF8, replace the escape sequence by the actual
478 * utf8 character in lex->strval. Do this also for other
479 * encodings if the escape designates an ASCII character,
480 * otherwise raise an error.
485 /* We can't allow this, since our TEXT type doesn't */
487 (errcode(ERRCODE_UNTRANSLATABLE_CHARACTER),
488 errmsg("unsupported Unicode escape sequence"),
489 errdetail("\\u0000 cannot be converted to text.")));
491 else if (GetDatabaseEncoding() == PG_UTF8)
496 unicode_to_utf8(ch, (unsigned char *) utf8str);
497 utf8len = pg_utf_mblen((unsigned char *) utf8str);
498 addstring(false, utf8str, utf8len);
500 else if (ch <= 0x007f)
503 * This is the only way to designate things like a
504 * form feed character in JSON, so it's useful in all
507 addchar(false, (char) ch);
512 (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
513 errmsg("invalid input syntax for type jsonpath"),
514 errdetail("Unicode escape values cannot be used for code "
515 "point values above 007F when the server encoding "
521 addUnicode(int ch, int *hi_surrogate)
523 if (ch >= 0xd800 && ch <= 0xdbff)
525 if (*hi_surrogate != -1)
527 (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
528 errmsg("invalid input syntax for type jsonpath"),
529 errdetail("Unicode high surrogate must not follow "
530 "a high surrogate.")));
531 *hi_surrogate = (ch & 0x3ff) << 10;
534 else if (ch >= 0xdc00 && ch <= 0xdfff)
536 if (*hi_surrogate == -1)
538 (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
539 errmsg("invalid input syntax for type jsonpath"),
540 errdetail("Unicode low surrogate must follow a high "
542 ch = 0x10000 + *hi_surrogate + (ch & 0x3ff);
545 else if (*hi_surrogate != -1)
548 (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
549 errmsg("invalid input syntax for type jsonpath"),
550 errdetail("Unicode low surrogate must follow a high "
558 * parseUnicode was adopted from json_lex_string() in
559 * src/backend/utils/adt/json.c
562 parseUnicode(char *s, int l)
565 int hi_surrogate = -1;
567 for (i = 2; i < l; i += 2) /* skip '\u' */
572 if (s[i] == '{') /* parse '\u{XX...}' */
574 while (s[++i] != '}' && i < l)
575 ch = (ch << 4) | hexval(s[i]);
578 else /* parse '\uXXXX' */
580 for (j = 0; j < 4 && i < l; j++)
581 ch = (ch << 4) | hexval(s[i++]);
584 addUnicode(ch, &hi_surrogate);
587 if (hi_surrogate != -1)
590 (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
591 errmsg("invalid input syntax for type jsonpath"),
592 errdetail("Unicode low surrogate must follow a high "
598 parseHexChars(char *s, int l)
602 Assert(l % 4 /* \xXX */ == 0);
604 for (i = 0; i < l / 4; i++)
606 int ch = (hexval(s[i * 4 + 2]) << 4) | hexval(s[i * 4 + 3]);
613 * Interface functions to make flex use palloc() instead of malloc().
614 * It'd be better to make these static, but flex insists otherwise.
618 jsonpath_yyalloc(yy_size_t bytes)
620 return palloc(bytes);
624 jsonpath_yyrealloc(void *ptr, yy_size_t bytes)
627 return repalloc(ptr, bytes);
629 return palloc(bytes);
633 jsonpath_yyfree(void *ptr)