2 /*-------------------------------------------------------------------------
5 * Lexical parser for jsonpath datatype
7 * Splits jsonpath string into tokens represented as JsonPathString structs.
8 * Decodes unicode and hex escaped strings.
10 * Copyright (c) 2019, PostgreSQL Global Development Group
13 * src/backend/utils/adt/jsonpath_scan.l
15 *-------------------------------------------------------------------------
20 #include "mb/pg_wchar.h"
21 #include "nodes/pg_list.h"
23 static JsonPathString scanstring;
25 /* Handles to the buffer that the lexer uses internally */
26 static YY_BUFFER_STATE scanbufhandle;
28 static int scanbuflen;
30 static void addstring(bool init, char *s, int l);
31 static void addchar(bool init, char s);
32 static enum yytokentype checkKeyword(void);
33 static void parseUnicode(char *s, int l);
34 static void parseHexChar(char *s);
36 /* Avoid exit() on fatal scanner errors (a bit ugly -- see yy_fatal_error) */
38 #define fprintf(file, fmt, msg) fprintf_to_ereport(fmt, msg)
41 fprintf_to_ereport(const char *fmt, const char *msg)
43 ereport(ERROR, (errmsg_internal("%s", msg)));
49 %option never-interactive
55 %option prefix="jsonpath_yy"
62 * We use exclusive states for quoted, signle-quoted and non-quoted strings,
63 * quoted variable names and C-tyle comments.
65 * <xq> - quoted strings
66 * <xnq> - non-quoted strings
67 * <xvq> - quoted variable names
68 * <xsq> - single-quoted strings
69 * <xc> - C-style comment
78 special [\?\%\$\.\[\]\{\}\(\)\|\&\!\=\<\>\@\#\,\*:\-\+\/]
79 any [^\?\%\$\.\[\]\{\}\(\)\|\&\!\=\<\>\@\#\,\*:\-\+\/\\\"\' \t\n\r\f]
83 integer (0|[1-9]{digit}*)
84 decimal {integer}\.{digit}+
85 decimalfail {integer}\.
86 real ({integer}|{decimal})[Ee][-+]?{digit}+
87 realfail1 ({integer}|{decimal})[Ee]
88 realfail2 ({integer}|{decimal})[Ee][-+]
91 unicode \\u({hex_dig}{4}|\{{hex_dig}{1,6}\})
92 unicodefail \\u({hex_dig}{0,3}|\{{hex_dig}{0,6})
93 hex_char \\x{hex_dig}{2}
94 hex_fail \\x{hex_dig}{0,1}
99 addstring(false, yytext, yyleng);
103 yylval->str = scanstring;
105 return checkKeyword();
110 yylval->str = scanstring;
114 <xnq>({special}|\"|\') {
115 yylval->str = scanstring;
118 return checkKeyword();
122 yylval->str = scanstring;
124 return checkKeyword();
127 <xnq,xq,xvq,xsq>\\[\"\'\\] { addchar(false, yytext[1]); }
129 <xnq,xq,xvq,xsq>\\b { addchar(false, '\b'); }
131 <xnq,xq,xvq,xsq>\\f { addchar(false, '\f'); }
133 <xnq,xq,xvq,xsq>\\n { addchar(false, '\n'); }
135 <xnq,xq,xvq,xsq>\\r { addchar(false, '\r'); }
137 <xnq,xq,xvq,xsq>\\t { addchar(false, '\t'); }
139 <xnq,xq,xvq,xsq>\\v { addchar(false, '\v'); }
141 <xnq,xq,xvq,xsq>{unicode}+ { parseUnicode(yytext, yyleng); }
143 <xnq,xq,xvq,xsq>{hex_char} { parseHexChar(yytext); }
145 <xnq,xq,xvq,xsq>{unicode}*{unicodefail} { yyerror(NULL, "invalid unicode sequence"); }
147 <xnq,xq,xvq,xsq>{hex_fail} { yyerror(NULL, "invalid hex character sequence"); }
149 <xnq,xq,xvq,xsq>{unicode}+\\ {
150 /* throw back the \\, and treat as unicode */
152 parseUnicode(yytext, yyleng);
155 <xnq,xq,xvq,xsq>\\. { yyerror(NULL, "escape sequence is invalid"); }
157 <xnq,xq,xvq,xsq>\\ { yyerror(NULL, "unexpected end after backslash"); }
159 <xq,xvq,xsq><<EOF>> { yyerror(NULL, "unexpected end of quoted string"); }
162 yylval->str = scanstring;
168 yylval->str = scanstring;
174 yylval->str = scanstring;
179 <xq,xvq>[^\\\"]+ { addstring(false, yytext, yyleng); }
181 <xsq>[^\\\']+ { addstring(false, yytext, yyleng); }
183 <xc>\*\/ { BEGIN INITIAL; }
189 <xc><<EOF>> { yyerror(NULL, "unexpected end of comment"); }
191 \&\& { return AND_P; }
193 \|\| { return OR_P; }
197 \*\* { return ANY_P; }
199 \< { return LESS_P; }
201 \<\= { return LESSEQUAL_P; }
203 \=\= { return EQUAL_P; }
205 \<\> { return NOTEQUAL_P; }
207 \!\= { return NOTEQUAL_P; }
209 \>\= { return GREATEREQUAL_P; }
211 \> { return GREATER_P; }
214 addstring(true, yytext + 1, yyleng - 1);
215 addchar(false, '\0');
216 yylval->str = scanstring;
225 {special} { return *yytext; }
227 {blank}+ { /* ignore */ }
235 addstring(true, yytext, yyleng);
236 addchar(false, '\0');
237 yylval->str = scanstring;
242 addstring(true, yytext, yyleng);
243 addchar(false, '\0');
244 yylval->str = scanstring;
249 addstring(true, yytext, yyleng);
250 addchar(false, '\0');
251 yylval->str = scanstring;
256 /* throw back the ., and treat as integer */
258 addstring(true, yytext, yyleng);
259 addchar(false, '\0');
260 yylval->str = scanstring;
264 ({realfail1}|{realfail2}) { yyerror(NULL, "invalid floating point number"); }
267 addstring(true, yytext, yyleng);
287 <<EOF>> { yyterminate(); }
292 jsonpath_yyerror(JsonPathParseResult **result, const char *message)
294 if (*yytext == YY_END_OF_BUFFER_CHAR)
297 (errcode(ERRCODE_SYNTAX_ERROR),
298 /* translator: %s is typically "syntax error" */
299 errmsg("%s at end of jsonpath input", _(message))));
304 (errcode(ERRCODE_SYNTAX_ERROR),
305 /* translator: first %s is typically "syntax error" */
306 errmsg("%s at or near \"%s\" of jsonpath input",
307 _(message), yytext)));
311 typedef struct JsonPathKeyword
320 * Array of key words should be sorted by length and then
323 static const JsonPathKeyword keywords[] = {
324 { 2, false, IS_P, "is"},
325 { 2, false, TO_P, "to"},
326 { 3, false, ABS_P, "abs"},
327 { 3, false, LAX_P, "lax"},
328 { 4, false, FLAG_P, "flag"},
329 { 4, false, LAST_P, "last"},
330 { 4, true, NULL_P, "null"},
331 { 4, false, SIZE_P, "size"},
332 { 4, true, TRUE_P, "true"},
333 { 4, false, TYPE_P, "type"},
334 { 4, false, WITH_P, "with"},
335 { 5, true, FALSE_P, "false"},
336 { 5, false, FLOOR_P, "floor"},
337 { 6, false, DOUBLE_P, "double"},
338 { 6, false, EXISTS_P, "exists"},
339 { 6, false, STARTS_P, "starts"},
340 { 6, false, STRICT_P, "strict"},
341 { 7, false, CEILING_P, "ceiling"},
342 { 7, false, UNKNOWN_P, "unknown"},
343 { 8, false, KEYVALUE_P, "keyvalue"},
344 { 10,false, LIKE_REGEX_P, "like_regex"},
347 /* Check if current scanstring value is a keyword */
348 static enum yytokentype
353 const JsonPathKeyword *StopLow = keywords,
354 *StopHigh = keywords + lengthof(keywords),
357 if (scanstring.len > keywords[lengthof(keywords) - 1].len)
360 while (StopLow < StopHigh)
362 StopMiddle = StopLow + ((StopHigh - StopLow) >> 1);
364 if (StopMiddle->len == scanstring.len)
365 diff = pg_strncasecmp(StopMiddle->keyword, scanstring.val,
368 diff = StopMiddle->len - scanstring.len;
371 StopLow = StopMiddle + 1;
373 StopHigh = StopMiddle;
376 if (StopMiddle->lowercase)
377 diff = strncmp(StopMiddle->keyword, scanstring.val,
381 res = StopMiddle->val;
391 * Called before any actual parsing is done
394 jsonpath_scanner_init(const char *str, int slen)
400 * Might be left over after ereport()
405 * Make a scan buffer with special termination needed by flex.
409 scanbuf = palloc(slen + 2);
410 memcpy(scanbuf, str, slen);
411 scanbuf[slen] = scanbuf[slen + 1] = YY_END_OF_BUFFER_CHAR;
412 scanbufhandle = yy_scan_buffer(scanbuf, slen + 2);
419 * Called after parsing is done to clean up after jsonpath_scanner_init()
422 jsonpath_scanner_finish(void)
424 yy_delete_buffer(scanbufhandle);
429 * Resize scanstring so that it can append string of given length.
430 * Reinitialize if required.
433 resizeString(bool init, int appendLen)
437 scanstring.total = Max(32, appendLen);
438 scanstring.val = (char *) palloc(scanstring.total);
443 if (scanstring.len + appendLen >= scanstring.total)
445 while (scanstring.len + appendLen >= scanstring.total)
446 scanstring.total *= 2;
447 scanstring.val = repalloc(scanstring.val, scanstring.total);
452 /* Add set of bytes at "s" of length "l" to scanstring */
454 addstring(bool init, char *s, int l)
456 resizeString(init, l + 1);
457 memcpy(scanstring.val + scanstring.len, s, l);
461 /* Add single byte "c" to scanstring */
463 addchar(bool init, char c)
465 resizeString(init, 1);
466 scanstring.val[scanstring.len] = c;
471 /* Interface to jsonpath parser */
472 JsonPathParseResult *
473 parsejsonpath(const char *str, int len)
475 JsonPathParseResult *parseresult;
477 jsonpath_scanner_init(str, len);
479 if (jsonpath_yyparse((void *) &parseresult) != 0)
480 jsonpath_yyerror(NULL, "bogus input"); /* shouldn't happen */
482 jsonpath_scanner_finish();
487 /* Turn hex character into integer */
491 if (c >= '0' && c <= '9')
493 if (c >= 'a' && c <= 'f')
494 return c - 'a' + 0xA;
495 if (c >= 'A' && c <= 'F')
496 return c - 'A' + 0xA;
497 jsonpath_yyerror(NULL, "invalid hexadecimal digit");
498 return 0; /* not reached */
501 /* Add given unicode character to scanstring */
503 addUnicodeChar(int ch)
506 * For UTF8, replace the escape sequence by the actual
507 * utf8 character in lex->strval. Do this also for other
508 * encodings if the escape designates an ASCII character,
509 * otherwise raise an error.
514 /* We can't allow this, since our TEXT type doesn't */
516 (errcode(ERRCODE_UNTRANSLATABLE_CHARACTER),
517 errmsg("unsupported Unicode escape sequence"),
518 errdetail("\\u0000 cannot be converted to text.")));
520 else if (GetDatabaseEncoding() == PG_UTF8)
525 unicode_to_utf8(ch, (unsigned char *) utf8str);
526 utf8len = pg_utf_mblen((unsigned char *) utf8str);
527 addstring(false, utf8str, utf8len);
529 else if (ch <= 0x007f)
532 * This is the only way to designate things like a
533 * form feed character in JSON, so it's useful in all
536 addchar(false, (char) ch);
541 (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
542 errmsg("invalid input syntax for type %s", "jsonpath"),
543 errdetail("Unicode escape values cannot be used for code "
544 "point values above 007F when the server encoding "
549 /* Add unicode character and process its hi surrogate */
551 addUnicode(int ch, int *hi_surrogate)
553 if (ch >= 0xd800 && ch <= 0xdbff)
555 if (*hi_surrogate != -1)
557 (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
558 errmsg("invalid input syntax for type %s", "jsonpath"),
559 errdetail("Unicode high surrogate must not follow "
560 "a high surrogate.")));
561 *hi_surrogate = (ch & 0x3ff) << 10;
564 else if (ch >= 0xdc00 && ch <= 0xdfff)
566 if (*hi_surrogate == -1)
568 (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
569 errmsg("invalid input syntax for type %s", "jsonpath"),
570 errdetail("Unicode low surrogate must follow a high "
572 ch = 0x10000 + *hi_surrogate + (ch & 0x3ff);
575 else if (*hi_surrogate != -1)
578 (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
579 errmsg("invalid input syntax for type %s", "jsonpath"),
580 errdetail("Unicode low surrogate must follow a high "
588 * parseUnicode was adopted from json_lex_string() in
589 * src/backend/utils/adt/json.c
592 parseUnicode(char *s, int l)
595 int hi_surrogate = -1;
597 for (i = 2; i < l; i += 2) /* skip '\u' */
602 if (s[i] == '{') /* parse '\u{XX...}' */
604 while (s[++i] != '}' && i < l)
605 ch = (ch << 4) | hexval(s[i]);
608 else /* parse '\uXXXX' */
610 for (j = 0; j < 4 && i < l; j++)
611 ch = (ch << 4) | hexval(s[i++]);
614 addUnicode(ch, &hi_surrogate);
617 if (hi_surrogate != -1)
620 (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
621 errmsg("invalid input syntax for type %s", "jsonpath"),
622 errdetail("Unicode low surrogate must follow a high "
627 /* Parse sequence of hex-encoded characters */
629 parseHexChar(char *s)
631 int ch = (hexval(s[2]) << 4) |
638 * Interface functions to make flex use palloc() instead of malloc().
639 * It'd be better to make these static, but flex insists otherwise.
643 jsonpath_yyalloc(yy_size_t bytes)
645 return palloc(bytes);
649 jsonpath_yyrealloc(void *ptr, yy_size_t bytes)
652 return repalloc(ptr, bytes);
654 return palloc(bytes);
658 jsonpath_yyfree(void *ptr)