%{ /*------------------------------------------------------------------------- * * jsonpath_scan.l * Lexical parser for jsonpath datatype * * Splits jsonpath string into tokens represented as JsonPathString structs. * Decodes unicode and hex escaped strings. * * Copyright (c) 2019, PostgreSQL Global Development Group * * IDENTIFICATION * src/backend/utils/adt/jsonpath_scan.l * *------------------------------------------------------------------------- */ #include "postgres.h" #include "mb/pg_wchar.h" #include "nodes/pg_list.h" static JsonPathString scanstring; /* Handles to the buffer that the lexer uses internally */ static YY_BUFFER_STATE scanbufhandle; static char *scanbuf; static int scanbuflen; static void addstring(bool init, char *s, int l); static void addchar(bool init, char s); static enum yytokentype checkKeyword(void); static void parseUnicode(char *s, int l); static void parseHexChar(char *s); /* Avoid exit() on fatal scanner errors (a bit ugly -- see yy_fatal_error) */ #undef fprintf #define fprintf(file, fmt, msg) fprintf_to_ereport(fmt, msg) static void fprintf_to_ereport(const char *fmt, const char *msg) { ereport(ERROR, (errmsg_internal("%s", msg))); } %} %option 8bit %option never-interactive %option nodefault %option noinput %option nounput %option noyywrap %option warn %option prefix="jsonpath_yy" %option bison-bridge %option noyyalloc %option noyyrealloc %option noyyfree /* * We use exclusive states for quoted and non-quoted strings, * quoted variable names and C-style comments. * Exclusive states: * - quoted strings * - non-quoted strings * - quoted variable names * - C-style comment */ %x xq %x xnq %x xvq %x xc special [\?\%\$\.\[\]\{\}\(\)\|\&\!\=\<\>\@\#\,\*:\-\+\/] blank [ \t\n\r\f] /* "other" means anything that's not special, blank, or '\' or '"' */ other [^\?\%\$\.\[\]\{\}\(\)\|\&\!\=\<\>\@\#\,\*:\-\+\/\\\" \t\n\r\f] digit [0-9] integer (0|[1-9]{digit}*) decimal {integer}\.{digit}+ decimalfail {integer}\. real ({integer}|{decimal})[Ee][-+]?{digit}+ realfail1 ({integer}|{decimal})[Ee] realfail2 ({integer}|{decimal})[Ee][-+] hex_dig [0-9A-Fa-f] unicode \\u({hex_dig}{4}|\{{hex_dig}{1,6}\}) unicodefail \\u({hex_dig}{0,3}|\{{hex_dig}{0,6}) hex_char \\x{hex_dig}{2} hex_fail \\x{hex_dig}{0,1} %% {other}+ { addstring(false, yytext, yyleng); } {blank}+ { yylval->str = scanstring; BEGIN INITIAL; return checkKeyword(); } \/\* { yylval->str = scanstring; BEGIN xc; } ({special}|\") { yylval->str = scanstring; yyless(0); BEGIN INITIAL; return checkKeyword(); } <> { yylval->str = scanstring; BEGIN INITIAL; return checkKeyword(); } \\b { addchar(false, '\b'); } \\f { addchar(false, '\f'); } \\n { addchar(false, '\n'); } \\r { addchar(false, '\r'); } \\t { addchar(false, '\t'); } \\v { addchar(false, '\v'); } {unicode}+ { parseUnicode(yytext, yyleng); } {hex_char} { parseHexChar(yytext); } {unicode}*{unicodefail} { yyerror(NULL, "invalid unicode sequence"); } {hex_fail} { yyerror(NULL, "invalid hex character sequence"); } {unicode}+\\ { /* throw back the \\, and treat as unicode */ yyless(yyleng - 1); parseUnicode(yytext, yyleng); } \\. { addchar(false, yytext[1]); } \\ { yyerror(NULL, "unexpected end after backslash"); } <> { yyerror(NULL, "unexpected end of quoted string"); } \" { yylval->str = scanstring; BEGIN INITIAL; return STRING_P; } \" { yylval->str = scanstring; BEGIN INITIAL; return VARIABLE_P; } [^\\\"]+ { addstring(false, yytext, yyleng); } \*\/ { BEGIN INITIAL; } [^\*]+ { } \* { } <> { yyerror(NULL, "unexpected end of comment"); } \&\& { return AND_P; } \|\| { return OR_P; } \! { return NOT_P; } \*\* { return ANY_P; } \< { return LESS_P; } \<\= { return LESSEQUAL_P; } \=\= { return EQUAL_P; } \<\> { return NOTEQUAL_P; } \!\= { return NOTEQUAL_P; } \>\= { return GREATEREQUAL_P; } \> { return GREATER_P; } \${other}+ { addstring(true, yytext + 1, yyleng - 1); addchar(false, '\0'); yylval->str = scanstring; return VARIABLE_P; } \$\" { addchar(true, '\0'); BEGIN xvq; } {special} { return *yytext; } {blank}+ { /* ignore */ } \/\* { addchar(true, '\0'); BEGIN xc; } {real} { addstring(true, yytext, yyleng); addchar(false, '\0'); yylval->str = scanstring; return NUMERIC_P; } {decimal} { addstring(true, yytext, yyleng); addchar(false, '\0'); yylval->str = scanstring; return NUMERIC_P; } {integer} { addstring(true, yytext, yyleng); addchar(false, '\0'); yylval->str = scanstring; return INT_P; } {decimalfail} { /* throw back the ., and treat as integer */ yyless(yyleng - 1); addstring(true, yytext, yyleng); addchar(false, '\0'); yylval->str = scanstring; return INT_P; } ({realfail1}|{realfail2}) { yyerror(NULL, "invalid floating point number"); } \" { addchar(true, '\0'); BEGIN xq; } \\ { yyless(0); addchar(true, '\0'); BEGIN xnq; } {other}+ { addstring(true, yytext, yyleng); BEGIN xnq; } <> { yyterminate(); } %% void jsonpath_yyerror(JsonPathParseResult **result, const char *message) { if (*yytext == YY_END_OF_BUFFER_CHAR) { ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), /* translator: %s is typically "syntax error" */ errmsg("%s at end of jsonpath input", _(message)))); } else { ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), /* translator: first %s is typically "syntax error" */ errmsg("%s at or near \"%s\" of jsonpath input", _(message), yytext))); } } typedef struct JsonPathKeyword { int16 len; bool lowercase; int val; const char *keyword; } JsonPathKeyword; /* * Array of key words should be sorted by length and then * alphabetical order */ static const JsonPathKeyword keywords[] = { { 2, false, IS_P, "is"}, { 2, false, TO_P, "to"}, { 3, false, ABS_P, "abs"}, { 3, false, LAX_P, "lax"}, { 4, false, FLAG_P, "flag"}, { 4, false, LAST_P, "last"}, { 4, true, NULL_P, "null"}, { 4, false, SIZE_P, "size"}, { 4, true, TRUE_P, "true"}, { 4, false, TYPE_P, "type"}, { 4, false, WITH_P, "with"}, { 5, true, FALSE_P, "false"}, { 5, false, FLOOR_P, "floor"}, { 6, false, DOUBLE_P, "double"}, { 6, false, EXISTS_P, "exists"}, { 6, false, STARTS_P, "starts"}, { 6, false, STRICT_P, "strict"}, { 7, false, CEILING_P, "ceiling"}, { 7, false, UNKNOWN_P, "unknown"}, { 8, false, DATETIME_P, "datetime"}, { 8, false, KEYVALUE_P, "keyvalue"}, { 10,false, LIKE_REGEX_P, "like_regex"}, }; /* Check if current scanstring value is a keyword */ static enum yytokentype checkKeyword() { int res = IDENT_P; int diff; const JsonPathKeyword *StopLow = keywords, *StopHigh = keywords + lengthof(keywords), *StopMiddle; if (scanstring.len > keywords[lengthof(keywords) - 1].len) return res; while (StopLow < StopHigh) { StopMiddle = StopLow + ((StopHigh - StopLow) >> 1); if (StopMiddle->len == scanstring.len) diff = pg_strncasecmp(StopMiddle->keyword, scanstring.val, scanstring.len); else diff = StopMiddle->len - scanstring.len; if (diff < 0) StopLow = StopMiddle + 1; else if (diff > 0) StopHigh = StopMiddle; else { if (StopMiddle->lowercase) diff = strncmp(StopMiddle->keyword, scanstring.val, scanstring.len); if (diff == 0) res = StopMiddle->val; break; } } return res; } /* * Called before any actual parsing is done */ static void jsonpath_scanner_init(const char *str, int slen) { if (slen <= 0) slen = strlen(str); /* * Might be left over after ereport() */ yy_init_globals(); /* * Make a scan buffer with special termination needed by flex. */ scanbuflen = slen; scanbuf = palloc(slen + 2); memcpy(scanbuf, str, slen); scanbuf[slen] = scanbuf[slen + 1] = YY_END_OF_BUFFER_CHAR; scanbufhandle = yy_scan_buffer(scanbuf, slen + 2); BEGIN(INITIAL); } /* * Called after parsing is done to clean up after jsonpath_scanner_init() */ static void jsonpath_scanner_finish(void) { yy_delete_buffer(scanbufhandle); pfree(scanbuf); } /* * Resize scanstring so that it can append string of given length. * Reinitialize if required. */ static void resizeString(bool init, int appendLen) { if (init) { scanstring.total = Max(32, appendLen); scanstring.val = (char *) palloc(scanstring.total); scanstring.len = 0; } else { if (scanstring.len + appendLen >= scanstring.total) { while (scanstring.len + appendLen >= scanstring.total) scanstring.total *= 2; scanstring.val = repalloc(scanstring.val, scanstring.total); } } } /* Add set of bytes at "s" of length "l" to scanstring */ static void addstring(bool init, char *s, int l) { resizeString(init, l + 1); memcpy(scanstring.val + scanstring.len, s, l); scanstring.len += l; } /* Add single byte "c" to scanstring */ static void addchar(bool init, char c) { resizeString(init, 1); scanstring.val[scanstring.len] = c; if (c != '\0') scanstring.len++; } /* Interface to jsonpath parser */ JsonPathParseResult * parsejsonpath(const char *str, int len) { JsonPathParseResult *parseresult; jsonpath_scanner_init(str, len); if (jsonpath_yyparse((void *) &parseresult) != 0) jsonpath_yyerror(NULL, "bogus input"); /* shouldn't happen */ jsonpath_scanner_finish(); return parseresult; } /* Turn hex character into integer */ static int hexval(char c) { if (c >= '0' && c <= '9') return c - '0'; if (c >= 'a' && c <= 'f') return c - 'a' + 0xA; if (c >= 'A' && c <= 'F') return c - 'A' + 0xA; jsonpath_yyerror(NULL, "invalid hexadecimal digit"); return 0; /* not reached */ } /* Add given unicode character to scanstring */ static void addUnicodeChar(int ch) { /* * For UTF8, replace the escape sequence by the actual * utf8 character in lex->strval. Do this also for other * encodings if the escape designates an ASCII character, * otherwise raise an error. */ if (ch == 0) { /* We can't allow this, since our TEXT type doesn't */ ereport(ERROR, (errcode(ERRCODE_UNTRANSLATABLE_CHARACTER), errmsg("unsupported Unicode escape sequence"), errdetail("\\u0000 cannot be converted to text."))); } else if (GetDatabaseEncoding() == PG_UTF8) { char utf8str[5]; int utf8len; unicode_to_utf8(ch, (unsigned char *) utf8str); utf8len = pg_utf_mblen((unsigned char *) utf8str); addstring(false, utf8str, utf8len); } else if (ch <= 0x007f) { /* * This is the only way to designate things like a * form feed character in JSON, so it's useful in all * encodings. */ addchar(false, (char) ch); } else { ereport(ERROR, (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), errmsg("invalid input syntax for type %s", "jsonpath"), errdetail("Unicode escape values cannot be used for code " "point values above 007F when the server encoding " "is not UTF8."))); } } /* Add unicode character and process its hi surrogate */ static void addUnicode(int ch, int *hi_surrogate) { if (ch >= 0xd800 && ch <= 0xdbff) { if (*hi_surrogate != -1) ereport(ERROR, (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), errmsg("invalid input syntax for type %s", "jsonpath"), errdetail("Unicode high surrogate must not follow " "a high surrogate."))); *hi_surrogate = (ch & 0x3ff) << 10; return; } else if (ch >= 0xdc00 && ch <= 0xdfff) { if (*hi_surrogate == -1) ereport(ERROR, (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), errmsg("invalid input syntax for type %s", "jsonpath"), errdetail("Unicode low surrogate must follow a high " "surrogate."))); ch = 0x10000 + *hi_surrogate + (ch & 0x3ff); *hi_surrogate = -1; } else if (*hi_surrogate != -1) { ereport(ERROR, (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), errmsg("invalid input syntax for type %s", "jsonpath"), errdetail("Unicode low surrogate must follow a high " "surrogate."))); } addUnicodeChar(ch); } /* * parseUnicode was adopted from json_lex_string() in * src/backend/utils/adt/json.c */ static void parseUnicode(char *s, int l) { int i = 2; int hi_surrogate = -1; for (i = 2; i < l; i += 2) /* skip '\u' */ { int ch = 0; int j; if (s[i] == '{') /* parse '\u{XX...}' */ { while (s[++i] != '}' && i < l) ch = (ch << 4) | hexval(s[i]); i++; /* skip '}' */ } else /* parse '\uXXXX' */ { for (j = 0; j < 4 && i < l; j++) ch = (ch << 4) | hexval(s[i++]); } addUnicode(ch, &hi_surrogate); } if (hi_surrogate != -1) { ereport(ERROR, (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), errmsg("invalid input syntax for type %s", "jsonpath"), errdetail("Unicode low surrogate must follow a high " "surrogate."))); } } /* Parse sequence of hex-encoded characters */ static void parseHexChar(char *s) { int ch = (hexval(s[2]) << 4) | hexval(s[3]); addUnicodeChar(ch); } /* * Interface functions to make flex use palloc() instead of malloc(). * It'd be better to make these static, but flex insists otherwise. */ void * jsonpath_yyalloc(yy_size_t bytes) { return palloc(bytes); } void * jsonpath_yyrealloc(void *ptr, yy_size_t bytes) { if (ptr) return repalloc(ptr, bytes); else return palloc(bytes); } void jsonpath_yyfree(void *ptr) { if (ptr) pfree(ptr); }