/*------------------------------------------------------------------------- * * json.c * JSON data type support. * * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION * src/backend/utils/adt/json.c * *------------------------------------------------------------------------- */ #include "postgres.h" #include "lib/stringinfo.h" #include "libpq/pqformat.h" #include "mb/pg_wchar.h" #include "utils/builtins.h" #include "utils/json.h" typedef enum { JSON_VALUE_INVALID, JSON_VALUE_STRING, JSON_VALUE_NUMBER, JSON_VALUE_OBJECT, JSON_VALUE_ARRAY, JSON_VALUE_TRUE, JSON_VALUE_FALSE, JSON_VALUE_NULL } JsonValueType; typedef struct { char *input; char *token_start; char *token_terminator; JsonValueType token_type; int line_number; char *line_start; } JsonLexContext; typedef enum { JSON_PARSE_VALUE, /* expecting a value */ JSON_PARSE_ARRAY_START, /* saw '[', expecting value or ']' */ JSON_PARSE_ARRAY_NEXT, /* saw array element, expecting ',' or ']' */ JSON_PARSE_OBJECT_START, /* saw '{', expecting label or '}' */ JSON_PARSE_OBJECT_LABEL, /* saw object label, expecting ':' */ JSON_PARSE_OBJECT_NEXT, /* saw object value, expecting ',' or '}' */ JSON_PARSE_OBJECT_COMMA /* saw object ',', expecting next label */ } JsonParseState; typedef struct JsonParseStack { JsonParseState state; } JsonParseStack; typedef enum { JSON_STACKOP_NONE, JSON_STACKOP_PUSH, JSON_STACKOP_PUSH_WITH_PUSHBACK, JSON_STACKOP_POP } JsonStackOp; static void json_validate_cstring(char *input); static void json_lex(JsonLexContext *lex); static void json_lex_string(JsonLexContext *lex); static void json_lex_number(JsonLexContext *lex, char *s); static void report_parse_error(JsonParseStack *stack, JsonLexContext *lex); static void report_invalid_token(JsonLexContext *lex); static char *extract_mb_char(char *s); extern Datum json_in(PG_FUNCTION_ARGS); /* * Input. */ Datum json_in(PG_FUNCTION_ARGS) { char *text = PG_GETARG_CSTRING(0); json_validate_cstring(text); PG_RETURN_TEXT_P(cstring_to_text(text)); } /* * Output. */ Datum json_out(PG_FUNCTION_ARGS) { Datum txt = PG_GETARG_DATUM(0); PG_RETURN_CSTRING(TextDatumGetCString(txt)); } /* * Binary send. */ Datum json_send(PG_FUNCTION_ARGS) { StringInfoData buf; text *t = PG_GETARG_TEXT_PP(0); pq_begintypsend(&buf); pq_sendtext(&buf, VARDATA_ANY(t), VARSIZE_ANY_EXHDR(t)); PG_RETURN_BYTEA_P(pq_endtypsend(&buf)); } /* * Binary receive. */ Datum json_recv(PG_FUNCTION_ARGS) { StringInfo buf = (StringInfo) PG_GETARG_POINTER(0); text *result; char *str; int nbytes; str = pq_getmsgtext(buf, buf->len - buf->cursor, &nbytes); /* * We need a null-terminated string to pass to json_validate_cstring(). * Rather than make a separate copy, make the temporary result one byte * bigger than it needs to be. */ result = palloc(nbytes + 1 + VARHDRSZ); SET_VARSIZE(result, nbytes + VARHDRSZ); memcpy(VARDATA(result), str, nbytes); str = VARDATA(result); str[nbytes] = '\0'; /* Validate it. */ json_validate_cstring(str); PG_RETURN_TEXT_P(result); } /* * Check whether supplied input is valid JSON. */ static void json_validate_cstring(char *input) { JsonLexContext lex; JsonParseStack *stack, *stacktop; int stacksize; /* Set up lexing context. */ lex.input = input; lex.token_terminator = lex.input; lex.line_number = 1; lex.line_start = input; /* Set up parse stack. */ stacksize = 32; stacktop = palloc(sizeof(JsonParseStack) * stacksize); stack = stacktop; stack->state = JSON_PARSE_VALUE; /* Main parsing loop. */ for (;;) { JsonStackOp op; /* Fetch next token. */ json_lex(&lex); /* Check for unexpected end of input. */ if (lex.token_start == NULL) report_parse_error(stack, &lex); redo: /* Figure out what to do with this token. */ op = JSON_STACKOP_NONE; switch (stack->state) { case JSON_PARSE_VALUE: if (lex.token_type != JSON_VALUE_INVALID) op = JSON_STACKOP_POP; else if (lex.token_start[0] == '[') stack->state = JSON_PARSE_ARRAY_START; else if (lex.token_start[0] == '{') stack->state = JSON_PARSE_OBJECT_START; else report_parse_error(stack, &lex); break; case JSON_PARSE_ARRAY_START: if (lex.token_type != JSON_VALUE_INVALID) stack->state = JSON_PARSE_ARRAY_NEXT; else if (lex.token_start[0] == ']') op = JSON_STACKOP_POP; else if (lex.token_start[0] == '[' || lex.token_start[0] == '{') { stack->state = JSON_PARSE_ARRAY_NEXT; op = JSON_STACKOP_PUSH_WITH_PUSHBACK; } else report_parse_error(stack, &lex); break; case JSON_PARSE_ARRAY_NEXT: if (lex.token_type != JSON_VALUE_INVALID) report_parse_error(stack, &lex); else if (lex.token_start[0] == ']') op = JSON_STACKOP_POP; else if (lex.token_start[0] == ',') op = JSON_STACKOP_PUSH; else report_parse_error(stack, &lex); break; case JSON_PARSE_OBJECT_START: if (lex.token_type == JSON_VALUE_STRING) stack->state = JSON_PARSE_OBJECT_LABEL; else if (lex.token_type == JSON_VALUE_INVALID && lex.token_start[0] == '}') op = JSON_STACKOP_POP; else report_parse_error(stack, &lex); break; case JSON_PARSE_OBJECT_LABEL: if (lex.token_type == JSON_VALUE_INVALID && lex.token_start[0] == ':') { stack->state = JSON_PARSE_OBJECT_NEXT; op = JSON_STACKOP_PUSH; } else report_parse_error(stack, &lex); break; case JSON_PARSE_OBJECT_NEXT: if (lex.token_type != JSON_VALUE_INVALID) report_parse_error(stack, &lex); else if (lex.token_start[0] == '}') op = JSON_STACKOP_POP; else if (lex.token_start[0] == ',') stack->state = JSON_PARSE_OBJECT_COMMA; else report_parse_error(stack, &lex); break; case JSON_PARSE_OBJECT_COMMA: if (lex.token_type == JSON_VALUE_STRING) stack->state = JSON_PARSE_OBJECT_LABEL; else report_parse_error(stack, &lex); break; default: elog(ERROR, "unexpected json parse state: %d", (int) stack->state); } /* Push or pop the stack, if needed. */ switch (op) { case JSON_STACKOP_PUSH: case JSON_STACKOP_PUSH_WITH_PUSHBACK: ++stack; if (stack >= &stacktop[stacksize]) { int stackoffset = stack - stacktop; stacksize = stacksize + 32; stacktop = repalloc(stacktop, sizeof(JsonParseStack) * stacksize); stack = stacktop + stackoffset; } stack->state = JSON_PARSE_VALUE; if (op == JSON_STACKOP_PUSH_WITH_PUSHBACK) goto redo; break; case JSON_STACKOP_POP: if (stack == stacktop) { /* Expect end of input. */ json_lex(&lex); if (lex.token_start != NULL) report_parse_error(NULL, &lex); return; } --stack; break; case JSON_STACKOP_NONE: /* nothing to do */ break; } } } /* * Lex one token from the input stream. */ static void json_lex(JsonLexContext *lex) { char *s; /* Skip leading whitespace. */ s = lex->token_terminator; while (*s == ' ' || *s == '\t' || *s == '\n' || *s == '\r') { if (*s == '\n') ++lex->line_number; ++s; } lex->token_start = s; /* Determine token type. */ if (strchr("{}[],:", s[0])) { /* strchr() doesn't return false on a NUL input. */ if (s[0] == '\0') { /* End of string. */ lex->token_start = NULL; lex->token_terminator = NULL; } else { /* Single-character token, some kind of punctuation mark. */ lex->token_terminator = s + 1; } lex->token_type = JSON_VALUE_INVALID; } else if (*s == '"') { /* String. */ json_lex_string(lex); lex->token_type = JSON_VALUE_STRING; } else if (*s == '-') { /* Negative number. */ json_lex_number(lex, s + 1); lex->token_type = JSON_VALUE_NUMBER; } else if (*s >= '0' && *s <= '9') { /* Positive number. */ json_lex_number(lex, s); lex->token_type = JSON_VALUE_NUMBER; } else { char *p; /* * We're not dealing with a string, number, legal punctuation mark, * or end of string. The only legal tokens we might find here are * true, false, and null, but for error reporting purposes we scan * until we see a non-alphanumeric character. That way, we can report * the whole word as an unexpected token, rather than just some * unintuitive prefix thereof. */ for (p = s; (*p >= 'a' && *p <= 'z') || (*p >= 'A' && *p <= 'Z') || (*p >= '0' && *p <= '9') || *p == '_' || IS_HIGHBIT_SET(*p); ++p) ; /* * We got some sort of unexpected punctuation or an otherwise * unexpected character, so just complain about that one character. */ if (p == s) { lex->token_terminator = s + 1; report_invalid_token(lex); } /* * We've got a real alphanumeric token here. If it happens to be * true, false, or null, all is well. If not, error out. */ lex->token_terminator = p; if (p - s == 4) { if (memcmp(s, "true", 4) == 0) lex->token_type = JSON_VALUE_TRUE; else if (memcmp(s, "null", 4) == 0) lex->token_type = JSON_VALUE_NULL; else report_invalid_token(lex); } else if (p - s == 5 && memcmp(s, "false", 5) == 0) lex->token_type = JSON_VALUE_FALSE; else report_invalid_token(lex); } } /* * The next token in the input stream is known to be a string; lex it. */ static void json_lex_string(JsonLexContext *lex) { char *s = lex->token_start + 1; for (s = lex->token_start + 1; *s != '"'; ++s) { /* Per RFC4627, these characters MUST be escaped. */ if (*s < 32) { /* A NUL byte marks the (premature) end of the string. */ if (*s == '\0') { lex->token_terminator = s; report_invalid_token(lex); } ereport(ERROR, (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), errmsg("invalid input syntax for type json"), errdetail_internal("line %d: Character \"%c\" must be escaped.", lex->line_number, *s))); } else if (*s == '\\') { /* OK, we have an escape character. */ ++s; if (*s == '\0') { lex->token_terminator = s; report_invalid_token(lex); } else if (*s == 'u') { int i; int ch = 0; for (i = 1; i <= 4; ++i) { if (s[i] == '\0') { lex->token_terminator = s + i; report_invalid_token(lex); } else if (s[i] >= '0' && s[i] <= '9') ch = (ch * 16) + (s[i] - '0'); else if (s[i] >= 'a' && s[i] <= 'f') ch = (ch * 16) + (s[i] - 'a') + 10; else if (s[i] >= 'A' && s[i] <= 'F') ch = (ch * 16) + (s[i] - 'A') + 10; else { ereport(ERROR, (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), errmsg("invalid input syntax for type json"), errdetail_internal("line %d: \"\\u\" must be followed by four hexadecimal digits.", lex->line_number))); } } /* Account for the four additional bytes we just parsed. */ s += 4; } else if (!strchr("\"\\/bfnrt", *s)) { /* Error out. */ ereport(ERROR, (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), errmsg("invalid input syntax for type json"), errdetail_internal("line %d: Invalid escape \"\\%s\".", lex->line_number, extract_mb_char(s)))); } } } /* Hooray, we found the end of the string! */ lex->token_terminator = s + 1; } /*------------------------------------------------------------------------- * The next token in the input stream is known to be a number; lex it. * * In JSON, a number consists of four parts: * * (1) An optional minus sign ('-'). * * (2) Either a single '0', or a string of one or more digits that does not * begin with a '0'. * * (3) An optional decimal part, consisting of a period ('.') followed by * one or more digits. (Note: While this part can be omitted * completely, it's not OK to have only the decimal point without * any digits afterwards.) * * (4) An optional exponent part, consisting of 'e' or 'E', optionally * followed by '+' or '-', followed by one or more digits. (Note: * As with the decimal part, if 'e' or 'E' is present, it must be * followed by at least one digit.) * * The 's' argument to this function points to the ostensible beginning * of part 2 - i.e. the character after any optional minus sign, and the * first character of the string if there is none. * *------------------------------------------------------------------------- */ static void json_lex_number(JsonLexContext *lex, char *s) { bool error = false; char *p; /* Part (1): leading sign indicator. */ /* Caller already did this for us; so do nothing. */ /* Part (2): parse main digit string. */ if (*s == '0') ++s; else if (*s >= '1' && *s <= '9') { do { ++s; } while (*s >= '0' && *s <= '9'); } else error = true; /* Part (3): parse optional decimal portion. */ if (*s == '.') { ++s; if (*s < '0' && *s > '9') error = true; else { do { ++s; } while (*s >= '0' && *s <= '9'); } } /* Part (4): parse optional exponent. */ if (*s == 'e' || *s == 'E') { ++s; if (*s == '+' || *s == '-') ++s; if (*s < '0' && *s > '9') error = true; else { do { ++s; } while (*s >= '0' && *s <= '9'); } } /* Check for trailing garbage. */ for (p = s; (*p >= 'a' && *p <= 'z') || (*p >= 'A' && *p <= 'Z') || (*p >= '0' && *p <= '9') || *p == '_' || IS_HIGHBIT_SET(*p); ++p) ; lex->token_terminator = p; if (p > s || error) report_invalid_token(lex); } /* * Report a parse error. */ static void report_parse_error(JsonParseStack *stack, JsonLexContext *lex) { char *detail = NULL; char *token = NULL; int toklen; /* Handle case where the input ended prematurely. */ if (lex->token_start == NULL) ereport(ERROR, (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), errmsg("invalid input syntax for type json: \"%s\"", lex->input), errdetail_internal("The input string ended unexpectedly."))); /* Work out the offending token. */ toklen = lex->token_terminator - lex->token_start; token = palloc(toklen + 1); memcpy(token, lex->token_start, toklen); token[toklen] = '\0'; /* Select correct detail message. */ if (stack == NULL) detail = "line %d: Expected end of input, but found \"%s\"."; else { switch (stack->state) { case JSON_PARSE_VALUE: detail = "line %d: Expected string, number, object, array, true, false, or null, but found \"%s\"."; break; case JSON_PARSE_ARRAY_START: detail = "line %d: Expected array element or \"]\", but found \"%s\"."; break; case JSON_PARSE_ARRAY_NEXT: detail = "line %d: Expected \",\" or \"]\", but found \"%s\"."; break; case JSON_PARSE_OBJECT_START: detail = "line %d: Expected string or \"}\", but found \"%s\"."; break; case JSON_PARSE_OBJECT_LABEL: detail = "line %d: Expected \":\", but found \"%s\"."; break; case JSON_PARSE_OBJECT_NEXT: detail = "line %d: Expected \",\" or \"}\", but found \"%s\"."; break; case JSON_PARSE_OBJECT_COMMA: detail = "line %d: Expected string, but found \"%s\"."; break; } } ereport(ERROR, (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), errmsg("invalid input syntax for type json: \"%s\"", lex->input), errdetail_internal(detail, lex->line_number, token))); } /* * Report an invalid input token. */ static void report_invalid_token(JsonLexContext *lex) { char *token; int toklen; toklen = lex->token_terminator - lex->token_start; token = palloc(toklen + 1); memcpy(token, lex->token_start, toklen); token[toklen] = '\0'; ereport(ERROR, (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), errmsg("invalid input syntax for type json"), errdetail_internal("line %d: Token \"%s\" is invalid.", lex->line_number, token))); } /* * Extract a single, possibly multi-byte char from the input string. */ static char * extract_mb_char(char *s) { char *res; int len; len = pg_mblen(s); res = palloc(len + 1); memcpy(res, s, len); res[len] = '\0'; return res; }