From e98624e5a9aaee0cf359d11c1fc4769ec848ea4d Mon Sep 17 00:00:00 2001 From: Stephen Dolan Date: Sat, 1 Sep 2012 19:16:43 +0100 Subject: [PATCH] First pass at a JSON parser --- c/Makefile | 9 +- c/jv.h | 11 +- c/jv_parse.c | 351 +++++++++++++++++++++++++++++++++++++++++++++++++++ c/jv_print.c | 59 ++++++++- c/jvtest.c | 2 - 5 files changed, 421 insertions(+), 11 deletions(-) create mode 100644 c/jv_parse.c diff --git a/c/Makefile b/c/Makefile index b77971d..08e7270 100644 --- a/c/Makefile +++ b/c/Makefile @@ -21,11 +21,12 @@ parsertest: parser.tab.c lexer.yy.c main.c opcode.c bytecode.c compile.c execute $(CC) -o $@ $^ -ljansson jvtest: jvtest.c jv.c jv_print.c - $(CC) -o $@ $^ -ljansson + $(CC) -DNO_JANSSON -o $@ $^ + +jv_parse: jv_parse.c jv.c jv_print.c jv_dtoa.c + $(CC) -DNO_JANSSON -o $@ $^ + test: jvtest valgrind --error-exitcode=1 -q --leak-check=full ./jvtest -jsparse: jsparse.l jv.c - flex -o jsparse.c jsparse.l - gcc -o jsparse jsparse.c -ljansson \ No newline at end of file diff --git a/c/jv.h b/c/jv.h index 52158b7..d815827 100644 --- a/c/jv.h +++ b/c/jv.h @@ -1,10 +1,11 @@ #ifndef JV_H #define JV_H -#include #include #include - +#include +#ifndef NO_JANSSON +#include static json_t* jv_lookup(json_t* t, json_t* k) { json_t* v; if (json_is_object(t) && json_is_string(k)) { @@ -38,7 +39,7 @@ static json_t* jv_insert(json_t* root, json_t* value, json_t** path, int pathlen } return jv_modify(root, *path, jv_insert(jv_lookup(root, *path), value, path+1, pathlen-1)); } - +#endif @@ -116,6 +117,10 @@ jv jv_object_iter_key(jv, int); jv jv_object_iter_value(jv, int); +void jv_dump(jv); + + + #endif diff --git a/c/jv_parse.c b/c/jv_parse.c new file mode 100644 index 0000000..4f7ef01 --- /dev/null +++ b/c/jv_parse.c @@ -0,0 +1,351 @@ +#include +#include +#include "jv.h" +#include "jv_dtoa.h" +jv stack[1000]; +int stackpos = 0; +jv next; +int hasnext; + +typedef const char* presult; + +#define TRY(x) do {presult msg__ = (x); if (msg__) return msg__; } while(0) +#ifdef __GNUC__ +#define pfunc __attribute__((warn_unused_result)) presult +#else +#define pfunc presult +#endif + + + +pfunc value(jv val) { + if (hasnext) return "Expected separator between values"; + hasnext = 1; + next = val; + return 0; +} + +void push(jv v) { + stack[stackpos++] = v; +} + +pfunc token(char ch) { + switch (ch) { + case '[': + if (hasnext) return "Expected separator between values"; + push(jv_array()); + break; + + case '{': + if (hasnext) return "Expected separator between values"; + push(jv_object()); + break; + + case ':': + if (!hasnext) + return "Expected string key before ':'"; + if (stackpos == 0 || jv_get_kind(stack[stackpos-1]) != JV_KIND_OBJECT) + return "':' not as part of an object"; + if (jv_get_kind(next) != JV_KIND_STRING) + return "Object keys must be strings"; + push(next); + hasnext = 0; + break; + + case ',': + if (!hasnext) + return "Expected value before ','"; + if (stackpos == 0) + return "',' not as part of an object or array"; + if (jv_get_kind(stack[stackpos-1]) == JV_KIND_ARRAY) { + stack[stackpos-1] = jv_array_append(stack[stackpos-1], next); + hasnext = 0; + } else if (jv_get_kind(stack[stackpos-1]) == JV_KIND_STRING) { + assert(stackpos > 1 && jv_get_kind(stack[stackpos-2]) == JV_KIND_OBJECT); + stack[stackpos-2] = jv_object_set(stack[stackpos-2], stack[stackpos-1], next); + stackpos--; + hasnext = 0; + } else { + // this case hits on input like {"a", "b"} + return "Objects must consist of key:value pairs"; + } + break; + + case ']': + if (stackpos == 0 || jv_get_kind(stack[stackpos-1]) != JV_KIND_ARRAY) + return "Unmatched ']'"; + if (hasnext) { + stack[stackpos-1] = jv_array_append(stack[stackpos-1], next); + hasnext = 0; + } else { + if (jv_array_length(jv_copy(stack[stackpos-1])) != 0) { + // this case hits on input like [1,2,3,] + return "Expected another array element"; + } + } + hasnext = 1; + next = stack[--stackpos]; + break; + + case '}': + if (stackpos == 0) + return "Unmatched '}'"; + if (hasnext) { + if (jv_get_kind(stack[stackpos-1]) != JV_KIND_STRING) + return "Objects must consist of key:value pairs"; + assert(stackpos > 1 && jv_get_kind(stack[stackpos-2]) == JV_KIND_OBJECT); + stack[stackpos-2] = jv_object_set(stack[stackpos-2], stack[stackpos-1], next); + stackpos--; + hasnext = 0; + } else { + if (jv_get_kind(stack[stackpos-1]) != JV_KIND_OBJECT) + return "Unmatched '}'"; + // FIXME: assert object empty + } + hasnext = 1; + next = stack[--stackpos]; + break; + } + return 0; +} + + +char tokenbuf[1000]; +int tokenpos; +struct dtoa_context dtoa; + +void tokenadd(char c) { + tokenbuf[tokenpos++] = c; +} + +int unhex4(char* hex) { + int r = 0; + for (int i=0; i<4; i++) { + char c = *hex++; + int n; + if ('0' <= c && c <= '9') n = c - '0'; + else if ('a' <= c && c <= 'f') n = c - 'a' + 10; + else if ('A' <= c && c <= 'F') n = c - 'A' + 10; + r <<= 4; + r |= n; + } + return r; +} + +int utf8_encode(int codepoint, char* out) { + assert(codepoint >= 0 && codepoint <= 0x10FFFF); + char* start = out; + if (codepoint <= 0x7F) { + *out++ = codepoint; + } else if (codepoint <= 0x7FF) { + *out++ = 0xC0 + ((codepoint & 0x7C0) >> 6); + *out++ = 0x80 + ((codepoint & 0x03F)); + } else if(codepoint <= 0xFFFF) { + *out++ = 0xE0 + ((codepoint & 0xF000) >> 12); + *out++ = 0x80 + ((codepoint & 0x0FC0) >> 6); + *out++ = 0x80 + ((codepoint & 0x003F)); + } else { + *out++ = 0xF0 + ((codepoint & 0x1C0000) >> 18); + *out++ = 0x80 + ((codepoint & 0x03F000) >> 12); + *out++ = 0x80 + ((codepoint & 0x000FC0) >> 6); + *out++ = 0x80 + ((codepoint & 0x00003F)); + } + return out - start; +} + +pfunc found_string() { + char* in = tokenbuf; + char* out = tokenbuf; + char* end = tokenbuf + tokenpos; + + while (in < end) { + char c = *in++; + if (c == '\\') { + if (in >= end) + return "Expected escape character at end of string"; + c = *in++; + switch (c) { + case '\\': + case '"': + case '/': *out++ = c; break; + case 'b': *out++ = '\b'; break; + case 'f': *out++ = '\f'; break; + case 't': *out++ = '\t'; break; + case 'n': *out++ = '\n'; break; + case 'r': *out++ = '\r'; break; + + case 'u': + /* ahh, the complicated case */ + if (in + 4 > end) + return "Invalid \\uXXXX escape"; + unsigned long codepoint = unhex4(in); + in += 4; + if (0xD800 <= codepoint && codepoint <= 0xDBFF) { + /* who thought UTF-16 surrogate pairs were a good idea? */ + if (in + 6 > end || in[0] != '\\' || in[1] != 'u') + return "Invalid \\uXXXX\\uXXXX surrogate pair escape"; + unsigned long surrogate = unhex4(in+2); + if (!(0xDC00 <= surrogate && surrogate <= 0xDFFF)) + return "Invalid \\uXXXX\\uXXXX surrogate pair escape"; + in += 6; + codepoint = 0x10000 + (((codepoint - 0xD800) << 10) + |(surrogate - 0xDC00)); + } + // FIXME assert valid codepoint + out += utf8_encode(codepoint, out); + break; + + default: + return "Invalid escape"; + } + } else { + *out++ = c; + } + } + TRY(value(jv_string_sized(tokenbuf, out - tokenbuf))); + tokenpos=0; + return 0; +} + +pfunc check_literal() { + if (tokenpos == 0) return 0; + + const char* pattern = 0; + int plen; + jv v; + switch (tokenbuf[0]) { + case 't': pattern = "true"; plen = 4; v = jv_true(); break; + case 'f': pattern = "false"; plen = 5; v = jv_false(); break; + case 'n': pattern = "null"; plen = 4; v = jv_null(); break; + } + if (pattern) { + if (tokenpos != plen) return "Invalid literal"; + for (int i=0; i -void jv_dump(jv x) { +#include "jv_dtoa.h" + +static void jv_dump_string(jv str, int ascii_only) { + assert(jv_get_kind(str) == JV_KIND_STRING); + const char* i = jv_string_value(str); + const char* end = i + jv_string_length(str); + while (i < end) { + int unicode_escape; + int c = (unsigned char)*i++; + if (0x20 <= c && c <= 0x7E) { + // printable ASCII + if (c == '"' || c == '\\') { + putchar('\\'); + } + putchar(c); + unicode_escape = 0; + } else if (c < 0x20 || c == 0x7F) { + // ASCII control character + switch (c) { + case '\b': + putchar('\\'); + putchar('b'); + break; + case '\t': + putchar('\\'); + putchar('t'); + break; + case '\r': + putchar('\\'); + putchar('r'); + break; + case '\n': + putchar('\\'); + putchar('n'); + break; + case '\f': + putchar('\\'); + putchar('f'); + break; + default: + unicode_escape = 1; + break; + } + } + } +} + +static void jv_dump_term(struct dtoa_context* C, jv x) { + char buf[JVP_DTOA_FMT_MAX_LEN]; switch (jv_get_kind(x)) { case JV_KIND_NULL: printf("null"); @@ -13,7 +61,7 @@ void jv_dump(jv x) { printf("true"); break; case JV_KIND_NUMBER: - printf("%f", jv_number_value(x)); + printf("%s", jvp_dtoa_fmt(C, buf, jv_number_value(x))); break; case JV_KIND_STRING: // FIXME: all sorts of broken @@ -43,3 +91,10 @@ void jv_dump(jv x) { } jv_free(x); } + +void jv_dump(jv x) { + struct dtoa_context C; + jvp_dtoa_context_init(&C); + jv_dump_term(&C, x); + jvp_dtoa_context_free(&C); +} diff --git a/c/jvtest.c b/c/jvtest.c index 667d706..2b5a627 100644 --- a/c/jvtest.c +++ b/c/jvtest.c @@ -3,8 +3,6 @@ #include #include "jv.h" -void jv_dump(jv); - int main(){ /// Arrays and numbers { -- 2.40.0