From eca74dcccfc743347709f48ee717bb1339d8dc3b Mon Sep 17 00:00:00 2001 From: dota17 Date: Wed, 8 Jan 2020 19:42:05 +0800 Subject: [PATCH] test utf8 --- json_tokener.c | 47 +++++++++++++++++++++++++++++++++++++-- json_tokener.h | 6 +++++ tests/test_parse.c | 35 +++++++++++++++++++++++++++++ tests/test_parse.expected | 26 +++++++++++++++++++++- 4 files changed, 111 insertions(+), 3 deletions(-) diff --git a/json_tokener.c b/json_tokener.c index fc8fb65..2a8451d 100644 --- a/json_tokener.c +++ b/json_tokener.c @@ -83,6 +83,7 @@ static const char* json_tokener_errors[] = { "object value separator ',' expected", "invalid string sequence", "expected comment", + "invalid utf-8 string", "buffer size overflow" }; @@ -222,8 +223,12 @@ struct json_object* json_tokener_parse_verbose(const char *str, : \ (((tok)->err = json_tokener_continue), 0) \ ) : \ - (((dest) = *str), 1) \ - ) + (((tok->flags & JSON_TOKENER_STRICT) && \ + (!json_tokener_validate_utf8(*str, nBytesp)))? \ + ((tok->err = json_tokener_error_parse_utf8_string), 0) \ + : \ + (((dest) = *str), 1) \ + )) /* ADVANCE_CHAR() macro: * Increments str & tok->char_offset. @@ -242,6 +247,9 @@ struct json_object* json_tokener_parse_ex(struct json_tokener *tok, { struct json_object *obj = NULL; char c = '\1'; + unsigned int nBytes = 0; + unsigned int *nBytesp = &nBytes; + #ifdef HAVE_USELOCALE locale_t oldlocale = uselocale(NULL); locale_t newloc; @@ -948,6 +956,10 @@ struct json_object* json_tokener_parse_ex(struct json_tokener *tok, } /* while(PEEK_CHAR) */ out: + if ((tok->flags & JSON_TOKENER_STRICT) && (nBytes != 0)) + { + tok->err = json_tokener_error_parse_utf8_string; + } if (c && (state == json_tokener_state_finish) && (tok->depth == 0) && @@ -985,6 +997,37 @@ struct json_object* json_tokener_parse_ex(struct json_tokener *tok, return NULL; } +json_bool json_tokener_validate_utf8(const char c, unsigned int *nBytes) +{ + unsigned char chr = c; + if (*nBytes == 0) + { + if (chr >= 0x80) + { + if(chr >= 0xFC && chr <= 0xFd) + *nBytes = 6; + else if (chr >= 0xF8) + *nBytes = 5; + else if (chr >= 0xF0) + *nBytes = 4; + else if (chr >= 0xE0) + *nBytes = 3; + else if (chr >= 0xC0) + *nBytes = 2; + else + return 0; + (*nBytes)--; + } + } + else + { + if ((chr & 0xC0) != 0x80) + return 0; + (*nBytes)--; + } + return 1; +} + void json_tokener_set_flags(struct json_tokener *tok, int flags) { tok->flags = flags; diff --git a/json_tokener.h b/json_tokener.h index da2b24c..061f81b 100644 --- a/json_tokener.h +++ b/json_tokener.h @@ -38,6 +38,7 @@ enum json_tokener_error { json_tokener_error_parse_object_value_sep, json_tokener_error_parse_string, json_tokener_error_parse_comment, + json_tokener_error_parse_utf8_string, json_tokener_error_size }; @@ -162,6 +163,11 @@ JSON_EXPORT void json_tokener_reset(struct json_tokener *tok); JSON_EXPORT struct json_object* json_tokener_parse(const char *str); JSON_EXPORT struct json_object* json_tokener_parse_verbose(const char *str, enum json_tokener_error *error); +/** + * validete the utf-8 string in strict model. + * if not utf-8 format, return err. + */ +json_bool json_tokener_validate_utf8(const char c, unsigned int *nBytes); /** * Set flags that control how parsing will be done. */ diff --git a/tests/test_parse.c b/tests/test_parse.c index 807b457..14d4b11 100644 --- a/tests/test_parse.c +++ b/tests/test_parse.c @@ -355,6 +355,41 @@ struct incremental_step { { "[1,2,3,]", -1, 7, json_tokener_error_parse_unexpected, 3 }, { "{\"a\":1,}", -1, 7, json_tokener_error_parse_unexpected, 3 }, + // utf-8 test + // acsll encoding + { "\x22\x31\x32\x33\x61\x73\x63\x24\x25\x26\x22",-1, -1, json_tokener_success, 3 }, + { "\x22\x31\x32\x33\x61\x73\x63\x24\x25\x26\x22",-1, -1, json_tokener_success, 1 }, + // utf-8 encoding + { "\x22\xe4\xb8\x96\xe7\x95\x8c\x22",-1, -1, json_tokener_success, 3 }, + { "\x22\xe4\xb8",-1, -1, json_tokener_error_parse_utf8_string, 2 }, + { "\x96\xe7\x95\x8c\x22",-1, 0, json_tokener_error_parse_utf8_string, 3 }, + { "\x22\xe4\xb8\x96\xe7\x95\x8c\x22",-1, -1, json_tokener_success, 1 }, + { "\x22\xcf\x80\xcf\x86\x22",-1, -1, json_tokener_success, 3 }, + { "\x22\xf0\xa5\x91\x95\x22",-1, -1, json_tokener_success, 3 }, + { "\x22\xf8\xa5\xa5\x91\x95\x22",-1, -1, json_tokener_success, 3 }, + { "\x22\xfd\xa5\xa5\xa5\x91\x95\x22",-1, -1, json_tokener_success, 3 }, + // wrong utf-8 encoding + { "\x22\xe6\x9d\x4e\x22",-1, 3, json_tokener_error_parse_utf8_string, 3 }, + { "\x22\xe6\x9d\x4e\x22",-1, 5, json_tokener_success, 1 }, + // GBK encoding + { "\x22\xc0\xee\xc5\xf4\x22",-1, 2, json_tokener_error_parse_utf8_string, 3 }, + { "\x22\xc0\xee\xc5\xf4\x22",-1, 6, json_tokener_success, 1 }, + // char after space + { "\x20\x20\x22\xe4\xb8\x96\x22",-1, -1, json_tokener_success, 3 }, + { "\x20\x20\x81\x22\xe4\xb8\x96\x22",-1, 2, json_tokener_error_parse_utf8_string, 3 }, + { "\x5b\x20\x81\x31\x5d",-1, 2, json_tokener_error_parse_utf8_string, 3 }, + // char in state inf + { "\x49\x6e\x66\x69\x6e\x69\x74\x79",9, 8, json_tokener_success, 1 }, + { "\x49\x6e\x66\x81\x6e\x69\x74\x79",-1, 3, json_tokener_error_parse_utf8_string, 3 }, + // char in escape unicode + { "\x22\x5c\x75\x64\x38\x35\x35\x5c\x75\x64\x63\x35\x35\x22",15, 14, json_tokener_success, 3 }, + { "\x22\x5c\x75\x64\x38\x35\x35\xc0\x75\x64\x63\x35\x35\x22",-1, 8, json_tokener_error_parse_utf8_string, 3 }, + { "\x22\x5c\x75\x64\x30\x30\x33\x31\xc0\x22",-1, 9, json_tokener_error_parse_utf8_string, 3 }, + // char in number + { "\x31\x31\x81\x31\x31",-1, 2, json_tokener_error_parse_utf8_string, 3 }, + // char in object + { "\x7b\x22\x31\x81\x22\x3a\x31\x7d",-1, 3, json_tokener_error_parse_utf8_string, 3 }, + { NULL, -1, -1, json_tokener_success, 0 }, }; diff --git a/tests/test_parse.expected b/tests/test_parse.expected index af075b0..a5c2454 100644 --- a/tests/test_parse.expected +++ b/tests/test_parse.expected @@ -183,5 +183,29 @@ json_tokener_parse_ex(tok, [1,2,3,] , 8) ... OK: got object of type [array] json_tokener_parse_ex(tok, [1,2,,3,] , 9) ... OK: got correct error: unexpected character json_tokener_parse_ex(tok, [1,2,3,] , 8) ... OK: got correct error: unexpected character json_tokener_parse_ex(tok, {"a":1,} , 8) ... OK: got correct error: unexpected character -End Incremental Tests OK=105 ERROR=0 +json_tokener_parse_ex(tok, "123asc$%&" , 11) ... OK: got object of type [string]: "123asc$%&" +json_tokener_parse_ex(tok, "123asc$%&" , 11) ... OK: got object of type [string]: "123asc$%&" +json_tokener_parse_ex(tok, "世界" , 8) ... OK: got object of type [string]: "世界" +json_tokener_parse_ex(tok, "ä¸ , 3) ... OK: got correct error: invalid utf-8 string +json_tokener_parse_ex(tok, –界" , 5) ... OK: got correct error: invalid utf-8 string +json_tokener_parse_ex(tok, "世界" , 8) ... OK: got object of type [string]: "世界" +json_tokener_parse_ex(tok, "πφ" , 6) ... OK: got object of type [string]: "πφ" +json_tokener_parse_ex(tok, "𥑕" , 6) ... OK: got object of type [string]: "𥑕" +json_tokener_parse_ex(tok, "ø¥¥‘•" , 7) ... OK: got object of type [string]: "ø¥¥‘•" +json_tokener_parse_ex(tok, "ý¥¥¥‘•" , 8) ... OK: got object of type [string]: "ý¥¥¥‘•" +json_tokener_parse_ex(tok, "æN" , 5) ... OK: got correct error: invalid utf-8 string +json_tokener_parse_ex(tok, "æN" , 5) ... OK: got object of type [string]: "æN" +json_tokener_parse_ex(tok, "ÀîÅô" , 6) ... OK: got correct error: invalid utf-8 string +json_tokener_parse_ex(tok, "ÀîÅô" , 6) ... OK: got object of type [string]: "ÀîÅô" +json_tokener_parse_ex(tok, "世" , 7) ... OK: got object of type [string]: "世" +json_tokener_parse_ex(tok, "世" , 8) ... OK: got correct error: invalid utf-8 string +json_tokener_parse_ex(tok, [ 1] , 5) ... OK: got correct error: invalid utf-8 string +json_tokener_parse_ex(tok, Infinity , 9) ... OK: got object of type [double]: Infinity +json_tokener_parse_ex(tok, Infnity , 8) ... OK: got correct error: invalid utf-8 string +json_tokener_parse_ex(tok, "\ud855\udc55", 15) ... OK: got object of type [string]: "𥑕" +json_tokener_parse_ex(tok, "\ud855Àudc55", 14) ... OK: got correct error: invalid utf-8 string +json_tokener_parse_ex(tok, "\ud0031À" , 10) ... OK: got correct error: invalid utf-8 string +json_tokener_parse_ex(tok, 1111 , 5) ... OK: got correct error: invalid utf-8 string +json_tokener_parse_ex(tok, {"1":1} , 8) ... OK: got correct error: invalid utf-8 string +End Incremental Tests OK=129 ERROR=0 ================================== -- 2.49.0