From c1b872d8174c9946f8d9383fe600e8ff6650647e Mon Sep 17 00:00:00 2001 From: dota17 Date: Tue, 2 Jun 2020 19:17:42 +0800 Subject: [PATCH] fix issue 616: support the surrogate pair in split file. --- json_tokener.c | 25 ++++++++--- json_tokener.h | 2 +- tests/test_parse.c | 95 ++++++++++++++++++++++++++++----------- tests/test_parse.expected | 26 ++++++++++- 4 files changed, 113 insertions(+), 35 deletions(-) diff --git a/json_tokener.c b/json_tokener.c index 0373d6f..69d7af2 100644 --- a/json_tokener.c +++ b/json_tokener.c @@ -630,8 +630,6 @@ struct json_object *json_tokener_parse_ex(struct json_tokener *tok, const char * case json_tokener_state_escape_unicode: { - unsigned int got_hi_surrogate = 0; - /* Handle a 4-byte sequence, or two sequences if a surrogate pair */ while (1) { @@ -643,14 +641,24 @@ struct json_object *json_tokener_parse_ex(struct json_tokener *tok, const char * { unsigned char unescaped_utf[4]; - if (got_hi_surrogate) + if (tok->got_hi_surrogate) { if (IS_LOW_SURROGATE(tok->ucs_char)) { + /* remove the utf8_replacement_char */ + /* which may generate during */ + /* parsing the high surrogate pair. */ + if (!strcmp( + tok->pb->buf, + (char *) + utf8_replacement_char)) + { + printbuf_reset(tok->pb); + } /* Recalculate the ucs_char, then fall thru to process normally */ tok->ucs_char = DECODE_SURROGATE_PAIR( - got_hi_surrogate, + tok->got_hi_surrogate, tok->ucs_char); } else @@ -662,7 +670,7 @@ struct json_object *json_tokener_parse_ex(struct json_tokener *tok, const char * (char *)utf8_replacement_char, 3); } - got_hi_surrogate = 0; + tok->got_hi_surrogate = 0; } if (tok->ucs_char < 0x80) @@ -686,7 +694,7 @@ struct json_object *json_tokener_parse_ex(struct json_tokener *tok, const char * * the beginning of another sequence, which * should be the low surrogate. */ - got_hi_surrogate = tok->ucs_char; + tok->got_hi_surrogate = tok->ucs_char; /* Not at end, and the next two chars should be "\u" */ if ((len == -1 || len > (tok->char_offset + 2)) && @@ -717,6 +725,8 @@ struct json_object *json_tokener_parse_ex(struct json_tokener *tok, const char * (char *) utf8_replacement_char, 3); + tok->ucs_char = 0; + tok->st_pos = 0; goto out; } tok->ucs_char = 0; @@ -786,7 +796,8 @@ struct json_object *json_tokener_parse_ex(struct json_tokener *tok, const char * if (!ADVANCE_CHAR(str, tok) || !PEEK_CHAR(c, tok)) { /* Clean up any pending chars */ - if (got_hi_surrogate) + if (tok->got_hi_surrogate && + strcmp(tok->pb->buf, (char *)utf8_replacement_char)) printbuf_memappend_fast( tok->pb, (char *)utf8_replacement_char, 3); goto out; diff --git a/json_tokener.h b/json_tokener.h index ea49ad8..8afa6ec 100644 --- a/json_tokener.h +++ b/json_tokener.h @@ -111,7 +111,7 @@ struct json_tokener * @deprecated See json_tokener_get_error() instead. */ enum json_tokener_error err; - unsigned int ucs_char; + unsigned int ucs_char, got_hi_surrogate; char quote_char; struct json_tokener_srec *stack; int flags; diff --git a/tests/test_parse.c b/tests/test_parse.c index 6014ac1..42b2fdf 100644 --- a/tests/test_parse.c +++ b/tests/test_parse.c @@ -224,6 +224,35 @@ struct incremental_step {"\": {\"bar", -1, -1, json_tokener_continue, 0}, {"\":13}}", -1, -1, json_tokener_success, 1}, + /* Check the UTF-16 surrogate pair */ + /* parse one char at every time */ + {"\"\\", -1, -1, json_tokener_continue, 0}, + {"u", -1, -1, json_tokener_continue, 0}, + {"d", -1, -1, json_tokener_continue, 0}, + {"8", -1, -1, json_tokener_continue, 0}, + {"3", -1, -1, json_tokener_continue, 0}, + {"4", -1, -1, json_tokener_continue, 0}, + {"\\", -1, -1, json_tokener_continue, 0}, + {"u", -1, -1, json_tokener_continue, 0}, + {"d", -1, -1, json_tokener_continue, 0}, + {"d", -1, -1, json_tokener_continue, 0}, + {"1", -1, -1, json_tokener_continue, 0}, + {"e\"", -1, -1, json_tokener_success, 1}, + /* parse two char at every time */ + {"\"\\u", -1, -1, json_tokener_continue, 0}, + {"d8", -1, -1, json_tokener_continue, 0}, + {"34", -1, -1, json_tokener_continue, 0}, + {"\\u", -1, -1, json_tokener_continue, 0}, + {"dd", -1, -1, json_tokener_continue, 0}, + {"1e\"", -1, -1, json_tokener_success, 1}, + /* check the low surrogate pair */ + {"\"\\ud834", -1, -1, json_tokener_continue, 0}, + {"\\udd1e\"", -1, -1, json_tokener_success, 1}, + {"\"\\ud834\\", -1, -1, json_tokener_continue, 0}, + {"udd1e\"", -1, -1, json_tokener_success, 1}, + {"\"\\ud834\\u", -1, -1, json_tokener_continue, 0}, + {"dd1e\"", -1, -1, json_tokener_success, 1}, + /* Check that json_tokener_reset actually resets */ {"{ \"foo", -1, -1, json_tokener_continue, 1}, {": \"bar\"}", -1, 0, json_tokener_error_parse_unexpected, 1}, @@ -239,11 +268,13 @@ struct incremental_step {"\"Y\"", -1, -1, json_tokener_success, 1}, /* Trailing characters should cause a failure in strict mode */ - {"{\"foo\":9}{\"bar\":8}", -1, 9, json_tokener_error_parse_unexpected, 1, JSON_TOKENER_STRICT }, + {"{\"foo\":9}{\"bar\":8}", -1, 9, json_tokener_error_parse_unexpected, 1, JSON_TOKENER_STRICT}, /* ... unless explicitly allowed. */ - {"{\"foo\":9}{\"bar\":8}", -1, 9, json_tokener_success, 0, JSON_TOKENER_STRICT|JSON_TOKENER_ALLOW_TRAILING_CHARS }, - {"{\"b\":8}ignored garbage", -1, 7, json_tokener_success, 1, JSON_TOKENER_STRICT|JSON_TOKENER_ALLOW_TRAILING_CHARS }, + {"{\"foo\":9}{\"bar\":8}", -1, 9, json_tokener_success, 0, + JSON_TOKENER_STRICT | JSON_TOKENER_ALLOW_TRAILING_CHARS}, + {"{\"b\":8}ignored garbage", -1, 7, json_tokener_success, 1, + JSON_TOKENER_STRICT | JSON_TOKENER_ALLOW_TRAILING_CHARS}, /* To stop parsing a number we need to reach a non-digit, e.g. a \0 */ {"1", 1, 1, json_tokener_continue, 0}, @@ -251,7 +282,7 @@ struct incremental_step {"2", 2, 1, json_tokener_success, 0}, {"12{", 3, 2, json_tokener_success, 1}, /* Parse number in strict model */ - {"[02]", -1, 3, json_tokener_error_parse_number, 1, JSON_TOKENER_STRICT }, + {"[02]", -1, 3, json_tokener_error_parse_number, 1, JSON_TOKENER_STRICT}, /* Similar tests for other kinds of objects: */ /* These could all return success immediately, since regardless of @@ -267,8 +298,8 @@ struct incremental_step {"Infinity", 9, 8, json_tokener_success, 1}, {"infinity", 9, 8, json_tokener_success, 1}, {"-infinity", 10, 9, json_tokener_success, 1}, - {"infinity", 9, 0, json_tokener_error_parse_unexpected, 1, JSON_TOKENER_STRICT }, - {"-infinity", 10, 1, json_tokener_error_parse_unexpected, 1, JSON_TOKENER_STRICT }, + {"infinity", 9, 0, json_tokener_error_parse_unexpected, 1, JSON_TOKENER_STRICT}, + {"-infinity", 10, 1, json_tokener_error_parse_unexpected, 1, JSON_TOKENER_STRICT}, {"inf", 3, 3, json_tokener_continue, 0}, {"inity", 6, 5, json_tokener_success, 1}, @@ -350,7 +381,7 @@ struct incremental_step {"\"\\a\"", -1, 2, json_tokener_error_parse_string, 1}, /* Check '\'' in strict model */ - {"\'foo\'", -1, 0, json_tokener_error_parse_unexpected, 1, JSON_TOKENER_STRICT }, + {"\'foo\'", -1, 0, json_tokener_error_parse_unexpected, 1, JSON_TOKENER_STRICT}, /* Parse array/object */ {"[1,2,3]", -1, -1, json_tokener_success, 0}, @@ -372,42 +403,54 @@ struct incremental_step {"[1,2,3,]", -1, -1, json_tokener_success, 0}, {"[1,2,,3,]", -1, 5, json_tokener_error_parse_unexpected, 0}, - {"[1,2,3,]", -1, 7, json_tokener_error_parse_unexpected, 1, JSON_TOKENER_STRICT }, - {"{\"a\":1,}", -1, 7, json_tokener_error_parse_unexpected, 1, JSON_TOKENER_STRICT }, + {"[1,2,3,]", -1, 7, json_tokener_error_parse_unexpected, 1, JSON_TOKENER_STRICT}, + {"{\"a\":1,}", -1, 7, json_tokener_error_parse_unexpected, 1, JSON_TOKENER_STRICT}, // utf-8 test // acsll encoding - {"\x22\x31\x32\x33\x61\x73\x63\x24\x25\x26\x22", -1, -1, json_tokener_success, 1, JSON_TOKENER_VALIDATE_UTF8 }, + {"\x22\x31\x32\x33\x61\x73\x63\x24\x25\x26\x22", -1, -1, json_tokener_success, 1, + JSON_TOKENER_VALIDATE_UTF8}, {"\x22\x31\x32\x33\x61\x73\x63\x24\x25\x26\x22", -1, -1, json_tokener_success, 1}, // utf-8 encoding - {"\x22\xe4\xb8\x96\xe7\x95\x8c\x22", -1, -1, json_tokener_success, 1, JSON_TOKENER_VALIDATE_UTF8 }, - {"\x22\xe4\xb8", -1, 3, json_tokener_error_parse_utf8_string, 0, JSON_TOKENER_VALIDATE_UTF8 }, - {"\x96\xe7\x95\x8c\x22", -1, 0, json_tokener_error_parse_utf8_string, 1, JSON_TOKENER_VALIDATE_UTF8 }, + {"\x22\xe4\xb8\x96\xe7\x95\x8c\x22", -1, -1, json_tokener_success, 1, + JSON_TOKENER_VALIDATE_UTF8}, + {"\x22\xe4\xb8", -1, 3, json_tokener_error_parse_utf8_string, 0, JSON_TOKENER_VALIDATE_UTF8}, + {"\x96\xe7\x95\x8c\x22", -1, 0, json_tokener_error_parse_utf8_string, 1, + JSON_TOKENER_VALIDATE_UTF8}, {"\x22\xe4\xb8\x96\xe7\x95\x8c\x22", -1, -1, json_tokener_success, 1}, - {"\x22\xcf\x80\xcf\x86\x22", -1, -1, json_tokener_success, 1, JSON_TOKENER_VALIDATE_UTF8 }, - {"\x22\xf0\xa5\x91\x95\x22", -1, -1, json_tokener_success, 1, JSON_TOKENER_VALIDATE_UTF8 }, + {"\x22\xcf\x80\xcf\x86\x22", -1, -1, json_tokener_success, 1, JSON_TOKENER_VALIDATE_UTF8}, + {"\x22\xf0\xa5\x91\x95\x22", -1, -1, json_tokener_success, 1, JSON_TOKENER_VALIDATE_UTF8}, // wrong utf-8 encoding - {"\x22\xe6\x9d\x4e\x22", -1, 3, json_tokener_error_parse_utf8_string, 1, JSON_TOKENER_VALIDATE_UTF8 }, + {"\x22\xe6\x9d\x4e\x22", -1, 3, json_tokener_error_parse_utf8_string, 1, + JSON_TOKENER_VALIDATE_UTF8}, {"\x22\xe6\x9d\x4e\x22", -1, 5, json_tokener_success, 1}, // GBK encoding - {"\x22\xc0\xee\xc5\xf4\x22", -1, 2, json_tokener_error_parse_utf8_string, 1, JSON_TOKENER_VALIDATE_UTF8 }, + {"\x22\xc0\xee\xc5\xf4\x22", -1, 2, json_tokener_error_parse_utf8_string, 1, + JSON_TOKENER_VALIDATE_UTF8}, {"\x22\xc0\xee\xc5\xf4\x22", -1, 6, json_tokener_success, 1}, // char after space - {"\x20\x20\x22\xe4\xb8\x96\x22", -1, -1, json_tokener_success, 1, JSON_TOKENER_VALIDATE_UTF8 }, - {"\x20\x20\x81\x22\xe4\xb8\x96\x22", -1, 2, json_tokener_error_parse_utf8_string, 1, JSON_TOKENER_VALIDATE_UTF8 }, - {"\x5b\x20\x81\x31\x5d", -1, 2, json_tokener_error_parse_utf8_string, 1, JSON_TOKENER_VALIDATE_UTF8 }, + {"\x20\x20\x22\xe4\xb8\x96\x22", -1, -1, json_tokener_success, 1, JSON_TOKENER_VALIDATE_UTF8}, + {"\x20\x20\x81\x22\xe4\xb8\x96\x22", -1, 2, json_tokener_error_parse_utf8_string, 1, + JSON_TOKENER_VALIDATE_UTF8}, + {"\x5b\x20\x81\x31\x5d", -1, 2, json_tokener_error_parse_utf8_string, 1, + JSON_TOKENER_VALIDATE_UTF8}, // char in state inf {"\x49\x6e\x66\x69\x6e\x69\x74\x79", 9, 8, json_tokener_success, 1}, - {"\x49\x6e\x66\x81\x6e\x69\x74\x79", -1, 3, json_tokener_error_parse_utf8_string, 1, JSON_TOKENER_VALIDATE_UTF8 }, + {"\x49\x6e\x66\x81\x6e\x69\x74\x79", -1, 3, json_tokener_error_parse_utf8_string, 1, + JSON_TOKENER_VALIDATE_UTF8}, // char in escape unicode - {"\x22\x5c\x75\x64\x38\x35\x35\x5c\x75\x64\x63\x35\x35\x22", 15, 14, json_tokener_success, 1, JSON_TOKENER_VALIDATE_UTF8 }, + {"\x22\x5c\x75\x64\x38\x35\x35\x5c\x75\x64\x63\x35\x35\x22", 15, 14, json_tokener_success, 1, + JSON_TOKENER_VALIDATE_UTF8}, {"\x22\x5c\x75\x64\x38\x35\x35\xc0\x75\x64\x63\x35\x35\x22", -1, 8, - json_tokener_error_parse_utf8_string, 1, JSON_TOKENER_VALIDATE_UTF8 }, - {"\x22\x5c\x75\x64\x30\x30\x33\x31\xc0\x22", -1, 9, json_tokener_error_parse_utf8_string, 1, JSON_TOKENER_VALIDATE_UTF8 }, + json_tokener_error_parse_utf8_string, 1, JSON_TOKENER_VALIDATE_UTF8}, + {"\x22\x5c\x75\x64\x30\x30\x33\x31\xc0\x22", -1, 9, json_tokener_error_parse_utf8_string, 1, + JSON_TOKENER_VALIDATE_UTF8}, // char in number - {"\x31\x31\x81\x31\x31", -1, 2, json_tokener_error_parse_utf8_string, 1, JSON_TOKENER_VALIDATE_UTF8 }, + {"\x31\x31\x81\x31\x31", -1, 2, json_tokener_error_parse_utf8_string, 1, + JSON_TOKENER_VALIDATE_UTF8}, // char in object - {"\x7b\x22\x31\x81\x22\x3a\x31\x7d", -1, 3, json_tokener_error_parse_utf8_string, 1, JSON_TOKENER_VALIDATE_UTF8 }, + {"\x7b\x22\x31\x81\x22\x3a\x31\x7d", -1, 3, json_tokener_error_parse_utf8_string, 1, + JSON_TOKENER_VALIDATE_UTF8}, {NULL, -1, -1, json_tokener_success, 0}, }; diff --git a/tests/test_parse.expected b/tests/test_parse.expected index 77e8be1..df74fbd 100644 --- a/tests/test_parse.expected +++ b/tests/test_parse.expected @@ -100,6 +100,30 @@ json_tokener_parse_ex(tok, // hello"foo", 13) ... OK: got correct error: contin json_tokener_parse_ex(tok, { "foo , 6) ... OK: got correct error: continue json_tokener_parse_ex(tok, ": {"bar , 8) ... OK: got correct error: continue json_tokener_parse_ex(tok, ":13}} , 6) ... OK: got object of type [object]: { "foo": { "bar": 13 } } +json_tokener_parse_ex(tok, "\ , 2) ... OK: got correct error: continue +json_tokener_parse_ex(tok, u , 1) ... OK: got correct error: continue +json_tokener_parse_ex(tok, d , 1) ... OK: got correct error: continue +json_tokener_parse_ex(tok, 8 , 1) ... OK: got correct error: continue +json_tokener_parse_ex(tok, 3 , 1) ... OK: got correct error: continue +json_tokener_parse_ex(tok, 4 , 1) ... OK: got correct error: continue +json_tokener_parse_ex(tok, \ , 1) ... OK: got correct error: continue +json_tokener_parse_ex(tok, u , 1) ... OK: got correct error: continue +json_tokener_parse_ex(tok, d , 1) ... OK: got correct error: continue +json_tokener_parse_ex(tok, d , 1) ... OK: got correct error: continue +json_tokener_parse_ex(tok, 1 , 1) ... OK: got correct error: continue +json_tokener_parse_ex(tok, e" , 2) ... OK: got object of type [string]: "𝄞" +json_tokener_parse_ex(tok, "\u , 3) ... OK: got correct error: continue +json_tokener_parse_ex(tok, d8 , 2) ... OK: got correct error: continue +json_tokener_parse_ex(tok, 34 , 2) ... OK: got correct error: continue +json_tokener_parse_ex(tok, \u , 2) ... OK: got correct error: continue +json_tokener_parse_ex(tok, dd , 2) ... OK: got correct error: continue +json_tokener_parse_ex(tok, 1e" , 3) ... OK: got object of type [string]: "𝄞" +json_tokener_parse_ex(tok, "\ud834 , 7) ... OK: got correct error: continue +json_tokener_parse_ex(tok, \udd1e" , 7) ... OK: got object of type [string]: "𝄞" +json_tokener_parse_ex(tok, "\ud834\ , 8) ... OK: got correct error: continue +json_tokener_parse_ex(tok, udd1e" , 6) ... OK: got object of type [string]: "𝄞" +json_tokener_parse_ex(tok, "\ud834\u , 9) ... OK: got correct error: continue +json_tokener_parse_ex(tok, dd1e" , 5) ... OK: got object of type [string]: "𝄞" json_tokener_parse_ex(tok, { "foo , 6) ... OK: got correct error: continue json_tokener_parse_ex(tok, : "bar"} , 8) ... OK: got correct error: unexpected character json_tokener_parse_ex(tok, { "foo , 6) ... OK: got correct error: continue @@ -216,5 +240,5 @@ json_tokener_parse_ex(tok, "\ud855 json_tokener_parse_ex(tok, "\ud0031À" , 10) ... OK: got correct error: invalid utf-8 string json_tokener_parse_ex(tok, 1111 , 5) ... OK: got correct error: invalid utf-8 string json_tokener_parse_ex(tok, {"1":1} , 8) ... OK: got correct error: invalid utf-8 string -End Incremental Tests OK=130 ERROR=0 +End Incremental Tests OK=154 ERROR=0 ================================== -- 2.40.0