From a68566bf6a2b44b5932072e85261a971097bc0a9 Mon Sep 17 00:00:00 2001 From: Eric Haszlakiewicz Date: Sun, 21 Jun 2020 18:17:40 +0000 Subject: [PATCH] Issue #616: Change the parsing of surrogate pairs in unicode escapes so it uses a couple of additional states instead of assuming the low surrogate is already present, to ensure that we correctly handle various cases of incremental parsing. --- ChangeLog | 2 + json_tokener.c | 306 ++++++++++++++++++-------------------- json_tokener.h | 2 + tests/test_parse.c | 26 +++- tests/test_parse.expected | 8 +- 5 files changed, 172 insertions(+), 172 deletions(-) diff --git a/ChangeLog b/ChangeLog index b785060..0ca4b59 100644 --- a/ChangeLog +++ b/ChangeLog @@ -25,6 +25,8 @@ Other changes Add json_object_array_shrink() and array_list_shrink() functions. * Add json_object_new_array_ext(int) and array_list_new_2(int) to allow arrays to be allocated with the exact size needed, when known. +* Parsing of surrogate pairs in unicode escapes now properly handles + incremental parsing. *** diff --git a/json_tokener.c b/json_tokener.c index 15ddd17..82cb8d9 100644 --- a/json_tokener.c +++ b/json_tokener.c @@ -295,7 +295,7 @@ struct json_object *json_tokener_parse_ex(struct json_tokener *tok, const char * } #endif - while (PEEK_CHAR(c, tok)) + while (PEEK_CHAR(c, tok)) // Note: c might be '\0' ! { redo_char: @@ -628,9 +628,11 @@ struct json_object *json_tokener_parse_ex(struct json_tokener *tok, const char * } break; + // =================================================== + case json_tokener_state_escape_unicode: { - /* Handle a 4-byte sequence, or two sequences if a surrogate pair */ + /* Handle a 4-byte \uNNNN sequence, or two sequences if a surrogate pair */ while (1) { if (!c || !strchr(json_hex_chars, c)) @@ -638,181 +640,153 @@ struct json_object *json_tokener_parse_ex(struct json_tokener *tok, const char * tok->err = json_tokener_error_parse_string; goto out; } - tok->ucs_char |= ((unsigned int)jt_hexdigit(c) - << ((3 - tok->st_pos) * 4)); + tok->ucs_char |= + ((unsigned int)jt_hexdigit(c) << ((3 - tok->st_pos) * 4)); tok->st_pos++; - if (tok->st_pos < 4) - { - ADVANCE_CHAR(str, tok); - if (!PEEK_CHAR(c, tok)) - { - /* - * We're out of characters in the current call to - * json_tokener_parse(), but a subsequent call might - * provide us with more, so leave our current state - * as-is (including tok->high_surrogate) and return. - */ - goto out; - } - continue; - } - - /* Now, we have a full \uNNNN sequence in tok->ucs_char */ - - if (tok->high_surrogate) - { - if (IS_LOW_SURROGATE(tok->ucs_char)) - { - /* remove the utf8_replacement_char */ - /* which may generate during */ - /* parsing the high surrogate pair. */ - if (!strcmp( - tok->pb->buf, - (char *) - utf8_replacement_char)) - { - printbuf_reset(tok->pb); - } - /* Recalculate the ucs_char, then fall thru to process normally */ - tok->ucs_char = - DECODE_SURROGATE_PAIR( - tok->high_surrogate, - tok->ucs_char); - } - else - { - /* High surrogate was not followed by a low surrogate - * Replace the high and process the rest normally - */ - printbuf_memappend_fast( - tok->pb, - (char *)utf8_replacement_char, - 3); - } - tok->high_surrogate = 0; - } + if (tok->st_pos >= 4) + break; - if (tok->ucs_char < 0x80) - { - unsigned char unescaped_utf[1]; - unescaped_utf[0] = tok->ucs_char; - printbuf_memappend_fast( - tok->pb, (char *)unescaped_utf, 1); - } - else if (tok->ucs_char < 0x800) - { - unsigned char unescaped_utf[2]; - unescaped_utf[0] = - 0xc0 | (tok->ucs_char >> 6); - unescaped_utf[1] = - 0x80 | (tok->ucs_char & 0x3f); - printbuf_memappend_fast( - tok->pb, (char *)unescaped_utf, 2); - } - else if (IS_HIGH_SURROGATE(tok->ucs_char)) + ADVANCE_CHAR(str, tok); + if (!PEEK_CHAR(c, tok)) { - /* Got a high surrogate. Remember it and look for - * the beginning of another \uNNNN sequence, which - * should be the low surrogate. + /* + * We're out of characters in the current call to + * json_tokener_parse(), but a subsequent call might + * provide us with more, so leave our current state + * as-is (including tok->high_surrogate) and return. */ - tok->high_surrogate = tok->ucs_char; - /* Not at end, and the next two chars should be "\u" */ - if ((len == -1 || - len > (tok->char_offset + 2)) && - // str[0] != '0' && // implied by json_hex_chars, above. - (str[1] == '\\') && (str[2] == 'u')) - { - /* Advance through the 16 bit surrogate, and move - * on to the next sequence. The next step is to - * process the following characters. - */ - if (!ADVANCE_CHAR(str, tok) || - !ADVANCE_CHAR(str, tok)) - { - printbuf_memappend_fast( - tok->pb, - (char *) - utf8_replacement_char, - 3); - } - /* Advance to the first char of the next sequence and - * continue processing with the next sequence. - */ - if (!ADVANCE_CHAR(str, tok) || - !PEEK_CHAR(c, tok)) - { - printbuf_memappend_fast( - tok->pb, - (char *) - utf8_replacement_char, - 3); - tok->ucs_char = 0; - tok->st_pos = 0; - goto out; - } - tok->ucs_char = 0; - tok->st_pos = 0; - /* other json_tokener_state_escape_unicode */ - continue; - } - else - { - /* Got a high surrogate without another sequence following - * it. Put a replacement char in for the high surrogate - * and pretend we finished. - */ - printbuf_memappend_fast( - tok->pb, - (char *)utf8_replacement_char, - 3); - } - } - else if (IS_LOW_SURROGATE(tok->ucs_char)) - { - /* Got a low surrogate not preceded by a high */ - printbuf_memappend_fast( - tok->pb, (char *)utf8_replacement_char, - 3); - } - else if (tok->ucs_char < 0x10000) - { - unsigned char unescaped_utf[3]; - unescaped_utf[0] = - 0xe0 | (tok->ucs_char >> 12); - unescaped_utf[1] = - 0x80 | ((tok->ucs_char >> 6) & 0x3f); - unescaped_utf[2] = - 0x80 | (tok->ucs_char & 0x3f); - printbuf_memappend_fast( - tok->pb, (char *)unescaped_utf, 3); + goto out; } - else if (tok->ucs_char < 0x110000) + } + tok->st_pos = 0; + + /* Now, we have a full \uNNNN sequence in tok->ucs_char */ + + /* If the *previous* sequence was a high surrogate ... */ + if (tok->high_surrogate) + { + if (IS_LOW_SURROGATE(tok->ucs_char)) { - unsigned char unescaped_utf[4]; - unescaped_utf[0] = - 0xf0 | ((tok->ucs_char >> 18) & 0x07); - unescaped_utf[1] = - 0x80 | ((tok->ucs_char >> 12) & 0x3f); - unescaped_utf[2] = - 0x80 | ((tok->ucs_char >> 6) & 0x3f); - unescaped_utf[3] = - 0x80 | (tok->ucs_char & 0x3f); - printbuf_memappend_fast( - tok->pb, (char *)unescaped_utf, 4); + /* Recalculate the ucs_char, then fall thru to process normally */ + tok->ucs_char = DECODE_SURROGATE_PAIR(tok->high_surrogate, + tok->ucs_char); } else { - /* Don't know what we got--insert the replacement char */ - printbuf_memappend_fast( - tok->pb, (char *)utf8_replacement_char, - 3); + /* High surrogate was not followed by a low surrogate + * Replace the high and process the rest normally + */ + printbuf_memappend_fast(tok->pb, + (char *)utf8_replacement_char, 3); } - state = saved_state; // i.e. _state_string or _object_field + tok->high_surrogate = 0; + } + + if (tok->ucs_char < 0x80) + { + unsigned char unescaped_utf[1]; + unescaped_utf[0] = tok->ucs_char; + printbuf_memappend_fast(tok->pb, (char *)unescaped_utf, 1); + } + else if (tok->ucs_char < 0x800) + { + unsigned char unescaped_utf[2]; + unescaped_utf[0] = 0xc0 | (tok->ucs_char >> 6); + unescaped_utf[1] = 0x80 | (tok->ucs_char & 0x3f); + printbuf_memappend_fast(tok->pb, (char *)unescaped_utf, 2); + } + else if (IS_HIGH_SURROGATE(tok->ucs_char)) + { + /* + * The next two characters should be \u, HOWEVER, + * we can't simply peek ahead here, because the + * characters we need might not be passed to us + * until a subsequent call to json_tokener_parse. + * Instead, transition throug a couple of states. + * (now): + * _escape_unicode => _unicode_need_escape + * (see a '\\' char): + * _unicode_need_escape => _unicode_need_u + * (see a 'u' char): + * _unicode_need_u => _escape_unicode + * ...and we'll end up back around here. + */ + tok->high_surrogate = tok->ucs_char; + tok->ucs_char = 0; + state = json_tokener_state_escape_unicode_need_escape; break; } + else if (IS_LOW_SURROGATE(tok->ucs_char)) + { + /* Got a low surrogate not preceded by a high */ + printbuf_memappend_fast(tok->pb, (char *)utf8_replacement_char, 3); + } + else if (tok->ucs_char < 0x10000) + { + unsigned char unescaped_utf[3]; + unescaped_utf[0] = 0xe0 | (tok->ucs_char >> 12); + unescaped_utf[1] = 0x80 | ((tok->ucs_char >> 6) & 0x3f); + unescaped_utf[2] = 0x80 | (tok->ucs_char & 0x3f); + printbuf_memappend_fast(tok->pb, (char *)unescaped_utf, 3); + } + else if (tok->ucs_char < 0x110000) + { + unsigned char unescaped_utf[4]; + unescaped_utf[0] = 0xf0 | ((tok->ucs_char >> 18) & 0x07); + unescaped_utf[1] = 0x80 | ((tok->ucs_char >> 12) & 0x3f); + unescaped_utf[2] = 0x80 | ((tok->ucs_char >> 6) & 0x3f); + unescaped_utf[3] = 0x80 | (tok->ucs_char & 0x3f); + printbuf_memappend_fast(tok->pb, (char *)unescaped_utf, 4); + } + else + { + /* Don't know what we got--insert the replacement char */ + printbuf_memappend_fast(tok->pb, (char *)utf8_replacement_char, 3); + } + state = saved_state; // i.e. _state_string or _state_object_field } break; + case json_tokener_state_escape_unicode_need_escape: + // We get here after processing a high_surrogate + // require a '\\' char + if (!c || c != '\\') + { + /* Got a high surrogate without another sequence following + * it. Put a replacement char in for the high surrogate + * and pop back up to _state_string or _state_object_field. + */ + printbuf_memappend_fast(tok->pb, (char *)utf8_replacement_char, 3); + tok->high_surrogate = 0; + tok->ucs_char = 0; + tok->st_pos = 0; + state = saved_state; + goto redo_char; + } + state = json_tokener_state_escape_unicode_need_u; + break; + + case json_tokener_state_escape_unicode_need_u: + /* We already had a \ char, check that it's \u */ + if (!c || c != 'u') + { + /* Got a high surrogate with some non-unicode escape + * sequence following it. + * Put a replacement char in for the high surrogate + * and handle the escape sequence normally. + */ + printbuf_memappend_fast(tok->pb, (char *)utf8_replacement_char, 3); + tok->high_surrogate = 0; + tok->ucs_char = 0; + tok->st_pos = 0; + state = json_tokener_state_string_escape; + goto redo_char; + } + state = json_tokener_state_escape_unicode; + break; + + // =================================================== + case json_tokener_state_boolean: { int size1, size2; @@ -1146,8 +1120,9 @@ struct json_object *json_tokener_parse_ex(struct json_tokener *tok, const char * } break; } - if (!ADVANCE_CHAR(str, tok)) - goto out; + (void)ADVANCE_CHAR(str, tok); + if (!c) // This is the char *before* advancing + break; } /* while(PEEK_CHAR) */ out: @@ -1156,7 +1131,8 @@ out: tok->err = json_tokener_error_parse_utf8_string; } if (c && (state == json_tokener_state_finish) && (tok->depth == 0) && - (tok->flags & (JSON_TOKENER_STRICT|JSON_TOKENER_ALLOW_TRAILING_CHARS)) == JSON_TOKENER_STRICT) + (tok->flags & (JSON_TOKENER_STRICT | JSON_TOKENER_ALLOW_TRAILING_CHARS)) == + JSON_TOKENER_STRICT) { /* unexpected char after JSON data */ tok->err = json_tokener_error_parse_unexpected; diff --git a/json_tokener.h b/json_tokener.h index 421ef14..c680603 100644 --- a/json_tokener.h +++ b/json_tokener.h @@ -59,6 +59,8 @@ enum json_tokener_state json_tokener_state_string, json_tokener_state_string_escape, json_tokener_state_escape_unicode, + json_tokener_state_escape_unicode_need_escape, + json_tokener_state_escape_unicode_need_u, json_tokener_state_boolean, json_tokener_state_number, json_tokener_state_array, diff --git a/tests/test_parse.c b/tests/test_parse.c index 57d584c..da82b51 100644 --- a/tests/test_parse.c +++ b/tests/test_parse.c @@ -68,8 +68,8 @@ static void single_incremental_parse(const char *test_string, int clear_serializ if (strcmp(all_at_once_str, new_str) != 0) { - printf("ERROR: failed to parse (%s) in %d byte chunks: %s != %s\n", - test_string, chunksize, all_at_once_str, new_str); + printf("ERROR: failed to parse (%s) in %d byte chunks: %s != %s\n", test_string, + chunksize, all_at_once_str, new_str); } json_tokener_free(tok); } @@ -193,8 +193,8 @@ static void test_utf8_parse() // json_tokener_parse doesn't support checking for byte order marks. // It's the responsibility of the caller to detect and skip a BOM. // Both of these checks return null. - char* utf8_bom = "\xEF\xBB\xBF"; - char* utf8_bom_and_chars = "\xEF\xBB\xBF{}"; + char *utf8_bom = "\xEF\xBB\xBF"; + char *utf8_bom_and_chars = "\xEF\xBB\xBF{}"; single_basic_parse(utf8_bom, 0); single_basic_parse(utf8_bom_and_chars, 0); } @@ -245,7 +245,7 @@ struct incremental_step int char_offset; enum json_tokener_error expected_error; int reset_tokener; /* Set to 1 to call json_tokener_reset() after parsing */ - int tok_flags; /* JSON_TOKENER_* flags to pass to json_tokener_set_flags() */ + int tok_flags; /* JSON_TOKENER_* flags to pass to json_tokener_set_flags() */ } incremental_steps[] = { /* Check that full json messages can be parsed, both w/ and w/o a reset */ @@ -268,7 +268,11 @@ struct incremental_step {"\": {\"bar", -1, -1, json_tokener_continue, 0}, {"\":13}}", -1, -1, json_tokener_success, 1}, - /* Check the UTF-16 surrogate pair */ + /* Check the UTF-16 surrogate pair handling in various ways. + * Note: \ud843\udd1e is u+1D11E, Musical Symbol G Clef + * Your terminal may not display these correctly, in particular + * PuTTY doesn't currently show this character. + */ /* parse one char at every time */ {"\"\\", -1, -1, json_tokener_continue, 0}, {"u", -1, -1, json_tokener_continue, 0}, @@ -296,6 +300,16 @@ struct incremental_step {"udd1e\"", -1, -1, json_tokener_success, 1}, {"\"\\ud834\\u", -1, -1, json_tokener_continue, 0}, {"dd1e\"", -1, -1, json_tokener_success, 1}, + {"\"fff \\ud834\\ud", -1, -1, json_tokener_continue, 0}, + {"d1e bar\"", -1, -1, json_tokener_success, 1}, + {"\"fff \\ud834\\udd", -1, -1, json_tokener_continue, 0}, + {"1e bar\"", -1, -1, json_tokener_success, 1}, + + /* \ud83d\ude00 is U+1F600, Grinning Face + * Displays fine in PuTTY, though you may need "less -r" + */ + {"\"fff \\ud83d\\ude", -1, -1, json_tokener_continue, 0}, + {"00 bar\"", -1, -1, json_tokener_success, 1}, /* Check that json_tokener_reset actually resets */ {"{ \"foo", -1, -1, json_tokener_continue, 1}, diff --git a/tests/test_parse.expected b/tests/test_parse.expected index a4b3393..6ed5520 100644 --- a/tests/test_parse.expected +++ b/tests/test_parse.expected @@ -124,6 +124,12 @@ json_tokener_parse_ex(tok, "\ud834\ , 8) ... OK: got correct error: continu json_tokener_parse_ex(tok, udd1e" , 6) ... OK: got object of type [string]: "𝄞" json_tokener_parse_ex(tok, "\ud834\u , 9) ... OK: got correct error: continue json_tokener_parse_ex(tok, dd1e" , 5) ... OK: got object of type [string]: "𝄞" +json_tokener_parse_ex(tok, "fff \ud834\ud, 14) ... OK: got correct error: continue +json_tokener_parse_ex(tok, d1e bar" , 8) ... OK: got object of type [string]: "fff 𝄞 bar" +json_tokener_parse_ex(tok, "fff \ud834\udd, 15) ... OK: got correct error: continue +json_tokener_parse_ex(tok, 1e bar" , 7) ... OK: got object of type [string]: "fff 𝄞 bar" +json_tokener_parse_ex(tok, "fff \ud83d\ude, 15) ... OK: got correct error: continue +json_tokener_parse_ex(tok, 00 bar" , 7) ... OK: got object of type [string]: "fff 😀 bar" json_tokener_parse_ex(tok, { "foo , 6) ... OK: got correct error: continue json_tokener_parse_ex(tok, : "bar"} , 8) ... OK: got correct error: unexpected character json_tokener_parse_ex(tok, { "foo , 6) ... OK: got correct error: continue @@ -240,5 +246,5 @@ json_tokener_parse_ex(tok, "\ud855 json_tokener_parse_ex(tok, "\ud0031À" , 10) ... OK: got correct error: invalid utf-8 string json_tokener_parse_ex(tok, 1111 , 5) ... OK: got correct error: invalid utf-8 string json_tokener_parse_ex(tok, {"1":1} , 8) ... OK: got correct error: invalid utf-8 string -End Incremental Tests OK=154 ERROR=0 +End Incremental Tests OK=160 ERROR=0 ================================== -- 2.50.1