From: Stephen Dolan Date: Sat, 22 Jun 2013 12:34:24 +0000 (+0100) Subject: Fix various UTF8 parsing bugs. X-Git-Tag: jq-1.4~90^2~1 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=ff48bd6ec538b01d1057be8e93b94eef6914e9ef;p=jq Fix various UTF8 parsing bugs. In particular, parse bad UTF8 by replacing the broken bits with U+FFFD and resychronise correctly after broken sequences. --- diff --git a/jv.c b/jv.c index 9316aec..0c69a56 100644 --- a/jv.c +++ b/jv.c @@ -377,6 +377,32 @@ static jvp_string* jvp_string_alloc(uint32_t size) { return s; } +/* Copy a UTF8 string, replacing all badly encoded points with U+FFFD */ +static jv_nontrivial jvp_string_copy_replace_bad(const char* data, uint32_t length) { + const char* end = data + length; + const char* i = data; + const char* cstart; + + uint32_t maxlength = length * 3 + 1; // worst case: all bad bytes, each becomes a 3-byte U+FFFD + jvp_string* s = jvp_string_alloc(maxlength); + char* out = s->data; + int c = 0; + + while ((i = jvp_utf8_next((cstart = i), end, &c))) { + if (c == -1) { + c = 0xFFFD; // U+FFFD REPLACEMENT CHARACTER + } + out += jvp_utf8_encode(c, out); + assert(out < s->data + maxlength); + } + length = out - s->data; + s->data[length] = 0; + s->length_hashed = length << 1; + jv_nontrivial r = {&s->refcnt, {0,0}}; + return r; +} + +/* Assumes valid UTF8 */ static jv_nontrivial jvp_string_new(const char* data, uint32_t length) { jvp_string* s = jvp_string_alloc(length); s->length_hashed = length << 1; @@ -523,7 +549,9 @@ static int jvp_string_equal(jv_nontrivial* a, jv_nontrivial* b) { jv jv_string_sized(const char* str, int len) { jv j; j.kind = JV_KIND_STRING; - j.val.nontrivial = jvp_string_new(str, len); + j.val.nontrivial = jvp_utf8_is_valid(str, str+len) ? + jvp_string_new(str, len) : + jvp_string_copy_replace_bad(str, len); return j; } @@ -568,14 +596,21 @@ jv jv_string_concat(jv a, jv b) { } jv jv_string_append_buf(jv a, const char* buf, int len) { - jvp_string_append(&a.val.nontrivial, buf, len); + if (jvp_utf8_is_valid(buf, buf+len)) { + jvp_string_append(&a.val.nontrivial, buf, len); + } else { + jv b; + b.kind = JV_KIND_STRING; + b.val.nontrivial = jvp_string_copy_replace_bad(buf, len); + a = jv_string_concat(a, b); + } return a; } jv jv_string_append_str(jv a, const char* str) { return jv_string_append_buf(a, str, strlen(str)); } - + jv jv_string_fmt(const char* fmt, ...) { int size = 1024; while (1) { diff --git a/jv_unicode.c b/jv_unicode.c index 375ad36..a5305cf 100644 --- a/jv_unicode.c +++ b/jv_unicode.c @@ -3,35 +3,56 @@ #include "jv_unicode.h" #include "jv_utf8_tables.h" -const char* jvp_utf8_next(const char* in, const char* end, int* codepoint) { +const char* jvp_utf8_next(const char* in, const char* end, int* codepoint_ret) { + assert(in <= end); if (in == end) { - codepoint = 0; return 0; } + int codepoint = -1; unsigned char first = (unsigned char)in[0]; int length = utf8_coding_length[first]; - if (length == 0 || length == UTF8_CONTINUATION_BYTE || in + length > end) { - *codepoint = -1; - return 0; - } - *codepoint = ((unsigned)in[0]) & utf8_coding_bits[first]; - for (int i=1; i end) { + /* String ends before UTF8 sequence ends */ + length = end - in; + } else { + codepoint = ((unsigned)in[0]) & utf8_coding_bits[first]; + for (int i=1; i 0x10FFFF) { + /* Outside Unicode range */ + codepoint = -1; } - *codepoint = (*codepoint << 6) | (ch & 0x3f); } + assert(length > 0); + *codepoint_ret = codepoint; return in + length; } -int jvp_utf8_verify(const char* in, const char* end) { - int codepoint = 0; +int jvp_utf8_is_valid(const char* in, const char* end) { + int codepoint; while ((in = jvp_utf8_next(in, end, &codepoint))) { if (codepoint == -1) return 0; } - return codepoint != -1; + return 1; } int jvp_utf8_encode_length(int codepoint) { diff --git a/jv_unicode.h b/jv_unicode.h index 78c7a40..579c910 100644 --- a/jv_unicode.h +++ b/jv_unicode.h @@ -2,7 +2,7 @@ #define JV_UNICODE_H const char* jvp_utf8_next(const char* in, const char* end, int* codepoint); - +int jvp_utf8_is_valid(const char* in, const char* end); int jvp_utf8_decode_length(char startchar);