From: William Langford <wlangfor@gmail.com> Date: Wed, 25 Jan 2017 04:05:47 +0000 (-0500) Subject: Handle cut-off UTF-8 sequences when reading files X-Git-Tag: jq-1.6rc1~92^2 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=e84d17196c03da6e6dc56f4fcd319a7fe84f8dbc;p=jq Handle cut-off UTF-8 sequences when reading files Read additional bytes from the file to complete the UTF-8 sequence so the bytes in it don't get converted to U+FFFD replacement characters. --- diff --git a/Makefile.am b/Makefile.am index 2a3aded..c1eaf6d 100644 --- a/Makefile.am +++ b/Makefile.am @@ -115,7 +115,7 @@ endif ### Tests (make check) -TESTS = tests/optionaltest tests/mantest tests/jqtest tests/onigtest tests/shtest +TESTS = tests/optionaltest tests/mantest tests/jqtest tests/onigtest tests/shtest tests/utf8test TESTS_ENVIRONMENT = NO_VALGRIND=$(NO_VALGRIND) diff --git a/src/jv_file.c b/src/jv_file.c index 33d327c..3159df5 100644 --- a/src/jv_file.c +++ b/src/jv_file.c @@ -4,6 +4,7 @@ #include <stdlib.h> #include <string.h> #include "jv.h" +#include "jv_unicode.h" jv jv_load_file(const char* filename, int raw) { FILE* file = fopen(filename, "r"); @@ -20,11 +21,23 @@ jv jv_load_file(const char* filename, int raw) { data = jv_array(); parser = jv_parser_new(0); } + + // To avoid mangling UTF-8 multi-byte sequences that cross the end of our read + // buffer, we need to be able to read the remainder of a sequence and add that + // before appending. + const int max_utf8_len = 4; + char buf[4096+max_utf8_len]; while (!feof(file) && !ferror(file)) { - char buf[4096]; - size_t n = fread(buf, 1, sizeof(buf), file); + size_t n = fread(buf, 1, sizeof(buf)-max_utf8_len, file); + int len = 0; + if (jvp_utf8_backtrack(buf+(n-1), buf, &len) && len > 0) { + if (!feof(file) && !ferror(file)) { + n += fread(buf+n, 1, len, file); + } + } + if (raw) { - data = jv_string_concat(data, jv_string_sized(buf, (int)n)); + data = jv_string_append_buf(data, buf, n); } else { jv_parser_set_buf(parser, buf, n, !feof(file)); jv value; diff --git a/src/jv_unicode.c b/src/jv_unicode.c index fbf7454..b3a50b2 100644 --- a/src/jv_unicode.c +++ b/src/jv_unicode.c @@ -3,6 +3,29 @@ #include "jv_unicode.h" #include "jv_utf8_tables.h" +// jvp_utf8_backtrack returns the beginning of the last codepoint in the +// string, assuming that start is the last byte in the string. +// If the last codepoint is incomplete, returns the number of missing bytes via +// *missing_bytes. If there are no leading bytes or an invalid byte is +// encountered, NULL is returned and *missing_bytes is not altered. +const char* jvp_utf8_backtrack(const char* start, const char* min, int *missing_bytes) { + assert(min < start); + if (min == start) { + return min; + } + int length = 0; + int seen = 1; + while (start >= min && (length = utf8_coding_length[(unsigned char)*start]) == UTF8_CONTINUATION_BYTE) { + start--; + seen++; + } + if (length == 0 || length == UTF8_CONTINUATION_BYTE || length - seen < 0) { + return NULL; + } + if (missing_bytes) *missing_bytes = length - seen; + return start; +} + const char* jvp_utf8_next(const char* in, const char* end, int* codepoint_ret) { assert(in <= end); if (in == end) { diff --git a/src/jv_unicode.h b/src/jv_unicode.h index 579c910..558721a 100644 --- a/src/jv_unicode.h +++ b/src/jv_unicode.h @@ -1,6 +1,7 @@ #ifndef JV_UNICODE_H #define JV_UNICODE_H +const char* jvp_utf8_backtrack(const char* start, const char* min, int *missing_bytes); const char* jvp_utf8_next(const char* in, const char* end, int* codepoint); int jvp_utf8_is_valid(const char* in, const char* end); diff --git a/tests/jq.test b/tests/jq.test index 630c344..ad4a51e 100644 --- a/tests/jq.test +++ b/tests/jq.test @@ -1312,4 +1312,3 @@ jq: error: syntax error, unexpected INVALID_CHARACTER, expecting $end (Unix shel (.[{}] = 0)? null - diff --git a/tests/utf8-truncate.jq b/tests/utf8-truncate.jq new file mode 100644 index 0000000..a6be863 --- /dev/null +++ b/tests/utf8-truncate.jq @@ -0,0 +1,3 @@ +def fill:"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"; +def e:"æ¥æ¬èª"; +e == "æ¥æ¬èª" diff --git a/tests/utf8test b/tests/utf8test new file mode 100755 index 0000000..570731b --- /dev/null +++ b/tests/utf8test @@ -0,0 +1,10 @@ +#!/bin/sh + +. "${0%/*}/setup" "$@" + +if [ "`$VALGRIND $Q $JQ -nf $JQTESTDIR/utf8-truncate.jq`" != "true" ]; then + echo "UTF-8 byte sequences that span the jv_load_file read buffer are mangled" + exit 1 +fi + +exit 0