From: Alex Dowad Date: Tue, 13 Oct 2020 13:17:00 +0000 (+0200) Subject: UTF-16 text conversion handles truncated characters as illegal X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=d9ddeb6e85d1dc3ddc3a0ea2e2014b2bc527e946;p=php UTF-16 text conversion handles truncated characters as illegal This broke one old test (Zend/tests/multibyte_encoding_003.phpt), which used a PHP script encoded as UTF-16. The problem was that to terminate the test script, we need the text: "\n--EXPECT--". Out of that text, the terminating newline (0x0A byte) becomes part of the resulting test script; but a bare 0x0A byte with no 0x00 is not valid UTF-16. Since we now treat truncated UTF-16 characters as erroneous, an extra '?' is appended to the output as an 'illegal character' marker. Really, if we are running PHP scripts which are treated as encoded in UTF-16 or some other arbitrary text encoding (not ASCII), and the script is not actually a valid string in that encoding, inserting '?' characters into the code which the PHP interpreter runs is a bad thing to do. In such cases, the script shouldn't be treated as UTF-16 (or whatever) at all. I wonder if mbstring's encoding detection is being used in 'non-strict' mode? --- diff --git a/Zend/tests/multibyte/multibyte_encoding_003.phpt b/Zend/tests/multibyte/multibyte_encoding_003.phpt index a0983329f4..f0fb60f6cd 100644 Binary files a/Zend/tests/multibyte/multibyte_encoding_003.phpt and b/Zend/tests/multibyte/multibyte_encoding_003.phpt differ diff --git a/ext/mbstring/libmbfl/filters/mbfilter_utf16.c b/ext/mbstring/libmbfl/filters/mbfilter_utf16.c index a44ea371b3..58c63eb54c 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_utf16.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_utf16.c @@ -33,6 +33,7 @@ static int mbfl_filt_ident_utf16(int c, mbfl_identify_filter *filter); static int mbfl_filt_ident_utf16le(int c, mbfl_identify_filter *filter); static int mbfl_filt_ident_utf16be(int c, mbfl_identify_filter *filter); +static int mbfl_filt_conv_utf16_wchar_flush(mbfl_convert_filter *filter); static const char *mbfl_encoding_utf16_aliases[] = {"utf16", NULL}; @@ -93,7 +94,7 @@ const struct mbfl_convert_vtbl vtbl_utf16_wchar = { mbfl_filt_conv_common_ctor, NULL, mbfl_filt_conv_utf16_wchar, - mbfl_filt_conv_common_flush, + mbfl_filt_conv_utf16_wchar_flush, NULL, }; @@ -113,7 +114,7 @@ const struct mbfl_convert_vtbl vtbl_utf16be_wchar = { mbfl_filt_conv_common_ctor, NULL, mbfl_filt_conv_utf16be_wchar, - mbfl_filt_conv_common_flush, + mbfl_filt_conv_utf16_wchar_flush, NULL, }; @@ -133,7 +134,7 @@ const struct mbfl_convert_vtbl vtbl_utf16le_wchar = { mbfl_filt_conv_common_ctor, NULL, mbfl_filt_conv_utf16le_wchar, - mbfl_filt_conv_common_flush, + mbfl_filt_conv_utf16_wchar_flush, NULL, }; @@ -343,6 +344,24 @@ int mbfl_filt_conv_wchar_utf16le(int c, mbfl_convert_filter *filter) return c; } +static int mbfl_filt_conv_utf16_wchar_flush(mbfl_convert_filter *filter) +{ + int status = filter->status; + int cache = filter->cache; + filter->status = filter->cache = 0; + + if (status & 0xF) { + /* Input string was truncated */ + CK((*filter->output_function)(cache | MBFL_WCSGROUP_THROUGH, filter->data)); + } + + if (filter->flush_function) { + (*filter->flush_function)(filter->data); + } + + return 0; +} + static int mbfl_filt_ident_utf16(int c, mbfl_identify_filter *filter) { if (filter->status == 0) {