From: Nikita Popov Date: Thu, 3 Aug 2017 19:53:21 +0000 (+0200) Subject: Revert/fix substitution character fallback X-Git-Tag: php-7.2.0beta3~49 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=fb9bf5b64b6c09b9d93bbd1dadd64884e0af66f3;p=php Revert/fix substitution character fallback The introduced checks were not correct in two respects: * It was checked whether the source encoding of the string matches the internal encoding, while the actually relevant encoding is the *target* encoding. * Even if the correct encoding is used, the checks are still too conservative. Just because something is not a "Unicode-encoding" does not mean that it does not map any non-ASCII characters. I've reverted the added checks and instead adjusted mbfl_convert to first try to use the provided substitution character and if that fails, perform the fallback to '?' at that point. This means that any codepoint mapped in the target encoding should now be correctly supported and anything else should fall back to '?'. --- diff --git a/ext/mbstring/libmbfl/mbfl/mbfl_convert.c b/ext/mbstring/libmbfl/mbfl/mbfl_convert.c index b553ad5d13..a73a0c80e5 100644 --- a/ext/mbstring/libmbfl/mbfl/mbfl_convert.c +++ b/ext/mbstring/libmbfl/mbfl/mbfl_convert.c @@ -467,14 +467,26 @@ int mbfl_convert_filter_strcat(mbfl_convert_filter *filter, const unsigned char int mbfl_filt_conv_illegal_output(int c, mbfl_convert_filter *filter) { - int mode_backup, ret, n, m, r; + int mode_backup, substchar_backup, ret, n, m, r; ret = 0; + mode_backup = filter->illegal_mode; - filter->illegal_mode = MBFL_OUTPUTFILTER_ILLEGAL_MODE_NONE; + substchar_backup = filter->illegal_substchar; + + /* The used substitution character may not be supported by the target character encoding. + * If that happens, first try to use "?" instead and if that also fails, silently drop the + * character. */ + if (filter->illegal_mode == MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR + && filter->illegal_substchar != 0x3f) { + filter->illegal_substchar = 0x3f; + } else { + filter->illegal_mode = MBFL_OUTPUTFILTER_ILLEGAL_MODE_NONE; + } + switch (mode_backup) { case MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR: - ret = (*filter->filter_function)(filter->illegal_substchar, filter); + ret = (*filter->filter_function)(substchar_backup, filter); break; case MBFL_OUTPUTFILTER_ILLEGAL_MODE_LONG: if (c >= 0) { @@ -560,14 +572,16 @@ mbfl_filt_conv_illegal_output(int c, mbfl_convert_filter *filter) } ret = mbfl_convert_filter_strcat(filter, (const unsigned char *)";"); } else { - ret = (*filter->filter_function)(filter->illegal_substchar, filter); + ret = (*filter->filter_function)(substchar_backup, filter); } } break; default: break; } + filter->illegal_mode = mode_backup; + filter->illegal_substchar = substchar_backup; filter->num_illegalchar++; return ret; diff --git a/ext/mbstring/mbstring.c b/ext/mbstring/mbstring.c index 732b5f65ba..9b9458afc4 100644 --- a/ext/mbstring/mbstring.c +++ b/ext/mbstring/mbstring.c @@ -937,6 +937,7 @@ static size_t php_mb_zend_encoding_converter(unsigned char **to, size_t *to_leng if (convd == NULL) { return -1; } + mbfl_buffer_converter_illegal_mode(convd, MBSTRG(current_filter_illegal_mode)); mbfl_buffer_converter_illegal_substchar(convd, MBSTRG(current_filter_illegal_substchar)); @@ -3254,29 +3255,9 @@ MBSTRING_API char *php_mb_convert_encoding(const char *input, size_t length, con php_error_docref(NULL, E_WARNING, "Unable to create character encoding converter"); return NULL; } - mbfl_buffer_converter_illegal_mode(convd, MBSTRG(current_filter_illegal_mode)); - - if (string.no_encoding == MBSTRG(current_internal_encoding)->no_encoding) { - mbfl_buffer_converter_illegal_substchar(convd, MBSTRG(current_filter_illegal_substchar)); - } else if (php_mb_is_no_encoding_unicode(string.no_encoding) && php_mb_is_no_encoding_unicode(MBSTRG(current_internal_encoding)->no_encoding)) { - - if (php_mb_is_no_encoding_utf8(string.no_encoding)) { - - if (MBSTRG(current_filter_illegal_substchar) > 0xd7ff && - 0xe000 > MBSTRG(current_filter_illegal_substchar) - ) { - mbfl_buffer_converter_illegal_substchar(convd, 0x3f); - } else { - mbfl_buffer_converter_illegal_substchar(convd, MBSTRG(current_filter_illegal_substchar)); - } - - } else { - mbfl_buffer_converter_illegal_substchar(convd, MBSTRG(current_filter_illegal_substchar)); - } - } else { - mbfl_buffer_converter_illegal_substchar(convd, 0x3f); - } + mbfl_buffer_converter_illegal_mode(convd, MBSTRG(current_filter_illegal_mode)); + mbfl_buffer_converter_illegal_substchar(convd, MBSTRG(current_filter_illegal_substchar)); /* do it */ ret = mbfl_buffer_converter_feed_result(convd, &string, &result); @@ -5199,17 +5180,7 @@ static inline char* php_mb_chr(zend_long cp, const char* enc, size_t *output_len if (php_mb_is_no_encoding_utf8(no_enc)) { if (0 > cp || cp > 0x10ffff || (cp > 0xd7ff && 0xe000 > cp)) { - if (php_mb_is_no_encoding_utf8(MBSTRG(current_internal_encoding)->no_encoding)) { - cp = MBSTRG(current_filter_illegal_substchar); - } else if (php_mb_is_no_encoding_unicode(MBSTRG(current_internal_encoding)->no_encoding)) { - if (0xd800 > MBSTRG(current_filter_illegal_substchar) || MBSTRG(current_filter_illegal_substchar) > 0xdfff) { - cp = MBSTRG(current_filter_illegal_substchar); - } else { - cp = 0x3f; - } - } else { - cp = 0x3f; - } + cp = MBSTRG(current_filter_illegal_substchar); } if (cp < 0x80) { diff --git a/ext/mbstring/tests/bug69086.phpt b/ext/mbstring/tests/bug69086.phpt index 921d61cca4..9566e10968 100644 --- a/ext/mbstring/tests/bug69086.phpt +++ b/ext/mbstring/tests/bug69086.phpt @@ -8,7 +8,13 @@ mb_substitute_character(0xfffd); var_dump("?" === mb_convert_encoding("\x80", "Shift_JIS", "EUC-JP")); mb_internal_encoding("UCS-4BE"); var_dump("\x00\x00\xff\xfd" === mb_convert_encoding("\x80", "UCS-4BE", "UTF-8")); + +mb_internal_encoding("UTF-8"); +mb_substitute_character(0xfffd); +var_dump("\u{fffd}" === mb_convert_encoding("\x80", "UTF-8", "EUC-JP-2004")); + ?> --EXPECT-- bool(true) bool(true) +bool(true) diff --git a/ext/mbstring/tests/mb_chr.phpt b/ext/mbstring/tests/mb_chr.phpt index 19e1a704ec..8ec35920c3 100644 --- a/ext/mbstring/tests/mb_chr.phpt +++ b/ext/mbstring/tests/mb_chr.phpt @@ -22,7 +22,7 @@ var_dump( mb_internal_encoding("EUC-JP"); mb_substitute_character(0xa4a2); var_dump( - "?" === mb_chr(0xd800, "UTF-8") + "\u{a4a2}" === mb_chr(0xd800, "UTF-8") ); // Invalid diff --git a/ext/mbstring/tests/mb_substitute_character_variation2.phpt b/ext/mbstring/tests/mb_substitute_character_variation2.phpt index 202561afc7..6248174aa6 100644 --- a/ext/mbstring/tests/mb_substitute_character_variation2.phpt +++ b/ext/mbstring/tests/mb_substitute_character_variation2.phpt @@ -35,5 +35,5 @@ var_dump(bin2hex(mb_convert_encoding($string_mb, "ISO-8859-1", "UTF-8"))); string(14) "3f3f3f3f3f3f3f" string(14) "42424242424242" string(0) "" -string(0) "" +string(14) "3f3f3f3f3f3f3f" ===DONE===