]> granicus.if.org Git - php/commitdiff
Revert/fix substitution character fallback
authorNikita Popov <nikita.ppv@gmail.com>
Thu, 3 Aug 2017 19:53:21 +0000 (21:53 +0200)
committerNikita Popov <nikita.ppv@gmail.com>
Thu, 3 Aug 2017 19:53:59 +0000 (21:53 +0200)
The introduced checks were not correct in two respects:
 * It was checked whether the source encoding of the string matches
   the internal encoding, while the actually relevant encoding is
   the *target* encoding.
 * Even if the correct encoding is used, the checks are still too
   conservative. Just because something is not a "Unicode-encoding"
   does not mean that it does not map any non-ASCII characters.

I've reverted the added checks and instead adjusted mbfl_convert
to first try to use the provided substitution character and if
that fails, perform the fallback to '?' at that point. This means
that any codepoint mapped in the target encoding should now be
correctly supported and anything else should fall back to '?'.

ext/mbstring/libmbfl/mbfl/mbfl_convert.c
ext/mbstring/mbstring.c
ext/mbstring/tests/bug69086.phpt
ext/mbstring/tests/mb_chr.phpt
ext/mbstring/tests/mb_substitute_character_variation2.phpt

index b553ad5d13d45338572a4755744ae01e705cb5ac..a73a0c80e502aab26079bf61eb33ca08378dfdbc 100644 (file)
@@ -467,14 +467,26 @@ int mbfl_convert_filter_strcat(mbfl_convert_filter *filter, const unsigned char
 int
 mbfl_filt_conv_illegal_output(int c, mbfl_convert_filter *filter)
 {
-       int mode_backup, ret, n, m, r;
+       int mode_backup, substchar_backup, ret, n, m, r;
 
        ret = 0;
+
        mode_backup = filter->illegal_mode;
-       filter->illegal_mode = MBFL_OUTPUTFILTER_ILLEGAL_MODE_NONE;
+       substchar_backup = filter->illegal_substchar;
+
+       /* The used substitution character may not be supported by the target character encoding.
+        * If that happens, first try to use "?" instead and if that also fails, silently drop the
+        * character. */
+       if (filter->illegal_mode == MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR
+                       && filter->illegal_substchar != 0x3f) {
+               filter->illegal_substchar = 0x3f;
+       } else {
+               filter->illegal_mode = MBFL_OUTPUTFILTER_ILLEGAL_MODE_NONE;
+       }
+
        switch (mode_backup) {
        case MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR:
-               ret = (*filter->filter_function)(filter->illegal_substchar, filter);
+               ret = (*filter->filter_function)(substchar_backup, filter);
                break;
        case MBFL_OUTPUTFILTER_ILLEGAL_MODE_LONG:
                if (c >= 0) {
@@ -560,14 +572,16 @@ mbfl_filt_conv_illegal_output(int c, mbfl_convert_filter *filter)
                                }
                                ret = mbfl_convert_filter_strcat(filter, (const unsigned char *)";");
                        } else {
-                               ret = (*filter->filter_function)(filter->illegal_substchar, filter);
+                               ret = (*filter->filter_function)(substchar_backup, filter);
                        }
                }
                break;
        default:
                break;
        }
+
        filter->illegal_mode = mode_backup;
+       filter->illegal_substchar = substchar_backup;
        filter->num_illegalchar++;
 
        return ret;
index 732b5f65ba3677a5c39e0be676df90149c39b6f8..9b9458afc4f3171c0ba544a865a78e26b9252bc0 100644 (file)
@@ -937,6 +937,7 @@ static size_t php_mb_zend_encoding_converter(unsigned char **to, size_t *to_leng
        if (convd == NULL) {
                return -1;
        }
+
        mbfl_buffer_converter_illegal_mode(convd, MBSTRG(current_filter_illegal_mode));
        mbfl_buffer_converter_illegal_substchar(convd, MBSTRG(current_filter_illegal_substchar));
 
@@ -3254,29 +3255,9 @@ MBSTRING_API char *php_mb_convert_encoding(const char *input, size_t length, con
                php_error_docref(NULL, E_WARNING, "Unable to create character encoding converter");
                return NULL;
        }
-       mbfl_buffer_converter_illegal_mode(convd, MBSTRG(current_filter_illegal_mode));
-
-       if (string.no_encoding == MBSTRG(current_internal_encoding)->no_encoding) {
-               mbfl_buffer_converter_illegal_substchar(convd, MBSTRG(current_filter_illegal_substchar));
-       } else if (php_mb_is_no_encoding_unicode(string.no_encoding) && php_mb_is_no_encoding_unicode(MBSTRG(current_internal_encoding)->no_encoding)) {
-
-               if (php_mb_is_no_encoding_utf8(string.no_encoding)) {
-
-                       if (MBSTRG(current_filter_illegal_substchar) > 0xd7ff &&
-                               0xe000 > MBSTRG(current_filter_illegal_substchar)
-                       ) {
-                               mbfl_buffer_converter_illegal_substchar(convd, 0x3f);
-                       } else {
-                               mbfl_buffer_converter_illegal_substchar(convd, MBSTRG(current_filter_illegal_substchar));
-                       }
-
-               } else {
-                       mbfl_buffer_converter_illegal_substchar(convd, MBSTRG(current_filter_illegal_substchar));
-               }
 
-       } else {
-               mbfl_buffer_converter_illegal_substchar(convd, 0x3f);
-       }
+       mbfl_buffer_converter_illegal_mode(convd, MBSTRG(current_filter_illegal_mode));
+       mbfl_buffer_converter_illegal_substchar(convd, MBSTRG(current_filter_illegal_substchar));
 
        /* do it */
        ret = mbfl_buffer_converter_feed_result(convd, &string, &result);
@@ -5199,17 +5180,7 @@ static inline char* php_mb_chr(zend_long cp, const char* enc, size_t *output_len
        if (php_mb_is_no_encoding_utf8(no_enc)) {
 
                if (0 > cp || cp > 0x10ffff || (cp > 0xd7ff && 0xe000 > cp)) {
-                       if (php_mb_is_no_encoding_utf8(MBSTRG(current_internal_encoding)->no_encoding)) {
-                               cp = MBSTRG(current_filter_illegal_substchar);
-                       } else if (php_mb_is_no_encoding_unicode(MBSTRG(current_internal_encoding)->no_encoding)) {
-                               if (0xd800 > MBSTRG(current_filter_illegal_substchar) || MBSTRG(current_filter_illegal_substchar) > 0xdfff) {
-                                       cp = MBSTRG(current_filter_illegal_substchar);
-                               } else {
-                                       cp = 0x3f;
-                               }
-                       } else {
-                               cp = 0x3f;
-                       }
+                       cp = MBSTRG(current_filter_illegal_substchar);
                }
 
                if (cp < 0x80) {
index 921d61cca41da32ef28e2952ebbaa922eca5c914..9566e10968e1b3dfd84cd5d44346e2cedab36c57 100644 (file)
@@ -8,7 +8,13 @@ mb_substitute_character(0xfffd);
 var_dump("?" === mb_convert_encoding("\x80", "Shift_JIS", "EUC-JP"));
 mb_internal_encoding("UCS-4BE");
 var_dump("\x00\x00\xff\xfd" === mb_convert_encoding("\x80", "UCS-4BE", "UTF-8"));
+
+mb_internal_encoding("UTF-8");
+mb_substitute_character(0xfffd);
+var_dump("\u{fffd}" === mb_convert_encoding("\x80", "UTF-8", "EUC-JP-2004"));
+
 ?>
 --EXPECT--
 bool(true)
 bool(true)
+bool(true)
index 19e1a704ec0265a25e6e75a05053cbe1bb48f7b1..8ec35920c3696805a1fac5062a581ba416890699 100644 (file)
@@ -22,7 +22,7 @@ var_dump(
 mb_internal_encoding("EUC-JP");
 mb_substitute_character(0xa4a2);
 var_dump(
-    "?" === mb_chr(0xd800, "UTF-8")
+    "\u{a4a2}" === mb_chr(0xd800, "UTF-8")
 );
 
 // Invalid
index 202561afc75cfd7dc3a5b438bad5b71f1fb7b863..6248174aa613de215b1046f2e72b5d65fa6ad8f1 100644 (file)
@@ -35,5 +35,5 @@ var_dump(bin2hex(mb_convert_encoding($string_mb, "ISO-8859-1", "UTF-8")));
 string(14) "3f3f3f3f3f3f3f"
 string(14) "42424242424242"
 string(0) ""
-string(0) ""
+string(14) "3f3f3f3f3f3f3f"
 ===DONE===