From a8a9e93e9a902ffd4099e3ba2a7a269da09120c5 Mon Sep 17 00:00:00 2001 From: Nikita Popov Date: Thu, 3 Aug 2017 21:05:27 +0200 Subject: [PATCH] Revert/fix mb_substitute_character() codepoint checks The introduced checks did not treat "non-Unicode" encodings correctly, because they treated the passed integer as encoded in the internal encoding in that case, while in actuality the substitute character is always a Unicode codepoint. Additionally checking the codepoint against the internal encoding is not correct in any case, because the substitution character must be mapped in the *target* encoding of the conversion, which does not necessarily coincide with the internal encoding (the internal encoding is the default *source* encoding, not *target* encoding). This reverts the checks back to simple range checks, but in a way that still resolves #69079: Characters outside the Basic Multilingual Plane are now accepted and Surrogate Codepoints are rejected. A distinction between UTF-8 and non-UTF-8 encodings is not made for surrogate checks (as in the original patch), as surrogates are always illegal on their own. Specifying a surrogate as substitution character would only make sense if you could specify a substitution string with more than one character -- however we do not support that. --- ext/mbstring/mbstring.c | 72 ++++--------------- ext/mbstring/tests/bug69079.phpt | 29 ++++++-- ext/mbstring/tests/bug69086.phpt | 3 - ext/mbstring/tests/mb_chr.phpt | 13 ++-- .../tests/mb_substitute_character_basic.phpt | 2 +- .../mb_substitute_character_variation1.phpt | 34 ++++----- 6 files changed, 61 insertions(+), 92 deletions(-) diff --git a/ext/mbstring/mbstring.c b/ext/mbstring/mbstring.c index 932d6dda28..732b5f65ba 100644 --- a/ext/mbstring/mbstring.c +++ b/ext/mbstring/mbstring.c @@ -1978,69 +1978,21 @@ PHP_FUNCTION(mb_detect_order) static inline int php_mb_check_code_point(long cp) { - enum mbfl_no_encoding no_enc; - char* buf; - char buf_len; - - no_enc = MBSTRG(current_internal_encoding)->no_encoding; - - if (php_mb_is_no_encoding_utf8(no_enc)) { - - if ((cp > 0 && 0xd800 > cp) || (cp > 0xdfff && 0x110000 > cp)) { - return 1; - } - + if (cp <= 0 || cp >= 0x110000) { + /* Out of Unicode range */ return 0; - } else if (php_mb_is_no_encoding_unicode(no_enc)) { - - if (0 > cp || cp > 0x10ffff) { - return 0; - } - - return 1; - - // backward compatibility - } else if (php_mb_is_unsupported_no_encoding(no_enc)) { - return cp < 0xffff && cp > 0x0; } - if (cp < 0x100) { - buf_len = 1; - buf = (char *) safe_emalloc(buf_len, 1, 1); - buf[0] = cp; - buf[1] = 0; - } else if (cp < 0x10000) { - buf_len = 2; - buf = (char *) safe_emalloc(buf_len, 1, 1); - buf[0] = cp >> 8; - buf[1] = cp & 0xff; - buf[2] = 0; - } else if (cp < 0x1000000) { - buf_len = 3; - buf = (char *) safe_emalloc(buf_len, 1, 1); - buf[0] = cp >> 16; - buf[1] = (cp >> 8) & 0xff; - buf[2] = cp & 0xff; - buf[3] = 0; - } else { - buf_len = 4; - buf = (char *) safe_emalloc(buf_len, 1, 1); - buf[0] = cp >> 24; - buf[1] = (cp >> 16) & 0xff; - buf[2] = (cp >> 8) & 0xff; - buf[3] = cp & 0xff; - buf[4] = 0; - } - - if (php_mb_check_encoding(buf, buf_len, NULL)) { - efree(buf); - - return 1; + if (cp >= 0xd800 && cp <= 0xdfff) { + /* Surrogate code-point. These are never valid on their own and we only allow a single + * substitute character. */ + return 0; } - efree(buf); - - return 0; + /* As the we do not know the target encoding of the conversion operation that is going to + * use the substitution character, we cannot check whether the codepoint is actually mapped + * in the given encoding at this point. Thus we have to accept everything. */ + return 1; } /* {{{ proto mixed mb_substitute_character([mixed substchar]) @@ -2081,7 +2033,7 @@ PHP_FUNCTION(mb_substitute_character) MBSTRG(current_filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR; MBSTRG(current_filter_illegal_substchar) = Z_LVAL_P(arg1); } else { - php_error_docref(NULL, E_WARNING, "Unknown character."); + php_error_docref(NULL, E_WARNING, "Unknown character"); RETURN_FALSE; } } @@ -2092,7 +2044,7 @@ PHP_FUNCTION(mb_substitute_character) MBSTRG(current_filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR; MBSTRG(current_filter_illegal_substchar) = Z_LVAL_P(arg1); } else { - php_error_docref(NULL, E_WARNING, "Unknown character."); + php_error_docref(NULL, E_WARNING, "Unknown character"); RETURN_FALSE; } break; diff --git a/ext/mbstring/tests/bug69079.phpt b/ext/mbstring/tests/bug69079.phpt index 67c4d0cc2d..65b1fd45a5 100644 --- a/ext/mbstring/tests/bug69079.phpt +++ b/ext/mbstring/tests/bug69079.phpt @@ -4,11 +4,32 @@ Bug #69079 (enhancement for mb_substitute_character) --FILE-- ---EXPECT-- +--EXPECTF-- bool(true) -bool(true) \ No newline at end of file +string(8) "f09f9880" + +Warning: mb_substitute_character(): Unknown character in %s on line %d +bool(false) +string(2) "3f" + +Warning: mb_substitute_character(): Unknown character in %s on line %d +string(2) "63" +string(6) "8fa1ef" diff --git a/ext/mbstring/tests/bug69086.phpt b/ext/mbstring/tests/bug69086.phpt index 0540bcba78..921d61cca4 100644 --- a/ext/mbstring/tests/bug69086.phpt +++ b/ext/mbstring/tests/bug69086.phpt @@ -8,10 +8,7 @@ mb_substitute_character(0xfffd); var_dump("?" === mb_convert_encoding("\x80", "Shift_JIS", "EUC-JP")); mb_internal_encoding("UCS-4BE"); var_dump("\x00\x00\xff\xfd" === mb_convert_encoding("\x80", "UCS-4BE", "UTF-8")); -mb_substitute_character(0xd800); -var_dump("\x00\x00\x00\x3f" === mb_convert_encoding("\x80", "UCS-4BE", "UTF-8")); ?> --EXPECT-- bool(true) bool(true) -bool(true) \ No newline at end of file diff --git a/ext/mbstring/tests/mb_chr.phpt b/ext/mbstring/tests/mb_chr.phpt index 095ce90ae5..19e1a704ec 100644 --- a/ext/mbstring/tests/mb_chr.phpt +++ b/ext/mbstring/tests/mb_chr.phpt @@ -15,9 +15,8 @@ mb_substitute_character(0xfffd); var_dump( "\u{fffd}" === mb_chr(0xd800, "UTF-8") ); -mb_substitute_character(0xd800); var_dump( - "?" === mb_chr(0xd800, "UTF-8") + "\u{fffd}" === mb_chr(0xd800, "UTF-8") ); mb_internal_encoding("EUC-JP"); @@ -43,15 +42,15 @@ bool(true) bool(true) bool(true) -Warning: mb_chr(): Unknown encoding "typo" in %s on line 26 +Warning: mb_chr(): Unknown encoding "typo" in %s on line %d -Warning: mb_chr(): Unsupported encoding "pass" in %s on line 27 +Warning: mb_chr(): Unsupported encoding "pass" in %s on line %d -Warning: mb_chr(): Unsupported encoding "jis" in %s on line 28 +Warning: mb_chr(): Unsupported encoding "jis" in %s on line %d -Warning: mb_chr(): Unsupported encoding "cp50222" in %s on line 29 +Warning: mb_chr(): Unsupported encoding "cp50222" in %s on line %d -Warning: mb_chr(): Unsupported encoding "utf-7" in %s on line 30 +Warning: mb_chr(): Unsupported encoding "utf-7" in %s on line %d bool(false) bool(false) bool(false) diff --git a/ext/mbstring/tests/mb_substitute_character_basic.phpt b/ext/mbstring/tests/mb_substitute_character_basic.phpt index 30cda8a926..036203f416 100644 --- a/ext/mbstring/tests/mb_substitute_character_basic.phpt +++ b/ext/mbstring/tests/mb_substitute_character_basic.phpt @@ -38,6 +38,6 @@ int(1234) bool(true) string(4) "none" -Warning: mb_substitute_character(): Unknown character. in %s on line %d +Warning: mb_substitute_character(): Unknown character in %s on line %d bool(false) ===DONE=== diff --git a/ext/mbstring/tests/mb_substitute_character_variation1.phpt b/ext/mbstring/tests/mb_substitute_character_variation1.phpt index 3458bc22e8..fa786a1e5c 100644 --- a/ext/mbstring/tests/mb_substitute_character_variation1.phpt +++ b/ext/mbstring/tests/mb_substitute_character_variation1.phpt @@ -123,7 +123,7 @@ fclose($fp); *** Testing mb_substitute_character() : usage variation *** --int 0-- -Error: 2 - mb_substitute_character(): Unknown character., %s(%d) +Error: 2 - mb_substitute_character(): Unknown character, %s(%d) bool(false) --int 1-- @@ -133,30 +133,30 @@ bool(true) bool(true) --int -12345-- -Error: 2 - mb_substitute_character(): Unknown character., %s(%d) +Error: 2 - mb_substitute_character(): Unknown character, %s(%d) bool(false) --float 10.5-- bool(true) --float -10.5-- -Error: 2 - mb_substitute_character(): Unknown character., %s(%d) +Error: 2 - mb_substitute_character(): Unknown character, %s(%d) bool(false) --float 12.3456789000e10-- -Error: 2 - mb_substitute_character(): Unknown character., %s(%d) +Error: 2 - mb_substitute_character(): Unknown character, %s(%d) bool(false) --float -12.3456789000e10-- -Error: 2 - mb_substitute_character(): Unknown character., %s(%d) +Error: 2 - mb_substitute_character(): Unknown character, %s(%d) bool(false) --float .5-- -Error: 2 - mb_substitute_character(): Unknown character., %s(%d) +Error: 2 - mb_substitute_character(): Unknown character, %s(%d) bool(false) --empty array-- -Error: 2 - mb_substitute_character(): Unknown character., %s(%d) +Error: 2 - mb_substitute_character(): Unknown character, %s(%d) bool(false) --int indexed array-- @@ -169,25 +169,25 @@ bool(true) bool(true) --uppercase NULL-- -Error: 2 - mb_substitute_character(): Unknown character., %s(%d) +Error: 2 - mb_substitute_character(): Unknown character, %s(%d) bool(false) --lowercase null-- -Error: 2 - mb_substitute_character(): Unknown character., %s(%d) +Error: 2 - mb_substitute_character(): Unknown character, %s(%d) bool(false) --lowercase true-- bool(true) --lowercase false-- -Error: 2 - mb_substitute_character(): Unknown character., %s(%d) +Error: 2 - mb_substitute_character(): Unknown character, %s(%d) bool(false) --uppercase TRUE-- bool(true) --uppercase FALSE-- -Error: 2 - mb_substitute_character(): Unknown character., %s(%d) +Error: 2 - mb_substitute_character(): Unknown character, %s(%d) bool(false) --empty string DQ-- @@ -197,19 +197,19 @@ bool(true) bool(true) --string DQ-- -Error: 2 - mb_substitute_character(): Unknown character., %s(%d) +Error: 2 - mb_substitute_character(): Unknown character, %s(%d) bool(false) --string SQ-- -Error: 2 - mb_substitute_character(): Unknown character., %s(%d) +Error: 2 - mb_substitute_character(): Unknown character, %s(%d) bool(false) --mixed case string-- -Error: 2 - mb_substitute_character(): Unknown character., %s(%d) +Error: 2 - mb_substitute_character(): Unknown character, %s(%d) bool(false) --heredoc-- -Error: 2 - mb_substitute_character(): Unknown character., %s(%d) +Error: 2 - mb_substitute_character(): Unknown character, %s(%d) bool(false) --instance of classWithToString-- @@ -221,11 +221,11 @@ Error: 8 - Object of class classWithoutToString could not be converted to int, % bool(true) --undefined var-- -Error: 2 - mb_substitute_character(): Unknown character., %s(%d) +Error: 2 - mb_substitute_character(): Unknown character, %s(%d) bool(false) --unset var-- -Error: 2 - mb_substitute_character(): Unknown character., %s(%d) +Error: 2 - mb_substitute_character(): Unknown character, %s(%d) bool(false) ===DONE=== -- 2.40.0