From e53162a32b011ef22c3e0210e7af334d968f227c Mon Sep 17 00:00:00 2001 From: Nikita Popov Date: Thu, 3 Aug 2017 22:32:31 +0200 Subject: [PATCH] Return false on invalid codepoint in mb_chr() Instead of returning the encoding of the current substitution character. This allows a robust check for the failure case. The substitution character (especially the default of "?") is also a valid output of mb_chr() for a valid input (for "?" that would be 0x3f), so it's a bad choice for an error value. --- ext/mbstring/mbstring.c | 53 ++++++++++++++++------------------ ext/mbstring/tests/mb_chr.phpt | 20 ++----------- 2 files changed, 27 insertions(+), 46 deletions(-) diff --git a/ext/mbstring/mbstring.c b/ext/mbstring/mbstring.c index aca182acf6..5618d7a75b 100644 --- a/ext/mbstring/mbstring.c +++ b/ext/mbstring/mbstring.c @@ -106,8 +106,6 @@ static void php_mb_gpc_set_input_encoding(const zend_encoding *encoding); static inline zend_bool php_mb_is_unsupported_no_encoding(enum mbfl_no_encoding no_enc); -static inline zend_bool php_mb_is_no_encoding_unicode(enum mbfl_no_encoding no_enc); - static inline zend_bool php_mb_is_no_encoding_utf8(enum mbfl_no_encoding no_enc); /* }}} */ @@ -3172,13 +3170,6 @@ static inline zend_bool php_mb_is_unsupported_no_encoding(enum mbfl_no_encoding } -/* See mbfl_no_encoding definition for list of unicode encodings */ -static inline zend_bool php_mb_is_no_encoding_unicode(enum mbfl_no_encoding no_enc) -{ - return (no_enc >= mbfl_no_encoding_ucs4 && no_enc <= mbfl_no_encoding_utf8_sb); -} - - /* See mbfl_no_encoding definition for list of UTF-8 encodings */ static inline zend_bool php_mb_is_no_encoding_utf8(enum mbfl_no_encoding no_enc) { @@ -5143,10 +5134,18 @@ static inline char* php_mb_chr(zend_long cp, const char* enc, size_t *output_len } } - if (php_mb_is_no_encoding_utf8(no_enc)) { + if (php_mb_is_unsupported_no_encoding(no_enc)) { + php_error_docref(NULL, E_WARNING, "Unsupported encoding \"%s\"", enc); + return NULL; + } + + if (cp < 0 || cp > 0x10ffff) { + return NULL; + } - if (0 > cp || cp > 0x10ffff || (cp > 0xd7ff && 0xe000 > cp)) { - cp = MBSTRG(current_filter_illegal_substchar); + if (php_mb_is_no_encoding_utf8(no_enc)) { + if (cp > 0xd7ff && 0xe000 > cp) { + return NULL; } if (cp < 0x80) { @@ -5182,20 +5181,6 @@ static inline char* php_mb_chr(zend_long cp, const char* enc, size_t *output_len } return ret; - - } else if (php_mb_is_unsupported_no_encoding(no_enc)) { - php_error_docref(NULL, E_WARNING, "Unsupported encoding \"%s\"", enc); - return NULL; - } - - if (0 > cp || 0x10ffff < cp) { - - if (php_mb_is_no_encoding_unicode(MBSTRG(current_internal_encoding)->no_encoding)) { - cp = MBSTRG(current_filter_illegal_substchar); - } else { - cp = 0x3f; - } - } buf_len = 4; @@ -5206,9 +5191,21 @@ static inline char* php_mb_chr(zend_long cp, const char* enc, size_t *output_len buf[3] = cp & 0xff; buf[4] = 0; - ret = php_mb_convert_encoding(buf, buf_len, enc, "UCS-4BE", &ret_len); - efree(buf); + { + long orig_illegalchars = MBSTRG(illegalchars); + MBSTRG(illegalchars) = 0; + ret = php_mb_convert_encoding(buf, buf_len, enc, "UCS-4BE", &ret_len); + if (MBSTRG(illegalchars) != 0) { + efree(buf); + efree(ret); + MBSTRG(illegalchars) = orig_illegalchars; + return NULL; + } + + MBSTRG(illegalchars) = orig_illegalchars; + } + efree(buf); if (output_len) { *output_len = ret_len; } diff --git a/ext/mbstring/tests/mb_chr.phpt b/ext/mbstring/tests/mb_chr.phpt index b99aa12b99..d61178af4d 100644 --- a/ext/mbstring/tests/mb_chr.phpt +++ b/ext/mbstring/tests/mb_chr.phpt @@ -7,22 +7,8 @@ mb_chr() var_dump( "\u{20bb7}" === mb_chr(0x20bb7), "\x8f\xa1\xef" === mb_chr(0x50aa, "EUC-JP-2004"), - "?" === mb_chr(0xd800) -); - -mb_internal_encoding("UCS-4BE"); -mb_substitute_character(0xfffd); -var_dump( - "\u{fffd}" === mb_chr(0xd800, "UTF-8") -); -var_dump( - "\u{fffd}" === mb_chr(0xd800, "UTF-8") -); - -mb_internal_encoding("EUC-JP"); -mb_substitute_character(0xa4a2); -var_dump( - "\u{a4a2}" === mb_chr(0xd800, "UTF-8") + false === mb_chr(0xd800), + false === mb_chr(0x1f600, "EUC-JP-2004") ); // Invalid @@ -39,8 +25,6 @@ bool(true) bool(true) bool(true) bool(true) -bool(true) -bool(true) Warning: mb_chr(): Unknown encoding "typo" in %s on line %d -- 2.50.1