From: George Peter Banyard Date: Sun, 3 May 2020 02:44:16 +0000 (+0200) Subject: Refactor mb_substitute_character() X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=7dd332f11059ca72bdb4f3bfdeebf4ce3d174239;p=php Refactor mb_substitute_character() Using the new Fast ZPP API for string|int|null This also fixes Bug #79448 which was too disruptive to fix in PHP 7.x --- diff --git a/ext/mbstring/mbstring.c b/ext/mbstring/mbstring.c index 84e28f4a4b..a70e16071b 100644 --- a/ext/mbstring/mbstring.c +++ b/ext/mbstring/mbstring.c @@ -1527,7 +1527,7 @@ PHP_FUNCTION(mb_detect_order) static inline int php_mb_check_code_point(zend_long cp) { - if (cp <= 0 || cp >= 0x110000) { + if (cp < 0 || cp >= 0x110000) { /* Out of Unicode range */ return 0; } @@ -1544,61 +1544,58 @@ static inline int php_mb_check_code_point(zend_long cp) return 1; } -/* {{{ proto mixed mb_substitute_character([mixed substchar]) +/* {{{ proto string|int|true mb_substitute_character([string|int|null substitute_character]) Sets the current substitute_character or returns the current substitute_character */ PHP_FUNCTION(mb_substitute_character) { - zval *arg1 = NULL; + zend_string *substitute_character = NULL; + zend_long substitute_codepoint; + zend_bool substitute_is_null = 1; - if (zend_parse_parameters(ZEND_NUM_ARGS(), "|z", &arg1) == FAILURE) { - RETURN_THROWS(); - } + ZEND_PARSE_PARAMETERS_START(0, 1) + Z_PARAM_OPTIONAL + Z_PARAM_STR_OR_LONG_OR_NULL(substitute_character, substitute_codepoint, substitute_is_null) + ZEND_PARSE_PARAMETERS_END(); - if (!arg1) { + if (substitute_is_null) { if (MBSTRG(current_filter_illegal_mode) == MBFL_OUTPUTFILTER_ILLEGAL_MODE_NONE) { RETURN_STRING("none"); - } else if (MBSTRG(current_filter_illegal_mode) == MBFL_OUTPUTFILTER_ILLEGAL_MODE_LONG) { + } + if (MBSTRG(current_filter_illegal_mode) == MBFL_OUTPUTFILTER_ILLEGAL_MODE_LONG) { RETURN_STRING("long"); - } else if (MBSTRG(current_filter_illegal_mode) == MBFL_OUTPUTFILTER_ILLEGAL_MODE_ENTITY) { + } + if (MBSTRG(current_filter_illegal_mode) == MBFL_OUTPUTFILTER_ILLEGAL_MODE_ENTITY) { RETURN_STRING("entity"); - } else { - RETURN_LONG(MBSTRG(current_filter_illegal_substchar)); } - } else { - RETVAL_TRUE; - - switch (Z_TYPE_P(arg1)) { - case IS_STRING: - if (strncasecmp("none", Z_STRVAL_P(arg1), Z_STRLEN_P(arg1)) == 0) { - MBSTRG(current_filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_NONE; - } else if (strncasecmp("long", Z_STRVAL_P(arg1), Z_STRLEN_P(arg1)) == 0) { - MBSTRG(current_filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_LONG; - } else if (strncasecmp("entity", Z_STRVAL_P(arg1), Z_STRLEN_P(arg1)) == 0) { - MBSTRG(current_filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_ENTITY; - } else { - convert_to_long_ex(arg1); + RETURN_LONG(MBSTRG(current_filter_illegal_substchar)); + } - if (php_mb_check_code_point(Z_LVAL_P(arg1))) { - MBSTRG(current_filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR; - MBSTRG(current_filter_illegal_substchar) = Z_LVAL_P(arg1); - } else { - php_error_docref(NULL, E_WARNING, "Unknown character"); - RETURN_FALSE; - } - } - break; - default: - convert_to_long_ex(arg1); - if (php_mb_check_code_point(Z_LVAL_P(arg1))) { - MBSTRG(current_filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR; - MBSTRG(current_filter_illegal_substchar) = Z_LVAL_P(arg1); - } else { - php_error_docref(NULL, E_WARNING, "Unknown character"); - RETURN_FALSE; - } - break; + if (substitute_character != NULL) { + if (zend_string_equals_literal_ci(substitute_character, "none")) { + MBSTRG(current_filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_NONE; + RETURN_TRUE; + } + if (zend_string_equals_literal_ci(substitute_character, "long")) { + MBSTRG(current_filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_LONG; + RETURN_TRUE; + } + if (zend_string_equals_literal_ci(substitute_character, "entity")) { + MBSTRG(current_filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_ENTITY; + RETURN_TRUE; } + /* Invalid string value */ + zend_argument_value_error(1, "must be 'none', 'long', 'entity' or a valid codepoint"); + RETURN_THROWS(); + } + /* Integer codepoint passed */ + if (!php_mb_check_code_point(substitute_codepoint)) { + zend_argument_value_error(1, "is not a valid codepoint"); + RETURN_THROWS(); } + + MBSTRG(current_filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR; + MBSTRG(current_filter_illegal_substchar) = substitute_codepoint; + RETURN_TRUE; } /* }}} */ diff --git a/ext/mbstring/mbstring.stub.php b/ext/mbstring/mbstring.stub.php index c2ae9e68b4..3a1fc929ac 100644 --- a/ext/mbstring/mbstring.stub.php +++ b/ext/mbstring/mbstring.stub.php @@ -12,8 +12,7 @@ function mb_http_output(string $encoding = UNKNOWN): string|bool {} function mb_detect_order(array|string $encoding = UNKNOWN): array|bool {} -/** @param string|int $substchar */ -function mb_substitute_character($substchar = UNKNOWN): string|int|bool {} +function mb_substitute_character(string|int|null $substitute_character = null): string|int|bool {} function mb_preferred_mime_name(string $encoding): string|false {} diff --git a/ext/mbstring/mbstring_arginfo.h b/ext/mbstring/mbstring_arginfo.h index 564e167876..57559f8ccd 100644 --- a/ext/mbstring/mbstring_arginfo.h +++ b/ext/mbstring/mbstring_arginfo.h @@ -19,7 +19,7 @@ ZEND_BEGIN_ARG_WITH_RETURN_TYPE_MASK_EX(arginfo_mb_detect_order, 0, 0, MAY_BE_AR ZEND_END_ARG_INFO() ZEND_BEGIN_ARG_WITH_RETURN_TYPE_MASK_EX(arginfo_mb_substitute_character, 0, 0, MAY_BE_STRING|MAY_BE_LONG|MAY_BE_BOOL) - ZEND_ARG_INFO(0, substchar) + ZEND_ARG_TYPE_MASK(0, substitute_character, MAY_BE_STRING|MAY_BE_LONG|MAY_BE_NULL, "null") ZEND_END_ARG_INFO() ZEND_BEGIN_ARG_WITH_RETURN_TYPE_MASK_EX(arginfo_mb_preferred_mime_name, 0, 1, MAY_BE_STRING|MAY_BE_FALSE) diff --git a/ext/mbstring/tests/bug69079.phpt b/ext/mbstring/tests/bug69079.phpt index 287b577ce4..c8bf4ce2a7 100644 --- a/ext/mbstring/tests/bug69079.phpt +++ b/ext/mbstring/tests/bug69079.phpt @@ -9,27 +9,32 @@ mb_internal_encoding('UTF-8'); var_dump(mb_substitute_character(0x1F600)); var_dump(bin2hex(mb_scrub("\xff"))); mb_substitute_character(0x3f); // Reset to '?', as the next call will fail -var_dump(mb_substitute_character(0xD800)); // Surrogate (illegal) +try { + var_dump(mb_substitute_character(0xD800)); // Surrogate (illegal) +} catch (\ValueError $e) { + echo $e->getMessage() . \PHP_EOL; +} var_dump(bin2hex(mb_scrub("\xff"))); mb_internal_encoding('EUC-JP-2004'); mb_substitute_character(0x63); // Reset to '?', as the next call will fail -mb_substitute_character(0x8fa1ef); // EUC-JP-2004 encoding of U+50AA (illegal) +try { + mb_substitute_character(0x8fa1ef); // EUC-JP-2004 encoding of U+50AA (illegal) +} catch (\ValueError $e) { + echo $e->getMessage() . \PHP_EOL; +} var_dump(bin2hex(mb_scrub("\x8d"))); mb_substitute_character(0x50aa); var_dump(bin2hex(mb_scrub("\x8d"))); ?> ---EXPECTF-- +--EXPECT-- bool(true) string(8) "f09f9880" - -Warning: mb_substitute_character(): Unknown character in %s on line %d -bool(false) +mb_substitute_character(): Argument #1 ($substitute_character) is not a valid codepoint string(2) "3f" - -Warning: mb_substitute_character(): Unknown character in %s on line %d +mb_substitute_character(): Argument #1 ($substitute_character) is not a valid codepoint string(2) "63" string(6) "8fa1ef" diff --git a/ext/mbstring/tests/mb_substitute_character.phpt b/ext/mbstring/tests/mb_substitute_character.phpt index 9e50823ef4..9848624e8a 100644 --- a/ext/mbstring/tests/mb_substitute_character.phpt +++ b/ext/mbstring/tests/mb_substitute_character.phpt @@ -23,9 +23,13 @@ var_dump(mb_substitute_character('entity')); var_dump(mb_substitute_character()); var_dump(bin2hex(mb_convert_encoding("\xe2\x99\xa0\xe3\x81\x82", "CP932", "UTF-8"))); -var_dump(mb_substitute_character('BAD_NAME')); +try { + var_dump(mb_substitute_character('BAD_NAME')); +} catch (\ValueError $e) { + echo $e->getMessage() . \PHP_EOL; +} ?> ---EXPECTF-- +--EXPECT-- bool(true) int(12356) string(8) "82a282a0" @@ -38,6 +42,4 @@ string(4) "82a0" bool(true) string(6) "entity" string(20) "262378323636303b82a0" - -Warning: mb_substitute_character(): Unknown character in %s on line %d -bool(false) +mb_substitute_character(): Argument #1 ($substitute_character) must be 'none', 'long', 'entity' or a valid codepoint diff --git a/ext/mbstring/tests/mb_substitute_character_basic.phpt b/ext/mbstring/tests/mb_substitute_character_basic.phpt index 63ae10f76c..3d5571aac4 100644 --- a/ext/mbstring/tests/mb_substitute_character_basic.phpt +++ b/ext/mbstring/tests/mb_substitute_character_basic.phpt @@ -3,7 +3,6 @@ Test mb_substitute_character() function : basic functionality --SKIPIF-- --FILE-- getMessage() . \PHP_EOL; +} ?> ---EXPECTF-- +--EXPECT-- *** Testing mb_substitute_character() : basic functionality *** int(63) bool(true) @@ -36,6 +42,6 @@ bool(true) int(1234) bool(true) string(4) "none" - -Warning: mb_substitute_character(): Unknown character in %s on line %d -bool(false) +bool(true) +string(4) "long" +mb_substitute_character(): Argument #1 ($substitute_character) must be 'none', 'long', 'entity' or a valid codepoint diff --git a/ext/mbstring/tests/mb_substitute_character_variation2.phpt b/ext/mbstring/tests/mb_substitute_character_variation2.phpt index 84b1d569c2..4fe4f04f37 100644 --- a/ext/mbstring/tests/mb_substitute_character_variation2.phpt +++ b/ext/mbstring/tests/mb_substitute_character_variation2.phpt @@ -3,7 +3,6 @@ Test mb_substitute_character() function : variation unmappable out char for conv --SKIPIF-- --FILE-- +--FILE-- + 1, 'two' => 2); + +//array of values to iterate over +$inputs = array( + + // int data + 'int 0' => 0, + 'int 1' => 1, + 'int 12345' => 12345, + 'int -12345' => -2345, + + // float data + 'float 10.5' => 10.5, + 'float -10.5' => -10.5, + 'float 10.0e19' => 10.0e19, // Cannot be represented as int + 'float -10.0e19' => -10.0e19, // Cannot be represented as int + 'float .5' => .5, + + // array data + 'empty array' => array(), + 'int indexed array' => $index_array, + 'associative array' => $assoc_array, + 'nested arrays' => array('foo', $index_array, $assoc_array), + + // null data + 'uppercase NULL' => NULL, + 'lowercase null' => null, + + // boolean data + 'lowercase true' => true, + 'lowercase false' =>false, + 'uppercase TRUE' =>TRUE, + 'uppercase FALSE' =>FALSE, + + // empty data + 'empty string DQ' => "", + 'empty string SQ' => '', + + // string data + 'string DQ' => "string", + 'string SQ' => 'string', + 'mixed case string' => "sTrInG", + 'heredoc' => $heredoc, + + // object data + 'instance of classWithToString' => new classWithToString(), + 'instance of classWithoutToString' => new classWithoutToString(), + + // undefined data + 'undefined var' => @$undefined_var, + + // unset data + 'unset var' => @$unset_var, +); + +// loop through each element of the array for substchar + +mb_internal_encoding('utf-8'); +foreach($inputs as $key =>$value) { + echo "--$key--\n"; + try { + var_dump( mb_substitute_character($value) ); + } catch (\ValueError|\TypeError $e) { + echo get_class($e) . ': ' . $e->getMessage() . \PHP_EOL; + } +} + +fclose($fp); + +?> +--EXPECT-- +*** Testing mb_substitute_character(): various types in strict typing mode *** +--int 0-- +bool(true) +--int 1-- +bool(true) +--int 12345-- +bool(true) +--int -12345-- +ValueError: mb_substitute_character(): Argument #1 ($substitute_character) is not a valid codepoint +--float 10.5-- +TypeError: mb_substitute_character(): Argument #1 ($substitute_character) must be of type string|int|null, float given +--float -10.5-- +TypeError: mb_substitute_character(): Argument #1 ($substitute_character) must be of type string|int|null, float given +--float 10.0e19-- +TypeError: mb_substitute_character(): Argument #1 ($substitute_character) must be of type string|int|null, float given +--float -10.0e19-- +TypeError: mb_substitute_character(): Argument #1 ($substitute_character) must be of type string|int|null, float given +--float .5-- +TypeError: mb_substitute_character(): Argument #1 ($substitute_character) must be of type string|int|null, float given +--empty array-- +TypeError: mb_substitute_character(): Argument #1 ($substitute_character) must be of type string|int|null, array given +--int indexed array-- +TypeError: mb_substitute_character(): Argument #1 ($substitute_character) must be of type string|int|null, array given +--associative array-- +TypeError: mb_substitute_character(): Argument #1 ($substitute_character) must be of type string|int|null, array given +--nested arrays-- +TypeError: mb_substitute_character(): Argument #1 ($substitute_character) must be of type string|int|null, array given +--uppercase NULL-- +int(12345) +--lowercase null-- +int(12345) +--lowercase true-- +TypeError: mb_substitute_character(): Argument #1 ($substitute_character) must be of type string|int|null, bool given +--lowercase false-- +TypeError: mb_substitute_character(): Argument #1 ($substitute_character) must be of type string|int|null, bool given +--uppercase TRUE-- +TypeError: mb_substitute_character(): Argument #1 ($substitute_character) must be of type string|int|null, bool given +--uppercase FALSE-- +TypeError: mb_substitute_character(): Argument #1 ($substitute_character) must be of type string|int|null, bool given +--empty string DQ-- +ValueError: mb_substitute_character(): Argument #1 ($substitute_character) must be 'none', 'long', 'entity' or a valid codepoint +--empty string SQ-- +ValueError: mb_substitute_character(): Argument #1 ($substitute_character) must be 'none', 'long', 'entity' or a valid codepoint +--string DQ-- +ValueError: mb_substitute_character(): Argument #1 ($substitute_character) must be 'none', 'long', 'entity' or a valid codepoint +--string SQ-- +ValueError: mb_substitute_character(): Argument #1 ($substitute_character) must be 'none', 'long', 'entity' or a valid codepoint +--mixed case string-- +ValueError: mb_substitute_character(): Argument #1 ($substitute_character) must be 'none', 'long', 'entity' or a valid codepoint +--heredoc-- +ValueError: mb_substitute_character(): Argument #1 ($substitute_character) must be 'none', 'long', 'entity' or a valid codepoint +--instance of classWithToString-- +TypeError: mb_substitute_character(): Argument #1 ($substitute_character) must be of type string|int|null, object given +--instance of classWithoutToString-- +TypeError: mb_substitute_character(): Argument #1 ($substitute_character) must be of type string|int|null, object given +--undefined var-- +int(12345) +--unset var-- +int(12345) diff --git a/ext/mbstring/tests/mb_substitute_character_variation1.phpt b/ext/mbstring/tests/mb_substitute_character_variation_weak_types.phpt similarity index 51% rename from ext/mbstring/tests/mb_substitute_character_variation1.phpt rename to ext/mbstring/tests/mb_substitute_character_variation_weak_types.phpt index 53a21c8787..f9bf85a371 100644 --- a/ext/mbstring/tests/mb_substitute_character_variation1.phpt +++ b/ext/mbstring/tests/mb_substitute_character_variation_weak_types.phpt @@ -3,26 +3,16 @@ Test mb_substitute_character() function : usage variation --SKIPIF-- --FILE-- 10.5, 'float -10.5' => -10.5, - 'float 12.3456789000e10' => 12.3456789000e10, - 'float -12.3456789000e10' => -12.3456789000e10, + 'float 10.0e19' => 10.0e19, // Cannot be represented as int + 'float -10.0e19' => -10.0e19, // Cannot be represented as int 'float .5' => .5, // array data @@ -111,118 +101,74 @@ $inputs = array( mb_internal_encoding('utf-8'); foreach($inputs as $key =>$value) { - echo "\n--$key--\n"; - var_dump( mb_substitute_character($value) ); -}; + echo "--$key--\n"; + try { + var_dump( mb_substitute_character($value) ); + } catch (\ValueError|\TypeError $e) { + echo get_class($e) . ': ' . $e->getMessage() . \PHP_EOL; + } +} fclose($fp); ?> ---EXPECTF-- -*** Testing mb_substitute_character() : usage variation *** - +--EXPECT-- +*** Testing mb_substitute_character(): various types in weak typing mode *** --int 0-- -Error: 2 - mb_substitute_character(): Unknown character, %s(%d) -bool(false) - +bool(true) --int 1-- bool(true) - --int 12345-- bool(true) - --int -12345-- -Error: 2 - mb_substitute_character(): Unknown character, %s(%d) -bool(false) - +ValueError: mb_substitute_character(): Argument #1 ($substitute_character) is not a valid codepoint --float 10.5-- bool(true) - --float -10.5-- -Error: 2 - mb_substitute_character(): Unknown character, %s(%d) -bool(false) - ---float 12.3456789000e10-- -Error: 2 - mb_substitute_character(): Unknown character, %s(%d) -bool(false) - ---float -12.3456789000e10-- -Error: 2 - mb_substitute_character(): Unknown character, %s(%d) -bool(false) - +ValueError: mb_substitute_character(): Argument #1 ($substitute_character) is not a valid codepoint +--float 10.0e19-- +ValueError: mb_substitute_character(): Argument #1 ($substitute_character) must be 'none', 'long', 'entity' or a valid codepoint +--float -10.0e19-- +ValueError: mb_substitute_character(): Argument #1 ($substitute_character) must be 'none', 'long', 'entity' or a valid codepoint --float .5-- -Error: 2 - mb_substitute_character(): Unknown character, %s(%d) -bool(false) - +bool(true) --empty array-- -Error: 2 - mb_substitute_character(): Unknown character, %s(%d) -bool(false) - +TypeError: mb_substitute_character(): Argument #1 ($substitute_character) must be of type string|int|null, array given --int indexed array-- -bool(true) - +TypeError: mb_substitute_character(): Argument #1 ($substitute_character) must be of type string|int|null, array given --associative array-- -bool(true) - +TypeError: mb_substitute_character(): Argument #1 ($substitute_character) must be of type string|int|null, array given --nested arrays-- -bool(true) - +TypeError: mb_substitute_character(): Argument #1 ($substitute_character) must be of type string|int|null, array given --uppercase NULL-- -Error: 2 - mb_substitute_character(): Unknown character, %s(%d) -bool(false) - +int(0) --lowercase null-- -Error: 2 - mb_substitute_character(): Unknown character, %s(%d) -bool(false) - +int(0) --lowercase true-- bool(true) - --lowercase false-- -Error: 2 - mb_substitute_character(): Unknown character, %s(%d) -bool(false) - +bool(true) --uppercase TRUE-- bool(true) - --uppercase FALSE-- -Error: 2 - mb_substitute_character(): Unknown character, %s(%d) -bool(false) - ---empty string DQ-- bool(true) - +--empty string DQ-- +ValueError: mb_substitute_character(): Argument #1 ($substitute_character) must be 'none', 'long', 'entity' or a valid codepoint --empty string SQ-- -bool(true) - +ValueError: mb_substitute_character(): Argument #1 ($substitute_character) must be 'none', 'long', 'entity' or a valid codepoint --string DQ-- -Error: 2 - mb_substitute_character(): Unknown character, %s(%d) -bool(false) - +ValueError: mb_substitute_character(): Argument #1 ($substitute_character) must be 'none', 'long', 'entity' or a valid codepoint --string SQ-- -Error: 2 - mb_substitute_character(): Unknown character, %s(%d) -bool(false) - +ValueError: mb_substitute_character(): Argument #1 ($substitute_character) must be 'none', 'long', 'entity' or a valid codepoint --mixed case string-- -Error: 2 - mb_substitute_character(): Unknown character, %s(%d) -bool(false) - +ValueError: mb_substitute_character(): Argument #1 ($substitute_character) must be 'none', 'long', 'entity' or a valid codepoint --heredoc-- -Error: 2 - mb_substitute_character(): Unknown character, %s(%d) -bool(false) - +ValueError: mb_substitute_character(): Argument #1 ($substitute_character) must be 'none', 'long', 'entity' or a valid codepoint --instance of classWithToString-- -Error: 8 - Object of class classWithToString could not be converted to int, %s(%d) -bool(true) - +ValueError: mb_substitute_character(): Argument #1 ($substitute_character) must be 'none', 'long', 'entity' or a valid codepoint --instance of classWithoutToString-- -Error: 8 - Object of class classWithoutToString could not be converted to int, %s(%d) -bool(true) - +TypeError: mb_substitute_character(): Argument #1 ($substitute_character) must be of type string|int|null, object given --undefined var-- -Error: 2 - mb_substitute_character(): Unknown character, %s(%d) -bool(false) - +int(0) --unset var-- -Error: 2 - mb_substitute_character(): Unknown character, %s(%d) -bool(false) +int(0)