static inline int php_mb_check_code_point(long cp)
{
- enum mbfl_no_encoding no_enc;
- char* buf;
- char buf_len;
-
- no_enc = MBSTRG(current_internal_encoding)->no_encoding;
-
- if (php_mb_is_no_encoding_utf8(no_enc)) {
-
- if ((cp > 0 && 0xd800 > cp) || (cp > 0xdfff && 0x110000 > cp)) {
- return 1;
- }
-
+ if (cp <= 0 || cp >= 0x110000) {
+ /* Out of Unicode range */
return 0;
- } else if (php_mb_is_no_encoding_unicode(no_enc)) {
-
- if (0 > cp || cp > 0x10ffff) {
- return 0;
- }
-
- return 1;
-
- // backward compatibility
- } else if (php_mb_is_unsupported_no_encoding(no_enc)) {
- return cp < 0xffff && cp > 0x0;
}
- if (cp < 0x100) {
- buf_len = 1;
- buf = (char *) safe_emalloc(buf_len, 1, 1);
- buf[0] = cp;
- buf[1] = 0;
- } else if (cp < 0x10000) {
- buf_len = 2;
- buf = (char *) safe_emalloc(buf_len, 1, 1);
- buf[0] = cp >> 8;
- buf[1] = cp & 0xff;
- buf[2] = 0;
- } else if (cp < 0x1000000) {
- buf_len = 3;
- buf = (char *) safe_emalloc(buf_len, 1, 1);
- buf[0] = cp >> 16;
- buf[1] = (cp >> 8) & 0xff;
- buf[2] = cp & 0xff;
- buf[3] = 0;
- } else {
- buf_len = 4;
- buf = (char *) safe_emalloc(buf_len, 1, 1);
- buf[0] = cp >> 24;
- buf[1] = (cp >> 16) & 0xff;
- buf[2] = (cp >> 8) & 0xff;
- buf[3] = cp & 0xff;
- buf[4] = 0;
- }
-
- if (php_mb_check_encoding(buf, buf_len, NULL)) {
- efree(buf);
-
- return 1;
+ if (cp >= 0xd800 && cp <= 0xdfff) {
+ /* Surrogate code-point. These are never valid on their own and we only allow a single
+ * substitute character. */
+ return 0;
}
- efree(buf);
-
- return 0;
+ /* As the we do not know the target encoding of the conversion operation that is going to
+ * use the substitution character, we cannot check whether the codepoint is actually mapped
+ * in the given encoding at this point. Thus we have to accept everything. */
+ return 1;
}
/* {{{ proto mixed mb_substitute_character([mixed substchar])
MBSTRG(current_filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR;
MBSTRG(current_filter_illegal_substchar) = Z_LVAL_P(arg1);
} else {
- php_error_docref(NULL, E_WARNING, "Unknown character.");
+ php_error_docref(NULL, E_WARNING, "Unknown character");
RETURN_FALSE;
}
}
MBSTRG(current_filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR;
MBSTRG(current_filter_illegal_substchar) = Z_LVAL_P(arg1);
} else {
- php_error_docref(NULL, E_WARNING, "Unknown character.");
+ php_error_docref(NULL, E_WARNING, "Unknown character");
RETURN_FALSE;
}
break;
<?php extension_loaded('mbstring') or die('skip mbstring not available'); ?>
--FILE--
<?php
+
mb_internal_encoding('UTF-8');
-var_dump(mb_substitute_character(0x1f600));
+var_dump(mb_substitute_character(0x1F600));
+var_dump(bin2hex(mb_scrub("\xff")));
+mb_substitute_character(0x3f); // Reset to '?', as the next call will fail
+var_dump(mb_substitute_character(0xD800)); // Surrogate (illegal)
+var_dump(bin2hex(mb_scrub("\xff")));
+
mb_internal_encoding('EUC-JP-2004');
-var_dump(mb_substitute_character(0x8fa1ef));
+
+mb_substitute_character(0x63); // Reset to '?', as the next call will fail
+mb_substitute_character(0x8fa1ef); // EUC-JP-2004 encoding of U+50AA (illegal)
+var_dump(bin2hex(mb_scrub("\x8d")));
+
+mb_substitute_character(0x50aa);
+var_dump(bin2hex(mb_scrub("\x8d")));
+
?>
---EXPECT--
+--EXPECTF--
bool(true)
-bool(true)
\ No newline at end of file
+string(8) "f09f9880"
+
+Warning: mb_substitute_character(): Unknown character in %s on line %d
+bool(false)
+string(2) "3f"
+
+Warning: mb_substitute_character(): Unknown character in %s on line %d
+string(2) "63"
+string(6) "8fa1ef"
var_dump("?" === mb_convert_encoding("\x80", "Shift_JIS", "EUC-JP"));
mb_internal_encoding("UCS-4BE");
var_dump("\x00\x00\xff\xfd" === mb_convert_encoding("\x80", "UCS-4BE", "UTF-8"));
-mb_substitute_character(0xd800);
-var_dump("\x00\x00\x00\x3f" === mb_convert_encoding("\x80", "UCS-4BE", "UTF-8"));
?>
--EXPECT--
bool(true)
bool(true)
-bool(true)
\ No newline at end of file
var_dump(
"\u{fffd}" === mb_chr(0xd800, "UTF-8")
);
-mb_substitute_character(0xd800);
var_dump(
- "?" === mb_chr(0xd800, "UTF-8")
+ "\u{fffd}" === mb_chr(0xd800, "UTF-8")
);
mb_internal_encoding("EUC-JP");
bool(true)
bool(true)
-Warning: mb_chr(): Unknown encoding "typo" in %s on line 26
+Warning: mb_chr(): Unknown encoding "typo" in %s on line %d
-Warning: mb_chr(): Unsupported encoding "pass" in %s on line 27
+Warning: mb_chr(): Unsupported encoding "pass" in %s on line %d
-Warning: mb_chr(): Unsupported encoding "jis" in %s on line 28
+Warning: mb_chr(): Unsupported encoding "jis" in %s on line %d
-Warning: mb_chr(): Unsupported encoding "cp50222" in %s on line 29
+Warning: mb_chr(): Unsupported encoding "cp50222" in %s on line %d
-Warning: mb_chr(): Unsupported encoding "utf-7" in %s on line 30
+Warning: mb_chr(): Unsupported encoding "utf-7" in %s on line %d
bool(false)
bool(false)
bool(false)
bool(true)
string(4) "none"
-Warning: mb_substitute_character(): Unknown character. in %s on line %d
+Warning: mb_substitute_character(): Unknown character in %s on line %d
bool(false)
===DONE===
*** Testing mb_substitute_character() : usage variation ***
--int 0--
-Error: 2 - mb_substitute_character(): Unknown character., %s(%d)
+Error: 2 - mb_substitute_character(): Unknown character, %s(%d)
bool(false)
--int 1--
bool(true)
--int -12345--
-Error: 2 - mb_substitute_character(): Unknown character., %s(%d)
+Error: 2 - mb_substitute_character(): Unknown character, %s(%d)
bool(false)
--float 10.5--
bool(true)
--float -10.5--
-Error: 2 - mb_substitute_character(): Unknown character., %s(%d)
+Error: 2 - mb_substitute_character(): Unknown character, %s(%d)
bool(false)
--float 12.3456789000e10--
-Error: 2 - mb_substitute_character(): Unknown character., %s(%d)
+Error: 2 - mb_substitute_character(): Unknown character, %s(%d)
bool(false)
--float -12.3456789000e10--
-Error: 2 - mb_substitute_character(): Unknown character., %s(%d)
+Error: 2 - mb_substitute_character(): Unknown character, %s(%d)
bool(false)
--float .5--
-Error: 2 - mb_substitute_character(): Unknown character., %s(%d)
+Error: 2 - mb_substitute_character(): Unknown character, %s(%d)
bool(false)
--empty array--
-Error: 2 - mb_substitute_character(): Unknown character., %s(%d)
+Error: 2 - mb_substitute_character(): Unknown character, %s(%d)
bool(false)
--int indexed array--
bool(true)
--uppercase NULL--
-Error: 2 - mb_substitute_character(): Unknown character., %s(%d)
+Error: 2 - mb_substitute_character(): Unknown character, %s(%d)
bool(false)
--lowercase null--
-Error: 2 - mb_substitute_character(): Unknown character., %s(%d)
+Error: 2 - mb_substitute_character(): Unknown character, %s(%d)
bool(false)
--lowercase true--
bool(true)
--lowercase false--
-Error: 2 - mb_substitute_character(): Unknown character., %s(%d)
+Error: 2 - mb_substitute_character(): Unknown character, %s(%d)
bool(false)
--uppercase TRUE--
bool(true)
--uppercase FALSE--
-Error: 2 - mb_substitute_character(): Unknown character., %s(%d)
+Error: 2 - mb_substitute_character(): Unknown character, %s(%d)
bool(false)
--empty string DQ--
bool(true)
--string DQ--
-Error: 2 - mb_substitute_character(): Unknown character., %s(%d)
+Error: 2 - mb_substitute_character(): Unknown character, %s(%d)
bool(false)
--string SQ--
-Error: 2 - mb_substitute_character(): Unknown character., %s(%d)
+Error: 2 - mb_substitute_character(): Unknown character, %s(%d)
bool(false)
--mixed case string--
-Error: 2 - mb_substitute_character(): Unknown character., %s(%d)
+Error: 2 - mb_substitute_character(): Unknown character, %s(%d)
bool(false)
--heredoc--
-Error: 2 - mb_substitute_character(): Unknown character., %s(%d)
+Error: 2 - mb_substitute_character(): Unknown character, %s(%d)
bool(false)
--instance of classWithToString--
bool(true)
--undefined var--
-Error: 2 - mb_substitute_character(): Unknown character., %s(%d)
+Error: 2 - mb_substitute_character(): Unknown character, %s(%d)
bool(false)
--unset var--
-Error: 2 - mb_substitute_character(): Unknown character., %s(%d)
+Error: 2 - mb_substitute_character(): Unknown character, %s(%d)
bool(false)
===DONE===