From: Nikita Popov Date: Wed, 19 Jul 2017 20:36:53 +0000 (+0200) Subject: Avoid unnecessary encoding lookups in mbstring X-Git-Tag: php-7.3.0alpha1~1881 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=dead4f0b1b9a555bbea970f5399c01142414db85;p=php Avoid unnecessary encoding lookups in mbstring Extract part of php_mb_convert_encoding that does the actual work and use it whenever we already know the encoding. --- diff --git a/ext/mbstring/mbstring.c b/ext/mbstring/mbstring.c index 3edaf87f66..a8b7016ca1 100644 --- a/ext/mbstring/mbstring.c +++ b/ext/mbstring/mbstring.c @@ -62,6 +62,7 @@ #include "libmbfl/mbfl/mbfl_allocators.h" #include "libmbfl/mbfl/mbfilter_pass.h" +#include "libmbfl/filters/mbfilter_ucs4.h" #include "php_variables.h" #include "php_globals.h" @@ -3231,96 +3232,50 @@ static inline zend_bool php_mb_is_no_encoding_utf8(enum mbfl_no_encoding no_enc) return (no_enc >= mbfl_no_encoding_utf8 && no_enc <= mbfl_no_encoding_utf8_sb); } - -/* {{{ MBSTRING_API char *php_mb_convert_encoding() */ -MBSTRING_API char *php_mb_convert_encoding(const char *input, size_t length, const char *_to_encoding, const char *_from_encodings, size_t *output_len) +MBSTRING_API char *php_mb_convert_encoding_ex(const char *input, size_t length, const mbfl_encoding *to_encoding, const mbfl_encoding *from_encoding, size_t *output_len) { mbfl_string string, result, *ret; - const mbfl_encoding *from_encoding, *to_encoding; mbfl_buffer_converter *convd; - size_t size; - const mbfl_encoding **list; - char *output=NULL; + char *output = NULL; if (output_len) { *output_len = 0; } - if (!input) { - return NULL; - } - /* new encoding */ - if (_to_encoding && strlen(_to_encoding)) { - to_encoding = mbfl_name2encoding(_to_encoding); - if (!to_encoding) { - php_error_docref(NULL, E_WARNING, "Unknown encoding \"%s\"", _to_encoding); - return NULL; - } - } else { - to_encoding = MBSTRG(current_internal_encoding); - } /* initialize string */ mbfl_string_init(&string); mbfl_string_init(&result); - from_encoding = MBSTRG(current_internal_encoding); string.no_encoding = from_encoding->no_encoding; string.no_language = MBSTRG(language); string.val = (unsigned char *)input; string.len = length; - /* pre-conversion encoding */ - if (_from_encodings) { - list = NULL; - size = 0; - php_mb_parse_encoding_list(_from_encodings, strlen(_from_encodings), &list, &size, 0); - if (size == 1) { - from_encoding = *list; - string.no_encoding = from_encoding->no_encoding; - } else if (size > 1) { - /* auto detect */ - from_encoding = mbfl_identify_encoding2(&string, list, size, MBSTRG(strict_detection)); - if (from_encoding) { - string.no_encoding = from_encoding->no_encoding; - } else { - php_error_docref(NULL, E_WARNING, "Unable to detect character encoding"); - from_encoding = &mbfl_encoding_pass; - to_encoding = from_encoding; - string.no_encoding = from_encoding->no_encoding; - } - } else { - php_error_docref(NULL, E_WARNING, "Illegal character encoding specified"); - } - if (list != NULL) { - efree((void *)list); - } - } - /* initialize converter */ convd = mbfl_buffer_converter_new2(from_encoding, to_encoding, string.len); if (convd == NULL) { php_error_docref(NULL, E_WARNING, "Unable to create character encoding converter"); return NULL; } - mbfl_buffer_converter_illegal_mode(convd, MBSTRG(current_filter_illegal_mode)); + mbfl_buffer_converter_illegal_mode(convd, MBSTRG(current_filter_illegal_mode)); if (string.no_encoding == MBSTRG(current_internal_encoding)->no_encoding) { mbfl_buffer_converter_illegal_substchar(convd, MBSTRG(current_filter_illegal_substchar)); - } else if (php_mb_is_no_encoding_unicode(string.no_encoding) && php_mb_is_no_encoding_unicode(MBSTRG(current_internal_encoding)->no_encoding)) { + } else if (php_mb_is_no_encoding_unicode(string.no_encoding) + && php_mb_is_no_encoding_unicode(MBSTRG(current_internal_encoding)->no_encoding)) { if (php_mb_is_no_encoding_utf8(string.no_encoding)) { - if (MBSTRG(current_filter_illegal_substchar) > 0xd7ff && 0xe000 > MBSTRG(current_filter_illegal_substchar) ) { mbfl_buffer_converter_illegal_substchar(convd, 0x3f); } else { - mbfl_buffer_converter_illegal_substchar(convd, MBSTRG(current_filter_illegal_substchar)); + mbfl_buffer_converter_illegal_substchar(convd, + MBSTRG(current_filter_illegal_substchar)); } - } else { - mbfl_buffer_converter_illegal_substchar(convd, MBSTRG(current_filter_illegal_substchar)); + mbfl_buffer_converter_illegal_substchar(convd, + MBSTRG(current_filter_illegal_substchar)); } - } else { mbfl_buffer_converter_illegal_substchar(convd, 0x3f); } @@ -3340,6 +3295,59 @@ MBSTRING_API char *php_mb_convert_encoding(const char *input, size_t length, con } /* }}} */ +/* {{{ MBSTRING_API char *php_mb_convert_encoding() */ +MBSTRING_API char *php_mb_convert_encoding(const char *input, size_t length, const char *_to_encoding, const char *_from_encodings, size_t *output_len) +{ + const mbfl_encoding *from_encoding, *to_encoding; + + if (output_len) { + *output_len = 0; + } + if (!input) { + return NULL; + } + /* new encoding */ + if (_to_encoding && strlen(_to_encoding)) { + to_encoding = mbfl_name2encoding(_to_encoding); + if (!to_encoding) { + php_error_docref(NULL, E_WARNING, "Unknown encoding \"%s\"", _to_encoding); + return NULL; + } + } else { + to_encoding = MBSTRG(current_internal_encoding); + } + + /* pre-conversion encoding */ + from_encoding = MBSTRG(current_internal_encoding); + if (_from_encodings) { + const mbfl_encoding **list = NULL; + size_t size = 0; + php_mb_parse_encoding_list(_from_encodings, strlen(_from_encodings), &list, &size, 0); + if (size == 1) { + from_encoding = *list; + } else if (size > 1) { + /* auto detect */ + mbfl_string string; + mbfl_string_init(&string); + string.val = (unsigned char *)input; + string.len = length; + from_encoding = mbfl_identify_encoding2(&string, list, size, MBSTRG(strict_detection)); + if (!from_encoding) { + php_error_docref(NULL, E_WARNING, "Unable to detect character encoding"); + from_encoding = &mbfl_encoding_pass; + } + } else { + php_error_docref(NULL, E_WARNING, "Illegal character encoding specified"); + } + if (list != NULL) { + efree((void *)list); + } + } + + return php_mb_convert_encoding_ex(input, length, to_encoding, from_encoding, output_len); +} +/* }}} */ + MBSTRING_API HashTable *php_mb_convert_encoding_recursive(HashTable *input, const char *_to_encoding, const char *_from_encodings) { HashTable *output, *chash; @@ -5126,29 +5134,28 @@ PHP_FUNCTION(mb_check_encoding) /* }}} */ -static inline zend_long php_mb_ord(const char* str, size_t str_len, const char* enc) +static inline zend_long php_mb_ord(const char* str, size_t str_len, const char* enc_name) { + const mbfl_encoding *enc; enum mbfl_no_encoding no_enc; char* ret; size_t ret_len; - const mbfl_encoding *encoding; unsigned char char_len; zend_long cp; - if (enc == NULL) { - no_enc = MBSTRG(current_internal_encoding)->no_encoding; + if (enc_name == NULL) { + enc = MBSTRG(current_internal_encoding); } else { - no_enc = mbfl_name2no_encoding(enc); - - if (no_enc == mbfl_no_encoding_invalid) { - php_error_docref(NULL, E_WARNING, "Unknown encoding \"%s\"", enc); + enc = mbfl_name2encoding(enc_name); + if (!enc) { + php_error_docref(NULL, E_WARNING, "Unknown encoding \"%s\"", enc_name); return -1; } } + no_enc = enc->no_encoding; if (php_mb_is_no_encoding_unicode(no_enc)) { - - ret = php_mb_convert_encoding(str, str_len, "UCS-4BE", enc, &ret_len); + ret = php_mb_convert_encoding_ex(str, str_len, &mbfl_encoding_ucs4be, enc, &ret_len); if (ret == NULL) { return -1; @@ -5164,18 +5171,17 @@ static inline zend_long php_mb_ord(const char* str, size_t str_len, const char* return cp; } else if (php_mb_is_unsupported_no_encoding(no_enc)) { - php_error_docref(NULL, E_WARNING, "Unsupported encoding \"%s\"", enc); + php_error_docref(NULL, E_WARNING, "Unsupported encoding \"%s\"", enc_name); return -1; } - ret = php_mb_convert_encoding(str, str_len, enc, enc, &ret_len); + ret = php_mb_convert_encoding_ex(str, str_len, enc, enc, &ret_len); if (ret == NULL) { return -1; } - encoding = mbfl_no2encoding(no_enc); - char_len = php_mb_mbchar_bytes_ex(ret, encoding); + char_len = php_mb_mbchar_bytes_ex(ret, enc); if (char_len == 1) { cp = (unsigned char) ret[0]; @@ -5225,24 +5231,27 @@ PHP_FUNCTION(mb_ord) /* }}} */ -static inline char* php_mb_chr(zend_long cp, const char* enc, size_t *output_len) +static inline char* php_mb_chr(zend_long cp, const char* enc_name, size_t *output_len) { + const mbfl_encoding *enc; enum mbfl_no_encoding no_enc; char* buf; size_t buf_len; char* ret; size_t ret_len; - if (enc == NULL) { - no_enc = MBSTRG(current_internal_encoding)->no_encoding; + if (enc_name == NULL) { + enc = MBSTRG(current_internal_encoding); } else { - no_enc = mbfl_name2no_encoding(enc); - if (no_enc == mbfl_no_encoding_invalid) { - php_error_docref(NULL, E_WARNING, "Unknown encoding \"%s\"", enc); + enc = mbfl_name2encoding(enc_name); + if (!enc) { + php_error_docref(NULL, E_WARNING, "Unknown encoding \"%s\"", enc_name); return NULL; } } + no_enc = enc->no_encoding; + if (php_mb_is_no_encoding_utf8(no_enc)) { if (0 > cp || cp > 0x10ffff || (cp > 0xd7ff && 0xe000 > cp)) { @@ -5313,7 +5322,7 @@ static inline char* php_mb_chr(zend_long cp, const char* enc, size_t *output_len buf[3] = cp & 0xff; buf[4] = 0; - ret = php_mb_convert_encoding(buf, buf_len, enc, "UCS-4BE", &ret_len); + ret = php_mb_convert_encoding_ex(buf, buf_len, enc, &mbfl_encoding_ucs4be, &ret_len); efree(buf); if (output_len) { @@ -5323,7 +5332,7 @@ static inline char* php_mb_chr(zend_long cp, const char* enc, size_t *output_len return ret; } else if (php_mb_is_unsupported_no_encoding(no_enc)) { - php_error_docref(NULL, E_WARNING, "Unsupported encoding \"%s\"", enc); + php_error_docref(NULL, E_WARNING, "Unsupported encoding \"%s\"", enc_name); return NULL; } @@ -5363,7 +5372,7 @@ static inline char* php_mb_chr(zend_long cp, const char* enc, size_t *output_len buf[4] = 0; } - ret = php_mb_convert_encoding(buf, buf_len, enc, enc, &ret_len); + ret = php_mb_convert_encoding_ex(buf, buf_len, enc, enc, &ret_len); efree(buf); if (output_len) { @@ -5401,34 +5410,38 @@ PHP_FUNCTION(mb_chr) /* }}} */ -static inline char* php_mb_scrub(const char* str, size_t str_len, const char* enc) +static inline char* php_mb_scrub(const char* str, size_t str_len, const mbfl_encoding *enc) { size_t ret_len; - return php_mb_convert_encoding(str, str_len, enc, enc, &ret_len); + return php_mb_convert_encoding_ex(str, str_len, enc, enc, &ret_len); } /* {{{ proto bool mb_scrub([string str[, string encoding]]) */ PHP_FUNCTION(mb_scrub) { + const mbfl_encoding *enc; char* str; size_t str_len; - char *enc = NULL; - size_t enc_len; + char *enc_name = NULL; + size_t enc_name_len; char *ret; ZEND_PARSE_PARAMETERS_START(1, 2) Z_PARAM_STRING(str, str_len) Z_PARAM_OPTIONAL - Z_PARAM_STRING(enc, enc_len) + Z_PARAM_STRING(enc_name, enc_name_len) ZEND_PARSE_PARAMETERS_END(); - if (enc == NULL) { - enc = (char *) MBSTRG(current_internal_encoding)->name; - } else if (!mbfl_is_support_encoding(enc)) { - php_error_docref(NULL, E_WARNING, "Unknown encoding \"%s\"", enc); - RETURN_FALSE; + if (enc_name == NULL) { + enc = MBSTRG(current_internal_encoding); + } else { + enc = mbfl_name2encoding(enc_name); + if (!enc) { + php_error_docref(NULL, E_WARNING, "Unknown encoding \"%s\"", enc_name); + RETURN_FALSE; + } } ret = php_mb_scrub(str, str_len, enc); diff --git a/ext/mbstring/mbstring.h b/ext/mbstring/mbstring.h index 4772898c89..207227b036 100644 --- a/ext/mbstring/mbstring.h +++ b/ext/mbstring/mbstring.h @@ -140,6 +140,9 @@ MBSTRING_API char *php_mb_safe_strrchr_ex(const char *s, unsigned int c, MBSTRING_API char *php_mb_safe_strrchr(const char *s, unsigned int c, size_t nbytes); +MBSTRING_API char *php_mb_convert_encoding_ex( + const char *input, size_t length, + const mbfl_encoding *to_encoding, const mbfl_encoding *from_encoding, size_t *output_len); MBSTRING_API char * php_mb_convert_encoding(const char *input, size_t length, const char *_to_encoding, const char *_from_encodings, diff --git a/ext/mbstring/php_unicode.c b/ext/mbstring/php_unicode.c index 8b6a52156a..5584b2a3a4 100644 --- a/ext/mbstring/php_unicode.c +++ b/ext/mbstring/php_unicode.c @@ -43,6 +43,7 @@ #include "mbstring.h" #include "php_unicode.h" #include "unicode_data.h" +#include "libmbfl/filters/mbfilter_ucs4.h" ZEND_EXTERN_MODULE_GLOBALS(mbstring) @@ -268,20 +269,23 @@ MBSTRING_API unsigned long php_unicode_totitle(unsigned long code, enum mbfl_no_ } MBSTRING_API char *php_unicode_convert_case(int case_mode, const char *srcstr, size_t srclen, size_t *ret_len, - const char *src_encoding) + const char *src_encoding_name) { char *unicode, *newstr; size_t unicode_len; unsigned char *unicode_ptr; size_t i; - enum mbfl_no_encoding _src_encoding = mbfl_name2no_encoding(src_encoding); + enum mbfl_no_encoding src_no_encoding; - if (_src_encoding == mbfl_no_encoding_invalid) { - php_error_docref(NULL, E_WARNING, "Unknown encoding \"%s\"", src_encoding); + const mbfl_encoding *src_encoding = mbfl_name2encoding(src_encoding_name); + if (!src_encoding) { + php_error_docref(NULL, E_WARNING, "Unknown encoding \"%s\"", src_encoding_name); return NULL; } - unicode = php_mb_convert_encoding(srcstr, srclen, "UCS-4BE", src_encoding, &unicode_len); + src_no_encoding = src_encoding->no_encoding; + + unicode = php_mb_convert_encoding_ex(srcstr, srclen, &mbfl_encoding_ucs4be, src_encoding, &unicode_len); if (unicode == NULL) return NULL; @@ -291,14 +295,14 @@ MBSTRING_API char *php_unicode_convert_case(int case_mode, const char *srcstr, s case PHP_UNICODE_CASE_UPPER: for (i = 0; i < unicode_len; i+=4) { UINT32_TO_BE_ARY(&unicode_ptr[i], - php_unicode_toupper(BE_ARY_TO_UINT32(&unicode_ptr[i]), _src_encoding)); + php_unicode_toupper(BE_ARY_TO_UINT32(&unicode_ptr[i]), src_no_encoding)); } break; case PHP_UNICODE_CASE_LOWER: for (i = 0; i < unicode_len; i+=4) { UINT32_TO_BE_ARY(&unicode_ptr[i], - php_unicode_tolower(BE_ARY_TO_UINT32(&unicode_ptr[i]), _src_encoding)); + php_unicode_tolower(BE_ARY_TO_UINT32(&unicode_ptr[i]), src_no_encoding)); } break; @@ -312,7 +316,7 @@ MBSTRING_API char *php_unicode_convert_case(int case_mode, const char *srcstr, s if (mode) { if (res) { UINT32_TO_BE_ARY(&unicode_ptr[i], - php_unicode_tolower(BE_ARY_TO_UINT32(&unicode_ptr[i]), _src_encoding)); + php_unicode_tolower(BE_ARY_TO_UINT32(&unicode_ptr[i]), src_no_encoding)); } else { mode = 0; } @@ -320,7 +324,7 @@ MBSTRING_API char *php_unicode_convert_case(int case_mode, const char *srcstr, s if (res) { mode = 1; UINT32_TO_BE_ARY(&unicode_ptr[i], - php_unicode_totitle(BE_ARY_TO_UINT32(&unicode_ptr[i]), _src_encoding)); + php_unicode_totitle(BE_ARY_TO_UINT32(&unicode_ptr[i]), src_no_encoding)); } } } @@ -328,7 +332,8 @@ MBSTRING_API char *php_unicode_convert_case(int case_mode, const char *srcstr, s } - newstr = php_mb_convert_encoding(unicode, unicode_len, src_encoding, "UCS-4BE", ret_len); + newstr = php_mb_convert_encoding_ex( + unicode, unicode_len, src_encoding, &mbfl_encoding_ucs4be, ret_len); efree(unicode); return newstr;