]> granicus.if.org Git - php/commitdiff
Avoid unnecessary encoding lookups in mbstring
authorNikita Popov <nikita.ppv@gmail.com>
Wed, 19 Jul 2017 20:36:53 +0000 (22:36 +0200)
committerNikita Popov <nikita.ppv@gmail.com>
Wed, 19 Jul 2017 21:59:42 +0000 (23:59 +0200)
Extract part of php_mb_convert_encoding that does the actual work
and use it whenever we already know the encoding.

ext/mbstring/mbstring.c
ext/mbstring/mbstring.h
ext/mbstring/php_unicode.c

index 3edaf87f661259fe8f54cf7d5867baffe1df1cb8..a8b7016ca1fe3471067e055be417a73dddf7696f 100644 (file)
@@ -62,6 +62,7 @@
 
 #include "libmbfl/mbfl/mbfl_allocators.h"
 #include "libmbfl/mbfl/mbfilter_pass.h"
+#include "libmbfl/filters/mbfilter_ucs4.h"
 
 #include "php_variables.h"
 #include "php_globals.h"
@@ -3231,96 +3232,50 @@ static inline zend_bool php_mb_is_no_encoding_utf8(enum mbfl_no_encoding no_enc)
        return (no_enc >= mbfl_no_encoding_utf8 && no_enc <= mbfl_no_encoding_utf8_sb);
 }
 
-
-/* {{{ MBSTRING_API char *php_mb_convert_encoding() */
-MBSTRING_API char *php_mb_convert_encoding(const char *input, size_t length, const char *_to_encoding, const char *_from_encodings, size_t *output_len)
+MBSTRING_API char *php_mb_convert_encoding_ex(const char *input, size_t length, const mbfl_encoding *to_encoding, const mbfl_encoding *from_encoding, size_t *output_len)
 {
        mbfl_string string, result, *ret;
-       const mbfl_encoding *from_encoding, *to_encoding;
        mbfl_buffer_converter *convd;
-       size_t size;
-       const mbfl_encoding **list;
-       char *output=NULL;
+       char *output = NULL;
 
        if (output_len) {
                *output_len = 0;
        }
-       if (!input) {
-               return NULL;
-       }
-       /* new encoding */
-       if (_to_encoding && strlen(_to_encoding)) {
-               to_encoding = mbfl_name2encoding(_to_encoding);
-               if (!to_encoding) {
-                       php_error_docref(NULL, E_WARNING, "Unknown encoding \"%s\"", _to_encoding);
-                       return NULL;
-               }
-       } else {
-               to_encoding = MBSTRG(current_internal_encoding);
-       }
 
        /* initialize string */
        mbfl_string_init(&string);
        mbfl_string_init(&result);
-       from_encoding = MBSTRG(current_internal_encoding);
        string.no_encoding = from_encoding->no_encoding;
        string.no_language = MBSTRG(language);
        string.val = (unsigned char *)input;
        string.len = length;
 
-       /* pre-conversion encoding */
-       if (_from_encodings) {
-               list = NULL;
-               size = 0;
-               php_mb_parse_encoding_list(_from_encodings, strlen(_from_encodings), &list, &size, 0);
-               if (size == 1) {
-                       from_encoding = *list;
-                       string.no_encoding = from_encoding->no_encoding;
-               } else if (size > 1) {
-                       /* auto detect */
-                       from_encoding = mbfl_identify_encoding2(&string, list, size, MBSTRG(strict_detection));
-                       if (from_encoding) {
-                               string.no_encoding = from_encoding->no_encoding;
-                       } else {
-                               php_error_docref(NULL, E_WARNING, "Unable to detect character encoding");
-                               from_encoding = &mbfl_encoding_pass;
-                               to_encoding = from_encoding;
-                               string.no_encoding = from_encoding->no_encoding;
-                       }
-               } else {
-                       php_error_docref(NULL, E_WARNING, "Illegal character encoding specified");
-               }
-               if (list != NULL) {
-                       efree((void *)list);
-               }
-       }
-
        /* initialize converter */
        convd = mbfl_buffer_converter_new2(from_encoding, to_encoding, string.len);
        if (convd == NULL) {
                php_error_docref(NULL, E_WARNING, "Unable to create character encoding converter");
                return NULL;
        }
-       mbfl_buffer_converter_illegal_mode(convd, MBSTRG(current_filter_illegal_mode));
 
+       mbfl_buffer_converter_illegal_mode(convd, MBSTRG(current_filter_illegal_mode));
        if (string.no_encoding == MBSTRG(current_internal_encoding)->no_encoding) {
                mbfl_buffer_converter_illegal_substchar(convd, MBSTRG(current_filter_illegal_substchar));
-       } else if (php_mb_is_no_encoding_unicode(string.no_encoding) && php_mb_is_no_encoding_unicode(MBSTRG(current_internal_encoding)->no_encoding)) {
+       } else if (php_mb_is_no_encoding_unicode(string.no_encoding)
+                       && php_mb_is_no_encoding_unicode(MBSTRG(current_internal_encoding)->no_encoding)) {
 
                if (php_mb_is_no_encoding_utf8(string.no_encoding)) {
-
                        if (MBSTRG(current_filter_illegal_substchar) > 0xd7ff &&
                                0xe000 > MBSTRG(current_filter_illegal_substchar)
                        ) {
                                mbfl_buffer_converter_illegal_substchar(convd, 0x3f);
                        } else {
-                               mbfl_buffer_converter_illegal_substchar(convd, MBSTRG(current_filter_illegal_substchar));
+                               mbfl_buffer_converter_illegal_substchar(convd,
+                                       MBSTRG(current_filter_illegal_substchar));
                        }
-
                } else {
-                       mbfl_buffer_converter_illegal_substchar(convd, MBSTRG(current_filter_illegal_substchar));
+                       mbfl_buffer_converter_illegal_substchar(convd,
+                               MBSTRG(current_filter_illegal_substchar));
                }
-
        } else {
                mbfl_buffer_converter_illegal_substchar(convd, 0x3f);
        }
@@ -3340,6 +3295,59 @@ MBSTRING_API char *php_mb_convert_encoding(const char *input, size_t length, con
 }
 /* }}} */
 
+/* {{{ MBSTRING_API char *php_mb_convert_encoding() */
+MBSTRING_API char *php_mb_convert_encoding(const char *input, size_t length, const char *_to_encoding, const char *_from_encodings, size_t *output_len)
+{
+       const mbfl_encoding *from_encoding, *to_encoding;
+
+       if (output_len) {
+               *output_len = 0;
+       }
+       if (!input) {
+               return NULL;
+       }
+       /* new encoding */
+       if (_to_encoding && strlen(_to_encoding)) {
+               to_encoding = mbfl_name2encoding(_to_encoding);
+               if (!to_encoding) {
+                       php_error_docref(NULL, E_WARNING, "Unknown encoding \"%s\"", _to_encoding);
+                       return NULL;
+               }
+       } else {
+               to_encoding = MBSTRG(current_internal_encoding);
+       }
+
+       /* pre-conversion encoding */
+       from_encoding = MBSTRG(current_internal_encoding);
+       if (_from_encodings) {
+               const mbfl_encoding **list = NULL;
+               size_t size = 0;
+               php_mb_parse_encoding_list(_from_encodings, strlen(_from_encodings), &list, &size, 0);
+               if (size == 1) {
+                       from_encoding = *list;
+               } else if (size > 1) {
+                       /* auto detect */
+                       mbfl_string string;
+                       mbfl_string_init(&string);
+                       string.val = (unsigned char *)input;
+                       string.len = length;
+                       from_encoding = mbfl_identify_encoding2(&string, list, size, MBSTRG(strict_detection));
+                       if (!from_encoding) {
+                               php_error_docref(NULL, E_WARNING, "Unable to detect character encoding");
+                               from_encoding = &mbfl_encoding_pass;
+                       }
+               } else {
+                       php_error_docref(NULL, E_WARNING, "Illegal character encoding specified");
+               }
+               if (list != NULL) {
+                       efree((void *)list);
+               }
+       }
+
+       return php_mb_convert_encoding_ex(input, length, to_encoding, from_encoding, output_len);
+}
+/* }}} */
+
 MBSTRING_API HashTable *php_mb_convert_encoding_recursive(HashTable *input, const char *_to_encoding, const char *_from_encodings)
 {
        HashTable *output, *chash;
@@ -5126,29 +5134,28 @@ PHP_FUNCTION(mb_check_encoding)
 /* }}} */
 
 
-static inline zend_long php_mb_ord(const char* str, size_t str_len, const char* enc)
+static inline zend_long php_mb_ord(const char* str, size_t str_len, const char* enc_name)
 {
+       const mbfl_encoding *enc;
        enum mbfl_no_encoding no_enc;
        char* ret;
        size_t ret_len;
-       const mbfl_encoding *encoding;
        unsigned char char_len;
        zend_long cp;
 
-       if (enc == NULL) {
-               no_enc = MBSTRG(current_internal_encoding)->no_encoding;
+       if (enc_name == NULL) {
+               enc = MBSTRG(current_internal_encoding);
        } else {
-               no_enc = mbfl_name2no_encoding(enc);
-
-               if (no_enc == mbfl_no_encoding_invalid) {
-                       php_error_docref(NULL, E_WARNING, "Unknown encoding \"%s\"", enc);
+               enc = mbfl_name2encoding(enc_name);
+               if (!enc) {
+                       php_error_docref(NULL, E_WARNING, "Unknown encoding \"%s\"", enc_name);
                        return -1;
                }
        }
 
+       no_enc = enc->no_encoding;
        if (php_mb_is_no_encoding_unicode(no_enc)) {
-
-               ret = php_mb_convert_encoding(str, str_len, "UCS-4BE", enc, &ret_len);
+               ret = php_mb_convert_encoding_ex(str, str_len, &mbfl_encoding_ucs4be, enc, &ret_len);
 
                if (ret == NULL) {
                        return -1;
@@ -5164,18 +5171,17 @@ static inline zend_long php_mb_ord(const char* str, size_t str_len, const char*
                return cp;
 
        } else if (php_mb_is_unsupported_no_encoding(no_enc)) {
-               php_error_docref(NULL, E_WARNING, "Unsupported encoding \"%s\"", enc);
+               php_error_docref(NULL, E_WARNING, "Unsupported encoding \"%s\"", enc_name);
                return -1;
        }
 
-       ret = php_mb_convert_encoding(str, str_len, enc, enc, &ret_len);
+       ret = php_mb_convert_encoding_ex(str, str_len, enc, enc, &ret_len);
 
        if (ret == NULL) {
                return -1;
        }
 
-       encoding = mbfl_no2encoding(no_enc);
-       char_len = php_mb_mbchar_bytes_ex(ret, encoding);
+       char_len = php_mb_mbchar_bytes_ex(ret, enc);
 
        if (char_len == 1) {
                cp = (unsigned char) ret[0];
@@ -5225,24 +5231,27 @@ PHP_FUNCTION(mb_ord)
 /* }}} */
 
 
-static inline char* php_mb_chr(zend_long cp, const char* enc, size_t *output_len)
+static inline char* php_mb_chr(zend_long cp, const char* enc_name, size_t *output_len)
 {
+       const mbfl_encoding *enc;
        enum mbfl_no_encoding no_enc;
        char* buf;
        size_t buf_len;
        char* ret;
        size_t ret_len;
 
-       if (enc == NULL) {
-               no_enc = MBSTRG(current_internal_encoding)->no_encoding;
+       if (enc_name == NULL) {
+               enc = MBSTRG(current_internal_encoding);
        } else {
-               no_enc = mbfl_name2no_encoding(enc);
-               if (no_enc == mbfl_no_encoding_invalid) {
-                       php_error_docref(NULL, E_WARNING, "Unknown encoding \"%s\"", enc);
+               enc = mbfl_name2encoding(enc_name);
+               if (!enc) {
+                       php_error_docref(NULL, E_WARNING, "Unknown encoding \"%s\"", enc_name);
                        return NULL;
                }
        }
 
+       no_enc = enc->no_encoding;
+
        if (php_mb_is_no_encoding_utf8(no_enc)) {
 
                if (0 > cp || cp > 0x10ffff || (cp > 0xd7ff && 0xe000 > cp)) {
@@ -5313,7 +5322,7 @@ static inline char* php_mb_chr(zend_long cp, const char* enc, size_t *output_len
                buf[3] = cp & 0xff;
                buf[4] = 0;
 
-               ret = php_mb_convert_encoding(buf, buf_len, enc, "UCS-4BE", &ret_len);
+               ret = php_mb_convert_encoding_ex(buf, buf_len, enc, &mbfl_encoding_ucs4be, &ret_len);
                efree(buf);
 
                if (output_len) {
@@ -5323,7 +5332,7 @@ static inline char* php_mb_chr(zend_long cp, const char* enc, size_t *output_len
                return ret;
 
        } else if (php_mb_is_unsupported_no_encoding(no_enc)) {
-               php_error_docref(NULL, E_WARNING, "Unsupported encoding \"%s\"", enc);
+               php_error_docref(NULL, E_WARNING, "Unsupported encoding \"%s\"", enc_name);
                return NULL;
        }
 
@@ -5363,7 +5372,7 @@ static inline char* php_mb_chr(zend_long cp, const char* enc, size_t *output_len
                buf[4] = 0;
        }
 
-       ret = php_mb_convert_encoding(buf, buf_len, enc, enc, &ret_len);
+       ret = php_mb_convert_encoding_ex(buf, buf_len, enc, enc, &ret_len);
        efree(buf);
 
        if (output_len) {
@@ -5401,34 +5410,38 @@ PHP_FUNCTION(mb_chr)
 /* }}} */
 
 
-static inline char* php_mb_scrub(const char* str, size_t str_len, const char* enc)
+static inline char* php_mb_scrub(const char* str, size_t str_len, const mbfl_encoding *enc)
 {
        size_t ret_len;
 
-       return php_mb_convert_encoding(str, str_len, enc, enc, &ret_len);
+       return php_mb_convert_encoding_ex(str, str_len, enc, enc, &ret_len);
 }
 
 
 /* {{{ proto bool mb_scrub([string str[, string encoding]]) */
 PHP_FUNCTION(mb_scrub)
 {
+       const mbfl_encoding *enc;
        char* str;
        size_t str_len;
-       char *enc = NULL;
-       size_t enc_len;
+       char *enc_name = NULL;
+       size_t enc_name_len;
        char *ret;
 
        ZEND_PARSE_PARAMETERS_START(1, 2)
                Z_PARAM_STRING(str, str_len)
                Z_PARAM_OPTIONAL
-               Z_PARAM_STRING(enc, enc_len)
+               Z_PARAM_STRING(enc_name, enc_name_len)
        ZEND_PARSE_PARAMETERS_END();
 
-       if (enc == NULL) {
-               enc = (char *) MBSTRG(current_internal_encoding)->name;
-       } else if (!mbfl_is_support_encoding(enc)) {
-               php_error_docref(NULL, E_WARNING, "Unknown encoding \"%s\"", enc);
-               RETURN_FALSE;
+       if (enc_name == NULL) {
+               enc = MBSTRG(current_internal_encoding);
+       } else {
+               enc = mbfl_name2encoding(enc_name);
+               if (!enc) {
+                       php_error_docref(NULL, E_WARNING, "Unknown encoding \"%s\"", enc_name);
+                       RETURN_FALSE;
+               }
        }
 
        ret = php_mb_scrub(str, str_len, enc);
index 4772898c89c3b565a7fa1d3fceaff9220f443042..207227b036ab446e12df96a8c55fbfed66f292fa 100644 (file)
@@ -140,6 +140,9 @@ MBSTRING_API char *php_mb_safe_strrchr_ex(const char *s, unsigned int c,
 MBSTRING_API char *php_mb_safe_strrchr(const char *s, unsigned int c,
                                  size_t nbytes);
 
+MBSTRING_API char *php_mb_convert_encoding_ex(
+               const char *input, size_t length,
+               const mbfl_encoding *to_encoding, const mbfl_encoding *from_encoding, size_t *output_len);
 MBSTRING_API char * php_mb_convert_encoding(const char *input, size_t length,
                                       const char *_to_encoding,
                                       const char *_from_encodings,
index 8b6a52156a0ca6388198d9db1217466aa92b5747..5584b2a3a4ac49267591136486a07c0b78deeba2 100644 (file)
@@ -43,6 +43,7 @@
 #include "mbstring.h"
 #include "php_unicode.h"
 #include "unicode_data.h"
+#include "libmbfl/filters/mbfilter_ucs4.h"
 
 ZEND_EXTERN_MODULE_GLOBALS(mbstring)
 
@@ -268,20 +269,23 @@ MBSTRING_API unsigned long php_unicode_totitle(unsigned long code, enum mbfl_no_
 }
 
 MBSTRING_API char *php_unicode_convert_case(int case_mode, const char *srcstr, size_t srclen, size_t *ret_len,
-               const char *src_encoding)
+               const char *src_encoding_name)
 {
        char *unicode, *newstr;
        size_t unicode_len;
        unsigned char *unicode_ptr;
        size_t i;
-       enum mbfl_no_encoding _src_encoding = mbfl_name2no_encoding(src_encoding);
+       enum mbfl_no_encoding src_no_encoding;
 
-       if (_src_encoding == mbfl_no_encoding_invalid) {
-               php_error_docref(NULL, E_WARNING, "Unknown encoding \"%s\"", src_encoding);
+       const mbfl_encoding *src_encoding = mbfl_name2encoding(src_encoding_name);
+       if (!src_encoding) {
+               php_error_docref(NULL, E_WARNING, "Unknown encoding \"%s\"", src_encoding_name);
                return NULL;
        }
 
-       unicode = php_mb_convert_encoding(srcstr, srclen, "UCS-4BE", src_encoding, &unicode_len);
+       src_no_encoding = src_encoding->no_encoding;
+
+       unicode = php_mb_convert_encoding_ex(srcstr, srclen, &mbfl_encoding_ucs4be, src_encoding, &unicode_len);
        if (unicode == NULL)
                return NULL;
 
@@ -291,14 +295,14 @@ MBSTRING_API char *php_unicode_convert_case(int case_mode, const char *srcstr, s
                case PHP_UNICODE_CASE_UPPER:
                        for (i = 0; i < unicode_len; i+=4) {
                                UINT32_TO_BE_ARY(&unicode_ptr[i],
-                                       php_unicode_toupper(BE_ARY_TO_UINT32(&unicode_ptr[i]), _src_encoding));
+                                       php_unicode_toupper(BE_ARY_TO_UINT32(&unicode_ptr[i]), src_no_encoding));
                        }
                        break;
 
                case PHP_UNICODE_CASE_LOWER:
                        for (i = 0; i < unicode_len; i+=4) {
                                UINT32_TO_BE_ARY(&unicode_ptr[i],
-                                       php_unicode_tolower(BE_ARY_TO_UINT32(&unicode_ptr[i]), _src_encoding));
+                                       php_unicode_tolower(BE_ARY_TO_UINT32(&unicode_ptr[i]), src_no_encoding));
                        }
                        break;
 
@@ -312,7 +316,7 @@ MBSTRING_API char *php_unicode_convert_case(int case_mode, const char *srcstr, s
                                if (mode) {
                                        if (res) {
                                                UINT32_TO_BE_ARY(&unicode_ptr[i],
-                                                       php_unicode_tolower(BE_ARY_TO_UINT32(&unicode_ptr[i]), _src_encoding));
+                                                       php_unicode_tolower(BE_ARY_TO_UINT32(&unicode_ptr[i]), src_no_encoding));
                                        } else {
                                                mode = 0;
                                        }
@@ -320,7 +324,7 @@ MBSTRING_API char *php_unicode_convert_case(int case_mode, const char *srcstr, s
                                        if (res) {
                                                mode = 1;
                                                UINT32_TO_BE_ARY(&unicode_ptr[i],
-                                                       php_unicode_totitle(BE_ARY_TO_UINT32(&unicode_ptr[i]), _src_encoding));
+                                                       php_unicode_totitle(BE_ARY_TO_UINT32(&unicode_ptr[i]), src_no_encoding));
                                        }
                                }
                        }
@@ -328,7 +332,8 @@ MBSTRING_API char *php_unicode_convert_case(int case_mode, const char *srcstr, s
 
        }
 
-       newstr = php_mb_convert_encoding(unicode, unicode_len, src_encoding, "UCS-4BE", ret_len);
+       newstr = php_mb_convert_encoding_ex(
+               unicode, unicode_len, src_encoding, &mbfl_encoding_ucs4be, ret_len);
        efree(unicode);
 
        return newstr;