From: Dmitry Stogov Date: Fri, 19 Feb 2021 12:42:21 +0000 (+0300) Subject: Improve basename(). Avoid calling mblen() for ASCII compatible locales. X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=5e015425263c28d40fd49ee386135f02d0e76975;p=php Improve basename(). Avoid calling mblen() for ASCII compatible locales. --- diff --git a/Zend/zend_globals.h b/Zend/zend_globals.h index 825fad833c..e9b24fc0e3 100644 --- a/Zend/zend_globals.h +++ b/Zend/zend_globals.h @@ -95,6 +95,10 @@ struct _zend_compiler_globals { bool skip_shebang; bool increment_lineno; + bool variable_width_locale; /* UTF-8, Shift-JIS, Big5, ISO 2022, EUC, etc */ + bool ascii_compatible_locale; /* locale uses ASCII characters as singletons */ + /* and don't use them as lead/trail units */ + zend_string *doc_comment; uint32_t extra_fn_flags; diff --git a/Zend/zend_operators.c b/Zend/zend_operators.c index 0cdb3aa085..a23dad9e1e 100644 --- a/Zend/zend_operators.c +++ b/Zend/zend_operators.c @@ -30,12 +30,21 @@ #include "zend_exceptions.h" #include "zend_closures.h" +#include +#ifdef HAVE_LANGINFO_H +# include +#endif + #ifdef __SSE2__ #include #endif +#if defined(ZEND_WIN32) && !defined(ZTS) && defined(_MSC_VER) +/* This performance improvement of tolower() on Windows gives 10-18% on bench.php */ +#define ZEND_USE_TOLOWER_L 1 +#endif + #ifdef ZEND_USE_TOLOWER_L -#include static _locale_t current_locale = NULL; /* this is true global! may lead to strange effects on ZTS, but so may setlocale() */ #define zend_tolower(c) _tolower_l(c, current_locale) @@ -2537,13 +2546,85 @@ ZEND_API bool ZEND_FASTCALL zend_object_is_true(zval *op) /* {{{ */ } /* }}} */ -#ifdef ZEND_USE_TOLOWER_L ZEND_API void zend_update_current_locale(void) /* {{{ */ { +#ifdef ZEND_USE_TOLOWER_L +# if defined(ZEND_WIN32) && defined(_MSC_VER) current_locale = _get_current_locale(); +# else + current_locale = uselocale(0); +# endif +#endif +#if defined(ZEND_WIN32) && defined(_MSC_VER) + if (MB_CUR_MAX > 1) { + unsigned int cp = ___lc_codepage_func(); + CG(variable_width_locale) = 1; + // TODO: EUC-* are also ASCII compatible ??? + CG(ascii_compatible_locale) = + cp == 65001; /* UTF-8 */ + } else { + CG(variable_width_locale) = 0; + CG(ascii_compatible_locale) = 1; + } +#elif defined(MB_CUR_MAX) + /* Check if current locale uses variable width encoding */ + if (MB_CUR_MAX > 1) { +#if HAVE_NL_LANGINFO + const char *charmap = nl_langinfo(CODESET); +#else + char buf[16]; + const char *charmap = NULL; + const char *locale = setlocale(LC_CTYPE, NULL); + + if (locale) { + const char *dot = strchr(locale, '.'); + const char *modifier; + + if (dot) { + dot++; + modifier = strchr(dot, '@'); + if (!modifier) { + charmap = dot; + } else if (modifier - dot < sizeof(buf)) { + memcpy(buf, dot, modifier - dot); + buf[modifier - dot] = '\0'; + charmap = buf; + } + } + } +#endif + CG(variable_width_locale) = 1; + CG(ascii_compatible_locale) = 0; + + if (charmap) { + size_t len = strlen(charmap); + static const char *ascii_compatible_charmaps[] = { + "utf-8", + "utf8", + // TODO: EUC-* are also ASCII compatible ??? + NULL + }; + const char **p; + /* Check if current locale is ASCII compatible */ + for (p = ascii_compatible_charmaps; *p; p++) { + if (zend_binary_strcasecmp(charmap, len, *p, strlen(*p)) == 0) { + CG(ascii_compatible_locale) = 1; + break; + } + } + } + + } else { + CG(variable_width_locale) = 0; + CG(ascii_compatible_locale) = 1; + } +#else + /* We can't determine current charset. Assume the worst case */ + CG(variable_width_locale) = 1; + CG(ascii_compatible_locale) = 0; +#endif } /* }}} */ -#endif static zend_always_inline void zend_str_tolower_impl(char *dest, const char *str, size_t length) /* {{{ */ { unsigned char *p = (unsigned char*)str; diff --git a/Zend/zend_operators.h b/Zend/zend_operators.h index b3ad598b74..8996a3d959 100644 --- a/Zend/zend_operators.h +++ b/Zend/zend_operators.h @@ -450,16 +450,7 @@ ZEND_API zend_long ZEND_FASTCALL zend_atol(const char *str, size_t str_len); #define convert_to_object_ex(zv) convert_to_object(zv) #define convert_scalar_to_number_ex(zv) convert_scalar_to_number(zv) -#if defined(ZEND_WIN32) && !defined(ZTS) && defined(_MSC_VER) -/* This performance improvement of tolower() on Windows gives 10-18% on bench.php */ -#define ZEND_USE_TOLOWER_L 1 -#endif - -#ifdef ZEND_USE_TOLOWER_L ZEND_API void zend_update_current_locale(void); -#else -#define zend_update_current_locale() -#endif /* The offset in bytes between the value and type fields of a zval */ #define ZVAL_OFFSETOF_TYPE \ diff --git a/ext/standard/string.c b/ext/standard/string.c index f6af763fd3..d9185de802 100644 --- a/ext/standard/string.c +++ b/ext/standard/string.c @@ -1465,66 +1465,96 @@ PHP_FUNCTION(strtolower) /* {{{ php_basename */ PHPAPI zend_string *php_basename(const char *s, size_t len, const char *suffix, size_t suffix_len) { - /* State 0 is directly after a directory separator (or at the start of the string). - * State 1 is everything else. */ - int state = 0; - const char *basename_start = s; - const char *basename_end = s; - while (len > 0) { - int inc_len = (*s == '\0' ? 1 : php_mblen(s, len)); + const char *basename_start; + const char *basename_end; + + if (CG(ascii_compatible_locale)) { +#ifdef ZEND_WIN32 + if ((len >= 2) && isalpha((int)((unsigned char *)s)[0]) && (s[1] == ':')) { + s += 2; + len -= 2; + } +#endif + + basename_end = s + len - 1; + + /* Strip trailing slashes */ + while (basename_end >= s && IS_SLASH_P(basename_end)) { + basename_end--; + } + if (basename_end < s) { + return ZSTR_EMPTY_ALLOC(); + } + + /* Extract filename */ + basename_start = basename_end; + basename_end++; + while (basename_start > s && !IS_SLASH_P(basename_start - 1)) { + basename_start--; + } + } else { + /* State 0 is directly after a directory separator (or at the start of the string). + * State 1 is everything else. */ + int state = 0; + + basename_start = s; + basename_end = s; + while (len > 0) { + int inc_len = (*s == '\0' ? 1 : php_mblen(s, len)); - switch (inc_len) { - case 0: - goto quit_loop; - case 1: + switch (inc_len) { + case 0: + goto quit_loop; + case 1: #if defined(PHP_WIN32) - if (*s == '/' || *s == '\\') { + if (*s == '/' || *s == '\\') { #else - if (*s == '/') { + if (*s == '/') { #endif - if (state == 1) { - state = 0; - basename_end = s; - } + if (state == 1) { + state = 0; + basename_end = s; + } #if defined(PHP_WIN32) - /* Catch relative paths in c:file.txt style. They're not to confuse - with the NTFS streams. This part ensures also, that no drive - letter traversing happens. */ - } else if ((*s == ':' && (s - basename_start == 1))) { - if (state == 0) { - basename_start = s; - state = 1; + /* Catch relative paths in c:file.txt style. They're not to confuse + with the NTFS streams. This part ensures also, that no drive + letter traversing happens. */ + } else if ((*s == ':' && (s - basename_start == 1))) { + if (state == 0) { + basename_start = s; + state = 1; + } else { + basename_end = s; + state = 0; + } +#endif } else { - basename_end = s; - state = 0; + if (state == 0) { + basename_start = s; + state = 1; + } + } + break; + default: + if (inc_len < 0) { + /* If character is invalid, treat it like other non-significant characters. */ + inc_len = 1; + php_mb_reset(); } -#endif - } else { if (state == 0) { basename_start = s; state = 1; } - } - break; - default: - if (inc_len < 0) { - /* If character is invalid, treat it like other non-significant characters. */ - inc_len = 1; - php_mb_reset(); - } - if (state == 0) { - basename_start = s; - state = 1; - } - break; + break; + } + s += inc_len; + len -= inc_len; } - s += inc_len; - len -= inc_len; - } quit_loop: - if (state == 1) { - basename_end = s; + if (state == 1) { + basename_end = s; + } } if (suffix != NULL && suffix_len < (size_t)(basename_end - basename_start) && @@ -4604,7 +4634,6 @@ static zend_string *try_setlocale_str(zend_long cat, zend_string *loc) { retval = setlocale(cat, NULL); } # endif - zend_update_current_locale(); if (!retval) { return NULL; } @@ -4615,6 +4644,7 @@ static zend_string *try_setlocale_str(zend_long cat, zend_string *loc) { BG(locale_changed) = 1; if (cat == LC_CTYPE || cat == LC_ALL) { + zend_update_current_locale(); if (BG(ctype_string)) { zend_string_release_ex(BG(ctype_string), 0); }