]> granicus.if.org Git - php/commitdiff
Improve basename(). Avoid calling mblen() for ASCII compatible locales.
authorDmitry Stogov <dmitry@zend.com>
Fri, 19 Feb 2021 12:42:21 +0000 (15:42 +0300)
committerDmitry Stogov <dmitry@zend.com>
Fri, 19 Feb 2021 12:42:21 +0000 (15:42 +0300)
Zend/zend_globals.h
Zend/zend_operators.c
Zend/zend_operators.h
ext/standard/string.c

index 825fad833c6352b0036a7c07a736fdd4db164204..e9b24fc0e3741e71bff3b3ddd4c60e9c598cd157 100644 (file)
@@ -95,6 +95,10 @@ struct _zend_compiler_globals {
        bool skip_shebang;
        bool increment_lineno;
 
+       bool variable_width_locale;   /* UTF-8, Shift-JIS, Big5, ISO 2022, EUC, etc */
+       bool ascii_compatible_locale; /* locale uses ASCII characters as singletons */
+                                     /* and don't use them as lead/trail units     */
+
        zend_string *doc_comment;
        uint32_t extra_fn_flags;
 
index 0cdb3aa0857e59496158b81a70ac709e10c10035..a23dad9e1e1d57d447a8c7881f34de3332d9befc 100644 (file)
 #include "zend_exceptions.h"
 #include "zend_closures.h"
 
+#include <locale.h>
+#ifdef HAVE_LANGINFO_H
+# include <langinfo.h>
+#endif
+
 #ifdef __SSE2__
 #include <emmintrin.h>
 #endif
 
+#if defined(ZEND_WIN32) && !defined(ZTS) && defined(_MSC_VER)
+/* This performance improvement of tolower() on Windows gives 10-18% on bench.php */
+#define ZEND_USE_TOLOWER_L 1
+#endif
+
 #ifdef ZEND_USE_TOLOWER_L
-#include <locale.h>
 static _locale_t current_locale = NULL;
 /* this is true global! may lead to strange effects on ZTS, but so may setlocale() */
 #define zend_tolower(c) _tolower_l(c, current_locale)
@@ -2537,13 +2546,85 @@ ZEND_API bool ZEND_FASTCALL zend_object_is_true(zval *op) /* {{{ */
 }
 /* }}} */
 
-#ifdef ZEND_USE_TOLOWER_L
 ZEND_API void zend_update_current_locale(void) /* {{{ */
 {
+#ifdef ZEND_USE_TOLOWER_L
+# if defined(ZEND_WIN32) && defined(_MSC_VER)
        current_locale = _get_current_locale();
+# else
+       current_locale = uselocale(0);
+# endif
+#endif
+#if defined(ZEND_WIN32) && defined(_MSC_VER)
+       if (MB_CUR_MAX > 1) {
+               unsigned int cp = ___lc_codepage_func();
+               CG(variable_width_locale) = 1;
+               // TODO: EUC-* are also ASCII compatible ???
+               CG(ascii_compatible_locale) =
+                       cp == 65001; /* UTF-8 */
+       } else {
+               CG(variable_width_locale) = 0;
+               CG(ascii_compatible_locale) = 1;
+       }
+#elif defined(MB_CUR_MAX)
+       /* Check if current locale uses variable width encoding */
+       if (MB_CUR_MAX > 1) {
+#if HAVE_NL_LANGINFO
+               const char *charmap = nl_langinfo(CODESET);
+#else
+               char buf[16];
+               const char *charmap = NULL;
+               const char *locale = setlocale(LC_CTYPE, NULL);
+
+               if (locale) {
+                       const char *dot = strchr(locale, '.');
+                       const char *modifier;
+
+                       if (dot) {
+                               dot++;
+                               modifier = strchr(dot, '@');
+                               if (!modifier) {
+                                       charmap = dot;
+                               } else if (modifier - dot < sizeof(buf)) {
+                                       memcpy(buf, dot, modifier - dot);
+                    buf[modifier - dot] = '\0';
+                    charmap = buf;
+                               }
+                       }
+               }
+#endif
+               CG(variable_width_locale) = 1;
+               CG(ascii_compatible_locale) = 0;
+
+               if (charmap) {
+                       size_t len = strlen(charmap);
+                       static const char *ascii_compatible_charmaps[] = {
+                               "utf-8",
+                               "utf8",
+                               // TODO: EUC-* are also ASCII compatible ???
+                               NULL
+                       };
+                       const char **p;
+                       /* Check if current locale is ASCII compatible */
+                       for (p = ascii_compatible_charmaps; *p; p++) {
+                               if (zend_binary_strcasecmp(charmap, len, *p, strlen(*p)) == 0) {
+                                       CG(ascii_compatible_locale) = 1;
+                                       break;
+                               }
+                       }
+               }
+
+       } else {
+               CG(variable_width_locale) = 0;
+               CG(ascii_compatible_locale) = 1;
+       }
+#else
+       /* We can't determine current charset. Assume the worst case */
+       CG(variable_width_locale) = 1;
+       CG(ascii_compatible_locale) = 0;
+#endif
 }
 /* }}} */
-#endif
 
 static zend_always_inline void zend_str_tolower_impl(char *dest, const char *str, size_t length) /* {{{ */ {
        unsigned char *p = (unsigned char*)str;
index b3ad598b74efdb9650bccbf0dda0cda97c80b952..8996a3d9599d43c95e39d1c5b570ec30f7df1af6 100644 (file)
@@ -450,16 +450,7 @@ ZEND_API zend_long ZEND_FASTCALL zend_atol(const char *str, size_t str_len);
 #define convert_to_object_ex(zv) convert_to_object(zv)
 #define convert_scalar_to_number_ex(zv) convert_scalar_to_number(zv)
 
-#if defined(ZEND_WIN32) && !defined(ZTS) && defined(_MSC_VER)
-/* This performance improvement of tolower() on Windows gives 10-18% on bench.php */
-#define ZEND_USE_TOLOWER_L 1
-#endif
-
-#ifdef ZEND_USE_TOLOWER_L
 ZEND_API void zend_update_current_locale(void);
-#else
-#define zend_update_current_locale()
-#endif
 
 /* The offset in bytes between the value and type fields of a zval */
 #define ZVAL_OFFSETOF_TYPE     \
index f6af763fd3fac13432c0423255cd1ff19056198c..d9185de80271845ae6e641db612d1ceb8672933e 100644 (file)
@@ -1465,66 +1465,96 @@ PHP_FUNCTION(strtolower)
 /* {{{ php_basename */
 PHPAPI zend_string *php_basename(const char *s, size_t len, const char *suffix, size_t suffix_len)
 {
-       /* State 0 is directly after a directory separator (or at the start of the string).
-        * State 1 is everything else. */
-       int state = 0;
-       const char *basename_start = s;
-       const char *basename_end = s;
-       while (len > 0) {
-               int inc_len = (*s == '\0' ? 1 : php_mblen(s, len));
+       const char *basename_start;
+       const char *basename_end;
+
+       if (CG(ascii_compatible_locale)) {
+#ifdef ZEND_WIN32
+               if ((len >= 2) && isalpha((int)((unsigned char *)s)[0]) && (s[1] == ':')) {
+                       s += 2;
+                       len -= 2;
+               }
+#endif
+
+               basename_end = s + len - 1;
+
+               /* Strip trailing slashes */
+               while (basename_end >= s && IS_SLASH_P(basename_end)) {
+                       basename_end--;
+               }
+               if (basename_end < s) {
+                       return ZSTR_EMPTY_ALLOC();
+               }
+
+               /* Extract filename */
+               basename_start = basename_end;
+               basename_end++;
+               while (basename_start > s && !IS_SLASH_P(basename_start - 1)) {
+                       basename_start--;
+               }
+       } else {
+               /* State 0 is directly after a directory separator (or at the start of the string).
+                * State 1 is everything else. */
+               int state = 0;
+
+               basename_start = s;
+               basename_end = s;
+               while (len > 0) {
+                       int inc_len = (*s == '\0' ? 1 : php_mblen(s, len));
 
-               switch (inc_len) {
-                       case 0:
-                               goto quit_loop;
-                       case 1:
+                       switch (inc_len) {
+                               case 0:
+                                       goto quit_loop;
+                               case 1:
 #if defined(PHP_WIN32)
-                               if (*s == '/' || *s == '\\') {
+                                       if (*s == '/' || *s == '\\') {
 #else
-                               if (*s == '/') {
+                                       if (*s == '/') {
 #endif
-                                       if (state == 1) {
-                                               state = 0;
-                                               basename_end = s;
-                                       }
+                                               if (state == 1) {
+                                                       state = 0;
+                                                       basename_end = s;
+                                               }
 #if defined(PHP_WIN32)
-                               /* Catch relative paths in c:file.txt style. They're not to confuse
-                                  with the NTFS streams. This part ensures also, that no drive
-                                  letter traversing happens. */
-                               } else if ((*s == ':' && (s - basename_start == 1))) {
-                                       if (state == 0) {
-                                               basename_start = s;
-                                               state = 1;
+                                       /* Catch relative paths in c:file.txt style. They're not to confuse
+                                          with the NTFS streams. This part ensures also, that no drive
+                                          letter traversing happens. */
+                                       } else if ((*s == ':' && (s - basename_start == 1))) {
+                                               if (state == 0) {
+                                                       basename_start = s;
+                                                       state = 1;
+                                               } else {
+                                                       basename_end = s;
+                                                       state = 0;
+                                               }
+#endif
                                        } else {
-                                               basename_end = s;
-                                               state = 0;
+                                               if (state == 0) {
+                                                       basename_start = s;
+                                                       state = 1;
+                                               }
+                                       }
+                                       break;
+                               default:
+                                       if (inc_len < 0) {
+                                               /* If character is invalid, treat it like other non-significant characters. */
+                                               inc_len = 1;
+                                               php_mb_reset();
                                        }
-#endif
-                               } else {
                                        if (state == 0) {
                                                basename_start = s;
                                                state = 1;
                                        }
-                               }
-                               break;
-                       default:
-                               if (inc_len < 0) {
-                                       /* If character is invalid, treat it like other non-significant characters. */
-                                       inc_len = 1;
-                                       php_mb_reset();
-                               }
-                               if (state == 0) {
-                                       basename_start = s;
-                                       state = 1;
-                               }
-                               break;
+                                       break;
+                       }
+                       s += inc_len;
+                       len -= inc_len;
                }
-               s += inc_len;
-               len -= inc_len;
-       }
 
 quit_loop:
-       if (state == 1) {
-               basename_end = s;
+               if (state == 1) {
+                       basename_end = s;
+               }
        }
 
        if (suffix != NULL && suffix_len < (size_t)(basename_end - basename_start) &&
@@ -4604,7 +4634,6 @@ static zend_string *try_setlocale_str(zend_long cat, zend_string *loc) {
                retval = setlocale(cat, NULL);
        }
 # endif
-       zend_update_current_locale();
        if (!retval) {
                return NULL;
        }
@@ -4615,6 +4644,7 @@ static zend_string *try_setlocale_str(zend_long cat, zend_string *loc) {
 
                BG(locale_changed) = 1;
                if (cat == LC_CTYPE || cat == LC_ALL) {
+                       zend_update_current_locale();
                        if (BG(ctype_string)) {
                                zend_string_release_ex(BG(ctype_string), 0);
                        }