]> granicus.if.org Git - php/commitdiff
- Fix strrpos() logic (which was broken even in PHP 5).
authorAndrei Zmievski <andrei@php.net>
Mon, 2 Oct 2006 19:18:14 +0000 (19:18 +0000)
committerAndrei Zmievski <andrei@php.net>
Mon, 2 Oct 2006 19:18:14 +0000 (19:18 +0000)
- Optimizations for a couple of functions.

ext/standard/string.c
unicode-progress.txt

index a4f99147211a1193148754391fd7a3426f1c5a90..48caf27e43883f3cc47a8e2c88e7439e114d1b35 100644 (file)
@@ -2372,14 +2372,8 @@ PHP_FUNCTION(stristr)
                                php_error(E_WARNING, "Needle argument codepoint value out of range (0 - 0x10FFFF)");
                                RETURN_FALSE;
                        }
-                       if (U_IS_BMP(Z_LVAL_PP(needle))) {
-                               u_needle_char[needle_len++] = (UChar)Z_LVAL_PP(needle);
-                               u_needle_char[needle_len]   = 0;
-                       } else {
-                               u_needle_char[needle_len++] = (UChar)U16_LEAD(Z_LVAL_PP(needle));
-                               u_needle_char[needle_len++] = (UChar)U16_TRAIL(Z_LVAL_PP(needle));
-                               u_needle_char[needle_len]   = 0;
-                       }
+                       needle_len = zend_codepoint_to_uchar((UChar32)Z_LVAL_PP(needle), u_needle_char);
+                       u_needle_char[needle_len] = 0;
                        target.u = u_needle_char;
                } else {
                        needle_char[needle_len++] = (char)Z_LVAL_PP(needle);
@@ -2426,7 +2420,7 @@ PHP_FUNCTION(stristr)
    Finds first occurrence of a string within another */
 PHP_FUNCTION(strstr)
 {
-       void *haystack;
+       zstr haystack;
        int haystack_len;
        zend_uchar haystack_type;
        zval **needle;
@@ -2450,16 +2444,16 @@ PHP_FUNCTION(strstr)
                /* haystack type determines the needle type */
                if (haystack_type == IS_UNICODE) {
                        convert_to_unicode_ex(needle);
-                       found = zend_u_memnstr((UChar*)haystack,
+                       found = zend_u_memnstr(haystack.u,
                                                                   Z_USTRVAL_PP(needle),
                                                                   Z_USTRLEN_PP(needle),
-                                                                  (UChar*)haystack + haystack_len);
+                                                                  haystack.u + haystack_len);
                } else {
                        convert_to_string_ex(needle);
-                       found = php_memnstr((char*)haystack,
+                       found = php_memnstr(haystack.s,
                                                                Z_STRVAL_PP(needle),
                                                                Z_STRLEN_PP(needle),
-                                                               (char*)haystack + haystack_len);
+                                                               haystack.s + haystack_len);
                }
        } else {
                convert_to_long_ex(needle);
@@ -2468,39 +2462,33 @@ PHP_FUNCTION(strstr)
                                php_error(E_WARNING, "Needle argument codepoint value out of range (0 - 0x10FFFF)");
                                RETURN_FALSE;
                        }
-                       /* supplementary codepoint values may require 2 UChar's */
-                       if (U_IS_BMP(Z_LVAL_PP(needle))) {
-                               u_needle_char[n_len++] = (UChar) Z_LVAL_PP(needle);
-                               u_needle_char[n_len]   = 0;
-                       } else {
-                               u_needle_char[n_len++] = (UChar) U16_LEAD(Z_LVAL_PP(needle));
-                               u_needle_char[n_len++] = (UChar) U16_TRAIL(Z_LVAL_PP(needle));
-                               u_needle_char[n_len]   = 0;
-                       }
 
-                       found = zend_u_memnstr((UChar*)haystack,
+                       n_len = zend_codepoint_to_uchar((UChar32)Z_LVAL_PP(needle), u_needle_char);
+                       u_needle_char[n_len] = 0;
+
+                       found = zend_u_memnstr(haystack.u,
                                                                   u_needle_char,
                                                                   n_len,
-                                                                  (UChar*)haystack + haystack_len);
+                                                                  haystack.u + haystack_len);
                } else {
                        needle_char[0] = (char) Z_LVAL_PP(needle);
                        needle_char[1] = 0;
 
-                       found = php_memnstr((char*)haystack,
+                       found = php_memnstr(haystack.s,
                                                                needle_char,
                                                                1,
-                                                               (char*)haystack + haystack_len);
+                                                               haystack.s + haystack_len);
                }
        }
 
        if (found) {
                switch (haystack_type) {
                        case IS_UNICODE:
-                               found_offset = (UChar*)found - (UChar*)haystack;
+                               found_offset = (UChar*)found - haystack.u;
                                if (part) {
                                        UChar *ret;
                                        ret = eumalloc(found_offset + 1);
-                                       u_strncpy(ret, haystack, found_offset);
+                                       u_strncpy(ret, haystack.u, found_offset);
                                        ret[found_offset] = '\0';
                                        RETURN_UNICODEL(ret , found_offset, 0);
                                } else {
@@ -2509,11 +2497,11 @@ PHP_FUNCTION(strstr)
                                break;
 
                        case IS_STRING:
-                               found_offset = (char *)found - (char *)haystack;
+                               found_offset = (char *)found - haystack.s;
                                if (part) {
                                        char *ret;
                                        ret = emalloc(found_offset + 1);
-                                       strncpy(ret, haystack, found_offset);
+                                       strncpy(ret, haystack.s, found_offset);
                                        ret[found_offset] = '\0';
                                        RETURN_STRINGL(ret , found_offset, 0);
                                } else {
@@ -2535,7 +2523,7 @@ PHP_FUNCTION(strstr)
    Finds position of first occurrence of a string within another */
 PHP_FUNCTION(strpos)
 {
-       void *haystack;
+       zstr haystack;
        int haystack_len;
        zend_uchar haystack_type;
        zval **needle;
@@ -2544,6 +2532,7 @@ PHP_FUNCTION(strpos)
        char  needle_char[2];
        UChar u_needle_char[3];
        int n_len = 0;
+       int32_t cu_offset = 0;
 
        if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "tZ|l", &haystack,
                                                          &haystack_len, &haystack_type, &needle, &offset) == FAILURE) {
@@ -2551,7 +2540,7 @@ PHP_FUNCTION(strpos)
        }
 
        /*
-        * Unicode note: it's okay to not convert offset to codepoint offset here.
+        * Unicode note: it's okay to not convert offset to code unit offset here.
         * We'll just do a rough check that the offset does not exceed length in
         * code units, and leave the rest to zend_u_memnstr().
         */
@@ -2568,25 +2557,23 @@ PHP_FUNCTION(strpos)
 
                /* haystack type determines the needle type */
                if (haystack_type == IS_UNICODE) {
-                       int32_t cp_offset = 0;
                        convert_to_unicode_ex(needle);
-                       /* locate the codepoint at the specified offset */
-                       U16_FWD_N((UChar*)haystack, cp_offset, haystack_len, offset);
-                       found = zend_u_memnstr((UChar*)haystack + cp_offset,
+                       /* calculate code unit offset */
+                       U16_FWD_N(haystack.u, cu_offset, haystack_len, offset);
+                       found = zend_u_memnstr(haystack.u + cu_offset,
                                                                   Z_USTRVAL_PP(needle),
                                                                   Z_USTRLEN_PP(needle),
-                                                                  (UChar*)haystack + haystack_len);
+                                                                  haystack.u + haystack_len);
                } else {
                        convert_to_string_ex(needle);
-                       found = php_memnstr((char*)haystack + offset,
+                       found = php_memnstr(haystack.s + offset,
                                                                Z_STRVAL_PP(needle),
                                                                Z_STRLEN_PP(needle),
-                                                               (char*)haystack + haystack_len);
+                                                               haystack.s + haystack_len);
                }
        } else {
                convert_to_long_ex(needle);
                if (haystack_type == IS_UNICODE) {
-                       int32_t cp_offset = 0;
                        if (Z_LVAL_PP(needle) < 0 || Z_LVAL_PP(needle) > 0x10FFFF) {
                                php_error(E_WARNING, "Needle argument codepoint value out of range (0 - 0x10FFFF)");
                                RETURN_FALSE;
@@ -2594,30 +2581,33 @@ PHP_FUNCTION(strpos)
                        n_len += zend_codepoint_to_uchar(Z_LVAL_PP(needle), u_needle_char);
                        u_needle_char[n_len] = 0;
 
-                       /* locate the codepoint at the specified offset */
-                       U16_FWD_N((UChar*)haystack, cp_offset, haystack_len, offset);
-                       found = zend_u_memnstr((UChar*)haystack + cp_offset,
+                       /* calculate code unit offset */
+                       U16_FWD_N(haystack.u, cu_offset, haystack_len, offset);
+                       found = zend_u_memnstr(haystack.u + cu_offset,
                                                                   u_needle_char,
                                                                   n_len,
-                                                                  (UChar*)haystack + haystack_len);
+                                                                  haystack.u + haystack_len);
                } else {
                        needle_char[0] = (char) Z_LVAL_PP(needle);
                        needle_char[1] = 0;
 
-                       found = php_memnstr((char*)haystack + offset,
+                       found = php_memnstr(haystack.s + offset,
                                                                needle_char,
                                                                1,
-                                                               (char*)haystack + haystack_len);
+                                                               haystack.s + haystack_len);
                }
        }
 
        if (found) {
                if (haystack_type == IS_UNICODE) {
-                       /* simple subtraction will not suffice, since there may be
-                          supplementary codepoints */
-                       RETURN_LONG(u_countChar32(haystack, ((char *)found - (char *)haystack)/sizeof(UChar)));
+                       /* Simple subtraction will not suffice, since there may be
+                          supplementary codepoints. We count how many codepoints there are
+                          between the starting offset and the found location and add them
+                          to the starting codepoint offset. */
+                       RETURN_LONG(offset + u_countChar32(haystack.u + cu_offset,
+                                                                                          (UChar*)found - (haystack.u + cu_offset)));
                } else {
-                       RETURN_LONG((char *)found - (char *)haystack);
+                       RETURN_LONG((char *)found - haystack.s);
                }
        } else {
                RETURN_FALSE;
@@ -2636,7 +2626,7 @@ PHP_FUNCTION(stripos)
        void *haystack_dup = NULL, *needle_dup = NULL;
        char needle_char[2];
        char c = 0;
-       UChar u_needle_char[8];
+       UChar u_needle_char[3];
        void *found = NULL;
        int cu_offset = 0;
 
@@ -2647,6 +2637,12 @@ PHP_FUNCTION(stripos)
        if (Z_TYPE_PP(haystack) != IS_UNICODE && Z_TYPE_PP(haystack) != IS_STRING) {
                convert_to_text_ex(haystack);
        }
+
+       /*
+        * Unicode note: it's okay to not convert offset to code unit offset here.
+        * We'll just do a rough check that the offset does not exceed length in
+        * code units, and leave the rest to zend_u_memnstr().
+        */
        if (offset < 0 || offset > Z_UNILEN_PP(haystack)) {
                php_error_docref(NULL TSRMLS_CC, E_WARNING, "Offset not contained in string.");
                RETURN_FALSE;
@@ -2670,7 +2666,7 @@ PHP_FUNCTION(stripos)
                }
                needle_len = Z_UNILEN_PP(needle);
                if (Z_TYPE_PP(haystack) == IS_UNICODE) {
-                       /* calculate codeunit offset */
+                       /* calculate code unit offset */
                        U16_FWD_N(Z_USTRVAL_PP(haystack), cu_offset, haystack_len, offset);
                        found = php_u_stristr(Z_USTRVAL_PP(haystack) + cu_offset, Z_USTRVAL_PP(needle), haystack_len, needle_len TSRMLS_CC);
                } else {
@@ -2683,44 +2679,20 @@ PHP_FUNCTION(stripos)
                                                                (char *)haystack_dup + haystack_len);
                }
        } else {
-               switch (Z_TYPE_PP(needle)) {
-                       case IS_LONG:
-                       case IS_BOOL:
-                               if (Z_TYPE_PP(haystack) == IS_UNICODE) {
-                                       if (Z_LVAL_PP(needle) < 0 || Z_LVAL_PP(needle) > 0x10FFFF) {
-                                               php_error(E_WARNING, "Needle argument codepoint value out of range (0 - 0x10FFFF)");
-                                               RETURN_FALSE;      
-                                       }
-                                       needle_len = zend_codepoint_to_uchar((UChar32)Z_LVAL_PP(needle), u_needle_char);
-                               } else {
-                                       c = tolower((char)Z_LVAL_PP(needle));
-                               }
-                               break;
-                       case IS_DOUBLE:
-                               if (Z_TYPE_PP(haystack) == IS_UNICODE) {
-                                       if ((UChar32)Z_DVAL_PP(needle) < 0 || (UChar32)Z_DVAL_PP(needle) > 0x10FFFF) {
-                                               php_error(E_WARNING, "Needle argument codepoint value out of range (0 - 0x10FFFF)");
-                                               RETURN_FALSE;      
-                                       }
-                                       needle_len = zend_codepoint_to_uchar((UChar32)Z_DVAL_PP(needle), u_needle_char);
-                               } else {
-                                       c = tolower((char)Z_DVAL_PP(needle));
-                               }
-                               break;
-                       default:
-                               php_error_docref(NULL TSRMLS_CC, E_WARNING, "needle is not a string or an integer.");
-                               RETURN_FALSE;
-                               break;
-
-               }
+               convert_to_long_ex(needle);
                if (Z_TYPE_PP(haystack) == IS_UNICODE) {
-                       /* calculate codeunit offset */
-                       U16_FWD_N(Z_USTRVAL_PP(haystack), cu_offset, haystack_len, offset);
+                       if (Z_LVAL_PP(needle) < 0 || Z_LVAL_PP(needle) > 0x10FFFF) {
+                               php_error(E_WARNING, "Needle argument codepoint value out of range (0 - 0x10FFFF)");
+                               RETURN_FALSE;      
+                       }
+                       needle_len = zend_codepoint_to_uchar((UChar32)Z_LVAL_PP(needle), u_needle_char);
                        u_needle_char[needle_len] = 0;
+                       /* calculate code unit offset */
+                       U16_FWD_N(Z_USTRVAL_PP(haystack), cu_offset, haystack_len, offset);
                        found = php_u_stristr(Z_USTRVAL_PP(haystack) + cu_offset,
                                                                  u_needle_char, haystack_len, needle_len TSRMLS_CC);
-                                                                  
                } else {
+                       c = tolower((char)Z_LVAL_PP(needle));
                        needle_char[0] = c;
                        needle_char[1] = '\0';
                        haystack_dup = estrndup(Z_STRVAL_PP(haystack), haystack_len);
@@ -2767,6 +2739,7 @@ PHP_FUNCTION(strrpos)
        long offset = 0;
        char *p, *e, ord_needle[2];
        UChar *pos, *u_p, *u_e, u_ord_needle[3];
+       int cu_offset = 0;
 
        if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "ZZ|l",
                                                          &zhaystack, &zneedle, &offset) == FAILURE) {
@@ -2790,14 +2763,8 @@ PHP_FUNCTION(strrpos)
                                php_error(E_WARNING, "Needle argument codepoint value out of range (0 - 0x10FFFF)");
                                RETURN_FALSE;
                        }
-                       if (U_IS_BMP(Z_LVAL_PP(zneedle))) {
-                               u_ord_needle[needle_len++] = (UChar)Z_LVAL_PP(zneedle);
-                               u_ord_needle[needle_len]   = 0;
-                       } else {
-                               u_ord_needle[needle_len++] = (UChar)U16_LEAD(Z_LVAL_PP(zneedle));
-                               u_ord_needle[needle_len++] = (UChar)U16_TRAIL(Z_LVAL_PP(zneedle));
-                               u_ord_needle[needle_len]   = 0;
-                       }
+                       needle_len = zend_codepoint_to_uchar((UChar32)Z_LVAL_PP(zneedle), u_ord_needle);
+                       u_ord_needle[needle_len] = 0;
                        needle.u = u_ord_needle;
                } else {
                        convert_to_long_ex(zneedle);
@@ -2810,40 +2777,60 @@ PHP_FUNCTION(strrpos)
        haystack = Z_UNIVAL_PP(zhaystack);
        haystack_len = Z_UNILEN_PP(zhaystack);
 
-       if ((haystack_len == 0) || (needle_len == 0)) {
+       if ((haystack_len == 0) || (needle_len == 0) || needle_len > haystack_len) {
                RETURN_FALSE;
        }
 
        if (Z_TYPE_PP(zhaystack) == IS_UNICODE) {
                if (offset >= 0) {
-                       u_p = haystack.u + offset;
+                       U16_FWD_N(haystack.u, cu_offset, haystack_len, offset);
+                       if (cu_offset > haystack_len - needle_len) {
+                               RETURN_FALSE;
+                       }
+                       u_p = haystack.u + cu_offset;
                        u_e = haystack.u + haystack_len - needle_len;
                } else {
                        u_p = haystack.u;
                        if (-offset > haystack_len) {
-                               u_e = haystack.u - needle_len;
-                       } else if (needle_len > -offset) {
-                               u_e = haystack.u + haystack_len - needle_len;
+                               RETURN_FALSE;
                        } else {
-                               u_e = haystack.u + haystack_len + offset;
+                               cu_offset = haystack_len;
+                               U16_BACK_N(haystack.u, 0, cu_offset, -offset);
+                               if (cu_offset == 0) {
+                                       RETURN_FALSE;
+                               }
+                               if (needle_len > haystack_len - cu_offset) {
+                                       u_e = haystack.u + haystack_len - needle_len;
+                               } else {
+                                       u_e = haystack.u + cu_offset;
+                               }
                        }
                }
 
                pos = u_strFindLast(u_p, u_e-u_p+needle_len, needle.u, needle_len);
                if (pos) {
-                       RETURN_LONG(pos - haystack.u);
+                       if (offset > 0) {
+                               RETURN_LONG(offset + u_countChar32(u_p, (UChar*)pos - u_p));
+                       } else {
+                               RETURN_LONG(u_countChar32(haystack.u, (UChar*)pos - haystack.u));
+                       }
                } else {
                        RETURN_FALSE;
                }
        } else {
                if (offset >= 0) {
+                       if (offset > haystack_len) {
+                               RETURN_FALSE;
+                       }
                        p = haystack.s + offset;
                        e = haystack.s + haystack_len - needle_len;
                } else {
-                       p = haystack.s;
                        if (-offset > haystack_len) {
-                               e = haystack.s - needle_len;
-                       } else if (needle_len > -offset) {
+                               RETURN_FALSE;
+                       }
+
+                       p = haystack.s;
+                       if (needle_len > -offset) {
                                e = haystack.s + haystack_len - needle_len;
                        } else {
                                e = haystack.s + haystack_len + offset;
index 017025f5808d514ef759f12539bcaa306e994d65..dddd13c6daec5d96267a5f5f948d0bc7eca1d266 100644 (file)
@@ -26,7 +26,6 @@ ext/standard
     sscanf()
         Params API. Rest - no idea yet.
 
-    stristr()
     strripos()
     str_replace()
     stri_replace()
@@ -158,6 +157,7 @@ ext/standard
     stripcslashes()
     stripslashes()
     stripos()
+    stristr()
     strpbrk()
     strpos()
     strrchr()