]> granicus.if.org Git - php/commitdiff
PCRE unicode/binary support
authorDmitry Stogov <dmitry@php.net>
Fri, 6 Jul 2007 23:06:51 +0000 (23:06 +0000)
committerDmitry Stogov <dmitry@php.net>
Fri, 6 Jul 2007 23:06:51 +0000 (23:06 +0000)
ext/pcre/php_pcre.c
ext/pcre/php_pcre.h
ext/pcre/tests/bug27103.phpt
ext/pcre/tests/bug40909.phpt
ext/pcre/tests/invalid_utf8.phpt
ext/spl/spl_iterators.c
win32/sendmail.c

index 2c828f7646701792f6326d8cc4bddd83ba8daaf0..f008c5f34bcbe470a4d9f2ddf94b06d25ce84160 100644 (file)
@@ -191,7 +191,7 @@ static int pcre_clean_cache(void *data, void *arg TSRMLS_DC)
 
 /* {{{ pcre_get_compiled_regex_cache
  */
-PHPAPI pcre_cache_entry* pcre_get_compiled_regex_cache(char *regex, int regex_len TSRMLS_DC)
+PHPAPI pcre_cache_entry* pcre_get_compiled_regex_cache(zend_uchar utype, char *regex, int regex_len TSRMLS_DC)
 {
        pcre                            *re = NULL;
        pcre_extra                      *extra;
@@ -333,7 +333,7 @@ PHPAPI pcre_cache_entry* pcre_get_compiled_regex_cache(char *regex, int regex_le
                }
        }
 
-       if (UG(unicode)) {
+       if (utype == IS_UNICODE) {
                coptions |= PCRE_UTF8;
        }
 
@@ -405,7 +405,7 @@ PHPAPI pcre_cache_entry* pcre_get_compiled_regex_cache(char *regex, int regex_le
  */
 PHPAPI pcre* pcre_get_compiled_regex(char *regex, pcre_extra **extra, int *preg_options TSRMLS_DC)
 {
-       pcre_cache_entry * pce = pcre_get_compiled_regex_cache(regex, strlen(regex) TSRMLS_CC);
+       pcre_cache_entry * pce = pcre_get_compiled_regex_cache(ZEND_STR_TYPE, regex, strlen(regex) TSRMLS_CC);
 
        if (extra) {
                *extra = pce ? pce->extra : NULL;
@@ -422,7 +422,7 @@ PHPAPI pcre* pcre_get_compiled_regex(char *regex, pcre_extra **extra, int *preg_
  */
 PHPAPI pcre* pcre_get_compiled_regex_ex(char *regex, pcre_extra **extra, int *preg_options, int *compile_options TSRMLS_DC)
 {
-       pcre_cache_entry * pce = pcre_get_compiled_regex_cache(regex, strlen(regex) TSRMLS_CC);
+       pcre_cache_entry * pce = pcre_get_compiled_regex_cache(ZEND_STR_TYPE, regex, strlen(regex) TSRMLS_CC);
        
        if (extra) {
                *extra = pce ? pce->extra : NULL;
@@ -439,7 +439,7 @@ PHPAPI pcre* pcre_get_compiled_regex_ex(char *regex, pcre_extra **extra, int *pr
 /* }}} */
 
 /* {{{ add_offset_pair */
-static inline void add_offset_pair(zval *result, char *str, int len, int offset, char *name, offset_map_t *prev TSRMLS_DC)
+static inline void add_offset_pair(zval *result, zend_uchar utype, char *str, int len, int offset, char *name, offset_map_t *prev TSRMLS_DC)
 {
        zval *match_pair;
        int tmp;
@@ -449,16 +449,25 @@ static inline void add_offset_pair(zval *result, char *str, int len, int offset,
        INIT_PZVAL(match_pair);
 
        /* Add (match, offset) to the return value */
-       add_next_index_utf8_stringl(match_pair, str, len, 1);
+       if (utype == IS_UNICODE) {
+               add_next_index_utf8_stringl(match_pair, str, len, 1);
+       } else {
+               add_next_index_stringl(match_pair, str, len, 1);
+       }
 
        /* Calculate codepoint offset from the previous chunk */
        if (offset) {
-               tmp = prev->byte_offset;
-               while (tmp < offset) {
-                       U8_FWD_1(prev->str, tmp, offset);
-                       prev->cp_offset++;
-               }
-               prev->byte_offset = tmp;
+               if (utype == IS_UNICODE) {
+                       tmp = prev->byte_offset;
+                       while (tmp < offset) {
+                               U8_FWD_1(prev->str, tmp, offset);
+                               prev->cp_offset++;
+                       }
+                       prev->byte_offset = tmp;
+               } else {
+                       prev->cp_offset = offset;
+                       prev->byte_offset = offset;
+               }               
        }
        add_next_index_long(match_pair, prev->cp_offset);
        
@@ -511,7 +520,7 @@ static void php_do_pcre_match(INTERNAL_FUNCTION_PARAMETERS, int global)
        }
        
        /* Compile regex or get it from cache. */
-       if ((pce = pcre_get_compiled_regex_cache(regex.s, regex_len TSRMLS_CC)) == NULL) {
+       if ((pce = pcre_get_compiled_regex_cache(str_type, regex.s, regex_len TSRMLS_CC)) == NULL) {
                if (str_type == IS_UNICODE) {
                        efree(regex_utf8);
                        efree(subject_utf8);
@@ -519,7 +528,7 @@ static void php_do_pcre_match(INTERNAL_FUNCTION_PARAMETERS, int global)
                RETURN_FALSE;
        }
 
-       php_pcre_match_impl(pce, subject.s, subject_len, return_value, subpats, 
+       php_pcre_match_impl(pce, str_type, subject.s, subject_len, return_value, subpats, 
                global, ZEND_NUM_ARGS() >= 4, flags, start_offset TSRMLS_CC);
 
        if (str_type == IS_UNICODE) {
@@ -530,7 +539,7 @@ static void php_do_pcre_match(INTERNAL_FUNCTION_PARAMETERS, int global)
 /* }}} */
 
 /* {{{ php_pcre_match_impl */
-PHPAPI void php_pcre_match_impl(pcre_cache_entry *pce, char *subject, int subject_len, zval *return_value,
+PHPAPI void php_pcre_match_impl(pcre_cache_entry *pce, zend_uchar utype, char *subject, int subject_len, zval *return_value,
        zval *subpats, int global, int use_flags, long flags, long start_offset TSRMLS_DC)
 {
        zval                    *result_set,            /* Holds a set of subpatterns after
@@ -580,7 +589,7 @@ PHPAPI void php_pcre_match_impl(pcre_cache_entry *pce, char *subject, int subjec
                offset_capture = 0;
        }
 
-       if (UG(unicode)) {
+       if (utype == IS_UNICODE) {
                int k = 0;
                /* Calculate byte offset from codepoint offset */
                if (start_offset < 0) {
@@ -590,7 +599,6 @@ PHPAPI void php_pcre_match_impl(pcre_cache_entry *pce, char *subject, int subjec
                        U8_FWD_N(subject, k, subject_len, start_offset);
                }
                start_offset = k;
-               exoptions |= PCRE_NO_UTF8_CHECK;
        } else {
                /* Negative offset counts from the end of the string. */
                if (start_offset < 0) {
@@ -599,6 +607,9 @@ PHPAPI void php_pcre_match_impl(pcre_cache_entry *pce, char *subject, int subjec
                                start_offset = 0;
                        }
                }
+               if (!(pce->compile_options & PCRE_UTF8)) {
+                       exoptions |= PCRE_NO_UTF8_CHECK;
+               }
        }
 
        if (extra == NULL) {
@@ -712,11 +723,14 @@ PHPAPI void php_pcre_match_impl(pcre_cache_entry *pce, char *subject, int subjec
                                                /* For each subpattern, insert it into the appropriate array. */
                                                for (i = 0; i < count; i++) {
                                                        if (offset_capture) {
-                                                               add_offset_pair(match_sets[i], (char *)stringlist[i],
+                                                               add_offset_pair(match_sets[i], utype, (char *)stringlist[i],
                                                                                                offsets[(i<<1)+1] - offsets[i<<1], offsets[i<<1], NULL, &map TSRMLS_CC);
-                                                       } else {
+                                                       } else if (utype == IS_UNICODE) {
                                                                add_next_index_utf8_stringl(match_sets[i], (char *)stringlist[i],
                                                                                                                        offsets[(i<<1)+1] - offsets[i<<1], 1);
+                                                       } else {
+                                                               add_next_index_stringl(match_sets[i], (char *)stringlist[i],
+                                                                                                               offsets[(i<<1)+1] - offsets[i<<1], 1);
                                                        }
                                                }
                                                /*
@@ -744,16 +758,23 @@ PHPAPI void php_pcre_match_impl(pcre_cache_entry *pce, char *subject, int subjec
                                                /* Add all the subpatterns to it */
                                                for (i = 0; i < count; i++) {
                                                        if (offset_capture) {
-                                                               add_offset_pair(result_set, (char *)stringlist[i],
+                                                               add_offset_pair(result_set, utype, (char *)stringlist[i],
                                                                                                offsets[(i<<1)+1] - offsets[i<<1], offsets[i<<1],
                                                                                                subpat_names[i], &map TSRMLS_CC);
-                                                       } else {
+                                                       } else if (utype == IS_UNICODE) {
                                                                if (subpat_names[i]) {
-                                                                       add_assoc_utf8_stringl(result_set, subpat_names[i], (char *)stringlist[i],
+                                                                       add_utf8_assoc_utf8_stringl(result_set, subpat_names[i], (char *)stringlist[i],
                                                                                                                   offsets[(i<<1)+1] - offsets[i<<1], 1);
                                                                }
                                                                add_next_index_utf8_stringl(result_set, (char *)stringlist[i],
                                                                                                                        offsets[(i<<1)+1] - offsets[i<<1], 1);
+                                                       } else {
+                                                               if (subpat_names[i]) {
+                                                                       add_rt_assoc_stringl(result_set, subpat_names[i], (char *)stringlist[i],
+                                                                                                          offsets[(i<<1)+1] - offsets[i<<1], 1);
+                                                               }
+                                                               add_next_index_stringl(result_set, (char *)stringlist[i],
+                                                                                                               offsets[(i<<1)+1] - offsets[i<<1], 1);
                                                        }
                                                }
                                                /* And add it to the output array */
@@ -763,16 +784,23 @@ PHPAPI void php_pcre_match_impl(pcre_cache_entry *pce, char *subject, int subjec
                                        /* For each subpattern, insert it into the subpatterns array. */
                                        for (i = 0; i < count; i++) {
                                                if (offset_capture) {
-                                                       add_offset_pair(subpats, (char *)stringlist[i],
+                                                       add_offset_pair(subpats, utype, (char *)stringlist[i],
                                                                                        offsets[(i<<1)+1] - offsets[i<<1],
                                                                                        offsets[i<<1], subpat_names[i], &map TSRMLS_CC);
-                                               } else {
+                                               } else if (utype == IS_UNICODE) {
                                                        if (subpat_names[i]) {
-                                                               add_assoc_utf8_stringl(subpats, subpat_names[i], (char *)stringlist[i],
+                                                               add_utf8_assoc_utf8_stringl(subpats, subpat_names[i], (char *)stringlist[i],
                                                                                                           offsets[(i<<1)+1] - offsets[i<<1], 1);
                                                        }
                                                        add_next_index_utf8_stringl(subpats, (char *)stringlist[i],
                                                                                                                offsets[(i<<1)+1] - offsets[i<<1], 1);
+                                               } else {
+                                                       if (subpat_names[i]) {
+                                                               add_rt_assoc_stringl(subpats, subpat_names[i], (char *)stringlist[i],
+                                                                                                  offsets[(i<<1)+1] - offsets[i<<1], 1);
+                                                       }
+                                                       add_next_index_stringl(subpats, (char *)stringlist[i],
+                                                                                                       offsets[(i<<1)+1] - offsets[i<<1], 1);
                                                }
                                        }
                                }
@@ -786,7 +814,7 @@ PHPAPI void php_pcre_match_impl(pcre_cache_entry *pce, char *subject, int subjec
                           to achieve this, unless we're already at the end of the string. */
                        if (g_notempty != 0 && start_offset < subject_len) {
                                offsets[0] = start_offset;
-                               if (UG(unicode) || pce->compile_options & PCRE_UTF8) {
+                               if (utype == IS_UNICODE || pce->compile_options & PCRE_UTF8) {
                                        offsets[1] = start_offset;
                                        U8_FWD_1(subject, offsets[1], subject_len);
                                } else {
@@ -1026,7 +1054,8 @@ static int preg_do_eval(char *eval_str, int eval_str_len, char *subject,
 
 /* {{{ php_pcre_replace
  */
-PHPAPI char *php_pcre_replace(char *regex,   int regex_len,
+PHPAPI char *php_pcre_replace(zend_uchar utype,
+                                                         char *regex,   int regex_len,
                                                          char *subject, int subject_len,
                                                          zval *replace_val, int is_callable_replace,
                                                          int *result_len, int limit, int *replace_count TSRMLS_DC)
@@ -1034,17 +1063,17 @@ PHPAPI char *php_pcre_replace(char *regex,   int regex_len,
        pcre_cache_entry        *pce;                       /* Compiled regular expression */
 
        /* Compile regex or get it from cache. */
-       if ((pce = pcre_get_compiled_regex_cache(regex, regex_len TSRMLS_CC)) == NULL) {
+       if ((pce = pcre_get_compiled_regex_cache(utype, regex, regex_len TSRMLS_CC)) == NULL) {
                return NULL;
        }
 
-       return php_pcre_replace_impl(pce, subject, subject_len, replace_val, 
+       return php_pcre_replace_impl(pce, utype, subject, subject_len, replace_val, 
                is_callable_replace, result_len, limit, replace_count TSRMLS_CC);
 }
 /* }}} */
 
 /* {{{ php_pcre_replace_impl() */
-PHPAPI char *php_pcre_replace_impl(pcre_cache_entry *pce, char *subject, int subject_len, zval *replace_val, 
+PHPAPI char *php_pcre_replace_impl(pcre_cache_entry *pce, zend_uchar utype, char *subject, int subject_len, zval *replace_val, 
        int is_callable_replace, int *result_len, int limit, int *replace_count TSRMLS_DC)
 {
        pcre_extra              *extra = pce->extra;/* Holds results of studying */
@@ -1112,7 +1141,7 @@ PHPAPI char *php_pcre_replace_impl(pcre_cache_entry *pce, char *subject, int sub
        start_offset = 0;
        PCRE_G(error_code) = PHP_PCRE_NO_ERROR;
 
-       if (UG(unicode)) {
+       if (utype != IS_UNICODE && !(pce->compile_options & PCRE_UTF8)) {
                exoptions |= PCRE_NO_UTF8_CHECK;
        }
        
@@ -1227,7 +1256,7 @@ PHPAPI char *php_pcre_replace_impl(pcre_cache_entry *pce, char *subject, int sub
                           to achieve this, unless we're already at the end of the string. */
                        if (g_notempty != 0 && start_offset < subject_len) {
                                offsets[0] = start_offset;
-                               if (UG(unicode) || pce->compile_options & PCRE_UTF8) {
+                               if (utype == IS_UNICODE || pce->compile_options & PCRE_UTF8) {
                                        offsets[1] = start_offset;
                                        U8_FWD_1(subject, offsets[1], subject_len);
                                } else {
@@ -1284,8 +1313,10 @@ static char *php_replace_in_subject(zval *regex, zval *replace, zval **subject,
        char            *subject_value,
                                *result;
        int                      subject_len;
+       zend_uchar   utype;
 
        /* Make sure we're dealing with strings. */     
+       utype = Z_TYPE_PP(subject);
        convert_to_string_with_converter_ex(subject, UG(utf8_conv));
 
        ZVAL_STRINGL(&empty_replace, "", 0, 0);
@@ -1325,7 +1356,8 @@ static char *php_replace_in_subject(zval *regex, zval *replace, zval **subject,
                        
                        /* Do the actual replacement and put the result back into subject_value
                           for further replacements. */
-                       if ((result = php_pcre_replace(Z_STRVAL_PP(regex_entry),
+                       if ((result = php_pcre_replace(utype,
+                                                                                  Z_STRVAL_PP(regex_entry),
                                                                                   Z_STRLEN_PP(regex_entry),
                                                                                   subject_value,
                                                                                   subject_len,
@@ -1344,7 +1376,8 @@ static char *php_replace_in_subject(zval *regex, zval *replace, zval **subject,
 
                return subject_value;
        } else {
-               result = php_pcre_replace(Z_STRVAL_P(regex),
+               result = php_pcre_replace(utype,
+                                                                 Z_STRVAL_P(regex),
                                                                  Z_STRLEN_P(regex),
                                                                  Z_STRVAL_PP(subject),
                                                                  Z_STRLEN_PP(subject),
@@ -1375,7 +1408,8 @@ static void preg_replace_impl(INTERNAL_FUNCTION_PARAMETERS, zend_bool is_callabl
        ulong                    num_key;
        zval                     callback_name;
        int                              replace_count=0;
-       int                             *replace_count_ptr=NULL; 
+       int                             *replace_count_ptr=NULL;
+       zend_uchar       utype;
        
        if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "z/z/z/|lz", &regex,
                                                          &replace, &subject, &limit, &zcount) == FAILURE) {
@@ -1419,21 +1453,34 @@ static void preg_replace_impl(INTERNAL_FUNCTION_PARAMETERS, zend_bool is_callabl
                   and add the result to the return_value array. */
                while (zend_hash_get_current_data(Z_ARRVAL_P(subject), (void **)&subject_entry) == SUCCESS) {
                        SEPARATE_ZVAL(subject_entry);
+                       utype = Z_TYPE_PP(subject_entry);
                        if ((result = php_replace_in_subject(regex, replace, subject_entry, &result_len, limit, is_callable_replace, replace_count_ptr TSRMLS_CC)) != NULL) {
 
                                /* Add to return array */
                                switch (zend_hash_get_current_key_ex(Z_ARRVAL_P(subject), &string_key, &string_key_len, &num_key, 0, NULL))
                                {
                                        case HASH_KEY_IS_UNICODE:
-                                               add_u_assoc_utf8_stringl_ex(return_value, IS_UNICODE, string_key, string_key_len, result, result_len, ZSTR_AUTOFREE);
+                                               if (utype == IS_UNICODE || (UG(unicode) && utype != IS_STRING)) {
+                                                       add_u_assoc_utf8_stringl_ex(return_value, IS_UNICODE, string_key, string_key_len, result, result_len, ZSTR_AUTOFREE);
+                                               } else {
+                                                       add_u_assoc_stringl_ex(return_value, IS_UNICODE, string_key, string_key_len, result, result_len, 0);
+                                               }
                                                break;
 
                                        case HASH_KEY_IS_STRING:
-                                               add_u_assoc_utf8_stringl_ex(return_value, IS_STRING, string_key, string_key_len, result, result_len, ZSTR_AUTOFREE);
+                                               if (utype == IS_UNICODE || (UG(unicode) && utype != IS_STRING)) {
+                                                       add_u_assoc_utf8_stringl_ex(return_value, IS_STRING, string_key, string_key_len, result, result_len, ZSTR_AUTOFREE);
+                                               } else {
+                                                       add_u_assoc_stringl_ex(return_value, IS_STRING, string_key, string_key_len, result, result_len, 0);
+                                               }
                                                break;
 
                                        case HASH_KEY_IS_LONG:
-                                               add_index_utf8_stringl(return_value, num_key, result, result_len, ZSTR_AUTOFREE);
+                                               if (utype == IS_UNICODE || (UG(unicode) && utype != IS_STRING)) {
+                                                       add_index_utf8_stringl(return_value, num_key, result, result_len, ZSTR_AUTOFREE);
+                                               } else {
+                                                       add_index_stringl(return_value, num_key, result, result_len, 0);
+                                               }
                                                break;
                                }
                        }
@@ -1441,8 +1488,13 @@ static void preg_replace_impl(INTERNAL_FUNCTION_PARAMETERS, zend_bool is_callabl
                        zend_hash_move_forward(Z_ARRVAL_P(subject));
                }
        } else {        /* if subject is not an array */
+               utype = Z_TYPE_P(subject);
                if ((result = php_replace_in_subject(regex, replace, &subject, &result_len, limit, is_callable_replace, replace_count_ptr TSRMLS_CC)) != NULL) {
-                       RETVAL_UTF8_STRINGL(result, result_len, ZSTR_AUTOFREE);
+                       if (utype == IS_UNICODE || (UG(unicode) && utype != IS_STRING)) {
+                               RETVAL_UTF8_STRINGL(result, result_len, ZSTR_AUTOFREE);
+                       } else {
+                               RETVAL_STRINGL(result, result_len, 0);
+                       }
                }
        }
        if (replace_count_ptr) {
@@ -1501,7 +1553,7 @@ PHP_FUNCTION(preg_split)
        }
 
        /* Compile regex or get it from cache. */
-       if ((pce = pcre_get_compiled_regex_cache(regex.s, regex_len TSRMLS_CC)) == NULL) {
+       if ((pce = pcre_get_compiled_regex_cache(str_type, regex.s, regex_len TSRMLS_CC)) == NULL) {
                if (str_type == IS_UNICODE) {
                        efree(regex_utf8);
                        efree(subject_utf8);
@@ -1509,7 +1561,7 @@ PHP_FUNCTION(preg_split)
                RETURN_FALSE;
        }
 
-       php_pcre_split_impl(pce, subject.s, subject_len, return_value, limit_val, flags TSRMLS_CC);
+       php_pcre_split_impl(pce, str_type, subject.s, subject_len, return_value, limit_val, flags TSRMLS_CC);
 
        if (str_type == IS_UNICODE) {
                efree(regex_utf8);
@@ -1520,7 +1572,7 @@ PHP_FUNCTION(preg_split)
 
 /* {{{ php_pcre_split_impl
  */
-PHPAPI void php_pcre_split_impl(pcre_cache_entry *pce, char *subject, int subject_len, zval *return_value,
+PHPAPI void php_pcre_split_impl(pcre_cache_entry *pce, zend_uchar utype, char *subject, int subject_len, zval *return_value,
        long limit_val, long flags TSRMLS_DC)
 {
        pcre_extra              *extra = NULL;          /* Holds results of studying */
@@ -1574,7 +1626,7 @@ PHPAPI void php_pcre_split_impl(pcre_cache_entry *pce, char *subject, int subjec
        match = NULL;
        PCRE_G(error_code) = PHP_PCRE_NO_ERROR;
 
-       if (UG(unicode)) {
+       if (utype != IS_UNICODE && !(pce->compile_options & PCRE_UTF8)) {
                exoptions |= PCRE_NO_UTF8_CHECK;
        }
        
@@ -1598,11 +1650,15 @@ PHPAPI void php_pcre_split_impl(pcre_cache_entry *pce, char *subject, int subjec
 
                                if (offset_capture) {
                                        /* Add (match, offset) pair to the return value */
-                                       add_offset_pair(return_value, last_match, &subject[offsets[0]]-last_match, next_offset, NULL, &map TSRMLS_CC);
-                               } else {
+                                       add_offset_pair(return_value, utype, last_match, &subject[offsets[0]]-last_match, next_offset, NULL, &map TSRMLS_CC);
+                               } else if (utype == IS_UNICODE) {
                                        /* Add the piece to the return value */
                                        add_next_index_utf8_stringl(return_value, last_match,
                                                                                                &subject[offsets[0]]-last_match, 1);
+                               } else {
+                                       /* Add the piece to the return value */
+                                       add_next_index_stringl(return_value, last_match,
+                                                                                       &subject[offsets[0]]-last_match, 1);
                                }
 
                                /* One less left to do */
@@ -1620,11 +1676,14 @@ PHPAPI void php_pcre_split_impl(pcre_cache_entry *pce, char *subject, int subjec
                                        /* If we have matched a delimiter */
                                        if (!no_empty || match_len > 0) {
                                                if (offset_capture) {
-                                                       add_offset_pair(return_value, &subject[offsets[i<<1]], match_len,
+                                                       add_offset_pair(return_value, utype, &subject[offsets[i<<1]], match_len,
                                                                                        offsets[i<<1], NULL, &map TSRMLS_CC);
-                                               } else {
+                                               } else if (utype == IS_UNICODE) {
                                                        add_next_index_utf8_stringl(return_value, &subject[offsets[i<<1]],
                                                                                                                match_len, 1);
+                                               } else {
+                                                       add_next_index_stringl(return_value, &subject[offsets[i<<1]],
+                                                                                                       match_len, 1);
                                                }
                                        }
                                }
@@ -1636,7 +1695,7 @@ PHPAPI void php_pcre_split_impl(pcre_cache_entry *pce, char *subject, int subjec
                           to achieve this, unless we're already at the end of the string. */
                        if (g_notempty != 0 && start_offset < subject_len) {
                                offsets[0] = start_offset;
-                               if (UG(unicode) || pce->compile_options & PCRE_UTF8) {
+                               if (utype == IS_UNICODE || pce->compile_options & PCRE_UTF8) {
                                        offsets[1] = start_offset;
                                        U8_FWD_1(subject, offsets[1], subject_len);
                                } else {
@@ -1664,11 +1723,14 @@ PHPAPI void php_pcre_split_impl(pcre_cache_entry *pce, char *subject, int subjec
        {
                if (offset_capture) {
                        /* Add the last (match, offset) pair to the return value */
-                       add_offset_pair(return_value, &subject[start_offset],
+                       add_offset_pair(return_value, utype, &subject[start_offset],
                                                        subject_len - start_offset, start_offset, NULL, &map TSRMLS_CC);
-               } else {
+               } else if (utype == IS_UNICODE) {
                        /* Add the last piece to the return value */
                        add_next_index_utf8_stringl(return_value, last_match, subject + subject_len - last_match, 1);
+               } else {
+                       /* Add the last piece to the return value */
+                       add_next_index_stringl(return_value, last_match, subject + subject_len - last_match, 1);
                }
        }
        
@@ -1786,24 +1848,41 @@ PHP_FUNCTION(preg_quote)
    Searches array and returns entries which match regex */
 PHP_FUNCTION(preg_grep)
 {
-       char                            *regex;                 /* Regular expression */
+       zstr                             regex;                 /* Regular expression */
        int                                      regex_len;
+       char*                            regex_utf8;
+       int                                      regex_utf8_len;
+       zend_uchar           regex_type;
        zval                            *input;                 /* Input array */
        long                             flags = 0;             /* Match control flags */
        pcre_cache_entry        *pce;                   /* Compiled regular expression */
+       UErrorCode                       status = U_ZERO_ERROR;
 
        /* Get arguments and do error checking */
-       if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "s&a|l", &regex,
-                                                         &regex_len, UG(utf8_conv), &input, &flags) == FAILURE) {
+       if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "ta|l", &regex,
+                                                         &regex_len, &regex_type, &input, &flags) == FAILURE) {
                return;
        }
        
+       if (regex_type == IS_UNICODE) {
+               zend_unicode_to_string_ex(UG(utf8_conv), &regex_utf8, &regex_utf8_len, regex.u, regex_len, &status);
+               regex.s = regex_utf8;
+               regex_len = regex_utf8_len;
+       }
+
        /* Compile regex or get it from cache. */
-       if ((pce = pcre_get_compiled_regex_cache(regex, regex_len TSRMLS_CC)) == NULL) {
+       if ((pce = pcre_get_compiled_regex_cache(regex_type, regex.s, regex_len TSRMLS_CC)) == NULL) {
+               if (regex_type == IS_UNICODE) {
+                       efree(regex_utf8);
+               }
                RETURN_FALSE;
        }
        
        php_pcre_grep_impl(pce, input, return_value, flags TSRMLS_CC);
+
+       if (regex_type == IS_UNICODE) {
+               efree(regex_utf8);
+       }
 }
 /* }}} */
 
@@ -1849,10 +1928,6 @@ PHPAPI void  php_pcre_grep_impl(pcre_cache_entry *pce, zval *input, zval *return
 
        PCRE_G(error_code) = PHP_PCRE_NO_ERROR;
 
-       if (UG(unicode)) {
-               exoptions |= PCRE_NO_UTF8_CHECK;
-       }
-
        /* Go through the input array */
        zend_hash_internal_pointer_reset(Z_ARRVAL_P(input));
        while(zend_hash_get_current_data(Z_ARRVAL_P(input), (void **)&entry) == SUCCESS) {
@@ -1866,7 +1941,7 @@ PHPAPI void  php_pcre_grep_impl(pcre_cache_entry *pce, zval *input, zval *return
 
                /* Perform the match */
                count = pcre_exec(pce->re, extra, Z_STRVAL(subject), Z_STRLEN(subject),
-                                                 0, exoptions, offsets, size_offsets);
+                                                 0, exoptions | ((Z_TYPE_PP(entry) != IS_UNICODE && !(pce->compile_options & PCRE_UTF8))?PCRE_NO_UTF8_CHECK:0), offsets, size_offsets);
 
                /* Check for too many substrings condition. */
                if (count == 0) {
index 39cf6d3e85cd26ea6e0743d16c3377cf9839bd3e..d1f67587bab537c9b2210974f08439f5e288edc6 100644 (file)
@@ -41,7 +41,7 @@ PHP_FUNCTION(preg_split);
 PHP_FUNCTION(preg_quote);
 PHP_FUNCTION(preg_grep);
 
-PHPAPI char *php_pcre_replace(char *regex, int regex_len, char *subject, int subject_len, zval *replace_val, int is_callable_replace, int *result_len, int limit, int *replace_count TSRMLS_DC);
+PHPAPI char *php_pcre_replace(zend_uchar utype, char *regex, int regex_len, char *subject, int subject_len, zval *replace_val, int is_callable_replace, int *result_len, int limit, int *replace_count TSRMLS_DC);
 PHPAPI pcre* pcre_get_compiled_regex(char *regex, pcre_extra **extra, int *options TSRMLS_DC);
 PHPAPI pcre* pcre_get_compiled_regex_ex(char *regex, pcre_extra **extra, int *preg_options, int *coptions TSRMLS_DC);
 
@@ -61,15 +61,15 @@ typedef struct {
        zend_bool unicode_mode;
 } pcre_cache_entry;
 
-PHPAPI pcre_cache_entry* pcre_get_compiled_regex_cache(char *regex, int regex_len TSRMLS_DC);
+PHPAPI pcre_cache_entry* pcre_get_compiled_regex_cache(zend_uchar utype, char *regex, int regex_len TSRMLS_DC);
 
-PHPAPI void  php_pcre_match_impl(  pcre_cache_entry *pce, char *subject, int subject_len, zval *return_value,
+PHPAPI void  php_pcre_match_impl(  pcre_cache_entry *pce, zend_uchar utype, char *subject, int subject_len, zval *return_value,
        zval *subpats, int global, int use_flags, long flags, long start_offset TSRMLS_DC);
 
-PHPAPI char *php_pcre_replace_impl(pcre_cache_entry *pce, char *subject, int subject_len, zval *return_value, 
+PHPAPI char *php_pcre_replace_impl(pcre_cache_entry *pce, zend_uchar utype, char *subject, int subject_len, zval *return_value, 
        int is_callable_replace, int *result_len, int limit, int *replace_count TSRMLS_DC);
 
-PHPAPI void  php_pcre_split_impl(  pcre_cache_entry *pce, char *subject, int subject_len, zval *return_value,
+PHPAPI void  php_pcre_split_impl(  pcre_cache_entry *pce, zend_uchar utype, char *subject, int subject_len, zval *return_value,
        long limit_val, long flags TSRMLS_DC);
 
 PHPAPI void  php_pcre_grep_impl(   pcre_cache_entry *pce, zval *input, zval *return_value,
index 0be4c67d4ada10033275be0a4d92c151e64ef0c1..a7416298c49e90f5ecd7952c3058d60057e05f07 100644 (file)
@@ -14,9 +14,9 @@ function iter($ar)
                echo htmlentities($c, 0, "UTF-8"), ": ", strlen($c), "\n";
        }
 }
-$teststr = "\xe2\x82\xac hi there";
-iter(preg_split('//u', $teststr, -1, PREG_SPLIT_NO_EMPTY));
-preg_match_all('/./u', $teststr, $matches);
+$teststr = b"\xe2\x82\xac hi there";
+iter(preg_split(b'//u', $teststr, -1, PREG_SPLIT_NO_EMPTY));
+preg_match_all(b'/./u', $teststr, $matches);
 iter($matches[0]);
 ?>
 --EXPECT--
index f66a8f9abb56e2c7a33b979c06fad05f84e940b9..67132b098f99578a1d73f57572807f4fdcb6bf9e 100644 (file)
@@ -36,3 +36,22 @@ array(3) {
     string(19) ""simpleValueInside""
   }
 }
+--UEXPECT--
+int(1)
+array(3) {
+  [0]=>
+  array(1) {
+    [0]=>
+    unicode(33) " an_attribute="simpleValueInside""
+  }
+  [1]=>
+  array(1) {
+    [0]=>
+    unicode(12) "an_attribute"
+  }
+  [2]=>
+  array(1) {
+    [0]=>
+    unicode(19) ""simpleValueInside""
+  }
+}
index 8f9f40bb9159c79b99de4d19ef1a785ba72176bc..b770c198addb7ec9704b929f4a75ed705f13d2d1 100644 (file)
@@ -9,7 +9,7 @@ if (@preg_match_all('/./u', "", $matches) === false) {
 --FILE--
 <?php
 
-$string = urldecode("search%e4"); 
+$string = urldecode(b"search%e4"); 
 $result = preg_replace("#(&\#x*)([0-9A-F]+);*#iu","$1$2;",$string); 
 var_dump($result); 
 var_dump(preg_last_error());
index d0e729dfb80196ceeb4c643c713e4d1ca6f0f79b..719c81decdf50a326524b8c5818d2e86a6ec9416 100755 (executable)
@@ -1024,7 +1024,7 @@ static spl_dual_it_object* spl_dual_it_construct(INTERNAL_FUNCTION_PARAMETERS, z
                        }
                        intern->u.regex.mode = mode;
                        intern->u.regex.regex = estrndup(regex, regex_len);
-                       intern->u.regex.pce = pcre_get_compiled_regex_cache(regex, regex_len TSRMLS_CC);
+                       intern->u.regex.pce = pcre_get_compiled_regex_cache(ZEND_STR_TYPE, regex, regex_len TSRMLS_CC);
                        if (intern->u.regex.pce == NULL) {
                                /* pcre_get_compiled_regex_cache has already sent error */
                                php_set_error_handling(EH_NORMAL, NULL TSRMLS_CC);
@@ -1417,7 +1417,7 @@ SPL_METHOD(RegexIterator, accept)
                }
                zval_ptr_dtor(&intern->current.data);
                ALLOC_INIT_ZVAL(intern->current.data);
-               php_pcre_match_impl(intern->u.regex.pce, subject, subject_len, &zcount, 
+               php_pcre_match_impl(intern->u.regex.pce, ZEND_STR_TYPE, subject, subject_len, &zcount, 
                        intern->current.data, intern->u.regex.mode == REGIT_MODE_ALL_MATCHES, intern->u.regex.use_flags, intern->u.regex.preg_flags, 0 TSRMLS_CC);
                count = zend_hash_num_elements(Z_ARRVAL_P(intern->current.data));
                RETVAL_BOOL(count > 0);
@@ -1430,14 +1430,14 @@ SPL_METHOD(RegexIterator, accept)
                }
                zval_ptr_dtor(&intern->current.data);
                ALLOC_INIT_ZVAL(intern->current.data);
-               php_pcre_split_impl(intern->u.regex.pce, subject, subject_len, intern->current.data, -1, intern->u.regex.preg_flags TSRMLS_CC);
+               php_pcre_split_impl(intern->u.regex.pce, ZEND_STR_TYPE, subject, subject_len, intern->current.data, -1, intern->u.regex.preg_flags TSRMLS_CC);
                count = zend_hash_num_elements(Z_ARRVAL_P(intern->current.data));
                RETVAL_BOOL(count > 1);
                break;
 
        case REGIT_MODE_REPLACE:
                replacement = zend_read_property(intern->std.ce, getThis(), "replacement", sizeof("replacement")-1, 1 TSRMLS_CC);
-               result = php_pcre_replace_impl(intern->u.regex.pce, subject, subject_len, replacement, 0, &result_len, 0, NULL TSRMLS_CC);
+               result = php_pcre_replace_impl(intern->u.regex.pce, ZEND_STR_TYPE, subject, subject_len, replacement, 0, &result_len, 0, NULL TSRMLS_CC);
                
                if (intern->u.regex.flags & REGIT_USE_KEY) {
                        if (intern->current.key_type != HASH_KEY_IS_LONG) {
index 60ef962d2bf39f862b2066f6d4beabfce6716111..0445da6bfaae32436e17ea1bc0ab6c7b4d2dad45 100644 (file)
@@ -165,7 +165,7 @@ static char *php_win32_mail_trim_header(char *header TSRMLS_DC)
        MAKE_STD_ZVAL(replace);
        ZVAL_STRING(replace, PHP_WIN32_MAIL_UNIFY_REPLACE, 0);
 
-       result = php_pcre_replace(PHP_WIN32_MAIL_UNIFY_PATTERN, sizeof(PHP_WIN32_MAIL_UNIFY_PATTERN)-1,
+       result = php_pcre_replace(IS_STRING, PHP_WIN32_MAIL_UNIFY_PATTERN, sizeof(PHP_WIN32_MAIL_UNIFY_PATTERN)-1,
                                                          header, strlen(header),
                                                          replace,
                                                          0,
@@ -179,7 +179,7 @@ static char *php_win32_mail_trim_header(char *header TSRMLS_DC)
 
        ZVAL_STRING(replace, PHP_WIN32_MAIL_RMVDBL_REPLACE, 0);
 
-       result2 = php_pcre_replace(PHP_WIN32_MAIL_RMVDBL_PATTERN, sizeof(PHP_WIN32_MAIL_RMVDBL_PATTERN)-1,
+       result2 = php_pcre_replace(IS_STRING, PHP_WIN32_MAIL_RMVDBL_PATTERN, sizeof(PHP_WIN32_MAIL_RMVDBL_PATTERN)-1,
                                                           result, result_len,
                                                           replace,
                                                           0,