PCRE unicode/binary support

author Dmitry Stogov <dmitry@php.net>

Fri, 6 Jul 2007 23:06:51 +0000 (23:06 +0000)

committer Dmitry Stogov <dmitry@php.net>

Fri, 6 Jul 2007 23:06:51 +0000 (23:06 +0000)
author Dmitry Stogov <dmitry@php.net>
Fri, 6 Jul 2007 23:06:51 +0000 (23:06 +0000)
committer Dmitry Stogov <dmitry@php.net>
Fri, 6 Jul 2007 23:06:51 +0000 (23:06 +0000)
diff --git a/ext/pcre/php_pcre.c b/ext/pcre/php_pcre.c

index 2c828f7646701792f6326d8cc4bddd83ba8daaf0..f008c5f34bcbe470a4d9f2ddf94b06d25ce84160 100644 (file)
--- a/ext/pcre/php_pcre.c
+++ b/ext/pcre/php_pcre.c
@@ -191,7 +191,7 @@ static int pcre_clean_cache(void *data, void *arg TSRMLS_DC)
  
  /* {{{ pcre_get_compiled_regex_cache
   */
-PHPAPI pcre_cache_entry* pcre_get_compiled_regex_cache(char *regex, int regex_len TSRMLS_DC)
+PHPAPI pcre_cache_entry* pcre_get_compiled_regex_cache(zend_uchar utype, char *regex, int regex_len TSRMLS_DC)
  {
         pcre                            *re = NULL;
         pcre_extra                      *extra;
@@ -333,7 +333,7 @@ PHPAPI pcre_cache_entry* pcre_get_compiled_regex_cache(char *regex, int regex_le
                 }
         }
  
-       if (UG(unicode)) {
+       if (utype == IS_UNICODE) {
                 coptions |= PCRE_UTF8;
         }
  
@@ -405,7 +405,7 @@ PHPAPI pcre_cache_entry* pcre_get_compiled_regex_cache(char *regex, int regex_le
   */
  PHPAPI pcre* pcre_get_compiled_regex(char *regex, pcre_extra **extra, int *preg_options TSRMLS_DC)
  {
-       pcre_cache_entry * pce = pcre_get_compiled_regex_cache(regex, strlen(regex) TSRMLS_CC);
+       pcre_cache_entry * pce = pcre_get_compiled_regex_cache(ZEND_STR_TYPE, regex, strlen(regex) TSRMLS_CC);
  
         if (extra) {
                 *extra = pce ? pce->extra : NULL;
@@ -422,7 +422,7 @@ PHPAPI pcre* pcre_get_compiled_regex(char *regex, pcre_extra **extra, int *preg_
   */
  PHPAPI pcre* pcre_get_compiled_regex_ex(char *regex, pcre_extra **extra, int *preg_options, int *compile_options TSRMLS_DC)
  {
-       pcre_cache_entry * pce = pcre_get_compiled_regex_cache(regex, strlen(regex) TSRMLS_CC);
+       pcre_cache_entry * pce = pcre_get_compiled_regex_cache(ZEND_STR_TYPE, regex, strlen(regex) TSRMLS_CC);
         
         if (extra) {
                 *extra = pce ? pce->extra : NULL;
@@ -439,7 +439,7 @@ PHPAPI pcre* pcre_get_compiled_regex_ex(char *regex, pcre_extra **extra, int *pr
  /* }}} */
  
  /* {{{ add_offset_pair */
-static inline void add_offset_pair(zval *result, char *str, int len, int offset, char *name, offset_map_t *prev TSRMLS_DC)
+static inline void add_offset_pair(zval *result, zend_uchar utype, char *str, int len, int offset, char *name, offset_map_t *prev TSRMLS_DC)
  {
         zval *match_pair;
         int tmp;
@@ -449,16 +449,25 @@ static inline void add_offset_pair(zval *result, char *str, int len, int offset,
         INIT_PZVAL(match_pair);
  
         /* Add (match, offset) to the return value */
-       add_next_index_utf8_stringl(match_pair, str, len, 1);
+       if (utype == IS_UNICODE) {
+               add_next_index_utf8_stringl(match_pair, str, len, 1);
+       } else {
+               add_next_index_stringl(match_pair, str, len, 1);
+       }
  
         /* Calculate codepoint offset from the previous chunk */
         if (offset) {
-               tmp = prev->byte_offset;
-               while (tmp < offset) {
-                       U8_FWD_1(prev->str, tmp, offset);
-                       prev->cp_offset++;
-               }
-               prev->byte_offset = tmp;
+               if (utype == IS_UNICODE) {
+                       tmp = prev->byte_offset;
+                       while (tmp < offset) {
+                               U8_FWD_1(prev->str, tmp, offset);
+                               prev->cp_offset++;
+                       }
+                       prev->byte_offset = tmp;
+               } else {
+                       prev->cp_offset = offset;
+                       prev->byte_offset = offset;
+               }               
         }
         add_next_index_long(match_pair, prev->cp_offset);
         
@@ -511,7 +520,7 @@ static void php_do_pcre_match(INTERNAL_FUNCTION_PARAMETERS, int global)
         }
         
         /* Compile regex or get it from cache. */
-       if ((pce = pcre_get_compiled_regex_cache(regex.s, regex_len TSRMLS_CC)) == NULL) {
+       if ((pce = pcre_get_compiled_regex_cache(str_type, regex.s, regex_len TSRMLS_CC)) == NULL) {
                 if (str_type == IS_UNICODE) {
                         efree(regex_utf8);
                         efree(subject_utf8);
@@ -519,7 +528,7 @@ static void php_do_pcre_match(INTERNAL_FUNCTION_PARAMETERS, int global)
                 RETURN_FALSE;
         }
  
-       php_pcre_match_impl(pce, subject.s, subject_len, return_value, subpats, 
+       php_pcre_match_impl(pce, str_type, subject.s, subject_len, return_value, subpats, 
                 global, ZEND_NUM_ARGS() >= 4, flags, start_offset TSRMLS_CC);
  
         if (str_type == IS_UNICODE) {
@@ -530,7 +539,7 @@ static void php_do_pcre_match(INTERNAL_FUNCTION_PARAMETERS, int global)
  /* }}} */
  
  /* {{{ php_pcre_match_impl */
-PHPAPI void php_pcre_match_impl(pcre_cache_entry *pce, char *subject, int subject_len, zval *return_value,
+PHPAPI void php_pcre_match_impl(pcre_cache_entry *pce, zend_uchar utype, char *subject, int subject_len, zval *return_value,
         zval *subpats, int global, int use_flags, long flags, long start_offset TSRMLS_DC)
  {
         zval                    *result_set,            /* Holds a set of subpatterns after
@@ -580,7 +589,7 @@ PHPAPI void php_pcre_match_impl(pcre_cache_entry *pce, char *subject, int subjec
                 offset_capture = 0;
         }
  
-       if (UG(unicode)) {
+       if (utype == IS_UNICODE) {
                 int k = 0;
                 /* Calculate byte offset from codepoint offset */
                 if (start_offset < 0) {
@@ -590,7 +599,6 @@ PHPAPI void php_pcre_match_impl(pcre_cache_entry *pce, char *subject, int subjec
                         U8_FWD_N(subject, k, subject_len, start_offset);
                 }
                 start_offset = k;
-               exoptions |= PCRE_NO_UTF8_CHECK;
         } else {
                 /* Negative offset counts from the end of the string. */
                 if (start_offset < 0) {
@@ -599,6 +607,9 @@ PHPAPI void php_pcre_match_impl(pcre_cache_entry *pce, char *subject, int subjec
                                 start_offset = 0;
                         }
                 }
+               if (!(pce->compile_options & PCRE_UTF8)) {
+                       exoptions |= PCRE_NO_UTF8_CHECK;
+               }
         }
  
         if (extra == NULL) {
@@ -712,11 +723,14 @@ PHPAPI void php_pcre_match_impl(pcre_cache_entry *pce, char *subject, int subjec
                                                 /* For each subpattern, insert it into the appropriate array. */
                                                 for (i = 0; i < count; i++) {
                                                         if (offset_capture) {
-                                                               add_offset_pair(match_sets[i], (char *)stringlist[i],
+                                                               add_offset_pair(match_sets[i], utype, (char *)stringlist[i],
                                                                                                 offsets[(i<<1)+1] - offsets[i<<1], offsets[i<<1], NULL, &map TSRMLS_CC);
-                                                       } else {
+                                                       } else if (utype == IS_UNICODE) {
                                                                 add_next_index_utf8_stringl(match_sets[i], (char *)stringlist[i],
                                                                                                                         offsets[(i<<1)+1] - offsets[i<<1], 1);
+                                                       } else {
+                                                               add_next_index_stringl(match_sets[i], (char *)stringlist[i],
+                                                                                                               offsets[(i<<1)+1] - offsets[i<<1], 1);
                                                         }
                                                 }
                                                 /*
@@ -744,16 +758,23 @@ PHPAPI void php_pcre_match_impl(pcre_cache_entry *pce, char *subject, int subjec
                                                 /* Add all the subpatterns to it */
                                                 for (i = 0; i < count; i++) {
                                                         if (offset_capture) {
-                                                               add_offset_pair(result_set, (char *)stringlist[i],
+                                                               add_offset_pair(result_set, utype, (char *)stringlist[i],
                                                                                                 offsets[(i<<1)+1] - offsets[i<<1], offsets[i<<1],
                                                                                                 subpat_names[i], &map TSRMLS_CC);
-                                                       } else {
+                                                       } else if (utype == IS_UNICODE) {
                                                                 if (subpat_names[i]) {
-                                                                       add_assoc_utf8_stringl(result_set, subpat_names[i], (char *)stringlist[i],
+                                                                       add_utf8_assoc_utf8_stringl(result_set, subpat_names[i], (char *)stringlist[i],
                                                                                                                    offsets[(i<<1)+1] - offsets[i<<1], 1);
                                                                 }
                                                                 add_next_index_utf8_stringl(result_set, (char *)stringlist[i],
                                                                                                                         offsets[(i<<1)+1] - offsets[i<<1], 1);
+                                                       } else {
+                                                               if (subpat_names[i]) {
+                                                                       add_rt_assoc_stringl(result_set, subpat_names[i], (char *)stringlist[i],
+                                                                                                          offsets[(i<<1)+1] - offsets[i<<1], 1);
+                                                               }
+                                                               add_next_index_stringl(result_set, (char *)stringlist[i],
+                                                                                                               offsets[(i<<1)+1] - offsets[i<<1], 1);
                                                         }
                                                 }
                                                 /* And add it to the output array */
@@ -763,16 +784,23 @@ PHPAPI void php_pcre_match_impl(pcre_cache_entry *pce, char *subject, int subjec
                                         /* For each subpattern, insert it into the subpatterns array. */
                                         for (i = 0; i < count; i++) {
                                                 if (offset_capture) {
-                                                       add_offset_pair(subpats, (char *)stringlist[i],
+                                                       add_offset_pair(subpats, utype, (char *)stringlist[i],
                                                                                         offsets[(i<<1)+1] - offsets[i<<1],
                                                                                         offsets[i<<1], subpat_names[i], &map TSRMLS_CC);
-                                               } else {
+                                               } else if (utype == IS_UNICODE) {
                                                         if (subpat_names[i]) {
-                                                               add_assoc_utf8_stringl(subpats, subpat_names[i], (char *)stringlist[i],
+                                                               add_utf8_assoc_utf8_stringl(subpats, subpat_names[i], (char *)stringlist[i],
                                                                                                            offsets[(i<<1)+1] - offsets[i<<1], 1);
                                                         }
                                                         add_next_index_utf8_stringl(subpats, (char *)stringlist[i],
                                                                                                                 offsets[(i<<1)+1] - offsets[i<<1], 1);
+                                               } else {
+                                                       if (subpat_names[i]) {
+                                                               add_rt_assoc_stringl(subpats, subpat_names[i], (char *)stringlist[i],
+                                                                                                  offsets[(i<<1)+1] - offsets[i<<1], 1);
+                                                       }
+                                                       add_next_index_stringl(subpats, (char *)stringlist[i],
+                                                                                                       offsets[(i<<1)+1] - offsets[i<<1], 1);
                                                 }
                                         }
                                 }
@@ -786,7 +814,7 @@ PHPAPI void php_pcre_match_impl(pcre_cache_entry *pce, char *subject, int subjec
                            to achieve this, unless we're already at the end of the string. */
                         if (g_notempty != 0 && start_offset < subject_len) {
                                 offsets[0] = start_offset;
-                               if (UG(unicode) || pce->compile_options & PCRE_UTF8) {
+                               if (utype == IS_UNICODE || pce->compile_options & PCRE_UTF8) {
                                         offsets[1] = start_offset;
                                         U8_FWD_1(subject, offsets[1], subject_len);
                                 } else {
@@ -1026,7 +1054,8 @@ static int preg_do_eval(char *eval_str, int eval_str_len, char *subject,
  
  /* {{{ php_pcre_replace
   */
-PHPAPI char *php_pcre_replace(char *regex,   int regex_len,
+PHPAPI char *php_pcre_replace(zend_uchar utype,
+                                                         char *regex,   int regex_len,
                                                           char *subject, int subject_len,
                                                           zval *replace_val, int is_callable_replace,
                                                           int *result_len, int limit, int *replace_count TSRMLS_DC)
@@ -1034,17 +1063,17 @@ PHPAPI char *php_pcre_replace(char *regex,   int regex_len,
         pcre_cache_entry        *pce;                       /* Compiled regular expression */
  
         /* Compile regex or get it from cache. */
-       if ((pce = pcre_get_compiled_regex_cache(regex, regex_len TSRMLS_CC)) == NULL) {
+       if ((pce = pcre_get_compiled_regex_cache(utype, regex, regex_len TSRMLS_CC)) == NULL) {
                 return NULL;
         }
  
-       return php_pcre_replace_impl(pce, subject, subject_len, replace_val, 
+       return php_pcre_replace_impl(pce, utype, subject, subject_len, replace_val, 
                 is_callable_replace, result_len, limit, replace_count TSRMLS_CC);
  }
  /* }}} */
  
  /* {{{ php_pcre_replace_impl() */
-PHPAPI char *php_pcre_replace_impl(pcre_cache_entry *pce, char *subject, int subject_len, zval *replace_val, 
+PHPAPI char *php_pcre_replace_impl(pcre_cache_entry *pce, zend_uchar utype, char *subject, int subject_len, zval *replace_val, 
         int is_callable_replace, int *result_len, int limit, int *replace_count TSRMLS_DC)
  {
         pcre_extra              *extra = pce->extra;/* Holds results of studying */
@@ -1112,7 +1141,7 @@ PHPAPI char *php_pcre_replace_impl(pcre_cache_entry *pce, char *subject, int sub
         start_offset = 0;
         PCRE_G(error_code) = PHP_PCRE_NO_ERROR;
  
-       if (UG(unicode)) {
+       if (utype != IS_UNICODE && !(pce->compile_options & PCRE_UTF8)) {
                 exoptions |= PCRE_NO_UTF8_CHECK;
         }
         
@@ -1227,7 +1256,7 @@ PHPAPI char *php_pcre_replace_impl(pcre_cache_entry *pce, char *subject, int sub
                            to achieve this, unless we're already at the end of the string. */
                         if (g_notempty != 0 && start_offset < subject_len) {
                                 offsets[0] = start_offset;
-                               if (UG(unicode) || pce->compile_options & PCRE_UTF8) {
+                               if (utype == IS_UNICODE || pce->compile_options & PCRE_UTF8) {
                                         offsets[1] = start_offset;
                                         U8_FWD_1(subject, offsets[1], subject_len);
                                 } else {
@@ -1284,8 +1313,10 @@ static char *php_replace_in_subject(zval *regex, zval *replace, zval **subject,
         char            *subject_value,
                                 *result;
         int                      subject_len;
+       zend_uchar   utype;
  
         /* Make sure we're dealing with strings. */     
+       utype = Z_TYPE_PP(subject);
         convert_to_string_with_converter_ex(subject, UG(utf8_conv));
  
         ZVAL_STRINGL(&empty_replace, "", 0, 0);
@@ -1325,7 +1356,8 @@ static char *php_replace_in_subject(zval *regex, zval *replace, zval **subject,
                         
                         /* Do the actual replacement and put the result back into subject_value
                            for further replacements. */
-                       if ((result = php_pcre_replace(Z_STRVAL_PP(regex_entry),
+                       if ((result = php_pcre_replace(utype,
+                                                                                  Z_STRVAL_PP(regex_entry),
                                                                                    Z_STRLEN_PP(regex_entry),
                                                                                    subject_value,
                                                                                    subject_len,
@@ -1344,7 +1376,8 @@ static char *php_replace_in_subject(zval *regex, zval *replace, zval **subject,
  
                 return subject_value;
         } else {
-               result = php_pcre_replace(Z_STRVAL_P(regex),
+               result = php_pcre_replace(utype,
+                                                                 Z_STRVAL_P(regex),
                                                                   Z_STRLEN_P(regex),
                                                                   Z_STRVAL_PP(subject),
                                                                   Z_STRLEN_PP(subject),
@@ -1375,7 +1408,8 @@ static void preg_replace_impl(INTERNAL_FUNCTION_PARAMETERS, zend_bool is_callabl
         ulong                    num_key;
         zval                     callback_name;
         int                              replace_count=0;
-       int                             *replace_count_ptr=NULL; 
+       int                             *replace_count_ptr=NULL;
+       zend_uchar       utype;
         
         if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "z/z/z/|lz", &regex,
                                                           &replace, &subject, &limit, &zcount) == FAILURE) {
@@ -1419,21 +1453,34 @@ static void preg_replace_impl(INTERNAL_FUNCTION_PARAMETERS, zend_bool is_callabl
                    and add the result to the return_value array. */
                 while (zend_hash_get_current_data(Z_ARRVAL_P(subject), (void **)&subject_entry) == SUCCESS) {
                         SEPARATE_ZVAL(subject_entry);
+                       utype = Z_TYPE_PP(subject_entry);
                         if ((result = php_replace_in_subject(regex, replace, subject_entry, &result_len, limit, is_callable_replace, replace_count_ptr TSRMLS_CC)) != NULL) {
  
                                 /* Add to return array */
                                 switch (zend_hash_get_current_key_ex(Z_ARRVAL_P(subject), &string_key, &string_key_len, &num_key, 0, NULL))
                                 {
                                         case HASH_KEY_IS_UNICODE:
-                                               add_u_assoc_utf8_stringl_ex(return_value, IS_UNICODE, string_key, string_key_len, result, result_len, ZSTR_AUTOFREE);
+                                               if (utype == IS_UNICODE || (UG(unicode) && utype != IS_STRING)) {
+                                                       add_u_assoc_utf8_stringl_ex(return_value, IS_UNICODE, string_key, string_key_len, result, result_len, ZSTR_AUTOFREE);
+                                               } else {
+                                                       add_u_assoc_stringl_ex(return_value, IS_UNICODE, string_key, string_key_len, result, result_len, 0);
+                                               }
                                                 break;
  
                                         case HASH_KEY_IS_STRING:
-                                               add_u_assoc_utf8_stringl_ex(return_value, IS_STRING, string_key, string_key_len, result, result_len, ZSTR_AUTOFREE);
+                                               if (utype == IS_UNICODE || (UG(unicode) && utype != IS_STRING)) {
+                                                       add_u_assoc_utf8_stringl_ex(return_value, IS_STRING, string_key, string_key_len, result, result_len, ZSTR_AUTOFREE);
+                                               } else {
+                                                       add_u_assoc_stringl_ex(return_value, IS_STRING, string_key, string_key_len, result, result_len, 0);
+                                               }
                                                 break;
  
                                         case HASH_KEY_IS_LONG:
-                                               add_index_utf8_stringl(return_value, num_key, result, result_len, ZSTR_AUTOFREE);
+                                               if (utype == IS_UNICODE || (UG(unicode) && utype != IS_STRING)) {
+                                                       add_index_utf8_stringl(return_value, num_key, result, result_len, ZSTR_AUTOFREE);
+                                               } else {
+                                                       add_index_stringl(return_value, num_key, result, result_len, 0);
+                                               }
                                                 break;
                                 }
                         }
@@ -1441,8 +1488,13 @@ static void preg_replace_impl(INTERNAL_FUNCTION_PARAMETERS, zend_bool is_callabl
                         zend_hash_move_forward(Z_ARRVAL_P(subject));
                 }
         } else {        /* if subject is not an array */
+               utype = Z_TYPE_P(subject);
                 if ((result = php_replace_in_subject(regex, replace, &subject, &result_len, limit, is_callable_replace, replace_count_ptr TSRMLS_CC)) != NULL) {
-                       RETVAL_UTF8_STRINGL(result, result_len, ZSTR_AUTOFREE);
+                       if (utype == IS_UNICODE || (UG(unicode) && utype != IS_STRING)) {
+                               RETVAL_UTF8_STRINGL(result, result_len, ZSTR_AUTOFREE);
+                       } else {
+                               RETVAL_STRINGL(result, result_len, 0);
+                       }
                 }
         }
         if (replace_count_ptr) {
@@ -1501,7 +1553,7 @@ PHP_FUNCTION(preg_split)
         }
  
         /* Compile regex or get it from cache. */
-       if ((pce = pcre_get_compiled_regex_cache(regex.s, regex_len TSRMLS_CC)) == NULL) {
+       if ((pce = pcre_get_compiled_regex_cache(str_type, regex.s, regex_len TSRMLS_CC)) == NULL) {
                 if (str_type == IS_UNICODE) {
                         efree(regex_utf8);
                         efree(subject_utf8);
@@ -1509,7 +1561,7 @@ PHP_FUNCTION(preg_split)
                 RETURN_FALSE;
         }
  
-       php_pcre_split_impl(pce, subject.s, subject_len, return_value, limit_val, flags TSRMLS_CC);
+       php_pcre_split_impl(pce, str_type, subject.s, subject_len, return_value, limit_val, flags TSRMLS_CC);
  
         if (str_type == IS_UNICODE) {
                 efree(regex_utf8);
@@ -1520,7 +1572,7 @@ PHP_FUNCTION(preg_split)
  
  /* {{{ php_pcre_split_impl
   */
-PHPAPI void php_pcre_split_impl(pcre_cache_entry *pce, char *subject, int subject_len, zval *return_value,
+PHPAPI void php_pcre_split_impl(pcre_cache_entry *pce, zend_uchar utype, char *subject, int subject_len, zval *return_value,
         long limit_val, long flags TSRMLS_DC)
  {
         pcre_extra              *extra = NULL;          /* Holds results of studying */
@@ -1574,7 +1626,7 @@ PHPAPI void php_pcre_split_impl(pcre_cache_entry *pce, char *subject, int subjec
         match = NULL;
         PCRE_G(error_code) = PHP_PCRE_NO_ERROR;
  
-       if (UG(unicode)) {
+       if (utype != IS_UNICODE && !(pce->compile_options & PCRE_UTF8)) {
                 exoptions |= PCRE_NO_UTF8_CHECK;
         }
         
@@ -1598,11 +1650,15 @@ PHPAPI void php_pcre_split_impl(pcre_cache_entry *pce, char *subject, int subjec
  
                                 if (offset_capture) {
                                         /* Add (match, offset) pair to the return value */
-                                       add_offset_pair(return_value, last_match, &subject[offsets[0]]-last_match, next_offset, NULL, &map TSRMLS_CC);
-                               } else {
+                                       add_offset_pair(return_value, utype, last_match, &subject[offsets[0]]-last_match, next_offset, NULL, &map TSRMLS_CC);
+                               } else if (utype == IS_UNICODE) {
                                         /* Add the piece to the return value */
                                         add_next_index_utf8_stringl(return_value, last_match,
                                                                                                 &subject[offsets[0]]-last_match, 1);
+                               } else {
+                                       /* Add the piece to the return value */
+                                       add_next_index_stringl(return_value, last_match,
+                                                                                       &subject[offsets[0]]-last_match, 1);
                                 }
  
                                 /* One less left to do */
@@ -1620,11 +1676,14 @@ PHPAPI void php_pcre_split_impl(pcre_cache_entry *pce, char *subject, int subjec
                                         /* If we have matched a delimiter */
                                         if (!no_empty || match_len > 0) {
                                                 if (offset_capture) {
-                                                       add_offset_pair(return_value, &subject[offsets[i<<1]], match_len,
+                                                       add_offset_pair(return_value, utype, &subject[offsets[i<<1]], match_len,
                                                                                         offsets[i<<1], NULL, &map TSRMLS_CC);
-                                               } else {
+                                               } else if (utype == IS_UNICODE) {
                                                         add_next_index_utf8_stringl(return_value, &subject[offsets[i<<1]],
                                                                                                                 match_len, 1);
+                                               } else {
+                                                       add_next_index_stringl(return_value, &subject[offsets[i<<1]],
+                                                                                                       match_len, 1);
                                                 }
                                         }
                                 }
@@ -1636,7 +1695,7 @@ PHPAPI void php_pcre_split_impl(pcre_cache_entry *pce, char *subject, int subjec
                            to achieve this, unless we're already at the end of the string. */
                         if (g_notempty != 0 && start_offset < subject_len) {
                                 offsets[0] = start_offset;
-                               if (UG(unicode) || pce->compile_options & PCRE_UTF8) {
+                               if (utype == IS_UNICODE || pce->compile_options & PCRE_UTF8) {
                                         offsets[1] = start_offset;
                                         U8_FWD_1(subject, offsets[1], subject_len);
                                 } else {
@@ -1664,11 +1723,14 @@ PHPAPI void php_pcre_split_impl(pcre_cache_entry *pce, char *subject, int subjec
         {
                 if (offset_capture) {
                         /* Add the last (match, offset) pair to the return value */
-                       add_offset_pair(return_value, &subject[start_offset],
+                       add_offset_pair(return_value, utype, &subject[start_offset],
                                                         subject_len - start_offset, start_offset, NULL, &map TSRMLS_CC);
-               } else {
+               } else if (utype == IS_UNICODE) {
                         /* Add the last piece to the return value */
                         add_next_index_utf8_stringl(return_value, last_match, subject + subject_len - last_match, 1);
+               } else {
+                       /* Add the last piece to the return value */
+                       add_next_index_stringl(return_value, last_match, subject + subject_len - last_match, 1);
                 }
         }
         
@@ -1786,24 +1848,41 @@ PHP_FUNCTION(preg_quote)
     Searches array and returns entries which match regex */
  PHP_FUNCTION(preg_grep)
  {
-       char                            *regex;                 /* Regular expression */
+       zstr                             regex;                 /* Regular expression */
         int                                      regex_len;
+       char*                            regex_utf8;
+       int                                      regex_utf8_len;
+       zend_uchar           regex_type;
         zval                            *input;                 /* Input array */
         long                             flags = 0;             /* Match control flags */
         pcre_cache_entry        *pce;                   /* Compiled regular expression */
+       UErrorCode                       status = U_ZERO_ERROR;
  
         /* Get arguments and do error checking */
-       if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "s&a|l", &regex,
-                                                         &regex_len, UG(utf8_conv), &input, &flags) == FAILURE) {
+       if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "ta|l", &regex,
+                                                         &regex_len, &regex_type, &input, &flags) == FAILURE) {
                 return;
         }
         
+       if (regex_type == IS_UNICODE) {
+               zend_unicode_to_string_ex(UG(utf8_conv), &regex_utf8, &regex_utf8_len, regex.u, regex_len, &status);
+               regex.s = regex_utf8;
+               regex_len = regex_utf8_len;
+       }
+
         /* Compile regex or get it from cache. */
-       if ((pce = pcre_get_compiled_regex_cache(regex, regex_len TSRMLS_CC)) == NULL) {
+       if ((pce = pcre_get_compiled_regex_cache(regex_type, regex.s, regex_len TSRMLS_CC)) == NULL) {
+               if (regex_type == IS_UNICODE) {
+                       efree(regex_utf8);
+               }
                 RETURN_FALSE;
         }
         
         php_pcre_grep_impl(pce, input, return_value, flags TSRMLS_CC);
+
+       if (regex_type == IS_UNICODE) {
+               efree(regex_utf8);
+       }
  }
  /* }}} */
  
@@ -1849,10 +1928,6 @@ PHPAPI void  php_pcre_grep_impl(pcre_cache_entry *pce, zval *input, zval *return
  
         PCRE_G(error_code) = PHP_PCRE_NO_ERROR;
  
-       if (UG(unicode)) {
-               exoptions |= PCRE_NO_UTF8_CHECK;
-       }
-
         /* Go through the input array */
         zend_hash_internal_pointer_reset(Z_ARRVAL_P(input));
         while(zend_hash_get_current_data(Z_ARRVAL_P(input), (void **)&entry) == SUCCESS) {
@@ -1866,7 +1941,7 @@ PHPAPI void  php_pcre_grep_impl(pcre_cache_entry *pce, zval *input, zval *return
  
                 /* Perform the match */
                 count = pcre_exec(pce->re, extra, Z_STRVAL(subject), Z_STRLEN(subject),
-                                                 0, exoptions, offsets, size_offsets);
+                                                 0, exoptions | ((Z_TYPE_PP(entry) != IS_UNICODE && !(pce->compile_options & PCRE_UTF8))?PCRE_NO_UTF8_CHECK:0), offsets, size_offsets);
  
                 /* Check for too many substrings condition. */
                 if (count == 0) {
diff --git a/ext/pcre/php_pcre.h b/ext/pcre/php_pcre.h

index 39cf6d3e85cd26ea6e0743d16c3377cf9839bd3e..d1f67587bab537c9b2210974f08439f5e288edc6 100644 (file)
--- a/ext/pcre/php_pcre.h
+++ b/ext/pcre/php_pcre.h
@@ -41,7 +41,7 @@ PHP_FUNCTION(preg_split);
  PHP_FUNCTION(preg_quote);
  PHP_FUNCTION(preg_grep);
  
-PHPAPI char *php_pcre_replace(char *regex, int regex_len, char *subject, int subject_len, zval *replace_val, int is_callable_replace, int *result_len, int limit, int *replace_count TSRMLS_DC);
+PHPAPI char *php_pcre_replace(zend_uchar utype, char *regex, int regex_len, char *subject, int subject_len, zval *replace_val, int is_callable_replace, int *result_len, int limit, int *replace_count TSRMLS_DC);
  PHPAPI pcre* pcre_get_compiled_regex(char *regex, pcre_extra **extra, int *options TSRMLS_DC);
  PHPAPI pcre* pcre_get_compiled_regex_ex(char *regex, pcre_extra **extra, int *preg_options, int *coptions TSRMLS_DC);
  
@@ -61,15 +61,15 @@ typedef struct {
         zend_bool unicode_mode;
  } pcre_cache_entry;
  
-PHPAPI pcre_cache_entry* pcre_get_compiled_regex_cache(char *regex, int regex_len TSRMLS_DC);
+PHPAPI pcre_cache_entry* pcre_get_compiled_regex_cache(zend_uchar utype, char *regex, int regex_len TSRMLS_DC);
  
-PHPAPI void  php_pcre_match_impl(  pcre_cache_entry *pce, char *subject, int subject_len, zval *return_value,
+PHPAPI void  php_pcre_match_impl(  pcre_cache_entry *pce, zend_uchar utype, char *subject, int subject_len, zval *return_value,
         zval *subpats, int global, int use_flags, long flags, long start_offset TSRMLS_DC);
  
-PHPAPI char *php_pcre_replace_impl(pcre_cache_entry *pce, char *subject, int subject_len, zval *return_value, 
+PHPAPI char *php_pcre_replace_impl(pcre_cache_entry *pce, zend_uchar utype, char *subject, int subject_len, zval *return_value, 
         int is_callable_replace, int *result_len, int limit, int *replace_count TSRMLS_DC);
  
-PHPAPI void  php_pcre_split_impl(  pcre_cache_entry *pce, char *subject, int subject_len, zval *return_value,
+PHPAPI void  php_pcre_split_impl(  pcre_cache_entry *pce, zend_uchar utype, char *subject, int subject_len, zval *return_value,
         long limit_val, long flags TSRMLS_DC);
  
  PHPAPI void  php_pcre_grep_impl(   pcre_cache_entry *pce, zval *input, zval *return_value,
diff --git a/ext/pcre/tests/bug27103.phpt b/ext/pcre/tests/bug27103.phpt

index 0be4c67d4ada10033275be0a4d92c151e64ef0c1..a7416298c49e90f5ecd7952c3058d60057e05f07 100644 (file)
--- a/ext/pcre/tests/bug27103.phpt
+++ b/ext/pcre/tests/bug27103.phpt
@@ -14,9 +14,9 @@ function iter($ar)
                 echo htmlentities($c, 0, "UTF-8"), ": ", strlen($c), "\n";
         }
  }
-$teststr = "\xe2\x82\xac hi there";
-iter(preg_split('//u', $teststr, -1, PREG_SPLIT_NO_EMPTY));
-preg_match_all('/./u', $teststr, $matches);
+$teststr = b"\xe2\x82\xac hi there";
+iter(preg_split(b'//u', $teststr, -1, PREG_SPLIT_NO_EMPTY));
+preg_match_all(b'/./u', $teststr, $matches);
  iter($matches[0]);
  ?>
  --EXPECT--
diff --git a/ext/pcre/tests/bug40909.phpt b/ext/pcre/tests/bug40909.phpt

index f66a8f9abb56e2c7a33b979c06fad05f84e940b9..67132b098f99578a1d73f57572807f4fdcb6bf9e 100644 (file)
--- a/ext/pcre/tests/bug40909.phpt
+++ b/ext/pcre/tests/bug40909.phpt
@@ -36,3 +36,22 @@ array(3) {
      string(19) ""simpleValueInside""
    }
  }
+--UEXPECT--
+int(1)
+array(3) {
+  [0]=>
+  array(1) {
+    [0]=>
+    unicode(33) " an_attribute="simpleValueInside""
+  }
+  [1]=>
+  array(1) {
+    [0]=>
+    unicode(12) "an_attribute"
+  }
+  [2]=>
+  array(1) {
+    [0]=>
+    unicode(19) ""simpleValueInside""
+  }
+}
diff --git a/ext/pcre/tests/invalid_utf8.phpt b/ext/pcre/tests/invalid_utf8.phpt

index 8f9f40bb9159c79b99de4d19ef1a785ba72176bc..b770c198addb7ec9704b929f4a75ed705f13d2d1 100644 (file)
--- a/ext/pcre/tests/invalid_utf8.phpt
+++ b/ext/pcre/tests/invalid_utf8.phpt
@@ -9,7 +9,7 @@ if (@preg_match_all('/./u', "", $matches) === false) {
  --FILE--
  <?php
  
-$string = urldecode("search%e4"); 
+$string = urldecode(b"search%e4"); 
  $result = preg_replace("#(&\#x*)([0-9A-F]+);*#iu","$1$2;",$string); 
  var_dump($result); 
  var_dump(preg_last_error());
diff --git a/ext/spl/spl_iterators.c b/ext/spl/spl_iterators.c

index d0e729dfb80196ceeb4c643c713e4d1ca6f0f79b..719c81decdf50a326524b8c5818d2e86a6ec9416 100755 (executable)
--- a/ext/spl/spl_iterators.c
+++ b/ext/spl/spl_iterators.c
@@ -1024,7 +1024,7 @@ static spl_dual_it_object* spl_dual_it_construct(INTERNAL_FUNCTION_PARAMETERS, z
                         }
                         intern->u.regex.mode = mode;
                         intern->u.regex.regex = estrndup(regex, regex_len);
-                       intern->u.regex.pce = pcre_get_compiled_regex_cache(regex, regex_len TSRMLS_CC);
+                       intern->u.regex.pce = pcre_get_compiled_regex_cache(ZEND_STR_TYPE, regex, regex_len TSRMLS_CC);
                         if (intern->u.regex.pce == NULL) {
                                 /* pcre_get_compiled_regex_cache has already sent error */
                                 php_set_error_handling(EH_NORMAL, NULL TSRMLS_CC);
@@ -1417,7 +1417,7 @@ SPL_METHOD(RegexIterator, accept)
                 }
                 zval_ptr_dtor(&intern->current.data);
                 ALLOC_INIT_ZVAL(intern->current.data);
-               php_pcre_match_impl(intern->u.regex.pce, subject, subject_len, &zcount, 
+               php_pcre_match_impl(intern->u.regex.pce, ZEND_STR_TYPE, subject, subject_len, &zcount, 
                         intern->current.data, intern->u.regex.mode == REGIT_MODE_ALL_MATCHES, intern->u.regex.use_flags, intern->u.regex.preg_flags, 0 TSRMLS_CC);
                 count = zend_hash_num_elements(Z_ARRVAL_P(intern->current.data));
                 RETVAL_BOOL(count > 0);
@@ -1430,14 +1430,14 @@ SPL_METHOD(RegexIterator, accept)
                 }
                 zval_ptr_dtor(&intern->current.data);
                 ALLOC_INIT_ZVAL(intern->current.data);
-               php_pcre_split_impl(intern->u.regex.pce, subject, subject_len, intern->current.data, -1, intern->u.regex.preg_flags TSRMLS_CC);
+               php_pcre_split_impl(intern->u.regex.pce, ZEND_STR_TYPE, subject, subject_len, intern->current.data, -1, intern->u.regex.preg_flags TSRMLS_CC);
                 count = zend_hash_num_elements(Z_ARRVAL_P(intern->current.data));
                 RETVAL_BOOL(count > 1);
                 break;
  
         case REGIT_MODE_REPLACE:
                 replacement = zend_read_property(intern->std.ce, getThis(), "replacement", sizeof("replacement")-1, 1 TSRMLS_CC);
-               result = php_pcre_replace_impl(intern->u.regex.pce, subject, subject_len, replacement, 0, &result_len, 0, NULL TSRMLS_CC);
+               result = php_pcre_replace_impl(intern->u.regex.pce, ZEND_STR_TYPE, subject, subject_len, replacement, 0, &result_len, 0, NULL TSRMLS_CC);
                 
                 if (intern->u.regex.flags & REGIT_USE_KEY) {
                         if (intern->current.key_type != HASH_KEY_IS_LONG) {
diff --git a/win32/sendmail.c b/win32/sendmail.c

index 60ef962d2bf39f862b2066f6d4beabfce6716111..0445da6bfaae32436e17ea1bc0ab6c7b4d2dad45 100644 (file)
--- a/win32/sendmail.c
+++ b/win32/sendmail.c
@@ -165,7 +165,7 @@ static char *php_win32_mail_trim_header(char *header TSRMLS_DC)
         MAKE_STD_ZVAL(replace);
         ZVAL_STRING(replace, PHP_WIN32_MAIL_UNIFY_REPLACE, 0);
  
-       result = php_pcre_replace(PHP_WIN32_MAIL_UNIFY_PATTERN, sizeof(PHP_WIN32_MAIL_UNIFY_PATTERN)-1,
+       result = php_pcre_replace(IS_STRING, PHP_WIN32_MAIL_UNIFY_PATTERN, sizeof(PHP_WIN32_MAIL_UNIFY_PATTERN)-1,
                                                           header, strlen(header),
                                                           replace,
                                                           0,
@@ -179,7 +179,7 @@ static char *php_win32_mail_trim_header(char *header TSRMLS_DC)
  
         ZVAL_STRING(replace, PHP_WIN32_MAIL_RMVDBL_REPLACE, 0);
  
-       result2 = php_pcre_replace(PHP_WIN32_MAIL_RMVDBL_PATTERN, sizeof(PHP_WIN32_MAIL_RMVDBL_PATTERN)-1,
+       result2 = php_pcre_replace(IS_STRING, PHP_WIN32_MAIL_RMVDBL_PATTERN, sizeof(PHP_WIN32_MAIL_RMVDBL_PATTERN)-1,
                                                            result, result_len,
                                                            replace,
                                                            0,
author	Dmitry Stogov <dmitry@php.net>
	Fri, 6 Jul 2007 23:06:51 +0000 (23:06 +0000)
committer	Dmitry Stogov <dmitry@php.net>
	Fri, 6 Jul 2007 23:06:51 +0000 (23:06 +0000)
ext/pcre/php_pcre.c		patch \| blob \| history
ext/pcre/php_pcre.h		patch \| blob \| history
ext/pcre/tests/bug27103.phpt		patch \| blob \| history
ext/pcre/tests/bug40909.phpt		patch \| blob \| history
ext/pcre/tests/invalid_utf8.phpt		patch \| blob \| history
ext/spl/spl_iterators.c		patch \| blob \| history
win32/sendmail.c		patch \| blob \| history