From 0330fb2cbf5b8d2f96aaa6f1de114e86d3dbbcfc Mon Sep 17 00:00:00 2001 From: Moriyoshi Koizumi Date: Sat, 31 Jan 2004 22:36:33 +0000 Subject: [PATCH] - Fix bug #27103 (preg_split('//u') incorrectly splits UTF-8 strings into octets). --- NEWS | 2 ++ ext/pcre/php_pcre.c | 46 +++++++++++++++++++++++++++++++++++++++------ ext/pcre/php_pcre.h | 2 ++ 3 files changed, 44 insertions(+), 6 deletions(-) diff --git a/NEWS b/NEWS index d8b2a8c339..b5ed1420d6 100644 --- a/NEWS +++ b/NEWS @@ -14,6 +14,8 @@ PHP NEWS (Derick) - Fixed problems with longlong values in mysqli. (Georg) - Fixed class name case preserving of user defined classes. (Marcus) +- Fixed bug #27103 (preg_split('//u') incorrectly splits UTF-8 strings into + octets). (Moriyoshi) - Fixed bug #27042 (SPL: SeekableIterator seek() broken). (Marcus) - Fixed bug #27008 (Every class method can be called as static). (Marcus) - Fixed bug #26938 (exec() has problems reading long lines). diff --git a/ext/pcre/php_pcre.c b/ext/pcre/php_pcre.c index c7702da11d..28babfca06 100644 --- a/ext/pcre/php_pcre.c +++ b/ext/pcre/php_pcre.c @@ -132,7 +132,17 @@ static PHP_MSHUTDOWN_FUNCTION(pcre) /* {{{ pcre_get_compiled_regex */ -PHPAPI pcre* pcre_get_compiled_regex(char *regex, pcre_extra **extra, int *preg_options TSRMLS_DC) { +PHPAPI pcre* pcre_get_compiled_regex(char *regex, pcre_extra **extra, int *preg_options TSRMLS_DC) +{ + int compile_options; + return pcre_get_compiled_regex_ex(regex, extra, preg_options, &compile_options); +} +/* }}} */ + +/* {{{ pcre_get_compiled_regex_ex + */ +PHPAPI pcre* pcre_get_compiled_regex_ex(char *regex, pcre_extra **extra, int *preg_options, int *compile_options TSRMLS_DC) +{ pcre *re = NULL; int coptions = 0; int soptions = 0; @@ -162,6 +172,7 @@ PHPAPI pcre* pcre_get_compiled_regex(char *regex, pcre_extra **extra, int *preg_ #endif *extra = pce->extra; *preg_options = pce->preg_options; + *compile_options = pce->compile_options; return pce->re; #if HAVE_SETLOCALE } @@ -236,7 +247,7 @@ PHPAPI pcre* pcre_get_compiled_regex(char *regex, pcre_extra **extra, int *preg_ /* Clear out preg options */ *preg_options = 0; - + /* Parse through the options, setting appropriate flags. Display a warning if we encounter an unknown modifier. */ while (*pp != 0) { @@ -297,13 +308,15 @@ PHPAPI pcre* pcre_get_compiled_regex(char *regex, pcre_extra **extra, int *preg_ } *preg_options = poptions; - + *compile_options = coptions; + efree(pattern); /* Store the compiled pattern and extra info in the cache. */ new_entry.re = re; new_entry.extra = *extra; new_entry.preg_options = poptions; + new_entry.compile_options = coptions; #if HAVE_SETLOCALE new_entry.locale = pestrdup(locale, 1); new_entry.tables = tables; @@ -1168,11 +1181,14 @@ PHP_FUNCTION(preg_split) **limit, /* Number of pieces to return */ **flags; pcre *re = NULL; /* Compiled regular expression */ + pcre *re_bump = NULL; /* Regex instance for empty matches */ pcre_extra *extra = NULL; /* Holds results of studying */ + pcre_extra *extra_bump = NULL; /* Almost dummy */ int *offsets; /* Array of subpattern offsets */ int size_offsets; /* Size of the offsets array */ int exoptions = 0; /* Execution options */ int preg_options = 0; /* Custom preg options */ + int coptions = 0; /* Custom preg options */ int argc; /* Argument count */ int limit_val = -1; /* Integer value of limit */ int no_empty = 0; /* If NO_EMPTY flag is set */ @@ -1210,7 +1226,7 @@ PHP_FUNCTION(preg_split) convert_to_string_ex(subject); /* Compile regex or get it from cache. */ - if ((re = pcre_get_compiled_regex(Z_STRVAL_PP(regex), &extra, &preg_options TSRMLS_CC)) == NULL) { + if ((re = pcre_get_compiled_regex_ex(Z_STRVAL_PP(regex), &extra, &preg_options, &coptions TSRMLS_CC)) == NULL) { RETURN_FALSE; } @@ -1284,8 +1300,26 @@ PHP_FUNCTION(preg_split) the start offset, and continue. Fudge the offset values to achieve this, unless we're already at the end of the string. */ if (g_notempty != 0 && start_offset < Z_STRLEN_PP(subject)) { - offsets[0] = start_offset; - offsets[1] = start_offset + 1; + if (coptions & PCRE_UTF8) { + if (re_bump == NULL) { + int dummy; + + if ((re_bump = pcre_get_compiled_regex("/./u", &extra_bump, &dummy TSRMLS_CC)) == NULL) { + RETURN_FALSE; + } + } + count = pcre_exec(re_bump, extra_bump, Z_STRVAL_PP(subject), + Z_STRLEN_PP(subject), start_offset, + exoptions, offsets, size_offsets); + if (count < 1) { + php_error_docref(NULL TSRMLS_CC,E_NOTICE, "Unknown error"); + offsets[0] = start_offset; + offsets[1] = start_offset + 1; + } + } else { + offsets[0] = start_offset; + offsets[1] = start_offset + 1; + } } else break; } diff --git a/ext/pcre/php_pcre.h b/ext/pcre/php_pcre.h index 25fc6abafe..2f05a27f6f 100644 --- a/ext/pcre/php_pcre.h +++ b/ext/pcre/php_pcre.h @@ -43,6 +43,7 @@ PHP_FUNCTION(preg_grep); PHPAPI char *php_pcre_replace(char *regex, int regex_len, char *subject, int subject_len, zval *replace_val, int is_callable_replace, int *result_len, int limit TSRMLS_DC); PHPAPI pcre* pcre_get_compiled_regex(char *regex, pcre_extra **extra, int *options TSRMLS_DC); +PHPAPI pcre* pcre_get_compiled_regex_ex(char *regex, pcre_extra **extra, int *preg_options, int *coptions TSRMLS_DC); extern zend_module_entry pcre_module_entry; #define pcre_module_ptr &pcre_module_entry @@ -51,6 +52,7 @@ typedef struct { pcre *re; pcre_extra *extra; int preg_options; + int compile_options; #if HAVE_SETLOCALE char *locale; unsigned const char *tables; -- 2.40.0