From 23e25f3319db021298310fb97cf537bcef4095ad Mon Sep 17 00:00:00 2001 From: "Christoph M. Becker" Date: Fri, 5 Jun 2015 14:40:03 +0200 Subject: [PATCH] Fixed Bug #53823 (preg_replace: * qualifier on unicode replace garbles the string) When advancing after empty matches, php_pcre_match_impl() as well as php_pcre_replace_impl() always have to advance to the next code point when the u modifier is given, instead of to the next byte. --- ext/pcre/php_pcre.c | 31 +++++++++++++++++++++--- ext/pcre/tests/bug53823.phpt | 13 ++++++++++ ext/pcre/tests/bug66121.phpt | 47 ++++++++++++++++++++++++++++++++++++ 3 files changed, 87 insertions(+), 4 deletions(-) create mode 100644 ext/pcre/tests/bug53823.phpt create mode 100644 ext/pcre/tests/bug66121.phpt diff --git a/ext/pcre/php_pcre.c b/ext/pcre/php_pcre.c index e7274b841d..7cc16ca6e6 100644 --- a/ext/pcre/php_pcre.c +++ b/ext/pcre/php_pcre.c @@ -225,6 +225,25 @@ static char **make_subpats_table(int num_subpats, pcre_cache_entry *pce TSRMLS_D } /* }}} */ +/* {{{ static calculate_unit_length */ +/* Calculates the byte length of the next character. Assumes valid UTF-8 for PCRE_UTF8. */ +static zend_always_inline int calculate_unit_length(pcre_cache_entry *pce, char *start) +{ + int unit_len; + + if (pce->compile_options & PCRE_UTF8) { + char *end = start; + + /* skip continuation bytes */ + while ((*++end & 0xC0) == 0x80); + unit_len = end - start; + } else { + unit_len = 1; + } + return unit_len; +} +/* }}} */ + /* {{{ pcre_get_compiled_regex_cache */ PHPAPI pcre_cache_entry* pcre_get_compiled_regex_cache(char *regex, int regex_len TSRMLS_DC) @@ -758,8 +777,10 @@ PHPAPI void php_pcre_match_impl(pcre_cache_entry *pce, char *subject, int subjec the start offset, and continue. Fudge the offset values to achieve this, unless we're already at the end of the string. */ if (g_notempty != 0 && start_offset < subject_len) { + int unit_len = calculate_unit_length(pce, subject + start_offset); + offsets[0] = start_offset; - offsets[1] = start_offset + 1; + offsets[1] = start_offset + unit_len; } else break; } else { @@ -1206,10 +1227,12 @@ PHPAPI char *php_pcre_replace_impl(pcre_cache_entry *pce, char *subject, int sub the start offset, and continue. Fudge the offset values to achieve this, unless we're already at the end of the string. */ if (g_notempty != 0 && start_offset < subject_len) { + int unit_len = calculate_unit_length(pce, piece); + offsets[0] = start_offset; - offsets[1] = start_offset + 1; - memcpy(&result[*result_len], piece, 1); - (*result_len)++; + offsets[1] = start_offset + unit_len; + memcpy(&result[*result_len], piece, unit_len); + *result_len += unit_len; } else { new_len = *result_len + subject_len - start_offset; if (new_len + 1 > alloc_len) { diff --git a/ext/pcre/tests/bug53823.phpt b/ext/pcre/tests/bug53823.phpt new file mode 100644 index 0000000000..c1d8f999e0 --- /dev/null +++ b/ext/pcre/tests/bug53823.phpt @@ -0,0 +1,13 @@ +--TEST-- +Bug #53823 - preg_replace: * qualifier on unicode replace garbles the string +--FILE-- + +--EXPECT-- +string(10) "áéíóú" +NULL +NULL diff --git a/ext/pcre/tests/bug66121.phpt b/ext/pcre/tests/bug66121.phpt new file mode 100644 index 0000000000..89c2f2d5d8 --- /dev/null +++ b/ext/pcre/tests/bug66121.phpt @@ -0,0 +1,47 @@ +--TEST-- +Bug #66121 - UTF-8 lookbehinds match bytes instead of characters +--FILE-- + +--EXPECT-- +string(4) "*ක" +string(5) "*ම*" +string(2) "*k" +string(3) "*m*" +array(1) { + [0]=> + array(2) { + [0]=> + array(2) { + [0]=> + string(0) "" + [1]=> + int(0) + } + [1]=> + array(2) { + [0]=> + string(0) "" + [1]=> + int(3) + } + } +} +NULL +NULL +bool(false) +bool(false) -- 2.40.0