]> granicus.if.org Git - php/commitdiff
PCRE: Only remember valid UTF-8 if start offset zero
authorNikita Popov <nikita.ppv@gmail.com>
Fri, 7 Feb 2020 16:01:39 +0000 (17:01 +0100)
committerNikita Popov <nikita.ppv@gmail.com>
Fri, 7 Feb 2020 16:01:39 +0000 (17:01 +0100)
PCRE only validates the string starting from the start offset
(minus maximum look-behind, but let's ignore that), so we can
only remember that the string is fully valid UTF-8 is the original
start offset is zero.

NEWS
ext/pcre/php_pcre.c
ext/pcre/tests/bug79241.phpt

diff --git a/NEWS b/NEWS
index 26885fae72050d43501249a6f653c8ec1e4e3c41..a08c2d77a9c8c4208b7a9f95283fc33bdb127cc8 100644 (file)
--- a/NEWS
+++ b/NEWS
@@ -13,6 +13,7 @@ PHP                                                                        NEWS
 - PCRE:
   . Fixed bug #79188 (Memory corruption in preg_replace/preg_replace_callback
     and unicode). (Nikita)
+  . Fixed bug #79241 (Segmentation fault on preg_match()). (Nikita)
 
 ?? ??? ????, PHP 7.4.3
 
index 104b8d4c97583f256a6b698590b38efba15716d1..c50bd2fba22820971fbb598e7fb698d4238cf025 100644 (file)
@@ -1167,7 +1167,7 @@ PHPAPI void php_pcre_match_impl(pcre_cache_entry *pce, zend_string *subject_str,
        PCRE2_SPTR       mark = NULL;           /* Target for MARK name */
        zval                     marks;                         /* Array of marks for PREG_PATTERN_ORDER */
        pcre2_match_data *match_data;
-       PCRE2_SIZE               start_offset2;
+       PCRE2_SIZE               start_offset2, orig_start_offset;
 
        char *subject = ZSTR_VAL(subject_str);
        size_t subject_len = ZSTR_LEN(subject_str);
@@ -1263,8 +1263,10 @@ PHPAPI void php_pcre_match_impl(pcre_cache_entry *pce, zend_string *subject_str,
                }
        }
 
-       options = (pce->compile_options & PCRE2_UTF) && !is_known_valid_utf8(subject_str, start_offset2)
-               ? 0 : PCRE2_NO_UTF_CHECK;
+       orig_start_offset = start_offset2;
+       options =
+               (pce->compile_options & PCRE2_UTF) && !is_known_valid_utf8(subject_str, orig_start_offset)
+                       ? 0 : PCRE2_NO_UTF_CHECK;
 
        /* Execute the regular expression. */
 #ifdef HAVE_PCRE_JIT_SUPPORT
@@ -1454,7 +1456,8 @@ error:
 
        if (PCRE_G(error_code) == PHP_PCRE_NO_ERROR) {
                /* If there was no error and we're in /u mode, remember that the string is valid UTF-8. */
-               if ((pce->compile_options & PCRE2_UTF) && !ZSTR_IS_INTERNED(subject_str)) {
+               if ((pce->compile_options & PCRE2_UTF)
+                               && !ZSTR_IS_INTERNED(subject_str) && orig_start_offset == 0) {
                        GC_ADD_FLAGS(subject_str, IS_STR_VALID_UTF8);
                }
 
index 92e5253735f9f64fd3c271a0e8f92ca86a821cf7..f6dbb8bea4e9090a0458eb57cca6631c1a6d5c94 100644 (file)
@@ -15,8 +15,19 @@ var_dump(preg_match($pattern, $text, $matches, 0, 0));
 var_dump(preg_match($pattern, $text, $matches, 0, 1));
 var_dump(preg_last_error() == PREG_BAD_UTF8_OFFSET_ERROR);
 
+echo "\n";
+
+$text = "VA\xff"; $text .= "LID";
+var_dump(preg_match($pattern, $text, $matches, 0, 4));
+var_dump(preg_match($pattern, $text, $matches, 0, 0));
+var_dump(preg_last_error() == PREG_BAD_UTF8_ERROR);
+
 ?>
 --EXPECT--
 int(0)
 bool(false)
 bool(true)
+
+int(1)
+bool(false)
+bool(true)