]> granicus.if.org Git - icu/commitdiff
ICU-21492 Fix regex compile assertion failure.
authorAndy Heninger <andy.heninger@gmail.com>
Sat, 13 Feb 2021 21:28:10 +0000 (13:28 -0800)
committerFrank Yung-Fong Tang <ftang@google.com>
Thu, 18 Feb 2021 02:49:55 +0000 (18:49 -0800)
A regex pattern containing nested look-behind blocks could trigger an assertion
failure during pattern compilation. The problem was caused by an off-by-one
error in the code that computes an upper bound on the match length, needed
because look-behind expressions are constrained to not have unbounded match
length.

Nested look-behind blocks come into play because, when computing the maximum
match length of an outer block, any inner look-behind blocks are skipped over -
they do not directly contribute to the length matched by the outer block. The
problem was in the code that skips over these nested look-behind blocks.

icu4c/source/i18n/regexcmp.cpp
icu4c/source/test/testdata/regextst.txt

index b75d80fa5015951cbd15397c6b85e5e47e56dab0..ec8654d0ea07a5101010f49b04863da082f8a877 100644 (file)
@@ -3475,6 +3475,9 @@ int32_t   RegexCompile::minMatchLength(int32_t start, int32_t end) {
 //                     value may be longer than the actual maximum; it must
 //                     never be shorter.
 //
+//                     start, end: the range of the pattern to check.
+//                     end is inclusive.
+//
 //------------------------------------------------------------------------------
 int32_t   RegexCompile::maxMatchLength(int32_t start, int32_t end) {
     if (U_FAILURE(*fStatus)) {
@@ -3720,14 +3723,14 @@ int32_t   RegexCompile::maxMatchLength(int32_t start, int32_t end) {
                 // Look-behind.  Scan forward until the matching look-around end,
                 //   without processing the look-behind block.
                 int32_t dataLoc = URX_VAL(op);
-                for (loc = loc + 1; loc < end; ++loc) {
+                for (loc = loc + 1; loc <= end; ++loc) {
                     op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
                     int32_t opType = URX_TYPE(op);
                     if ((opType == URX_LA_END || opType == URX_LBN_END) && (URX_VAL(op) == dataLoc)) {
                         break;
                     }
                 }
-                U_ASSERT(loc < end);
+                U_ASSERT(loc <= end);
             }
             break;
 
index 4609ee98f9af6d95c653af903b0a6267ee4533e8..12146bc30b5bb0f02f5b7ad052ba48387891d4dd 100644 (file)
 #
 "(?w)\b"                     v2     "äää<0></0> äää"
 
+# Bug ICU-21492 Assertion failure with nested look-around expressions.
+#
+"(?<=(?:(?<=(?:(?<=(?:(?<=)){2})){3})){4}"   E  "<0></0>"  # orig failure from bug report, w mismatched parens.
+"(?:(?<=(?:(?<=)){2}))"            "<0></0>"               # Simplified case, with a valid pattern.
+
 #  Random debugging, Temporary
 #