]> granicus.if.org Git - icu/commitdiff
ICU-9719 Regular Expressions, add loop breaking to unbounded {min, max} loops.
authorAndy Heninger <andy.heninger@gmail.com>
Wed, 26 Jun 2013 00:27:11 +0000 (00:27 +0000)
committerAndy Heninger <andy.heninger@gmail.com>
Wed, 26 Jun 2013 00:27:11 +0000 (00:27 +0000)
X-SVN-Rev: 33848

icu4c/source/i18n/regexcmp.cpp
icu4c/source/i18n/rematch.cpp
icu4c/source/test/intltest/regextst.cpp
icu4c/source/test/testdata/regextst.txt

index 9133cdd968184dd34475d7b5d1cc15b758f6fc10..0ec61543295dd9976d737e290c57d38650bd306a 100644 (file)
@@ -2254,6 +2254,8 @@ void        RegexCompile::compileSet(UnicodeSet *theSet)
 //                      Except for the specific opcodes used, the code is the same
 //                      for all three types (greedy, non-greedy, possessive) of
 //                      intervals.  The opcodes are supplied as parameters.
+//                      (There are two sets of opcodes - greedy & possessive use the
+//                      same ones, while non-greedy has it's own.)
 //
 //                      The code for interval loops has this form:
 //                         0  CTR_INIT   counter loc (in stack frame)
@@ -2275,9 +2277,15 @@ void        RegexCompile::compileInterval(int32_t InitOp,  int32_t LoopOp)
     insertOp(topOfBlock);
 
     // The operands for the CTR_INIT opcode include the index in the matcher data
-    //   of the counter.  Allocate it now.
+    //   of the counter.  Allocate it now. There are two data items
+    //        counterLoc   -->  Loop counter
+    //               +1    -->  Input index (for breaking non-progressing loops)
+    //                          (Only present if unbounded upper limit on loop)
     int32_t   counterLoc = fRXPat->fFrameSize;
     fRXPat->fFrameSize++;
+    if (fIntervalUpper < 0) {
+        fRXPat->fFrameSize++;
+    }
 
     int32_t   op = URX_BUILD(InitOp, counterLoc);
     fRXPat->fCompiledPat->setElementAt(op, topOfBlock);
index 944843e31d7714fe8ffcfd5b3a218130563a3770..1af47442afec21e110f9bf3f91c0ae7f61626374 100644 (file)
@@ -3481,7 +3481,7 @@ GC_Done:
         case URX_CTR_INIT:
             {
                 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
-                fp->fExtra[opValue] = 0;       //  Set the loop counter variable to zero
+                fp->fExtra[opValue] = 0;                 //  Set the loop counter variable to zero
 
                 // Pick up the three extra operands that CTR_INIT has, and
                 //    skip the pattern location counter past 
@@ -3497,7 +3497,9 @@ GC_Done:
                 if (minCount == 0) {
                     fp = StateSave(fp, loopLoc+1, status);
                 }
-                if (maxCount == 0) {
+                if (maxCount == -1) {
+                    fp->fExtra[opValue+1] = fp->fInputIdx;   //  For loop breaking.
+                } else if (maxCount == 0) {
                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
                 }
             }
@@ -3511,18 +3513,22 @@ GC_Done:
                 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
                 int32_t minCount  = (int32_t)pat[opValue+2];
                 int32_t maxCount  = (int32_t)pat[opValue+3];
-                // Increment the counter.  Note: we DIDN'T worry about counter
-                //   overflow, since the data comes from UnicodeStrings, which
-                //   stores its length in an int32_t. Do we have to think about
-                //   this now that we're using UText? Probably not, since the length
-                //   in UChar32s is still an int32_t.
                 (*pCounter)++;
-                U_ASSERT(*pCounter > 0);
-                if ((uint64_t)*pCounter >= (uint32_t)maxCount) {
-                    U_ASSERT(*pCounter == maxCount || maxCount == -1);
+                if ((uint64_t)*pCounter >= (uint32_t)maxCount && maxCount != -1) {
+                    U_ASSERT(*pCounter == maxCount);
                     break;
                 }
                 if (*pCounter >= minCount) {
+                    if (maxCount == -1) {
+                        // Loop has no hard upper bound.
+                        // Check that it is progressing through the input, break if it is not.
+                        int64_t *pLastInputIdx =  &fp->fExtra[URX_VAL(initOp) + 1];
+                        if (fp->fInputIdx == *pLastInputIdx) {
+                            break;
+                        } else {
+                            *pLastInputIdx = fp->fInputIdx;
+                        }
+                    }
                     fp = StateSave(fp, fp->fPatIdx, status);
                 }
                 fp->fPatIdx = opValue + 4;    // Loop back.
@@ -3533,9 +3539,9 @@ GC_Done:
             {
                 // Initialize a non-greedy loop
                 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
-                fp->fExtra[opValue] = 0;       //  Set the loop counter variable to zero
+                fp->fExtra[opValue] = 0;                 //  Set the loop counter variable to zero
 
-                // Pick up the three extra operands that CTR_INIT has, and
+                // Pick up the three extra operands that CTR_INIT_NG has, and
                 //    skip the pattern location counter past 
                 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
                 fp->fPatIdx += 3;
@@ -3545,6 +3551,9 @@ GC_Done:
                 U_ASSERT(minCount>=0);
                 U_ASSERT(maxCount>=minCount || maxCount==-1);
                 U_ASSERT(loopLoc>fp->fPatIdx);
+                if (maxCount == -1) {
+                    fp->fExtra[opValue+1] = fp->fInputIdx;   //  Save initial input index for loop breaking.
+                }
 
                 if (minCount == 0) {
                     if (maxCount != 0) {
@@ -3564,19 +3573,13 @@ GC_Done:
                 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
                 int32_t minCount  = (int32_t)pat[opValue+2];
                 int32_t maxCount  = (int32_t)pat[opValue+3];
-                // Increment the counter.  Note: we DIDN'T worry about counter
-                //   overflow, since the data comes from UnicodeStrings, which
-                //   stores its length in an int32_t. Do we have to think about
-                //   this now that we're using UText? Probably not, since the length
-                //   in UChar32s is still an int32_t.
-                (*pCounter)++;
-                U_ASSERT(*pCounter > 0);
 
-                if ((uint64_t)*pCounter >= (uint32_t)maxCount) {
+                (*pCounter)++;
+                if ((uint64_t)*pCounter >= (uint32_t)maxCount && maxCount != -1) {
                     // The loop has matched the maximum permitted number of times.
                     //   Break out of here with no action.  Matching will
                     //   continue with the following pattern.
-                    U_ASSERT(*pCounter == maxCount || maxCount == -1);
+                    U_ASSERT(*pCounter == maxCount);
                     break;
                 }
 
@@ -3586,8 +3589,20 @@ GC_Done:
                     fp->fPatIdx = opValue + 4;    // Loop back.
                 } else {
                     // We do have the minimum number of matches.
-                    //   Fall into the following pattern, but first do
-                    //   a state save to the top of the loop, so that a failure
+
+                    // If there is no upper bound on the loop iterations, check that the input index
+                    // is progressing, and stop the loop if it is not.
+                    if (maxCount == -1) {
+                        int64_t *pLastInputIdx =  &fp->fExtra[URX_VAL(initOp) + 1];
+                        if (fp->fInputIdx == *pLastInputIdx) {
+                            break;
+                        }
+                        *pLastInputIdx = fp->fInputIdx;
+                    }
+
+                    // Loop Continuation: we will fall into the pattern following the loop
+                    //   (non-greedy, don't execute loop body first), but first do
+                    //   a state save to the top of the loop, so that a match failure
                     //   in the following pattern will try another iteration of the loop.
                     fp = StateSave(fp, opValue + 4, status);
                 }
@@ -4940,7 +4955,7 @@ GC_Done:
         case URX_CTR_INIT:
             {
                 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
-                fp->fExtra[opValue] = 0;       //  Set the loop counter variable to zero
+                fp->fExtra[opValue] = 0;                 //  Set the loop counter variable to zero
                 
                 // Pick up the three extra operands that CTR_INIT has, and
                 //    skip the pattern location counter past 
@@ -4956,7 +4971,9 @@ GC_Done:
                 if (minCount == 0) {
                     fp = StateSave(fp, loopLoc+1, status);
                 }
-                if (maxCount == 0) {
+                if (maxCount == -1) {
+                    fp->fExtra[opValue+1] = fp->fInputIdx;   //  For loop breaking.
+                } else if (maxCount == 0) {
                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
                 }
             }
@@ -4970,18 +4987,22 @@ GC_Done:
                 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
                 int32_t minCount  = (int32_t)pat[opValue+2];
                 int32_t maxCount  = (int32_t)pat[opValue+3];
-                // Increment the counter.  Note: we DIDN'T worry about counter
-                //   overflow, since the data comes from UnicodeStrings, which
-                //   stores its length in an int32_t. Do we have to think about
-                //   this now that we're using UText? Probably not, since the length
-                //   in UChar32s is still an int32_t.
                 (*pCounter)++;
-                U_ASSERT(*pCounter > 0);
-                if ((uint64_t)*pCounter >= (uint32_t)maxCount) {
-                    U_ASSERT(*pCounter == maxCount || maxCount == -1);
+                if ((uint64_t)*pCounter >= (uint32_t)maxCount && maxCount != -1) {
+                    U_ASSERT(*pCounter == maxCount);
                     break;
                 }
                 if (*pCounter >= minCount) {
+                    if (maxCount == -1) {
+                        // Loop has no hard upper bound.
+                        // Check that it is progressing through the input, break if it is not.
+                        int64_t *pLastInputIdx =  &fp->fExtra[URX_VAL(initOp) + 1];
+                        if (fp->fInputIdx == *pLastInputIdx) {
+                            break;
+                        } else {
+                            *pLastInputIdx = fp->fInputIdx;
+                        }
+                    }
                     fp = StateSave(fp, fp->fPatIdx, status);
                 }
                 fp->fPatIdx = opValue + 4;    // Loop back.
@@ -4992,9 +5013,9 @@ GC_Done:
             {
                 // Initialize a non-greedy loop
                 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
-                fp->fExtra[opValue] = 0;       //  Set the loop counter variable to zero
+                fp->fExtra[opValue] = 0;                 //  Set the loop counter variable to zero
                 
-                // Pick up the three extra operands that CTR_INIT has, and
+                // Pick up the three extra operands that CTR_INIT_NG has, and
                 //    skip the pattern location counter past 
                 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
                 fp->fPatIdx += 3;
@@ -5004,6 +5025,9 @@ GC_Done:
                 U_ASSERT(minCount>=0);
                 U_ASSERT(maxCount>=minCount || maxCount==-1);
                 U_ASSERT(loopLoc>fp->fPatIdx);
+                if (maxCount == -1) {
+                    fp->fExtra[opValue+1] = fp->fInputIdx;   //  Save initial input index for loop breaking.
+                }
                 
                 if (minCount == 0) {
                     if (maxCount != 0) {
@@ -5023,19 +5047,13 @@ GC_Done:
                 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
                 int32_t minCount  = (int32_t)pat[opValue+2];
                 int32_t maxCount  = (int32_t)pat[opValue+3];
-                // Increment the counter.  Note: we DIDN'T worry about counter
-                //   overflow, since the data comes from UnicodeStrings, which
-                //   stores its length in an int32_t. Do we have to think about
-                //   this now that we're using UText? Probably not, since the length
-                //   in UChar32s is still an int32_t.
+
                 (*pCounter)++;
-                U_ASSERT(*pCounter > 0);
-                
-                if ((uint64_t)*pCounter >= (uint32_t)maxCount) {
+                if ((uint64_t)*pCounter >= (uint32_t)maxCount && maxCount != -1) {
                     // The loop has matched the maximum permitted number of times.
                     //   Break out of here with no action.  Matching will
                     //   continue with the following pattern.
-                    U_ASSERT(*pCounter == maxCount || maxCount == -1);
+                    U_ASSERT(*pCounter == maxCount);
                     break;
                 }
                 
@@ -5045,8 +5063,20 @@ GC_Done:
                     fp->fPatIdx = opValue + 4;    // Loop back.
                 } else {
                     // We do have the minimum number of matches.
-                    //   Fall into the following pattern, but first do
-                    //   a state save to the top of the loop, so that a failure
+
+                    // If there is no upper bound on the loop iterations, check that the input index
+                    // is progressing, and stop the loop if it is not.
+                    if (maxCount == -1) {
+                        int64_t *pLastInputIdx =  &fp->fExtra[URX_VAL(initOp) + 1];
+                        if (fp->fInputIdx == *pLastInputIdx) {
+                            break;
+                        }
+                        *pLastInputIdx = fp->fInputIdx;
+                    }
+
+                    // Loop Continuation: we will fall into the pattern following the loop
+                    //   (non-greedy, don't execute loop body first), but first do
+                    //   a state save to the top of the loop, so that a match failure
                     //   in the following pattern will try another iteration of the loop.
                     fp = StateSave(fp, opValue + 4, status);
                 }
index 66a26fbef37d1854f7403f6755a5949e19dbd493..3c97af53d357c15982c57f30fda127aefaa42389 100644 (file)
@@ -1,6 +1,6 @@
 /********************************************************************
  * COPYRIGHT:
- * Copyright (c) 2002-2012, International Business Machines Corporation and
+ * Copyright (c) 2002-2013, International Business Machines Corporation and
  * others. All Rights Reserved.
  ********************************************************************/
 
@@ -3581,6 +3581,9 @@ void RegexTest::regex_find(const UnicodeString &pattern,
         }
     }
     matcher->setTrace(FALSE);
+    if (U_FAILURE(status)) {
+        errln("Error at line %d. ICU ErrorCode is %s", u_errorName(status));
+    }
 
     //
     // Match up the groups from the find() with the groups from the tags
@@ -3591,7 +3594,7 @@ void RegexTest::regex_find(const UnicodeString &pattern,
     //   G option in test means that capture group data is not available in the
     //     expected results, so the check needs to be suppressed.
     if (isMatch == FALSE && groupStarts.size() != 0) {
-        dataerrln("Error at line %d:  Match expected, but none found.", line);
+        errln("Error at line %d:  Match expected, but none found.", line);
         failed = TRUE;
         goto cleanupAndReturn;
     } else if (UTF8Matcher != NULL && isUTF8Match == FALSE && groupStarts.size() != 0) {
@@ -4682,13 +4685,14 @@ void RegexTest::PerlTestsUTF8() {
 //
 //  Bug6149   Verify limits to heap expansion for backtrack stack.
 //             Use this pattern,
-//                 "(a?){1,}"
-//             The zero-length match will repeat forever.
-//                (That this goes into a loop is another bug)
+//                 "(a?){1,8000000}"
+//             Note: was an unbounded upperbounds, but that now has loop-breaking enabled.
+//                   This test is likely to be fragile, as further optimizations stop
+//                   more cases of pointless looping in the match engine.
 //
 //---------------------------------------------------------------
 void RegexTest::Bug6149() {
-    UnicodeString pattern("(a?){1,}");
+    UnicodeString pattern("(a?){1,8000000}");
     UnicodeString s("xyz");
     uint32_t flags = 0;
     UErrorCode status = U_ZERO_ERROR;
index 154248550381f7e0fb231cf69034fa79dfb7685a..5716ab54449aec6341470d75ef8fc2f98072de39 100644 (file)
 
 "(ab)?(?<=ab)cd|ef"             i  "<0><1>ab</1>cd</0>"
 
+# Bug 9719  Loop breaking on (zero length match){3,}   (unlimited upper bound).
+#
+
+"(?:abc){1,}abc"                   "<0>abcabcabcabcabc</0>"
+"(?:2*){2,}?a2\z"                  "<0>2a2</0>" 
+"(?:2*){2,}?a2\z"                  "2a3" 
+"(?:x?+){3,}+yz"                   "w<0>yz</0>"
+"(2*){2,}?a2\\z"                   "2a3"
+"(2*){2,}?a2\\z"                   "<0>2<1></1>a2\\z</0>"
+"(2*){2,}?a2\z"                    "<0>2<1></1>a2</0>"
+
+
 # Bug 10024
 #   Incorrect (unbounded) longest match length with {1, 20} style quantifiers.
 #   Unbounded match is disallowed in look-behind expressions.