From 20016a58db8e60caf49d4a144ffd37fabb1c722d Mon Sep 17 00:00:00 2001 From: Andy Heninger Date: Wed, 26 Jun 2013 00:27:11 +0000 Subject: [PATCH] ICU-9719 Regular Expressions, add loop breaking to unbounded {min, max} loops. X-SVN-Rev: 33848 --- icu4c/source/i18n/regexcmp.cpp | 10 +- icu4c/source/i18n/rematch.cpp | 122 +++++++++++++++--------- icu4c/source/test/intltest/regextst.cpp | 16 ++-- icu4c/source/test/testdata/regextst.txt | 12 +++ 4 files changed, 107 insertions(+), 53 deletions(-) diff --git a/icu4c/source/i18n/regexcmp.cpp b/icu4c/source/i18n/regexcmp.cpp index 9133cdd9681..0ec61543295 100644 --- a/icu4c/source/i18n/regexcmp.cpp +++ b/icu4c/source/i18n/regexcmp.cpp @@ -2254,6 +2254,8 @@ void RegexCompile::compileSet(UnicodeSet *theSet) // Except for the specific opcodes used, the code is the same // for all three types (greedy, non-greedy, possessive) of // intervals. The opcodes are supplied as parameters. +// (There are two sets of opcodes - greedy & possessive use the +// same ones, while non-greedy has it's own.) // // The code for interval loops has this form: // 0 CTR_INIT counter loc (in stack frame) @@ -2275,9 +2277,15 @@ void RegexCompile::compileInterval(int32_t InitOp, int32_t LoopOp) insertOp(topOfBlock); // The operands for the CTR_INIT opcode include the index in the matcher data - // of the counter. Allocate it now. + // of the counter. Allocate it now. There are two data items + // counterLoc --> Loop counter + // +1 --> Input index (for breaking non-progressing loops) + // (Only present if unbounded upper limit on loop) int32_t counterLoc = fRXPat->fFrameSize; fRXPat->fFrameSize++; + if (fIntervalUpper < 0) { + fRXPat->fFrameSize++; + } int32_t op = URX_BUILD(InitOp, counterLoc); fRXPat->fCompiledPat->setElementAt(op, topOfBlock); diff --git a/icu4c/source/i18n/rematch.cpp b/icu4c/source/i18n/rematch.cpp index 944843e31d7..1af47442afe 100644 --- a/icu4c/source/i18n/rematch.cpp +++ b/icu4c/source/i18n/rematch.cpp @@ -3481,7 +3481,7 @@ GC_Done: case URX_CTR_INIT: { U_ASSERT(opValue >= 0 && opValue < fFrameSize-2); - fp->fExtra[opValue] = 0; // Set the loop counter variable to zero + fp->fExtra[opValue] = 0; // Set the loop counter variable to zero // Pick up the three extra operands that CTR_INIT has, and // skip the pattern location counter past @@ -3497,7 +3497,9 @@ GC_Done: if (minCount == 0) { fp = StateSave(fp, loopLoc+1, status); } - if (maxCount == 0) { + if (maxCount == -1) { + fp->fExtra[opValue+1] = fp->fInputIdx; // For loop breaking. + } else if (maxCount == 0) { fp = (REStackFrame *)fStack->popFrame(fFrameSize); } } @@ -3511,18 +3513,22 @@ GC_Done: int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)]; int32_t minCount = (int32_t)pat[opValue+2]; int32_t maxCount = (int32_t)pat[opValue+3]; - // Increment the counter. Note: we DIDN'T worry about counter - // overflow, since the data comes from UnicodeStrings, which - // stores its length in an int32_t. Do we have to think about - // this now that we're using UText? Probably not, since the length - // in UChar32s is still an int32_t. (*pCounter)++; - U_ASSERT(*pCounter > 0); - if ((uint64_t)*pCounter >= (uint32_t)maxCount) { - U_ASSERT(*pCounter == maxCount || maxCount == -1); + if ((uint64_t)*pCounter >= (uint32_t)maxCount && maxCount != -1) { + U_ASSERT(*pCounter == maxCount); break; } if (*pCounter >= minCount) { + if (maxCount == -1) { + // Loop has no hard upper bound. + // Check that it is progressing through the input, break if it is not. + int64_t *pLastInputIdx = &fp->fExtra[URX_VAL(initOp) + 1]; + if (fp->fInputIdx == *pLastInputIdx) { + break; + } else { + *pLastInputIdx = fp->fInputIdx; + } + } fp = StateSave(fp, fp->fPatIdx, status); } fp->fPatIdx = opValue + 4; // Loop back. @@ -3533,9 +3539,9 @@ GC_Done: { // Initialize a non-greedy loop U_ASSERT(opValue >= 0 && opValue < fFrameSize-2); - fp->fExtra[opValue] = 0; // Set the loop counter variable to zero + fp->fExtra[opValue] = 0; // Set the loop counter variable to zero - // Pick up the three extra operands that CTR_INIT has, and + // Pick up the three extra operands that CTR_INIT_NG has, and // skip the pattern location counter past int32_t instrOperandLoc = (int32_t)fp->fPatIdx; fp->fPatIdx += 3; @@ -3545,6 +3551,9 @@ GC_Done: U_ASSERT(minCount>=0); U_ASSERT(maxCount>=minCount || maxCount==-1); U_ASSERT(loopLoc>fp->fPatIdx); + if (maxCount == -1) { + fp->fExtra[opValue+1] = fp->fInputIdx; // Save initial input index for loop breaking. + } if (minCount == 0) { if (maxCount != 0) { @@ -3564,19 +3573,13 @@ GC_Done: int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)]; int32_t minCount = (int32_t)pat[opValue+2]; int32_t maxCount = (int32_t)pat[opValue+3]; - // Increment the counter. Note: we DIDN'T worry about counter - // overflow, since the data comes from UnicodeStrings, which - // stores its length in an int32_t. Do we have to think about - // this now that we're using UText? Probably not, since the length - // in UChar32s is still an int32_t. - (*pCounter)++; - U_ASSERT(*pCounter > 0); - if ((uint64_t)*pCounter >= (uint32_t)maxCount) { + (*pCounter)++; + if ((uint64_t)*pCounter >= (uint32_t)maxCount && maxCount != -1) { // The loop has matched the maximum permitted number of times. // Break out of here with no action. Matching will // continue with the following pattern. - U_ASSERT(*pCounter == maxCount || maxCount == -1); + U_ASSERT(*pCounter == maxCount); break; } @@ -3586,8 +3589,20 @@ GC_Done: fp->fPatIdx = opValue + 4; // Loop back. } else { // We do have the minimum number of matches. - // Fall into the following pattern, but first do - // a state save to the top of the loop, so that a failure + + // If there is no upper bound on the loop iterations, check that the input index + // is progressing, and stop the loop if it is not. + if (maxCount == -1) { + int64_t *pLastInputIdx = &fp->fExtra[URX_VAL(initOp) + 1]; + if (fp->fInputIdx == *pLastInputIdx) { + break; + } + *pLastInputIdx = fp->fInputIdx; + } + + // Loop Continuation: we will fall into the pattern following the loop + // (non-greedy, don't execute loop body first), but first do + // a state save to the top of the loop, so that a match failure // in the following pattern will try another iteration of the loop. fp = StateSave(fp, opValue + 4, status); } @@ -4940,7 +4955,7 @@ GC_Done: case URX_CTR_INIT: { U_ASSERT(opValue >= 0 && opValue < fFrameSize-2); - fp->fExtra[opValue] = 0; // Set the loop counter variable to zero + fp->fExtra[opValue] = 0; // Set the loop counter variable to zero // Pick up the three extra operands that CTR_INIT has, and // skip the pattern location counter past @@ -4956,7 +4971,9 @@ GC_Done: if (minCount == 0) { fp = StateSave(fp, loopLoc+1, status); } - if (maxCount == 0) { + if (maxCount == -1) { + fp->fExtra[opValue+1] = fp->fInputIdx; // For loop breaking. + } else if (maxCount == 0) { fp = (REStackFrame *)fStack->popFrame(fFrameSize); } } @@ -4970,18 +4987,22 @@ GC_Done: int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)]; int32_t minCount = (int32_t)pat[opValue+2]; int32_t maxCount = (int32_t)pat[opValue+3]; - // Increment the counter. Note: we DIDN'T worry about counter - // overflow, since the data comes from UnicodeStrings, which - // stores its length in an int32_t. Do we have to think about - // this now that we're using UText? Probably not, since the length - // in UChar32s is still an int32_t. (*pCounter)++; - U_ASSERT(*pCounter > 0); - if ((uint64_t)*pCounter >= (uint32_t)maxCount) { - U_ASSERT(*pCounter == maxCount || maxCount == -1); + if ((uint64_t)*pCounter >= (uint32_t)maxCount && maxCount != -1) { + U_ASSERT(*pCounter == maxCount); break; } if (*pCounter >= minCount) { + if (maxCount == -1) { + // Loop has no hard upper bound. + // Check that it is progressing through the input, break if it is not. + int64_t *pLastInputIdx = &fp->fExtra[URX_VAL(initOp) + 1]; + if (fp->fInputIdx == *pLastInputIdx) { + break; + } else { + *pLastInputIdx = fp->fInputIdx; + } + } fp = StateSave(fp, fp->fPatIdx, status); } fp->fPatIdx = opValue + 4; // Loop back. @@ -4992,9 +5013,9 @@ GC_Done: { // Initialize a non-greedy loop U_ASSERT(opValue >= 0 && opValue < fFrameSize-2); - fp->fExtra[opValue] = 0; // Set the loop counter variable to zero + fp->fExtra[opValue] = 0; // Set the loop counter variable to zero - // Pick up the three extra operands that CTR_INIT has, and + // Pick up the three extra operands that CTR_INIT_NG has, and // skip the pattern location counter past int32_t instrOperandLoc = (int32_t)fp->fPatIdx; fp->fPatIdx += 3; @@ -5004,6 +5025,9 @@ GC_Done: U_ASSERT(minCount>=0); U_ASSERT(maxCount>=minCount || maxCount==-1); U_ASSERT(loopLoc>fp->fPatIdx); + if (maxCount == -1) { + fp->fExtra[opValue+1] = fp->fInputIdx; // Save initial input index for loop breaking. + } if (minCount == 0) { if (maxCount != 0) { @@ -5023,19 +5047,13 @@ GC_Done: int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)]; int32_t minCount = (int32_t)pat[opValue+2]; int32_t maxCount = (int32_t)pat[opValue+3]; - // Increment the counter. Note: we DIDN'T worry about counter - // overflow, since the data comes from UnicodeStrings, which - // stores its length in an int32_t. Do we have to think about - // this now that we're using UText? Probably not, since the length - // in UChar32s is still an int32_t. + (*pCounter)++; - U_ASSERT(*pCounter > 0); - - if ((uint64_t)*pCounter >= (uint32_t)maxCount) { + if ((uint64_t)*pCounter >= (uint32_t)maxCount && maxCount != -1) { // The loop has matched the maximum permitted number of times. // Break out of here with no action. Matching will // continue with the following pattern. - U_ASSERT(*pCounter == maxCount || maxCount == -1); + U_ASSERT(*pCounter == maxCount); break; } @@ -5045,8 +5063,20 @@ GC_Done: fp->fPatIdx = opValue + 4; // Loop back. } else { // We do have the minimum number of matches. - // Fall into the following pattern, but first do - // a state save to the top of the loop, so that a failure + + // If there is no upper bound on the loop iterations, check that the input index + // is progressing, and stop the loop if it is not. + if (maxCount == -1) { + int64_t *pLastInputIdx = &fp->fExtra[URX_VAL(initOp) + 1]; + if (fp->fInputIdx == *pLastInputIdx) { + break; + } + *pLastInputIdx = fp->fInputIdx; + } + + // Loop Continuation: we will fall into the pattern following the loop + // (non-greedy, don't execute loop body first), but first do + // a state save to the top of the loop, so that a match failure // in the following pattern will try another iteration of the loop. fp = StateSave(fp, opValue + 4, status); } diff --git a/icu4c/source/test/intltest/regextst.cpp b/icu4c/source/test/intltest/regextst.cpp index 66a26fbef37..3c97af53d35 100644 --- a/icu4c/source/test/intltest/regextst.cpp +++ b/icu4c/source/test/intltest/regextst.cpp @@ -1,6 +1,6 @@ /******************************************************************** * COPYRIGHT: - * Copyright (c) 2002-2012, International Business Machines Corporation and + * Copyright (c) 2002-2013, International Business Machines Corporation and * others. All Rights Reserved. ********************************************************************/ @@ -3581,6 +3581,9 @@ void RegexTest::regex_find(const UnicodeString &pattern, } } matcher->setTrace(FALSE); + if (U_FAILURE(status)) { + errln("Error at line %d. ICU ErrorCode is %s", u_errorName(status)); + } // // Match up the groups from the find() with the groups from the tags @@ -3591,7 +3594,7 @@ void RegexTest::regex_find(const UnicodeString &pattern, // G option in test means that capture group data is not available in the // expected results, so the check needs to be suppressed. if (isMatch == FALSE && groupStarts.size() != 0) { - dataerrln("Error at line %d: Match expected, but none found.", line); + errln("Error at line %d: Match expected, but none found.", line); failed = TRUE; goto cleanupAndReturn; } else if (UTF8Matcher != NULL && isUTF8Match == FALSE && groupStarts.size() != 0) { @@ -4682,13 +4685,14 @@ void RegexTest::PerlTestsUTF8() { // // Bug6149 Verify limits to heap expansion for backtrack stack. // Use this pattern, -// "(a?){1,}" -// The zero-length match will repeat forever. -// (That this goes into a loop is another bug) +// "(a?){1,8000000}" +// Note: was an unbounded upperbounds, but that now has loop-breaking enabled. +// This test is likely to be fragile, as further optimizations stop +// more cases of pointless looping in the match engine. // //--------------------------------------------------------------- void RegexTest::Bug6149() { - UnicodeString pattern("(a?){1,}"); + UnicodeString pattern("(a?){1,8000000}"); UnicodeString s("xyz"); uint32_t flags = 0; UErrorCode status = U_ZERO_ERROR; diff --git a/icu4c/source/test/testdata/regextst.txt b/icu4c/source/test/testdata/regextst.txt index 15424855038..5716ab54449 100644 --- a/icu4c/source/test/testdata/regextst.txt +++ b/icu4c/source/test/testdata/regextst.txt @@ -1146,6 +1146,18 @@ "(ab)?(?<=ab)cd|ef" i "<0><1>abcd" +# Bug 9719 Loop breaking on (zero length match){3,} (unlimited upper bound). +# + +"(?:abc){1,}abc" "<0>abcabcabcabcabc" +"(?:2*){2,}?a2\z" "<0>2a2" +"(?:2*){2,}?a2\z" "2a3" +"(?:x?+){3,}+yz" "w<0>yz" +"(2*){2,}?a2\\z" "2a3" +"(2*){2,}?a2\\z" "<0>2<1>a2\\z" +"(2*){2,}?a2\z" "<0>2<1>a2" + + # Bug 10024 # Incorrect (unbounded) longest match length with {1, 20} style quantifiers. # Unbounded match is disallowed in look-behind expressions. -- 2.40.0