]> granicus.if.org Git - icu/commitdiff
ICU-10024 fixes for Regexp, look-behind pattern fails to compile bug.
authorAndy Heninger <andy.heninger@gmail.com>
Tue, 18 Jun 2013 20:38:08 +0000 (20:38 +0000)
committerAndy Heninger <andy.heninger@gmail.com>
Tue, 18 Jun 2013 20:38:08 +0000 (20:38 +0000)
X-SVN-Rev: 33835

icu4c/source/i18n/regexcmp.cpp
icu4c/source/i18n/rematch.cpp
icu4c/source/test/testdata/regextst.txt

index b17ef53c93d73df4603bf422b1a619a0c085c5b8..9133cdd968184dd34475d7b5d1cc15b758f6fc10 100644 (file)
@@ -1,7 +1,7 @@
 //
 //  file:  regexcmp.cpp
 //
-//  Copyright (C) 2002-2012 International Business Machines Corporation and others.
+//  Copyright (C) 2002-2013 International Business Machines Corporation and others.
 //  All Rights Reserved.
 //
 //  This file contains the ICU regular expression compiler, which is responsible
@@ -3335,14 +3335,46 @@ int32_t   RegexCompile::maxMatchLength(int32_t start, int32_t end) {
 
         case URX_CTR_INIT:
         case URX_CTR_INIT_NG:
+            // For Loops, recursively call this function on the pattern for the loop body,
+            //   then multiply the result by the maximum loop count.
+            {
+                int32_t  loopEndLoc = URX_VAL(fRXPat->fCompiledPat->elementAti(loc+1));
+                if (loopEndLoc == loc+4) {
+                    // Loop has an empty body. No affect on max match length.
+                    // Continue processing with code after the loop end.
+                    loc = loopEndLoc;
+                    break;
+                }
+                
+                int32_t maxLoopCount = fRXPat->fCompiledPat->elementAti(loc+3);
+                if (maxLoopCount == -1) {
+                    // Unbounded Loop. No upper bound on match length.
+                    currentLen = INT32_MAX;
+                    break;
+                }
+
+                U_ASSERT(loopEndLoc >= loc+4);
+                int32_t  blockLen = maxMatchLength(loc+4, loopEndLoc-1);  // Recursive call.
+                if (blockLen == INT32_MAX) {
+                    currentLen = blockLen;
+                    break;
+                }
+                currentLen += blockLen * maxLoopCount;
+                loc = loopEndLoc;
+                break;
+            }
+
         case URX_CTR_LOOP:
         case URX_CTR_LOOP_NG:
+            // These opcodes will be skipped over by code for URX_CRT_INIT.
+            // We shouldn't encounter them here.
+            U_ASSERT(FALSE);
+            break;
+
         case URX_LOOP_SR_I:
         case URX_LOOP_DOT_I:
         case URX_LOOP_C:
             // For anything to do with loops, make the match length unbounded.
-            //   Note:  INIT instructions are multi-word.  Can ignore because
-            //          INT32_MAX length will stop the per-instruction loop.
             currentLen = INT32_MAX;
             break;
 
index b416f16c811136b58f28147fe4e6f336cd647cde..944843e31d7714fe8ffcfd5b3a218130563a3770 100644 (file)
@@ -1,6 +1,6 @@
 /*
 **************************************************************************
-*   Copyright (C) 2002-2012 International Business Machines Corporation  *
+*   Copyright (C) 2002-2013 International Business Machines Corporation  *
 *   and others. All rights reserved.                                     *
 **************************************************************************
 */
@@ -2827,7 +2827,7 @@ void RegexMatcher::MatchAt(int64_t startIdx, UBool toEnd, UErrorCode &status) {
         #ifdef REGEX_RUN_DEBUG
         if (fTraceDebug) {
             UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
-            printf("inputIdx=%d   inputChar=%x   sp=%3d   activeLimit=%d  ", fp->fInputIdx,
+            printf("inputIdx=%ld   inputChar=%x   sp=%3ld   activeLimit=%ld  ", fp->fInputIdx,
                 UTEXT_CURRENT32(fInputText), (int64_t *)fp-fStack->getBuffer(), fActiveLimit);
             fPattern->dumpOp(fp->fPatIdx);
         }
@@ -3492,7 +3492,7 @@ GC_Done:
                 int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
                 U_ASSERT(minCount>=0);
                 U_ASSERT(maxCount>=minCount || maxCount==-1);
-                U_ASSERT(loopLoc>fp->fPatIdx);
+                U_ASSERT(loopLoc>=fp->fPatIdx);
 
                 if (minCount == 0) {
                     fp = StateSave(fp, loopLoc+1, status);
@@ -4211,7 +4211,7 @@ breakFromLoop:
         fMatchStart   = startIdx;
         fMatchEnd     = fp->fInputIdx;
         if (fTraceDebug) {
-            REGEX_RUN_DEBUG_PRINTF(("Match.  start=%d   end=%d\n\n", fMatchStart, fMatchEnd));
+            REGEX_RUN_DEBUG_PRINTF(("Match.  start=%ld   end=%ld\n\n", fMatchStart, fMatchEnd));
         }
     }
     else
@@ -4252,7 +4252,7 @@ void RegexMatcher::MatchChunkAt(int32_t startIdx, UBool toEnd, UErrorCode &statu
 #ifdef REGEX_RUN_DEBUG
     if (fTraceDebug)
     {
-        printf("MatchAt(startIdx=%ld)\n", startIdx);
+        printf("MatchAt(startIdx=%d)\n", startIdx);
         printf("Original Pattern: ");
         UChar32 c = utext_next32From(fPattern->fPattern, 0);
         while (c != U_SENTINEL) {
@@ -4321,7 +4321,7 @@ void RegexMatcher::MatchChunkAt(int32_t startIdx, UBool toEnd, UErrorCode &statu
 #ifdef REGEX_RUN_DEBUG
         if (fTraceDebug) {
             UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
-            printf("inputIdx=%d   inputChar=%x   sp=%3d   activeLimit=%d  ", fp->fInputIdx,
+            printf("inputIdx=%ld   inputChar=%x   sp=%3ld   activeLimit=%ld  ", fp->fInputIdx,
                    UTEXT_CURRENT32(fInputText), (int64_t *)fp-fStack->getBuffer(), fActiveLimit);
             fPattern->dumpOp(fp->fPatIdx);
         }
@@ -4951,7 +4951,7 @@ GC_Done:
                 int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
                 U_ASSERT(minCount>=0);
                 U_ASSERT(maxCount>=minCount || maxCount==-1);
-                U_ASSERT(loopLoc>fp->fPatIdx);
+                U_ASSERT(loopLoc>=fp->fPatIdx);
                 
                 if (minCount == 0) {
                     fp = StateSave(fp, loopLoc+1, status);
@@ -5635,7 +5635,7 @@ breakFromLoop:
         fMatchStart   = startIdx;
         fMatchEnd     = fp->fInputIdx;
         if (fTraceDebug) {
-            REGEX_RUN_DEBUG_PRINTF(("Match.  start=%d   end=%d\n\n", fMatchStart, fMatchEnd));
+            REGEX_RUN_DEBUG_PRINTF(("Match.  start=%ld   end=%ld\n\n", fMatchStart, fMatchEnd));
         }
     }
     else
index 53bd73a7ef3f37435c5df6fd134fc71d923393db..154248550381f7e0fb231cf69034fa79dfb7685a 100644 (file)
@@ -1,4 +1,4 @@
-# Copyright (c) 2001-2012 International Business Machines
+# Copyright (c) 2001-2013 International Business Machines
 # Corporation and others. All Rights Reserved.
 #
 #  file:
 
 "(ab)?(?<=ab)cd|ef"             i  "<0><1>ab</1>cd</0>"
 
+# Bug 10024
+#   Incorrect (unbounded) longest match length with {1, 20} style quantifiers.
+#   Unbounded match is disallowed in look-behind expressions.
+#   Max match length is used to limit where to check for look-behind matches.
+
+"(?<=a{1,5})bc"                   "aaaa<0>bc</0>def"
+"(?<=(?:aa){3,20})bc"             "aaaaaa<0>bc</0>def"
+"(?<!abc {1,100}|def {1,100}|ghi {1,100})jkl"      "def jkl"
+"(?<!abc {1,100}|def {1,100}|ghi {1,100})jkl"      "rst <0>jkl</0>"
+"(?<=a{11})bc"                   "aaaaaaaaaaa<0>bc</0>"
+"(?<=a{11})bc"                   "aaaaaaaaaabc"
+"(?<=a{1,})bc"           E       "aaaa<0>bc</0>def"   # U_REGEX_LOOK_BEHIND_LIMIT error.
+"(?<=(?:){11})bc"                "<0>bc</0>"          # Empty (?:) expression.
+
+
 #  Random debugging, Temporary
 #
 #"^(?:a?b?)*$"                   "a--"