From: Andy Heninger Date: Tue, 18 Jun 2013 20:38:08 +0000 (+0000) Subject: ICU-10024 fixes for Regexp, look-behind pattern fails to compile bug. X-Git-Tag: milestone-59-0-1~2828 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=b129b4028015c5abb638a1c26a51e289eafbeed1;p=icu ICU-10024 fixes for Regexp, look-behind pattern fails to compile bug. X-SVN-Rev: 33835 --- diff --git a/icu4c/source/i18n/regexcmp.cpp b/icu4c/source/i18n/regexcmp.cpp index b17ef53c93d..9133cdd9681 100644 --- a/icu4c/source/i18n/regexcmp.cpp +++ b/icu4c/source/i18n/regexcmp.cpp @@ -1,7 +1,7 @@ // // file: regexcmp.cpp // -// Copyright (C) 2002-2012 International Business Machines Corporation and others. +// Copyright (C) 2002-2013 International Business Machines Corporation and others. // All Rights Reserved. // // This file contains the ICU regular expression compiler, which is responsible @@ -3335,14 +3335,46 @@ int32_t RegexCompile::maxMatchLength(int32_t start, int32_t end) { case URX_CTR_INIT: case URX_CTR_INIT_NG: + // For Loops, recursively call this function on the pattern for the loop body, + // then multiply the result by the maximum loop count. + { + int32_t loopEndLoc = URX_VAL(fRXPat->fCompiledPat->elementAti(loc+1)); + if (loopEndLoc == loc+4) { + // Loop has an empty body. No affect on max match length. + // Continue processing with code after the loop end. + loc = loopEndLoc; + break; + } + + int32_t maxLoopCount = fRXPat->fCompiledPat->elementAti(loc+3); + if (maxLoopCount == -1) { + // Unbounded Loop. No upper bound on match length. + currentLen = INT32_MAX; + break; + } + + U_ASSERT(loopEndLoc >= loc+4); + int32_t blockLen = maxMatchLength(loc+4, loopEndLoc-1); // Recursive call. + if (blockLen == INT32_MAX) { + currentLen = blockLen; + break; + } + currentLen += blockLen * maxLoopCount; + loc = loopEndLoc; + break; + } + case URX_CTR_LOOP: case URX_CTR_LOOP_NG: + // These opcodes will be skipped over by code for URX_CRT_INIT. + // We shouldn't encounter them here. + U_ASSERT(FALSE); + break; + case URX_LOOP_SR_I: case URX_LOOP_DOT_I: case URX_LOOP_C: // For anything to do with loops, make the match length unbounded. - // Note: INIT instructions are multi-word. Can ignore because - // INT32_MAX length will stop the per-instruction loop. currentLen = INT32_MAX; break; diff --git a/icu4c/source/i18n/rematch.cpp b/icu4c/source/i18n/rematch.cpp index b416f16c811..944843e31d7 100644 --- a/icu4c/source/i18n/rematch.cpp +++ b/icu4c/source/i18n/rematch.cpp @@ -1,6 +1,6 @@ /* ************************************************************************** -* Copyright (C) 2002-2012 International Business Machines Corporation * +* Copyright (C) 2002-2013 International Business Machines Corporation * * and others. All rights reserved. * ************************************************************************** */ @@ -2827,7 +2827,7 @@ void RegexMatcher::MatchAt(int64_t startIdx, UBool toEnd, UErrorCode &status) { #ifdef REGEX_RUN_DEBUG if (fTraceDebug) { UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); - printf("inputIdx=%d inputChar=%x sp=%3d activeLimit=%d ", fp->fInputIdx, + printf("inputIdx=%ld inputChar=%x sp=%3ld activeLimit=%ld ", fp->fInputIdx, UTEXT_CURRENT32(fInputText), (int64_t *)fp-fStack->getBuffer(), fActiveLimit); fPattern->dumpOp(fp->fPatIdx); } @@ -3492,7 +3492,7 @@ GC_Done: int32_t maxCount = (int32_t)pat[instrOperandLoc+2]; U_ASSERT(minCount>=0); U_ASSERT(maxCount>=minCount || maxCount==-1); - U_ASSERT(loopLoc>fp->fPatIdx); + U_ASSERT(loopLoc>=fp->fPatIdx); if (minCount == 0) { fp = StateSave(fp, loopLoc+1, status); @@ -4211,7 +4211,7 @@ breakFromLoop: fMatchStart = startIdx; fMatchEnd = fp->fInputIdx; if (fTraceDebug) { - REGEX_RUN_DEBUG_PRINTF(("Match. start=%d end=%d\n\n", fMatchStart, fMatchEnd)); + REGEX_RUN_DEBUG_PRINTF(("Match. start=%ld end=%ld\n\n", fMatchStart, fMatchEnd)); } } else @@ -4252,7 +4252,7 @@ void RegexMatcher::MatchChunkAt(int32_t startIdx, UBool toEnd, UErrorCode &statu #ifdef REGEX_RUN_DEBUG if (fTraceDebug) { - printf("MatchAt(startIdx=%ld)\n", startIdx); + printf("MatchAt(startIdx=%d)\n", startIdx); printf("Original Pattern: "); UChar32 c = utext_next32From(fPattern->fPattern, 0); while (c != U_SENTINEL) { @@ -4321,7 +4321,7 @@ void RegexMatcher::MatchChunkAt(int32_t startIdx, UBool toEnd, UErrorCode &statu #ifdef REGEX_RUN_DEBUG if (fTraceDebug) { UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); - printf("inputIdx=%d inputChar=%x sp=%3d activeLimit=%d ", fp->fInputIdx, + printf("inputIdx=%ld inputChar=%x sp=%3ld activeLimit=%ld ", fp->fInputIdx, UTEXT_CURRENT32(fInputText), (int64_t *)fp-fStack->getBuffer(), fActiveLimit); fPattern->dumpOp(fp->fPatIdx); } @@ -4951,7 +4951,7 @@ GC_Done: int32_t maxCount = (int32_t)pat[instrOperandLoc+2]; U_ASSERT(minCount>=0); U_ASSERT(maxCount>=minCount || maxCount==-1); - U_ASSERT(loopLoc>fp->fPatIdx); + U_ASSERT(loopLoc>=fp->fPatIdx); if (minCount == 0) { fp = StateSave(fp, loopLoc+1, status); @@ -5635,7 +5635,7 @@ breakFromLoop: fMatchStart = startIdx; fMatchEnd = fp->fInputIdx; if (fTraceDebug) { - REGEX_RUN_DEBUG_PRINTF(("Match. start=%d end=%d\n\n", fMatchStart, fMatchEnd)); + REGEX_RUN_DEBUG_PRINTF(("Match. start=%ld end=%ld\n\n", fMatchStart, fMatchEnd)); } } else diff --git a/icu4c/source/test/testdata/regextst.txt b/icu4c/source/test/testdata/regextst.txt index 53bd73a7ef3..15424855038 100644 --- a/icu4c/source/test/testdata/regextst.txt +++ b/icu4c/source/test/testdata/regextst.txt @@ -1,4 +1,4 @@ -# Copyright (c) 2001-2012 International Business Machines +# Copyright (c) 2001-2013 International Business Machines # Corporation and others. All Rights Reserved. # # file: @@ -1146,6 +1146,21 @@ "(ab)?(?<=ab)cd|ef" i "<0><1>abcd" +# Bug 10024 +# Incorrect (unbounded) longest match length with {1, 20} style quantifiers. +# Unbounded match is disallowed in look-behind expressions. +# Max match length is used to limit where to check for look-behind matches. + +"(?<=a{1,5})bc" "aaaa<0>bcdef" +"(?<=(?:aa){3,20})bc" "aaaaaa<0>bcdef" +"(?jkl" +"(?<=a{11})bc" "aaaaaaaaaaa<0>bc" +"(?<=a{11})bc" "aaaaaaaaaabc" +"(?<=a{1,})bc" E "aaaa<0>bcdef" # U_REGEX_LOOK_BEHIND_LIMIT error. +"(?<=(?:){11})bc" "<0>bc" # Empty (?:) expression. + + # Random debugging, Temporary # #"^(?:a?b?)*$" "a--"