//
// file: regexcmp.cpp
//
-// Copyright (C) 2002-2012 International Business Machines Corporation and others.
+// Copyright (C) 2002-2013 International Business Machines Corporation and others.
// All Rights Reserved.
//
// This file contains the ICU regular expression compiler, which is responsible
case URX_CTR_INIT:
case URX_CTR_INIT_NG:
+ // For Loops, recursively call this function on the pattern for the loop body,
+ // then multiply the result by the maximum loop count.
+ {
+ int32_t loopEndLoc = URX_VAL(fRXPat->fCompiledPat->elementAti(loc+1));
+ if (loopEndLoc == loc+4) {
+ // Loop has an empty body. No affect on max match length.
+ // Continue processing with code after the loop end.
+ loc = loopEndLoc;
+ break;
+ }
+
+ int32_t maxLoopCount = fRXPat->fCompiledPat->elementAti(loc+3);
+ if (maxLoopCount == -1) {
+ // Unbounded Loop. No upper bound on match length.
+ currentLen = INT32_MAX;
+ break;
+ }
+
+ U_ASSERT(loopEndLoc >= loc+4);
+ int32_t blockLen = maxMatchLength(loc+4, loopEndLoc-1); // Recursive call.
+ if (blockLen == INT32_MAX) {
+ currentLen = blockLen;
+ break;
+ }
+ currentLen += blockLen * maxLoopCount;
+ loc = loopEndLoc;
+ break;
+ }
+
case URX_CTR_LOOP:
case URX_CTR_LOOP_NG:
+ // These opcodes will be skipped over by code for URX_CRT_INIT.
+ // We shouldn't encounter them here.
+ U_ASSERT(FALSE);
+ break;
+
case URX_LOOP_SR_I:
case URX_LOOP_DOT_I:
case URX_LOOP_C:
// For anything to do with loops, make the match length unbounded.
- // Note: INIT instructions are multi-word. Can ignore because
- // INT32_MAX length will stop the per-instruction loop.
currentLen = INT32_MAX;
break;
/*
**************************************************************************
-* Copyright (C) 2002-2012 International Business Machines Corporation *
+* Copyright (C) 2002-2013 International Business Machines Corporation *
* and others. All rights reserved. *
**************************************************************************
*/
#ifdef REGEX_RUN_DEBUG
if (fTraceDebug) {
UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
- printf("inputIdx=%d inputChar=%x sp=%3d activeLimit=%d ", fp->fInputIdx,
+ printf("inputIdx=%ld inputChar=%x sp=%3ld activeLimit=%ld ", fp->fInputIdx,
UTEXT_CURRENT32(fInputText), (int64_t *)fp-fStack->getBuffer(), fActiveLimit);
fPattern->dumpOp(fp->fPatIdx);
}
int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
U_ASSERT(minCount>=0);
U_ASSERT(maxCount>=minCount || maxCount==-1);
- U_ASSERT(loopLoc>fp->fPatIdx);
+ U_ASSERT(loopLoc>=fp->fPatIdx);
if (minCount == 0) {
fp = StateSave(fp, loopLoc+1, status);
fMatchStart = startIdx;
fMatchEnd = fp->fInputIdx;
if (fTraceDebug) {
- REGEX_RUN_DEBUG_PRINTF(("Match. start=%d end=%d\n\n", fMatchStart, fMatchEnd));
+ REGEX_RUN_DEBUG_PRINTF(("Match. start=%ld end=%ld\n\n", fMatchStart, fMatchEnd));
}
}
else
#ifdef REGEX_RUN_DEBUG
if (fTraceDebug)
{
- printf("MatchAt(startIdx=%ld)\n", startIdx);
+ printf("MatchAt(startIdx=%d)\n", startIdx);
printf("Original Pattern: ");
UChar32 c = utext_next32From(fPattern->fPattern, 0);
while (c != U_SENTINEL) {
#ifdef REGEX_RUN_DEBUG
if (fTraceDebug) {
UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
- printf("inputIdx=%d inputChar=%x sp=%3d activeLimit=%d ", fp->fInputIdx,
+ printf("inputIdx=%ld inputChar=%x sp=%3ld activeLimit=%ld ", fp->fInputIdx,
UTEXT_CURRENT32(fInputText), (int64_t *)fp-fStack->getBuffer(), fActiveLimit);
fPattern->dumpOp(fp->fPatIdx);
}
int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
U_ASSERT(minCount>=0);
U_ASSERT(maxCount>=minCount || maxCount==-1);
- U_ASSERT(loopLoc>fp->fPatIdx);
+ U_ASSERT(loopLoc>=fp->fPatIdx);
if (minCount == 0) {
fp = StateSave(fp, loopLoc+1, status);
fMatchStart = startIdx;
fMatchEnd = fp->fInputIdx;
if (fTraceDebug) {
- REGEX_RUN_DEBUG_PRINTF(("Match. start=%d end=%d\n\n", fMatchStart, fMatchEnd));
+ REGEX_RUN_DEBUG_PRINTF(("Match. start=%ld end=%ld\n\n", fMatchStart, fMatchEnd));
}
}
else
-# Copyright (c) 2001-2012 International Business Machines
+# Copyright (c) 2001-2013 International Business Machines
# Corporation and others. All Rights Reserved.
#
# file:
"(ab)?(?<=ab)cd|ef" i "<0><1>ab</1>cd</0>"
+# Bug 10024
+# Incorrect (unbounded) longest match length with {1, 20} style quantifiers.
+# Unbounded match is disallowed in look-behind expressions.
+# Max match length is used to limit where to check for look-behind matches.
+
+"(?<=a{1,5})bc" "aaaa<0>bc</0>def"
+"(?<=(?:aa){3,20})bc" "aaaaaa<0>bc</0>def"
+"(?<!abc {1,100}|def {1,100}|ghi {1,100})jkl" "def jkl"
+"(?<!abc {1,100}|def {1,100}|ghi {1,100})jkl" "rst <0>jkl</0>"
+"(?<=a{11})bc" "aaaaaaaaaaa<0>bc</0>"
+"(?<=a{11})bc" "aaaaaaaaaabc"
+"(?<=a{1,})bc" E "aaaa<0>bc</0>def" # U_REGEX_LOOK_BEHIND_LIMIT error.
+"(?<=(?:){11})bc" "<0>bc</0>" # Empty (?:) expression.
+
+
# Random debugging, Temporary
#
#"^(?:a?b?)*$" "a--"