]> granicus.if.org Git - icu/commitdiff
ICU-11554 Fix regex bug with look-behind matching & UTF-8 input.
authorAndy Heninger <andy.heninger@gmail.com>
Fri, 9 Oct 2015 20:01:46 +0000 (20:01 +0000)
committerAndy Heninger <andy.heninger@gmail.com>
Fri, 9 Oct 2015 20:01:46 +0000 (20:01 +0000)
X-SVN-Rev: 38056

icu4c/source/i18n/rematch.cpp
icu4c/source/i18n/repattrn.cpp
icu4c/source/test/intltest/regextst.cpp
icu4c/source/test/testdata/regextst.txt

index 4d4aa0534f557dc7c43d42c234c05fcd73d1e4cf..c741f81ea876d280efbcc51c9766c2d04b4958f2 100644 (file)
@@ -23,6 +23,7 @@
 #include "unicode/utf16.h"
 #include "uassert.h"
 #include "cmemory.h"
+#include "cstr.h"
 #include "uvector.h"
 #include "uvectr32.h"
 #include "uvectr64.h"
@@ -33,6 +34,7 @@
 
 // #include <malloc.h>        // Needed for heapcheck testing
 
+
 U_NAMESPACE_BEGIN
 
 // Default limit for the size of the back track stack, to avoid system
@@ -782,7 +784,7 @@ UBool RegexMatcher::find(UErrorCode &status) {
                     if (fMatch) {
                         return TRUE;
                     }
-                    UTEXT_SETNATIVEINDEX(fInputText, pos);
+                    UTEXT_SETNATIVEINDEX(fInputText, startPos);
                 }
                 if (startPos > testStartLimit) {
                     fMatch = FALSE;
@@ -2723,6 +2725,18 @@ inline REStackFrame *RegexMatcher::StateSave(REStackFrame *fp, int64_t savePatId
     return (REStackFrame *)newFP;
 }
 
+#if defined(REGEX_DEBUG)
+namespace {
+UnicodeString StringFromUText(UText *ut) {
+    UnicodeString result;
+    for (UChar32 c = utext_next32From(ut, 0); c != U_SENTINEL; c = UTEXT_NEXT32(ut)) {
+        result.append(c);
+    }
+    return result;
+}
+}
+#endif // REGEX_DEBUG
+
 
 //--------------------------------------------------------------------------------
 //
@@ -2742,32 +2756,10 @@ void RegexMatcher::MatchAt(int64_t startIdx, UBool toEnd, UErrorCode &status) {
     int32_t     opValue;               //    and the operand value.
 
 #ifdef REGEX_RUN_DEBUG
-    if (fTraceDebug)
-    {
+    if (fTraceDebug) {
         printf("MatchAt(startIdx=%ld)\n", startIdx);
-        printf("Original Pattern: ");
-        UChar32 c = utext_next32From(fPattern->fPattern, 0);
-        while (c != U_SENTINEL) {
-            if (c<32 || c>256) {
-                c = '.';
-            }
-            printf("%c", c);
-
-            c = UTEXT_NEXT32(fPattern->fPattern);
-        }
-        printf("\n");
-        printf("Input String: ");
-        c = utext_next32From(fInputText, 0);
-        while (c != U_SENTINEL) {
-            if (c<32 || c>256) {
-                c = '.';
-            }
-            printf("%c", c);
-
-            c = UTEXT_NEXT32(fInputText);
-        }
-        printf("\n");
-        printf("\n");
+        printf("Original Pattern: \"%s\"\n", CStr(StringFromUText(fPattern->fPattern))());
+        printf("Input String:     \"%s\"\n\n", CStr(StringFromUText(fInputText))());
     }
 #endif
 
@@ -3936,28 +3928,38 @@ GC_Done:
                 //   of this op in the pattern.
                 int32_t minML = (int32_t)pat[fp->fPatIdx++];
                 int32_t maxML = (int32_t)pat[fp->fPatIdx++];
+                if (!UTEXT_USES_U16(fInputText)) {
+                    // utf-8 fix to maximum match length. The pattern compiler assumes utf-16.
+                    // The max length need not be exact; it just needs to be >= actual maximum.
+                    maxML *= 3;
+                }
                 U_ASSERT(minML <= maxML);
                 U_ASSERT(minML >= 0);
 
                 // Fetch (from data) the last input index where a match was attempted.
                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
-                int64_t  *lbStartIdx = &fData[opValue+2];
-                if (*lbStartIdx < 0) {
+                int64_t  &lbStartIdx = fData[opValue+2];
+                if (lbStartIdx < 0) {
                     // First time through loop.
-                    *lbStartIdx = fp->fInputIdx - minML;
+                    lbStartIdx = fp->fInputIdx - minML;
+                    if (lbStartIdx > 0) {
+                        // move index to a code point boudary, if it's not on one already.
+                        UTEXT_SETNATIVEINDEX(fInputText, lbStartIdx);
+                        lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
+                    }
                 } else {
                     // 2nd through nth time through the loop.
                     // Back up start position for match by one.
-                    if (*lbStartIdx == 0) {
-                        (*lbStartIdx)--;
+                    if (lbStartIdx == 0) {
+                        (lbStartIdx)--;
                     } else {
-                        UTEXT_SETNATIVEINDEX(fInputText, *lbStartIdx);
+                        UTEXT_SETNATIVEINDEX(fInputText, lbStartIdx);
                         (void)UTEXT_PREVIOUS32(fInputText);
-                        *lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
+                        lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
                     }
                 }
 
-                if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) {
+                if (lbStartIdx < 0 || lbStartIdx < fp->fInputIdx - maxML) {
                     // We have tried all potential match starting points without
                     //  getting a match.  Backtrack out, and out of the
                     //   Look Behind altogether.
@@ -3972,7 +3974,7 @@ GC_Done:
                 //    Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
                 //      (successful match will fall off the end of the loop.)
                 fp = StateSave(fp, fp->fPatIdx-3, status);
-                fp->fInputIdx = *lbStartIdx;
+                fp->fInputIdx = lbStartIdx;
             }
             break;
 
@@ -4009,6 +4011,11 @@ GC_Done:
                 // Fetch the extra parameters of this op.
                 int32_t minML       = (int32_t)pat[fp->fPatIdx++];
                 int32_t maxML       = (int32_t)pat[fp->fPatIdx++];
+                if (!UTEXT_USES_U16(fInputText)) {
+                    // utf-8 fix to maximum match length. The pattern compiler assumes utf-16.
+                    // The max length need not be exact; it just needs to be >= actual maximum.
+                    maxML *= 3;
+                }
                 int32_t continueLoc = (int32_t)pat[fp->fPatIdx++];
                         continueLoc = URX_VAL(continueLoc);
                 U_ASSERT(minML <= maxML);
@@ -4017,23 +4024,28 @@ GC_Done:
 
                 // Fetch (from data) the last input index where a match was attempted.
                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
-                int64_t  *lbStartIdx = &fData[opValue+2];
-                if (*lbStartIdx < 0) {
+                int64_t  &lbStartIdx = fData[opValue+2];
+                if (lbStartIdx < 0) {
                     // First time through loop.
-                    *lbStartIdx = fp->fInputIdx - minML;
+                    lbStartIdx = fp->fInputIdx - minML;
+                    if (lbStartIdx > 0) {
+                        // move index to a code point boudary, if it's not on one already.
+                        UTEXT_SETNATIVEINDEX(fInputText, lbStartIdx);
+                        lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
+                    }
                 } else {
                     // 2nd through nth time through the loop.
                     // Back up start position for match by one.
-                    if (*lbStartIdx == 0) {
-                        (*lbStartIdx)--;
+                    if (lbStartIdx == 0) {
+                        (lbStartIdx)--;
                     } else {
-                        UTEXT_SETNATIVEINDEX(fInputText, *lbStartIdx);
+                        UTEXT_SETNATIVEINDEX(fInputText, lbStartIdx);
                         (void)UTEXT_PREVIOUS32(fInputText);
-                        *lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
+                        lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
                     }
                 }
 
-                if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) {
+                if (lbStartIdx < 0 || lbStartIdx < fp->fInputIdx - maxML) {
                     // We have tried all potential match starting points without
                     //  getting a match, which means that the negative lookbehind as
                     //  a whole has succeeded.  Jump forward to the continue location
@@ -4048,7 +4060,7 @@ GC_Done:
                 //    Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
                 //      (successful match will cause a FAIL out of the loop altogether.)
                 fp = StateSave(fp, fp->fPatIdx-4, status);
-                fp->fInputIdx = *lbStartIdx;
+                fp->fInputIdx = lbStartIdx;
             }
             break;
 
@@ -4310,29 +4322,8 @@ void RegexMatcher::MatchChunkAt(int32_t startIdx, UBool toEnd, UErrorCode &statu
 #ifdef REGEX_RUN_DEBUG
     if (fTraceDebug) {
         printf("MatchAt(startIdx=%d)\n", startIdx);
-        printf("Original Pattern: ");
-        UChar32 c = utext_next32From(fPattern->fPattern, 0);
-        while (c != U_SENTINEL) {
-            if (c<32 || c>256) {
-                c = '.';
-            }
-            printf("%c", c);
-
-            c = UTEXT_NEXT32(fPattern->fPattern);
-        }
-        printf("\n");
-        printf("Input String: ");
-        c = utext_next32From(fInputText, 0);
-        while (c != U_SENTINEL) {
-            if (c<32 || c>256) {
-                c = '.';
-            }
-            printf("%c", c);
-
-            c = UTEXT_NEXT32(fInputText);
-        }
-        printf("\n");
-        printf("\n");
+        printf("Original Pattern: \"%s\"\n", CStr(StringFromUText(fPattern->fPattern))());
+        printf("Input String:     \"%s\"\n\n", CStr(StringFromUText(fInputText))());
     }
 #endif
 
@@ -5450,21 +5441,24 @@ GC_Done:
 
                 // Fetch (from data) the last input index where a match was attempted.
                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
-                int64_t  *lbStartIdx = &fData[opValue+2];
-                if (*lbStartIdx < 0) {
+                int64_t  &lbStartIdx = fData[opValue+2];
+                if (lbStartIdx < 0) {
                     // First time through loop.
-                    *lbStartIdx = fp->fInputIdx - minML;
+                    lbStartIdx = fp->fInputIdx - minML;
+                    if (lbStartIdx > 0) {
+                        U16_SET_CP_START(inputBuf, 0, lbStartIdx);
+                    }
                 } else {
                     // 2nd through nth time through the loop.
                     // Back up start position for match by one.
-                    if (*lbStartIdx == 0) {
-                        (*lbStartIdx)--;
+                    if (lbStartIdx == 0) {
+                        lbStartIdx--;
                     } else {
-                        U16_BACK_1(inputBuf, 0, *lbStartIdx);
+                        U16_BACK_1(inputBuf, 0, lbStartIdx);
                     }
                 }
 
-                if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) {
+                if (lbStartIdx < 0 || lbStartIdx < fp->fInputIdx - maxML) {
                     // We have tried all potential match starting points without
                     //  getting a match.  Backtrack out, and out of the
                     //   Look Behind altogether.
@@ -5479,7 +5473,7 @@ GC_Done:
                 //    Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
                 //      (successful match will fall off the end of the loop.)
                 fp = StateSave(fp, fp->fPatIdx-3, status);
-                fp->fInputIdx =  *lbStartIdx;
+                fp->fInputIdx =  lbStartIdx;
             }
             break;
 
@@ -5524,21 +5518,24 @@ GC_Done:
 
                 // Fetch (from data) the last input index where a match was attempted.
                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
-                int64_t  *lbStartIdx = &fData[opValue+2];
-                if (*lbStartIdx < 0) {
+                int64_t  &lbStartIdx = fData[opValue+2];
+                if (lbStartIdx < 0) {
                     // First time through loop.
-                    *lbStartIdx = fp->fInputIdx - minML;
+                    lbStartIdx = fp->fInputIdx - minML;
+                    if (lbStartIdx > 0) {
+                        U16_SET_CP_START(inputBuf, 0, lbStartIdx);
+                    }
                 } else {
                     // 2nd through nth time through the loop.
                     // Back up start position for match by one.
-                    if (*lbStartIdx == 0) {
-                        (*lbStartIdx)--;   // Because U16_BACK is unsafe starting at 0.
+                    if (lbStartIdx == 0) {
+                        lbStartIdx--;   // Because U16_BACK is unsafe starting at 0.
                     } else {
-                        U16_BACK_1(inputBuf, 0, *lbStartIdx);
+                        U16_BACK_1(inputBuf, 0, lbStartIdx);
                     }
                 }
 
-                if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) {
+                if (lbStartIdx < 0 || lbStartIdx < fp->fInputIdx - maxML) {
                     // We have tried all potential match starting points without
                     //  getting a match, which means that the negative lookbehind as
                     //  a whole has succeeded.  Jump forward to the continue location
@@ -5553,7 +5550,7 @@ GC_Done:
                 //    Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
                 //      (successful match will cause a FAIL out of the loop altogether.)
                 fp = StateSave(fp, fp->fPatIdx-4, status);
-                fp->fInputIdx =  *lbStartIdx;
+                fp->fInputIdx =  lbStartIdx;
             }
             break;
 
index 58650d11374beb94d2e5978d8fb07e80818eca4d..9363e3bb357882b2cdaae56a7396c7ad73b5d2ae 100644 (file)
@@ -14,6 +14,7 @@
 
 #include "unicode/regex.h"
 #include "unicode/uclean.h"
+#include "cstr.h"
 #include "uassert.h"
 #include "uhash.h"
 #include "uvector.h"
@@ -675,7 +676,6 @@ int32_t  RegexPattern::split(UText *input,
 }
 
 
-
 //---------------------------------------------------------------------
 //
 //   dump    Output the compiled form of the pattern.
@@ -751,7 +751,11 @@ void   RegexPattern::dumpOp(int32_t index) const {
 
     case URX_ONECHAR:
     case URX_ONECHAR_I:
-        printf("%c", val<256?val:'?');
+        if (val < 0x20) {
+            printf("%#x", val);
+        } else {
+            printf("'%s'", CStr(UnicodeString(val))());
+        }
         break;
 
     case URX_STRING:
@@ -760,12 +764,8 @@ void   RegexPattern::dumpOp(int32_t index) const {
             int32_t lengthOp       = fCompiledPat->elementAti(index+1);
             U_ASSERT(URX_TYPE(lengthOp) == URX_STRING_LEN);
             int32_t length = URX_VAL(lengthOp);
-            int32_t i;
-            for (i=val; i<val+length; i++) {
-                UChar c = fLiteralText[i];
-                if (c < 32 || c >= 256) {c = '.';}
-                printf("%c", c);
-            }
+            UnicodeString str(fLiteralText, val, length);
+            printf("%s", CStr(str)());
         }
         break;
 
@@ -775,9 +775,7 @@ void   RegexPattern::dumpOp(int32_t index) const {
             UnicodeString s;
             UnicodeSet *set = (UnicodeSet *)fSets->elementAt(val);
             set->toPattern(s, TRUE);
-            for (int32_t i=0; i<s.length(); i++) {
-                printf("%c", s.charAt(i));
-            }
+            printf("%s", CStr(s)());
         }
         break;
 
@@ -791,9 +789,7 @@ void   RegexPattern::dumpOp(int32_t index) const {
             }
             UnicodeSet *set = fStaticSets[val];
             set->toPattern(s, TRUE);
-            for (int32_t i=0; i<s.length(); i++) {
-                printf("%c", s.charAt(i));
-            }
+            printf("%s", CStr(s)());
         }
         break;
 
@@ -809,53 +805,27 @@ void   RegexPattern::dumpOp(int32_t index) const {
 
 void RegexPattern::dumpPattern() const {
 #if defined(REGEX_DEBUG)
-    // TODO: This function assumes an ASCII based charset.
     int      index;
-    int      i;
-
-    printf("Original Pattern:  ");
-    UChar32 c = utext_next32From(fPattern, 0);
-    while (c != U_SENTINEL) {
-        if (c<32 || c>256) {
-            c = '.';
-        }
-        printf("%c", c);
 
-        c = UTEXT_NEXT32(fPattern);
+    UnicodeString patStr;
+    for (UChar32 c = utext_next32From(fPattern, 0); c != U_SENTINEL; c = utext_next32(fPattern)) {
+        patStr.append(c);
     }
-    printf("\n");
+    printf("Original Pattern:  \"%s\"\n", CStr(patStr)());
     printf("   Min Match Length:  %d\n", fMinMatchLen);
     printf("   Match Start Type:  %s\n", START_OF_MATCH_STR(fStartType));
     if (fStartType == START_STRING) {
-        printf("    Initial match string: \"");
-        for (i=fInitialStringIdx; i<fInitialStringIdx+fInitialStringLen; i++) {
-            printf("%c", fLiteralText[i]);   // TODO:  non-printables, surrogates.
-        }
-        printf("\"\n");
-
+        UnicodeString initialString(fLiteralText,fInitialStringIdx, fInitialStringLen);
+        printf("   Initial match string: \"%s\"\n", CStr(initialString)());
     } else if (fStartType == START_SET) {
-        int32_t numSetChars = fInitialChars->size();
-        if (numSetChars > 20) {
-            numSetChars = 20;
-        }
-        printf("     Match First Chars : ");
-        for (i=0; i<numSetChars; i++) {
-            UChar32 c = fInitialChars->charAt(i);
-            if (0x20<c && c <0x7e) {
-                printf("%c ", c);
-            } else {
-                printf("%#x ", c);
-            }
-        }
-        if (numSetChars < fInitialChars->size()) {
-            printf(" ...");
-        }
-        printf("\n");
+        UnicodeString s;
+        fInitialChars->toPattern(s, TRUE);
+        printf("    Match First Chars: %s\n", CStr(s)());
 
     } else if (fStartType == START_CHAR) {
-        printf("    First char of Match : ");
-        if (0x20 < fInitialChar && fInitialChar<0x7e) {
-                printf("%c\n", fInitialChar);
+        printf("    First char of Match: ");
+        if (fInitialChar > 0x20) {
+                printf("'%s'\n", CStr(UnicodeString(fInitialChar))());
             } else {
                 printf("%#x\n", fInitialChar);
             }
@@ -869,10 +839,8 @@ void RegexPattern::dumpPattern() const {
         const UHashElement *el = NULL;
         while ((el = uhash_nextElement(fNamedCaptureMap, &pos))) {
             const UnicodeString *name = (const UnicodeString *)el->key.pointer;
-            char s[100];
-            name->extract(0, 99, s, sizeof(s), US_INV);  // capture group names are invariant.
             int32_t number = el->value.integer;
-            printf("   %d\t%s\n", number, s);
+            printf("   %d\t%s\n", number, CStr(*name)());
         }
     }
 
index bf13426041ee166bebc57a3d8ffc7207e22b49a1..c1a6a0a461e1526bad130b6fc5f64499b00abad5 100644 (file)
@@ -3583,7 +3583,7 @@ void RegexTest::regex_find(const UnicodeString &pattern,
 
         if (UTF8Matcher == NULL) {
             // UTF-8 does not allow unpaired surrogates, so this could actually happen without being a failure of the engine
-          logln("Unable to create UTF-8 matcher, skipping UTF-8 tests for %s:%d", srcPath, line);
+            logln("Unable to create UTF-8 matcher, skipping UTF-8 tests for %s:%d", srcPath, line);
             status = U_ZERO_ERROR;
         }
     }
@@ -3592,6 +3592,9 @@ void RegexTest::regex_find(const UnicodeString &pattern,
     //  Generate native indices for UTF8 versions of region and capture group info
     //
     if (UTF8Matcher != NULL) {
+        if (flags.indexOf((UChar)0x74) >= 0) {   //  't' trace flag
+            UTF8Matcher->setTrace(TRUE);
+        }
         if (regionStart>=0)    (void) utextOffsetToNative(&inputText, regionStart, regionStartUTF8);
         if (regionEnd>=0)      (void) utextOffsetToNative(&inputText, regionEnd, regionEndUTF8);
 
@@ -3671,6 +3674,9 @@ void RegexTest::regex_find(const UnicodeString &pattern,
         }
     }
     matcher->setTrace(FALSE);
+    if (UTF8Matcher) {
+        UTF8Matcher->setTrace(FALSE);
+    }
     if (U_FAILURE(status)) {
         errln("Error at line %d. ICU ErrorCode is %s", u_errorName(status));
     }
@@ -3692,16 +3698,17 @@ void RegexTest::regex_find(const UnicodeString &pattern,
         failed = TRUE;
         goto cleanupAndReturn;
     }
+    if (isMatch && groupStarts.size() == 0) {
+        errln("Error at line %d: No match expected, but one found at position %d.", line, matcher->start(status));
+        failed = TRUE;
+    }
+    if (UTF8Matcher && isUTF8Match && groupStarts.size() == 0) {
+        errln("Error at line %d: No match expected, but one found at position %d (UTF-8).", line, UTF8Matcher->start(status));
+        failed = TRUE;
+    }
 
     if (flags.indexOf((UChar)0x47 /*G*/) >= 0) {
         // Only check for match / no match.  Don't check capture groups.
-        if (isMatch && groupStarts.size() == 0) {
-            errln("Error at line %d:  No match expected, but one found.", line);
-            failed = TRUE;
-        } else if (UTF8Matcher != NULL && isUTF8Match && groupStarts.size() == 0) {
-            errln("Error at line %d:  No match expected, but one found. (UTF8)", line);
-            failed = TRUE;
-        }
         goto cleanupAndReturn;
     }
 
index 9a7a6013284db7fcab338545b36f3557fd335a22..4cdca21b5accda7e33986207a327ead99947d632 100644 (file)
 "pre(.)post\1"          i       "pre\ud800post\ud800\udc00"         # case insensiteve backrefs take a different code path
 "pre(.)post\1"          i       "<0>pre<1>\ud800</1>post\ud800</0> fin"
 
+# Bug 11554
+#
+#    Maximum match length computation was assuming UTF-16.
+#    Used in look-behind matches to constrain how far back to look.
+
+"(?<=a\x{100000})spam"          "***a\x{100000}<0>spam</0>**"
+"(?<=aą)spam"                   "**aą<0>spam</0>**"
+"(?<=ąabc)spam"                 "**ąabc<0>spam</0>**"
+
+"(?<=a\x{100000})spam"          "***a\x{100001}spam**"
+"(?<=aą)spam"                   "**bąspam**"
+"(?<=ąabc)spam"                 "**ąabxspam**"
+
+# with negative look-behind
+
+"(?<!a\x{100000})spam"          "***a\x{100000}spam**"
+"(?<!aą)spam"                   "**aąspam**"
+"(?<!ąabc)spam"                 "**ąabcspam**"
+
+"(?<!a\x{100000})spam"          "***a\x{100001}<0>spam</0>**"
+"(?<!aą)spam"                   "**bą<0>spam</0>**"
+"(?<!ąabc)spam"                 "**ąabx<0>spam</0>**"
+
 #  Random debugging, Temporary
 #