ICU-11049 regular expressions, use same logic in UText and (UChar *) code paths when...

author Andy Heninger <andy.heninger@gmail.com>

Thu, 14 Aug 2014 17:44:05 +0000 (17:44 +0000)

committer Andy Heninger <andy.heninger@gmail.com>

Thu, 14 Aug 2014 17:44:05 +0000 (17:44 +0000)
author Andy Heninger <andy.heninger@gmail.com>
Thu, 14 Aug 2014 17:44:05 +0000 (17:44 +0000)
committer Andy Heninger <andy.heninger@gmail.com>
Thu, 14 Aug 2014 17:44:05 +0000 (17:44 +0000)
diff --git a/icu4c/source/i18n/rematch.cpp b/icu4c/source/i18n/rematch.cpp

index d3b8f8344eb6f0a686cb0e0a75ecdeb7b3241b5c..ace985fa685557a6a5b513d63a61b08d8d0a7c84 100644 (file)
--- a/icu4c/source/i18n/rematch.cpp
+++ b/icu4c/source/i18n/rematch.cpp
@@ -640,9 +640,9 @@ UBool RegexMatcher::find() {
              return FALSE;
          }
      } else {
-        // For now, let the matcher discover that it can't match on its own
-        // We don't know how long the match len is in native characters
-        testStartLimit = fActiveLimit;
+        // We don't know exactly how long the minimum match length is in native characters.
+        // Treat anything > 0 as 1.
+        testStartLimit = fActiveLimit - (fPattern->fMinMatchLen > 0 ? 1 : 0);
      }
  
      UChar32  c;
@@ -693,17 +693,17 @@ UBool RegexMatcher::find() {
          {
              // Match may start on any char from a pre-computed set.
              U_ASSERT(fPattern->fMinMatchLen > 0);
-            int64_t pos;
              UTEXT_SETNATIVEINDEX(fInputText, startPos);
              for (;;) {
+                int64_t pos = startPos;
                  c = UTEXT_NEXT32(fInputText);
-                pos = UTEXT_GETNATIVEINDEX(fInputText);
+                startPos = UTEXT_GETNATIVEINDEX(fInputText);
                  // c will be -1 (U_SENTINEL) at end of text, in which case we
                  // skip this next block (so we don't have a negative array index)
                  // and handle end of text in the following block.
                  if (c >= 0 && ((c<256 && fPattern->fInitialChars8->contains(c)) ||
                                (c>=256 && fPattern->fInitialChars->contains(c)))) {
-                    MatchAt(startPos, FALSE, fDeferredStatus);
+                    MatchAt(pos, FALSE, fDeferredStatus);
                      if (U_FAILURE(fDeferredStatus)) {
                          return FALSE;
                      }
@@ -712,12 +712,11 @@ UBool RegexMatcher::find() {
                      }
                      UTEXT_SETNATIVEINDEX(fInputText, pos);
                  }
-                if (startPos >= testStartLimit) {
+                if (startPos > testStartLimit) {
                      fMatch = FALSE;
                      fHitEnd = TRUE;
                      return FALSE;
                  }
-                startPos = pos;
                     if  (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
                      return FALSE;
              }
@@ -730,13 +729,13 @@ UBool RegexMatcher::find() {
              // Match starts on exactly one char.
              U_ASSERT(fPattern->fMinMatchLen > 0);
              UChar32 theChar = fPattern->fInitialChar;
-            int64_t pos;
              UTEXT_SETNATIVEINDEX(fInputText, startPos);
              for (;;) {
+                int64_t pos = startPos;
                  c = UTEXT_NEXT32(fInputText);
-                pos = UTEXT_GETNATIVEINDEX(fInputText);
+                startPos = UTEXT_GETNATIVEINDEX(fInputText);
                  if (c == theChar) {
-                    MatchAt(startPos, FALSE, fDeferredStatus);
+                    MatchAt(pos, FALSE, fDeferredStatus);
                      if (U_FAILURE(fDeferredStatus)) {
                          return FALSE;
                      }
@@ -745,12 +744,11 @@ UBool RegexMatcher::find() {
                      }
                      UTEXT_SETNATIVEINDEX(fInputText, pos);
                  }
-                if (startPos >= testStartLimit) {
+                if (startPos > testStartLimit) {
                      fMatch = FALSE;
                      fHitEnd = TRUE;
                      return FALSE;
                  }
-                startPos = pos;
                     if  (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
                      return FALSE;
             }
@@ -917,6 +915,7 @@ UBool RegexMatcher::findUsingChunk() {
      //   the minimum length match would extend past the end of the input.
      //   Note:  some patterns that cannot match anything will have fMinMatchLength==Max Int.
      //          Be aware of possible overflows if making changes here.
+    //   Note:  a match can begin at inputBuf + testLen; it is an inclusive limit.
      int32_t testLen  = (int32_t)(fActiveLimit - fPattern->fMinMatchLen);
      if (startPos > testLen) {
          fMatch = FALSE;
@@ -1012,7 +1011,7 @@ UBool RegexMatcher::findUsingChunk() {
                      return TRUE;
                  }
              }
-            if (pos >= testLen) {
+            if (startPos > testLen) {
                  fMatch = FALSE;
                  fHitEnd = TRUE;
                  return FALSE;
@@ -1021,7 +1020,7 @@ UBool RegexMatcher::findUsingChunk() {
                  return FALSE;
          }
      }
-        U_ASSERT(FALSE);
+    U_ASSERT(FALSE);
  
      case START_LINE:
      {
diff --git a/icu4c/source/test/intltest/regextst.cpp b/icu4c/source/test/intltest/regextst.cpp

index 5b697c5d390e78d0dce230be93007fd7eb6f8b19..bc7e3afed2741623ddb0e67118d5e3e15f6cfcd2 100644 (file)
--- a/icu4c/source/test/intltest/regextst.cpp
+++ b/icu4c/source/test/intltest/regextst.cpp
@@ -5314,43 +5314,58 @@ void RegexTest::TestBug11049() {
      // To see the problem, the text must exactly fill an allocated buffer, so that valgrind will
      // detect the bad read.
  
-    UnicodeString patternString("A|B|C");
-    UnicodeString txtString = UnicodeString("a string \\ud800\\udc00").unescape();
-    UChar *exactBuffer = new UChar[txtString.length()];
+    TestCase11049("A|B|C", "a string \\ud800\\udc00", FALSE, __LINE__);
+    TestCase11049("A|B|C", "string matches at end C", TRUE, __LINE__);
+
+    // Test again with a pattern starting with a single character, 
+    // which takes a different code path than starting with an OR expression,
+    // but with similar logic.
+    TestCase11049("C", "a string \\ud800\\udc00", FALSE, __LINE__);
+    TestCase11049("C", "string matches at end C", TRUE, __LINE__);
+}
+
+// Run a single test case from TestBug11049(). Internal function.
+void RegexTest::TestCase11049(const char *pattern, const char *data, UBool expectMatch, int32_t lineNumber) {
      UErrorCode status = U_ZERO_ERROR;
-    txtString.extract(exactBuffer, txtString.length(), status);
-    UText *ut = utext_openUChars(NULL, exactBuffer, txtString.length(), &status);
+    UnicodeString patternString = UnicodeString(pattern).unescape();
+    LocalPointer<RegexPattern> compiledPat(RegexPattern::compile(patternString, 0, status));
  
-    LocalPointer<RegexPattern> pattern(RegexPattern::compile(patternString, 0, status));
+    UnicodeString dataString = UnicodeString(data).unescape();
+    UChar *exactBuffer = new UChar[dataString.length()];
+    dataString.extract(exactBuffer, dataString.length(), status);
+    UText *ut = utext_openUChars(NULL, exactBuffer, dataString.length(), &status);
+
+    LocalPointer<RegexMatcher> matcher(compiledPat->matcher(status));
      REGEX_CHECK_STATUS;
-    LocalPointer<RegexMatcher> matcher(pattern->matcher(status));
      matcher->reset(ut);
-    REGEX_CHECK_STATUS;
      UBool result = matcher->find();
-    REGEX_ASSERT(result == FALSE);
-
-    // Verify that match starting on the last char in input will be found.
-    txtString = UnicodeString("string matches at end C");
-    matcher->reset(txtString);
-    result = matcher->find();
-    REGEX_ASSERT(result == TRUE);
+    if (result != expectMatch) {
+        errln("File %s, line %d: expected %d, got %d. Pattern = \"%s\", text = \"%s\"",
+              __FILE__, lineNumber, expectMatch, result, pattern, data);
+    }
  
-    // Put an unpaired surrogate at the end of the input text,
-    // let valgrind verify that find() doesn't look off the end.
-    txtString = UnicodeString("a string \\ud800").unescape();
-    delete [] exactBuffer;
-    exactBuffer = new UChar[txtString.length()];
-    txtString.extract(exactBuffer, txtString.length(), status);
-    utext_openUChars(ut, exactBuffer, txtString.length(), &status);
+    // Rerun test with UTF-8 input text. Won't see buffer overreads, but could see
+    //   off-by-one on find() with match at the last code point.
+    //   Size of the original char * data (invariant charset) will be <= than the equivalent UTF-8
+    //   because string.unescape() will only shrink it.
+    char * utf8Buffer = new char[uprv_strlen(data)+1];
+    u_strToUTF8(utf8Buffer, uprv_strlen(data)+1, NULL, dataString.getBuffer(), dataString.length(), &status);
+    REGEX_CHECK_STATUS;
+    ut = utext_openUTF8(ut, utf8Buffer, -1, &status);
+    REGEX_CHECK_STATUS;
      matcher->reset(ut);
      result = matcher->find();
-    REGEX_ASSERT(result == FALSE);
-    REGEX_CHECK_STATUS;
+    if (result != expectMatch) {
+        errln("File %s, line %d (UTF-8 check): expected %d, got %d. Pattern = \"%s\", text = \"%s\"",
+              __FILE__, lineNumber, expectMatch, result, pattern, data);
+    }
+    delete [] utf8Buffer;
  
      utext_close(ut);
      delete [] exactBuffer;
  }
  
  
+
  #endif  /* !UCONFIG_NO_REGULAR_EXPRESSIONS  */
  
diff --git a/icu4c/source/test/intltest/regextst.h b/icu4c/source/test/intltest/regextst.h

index bb8777a338d243f5608e78a1214cbe2d379f30df..28e21216e4cf7f5dcd18d20f7705575b53830ec8 100644 (file)
--- a/icu4c/source/test/intltest/regextst.h
+++ b/icu4c/source/test/intltest/regextst.h
@@ -63,6 +63,8 @@ public:
      virtual UChar *ReadAndConvertFile(const char *fileName, int32_t &len, const char *charset, UErrorCode &status);
      virtual const char *getPath(char buffer[2048], const char *filename);
  
+    virtual void TestCase11049(const char *pattern, const char *data, UBool expectMatch, int32_t lineNumber);
+
      static const char* extractToAssertBuf(const UnicodeString& message);
      
  };
author	Andy Heninger <andy.heninger@gmail.com>
	Thu, 14 Aug 2014 17:44:05 +0000 (17:44 +0000)
committer	Andy Heninger <andy.heninger@gmail.com>
	Thu, 14 Aug 2014 17:44:05 +0000 (17:44 +0000)
icu4c/source/i18n/rematch.cpp		patch \| blob \| history
icu4c/source/test/intltest/regextst.cpp		patch \| blob \| history
icu4c/source/test/intltest/regextst.h		patch \| blob \| history