ICU-12918 Dictionary Break Iterator Assertion Failure

author Andy Heninger <andy.heninger@gmail.com>

Wed, 18 Jan 2017 19:42:33 +0000 (19:42 +0000)

committer Andy Heninger <andy.heninger@gmail.com>

Wed, 18 Jan 2017 19:42:33 +0000 (19:42 +0000)
author Andy Heninger <andy.heninger@gmail.com>
Wed, 18 Jan 2017 19:42:33 +0000 (19:42 +0000)
committer Andy Heninger <andy.heninger@gmail.com>
Wed, 18 Jan 2017 19:42:33 +0000 (19:42 +0000)
diff --git a/icu4c/source/common/dictbe.cpp b/icu4c/source/common/dictbe.cpp

index 924f09bd5dd991e31bc088539e64da26abb3d1b2..93cb57cd4dc2cd386d2be841810006d3ff8b444b 100644 (file)
--- a/icu4c/source/common/dictbe.cpp
+++ b/icu4c/source/common/dictbe.cpp
@@ -1385,12 +1385,25 @@ CjkBreakEngine::divideUpDictionaryRange( UText *inText,
      // Now that we're done, convert positions in t_boundary[] (indices in 
      // the normalized input string) back to indices in the original input UText
      // while reversing t_boundary and pushing values to foundBreaks.
+    int32_t prevCPPos = -1;
+    int32_t prevUTextPos = -1;
      for (int32_t i = numBreaks-1; i >= 0; i--) {
          int32_t cpPos = t_boundary.elementAti(i);
+        U_ASSERT(cpPos > prevCPPos);
          int32_t utextPos =  inputMap.isValid() ? inputMap->elementAti(cpPos) : cpPos + rangeStart;
-        // Boundaries are added to foundBreaks output in ascending order.
-        U_ASSERT(foundBreaks.size() == 0 ||foundBreaks.peeki() < utextPos);
-        foundBreaks.push(utextPos, status);
+        U_ASSERT(utextPos >= prevUTextPos);
+        if (utextPos > prevUTextPos) {
+            // Boundaries are added to foundBreaks output in ascending order.
+            U_ASSERT(foundBreaks.size() == 0 || foundBreaks.peeki() < utextPos);
+            foundBreaks.push(utextPos, status);
+        } else {
+            // Normalization expanded the input text, the dictionary found a boundary
+            // within the expansion, giving two boundaries with the same index in the
+            // original text. Ignore the second. See ticket #12918.
+            --numBreaks;
+        }
+        prevCPPos = cpPos;
+        prevUTextPos = utextPos;
      }
  
      // inString goes out of scope
diff --git a/icu4c/source/test/intltest/rbbitst.cpp b/icu4c/source/test/intltest/rbbitst.cpp

index a3102c12d843826c279838445734ccd4c2fd2ce2..96a2b7a9edd7f91cd1d17854a0ad3bd17da4d746 100644 (file)
--- a/icu4c/source/test/intltest/rbbitst.cpp
+++ b/icu4c/source/test/intltest/rbbitst.cpp
@@ -103,6 +103,7 @@ void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, cha
      TESTCASE_AUTO(TestBug5532);
      TESTCASE_AUTO(TestBug7547);
      TESTCASE_AUTO(TestBug12797);
+    TESTCASE_AUTO(TestBug12918);
      TESTCASE_AUTO_END;
  }
  
@@ -4652,6 +4653,26 @@ void RBBITest::TestBug12797() {
      }
  }
  
+void RBBITest::TestBug12918() {
+    // This test triggers an assertion failure in dictbe.cpp
+    const UChar *crasherString = u"\u3325\u4a16";
+    UErrorCode status = U_ZERO_ERROR;
+    UBreakIterator* iter = ubrk_open(UBRK_WORD, NULL, crasherString, -1, &status);
+    if (U_FAILURE(status)) {
+        errln("%s:%d status = %s", __FILE__, __LINE__, u_errorName(status));
+        return;
+    }
+    ubrk_first(iter);
+    int32_t pos = 0;
+    int32_t lastPos = -1;
+    while((pos = ubrk_next(iter)) != UBRK_DONE) {
+        if (pos <= lastPos) {
+            errln("%s:%d (pos, lastPos) = (%d, %d)", __FILE__, __LINE__, pos, lastPos);
+            break;
+        }
+    }
+    ubrk_close(iter);
+}
  
  //
  //  TestDebug    -  A place-holder test for debugging purposes.
diff --git a/icu4c/source/test/intltest/rbbitst.h b/icu4c/source/test/intltest/rbbitst.h

index 6b2c2f0eb72f36ac9dcc5dc599729f92d000ff3d..91c3d99f6334446b0b566f043b608fd64892626d 100644 (file)
--- a/icu4c/source/test/intltest/rbbitst.h
+++ b/icu4c/source/test/intltest/rbbitst.h
@@ -76,6 +76,7 @@ public:
      void TestBug9983();
      void TestBug7547();
      void TestBug12797();
+    void TestBug12918();
  
      void TestDebug();
      void TestProperties();
author	Andy Heninger <andy.heninger@gmail.com>
	Wed, 18 Jan 2017 19:42:33 +0000 (19:42 +0000)
committer	Andy Heninger <andy.heninger@gmail.com>
	Wed, 18 Jan 2017 19:42:33 +0000 (19:42 +0000)
icu4c/source/common/dictbe.cpp		patch \| blob \| history
icu4c/source/test/intltest/rbbitst.cpp		patch \| blob \| history
icu4c/source/test/intltest/rbbitst.h		patch \| blob \| history