ICU-13549 CjkBreakEngine::divideUpDictionaryRange, problems with supplemental charact...

author Andy Heninger <andy.heninger@gmail.com>

Sun, 18 Feb 2018 22:44:18 +0000 (22:44 +0000)

committer Andy Heninger <andy.heninger@gmail.com>

Sun, 18 Feb 2018 22:44:18 +0000 (22:44 +0000)
author Andy Heninger <andy.heninger@gmail.com>
Sun, 18 Feb 2018 22:44:18 +0000 (22:44 +0000)
committer Andy Heninger <andy.heninger@gmail.com>
Sun, 18 Feb 2018 22:44:18 +0000 (22:44 +0000)
diff --git a/icu4c/source/common/dictbe.cpp b/icu4c/source/common/dictbe.cpp

index 18fa188a7ed95d00939355afc86793c607bc1e08..dde8072966e41519e3c47553ac54eafcff528ac3 100644 (file)
--- a/icu4c/source/common/dictbe.cpp
+++ b/icu4c/source/common/dictbe.cpp
@@ -1324,8 +1324,8 @@ CjkBreakEngine::divideUpDictionaryRange( UText *inText,
              }
              if (katakanaRunLength < kMaxKatakanaGroupLength) {
                  uint32_t newSnlp = bestSnlp.elementAti(i) + getKatakanaCost(katakanaRunLength);
-                if (newSnlp < (uint32_t)bestSnlp.elementAti(j)) {
-                    bestSnlp.setElementAt(newSnlp, j);
+                if (newSnlp < (uint32_t)bestSnlp.elementAti(i+katakanaRunLength)) {
+                    bestSnlp.setElementAt(newSnlp, i+katakanaRunLength);
                      prev.setElementAt(i, i+katakanaRunLength);  // prev[j] = i;
                  }
              }
diff --git a/icu4c/source/test/testdata/rbbitst.txt b/icu4c/source/test/testdata/rbbitst.txt

index 0d4c1633b62e47a311c5b7bc193102d79102fb32..761b3e01b5b8bb7bc4a97d3b49985991af17c405 100644 (file)
--- a/icu4c/source/test/testdata/rbbitst.txt
+++ b/icu4c/source/test/testdata/rbbitst.txt
@@ -38,17 +38,8 @@
  
  
  #   Temp debugging tests
-<locale en>
-<rules>
-$s0=[;,*];
-$s1=[a-z];
-$s2=[i-n];
-$s3=[x-z];
-!!forward;
-($s0 | '?')*
-($s1 | $s2 | $s3)*;
-</rules>
-<data>•hello• •</data>
+#
+
  
  ## FILTERED BREAK TESTS
  
@@ -327,6 +318,15 @@ $s3=[x-z];
  <data>•ロ<400>から<400>売却<400>完了<400>時<400>の<400>時価<400>が<400>提示<400>さ<400>れ<400>て<400>いる<400></data>
  <data>•\U00011700<200>ロ<400>から<400>売却<400>完了<400>時<400>の<400>時価<400>が<400>提示<400>さ<400>れ<400>て<400>いる<400></data>
  
+#
+# Ticket #13549
+#   CjiBreakEngine::divideUpDictionaryRange: assertion failure.
+#
+<locale en>
+<word>
+<data>•\U00020029<400>\u3300<400>\U0002C400<400></data>
+<data>•\uFAD7<400>\u331B<400>\u87DF<400>\u006D<200>\uFFFD•</data>
+
  #
  # What Is Unicode in Japanese
  # From http://unicode.org/standard/translations/japanese.html
diff --git a/icu4j/main/classes/core/src/com/ibm/icu/text/CjkBreakEngine.java b/icu4j/main/classes/core/src/com/ibm/icu/text/CjkBreakEngine.java

index b2c4c61b7fb25e61f0d330537beeb9c7c248fa7c..0e21779bdd03baa2777590f3bf0ecafc8fbbe125 100644 (file)
--- a/icu4j/main/classes/core/src/com/ibm/icu/text/CjkBreakEngine.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/text/CjkBreakEngine.java
@@ -102,7 +102,7 @@ class CjkBreakEngine extends DictionaryBreakEngine {
          boolean isNormalized = Normalizer.quickCheck(prenormstr, Normalizer.NFKC) == Normalizer.YES ||
                                 Normalizer.isNormalized(prenormstr, Normalizer.NFKC, 0);
          CharacterIterator text;
-        int numChars = 0;
+        int numCodePts = 0;
          if (isNormalized) {
              text = new java.text.StringCharacterIterator(prenormstr);
              int index = 0;
@@ -110,8 +110,8 @@ class CjkBreakEngine extends DictionaryBreakEngine {
              while (index < prenormstr.length()) {
                  int codepoint = prenormstr.codePointAt(index);
                  index += Character.charCount(codepoint);
-                numChars++;
-                charPositions[numChars] = index;
+                numCodePts++;
+                charPositions[numCodePts] = index;
              }
          } else {
              String normStr = Normalizer.normalize(prenormstr, Normalizer.NFKC);
@@ -122,37 +122,43 @@ class CjkBreakEngine extends DictionaryBreakEngine {
              charPositions[0] = 0;
              while (index < normalizer.endIndex()) {
                  normalizer.next();
-                numChars++;
+                numCodePts++;
                  index = normalizer.getIndex();
-                charPositions[numChars] = index;
+                charPositions[numCodePts] = index;
              }
          }
  
          // From here on out, do the algorithm. Note that our indices
          // refer to indices within the normalized string.
-        int[] bestSnlp = new int[numChars + 1];
+        int[] bestSnlp = new int[numCodePts + 1];
          bestSnlp[0] = 0;
-        for (int i = 1; i <= numChars; i++) {
+        for (int i = 1; i <= numCodePts; i++) {
              bestSnlp[i] = kint32max;
          }
  
-        int[] prev = new int[numChars + 1];
-        for (int i = 0; i <= numChars; i++) {
+        int[] prev = new int[numCodePts + 1];
+        for (int i = 0; i <= numCodePts; i++) {
              prev[i] = -1;
          }
  
          final int maxWordSize = 20;
-        int values[] = new int[numChars];
-        int lengths[] = new int[numChars];
+        int values[] = new int[numCodePts];
+        int lengths[] = new int[numCodePts];
          // dynamic programming to find the best segmentation
+
+        // In outer loop, i  is the code point index,
+        //                ix is the corresponding code unit index.
+        //    They differ when the string contains supplementary characters.
+        int ix = 0;
+        text.setIndex(ix);
          boolean is_prev_katakana = false;
-        for (int i = 0; i < numChars; i++) {
-            text.setIndex(i);
+        for (int i = 0; i < numCodePts; i++, text.setIndex(ix), next32(text)) {
+            ix = text.getIndex();
              if (bestSnlp[i] == kint32max) {
                  continue;
              }
  
-            int maxSearchLength = (i + maxWordSize < numChars) ? maxWordSize : (numChars - i);
+            int maxSearchLength = (i + maxWordSize < numCodePts) ? maxWordSize : (numCodePts - i);
              int[] count_ = new int[1];
              fDictionary.matches(text, maxSearchLength, lengths, count_, maxSearchLength, values);
              int count = count_[0];
@@ -162,7 +168,7 @@ class CjkBreakEngine extends DictionaryBreakEngine {
              // with the highest value possible (i.e. the least likely to occur).
              // Exclude Korean characters from this treatment, as they should be
              // left together by default.
-            text.setIndex(i);  // fDictionary.matches() advances the text position; undo that.
+            text.setIndex(ix);  // fDictionary.matches() advances the text position; undo that.
              if ((count == 0 || lengths[0] != 1) && current32(text) != DONE32 && !fHangulWordSet.contains(current32(text))) {
                  values[count] = maxSnlp;
                  lengths[count] = 1;
@@ -186,7 +192,7 @@ class CjkBreakEngine extends DictionaryBreakEngine {
              if (!is_prev_katakana && is_katakana) {
                  int j = i + 1;
                  next32(text);
-                while (j < numChars && (j - i) < kMaxKatakanaGroupLength && isKatakana(current32(text))) {
+                while (j < numCodePts && (j - i) < kMaxKatakanaGroupLength && isKatakana(current32(text))) {
                      next32(text);
                      ++j;
                  }
@@ -202,13 +208,13 @@ class CjkBreakEngine extends DictionaryBreakEngine {
              is_prev_katakana = is_katakana;
          }
  
-        int t_boundary[] = new int[numChars + 1];
+        int t_boundary[] = new int[numCodePts + 1];
          int numBreaks = 0;
-        if (bestSnlp[numChars] == kint32max) {
-            t_boundary[numBreaks] = numChars;
+        if (bestSnlp[numCodePts] == kint32max) {
+            t_boundary[numBreaks] = numCodePts;
              numBreaks++;
          } else {
-            for (int i = numChars; i > 0; i = prev[i]) {
+            for (int i = numCodePts; i > 0; i = prev[i]) {
                  t_boundary[numBreaks] = i;
                  numBreaks++;
              }
diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/rbbitst.txt b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/rbbitst.txt

index 1450a98d7be848ece5a2626aedd51295f08cbb9e..761b3e01b5b8bb7bc4a97d3b49985991af17c405 100644 (file)
--- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/rbbitst.txt
+++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/rbbitst.txt
@@ -38,19 +38,8 @@
  
  
  #   Temp debugging tests
-<locale en>
-<word>
-<data><0>コンピューター<400>は<400>、<0>本質<400>的<400>に<400>は<400>数字<400>しか<400>扱う<400>こと<400>が<400>でき<400>ま<400>せん<400>。<0>\
-コンピューター<400>は<400>、<0>文字<400>や<400>記号<400>など<400>の<400>それぞれに<400>番号<400>を<400>割り振る<400>こと<400>によって<400>扱える<400>\
-よう<400>にし<400>ます<400>。<0>ユニ<400>コード<400>が<400>出来る<400>まで<400>は<400>、<0>これらの<400>番号<400>を<400>割り振る<400>仕組み<400>が<400>\
-何<400>百<400>種類<400>も<400>存在<400>しま<400>した<400>。<0>どの<400>一つ<400>を<400>とっても<400>、<0>十分<400>な<400>文字<400>を<400>含<400>\
-んで<400>は<400>いま<400>せん<400>で<400>した<400>。<0>例えば<400>、<0>欧州<400>連合<400>一つ<400>を<400>見<400>て<400>も<400>、<0>その<400>\
-すべて<400>の<400>言語<400>を<400>カバー<400>する<400>ため<400>に<400>は<400>、<0>いくつか<400>の<400>異なる<400>符号<400>化<400>の<400>仕組み<400>\
-が<400>必要<400>で<400>した<400>。<0>英語<400>の<400>よう<400>な<400>一つ<400>の<400>言語<400>に<400>限<400>って<400>も<400>、<0>一つ<400>だけ<400>\
-の<400>符号<400>化<400>の<400>仕組み<400>では<400>、<0>一般<400>的<400>に<400>使<400>われる<400>すべて<400>の<400>文字<400>、<0>句読点<400>、<0>\
-。<0></data>
+#
  
-#<data><0>コンピューター<400>は<400>、<0>本質<400>的<400>に<400>は<400>数字<400>しか<400>扱う<400>こと<400>が<400>でき<400>ま<400>せん<400>。<0>\
  
  ## FILTERED BREAK TESTS
  
@@ -329,6 +318,15 @@
  <data>•ロ<400>から<400>売却<400>完了<400>時<400>の<400>時価<400>が<400>提示<400>さ<400>れ<400>て<400>いる<400></data>
  <data>•\U00011700<200>ロ<400>から<400>売却<400>完了<400>時<400>の<400>時価<400>が<400>提示<400>さ<400>れ<400>て<400>いる<400></data>
  
+#
+# Ticket #13549
+#   CjiBreakEngine::divideUpDictionaryRange: assertion failure.
+#
+<locale en>
+<word>
+<data>•\U00020029<400>\u3300<400>\U0002C400<400></data>
+<data>•\uFAD7<400>\u331B<400>\u87DF<400>\u006D<200>\uFFFD•</data>
+
  #
  # What Is Unicode in Japanese
  # From http://unicode.org/standard/translations/japanese.html
author	Andy Heninger <andy.heninger@gmail.com>
	Sun, 18 Feb 2018 22:44:18 +0000 (22:44 +0000)
committer	Andy Heninger <andy.heninger@gmail.com>
	Sun, 18 Feb 2018 22:44:18 +0000 (22:44 +0000)
icu4c/source/common/dictbe.cpp		patch \| blob \| history
icu4c/source/test/testdata/rbbitst.txt		patch \| blob \| history
icu4j/main/classes/core/src/com/ibm/icu/text/CjkBreakEngine.java		patch \| blob \| history
icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/rbbitst.txt		patch \| blob \| history