]> granicus.if.org Git - icu/commitdiff
ICU-21699 Refactor codeunit handling
authorallenwtsu <allenwtsu@google.com>
Tue, 25 Jan 2022 11:27:47 +0000 (11:27 +0000)
committerFrank Yung-Fong Tang <ftang@google.com>
Wed, 26 Jan 2022 23:41:34 +0000 (15:41 -0800)
See #1965

icu4c/source/common/dictbe.cpp
icu4c/source/test/testdata/rbbitst.txt
icu4j/main/classes/core/src/com/ibm/icu/impl/breakiter/CjkBreakEngine.java
icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/rbbitst.txt

index c0af19ef171300601f5786641ebc45d658ea972e..4621bf4e24f303551c910bd577476b17f91b39c4 100644 (file)
@@ -1360,16 +1360,18 @@ CjkBreakEngine::divideUpDictionaryRange( UText *inText,
             int32_t prevIdx = numCodePts;
 
             int32_t codeUnitIdx = -1;
+            int32_t prevCodeUnitIdx = -1;
             int32_t length = -1;
             for (int32_t i = prev.elementAti(numCodePts); i > 0; i = prev.elementAti(i)) {
                 codeUnitIdx = inString.moveIndex32(0, i);
+                prevCodeUnitIdx = inString.moveIndex32(0, prevIdx);
                 // Calculate the length by using the code unit.
-                length = inString.moveIndex32(0, prevIdx) - codeUnitIdx;
+                length = prevCodeUnitIdx - codeUnitIdx;
                 prevIdx = i;
                 // Keep the breakpoint if the pattern is not in the fSkipSet and continuous Katakana
                 // characters don't occur.
                 if (!fSkipSet.containsKey(inString.tempSubString(codeUnitIdx, length))
-                    && (!isKatakana(inString.char32At(codeUnitIdx -1))
+                    && (!isKatakana(inString.char32At(inString.moveIndex32(codeUnitIdx, -1)))
                            || !isKatakana(inString.char32At(codeUnitIdx)))) {
                     t_boundary.addElement(i, status);
                     numBreaks++;
index 1be45e9f3c5d2e86cd4cec337ca3253b26a14d40..702bb47903841635ccc0bc284f7c4141a36e1648 100644 (file)
@@ -1898,7 +1898,9 @@ Bangkok)•</data>
 <data>•\uff3b\u643a\u5e2f•\u96fb\u8a71\uff3d•\u6b63\u3057\u3044•\u9078\u629e•</data>
 #純金製百人一首にサッカーボール -> 純金•製•百人一首に•サッカーボール
 <data>•\u7D14\u91D1•\u88FD•\u767E\u4EBA\u4E00\u9996\u306B•\u30B5\u30C3\u30AB\u30FC\u30DC\u30FC\u30EB•</data>
-
+#Kana supplement: 𛁈(U+1B048) -> \uD82C\uDC48, 𛀸(U+1B038) -> \uD82C\uDC38, 𛀙(U+1B019)-> \uD82C\uDC19</data>
+#𛁈る𛀸(しるこ)、あ𛀙よろし(あかよろし) -> 𛁈る𛀸•(しるこ)、•あ𛀙よろし•(あ•かよろし)
+<data>•\uD82C\uDC48\u308B\uD82C\uDC38•\uFF08\u3057\u308B\u3053\uFF09\u3001•\u3042\uD82C\uDC19\u3088\u308D\u3057•\uFF08\u3042•\u304B\u3088\u308D\u3057\uFF09•</data>
 
 ####################################################################################
 #
index 06b93683771304fd32537bf3baf900c4d9c117bd..166a4c752453e52a56349ecec6b22cf5c3f12dd9 100644 (file)
@@ -11,6 +11,7 @@ package com.ibm.icu.impl.breakiter;
 import static com.ibm.icu.impl.CharacterIteration.DONE32;
 import static com.ibm.icu.impl.CharacterIteration.current32;
 import static com.ibm.icu.impl.CharacterIteration.next32;
+import static com.ibm.icu.impl.CharacterIteration.previous32;
 
 import java.io.IOException;
 import java.text.CharacterIterator;
@@ -240,17 +241,18 @@ public class CjkBreakEngine extends DictionaryBreakEngine {
             t_boundary[numBreaks] = numCodePts;
             numBreaks++;
             int prevIdx = numCodePts;
-            int codeUnitIdx = 0, length = 0;
+            int codeUnitIdx = 0, prevCodeUnitIdx = 0, length = 0;
             for (int i = prev[numCodePts]; i > 0; i = prev[i]) {
                 codeUnitIdx = prenormstr.offsetByCodePoints(0, i);
-                length = prevIdx - i;
+                prevCodeUnitIdx = prenormstr.offsetByCodePoints(0, prevIdx);
+                length =  prevCodeUnitIdx - codeUnitIdx;
                 prevIdx = i;
                 String pattern = getPatternFromText(text, s, codeUnitIdx, length);
                 // Keep the breakpoint if the pattern is not in the fSkipSet and continuous Katakana
                 // characters don't occur.
-                text.setIndex(codeUnitIdx - 1);
+                text.setIndex(codeUnitIdx);
                 if (!fSkipSet.contains(pattern)
-                        && (!isKatakana(current32(text)) || !isKatakana(next32(text)))) {
+                        && (!isKatakana(current32(text)) || !isKatakana(previous32(text)))) {
                     t_boundary[numBreaks] = i;
                     numBreaks++;
                 }
@@ -308,11 +310,11 @@ public class CjkBreakEngine extends DictionaryBreakEngine {
     private String getPatternFromText(CharacterIterator text, StringBuffer sb, int start,
             int length) {
         sb.setLength(0);
-        if(length > 0) {
+        if (length > 0) {
             text.setIndex(start);
-            sb.appendCodePoint(current32(text));
-            for (int j = 1; j < length; j++) {
-                sb.appendCodePoint(next32(text));
+            sb.append(text.current());
+            for (int i = 1; i < length; i++) {
+                sb.append(text.next());
             }
         }
         return sb.toString();
index 346da988d7dd0b4baf4e78506f68f977013158c5..2a238a80f9c3f9ae31e35c5386a4f0b56ba680ba 100644 (file)
@@ -1898,6 +1898,9 @@ Bangkok)•</data>
 <data>•\uff3b\u643a\u5e2f•\u96fb\u8a71\uff3d•\u6b63\u3057\u3044•\u9078\u629e•</data>
 #純金製百人一首にサッカーボール -> 純金•製•百人一首に•サッカーボール
 <data>•\u7D14\u91D1•\u88FD•\u767E\u4EBA\u4E00\u9996\u306B•\u30B5\u30C3\u30AB\u30FC\u30DC\u30FC\u30EB•</data>
+#Kana supplement: 𛁈(U+1B048) -> \uD82C\uDC48, 𛀸(U+1B038) -> \uD82C\uDC38, 𛀙(U+1B019)-> \uD82C\uDC19</data>
+#𛁈る𛀸(しるこ)、あ𛀙よろし(あかよろし) -> 𛁈る𛀸•(しるこ)、•あ𛀙よろし•(あ•かよろし)
+<data>•\uD82C\uDC48\u308B\uD82C\uDC38•\uFF08\u3057\u308B\u3053\uFF09\u3001•\u3042\uD82C\uDC19\u3088\u308D\u3057•\uFF08\u3042•\u304B\u3088\u308D\u3057\uFF09•</data>
 
 
 ####################################################################################