]> granicus.if.org Git - icu/commitdiff
ICU-21878 Sync icu4j's CjkBreakEngine to icu4c's
authorallenwtsu <allenwtsu@google.com>
Wed, 22 Dec 2021 15:50:44 +0000 (15:50 +0000)
committerFrank Yung-Fong Tang <ftang@google.com>
Thu, 30 Dec 2021 22:47:37 +0000 (14:47 -0800)
See #1953

icu4c/source/common/dictbe.cpp
icu4c/source/test/testdata/rbbitst.txt
icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/rbbitst.txt

index 36a35c411a94743d9957f5fd71cd6cffa012ebbe..6b6d4297ad4e88011f6441c8e3e9994a15b87023 100644 (file)
@@ -1361,6 +1361,7 @@ CjkBreakEngine::divideUpDictionaryRange( UText *inText,
     // while reversing t_boundary and pushing values to foundBreaks.
     int32_t prevCPPos = -1;
     int32_t prevUTextPos = -1;
+    int correctedNumBreaks = 0;
     for (int32_t i = numBreaks-1; i >= 0; i--) {
         int32_t cpPos = t_boundary.elementAti(i);
         U_ASSERT(cpPos > prevCPPos);
@@ -1369,7 +1370,10 @@ CjkBreakEngine::divideUpDictionaryRange( UText *inText,
         if (utextPos > prevUTextPos) {
             // Boundaries are added to foundBreaks output in ascending order.
             U_ASSERT(foundBreaks.size() == 0 || foundBreaks.peeki() < utextPos);
-            foundBreaks.push(utextPos, status);
+            if (!(foundBreaks.contains(utextPos) || utextPos == rangeStart)) {
+                foundBreaks.push(utextPos, status);
+                correctedNumBreaks++;
+            }
         } else {
             // Normalization expanded the input text, the dictionary found a boundary
             // within the expansion, giving two boundaries with the same index in the
@@ -1381,9 +1385,14 @@ CjkBreakEngine::divideUpDictionaryRange( UText *inText,
     }
     (void)prevCPPos; // suppress compiler warnings about unused variable
 
+    if (!foundBreaks.isEmpty() && foundBreaks.peeki() == rangeEnd) {
+        foundBreaks.popi();
+        correctedNumBreaks--;
+    }
+
     // inString goes out of scope
     // inputMap goes out of scope
-    return numBreaks;
+    return correctedNumBreaks;
 }
 #endif
 
index 7fb30c9e8e3561ed1eaa54c4322e384b6beba129..1948360277d03048c7bb6405c8121ce76dcf8cb9 100644 (file)
 <word>
 <data>•ジョージア<400> •</data>
 
+<word>
+<data>•[<0>携帯<400>電話<400>]<0>お金<400>が<400>かかる<400>ん<400>です<400>。<0></data>
+
 # Test for #11723
 <word>
 <data>•アレルギー性<400>結膜炎<400></data>
index 7fb30c9e8e3561ed1eaa54c4322e384b6beba129..1948360277d03048c7bb6405c8121ce76dcf8cb9 100644 (file)
 <word>
 <data>•ジョージア<400> •</data>
 
+<word>
+<data>•[<0>携帯<400>電話<400>]<0>お金<400>が<400>かかる<400>ん<400>です<400>。<0></data>
+
 # Test for #11723
 <word>
 <data>•アレルギー性<400>結膜炎<400></data>