From: allenwtsu <allenwtsu@google.com>
Date: Wed, 22 Dec 2021 15:50:44 +0000 (+0000)
Subject: ICU-21878 Sync icu4j's CjkBreakEngine to icu4c's
X-Git-Tag: cldr/2022-02-08~15
X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=08c3f99c0882ea84aebadd0fdb73f4d92fad859e;p=icu

ICU-21878 Sync icu4j's CjkBreakEngine to icu4c's

See #1953
---

diff --git a/icu4c/source/common/dictbe.cpp b/icu4c/source/common/dictbe.cpp
index 36a35c411a9..6b6d4297ad4 100644
--- a/icu4c/source/common/dictbe.cpp
+++ b/icu4c/source/common/dictbe.cpp
@@ -1361,6 +1361,7 @@ CjkBreakEngine::divideUpDictionaryRange( UText *inText,
     // while reversing t_boundary and pushing values to foundBreaks.
     int32_t prevCPPos = -1;
     int32_t prevUTextPos = -1;
+    int correctedNumBreaks = 0;
     for (int32_t i = numBreaks-1; i >= 0; i--) {
         int32_t cpPos = t_boundary.elementAti(i);
         U_ASSERT(cpPos > prevCPPos);
@@ -1369,7 +1370,10 @@ CjkBreakEngine::divideUpDictionaryRange( UText *inText,
         if (utextPos > prevUTextPos) {
             // Boundaries are added to foundBreaks output in ascending order.
             U_ASSERT(foundBreaks.size() == 0 || foundBreaks.peeki() < utextPos);
-            foundBreaks.push(utextPos, status);
+            if (!(foundBreaks.contains(utextPos) || utextPos == rangeStart)) {
+                foundBreaks.push(utextPos, status);
+                correctedNumBreaks++;
+            }
         } else {
             // Normalization expanded the input text, the dictionary found a boundary
             // within the expansion, giving two boundaries with the same index in the
@@ -1381,9 +1385,14 @@ CjkBreakEngine::divideUpDictionaryRange( UText *inText,
     }
     (void)prevCPPos; // suppress compiler warnings about unused variable
 
+    if (!foundBreaks.isEmpty() && foundBreaks.peeki() == rangeEnd) {
+        foundBreaks.popi();
+        correctedNumBreaks--;
+    }
+
     // inString goes out of scope
     // inputMap goes out of scope
-    return numBreaks;
+    return correctedNumBreaks;
 }
 #endif
 
diff --git a/icu4c/source/test/testdata/rbbitst.txt b/icu4c/source/test/testdata/rbbitst.txt
index 7fb30c9e8e3..1948360277d 100644
--- a/icu4c/source/test/testdata/rbbitst.txt
+++ b/icu4c/source/test/testdata/rbbitst.txt
@@ -796,6 +796,9 @@
 <word>
 <data>•ジョージア<400> •</data>
 
+<word>
+<data>•[<0>携帯<400>電話<400>]<0>お金<400>が<400>かかる<400>ん<400>です<400>。<0></data>
+
 # Test for #11723
 <word>
 <data>•アレルギー性<400>結膜炎<400></data>
diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/rbbitst.txt b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/rbbitst.txt
index 7fb30c9e8e3..1948360277d 100644
--- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/rbbitst.txt
+++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/rbbitst.txt
@@ -796,6 +796,9 @@
 <word>
 <data>•ジョージア<400> •</data>
 
+<word>
+<data>•[<0>携帯<400>電話<400>]<0>お金<400>が<400>かかる<400>ん<400>です<400>。<0></data>
+
 # Test for #11723
 <word>
 <data>•アレルギー性<400>結膜炎<400></data>