From: allenwtsu <allenwtsu@google.com> Date: Wed, 22 Dec 2021 15:50:44 +0000 (+0000) Subject: ICU-21878 Sync icu4j's CjkBreakEngine to icu4c's X-Git-Tag: cldr/2022-02-08~15 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=08c3f99c0882ea84aebadd0fdb73f4d92fad859e;p=icu ICU-21878 Sync icu4j's CjkBreakEngine to icu4c's See #1953 --- diff --git a/icu4c/source/common/dictbe.cpp b/icu4c/source/common/dictbe.cpp index 36a35c411a9..6b6d4297ad4 100644 --- a/icu4c/source/common/dictbe.cpp +++ b/icu4c/source/common/dictbe.cpp @@ -1361,6 +1361,7 @@ CjkBreakEngine::divideUpDictionaryRange( UText *inText, // while reversing t_boundary and pushing values to foundBreaks. int32_t prevCPPos = -1; int32_t prevUTextPos = -1; + int correctedNumBreaks = 0; for (int32_t i = numBreaks-1; i >= 0; i--) { int32_t cpPos = t_boundary.elementAti(i); U_ASSERT(cpPos > prevCPPos); @@ -1369,7 +1370,10 @@ CjkBreakEngine::divideUpDictionaryRange( UText *inText, if (utextPos > prevUTextPos) { // Boundaries are added to foundBreaks output in ascending order. U_ASSERT(foundBreaks.size() == 0 || foundBreaks.peeki() < utextPos); - foundBreaks.push(utextPos, status); + if (!(foundBreaks.contains(utextPos) || utextPos == rangeStart)) { + foundBreaks.push(utextPos, status); + correctedNumBreaks++; + } } else { // Normalization expanded the input text, the dictionary found a boundary // within the expansion, giving two boundaries with the same index in the @@ -1381,9 +1385,14 @@ CjkBreakEngine::divideUpDictionaryRange( UText *inText, } (void)prevCPPos; // suppress compiler warnings about unused variable + if (!foundBreaks.isEmpty() && foundBreaks.peeki() == rangeEnd) { + foundBreaks.popi(); + correctedNumBreaks--; + } + // inString goes out of scope // inputMap goes out of scope - return numBreaks; + return correctedNumBreaks; } #endif diff --git a/icu4c/source/test/testdata/rbbitst.txt b/icu4c/source/test/testdata/rbbitst.txt index 7fb30c9e8e3..1948360277d 100644 --- a/icu4c/source/test/testdata/rbbitst.txt +++ b/icu4c/source/test/testdata/rbbitst.txt @@ -796,6 +796,9 @@ <word> <data>â¢ã¸ã§ã¼ã¸ã¢<400> â¢</data> +<word> +<data>â¢ï¼»<0>æºå¸¯<400>é»è©±<400>ï¼½<0>ãé<400>ã<400>ããã<400>ã<400>ã§ã<400>ã<0></data> + # Test for #11723 <word> <data>â¢ã¢ã¬ã«ã®ã¼æ§<400>çµèç<400></data> diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/rbbitst.txt b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/rbbitst.txt index 7fb30c9e8e3..1948360277d 100644 --- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/rbbitst.txt +++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/rbbitst.txt @@ -796,6 +796,9 @@ <word> <data>â¢ã¸ã§ã¼ã¸ã¢<400> â¢</data> +<word> +<data>â¢ï¼»<0>æºå¸¯<400>é»è©±<400>ï¼½<0>ãé<400>ã<400>ããã<400>ã<400>ã§ã<400>ã<0></data> + # Test for #11723 <word> <data>â¢ã¢ã¬ã«ã®ã¼æ§<400>çµèç<400></data>