From: allenwtsu Date: Mon, 10 Jan 2022 14:07:13 +0000 (+0800) Subject: ICU-21699 Fix CjkBreakEngine performance issue X-Git-Tag: cldr/2022-02-08~13 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=06ef8867f35befee7340e35082fefc9d3561d230;p=icu ICU-21699 Fix CjkBreakEngine performance issue 1. vector.contains() uses sequential comparison, O(n). As the vector size is great, the performance will be impacted. Remove this unnecessary check, vector.contains(), in C++. 2. At Java's CjkBreakEngine, replace "vector.contains()" with "if(pos > previous)" to deal with duplicate breakpoint position. This way, C++ and Java implementation will be synchronous. Test: ant checkTest -Dtestclass='com.ibm.icu.dev.test.rbbi.RBBITest' (RBBTest#TestBreakAllChars() can generate duplicate position for word break. It could pass with this change) --- diff --git a/icu4c/source/common/dictbe.cpp b/icu4c/source/common/dictbe.cpp index 6b6d4297ad4..35d3cd48a7a 100644 --- a/icu4c/source/common/dictbe.cpp +++ b/icu4c/source/common/dictbe.cpp @@ -1370,7 +1370,7 @@ CjkBreakEngine::divideUpDictionaryRange( UText *inText, if (utextPos > prevUTextPos) { // Boundaries are added to foundBreaks output in ascending order. U_ASSERT(foundBreaks.size() == 0 || foundBreaks.peeki() < utextPos); - if (!(foundBreaks.contains(utextPos) || utextPos == rangeStart)) { + if (utextPos != rangeStart) { foundBreaks.push(utextPos, status); correctedNumBreaks++; } diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/breakiter/CjkBreakEngine.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/breakiter/CjkBreakEngine.java index a14c745e509..0404e031cc2 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/impl/breakiter/CjkBreakEngine.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/breakiter/CjkBreakEngine.java @@ -209,12 +209,14 @@ public class CjkBreakEngine extends DictionaryBreakEngine { } int correctedNumBreaks = 0; + int previous = -1; for (int i = numBreaks - 1; i >= 0; i--) { int pos = charPositions[t_boundary[i]] + startPos; - if (!(foundBreaks.contains(pos) || pos == startPos)) { - foundBreaks.push(charPositions[t_boundary[i]] + startPos); + if (pos > previous && pos != startPos) { + foundBreaks.push(pos); correctedNumBreaks++; } + previous = pos; } if (!foundBreaks.isEmpty() && foundBreaks.peek() == endPos) {