1. vector.contains() uses sequential comparison, O(n).
As the vector size is great, the performance will be impacted.
Remove this unnecessary check, vector.contains(), in C++.
2. At Java's CjkBreakEngine, replace "vector.contains()" with "if(pos > previous)" to deal with duplicate breakpoint position.
This way, C++ and Java implementation will be synchronous.
Test: ant checkTest -Dtestclass='com.ibm.icu.dev.test.rbbi.RBBITest'
(RBBTest#TestBreakAllChars() can generate duplicate position for word break. It could pass with this change)
if (utextPos > prevUTextPos) {
// Boundaries are added to foundBreaks output in ascending order.
U_ASSERT(foundBreaks.size() == 0 || foundBreaks.peeki() < utextPos);
- if (!(foundBreaks.contains(utextPos) || utextPos == rangeStart)) {
+ if (utextPos != rangeStart) {
foundBreaks.push(utextPos, status);
correctedNumBreaks++;
}
}
int correctedNumBreaks = 0;
+ int previous = -1;
for (int i = numBreaks - 1; i >= 0; i--) {
int pos = charPositions[t_boundary[i]] + startPos;
- if (!(foundBreaks.contains(pos) || pos == startPos)) {
- foundBreaks.push(charPositions[t_boundary[i]] + startPos);
+ if (pos > previous && pos != startPos) {
+ foundBreaks.push(pos);
correctedNumBreaks++;
}
+ previous = pos;
}
if (!foundBreaks.isEmpty() && foundBreaks.peek() == endPos) {