}
U_ASSERT(position < fBoundaries[fStartBufIdx] || position > fBoundaries[fEndBufIdx]);
- // Find a boundary somewhere in the vicinity of the requested position.
- // Depending on the safe rules and the text data, it could be either before, at, or after
- // the requested position.
-
+ // Add boundaries to the cache near the specified position.
+ // The given position need not be a boundary itself.
+ // The input position must be within the range of the text, and
+ // on a code point boundary.
+ // If the requested position is a break boundary, leave the iteration
+ // position on it.
+ // If the requested position is not a boundary, leave the iteration
+ // position on the preceding boundary and include both the
+ // preceding and following boundaries in the cache.
+ // Additional boundaries, either preceding or following, may be added
+ // to the cache as a side effect.
// If the requested position is not near already cached positions, clear the existing cache,
// find a near-by boundary and begin new cache contents there.
- if ((position < fBoundaries[fStartBufIdx] - 15) || position > (fBoundaries[fEndBufIdx] + 15)) {
- int32_t aBoundary = 0;
- int32_t ruleStatusIndex = 0;
- if (position > 20) {
- int32_t backupPos = fBI->handleSafePrevious(position);
-
- if (backupPos > 0) {
- // Advance to the boundary following the backup position.
- // There is a complication: the safe reverse rules identify pairs of code points
- // that are safe. If advancing from the safe point moves forwards by less than
- // two code points, we need to advance one more time to ensure that the boundary
- // is good, including a correct rules status value.
- //
- fBI->fPosition = backupPos;
- aBoundary = fBI->handleNext();
- if (aBoundary <= backupPos + 4) {
- // +4 is a quick test for possibly having advanced only one codepoint.
- // Four being the length of the longest potential code point, a supplementary in UTF-8
- utext_setNativeIndex(&fBI->fText, aBoundary);
- if (backupPos == utext_getPreviousNativeIndex(&fBI->fText)) {
- // The initial handleNext() only advanced by a single code point. Go again.
- aBoundary = fBI->handleNext(); // Safe rules identify safe pairs.
- }
+ // Threshold for a text position to be considered near to existing cache contents.
+ // TODO: See issue ICU-22024 "perf tuning of Cache needed."
+ // This value is subject to change. See the ticket for more details.
+ static constexpr int32_t CACHE_NEAR = 15;
+
+ int32_t aBoundary = -1;
+ int32_t ruleStatusIndex = 0;
+ bool retainCache = false;
+ if ((position > fBoundaries[fStartBufIdx] - CACHE_NEAR) && position < (fBoundaries[fEndBufIdx] + CACHE_NEAR)) {
+ // Requested position is near the existing cache. Retain it.
+ retainCache = true;
+ } else if (position <= CACHE_NEAR) {
+ // Requested position is near the start of the text. Fill cache from start, skipping
+ // the need to find a safe point.
+ retainCache = false;
+ aBoundary = 0;
+ } else {
+ // Requested position is not near the existing cache.
+ // Find a safe point to refill the cache from.
+ int32_t backupPos = fBI->handleSafePrevious(position);
+
+ if (fBoundaries[fEndBufIdx] < position && fBoundaries[fEndBufIdx] >= (backupPos - CACHE_NEAR)) {
+ // The requested position is beyond the end of the existing cache, but the
+ // reverse rules produced a position near or before the cached region.
+ // Retain the existing cache, and fill from the end of it.
+ retainCache = true;
+ } else if (backupPos < CACHE_NEAR) {
+ // The safe reverse rules moved us to near the start of text.
+ // Take that (index 0) as the backup boundary, avoiding the complication
+ // (in the following block) of moving forward from the safe point to a known boundary.
+ //
+ // Retain the cache if it begins not too far from the requested position.
+ aBoundary = 0;
+ retainCache = (fBoundaries[fStartBufIdx] <= (position + CACHE_NEAR));
+ } else {
+ // The safe reverse rules produced a position that is neither near the existing
+ // cache, nor near the start of text.
+ // Advance to the boundary following.
+ // There is a complication: the safe reverse rules identify pairs of code points
+ // that are safe. If advancing from the safe point moves forwards by less than
+ // two code points, we need to advance one more time to ensure that the boundary
+ // is good, including a correct rules status value.
+ retainCache = false;
+ fBI->fPosition = backupPos;
+ aBoundary = fBI->handleNext();
+ if (aBoundary != UBRK_DONE && aBoundary <= backupPos + 4) {
+ // +4 is a quick test for possibly having advanced only one codepoint.
+ // Four being the length of the longest potential code point, a supplementary in UTF-8
+ utext_setNativeIndex(&fBI->fText, aBoundary);
+ if (backupPos == utext_getPreviousNativeIndex(&fBI->fText)) {
+ // The initial handleNext() only advanced by a single code point. Go again.
+ aBoundary = fBI->handleNext(); // Safe rules identify safe pairs.
}
- ruleStatusIndex = fBI->fRuleStatusIndex;
}
+ if (aBoundary == UBRK_DONE) {
+ // Note (Andy Heninger): I don't think this condition can occur, but it's hard
+ // to prove that it can't. We ran off the end of the string looking a boundary
+ // following a safe point; choose the end of the string as that boundary.
+ aBoundary = utext_nativeLength(&fBI->fText);
+ }
+ ruleStatusIndex = fBI->fRuleStatusIndex;
}
+ }
+
+ if (!retainCache) {
+ U_ASSERT(aBoundary != -1);
reset(aBoundary, ruleStatusIndex); // Reset cache to hold aBoundary as a single starting point.
}
TESTCASE_AUTO(TestUnpairedSurrogate);
TESTCASE_AUTO(TestLSTMThai);
TESTCASE_AUTO(TestLSTMBurmese);
+ TESTCASE_AUTO(TestRandomAccess);
#if U_ENABLE_TRACING
TESTCASE_AUTO(TestTraceCreateCharacter);
runLSTMTestFromFile("Burmese_graphclust_model5_heavy_Test.txt", USCRIPT_MYANMAR);
}
+
+// Test preceding(index) and following(index), with semi-random indexes.
+// The random indexes are produced in clusters that are relatively closely spaced,
+// to increase the occurrences of hits to the internal break cache.
+
+void RBBITest::TestRandomAccess() {
+ static constexpr int32_t CACHE_SIZE = 128;
+
+ UnicodeString testData;
+ for (int i=0; i<CACHE_SIZE*2; ++i) {
+ testData.append(u"aaaa\n");
+ }
+
+ UErrorCode status = U_ZERO_ERROR;
+ LocalPointer<RuleBasedBreakIterator> bi(
+ (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getEnglish(), status),
+ status);
+ if (!assertSuccess(WHERE, status)) { return; };
+
+ bi->setText(testData);
+
+ auto expectedPreceding = [](int from) {
+ if (from == 0) {return UBRK_DONE;}
+ if (from % 5 == 0) {return from - 5;}
+ return from - (from % 5);
+ };
+
+ auto expectedFollow = [testData](int from) {
+ if (from >= testData.length()) {return UBRK_DONE;}
+ if (from % 5 == 0) {return from + 5;}
+ return from + (5 - (from % 5));
+ };
+
+ auto randomStringIndex = [testData]() {
+ static icu_rand randomGenerator; // produces random uint32_t values.
+ static int lastNum;
+ static int clusterCount;
+ static constexpr int CLUSTER_SIZE = 100;
+ static constexpr int CLUSTER_LENGTH = 10;
+
+ if (clusterCount < CLUSTER_LENGTH) {
+ ++clusterCount;
+ lastNum += (randomGenerator() % CLUSTER_SIZE);
+ lastNum -= CLUSTER_SIZE / 2;
+ lastNum = std::max(0, lastNum);
+ // Deliberately test indexes > testData.length.
+ lastNum = std::min(testData.length() + 5, lastNum);
+ } else {
+ clusterCount = 0;
+ lastNum = randomGenerator() % testData.length();
+ }
+ return lastNum;
+ };
+
+ for (int i=0; i<5000; ++i) {
+ int idx = randomStringIndex();
+ assertEquals(WHERE, expectedFollow(idx), bi->following(idx));
+ idx = randomStringIndex();
+ assertEquals(WHERE, expectedPreceding(idx), bi->preceding(idx));
+ }
+}
+
#endif // #if !UCONFIG_NO_BREAK_ITERATION
void TestBug13590();
void TestLSTMThai();
void TestLSTMBurmese();
+ void TestRandomAccess();
#if U_ENABLE_TRACING
void TestTraceCreateCharacter();
boolean populateNear(int position) {
assert(position < fBoundaries[fStartBufIdx] || position > fBoundaries[fEndBufIdx]);
- // Find a boundary somewhere in the vicinity of the requested position.
- // Depending on the safe rules and the text data, it could be either before, at, or after
- // the requested position.
-
+ // Add boundaries to the cache near the specified position.
+ // The given position need not be a boundary itself.
+ // The input position must be within the range of the text, and
+ // on a code point boundary.
+ // If the requested position is a break boundary, leave the iteration
+ // position on it.
+ // If the requested position is not a boundary, leave the iteration
+ // position on the preceding boundary and include both the
+ // preceding and following boundaries in the cache.
+ // Additional boundaries, either preceding or following, may be added
+ // to the cache as a side effect.
// If the requested position is not near already cached positions, clear the existing cache,
// find a near-by boundary and begin new cache contents there.
- if ((position < fBoundaries[fStartBufIdx] - 15) || position > (fBoundaries[fEndBufIdx] + 15)) {
- int aBoundary = fText.getBeginIndex();
- int ruleStatusIndex = 0;
- if (position > aBoundary + 20) {
- int backupPos = handleSafePrevious(position);
- if (backupPos > aBoundary) {
- // Advance to the boundary following the backup position.
- // There is a complication: the safe reverse rules identify pairs of code points
- // that are safe. If advancing from the safe point moves forwards by less than
- // two code points, we need to advance one more time to ensure that the boundary
- // is good, including a correct rules status value.
- //
- fPosition = backupPos;
+ // Threshold for a text position to be considered near to existing cache contents.
+ // TODO: See issue ICU-22024 "perf tuning of Cache needed."
+ // This value is subject to change. See the ticket for more details.
+ final int CACHE_NEAR = 15;
+
+ int startOfText = fText.getBeginIndex();
+ int aBoundary = -1;
+ int ruleStatusIndex = 0;
+ boolean retainCache = false;
+ if ((position > fBoundaries[fStartBufIdx] - CACHE_NEAR) && position < (fBoundaries[fEndBufIdx] + CACHE_NEAR)) {
+ // Requested position is near the existing cache. Retain it.
+ retainCache = true;
+ } else if (position <= startOfText + CACHE_NEAR) {
+ // Requested position is near the start of the text. Fill cache from start, skipping
+ // the need to find a safe point.
+ retainCache = false;
+ aBoundary = startOfText;
+ } else {
+ // Requested position is not near the existing cache.
+ // Find a safe point to refill the cache from.
+ int backupPos = handleSafePrevious(position);
+
+ if (fBoundaries[fEndBufIdx] < position && fBoundaries[fEndBufIdx] >= (backupPos - CACHE_NEAR)) {
+ // The requested position is beyond the end of the existing cache, but the
+ // reverse rules produced a position near or before the cached region.
+ // Retain the existing cache, and fill from the end of it.
+ retainCache = true;
+ } else if (backupPos < startOfText + CACHE_NEAR) {
+ // The safe reverse rules moved us to near the start of text.
+ // Take that (index 0) as the backup boundary, avoiding the complication
+ // (in the following block) of moving forward from the safe point to a known boundary.
+ //
+ // Retain the cache if it begins not too far from the requested position.
+ aBoundary = startOfText;
+ retainCache = (fBoundaries[fStartBufIdx] <= (position + CACHE_NEAR));
+ } else {
+ // The safe reverse rules produced a position that is neither near the existing
+ // cache, nor near the start of text.
+ // Advance to the boundary following.
+ // There is a complication: the safe reverse rules identify pairs of code points
+ // that are safe. If advancing from the safe point moves forwards by less than
+ // two code points, we need to advance one more time to ensure that the boundary
+ // is good, including a correct rules status value.
+ //
+ retainCache = false;
+ fPosition = backupPos;
+ aBoundary = handleNext();
+ if (aBoundary == backupPos + 1 ||
+ (aBoundary == backupPos + 2 &&
+ Character.isHighSurrogate(fText.setIndex(backupPos)) &&
+ Character.isLowSurrogate(fText.next()))) {
+ // The initial handleNext() only advanced by a single code point. Go again.
+ // Safe rules identify safe pairs.
aBoundary = handleNext();
- if (aBoundary == backupPos + 1 ||
- (aBoundary == backupPos + 2 &&
- Character.isHighSurrogate(fText.setIndex(backupPos)) &&
- Character.isLowSurrogate(fText.next()))) {
- // The initial handleNext() only advanced by a single code point. Go again.
- // Safe rules identify safe pairs.
- aBoundary = handleNext();
- }
+ }
+ if (aBoundary == BreakIterator.DONE) {
+ aBoundary = fText.getEndIndex();
}
ruleStatusIndex = fRuleStatusIndex;
}
+ }
+
+ if (!retainCache) {
+ assert(aBoundary != -1);
reset(aBoundary, ruleStatusIndex); // Reset cache to hold aBoundary as a single starting point.
}
+
// Fill in boundaries between existing cache content and the new requested position.
if (fBoundaries[fEndBufIdx] < position) {
bi = new RuleBasedBreakIterator(rules);
assertEquals("Rules does not match", rules, bi.toString());
}
+
+ /* Test preceding(index) and following(index), with semi-random indexes.
+ * The random indexes are produced in clusters that are relatively closely spaced,
+ * to increase the occurrences of hits to the internal break cache.
+ */
+ @Test
+ public void TestRandomAccess() {
+ final int CACHE_SIZE = 128;
+ final StringBuilder testData = new StringBuilder();
+ for (int i=0; i<CACHE_SIZE*2; ++i) {
+ testData.append("aaaa\n");
+ }
+ RuleBasedBreakIterator bi =
+ (RuleBasedBreakIterator)BreakIterator.getLineInstance(ULocale.ENGLISH);
+ bi.setText(testData);
+
+ class Fns { // This class exists only to allow declaring nested functions
+ // within TestRandomAccess().
+ public int expectedPreceding(int from) {
+ if (from == 0) {return BreakIterator.DONE;}
+ if (from % 5 == 0) {return from - 5;}
+ return from - (from % 5);
+ };
+
+ public int expectedFollow(int from) {
+ if (from >= testData.length()) {return BreakIterator.DONE;}
+ if (from % 5 == 0) {return from + 5;}
+ return from + (5 - (from % 5));
+ };
+
+ ICU_Rand randomGenerator = new ICU_Rand(0);
+ int lastNum;
+ int clusterCount;
+ static final int CLUSTER_SIZE = 100;
+ static final int CLUSTER_LENGTH = 10;
+ public int randomStringIndex() {
+ if (clusterCount < CLUSTER_LENGTH) {
+ ++clusterCount;
+ lastNum += (randomGenerator.next() % CLUSTER_SIZE);
+ lastNum -= CLUSTER_SIZE / 2;
+ lastNum = Math.max(0, lastNum);
+ lastNum = Math.min(testData.length() + 5, lastNum);
+ } else {
+ clusterCount = 0;
+ lastNum = randomGenerator.next() % testData.length();
+ }
+ return lastNum;
+ };
+ };
+ Fns fns = new Fns();
+
+ for (int i=0; i<5000; ++i) {
+ int idx = fns.randomStringIndex();
+ assertEquals("following" + idx, fns.expectedFollow(idx), bi.following(idx));
+ idx = fns.randomStringIndex();
+ assertEquals("preceding" + idx, fns.expectedPreceding(idx), bi.preceding(idx));
+ }
+ }
}