From: Andy Heninger Date: Tue, 10 May 2022 00:29:04 +0000 (-0700) Subject: ICU-21946 RBBI Break Cache Optimizations X-Git-Tag: cldr/2022-09-07~17 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=b95c6b1f3eb12eb84c2dabe438fe36be55a0772c;p=icu ICU-21946 RBBI Break Cache Optimizations Adjust RuleBasedBreakIterator::BreakCache::populateNear() to retain the cache the cache contents in additional cases where are still useful, resulting in improved performance. This change is related to PR #2039, which addressed the same problem. This one retains the cache contents in more situations. --- diff --git a/icu4c/source/common/rbbi_cache.cpp b/icu4c/source/common/rbbi_cache.cpp index 26d82df7811..4f83c5c604f 100644 --- a/icu4c/source/common/rbbi_cache.cpp +++ b/icu4c/source/common/rbbi_cache.cpp @@ -341,41 +341,87 @@ UBool RuleBasedBreakIterator::BreakCache::populateNear(int32_t position, UErrorC } U_ASSERT(position < fBoundaries[fStartBufIdx] || position > fBoundaries[fEndBufIdx]); - // Find a boundary somewhere in the vicinity of the requested position. - // Depending on the safe rules and the text data, it could be either before, at, or after - // the requested position. - + // Add boundaries to the cache near the specified position. + // The given position need not be a boundary itself. + // The input position must be within the range of the text, and + // on a code point boundary. + // If the requested position is a break boundary, leave the iteration + // position on it. + // If the requested position is not a boundary, leave the iteration + // position on the preceding boundary and include both the + // preceding and following boundaries in the cache. + // Additional boundaries, either preceding or following, may be added + // to the cache as a side effect. // If the requested position is not near already cached positions, clear the existing cache, // find a near-by boundary and begin new cache contents there. - if ((position < fBoundaries[fStartBufIdx] - 15) || position > (fBoundaries[fEndBufIdx] + 15)) { - int32_t aBoundary = 0; - int32_t ruleStatusIndex = 0; - if (position > 20) { - int32_t backupPos = fBI->handleSafePrevious(position); - - if (backupPos > 0) { - // Advance to the boundary following the backup position. - // There is a complication: the safe reverse rules identify pairs of code points - // that are safe. If advancing from the safe point moves forwards by less than - // two code points, we need to advance one more time to ensure that the boundary - // is good, including a correct rules status value. - // - fBI->fPosition = backupPos; - aBoundary = fBI->handleNext(); - if (aBoundary <= backupPos + 4) { - // +4 is a quick test for possibly having advanced only one codepoint. - // Four being the length of the longest potential code point, a supplementary in UTF-8 - utext_setNativeIndex(&fBI->fText, aBoundary); - if (backupPos == utext_getPreviousNativeIndex(&fBI->fText)) { - // The initial handleNext() only advanced by a single code point. Go again. - aBoundary = fBI->handleNext(); // Safe rules identify safe pairs. - } + // Threshold for a text position to be considered near to existing cache contents. + // TODO: See issue ICU-22024 "perf tuning of Cache needed." + // This value is subject to change. See the ticket for more details. + static constexpr int32_t CACHE_NEAR = 15; + + int32_t aBoundary = -1; + int32_t ruleStatusIndex = 0; + bool retainCache = false; + if ((position > fBoundaries[fStartBufIdx] - CACHE_NEAR) && position < (fBoundaries[fEndBufIdx] + CACHE_NEAR)) { + // Requested position is near the existing cache. Retain it. + retainCache = true; + } else if (position <= CACHE_NEAR) { + // Requested position is near the start of the text. Fill cache from start, skipping + // the need to find a safe point. + retainCache = false; + aBoundary = 0; + } else { + // Requested position is not near the existing cache. + // Find a safe point to refill the cache from. + int32_t backupPos = fBI->handleSafePrevious(position); + + if (fBoundaries[fEndBufIdx] < position && fBoundaries[fEndBufIdx] >= (backupPos - CACHE_NEAR)) { + // The requested position is beyond the end of the existing cache, but the + // reverse rules produced a position near or before the cached region. + // Retain the existing cache, and fill from the end of it. + retainCache = true; + } else if (backupPos < CACHE_NEAR) { + // The safe reverse rules moved us to near the start of text. + // Take that (index 0) as the backup boundary, avoiding the complication + // (in the following block) of moving forward from the safe point to a known boundary. + // + // Retain the cache if it begins not too far from the requested position. + aBoundary = 0; + retainCache = (fBoundaries[fStartBufIdx] <= (position + CACHE_NEAR)); + } else { + // The safe reverse rules produced a position that is neither near the existing + // cache, nor near the start of text. + // Advance to the boundary following. + // There is a complication: the safe reverse rules identify pairs of code points + // that are safe. If advancing from the safe point moves forwards by less than + // two code points, we need to advance one more time to ensure that the boundary + // is good, including a correct rules status value. + retainCache = false; + fBI->fPosition = backupPos; + aBoundary = fBI->handleNext(); + if (aBoundary != UBRK_DONE && aBoundary <= backupPos + 4) { + // +4 is a quick test for possibly having advanced only one codepoint. + // Four being the length of the longest potential code point, a supplementary in UTF-8 + utext_setNativeIndex(&fBI->fText, aBoundary); + if (backupPos == utext_getPreviousNativeIndex(&fBI->fText)) { + // The initial handleNext() only advanced by a single code point. Go again. + aBoundary = fBI->handleNext(); // Safe rules identify safe pairs. } - ruleStatusIndex = fBI->fRuleStatusIndex; } + if (aBoundary == UBRK_DONE) { + // Note (Andy Heninger): I don't think this condition can occur, but it's hard + // to prove that it can't. We ran off the end of the string looking a boundary + // following a safe point; choose the end of the string as that boundary. + aBoundary = utext_nativeLength(&fBI->fText); + } + ruleStatusIndex = fBI->fRuleStatusIndex; } + } + + if (!retainCache) { + U_ASSERT(aBoundary != -1); reset(aBoundary, ruleStatusIndex); // Reset cache to hold aBoundary as a single starting point. } diff --git a/icu4c/source/test/intltest/rbbitst.cpp b/icu4c/source/test/intltest/rbbitst.cpp index 69db5db018c..09d71a7950a 100644 --- a/icu4c/source/test/intltest/rbbitst.cpp +++ b/icu4c/source/test/intltest/rbbitst.cpp @@ -139,6 +139,7 @@ void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, cha TESTCASE_AUTO(TestUnpairedSurrogate); TESTCASE_AUTO(TestLSTMThai); TESTCASE_AUTO(TestLSTMBurmese); + TESTCASE_AUTO(TestRandomAccess); #if U_ENABLE_TRACING TESTCASE_AUTO(TestTraceCreateCharacter); @@ -5453,4 +5454,66 @@ void RBBITest::TestLSTMBurmese() { runLSTMTestFromFile("Burmese_graphclust_model5_heavy_Test.txt", USCRIPT_MYANMAR); } + +// Test preceding(index) and following(index), with semi-random indexes. +// The random indexes are produced in clusters that are relatively closely spaced, +// to increase the occurrences of hits to the internal break cache. + +void RBBITest::TestRandomAccess() { + static constexpr int32_t CACHE_SIZE = 128; + + UnicodeString testData; + for (int i=0; i bi( + (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getEnglish(), status), + status); + if (!assertSuccess(WHERE, status)) { return; }; + + bi->setText(testData); + + auto expectedPreceding = [](int from) { + if (from == 0) {return UBRK_DONE;} + if (from % 5 == 0) {return from - 5;} + return from - (from % 5); + }; + + auto expectedFollow = [testData](int from) { + if (from >= testData.length()) {return UBRK_DONE;} + if (from % 5 == 0) {return from + 5;} + return from + (5 - (from % 5)); + }; + + auto randomStringIndex = [testData]() { + static icu_rand randomGenerator; // produces random uint32_t values. + static int lastNum; + static int clusterCount; + static constexpr int CLUSTER_SIZE = 100; + static constexpr int CLUSTER_LENGTH = 10; + + if (clusterCount < CLUSTER_LENGTH) { + ++clusterCount; + lastNum += (randomGenerator() % CLUSTER_SIZE); + lastNum -= CLUSTER_SIZE / 2; + lastNum = std::max(0, lastNum); + // Deliberately test indexes > testData.length. + lastNum = std::min(testData.length() + 5, lastNum); + } else { + clusterCount = 0; + lastNum = randomGenerator() % testData.length(); + } + return lastNum; + }; + + for (int i=0; i<5000; ++i) { + int idx = randomStringIndex(); + assertEquals(WHERE, expectedFollow(idx), bi->following(idx)); + idx = randomStringIndex(); + assertEquals(WHERE, expectedPreceding(idx), bi->preceding(idx)); + } +} + #endif // #if !UCONFIG_NO_BREAK_ITERATION diff --git a/icu4c/source/test/intltest/rbbitst.h b/icu4c/source/test/intltest/rbbitst.h index d5d52119bc0..9d49b213efc 100644 --- a/icu4c/source/test/intltest/rbbitst.h +++ b/icu4c/source/test/intltest/rbbitst.h @@ -95,6 +95,7 @@ public: void TestBug13590(); void TestLSTMThai(); void TestLSTMBurmese(); + void TestRandomAccess(); #if U_ENABLE_TRACING void TestTraceCreateCharacter(); diff --git a/icu4j/main/classes/core/src/com/ibm/icu/text/RuleBasedBreakIterator.java b/icu4j/main/classes/core/src/com/ibm/icu/text/RuleBasedBreakIterator.java index 507677579fc..d0cf532ec76 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/text/RuleBasedBreakIterator.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/text/RuleBasedBreakIterator.java @@ -1436,42 +1436,89 @@ class BreakCache { boolean populateNear(int position) { assert(position < fBoundaries[fStartBufIdx] || position > fBoundaries[fEndBufIdx]); - // Find a boundary somewhere in the vicinity of the requested position. - // Depending on the safe rules and the text data, it could be either before, at, or after - // the requested position. - + // Add boundaries to the cache near the specified position. + // The given position need not be a boundary itself. + // The input position must be within the range of the text, and + // on a code point boundary. + // If the requested position is a break boundary, leave the iteration + // position on it. + // If the requested position is not a boundary, leave the iteration + // position on the preceding boundary and include both the + // preceding and following boundaries in the cache. + // Additional boundaries, either preceding or following, may be added + // to the cache as a side effect. // If the requested position is not near already cached positions, clear the existing cache, // find a near-by boundary and begin new cache contents there. - if ((position < fBoundaries[fStartBufIdx] - 15) || position > (fBoundaries[fEndBufIdx] + 15)) { - int aBoundary = fText.getBeginIndex(); - int ruleStatusIndex = 0; - if (position > aBoundary + 20) { - int backupPos = handleSafePrevious(position); - if (backupPos > aBoundary) { - // Advance to the boundary following the backup position. - // There is a complication: the safe reverse rules identify pairs of code points - // that are safe. If advancing from the safe point moves forwards by less than - // two code points, we need to advance one more time to ensure that the boundary - // is good, including a correct rules status value. - // - fPosition = backupPos; + // Threshold for a text position to be considered near to existing cache contents. + // TODO: See issue ICU-22024 "perf tuning of Cache needed." + // This value is subject to change. See the ticket for more details. + final int CACHE_NEAR = 15; + + int startOfText = fText.getBeginIndex(); + int aBoundary = -1; + int ruleStatusIndex = 0; + boolean retainCache = false; + if ((position > fBoundaries[fStartBufIdx] - CACHE_NEAR) && position < (fBoundaries[fEndBufIdx] + CACHE_NEAR)) { + // Requested position is near the existing cache. Retain it. + retainCache = true; + } else if (position <= startOfText + CACHE_NEAR) { + // Requested position is near the start of the text. Fill cache from start, skipping + // the need to find a safe point. + retainCache = false; + aBoundary = startOfText; + } else { + // Requested position is not near the existing cache. + // Find a safe point to refill the cache from. + int backupPos = handleSafePrevious(position); + + if (fBoundaries[fEndBufIdx] < position && fBoundaries[fEndBufIdx] >= (backupPos - CACHE_NEAR)) { + // The requested position is beyond the end of the existing cache, but the + // reverse rules produced a position near or before the cached region. + // Retain the existing cache, and fill from the end of it. + retainCache = true; + } else if (backupPos < startOfText + CACHE_NEAR) { + // The safe reverse rules moved us to near the start of text. + // Take that (index 0) as the backup boundary, avoiding the complication + // (in the following block) of moving forward from the safe point to a known boundary. + // + // Retain the cache if it begins not too far from the requested position. + aBoundary = startOfText; + retainCache = (fBoundaries[fStartBufIdx] <= (position + CACHE_NEAR)); + } else { + // The safe reverse rules produced a position that is neither near the existing + // cache, nor near the start of text. + // Advance to the boundary following. + // There is a complication: the safe reverse rules identify pairs of code points + // that are safe. If advancing from the safe point moves forwards by less than + // two code points, we need to advance one more time to ensure that the boundary + // is good, including a correct rules status value. + // + retainCache = false; + fPosition = backupPos; + aBoundary = handleNext(); + if (aBoundary == backupPos + 1 || + (aBoundary == backupPos + 2 && + Character.isHighSurrogate(fText.setIndex(backupPos)) && + Character.isLowSurrogate(fText.next()))) { + // The initial handleNext() only advanced by a single code point. Go again. + // Safe rules identify safe pairs. aBoundary = handleNext(); - if (aBoundary == backupPos + 1 || - (aBoundary == backupPos + 2 && - Character.isHighSurrogate(fText.setIndex(backupPos)) && - Character.isLowSurrogate(fText.next()))) { - // The initial handleNext() only advanced by a single code point. Go again. - // Safe rules identify safe pairs. - aBoundary = handleNext(); - } + } + if (aBoundary == BreakIterator.DONE) { + aBoundary = fText.getEndIndex(); } ruleStatusIndex = fRuleStatusIndex; } + } + + if (!retainCache) { + assert(aBoundary != -1); reset(aBoundary, ruleStatusIndex); // Reset cache to hold aBoundary as a single starting point. } + // Fill in boundaries between existing cache content and the new requested position. if (fBoundaries[fEndBufIdx] < position) { diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITest.java b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITest.java index 45ade284baa..64aba243a93 100644 --- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITest.java +++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITest.java @@ -946,4 +946,62 @@ public class RBBITest extends TestFmwk { bi = new RuleBasedBreakIterator(rules); assertEquals("Rules does not match", rules, bi.toString()); } + + /* Test preceding(index) and following(index), with semi-random indexes. + * The random indexes are produced in clusters that are relatively closely spaced, + * to increase the occurrences of hits to the internal break cache. + */ + @Test + public void TestRandomAccess() { + final int CACHE_SIZE = 128; + final StringBuilder testData = new StringBuilder(); + for (int i=0; i= testData.length()) {return BreakIterator.DONE;} + if (from % 5 == 0) {return from + 5;} + return from + (5 - (from % 5)); + }; + + ICU_Rand randomGenerator = new ICU_Rand(0); + int lastNum; + int clusterCount; + static final int CLUSTER_SIZE = 100; + static final int CLUSTER_LENGTH = 10; + public int randomStringIndex() { + if (clusterCount < CLUSTER_LENGTH) { + ++clusterCount; + lastNum += (randomGenerator.next() % CLUSTER_SIZE); + lastNum -= CLUSTER_SIZE / 2; + lastNum = Math.max(0, lastNum); + lastNum = Math.min(testData.length() + 5, lastNum); + } else { + clusterCount = 0; + lastNum = randomGenerator.next() % testData.length(); + } + return lastNum; + }; + }; + Fns fns = new Fns(); + + for (int i=0; i<5000; ++i) { + int idx = fns.randomStringIndex(); + assertEquals("following" + idx, fns.expectedFollow(idx), bi.following(idx)); + idx = fns.randomStringIndex(); + assertEquals("preceding" + idx, fns.expectedPreceding(idx), bi.preceding(idx)); + } + } }