ICU-21946 RBBI Break Cache Optimizations

author Andy Heninger <andy.heninger@gmail.com>

Tue, 10 May 2022 00:29:04 +0000 (17:29 -0700)

committer Markus Scherer <markus.icu@gmail.com>

Sat, 20 Aug 2022 23:16:30 +0000 (16:16 -0700)
author Andy Heninger <andy.heninger@gmail.com>
Tue, 10 May 2022 00:29:04 +0000 (17:29 -0700)
committer Markus Scherer <markus.icu@gmail.com>
Sat, 20 Aug 2022 23:16:30 +0000 (16:16 -0700)
diff --git a/icu4c/source/common/rbbi_cache.cpp b/icu4c/source/common/rbbi_cache.cpp

index 26d82df7811838e272aebcdcbbb02a4a69997948..4f83c5c604fea26dcc9df9cd287f791e66b7f25a 100644 (file)
--- a/icu4c/source/common/rbbi_cache.cpp
+++ b/icu4c/source/common/rbbi_cache.cpp
@@ -341,41 +341,87 @@ UBool RuleBasedBreakIterator::BreakCache::populateNear(int32_t position, UErrorC
      }
      U_ASSERT(position < fBoundaries[fStartBufIdx] || position > fBoundaries[fEndBufIdx]);
  
-    // Find a boundary somewhere in the vicinity of the requested position.
-    // Depending on the safe rules and the text data, it could be either before, at, or after
-    // the requested position.
-
+    // Add boundaries to the cache near the specified position.
+    // The given position need not be a boundary itself.
+    // The input position must be within the range of the text, and
+    // on a code point boundary.
+    // If the requested position is a break boundary, leave the iteration
+    // position on it.
+    // If the requested position is not a boundary, leave the iteration
+    // position on the preceding boundary and include both the
+    // preceding and following boundaries in the cache.
+    // Additional boundaries, either preceding or following, may be added
+    // to the cache as a side effect.
  
      // If the requested position is not near already cached positions, clear the existing cache,
      // find a near-by boundary and begin new cache contents there.
  
-    if ((position < fBoundaries[fStartBufIdx] - 15) || position > (fBoundaries[fEndBufIdx] + 15)) {
-        int32_t aBoundary = 0;
-        int32_t ruleStatusIndex = 0;
-        if (position > 20) {
-            int32_t backupPos = fBI->handleSafePrevious(position);
-
-            if (backupPos > 0) {
-                // Advance to the boundary following the backup position.
-                // There is a complication: the safe reverse rules identify pairs of code points
-                // that are safe. If advancing from the safe point moves forwards by less than
-                // two code points, we need to advance one more time to ensure that the boundary
-                // is good, including a correct rules status value.
-                //
-                fBI->fPosition = backupPos;
-                aBoundary = fBI->handleNext();
-                if (aBoundary <= backupPos + 4) {
-                    // +4 is a quick test for possibly having advanced only one codepoint.
-                    // Four being the length of the longest potential code point, a supplementary in UTF-8
-                    utext_setNativeIndex(&fBI->fText, aBoundary);
-                    if (backupPos == utext_getPreviousNativeIndex(&fBI->fText)) {
-                        // The initial handleNext() only advanced by a single code point. Go again.
-                        aBoundary = fBI->handleNext();   // Safe rules identify safe pairs.
-                    }
+    // Threshold for a text position to be considered near to existing cache contents.
+    // TODO: See issue ICU-22024 "perf tuning of Cache needed."
+    //       This value is subject to change. See the ticket for more details.
+    static constexpr int32_t CACHE_NEAR = 15;
+
+    int32_t aBoundary = -1;
+    int32_t ruleStatusIndex = 0;
+    bool retainCache = false;
+    if ((position > fBoundaries[fStartBufIdx] - CACHE_NEAR) && position < (fBoundaries[fEndBufIdx] + CACHE_NEAR)) {
+        // Requested position is near the existing cache. Retain it.
+        retainCache = true;
+    } else if (position <= CACHE_NEAR) {
+        // Requested position is near the start of the text. Fill cache from start, skipping
+        // the need to find a safe point.
+        retainCache = false;
+        aBoundary = 0;
+    } else {
+        // Requested position is not near the existing cache.
+        // Find a safe point to refill the cache from.
+        int32_t backupPos = fBI->handleSafePrevious(position);
+
+        if (fBoundaries[fEndBufIdx] < position && fBoundaries[fEndBufIdx] >= (backupPos - CACHE_NEAR)) {
+            // The requested position is beyond the end of the existing cache, but the
+            // reverse rules produced a position near or before the cached region.
+            // Retain the existing cache, and fill from the end of it.
+            retainCache = true;
+        } else if (backupPos < CACHE_NEAR) {
+            // The safe reverse rules moved us to near the start of text.
+            // Take that (index 0) as the backup boundary, avoiding the complication
+            // (in the following block) of moving forward from the safe point to a known boundary.
+            //
+            // Retain the cache if it begins not too far from the requested position.
+            aBoundary = 0;
+            retainCache = (fBoundaries[fStartBufIdx] <= (position + CACHE_NEAR));
+        } else {
+            // The safe reverse rules produced a position that is neither near the existing
+            // cache, nor near the start of text.
+            // Advance to the boundary following.
+            // There is a complication: the safe reverse rules identify pairs of code points
+            // that are safe. If advancing from the safe point moves forwards by less than
+            // two code points, we need to advance one more time to ensure that the boundary
+            // is good, including a correct rules status value.
+            retainCache = false;
+            fBI->fPosition = backupPos;
+            aBoundary = fBI->handleNext();
+            if (aBoundary != UBRK_DONE && aBoundary <= backupPos + 4) {
+                // +4 is a quick test for possibly having advanced only one codepoint.
+                // Four being the length of the longest potential code point, a supplementary in UTF-8
+                utext_setNativeIndex(&fBI->fText, aBoundary);
+                if (backupPos == utext_getPreviousNativeIndex(&fBI->fText)) {
+                    // The initial handleNext() only advanced by a single code point. Go again.
+                    aBoundary = fBI->handleNext();   // Safe rules identify safe pairs.
                  }
-                ruleStatusIndex = fBI->fRuleStatusIndex;
              }
+            if (aBoundary == UBRK_DONE) {
+                // Note (Andy Heninger): I don't think this condition can occur, but it's hard
+                // to prove that it can't. We ran off the end of the string looking a boundary
+                // following a safe point; choose the end of the string as that boundary.
+                aBoundary = utext_nativeLength(&fBI->fText);
+            }
+            ruleStatusIndex = fBI->fRuleStatusIndex;
          }
+    }
+
+    if (!retainCache) {
+        U_ASSERT(aBoundary != -1);
          reset(aBoundary, ruleStatusIndex);        // Reset cache to hold aBoundary as a single starting point.
      }
  
diff --git a/icu4c/source/test/intltest/rbbitst.cpp b/icu4c/source/test/intltest/rbbitst.cpp

index 69db5db018cb0baed3bd3e313da8e7f9d4f2604e..09d71a7950a302e1b21f5b40b6caf559c1fd1ba5 100644 (file)
--- a/icu4c/source/test/intltest/rbbitst.cpp
+++ b/icu4c/source/test/intltest/rbbitst.cpp
@@ -139,6 +139,7 @@ void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, cha
      TESTCASE_AUTO(TestUnpairedSurrogate);
      TESTCASE_AUTO(TestLSTMThai);
      TESTCASE_AUTO(TestLSTMBurmese);
+    TESTCASE_AUTO(TestRandomAccess);
  
  #if U_ENABLE_TRACING
      TESTCASE_AUTO(TestTraceCreateCharacter);
@@ -5453,4 +5454,66 @@ void RBBITest::TestLSTMBurmese() {
      runLSTMTestFromFile("Burmese_graphclust_model5_heavy_Test.txt", USCRIPT_MYANMAR);
  }
  
+
+// Test preceding(index) and following(index), with semi-random indexes.
+// The random indexes are produced in clusters that are relatively closely spaced,
+// to increase the occurrences of hits to the internal break cache.
+
+void RBBITest::TestRandomAccess() {
+    static constexpr int32_t CACHE_SIZE = 128;
+
+    UnicodeString testData;
+    for (int i=0; i<CACHE_SIZE*2; ++i) {
+        testData.append(u"aaaa\n");
+    }
+
+    UErrorCode status = U_ZERO_ERROR;
+    LocalPointer<RuleBasedBreakIterator> bi(
+            (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getEnglish(), status),
+            status);
+    if (!assertSuccess(WHERE, status)) { return; };
+
+    bi->setText(testData);
+
+    auto expectedPreceding = [](int from) {
+        if (from == 0) {return UBRK_DONE;}
+        if (from % 5 == 0) {return from - 5;}
+        return from - (from % 5);
+    };
+
+    auto expectedFollow = [testData](int from) {
+        if (from >= testData.length()) {return UBRK_DONE;}
+        if (from % 5 == 0) {return from + 5;}
+        return from + (5 - (from % 5));
+    };
+
+    auto randomStringIndex = [testData]() {
+        static icu_rand randomGenerator;  // produces random uint32_t values.
+        static int lastNum;
+        static int clusterCount;
+        static constexpr int CLUSTER_SIZE = 100;
+        static constexpr int CLUSTER_LENGTH = 10;
+
+        if (clusterCount < CLUSTER_LENGTH) {
+            ++clusterCount;
+            lastNum += (randomGenerator() % CLUSTER_SIZE);
+            lastNum -= CLUSTER_SIZE / 2;
+            lastNum = std::max(0, lastNum);
+            // Deliberately test indexes > testData.length.
+            lastNum = std::min(testData.length() + 5, lastNum);
+        } else {
+            clusterCount = 0;
+            lastNum = randomGenerator() % testData.length();
+        }
+        return lastNum;
+    };
+
+    for (int i=0; i<5000; ++i) {
+        int idx = randomStringIndex();
+        assertEquals(WHERE, expectedFollow(idx), bi->following(idx));
+        idx = randomStringIndex();
+        assertEquals(WHERE, expectedPreceding(idx), bi->preceding(idx));
+    }
+}
+
  #endif // #if !UCONFIG_NO_BREAK_ITERATION
diff --git a/icu4c/source/test/intltest/rbbitst.h b/icu4c/source/test/intltest/rbbitst.h

index d5d52119bc0031b5315bbe4de95f3141a3f13666..9d49b213efcba70c0b0b48ba2e3100b47bc43b2b 100644 (file)
--- a/icu4c/source/test/intltest/rbbitst.h
+++ b/icu4c/source/test/intltest/rbbitst.h
@@ -95,6 +95,7 @@ public:
      void TestBug13590();
      void TestLSTMThai();
      void TestLSTMBurmese();
+    void TestRandomAccess();
  
  #if U_ENABLE_TRACING
      void TestTraceCreateCharacter();
diff --git a/icu4j/main/classes/core/src/com/ibm/icu/text/RuleBasedBreakIterator.java b/icu4j/main/classes/core/src/com/ibm/icu/text/RuleBasedBreakIterator.java

index 507677579fcf14108a8fbf237e39fc301ef667d7..d0cf532ec761056527f7319a6e85f0c1a425dc9a 100644 (file)
--- a/icu4j/main/classes/core/src/com/ibm/icu/text/RuleBasedBreakIterator.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/text/RuleBasedBreakIterator.java
@@ -1436,42 +1436,89 @@ class BreakCache {
      boolean populateNear(int position) {
          assert(position < fBoundaries[fStartBufIdx] || position > fBoundaries[fEndBufIdx]);
  
-        // Find a boundary somewhere in the vicinity of the requested position.
-        // Depending on the safe rules and the text data, it could be either before, at, or after
-        // the requested position.
-
+        // Add boundaries to the cache near the specified position.
+        // The given position need not be a boundary itself.
+        // The input position must be within the range of the text, and
+        // on a code point boundary.
+        // If the requested position is a break boundary, leave the iteration
+        // position on it.
+        // If the requested position is not a boundary, leave the iteration
+        // position on the preceding boundary and include both the
+        // preceding and following boundaries in the cache.
+        // Additional boundaries, either preceding or following, may be added
+        // to the cache as a side effect.
  
          // If the requested position is not near already cached positions, clear the existing cache,
          // find a near-by boundary and begin new cache contents there.
  
-        if ((position < fBoundaries[fStartBufIdx] - 15) || position > (fBoundaries[fEndBufIdx] + 15)) {
-            int aBoundary = fText.getBeginIndex();
-            int ruleStatusIndex = 0;
-            if (position > aBoundary + 20) {
-                int backupPos = handleSafePrevious(position);
-                if (backupPos > aBoundary) {
-                    // Advance to the boundary following the backup position.
-                    // There is a complication: the safe reverse rules identify pairs of code points
-                    // that are safe. If advancing from the safe point moves forwards by less than
-                    // two code points, we need to advance one more time to ensure that the boundary
-                    // is good, including a correct rules status value.
-                    //
-                    fPosition = backupPos;
+        // Threshold for a text position to be considered near to existing cache contents.
+        // TODO: See issue ICU-22024 "perf tuning of Cache needed."
+        //       This value is subject to change. See the ticket for more details.
+        final int CACHE_NEAR = 15;
+
+        int startOfText = fText.getBeginIndex();
+        int aBoundary = -1;
+        int ruleStatusIndex = 0;
+        boolean retainCache = false;
+        if ((position > fBoundaries[fStartBufIdx] - CACHE_NEAR) && position < (fBoundaries[fEndBufIdx] + CACHE_NEAR)) {
+            // Requested position is near the existing cache. Retain it.
+            retainCache = true;
+        } else if (position <= startOfText + CACHE_NEAR) {
+            // Requested position is near the start of the text. Fill cache from start, skipping
+            // the need to find a safe point.
+            retainCache = false;
+            aBoundary = startOfText;
+        } else {
+            // Requested position is not near the existing cache.
+            // Find a safe point to refill the cache from.
+            int backupPos = handleSafePrevious(position);
+
+            if (fBoundaries[fEndBufIdx] < position && fBoundaries[fEndBufIdx] >= (backupPos - CACHE_NEAR)) {
+                // The requested position is beyond the end of the existing cache, but the
+                // reverse rules produced a position near or before the cached region.
+                // Retain the existing cache, and fill from the end of it.
+                retainCache = true;
+            } else if (backupPos < startOfText + CACHE_NEAR) {
+                // The safe reverse rules moved us to near the start of text.
+                // Take that (index 0) as the backup boundary, avoiding the complication
+                // (in the following block) of moving forward from the safe point to a known boundary.
+                //
+                // Retain the cache if it begins not too far from the requested position.
+                aBoundary = startOfText;
+                retainCache = (fBoundaries[fStartBufIdx] <= (position + CACHE_NEAR));
+            } else {
+                // The safe reverse rules produced a position that is neither near the existing
+                // cache, nor near the start of text.
+                // Advance to the boundary following.
+                // There is a complication: the safe reverse rules identify pairs of code points
+                // that are safe. If advancing from the safe point moves forwards by less than
+                // two code points, we need to advance one more time to ensure that the boundary
+                // is good, including a correct rules status value.
+                //
+                retainCache = false;
+                fPosition = backupPos;
+                aBoundary = handleNext();
+                if (aBoundary == backupPos + 1 ||
+                        (aBoundary == backupPos + 2 &&
+                        Character.isHighSurrogate(fText.setIndex(backupPos)) &&
+                        Character.isLowSurrogate(fText.next()))) {
+                    // The initial handleNext() only advanced by a single code point. Go again.
+                    // Safe rules identify safe pairs.
                      aBoundary = handleNext();
-                    if (aBoundary == backupPos + 1 ||
-                            (aBoundary == backupPos + 2 &&
-                            Character.isHighSurrogate(fText.setIndex(backupPos)) &&
-                            Character.isLowSurrogate(fText.next()))) {
-                        // The initial handleNext() only advanced by a single code point. Go again.
-                        // Safe rules identify safe pairs.
-                        aBoundary = handleNext();
-                    }
+                }
+                if (aBoundary == BreakIterator.DONE) {
+                    aBoundary = fText.getEndIndex();
                  }
                  ruleStatusIndex = fRuleStatusIndex;
              }
+        }
+
+        if (!retainCache) {
+            assert(aBoundary != -1);
              reset(aBoundary, ruleStatusIndex);               // Reset cache to hold aBoundary as a single starting point.
          }
  
+
          // Fill in boundaries between existing cache content and the new requested position.
  
          if (fBoundaries[fEndBufIdx] < position) {
diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITest.java b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITest.java

index 45ade284baa94dfe43fe69292d0616c9720a8a5f..64aba243a938e8e08ea9c08c45c6353b7cb155ae 100644 (file)
--- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITest.java
+++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITest.java
@@ -946,4 +946,62 @@ public class RBBITest extends TestFmwk {
          bi = new RuleBasedBreakIterator(rules);
          assertEquals("Rules does not match", rules, bi.toString());
      }
+
+    /* Test preceding(index) and following(index), with semi-random indexes.
+     * The random indexes are produced in clusters that are relatively closely spaced,
+     * to increase the occurrences of hits to the internal break cache.
+     */
+    @Test
+    public void TestRandomAccess() {
+        final int CACHE_SIZE = 128;
+        final StringBuilder testData = new StringBuilder();
+        for (int i=0; i<CACHE_SIZE*2; ++i) {
+            testData.append("aaaa\n");
+        }
+        RuleBasedBreakIterator bi =
+                (RuleBasedBreakIterator)BreakIterator.getLineInstance(ULocale.ENGLISH);
+        bi.setText(testData);
+
+        class Fns {     // This class exists only to allow declaring nested functions
+                        // within TestRandomAccess().
+            public int expectedPreceding(int from) {
+                if (from == 0) {return BreakIterator.DONE;}
+                if (from % 5 == 0) {return from - 5;}
+                return from - (from % 5);
+            };
+
+            public int expectedFollow(int from) {
+                if (from >= testData.length()) {return BreakIterator.DONE;}
+                if (from % 5 == 0) {return from + 5;}
+                return from + (5 - (from % 5));
+            };
+
+            ICU_Rand randomGenerator = new ICU_Rand(0);
+            int lastNum;
+            int clusterCount;
+            static final int CLUSTER_SIZE = 100;
+            static final int CLUSTER_LENGTH = 10;
+            public int randomStringIndex() {
+                if (clusterCount < CLUSTER_LENGTH) {
+                    ++clusterCount;
+                    lastNum += (randomGenerator.next() % CLUSTER_SIZE);
+                    lastNum -= CLUSTER_SIZE / 2;
+                    lastNum = Math.max(0, lastNum);
+                    lastNum = Math.min(testData.length() + 5, lastNum);
+                } else {
+                    clusterCount = 0;
+                    lastNum = randomGenerator.next() % testData.length();
+                }
+                return lastNum;
+            };
+        };
+        Fns fns = new Fns();
+
+        for (int i=0; i<5000; ++i) {
+            int idx = fns.randomStringIndex();
+            assertEquals("following" + idx, fns.expectedFollow(idx), bi.following(idx));
+            idx = fns.randomStringIndex();
+            assertEquals("preceding" + idx, fns.expectedPreceding(idx), bi.preceding(idx));
+        }
+    }
  }
author	Andy Heninger <andy.heninger@gmail.com>
	Tue, 10 May 2022 00:29:04 +0000 (17:29 -0700)
committer	Markus Scherer <markus.icu@gmail.com>
	Sat, 20 Aug 2022 23:16:30 +0000 (16:16 -0700)
icu4c/source/common/rbbi_cache.cpp		patch \| blob \| history
icu4c/source/test/intltest/rbbitst.cpp		patch \| blob \| history
icu4c/source/test/intltest/rbbitst.h		patch \| blob \| history
icu4j/main/classes/core/src/com/ibm/icu/text/RuleBasedBreakIterator.java		patch \| blob \| history
icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITest.java		patch \| blob \| history