ICU-13541 Improve RuleBasedBreakIterator construction time, patch from grhoten.

author Andy Heninger <andy.heninger@gmail.com>

Fri, 19 Jan 2018 22:30:56 +0000 (22:30 +0000)

committer Andy Heninger <andy.heninger@gmail.com>

Fri, 19 Jan 2018 22:30:56 +0000 (22:30 +0000)
author Andy Heninger <andy.heninger@gmail.com>
Fri, 19 Jan 2018 22:30:56 +0000 (22:30 +0000)
committer Andy Heninger <andy.heninger@gmail.com>
Fri, 19 Jan 2018 22:30:56 +0000 (22:30 +0000)
diff --git a/icu4c/source/common/rbbi.cpp b/icu4c/source/common/rbbi.cpp

index a02c48d427e68df2244574e7822541c8d47f6f44..61d187d36a3090d71bf8cd7aa75fcb14eb6fb89a 100644 (file)
--- a/icu4c/source/common/rbbi.cpp
+++ b/icu4c/source/common/rbbi.cpp
@@ -64,7 +64,9 @@ UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RuleBasedBreakIterator)
   * Constructs a RuleBasedBreakIterator that uses the already-created
   * tables object that is passed in as a parameter.
   */
-RuleBasedBreakIterator::RuleBasedBreakIterator(RBBIDataHeader* data, UErrorCode &status) {
+RuleBasedBreakIterator::RuleBasedBreakIterator(RBBIDataHeader* data, UErrorCode &status)
+ : fSCharIter(UnicodeString())
+{
      init(status);
      fData = new RBBIDataWrapper(data, status); // status checked in constructor
      if (U_FAILURE(status)) {return;}
@@ -80,7 +82,9 @@ RuleBasedBreakIterator::RuleBasedBreakIterator(RBBIDataHeader* data, UErrorCode
  //
  RuleBasedBreakIterator::RuleBasedBreakIterator(const uint8_t *compiledRules,
                         uint32_t       ruleLength,
-                       UErrorCode     &status) {
+                       UErrorCode     &status)
+ : fSCharIter(UnicodeString())
+{
      init(status);
      if (U_FAILURE(status)) {
          return;
@@ -110,6 +114,7 @@ RuleBasedBreakIterator::RuleBasedBreakIterator(const uint8_t *compiledRules,
  //
  //-------------------------------------------------------------------------------
  RuleBasedBreakIterator::RuleBasedBreakIterator(UDataMemory* udm, UErrorCode &status)
+ : fSCharIter(UnicodeString())
  {
      init(status);
      fData = new RBBIDataWrapper(udm, status); // status checked in constructor
@@ -130,6 +135,7 @@ RuleBasedBreakIterator::RuleBasedBreakIterator(UDataMemory* udm, UErrorCode &sta
  RuleBasedBreakIterator::RuleBasedBreakIterator( const UnicodeString  &rules,
                                                  UParseError          &parseError,
                                                  UErrorCode           &status)
+ : fSCharIter(UnicodeString())
  {
      init(status);
      if (U_FAILURE(status)) {return;}
@@ -152,7 +158,9 @@ RuleBasedBreakIterator::RuleBasedBreakIterator( const UnicodeString  &rules,
  //                           Used when creating a RuleBasedBreakIterator from a set
  //                           of rules.
  //-------------------------------------------------------------------------------
-RuleBasedBreakIterator::RuleBasedBreakIterator() {
+RuleBasedBreakIterator::RuleBasedBreakIterator()
+ : fSCharIter(UnicodeString())
+{
      UErrorCode status = U_ZERO_ERROR;
      init(status);
  }
@@ -165,7 +173,8 @@ RuleBasedBreakIterator::RuleBasedBreakIterator() {
  //
  //-------------------------------------------------------------------------------
  RuleBasedBreakIterator::RuleBasedBreakIterator(const RuleBasedBreakIterator& other)
-: BreakIterator(other)
+: BreakIterator(other),
+  fSCharIter(UnicodeString())
  {
      UErrorCode status = U_ZERO_ERROR;
      this->init(status);
@@ -177,15 +186,11 @@ RuleBasedBreakIterator::RuleBasedBreakIterator(const RuleBasedBreakIterator& oth
   * Destructor
   */
  RuleBasedBreakIterator::~RuleBasedBreakIterator() {
-    if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {
+    if (fCharIter != &fSCharIter) {
          // fCharIter was adopted from the outside.
          delete fCharIter;
      }
      fCharIter = NULL;
-    delete fSCharIter;
-    fSCharIter = NULL;
-    delete fDCharIter;
-    fDCharIter = NULL;
  
      utext_close(fText);
  
@@ -226,17 +231,21 @@ RuleBasedBreakIterator::operator=(const RuleBasedBreakIterator& that) {
      UErrorCode status = U_ZERO_ERROR;
      fText = utext_clone(fText, that.fText, FALSE, TRUE, &status);
  
-    if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {
+    if (fCharIter != &fSCharIter) {
          delete fCharIter;
      }
      fCharIter = NULL;
  
-    if (that.fCharIter != NULL ) {
+    if (that.fCharIter != NULL && that.fCharIter != &that.fSCharIter) {
          // This is a little bit tricky - it will intially appear that
          //  this->fCharIter is adopted, even if that->fCharIter was
          //  not adopted.  That's ok.
          fCharIter = that.fCharIter->clone();
      }
+    fSCharIter = that.fSCharIter;
+    if (fCharIter == NULL) {
+        fCharIter = &fSCharIter;
+    }
  
      if (fData != NULL) {
          fData->removeReference();
@@ -271,8 +280,6 @@ RuleBasedBreakIterator::operator=(const RuleBasedBreakIterator& that) {
  void RuleBasedBreakIterator::init(UErrorCode &status) {
      fText                 = NULL;
      fCharIter             = NULL;
-    fSCharIter            = NULL;
-    fDCharIter            = NULL;
      fData                 = NULL;
      fPosition             = 0;
      fRuleStatusIndex      = 0;
@@ -393,20 +400,13 @@ void RuleBasedBreakIterator::setText(UText *ut, UErrorCode &status) {
      //   Return one over an empty string instead - this is the closest
      //   we can come to signaling a failure.
      //   (GetText() is obsolete, this failure is sort of OK)
-    if (fDCharIter == NULL) {
-        static const UChar c = 0;
-        fDCharIter = new UCharCharacterIterator(&c, 0);
-        if (fDCharIter == NULL) {
-            status = U_MEMORY_ALLOCATION_ERROR;
-            return;
-        }
-    }
+    fSCharIter.setText(UnicodeString());
  
-    if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {
+    if (fCharIter != &fSCharIter) {
          // existing fCharIter was adopted from the outside.  Delete it now.
          delete fCharIter;
      }
-    fCharIter = fDCharIter;
+    fCharIter = &fSCharIter;
  
      this->first();
  }
@@ -439,7 +439,7 @@ void
  RuleBasedBreakIterator::adoptText(CharacterIterator* newText) {
      // If we are holding a CharacterIterator adopted from a
      //   previous call to this function, delete it now.
-    if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {
+    if (fCharIter != &fSCharIter) {
          delete fCharIter;
      }
  
@@ -473,17 +473,13 @@ RuleBasedBreakIterator::setText(const UnicodeString& newText) {
      //   Needed in case someone calls getText().
      //  Can not, unfortunately, do this lazily on the (probably never)
      //  call to getText(), because getText is const.
-    if (fSCharIter == NULL) {
-        fSCharIter = new StringCharacterIterator(newText);
-    } else {
-        fSCharIter->setText(newText);
-    }
+    fSCharIter.setText(newText);
  
-    if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {
+    if (fCharIter != &fSCharIter) {
          // old fCharIter was adopted from the outside.  Delete it.
          delete fCharIter;
      }
-    fCharIter = fSCharIter;
+    fCharIter = &fSCharIter;
  
      this->first();
  }
diff --git a/icu4c/source/common/rbbi_cache.cpp b/icu4c/source/common/rbbi_cache.cpp

index b3c3a38a0deced6a9fbfba6d3fe0bf8525baed7d..75da83e07997d1317c0fdc72ae29843197ffe9f0 100644 (file)
--- a/icu4c/source/common/rbbi_cache.cpp
+++ b/icu4c/source/common/rbbi_cache.cpp
@@ -26,14 +26,11 @@ U_NAMESPACE_BEGIN
   */
  
  RuleBasedBreakIterator::DictionaryCache::DictionaryCache(RuleBasedBreakIterator *bi, UErrorCode &status) :
-        fBI(bi), fBreaks(NULL), fPositionInCache(-1),
+        fBI(bi), fBreaks(status), fPositionInCache(-1),
          fStart(0), fLimit(0), fFirstRuleStatusIndex(0), fOtherRuleStatusIndex(0) {
-    fBreaks = new UVector32(status);
  }
  
  RuleBasedBreakIterator::DictionaryCache::~DictionaryCache() {
-    delete fBreaks;
-    fBreaks = NULL;
  }
  
  void RuleBasedBreakIterator::DictionaryCache::reset() {
@@ -42,7 +39,7 @@ void RuleBasedBreakIterator::DictionaryCache::reset() {
      fLimit = 0;
      fFirstRuleStatusIndex = 0;
      fOtherRuleStatusIndex = 0;
-    fBreaks->removeAllElements();
+    fBreaks.removeAllElements();
  }
  
  UBool RuleBasedBreakIterator::DictionaryCache::following(int32_t fromPos, int32_t *result, int32_t *statusIndex) {
@@ -54,13 +51,13 @@ UBool RuleBasedBreakIterator::DictionaryCache::following(int32_t fromPos, int32_
      // Sequential iteration, move from previous boundary to the following
  
      int32_t r = 0;
-    if (fPositionInCache >= 0 && fPositionInCache < fBreaks->size() && fBreaks->elementAti(fPositionInCache) == fromPos) {
+    if (fPositionInCache >= 0 && fPositionInCache < fBreaks.size() && fBreaks.elementAti(fPositionInCache) == fromPos) {
          ++fPositionInCache;
-        if (fPositionInCache >= fBreaks->size()) {
+        if (fPositionInCache >= fBreaks.size()) {
              fPositionInCache = -1;
              return FALSE;
          }
-        r = fBreaks->elementAti(fPositionInCache);
+        r = fBreaks.elementAti(fPositionInCache);
          U_ASSERT(r > fromPos);
          *result = r;
          *statusIndex = fOtherRuleStatusIndex;
@@ -69,8 +66,8 @@ UBool RuleBasedBreakIterator::DictionaryCache::following(int32_t fromPos, int32_
  
      // Random indexing. Linear search for the boundary following the given position.
  
-    for (fPositionInCache = 0; fPositionInCache < fBreaks->size(); ++fPositionInCache) {
-        r= fBreaks->elementAti(fPositionInCache);
+    for (fPositionInCache = 0; fPositionInCache < fBreaks.size(); ++fPositionInCache) {
+        r= fBreaks.elementAti(fPositionInCache);
          if (r > fromPos) {
              *result = r;
              *statusIndex = fOtherRuleStatusIndex;
@@ -90,16 +87,16 @@ UBool RuleBasedBreakIterator::DictionaryCache::preceding(int32_t fromPos, int32_
      }
  
      if (fromPos == fLimit) {
-        fPositionInCache = fBreaks->size() - 1;
+        fPositionInCache = fBreaks.size() - 1;
          if (fPositionInCache >= 0) {
-            U_ASSERT(fBreaks->elementAti(fPositionInCache) == fromPos);
+            U_ASSERT(fBreaks.elementAti(fPositionInCache) == fromPos);
          }
      }
  
      int32_t r;
-    if (fPositionInCache > 0 && fPositionInCache < fBreaks->size() && fBreaks->elementAti(fPositionInCache) == fromPos) {
+    if (fPositionInCache > 0 && fPositionInCache < fBreaks.size() && fBreaks.elementAti(fPositionInCache) == fromPos) {
          --fPositionInCache;
-        r = fBreaks->elementAti(fPositionInCache);
+        r = fBreaks.elementAti(fPositionInCache);
          U_ASSERT(r < fromPos);
          *result = r;
          *statusIndex = ( r== fStart) ? fFirstRuleStatusIndex : fOtherRuleStatusIndex;
@@ -111,8 +108,8 @@ UBool RuleBasedBreakIterator::DictionaryCache::preceding(int32_t fromPos, int32_
          return FALSE;
      }
  
-    for (fPositionInCache = fBreaks->size()-1; fPositionInCache >= 0; --fPositionInCache) {
-        r = fBreaks->elementAti(fPositionInCache);
+    for (fPositionInCache = fBreaks.size()-1; fPositionInCache >= 0; --fPositionInCache) {
+        r = fBreaks.elementAti(fPositionInCache);
          if (r < fromPos) {
              *result = r;
              *statusIndex = ( r == fStart) ? fFirstRuleStatusIndex : fOtherRuleStatusIndex;
@@ -168,7 +165,7 @@ void RuleBasedBreakIterator::DictionaryCache::populateDictionary(int32_t startPo
          // Ask the language object if there are any breaks. It will add them to the cache and
          // leave the text pointer on the other side of its range, ready to search for the next one.
          if (lbe != NULL) {
-            foundBreakCount += lbe->findBreaks(text, rangeStart, rangeEnd, fBI->fBreakType, *fBreaks);
+            foundBreakCount += lbe->findBreaks(text, rangeStart, rangeEnd, fBI->fBreakType, fBreaks);
          }
  
          // Reload the loop variables for the next go-round
@@ -182,21 +179,21 @@ void RuleBasedBreakIterator::DictionaryCache::populateDictionary(int32_t startPo
  
      // printf("foundBreakCount = %d\n", foundBreakCount);
      if (foundBreakCount > 0) {
-        U_ASSERT(foundBreakCount == fBreaks->size());
-        if (startPos < fBreaks->elementAti(0)) {
+        U_ASSERT(foundBreakCount == fBreaks.size());
+        if (startPos < fBreaks.elementAti(0)) {
              // The dictionary did not place a boundary at the start of the segment of text.
              // Add one now. This should not commonly happen, but it would be easy for interactions
              // of the rules for dictionary segments and the break engine implementations to
              // inadvertently cause it. Cover it here, just in case.
-            fBreaks->insertElementAt(startPos, 0, status);
+            fBreaks.insertElementAt(startPos, 0, status);
          }
-        if (endPos > fBreaks->peeki()) {
-            fBreaks->push(endPos, status);
+        if (endPos > fBreaks.peeki()) {
+            fBreaks.push(endPos, status);
          }
          fPositionInCache = 0;
          // Note: Dictionary matching may extend beyond the original limit.
-        fStart = fBreaks->elementAti(0);
-        fLimit = fBreaks->peeki();
+        fStart = fBreaks.elementAti(0);
+        fLimit = fBreaks.peeki();
      } else {
          // there were no language-based breaks, even though the segment contained
          // dictionary characters. Subsequent attempts to fetch boundaries from the dictionary cache
diff --git a/icu4c/source/common/rbbi_cache.h b/icu4c/source/common/rbbi_cache.h

index dea017a91262f9bb794a009e055f6d5a40600a54..b4338c37490e2b3cdf1aa6343aa1f434748a0c86 100644 (file)
--- a/icu4c/source/common/rbbi_cache.h
+++ b/icu4c/source/common/rbbi_cache.h
@@ -56,7 +56,7 @@ class RuleBasedBreakIterator::DictionaryCache: public UMemory {
  
      RuleBasedBreakIterator *fBI;
      
-    UVector32          *fBreaks;                // A vector containing the boundaries.
+    UVector32           fBreaks;                // A vector containing the boundaries.
      int32_t             fPositionInCache;       // Index in fBreaks of last boundary returned by following()
                                                  //    or preceding(). Optimizes sequential access.
      int32_t             fStart;                 // Text position of first boundary in cache.
diff --git a/icu4c/source/common/unicode/rbbi.h b/icu4c/source/common/unicode/rbbi.h

index a0c1f90415c87f3874bd7fe25683a783399b6f80..47f51f2b698e368d3de56764679bf0bef6b21827 100644 (file)
--- a/icu4c/source/common/unicode/rbbi.h
+++ b/icu4c/source/common/unicode/rbbi.h
@@ -29,7 +29,6 @@
  #include "unicode/udata.h"
  #include "unicode/parseerr.h"
  #include "unicode/schriter.h"
-#include "unicode/uchriter.h"
  
  U_NAMESPACE_BEGIN
  
@@ -72,14 +71,7 @@ private:
       *    a characterIterator that wraps that data.  Needed only for the
       *    implementation of getText(), a backwards compatibility issue.
       */
-    StringCharacterIterator *fSCharIter;
-
-    /**
-     *  When the input text is provided by a UText, this
-     *    dummy CharacterIterator over an empty string will
-     *    be returned from getText()
-     */
-    UCharCharacterIterator *fDCharIter;
+    StringCharacterIterator fSCharIter;
  
      /**
       * The rule data for this BreakIterator instance
author	Andy Heninger <andy.heninger@gmail.com>
	Fri, 19 Jan 2018 22:30:56 +0000 (22:30 +0000)
committer	Andy Heninger <andy.heninger@gmail.com>
	Fri, 19 Jan 2018 22:30:56 +0000 (22:30 +0000)
icu4c/source/common/rbbi.cpp		patch \| blob \| history
icu4c/source/common/rbbi_cache.cpp		patch \| blob \| history
icu4c/source/common/rbbi_cache.h		patch \| blob \| history
icu4c/source/common/unicode/rbbi.h		patch \| blob \| history