]> granicus.if.org Git - icu/commitdiff
ICU-13541 Improve RuleBasedBreakIterator construction time, patch from grhoten.
authorAndy Heninger <andy.heninger@gmail.com>
Fri, 19 Jan 2018 22:30:56 +0000 (22:30 +0000)
committerAndy Heninger <andy.heninger@gmail.com>
Fri, 19 Jan 2018 22:30:56 +0000 (22:30 +0000)
X-SVN-Rev: 40789

icu4c/source/common/rbbi.cpp
icu4c/source/common/rbbi_cache.cpp
icu4c/source/common/rbbi_cache.h
icu4c/source/common/unicode/rbbi.h

index a02c48d427e68df2244574e7822541c8d47f6f44..61d187d36a3090d71bf8cd7aa75fcb14eb6fb89a 100644 (file)
@@ -64,7 +64,9 @@ UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RuleBasedBreakIterator)
  * Constructs a RuleBasedBreakIterator that uses the already-created
  * tables object that is passed in as a parameter.
  */
-RuleBasedBreakIterator::RuleBasedBreakIterator(RBBIDataHeader* data, UErrorCode &status) {
+RuleBasedBreakIterator::RuleBasedBreakIterator(RBBIDataHeader* data, UErrorCode &status)
+ : fSCharIter(UnicodeString())
+{
     init(status);
     fData = new RBBIDataWrapper(data, status); // status checked in constructor
     if (U_FAILURE(status)) {return;}
@@ -80,7 +82,9 @@ RuleBasedBreakIterator::RuleBasedBreakIterator(RBBIDataHeader* data, UErrorCode
 //
 RuleBasedBreakIterator::RuleBasedBreakIterator(const uint8_t *compiledRules,
                        uint32_t       ruleLength,
-                       UErrorCode     &status) {
+                       UErrorCode     &status)
+ : fSCharIter(UnicodeString())
+{
     init(status);
     if (U_FAILURE(status)) {
         return;
@@ -110,6 +114,7 @@ RuleBasedBreakIterator::RuleBasedBreakIterator(const uint8_t *compiledRules,
 //
 //-------------------------------------------------------------------------------
 RuleBasedBreakIterator::RuleBasedBreakIterator(UDataMemory* udm, UErrorCode &status)
+ : fSCharIter(UnicodeString())
 {
     init(status);
     fData = new RBBIDataWrapper(udm, status); // status checked in constructor
@@ -130,6 +135,7 @@ RuleBasedBreakIterator::RuleBasedBreakIterator(UDataMemory* udm, UErrorCode &sta
 RuleBasedBreakIterator::RuleBasedBreakIterator( const UnicodeString  &rules,
                                                 UParseError          &parseError,
                                                 UErrorCode           &status)
+ : fSCharIter(UnicodeString())
 {
     init(status);
     if (U_FAILURE(status)) {return;}
@@ -152,7 +158,9 @@ RuleBasedBreakIterator::RuleBasedBreakIterator( const UnicodeString  &rules,
 //                           Used when creating a RuleBasedBreakIterator from a set
 //                           of rules.
 //-------------------------------------------------------------------------------
-RuleBasedBreakIterator::RuleBasedBreakIterator() {
+RuleBasedBreakIterator::RuleBasedBreakIterator()
+ : fSCharIter(UnicodeString())
+{
     UErrorCode status = U_ZERO_ERROR;
     init(status);
 }
@@ -165,7 +173,8 @@ RuleBasedBreakIterator::RuleBasedBreakIterator() {
 //
 //-------------------------------------------------------------------------------
 RuleBasedBreakIterator::RuleBasedBreakIterator(const RuleBasedBreakIterator& other)
-: BreakIterator(other)
+: BreakIterator(other),
+  fSCharIter(UnicodeString())
 {
     UErrorCode status = U_ZERO_ERROR;
     this->init(status);
@@ -177,15 +186,11 @@ RuleBasedBreakIterator::RuleBasedBreakIterator(const RuleBasedBreakIterator& oth
  * Destructor
  */
 RuleBasedBreakIterator::~RuleBasedBreakIterator() {
-    if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {
+    if (fCharIter != &fSCharIter) {
         // fCharIter was adopted from the outside.
         delete fCharIter;
     }
     fCharIter = NULL;
-    delete fSCharIter;
-    fSCharIter = NULL;
-    delete fDCharIter;
-    fDCharIter = NULL;
 
     utext_close(fText);
 
@@ -226,17 +231,21 @@ RuleBasedBreakIterator::operator=(const RuleBasedBreakIterator& that) {
     UErrorCode status = U_ZERO_ERROR;
     fText = utext_clone(fText, that.fText, FALSE, TRUE, &status);
 
-    if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {
+    if (fCharIter != &fSCharIter) {
         delete fCharIter;
     }
     fCharIter = NULL;
 
-    if (that.fCharIter != NULL ) {
+    if (that.fCharIter != NULL && that.fCharIter != &that.fSCharIter) {
         // This is a little bit tricky - it will intially appear that
         //  this->fCharIter is adopted, even if that->fCharIter was
         //  not adopted.  That's ok.
         fCharIter = that.fCharIter->clone();
     }
+    fSCharIter = that.fSCharIter;
+    if (fCharIter == NULL) {
+        fCharIter = &fSCharIter;
+    }
 
     if (fData != NULL) {
         fData->removeReference();
@@ -271,8 +280,6 @@ RuleBasedBreakIterator::operator=(const RuleBasedBreakIterator& that) {
 void RuleBasedBreakIterator::init(UErrorCode &status) {
     fText                 = NULL;
     fCharIter             = NULL;
-    fSCharIter            = NULL;
-    fDCharIter            = NULL;
     fData                 = NULL;
     fPosition             = 0;
     fRuleStatusIndex      = 0;
@@ -393,20 +400,13 @@ void RuleBasedBreakIterator::setText(UText *ut, UErrorCode &status) {
     //   Return one over an empty string instead - this is the closest
     //   we can come to signaling a failure.
     //   (GetText() is obsolete, this failure is sort of OK)
-    if (fDCharIter == NULL) {
-        static const UChar c = 0;
-        fDCharIter = new UCharCharacterIterator(&c, 0);
-        if (fDCharIter == NULL) {
-            status = U_MEMORY_ALLOCATION_ERROR;
-            return;
-        }
-    }
+    fSCharIter.setText(UnicodeString());
 
-    if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {
+    if (fCharIter != &fSCharIter) {
         // existing fCharIter was adopted from the outside.  Delete it now.
         delete fCharIter;
     }
-    fCharIter = fDCharIter;
+    fCharIter = &fSCharIter;
 
     this->first();
 }
@@ -439,7 +439,7 @@ void
 RuleBasedBreakIterator::adoptText(CharacterIterator* newText) {
     // If we are holding a CharacterIterator adopted from a
     //   previous call to this function, delete it now.
-    if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {
+    if (fCharIter != &fSCharIter) {
         delete fCharIter;
     }
 
@@ -473,17 +473,13 @@ RuleBasedBreakIterator::setText(const UnicodeString& newText) {
     //   Needed in case someone calls getText().
     //  Can not, unfortunately, do this lazily on the (probably never)
     //  call to getText(), because getText is const.
-    if (fSCharIter == NULL) {
-        fSCharIter = new StringCharacterIterator(newText);
-    } else {
-        fSCharIter->setText(newText);
-    }
+    fSCharIter.setText(newText);
 
-    if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {
+    if (fCharIter != &fSCharIter) {
         // old fCharIter was adopted from the outside.  Delete it.
         delete fCharIter;
     }
-    fCharIter = fSCharIter;
+    fCharIter = &fSCharIter;
 
     this->first();
 }
index b3c3a38a0deced6a9fbfba6d3fe0bf8525baed7d..75da83e07997d1317c0fdc72ae29843197ffe9f0 100644 (file)
@@ -26,14 +26,11 @@ U_NAMESPACE_BEGIN
  */
 
 RuleBasedBreakIterator::DictionaryCache::DictionaryCache(RuleBasedBreakIterator *bi, UErrorCode &status) :
-        fBI(bi), fBreaks(NULL), fPositionInCache(-1),
+        fBI(bi), fBreaks(status), fPositionInCache(-1),
         fStart(0), fLimit(0), fFirstRuleStatusIndex(0), fOtherRuleStatusIndex(0) {
-    fBreaks = new UVector32(status);
 }
 
 RuleBasedBreakIterator::DictionaryCache::~DictionaryCache() {
-    delete fBreaks;
-    fBreaks = NULL;
 }
 
 void RuleBasedBreakIterator::DictionaryCache::reset() {
@@ -42,7 +39,7 @@ void RuleBasedBreakIterator::DictionaryCache::reset() {
     fLimit = 0;
     fFirstRuleStatusIndex = 0;
     fOtherRuleStatusIndex = 0;
-    fBreaks->removeAllElements();
+    fBreaks.removeAllElements();
 }
 
 UBool RuleBasedBreakIterator::DictionaryCache::following(int32_t fromPos, int32_t *result, int32_t *statusIndex) {
@@ -54,13 +51,13 @@ UBool RuleBasedBreakIterator::DictionaryCache::following(int32_t fromPos, int32_
     // Sequential iteration, move from previous boundary to the following
 
     int32_t r = 0;
-    if (fPositionInCache >= 0 && fPositionInCache < fBreaks->size() && fBreaks->elementAti(fPositionInCache) == fromPos) {
+    if (fPositionInCache >= 0 && fPositionInCache < fBreaks.size() && fBreaks.elementAti(fPositionInCache) == fromPos) {
         ++fPositionInCache;
-        if (fPositionInCache >= fBreaks->size()) {
+        if (fPositionInCache >= fBreaks.size()) {
             fPositionInCache = -1;
             return FALSE;
         }
-        r = fBreaks->elementAti(fPositionInCache);
+        r = fBreaks.elementAti(fPositionInCache);
         U_ASSERT(r > fromPos);
         *result = r;
         *statusIndex = fOtherRuleStatusIndex;
@@ -69,8 +66,8 @@ UBool RuleBasedBreakIterator::DictionaryCache::following(int32_t fromPos, int32_
 
     // Random indexing. Linear search for the boundary following the given position.
 
-    for (fPositionInCache = 0; fPositionInCache < fBreaks->size(); ++fPositionInCache) {
-        r= fBreaks->elementAti(fPositionInCache);
+    for (fPositionInCache = 0; fPositionInCache < fBreaks.size(); ++fPositionInCache) {
+        r= fBreaks.elementAti(fPositionInCache);
         if (r > fromPos) {
             *result = r;
             *statusIndex = fOtherRuleStatusIndex;
@@ -90,16 +87,16 @@ UBool RuleBasedBreakIterator::DictionaryCache::preceding(int32_t fromPos, int32_
     }
 
     if (fromPos == fLimit) {
-        fPositionInCache = fBreaks->size() - 1;
+        fPositionInCache = fBreaks.size() - 1;
         if (fPositionInCache >= 0) {
-            U_ASSERT(fBreaks->elementAti(fPositionInCache) == fromPos);
+            U_ASSERT(fBreaks.elementAti(fPositionInCache) == fromPos);
         }
     }
 
     int32_t r;
-    if (fPositionInCache > 0 && fPositionInCache < fBreaks->size() && fBreaks->elementAti(fPositionInCache) == fromPos) {
+    if (fPositionInCache > 0 && fPositionInCache < fBreaks.size() && fBreaks.elementAti(fPositionInCache) == fromPos) {
         --fPositionInCache;
-        r = fBreaks->elementAti(fPositionInCache);
+        r = fBreaks.elementAti(fPositionInCache);
         U_ASSERT(r < fromPos);
         *result = r;
         *statusIndex = ( r== fStart) ? fFirstRuleStatusIndex : fOtherRuleStatusIndex;
@@ -111,8 +108,8 @@ UBool RuleBasedBreakIterator::DictionaryCache::preceding(int32_t fromPos, int32_
         return FALSE;
     }
 
-    for (fPositionInCache = fBreaks->size()-1; fPositionInCache >= 0; --fPositionInCache) {
-        r = fBreaks->elementAti(fPositionInCache);
+    for (fPositionInCache = fBreaks.size()-1; fPositionInCache >= 0; --fPositionInCache) {
+        r = fBreaks.elementAti(fPositionInCache);
         if (r < fromPos) {
             *result = r;
             *statusIndex = ( r == fStart) ? fFirstRuleStatusIndex : fOtherRuleStatusIndex;
@@ -168,7 +165,7 @@ void RuleBasedBreakIterator::DictionaryCache::populateDictionary(int32_t startPo
         // Ask the language object if there are any breaks. It will add them to the cache and
         // leave the text pointer on the other side of its range, ready to search for the next one.
         if (lbe != NULL) {
-            foundBreakCount += lbe->findBreaks(text, rangeStart, rangeEnd, fBI->fBreakType, *fBreaks);
+            foundBreakCount += lbe->findBreaks(text, rangeStart, rangeEnd, fBI->fBreakType, fBreaks);
         }
 
         // Reload the loop variables for the next go-round
@@ -182,21 +179,21 @@ void RuleBasedBreakIterator::DictionaryCache::populateDictionary(int32_t startPo
 
     // printf("foundBreakCount = %d\n", foundBreakCount);
     if (foundBreakCount > 0) {
-        U_ASSERT(foundBreakCount == fBreaks->size());
-        if (startPos < fBreaks->elementAti(0)) {
+        U_ASSERT(foundBreakCount == fBreaks.size());
+        if (startPos < fBreaks.elementAti(0)) {
             // The dictionary did not place a boundary at the start of the segment of text.
             // Add one now. This should not commonly happen, but it would be easy for interactions
             // of the rules for dictionary segments and the break engine implementations to
             // inadvertently cause it. Cover it here, just in case.
-            fBreaks->insertElementAt(startPos, 0, status);
+            fBreaks.insertElementAt(startPos, 0, status);
         }
-        if (endPos > fBreaks->peeki()) {
-            fBreaks->push(endPos, status);
+        if (endPos > fBreaks.peeki()) {
+            fBreaks.push(endPos, status);
         }
         fPositionInCache = 0;
         // Note: Dictionary matching may extend beyond the original limit.
-        fStart = fBreaks->elementAti(0);
-        fLimit = fBreaks->peeki();
+        fStart = fBreaks.elementAti(0);
+        fLimit = fBreaks.peeki();
     } else {
         // there were no language-based breaks, even though the segment contained
         // dictionary characters. Subsequent attempts to fetch boundaries from the dictionary cache
index dea017a91262f9bb794a009e055f6d5a40600a54..b4338c37490e2b3cdf1aa6343aa1f434748a0c86 100644 (file)
@@ -56,7 +56,7 @@ class RuleBasedBreakIterator::DictionaryCache: public UMemory {
 
     RuleBasedBreakIterator *fBI;
     
-    UVector32          *fBreaks;                // A vector containing the boundaries.
+    UVector32           fBreaks;                // A vector containing the boundaries.
     int32_t             fPositionInCache;       // Index in fBreaks of last boundary returned by following()
                                                 //    or preceding(). Optimizes sequential access.
     int32_t             fStart;                 // Text position of first boundary in cache.
index a0c1f90415c87f3874bd7fe25683a783399b6f80..47f51f2b698e368d3de56764679bf0bef6b21827 100644 (file)
@@ -29,7 +29,6 @@
 #include "unicode/udata.h"
 #include "unicode/parseerr.h"
 #include "unicode/schriter.h"
-#include "unicode/uchriter.h"
 
 U_NAMESPACE_BEGIN
 
@@ -72,14 +71,7 @@ private:
      *    a characterIterator that wraps that data.  Needed only for the
      *    implementation of getText(), a backwards compatibility issue.
      */
-    StringCharacterIterator *fSCharIter;
-
-    /**
-     *  When the input text is provided by a UText, this
-     *    dummy CharacterIterator over an empty string will
-     *    be returned from getText()
-     */
-    UCharCharacterIterator *fDCharIter;
+    StringCharacterIterator fSCharIter;
 
     /**
      * The rule data for this BreakIterator instance