]> granicus.if.org Git - icu/commitdiff
ICU-12507 ICU4C RBBI, switch to UTrie2
authorAndy Heninger <andy.heninger@gmail.com>
Wed, 3 May 2017 23:44:14 +0000 (23:44 +0000)
committerAndy Heninger <andy.heninger@gmail.com>
Wed, 3 May 2017 23:44:14 +0000 (23:44 +0000)
X-SVN-Rev: 40105

icu4c/source/common/rbbi.cpp
icu4c/source/common/rbbidata.cpp
icu4c/source/common/rbbidata.h
icu4c/source/common/rbbisetb.cpp
icu4c/source/common/rbbisetb.h
icu4c/source/common/unicode/rbbi.h

index d032604e04975c75a9013cd87d8fec98a338708b..d5458e7e7e9a064a1c6439f841bef61fe7395db1 100644 (file)
@@ -1078,7 +1078,7 @@ int32_t RuleBasedBreakIterator::handleNext(const RBBIStateTable *statetable) {
             // Note:  the 16 in UTRIE_GET16 refers to the size of the data being returned,
             //        not the size of the character going in, which is a UChar32.
             //
-            UTRIE_GET16(&fData->fTrie, c, category);
+            category = UTRIE2_GET16(fData->fTrie, c);
 
             // Check the dictionary bit in the character's category.
             //    Counter is only used by dictionary based iterators (subclasses).
@@ -1275,7 +1275,7 @@ int32_t RuleBasedBreakIterator::handlePrevious(const RBBIStateTable *statetable)
             // Note:  the 16 in UTRIE_GET16 refers to the size of the data being returned,
             //        not the size of the character going in, which is a UChar32.
             //
-            UTRIE_GET16(&fData->fTrie, c, category);
+            category = UTRIE2_GET16(fData->fTrie, c);
 
             // Check the dictionary bit in the character's category.
             //    Counter is only used by dictionary based iterators (subclasses).
@@ -1510,26 +1510,6 @@ BreakIterator *  RuleBasedBreakIterator::createBufferClone(void * /*stackBuffer*
 }
 
 
-//-------------------------------------------------------------------------------
-//
-//  isDictionaryChar      Return true if the category lookup for this char
-//                        indicates that it is in the set of dictionary lookup
-//                        chars.
-//
-//                        This function is intended for use by dictionary based
-//                        break iterators.
-//
-//-------------------------------------------------------------------------------
-/*UBool RuleBasedBreakIterator::isDictionaryChar(UChar32   c) {
-    if (fData == NULL) {
-        return FALSE;
-    }
-    uint16_t category;
-    UTRIE_GET16(&fData->fTrie, c, category);
-    return (category & 0x4000) != 0;
-}*/
-
-
 //-------------------------------------------------------------------------------
 //
 //  checkDictionary       This function handles all processing of characters in
@@ -1569,7 +1549,7 @@ int32_t RuleBasedBreakIterator::checkDictionary(int32_t startPos,
     int32_t     foundBreakCount = 0;
     UChar32     c = utext_current32(fText);
 
-    UTRIE_GET16(&fData->fTrie, c, category);
+    category = UTRIE2_GET16(fData->fTrie, c);
 
     // Is the character we're starting on a dictionary character? If so, we
     // need to back up to include the entire run; otherwise the results of
@@ -1581,7 +1561,7 @@ int32_t RuleBasedBreakIterator::checkDictionary(int32_t startPos,
             do {
                 utext_next32(fText);          // TODO:  recast to work directly with postincrement.
                 c = utext_current32(fText);
-                UTRIE_GET16(&fData->fTrie, c, category);
+                category = UTRIE2_GET16(fData->fTrie, c);
             } while (c != U_SENTINEL && (category & 0x4000));
             // Back up to the last dictionary character
             rangeEnd = (int32_t)UTEXT_GETNATIVEINDEX(fText);
@@ -1597,7 +1577,7 @@ int32_t RuleBasedBreakIterator::checkDictionary(int32_t startPos,
         else {
             do {
                 c = UTEXT_PREVIOUS32(fText);
-                UTRIE_GET16(&fData->fTrie, c, category);
+                category = UTRIE2_GET16(fData->fTrie, c);
             }
             while (c != U_SENTINEL && (category & 0x4000));
             // Back up to the last dictionary character
@@ -1611,7 +1591,7 @@ int32_t RuleBasedBreakIterator::checkDictionary(int32_t startPos,
             }
             rangeStart = (int32_t)UTEXT_GETNATIVEINDEX(fText);;
         }
-        UTRIE_GET16(&fData->fTrie, c, category);
+        category = UTRIE2_GET16(fData->fTrie, c);
     }
 
     // Loop through the text, looking for ranges of dictionary characters.
@@ -1622,13 +1602,13 @@ int32_t RuleBasedBreakIterator::checkDictionary(int32_t startPos,
     if (reverse) {
         utext_setNativeIndex(fText, rangeStart);
         c = utext_current32(fText);
-        UTRIE_GET16(&fData->fTrie, c, category);
+        category = UTRIE2_GET16(fData->fTrie, c);
     }
     while(U_SUCCESS(status)) {
         while((current = (int32_t)UTEXT_GETNATIVEINDEX(fText)) < rangeEnd && (category & 0x4000) == 0) {
             utext_next32(fText);           // TODO:  tweak for post-increment operation
             c = utext_current32(fText);
-            UTRIE_GET16(&fData->fTrie, c, category);
+            category = UTRIE2_GET16(fData->fTrie, c);
         }
         if (current >= rangeEnd) {
             break;
@@ -1646,7 +1626,7 @@ int32_t RuleBasedBreakIterator::checkDictionary(int32_t startPos,
 
         // Reload the loop variables for the next go-round
         c = utext_current32(fText);
-        UTRIE_GET16(&fData->fTrie, c, category);
+        category = UTRIE2_GET16(fData->fTrie, c);
     }
 
     // If we found breaks, build a new break cache. The first and last entries must
index 63ed15f39b8c48570787e43d87d448998bea869f..916e2b6d1fd9d8f88c581e2cf8bff56e747ec1a8 100644 (file)
 #include "uassert.h"
 
 
-//-----------------------------------------------------------------------------------
-//
-//   Trie access folding function.  Copied as-is from properties code in uchar.c
-//
-//-----------------------------------------------------------------------------------
-U_CDECL_BEGIN
-static int32_t U_CALLCONV
-getFoldingOffset(uint32_t data) {
-    /* if bit 15 is set, then the folding offset is in bits 14..0 of the 16-bit trie result */
-    if(data&0x8000) {
-        return (int32_t)(data&0x7fff);
-    } else {
-        return 0;
-    }
-}
-U_CDECL_END
-
 U_NAMESPACE_BEGIN
 
 //-----------------------------------------------------------------------------
@@ -98,6 +81,7 @@ void RBBIDataWrapper::init0() {
     fSafeRevTable = NULL;
     fRuleSource = NULL;
     fRuleStatusTable = NULL;
+    fTrie = NULL;
     fUDataMem = NULL;
     fRefCount = 0;
     fDontFreeData = TRUE;
@@ -132,15 +116,14 @@ void RBBIDataWrapper::init(const RBBIDataHeader *data, UErrorCode &status) {
     }
 
 
-    utrie_unserialize(&fTrie,
-                       (uint8_t *)data + fHeader->fTrie,
-                       fHeader->fTrieLen,
-                       &status);
+    fTrie = utrie2_openFromSerialized(UTRIE2_16_VALUE_BITS,
+                                      (uint8_t *)data + fHeader->fTrie,
+                                      fHeader->fTrieLen,
+                                      NULL,           // *actual length
+                                      &status);
     if (U_FAILURE(status)) {
         return;
     }
-    fTrie.getFoldingOffset=getFoldingOffset;
-
 
     fRuleSource   = (UChar *)((char *)data + fHeader->fRuleSource);
     fRuleString.setTo(TRUE, fRuleSource, -1);
@@ -165,6 +148,8 @@ void RBBIDataWrapper::init(const RBBIDataHeader *data, UErrorCode &status) {
 //-----------------------------------------------------------------------------
 RBBIDataWrapper::~RBBIDataWrapper() {
     U_ASSERT(fRefCount == 0);
+    utrie2_close(fTrie);
+    fTrie = NULL;
     if (fUDataMem) {
         udata_close(fUDataMem);
     } else if (!fDontFreeData) {
@@ -451,8 +436,8 @@ ubrk_swap(const UDataSwapper *ds, const void *inData, int32_t length, void *outD
     }
 
     // Trie table for character categories
-    utrie_swap(ds, inBytes+ds->readUInt32(rbbiDH->fTrie), ds->readUInt32(rbbiDH->fTrieLen),
-                            outBytes+ds->readUInt32(rbbiDH->fTrie), status);
+    utrie2_swap(ds, inBytes+ds->readUInt32(rbbiDH->fTrie), ds->readUInt32(rbbiDH->fTrieLen),
+                    outBytes+ds->readUInt32(rbbiDH->fTrie), status);
 
     // Source Rules Text.  It's UChar data
     ds->swapArray16(ds, inBytes+ds->readUInt32(rbbiDH->fRuleSource), ds->readUInt32(rbbiDH->fRuleSourceLen),
index 8011bafd56bf83ca130fa2a00d413fc1391ead42..c64c65cb46633c0faaae93afc132a48ec2704b13 100644 (file)
@@ -52,7 +52,7 @@ ubrk_swap(const UDataSwapper *ds,
 #include "unicode/uobject.h"
 #include "unicode/unistr.h"
 #include "umutex.h"
-#include "utrie.h"
+#include "utrie2.h"
 
 U_NAMESPACE_BEGIN
 
@@ -181,7 +181,7 @@ public:
     /* number of int32_t values in the rule status table.   Used to sanity check indexing */
     int32_t             fStatusMaxIdx;
 
-    UTrie               fTrie;
+    UTrie2              *fTrie;
 
 private:
     u_atomic_int32_t    fRefCount;
index f55388aeaca7163625c2772861cea940feb1f836..f473b16974a685e97bc3c349381104c4cf63c0e1 100644 (file)
@@ -35,7 +35,7 @@
 #if !UCONFIG_NO_BREAK_ITERATION
 
 #include "unicode/uniset.h"
-#include "utrie.h"
+#include "utrie2.h"
 #include "uvector.h"
 #include "uassert.h"
 #include "cmemory.h"
 #include "rbbisetb.h"
 #include "rbbinode.h"
 
-
-//------------------------------------------------------------------------
-//
-//   getFoldedRBBIValue        Call-back function used during building of Trie table.
-//                             Folding value: just store the offset (16 bits)
-//                             if there is any non-0 entry.
-//                             (It'd really be nice if the Trie builder would provide a
-//                             simple default, so this function could go away from here.)
-//
-//------------------------------------------------------------------------
-/* folding value: just store the offset (16 bits) if there is any non-0 entry */
-U_CDECL_BEGIN
-static uint32_t U_CALLCONV
-getFoldedRBBIValue(UNewTrie *trie, UChar32 start, int32_t offset) {
-    uint32_t value;
-    UChar32 limit;
-    UBool inBlockZero;
-
-    limit=start+0x400;
-    while(start<limit) {
-        value=utrie_get32(trie, start, &inBlockZero);
-        if(inBlockZero) {
-            start+=UTRIE_DATA_BLOCK_LENGTH;
-        } else if(value!=0) {
-            return (uint32_t)(offset|0x8000);
-        } else {
-            ++start;
-        }
-    }
-    return 0;
-}
-
-
-U_CDECL_END
-
-
-
 U_NAMESPACE_BEGIN
 
 //------------------------------------------------------------------------
@@ -116,7 +79,7 @@ RBBISetBuilder::~RBBISetBuilder()
         delete r;
     }
 
-    utrie_close(fTrie);
+    utrie2_close(fTrie);
 }
 
 
@@ -287,33 +250,30 @@ void RBBISetBuilder::build() {
     // Build the Trie table for mapping UChar32 values to the corresponding
     //   range group number
     //
-    fTrie = utrie_open(NULL,    //  Pre-existing trie to be filled in
-                      NULL,    //  Data array  (utrie will allocate one)
-                      100000,  //  Max Data Length
-                      0,       //  Initial value for all code points
-                      0,       //  Lead surrogate unit value
-                      TRUE);   //  Keep Latin 1 in separately
-
+    fTrie = utrie2_open(0,       //  Initial value for all code points
+                        0,       //  errorValue
+                        fStatus);
 
     for (rlRange = fRangeList; rlRange!=0; rlRange=rlRange->fNext) {
-        utrie_setRange32(fTrie, rlRange->fStartChar, rlRange->fEndChar+1, rlRange->fNum, TRUE);
+        utrie2_setRange32(fTrie, rlRange->fStartChar, rlRange->fEndChar, rlRange->fNum, TRUE, fStatus);
     }
 }
 
 
-
 //-----------------------------------------------------------------------------------
 //
 //  getTrieSize()    Return the size that will be required to serialize the Trie.
 //
 //-----------------------------------------------------------------------------------
 int32_t RBBISetBuilder::getTrieSize() /*const*/ {
-    fTrieSize  = utrie_serialize(fTrie,
-                                    NULL,                // Buffer
-                                    0,                   // Capacity
-                                    getFoldedRBBIValue,
-                                    TRUE,                // Reduce to 16 bits
-                                    fStatus);
+    utrie2_freeze(fTrie, UTRIE2_16_VALUE_BITS, fStatus);
+    fTrieSize  = utrie2_serialize(fTrie,
+                                  NULL,                // Buffer
+                                  0,                   // Capacity
+                                  fStatus);
+    if (*fStatus == U_BUFFER_OVERFLOW_ERROR) {
+        *fStatus = U_ZERO_ERROR;
+    }
     // RBBIDebugPrintf("Trie table size is %d\n", trieSize);
     return fTrieSize;
 }
@@ -327,12 +287,10 @@ int32_t RBBISetBuilder::getTrieSize() /*const*/ {
 //
 //-----------------------------------------------------------------------------------
 void RBBISetBuilder::serializeTrie(uint8_t *where) {
-    utrie_serialize(fTrie,
-                    where,                   // Buffer
-                    fTrieSize,               // Capacity
-                    getFoldedRBBIValue,
-                    TRUE,                    // Reduce to 16 bits
-                    fStatus);
+    utrie2_serialize(fTrie,
+                     where,                   // Buffer
+                     fTrieSize,               // Capacity
+                     fStatus);
 }
 
 //------------------------------------------------------------------------
index a7d1e7af3bcfb217596f2d5a95cadc6e2d727fb9..5d1f011097e1affbc9a986212fdef2e958f27262 100644 (file)
 #include "unicode/utypes.h"
 #include "unicode/uobject.h"
 #include "rbbirb.h"
+#include "utrie2.h"
 #include "uvector.h"
 
-struct  UNewTrie;
-
 U_NAMESPACE_BEGIN
 
 //
@@ -109,7 +108,7 @@ private:
 
     RangeDescriptor       *fRangeList;      // Head of the linked list of RangeDescriptors
 
-    UNewTrie              *fTrie;           // The mapping TRIE that is the end result of processing
+    UTrie2               *fTrie;            // The mapping TRIE that is the end result of processing
     uint32_t              fTrieSize;        //  the Unicode Sets.
 
     // Groups correspond to character categories -
index d654154008bc7fce2fa16fb6e9e1a38492f837fc..eb80b9b665e258c7c7ca7263a18d03b1d6bb5bdc 100644 (file)
@@ -32,8 +32,6 @@
 #include "unicode/uchriter.h"
 
 
-struct UTrie;
-
 U_NAMESPACE_BEGIN
 
 /** @internal */