]> granicus.if.org Git - icu/commitdiff
ICU-13569 rbbi state table opt, work in progress.
authorAndy Heninger <andy.heninger@gmail.com>
Thu, 8 Feb 2018 01:42:04 +0000 (01:42 +0000)
committerAndy Heninger <andy.heninger@gmail.com>
Thu, 8 Feb 2018 01:42:04 +0000 (01:42 +0000)
X-SVN-Rev: 40855

12 files changed:
icu4c/source/common/rbbi.cpp
icu4c/source/common/rbbidata.cpp
icu4c/source/common/rbbidata.h
icu4c/source/common/rbbirb.cpp
icu4c/source/common/rbbirb.h
icu4c/source/common/rbbisetb.cpp
icu4c/source/common/rbbisetb.h
icu4c/source/common/rbbitblb.cpp
icu4c/source/common/rbbitblb.h
icu4c/source/common/unicode/rbbi.h
icu4c/source/test/intltest/rbbitst.cpp
icu4c/source/test/intltest/rbbitst.h

index 27e0fe7e5d9a6ed1bf9549b283564282276de022..c9f26ac9ce0bc5e67b1d11cae9cb6fbd6df18ad0 100644 (file)
@@ -1338,6 +1338,10 @@ void RuleBasedBreakIterator::dumpCache() {
     fBreakCache->dumpCache();
 }
 
+void RuleBasedBreakIterator::dumpTables() {
+    fData->printData();
+}
+
 /**
  * Returns the description used to create this iterator
  */
index 33405708c0698fe2ffac929123c8222090f65234..5b00e9506228c68e1af8c24e222de0dbbf0adff4 100644 (file)
@@ -267,8 +267,8 @@ void  RBBIDataWrapper::printTable(const char *heading, const RBBIStateTable *tab
 #endif
 
 
-#ifdef RBBI_DEBUG
 void  RBBIDataWrapper::printData() {
+#ifdef RBBI_DEBUG
     RBBIDebugPrintf("RBBI Data at %p\n", (void *)fHeader);
     RBBIDebugPrintf("   Version = {%d %d %d %d}\n", fHeader->fFormatVersion[0], fHeader->fFormatVersion[1],
                                                     fHeader->fFormatVersion[2], fHeader->fFormatVersion[3]);
@@ -285,8 +285,8 @@ void  RBBIDataWrapper::printData() {
         RBBIDebugPrintf("%c", fRuleSource[c]);
     }
     RBBIDebugPrintf("\n\n");
-}
 #endif
+}
 
 
 U_NAMESPACE_END
index 0e7beb7726694c775e58b800ed04bd49a1513416..1244a118dc523595f27bd9dda9b18245ff987d8c 100644 (file)
@@ -165,13 +165,8 @@ public:
     UBool                 operator ==(const RBBIDataWrapper &other) const;
     int32_t               hashCode();
     const UnicodeString  &getRuleSourceString() const;
-#ifdef RBBI_DEBUG
     void                  printData();
     void                  printTable(const char *heading, const RBBIStateTable *table);
-#else
-    #define printData()
-    #define printTable(heading, table)
-#endif
 
     /*                                     */
     /*   Pointers to items within the data */
index 84dfd4b58cfaebe62b38fc400416c669dc346a15..817a955a9698b57b53e67e71b749a6c6e5ce1757 100644 (file)
@@ -282,10 +282,10 @@ RBBIRuleBuilder::createRuleBasedBreakIterator( const UnicodeString    &rules,
     //
     // UnicodeSet processing.
     //    Munge the Unicode Sets to create a set of character categories.
-    //    Generate the mapping tables (TRIE) from input 32-bit characters to
+    //    Generate the mapping tables (TRIE) from input code points to
     //    the character categories.
     //
-    builder.fSetBuilder->build();
+    builder.fSetBuilder->buildRanges();
 
 
     //
@@ -317,6 +317,11 @@ RBBIRuleBuilder::createRuleBasedBreakIterator( const UnicodeString    &rules,
     }
 #endif
 
+    builder.optimizeTables();
+    builder.fSetBuilder->buildTrie();
+
+
+
     //
     //   Package up the compiled data into a memory image
     //      in the run-time format.
@@ -348,6 +353,20 @@ RBBIRuleBuilder::createRuleBasedBreakIterator( const UnicodeString    &rules,
     return This;
 }
 
+void RBBIRuleBuilder::optimizeTables() {
+    int32_t leftClass;
+    int32_t rightClass;
+
+    leftClass = 1;
+    rightClass = 2;
+    while (fForwardTables->findDuplCharClassFrom(leftClass, rightClass)) {
+        fSetBuilder->mergeCategories(leftClass, rightClass);
+        fForwardTables->removeColumn(rightClass);
+    }
+
+
+}
+
 U_NAMESPACE_END
 
 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
index 0c307577e362b356f40065df882423af5b139e44..f890cf686e36cb20887cc435b439c566febfd0b8 100644 (file)
@@ -126,6 +126,14 @@ public:
         );
 
     virtual    ~RBBIRuleBuilder();
+
+    /**
+     * Fold together redundant character classes (table columns) and
+     * redundant states (table rows). Done after initial table generation,
+     * before serializing the result.
+     */
+    void optimizeTables();
+
     char                          *fDebugEnv;        // controls debug trace output
     UErrorCode                    *fStatus;          // Error reporting.  Keeping status
     UParseError                   *fParseError;      //   here avoids passing it everywhere.
index e97eba8d14de3e29a6f498b774481d91694cb97a..67bb460acaa5e968f0b8c2bafed1e9622c36762b 100644 (file)
@@ -91,7 +91,7 @@ RBBISetBuilder::~RBBISetBuilder()
 //                  from the Unicode Sets.
 //
 //------------------------------------------------------------------------
-void RBBISetBuilder::build() {
+void RBBISetBuilder::buildRanges() {
     RBBINode        *usetNode;
     RangeDescriptor *rlRange;
 
@@ -245,11 +245,16 @@ void RBBISetBuilder::build() {
 
     if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "rgroup")) {printRangeGroups();}
     if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "esets")) {printSets();}
+}
+
+
+//
+// Build the Trie table for mapping UChar32 values to the corresponding
+// range group number.
+//
+void RBBISetBuilder::buildTrie() {
+    RangeDescriptor *rlRange;
 
-    //
-    // Build the Trie table for mapping UChar32 values to the corresponding
-    //   range group number
-    //
     fTrie = utrie2_open(0,       //  Initial value for all code points.
                         0,       //  Error value for out-of-range input.
                         fStatus);
@@ -265,6 +270,20 @@ void RBBISetBuilder::build() {
 }
 
 
+void RBBISetBuilder::mergeCategories(int32_t left, int32_t right) {
+    U_ASSERT(left >= 1);
+    U_ASSERT(right > left);
+    for (RangeDescriptor *rd = fRangeList; rd != nullptr; rd = rd->fNext) {
+        if (rd->fNum == right) {
+            rd->fNum = left;
+        } else if (rd->fNum > right) {
+            rd->fNum--;
+        }
+    }
+    --fGroupCount;
+}
+
+
 //-----------------------------------------------------------------------------------
 //
 //  getTrieSize()    Return the size that will be required to serialize the Trie.
index 7cedb45b33550f4101b01b789fbb9891f8d2cdc3..3f0ec1a8a0c462de4d922cf357654934b62f6ce5 100644 (file)
@@ -82,7 +82,8 @@ public:
     RBBISetBuilder(RBBIRuleBuilder *rb);
     ~RBBISetBuilder();
 
-    void     build();
+    void     buildRanges();
+    void     buildTrie();
     void     addValToSets(UVector *sets,      uint32_t val);
     void     addValToSet (RBBINode *usetNode, uint32_t val);
     int32_t  getNumCharCategories() const;   // CharCategories are the same as input symbol set to the
@@ -93,6 +94,11 @@ public:
     UChar32  getFirstChar(int32_t  val) const;
     UBool    sawBOF() const;                 // Indicate whether any references to the {bof} pseudo
                                              //   character were encountered.
+    /** merge two character categories that have been identified as having equivalent behavior.
+     *  The ranges belonging to the right category (table column) will be added to the left.
+     */
+    void     mergeCategories(int32_t left, int32_t right);
+
 #ifdef RBBI_DEBUG
     void     printSets();
     void     printRanges();
index 6a7e8f5113c1f06d55daeb66611119e7e3c114ca..a8bc0486199dad26c43db0126c66eb376dc2a2d4 100644 (file)
@@ -22,6 +22,7 @@
 #include "rbbidata.h"
 #include "cstring.h"
 #include "uassert.h"
+#include "uvectr32.h"
 #include "cmemory.h"
 
 U_NAMESPACE_BEGIN
@@ -1077,6 +1078,49 @@ void RBBITableBuilder::printPosSets(RBBINode *n) {
 }
 #endif
 
+//
+//    findDuplCharClassFrom()
+//
+bool RBBITableBuilder::findDuplCharClassFrom(int32_t &baseCategory, int32_t &duplCategory) {
+    int32_t numStates = fDStates->size();
+    int32_t numCols = fRB->fSetBuilder->getNumCharCategories();
+
+    U_ASSERT(baseCategory < duplCategory);
+
+    uint16_t table_base;
+    uint16_t table_dupl;
+    for (; baseCategory < numCols-1; ++baseCategory) {
+        for (; duplCategory < numCols; ++duplCategory) {
+             for (int32_t state=0; state<numStates; state++) {
+                 RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(state);
+                 table_base = (uint16_t)sd->fDtran->elementAti(baseCategory);
+                 table_dupl = (uint16_t)sd->fDtran->elementAti(duplCategory);
+                 if (table_base != table_dupl) {
+                     break;
+                 }
+             }
+             if (table_base == table_dupl) {
+                 return true;
+             }
+        }
+    }
+    return false;
+}
+
+
+//
+//    removeColumn()
+//
+void RBBITableBuilder::removeColumn(int32_t column) {
+    int32_t numStates = fDStates->size();
+    for (int32_t state=0; state<numStates; state++) {
+        RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(state);
+        U_ASSERT(column < sd->fDtran->size());
+        sd->fDtran->removeElementAt(column);
+    }
+}
+
+
 
 
 //-----------------------------------------------------------------------------
@@ -1106,7 +1150,6 @@ int32_t  RBBITableBuilder::getTableSize() const {
 }
 
 
-
 //-----------------------------------------------------------------------------
 //
 //   exportTable()    export the state transition table in the format required
@@ -1256,7 +1299,7 @@ RBBIStateDescriptor::RBBIStateDescriptor(int lastInputSymbol, UErrorCode *fStatu
     fPositions = NULL;
     fDtran     = NULL;
 
-    fDtran     = new UVector(lastInputSymbol+1, *fStatus);
+    fDtran     = new UVector32(lastInputSymbol+1, *fStatus);
     if (U_FAILURE(*fStatus)) {
         return;
     }
@@ -1264,7 +1307,7 @@ RBBIStateDescriptor::RBBIStateDescriptor(int lastInputSymbol, UErrorCode *fStatu
         *fStatus = U_MEMORY_ALLOCATION_ERROR;
         return;
     }
-    fDtran->setSize(lastInputSymbol+1, *fStatus);    // fDtran needs to be pre-sized.
+    fDtran->setSize(lastInputSymbol+1);    // fDtran needs to be pre-sized.
                                            //   It is indexed by input symbols, and will
                                            //   hold  the next state number for each
                                            //   symbol.
index 104150187852096b632591c24fd0c97b31c77414..375ed6edd2701f23b57a51c42c32c8e6585d53b9 100644 (file)
@@ -24,6 +24,7 @@ U_NAMESPACE_BEGIN
 
 class RBBIRuleScanner;
 class RBBIRuleBuilder;
+class UVector32;
 
 //
 //  class RBBITableBuilder is part of the RBBI rule compiler.
@@ -42,9 +43,23 @@ public:
     void     build();
     int32_t  getTableSize() const;      // Return the runtime size in bytes of
                                         //     the built state table
-    void     exportTable(void *where);  // fill in the runtime state table.
-                                        //     Sufficient memory must exist at
-                                        //     the specified location.
+
+    /** Fill in the runtime state table. Sufficient memory must exist at the specified location.
+     */
+    void     exportTable(void *where);
+
+    /** Find duplicate (redundant) character classes, beginning after the specifed
+     *  pair, within this state table. This is an iterator-like function, used to
+     *  identify char classes (state table columns) that can be eliminated.
+     */
+    bool     findDuplCharClassFrom(int &baseClass, int &duplClass);
+
+    /** Remove a column from the state table. Used when two character categories
+     *  have been found equivalent, and merged together, to eliminate the uneeded table column.
+     */
+    void     removeColumn(int32_t column);
+
+
 
 
 private:
@@ -60,6 +75,12 @@ private:
     void     flagTaggedStates();
     void     mergeRuleStatusVals();
 
+    /**
+     * Merge redundant state table columns, eliminating character classes with identical behavior.
+     * Done after the state tables are generated, just before converting to their run-time format.
+     */
+    int32_t  mergeColumns();
+
     void     addRuleRootNodes(UVector *dest, RBBINode *node);
 
     // Set functions for UVector.
@@ -112,7 +133,7 @@ public:
                                            //   with this state.  Unordered (it's a set).
                                            //   UVector contents are RBBINode *
 
-    UVector          *fDtran;              // Transitions out of this state.
+    UVector32        *fDtran;              // Transitions out of this state.
                                            //   indexed by input character
                                            //   contents is int index of dest state
                                            //   in RBBITableBuilder.fDStates
index 3e09ec913acc48ed6a423a780e690821ade3f76f..165fabc7b5795c68db25eeedf7979ae2d3d3b12f 100644 (file)
@@ -60,10 +60,13 @@ private:
     UText  fText;
 
     /**
-     * The rule data for this BreakIterator instance
+     * The rule data for this BreakIterator instance.
+     * Not for general use; Public only for testing purposes.
      * @internal
      */
+public:
     RBBIDataWrapper    *fData;
+private:
 
     /** 
      *  The iteration state - current position, rule status for the current position,
@@ -683,6 +686,13 @@ private:
      *   @internal
      */
      void dumpCache();
+
+    /**
+     * Debugging function only.
+     * @internal
+     */
+    void dumpTables();
+
 #endif  /* U_HIDE_INTERNAL_API */
 };
 
index 98a1c900ceb0b4f56d7b0f0882f6beb0f0d8c69e..2565ef4f61cfb6a8ea28c51b3811ee2cbf47a363 100644 (file)
@@ -17,6 +17,7 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
+#include <vector>
 
 #include "unicode/brkiter.h"
 #include "unicode/localpointer.h"
 #include "cstr.h"
 #include "intltest.h"
 #include "rbbitst.h"
+#include "rbbidata.h"
 #include "utypeinfo.h"  // for 'typeid' to work
 #include "uvector.h"
 #include "uvectr32.h"
 
+
 #if !UCONFIG_NO_FILTERED_BREAK_ITERATION
 #include "unicode/filteredbrk.h"
 #endif // !UCONFIG_NO_FILTERED_BREAK_ITERATION
@@ -106,6 +109,7 @@ void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, cha
     TESTCASE_AUTO(TestEmoji);
     TESTCASE_AUTO(TestBug12519);
     TESTCASE_AUTO(TestBug12677);
+    TESTCASE_AUTO(TestTableRedundancies);
     TESTCASE_AUTO_END;
 }
 
@@ -4454,6 +4458,66 @@ void RBBITest::TestBug12677() {
     assertEquals(WHERE, UnicodeString(u"!!forward; $x = [ab#]; '#' '?'; "),  rtRules);
 }
 
+
+void RBBITest::TestTableRedundancies() {
+    UErrorCode status = U_ZERO_ERROR;
+    RuleBasedBreakIterator *bi =
+            (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getEnglish(), status);
+    // bi->dumpTables();
+
+    RBBIDataWrapper *dw = bi->fData;
+    const RBBIStateTable *fwtbl = dw->fForwardTable;
+    int32_t numCharClasses = dw->fHeader->fCatCount;
+    printf("Char Classes: %d     states: %d\n", numCharClasses, fwtbl->fNumStates);
+
+    // Check for duplicate columns
+
+    std::vector<UnicodeString> columns;
+    for (int32_t column = 0; column < numCharClasses; column++) {
+        UnicodeString s;
+        for (int32_t r = 1; r < (int32_t)fwtbl->fNumStates; r++) {
+            RBBIStateTableRow  *row = (RBBIStateTableRow *) (fwtbl->fTableData + (fwtbl->fRowLen * r));
+            s.append(row->fNextState[column]);
+        }
+        columns.push_back(s);
+    }
+    for (int c1=0; c1<numCharClasses; c1++) {
+        for (int c2 = c1+1; c2 < numCharClasses; c2++) {
+            if (columns.at(c1) == columns.at(c2)) {
+                printf("Duplicate columns (%d, %d)\n", c1, c2);
+                break;
+            }
+        }
+    }
+
+    // Check for duplicate states
+    std::vector<UnicodeString> rows;
+    for (int32_t r=0; r < (int32_t)fwtbl->fNumStates; r++) {
+        UnicodeString s;
+        RBBIStateTableRow  *row = (RBBIStateTableRow *) (fwtbl->fTableData + (fwtbl->fRowLen * r));
+        if (row->fAccepting < -1) {
+            printf("row %d accepting = %d\n", r, row->fAccepting);
+        }
+        s.append(row->fAccepting + 1);   // values of -1 are expected.
+        s.append(row->fLookAhead);
+        s.append(row->fTagIdx);
+        for (int32_t column = 0; column < numCharClasses; column++) {
+            s.append(row->fNextState[column]);
+        }
+        rows.push_back(s);
+    }
+    for (int r1=0; r1<(int32_t)fwtbl->fNumStates; r1++) {
+        for (int r2 = r1+1; r2 < (int32_t)fwtbl->fNumStates; r2++) {
+            if (rows.at(r1) == rows.at(r2)) {
+                printf("Duplicate rows (%d, %d)\n", r1, r2);
+                break;
+            }
+        }
+    }
+    delete bi;
+}
+
+
 //
 //  TestDebug    -  A place-holder test for debugging purposes.
 //                  For putting in fragments of other tests that can be invoked
index 71febf1cebfe758c491d97c09cf42f15030e3402..0977c98a0fb696aa9f75a2bc5f214801ab8b205a 100644 (file)
@@ -75,6 +75,7 @@ public:
     void TestEmoji();
     void TestBug12519();
     void TestBug12677();
+    void TestTableRedundancies();
 
     void TestDebug();
     void TestProperties();