From 3d4a3fbaa8aaf5bcf6088428cb9222b32c596c8f Mon Sep 17 00:00:00 2001 From: Andy Heninger Date: Thu, 8 Feb 2018 01:42:04 +0000 Subject: [PATCH] ICU-13569 rbbi state table opt, work in progress. X-SVN-Rev: 40855 --- icu4c/source/common/rbbi.cpp | 4 ++ icu4c/source/common/rbbidata.cpp | 4 +- icu4c/source/common/rbbidata.h | 5 -- icu4c/source/common/rbbirb.cpp | 23 ++++++++- icu4c/source/common/rbbirb.h | 8 ++++ icu4c/source/common/rbbisetb.cpp | 29 ++++++++++-- icu4c/source/common/rbbisetb.h | 8 +++- icu4c/source/common/rbbitblb.cpp | 49 ++++++++++++++++++-- icu4c/source/common/rbbitblb.h | 29 ++++++++++-- icu4c/source/common/unicode/rbbi.h | 12 ++++- icu4c/source/test/intltest/rbbitst.cpp | 64 ++++++++++++++++++++++++++ icu4c/source/test/intltest/rbbitst.h | 1 + 12 files changed, 213 insertions(+), 23 deletions(-) diff --git a/icu4c/source/common/rbbi.cpp b/icu4c/source/common/rbbi.cpp index 27e0fe7e5d9..c9f26ac9ce0 100644 --- a/icu4c/source/common/rbbi.cpp +++ b/icu4c/source/common/rbbi.cpp @@ -1338,6 +1338,10 @@ void RuleBasedBreakIterator::dumpCache() { fBreakCache->dumpCache(); } +void RuleBasedBreakIterator::dumpTables() { + fData->printData(); +} + /** * Returns the description used to create this iterator */ diff --git a/icu4c/source/common/rbbidata.cpp b/icu4c/source/common/rbbidata.cpp index 33405708c06..5b00e950622 100644 --- a/icu4c/source/common/rbbidata.cpp +++ b/icu4c/source/common/rbbidata.cpp @@ -267,8 +267,8 @@ void RBBIDataWrapper::printTable(const char *heading, const RBBIStateTable *tab #endif -#ifdef RBBI_DEBUG void RBBIDataWrapper::printData() { +#ifdef RBBI_DEBUG RBBIDebugPrintf("RBBI Data at %p\n", (void *)fHeader); RBBIDebugPrintf(" Version = {%d %d %d %d}\n", fHeader->fFormatVersion[0], fHeader->fFormatVersion[1], fHeader->fFormatVersion[2], fHeader->fFormatVersion[3]); @@ -285,8 +285,8 @@ void RBBIDataWrapper::printData() { RBBIDebugPrintf("%c", fRuleSource[c]); } RBBIDebugPrintf("\n\n"); -} #endif +} U_NAMESPACE_END diff --git a/icu4c/source/common/rbbidata.h b/icu4c/source/common/rbbidata.h index 0e7beb77266..1244a118dc5 100644 --- a/icu4c/source/common/rbbidata.h +++ b/icu4c/source/common/rbbidata.h @@ -165,13 +165,8 @@ public: UBool operator ==(const RBBIDataWrapper &other) const; int32_t hashCode(); const UnicodeString &getRuleSourceString() const; -#ifdef RBBI_DEBUG void printData(); void printTable(const char *heading, const RBBIStateTable *table); -#else - #define printData() - #define printTable(heading, table) -#endif /* */ /* Pointers to items within the data */ diff --git a/icu4c/source/common/rbbirb.cpp b/icu4c/source/common/rbbirb.cpp index 84dfd4b58cf..817a955a969 100644 --- a/icu4c/source/common/rbbirb.cpp +++ b/icu4c/source/common/rbbirb.cpp @@ -282,10 +282,10 @@ RBBIRuleBuilder::createRuleBasedBreakIterator( const UnicodeString &rules, // // UnicodeSet processing. // Munge the Unicode Sets to create a set of character categories. - // Generate the mapping tables (TRIE) from input 32-bit characters to + // Generate the mapping tables (TRIE) from input code points to // the character categories. // - builder.fSetBuilder->build(); + builder.fSetBuilder->buildRanges(); // @@ -317,6 +317,11 @@ RBBIRuleBuilder::createRuleBasedBreakIterator( const UnicodeString &rules, } #endif + builder.optimizeTables(); + builder.fSetBuilder->buildTrie(); + + + // // Package up the compiled data into a memory image // in the run-time format. @@ -348,6 +353,20 @@ RBBIRuleBuilder::createRuleBasedBreakIterator( const UnicodeString &rules, return This; } +void RBBIRuleBuilder::optimizeTables() { + int32_t leftClass; + int32_t rightClass; + + leftClass = 1; + rightClass = 2; + while (fForwardTables->findDuplCharClassFrom(leftClass, rightClass)) { + fSetBuilder->mergeCategories(leftClass, rightClass); + fForwardTables->removeColumn(rightClass); + } + + +} + U_NAMESPACE_END #endif /* #if !UCONFIG_NO_BREAK_ITERATION */ diff --git a/icu4c/source/common/rbbirb.h b/icu4c/source/common/rbbirb.h index 0c307577e36..f890cf686e3 100644 --- a/icu4c/source/common/rbbirb.h +++ b/icu4c/source/common/rbbirb.h @@ -126,6 +126,14 @@ public: ); virtual ~RBBIRuleBuilder(); + + /** + * Fold together redundant character classes (table columns) and + * redundant states (table rows). Done after initial table generation, + * before serializing the result. + */ + void optimizeTables(); + char *fDebugEnv; // controls debug trace output UErrorCode *fStatus; // Error reporting. Keeping status UParseError *fParseError; // here avoids passing it everywhere. diff --git a/icu4c/source/common/rbbisetb.cpp b/icu4c/source/common/rbbisetb.cpp index e97eba8d14d..67bb460acaa 100644 --- a/icu4c/source/common/rbbisetb.cpp +++ b/icu4c/source/common/rbbisetb.cpp @@ -91,7 +91,7 @@ RBBISetBuilder::~RBBISetBuilder() // from the Unicode Sets. // //------------------------------------------------------------------------ -void RBBISetBuilder::build() { +void RBBISetBuilder::buildRanges() { RBBINode *usetNode; RangeDescriptor *rlRange; @@ -245,11 +245,16 @@ void RBBISetBuilder::build() { if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "rgroup")) {printRangeGroups();} if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "esets")) {printSets();} +} + + +// +// Build the Trie table for mapping UChar32 values to the corresponding +// range group number. +// +void RBBISetBuilder::buildTrie() { + RangeDescriptor *rlRange; - // - // Build the Trie table for mapping UChar32 values to the corresponding - // range group number - // fTrie = utrie2_open(0, // Initial value for all code points. 0, // Error value for out-of-range input. fStatus); @@ -265,6 +270,20 @@ void RBBISetBuilder::build() { } +void RBBISetBuilder::mergeCategories(int32_t left, int32_t right) { + U_ASSERT(left >= 1); + U_ASSERT(right > left); + for (RangeDescriptor *rd = fRangeList; rd != nullptr; rd = rd->fNext) { + if (rd->fNum == right) { + rd->fNum = left; + } else if (rd->fNum > right) { + rd->fNum--; + } + } + --fGroupCount; +} + + //----------------------------------------------------------------------------------- // // getTrieSize() Return the size that will be required to serialize the Trie. diff --git a/icu4c/source/common/rbbisetb.h b/icu4c/source/common/rbbisetb.h index 7cedb45b335..3f0ec1a8a0c 100644 --- a/icu4c/source/common/rbbisetb.h +++ b/icu4c/source/common/rbbisetb.h @@ -82,7 +82,8 @@ public: RBBISetBuilder(RBBIRuleBuilder *rb); ~RBBISetBuilder(); - void build(); + void buildRanges(); + void buildTrie(); void addValToSets(UVector *sets, uint32_t val); void addValToSet (RBBINode *usetNode, uint32_t val); int32_t getNumCharCategories() const; // CharCategories are the same as input symbol set to the @@ -93,6 +94,11 @@ public: UChar32 getFirstChar(int32_t val) const; UBool sawBOF() const; // Indicate whether any references to the {bof} pseudo // character were encountered. + /** merge two character categories that have been identified as having equivalent behavior. + * The ranges belonging to the right category (table column) will be added to the left. + */ + void mergeCategories(int32_t left, int32_t right); + #ifdef RBBI_DEBUG void printSets(); void printRanges(); diff --git a/icu4c/source/common/rbbitblb.cpp b/icu4c/source/common/rbbitblb.cpp index 6a7e8f5113c..a8bc0486199 100644 --- a/icu4c/source/common/rbbitblb.cpp +++ b/icu4c/source/common/rbbitblb.cpp @@ -22,6 +22,7 @@ #include "rbbidata.h" #include "cstring.h" #include "uassert.h" +#include "uvectr32.h" #include "cmemory.h" U_NAMESPACE_BEGIN @@ -1077,6 +1078,49 @@ void RBBITableBuilder::printPosSets(RBBINode *n) { } #endif +// +// findDuplCharClassFrom() +// +bool RBBITableBuilder::findDuplCharClassFrom(int32_t &baseCategory, int32_t &duplCategory) { + int32_t numStates = fDStates->size(); + int32_t numCols = fRB->fSetBuilder->getNumCharCategories(); + + U_ASSERT(baseCategory < duplCategory); + + uint16_t table_base; + uint16_t table_dupl; + for (; baseCategory < numCols-1; ++baseCategory) { + for (; duplCategory < numCols; ++duplCategory) { + for (int32_t state=0; stateelementAt(state); + table_base = (uint16_t)sd->fDtran->elementAti(baseCategory); + table_dupl = (uint16_t)sd->fDtran->elementAti(duplCategory); + if (table_base != table_dupl) { + break; + } + } + if (table_base == table_dupl) { + return true; + } + } + } + return false; +} + + +// +// removeColumn() +// +void RBBITableBuilder::removeColumn(int32_t column) { + int32_t numStates = fDStates->size(); + for (int32_t state=0; stateelementAt(state); + U_ASSERT(column < sd->fDtran->size()); + sd->fDtran->removeElementAt(column); + } +} + + //----------------------------------------------------------------------------- @@ -1106,7 +1150,6 @@ int32_t RBBITableBuilder::getTableSize() const { } - //----------------------------------------------------------------------------- // // exportTable() export the state transition table in the format required @@ -1256,7 +1299,7 @@ RBBIStateDescriptor::RBBIStateDescriptor(int lastInputSymbol, UErrorCode *fStatu fPositions = NULL; fDtran = NULL; - fDtran = new UVector(lastInputSymbol+1, *fStatus); + fDtran = new UVector32(lastInputSymbol+1, *fStatus); if (U_FAILURE(*fStatus)) { return; } @@ -1264,7 +1307,7 @@ RBBIStateDescriptor::RBBIStateDescriptor(int lastInputSymbol, UErrorCode *fStatu *fStatus = U_MEMORY_ALLOCATION_ERROR; return; } - fDtran->setSize(lastInputSymbol+1, *fStatus); // fDtran needs to be pre-sized. + fDtran->setSize(lastInputSymbol+1); // fDtran needs to be pre-sized. // It is indexed by input symbols, and will // hold the next state number for each // symbol. diff --git a/icu4c/source/common/rbbitblb.h b/icu4c/source/common/rbbitblb.h index 10415018785..375ed6edd27 100644 --- a/icu4c/source/common/rbbitblb.h +++ b/icu4c/source/common/rbbitblb.h @@ -24,6 +24,7 @@ U_NAMESPACE_BEGIN class RBBIRuleScanner; class RBBIRuleBuilder; +class UVector32; // // class RBBITableBuilder is part of the RBBI rule compiler. @@ -42,9 +43,23 @@ public: void build(); int32_t getTableSize() const; // Return the runtime size in bytes of // the built state table - void exportTable(void *where); // fill in the runtime state table. - // Sufficient memory must exist at - // the specified location. + + /** Fill in the runtime state table. Sufficient memory must exist at the specified location. + */ + void exportTable(void *where); + + /** Find duplicate (redundant) character classes, beginning after the specifed + * pair, within this state table. This is an iterator-like function, used to + * identify char classes (state table columns) that can be eliminated. + */ + bool findDuplCharClassFrom(int &baseClass, int &duplClass); + + /** Remove a column from the state table. Used when two character categories + * have been found equivalent, and merged together, to eliminate the uneeded table column. + */ + void removeColumn(int32_t column); + + private: @@ -60,6 +75,12 @@ private: void flagTaggedStates(); void mergeRuleStatusVals(); + /** + * Merge redundant state table columns, eliminating character classes with identical behavior. + * Done after the state tables are generated, just before converting to their run-time format. + */ + int32_t mergeColumns(); + void addRuleRootNodes(UVector *dest, RBBINode *node); // Set functions for UVector. @@ -112,7 +133,7 @@ public: // with this state. Unordered (it's a set). // UVector contents are RBBINode * - UVector *fDtran; // Transitions out of this state. + UVector32 *fDtran; // Transitions out of this state. // indexed by input character // contents is int index of dest state // in RBBITableBuilder.fDStates diff --git a/icu4c/source/common/unicode/rbbi.h b/icu4c/source/common/unicode/rbbi.h index 3e09ec913ac..165fabc7b57 100644 --- a/icu4c/source/common/unicode/rbbi.h +++ b/icu4c/source/common/unicode/rbbi.h @@ -60,10 +60,13 @@ private: UText fText; /** - * The rule data for this BreakIterator instance + * The rule data for this BreakIterator instance. + * Not for general use; Public only for testing purposes. * @internal */ +public: RBBIDataWrapper *fData; +private: /** * The iteration state - current position, rule status for the current position, @@ -683,6 +686,13 @@ private: * @internal */ void dumpCache(); + + /** + * Debugging function only. + * @internal + */ + void dumpTables(); + #endif /* U_HIDE_INTERNAL_API */ }; diff --git a/icu4c/source/test/intltest/rbbitst.cpp b/icu4c/source/test/intltest/rbbitst.cpp index 98a1c900ceb..2565ef4f61c 100644 --- a/icu4c/source/test/intltest/rbbitst.cpp +++ b/icu4c/source/test/intltest/rbbitst.cpp @@ -17,6 +17,7 @@ #include #include #include +#include #include "unicode/brkiter.h" #include "unicode/localpointer.h" @@ -39,10 +40,12 @@ #include "cstr.h" #include "intltest.h" #include "rbbitst.h" +#include "rbbidata.h" #include "utypeinfo.h" // for 'typeid' to work #include "uvector.h" #include "uvectr32.h" + #if !UCONFIG_NO_FILTERED_BREAK_ITERATION #include "unicode/filteredbrk.h" #endif // !UCONFIG_NO_FILTERED_BREAK_ITERATION @@ -106,6 +109,7 @@ void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, cha TESTCASE_AUTO(TestEmoji); TESTCASE_AUTO(TestBug12519); TESTCASE_AUTO(TestBug12677); + TESTCASE_AUTO(TestTableRedundancies); TESTCASE_AUTO_END; } @@ -4454,6 +4458,66 @@ void RBBITest::TestBug12677() { assertEquals(WHERE, UnicodeString(u"!!forward; $x = [ab#]; '#' '?'; "), rtRules); } + +void RBBITest::TestTableRedundancies() { + UErrorCode status = U_ZERO_ERROR; + RuleBasedBreakIterator *bi = + (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getEnglish(), status); + // bi->dumpTables(); + + RBBIDataWrapper *dw = bi->fData; + const RBBIStateTable *fwtbl = dw->fForwardTable; + int32_t numCharClasses = dw->fHeader->fCatCount; + printf("Char Classes: %d states: %d\n", numCharClasses, fwtbl->fNumStates); + + // Check for duplicate columns + + std::vector columns; + for (int32_t column = 0; column < numCharClasses; column++) { + UnicodeString s; + for (int32_t r = 1; r < (int32_t)fwtbl->fNumStates; r++) { + RBBIStateTableRow *row = (RBBIStateTableRow *) (fwtbl->fTableData + (fwtbl->fRowLen * r)); + s.append(row->fNextState[column]); + } + columns.push_back(s); + } + for (int c1=0; c1 rows; + for (int32_t r=0; r < (int32_t)fwtbl->fNumStates; r++) { + UnicodeString s; + RBBIStateTableRow *row = (RBBIStateTableRow *) (fwtbl->fTableData + (fwtbl->fRowLen * r)); + if (row->fAccepting < -1) { + printf("row %d accepting = %d\n", r, row->fAccepting); + } + s.append(row->fAccepting + 1); // values of -1 are expected. + s.append(row->fLookAhead); + s.append(row->fTagIdx); + for (int32_t column = 0; column < numCharClasses; column++) { + s.append(row->fNextState[column]); + } + rows.push_back(s); + } + for (int r1=0; r1<(int32_t)fwtbl->fNumStates; r1++) { + for (int r2 = r1+1; r2 < (int32_t)fwtbl->fNumStates; r2++) { + if (rows.at(r1) == rows.at(r2)) { + printf("Duplicate rows (%d, %d)\n", r1, r2); + break; + } + } + } + delete bi; +} + + // // TestDebug - A place-holder test for debugging purposes. // For putting in fragments of other tests that can be invoked diff --git a/icu4c/source/test/intltest/rbbitst.h b/icu4c/source/test/intltest/rbbitst.h index 71febf1cebf..0977c98a0fb 100644 --- a/icu4c/source/test/intltest/rbbitst.h +++ b/icu4c/source/test/intltest/rbbitst.h @@ -75,6 +75,7 @@ public: void TestEmoji(); void TestBug12519(); void TestBug12677(); + void TestTableRedundancies(); void TestDebug(); void TestProperties(); -- 2.40.0