fBreakCache->dumpCache();
}
+void RuleBasedBreakIterator::dumpTables() {
+ fData->printData();
+}
+
/**
* Returns the description used to create this iterator
*/
#endif
-#ifdef RBBI_DEBUG
void RBBIDataWrapper::printData() {
+#ifdef RBBI_DEBUG
RBBIDebugPrintf("RBBI Data at %p\n", (void *)fHeader);
RBBIDebugPrintf(" Version = {%d %d %d %d}\n", fHeader->fFormatVersion[0], fHeader->fFormatVersion[1],
fHeader->fFormatVersion[2], fHeader->fFormatVersion[3]);
RBBIDebugPrintf("%c", fRuleSource[c]);
}
RBBIDebugPrintf("\n\n");
-}
#endif
+}
U_NAMESPACE_END
UBool operator ==(const RBBIDataWrapper &other) const;
int32_t hashCode();
const UnicodeString &getRuleSourceString() const;
-#ifdef RBBI_DEBUG
void printData();
void printTable(const char *heading, const RBBIStateTable *table);
-#else
- #define printData()
- #define printTable(heading, table)
-#endif
/* */
/* Pointers to items within the data */
//
// UnicodeSet processing.
// Munge the Unicode Sets to create a set of character categories.
- // Generate the mapping tables (TRIE) from input 32-bit characters to
+ // Generate the mapping tables (TRIE) from input code points to
// the character categories.
//
- builder.fSetBuilder->build();
+ builder.fSetBuilder->buildRanges();
//
}
#endif
+ builder.optimizeTables();
+ builder.fSetBuilder->buildTrie();
+
+
+
//
// Package up the compiled data into a memory image
// in the run-time format.
return This;
}
+void RBBIRuleBuilder::optimizeTables() {
+ int32_t leftClass;
+ int32_t rightClass;
+
+ leftClass = 1;
+ rightClass = 2;
+ while (fForwardTables->findDuplCharClassFrom(leftClass, rightClass)) {
+ fSetBuilder->mergeCategories(leftClass, rightClass);
+ fForwardTables->removeColumn(rightClass);
+ }
+
+
+}
+
U_NAMESPACE_END
#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
);
virtual ~RBBIRuleBuilder();
+
+ /**
+ * Fold together redundant character classes (table columns) and
+ * redundant states (table rows). Done after initial table generation,
+ * before serializing the result.
+ */
+ void optimizeTables();
+
char *fDebugEnv; // controls debug trace output
UErrorCode *fStatus; // Error reporting. Keeping status
UParseError *fParseError; // here avoids passing it everywhere.
// from the Unicode Sets.
//
//------------------------------------------------------------------------
-void RBBISetBuilder::build() {
+void RBBISetBuilder::buildRanges() {
RBBINode *usetNode;
RangeDescriptor *rlRange;
if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "rgroup")) {printRangeGroups();}
if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "esets")) {printSets();}
+}
+
+
+//
+// Build the Trie table for mapping UChar32 values to the corresponding
+// range group number.
+//
+void RBBISetBuilder::buildTrie() {
+ RangeDescriptor *rlRange;
- //
- // Build the Trie table for mapping UChar32 values to the corresponding
- // range group number
- //
fTrie = utrie2_open(0, // Initial value for all code points.
0, // Error value for out-of-range input.
fStatus);
}
+void RBBISetBuilder::mergeCategories(int32_t left, int32_t right) {
+ U_ASSERT(left >= 1);
+ U_ASSERT(right > left);
+ for (RangeDescriptor *rd = fRangeList; rd != nullptr; rd = rd->fNext) {
+ if (rd->fNum == right) {
+ rd->fNum = left;
+ } else if (rd->fNum > right) {
+ rd->fNum--;
+ }
+ }
+ --fGroupCount;
+}
+
+
//-----------------------------------------------------------------------------------
//
// getTrieSize() Return the size that will be required to serialize the Trie.
RBBISetBuilder(RBBIRuleBuilder *rb);
~RBBISetBuilder();
- void build();
+ void buildRanges();
+ void buildTrie();
void addValToSets(UVector *sets, uint32_t val);
void addValToSet (RBBINode *usetNode, uint32_t val);
int32_t getNumCharCategories() const; // CharCategories are the same as input symbol set to the
UChar32 getFirstChar(int32_t val) const;
UBool sawBOF() const; // Indicate whether any references to the {bof} pseudo
// character were encountered.
+ /** merge two character categories that have been identified as having equivalent behavior.
+ * The ranges belonging to the right category (table column) will be added to the left.
+ */
+ void mergeCategories(int32_t left, int32_t right);
+
#ifdef RBBI_DEBUG
void printSets();
void printRanges();
#include "rbbidata.h"
#include "cstring.h"
#include "uassert.h"
+#include "uvectr32.h"
#include "cmemory.h"
U_NAMESPACE_BEGIN
}
#endif
+//
+// findDuplCharClassFrom()
+//
+bool RBBITableBuilder::findDuplCharClassFrom(int32_t &baseCategory, int32_t &duplCategory) {
+ int32_t numStates = fDStates->size();
+ int32_t numCols = fRB->fSetBuilder->getNumCharCategories();
+
+ U_ASSERT(baseCategory < duplCategory);
+
+ uint16_t table_base;
+ uint16_t table_dupl;
+ for (; baseCategory < numCols-1; ++baseCategory) {
+ for (; duplCategory < numCols; ++duplCategory) {
+ for (int32_t state=0; state<numStates; state++) {
+ RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(state);
+ table_base = (uint16_t)sd->fDtran->elementAti(baseCategory);
+ table_dupl = (uint16_t)sd->fDtran->elementAti(duplCategory);
+ if (table_base != table_dupl) {
+ break;
+ }
+ }
+ if (table_base == table_dupl) {
+ return true;
+ }
+ }
+ }
+ return false;
+}
+
+
+//
+// removeColumn()
+//
+void RBBITableBuilder::removeColumn(int32_t column) {
+ int32_t numStates = fDStates->size();
+ for (int32_t state=0; state<numStates; state++) {
+ RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(state);
+ U_ASSERT(column < sd->fDtran->size());
+ sd->fDtran->removeElementAt(column);
+ }
+}
+
+
//-----------------------------------------------------------------------------
}
-
//-----------------------------------------------------------------------------
//
// exportTable() export the state transition table in the format required
fPositions = NULL;
fDtran = NULL;
- fDtran = new UVector(lastInputSymbol+1, *fStatus);
+ fDtran = new UVector32(lastInputSymbol+1, *fStatus);
if (U_FAILURE(*fStatus)) {
return;
}
*fStatus = U_MEMORY_ALLOCATION_ERROR;
return;
}
- fDtran->setSize(lastInputSymbol+1, *fStatus); // fDtran needs to be pre-sized.
+ fDtran->setSize(lastInputSymbol+1); // fDtran needs to be pre-sized.
// It is indexed by input symbols, and will
// hold the next state number for each
// symbol.
class RBBIRuleScanner;
class RBBIRuleBuilder;
+class UVector32;
//
// class RBBITableBuilder is part of the RBBI rule compiler.
void build();
int32_t getTableSize() const; // Return the runtime size in bytes of
// the built state table
- void exportTable(void *where); // fill in the runtime state table.
- // Sufficient memory must exist at
- // the specified location.
+
+ /** Fill in the runtime state table. Sufficient memory must exist at the specified location.
+ */
+ void exportTable(void *where);
+
+ /** Find duplicate (redundant) character classes, beginning after the specifed
+ * pair, within this state table. This is an iterator-like function, used to
+ * identify char classes (state table columns) that can be eliminated.
+ */
+ bool findDuplCharClassFrom(int &baseClass, int &duplClass);
+
+ /** Remove a column from the state table. Used when two character categories
+ * have been found equivalent, and merged together, to eliminate the uneeded table column.
+ */
+ void removeColumn(int32_t column);
+
+
private:
void flagTaggedStates();
void mergeRuleStatusVals();
+ /**
+ * Merge redundant state table columns, eliminating character classes with identical behavior.
+ * Done after the state tables are generated, just before converting to their run-time format.
+ */
+ int32_t mergeColumns();
+
void addRuleRootNodes(UVector *dest, RBBINode *node);
// Set functions for UVector.
// with this state. Unordered (it's a set).
// UVector contents are RBBINode *
- UVector *fDtran; // Transitions out of this state.
+ UVector32 *fDtran; // Transitions out of this state.
// indexed by input character
// contents is int index of dest state
// in RBBITableBuilder.fDStates
UText fText;
/**
- * The rule data for this BreakIterator instance
+ * The rule data for this BreakIterator instance.
+ * Not for general use; Public only for testing purposes.
* @internal
*/
+public:
RBBIDataWrapper *fData;
+private:
/**
* The iteration state - current position, rule status for the current position,
* @internal
*/
void dumpCache();
+
+ /**
+ * Debugging function only.
+ * @internal
+ */
+ void dumpTables();
+
#endif /* U_HIDE_INTERNAL_API */
};
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
+#include <vector>
#include "unicode/brkiter.h"
#include "unicode/localpointer.h"
#include "cstr.h"
#include "intltest.h"
#include "rbbitst.h"
+#include "rbbidata.h"
#include "utypeinfo.h" // for 'typeid' to work
#include "uvector.h"
#include "uvectr32.h"
+
#if !UCONFIG_NO_FILTERED_BREAK_ITERATION
#include "unicode/filteredbrk.h"
#endif // !UCONFIG_NO_FILTERED_BREAK_ITERATION
TESTCASE_AUTO(TestEmoji);
TESTCASE_AUTO(TestBug12519);
TESTCASE_AUTO(TestBug12677);
+ TESTCASE_AUTO(TestTableRedundancies);
TESTCASE_AUTO_END;
}
assertEquals(WHERE, UnicodeString(u"!!forward; $x = [ab#]; '#' '?'; "), rtRules);
}
+
+void RBBITest::TestTableRedundancies() {
+ UErrorCode status = U_ZERO_ERROR;
+ RuleBasedBreakIterator *bi =
+ (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getEnglish(), status);
+ // bi->dumpTables();
+
+ RBBIDataWrapper *dw = bi->fData;
+ const RBBIStateTable *fwtbl = dw->fForwardTable;
+ int32_t numCharClasses = dw->fHeader->fCatCount;
+ printf("Char Classes: %d states: %d\n", numCharClasses, fwtbl->fNumStates);
+
+ // Check for duplicate columns
+
+ std::vector<UnicodeString> columns;
+ for (int32_t column = 0; column < numCharClasses; column++) {
+ UnicodeString s;
+ for (int32_t r = 1; r < (int32_t)fwtbl->fNumStates; r++) {
+ RBBIStateTableRow *row = (RBBIStateTableRow *) (fwtbl->fTableData + (fwtbl->fRowLen * r));
+ s.append(row->fNextState[column]);
+ }
+ columns.push_back(s);
+ }
+ for (int c1=0; c1<numCharClasses; c1++) {
+ for (int c2 = c1+1; c2 < numCharClasses; c2++) {
+ if (columns.at(c1) == columns.at(c2)) {
+ printf("Duplicate columns (%d, %d)\n", c1, c2);
+ break;
+ }
+ }
+ }
+
+ // Check for duplicate states
+ std::vector<UnicodeString> rows;
+ for (int32_t r=0; r < (int32_t)fwtbl->fNumStates; r++) {
+ UnicodeString s;
+ RBBIStateTableRow *row = (RBBIStateTableRow *) (fwtbl->fTableData + (fwtbl->fRowLen * r));
+ if (row->fAccepting < -1) {
+ printf("row %d accepting = %d\n", r, row->fAccepting);
+ }
+ s.append(row->fAccepting + 1); // values of -1 are expected.
+ s.append(row->fLookAhead);
+ s.append(row->fTagIdx);
+ for (int32_t column = 0; column < numCharClasses; column++) {
+ s.append(row->fNextState[column]);
+ }
+ rows.push_back(s);
+ }
+ for (int r1=0; r1<(int32_t)fwtbl->fNumStates; r1++) {
+ for (int r2 = r1+1; r2 < (int32_t)fwtbl->fNumStates; r2++) {
+ if (rows.at(r1) == rows.at(r2)) {
+ printf("Duplicate rows (%d, %d)\n", r1, r2);
+ break;
+ }
+ }
+ }
+ delete bi;
+}
+
+
//
// TestDebug - A place-holder test for debugging purposes.
// For putting in fragments of other tests that can be invoked
void TestEmoji();
void TestBug12519();
void TestBug12677();
+ void TestTableRedundancies();
void TestDebug();
void TestProperties();