int32_t ruleStatusIndex = 0;
// TODO: check for position == length of text. Although may still need to back up to get rule status.
if (position > 20) {
- int32_t backupPos = fBI->handlePrevious(position);
+ int32_t backupPos = fBI->handleSafePrevious(position);
fBI->fPosition = backupPos;
- aBoundary = fBI->handleNext(); // Ignore dictionary, just finding a rule based boundary.
+ aBoundary = fBI->handleNext(); // Ignore dictionary, just finding a rule based boundary.
+ if (aBoundary == backupPos + 1) { // TODO: + 1 is wrong for supplementals.
+ // Safe rules work on pairs. +1 from start pos may be a false match.
+ aBoundary = fBI->handleNext();
+ }
ruleStatusIndex = fBI->fRuleStatusIndex;
}
- reset(aBoundary, ruleStatusIndex); // Reset cache to hold aBoundary as a single starting point.
+ reset(aBoundary, ruleStatusIndex); // Reset cache to hold aBoundary as a single starting point.
}
// Fill in boundaries between existing cache content and the new requested position.
if (backupPosition <= 0) {
backupPosition = 0;
} else {
- backupPosition = fBI->handlePrevious(backupPosition);
+ backupPosition = fBI->handleSafePrevious(backupPosition);
}
if (backupPosition == UBRK_DONE || backupPosition == 0) {
position = 0;
positionStatusIdx = 0;
} else {
fBI->fPosition = backupPosition; // TODO: pass starting position in a clearer way.
- position = fBI->handleNext();
+ position = fBI->handleNext(); // TODO: supplementals don't work with the +1.
+ if (position == backupPosition + 1) {
+ position = fBI->handleNext(); // Safe rules identify safe pairs.
+ };
positionStatusIdx = fBI->fRuleStatusIndex;
}
//
int32_t headerSize = align8(sizeof(RBBIDataHeader));
int32_t forwardTableSize = align8(fForwardTables->getTableSize());
- int32_t safeRevTableSize = align8(fSafeRevTables->getTableSize());
+ int32_t reverseTableSize = align8(fForwardTables->getSafeTableSize());
+ int32_t safeRevTableSize = align8(fSafeRevTables->getTableSize()); // TODO: remove hand-written rules.
int32_t trieSize = align8(fSetBuilder->getTrieSize());
int32_t statusTableSize = align8(fRuleStatusVals->size() * sizeof(int32_t));
int32_t rulesSize = align8((fStrippedRules.length()+1) * sizeof(UChar));
int32_t totalSize = headerSize
- + forwardTableSize
+ + forwardTableSize
+ + reverseTableSize
+ safeRevTableSize
+ statusTableSize + trieSize + rulesSize;
data->fLength = totalSize;
data->fCatCount = fSetBuilder->getNumCharCategories();
- // Only save the forward table and the safe reverse table,
- // because these are the only ones used at run-time.
- //
- // For the moment, we still build the other tables if they are present in the rule source files,
- // for backwards compatibility. Old rule files need to work, and this is the simplest approach.
- //
- // Additional backwards compatibility consideration: if no safe rules are provided, consider the
- // reverse rules to actually be the safe reverse rules.
-
data->fFTable = headerSize;
data->fFTableLen = forwardTableSize;
- // Do not save Reverse Table.
- data->fRTable = data->fFTable + forwardTableSize;
- data->fRTableLen = 0;
+ data->fRTable = data->fFTable + data->fFTableLen;
+ data->fRTableLen = reverseTableSize;
// Do not save the Safe Forward table.
- data->fSFTable = data->fRTable + 0;
+ data->fSFTable = data->fRTable + data->fRTableLen;
data->fSFTableLen = 0;
- data->fSRTable = data->fSFTable + 0;
+ // Hand written reverse rules. TODO: remove, once synthesized ones are working.
+ data->fSRTable = data->fSFTable + data->fSFTableLen;
data->fSRTableLen = safeRevTableSize;
U_ASSERT(safeRevTableSize > 0);
uprv_memset(data->fReserved, 0, sizeof(data->fReserved));
fForwardTables->exportTable((uint8_t *)data + data->fFTable);
+ fForwardTables->exportSafeTable((uint8_t *)data + data->fRTable);
fSafeRevTables->exportTable((uint8_t *)data + data->fSRTable);
fSetBuilder->serializeTrie ((uint8_t *)data + data->fTrie);
if (fForwardTables == nullptr || fSafeRevTables == nullptr)
{
status = U_MEMORY_ALLOCATION_ERROR;
- delete fForwardTables; fForwardTables = nullptr;
- delete fSafeRevTables; fSafeRevTables = nullptr;
return nullptr;
}
fForwardTables->build();
- fForwardTables->buildSafe(status);
fSafeRevTables->build();
+ optimizeTables();
+ fForwardTables->buildSafe(status);
+
+ if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "states")) {printStates();};
#ifdef RBBI_DEBUG
if (fDebugEnv && uprv_strstr(fDebugEnv, "states")) {
+ fForwardTables
fForwardTables->printRuleStatusTable();
+ fForwardTables->printSafeTable();
}
#endif
- optimizeTables();
fSetBuilder->buildTrie();
//
}
// fDStates is UVector<RBBIStateDescriptor *>
fDStates = new UVector(status);
- // SafeTable is UVector<UnicodeString *>. Contents owned by the UVector.
- fSafeTable = new UVector(uprv_deleteUObject, uhash_compareUnicodeString, status);
- if (U_SUCCESS(status) && (fDStates == nullptr || fSafeTable == nullptr)) {
- status = U_MEMORY_ALLOCATION_ERROR;;
+ if (U_SUCCESS(status) && fDStates == nullptr ) {
+ status = U_MEMORY_ALLOCATION_ERROR;
}
}
// for all tables. Merge the ones from this table into the global set.
//
mergeRuleStatusVals();
-
- if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "states")) {printStates();};
}
if (wantedEndState == endState) {
int32_t pair = c1 << 16 | c2;
safePairs.addElement(pair, status);
- // printf("(%d, %d) ", c1, c2);
+ printf("(%d, %d) ", c1, c2);
}
}
- //printf("\n");
+ printf("\n");
}
// Populate the initial safe table.
// Row 1 is the start sate.
// Row 2 and beyond are other states, initially one per char class, but
// after initial construction, many of the states will be combined, compacting the table.)
+ // The String holds the nextState data only. The four leading fields of a row, fAccepting,
+ // fLookAhead, etc. are not needed for the safe table, and are omitted at this stage of building.
+
+ U_ASSERT(fSafeTable == nullptr);
fSafeTable = new UVector(uprv_deleteUObject, uhash_compareUnicodeString, numCharClasses + 2, status);
for (int32_t row=0; row<numCharClasses + 2; ++row) {
- fSafeTable->addElement(new UnicodeString(numCharClasses+4, 0, numCharClasses+4), status);
+ fSafeTable->addElement(new UnicodeString(numCharClasses, 0, numCharClasses+4), status);
}
// From the start state, each input char class transitions to the state for that input.
UnicodeString &startState = *(UnicodeString *)fSafeTable->elementAt(1);
for (int32_t charClass=0; charClass < numCharClasses; ++charClass) {
- // Note: +2 for the start & stop state; +4 for header columns in state table.
- startState.setCharAt(charClass+4, charClass+2);
+ // Note: +2 for the start & stop state.
+ startState.setCharAt(charClass, charClass+2);
}
// Initially make every other state table row look like the start state row,
for (int32_t row=2; row<numCharClasses+2; ++row) {
- UnicodeString &rowState = *(UnicodeString *)fSafeTable->elementAt(1);
+ UnicodeString &rowState = *(UnicodeString *)fSafeTable->elementAt(row);
rowState = startState; // UnicodeString assignment, copies contents.
}
int32_t c2 = pair & 0x0000ffff;
UnicodeString &rowState = *(UnicodeString *)fSafeTable->elementAt(c2 + 2);
- rowState.setCharAt(c1 + 4, 0);
+ rowState.setCharAt(c1, 0);
}
// Merge similar states.
}
+
+//-----------------------------------------------------------------------------
+//
+// getSafeTableSize() Calculate the size of the runtime form of this
+// safe state table.
+//
+//-----------------------------------------------------------------------------
+int32_t RBBITableBuilder::getSafeTableSize() const {
+ int32_t size = 0;
+ int32_t numRows;
+ int32_t numCols;
+ int32_t rowSize;
+
+ if (fSafeTable == nullptr) {
+ return 0;
+ }
+
+ size = offsetof(RBBIStateTable, fTableData); // The header, with no rows to the table.
+
+ numRows = fSafeTable->size();
+ numCols = fRB->fSetBuilder->getNumCharCategories();
+
+ rowSize = offsetof(RBBIStateTableRow, fNextState) + sizeof(uint16_t)*numCols;
+ size += numRows * rowSize;
+ return size;
+}
+
+
+//-----------------------------------------------------------------------------
+//
+// exportTable() export the state transition table in the format required
+// by the runtime engine. getTableSize() bytes of memory
+// must be available at the output address "where".
+//
+//-----------------------------------------------------------------------------
+void RBBITableBuilder::exportSafeTable(void *where) {
+ RBBIStateTable *table = (RBBIStateTable *)where;
+ uint32_t state;
+ int col;
+
+ if (U_FAILURE(*fStatus) || fSafeTable == nullptr) {
+ return;
+ }
+
+ int32_t catCount = fRB->fSetBuilder->getNumCharCategories();
+ if (catCount > 0x7fff ||
+ fSafeTable->size() > 0x7fff) {
+ *fStatus = U_BRK_INTERNAL_ERROR;
+ return;
+ }
+
+ table->fRowLen = offsetof(RBBIStateTableRow, fNextState) + sizeof(uint16_t) * catCount;
+ table->fNumStates = fSafeTable->size();
+ table->fFlags = 0;
+ table->fReserved = 0;
+
+ for (state=0; state<table->fNumStates; state++) {
+ UnicodeString *rowString = (UnicodeString *)fSafeTable->elementAt(state);
+ RBBIStateTableRow *row = (RBBIStateTableRow *)(table->fTableData + state*table->fRowLen);
+ row->fAccepting = 0;
+ row->fLookAhead = 0;
+ row->fTagIdx = 0;
+ row->fReserved = 0;
+ for (col=0; col<catCount; col++) {
+ row->fNextState[col] = rowString->charAt(col);
+ }
+ }
+}
+
+
+
+
//-----------------------------------------------------------------------------
//
// printSet Debug function. Print the contents of a UVector
#endif
+//-----------------------------------------------------------------------------
+//
+// printSafeTable Debug Function. Dump the fully constructed safe table.
+//
+//-----------------------------------------------------------------------------
+#ifdef RBBI_DEBUG
+void RBBITableBuilder::printSafeTable() {
+ int c; // input "character"
+ int n; // state number
+
+ RBBIDebugPrintf(" Safe Reverse Table \n");
+ if (fSafeTable == nullptr) {
+ RBBIDebugPrintf(" --- nullptr ---\n");
+ return;
+ }
+ RBBIDebugPrintf("state | i n p u t s y m b o l s \n");
+ RBBIDebugPrintf(" | Acc LA Tag");
+ for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
+ RBBIDebugPrintf(" %2d", c);
+ }
+ RBBIDebugPrintf("\n");
+ RBBIDebugPrintf(" |---------------");
+ for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
+ RBBIDebugPrintf("---");
+ }
+ RBBIDebugPrintf("\n");
+
+ for (n=0; n<fSafeTable->size(); n++) {
+ UnicodeString *rowString = (UnicodeString *)fSafeTable->elementAt(n);
+ RBBIDebugPrintf(" %3d | " , n);
+ RBBIDebugPrintf("%3d %3d %5d ", 0, 0, 0); // Accepting, LookAhead, Tags
+ for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
+ RBBIDebugPrintf(" %2d", rowString->charAt(c));
+ }
+ RBBIDebugPrintf("\n");
+ }
+ RBBIDebugPrintf("\n\n");
+}
+#endif
+
+
//-----------------------------------------------------------------------------
//
// for tracing without a lot of unwanted extra stuff happening.
//
void RBBITest::TestDebug(void) {
+ UErrorCode status = U_ZERO_ERROR;
+ LocalPointer<RuleBasedBreakIterator> bi ((RuleBasedBreakIterator *)
+ BreakIterator::createCharacterInstance(Locale::getEnglish(), status), status);
+ const UnicodeString &rules = bi->getRules();
+ UParseError pe;
+ LocalPointer<RuleBasedBreakIterator> newbi(new RuleBasedBreakIterator(rules, pe, status));
+ assertSuccess(WHERE, status);
+
+#if 0
+ bi->dumpTables();
+
+ RBBIDataWrapper *dw = bi->fData;
+ const RBBIStateTable *fwtbl = dw->fForwardTable;
+ int32_t numCharClasses = dw->fHeader->fCatCount;
+ printf("Char Classes: %d states: %d\n", numCharClasses, fwtbl->fNumStates);
+
+ for (int32_t c1=0; c1<numCharClasses; ++c1) {
+ for (int32_t c2=0; c2 < numCharClasses; ++c2) {
+ int32_t wantedEndState = -1;
+ int32_t endState = 0;
+ for (int32_t startState = 1; startState < (int32_t)fwtbl->fNumStates; ++startState) {
+ RBBIStateTableRow *row = (RBBIStateTableRow *) (fwtbl->fTableData + (fwtbl->fRowLen * startState));
+ int32_t s2 = row->fNextState[c1];
+ row = (RBBIStateTableRow *) (fwtbl->fTableData + (fwtbl->fRowLen * s2));
+ endState = row->fNextState[c2];
+ if (wantedEndState < 0) {
+ wantedEndState = endState;
+ } else {
+ if (wantedEndState != endState) {
+ break;
+ }
+ }
+ }
+ if (wantedEndState == endState) {
+ printf("(%d, %d) ", c1, c2);
+ }
+ }
+ printf("\n");
+ }
+ printf("\n");
+#endif
}
void RBBITest::TestProperties() {