}
+//-----------------------------------------------------------------------------------
+//
+// handleSafePrevious()
+//
+// Iterate backwards using the safe reverse rules.
+// The logic of this function is similar to handleNext(), but simpler
+// because the safe table does not require as many options.
+//
+//-----------------------------------------------------------------------------------
+int32_t RuleBasedBreakIterator::handleSafePrevious(int32_t fromPosition) {
+ int32_t state;
+ uint16_t category = 0;
+ RBBIStateTableRow *row;
+ UChar32 c;
+ int32_t result = 0;
+
+ const RBBIStateTable *stateTable = fData->fSafeRevTable;
+ UTEXT_SETNATIVEINDEX(&fText, fromPosition);
+ #ifdef RBBI_DEBUG
+ if (gTrace) {
+ RBBIDebugPuts("Handle Previous pos char state category");
+ }
+ #endif
+
+ // if we're already at the start of the text, return DONE.
+ if (fData == NULL || UTEXT_GETNATIVEINDEX(&fText)==0) {
+ return BreakIterator::DONE;
+ }
+
+ // Set the initial state for the state machine
+ c = UTEXT_PREVIOUS32(&fText);
+ state = START_STATE;
+ row = (RBBIStateTableRow *)
+ (stateTable->fTableData + (stateTable->fRowLen * state));
+
+ // loop until we reach the start of the text or transition to state 0
+ //
+ for (; c != U_SENTINEL; c = UTEXT_PREVIOUS32(&fText)) {
+
+ // look up the current character's character category, which tells us
+ // which column in the state table to look at.
+ // Note: the 16 in UTRIE_GET16 refers to the size of the data being returned,
+ // not the size of the character going in, which is a UChar32.
+ //
+ // And off the dictionary flag bit. For reverse iteration it is not used.
+ category = UTRIE2_GET16(fData->fTrie, c);
+ category &= ~0x4000;
+
+ #ifdef RBBI_DEBUG
+ if (gTrace) {
+ RBBIDebugPrintf(" %4d ", (int32_t)utext_getNativeIndex(&fText));
+ if (0x20<=c && c<0x7f) {
+ RBBIDebugPrintf("\"%c\" ", c);
+ } else {
+ RBBIDebugPrintf("%5x ", c);
+ }
+ RBBIDebugPrintf("%3d %3d\n", state, category);
+ }
+ #endif
+
+ // State Transition - move machine to its next state
+ //
+ // fNextState is a variable-length array.
+ U_ASSERT(category<fData->fHeader->fCatCount);
+ state = row->fNextState[category]; /*Not accessing beyond memory*/
+ row = (RBBIStateTableRow *)
+ (stateTable->fTableData + (stateTable->fRowLen * state));
+
+ if (state == STOP_STATE) {
+ // This is the normal exit from the lookup state machine.
+ // Transistion to state zero means we have found a safe point.
+ break;
+ }
+ }
+
+ // The state machine is done. Check whether it found a match...
+ result = (int32_t)UTEXT_GETNATIVEINDEX(&fText);
+ #ifdef RBBI_DEBUG
+ if (gTrace) {
+ RBBIDebugPrintf("result = %d\n\n", result);
+ }
+ #endif
+ return result;
+}
+
//-------------------------------------------------------------------------------
//
// getRuleStatus() Return the break rule tag associated with the current
void RBBIDataWrapper::init0() {
fHeader = NULL;
fForwardTable = NULL;
- fReverseTable = NULL;
- fSafeFwdTable = NULL;
fSafeRevTable = NULL;
fRuleSource = NULL;
fRuleStatusTable = NULL;
if (data->fFTableLen != 0) {
fForwardTable = (RBBIStateTable *)((char *)data + fHeader->fFTable);
}
- if (data->fRTableLen != 0) {
- fReverseTable = (RBBIStateTable *)((char *)data + fHeader->fRTable);
- }
- if (data->fSFTableLen != 0) {
- fSafeFwdTable = (RBBIStateTable *)((char *)data + fHeader->fSFTable);
- }
if (data->fSRTableLen != 0) {
fSafeRevTable = (RBBIStateTable *)((char *)data + fHeader->fSRTable);
}
- // Rule Compatibility Hacks
- // If a rule set includes reverse rules but does not explicitly include safe reverse rules,
- // the reverse rules are to be treated as safe reverse rules.
-
- if (fSafeRevTable == NULL && fReverseTable != NULL) {
- fSafeRevTable = fReverseTable;
- fReverseTable = NULL;
- }
-
fTrie = utrie2_openFromSerialized(UTRIE2_16_VALUE_BITS,
(uint8_t *)data + fHeader->fTrie,
fHeader->fTrieLen,
RBBIDebugPrintf(" number of character categories = %d\n\n", fHeader->fCatCount);
printTable("Forward State Transition Table", fForwardTable);
- printTable("Reverse State Transition Table", fReverseTable);
- printTable("Safe Forward State Transition Table", fSafeFwdTable);
printTable("Safe Reverse State Transition Table", fSafeRevTable);
RBBIDebugPrintf("\nOrignal Rules source:\n");
/* */
const RBBIDataHeader *fHeader;
const RBBIStateTable *fForwardTable;
- const RBBIStateTable *fReverseTable;
- const RBBIStateTable *fSafeFwdTable;
const RBBIStateTable *fSafeRevTable;
const UChar *fRuleSource;
const int32_t *fRuleStatusTable;
fSafeRevTree = NULL;
fDefaultTree = &fForwardTree;
fForwardTables = NULL;
- fReverseTables = NULL;
- fSafeFwdTables = NULL;
fSafeRevTables = NULL;
fRuleStatusVals = NULL;
fChainRules = FALSE;
delete fUSetNodes;
delete fSetBuilder;
delete fForwardTables;
- delete fReverseTables;
- delete fSafeFwdTables;
delete fSafeRevTables;
delete fForwardTree;
//
int32_t headerSize = align8(sizeof(RBBIDataHeader));
int32_t forwardTableSize = align8(fForwardTables->getTableSize());
- int32_t reverseTableSize = align8(fReverseTables->getTableSize());
- int32_t safeFwdTableSize = align8(fSafeFwdTables->getTableSize());
int32_t safeRevTableSize = align8(fSafeRevTables->getTableSize());
int32_t trieSize = align8(fSetBuilder->getTrieSize());
int32_t statusTableSize = align8(fRuleStatusVals->size() * sizeof(int32_t));
int32_t rulesSize = align8((fStrippedRules.length()+1) * sizeof(UChar));
- (void)safeFwdTableSize;
-
int32_t totalSize = headerSize
+ forwardTableSize
- + /* reverseTableSize */ 0
- + /* safeFwdTableSize */ 0
- + (safeRevTableSize ? safeRevTableSize : reverseTableSize)
+ + safeRevTableSize
+ statusTableSize + trieSize + rulesSize;
RBBIDataHeader *data = (RBBIDataHeader *)uprv_malloc(totalSize);
data->fSFTableLen = 0;
data->fSRTable = data->fSFTable + 0;
- if (safeRevTableSize > 0) {
- data->fSRTableLen = safeRevTableSize;
- } else if (reverseTableSize > 0) {
- data->fSRTableLen = reverseTableSize;
- } else {
- U_ASSERT(FALSE); // Rule build should have failed for lack of a reverse table
- // before reaching this point.
- }
-
-
+ data->fSRTableLen = safeRevTableSize;
+ U_ASSERT(safeRevTableSize > 0);
+
data->fTrie = data->fSRTable + data->fSRTableLen;
data->fTrieLen = fSetBuilder->getTrieSize();
data->fStatusTable = data->fTrie + trieSize;
uprv_memset(data->fReserved, 0, sizeof(data->fReserved));
fForwardTables->exportTable((uint8_t *)data + data->fFTable);
- // fReverseTables->exportTable((uint8_t *)data + data->fRTable);
- // fSafeFwdTables->exportTable((uint8_t *)data + data->fSFTable);
- if (safeRevTableSize > 0) {
- fSafeRevTables->exportTable((uint8_t *)data + data->fSRTable);
- } else {
- fReverseTables->exportTable((uint8_t *)data + data->fSRTable);
- }
+ fSafeRevTables->exportTable((uint8_t *)data + data->fSRTable);
fSetBuilder->serializeTrie ((uint8_t *)data + data->fTrie);
}
-
-
-
-
//----------------------------------------------------------------------------------------
//
// createRuleBasedBreakIterator construct from source rules that are passed in
UParseError *parseError,
UErrorCode &status)
{
- // status checked below
-
//
// Read the input rules, generate a parse tree, symbol table,
// and list of all Unicode Sets referenced by the rules.
if (U_FAILURE(status)) { // status checked here bcos build below doesn't
return NULL;
}
- builder.fScanner->parse();
+
+ RBBIDataHeader *data = builder.build(status);
+
+ if (U_FAILURE(status)) {
+ return nullptr;
+ }
+
+ //
+ // Create a break iterator from the compiled rules.
+ // (Identical to creation from stored pre-compiled rules)
+ //
+ // status is checked after init in construction.
+ RuleBasedBreakIterator *This = new RuleBasedBreakIterator(data, status);
+ if (U_FAILURE(status)) {
+ delete This;
+ This = NULL;
+ }
+ else if(This == NULL) { // test for NULL
+ status = U_MEMORY_ALLOCATION_ERROR;
+ }
+ return This;
+}
+
+RBBIDataHeader *RBBIRuleBuilder::build(UErrorCode &status) {
+ if (U_FAILURE(status)) {
+ return nullptr;
+ }
+
+ fScanner->parse();
+ if (U_FAILURE(status)) {
+ return nullptr;
+ }
//
// UnicodeSet processing.
// Generate the mapping tables (TRIE) from input code points to
// the character categories.
//
- builder.fSetBuilder->buildRanges();
-
+ fSetBuilder->buildRanges();
//
// Generate the DFA state transition table.
//
- builder.fForwardTables = new RBBITableBuilder(&builder, &builder.fForwardTree);
- builder.fReverseTables = new RBBITableBuilder(&builder, &builder.fReverseTree);
- builder.fSafeFwdTables = new RBBITableBuilder(&builder, &builder.fSafeFwdTree);
- builder.fSafeRevTables = new RBBITableBuilder(&builder, &builder.fSafeRevTree);
- if (builder.fForwardTables == NULL || builder.fReverseTables == NULL ||
- builder.fSafeFwdTables == NULL || builder.fSafeRevTables == NULL)
+ fForwardTables = new RBBITableBuilder(this, &fForwardTree, status);
+ fSafeRevTables = new RBBITableBuilder(this, &fSafeRevTree, status);
+ if (fForwardTables == nullptr || fSafeRevTables == nullptr)
{
status = U_MEMORY_ALLOCATION_ERROR;
- delete builder.fForwardTables; builder.fForwardTables = NULL;
- delete builder.fReverseTables; builder.fReverseTables = NULL;
- delete builder.fSafeFwdTables; builder.fSafeFwdTables = NULL;
- delete builder.fSafeRevTables; builder.fSafeRevTables = NULL;
- return NULL;
+ delete fForwardTables; fForwardTables = nullptr;
+ delete fSafeRevTables; fSafeRevTables = nullptr;
+ return nullptr;
}
- builder.fForwardTables->build();
- builder.fReverseTables->build();
- builder.fSafeFwdTables->build();
- builder.fSafeRevTables->build();
+ fForwardTables->build();
+ fForwardTables->buildSafe(status);
+ fSafeRevTables->build();
#ifdef RBBI_DEBUG
- if (builder.fDebugEnv && uprv_strstr(builder.fDebugEnv, "states")) {
- builder.fForwardTables->printRuleStatusTable();
+ if (fDebugEnv && uprv_strstr(fDebugEnv, "states")) {
+ fForwardTables->printRuleStatusTable();
}
#endif
- builder.optimizeTables();
- builder.fSetBuilder->buildTrie();
-
-
+ optimizeTables();
+ fSetBuilder->buildTrie();
//
// Package up the compiled data into a memory image
// in the run-time format.
//
- RBBIDataHeader *data = builder.flattenData(); // returns NULL if error
- if (U_FAILURE(*builder.fStatus)) {
- return NULL;
- }
-
-
- //
- // Clean up the compiler related stuff
- //
-
-
- //
- // Create a break iterator from the compiled rules.
- // (Identical to creation from stored pre-compiled rules)
- //
- // status is checked after init in construction.
- RuleBasedBreakIterator *This = new RuleBasedBreakIterator(data, status);
+ RBBIDataHeader *data = flattenData(); // returns NULL if error
if (U_FAILURE(status)) {
- delete This;
- This = NULL;
- }
- else if(This == NULL) { // test for NULL
- status = U_MEMORY_ALLOCATION_ERROR;
+ return nullptr;
}
- return This;
+ return data;
}
void RBBIRuleBuilder::optimizeTables() {
while (fForwardTables->findDuplCharClassFrom(leftClass, rightClass)) {
fSetBuilder->mergeCategories(leftClass, rightClass);
fForwardTables->removeColumn(rightClass);
- fReverseTables->removeColumn(rightClass);
- fSafeFwdTables->removeColumn(rightClass);
fSafeRevTables->removeColumn(rightClass);
}
fForwardTables->removeDuplicateStates();
- fReverseTables->removeDuplicateStates();
- fSafeFwdTables->removeDuplicateStates();
fSafeRevTables->removeDuplicateStates();
-
-
-
}
U_NAMESPACE_END
RBBIRuleBuilder(const UnicodeString &rules,
UParseError *parseErr,
UErrorCode &status
- );
+ );
virtual ~RBBIRuleBuilder();
+ /**
+ * Build the state tables and char class Trie from the source rules.
+ */
+ RBBIDataHeader *build(UErrorCode &status);
+
+
/**
* Fold together redundant character classes (table columns) and
* redundant states (table rows). Done after initial table generation,
UVector *fUSetNodes; // Vector of all uset nodes.
RBBITableBuilder *fForwardTables; // State transition tables
- RBBITableBuilder *fReverseTables;
- RBBITableBuilder *fSafeFwdTables;
RBBITableBuilder *fSafeRevTables;
UVector *fRuleStatusVals; // The values that can be returned
// (forward, reverse, safe_forward, safe_reverse)
// OR this rule into the appropriate group of them.
//
- RBBINode **destRules = (fReverseRule? &fRB->fReverseTree : fRB->fDefaultTree);
+ RBBINode **destRules = (fReverseRule? &fRB->fSafeRevTree : fRB->fDefaultTree);
if (*destRules != NULL) {
// This is not the first rule encounted.
}
//
- // If there were NO user specified reverse rules, set up the equivalent of ".*;"
+ // If there were NO user specified safe reverse rules, set up the equivalent of ".*;"
//
- if (fRB->fReverseTree == NULL) {
- fRB->fReverseTree = pushNewNode(RBBINode::opStar);
+ if (fRB->fSafeRevTree == NULL) {
+ fRB->fSafeRevTree = pushNewNode(RBBINode::opStar);
RBBINode *operand = pushNewNode(RBBINode::setRef);
if (U_FAILURE(*fRB->fStatus)) {
return;
}
findSetFor(UnicodeString(TRUE, kAny, 3), operand);
- fRB->fReverseTree->fLeftChild = operand;
- operand->fParent = fRB->fReverseTree;
+ fRB->fSafeRevTree->fLeftChild = operand;
+ operand->fParent = fRB->fSafeRevTree;
fNodeStackPtr -= 2;
}
U_NAMESPACE_BEGIN
-RBBITableBuilder::RBBITableBuilder(RBBIRuleBuilder *rb, RBBINode **rootNode) :
- fTree(*rootNode) {
- fRB = rb;
- fStatus = fRB->fStatus;
- UErrorCode status = U_ZERO_ERROR;
- fDStates = new UVector(status);
- if (U_FAILURE(*fStatus)) {
- return;
- }
+RBBITableBuilder::RBBITableBuilder(RBBIRuleBuilder *rb, RBBINode **rootNode, UErrorCode &status) :
+ fRB(rb),
+ fTree(*rootNode),
+ fStatus(&status),
+ fDStates(nullptr),
+ fSafeTable(nullptr) {
if (U_FAILURE(status)) {
- *fStatus = status;
return;
}
- if (fDStates == NULL) {
- *fStatus = U_MEMORY_ALLOCATION_ERROR;;
+ // fDStates is UVector<RBBIStateDescriptor *>
+ fDStates = new UVector(status);
+ // SafeTable is UVector<UnicodeString *>. Contents owned by the UVector.
+ fSafeTable = new UVector(uprv_deleteUObject, uhash_compareUnicodeString, status);
+ if (U_SUCCESS(status) && (fDStates == nullptr || fSafeTable == nullptr)) {
+ status = U_MEMORY_ALLOCATION_ERROR;;
}
}
for (i=0; i<fDStates->size(); i++) {
delete (RBBIStateDescriptor *)fDStates->elementAt(i);
}
- delete fDStates;
+ delete fDStates;
+ delete fSafeTable;
}
}
+/**
+ * Synthesize a safe state table from the main state table.
+ */
+void RBBITableBuilder::buildSafe(UErrorCode &status) {
+ // Find safe char class pairs.
+
+ // make a state table row for each trailing class, and map from class to row.
+
+ // For each pair
+ // startRow[p1] = p2
+ // p2row[p2] = stopRow
+ // For each unfilled in cell
+ // set to row corresponding to its column.
+ UVector32 safePairs(status);
+
+ int32_t numCharClasses = fRB->fSetBuilder->getNumCharCategories();
+ int32_t numStates = fDStates->size();
+
+ for (int32_t c1=0; c1<numCharClasses; ++c1) {
+ for (int32_t c2=0; c2 < numCharClasses; ++c2) {
+ int32_t wantedEndState = -1;
+ int32_t endState = 0;
+ for (int32_t startState = 1; startState < numStates; ++startState) {
+ RBBIStateDescriptor *startStateD = static_cast<RBBIStateDescriptor *>(fDStates->elementAt(startState));
+ int32_t s2 = startStateD->fDtran->elementAti(c1);
+ RBBIStateDescriptor *s2StateD = static_cast<RBBIStateDescriptor *>(fDStates->elementAt(s2));
+ endState = s2StateD->fDtran->elementAti(c2);
+ if (wantedEndState < 0) {
+ wantedEndState = endState;
+ } else {
+ if (wantedEndState != endState) {
+ break;
+ }
+ }
+ }
+ if (wantedEndState == endState) {
+ int32_t pair = c1 << 16 | c2;
+ safePairs.addElement(pair, status);
+ // printf("(%d, %d) ", c1, c2);
+ }
+ }
+ //printf("\n");
+ }
+
+ // Populate the initial safe table.
+ // The table as a whole is UVector<UnicodeString>
+ // Each row is represented by a UnicodeString, being used as a Vector<int16>.
+ // Row 0 is the stop state.
+ // Row 1 is the start sate.
+ // Row 2 and beyond are other states, initially one per char class, but
+ // after initial construction, many of the states will be combined, compacting the table.)
+ fSafeTable = new UVector(uprv_deleteUObject, uhash_compareUnicodeString, numCharClasses + 2, status);
+ for (int32_t row=0; row<numCharClasses + 2; ++row) {
+ fSafeTable->addElement(new UnicodeString(numCharClasses+4, 0, numCharClasses+4), status);
+ }
+
+ // From the start state, each input char class transitions to the state for that input.
+ UnicodeString &startState = *(UnicodeString *)fSafeTable->elementAt(1);
+ for (int32_t charClass=0; charClass < numCharClasses; ++charClass) {
+ // Note: +2 for the start & stop state; +4 for header columns in state table.
+ startState.setCharAt(charClass+4, charClass+2);
+ }
+
+ // Initially make every other state table row look like the start state row,
+ for (int32_t row=2; row<numCharClasses+2; ++row) {
+ UnicodeString &rowState = *(UnicodeString *)fSafeTable->elementAt(1);
+ rowState = startState; // UnicodeString assignment, copies contents.
+ }
+
+ // Run through the safe pairs, make next state to zero when pair has been seen.
+ // Zero being the stop state, meaning we found a safe point.
+ for (int32_t pairIdx=0; pairIdx<safePairs.size(); pairIdx++) {
+ int32_t pair = safePairs.elementAti(pairIdx);
+ int32_t c1 = (pair >> 16) & 0x0000ffff;
+ int32_t c2 = pair & 0x0000ffff;
+
+ UnicodeString &rowState = *(UnicodeString *)fSafeTable->elementAt(c2 + 2);
+ rowState.setCharAt(c1 + 4, 0);
+ }
+
+ // Merge similar states.
+
+}
//-----------------------------------------------------------------------------
//
class RBBITableBuilder : public UMemory {
public:
- RBBITableBuilder(RBBIRuleBuilder *rb, RBBINode **rootNode);
+ RBBITableBuilder(RBBIRuleBuilder *rb, RBBINode **rootNode, UErrorCode &status);
~RBBITableBuilder();
void build();
- int32_t getTableSize() const; // Return the runtime size in bytes of
- // the built state table
+
+ /** Return the runtime size in bytes of the built state table. */
+ int32_t getTableSize() const;
/** Fill in the runtime state table. Sufficient memory must exist at the specified location.
*/
/** Check for, and remove dupicate states (table rows). */
void removeDuplicateStates();
+ void buildSafe(UErrorCode &status);
+
+ /** Return the runtime size in bytes of the built safe reverse state table. */
+ int32_t getSafeTableSize() const;
+
+ /** Fill in the runtime safe state table. Sufficient memory must exist at the specified location.
+ */
+ void exportSafeTable(void *where);
+
private:
void calcNullable(RBBINode *n);
// table for.
UErrorCode *fStatus;
+ /** State Descriptors, UVector<RBBIStateDescriptor> */
UVector *fDStates; // D states (Aho's terminology)
// Index is state number
// Contents are RBBIStateDescriptor pointers.
+ /** Synthesized safe table, UVector of UnicodeString, one string per table row. */
+ UVector *fSafeTable;
+
RBBITableBuilder(const RBBITableBuilder &other); // forbid copying of this class
RBBITableBuilder &operator=(const RBBITableBuilder &other); // forbid copying of this class
*/
int32_t handlePrevious(int32_t fromPosition);
+ /**
+ * Iterate backwards from an arbitrary position in the input text using the
+ * synthesized Safe Reverse rules.
+ * This locates a "Safe Position" from which the forward break rules
+ * will operate correctly. A Safe Position is not necessarily a boundary itself.
+ *
+ * @param fromPosition the position in the input text to begin the iteration.
+ * @internal
+ */
+ int32_t handleSafePrevious(int32_t fromPosition);
+
/**
* Find a rule-based boundary by running the state machine.
* Input
}
/**
- * Sort the vector, assuming it constains ints.
+ * Sort the vector, assuming it contains ints.
* (A more general sort would take a comparison function, but it's
* not clear whether UVector's UElementComparator or
* UComparator from uprv_sortAray would be more appropriate.)
#include "uvectr32.h"
#include "cmemory.h"
#include "putilimp.h"
+#include "uarrsort.h"
U_NAMESPACE_BEGIN
}
-
-
+/**
+ * Sort the vector, assuming it contains ints.
+ */
+void UVector32::sorti(UErrorCode &ec) {
+ if (U_SUCCESS(ec)) {
+ uprv_sortArray(elements, count, sizeof(int32_t),
+ uprv_int32Comparator, nullptr, false, &ec);
+ }
+}
U_NAMESPACE_END
*/
void sortedInsert(int32_t elem, UErrorCode& ec);
+ /**
+ * Sort the contents of the int32_t vector.
+ */
+ void sorti(UErrorCode &ec);
+
/**
* Returns a pointer to the internal array holding the vector.
*/