]> granicus.if.org Git - icu/commitdiff
ICU-13194 rbbi safe rule synth, work in progress.
authorAndy Heninger <andy.heninger@gmail.com>
Sat, 17 Mar 2018 00:34:48 +0000 (00:34 +0000)
committerAndy Heninger <andy.heninger@gmail.com>
Sat, 17 Mar 2018 00:34:48 +0000 (00:34 +0000)
X-SVN-Rev: 41118

12 files changed:
icu4c/source/common/rbbi.cpp
icu4c/source/common/rbbidata.cpp
icu4c/source/common/rbbidata.h
icu4c/source/common/rbbirb.cpp
icu4c/source/common/rbbirb.h
icu4c/source/common/rbbiscan.cpp
icu4c/source/common/rbbitblb.cpp
icu4c/source/common/rbbitblb.h
icu4c/source/common/unicode/rbbi.h
icu4c/source/common/uvector.cpp
icu4c/source/common/uvectr32.cpp
icu4c/source/common/uvectr32.h

index 69f92d94c602c33cc65cb7d873956a3efca08aed..83d1c0b59a9e43c620c7bd39eb92741a2fddefb6 100644 (file)
@@ -1100,6 +1100,91 @@ int32_t RuleBasedBreakIterator::handlePrevious(int32_t fromPosition) {
 }
 
 
+//-----------------------------------------------------------------------------------
+//
+//  handleSafePrevious()
+//
+//      Iterate backwards using the safe reverse rules.
+//      The logic of this function is similar to handleNext(), but simpler
+//      because the safe table does not require as many options.
+//
+//-----------------------------------------------------------------------------------
+int32_t RuleBasedBreakIterator::handleSafePrevious(int32_t fromPosition) {
+    int32_t             state;
+    uint16_t            category        = 0;
+    RBBIStateTableRow  *row;
+    UChar32             c;
+    int32_t             result          = 0;
+
+    const RBBIStateTable *stateTable = fData->fSafeRevTable;
+    UTEXT_SETNATIVEINDEX(&fText, fromPosition);
+    #ifdef RBBI_DEBUG
+        if (gTrace) {
+            RBBIDebugPuts("Handle Previous   pos   char  state category");
+        }
+    #endif
+
+    // if we're already at the start of the text, return DONE.
+    if (fData == NULL || UTEXT_GETNATIVEINDEX(&fText)==0) {
+        return BreakIterator::DONE;
+    }
+
+    //  Set the initial state for the state machine
+    c = UTEXT_PREVIOUS32(&fText);
+    state = START_STATE;
+    row = (RBBIStateTableRow *)
+            (stateTable->fTableData + (stateTable->fRowLen * state));
+
+    // loop until we reach the start of the text or transition to state 0
+    //
+    for (; c != U_SENTINEL; c = UTEXT_PREVIOUS32(&fText)) {
+
+        // look up the current character's character category, which tells us
+        // which column in the state table to look at.
+        // Note:  the 16 in UTRIE_GET16 refers to the size of the data being returned,
+        //        not the size of the character going in, which is a UChar32.
+        //
+        //  And off the dictionary flag bit. For reverse iteration it is not used.
+        category = UTRIE2_GET16(fData->fTrie, c);
+        category &= ~0x4000;
+
+        #ifdef RBBI_DEBUG
+            if (gTrace) {
+                RBBIDebugPrintf("             %4d   ", (int32_t)utext_getNativeIndex(&fText));
+                if (0x20<=c && c<0x7f) {
+                    RBBIDebugPrintf("\"%c\"  ", c);
+                } else {
+                    RBBIDebugPrintf("%5x  ", c);
+                }
+                RBBIDebugPrintf("%3d  %3d\n", state, category);
+            }
+        #endif
+
+        // State Transition - move machine to its next state
+        //
+        // fNextState is a variable-length array.
+        U_ASSERT(category<fData->fHeader->fCatCount);
+        state = row->fNextState[category];  /*Not accessing beyond memory*/
+        row = (RBBIStateTableRow *)
+            (stateTable->fTableData + (stateTable->fRowLen * state));
+
+        if (state == STOP_STATE) {
+            // This is the normal exit from the lookup state machine.
+            // Transistion to state zero means we have found a safe point.
+            break;
+        }
+    }
+
+    // The state machine is done.  Check whether it found a match...
+    result = (int32_t)UTEXT_GETNATIVEINDEX(&fText);
+    #ifdef RBBI_DEBUG
+        if (gTrace) {
+            RBBIDebugPrintf("result = %d\n\n", result);
+        }
+    #endif
+    return result;
+}
+
 //-------------------------------------------------------------------------------
 //
 //   getRuleStatus()   Return the break rule tag associated with the current
index 5b00e9506228c68e1af8c24e222de0dbbf0adff4..ebd60c42a6de229e2d20109236c2d34f387ea2e1 100644 (file)
@@ -80,8 +80,6 @@ UBool RBBIDataWrapper::isDataVersionAcceptable(const UVersionInfo version) {
 void RBBIDataWrapper::init0() {
     fHeader = NULL;
     fForwardTable = NULL;
-    fReverseTable = NULL;
-    fSafeFwdTable = NULL;
     fSafeRevTable = NULL;
     fRuleSource   = NULL;
     fRuleStatusTable = NULL;
@@ -108,25 +106,10 @@ void RBBIDataWrapper::init(const RBBIDataHeader *data, UErrorCode &status) {
     if (data->fFTableLen != 0) {
         fForwardTable = (RBBIStateTable *)((char *)data + fHeader->fFTable);
     }
-    if (data->fRTableLen != 0) {
-        fReverseTable = (RBBIStateTable *)((char *)data + fHeader->fRTable);
-    }
-    if (data->fSFTableLen != 0) {
-        fSafeFwdTable = (RBBIStateTable *)((char *)data + fHeader->fSFTable);
-    }
     if (data->fSRTableLen != 0) {
         fSafeRevTable = (RBBIStateTable *)((char *)data + fHeader->fSRTable);
     }
 
-    // Rule Compatibility Hacks
-    //    If a rule set includes reverse rules but does not explicitly include safe reverse rules,
-    //    the reverse rules are to be treated as safe reverse rules.
-
-    if (fSafeRevTable == NULL && fReverseTable != NULL) {
-        fSafeRevTable = fReverseTable;
-        fReverseTable = NULL;
-    }
-
     fTrie = utrie2_openFromSerialized(UTRIE2_16_VALUE_BITS,
                                       (uint8_t *)data + fHeader->fTrie,
                                       fHeader->fTrieLen,
@@ -276,8 +259,6 @@ void  RBBIDataWrapper::printData() {
     RBBIDebugPrintf("   number of character categories = %d\n\n", fHeader->fCatCount);
 
     printTable("Forward State Transition Table", fForwardTable);
-    printTable("Reverse State Transition Table", fReverseTable);
-    printTable("Safe Forward State Transition Table", fSafeFwdTable);
     printTable("Safe Reverse State Transition Table", fSafeRevTable);
 
     RBBIDebugPrintf("\nOrignal Rules source:\n");
index 1244a118dc523595f27bd9dda9b18245ff987d8c..6a5ebede04681922e2c6c8b1e66159527340e4ee 100644 (file)
@@ -173,8 +173,6 @@ public:
     /*                                     */
     const RBBIDataHeader     *fHeader;
     const RBBIStateTable     *fForwardTable;
-    const RBBIStateTable     *fReverseTable;
-    const RBBIStateTable     *fSafeFwdTable;
     const RBBIStateTable     *fSafeRevTable;
     const UChar              *fRuleSource;
     const int32_t            *fRuleStatusTable; 
index 61e596d6ed70fa987d35c3c84eee765ce9ca67af..ac629c6222cd039df0a23f77372af5e5a098f54e 100644 (file)
@@ -63,8 +63,6 @@ RBBIRuleBuilder::RBBIRuleBuilder(const UnicodeString   &rules,
     fSafeRevTree        = NULL;
     fDefaultTree        = &fForwardTree;
     fForwardTables      = NULL;
-    fReverseTables      = NULL;
-    fSafeFwdTables      = NULL;
     fSafeRevTables      = NULL;
     fRuleStatusVals     = NULL;
     fChainRules         = FALSE;
@@ -115,8 +113,6 @@ RBBIRuleBuilder::~RBBIRuleBuilder() {
     delete fUSetNodes;
     delete fSetBuilder;
     delete fForwardTables;
-    delete fReverseTables;
-    delete fSafeFwdTables;
     delete fSafeRevTables;
 
     delete fForwardTree;
@@ -158,20 +154,14 @@ RBBIDataHeader *RBBIRuleBuilder::flattenData() {
     //
     int32_t headerSize        = align8(sizeof(RBBIDataHeader));
     int32_t forwardTableSize  = align8(fForwardTables->getTableSize());
-    int32_t reverseTableSize  = align8(fReverseTables->getTableSize());
-    int32_t safeFwdTableSize  = align8(fSafeFwdTables->getTableSize());
     int32_t safeRevTableSize  = align8(fSafeRevTables->getTableSize());
     int32_t trieSize          = align8(fSetBuilder->getTrieSize());
     int32_t statusTableSize   = align8(fRuleStatusVals->size() * sizeof(int32_t));
     int32_t rulesSize         = align8((fStrippedRules.length()+1) * sizeof(UChar));
 
-    (void)safeFwdTableSize;
-
     int32_t         totalSize = headerSize
                                 + forwardTableSize 
-                                + /* reverseTableSize */ 0
-                                + /* safeFwdTableSize */ 0
-                                + (safeRevTableSize ? safeRevTableSize : reverseTableSize)
+                                + safeRevTableSize
                                 + statusTableSize + trieSize + rulesSize;
 
     RBBIDataHeader  *data     = (RBBIDataHeader *)uprv_malloc(totalSize);
@@ -211,16 +201,9 @@ RBBIDataHeader *RBBIRuleBuilder::flattenData() {
     data->fSFTableLen    = 0;
 
     data->fSRTable       = data->fSFTable + 0;
-    if (safeRevTableSize > 0) {
-        data->fSRTableLen    = safeRevTableSize;
-    } else if (reverseTableSize > 0) {
-        data->fSRTableLen    = reverseTableSize;
-    } else {
-        U_ASSERT(FALSE);    // Rule build should have failed for lack of a reverse table
-                            // before reaching this point.
-    }
-        
-
+    data->fSRTableLen    = safeRevTableSize;
+    U_ASSERT(safeRevTableSize > 0);
     data->fTrie          = data->fSRTable + data->fSRTableLen;
     data->fTrieLen       = fSetBuilder->getTrieSize();
     data->fStatusTable   = data->fTrie    + trieSize;
@@ -231,13 +214,7 @@ RBBIDataHeader *RBBIRuleBuilder::flattenData() {
     uprv_memset(data->fReserved, 0, sizeof(data->fReserved));
 
     fForwardTables->exportTable((uint8_t *)data + data->fFTable);
-    // fReverseTables->exportTable((uint8_t *)data + data->fRTable);
-    // fSafeFwdTables->exportTable((uint8_t *)data + data->fSFTable);
-    if (safeRevTableSize > 0) {
-        fSafeRevTables->exportTable((uint8_t *)data + data->fSRTable);
-    } else {
-        fReverseTables->exportTable((uint8_t *)data + data->fSRTable);
-    }
+    fSafeRevTables->exportTable((uint8_t *)data + data->fSRTable);
 
     fSetBuilder->serializeTrie ((uint8_t *)data + data->fTrie);
 
@@ -252,10 +229,6 @@ RBBIDataHeader *RBBIRuleBuilder::flattenData() {
 }
 
 
-
-
-
-
 //----------------------------------------------------------------------------------------
 //
 //  createRuleBasedBreakIterator    construct from source rules that are passed in
@@ -267,8 +240,6 @@ RBBIRuleBuilder::createRuleBasedBreakIterator( const UnicodeString    &rules,
                                     UParseError      *parseError,
                                     UErrorCode       &status)
 {
-    // status checked below
-
     //
     // Read the input rules, generate a parse tree, symbol table,
     // and list of all Unicode Sets referenced by the rules.
@@ -277,7 +248,38 @@ RBBIRuleBuilder::createRuleBasedBreakIterator( const UnicodeString    &rules,
     if (U_FAILURE(status)) { // status checked here bcos build below doesn't
         return NULL;
     }
-    builder.fScanner->parse();
+
+    RBBIDataHeader *data = builder.build(status);
+
+    if (U_FAILURE(status)) {
+        return nullptr;
+    }
+
+    //
+    //  Create a break iterator from the compiled rules.
+    //     (Identical to creation from stored pre-compiled rules)
+    //
+    // status is checked after init in construction.
+    RuleBasedBreakIterator *This = new RuleBasedBreakIterator(data, status);
+    if (U_FAILURE(status)) {
+        delete This;
+        This = NULL;
+    } 
+    else if(This == NULL) { // test for NULL
+        status = U_MEMORY_ALLOCATION_ERROR;
+    }
+    return This;
+}
+
+RBBIDataHeader *RBBIRuleBuilder::build(UErrorCode &status) {
+    if (U_FAILURE(status)) {
+        return nullptr;
+    }
+
+    fScanner->parse();
+    if (U_FAILURE(status)) {
+        return nullptr;
+    }
 
     //
     // UnicodeSet processing.
@@ -285,72 +287,43 @@ RBBIRuleBuilder::createRuleBasedBreakIterator( const UnicodeString    &rules,
     //    Generate the mapping tables (TRIE) from input code points to
     //    the character categories.
     //
-    builder.fSetBuilder->buildRanges();
-
+    fSetBuilder->buildRanges();
 
     //
     //   Generate the DFA state transition table.
     //
-    builder.fForwardTables = new RBBITableBuilder(&builder, &builder.fForwardTree);
-    builder.fReverseTables = new RBBITableBuilder(&builder, &builder.fReverseTree);
-    builder.fSafeFwdTables = new RBBITableBuilder(&builder, &builder.fSafeFwdTree);
-    builder.fSafeRevTables = new RBBITableBuilder(&builder, &builder.fSafeRevTree);
-    if (builder.fForwardTables == NULL || builder.fReverseTables == NULL ||
-        builder.fSafeFwdTables == NULL || builder.fSafeRevTables == NULL)
+    fForwardTables = new RBBITableBuilder(this, &fForwardTree, status);
+    fSafeRevTables = new RBBITableBuilder(this, &fSafeRevTree, status);
+    if (fForwardTables == nullptr || fSafeRevTables == nullptr)
     {
         status = U_MEMORY_ALLOCATION_ERROR;
-        delete builder.fForwardTables; builder.fForwardTables = NULL;
-        delete builder.fReverseTables; builder.fReverseTables = NULL;
-        delete builder.fSafeFwdTables; builder.fSafeFwdTables = NULL;
-        delete builder.fSafeRevTables; builder.fSafeRevTables = NULL;
-        return NULL;
+        delete fForwardTables; fForwardTables = nullptr;
+        delete fSafeRevTables; fSafeRevTables = nullptr;
+        return nullptr;
     }
 
-    builder.fForwardTables->build();
-    builder.fReverseTables->build();
-    builder.fSafeFwdTables->build();
-    builder.fSafeRevTables->build();
+    fForwardTables->build();
+    fForwardTables->buildSafe(status);
+    fSafeRevTables->build();
 
 #ifdef RBBI_DEBUG
-    if (builder.fDebugEnv && uprv_strstr(builder.fDebugEnv, "states")) {
-        builder.fForwardTables->printRuleStatusTable();
+    if (fDebugEnv && uprv_strstr(fDebugEnv, "states")) {
+        fForwardTables->printRuleStatusTable();
     }
 #endif
 
-    builder.optimizeTables();
-    builder.fSetBuilder->buildTrie();
-
-
+    optimizeTables();
+    fSetBuilder->buildTrie();
 
     //
     //   Package up the compiled data into a memory image
     //      in the run-time format.
     //
-    RBBIDataHeader *data = builder.flattenData(); // returns NULL if error
-    if (U_FAILURE(*builder.fStatus)) {
-        return NULL;
-    }
-
-
-    //
-    //  Clean up the compiler related stuff
-    //
-
-
-    //
-    //  Create a break iterator from the compiled rules.
-    //     (Identical to creation from stored pre-compiled rules)
-    //
-    // status is checked after init in construction.
-    RuleBasedBreakIterator *This = new RuleBasedBreakIterator(data, status);
+    RBBIDataHeader *data = flattenData(); // returns NULL if error
     if (U_FAILURE(status)) {
-        delete This;
-        This = NULL;
-    } 
-    else if(This == NULL) { // test for NULL
-        status = U_MEMORY_ALLOCATION_ERROR;
+        return nullptr;
     }
-    return This;
+    return data;
 }
 
 void RBBIRuleBuilder::optimizeTables() {
@@ -362,18 +335,11 @@ void RBBIRuleBuilder::optimizeTables() {
     while (fForwardTables->findDuplCharClassFrom(leftClass, rightClass)) {
         fSetBuilder->mergeCategories(leftClass, rightClass);
         fForwardTables->removeColumn(rightClass);
-        fReverseTables->removeColumn(rightClass);
-        fSafeFwdTables->removeColumn(rightClass);
         fSafeRevTables->removeColumn(rightClass);
     }
 
     fForwardTables->removeDuplicateStates();
-    fReverseTables->removeDuplicateStates();
-    fSafeFwdTables->removeDuplicateStates();
     fSafeRevTables->removeDuplicateStates();
-
-
-
 }
 
 U_NAMESPACE_END
index f890cf686e36cb20887cc435b439c566febfd0b8..ca71f40c7dc1ea7b76b8dc9dc89a20cc4a7441b4 100644 (file)
@@ -123,10 +123,16 @@ public:
     RBBIRuleBuilder(const UnicodeString  &rules,
                     UParseError          *parseErr,
                     UErrorCode           &status
-        );
+    );
 
     virtual    ~RBBIRuleBuilder();
 
+    /**
+     *  Build the state tables and char class Trie from the source rules.
+     */
+    RBBIDataHeader  *build(UErrorCode &status);
+
+
     /**
      * Fold together redundant character classes (table columns) and
      * redundant states (table rows). Done after initial table generation,
@@ -163,8 +169,6 @@ public:
     UVector                       *fUSetNodes;       // Vector of all uset nodes.
 
     RBBITableBuilder              *fForwardTables;   // State transition tables
-    RBBITableBuilder              *fReverseTables;
-    RBBITableBuilder              *fSafeFwdTables;
     RBBITableBuilder              *fSafeRevTables;
 
     UVector                       *fRuleStatusVals;  // The values that can be returned
index 60f3d197c291882cf9c98a6b1af001e628cdbeaa..6b64969f125b9df63195934e8ccb48e2ec6cc56f 100644 (file)
@@ -372,7 +372,7 @@ UBool RBBIRuleScanner::doParseActions(int32_t action)
         //  (forward, reverse, safe_forward, safe_reverse)
         //  OR this rule into the appropriate group of them.
         //
-        RBBINode **destRules = (fReverseRule? &fRB->fReverseTree : fRB->fDefaultTree);
+        RBBINode **destRules = (fReverseRule? &fRB->fSafeRevTree : fRB->fDefaultTree);
 
         if (*destRules != NULL) {
             // This is not the first rule encounted.
@@ -1123,17 +1123,17 @@ void RBBIRuleScanner::parse() {
     }
 
     //
-    // If there were NO user specified reverse rules, set up the equivalent of ".*;"
+    // If there were NO user specified safe reverse rules, set up the equivalent of ".*;"
     //
-    if (fRB->fReverseTree == NULL) {
-        fRB->fReverseTree  = pushNewNode(RBBINode::opStar);
+    if (fRB->fSafeRevTree == NULL) {
+        fRB->fSafeRevTree  = pushNewNode(RBBINode::opStar);
         RBBINode  *operand = pushNewNode(RBBINode::setRef);
         if (U_FAILURE(*fRB->fStatus)) {
             return;
         }
         findSetFor(UnicodeString(TRUE, kAny, 3), operand);
-        fRB->fReverseTree->fLeftChild = operand;
-        operand->fParent              = fRB->fReverseTree;
+        fRB->fSafeRevTree->fLeftChild = operand;
+        operand->fParent              = fRB->fSafeRevTree;
         fNodeStackPtr -= 2;
     }
 
index 58168922d4bca22c98e714836b2034da1db5da9e..ee2d172c18542e70a03e88f5f48b86c86e54d8f5 100644 (file)
 
 U_NAMESPACE_BEGIN
 
-RBBITableBuilder::RBBITableBuilder(RBBIRuleBuilder *rb, RBBINode **rootNode) :
- fTree(*rootNode) {
-    fRB                 = rb;
-    fStatus             = fRB->fStatus;
-    UErrorCode status   = U_ZERO_ERROR;
-    fDStates            = new UVector(status);
-    if (U_FAILURE(*fStatus)) {
-        return;
-    }
+RBBITableBuilder::RBBITableBuilder(RBBIRuleBuilder *rb, RBBINode **rootNode, UErrorCode &status) :
+        fRB(rb),
+        fTree(*rootNode),
+        fStatus(&status),
+        fDStates(nullptr),
+        fSafeTable(nullptr) {
     if (U_FAILURE(status)) {
-        *fStatus = status;
         return;
     }
-    if (fDStates == NULL) {
-        *fStatus = U_MEMORY_ALLOCATION_ERROR;;
+    // fDStates is UVector<RBBIStateDescriptor *>
+    fDStates = new UVector(status);
+    // SafeTable is UVector<UnicodeString *>.  Contents owned by the UVector.
+    fSafeTable = new UVector(uprv_deleteUObject, uhash_compareUnicodeString, status);
+    if (U_SUCCESS(status) && (fDStates == nullptr || fSafeTable == nullptr)) {
+        status = U_MEMORY_ALLOCATION_ERROR;;
     }
 }
 
@@ -52,7 +52,8 @@ RBBITableBuilder::~RBBITableBuilder() {
     for (i=0; i<fDStates->size(); i++) {
         delete (RBBIStateDescriptor *)fDStates->elementAt(i);
     }
-    delete   fDStates;
+    delete fDStates;
+    delete fSafeTable;
 }
 
 
@@ -1277,6 +1278,89 @@ void RBBITableBuilder::exportTable(void *where) {
 }
 
 
+/**
+ *   Synthesize a safe state table from the main state table.
+ */
+void RBBITableBuilder::buildSafe(UErrorCode &status) {
+    // Find safe char class pairs.
+
+    // make a state table row for each trailing class, and map from class to row.
+
+    // For each pair
+    //   startRow[p1] = p2
+    //   p2row[p2] = stopRow
+    // For each unfilled in cell
+    //   set to row corresponding to its column.
+    UVector32 safePairs(status);
+
+    int32_t numCharClasses = fRB->fSetBuilder->getNumCharCategories();
+    int32_t numStates = fDStates->size();
+
+    for (int32_t c1=0; c1<numCharClasses; ++c1) {
+        for (int32_t c2=0; c2 < numCharClasses; ++c2) {
+            int32_t wantedEndState = -1;
+            int32_t endState = 0;
+            for (int32_t startState = 1; startState < numStates; ++startState) {
+                RBBIStateDescriptor *startStateD = static_cast<RBBIStateDescriptor *>(fDStates->elementAt(startState));
+                int32_t s2 = startStateD->fDtran->elementAti(c1);
+                RBBIStateDescriptor *s2StateD = static_cast<RBBIStateDescriptor *>(fDStates->elementAt(s2));
+                endState = s2StateD->fDtran->elementAti(c2);
+                if (wantedEndState < 0) {
+                    wantedEndState = endState;
+                } else {
+                    if (wantedEndState != endState) {
+                        break;
+                    }
+                }
+            }
+            if (wantedEndState == endState) {
+                int32_t pair = c1 << 16 | c2;
+                safePairs.addElement(pair, status);
+                // printf("(%d, %d) ", c1, c2);
+            }
+        }
+        //printf("\n");
+    }
+
+    // Populate the initial safe table.
+    // The table as a whole is UVector<UnicodeString>
+    // Each row is represented by a UnicodeString, being used as a Vector<int16>.
+    // Row 0 is the stop state.
+    // Row 1 is the start sate.
+    // Row 2 and beyond are other states, initially one per char class, but
+    //   after initial construction, many of the states will be combined, compacting the table.)
+    fSafeTable = new UVector(uprv_deleteUObject, uhash_compareUnicodeString, numCharClasses + 2, status);
+    for (int32_t row=0; row<numCharClasses + 2; ++row) {
+        fSafeTable->addElement(new UnicodeString(numCharClasses+4, 0, numCharClasses+4), status);
+    }
+
+    // From the start state, each input char class transitions to the state for that input.
+    UnicodeString &startState = *(UnicodeString *)fSafeTable->elementAt(1);
+    for (int32_t charClass=0; charClass < numCharClasses; ++charClass) {
+        // Note: +2 for the start & stop state; +4 for header columns in state table.
+        startState.setCharAt(charClass+4, charClass+2);
+    }
+
+    // Initially make every other state table row look like the start state row,
+    for (int32_t row=2; row<numCharClasses+2; ++row) {
+        UnicodeString &rowState = *(UnicodeString *)fSafeTable->elementAt(1);
+        rowState = startState;   // UnicodeString assignment, copies contents.
+    }
+
+    // Run through the safe pairs, make next state to zero when pair has been seen.
+    // Zero being the stop state, meaning we found a safe point.
+    for (int32_t pairIdx=0; pairIdx<safePairs.size(); pairIdx++) {
+        int32_t pair = safePairs.elementAti(pairIdx);
+        int32_t c1 = (pair >> 16) & 0x0000ffff;
+        int32_t c2 = pair & 0x0000ffff;
+
+        UnicodeString &rowState = *(UnicodeString *)fSafeTable->elementAt(c2 + 2);
+        rowState.setCharAt(c1 + 4, 0);
+    }
+
+    // Merge similar states.
+
+}
 
 //-----------------------------------------------------------------------------
 //
index 09b57b5cf0f4c3f44463c0ed0a26ec148b223aa5..104d544b172444c1296bf12d3356b9fe06d9161c 100644 (file)
@@ -37,12 +37,13 @@ class UVector32;
 
 class RBBITableBuilder : public UMemory {
 public:
-    RBBITableBuilder(RBBIRuleBuilder *rb, RBBINode **rootNode);
+    RBBITableBuilder(RBBIRuleBuilder *rb, RBBINode **rootNode, UErrorCode &status);
     ~RBBITableBuilder();
 
     void     build();
-    int32_t  getTableSize() const;      // Return the runtime size in bytes of
-                                        //     the built state table
+
+    /** Return the runtime size in bytes of the built state table.  */
+    int32_t  getTableSize() const;
 
     /** Fill in the runtime state table. Sufficient memory must exist at the specified location.
      */
@@ -62,6 +63,15 @@ public:
     /** Check for, and remove dupicate states (table rows). */
     void     removeDuplicateStates();
 
+    void     buildSafe(UErrorCode &status);
+
+    /** Return the runtime size in bytes of the built safe reverse state table. */
+    int32_t  getSafeTableSize() const;
+
+    /** Fill in the runtime safe state table. Sufficient memory must exist at the specified location.
+     */
+    void     exportSafeTable(void *where);
+
 
 private:
     void     calcNullable(RBBINode *n);
@@ -126,10 +136,14 @@ private:
                                            //   table for.
     UErrorCode       *fStatus;
 
+    /** State Descriptors, UVector<RBBIStateDescriptor> */
     UVector          *fDStates;            //  D states (Aho's terminology)
                                            //  Index is state number
                                            //  Contents are RBBIStateDescriptor pointers.
 
+    /** Synthesized safe table, UVector of UnicodeString, one string per table row.   */
+    UVector          *fSafeTable;
+
 
     RBBITableBuilder(const RBBITableBuilder &other); // forbid copying of this class
     RBBITableBuilder &operator=(const RBBITableBuilder &other); // forbid copying of this class
index f0ac4bc4d78a33f459a6f7b8cd1d1c525c463603..deba07a399db17c80f991d5304c7929ad03bcc1c 100644 (file)
@@ -648,6 +648,17 @@ private:
      */
     int32_t handlePrevious(int32_t fromPosition);
 
+    /**
+     * Iterate backwards from an arbitrary position in the input text using the
+     * synthesized Safe Reverse rules.
+     * This locates a "Safe Position" from which the forward break rules
+     * will operate correctly. A Safe Position is not necessarily a boundary itself.
+     *
+     * @param fromPosition the position in the input text to begin the iteration.
+     * @internal
+     */
+    int32_t handleSafePrevious(int32_t fromPosition);
+
     /**
      * Find a rule-based boundary by running the state machine.
      * Input
index cf19edf646fb0a2618cb5198c431dcb313d1e0aa..4088855b7a3ab4bdcd49c5e736f674dad4f99963 100644 (file)
@@ -518,7 +518,7 @@ sortiComparator(const void * /*context */, const void *left, const void *right)
 }
 
 /**
-  * Sort the vector, assuming it constains ints.
+  * Sort the vector, assuming it contains ints.
   *     (A more general sort would take a comparison function, but it's
   *     not clear whether UVector's UElementComparator or
   *     UComparator from uprv_sortAray would be more appropriate.)
index d1ae6599585086c5ac8d20cb0a6216d495756c4a..484777885ee202e1fcb9ad38a46fef6e67633fcf 100644 (file)
@@ -13,6 +13,7 @@
 #include "uvectr32.h"
 #include "cmemory.h"
 #include "putilimp.h"
+#include "uarrsort.h"
 
 U_NAMESPACE_BEGIN
 
@@ -328,8 +329,15 @@ void UVector32::sortedInsert(int32_t tok, UErrorCode& ec) {
 }
 
 
-
-
+/**
+  * Sort the vector, assuming it contains ints.
+  */
+void UVector32::sorti(UErrorCode &ec) {
+    if (U_SUCCESS(ec)) {
+        uprv_sortArray(elements, count, sizeof(int32_t),
+                       uprv_int32Comparator, nullptr, false, &ec);
+    }
+}
 
 U_NAMESPACE_END
 
index 9112d9fe4aa2f94a45e7f0404c4e0b80bfb172e6..7efd767ae28ffc59de08e2422f0679ce8f2efd28 100644 (file)
@@ -160,6 +160,11 @@ public:
      */
     void sortedInsert(int32_t elem, UErrorCode& ec);
 
+    /**
+     * Sort the contents of the int32_t vector.
+     */
+    void sorti(UErrorCode &ec);
+
     /**
      * Returns a pointer to the internal array holding the vector.
      */