]> granicus.if.org Git - icu/commitdiff
ICU-13194 RBBI safe rules, work in progress.
authorAndy Heninger <andy.heninger@gmail.com>
Thu, 22 Mar 2018 17:31:00 +0000 (17:31 +0000)
committerAndy Heninger <andy.heninger@gmail.com>
Thu, 22 Mar 2018 17:31:00 +0000 (17:31 +0000)
X-SVN-Rev: 41135

icu4c/source/common/rbbi_cache.cpp
icu4c/source/common/rbbirb.cpp
icu4c/source/common/rbbitblb.cpp
icu4c/source/common/rbbitblb.h
icu4c/source/test/intltest/rbbitst.cpp
icu4c/source/test/testdata/rbbitst.txt

index 44c66fbea363eb9e04350f8c072372c33875e14c..33f03d90013f501741ab244e7ada6dd0297ebc1e 100644 (file)
@@ -356,12 +356,16 @@ UBool RuleBasedBreakIterator::BreakCache::populateNear(int32_t position, UErrorC
         int32_t ruleStatusIndex = 0;
         // TODO: check for position == length of text. Although may still need to back up to get rule status.
         if (position > 20) {
-            int32_t backupPos = fBI->handlePrevious(position);
+            int32_t backupPos = fBI->handleSafePrevious(position);
             fBI->fPosition = backupPos;
-            aBoundary = fBI->handleNext();                // Ignore dictionary, just finding a rule based boundary.
+            aBoundary = fBI->handleNext();        // Ignore dictionary, just finding a rule based boundary.
+            if (aBoundary == backupPos + 1) {   // TODO: + 1 is wrong for supplementals.
+                // Safe rules work on pairs. +1 from start pos may be a false match.
+                aBoundary = fBI->handleNext();
+            }
             ruleStatusIndex = fBI->fRuleStatusIndex;
         }
-        reset(aBoundary, ruleStatusIndex);               // Reset cache to hold aBoundary as a single starting point.
+        reset(aBoundary, ruleStatusIndex);        // Reset cache to hold aBoundary as a single starting point.
     }
     
     // Fill in boundaries between existing cache content and the new requested position.
@@ -485,14 +489,17 @@ UBool RuleBasedBreakIterator::BreakCache::populatePreceding(UErrorCode &status)
         if (backupPosition <= 0) {
             backupPosition = 0;
         } else {
-            backupPosition = fBI->handlePrevious(backupPosition);
+            backupPosition = fBI->handleSafePrevious(backupPosition);
         }
         if (backupPosition == UBRK_DONE || backupPosition == 0) {
             position = 0;
             positionStatusIdx = 0;
         } else {
             fBI->fPosition = backupPosition;  // TODO: pass starting position in a clearer way.
-            position = fBI->handleNext();
+            position = fBI->handleNext();     // TODO: supplementals don't work with the +1.
+            if (position == backupPosition + 1) {
+                position = fBI->handleNext();   // Safe rules identify safe pairs.
+            };
             positionStatusIdx = fBI->fRuleStatusIndex;
 
         }
index ac629c6222cd039df0a23f77372af5e5a098f54e..043cef25f1536fd7f600254eb9c532ab80fb6461 100644 (file)
@@ -154,13 +154,15 @@ RBBIDataHeader *RBBIRuleBuilder::flattenData() {
     //
     int32_t headerSize        = align8(sizeof(RBBIDataHeader));
     int32_t forwardTableSize  = align8(fForwardTables->getTableSize());
-    int32_t safeRevTableSize  = align8(fSafeRevTables->getTableSize());
+    int32_t reverseTableSize  = align8(fForwardTables->getSafeTableSize());
+    int32_t safeRevTableSize  = align8(fSafeRevTables->getTableSize());  // TODO: remove hand-written rules.
     int32_t trieSize          = align8(fSetBuilder->getTrieSize());
     int32_t statusTableSize   = align8(fRuleStatusVals->size() * sizeof(int32_t));
     int32_t rulesSize         = align8((fStrippedRules.length()+1) * sizeof(UChar));
 
     int32_t         totalSize = headerSize
-                                + forwardTableSize 
+                                + forwardTableSize
+                                + reverseTableSize
                                 + safeRevTableSize
                                 + statusTableSize + trieSize + rulesSize;
 
@@ -180,27 +182,18 @@ RBBIDataHeader *RBBIRuleBuilder::flattenData() {
     data->fLength           = totalSize;
     data->fCatCount         = fSetBuilder->getNumCharCategories();
 
-    // Only save the forward table and the safe reverse table,
-    // because these are the only ones used at run-time.
-    //
-    // For the moment, we still build the other tables if they are present in the rule source files,
-    // for backwards compatibility. Old rule files need to work, and this is the simplest approach.
-    //
-    // Additional backwards compatibility consideration: if no safe rules are provided, consider the
-    // reverse rules to actually be the safe reverse rules.
-
     data->fFTable        = headerSize;
     data->fFTableLen     = forwardTableSize;
 
-    // Do not save Reverse Table.
-    data->fRTable        = data->fFTable  + forwardTableSize;
-    data->fRTableLen     = 0;
+    data->fRTable        = data->fFTable  + data->fFTableLen;
+    data->fRTableLen     = reverseTableSize;
 
     // Do not save the Safe Forward table.
-    data->fSFTable       = data->fRTable + 0;
+    data->fSFTable       = data->fRTable + data->fRTableLen;
     data->fSFTableLen    = 0;
 
-    data->fSRTable       = data->fSFTable + 0;
+    // Hand written reverse rules. TODO: remove, once synthesized ones are working.
+    data->fSRTable       = data->fSFTable + data->fSFTableLen;
     data->fSRTableLen    = safeRevTableSize;
     U_ASSERT(safeRevTableSize > 0);
  
@@ -214,6 +207,7 @@ RBBIDataHeader *RBBIRuleBuilder::flattenData() {
     uprv_memset(data->fReserved, 0, sizeof(data->fReserved));
 
     fForwardTables->exportTable((uint8_t *)data + data->fFTable);
+    fForwardTables->exportSafeTable((uint8_t *)data + data->fRTable);
     fSafeRevTables->exportTable((uint8_t *)data + data->fSRTable);
 
     fSetBuilder->serializeTrie ((uint8_t *)data + data->fTrie);
@@ -297,22 +291,24 @@ RBBIDataHeader *RBBIRuleBuilder::build(UErrorCode &status) {
     if (fForwardTables == nullptr || fSafeRevTables == nullptr)
     {
         status = U_MEMORY_ALLOCATION_ERROR;
-        delete fForwardTables; fForwardTables = nullptr;
-        delete fSafeRevTables; fSafeRevTables = nullptr;
         return nullptr;
     }
 
     fForwardTables->build();
-    fForwardTables->buildSafe(status);
     fSafeRevTables->build();
+    optimizeTables();
+    fForwardTables->buildSafe(status);
 
+
+    if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "states")) {printStates();};
 #ifdef RBBI_DEBUG
     if (fDebugEnv && uprv_strstr(fDebugEnv, "states")) {
+        fForwardTables
         fForwardTables->printRuleStatusTable();
+        fForwardTables->printSafeTable();
     }
 #endif
 
-    optimizeTables();
     fSetBuilder->buildTrie();
 
     //
index ee2d172c18542e70a03e88f5f48b86c86e54d8f5..dd84b4c27b73246ab40698ba4081a1301a116abf 100644 (file)
@@ -38,10 +38,8 @@ RBBITableBuilder::RBBITableBuilder(RBBIRuleBuilder *rb, RBBINode **rootNode, UEr
     }
     // fDStates is UVector<RBBIStateDescriptor *>
     fDStates = new UVector(status);
-    // SafeTable is UVector<UnicodeString *>.  Contents owned by the UVector.
-    fSafeTable = new UVector(uprv_deleteUObject, uhash_compareUnicodeString, status);
-    if (U_SUCCESS(status) && (fDStates == nullptr || fSafeTable == nullptr)) {
-        status = U_MEMORY_ALLOCATION_ERROR;;
+    if (U_SUCCESS(status) && fDStates == nullptr ) {
+        status = U_MEMORY_ALLOCATION_ERROR;
     }
 }
 
@@ -190,8 +188,6 @@ void  RBBITableBuilder::build() {
     //    for all tables.  Merge the ones from this table into the global set.
     //
     mergeRuleStatusVals();
-
-    if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "states")) {printStates();};
 }
 
 
@@ -1316,10 +1312,10 @@ void RBBITableBuilder::buildSafe(UErrorCode &status) {
             if (wantedEndState == endState) {
                 int32_t pair = c1 << 16 | c2;
                 safePairs.addElement(pair, status);
-                // printf("(%d, %d) ", c1, c2);
+                printf("(%d, %d) ", c1, c2);
             }
         }
-        //printf("\n");
+        printf("\n");
     }
 
     // Populate the initial safe table.
@@ -1329,21 +1325,25 @@ void RBBITableBuilder::buildSafe(UErrorCode &status) {
     // Row 1 is the start sate.
     // Row 2 and beyond are other states, initially one per char class, but
     //   after initial construction, many of the states will be combined, compacting the table.)
+    // The String holds the nextState data only. The four leading fields of a row, fAccepting,
+    // fLookAhead, etc. are not needed for the safe table, and are omitted at this stage of building.
+
+    U_ASSERT(fSafeTable == nullptr);
     fSafeTable = new UVector(uprv_deleteUObject, uhash_compareUnicodeString, numCharClasses + 2, status);
     for (int32_t row=0; row<numCharClasses + 2; ++row) {
-        fSafeTable->addElement(new UnicodeString(numCharClasses+4, 0, numCharClasses+4), status);
+        fSafeTable->addElement(new UnicodeString(numCharClasses, 0, numCharClasses+4), status);
     }
 
     // From the start state, each input char class transitions to the state for that input.
     UnicodeString &startState = *(UnicodeString *)fSafeTable->elementAt(1);
     for (int32_t charClass=0; charClass < numCharClasses; ++charClass) {
-        // Note: +2 for the start & stop state; +4 for header columns in state table.
-        startState.setCharAt(charClass+4, charClass+2);
+        // Note: +2 for the start & stop state.
+        startState.setCharAt(charClass, charClass+2);
     }
 
     // Initially make every other state table row look like the start state row,
     for (int32_t row=2; row<numCharClasses+2; ++row) {
-        UnicodeString &rowState = *(UnicodeString *)fSafeTable->elementAt(1);
+        UnicodeString &rowState = *(UnicodeString *)fSafeTable->elementAt(row);
         rowState = startState;   // UnicodeString assignment, copies contents.
     }
 
@@ -1355,13 +1355,85 @@ void RBBITableBuilder::buildSafe(UErrorCode &status) {
         int32_t c2 = pair & 0x0000ffff;
 
         UnicodeString &rowState = *(UnicodeString *)fSafeTable->elementAt(c2 + 2);
-        rowState.setCharAt(c1 + 4, 0);
+        rowState.setCharAt(c1, 0);
     }
 
     // Merge similar states.
 
 }
 
+
+//-----------------------------------------------------------------------------
+//
+//   getSafeTableSize()    Calculate the size of the runtime form of this
+//                         safe state table.
+//
+//-----------------------------------------------------------------------------
+int32_t  RBBITableBuilder::getSafeTableSize() const {
+    int32_t    size = 0;
+    int32_t    numRows;
+    int32_t    numCols;
+    int32_t    rowSize;
+
+    if (fSafeTable == nullptr) {
+        return 0;
+    }
+
+    size    = offsetof(RBBIStateTable, fTableData);    // The header, with no rows to the table.
+
+    numRows = fSafeTable->size();
+    numCols = fRB->fSetBuilder->getNumCharCategories();
+
+    rowSize = offsetof(RBBIStateTableRow, fNextState) + sizeof(uint16_t)*numCols;
+    size   += numRows * rowSize;
+    return size;
+}
+
+
+//-----------------------------------------------------------------------------
+//
+//   exportTable()    export the state transition table in the format required
+//                    by the runtime engine.  getTableSize() bytes of memory
+//                    must be available at the output address "where".
+//
+//-----------------------------------------------------------------------------
+void RBBITableBuilder::exportSafeTable(void *where) {
+    RBBIStateTable    *table = (RBBIStateTable *)where;
+    uint32_t           state;
+    int                col;
+
+    if (U_FAILURE(*fStatus) || fSafeTable == nullptr) {
+        return;
+    }
+
+    int32_t catCount = fRB->fSetBuilder->getNumCharCategories();
+    if (catCount > 0x7fff ||
+            fSafeTable->size() > 0x7fff) {
+        *fStatus = U_BRK_INTERNAL_ERROR;
+        return;
+    }
+
+    table->fRowLen    = offsetof(RBBIStateTableRow, fNextState) + sizeof(uint16_t) * catCount;
+    table->fNumStates = fSafeTable->size();
+    table->fFlags     = 0;
+    table->fReserved  = 0;
+
+    for (state=0; state<table->fNumStates; state++) {
+        UnicodeString *rowString = (UnicodeString *)fSafeTable->elementAt(state);
+        RBBIStateTableRow   *row = (RBBIStateTableRow *)(table->fTableData + state*table->fRowLen);
+        row->fAccepting = 0;
+        row->fLookAhead = 0;
+        row->fTagIdx    = 0;
+        row->fReserved  = 0;
+        for (col=0; col<catCount; col++) {
+            row->fNextState[col] = rowString->charAt(col);
+        }
+    }
+}
+
+
+
+
 //-----------------------------------------------------------------------------
 //
 //   printSet    Debug function.   Print the contents of a UVector
@@ -1415,6 +1487,47 @@ void RBBITableBuilder::printStates() {
 #endif
 
 
+//-----------------------------------------------------------------------------
+//
+//   printSafeTable    Debug Function.  Dump the fully constructed safe table.
+//
+//-----------------------------------------------------------------------------
+#ifdef RBBI_DEBUG
+void RBBITableBuilder::printSafeTable() {
+    int     c;    // input "character"
+    int     n;    // state number
+
+    RBBIDebugPrintf("    Safe Reverse Table \n");
+    if (fSafeTable == nullptr) {
+        RBBIDebugPrintf("   --- nullptr ---\n");
+        return;
+    }
+    RBBIDebugPrintf("state |           i n p u t     s y m b o l s \n");
+    RBBIDebugPrintf("      | Acc  LA    Tag");
+    for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
+        RBBIDebugPrintf(" %2d", c);
+    }
+    RBBIDebugPrintf("\n");
+    RBBIDebugPrintf("      |---------------");
+    for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
+        RBBIDebugPrintf("---");
+    }
+    RBBIDebugPrintf("\n");
+
+    for (n=0; n<fSafeTable->size(); n++) {
+        UnicodeString *rowString = (UnicodeString *)fSafeTable->elementAt(n);
+        RBBIDebugPrintf("  %3d | " , n);
+        RBBIDebugPrintf("%3d %3d %5d ", 0, 0, 0);  // Accepting, LookAhead, Tags
+        for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
+            RBBIDebugPrintf(" %2d", rowString->charAt(c));
+        }
+        RBBIDebugPrintf("\n");
+    }
+    RBBIDebugPrintf("\n\n");
+}
+#endif
+
+
 
 //-----------------------------------------------------------------------------
 //
index 104d544b172444c1296bf12d3356b9fe06d9161c..6b5bb86196d5e7b1ea532624a36c33087614d6d7 100644 (file)
@@ -123,11 +123,13 @@ public:
     void     printPosSets(RBBINode *n /* = NULL*/);
     void     printStates();
     void     printRuleStatusTable();
+    void     printSafeTable();
 #else
     #define  printSet(s)
     #define  printPosSets(n)
     #define  printStates()
     #define  printRuleStatusTable()
+    #define printSafeTable()
 #endif
 
 private:
index 02785167d76fbe5d8f322b6b1575c7a33aa31784..23ecf10e9276e21efaa19be2b4f12d8b82de9a6c 100644 (file)
@@ -4544,6 +4544,47 @@ void RBBITest::TestBug13447() {
 //                  for tracing  without a lot of unwanted extra stuff happening.
 //
 void RBBITest::TestDebug(void) {
+    UErrorCode status = U_ZERO_ERROR;
+    LocalPointer<RuleBasedBreakIterator> bi ((RuleBasedBreakIterator *)
+            BreakIterator::createCharacterInstance(Locale::getEnglish(), status), status);
+    const UnicodeString &rules = bi->getRules();
+    UParseError pe;
+    LocalPointer<RuleBasedBreakIterator> newbi(new RuleBasedBreakIterator(rules, pe, status));
+    assertSuccess(WHERE, status);
+
+#if 0
+    bi->dumpTables();
+
+    RBBIDataWrapper *dw = bi->fData;
+    const RBBIStateTable *fwtbl = dw->fForwardTable;
+    int32_t numCharClasses = dw->fHeader->fCatCount;
+    printf("Char Classes: %d     states: %d\n", numCharClasses, fwtbl->fNumStates);
+
+    for (int32_t c1=0; c1<numCharClasses; ++c1) {
+        for (int32_t c2=0; c2 < numCharClasses; ++c2) {
+            int32_t wantedEndState = -1;
+            int32_t endState = 0;
+            for (int32_t startState = 1; startState < (int32_t)fwtbl->fNumStates; ++startState) {
+                RBBIStateTableRow  *row = (RBBIStateTableRow *) (fwtbl->fTableData + (fwtbl->fRowLen * startState));
+                int32_t s2 = row->fNextState[c1];
+                row = (RBBIStateTableRow *) (fwtbl->fTableData + (fwtbl->fRowLen * s2));
+                endState = row->fNextState[c2];
+                if (wantedEndState < 0) {
+                    wantedEndState = endState;
+                } else {
+                    if (wantedEndState != endState) {
+                        break;
+                    }
+                }
+            }
+            if (wantedEndState == endState) {
+                printf("(%d, %d) ", c1, c2);
+            }
+        }
+        printf("\n");
+    }
+    printf("\n");
+#endif
 }
 
 void RBBITest::TestProperties() {
index 761b3e01b5b8bb7bc4a97d3b49985991af17c405..38a69fd974256220f87c28e21120671463cf7348 100644 (file)
@@ -1370,7 +1370,7 @@ Bangkok)•</data>
 <rules>
 !!forward;
     Hello\ World;
-!!reverse;
+!!safe_reverse;
     .*;
 </rules>
 <data>•Hello World•</data>
@@ -1379,7 +1379,7 @@ Bangkok)•</data>
 !!quoted_literals_only;
 !!forward;
     Hello\ World;
-!!reverse;
+!!safe_reverse;
     .*;
 </badrules>
 
@@ -1387,7 +1387,7 @@ Bangkok)•</data>
 !!quoted_literals_only;
 !!forward;
     'Hello World';
-!!reverse;
+!!safe_reverse;
     .*;
 </rules>
 <data>•Hello World•</data>
@@ -1399,7 +1399,7 @@ Bangkok)•</data>
 <rules>
 !!forward;
 .;
-!!reverse;
+!!safe_reverse;
 .*;
 </rules>
 <data>•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•</data>