]> granicus.if.org Git - icu/commitdiff
ICU-13194 RBBI safe tables, all tests passing!
authorAndy Heninger <andy.heninger@gmail.com>
Mon, 26 Mar 2018 23:01:16 +0000 (23:01 +0000)
committerAndy Heninger <andy.heninger@gmail.com>
Mon, 26 Mar 2018 23:01:16 +0000 (23:01 +0000)
X-SVN-Rev: 41155

icu4c/source/common/rbbi.cpp
icu4c/source/common/rbbi_cache.cpp
icu4c/source/common/rbbidata.cpp
icu4c/source/common/rbbidata.h
icu4c/source/common/rbbirb.cpp
icu4c/source/common/rbbitblb.cpp
icu4c/source/test/cintltst/cbiapts.c
icu4c/source/test/intltest/rbbitst.cpp
icu4c/source/test/testdata/rbbitst.txt

index 83d1c0b59a9e43c620c7bd39eb92741a2fddefb6..b48f028483fe16ff5489a77f87ac56197bc9e651 100644 (file)
@@ -651,7 +651,7 @@ UBool RuleBasedBreakIterator::isBoundary(int32_t offset) {
     }
 
     // Adjust offset to be on a code point boundary and not beyond the end of the text.
-    // Note that isBoundary() is always be false for offsets that are not on code point boundaries.
+    // Note that isBoundary() is always false for offsets that are not on code point boundaries.
     // But we still need the side effect of leaving iteration at the following boundary.
 
     utext_setNativeIndex(&fText, offset);
@@ -1116,7 +1116,7 @@ int32_t RuleBasedBreakIterator::handleSafePrevious(int32_t fromPosition) {
     UChar32             c;
     int32_t             result          = 0;
 
-    const RBBIStateTable *stateTable = fData->fSafeRevTable;
+    const RBBIStateTable *stateTable = fData->fReverseTable;
     UTEXT_SETNATIVEINDEX(&fText, fromPosition);
     #ifdef RBBI_DEBUG
         if (gTrace) {
index 33f03d90013f501741ab244e7ada6dd0297ebc1e..60316ce6420dc5ecf5dc96ebbb5eebdd13059b5c 100644 (file)
@@ -206,7 +206,7 @@ void RuleBasedBreakIterator::DictionaryCache::populateDictionary(int32_t startPo
  *   BreakCache implemetation
  */
 
-RuleBasedBreakIterator::BreakCache::BreakCache(RuleBasedBreakIterator *bi, UErrorCode &status) : 
+RuleBasedBreakIterator::BreakCache::BreakCache(RuleBasedBreakIterator *bi, UErrorCode &status) :
         fBI(bi), fSideBuffer(status) {
     reset();
 }
@@ -299,7 +299,7 @@ void RuleBasedBreakIterator::BreakCache::previous(UErrorCode &status) {
     fBI->fPosition = fTextIdx;
     fBI->fRuleStatusIndex = fStatuses[fBufIdx];
     return;
-}    
+}
 
 
 UBool RuleBasedBreakIterator::BreakCache::seek(int32_t pos) {
@@ -317,7 +317,7 @@ UBool RuleBasedBreakIterator::BreakCache::seek(int32_t pos) {
         fTextIdx = fBoundaries[fBufIdx];
         return TRUE;
     }
-    
+
     int32_t min = fStartBufIdx;
     int32_t max = fEndBufIdx;
     while (min != max) {
@@ -354,20 +354,33 @@ UBool RuleBasedBreakIterator::BreakCache::populateNear(int32_t position, UErrorC
     if ((position < fBoundaries[fStartBufIdx] - 15) || position > (fBoundaries[fEndBufIdx] + 15)) {
         int32_t aBoundary = 0;
         int32_t ruleStatusIndex = 0;
-        // TODO: check for position == length of text. Although may still need to back up to get rule status.
         if (position > 20) {
             int32_t backupPos = fBI->handleSafePrevious(position);
-            fBI->fPosition = backupPos;
-            aBoundary = fBI->handleNext();        // Ignore dictionary, just finding a rule based boundary.
-            if (aBoundary == backupPos + 1) {   // TODO: + 1 is wrong for supplementals.
-                // Safe rules work on pairs. +1 from start pos may be a false match.
+
+            if (backupPos > 0) {
+                // Advance to the boundary following the backup position.
+                // There is a complication: the safe reverse rules identify pairs of code points
+                // that are safe. If advancing from the safe point moves forwards by less than
+                // two code points, we need to advance one more time to ensure that the boundary
+                // is good, including a correct rules status value.
+                //
+                fBI->fPosition = backupPos;
                 aBoundary = fBI->handleNext();
+                if (aBoundary <= backupPos + 4) {
+                    // +4 is a quick test for possibly having advanced only one codepoint.
+                    // Four being the length of the longest potential code point, a supplementary in UTF-8
+                    utext_setNativeIndex(&fBI->fText, aBoundary);
+                    if (backupPos == utext_getPreviousNativeIndex(&fBI->fText)) {
+                        // The initial handleNext() only advanced by a single code point. Go again.
+                        aBoundary = fBI->handleNext();   // Safe rules identify safe pairs.
+                    }
+                }
+                ruleStatusIndex = fBI->fRuleStatusIndex;
             }
-            ruleStatusIndex = fBI->fRuleStatusIndex;
         }
         reset(aBoundary, ruleStatusIndex);        // Reset cache to hold aBoundary as a single starting point.
     }
-    
+
     // Fill in boundaries between existing cache content and the new requested position.
 
     if (fBoundaries[fEndBufIdx] < position) {
@@ -495,13 +508,24 @@ UBool RuleBasedBreakIterator::BreakCache::populatePreceding(UErrorCode &status)
             position = 0;
             positionStatusIdx = 0;
         } else {
-            fBI->fPosition = backupPosition;  // TODO: pass starting position in a clearer way.
-            position = fBI->handleNext();     // TODO: supplementals don't work with the +1.
-            if (position == backupPosition + 1) {
-                position = fBI->handleNext();   // Safe rules identify safe pairs.
+            // Advance to the boundary following the backup position.
+            // There is a complication: the safe reverse rules identify pairs of code points
+            // that are safe. If advancing from the safe point moves forwards by less than
+            // two code points, we need to advance one more time to ensure that the boundary
+            // is good, including a correct rules status value.
+            //
+            fBI->fPosition = backupPosition;
+            position = fBI->handleNext();
+            if (position <= backupPosition + 4) {
+                // +4 is a quick test for possibly having advanced only one codepoint.
+                // Four being the length of the longest potential code point, a supplementary in UTF-8
+                utext_setNativeIndex(&fBI->fText, position);
+                if (backupPosition == utext_getPreviousNativeIndex(&fBI->fText)) {
+                    // The initial handleNext() only advanced by a single code point. Go again.
+                    position = fBI->handleNext();   // Safe rules identify safe pairs.
+                }
             };
             positionStatusIdx = fBI->fRuleStatusIndex;
-
         }
     } while (position >= fromPosition);
 
@@ -540,7 +564,7 @@ UBool RuleBasedBreakIterator::BreakCache::populatePreceding(UErrorCode &status)
             }
             U_ASSERT(position==dictSegEndPosition || position>=fromPosition);
         }
-            
+
         if (!segmentHandledByDictionary && position < fromPosition) {
             fSideBuffer.addElement(position, status);
             fSideBuffer.addElement(positionStatusIdx, status);
@@ -566,7 +590,7 @@ UBool RuleBasedBreakIterator::BreakCache::populatePreceding(UErrorCode &status)
             break;
         }
     }
-      
+
     return success;
 }
 
index ebd60c42a6de229e2d20109236c2d34f387ea2e1..385ab08b3295b4eafc2473b4195c8f5b0a7e1f00 100644 (file)
@@ -106,7 +106,10 @@ void RBBIDataWrapper::init(const RBBIDataHeader *data, UErrorCode &status) {
     if (data->fFTableLen != 0) {
         fForwardTable = (RBBIStateTable *)((char *)data + fHeader->fFTable);
     }
-    if (data->fSRTableLen != 0) {
+    if (data->fRTableLen != 0) {
+        fReverseTable = (RBBIStateTable *)((char *)data + fHeader->fRTable);
+    }
+    if (data->fSRTableLen != 0) {   // TODO: obsolete. Remove.
         fSafeRevTable = (RBBIStateTable *)((char *)data + fHeader->fSRTable);
     }
 
index 6a5ebede04681922e2c6c8b1e66159527340e4ee..83b96f261de4d9147d768f138f25f8d43628dfc2 100644 (file)
@@ -173,7 +173,8 @@ public:
     /*                                     */
     const RBBIDataHeader     *fHeader;
     const RBBIStateTable     *fForwardTable;
-    const RBBIStateTable     *fSafeRevTable;
+    const RBBIStateTable     *fReverseTable;      // auto-generated safe reverse.
+    const RBBIStateTable     *fSafeRevTable;      // hand-written safe reverse. TODO: delete this.
     const UChar              *fRuleSource;
     const int32_t            *fRuleStatusTable; 
 
index 043cef25f1536fd7f600254eb9c532ab80fb6461..4a3698893851dafb8b7244b7d60d73339d0b2bec 100644 (file)
@@ -300,10 +300,9 @@ RBBIDataHeader *RBBIRuleBuilder::build(UErrorCode &status) {
     fForwardTables->buildSafe(status);
 
 
-    if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "states")) {printStates();};
 #ifdef RBBI_DEBUG
     if (fDebugEnv && uprv_strstr(fDebugEnv, "states")) {
-        fForwardTables
+        fForwardTables->printStates();
         fForwardTables->printRuleStatusTable();
         fForwardTables->printSafeTable();
     }
index dd84b4c27b73246ab40698ba4081a1301a116abf..c3bcc722293e1d97fe6728dab4d399235ca66db6 100644 (file)
@@ -1312,10 +1312,10 @@ void RBBITableBuilder::buildSafe(UErrorCode &status) {
             if (wantedEndState == endState) {
                 int32_t pair = c1 << 16 | c2;
                 safePairs.addElement(pair, status);
-                printf("(%d, %d) ", c1, c2);
+                // printf("(%d, %d) ", c1, c2);
             }
         }
-        printf("\n");
+        // printf("\n");
     }
 
     // Populate the initial safe table.
@@ -1358,7 +1358,7 @@ void RBBITableBuilder::buildSafe(UErrorCode &status) {
         rowState.setCharAt(c1, 0);
     }
 
-    // Merge similar states.
+    // TODO: Merge similar states.
 
 }
 
@@ -1392,9 +1392,9 @@ int32_t  RBBITableBuilder::getSafeTableSize() const {
 
 //-----------------------------------------------------------------------------
 //
-//   exportTable()    export the state transition table in the format required
-//                    by the runtime engine.  getTableSize() bytes of memory
-//                    must be available at the output address "where".
+//   exportSafeTable()   export the state transition table in the format required
+//                       by the runtime engine.  getTableSize() bytes of memory
+//                       must be available at the output address "where".
 //
 //-----------------------------------------------------------------------------
 void RBBITableBuilder::exportSafeTable(void *where) {
index 9f68f0b7f7ed5a8db739d13ebd8d9e6b6b8b814a..fa78413be498f7967acfc4d0efc8de5248670668 100644 (file)
@@ -308,14 +308,14 @@ static void TestBreakIteratorCAPI()
 
 
     log_verbose("\nTesting the functions for sentence\n");
-    ubrk_first(sentence);
+    pos = ubrk_first(sentence);
     pos = ubrk_current(sentence);
     log_verbose("Current(sentence) = %d\n", (int32_t)pos);
        pos = ubrk_last(sentence);
     if(pos!=49)
         log_err("error ubrk_last for sentence did not return 49\n");
     log_verbose("Last (sentence) = %d\n", (int32_t)pos);
-    ubrk_first(sentence);
+    pos = ubrk_first(sentence);
     to = ubrk_following( sentence, 0 );
     if (to == 0) log_err("ubrk_following returned 0\n");
     to = ubrk_preceding( sentence, to );
index 23ecf10e9276e21efaa19be2b4f12d8b82de9a6c..ddd809cb8ec89f5a2be0c8da86ec7e86581063a9 100644 (file)
@@ -4551,40 +4551,6 @@ void RBBITest::TestDebug(void) {
     UParseError pe;
     LocalPointer<RuleBasedBreakIterator> newbi(new RuleBasedBreakIterator(rules, pe, status));
     assertSuccess(WHERE, status);
-
-#if 0
-    bi->dumpTables();
-
-    RBBIDataWrapper *dw = bi->fData;
-    const RBBIStateTable *fwtbl = dw->fForwardTable;
-    int32_t numCharClasses = dw->fHeader->fCatCount;
-    printf("Char Classes: %d     states: %d\n", numCharClasses, fwtbl->fNumStates);
-
-    for (int32_t c1=0; c1<numCharClasses; ++c1) {
-        for (int32_t c2=0; c2 < numCharClasses; ++c2) {
-            int32_t wantedEndState = -1;
-            int32_t endState = 0;
-            for (int32_t startState = 1; startState < (int32_t)fwtbl->fNumStates; ++startState) {
-                RBBIStateTableRow  *row = (RBBIStateTableRow *) (fwtbl->fTableData + (fwtbl->fRowLen * startState));
-                int32_t s2 = row->fNextState[c1];
-                row = (RBBIStateTableRow *) (fwtbl->fTableData + (fwtbl->fRowLen * s2));
-                endState = row->fNextState[c2];
-                if (wantedEndState < 0) {
-                    wantedEndState = endState;
-                } else {
-                    if (wantedEndState != endState) {
-                        break;
-                    }
-                }
-            }
-            if (wantedEndState == endState) {
-                printf("(%d, %d) ", c1, c2);
-            }
-        }
-        printf("\n");
-    }
-    printf("\n");
-#endif
 }
 
 void RBBITest::TestProperties() {
index 38a69fd974256220f87c28e21120671463cf7348..1ed2d17228f1d4954c86300a0ed9d12a19ba3ea1 100644 (file)
@@ -39,7 +39,9 @@
 
 #   Temp debugging tests
 #
-
+<word>
+<data>•
+•</data>
 
 ## FILTERED BREAK TESTS