ICU-13194 RBBI safe tables, all tests passing!

author Andy Heninger <andy.heninger@gmail.com>

Mon, 26 Mar 2018 23:01:16 +0000 (23:01 +0000)

committer Andy Heninger <andy.heninger@gmail.com>

Mon, 26 Mar 2018 23:01:16 +0000 (23:01 +0000)
author Andy Heninger <andy.heninger@gmail.com>
Mon, 26 Mar 2018 23:01:16 +0000 (23:01 +0000)
committer Andy Heninger <andy.heninger@gmail.com>
Mon, 26 Mar 2018 23:01:16 +0000 (23:01 +0000)
diff --git a/icu4c/source/common/rbbi.cpp b/icu4c/source/common/rbbi.cpp

index 83d1c0b59a9e43c620c7bd39eb92741a2fddefb6..b48f028483fe16ff5489a77f87ac56197bc9e651 100644 (file)
--- a/icu4c/source/common/rbbi.cpp
+++ b/icu4c/source/common/rbbi.cpp
@@ -651,7 +651,7 @@ UBool RuleBasedBreakIterator::isBoundary(int32_t offset) {
      }
  
      // Adjust offset to be on a code point boundary and not beyond the end of the text.
-    // Note that isBoundary() is always be false for offsets that are not on code point boundaries.
+    // Note that isBoundary() is always false for offsets that are not on code point boundaries.
      // But we still need the side effect of leaving iteration at the following boundary.
  
      utext_setNativeIndex(&fText, offset);
@@ -1116,7 +1116,7 @@ int32_t RuleBasedBreakIterator::handleSafePrevious(int32_t fromPosition) {
      UChar32             c;
      int32_t             result          = 0;
  
-    const RBBIStateTable *stateTable = fData->fSafeRevTable;
+    const RBBIStateTable *stateTable = fData->fReverseTable;
      UTEXT_SETNATIVEINDEX(&fText, fromPosition);
      #ifdef RBBI_DEBUG
          if (gTrace) {
diff --git a/icu4c/source/common/rbbi_cache.cpp b/icu4c/source/common/rbbi_cache.cpp

index 33f03d90013f501741ab244e7ada6dd0297ebc1e..60316ce6420dc5ecf5dc96ebbb5eebdd13059b5c 100644 (file)
--- a/icu4c/source/common/rbbi_cache.cpp
+++ b/icu4c/source/common/rbbi_cache.cpp
@@ -206,7 +206,7 @@ void RuleBasedBreakIterator::DictionaryCache::populateDictionary(int32_t startPo
   *   BreakCache implemetation
   */
  
-RuleBasedBreakIterator::BreakCache::BreakCache(RuleBasedBreakIterator *bi, UErrorCode &status) : 
+RuleBasedBreakIterator::BreakCache::BreakCache(RuleBasedBreakIterator *bi, UErrorCode &status) :
          fBI(bi), fSideBuffer(status) {
      reset();
  }
@@ -299,7 +299,7 @@ void RuleBasedBreakIterator::BreakCache::previous(UErrorCode &status) {
      fBI->fPosition = fTextIdx;
      fBI->fRuleStatusIndex = fStatuses[fBufIdx];
      return;
-}    
+}
  
  
  UBool RuleBasedBreakIterator::BreakCache::seek(int32_t pos) {
@@ -317,7 +317,7 @@ UBool RuleBasedBreakIterator::BreakCache::seek(int32_t pos) {
          fTextIdx = fBoundaries[fBufIdx];
          return TRUE;
      }
-    
+
      int32_t min = fStartBufIdx;
      int32_t max = fEndBufIdx;
      while (min != max) {
@@ -354,20 +354,33 @@ UBool RuleBasedBreakIterator::BreakCache::populateNear(int32_t position, UErrorC
      if ((position < fBoundaries[fStartBufIdx] - 15) || position > (fBoundaries[fEndBufIdx] + 15)) {
          int32_t aBoundary = 0;
          int32_t ruleStatusIndex = 0;
-        // TODO: check for position == length of text. Although may still need to back up to get rule status.
          if (position > 20) {
              int32_t backupPos = fBI->handleSafePrevious(position);
-            fBI->fPosition = backupPos;
-            aBoundary = fBI->handleNext();        // Ignore dictionary, just finding a rule based boundary.
-            if (aBoundary == backupPos + 1) {   // TODO: + 1 is wrong for supplementals.
-                // Safe rules work on pairs. +1 from start pos may be a false match.
+
+            if (backupPos > 0) {
+                // Advance to the boundary following the backup position.
+                // There is a complication: the safe reverse rules identify pairs of code points
+                // that are safe. If advancing from the safe point moves forwards by less than
+                // two code points, we need to advance one more time to ensure that the boundary
+                // is good, including a correct rules status value.
+                //
+                fBI->fPosition = backupPos;
                  aBoundary = fBI->handleNext();
+                if (aBoundary <= backupPos + 4) {
+                    // +4 is a quick test for possibly having advanced only one codepoint.
+                    // Four being the length of the longest potential code point, a supplementary in UTF-8
+                    utext_setNativeIndex(&fBI->fText, aBoundary);
+                    if (backupPos == utext_getPreviousNativeIndex(&fBI->fText)) {
+                        // The initial handleNext() only advanced by a single code point. Go again.
+                        aBoundary = fBI->handleNext();   // Safe rules identify safe pairs.
+                    }
+                }
+                ruleStatusIndex = fBI->fRuleStatusIndex;
              }
-            ruleStatusIndex = fBI->fRuleStatusIndex;
          }
          reset(aBoundary, ruleStatusIndex);        // Reset cache to hold aBoundary as a single starting point.
      }
-    
+
      // Fill in boundaries between existing cache content and the new requested position.
  
      if (fBoundaries[fEndBufIdx] < position) {
@@ -495,13 +508,24 @@ UBool RuleBasedBreakIterator::BreakCache::populatePreceding(UErrorCode &status)
              position = 0;
              positionStatusIdx = 0;
          } else {
-            fBI->fPosition = backupPosition;  // TODO: pass starting position in a clearer way.
-            position = fBI->handleNext();     // TODO: supplementals don't work with the +1.
-            if (position == backupPosition + 1) {
-                position = fBI->handleNext();   // Safe rules identify safe pairs.
+            // Advance to the boundary following the backup position.
+            // There is a complication: the safe reverse rules identify pairs of code points
+            // that are safe. If advancing from the safe point moves forwards by less than
+            // two code points, we need to advance one more time to ensure that the boundary
+            // is good, including a correct rules status value.
+            //
+            fBI->fPosition = backupPosition;
+            position = fBI->handleNext();
+            if (position <= backupPosition + 4) {
+                // +4 is a quick test for possibly having advanced only one codepoint.
+                // Four being the length of the longest potential code point, a supplementary in UTF-8
+                utext_setNativeIndex(&fBI->fText, position);
+                if (backupPosition == utext_getPreviousNativeIndex(&fBI->fText)) {
+                    // The initial handleNext() only advanced by a single code point. Go again.
+                    position = fBI->handleNext();   // Safe rules identify safe pairs.
+                }
              };
              positionStatusIdx = fBI->fRuleStatusIndex;
-
          }
      } while (position >= fromPosition);
  
@@ -540,7 +564,7 @@ UBool RuleBasedBreakIterator::BreakCache::populatePreceding(UErrorCode &status)
              }
              U_ASSERT(position==dictSegEndPosition || position>=fromPosition);
          }
-            
+
          if (!segmentHandledByDictionary && position < fromPosition) {
              fSideBuffer.addElement(position, status);
              fSideBuffer.addElement(positionStatusIdx, status);
@@ -566,7 +590,7 @@ UBool RuleBasedBreakIterator::BreakCache::populatePreceding(UErrorCode &status)
              break;
          }
      }
-      
+
      return success;
  }
  
diff --git a/icu4c/source/common/rbbidata.cpp b/icu4c/source/common/rbbidata.cpp

index ebd60c42a6de229e2d20109236c2d34f387ea2e1..385ab08b3295b4eafc2473b4195c8f5b0a7e1f00 100644 (file)
--- a/icu4c/source/common/rbbidata.cpp
+++ b/icu4c/source/common/rbbidata.cpp
@@ -106,7 +106,10 @@ void RBBIDataWrapper::init(const RBBIDataHeader *data, UErrorCode &status) {
      if (data->fFTableLen != 0) {
          fForwardTable = (RBBIStateTable *)((char *)data + fHeader->fFTable);
      }
-    if (data->fSRTableLen != 0) {
+    if (data->fRTableLen != 0) {
+        fReverseTable = (RBBIStateTable *)((char *)data + fHeader->fRTable);
+    }
+    if (data->fSRTableLen != 0) {   // TODO: obsolete. Remove.
          fSafeRevTable = (RBBIStateTable *)((char *)data + fHeader->fSRTable);
      }
  
diff --git a/icu4c/source/common/rbbidata.h b/icu4c/source/common/rbbidata.h

index 6a5ebede04681922e2c6c8b1e66159527340e4ee..83b96f261de4d9147d768f138f25f8d43628dfc2 100644 (file)
--- a/icu4c/source/common/rbbidata.h
+++ b/icu4c/source/common/rbbidata.h
@@ -173,7 +173,8 @@ public:
      /*                                     */
      const RBBIDataHeader     *fHeader;
      const RBBIStateTable     *fForwardTable;
-    const RBBIStateTable     *fSafeRevTable;
+    const RBBIStateTable     *fReverseTable;      // auto-generated safe reverse.
+    const RBBIStateTable     *fSafeRevTable;      // hand-written safe reverse. TODO: delete this.
      const UChar              *fRuleSource;
      const int32_t            *fRuleStatusTable; 
  
diff --git a/icu4c/source/common/rbbirb.cpp b/icu4c/source/common/rbbirb.cpp

index 043cef25f1536fd7f600254eb9c532ab80fb6461..4a3698893851dafb8b7244b7d60d73339d0b2bec 100644 (file)
--- a/icu4c/source/common/rbbirb.cpp
+++ b/icu4c/source/common/rbbirb.cpp
@@ -300,10 +300,9 @@ RBBIDataHeader *RBBIRuleBuilder::build(UErrorCode &status) {
      fForwardTables->buildSafe(status);
  
  
-    if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "states")) {printStates();};
  #ifdef RBBI_DEBUG
      if (fDebugEnv && uprv_strstr(fDebugEnv, "states")) {
-        fForwardTables
+        fForwardTables->printStates();
          fForwardTables->printRuleStatusTable();
          fForwardTables->printSafeTable();
      }
diff --git a/icu4c/source/common/rbbitblb.cpp b/icu4c/source/common/rbbitblb.cpp

index dd84b4c27b73246ab40698ba4081a1301a116abf..c3bcc722293e1d97fe6728dab4d399235ca66db6 100644 (file)
--- a/icu4c/source/common/rbbitblb.cpp
+++ b/icu4c/source/common/rbbitblb.cpp
@@ -1312,10 +1312,10 @@ void RBBITableBuilder::buildSafe(UErrorCode &status) {
              if (wantedEndState == endState) {
                  int32_t pair = c1 << 16 | c2;
                  safePairs.addElement(pair, status);
-                printf("(%d, %d) ", c1, c2);
+                // printf("(%d, %d) ", c1, c2);
              }
          }
-        printf("\n");
+        // printf("\n");
      }
  
      // Populate the initial safe table.
@@ -1358,7 +1358,7 @@ void RBBITableBuilder::buildSafe(UErrorCode &status) {
          rowState.setCharAt(c1, 0);
      }
  
-    // Merge similar states.
+    // TODO: Merge similar states.
  
  }
  
@@ -1392,9 +1392,9 @@ int32_t  RBBITableBuilder::getSafeTableSize() const {
  
  //-----------------------------------------------------------------------------
  //
-//   exportTable()    export the state transition table in the format required
-//                    by the runtime engine.  getTableSize() bytes of memory
-//                    must be available at the output address "where".
+//   exportSafeTable()   export the state transition table in the format required
+//                       by the runtime engine.  getTableSize() bytes of memory
+//                       must be available at the output address "where".
  //
  //-----------------------------------------------------------------------------
  void RBBITableBuilder::exportSafeTable(void *where) {
diff --git a/icu4c/source/test/cintltst/cbiapts.c b/icu4c/source/test/cintltst/cbiapts.c

index 9f68f0b7f7ed5a8db739d13ebd8d9e6b6b8b814a..fa78413be498f7967acfc4d0efc8de5248670668 100644 (file)
--- a/icu4c/source/test/cintltst/cbiapts.c
+++ b/icu4c/source/test/cintltst/cbiapts.c
@@ -308,14 +308,14 @@ static void TestBreakIteratorCAPI()
  
  
      log_verbose("\nTesting the functions for sentence\n");
-    ubrk_first(sentence);
+    pos = ubrk_first(sentence);
      pos = ubrk_current(sentence);
      log_verbose("Current(sentence) = %d\n", (int32_t)pos);
         pos = ubrk_last(sentence);
      if(pos!=49)
          log_err("error ubrk_last for sentence did not return 49\n");
      log_verbose("Last (sentence) = %d\n", (int32_t)pos);
-    ubrk_first(sentence);
+    pos = ubrk_first(sentence);
      to = ubrk_following( sentence, 0 );
      if (to == 0) log_err("ubrk_following returned 0\n");
      to = ubrk_preceding( sentence, to );
diff --git a/icu4c/source/test/intltest/rbbitst.cpp b/icu4c/source/test/intltest/rbbitst.cpp

index 23ecf10e9276e21efaa19be2b4f12d8b82de9a6c..ddd809cb8ec89f5a2be0c8da86ec7e86581063a9 100644 (file)
--- a/icu4c/source/test/intltest/rbbitst.cpp
+++ b/icu4c/source/test/intltest/rbbitst.cpp
@@ -4551,40 +4551,6 @@ void RBBITest::TestDebug(void) {
      UParseError pe;
      LocalPointer<RuleBasedBreakIterator> newbi(new RuleBasedBreakIterator(rules, pe, status));
      assertSuccess(WHERE, status);
-
-#if 0
-    bi->dumpTables();
-
-    RBBIDataWrapper *dw = bi->fData;
-    const RBBIStateTable *fwtbl = dw->fForwardTable;
-    int32_t numCharClasses = dw->fHeader->fCatCount;
-    printf("Char Classes: %d     states: %d\n", numCharClasses, fwtbl->fNumStates);
-
-    for (int32_t c1=0; c1<numCharClasses; ++c1) {
-        for (int32_t c2=0; c2 < numCharClasses; ++c2) {
-            int32_t wantedEndState = -1;
-            int32_t endState = 0;
-            for (int32_t startState = 1; startState < (int32_t)fwtbl->fNumStates; ++startState) {
-                RBBIStateTableRow  *row = (RBBIStateTableRow *) (fwtbl->fTableData + (fwtbl->fRowLen * startState));
-                int32_t s2 = row->fNextState[c1];
-                row = (RBBIStateTableRow *) (fwtbl->fTableData + (fwtbl->fRowLen * s2));
-                endState = row->fNextState[c2];
-                if (wantedEndState < 0) {
-                    wantedEndState = endState;
-                } else {
-                    if (wantedEndState != endState) {
-                        break;
-                    }
-                }
-            }
-            if (wantedEndState == endState) {
-                printf("(%d, %d) ", c1, c2);
-            }
-        }
-        printf("\n");
-    }
-    printf("\n");
-#endif
  }
  
  void RBBITest::TestProperties() {
diff --git a/icu4c/source/test/testdata/rbbitst.txt b/icu4c/source/test/testdata/rbbitst.txt

index 38a69fd974256220f87c28e21120671463cf7348..1ed2d17228f1d4954c86300a0ed9d12a19ba3ea1 100644 (file)
--- a/icu4c/source/test/testdata/rbbitst.txt
+++ b/icu4c/source/test/testdata/rbbitst.txt
@@ -39,7 +39,9 @@
  
  #   Temp debugging tests
  #
-
+<word>
+<data>•
+•</data>
  
  ## FILTERED BREAK TESTS
author	Andy Heninger <andy.heninger@gmail.com>
	Mon, 26 Mar 2018 23:01:16 +0000 (23:01 +0000)
committer	Andy Heninger <andy.heninger@gmail.com>
	Mon, 26 Mar 2018 23:01:16 +0000 (23:01 +0000)
icu4c/source/common/rbbi.cpp		patch \| blob \| history
icu4c/source/common/rbbi_cache.cpp		patch \| blob \| history
icu4c/source/common/rbbidata.cpp		patch \| blob \| history
icu4c/source/common/rbbidata.h		patch \| blob \| history
icu4c/source/common/rbbirb.cpp		patch \| blob \| history
icu4c/source/common/rbbitblb.cpp		patch \| blob \| history
icu4c/source/test/cintltst/cbiapts.c		patch \| blob \| history
icu4c/source/test/intltest/rbbitst.cpp		patch \| blob \| history
icu4c/source/test/testdata/rbbitst.txt		patch \| blob \| history