From: Andy Heninger Date: Mon, 26 Mar 2018 23:01:16 +0000 (+0000) Subject: ICU-13194 RBBI safe tables, all tests passing! X-Git-Tag: release-62-rc~204^2~16 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=b1b0be93ea29c41dbd89fc7367a177187f804068;p=icu ICU-13194 RBBI safe tables, all tests passing! X-SVN-Rev: 41155 --- diff --git a/icu4c/source/common/rbbi.cpp b/icu4c/source/common/rbbi.cpp index 83d1c0b59a9..b48f028483f 100644 --- a/icu4c/source/common/rbbi.cpp +++ b/icu4c/source/common/rbbi.cpp @@ -651,7 +651,7 @@ UBool RuleBasedBreakIterator::isBoundary(int32_t offset) { } // Adjust offset to be on a code point boundary and not beyond the end of the text. - // Note that isBoundary() is always be false for offsets that are not on code point boundaries. + // Note that isBoundary() is always false for offsets that are not on code point boundaries. // But we still need the side effect of leaving iteration at the following boundary. utext_setNativeIndex(&fText, offset); @@ -1116,7 +1116,7 @@ int32_t RuleBasedBreakIterator::handleSafePrevious(int32_t fromPosition) { UChar32 c; int32_t result = 0; - const RBBIStateTable *stateTable = fData->fSafeRevTable; + const RBBIStateTable *stateTable = fData->fReverseTable; UTEXT_SETNATIVEINDEX(&fText, fromPosition); #ifdef RBBI_DEBUG if (gTrace) { diff --git a/icu4c/source/common/rbbi_cache.cpp b/icu4c/source/common/rbbi_cache.cpp index 33f03d90013..60316ce6420 100644 --- a/icu4c/source/common/rbbi_cache.cpp +++ b/icu4c/source/common/rbbi_cache.cpp @@ -206,7 +206,7 @@ void RuleBasedBreakIterator::DictionaryCache::populateDictionary(int32_t startPo * BreakCache implemetation */ -RuleBasedBreakIterator::BreakCache::BreakCache(RuleBasedBreakIterator *bi, UErrorCode &status) : +RuleBasedBreakIterator::BreakCache::BreakCache(RuleBasedBreakIterator *bi, UErrorCode &status) : fBI(bi), fSideBuffer(status) { reset(); } @@ -299,7 +299,7 @@ void RuleBasedBreakIterator::BreakCache::previous(UErrorCode &status) { fBI->fPosition = fTextIdx; fBI->fRuleStatusIndex = fStatuses[fBufIdx]; return; -} +} UBool RuleBasedBreakIterator::BreakCache::seek(int32_t pos) { @@ -317,7 +317,7 @@ UBool RuleBasedBreakIterator::BreakCache::seek(int32_t pos) { fTextIdx = fBoundaries[fBufIdx]; return TRUE; } - + int32_t min = fStartBufIdx; int32_t max = fEndBufIdx; while (min != max) { @@ -354,20 +354,33 @@ UBool RuleBasedBreakIterator::BreakCache::populateNear(int32_t position, UErrorC if ((position < fBoundaries[fStartBufIdx] - 15) || position > (fBoundaries[fEndBufIdx] + 15)) { int32_t aBoundary = 0; int32_t ruleStatusIndex = 0; - // TODO: check for position == length of text. Although may still need to back up to get rule status. if (position > 20) { int32_t backupPos = fBI->handleSafePrevious(position); - fBI->fPosition = backupPos; - aBoundary = fBI->handleNext(); // Ignore dictionary, just finding a rule based boundary. - if (aBoundary == backupPos + 1) { // TODO: + 1 is wrong for supplementals. - // Safe rules work on pairs. +1 from start pos may be a false match. + + if (backupPos > 0) { + // Advance to the boundary following the backup position. + // There is a complication: the safe reverse rules identify pairs of code points + // that are safe. If advancing from the safe point moves forwards by less than + // two code points, we need to advance one more time to ensure that the boundary + // is good, including a correct rules status value. + // + fBI->fPosition = backupPos; aBoundary = fBI->handleNext(); + if (aBoundary <= backupPos + 4) { + // +4 is a quick test for possibly having advanced only one codepoint. + // Four being the length of the longest potential code point, a supplementary in UTF-8 + utext_setNativeIndex(&fBI->fText, aBoundary); + if (backupPos == utext_getPreviousNativeIndex(&fBI->fText)) { + // The initial handleNext() only advanced by a single code point. Go again. + aBoundary = fBI->handleNext(); // Safe rules identify safe pairs. + } + } + ruleStatusIndex = fBI->fRuleStatusIndex; } - ruleStatusIndex = fBI->fRuleStatusIndex; } reset(aBoundary, ruleStatusIndex); // Reset cache to hold aBoundary as a single starting point. } - + // Fill in boundaries between existing cache content and the new requested position. if (fBoundaries[fEndBufIdx] < position) { @@ -495,13 +508,24 @@ UBool RuleBasedBreakIterator::BreakCache::populatePreceding(UErrorCode &status) position = 0; positionStatusIdx = 0; } else { - fBI->fPosition = backupPosition; // TODO: pass starting position in a clearer way. - position = fBI->handleNext(); // TODO: supplementals don't work with the +1. - if (position == backupPosition + 1) { - position = fBI->handleNext(); // Safe rules identify safe pairs. + // Advance to the boundary following the backup position. + // There is a complication: the safe reverse rules identify pairs of code points + // that are safe. If advancing from the safe point moves forwards by less than + // two code points, we need to advance one more time to ensure that the boundary + // is good, including a correct rules status value. + // + fBI->fPosition = backupPosition; + position = fBI->handleNext(); + if (position <= backupPosition + 4) { + // +4 is a quick test for possibly having advanced only one codepoint. + // Four being the length of the longest potential code point, a supplementary in UTF-8 + utext_setNativeIndex(&fBI->fText, position); + if (backupPosition == utext_getPreviousNativeIndex(&fBI->fText)) { + // The initial handleNext() only advanced by a single code point. Go again. + position = fBI->handleNext(); // Safe rules identify safe pairs. + } }; positionStatusIdx = fBI->fRuleStatusIndex; - } } while (position >= fromPosition); @@ -540,7 +564,7 @@ UBool RuleBasedBreakIterator::BreakCache::populatePreceding(UErrorCode &status) } U_ASSERT(position==dictSegEndPosition || position>=fromPosition); } - + if (!segmentHandledByDictionary && position < fromPosition) { fSideBuffer.addElement(position, status); fSideBuffer.addElement(positionStatusIdx, status); @@ -566,7 +590,7 @@ UBool RuleBasedBreakIterator::BreakCache::populatePreceding(UErrorCode &status) break; } } - + return success; } diff --git a/icu4c/source/common/rbbidata.cpp b/icu4c/source/common/rbbidata.cpp index ebd60c42a6d..385ab08b329 100644 --- a/icu4c/source/common/rbbidata.cpp +++ b/icu4c/source/common/rbbidata.cpp @@ -106,7 +106,10 @@ void RBBIDataWrapper::init(const RBBIDataHeader *data, UErrorCode &status) { if (data->fFTableLen != 0) { fForwardTable = (RBBIStateTable *)((char *)data + fHeader->fFTable); } - if (data->fSRTableLen != 0) { + if (data->fRTableLen != 0) { + fReverseTable = (RBBIStateTable *)((char *)data + fHeader->fRTable); + } + if (data->fSRTableLen != 0) { // TODO: obsolete. Remove. fSafeRevTable = (RBBIStateTable *)((char *)data + fHeader->fSRTable); } diff --git a/icu4c/source/common/rbbidata.h b/icu4c/source/common/rbbidata.h index 6a5ebede046..83b96f261de 100644 --- a/icu4c/source/common/rbbidata.h +++ b/icu4c/source/common/rbbidata.h @@ -173,7 +173,8 @@ public: /* */ const RBBIDataHeader *fHeader; const RBBIStateTable *fForwardTable; - const RBBIStateTable *fSafeRevTable; + const RBBIStateTable *fReverseTable; // auto-generated safe reverse. + const RBBIStateTable *fSafeRevTable; // hand-written safe reverse. TODO: delete this. const UChar *fRuleSource; const int32_t *fRuleStatusTable; diff --git a/icu4c/source/common/rbbirb.cpp b/icu4c/source/common/rbbirb.cpp index 043cef25f15..4a369889385 100644 --- a/icu4c/source/common/rbbirb.cpp +++ b/icu4c/source/common/rbbirb.cpp @@ -300,10 +300,9 @@ RBBIDataHeader *RBBIRuleBuilder::build(UErrorCode &status) { fForwardTables->buildSafe(status); - if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "states")) {printStates();}; #ifdef RBBI_DEBUG if (fDebugEnv && uprv_strstr(fDebugEnv, "states")) { - fForwardTables + fForwardTables->printStates(); fForwardTables->printRuleStatusTable(); fForwardTables->printSafeTable(); } diff --git a/icu4c/source/common/rbbitblb.cpp b/icu4c/source/common/rbbitblb.cpp index dd84b4c27b7..c3bcc722293 100644 --- a/icu4c/source/common/rbbitblb.cpp +++ b/icu4c/source/common/rbbitblb.cpp @@ -1312,10 +1312,10 @@ void RBBITableBuilder::buildSafe(UErrorCode &status) { if (wantedEndState == endState) { int32_t pair = c1 << 16 | c2; safePairs.addElement(pair, status); - printf("(%d, %d) ", c1, c2); + // printf("(%d, %d) ", c1, c2); } } - printf("\n"); + // printf("\n"); } // Populate the initial safe table. @@ -1358,7 +1358,7 @@ void RBBITableBuilder::buildSafe(UErrorCode &status) { rowState.setCharAt(c1, 0); } - // Merge similar states. + // TODO: Merge similar states. } @@ -1392,9 +1392,9 @@ int32_t RBBITableBuilder::getSafeTableSize() const { //----------------------------------------------------------------------------- // -// exportTable() export the state transition table in the format required -// by the runtime engine. getTableSize() bytes of memory -// must be available at the output address "where". +// exportSafeTable() export the state transition table in the format required +// by the runtime engine. getTableSize() bytes of memory +// must be available at the output address "where". // //----------------------------------------------------------------------------- void RBBITableBuilder::exportSafeTable(void *where) { diff --git a/icu4c/source/test/cintltst/cbiapts.c b/icu4c/source/test/cintltst/cbiapts.c index 9f68f0b7f7e..fa78413be49 100644 --- a/icu4c/source/test/cintltst/cbiapts.c +++ b/icu4c/source/test/cintltst/cbiapts.c @@ -308,14 +308,14 @@ static void TestBreakIteratorCAPI() log_verbose("\nTesting the functions for sentence\n"); - ubrk_first(sentence); + pos = ubrk_first(sentence); pos = ubrk_current(sentence); log_verbose("Current(sentence) = %d\n", (int32_t)pos); pos = ubrk_last(sentence); if(pos!=49) log_err("error ubrk_last for sentence did not return 49\n"); log_verbose("Last (sentence) = %d\n", (int32_t)pos); - ubrk_first(sentence); + pos = ubrk_first(sentence); to = ubrk_following( sentence, 0 ); if (to == 0) log_err("ubrk_following returned 0\n"); to = ubrk_preceding( sentence, to ); diff --git a/icu4c/source/test/intltest/rbbitst.cpp b/icu4c/source/test/intltest/rbbitst.cpp index 23ecf10e927..ddd809cb8ec 100644 --- a/icu4c/source/test/intltest/rbbitst.cpp +++ b/icu4c/source/test/intltest/rbbitst.cpp @@ -4551,40 +4551,6 @@ void RBBITest::TestDebug(void) { UParseError pe; LocalPointer newbi(new RuleBasedBreakIterator(rules, pe, status)); assertSuccess(WHERE, status); - -#if 0 - bi->dumpTables(); - - RBBIDataWrapper *dw = bi->fData; - const RBBIStateTable *fwtbl = dw->fForwardTable; - int32_t numCharClasses = dw->fHeader->fCatCount; - printf("Char Classes: %d states: %d\n", numCharClasses, fwtbl->fNumStates); - - for (int32_t c1=0; c1fNumStates; ++startState) { - RBBIStateTableRow *row = (RBBIStateTableRow *) (fwtbl->fTableData + (fwtbl->fRowLen * startState)); - int32_t s2 = row->fNextState[c1]; - row = (RBBIStateTableRow *) (fwtbl->fTableData + (fwtbl->fRowLen * s2)); - endState = row->fNextState[c2]; - if (wantedEndState < 0) { - wantedEndState = endState; - } else { - if (wantedEndState != endState) { - break; - } - } - } - if (wantedEndState == endState) { - printf("(%d, %d) ", c1, c2); - } - } - printf("\n"); - } - printf("\n"); -#endif } void RBBITest::TestProperties() { diff --git a/icu4c/source/test/testdata/rbbitst.txt b/icu4c/source/test/testdata/rbbitst.txt index 38a69fd9742..1ed2d17228f 100644 --- a/icu4c/source/test/testdata/rbbitst.txt +++ b/icu4c/source/test/testdata/rbbitst.txt @@ -39,7 +39,9 @@ # Temp debugging tests # - + +• +• ## FILTERED BREAK TESTS