From: Andy Heninger Date: Tue, 27 Mar 2018 05:03:10 +0000 (+0000) Subject: ICU-13194 RBBI safe tables, added another test. X-Git-Tag: release-62-rc~204^2~15 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=e5ab76b130f0af126fa6a100704e36deeea89ef4;p=icu ICU-13194 RBBI safe tables, added another test. X-SVN-Rev: 41157 --- diff --git a/icu4c/source/test/intltest/rbbitst.cpp b/icu4c/source/test/intltest/rbbitst.cpp index ddd809cb8ec..02e88bc2766 100644 --- a/icu4c/source/test/intltest/rbbitst.cpp +++ b/icu4c/source/test/intltest/rbbitst.cpp @@ -17,6 +17,7 @@ #include #include #include +#include #include #include "unicode/brkiter.h" @@ -111,6 +112,7 @@ void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, cha TESTCASE_AUTO(TestBug12677); TESTCASE_AUTO(TestTableRedundancies); TESTCASE_AUTO(TestBug13447); + TESTCASE_AUTO(TestReverse); TESTCASE_AUTO_END; } @@ -1817,7 +1819,7 @@ int32_t RBBICharMonkey::next(int32_t prevPos) { // a break if there are three or more contiguous RIs. If there are // only two, a break following will occur via other rules, and will include // any trailing extend characters, which is needed behavior. - if (fRegionalIndicatorSet->contains(c0) && fRegionalIndicatorSet->contains(c1) + if (fRegionalIndicatorSet->contains(c0) && fRegionalIndicatorSet->contains(c1) && fRegionalIndicatorSet->contains(c2)) { break; } @@ -3121,11 +3123,11 @@ int32_t RBBILineMonkey::next(int32_t startPos) { // LB 23a Do not break between numeric prefixes and ideographs, or between ideographs and numeric postfixes. // PR x (ID | EB | EM) // (ID | EB | EM) x PO - if (fPR->contains(prevChar) && + if (fPR->contains(prevChar) && (fID->contains(thisChar) || fEB->contains(thisChar) || fEM->contains(thisChar))) { continue; } - if ((fID->contains(prevChar) || fEB->contains(prevChar) || fEM->contains(prevChar)) && + if ((fID->contains(prevChar) || fEB->contains(prevChar) || fEM->contains(prevChar)) && fPO->contains(thisChar)) { continue; } @@ -4422,7 +4424,7 @@ void RBBITest::TestBug12519() { return; } assertTrue(WHERE, Locale::getEnglish() == biEn->getLocale(ULOC_VALID_LOCALE, status)); - + assertTrue(WHERE, Locale::getFrench() == biFr->getLocale(ULOC_VALID_LOCALE, status)); assertTrue(WHERE "Locales do not participate in BreakIterator equality.", *biEn == *biFr); @@ -4462,7 +4464,7 @@ void RBBITest::TestBug12677() { void RBBITest::TestTableRedundancies() { UErrorCode status = U_ZERO_ERROR; - + LocalPointer bi ( (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getEnglish(), status)); assertSuccess(WHERE, status); @@ -4538,6 +4540,85 @@ void RBBITest::TestBug13447() { assertEquals(WHERE, UBRK_WORD_NUMBER, bi->getRuleStatus()); } +// TestReverse exercises both the synthesized safe reverse rules and the logic +// for filling the break iterator cache when starting from random positions +// in the text. +// +// It's a monkey test, working on random data, with the expected data obtained +// from forward iteration (no safe rules involved), comparing with results +// when indexing into the interior of the string (safe rules needed). + +void RBBITest::TestReverse() { + UErrorCode status = U_ZERO_ERROR; + + TestReverse(std::unique_ptr((RuleBasedBreakIterator *) + BreakIterator::createCharacterInstance(Locale::getEnglish(), status))); + assertSuccess(WHERE, status); + TestReverse(std::unique_ptr((RuleBasedBreakIterator *) + BreakIterator::createWordInstance(Locale::getEnglish(), status))); + assertSuccess(WHERE, status); + TestReverse(std::unique_ptr((RuleBasedBreakIterator *) + BreakIterator::createLineInstance(Locale::getEnglish(), status))); + assertSuccess(WHERE, status); + TestReverse(std::unique_ptr((RuleBasedBreakIterator *) + BreakIterator::createSentenceInstance(Locale::getEnglish(), status))); + assertSuccess(WHERE, status); +} + +void RBBITest::TestReverse(std::unique_ptrbi) { + if (!bi) { + errln(WHERE); + return; + } + + // From the mapping trie in the break iterator's internal data, create a + // vector of UnicodeStrings, one for each character category, containing + // all of the code points that map to that category. Unicode planes 0 and 1 only, + // to avoid an execess of unassigned code points. + + RBBIDataWrapper *data = bi->fData; + int32_t categoryCount = data->fHeader->fCatCount; + UTrie2 *trie = data->fTrie; + + std::vector strings(categoryCount, UnicodeString()); + for (int cp=0; cp<0x1fff0; ++cp) { + int cat = utrie2_get32(trie, cp); + cat &= ~0x4000; // And off the dictionary bit from the category. + assertTrue(WHERE, cat < categoryCount && cat >= 0); + if (cat < 0 || cat >= categoryCount) return; + strings[cat].append(cp); + } + + icu_rand randomGen; + const int testStringLength = 10000; + UnicodeString testString; + + for (int i=0; i 0) { + int cp = strings[charClass].char32At(randomGen() % strings[charClass].length()); + testString.append(cp); + } + } + + typedef std::pair Result; + std::vector expectedResults; + bi->setText(testString); + for (int i=0; iisBoundary(i); + int ruleStatus = bi->getRuleStatus(); + expectedResults.push_back(std::make_pair(isboundary, ruleStatus)); + } + + for (int i=testString.length()-1; i>=0; --i) { + bi->setText(testString); // clears the internal break cache + Result expected = expectedResults[i]; + assertEquals(WHERE, expected.first, bi->isBoundary(i)); + assertEquals(WHERE, expected.second, bi->getRuleStatus()); + } +} + + // // TestDebug - A place-holder test for debugging purposes. // For putting in fragments of other tests that can be invoked diff --git a/icu4c/source/test/intltest/rbbitst.h b/icu4c/source/test/intltest/rbbitst.h index 21fdfb9f01a..20774cc8ea2 100644 --- a/icu4c/source/test/intltest/rbbitst.h +++ b/icu4c/source/test/intltest/rbbitst.h @@ -17,6 +17,8 @@ #if !UCONFIG_NO_BREAK_ITERATION +#include + #include "intltest.h" #include "unicode/brkiter.h" #include "unicode/rbbi.h" @@ -77,6 +79,8 @@ public: void TestBug12677(); void TestTableRedundancies(); void TestBug13447(); + void TestReverse(); + void TestReverse(std::unique_ptrbi); void TestDebug(); void TestProperties();