From: Andy Heninger Date: Tue, 9 Jun 2020 20:19:17 +0000 (-0700) Subject: ICU-13565 Break Iteration, remove the dictionary bit from the implementation. X-Git-Tag: cldr/2020-09-22~180 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=1eef3623293115fe2cd2fc52ccf85a723f7ad70f;p=icu ICU-13565 Break Iteration, remove the dictionary bit from the implementation. For identifying text that needs to be handled by a word dictionary for Break Iteration, change from using a bit in the character category to sorting all dictionary categories together, and recording the boundary between the non-dictionary and dictionary ranges. This is internal to the implementaion. It does not affect behavior. It does increase the number of character categories that can be handled using a compact 8 bit Trie, from 127 to 255. --- diff --git a/icu4c/source/common/rbbi.cpp b/icu4c/source/common/rbbi.cpp index 87f5f714107..369e400630d 100644 --- a/icu4c/source/common/rbbi.cpp +++ b/icu4c/source/common/rbbi.cpp @@ -763,15 +763,15 @@ int32_t RuleBasedBreakIterator::handleNext() { bool use8BitsTrie = ucptrie_getValueWidth(fData->fTrie) == UCPTRIE_VALUE_BITS_8; if (statetable->fFlags & RBBI_8BITS_ROWS) { if (use8BitsTrie) { - return handleNext(); + return handleNext(); } else { - return handleNext(); + return handleNext(); } } else { if (use8BitsTrie) { - return handleNext(); + return handleNext(); } else { - return handleNext(); + return handleNext(); } } } @@ -781,15 +781,15 @@ int32_t RuleBasedBreakIterator::handleSafePrevious(int32_t fromPosition) { bool use8BitsTrie = ucptrie_getValueWidth(fData->fTrie) == UCPTRIE_VALUE_BITS_8; if (statetable->fFlags & RBBI_8BITS_ROWS) { if (use8BitsTrie) { - return handleSafePrevious(fromPosition); + return handleSafePrevious(fromPosition); } else { - return handleSafePrevious(fromPosition); + return handleSafePrevious(fromPosition); } } else { if (use8BitsTrie) { - return handleSafePrevious(fromPosition); + return handleSafePrevious(fromPosition); } else { - return handleSafePrevious(fromPosition); + return handleSafePrevious(fromPosition); } } } @@ -801,7 +801,7 @@ int32_t RuleBasedBreakIterator::handleSafePrevious(int32_t fromPosition) { // Run the state machine to find a boundary // //----------------------------------------------------------------------------------- -template +template int32_t RuleBasedBreakIterator::handleNext() { int32_t state; uint16_t category = 0; @@ -815,6 +815,7 @@ int32_t RuleBasedBreakIterator::handleNext() { const RBBIStateTable *statetable = fData->fForwardTable; const char *tableData = statetable->fTableData; uint32_t tableRowLen = statetable->fRowLen; + uint32_t dictStart = statetable->fDictCategoriesStart; #ifdef RBBI_DEBUG if (gTrace) { RBBIDebugPuts("Handle Next pos char state category"); @@ -876,17 +877,7 @@ int32_t RuleBasedBreakIterator::handleNext() { // look up the current character's character category, which tells us // which column in the state table to look at. category = trieFunc(fData->fTrie, c); - - // Check the dictionary bit in the character's category. - // Counter is only used by dictionary based iteration. - // Chars that need to be handled by a dictionary have a flag bit set - // in their category values. - // - if ((category & dictMask) != 0) { - fDictionaryCharCount++; - // And off the dictionary flag bit. - category &= ~dictMask; - } + fDictionaryCharCount += (category >= dictStart); } #ifdef RBBI_DEBUG @@ -993,7 +984,7 @@ int32_t RuleBasedBreakIterator::handleNext() { // because the safe table does not require as many options. // //----------------------------------------------------------------------------------- -template +template int32_t RuleBasedBreakIterator::handleSafePrevious(int32_t fromPosition) { int32_t state; @@ -1030,7 +1021,6 @@ int32_t RuleBasedBreakIterator::handleSafePrevious(int32_t fromPosition) { // // Off the dictionary flag bit. For reverse iteration it is not used. category = trieFunc(fData->fTrie, c); - category &= ~dictMask; #ifdef RBBI_DEBUG if (gTrace) { diff --git a/icu4c/source/common/rbbi_cache.cpp b/icu4c/source/common/rbbi_cache.cpp index 4ea9e3e28b2..63ff3001c70 100644 --- a/icu4c/source/common/rbbi_cache.cpp +++ b/icu4c/source/common/rbbi_cache.cpp @@ -119,8 +119,6 @@ UBool RuleBasedBreakIterator::DictionaryCache::preceding(int32_t fromPos, int32_ void RuleBasedBreakIterator::DictionaryCache::populateDictionary(int32_t startPos, int32_t endPos, int32_t firstRuleStatus, int32_t otherRuleStatus) { - uint32_t dictMask = ucptrie_getValueWidth(fBI->fData->fTrie) == UCPTRIE_VALUE_BITS_8 ? - kDictBitFor8BitsTrie : kDictBit; if ((endPos - startPos) <= 1) { return; } @@ -145,9 +143,11 @@ void RuleBasedBreakIterator::DictionaryCache::populateDictionary(int32_t startPo utext_setNativeIndex(text, rangeStart); UChar32 c = utext_current32(text); category = ucptrie_get(fBI->fData->fTrie, c); + uint32_t dictStart = fBI->fData->fForwardTable->fDictCategoriesStart; while(U_SUCCESS(status)) { - while((current = (int32_t)UTEXT_GETNATIVEINDEX(text)) < rangeEnd && (category & dictMask) == 0) { + while((current = (int32_t)UTEXT_GETNATIVEINDEX(text)) < rangeEnd + && (category < dictStart)) { utext_next32(text); // TODO: cleaner loop structure. c = utext_current32(text); category = ucptrie_get(fBI->fData->fTrie, c); diff --git a/icu4c/source/common/rbbidata.h b/icu4c/source/common/rbbidata.h index 963050d58e3..efbd4bea112 100644 --- a/icu4c/source/common/rbbidata.h +++ b/icu4c/source/common/rbbidata.h @@ -101,18 +101,18 @@ struct RBBIStateTableRowT { // Value 0: not an accepting state. // 1: (ACCEPTING_UNCONDITIONAL) Unconditional Accepting state. // >1: Look-ahead match has completed. - // Actual boundary position happened earlier + // Actual boundary position happened earlier. // Value here == fLookAhead in earlier - // state, at actual boundary pos. + // state, at actual boundary pos. T fLookAhead; // Non-zero if this row is for a state that // corresponds to a '/' in the rule source. // Value is the same as the fAccepting - // value for the rule (which will appear - // in a different state. + // value for the rule (which will appear + // in a different state. T fTagsIdx; // Non-zero if this row covers a {tagged} position - // from a rule. Value is the index in the - // StatusTable of the set of matching - // tags (rule status values) + // from a rule. Value is the index in the + // StatusTable of the set of matching + // tags (rule status values) T fNextState[1]; // Next State, indexed by char category. // Variable-length array declared with length 1 // to disable bounds checkers. @@ -132,14 +132,17 @@ union RBBIStateTableRow { }; struct RBBIStateTable { - uint32_t fNumStates; /* Number of states. */ - uint32_t fRowLen; /* Length of a state table row, in bytes. */ - uint32_t fFlags; /* Option Flags for this state table */ - char fTableData[1]; /* First RBBIStateTableRow begins here. */ - /* Variable-length array declared with length 1 */ - /* to disable bounds checkers. */ - /* (making it char[] simplifies ugly address */ - /* arithmetic for indexing variable length rows.) */ + uint32_t fNumStates; // Number of states. + uint32_t fRowLen; // Length of a state table row, in bytes. + uint32_t fDictCategoriesStart; // Char category number of the first dictionary + // char class, or the the largest category number + 1 + // if there are no dictionary categories. + uint32_t fFlags; // Option Flags for this state table. + char fTableData[1]; // First RBBIStateTableRow begins here. + // Variable-length array declared with length 1 + // to disable bounds checkers. + // (making it char[] simplifies ugly address + // arithmetic for indexing variable length rows.) }; constexpr uint32_t RBBI_LOOKAHEAD_HARD_BREAK = 1; diff --git a/icu4c/source/common/rbbirb.cpp b/icu4c/source/common/rbbirb.cpp index 9c507527b8d..e5c250dfe40 100644 --- a/icu4c/source/common/rbbirb.cpp +++ b/icu4c/source/common/rbbirb.cpp @@ -287,9 +287,7 @@ RBBIDataHeader *RBBIRuleBuilder::build(UErrorCode &status) { // // UnicodeSet processing. - // Munge the Unicode Sets to create a set of character categories. - // Generate the mapping tables (TRIE) from input code points to - // the character categories. + // Munge the Unicode Sets to create an initial set of character categories. // fSetBuilder->buildRanges(); @@ -303,6 +301,12 @@ RBBIDataHeader *RBBIRuleBuilder::build(UErrorCode &status) { } fForwardTable->buildForwardTable(); + + // State table and character category optimization. + // Merge equivalent rows and columns. + // Note that this process alters the initial set of character categories, + // causing the representation of UnicodeSets in the parse tree to become invalid. + optimizeTables(); fForwardTable->buildSafeReverseTable(status); @@ -315,6 +319,9 @@ RBBIDataHeader *RBBIRuleBuilder::build(UErrorCode &status) { } #endif + // Generate the mapping tables (TRIE) from input code points to + // the character categories. + // fSetBuilder->buildTrie(); // diff --git a/icu4c/source/common/rbbisetb.cpp b/icu4c/source/common/rbbisetb.cpp index 23dbc19d7cf..29faeb8c456 100644 --- a/icu4c/source/common/rbbisetb.cpp +++ b/icu4c/source/common/rbbisetb.cpp @@ -19,7 +19,7 @@ // by the RBBI rules. // - compute a set of non-overlapping character ranges // with all characters within a range belonging to the same -// set of input uniocde sets. +// set of input unicode sets. // - Derive a set of non-overlapping UnicodeSet (like things) // that will correspond to columns in the state table for // the RBBI execution engine. All characters within one @@ -45,7 +45,7 @@ U_NAMESPACE_BEGIN -const int32_t kMaxCharCategoriesFor8BitsTrie = 127; +const int32_t kMaxCharCategoriesFor8BitsTrie = 255; //------------------------------------------------------------------------ // // Constructor @@ -55,12 +55,12 @@ RBBISetBuilder::RBBISetBuilder(RBBIRuleBuilder *rb) { fRB = rb; fStatus = rb->fStatus; - fRangeList = 0; + fRangeList = nullptr; fMutableTrie = nullptr; fTrie = nullptr; fTrieSize = 0; fGroupCount = 0; - fSawBOF = FALSE; + fSawBOF = false; } @@ -196,24 +196,47 @@ void RBBISetBuilder::buildRanges() { // // Numbering: # 0 (state table column 0) is unused. // # 1 is reserved - table column 1 is for end-of-input - // # 2 is reserved - table column 2 is for beginning-in-input + // # 2 is reserved - table column 2 is for beginning-of-input // # 3 is the first range list. // RangeDescriptor *rlSearchRange; - for (rlRange = fRangeList; rlRange!=0; rlRange=rlRange->fNext) { + int32_t dictGroupCount = 0; + + for (rlRange = fRangeList; rlRange!=nullptr; rlRange=rlRange->fNext) { for (rlSearchRange=fRangeList; rlSearchRange != rlRange; rlSearchRange=rlSearchRange->fNext) { if (rlRange->fIncludesSets->equals(*rlSearchRange->fIncludesSets)) { rlRange->fNum = rlSearchRange->fNum; + rlRange->fIncludesDict = rlSearchRange->fIncludesDict; break; } } if (rlRange->fNum == 0) { - fGroupCount ++; - rlRange->fNum = fGroupCount+2; - rlRange->setDictionaryFlag(); - addValToSets(rlRange->fIncludesSets, fGroupCount+2); + rlRange->fFirstInGroup = true; + if (rlRange->isDictionaryRange()) { + rlRange->fNum = ++dictGroupCount; + rlRange->fIncludesDict = true; + } else { + fGroupCount++; + rlRange->fNum = fGroupCount+2; + addValToSets(rlRange->fIncludesSets, rlRange->fNum); + } + } + } + + // Move the character category numbers for any dictionary ranges up, so that they + // immediately follow the non-dictionary ranges. + + fDictCategoriesStart = fGroupCount + 3; + for (rlRange = fRangeList; rlRange!=nullptr; rlRange=rlRange->fNext) { + if (rlRange->fIncludesDict) { + rlRange->fNum += fDictCategoriesStart - 1; + if (rlRange->fFirstInGroup) { + addValToSets(rlRange->fIncludesSets, rlRange->fNum); + } } } + fGroupCount += dictGroupCount; + // Handle input sets that contain the special string {eof}. // Column 1 of the state table is reserved for EOF on input. @@ -222,13 +245,11 @@ void RBBISetBuilder::buildRanges() { // references to {bof}.) // Add this column value (1 or 2) to the equivalent expression // subtree for each UnicodeSet that contains the string {eof} - // Because {bof} and {eof} are not a characters in the normal sense, - // they doesn't affect the computation of ranges or TRIE. - static const UChar eofUString[] = {0x65, 0x6f, 0x66, 0}; - static const UChar bofUString[] = {0x62, 0x6f, 0x66, 0}; + // Because {bof} and {eof} are not characters in the normal sense, + // they don't affect the computation of the ranges or TRIE. - UnicodeString eofString(eofUString); - UnicodeString bofString(bofUString); + UnicodeString eofString(u"eof"); + UnicodeString bofString(u"bof"); for (ni=0; ; ni++) { // Loop over each of the UnicodeSets encountered in the input rules usetNode = (RBBINode *)this->fRB->fUSetNodes->elementAt(ni); if (usetNode==NULL) { @@ -255,24 +276,16 @@ void RBBISetBuilder::buildRanges() { // range group number. // void RBBISetBuilder::buildTrie() { - RangeDescriptor *rlRange; - fMutableTrie = umutablecptrie_open( 0, // Initial value for all code points. 0, // Error value for out-of-range input. fStatus); - bool use8Bits = getNumCharCategories() <= kMaxCharCategoriesFor8BitsTrie; - for (rlRange = fRangeList; rlRange!=0 && U_SUCCESS(*fStatus); rlRange=rlRange->fNext) { - uint32_t value = rlRange->fNum; - if (use8Bits && ((value & RuleBasedBreakIterator::kDictBit) != 0)) { - U_ASSERT((value & RuleBasedBreakIterator::kDictBitFor8BitsTrie) == 0); - value = RuleBasedBreakIterator::kDictBitFor8BitsTrie | (value & ~RuleBasedBreakIterator::kDictBit); - } + for (RangeDescriptor *range = fRangeList; range!=nullptr && U_SUCCESS(*fStatus); range=range->fNext) { umutablecptrie_setRange(fMutableTrie, - rlRange->fStartChar, // Range start - rlRange->fEndChar, // Range end (inclusive) - value, // value for range + range->fStartChar, // Range start + range->fEndChar, // Range end (inclusive) + range->fNum, // value for range fStatus); } } @@ -281,16 +294,21 @@ void RBBISetBuilder::buildTrie() { void RBBISetBuilder::mergeCategories(IntPair categories) { U_ASSERT(categories.first >= 1); U_ASSERT(categories.second > categories.first); + U_ASSERT((categories.first < fDictCategoriesStart && categories.second < fDictCategoriesStart) || + (categories.first >= fDictCategoriesStart && categories.second >= fDictCategoriesStart)); + for (RangeDescriptor *rd = fRangeList; rd != nullptr; rd = rd->fNext) { - int32_t rangeNum = rd->fNum & ~RuleBasedBreakIterator::kDictBit; - int32_t rangeDict = rd->fNum & RuleBasedBreakIterator::kDictBit; + int32_t rangeNum = rd->fNum; if (rangeNum == categories.second) { - rd->fNum = categories.first | rangeDict; + rd->fNum = categories.first; } else if (rangeNum > categories.second) { rd->fNum--; } } --fGroupCount; + if (categories.second <= fDictCategoriesStart) { + --fDictCategoriesStart; + } } @@ -395,6 +413,16 @@ int32_t RBBISetBuilder::getNumCharCategories() const { } +//------------------------------------------------------------------------ +// +// getDictCategoriesStart +// +//------------------------------------------------------------------------ +int32_t RBBISetBuilder::getDictCategoriesStart() const { + return fDictCategoriesStart; +} + + //------------------------------------------------------------------------ // // sawBOF @@ -414,7 +442,7 @@ UBool RBBISetBuilder::sawBOF() const { UChar32 RBBISetBuilder::getFirstChar(int32_t category) const { RangeDescriptor *rlRange; UChar32 retVal = (UChar32)-1; - for (rlRange = fRangeList; rlRange!=0; rlRange=rlRange->fNext) { + for (rlRange = fRangeList; rlRange!=nullptr; rlRange=rlRange->fNext) { if (rlRange->fNum == category) { retVal = rlRange->fStartChar; break; @@ -424,7 +452,6 @@ UChar32 RBBISetBuilder::getFirstChar(int32_t category) const { } - //------------------------------------------------------------------------ // // printRanges A debugging function. @@ -437,16 +464,16 @@ void RBBISetBuilder::printRanges() { int i; RBBIDebugPrintf("\n\n Nonoverlapping Ranges ...\n"); - for (rlRange = fRangeList; rlRange!=0; rlRange=rlRange->fNext) { - RBBIDebugPrintf("%2i %4x-%4x ", rlRange->fNum, rlRange->fStartChar, rlRange->fEndChar); + for (rlRange = fRangeList; rlRange!=nullptr; rlRange=rlRange->fNext) { + RBBIDebugPrintf("%4x-%4x ", rlRange->fStartChar, rlRange->fEndChar); for (i=0; ifIncludesSets->size(); i++) { RBBINode *usetNode = (RBBINode *)rlRange->fIncludesSets->elementAt(i); - UnicodeString setName = UNICODE_STRING("anon", 4); + UnicodeString setName {u"anon"}; RBBINode *setRef = usetNode->fParent; - if (setRef != NULL) { + if (setRef != nullptr) { RBBINode *varRef = setRef->fParent; - if (varRef != NULL && varRef->fType == RBBINode::varRef) { + if (varRef != nullptr && varRef->fType == RBBINode::varRef) { setName = varRef->fText; } } @@ -466,19 +493,15 @@ void RBBISetBuilder::printRanges() { //------------------------------------------------------------------------ #ifdef RBBI_DEBUG void RBBISetBuilder::printRangeGroups() { - RangeDescriptor *rlRange; - RangeDescriptor *tRange; int i; - int lastPrintedGroupNum = 0; RBBIDebugPrintf("\nRanges grouped by Unicode Set Membership...\n"); - for (rlRange = fRangeList; rlRange!=0; rlRange=rlRange->fNext) { - int groupNum = rlRange->fNum & 0xbfff; - if (groupNum > lastPrintedGroupNum) { - lastPrintedGroupNum = groupNum; + for (RangeDescriptor *rlRange = fRangeList; rlRange!=nullptr; rlRange=rlRange->fNext) { + if (rlRange->fFirstInGroup) { + int groupNum = rlRange->fNum; RBBIDebugPrintf("%2i ", groupNum); - if (rlRange->fNum & RuleBasedBreakIterator::kDictBit) { RBBIDebugPrintf(" ");} + if (groupNum >= fDictCategoriesStart) { RBBIDebugPrintf(" ");} for (i=0; ifIncludesSets->size(); i++) { RBBINode *usetNode = (RBBINode *)rlRange->fIncludesSets->elementAt(i); @@ -494,7 +517,7 @@ void RBBISetBuilder::printRangeGroups() { } i = 0; - for (tRange = rlRange; tRange != 0; tRange = tRange->fNext) { + for (RangeDescriptor *tRange = rlRange; tRange != nullptr; tRange = tRange->fNext) { if (tRange->fNum == rlRange->fNum) { if (i++ % 5 == 0) { RBBIDebugPrintf("\n "); @@ -561,28 +584,22 @@ void RBBISetBuilder::printSets() { // //------------------------------------------------------------------------------------- -RangeDescriptor::RangeDescriptor(const RangeDescriptor &other, UErrorCode &status) { - int i; - - this->fStartChar = other.fStartChar; - this->fEndChar = other.fEndChar; - this->fNum = other.fNum; - this->fNext = NULL; - UErrorCode oldstatus = status; - this->fIncludesSets = new UVector(status); - if (U_FAILURE(oldstatus)) { - status = oldstatus; - } +RangeDescriptor::RangeDescriptor(const RangeDescriptor &other, UErrorCode &status) : + fStartChar(other.fStartChar), fEndChar {other.fEndChar}, fNum {other.fNum}, + fIncludesDict{other.fIncludesDict}, fFirstInGroup{other.fFirstInGroup} { + if (U_FAILURE(status)) { return; } - /* test for NULL */ - if (this->fIncludesSets == 0) { + fIncludesSets = new UVector(status); + if (this->fIncludesSets == nullptr) { status = U_MEMORY_ALLOCATION_ERROR; + } + if (U_FAILURE(status)) { return; } - for (i=0; isize(); i++) { + for (int32_t i=0; isize(); i++) { this->fIncludesSets->addElement(other.fIncludesSets->elementAt(i), status); } } @@ -594,24 +611,13 @@ RangeDescriptor::RangeDescriptor(const RangeDescriptor &other, UErrorCode &statu // //------------------------------------------------------------------------------------- RangeDescriptor::RangeDescriptor(UErrorCode &status) { - this->fStartChar = 0; - this->fEndChar = 0; - this->fNum = 0; - this->fNext = NULL; - UErrorCode oldstatus = status; - this->fIncludesSets = new UVector(status); - if (U_FAILURE(oldstatus)) { - status = oldstatus; - } if (U_FAILURE(status)) { return; } - /* test for NULL */ - if(this->fIncludesSets == 0) { + fIncludesSets = new UVector(status); + if (fIncludesSets == nullptr) { status = U_MEMORY_ALLOCATION_ERROR; - return; } - } @@ -622,7 +628,7 @@ RangeDescriptor::RangeDescriptor(UErrorCode &status) { //------------------------------------------------------------------------------------- RangeDescriptor::~RangeDescriptor() { delete fIncludesSets; - fIncludesSets = NULL; + fIncludesSets = nullptr; } //------------------------------------------------------------------------------------- @@ -633,7 +639,7 @@ RangeDescriptor::~RangeDescriptor() { void RangeDescriptor::split(UChar32 where, UErrorCode &status) { U_ASSERT(where>fStartChar && where<=fEndChar); RangeDescriptor *nr = new RangeDescriptor(*this, status); - if(nr == 0) { + if(nr == nullptr) { status = U_MEMORY_ALLOCATION_ERROR; return; } @@ -652,27 +658,22 @@ void RangeDescriptor::split(UChar32 where, UErrorCode &status) { //------------------------------------------------------------------------------------- // -// RangeDescriptor::setDictionaryFlag +// RangeDescriptor::isDictionaryRange // -// Character Category Numbers that include characters from -// the original Unicode Set named "dictionary" have bit 14 -// set to 1. The RBBI runtime engine uses this to trigger -// use of the word dictionary. +// Test whether this range includes characters from +// the original Unicode Set named "dictionary". // -// This function looks through the Unicode Sets that it -// (the range) includes, and sets the bit in fNum when -// "dictionary" is among them. +// This function looks through the Unicode Sets that +// the range includes, checking for one named "dictionary" // // TODO: a faster way would be to find the set node for // "dictionary" just once, rather than looking it // up by name every time. // //------------------------------------------------------------------------------------- -void RangeDescriptor::setDictionaryFlag() { - int i; - +bool RangeDescriptor::isDictionaryRange() { static const char16_t *dictionary = u"dictionary"; - for (i=0; isize(); i++) { + for (int32_t i=0; isize(); i++) { RBBINode *usetNode = (RBBINode *)fIncludesSets->elementAt(i); RBBINode *setRef = usetNode->fParent; if (setRef != nullptr) { @@ -680,16 +681,14 @@ void RangeDescriptor::setDictionaryFlag() { if (varRef && varRef->fType == RBBINode::varRef) { const UnicodeString *setName = &varRef->fText; if (setName->compare(dictionary, -1) == 0) { - fNum |= RuleBasedBreakIterator::kDictBit; - break; + return true; } } } } + return false; } - - U_NAMESPACE_END #endif /* #if !UCONFIG_NO_BREAK_ITERATION */ diff --git a/icu4c/source/common/rbbisetb.h b/icu4c/source/common/rbbisetb.h index cc031a2924d..6409a4ea579 100644 --- a/icu4c/source/common/rbbisetb.h +++ b/icu4c/source/common/rbbisetb.h @@ -41,25 +41,26 @@ U_NAMESPACE_BEGIN // class RangeDescriptor : public UMemory { public: - UChar32 fStartChar; // Start of range, unicode 32 bit value. - UChar32 fEndChar; // End of range, unicode 32 bit value. - int32_t fNum; // runtime-mapped input value for this range. - UVector *fIncludesSets; // vector of the the original - // Unicode sets that include this range. - // (Contains ptrs to uset nodes) - RangeDescriptor *fNext; // Next RangeDescriptor in the linked list. + UChar32 fStartChar {}; // Start of range, unicode 32 bit value. + UChar32 fEndChar {}; // End of range, unicode 32 bit value. + int32_t fNum {0}; // runtime-mapped input value for this range. + bool fIncludesDict {false}; // True if the range includes $dictionary. + bool fFirstInGroup {false}; // True if first range in a group with the same fNum. + UVector *fIncludesSets {nullptr}; // vector of the the original + // Unicode sets that include this range. + // (Contains ptrs to uset nodes) + RangeDescriptor *fNext {nullptr}; // Next RangeDescriptor in the linked list. RangeDescriptor(UErrorCode &status); RangeDescriptor(const RangeDescriptor &other, UErrorCode &status); ~RangeDescriptor(); void split(UChar32 where, UErrorCode &status); // Spit this range in two at "where", with // where appearing in the second (higher) part. - void setDictionaryFlag(); // Check whether this range appears as part of + bool isDictionaryRange(); // Check whether this range appears as part of // the Unicode set named "dictionary" -private: - RangeDescriptor(const RangeDescriptor &other); // forbid copying of this class - RangeDescriptor &operator=(const RangeDescriptor &other); // forbid copying of this class + RangeDescriptor(const RangeDescriptor &other) = delete; // forbid default copying of this class + RangeDescriptor &operator=(const RangeDescriptor &other) = delete; // forbid assigning of this class }; @@ -90,6 +91,8 @@ public: int32_t getNumCharCategories() const; // CharCategories are the same as input symbol set to the // runtime state machine, which are the same as // columns in the DFA state table + int32_t getDictCategoriesStart() const; // First char category that includes $dictionary, or + // last category + 1 if there are no dictionary categories. int32_t getTrieSize() /*const*/; // Size in bytes of the serialized Trie. void serializeTrie(uint8_t *where); // write out the serialized Trie. UChar32 getFirstChar(int32_t val) const; @@ -113,8 +116,6 @@ public: #endif private: - void numberSets(); - RBBIRuleBuilder *fRB; // The RBBI Rule Compiler that owns us. UErrorCode *fStatus; @@ -124,14 +125,13 @@ private: UCPTrie *fTrie; // the Unicode Sets. uint32_t fTrieSize; - // Groups correspond to character categories - - // groups of ranges that are in the same original UnicodeSets. - // fGroupCount is the index of the last used group. - // fGroupCount+1 is also the number of columns in the RBBI state table being compiled. - // State table column 0 is not used. Column 1 is for end-of-input. - // column 2 is for group 0. Funny counting. + // Number of range groups, which are groups of ranges that are in the same original UnicodeSets. int32_t fGroupCount; + // The number of the first dictionary char category. + // If there are no Dictionary categories, set to the last category + 1. + int32_t fDictCategoriesStart; + UBool fSawBOF; RBBISetBuilder(const RBBISetBuilder &other); // forbid copying of this class diff --git a/icu4c/source/common/rbbitblb.cpp b/icu4c/source/common/rbbitblb.cpp index ebf1f858c56..09a6aaa0189 100644 --- a/icu4c/source/common/rbbitblb.cpp +++ b/icu4c/source/common/rbbitblb.cpp @@ -1155,7 +1155,13 @@ bool RBBITableBuilder::findDuplCharClassFrom(IntPair *categories) { int32_t numCols = fRB->fSetBuilder->getNumCharCategories(); for (; categories->first < numCols-1; categories->first++) { - for (categories->second=categories->first+1; categories->second < numCols; categories->second++) { + // Note: dictionary & non-dictionary columns cannot be merged. + // The limitSecond value prevents considering mixed pairs. + // Dictionary categories are >= DictCategoriesStart. + // Non dict categories are < DictCategoriesStart. + int limitSecond = categories->first < fRB->fSetBuilder->getDictCategoriesStart() ? + fRB->fSetBuilder->getDictCategoriesStart() : numCols; + for (categories->second=categories->first+1; categories->second < limitSecond; categories->second++) { // Initialized to different values to prevent returning true if numStates = 0 (implies no duplicates). uint16_t table_base = 0; uint16_t table_dupl = 1; @@ -1379,6 +1385,7 @@ void RBBITableBuilder::exportTable(void *where) { } table->fNumStates = fDStates->size(); + table->fDictCategoriesStart = fRB->fSetBuilder->getDictCategoriesStart(); table->fFlags = 0; if (use8BitsForTable()) { table->fRowLen = offsetof(RBBIStateTableRow8, fNextState) + sizeof(uint8_t) * catCount; @@ -1652,12 +1659,12 @@ void RBBITableBuilder::printStates() { RBBIDebugPrintf("state | i n p u t s y m b o l s \n"); RBBIDebugPrintf(" | Acc LA Tag"); for (c=0; cfSetBuilder->getNumCharCategories(); c++) { - RBBIDebugPrintf(" %2d", c); + RBBIDebugPrintf(" %3d", c); } RBBIDebugPrintf("\n"); RBBIDebugPrintf(" |---------------"); for (c=0; cfSetBuilder->getNumCharCategories(); c++) { - RBBIDebugPrintf("---"); + RBBIDebugPrintf("----"); } RBBIDebugPrintf("\n"); @@ -1666,7 +1673,7 @@ void RBBITableBuilder::printStates() { RBBIDebugPrintf(" %3d | " , n); RBBIDebugPrintf("%3d %3d %5d ", sd->fAccepting, sd->fLookAhead, sd->fTagsIdx); for (c=0; cfSetBuilder->getNumCharCategories(); c++) { - RBBIDebugPrintf(" %2d", sd->fDtran->elementAti(c)); + RBBIDebugPrintf(" %3d", sd->fDtran->elementAti(c)); } RBBIDebugPrintf("\n"); } diff --git a/icu4c/source/common/unicode/rbbi.h b/icu4c/source/common/unicode/rbbi.h index 843144064ab..2ebe931b1c0 100644 --- a/icu4c/source/common/unicode/rbbi.h +++ b/icu4c/source/common/unicode/rbbi.h @@ -677,10 +677,10 @@ private: typedef uint16_t (*PTrieFunc)(const UCPTrie *, UChar32); - template + template int32_t handleSafePrevious(int32_t fromPosition); - template + template int32_t handleNext(); @@ -705,17 +705,6 @@ private: * @internal */ void dumpTables(); - - /** - * Bit for dictionary based category - */ - static constexpr int32_t kDictBit = 0x4000; - - /** - * Bit for dictionary based category in 8bits trie - */ - static constexpr int32_t kDictBitFor8BitsTrie = 0x0080; - #endif /* U_HIDE_INTERNAL_API */ }; diff --git a/icu4c/source/test/intltest/rbbitst.cpp b/icu4c/source/test/intltest/rbbitst.cpp index 002ab94893e..84c6cd782b3 100644 --- a/icu4c/source/test/intltest/rbbitst.cpp +++ b/icu4c/source/test/intltest/rbbitst.cpp @@ -4657,7 +4657,8 @@ void RBBITest::TestTableRedundancies() { } // Ignore column (char class) 0 while checking; it's special, and may have duplicates. for (int c1=1; c1fDictCategoriesStart ? fwtbl->fDictCategoriesStart : numCharClasses; + for (int c2 = c1+1; c2 < limit; c2++) { if (columns.at(c1) == columns.at(c2)) { errln("%s:%d Duplicate columns (%d, %d)\n", __FILE__, __LINE__, c1, c2); goto out; @@ -4952,15 +4953,15 @@ void RBBITest::testTrieStateTable(int32_t numChar, bool expectedTrieWidthIn8Bits } void RBBITest::Test8BitsTrieWith8BitStateTable() { - testTrieStateTable(123, true /* expectedTrieWidthIn8Bits */, true /* expectedStateRowIn8Bits */); + testTrieStateTable(251, true /* expectedTrieWidthIn8Bits */, true /* expectedStateRowIn8Bits */); } void RBBITest::Test16BitsTrieWith8BitStateTable() { - testTrieStateTable(124, false /* expectedTrieWidthIn8Bits */, true /* expectedStateRowIn8Bits */); + testTrieStateTable(252, false /* expectedTrieWidthIn8Bits */, true /* expectedStateRowIn8Bits */); } void RBBITest::Test16BitsTrieWith16BitStateTable() { - testTrieStateTable(255, false /* expectedTrieWidthIn8Bits */, false /* expectedStateRowIn8Bits */); + testTrieStateTable(253, false /* expectedTrieWidthIn8Bits */, false /* expectedStateRowIn8Bits */); } void RBBITest::Test8BitsTrieWith16BitStateTable() { diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/RBBIDataWrapper.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/RBBIDataWrapper.java index c0375bdb4f6..fd81e6ecff5 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/impl/RBBIDataWrapper.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/RBBIDataWrapper.java @@ -41,10 +41,20 @@ public final class RBBIDataWrapper { * Length of a table row in bytes. Note mismatch with table data, which is short[]. */ public int fRowLen; + /** + * Char category number of the first dictionary char class, + * or the the largest category number + 1 if there are no dictionary categories. + */ + public int fDictCategoriesStart; /** * Option Flags for this state table. */ public int fFlags; + /** + * Length in bytes of the state table header, of all the int32 fields + * preceding fTable in the serialized form. + */ + public static int fHeaderSize = 16; /** * Linear array of next state values, accessed as short[state, char_class] */ @@ -57,14 +67,15 @@ public final class RBBIDataWrapper { if (length == 0) { return null; } - if (length < 12) { + if (length < fHeaderSize) { throw new IOException("Invalid RBBI state table length."); } RBBIStateTable This = new RBBIStateTable(); This.fNumStates = bytes.getInt(); This.fRowLen = bytes.getInt(); + This.fDictCategoriesStart = bytes.getInt(); This.fFlags = bytes.getInt(); - int lengthOfTable = length - 12; // length in bytes. + int lengthOfTable = length - fHeaderSize; // length in bytes. boolean use8Bits = (This.fFlags & RBBIDataWrapper.RBBI_8BITS_ROWS) == RBBIDataWrapper.RBBI_8BITS_ROWS; if (use8Bits) { This.fTable = new char[lengthOfTable]; @@ -82,6 +93,7 @@ public final class RBBIDataWrapper { public int put(DataOutputStream bytes) throws IOException { bytes.writeInt(fNumStates); bytes.writeInt(fRowLen); + bytes.writeInt(fDictCategoriesStart); bytes.writeInt(fFlags); if ((fFlags & RBBIDataWrapper.RBBI_8BITS_ROWS) == RBBIDataWrapper.RBBI_8BITS_ROWS) { int tableLen = fRowLen * fNumStates; // fRowLen is bytes. @@ -95,8 +107,8 @@ public final class RBBIDataWrapper { bytes.writeChar(fTable[i]); } } - int bytesWritten = 12 + fRowLen * fNumStates; // total bytes written, - // including 12 for the header. + int bytesWritten = fHeaderSize + fRowLen * fNumStates; // total bytes written, + // including the header. while (bytesWritten % 8 != 0) { bytes.writeByte(0); ++bytesWritten; @@ -118,6 +130,7 @@ public final class RBBIDataWrapper { RBBIStateTable otherST = (RBBIStateTable)other; if (fNumStates != otherST.fNumStates) return false; if (fRowLen != otherST.fRowLen) return false; + if (fDictCategoriesStart != otherST.fDictCategoriesStart) return false; if (fFlags != otherST.fFlags) return false; return Arrays.equals(fTable, otherST.fTable); } @@ -216,9 +229,6 @@ public final class RBBIDataWrapper { public final static int RBBI_BOF_REQUIRED = 2; public final static int RBBI_8BITS_ROWS = 4; - public final static int DICT_BIT = 0x4000; - public final static int DICT_BIT_FOR_8BITS_TRIE = 0x0080; - /** * Data Header. A struct-like class with the fields from the RBBI data file header. * Not intended for public use, declared public for testing purposes only. @@ -496,7 +506,6 @@ public final class RBBIDataWrapper { int char32; int category; int lastNewline[] = new int[n+1]; - int dictMask = fTrie.getValueWidth() == CodePointTrie.ValueWidth.BITS_8 ? DICT_BIT_FOR_8BITS_TRIE : DICT_BIT; for (category = 0; category <= fHeader.fCatCount; category ++) { catStrings[category] = ""; @@ -505,7 +514,6 @@ public final class RBBIDataWrapper { out.println("--------------------"); for (char32 = 0; char32<=0x10ffff; char32++) { category = fTrie.get(char32); - category &= ~dictMask; // Mask off dictionary bit. if (category < 0 || category > fHeader.fCatCount) { out.println("Error, bad category " + Integer.toHexString(category) + " for char " + Integer.toHexString(char32)); diff --git a/icu4j/main/classes/core/src/com/ibm/icu/text/RBBIRuleBuilder.java b/icu4j/main/classes/core/src/com/ibm/icu/text/RBBIRuleBuilder.java index dfe3d2adddd..7f3b2e665f2 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/text/RBBIRuleBuilder.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/text/RBBIRuleBuilder.java @@ -67,7 +67,7 @@ class RBBIRuleBuilder { // // Status {tag} values. These structures are common to all of the rule sets (Forward, Reverse, etc.). // - Map, Integer> fStatusSets = new HashMap, Integer>(); // Status value sets encountered so far. + Map, Integer> fStatusSets = new HashMap<>(); // Status value sets encountered so far. // Map Key is the set of values. // Map Value is the runtime array index. @@ -146,8 +146,8 @@ class RBBIRuleBuilder { ICUDebug.value("rbbi") : null; fRules = rules; fStrippedRules = new StringBuilder(rules); - fUSetNodes = new ArrayList(); - fRuleStatusVals = new ArrayList(); + fUSetNodes = new ArrayList<>(); + fRuleStatusVals = new ArrayList<>(); fScanner = new RBBIRuleScanner(this); fSetBuilder = new RBBISetBuilder(this); } @@ -294,9 +294,7 @@ class RBBIRuleBuilder { // // UnicodeSet processing. - // Munge the Unicode Sets to create a set of character categories. - // Generate the mapping tables (TRIE) from input code points to - // the character categories. + // Munge the Unicode Sets to create an initial set of character categories. // fSetBuilder.buildRanges(); @@ -305,6 +303,10 @@ class RBBIRuleBuilder { // fForwardTable = new RBBITableBuilder(this, fForwardTree); fForwardTable.buildForwardTable(); + // State table and character category optimization. + // Merge equivalent rows and columns. + // Note that this process alters the the initial set of character categories, + // causing the representation of UnicodeSets in the parse tree to become invalid. optimizeTables(); fForwardTable.buildSafeReverseTable(); @@ -315,7 +317,9 @@ class RBBIRuleBuilder { fForwardTable.printRuleStatusTable(); fForwardTable.printReverseTable(); } - + // Generate the mapping tables (TRIE) from input code points to + // the character categories. + // fSetBuilder.buildTrie(); // // Package up the compiled data, writing it to an output stream diff --git a/icu4j/main/classes/core/src/com/ibm/icu/text/RBBISetBuilder.java b/icu4j/main/classes/core/src/com/ibm/icu/text/RBBISetBuilder.java index e7189a58c80..fa0c73325b6 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/text/RBBISetBuilder.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/text/RBBISetBuilder.java @@ -29,7 +29,7 @@ import com.ibm.icu.util.MutableCodePointTrie; // by the RBBI rules. // - compute a set of non-overlapping character ranges // with all characters within a range belonging to the same -// set of input uniocde sets. +// set of input unicode sets. // - Derive a set of non-overlapping UnicodeSet (like things) // that will correspond to columns in the state table for // the RBBI execution engine. All characters within one @@ -41,23 +41,27 @@ import com.ibm.icu.util.MutableCodePointTrie; // class RBBISetBuilder { static class RangeDescriptor { - int fStartChar; // Start of range, unicode 32 bit value. - int fEndChar; // End of range, unicode 32 bit value. - int fNum; // runtime-mapped input value for this range. - List fIncludesSets; // vector of the the original - // Unicode sets that include this range. - // (Contains ptrs to uset nodes) - RangeDescriptor fNext; // Next RangeDescriptor in the linked list. + int fStartChar = 0; // Start of range, unicode 32 bit value. + int fEndChar = 0; // End of range, unicode 32 bit value. + int fNum = 0; // runtime-mapped input value for this range. + boolean fIncludesDict = false; // True if the range includes $dictionary. + boolean fFirstInGroup = false; // True if first range in a group with the same fNum. + List fIncludesSets; // vector of the the original + // Unicode sets that include this range. + // (Contains ptrs to uset nodes) + RangeDescriptor fNext; // Next RangeDescriptor in the linked list. RangeDescriptor() { - fIncludesSets = new ArrayList(); + fIncludesSets = new ArrayList<>(); } RangeDescriptor(RangeDescriptor other) { fStartChar = other.fStartChar; fEndChar = other.fEndChar; fNum = other.fNum; - fIncludesSets = new ArrayList(other.fIncludesSets); + fIncludesDict = other.fIncludesDict; + fFirstInGroup = other.fFirstInGroup; + fIncludesSets = new ArrayList<>(other.fIncludesSets); } //------------------------------------------------------------------------------------- @@ -82,28 +86,18 @@ class RBBISetBuilder { } - //------------------------------------------------------------------------------------- - // - // RangeDescriptor::setDictionaryFlag - // - // Character Category Numbers that include characters from - // the original Unicode Set named "dictionary" have bit 14 - // set to 1. The RBBI runtime engine uses this to trigger - // use of the word dictionary. - // - // This function looks through the Unicode Sets that it - // (the range) includes, and sets the bit in fNum when - // "dictionary" is among them. - // + /** + * Test whether this range includes characters from the original Unicode Set named "dictionary". + * + * This function looks through the Unicode Sets that + * the range includes, checking for one named "dictionary" + */ // TODO: a faster way would be to find the set node for // "dictionary" just once, rather than looking it // up by name every time. // - // ------------------------------------------------------------------------------------- - void setDictionaryFlag() { - int i; - - for (i=0; i= 1); assert(categories.second > categories.first); + assert((categories.first < fDictCategoriesStart && categories.second < fDictCategoriesStart) || + (categories.first >= fDictCategoriesStart && categories.second >= fDictCategoriesStart)); for (RangeDescriptor rd = fRangeList; rd != null; rd = rd.fNext) { - int rangeNum = rd.fNum & ~DICT_BIT; - int rangeDict = rd.fNum & DICT_BIT; + int rangeNum = rd.fNum; if (rangeNum == categories.second) { - rd.fNum = categories.first | rangeDict; + rd.fNum = categories.first; } else if (rangeNum > categories.second) { rd.fNum--; } } --fGroupCount; + if (categories.second <= fDictCategoriesStart) { + --fDictCategoriesStart; + } } //----------------------------------------------------------------------------------- @@ -425,6 +435,16 @@ class RBBISetBuilder { } + //------------------------------------------------------------------------ + // + // getDictCategoriesStart + // + //------------------------------------------------------------------------ + int getDictCategoriesStart() { + return fDictCategoriesStart; + } + + //------------------------------------------------------------------------ // // sawBOF @@ -454,7 +474,6 @@ class RBBISetBuilder { } - //------------------------------------------------------------------------ // // printRanges A debugging function. @@ -468,7 +487,7 @@ class RBBISetBuilder { System.out.print("\n\n Nonoverlapping Ranges ...\n"); for (rlRange = fRangeList; rlRange!=null; rlRange=rlRange.fNext) { - System.out.print(" " + rlRange.fNum + " " + rlRange.fStartChar + "-" + rlRange.fEndChar); + System.out.printf("%04x-%04x ", rlRange.fStartChar, rlRange.fEndChar); for (i=0; i lastPrintedGroupNum) { - lastPrintedGroupNum = groupNum; + for (RangeDescriptor rlRange = fRangeList; rlRange!=null; rlRange=rlRange.fNext) { + if (rlRange.fFirstInGroup) { + int groupNum = rlRange.fNum; if (groupNum<10) {System.out.print(" ");} System.out.print(groupNum + " "); - if ((rlRange.fNum & DICT_BIT) != 0) { System.out.print(" ");} + if (groupNum >= fDictCategoriesStart) { System.out.print(" ");} for (i=0; i= DictCategoriesStart. + // Non dict categories are < DictCategoriesStart. + int limitSecond = categories.first < fRB.fSetBuilder.getDictCategoriesStart() ? + fRB.fSetBuilder.getDictCategoriesStart() : numCols; + for (categories.second=categories.first+1; categories.second < limitSecond; ++categories.second) { for (int state=0; state= dictStart) { fDictionaryCharCount++; - // And off the dictionary flag bit. - category &= ~dictMask; } if (TRACE) { @@ -1004,9 +997,6 @@ public class RuleBasedBreakIterator extends BreakIterator { CharacterIterator text = fText; CodePointTrie trie = fRData.fTrie; char[] stateTable = fRData.fRTable.fTable; - int flagsState = fRData.fRTable.fFlags; - int dictMask = fRData.fTrie.getValueWidth() == CodePointTrie.ValueWidth.BITS_8 ? - RBBIDataWrapper.DICT_BIT_FOR_8BITS_TRIE : RBBIDataWrapper.DICT_BIT; CISetIndex32(text, fromPosition); if (TRACE) { @@ -1032,7 +1022,6 @@ public class RuleBasedBreakIterator extends BreakIterator { // // And off the dictionary flag bit. For reverse iteration it is not used. category = (short) trie.get(c); - category &= ~dictMask; if (TRACE) { System.out.print(" " + RBBIDataWrapper.intToString(text.getIndex(), 5)); System.out.print(RBBIDataWrapper.intToHexString(c, 10)); @@ -1212,8 +1201,6 @@ public class RuleBasedBreakIterator extends BreakIterator { int category; int current; int foundBreakCount = 0; - int dictMask = fRData.fTrie.getValueWidth() == CodePointTrie.ValueWidth.BITS_8 ? - RBBIDataWrapper.DICT_BIT_FOR_8BITS_TRIE : RBBIDataWrapper.DICT_BIT; // Loop through the text, looking for ranges of dictionary characters. // For each span, find the appropriate break engine, and ask it to find @@ -1222,9 +1209,10 @@ public class RuleBasedBreakIterator extends BreakIterator { fText.setIndex(rangeStart); int c = CharacterIteration.current32(fText); category = (short)fRData.fTrie.get(c); + int dictStart = fRData.fFTable.fDictCategoriesStart; while(true) { - while((current = fText.getIndex()) < rangeEnd && (category & dictMask) == 0) { + while((current = fText.getIndex()) < rangeEnd && (category < dictStart)) { c = CharacterIteration.next32(fText); // pre-increment category = (short)fRData.fTrie.get(c); } diff --git a/icu4j/main/shared/data/icudata.jar b/icu4j/main/shared/data/icudata.jar index 0d7cbef46a5..158ef449747 100644 --- a/icu4j/main/shared/data/icudata.jar +++ b/icu4j/main/shared/data/icudata.jar @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bdf00a19b05bc52e17c2aea74e87cc1872a824d5a9cced226078c46a194a8799 -size 13141762 +oid sha256:53e4c3251f31233ffcfe3ff4229ea43d81422a3fa071ee774ed835e5e969d22c +size 13142859 diff --git a/icu4j/main/shared/data/icutzdata.jar b/icu4j/main/shared/data/icutzdata.jar index 8fd2b94f214..f80547f225e 100644 --- a/icu4j/main/shared/data/icutzdata.jar +++ b/icu4j/main/shared/data/icutzdata.jar @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6d2882ccb44134313ff0365eb24776d4e859fa9dd223f10d608d65fdfd7f23d9 +oid sha256:72b712d8d19a5aa8d1cb36f070337010c29595c63d917cf81e3213a5ea5be2e7 size 94529 diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITest.java b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITest.java index 493098ad605..504236095a5 100644 --- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITest.java +++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITest.java @@ -408,7 +408,7 @@ public class RBBITest extends TestFmwk { } } - List threads = new ArrayList(); + List threads = new ArrayList<>(); for (int n = 0; n<4; ++n) { threads.add(new Thread(new WorkerThread())); } @@ -513,7 +513,7 @@ public class RBBITest extends TestFmwk { } private static final BreakIterator BREAK_ITERATOR_CACHE = BreakIterator.getWordInstance(ULocale.ROOT); public static List getBoundary(String toParse) { - List retVal = new ArrayList(); + List retVal = new ArrayList<>(); BreakIterator bi = (BreakIterator) BREAK_ITERATOR_CACHE.clone(); bi.setText(toParse); for (int boundary=bi.first(); boundary != BreakIterator.DONE; boundary = bi.next()) { @@ -579,19 +579,20 @@ public class RBBITest extends TestFmwk { int numCharClasses = dw.fHeader.fCatCount; // Check for duplicate columns (character categories) - List columns = new ArrayList(); + List columns = new ArrayList<>(); for (int column=0; column rows = new ArrayList(); + List rows = new ArrayList<>(); for (int r=0; r breakIterators = new ArrayList(); + List breakIterators = new ArrayList<>(); breakIterators.add((RuleBasedBreakIterator)BreakIterator.getCharacterInstance(Locale.ENGLISH)); breakIterators.add((RuleBasedBreakIterator)BreakIterator.getWordInstance(Locale.ENGLISH)); breakIterators.add((RuleBasedBreakIterator)BreakIterator.getSentenceInstance(Locale.ENGLISH)); @@ -723,17 +724,17 @@ public class RBBITest extends TestFmwk { @Test public void Test8BitsTrieWith8BitStateTable() { - testTrieStateTable(123, true /* expectUCPTrieValueWidthIn8Bits */, true /* expectStateRowIn8Bits */); + testTrieStateTable(251, true /* expectUCPTrieValueWidthIn8Bits */, true /* expectStateRowIn8Bits */); } @Test public void Test16BitsTrieWith8BitStateTable() { - testTrieStateTable(124, false /* expectUCPTrieValueWidthIn8Bits */, true /* expectStateRowIn8Bits */); + testTrieStateTable(252, false /* expectUCPTrieValueWidthIn8Bits */, true /* expectStateRowIn8Bits */); } @Test public void Test16BitsTrieWith16BitStateTable() { - testTrieStateTable(255, false /* expectUCPTrieValueWidthIn8Bits */, false /* expectStateRowIn8Bits */); + testTrieStateTable(253, false /* expectUCPTrieValueWidthIn8Bits */, false /* expectStateRowIn8Bits */); } @Test