ICU-13565 Break Iteration, remove the dictionary bit from the implementation.

author Andy Heninger <andy.heninger@gmail.com>

Tue, 9 Jun 2020 20:19:17 +0000 (13:19 -0700)

committer Andy Heninger <andy.heninger@gmail.com>

Wed, 17 Jun 2020 19:00:14 +0000 (12:00 -0700)
author Andy Heninger <andy.heninger@gmail.com>
Tue, 9 Jun 2020 20:19:17 +0000 (13:19 -0700)
committer Andy Heninger <andy.heninger@gmail.com>
Wed, 17 Jun 2020 19:00:14 +0000 (12:00 -0700)
diff --git a/icu4c/source/common/rbbi.cpp b/icu4c/source/common/rbbi.cpp

index 87f5f714107b0468b5997c0dd6f87eb41051ddca..369e400630dce3db1199cca55c07d7af03a524a6 100644 (file)
--- a/icu4c/source/common/rbbi.cpp
+++ b/icu4c/source/common/rbbi.cpp
@@ -763,15 +763,15 @@ int32_t RuleBasedBreakIterator::handleNext() {
      bool use8BitsTrie = ucptrie_getValueWidth(fData->fTrie) == UCPTRIE_VALUE_BITS_8;
      if (statetable->fFlags & RBBI_8BITS_ROWS) {
          if (use8BitsTrie) {
-            return handleNext<RBBIStateTableRow8, TrieFunc8, kDictBitFor8BitsTrie>();
+            return handleNext<RBBIStateTableRow8, TrieFunc8>();
          } else {
-            return handleNext<RBBIStateTableRow8, TrieFunc16, kDictBit>();
+            return handleNext<RBBIStateTableRow8, TrieFunc16>();
          }
      } else {
          if (use8BitsTrie) {
-            return handleNext<RBBIStateTableRow16, TrieFunc8, kDictBitFor8BitsTrie>();
+            return handleNext<RBBIStateTableRow16, TrieFunc8>();
          } else {
-            return handleNext<RBBIStateTableRow16, TrieFunc16, kDictBit>();
+            return handleNext<RBBIStateTableRow16, TrieFunc16>();
          }
      }
  }
@@ -781,15 +781,15 @@ int32_t RuleBasedBreakIterator::handleSafePrevious(int32_t fromPosition) {
      bool use8BitsTrie = ucptrie_getValueWidth(fData->fTrie) == UCPTRIE_VALUE_BITS_8;
      if (statetable->fFlags & RBBI_8BITS_ROWS) {
          if (use8BitsTrie) {
-            return handleSafePrevious<RBBIStateTableRow8, TrieFunc8, kDictBitFor8BitsTrie>(fromPosition);
+            return handleSafePrevious<RBBIStateTableRow8, TrieFunc8>(fromPosition);
          } else {
-            return handleSafePrevious<RBBIStateTableRow8, TrieFunc16, kDictBit>(fromPosition);
+            return handleSafePrevious<RBBIStateTableRow8, TrieFunc16>(fromPosition);
          }
      } else {
          if (use8BitsTrie) {
-            return handleSafePrevious<RBBIStateTableRow16, TrieFunc8, kDictBitFor8BitsTrie>(fromPosition);
+            return handleSafePrevious<RBBIStateTableRow16, TrieFunc8>(fromPosition);
          } else {
-            return handleSafePrevious<RBBIStateTableRow16, TrieFunc16, kDictBit>(fromPosition);
+            return handleSafePrevious<RBBIStateTableRow16, TrieFunc16>(fromPosition);
          }
      }
  }
@@ -801,7 +801,7 @@ int32_t RuleBasedBreakIterator::handleSafePrevious(int32_t fromPosition) {
  //     Run the state machine to find a boundary
  //
  //-----------------------------------------------------------------------------------
-template <typename RowType, RuleBasedBreakIterator::PTrieFunc trieFunc, uint16_t dictMask>
+template <typename RowType, RuleBasedBreakIterator::PTrieFunc trieFunc>
  int32_t RuleBasedBreakIterator::handleNext() {
      int32_t             state;
      uint16_t            category        = 0;
@@ -815,6 +815,7 @@ int32_t RuleBasedBreakIterator::handleNext() {
      const RBBIStateTable *statetable       = fData->fForwardTable;
      const char         *tableData          = statetable->fTableData;
      uint32_t            tableRowLen        = statetable->fRowLen;
+    uint32_t            dictStart          = statetable->fDictCategoriesStart;
      #ifdef RBBI_DEBUG
          if (gTrace) {
              RBBIDebugPuts("Handle Next   pos   char  state category");
@@ -876,17 +877,7 @@ int32_t RuleBasedBreakIterator::handleNext() {
              // look up the current character's character category, which tells us
              // which column in the state table to look at.
              category = trieFunc(fData->fTrie, c);
-
-            // Check the dictionary bit in the character's category.
-            //    Counter is only used by dictionary based iteration.
-            //    Chars that need to be handled by a dictionary have a flag bit set
-            //    in their category values.
-            //
-            if ((category & dictMask) != 0)  {
-                fDictionaryCharCount++;
-                //  And off the dictionary flag bit.
-                category &= ~dictMask;
-            }
+            fDictionaryCharCount += (category >= dictStart);
          }
  
         #ifdef RBBI_DEBUG
@@ -993,7 +984,7 @@ int32_t RuleBasedBreakIterator::handleNext() {
  //      because the safe table does not require as many options.
  //
  //-----------------------------------------------------------------------------------
-template <typename RowType, RuleBasedBreakIterator::PTrieFunc trieFunc, uint16_t dictMask>
+template <typename RowType, RuleBasedBreakIterator::PTrieFunc trieFunc>
  int32_t RuleBasedBreakIterator::handleSafePrevious(int32_t fromPosition) {
  
      int32_t             state;
@@ -1030,7 +1021,6 @@ int32_t RuleBasedBreakIterator::handleSafePrevious(int32_t fromPosition) {
          //
          //  Off the dictionary flag bit. For reverse iteration it is not used.
          category = trieFunc(fData->fTrie, c);
-        category &= ~dictMask;
  
          #ifdef RBBI_DEBUG
              if (gTrace) {
diff --git a/icu4c/source/common/rbbi_cache.cpp b/icu4c/source/common/rbbi_cache.cpp

index 4ea9e3e28b2ef4a33f1733c485b49ce029cecf2a..63ff3001c7034e0c2f0e90f2dfc065a3d296ad8d 100644 (file)
--- a/icu4c/source/common/rbbi_cache.cpp
+++ b/icu4c/source/common/rbbi_cache.cpp
@@ -119,8 +119,6 @@ UBool RuleBasedBreakIterator::DictionaryCache::preceding(int32_t fromPos, int32_
  
  void RuleBasedBreakIterator::DictionaryCache::populateDictionary(int32_t startPos, int32_t endPos,
                                         int32_t firstRuleStatus, int32_t otherRuleStatus) {
-    uint32_t dictMask = ucptrie_getValueWidth(fBI->fData->fTrie) == UCPTRIE_VALUE_BITS_8 ?
-        kDictBitFor8BitsTrie : kDictBit;
      if ((endPos - startPos) <= 1) {
          return;
      }
@@ -145,9 +143,11 @@ void RuleBasedBreakIterator::DictionaryCache::populateDictionary(int32_t startPo
      utext_setNativeIndex(text, rangeStart);
      UChar32     c = utext_current32(text);
      category = ucptrie_get(fBI->fData->fTrie, c);
+    uint32_t dictStart = fBI->fData->fForwardTable->fDictCategoriesStart;
  
      while(U_SUCCESS(status)) {
-        while((current = (int32_t)UTEXT_GETNATIVEINDEX(text)) < rangeEnd && (category & dictMask) == 0) {
+        while((current = (int32_t)UTEXT_GETNATIVEINDEX(text)) < rangeEnd
+                && (category < dictStart)) {
              utext_next32(text);           // TODO: cleaner loop structure.
              c = utext_current32(text);
              category = ucptrie_get(fBI->fData->fTrie, c);
diff --git a/icu4c/source/common/rbbidata.h b/icu4c/source/common/rbbidata.h

index 963050d58e3f9dd8323c6fd0cda3133eb6074b55..efbd4bea1122c8613e12346902feaca89f7fcac1 100644 (file)
--- a/icu4c/source/common/rbbidata.h
+++ b/icu4c/source/common/rbbidata.h
@@ -101,18 +101,18 @@ struct RBBIStateTableRowT {
                                     //  Value 0: not an accepting state.
                                     //        1: (ACCEPTING_UNCONDITIONAL) Unconditional Accepting state.
                                     //       >1: Look-ahead match has completed.
-                                   //           Actual boundary position happened earlier
+                                   //           Actual boundary position happened earlier.
                                     //           Value here == fLookAhead in earlier
-                                   //              state, at actual boundary pos.
+                                   //           state, at actual boundary pos.
      T               fLookAhead;    //  Non-zero if this row is for a state that
                                     //    corresponds to a '/' in the rule source.
                                     //    Value is the same as the fAccepting
-                                   //      value for the rule (which will appear
-                                   //      in a different state.
+                                   //    value for the rule (which will appear
+                                   //    in a different state.
      T               fTagsIdx;      //  Non-zero if this row covers a {tagged} position
-                                   //     from a rule.  Value is the index in the
-                                   //     StatusTable of the set of matching
-                                   //     tags (rule status values)
+                                   //    from a rule.  Value is the index in the
+                                   //    StatusTable of the set of matching
+                                   //    tags (rule status values)
      T               fNextState[1]; //  Next State, indexed by char category.
                                     //    Variable-length array declared with length 1
                                     //    to disable bounds checkers.
@@ -132,14 +132,17 @@ union RBBIStateTableRow {
  };
  
  struct RBBIStateTable {
-    uint32_t         fNumStates;    /*  Number of states.                                 */
-    uint32_t         fRowLen;       /*  Length of a state table row, in bytes.            */
-    uint32_t         fFlags;        /*  Option Flags for this state table                 */
-    char             fTableData[1]; /*  First RBBIStateTableRow begins here.              */
-                                    /*    Variable-length array declared with length 1    */
-                                    /*    to disable bounds checkers.                     */
-                                    /*    (making it char[] simplifies ugly address       */
-                                    /*     arithmetic for indexing variable length rows.) */
+    uint32_t         fNumStates;            // Number of states.
+    uint32_t         fRowLen;               // Length of a state table row, in bytes.
+    uint32_t         fDictCategoriesStart;  // Char category number of the first dictionary
+                                            //   char class, or the the largest category number + 1
+                                            //   if there are no dictionary categories.
+    uint32_t         fFlags;                // Option Flags for this state table.
+    char             fTableData[1];         // First RBBIStateTableRow begins here.
+                                            //   Variable-length array declared with length 1
+                                            //   to disable bounds checkers.
+                                            //   (making it char[] simplifies ugly address
+                                            //   arithmetic for indexing variable length rows.)
  };
  
  constexpr uint32_t RBBI_LOOKAHEAD_HARD_BREAK = 1;
diff --git a/icu4c/source/common/rbbirb.cpp b/icu4c/source/common/rbbirb.cpp

index 9c507527b8d704d2ca52daae3c954d6d2cfcff8b..e5c250dfe4056f2aaa7bbaafd07d827c7ac8694c 100644 (file)
--- a/icu4c/source/common/rbbirb.cpp
+++ b/icu4c/source/common/rbbirb.cpp
@@ -287,9 +287,7 @@ RBBIDataHeader *RBBIRuleBuilder::build(UErrorCode &status) {
  
      //
      // UnicodeSet processing.
-    //    Munge the Unicode Sets to create a set of character categories.
-    //    Generate the mapping tables (TRIE) from input code points to
-    //    the character categories.
+    //    Munge the Unicode Sets to create an initial set of character categories.
      //
      fSetBuilder->buildRanges();
  
@@ -303,6 +301,12 @@ RBBIDataHeader *RBBIRuleBuilder::build(UErrorCode &status) {
      }
  
      fForwardTable->buildForwardTable();
+
+    // State table and character category optimization.
+    // Merge equivalent rows and columns.
+    // Note that this process alters the initial set of character categories,
+    // causing the representation of UnicodeSets in the parse tree to become invalid.
+
      optimizeTables();
      fForwardTable->buildSafeReverseTable(status);
  
@@ -315,6 +319,9 @@ RBBIDataHeader *RBBIRuleBuilder::build(UErrorCode &status) {
      }
  #endif
  
+    //    Generate the mapping tables (TRIE) from input code points to
+    //    the character categories.
+    //
      fSetBuilder->buildTrie();
  
      //
diff --git a/icu4c/source/common/rbbisetb.cpp b/icu4c/source/common/rbbisetb.cpp

index 23dbc19d7cf5aecccff18f66af957fc138a402a7..29faeb8c456d723b7f025f0ab0012b1b5ea8e0c1 100644 (file)
--- a/icu4c/source/common/rbbisetb.cpp
+++ b/icu4c/source/common/rbbisetb.cpp
@@ -19,7 +19,7 @@
  //                      by the RBBI rules.
  //                   -  compute a set of non-overlapping character ranges
  //                      with all characters within a range belonging to the same
-//                      set of input uniocde sets.
+//                      set of input unicode sets.
  //                   -  Derive a set of non-overlapping UnicodeSet (like things)
  //                      that will correspond to columns in the state table for
  //                      the RBBI execution engine.  All characters within one
@@ -45,7 +45,7 @@
  
  U_NAMESPACE_BEGIN
  
-const int32_t kMaxCharCategoriesFor8BitsTrie = 127;
+const int32_t kMaxCharCategoriesFor8BitsTrie = 255;
  //------------------------------------------------------------------------
  //
  //   Constructor
@@ -55,12 +55,12 @@ RBBISetBuilder::RBBISetBuilder(RBBIRuleBuilder *rb)
  {
      fRB             = rb;
      fStatus         = rb->fStatus;
-    fRangeList      = 0;
+    fRangeList      = nullptr;
      fMutableTrie    = nullptr;
      fTrie           = nullptr;
      fTrieSize       = 0;
      fGroupCount     = 0;
-    fSawBOF         = FALSE;
+    fSawBOF         = false;
  }
  
  
@@ -196,24 +196,47 @@ void RBBISetBuilder::buildRanges() {
      //
      //    Numbering: # 0  (state table column 0) is unused.
      //               # 1  is reserved - table column 1 is for end-of-input
-    //               # 2  is reserved - table column 2 is for beginning-in-input
+    //               # 2  is reserved - table column 2 is for beginning-of-input
      //               # 3  is the first range list.
      //
      RangeDescriptor *rlSearchRange;
-    for (rlRange = fRangeList; rlRange!=0; rlRange=rlRange->fNext) {
+    int32_t dictGroupCount = 0;
+
+    for (rlRange = fRangeList; rlRange!=nullptr; rlRange=rlRange->fNext) {
          for (rlSearchRange=fRangeList; rlSearchRange != rlRange; rlSearchRange=rlSearchRange->fNext) {
              if (rlRange->fIncludesSets->equals(*rlSearchRange->fIncludesSets)) {
                  rlRange->fNum = rlSearchRange->fNum;
+                rlRange->fIncludesDict = rlSearchRange->fIncludesDict;
                  break;
              }
          }
          if (rlRange->fNum == 0) {
-            fGroupCount ++;
-            rlRange->fNum = fGroupCount+2; 
-            rlRange->setDictionaryFlag();
-            addValToSets(rlRange->fIncludesSets, fGroupCount+2);
+            rlRange->fFirstInGroup = true;
+            if (rlRange->isDictionaryRange()) {
+                rlRange->fNum = ++dictGroupCount;
+                rlRange->fIncludesDict = true;
+            } else {
+                fGroupCount++;
+                rlRange->fNum = fGroupCount+2;
+                addValToSets(rlRange->fIncludesSets, rlRange->fNum);
+            }
+        }
+    }
+
+    // Move the character category numbers for any dictionary ranges up, so that they
+    // immediately follow the non-dictionary ranges.
+
+    fDictCategoriesStart = fGroupCount + 3;
+    for (rlRange = fRangeList; rlRange!=nullptr; rlRange=rlRange->fNext) {
+        if (rlRange->fIncludesDict) {
+            rlRange->fNum += fDictCategoriesStart - 1;
+            if (rlRange->fFirstInGroup) {
+                addValToSets(rlRange->fIncludesSets, rlRange->fNum);
+            }
          }
      }
+    fGroupCount += dictGroupCount;
+
  
      // Handle input sets that contain the special string {eof}.
      //   Column 1 of the state table is reserved for EOF on input.
@@ -222,13 +245,11 @@ void RBBISetBuilder::buildRanges() {
      //             references to {bof}.)
      //   Add this column value (1 or 2) to the equivalent expression
      //     subtree for each UnicodeSet that contains the string {eof}
-    //   Because {bof} and {eof} are not a characters in the normal sense,
-    //   they doesn't affect the computation of ranges or TRIE.
-    static const UChar eofUString[] = {0x65, 0x6f, 0x66, 0};
-    static const UChar bofUString[] = {0x62, 0x6f, 0x66, 0};
+    //   Because {bof} and {eof} are not characters in the normal sense,
+    //   they don't affect the computation of the ranges or TRIE.
  
-    UnicodeString eofString(eofUString);
-    UnicodeString bofString(bofUString);
+    UnicodeString eofString(u"eof");
+    UnicodeString bofString(u"bof");
      for (ni=0; ; ni++) {        // Loop over each of the UnicodeSets encountered in the input rules
          usetNode = (RBBINode *)this->fRB->fUSetNodes->elementAt(ni);
          if (usetNode==NULL) {
@@ -255,24 +276,16 @@ void RBBISetBuilder::buildRanges() {
  // range group number.
  //
  void RBBISetBuilder::buildTrie() {
-    RangeDescriptor *rlRange;
-
      fMutableTrie = umutablecptrie_open(
                          0,       //  Initial value for all code points.
                          0,       //  Error value for out-of-range input.
                          fStatus);
  
-    bool use8Bits = getNumCharCategories() <= kMaxCharCategoriesFor8BitsTrie;
-    for (rlRange = fRangeList; rlRange!=0 && U_SUCCESS(*fStatus); rlRange=rlRange->fNext) {
-        uint32_t value = rlRange->fNum;
-        if (use8Bits && ((value & RuleBasedBreakIterator::kDictBit) != 0)) {
-            U_ASSERT((value & RuleBasedBreakIterator::kDictBitFor8BitsTrie) == 0);
-            value = RuleBasedBreakIterator::kDictBitFor8BitsTrie | (value & ~RuleBasedBreakIterator::kDictBit);
-        }
+    for (RangeDescriptor *range = fRangeList; range!=nullptr && U_SUCCESS(*fStatus); range=range->fNext) {
          umutablecptrie_setRange(fMutableTrie,
-                                rlRange->fStartChar,     // Range start
-                                rlRange->fEndChar,       // Range end (inclusive)
-                                value,           // value for range
+                                range->fStartChar,     // Range start
+                                range->fEndChar,       // Range end (inclusive)
+                                range->fNum,           // value for range
                                  fStatus);
      }
  }
@@ -281,16 +294,21 @@ void RBBISetBuilder::buildTrie() {
  void RBBISetBuilder::mergeCategories(IntPair categories) {
      U_ASSERT(categories.first >= 1);
      U_ASSERT(categories.second > categories.first);
+    U_ASSERT((categories.first <  fDictCategoriesStart && categories.second <  fDictCategoriesStart) ||
+             (categories.first >= fDictCategoriesStart && categories.second >= fDictCategoriesStart));
+
      for (RangeDescriptor *rd = fRangeList; rd != nullptr; rd = rd->fNext) {
-        int32_t rangeNum = rd->fNum & ~RuleBasedBreakIterator::kDictBit;
-        int32_t rangeDict = rd->fNum & RuleBasedBreakIterator::kDictBit;
+        int32_t rangeNum = rd->fNum;
          if (rangeNum == categories.second) {
-            rd->fNum = categories.first | rangeDict;
+            rd->fNum = categories.first;
          } else if (rangeNum > categories.second) {
              rd->fNum--;
          }
      }
      --fGroupCount;
+    if (categories.second <= fDictCategoriesStart) {
+        --fDictCategoriesStart;
+    }
  }
  
  
@@ -395,6 +413,16 @@ int32_t  RBBISetBuilder::getNumCharCategories() const {
  }
  
  
+//------------------------------------------------------------------------
+//
+//   getDictCategoriesStart
+//
+//------------------------------------------------------------------------
+int32_t  RBBISetBuilder::getDictCategoriesStart() const {
+    return fDictCategoriesStart;
+}
+
+
  //------------------------------------------------------------------------
  //
  //   sawBOF
@@ -414,7 +442,7 @@ UBool  RBBISetBuilder::sawBOF() const {
  UChar32  RBBISetBuilder::getFirstChar(int32_t category) const {
      RangeDescriptor   *rlRange;
      UChar32            retVal = (UChar32)-1;
-    for (rlRange = fRangeList; rlRange!=0; rlRange=rlRange->fNext) {
+    for (rlRange = fRangeList; rlRange!=nullptr; rlRange=rlRange->fNext) {
          if (rlRange->fNum == category) {
              retVal = rlRange->fStartChar;
              break;
@@ -424,7 +452,6 @@ UChar32  RBBISetBuilder::getFirstChar(int32_t category) const {
  }
  
  
-
  //------------------------------------------------------------------------
  //
  //   printRanges        A debugging function.
@@ -437,16 +464,16 @@ void RBBISetBuilder::printRanges() {
      int                    i;
  
      RBBIDebugPrintf("\n\n Nonoverlapping Ranges ...\n");
-    for (rlRange = fRangeList; rlRange!=0; rlRange=rlRange->fNext) {
-        RBBIDebugPrintf("%2i  %4x-%4x  ", rlRange->fNum, rlRange->fStartChar, rlRange->fEndChar);
+    for (rlRange = fRangeList; rlRange!=nullptr; rlRange=rlRange->fNext) {
+        RBBIDebugPrintf("%4x-%4x  ", rlRange->fStartChar, rlRange->fEndChar);
  
          for (i=0; i<rlRange->fIncludesSets->size(); i++) {
              RBBINode       *usetNode    = (RBBINode *)rlRange->fIncludesSets->elementAt(i);
-            UnicodeString   setName = UNICODE_STRING("anon", 4);
+            UnicodeString   setName {u"anon"};
              RBBINode       *setRef = usetNode->fParent;
-            if (setRef != NULL) {
+            if (setRef != nullptr) {
                  RBBINode *varRef = setRef->fParent;
-                if (varRef != NULL  &&  varRef->fType == RBBINode::varRef) {
+                if (varRef != nullptr  &&  varRef->fType == RBBINode::varRef) {
                      setName = varRef->fText;
                  }
              }
@@ -466,19 +493,15 @@ void RBBISetBuilder::printRanges() {
  //------------------------------------------------------------------------
  #ifdef RBBI_DEBUG
  void RBBISetBuilder::printRangeGroups() {
-    RangeDescriptor       *rlRange;
-    RangeDescriptor       *tRange;
      int                    i;
-    int                    lastPrintedGroupNum = 0;
  
      RBBIDebugPrintf("\nRanges grouped by Unicode Set Membership...\n");
-    for (rlRange = fRangeList; rlRange!=0; rlRange=rlRange->fNext) {
-        int groupNum = rlRange->fNum & 0xbfff;
-        if (groupNum > lastPrintedGroupNum) {
-            lastPrintedGroupNum = groupNum;
+    for (RangeDescriptor *rlRange = fRangeList; rlRange!=nullptr; rlRange=rlRange->fNext) {
+        if (rlRange->fFirstInGroup) {
+            int groupNum = rlRange->fNum;
              RBBIDebugPrintf("%2i  ", groupNum);
  
-            if (rlRange->fNum & RuleBasedBreakIterator::kDictBit) { RBBIDebugPrintf(" <DICT> ");}
+            if (groupNum >= fDictCategoriesStart) { RBBIDebugPrintf(" <DICT> ");}
  
              for (i=0; i<rlRange->fIncludesSets->size(); i++) {
                  RBBINode       *usetNode    = (RBBINode *)rlRange->fIncludesSets->elementAt(i);
@@ -494,7 +517,7 @@ void RBBISetBuilder::printRangeGroups() {
              }
  
              i = 0;
-            for (tRange = rlRange; tRange != 0; tRange = tRange->fNext) {
+            for (RangeDescriptor *tRange = rlRange; tRange != nullptr; tRange = tRange->fNext) {
                  if (tRange->fNum == rlRange->fNum) {
                      if (i++ % 5 == 0) {
                          RBBIDebugPrintf("\n    ");
@@ -561,28 +584,22 @@ void RBBISetBuilder::printSets() {
  //
  //-------------------------------------------------------------------------------------
  
-RangeDescriptor::RangeDescriptor(const RangeDescriptor &other, UErrorCode &status) {
-    int  i;
-
-    this->fStartChar    = other.fStartChar;
-    this->fEndChar      = other.fEndChar;
-    this->fNum          = other.fNum;
-    this->fNext         = NULL;
-    UErrorCode oldstatus = status;
-    this->fIncludesSets = new UVector(status);
-    if (U_FAILURE(oldstatus)) {
-        status = oldstatus;
-    }
+RangeDescriptor::RangeDescriptor(const RangeDescriptor &other, UErrorCode &status) :
+        fStartChar(other.fStartChar), fEndChar {other.fEndChar}, fNum {other.fNum},
+        fIncludesDict{other.fIncludesDict}, fFirstInGroup{other.fFirstInGroup} {
+
      if (U_FAILURE(status)) {
          return;
      }
-    /* test for NULL */
-    if (this->fIncludesSets == 0) {
+    fIncludesSets = new UVector(status);
+    if (this->fIncludesSets == nullptr) {
          status = U_MEMORY_ALLOCATION_ERROR;
+    }
+    if (U_FAILURE(status)) {
          return;
      }
  
-    for (i=0; i<other.fIncludesSets->size(); i++) {
+    for (int32_t i=0; i<other.fIncludesSets->size(); i++) {
          this->fIncludesSets->addElement(other.fIncludesSets->elementAt(i), status);
      }
  }
@@ -594,24 +611,13 @@ RangeDescriptor::RangeDescriptor(const RangeDescriptor &other, UErrorCode &statu
  //
  //-------------------------------------------------------------------------------------
  RangeDescriptor::RangeDescriptor(UErrorCode &status) {
-    this->fStartChar    = 0;
-    this->fEndChar      = 0;
-    this->fNum          = 0;
-    this->fNext         = NULL;
-    UErrorCode oldstatus = status;
-    this->fIncludesSets = new UVector(status);
-    if (U_FAILURE(oldstatus)) {
-        status = oldstatus;
-    }
      if (U_FAILURE(status)) {
          return;
      }
-    /* test for NULL */
-    if(this->fIncludesSets == 0) {
+    fIncludesSets = new UVector(status);
+    if (fIncludesSets == nullptr) {
          status = U_MEMORY_ALLOCATION_ERROR;
-        return;
      }
-
  }
  
  
@@ -622,7 +628,7 @@ RangeDescriptor::RangeDescriptor(UErrorCode &status) {
  //-------------------------------------------------------------------------------------
  RangeDescriptor::~RangeDescriptor() {
      delete  fIncludesSets;
-    fIncludesSets = NULL;
+    fIncludesSets = nullptr;
  }
  
  //-------------------------------------------------------------------------------------
@@ -633,7 +639,7 @@ RangeDescriptor::~RangeDescriptor() {
  void RangeDescriptor::split(UChar32 where, UErrorCode &status) {
      U_ASSERT(where>fStartChar && where<=fEndChar);
      RangeDescriptor *nr = new RangeDescriptor(*this, status);
-    if(nr == 0) {
+    if(nr == nullptr) {
          status = U_MEMORY_ALLOCATION_ERROR;
          return;
      }
@@ -652,27 +658,22 @@ void RangeDescriptor::split(UChar32 where, UErrorCode &status) {
  
  //-------------------------------------------------------------------------------------
  //
-//   RangeDescriptor::setDictionaryFlag
+//   RangeDescriptor::isDictionaryRange
  //
-//            Character Category Numbers that include characters from
-//            the original Unicode Set named "dictionary" have bit 14
-//            set to 1.  The RBBI runtime engine uses this to trigger
-//            use of the word dictionary.
+//            Test whether this range includes characters from
+//            the original Unicode Set named "dictionary".
  //
-//            This function looks through the Unicode Sets that it
-//            (the range) includes, and sets the bit in fNum when
-//            "dictionary" is among them.
+//            This function looks through the Unicode Sets that
+//            the range includes, checking for one named "dictionary"
  //
  //            TODO:  a faster way would be to find the set node for
  //                   "dictionary" just once, rather than looking it
  //                   up by name every time.
  //
  //-------------------------------------------------------------------------------------
-void RangeDescriptor::setDictionaryFlag() {
-    int i;
-
+bool RangeDescriptor::isDictionaryRange() {
      static const char16_t *dictionary = u"dictionary";
-    for (i=0; i<fIncludesSets->size(); i++) {
+    for (int32_t i=0; i<fIncludesSets->size(); i++) {
          RBBINode *usetNode  = (RBBINode *)fIncludesSets->elementAt(i);
          RBBINode *setRef = usetNode->fParent;
          if (setRef != nullptr) {
@@ -680,16 +681,14 @@ void RangeDescriptor::setDictionaryFlag() {
              if (varRef && varRef->fType == RBBINode::varRef) {
                  const UnicodeString *setName = &varRef->fText;
                  if (setName->compare(dictionary, -1) == 0) {
-                    fNum |= RuleBasedBreakIterator::kDictBit;
-                    break;
+                    return true;
                  }
              }
          }
      }
+    return false;
  }
  
-
-
  U_NAMESPACE_END
  
  #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
diff --git a/icu4c/source/common/rbbisetb.h b/icu4c/source/common/rbbisetb.h

index cc031a2924d137e9dd42cc8fa6b525820efcb38e..6409a4ea57983220f8df3c8aaf1c49614d8ab1ab 100644 (file)
--- a/icu4c/source/common/rbbisetb.h
+++ b/icu4c/source/common/rbbisetb.h
@@ -41,25 +41,26 @@ U_NAMESPACE_BEGIN
  //
  class RangeDescriptor : public UMemory {
  public:
-    UChar32            fStartChar;      // Start of range, unicode 32 bit value.
-    UChar32            fEndChar;        // End of range, unicode 32 bit value.
-    int32_t            fNum;            // runtime-mapped input value for this range.
-    UVector           *fIncludesSets;   // vector of the the original
-                                        //   Unicode sets that include this range.
-                                        //    (Contains ptrs to uset nodes)
-    RangeDescriptor   *fNext;           // Next RangeDescriptor in the linked list.
+    UChar32            fStartChar {};            // Start of range, unicode 32 bit value.
+    UChar32            fEndChar {};              // End of range, unicode 32 bit value.
+    int32_t            fNum {0};                 // runtime-mapped input value for this range.
+    bool               fIncludesDict {false};    // True if the range includes $dictionary.
+    bool               fFirstInGroup {false};    // True if first range in a group with the same fNum.
+    UVector           *fIncludesSets {nullptr};  // vector of the the original
+                                                 //   Unicode sets that include this range.
+                                                 //    (Contains ptrs to uset nodes)
+    RangeDescriptor   *fNext {nullptr};          // Next RangeDescriptor in the linked list.
  
      RangeDescriptor(UErrorCode &status);
      RangeDescriptor(const RangeDescriptor &other, UErrorCode &status);
      ~RangeDescriptor();
      void split(UChar32 where, UErrorCode &status);   // Spit this range in two at "where", with
                                          //   where appearing in the second (higher) part.
-    void setDictionaryFlag();           // Check whether this range appears as part of
+    bool isDictionaryRange();           // Check whether this range appears as part of
                                          //   the Unicode set named "dictionary"
  
-private:
-    RangeDescriptor(const RangeDescriptor &other); // forbid copying of this class
-    RangeDescriptor &operator=(const RangeDescriptor &other); // forbid copying of this class
+    RangeDescriptor(const RangeDescriptor &other) = delete; // forbid default copying of this class
+    RangeDescriptor &operator=(const RangeDescriptor &other) = delete; // forbid assigning of this class
  };
  
  
@@ -90,6 +91,8 @@ public:
      int32_t  getNumCharCategories() const;   // CharCategories are the same as input symbol set to the
                                               //    runtime state machine, which are the same as
                                               //    columns in the DFA state table
+    int32_t  getDictCategoriesStart() const; // First char category that includes $dictionary, or
+                                             // last category + 1 if there are no dictionary categories.
      int32_t  getTrieSize() /*const*/;        // Size in bytes of the serialized Trie.
      void     serializeTrie(uint8_t *where);  // write out the serialized Trie.
      UChar32  getFirstChar(int32_t  val) const;
@@ -113,8 +116,6 @@ public:
  #endif
  
  private:
-    void           numberSets();
-
      RBBIRuleBuilder       *fRB;             // The RBBI Rule Compiler that owns us.
      UErrorCode            *fStatus;
  
@@ -124,14 +125,13 @@ private:
      UCPTrie               *fTrie;           //  the Unicode Sets.
      uint32_t               fTrieSize;
  
-    // Groups correspond to character categories -
-    //       groups of ranges that are in the same original UnicodeSets.
-    //       fGroupCount is the index of the last used group.
-    //       fGroupCount+1 is also the number of columns in the RBBI state table being compiled.
-    //       State table column 0 is not used.  Column 1 is for end-of-input.
-    //       column 2 is for group 0.  Funny counting.
+    // Number of range groups, which are groups of ranges that are in the same original UnicodeSets.
      int32_t               fGroupCount;
  
+    // The number of the first dictionary char category.
+    // If there are no Dictionary categories, set to the last category + 1.
+    int32_t               fDictCategoriesStart;
+
      UBool                 fSawBOF;
  
      RBBISetBuilder(const RBBISetBuilder &other); // forbid copying of this class
diff --git a/icu4c/source/common/rbbitblb.cpp b/icu4c/source/common/rbbitblb.cpp

index ebf1f858c56ee42a80a1b6b56c370ac7b6766423..09a6aaa01896a01dd03bf18b49f2da4e66bca21f 100644 (file)
--- a/icu4c/source/common/rbbitblb.cpp
+++ b/icu4c/source/common/rbbitblb.cpp
@@ -1155,7 +1155,13 @@ bool RBBITableBuilder::findDuplCharClassFrom(IntPair *categories) {
      int32_t numCols = fRB->fSetBuilder->getNumCharCategories();
  
      for (; categories->first < numCols-1; categories->first++) {
-        for (categories->second=categories->first+1; categories->second < numCols; categories->second++) {
+        // Note: dictionary & non-dictionary columns cannot be merged.
+        //       The limitSecond value prevents considering mixed pairs.
+        //       Dictionary categories are >= DictCategoriesStart.
+        //       Non dict categories are   <  DictCategoriesStart.
+        int limitSecond = categories->first < fRB->fSetBuilder->getDictCategoriesStart() ?
+            fRB->fSetBuilder->getDictCategoriesStart() : numCols;
+        for (categories->second=categories->first+1; categories->second < limitSecond; categories->second++) {
              // Initialized to different values to prevent returning true if numStates = 0 (implies no duplicates).
              uint16_t table_base = 0;
              uint16_t table_dupl = 1;
@@ -1379,6 +1385,7 @@ void RBBITableBuilder::exportTable(void *where) {
      }
  
      table->fNumStates = fDStates->size();
+    table->fDictCategoriesStart = fRB->fSetBuilder->getDictCategoriesStart();
      table->fFlags     = 0;
      if (use8BitsForTable()) {
          table->fRowLen    = offsetof(RBBIStateTableRow8, fNextState) + sizeof(uint8_t) * catCount;
@@ -1652,12 +1659,12 @@ void RBBITableBuilder::printStates() {
      RBBIDebugPrintf("state |           i n p u t     s y m b o l s \n");
      RBBIDebugPrintf("      | Acc  LA    Tag");
      for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
-        RBBIDebugPrintf(" %2d", c);
+        RBBIDebugPrintf(" %3d", c);
      }
      RBBIDebugPrintf("\n");
      RBBIDebugPrintf("      |---------------");
      for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
-        RBBIDebugPrintf("---");
+        RBBIDebugPrintf("----");
      }
      RBBIDebugPrintf("\n");
  
@@ -1666,7 +1673,7 @@ void RBBITableBuilder::printStates() {
          RBBIDebugPrintf("  %3d | " , n);
          RBBIDebugPrintf("%3d %3d %5d ", sd->fAccepting, sd->fLookAhead, sd->fTagsIdx);
          for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
-            RBBIDebugPrintf(" %2d", sd->fDtran->elementAti(c));
+            RBBIDebugPrintf(" %3d", sd->fDtran->elementAti(c));
          }
          RBBIDebugPrintf("\n");
      }
diff --git a/icu4c/source/common/unicode/rbbi.h b/icu4c/source/common/unicode/rbbi.h

index 843144064ab5100677ac2f59682b584efe901110..2ebe931b1c0217f371e1b9715c67393ba00a7e9e 100644 (file)
--- a/icu4c/source/common/unicode/rbbi.h
+++ b/icu4c/source/common/unicode/rbbi.h
@@ -677,10 +677,10 @@ private:
  
      typedef uint16_t (*PTrieFunc)(const UCPTrie *, UChar32);
  
-    template<typename RowType, PTrieFunc trieFunc, uint16_t dictMask>
+    template<typename RowType, PTrieFunc trieFunc>
      int32_t handleSafePrevious(int32_t fromPosition);
  
-    template<typename RowType, PTrieFunc trieFunc, uint16_t dictMask>
+    template<typename RowType, PTrieFunc trieFunc>
      int32_t handleNext();
  
  
@@ -705,17 +705,6 @@ private:
       * @internal
       */
      void dumpTables();
-
-    /**
-     * Bit for dictionary based category
-     */
-    static constexpr int32_t kDictBit = 0x4000;
-
-    /**
-     * Bit for dictionary based category in 8bits trie
-     */
-    static constexpr int32_t kDictBitFor8BitsTrie = 0x0080;
-
  #endif  /* U_HIDE_INTERNAL_API */
  };
  
diff --git a/icu4c/source/test/intltest/rbbitst.cpp b/icu4c/source/test/intltest/rbbitst.cpp

index 002ab94893e08d23cc6e8308a2cf02763b2421d1..84c6cd782b3b3365a161c8a69b1b75c72440a966 100644 (file)
--- a/icu4c/source/test/intltest/rbbitst.cpp
+++ b/icu4c/source/test/intltest/rbbitst.cpp
@@ -4657,7 +4657,8 @@ void RBBITest::TestTableRedundancies() {
      }
      // Ignore column (char class) 0 while checking; it's special, and may have duplicates.
      for (int c1=1; c1<numCharClasses; c1++) {
-        for (int c2 = c1+1; c2 < numCharClasses; c2++) {
+        int limit = c1 < (int)fwtbl->fDictCategoriesStart ? fwtbl->fDictCategoriesStart : numCharClasses;
+        for (int c2 = c1+1; c2 < limit; c2++) {
              if (columns.at(c1) == columns.at(c2)) {
                  errln("%s:%d Duplicate columns (%d, %d)\n", __FILE__, __LINE__, c1, c2);
                  goto out;
@@ -4952,15 +4953,15 @@ void RBBITest::testTrieStateTable(int32_t numChar, bool expectedTrieWidthIn8Bits
  }
  
  void RBBITest::Test8BitsTrieWith8BitStateTable() {
-    testTrieStateTable(123, true /* expectedTrieWidthIn8Bits */, true /* expectedStateRowIn8Bits */);
+    testTrieStateTable(251, true /* expectedTrieWidthIn8Bits */, true /* expectedStateRowIn8Bits */);
  }
  
  void RBBITest::Test16BitsTrieWith8BitStateTable() {
-    testTrieStateTable(124, false /* expectedTrieWidthIn8Bits */, true /* expectedStateRowIn8Bits */);
+    testTrieStateTable(252, false /* expectedTrieWidthIn8Bits */, true /* expectedStateRowIn8Bits */);
  }
  
  void RBBITest::Test16BitsTrieWith16BitStateTable() {
-    testTrieStateTable(255, false /* expectedTrieWidthIn8Bits */, false /* expectedStateRowIn8Bits */);
+    testTrieStateTable(253, false /* expectedTrieWidthIn8Bits */, false /* expectedStateRowIn8Bits */);
  }
  
  void RBBITest::Test8BitsTrieWith16BitStateTable() {
diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/RBBIDataWrapper.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/RBBIDataWrapper.java

index c0375bdb4f645b4e2b2b54dbf2569bd8f7a9263b..fd81e6ecff5f5421ac4e261a911597fdf0a997d2 100644 (file)
--- a/icu4j/main/classes/core/src/com/ibm/icu/impl/RBBIDataWrapper.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/RBBIDataWrapper.java
@@ -41,10 +41,20 @@ public final class RBBIDataWrapper {
           * Length of a table row in bytes. Note mismatch with table data, which is short[].
           */
          public int     fRowLen;
+        /**
+         * Char category number of the first dictionary char class,
+         * or the the largest category number + 1 if there are no dictionary categories.
+         */
+        public int     fDictCategoriesStart;
          /**
           * Option Flags for this state table.
           */
          public int     fFlags;
+        /**
+         * Length in bytes of the state table header, of all the int32 fields
+         * preceding fTable in the serialized form.
+         */
+        public static int fHeaderSize = 16;
          /**
           * Linear array of next state values, accessed as short[state, char_class]
           */
@@ -57,14 +67,15 @@ public final class RBBIDataWrapper {
              if (length == 0) {
                  return null;
              }
-            if (length < 12) {
+            if (length < fHeaderSize) {
                  throw new IOException("Invalid RBBI state table length.");
              }
              RBBIStateTable This = new RBBIStateTable();
              This.fNumStates = bytes.getInt();
              This.fRowLen    = bytes.getInt();
+            This.fDictCategoriesStart = bytes.getInt();
              This.fFlags     = bytes.getInt();
-            int lengthOfTable = length - 12;   // length in bytes.
+            int lengthOfTable = length - fHeaderSize;   // length in bytes.
              boolean use8Bits = (This.fFlags & RBBIDataWrapper.RBBI_8BITS_ROWS) == RBBIDataWrapper.RBBI_8BITS_ROWS;
              if (use8Bits) {
                  This.fTable = new char[lengthOfTable];
@@ -82,6 +93,7 @@ public final class RBBIDataWrapper {
          public int put(DataOutputStream bytes) throws IOException {
              bytes.writeInt(fNumStates);
              bytes.writeInt(fRowLen);
+            bytes.writeInt(fDictCategoriesStart);
              bytes.writeInt(fFlags);
              if ((fFlags & RBBIDataWrapper.RBBI_8BITS_ROWS) == RBBIDataWrapper.RBBI_8BITS_ROWS) {
                  int tableLen = fRowLen * fNumStates;  // fRowLen is bytes.
@@ -95,8 +107,8 @@ public final class RBBIDataWrapper {
                      bytes.writeChar(fTable[i]);
                  }
              }
-            int bytesWritten = 12 + fRowLen * fNumStates;   // total bytes written,
-                                                            // including 12 for the header.
+            int bytesWritten = fHeaderSize + fRowLen * fNumStates;   // total bytes written,
+                                                                     // including the header.
              while (bytesWritten % 8 != 0) {
                  bytes.writeByte(0);
                  ++bytesWritten;
@@ -118,6 +130,7 @@ public final class RBBIDataWrapper {
              RBBIStateTable otherST = (RBBIStateTable)other;
              if (fNumStates != otherST.fNumStates) return false;
              if (fRowLen    != otherST.fRowLen)    return false;
+            if (fDictCategoriesStart != otherST.fDictCategoriesStart) return false;
              if (fFlags     != otherST.fFlags)     return false;
              return Arrays.equals(fTable, otherST.fTable);
          }
@@ -216,9 +229,6 @@ public final class RBBIDataWrapper {
      public final static int      RBBI_BOF_REQUIRED         = 2;
      public final static int      RBBI_8BITS_ROWS           = 4;
  
-    public final static int      DICT_BIT                  = 0x4000;
-    public final static int      DICT_BIT_FOR_8BITS_TRIE   = 0x0080;
-
      /**
       * Data Header.  A struct-like class with the fields from the RBBI data file header.
       * Not intended for public use, declared public for testing purposes only.
@@ -496,7 +506,6 @@ public final class RBBIDataWrapper {
          int      char32;
          int      category;
          int      lastNewline[] = new int[n+1];
-        int      dictMask = fTrie.getValueWidth() ==  CodePointTrie.ValueWidth.BITS_8 ? DICT_BIT_FOR_8BITS_TRIE : DICT_BIT;
  
          for (category = 0; category <= fHeader.fCatCount; category ++) {
              catStrings[category] = "";
@@ -505,7 +514,6 @@ public final class RBBIDataWrapper {
          out.println("--------------------");
          for (char32 = 0; char32<=0x10ffff; char32++) {
              category = fTrie.get(char32);
-            category &= ~dictMask;            // Mask off dictionary bit.
              if (category < 0 || category > fHeader.fCatCount) {
                  out.println("Error, bad category " + Integer.toHexString(category) +
                          " for char " + Integer.toHexString(char32));
diff --git a/icu4j/main/classes/core/src/com/ibm/icu/text/RBBIRuleBuilder.java b/icu4j/main/classes/core/src/com/ibm/icu/text/RBBIRuleBuilder.java

index dfe3d2adddddb8b977f3d379f8d09cb4c53fc358..7f3b2e665f2e87d2616d29cb8b163e068b25b577 100644 (file)
--- a/icu4j/main/classes/core/src/com/ibm/icu/text/RBBIRuleBuilder.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/text/RBBIRuleBuilder.java
@@ -67,7 +67,7 @@ class RBBIRuleBuilder {
      //
      // Status {tag} values.   These structures are common to all of the rule sets (Forward, Reverse, etc.).
      //
-    Map<Set<Integer>, Integer> fStatusSets = new HashMap<Set<Integer>, Integer>(); // Status value sets encountered so far.
+    Map<Set<Integer>, Integer> fStatusSets = new HashMap<>(); // Status value sets encountered so far.
                                                                                     //  Map Key is the set of values.
                                                                                     //  Map Value is the runtime array index.
  
@@ -146,8 +146,8 @@ class RBBIRuleBuilder {
                              ICUDebug.value("rbbi") : null;
          fRules          = rules;
          fStrippedRules  = new StringBuilder(rules);
-        fUSetNodes      = new ArrayList<RBBINode>();
-        fRuleStatusVals = new ArrayList<Integer>();
+        fUSetNodes      = new ArrayList<>();
+        fRuleStatusVals = new ArrayList<>();
          fScanner        = new RBBIRuleScanner(this);
          fSetBuilder     = new RBBISetBuilder(this);
      }
@@ -294,9 +294,7 @@ class RBBIRuleBuilder {
  
          //
          // UnicodeSet processing.
-        //    Munge the Unicode Sets to create a set of character categories.
-        //    Generate the mapping tables (TRIE) from input code points to
-        //    the character categories.
+        //    Munge the Unicode Sets to create an initial set of character categories.
          //
          fSetBuilder.buildRanges();
  
@@ -305,6 +303,10 @@ class RBBIRuleBuilder {
          //
          fForwardTable = new RBBITableBuilder(this, fForwardTree);
          fForwardTable.buildForwardTable();
+        // State table and character category optimization.
+        // Merge equivalent rows and columns.
+        // Note that this process alters the the initial set of character categories,
+        // causing the representation of UnicodeSets in the parse tree to become invalid.
          optimizeTables();
          fForwardTable.buildSafeReverseTable();
  
@@ -315,7 +317,9 @@ class RBBIRuleBuilder {
              fForwardTable.printRuleStatusTable();
              fForwardTable.printReverseTable();
          }
-
+        //    Generate the mapping tables (TRIE) from input code points to
+        //    the character categories.
+        //
          fSetBuilder.buildTrie();
          //
          //   Package up the compiled data, writing it to an output stream
diff --git a/icu4j/main/classes/core/src/com/ibm/icu/text/RBBISetBuilder.java b/icu4j/main/classes/core/src/com/ibm/icu/text/RBBISetBuilder.java

index e7189a58c803541c9c146cfb6ff8a1c0c990b7f5..fa0c73325b61341d7cbdf85925acc2cc26f79fde 100644 (file)
--- a/icu4j/main/classes/core/src/com/ibm/icu/text/RBBISetBuilder.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/text/RBBISetBuilder.java
@@ -29,7 +29,7 @@ import com.ibm.icu.util.MutableCodePointTrie;
  //                      by the RBBI rules.
  //                   -  compute a set of non-overlapping character ranges
  //                      with all characters within a range belonging to the same
-//                      set of input uniocde sets.
+//                      set of input unicode sets.
  //                   -  Derive a set of non-overlapping UnicodeSet (like things)
  //                      that will correspond to columns in the state table for
  //                      the RBBI execution engine.  All characters within one
@@ -41,23 +41,27 @@ import com.ibm.icu.util.MutableCodePointTrie;
  //
  class RBBISetBuilder {
      static class RangeDescriptor  {
-           int                fStartChar;      // Start of range, unicode 32 bit value.
-           int                fEndChar;        // End of range, unicode 32 bit value.
-           int                fNum;            // runtime-mapped input value for this range.
-           List<RBBINode>     fIncludesSets;    // vector of the the original
-                                                 //   Unicode sets that include this range.
-                                                //    (Contains ptrs to uset nodes)
-            RangeDescriptor   fNext;           // Next RangeDescriptor in the linked list.
+           int                fStartChar = 0;         // Start of range, unicode 32 bit value.
+           int                fEndChar = 0;           // End of range, unicode 32 bit value.
+           int                fNum = 0;               // runtime-mapped input value for this range.
+           boolean            fIncludesDict = false;  // True if the range includes $dictionary.
+           boolean            fFirstInGroup = false;  // True if first range in a group with the same fNum.
+           List<RBBINode>     fIncludesSets;          // vector of the the original
+                                                      //   Unicode sets that include this range.
+                                                      //    (Contains ptrs to uset nodes)
+            RangeDescriptor   fNext;                  // Next RangeDescriptor in the linked list.
  
              RangeDescriptor() {
-                fIncludesSets = new ArrayList<RBBINode>();
+                fIncludesSets = new ArrayList<>();
              }
  
              RangeDescriptor(RangeDescriptor other) {
                  fStartChar = other.fStartChar;
                  fEndChar   = other.fEndChar;
                  fNum       = other.fNum;
-                fIncludesSets = new ArrayList<RBBINode>(other.fIncludesSets);
+                fIncludesDict = other.fIncludesDict;
+                fFirstInGroup = other.fFirstInGroup;
+                fIncludesSets = new ArrayList<>(other.fIncludesSets);
              }
  
              //-------------------------------------------------------------------------------------
@@ -82,28 +86,18 @@ class RBBISetBuilder {
              }
  
  
-            //-------------------------------------------------------------------------------------
-            //
-            //          RangeDescriptor::setDictionaryFlag
-            //
-            //          Character Category Numbers that include characters from
-            //          the original Unicode Set named "dictionary" have bit 14
-            //          set to 1.  The RBBI runtime engine uses this to trigger
-            //          use of the word dictionary.
-            //
-            //          This function looks through the Unicode Sets that it
-            //          (the range) includes, and sets the bit in fNum when
-            //          "dictionary" is among them.
-            //
+            /**
+             * Test whether this range includes characters from the original Unicode Set named "dictionary".
+             *
+             * This function looks through the Unicode Sets that
+             * the range includes, checking for one named "dictionary"
+             */
              //          TODO:  a faster way would be to find the set node for
              //          "dictionary" just once, rather than looking it
              //          up by name every time.
              //
-            // -------------------------------------------------------------------------------------
-            void setDictionaryFlag() {
-                int i;
-
-                for (i=0; i<this.fIncludesSets.size(); i++) {
+            boolean isDictionaryRange() {
+                for (int i=0; i<this.fIncludesSets.size(); i++) {
                      RBBINode        usetNode    = fIncludesSets.get(i);
                      String          setName = "";
                      RBBINode        setRef = usetNode.fParent;
@@ -114,11 +108,10 @@ class RBBISetBuilder {
                          }
                      }
                      if (setName.equals("dictionary")) {
-                        this.fNum |= DICT_BIT;
-                        break;
+                        return true;
                      }
                  }
-
+                return false;
          }
      }
  
@@ -130,19 +123,18 @@ class RBBISetBuilder {
                                             //  the Unicode Sets.
      CodePointTrie         fFrozenTrie;
  
-    // Groups correspond to character categories -
-    //       groups of ranges that are in the same original UnicodeSets.
-    //       fGroupCount is the index of the last used group.
-    //       fGroupCount+1 is also the number of columns in the RBBI state table being compiled.
-    //       State table column 0 is not used.  Column 1 is for end-of-input.
-    //       column 2 is for group 0.  Funny counting.
+    /**
+     * Number of range groups, which are groups of ranges that are in the same original UnicodeSets.
+     */
      int                fGroupCount;
+    /**
+     * The number of the first dictionary char category.
+     * If there are no Dictionary categories, set to the last category + 1.
+     */
+    int                fDictCategoriesStart;
  
      boolean             fSawBOF;
  
-    static final int    DICT_BIT = 0x4000;
-    static final int    DICT_BIT_FOR_8BITS_TRIE  = 0x0080;
-
  
      //------------------------------------------------------------------------
      //
@@ -239,25 +231,49 @@ class RBBISetBuilder {
          //
          //    Numbering: # 0  (state table column 0) is unused.
          //               # 1  is reserved - table column 1 is for end-of-input
-        //               # 2  is reserved - table column 2 is for beginning-in-input
+        //               # 2  is reserved - table column 2 is for beginning-of-input
          //               # 3  is the first range list.
          //
          RangeDescriptor rlSearchRange;
+        int dictGroupCount = 0;
+
          for (rlRange = fRangeList; rlRange!=null; rlRange=rlRange.fNext) {
              for (rlSearchRange=fRangeList; rlSearchRange != rlRange; rlSearchRange=rlSearchRange.fNext) {
                  if (rlRange.fIncludesSets.equals(rlSearchRange.fIncludesSets)) {
                      rlRange.fNum = rlSearchRange.fNum;
+                    rlRange.fIncludesDict = rlSearchRange.fIncludesDict;
                      break;
                  }
              }
              if (rlRange.fNum == 0) {
-                fGroupCount ++;
-                rlRange.fNum = fGroupCount+2;
-                rlRange.setDictionaryFlag();
-                addValToSets(rlRange.fIncludesSets, fGroupCount+2);
+                rlRange.fFirstInGroup = true;
+                if (rlRange.isDictionaryRange()) {
+                    rlRange.fNum = ++dictGroupCount;
+                    rlRange.fIncludesDict = true;
+                } else {
+                    fGroupCount++;
+                    rlRange.fNum = fGroupCount + 2;
+                    addValToSets(rlRange.fIncludesSets, fGroupCount + 2);
+                }
              }
          }
  
+        // Move the character category numbers for any dictionary ranges up, so that they
+        // immediately follow the non-dictionary ranges.
+
+        fDictCategoriesStart = fGroupCount + 3;
+        for (rlRange = fRangeList; rlRange!=null; rlRange=rlRange.fNext) {
+            if (rlRange.fIncludesDict) {
+                rlRange.fNum += fDictCategoriesStart - 1;
+                if (rlRange.fFirstInGroup) {
+                    addValToSets(rlRange.fIncludesSets, rlRange.fNum);
+                }
+            }
+        }
+        fGroupCount += dictGroupCount;
+
+
+
          // Handle input sets that contain the special string {eof}.
          //   Column 1 of the state table is reserved for EOF on input.
          //   Column 2 is reserved for before-the-start-input.
@@ -288,31 +304,21 @@ class RBBISetBuilder {
      }
  
  
-    private static final int MAX_CHAR_CATEGORIES_FOR_8BITS_TRIE = 127;
+    private static final int MAX_CHAR_CATEGORIES_FOR_8BITS_TRIE = 255;
  
      /**
       * Build the Trie table for mapping UChar32 values to the corresponding
       * range group number.
       */
      void buildTrie() {
-        boolean use8Bits = getNumCharCategories() <= MAX_CHAR_CATEGORIES_FOR_8BITS_TRIE;
-        RangeDescriptor rlRange;
-
          fTrie = new MutableCodePointTrie(0,       //   Initial value for all code points.
                                           0);      //   Error value for out-of-range input.
  
-        for (rlRange = fRangeList; rlRange!=null; rlRange=rlRange.fNext) {
-            int value = rlRange.fNum;
-            if (use8Bits && ((value & DICT_BIT) != 0)) {
-                assert((value & DICT_BIT_FOR_8BITS_TRIE) == 0);
-                // switch to the bit from DICT_BIT to DICT_BIT_FOR_8BITS_TRIE
-                value = DICT_BIT_FOR_8BITS_TRIE | (value & ~DICT_BIT);
-            }
-            fTrie.setRange(
-                    rlRange.fStartChar,     // Range start
-                    rlRange.fEndChar,       // Range end (inclusive)
-                    value                  // value for range
-                    );
+        for (RangeDescriptor rlRange = fRangeList; rlRange!=null; rlRange=rlRange.fNext) {
+            fTrie.setRange(rlRange.fStartChar,     // Range start
+                           rlRange.fEndChar,       // Range end (inclusive)
+                           rlRange.fNum            // value for range
+                          );
          }
      }
  
@@ -324,16 +330,20 @@ class RBBISetBuilder {
      void mergeCategories(IntPair categories) {
          assert(categories.first >= 1);
          assert(categories.second > categories.first);
+        assert((categories.first <  fDictCategoriesStart && categories.second <  fDictCategoriesStart) ||
+                (categories.first >= fDictCategoriesStart && categories.second >= fDictCategoriesStart));
          for (RangeDescriptor rd = fRangeList; rd != null; rd = rd.fNext) {
-            int rangeNum = rd.fNum & ~DICT_BIT;
-            int rangeDict = rd.fNum & DICT_BIT;
+            int rangeNum = rd.fNum;
              if (rangeNum == categories.second) {
-                rd.fNum = categories.first | rangeDict;
+                rd.fNum = categories.first;
              } else if (rangeNum > categories.second) {
                  rd.fNum--;
              }
          }
          --fGroupCount;
+        if (categories.second <= fDictCategoriesStart) {
+            --fDictCategoriesStart;
+        }
      }
  
      //-----------------------------------------------------------------------------------
@@ -425,6 +435,16 @@ class RBBISetBuilder {
      }
  
  
+    //------------------------------------------------------------------------
+    //
+    //   getDictCategoriesStart
+    //
+    //------------------------------------------------------------------------
+    int  getDictCategoriesStart() {
+        return fDictCategoriesStart;
+    }
+
+
      //------------------------------------------------------------------------
      //
      //           sawBOF
@@ -454,7 +474,6 @@ class RBBISetBuilder {
      }
  
  
-
      //------------------------------------------------------------------------
      //
      //           printRanges        A debugging function.
@@ -468,7 +487,7 @@ class RBBISetBuilder {
  
          System.out.print("\n\n Nonoverlapping Ranges ...\n");
          for (rlRange = fRangeList; rlRange!=null; rlRange=rlRange.fNext) {
-            System.out.print(" " + rlRange.fNum + "   " + rlRange.fStartChar + "-" + rlRange.fEndChar);
+            System.out.printf("%04x-%04x ", rlRange.fStartChar, rlRange.fEndChar);
  
              for (i=0; i<rlRange.fIncludesSets.size(); i++) {
                  RBBINode       usetNode    = rlRange.fIncludesSets.get(i);
@@ -496,20 +515,16 @@ class RBBISetBuilder {
      //------------------------------------------------------------------------
      ///CLOVER:OFF
      void printRangeGroups() {
-        RangeDescriptor       rlRange;
-        RangeDescriptor       tRange;
          int                    i;
-        int                    lastPrintedGroupNum = 0;
  
          System.out.print("\nRanges grouped by Unicode Set Membership...\n");
-        for (rlRange = fRangeList; rlRange!=null; rlRange=rlRange.fNext) {
-            int groupNum = rlRange.fNum & 0xbfff;
-            if (groupNum > lastPrintedGroupNum) {
-                lastPrintedGroupNum = groupNum;
+        for (RangeDescriptor rlRange = fRangeList; rlRange!=null; rlRange=rlRange.fNext) {
+            if (rlRange.fFirstInGroup) {
+                int groupNum = rlRange.fNum;
                  if (groupNum<10) {System.out.print(" ");}
                  System.out.print(groupNum + " ");
  
-                if ((rlRange.fNum & DICT_BIT) != 0) { System.out.print(" <DICT> ");}
+                if (groupNum >= fDictCategoriesStart) { System.out.print(" <DICT> ");}
  
                  for (i=0; i<rlRange.fIncludesSets.size(); i++) {
                      RBBINode       usetNode    = rlRange.fIncludesSets.get(i);
@@ -525,7 +540,7 @@ class RBBISetBuilder {
                  }
  
                  i = 0;
-                for (tRange = rlRange; tRange != null; tRange = tRange.fNext) {
+                for (RangeDescriptor tRange = rlRange; tRange != null; tRange = tRange.fNext) {
                      if (tRange.fNum == rlRange.fNum) {
                          if (i++ % 5 == 0) {
                              System.out.print("\n    ");
diff --git a/icu4j/main/classes/core/src/com/ibm/icu/text/RBBITableBuilder.java b/icu4j/main/classes/core/src/com/ibm/icu/text/RBBITableBuilder.java

index 7e9ab11a353b0cc13e7d024b638ebabe75792eb6..f981115532568fd5f3f9e9e5f30f602ddb530d69 100644 (file)
--- a/icu4j/main/classes/core/src/com/ibm/icu/text/RBBITableBuilder.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/text/RBBITableBuilder.java
@@ -905,7 +905,13 @@ class RBBITableBuilder {
             int table_base = 0;
             int table_dupl = 0;
             for (; categories.first < numCols-1; ++categories.first) {
-               for (categories.second=categories.first+1; categories.second < numCols; ++categories.second) {
+               // Note: dictionary & non-dictionary columns cannot be merged.
+               //       The limitSecond value prevents considering mixed pairs.
+               //       Dictionary categories are >= DictCategoriesStart.
+               //       Non dict categories are   <  DictCategoriesStart.
+               int limitSecond = categories.first < fRB.fSetBuilder.getDictCategoriesStart() ?
+                   fRB.fSetBuilder.getDictCategoriesStart() : numCols;
+               for (categories.second=categories.first+1; categories.second < limitSecond; ++categories.second) {
                     for (int state=0; state<numStates; state++) {
                         RBBIStateDescriptor sd = fDStates.get(state);
                         table_base = sd.fDtran[categories.first];
@@ -1103,7 +1109,7 @@ class RBBITableBuilder {
             if (fRB.fTreeRoots[fRootIx] == null) {
                 return 0;
             }
-           int size    = 12;    // The header of 4 ints, with no rows to the table.
+           int size    = RBBIDataWrapper.RBBIStateTable.fHeaderSize;    // The header, with no rows to the table.
             int numRows = fDStates.size();
             int numCols = fRB.fSetBuilder.getNumCharCategories();
             boolean use8Bits = numRows <= MAX_STATE_FOR_8BITS_TABLE;
@@ -1132,17 +1138,18 @@ class RBBITableBuilder {
             Assert.assrt(fRB.fSetBuilder.getNumCharCategories() < 0x7fff &&
                 fDStates.size() < 0x7fff);
             table.fNumStates = fDStates.size();
+           table.fDictCategoriesStart = fRB.fSetBuilder.getDictCategoriesStart();
             boolean use8Bits = table.fNumStates <= MAX_STATE_FOR_8BITS_TABLE;
  
             // Size of table size in shorts.
             int rowLen = RBBIDataWrapper.NEXTSTATES + fRB.fSetBuilder.getNumCharCategories();   // Row Length in shorts.
             int tableSize;
             if (use8Bits) {
-               tableSize = (getTableSize() - 12);       // fTable length in bytes.
+               tableSize = (getTableSize() - RBBIDataWrapper.RBBIStateTable.fHeaderSize);       // fTable length in bytes.
                 table.fTable = new char[tableSize];
                 table.fRowLen = rowLen;                          // Row length in bytes.
             } else {
-               tableSize = (getTableSize() - 12) / 2;   // fTable length in shorts.
+               tableSize = (getTableSize() - RBBIDataWrapper.RBBIStateTable.fHeaderSize) / 2;   // fTable length in shorts.
                 table.fTable = new char[tableSize];
                 table.fRowLen = rowLen * 2;                      // Row length in bytes.
             }
@@ -1275,7 +1282,7 @@ class RBBITableBuilder {
             if (fSafeTable == null) {
                 return 0;
             }
-           int size    = 12;    // The header of 4 ints, with no rows to the table.
+           int size    = RBBIDataWrapper.RBBIStateTable.fHeaderSize;    // The header, with no rows to the table.
             int numRows = fSafeTable.size();
             int numCols = fSafeTable.get(0).length;
             boolean use8Bits = numRows <= MAX_STATE_FOR_8BITS_TABLE;
@@ -1303,7 +1310,7 @@ class RBBITableBuilder {
             int rowLen = RBBIDataWrapper.NEXTSTATES + numCharCategories;
             // TODO: tableSize is basically numStates * numCharCategories,
             //       except for alignment padding. Clean up here, and in main exportTable().
-           int tableSize = (getSafeTableSize() - 12);           // fTable length in bytes.
+           int tableSize = (getSafeTableSize() - RBBIDataWrapper.RBBIStateTable.fHeaderSize);     // fTable length in bytes.
             if (use8Bits) {
                 table.fFlags  |= RBBIDataWrapper.RBBI_8BITS_ROWS;
                 table.fTable = new char[tableSize];
@@ -1357,12 +1364,12 @@ class RBBITableBuilder {
             System.out.print("state |           i n p u t     s y m b o l s \n");
             System.out.print("      | Acc  LA    Tag");
             for (c=0; c<fRB.fSetBuilder.getNumCharCategories(); c++) {
-               RBBINode.printInt(c, 3);
+               RBBINode.printInt(c, 4);
             }
             System.out.print("\n");
             System.out.print("      |---------------");
             for (c=0; c<fRB.fSetBuilder.getNumCharCategories(); c++) {
-               System.out.print("---");
+               System.out.print("----");
             }
             System.out.print("\n");
  
@@ -1376,7 +1383,7 @@ class RBBITableBuilder {
                 RBBINode.printInt(sd.fTagsIdx, 6);
                 System.out.print(" ");
                 for (c=0; c<fRB.fSetBuilder.getNumCharCategories(); c++) {
-                   RBBINode.printInt(sd.fDtran[c], 3);
+                   RBBINode.printInt(sd.fDtran[c], 4);
                 }
                 System.out.print("\n");
             }
diff --git a/icu4j/main/classes/core/src/com/ibm/icu/text/RuleBasedBreakIterator.java b/icu4j/main/classes/core/src/com/ibm/icu/text/RuleBasedBreakIterator.java

index 5832dcd7a7ff867ca57a4a07261c5c43d42d43b8..8206eadd314de76d5d3c0e644f56c8bcdc3eef07 100644 (file)
--- a/icu4j/main/classes/core/src/com/ibm/icu/text/RuleBasedBreakIterator.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/text/RuleBasedBreakIterator.java
@@ -843,9 +843,8 @@ public class RuleBasedBreakIterator extends BreakIterator {
          int row             = fRData.getRowIndex(state);
          short category      = 3;
          int flagsState      = fRData.fFTable.fFlags;
+        int dictStart       = fRData.fFTable.fDictCategoriesStart;
          int mode            = RBBI_RUN;
-        int dictMask = fRData.fTrie.getValueWidth() == CodePointTrie.ValueWidth.BITS_8 ?
-            RBBIDataWrapper.DICT_BIT_FOR_8BITS_TRIE : RBBIDataWrapper.DICT_BIT;
          if ((flagsState & RBBIDataWrapper.RBBI_BOF_REQUIRED) != 0) {
              category = 2;
              mode     = RBBI_START;
@@ -882,15 +881,9 @@ public class RuleBasedBreakIterator extends BreakIterator {
                  //
                  category = (short) trie.get(c);
  
-                // Check the dictionary bit in the character's category.
-                //    Counter is only used by dictionary based iterators (subclasses).
-                //    Chars that need to be handled by a dictionary have a flag bit set
-                //    in their category values.
-                //
-                if ((category & dictMask) != 0)  {
+                // Check for categories that require word dictionary handling.
+                if (category >= dictStart) {
                      fDictionaryCharCount++;
-                    //  And off the dictionary flag bit.
-                    category &= ~dictMask;
                  }
  
                  if (TRACE) {
@@ -1004,9 +997,6 @@ public class RuleBasedBreakIterator extends BreakIterator {
          CharacterIterator text = fText;
          CodePointTrie trie = fRData.fTrie;
          char[] stateTable  = fRData.fRTable.fTable;
-        int flagsState      = fRData.fRTable.fFlags;
-        int dictMask = fRData.fTrie.getValueWidth() == CodePointTrie.ValueWidth.BITS_8 ?
-            RBBIDataWrapper.DICT_BIT_FOR_8BITS_TRIE : RBBIDataWrapper.DICT_BIT;
  
          CISetIndex32(text, fromPosition);
          if (TRACE) {
@@ -1032,7 +1022,6 @@ public class RuleBasedBreakIterator extends BreakIterator {
              //
              //  And off the dictionary flag bit. For reverse iteration it is not used.
              category = (short) trie.get(c);
-            category &= ~dictMask;
              if (TRACE) {
                  System.out.print("            " +  RBBIDataWrapper.intToString(text.getIndex(), 5));
                  System.out.print(RBBIDataWrapper.intToHexString(c, 10));
@@ -1212,8 +1201,6 @@ public class RuleBasedBreakIterator extends BreakIterator {
              int         category;
              int         current;
              int         foundBreakCount = 0;
-            int dictMask = fRData.fTrie.getValueWidth() == CodePointTrie.ValueWidth.BITS_8 ?
-                RBBIDataWrapper.DICT_BIT_FOR_8BITS_TRIE : RBBIDataWrapper.DICT_BIT;
  
              // Loop through the text, looking for ranges of dictionary characters.
              // For each span, find the appropriate break engine, and ask it to find
@@ -1222,9 +1209,10 @@ public class RuleBasedBreakIterator extends BreakIterator {
              fText.setIndex(rangeStart);
              int     c = CharacterIteration.current32(fText);
              category = (short)fRData.fTrie.get(c);
+            int dictStart = fRData.fFTable.fDictCategoriesStart;
  
              while(true) {
-                while((current = fText.getIndex()) < rangeEnd && (category & dictMask) == 0) {
+                while((current = fText.getIndex()) < rangeEnd && (category < dictStart)) {
                      c = CharacterIteration.next32(fText);    // pre-increment
                      category = (short)fRData.fTrie.get(c);
                  }
diff --git a/icu4j/main/shared/data/icudata.jar b/icu4j/main/shared/data/icudata.jar

index 0d7cbef46a5a28601b65f10fa78f576501d8bfee..158ef449747885c168be4ffaadc23522bb239b2c 100644 (file)
--- a/icu4j/main/shared/data/icudata.jar
+++ b/icu4j/main/shared/data/icudata.jar
@@ -1,3 +1,3 @@
  version https://git-lfs.github.com/spec/v1
-oid sha256:bdf00a19b05bc52e17c2aea74e87cc1872a824d5a9cced226078c46a194a8799
-size 13141762
+oid sha256:53e4c3251f31233ffcfe3ff4229ea43d81422a3fa071ee774ed835e5e969d22c
+size 13142859
diff --git a/icu4j/main/shared/data/icutzdata.jar b/icu4j/main/shared/data/icutzdata.jar

index 8fd2b94f214a3cb8b1cd716c1ecddf6639a48593..f80547f225e754113b4d09771940973592d34df1 100644 (file)
--- a/icu4j/main/shared/data/icutzdata.jar
+++ b/icu4j/main/shared/data/icutzdata.jar
@@ -1,3 +1,3 @@
  version https://git-lfs.github.com/spec/v1
-oid sha256:6d2882ccb44134313ff0365eb24776d4e859fa9dd223f10d608d65fdfd7f23d9
+oid sha256:72b712d8d19a5aa8d1cb36f070337010c29595c63d917cf81e3213a5ea5be2e7
  size 94529
diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITest.java b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITest.java

index 493098ad605c68575c158bdbcd1833fd5be7a859..504236095a5e4148c10b123acfc1ea2023f37001 100644 (file)
--- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITest.java
+++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITest.java
@@ -408,7 +408,7 @@ public class RBBITest extends TestFmwk {
              }
          }
  
-        List<Thread> threads = new ArrayList<Thread>();
+        List<Thread> threads = new ArrayList<>();
          for (int n = 0; n<4; ++n) {
              threads.add(new Thread(new WorkerThread()));
          }
@@ -513,7 +513,7 @@ public class RBBITest extends TestFmwk {
          }
          private static final BreakIterator BREAK_ITERATOR_CACHE = BreakIterator.getWordInstance(ULocale.ROOT);
          public static List<Integer> getBoundary(String toParse) {
-            List<Integer> retVal = new ArrayList<Integer>();
+            List<Integer> retVal = new ArrayList<>();
              BreakIterator bi = (BreakIterator) BREAK_ITERATOR_CACHE.clone();
              bi.setText(toParse);
              for (int boundary=bi.first(); boundary != BreakIterator.DONE; boundary = bi.next()) {
@@ -579,19 +579,20 @@ public class RBBITest extends TestFmwk {
          int numCharClasses = dw.fHeader.fCatCount;
  
          // Check for duplicate columns (character categories)
-        List<String> columns = new ArrayList<String>();
+        List<String> columns = new ArrayList<>();
          for (int column=0; column<numCharClasses; column++) {
              StringBuilder s = new StringBuilder();
              for (int r = 1; r < fwtbl.fNumStates; r++) {
                  int row = dw.getRowIndex(r);
                  char tableVal = fwtbl.fTable[row + RBBIDataWrapper.NEXTSTATES + column];
-                s.append((char)tableVal);
+                s.append(tableVal);
              }
              columns.add(s.toString());
          }
          // Ignore column (char class) 0 while checking; it's special, and may have duplicates.
          for (int c1=1; c1<numCharClasses; c1++) {
-            for (int c2 = c1+1; c2 < numCharClasses; c2++) {
+            int limit = c1 < fwtbl.fDictCategoriesStart ? fwtbl.fDictCategoriesStart : numCharClasses;
+            for (int c2 = c1+1; c2 < limit; c2++) {
                  assertFalse(String.format("Duplicate columns (%d, %d)", c1, c2), columns.get(c1).equals(columns.get(c2)));
                  // if (columns.get(c1).equals(columns.get(c2))) {
                  //    System.out.printf("Duplicate columns (%d, %d)\n", c1, c2);
@@ -600,7 +601,7 @@ public class RBBITest extends TestFmwk {
          }
  
          // Check for duplicate states.
-        List<String> rows = new ArrayList<String>();
+        List<String> rows = new ArrayList<>();
          for (int r=0; r<fwtbl.fNumStates; r++) {
              StringBuilder s = new StringBuilder();
              int row = dw.getRowIndex(r);
@@ -643,7 +644,7 @@ public class RBBITest extends TestFmwk {
      public void TestTableRebuild() {
          // Test to verify that rebuilding the state tables from rule source for the standard
          // break iterator types yields the same tables as are imported from ICU4C as part of the default data.
-        List<RuleBasedBreakIterator> breakIterators = new ArrayList<RuleBasedBreakIterator>();
+        List<RuleBasedBreakIterator> breakIterators = new ArrayList<>();
          breakIterators.add((RuleBasedBreakIterator)BreakIterator.getCharacterInstance(Locale.ENGLISH));
          breakIterators.add((RuleBasedBreakIterator)BreakIterator.getWordInstance(Locale.ENGLISH));
          breakIterators.add((RuleBasedBreakIterator)BreakIterator.getSentenceInstance(Locale.ENGLISH));
@@ -723,17 +724,17 @@ public class RBBITest extends TestFmwk {
  
      @Test
      public void Test8BitsTrieWith8BitStateTable() {
-        testTrieStateTable(123,  true /* expectUCPTrieValueWidthIn8Bits */,  true /* expectStateRowIn8Bits */);
+        testTrieStateTable(251,  true /* expectUCPTrieValueWidthIn8Bits */,  true /* expectStateRowIn8Bits */);
      }
  
      @Test
      public void Test16BitsTrieWith8BitStateTable() {
-        testTrieStateTable(124, false /* expectUCPTrieValueWidthIn8Bits */,  true /* expectStateRowIn8Bits */);
+        testTrieStateTable(252, false /* expectUCPTrieValueWidthIn8Bits */,  true /* expectStateRowIn8Bits */);
      }
  
      @Test
      public void Test16BitsTrieWith16BitStateTable() {
-        testTrieStateTable(255, false /* expectUCPTrieValueWidthIn8Bits */, false /* expectStateRowIn8Bits */);
+        testTrieStateTable(253, false /* expectUCPTrieValueWidthIn8Bits */, false /* expectStateRowIn8Bits */);
      }
  
      @Test
author	Andy Heninger <andy.heninger@gmail.com>
	Tue, 9 Jun 2020 20:19:17 +0000 (13:19 -0700)
committer	Andy Heninger <andy.heninger@gmail.com>
	Wed, 17 Jun 2020 19:00:14 +0000 (12:00 -0700)
icu4c/source/common/rbbi.cpp		patch \| blob \| history
icu4c/source/common/rbbi_cache.cpp		patch \| blob \| history
icu4c/source/common/rbbidata.h		patch \| blob \| history
icu4c/source/common/rbbirb.cpp		patch \| blob \| history
icu4c/source/common/rbbisetb.cpp		patch \| blob \| history
icu4c/source/common/rbbisetb.h		patch \| blob \| history
icu4c/source/common/rbbitblb.cpp		patch \| blob \| history
icu4c/source/common/unicode/rbbi.h		patch \| blob \| history
icu4c/source/test/intltest/rbbitst.cpp		patch \| blob \| history
icu4j/main/classes/core/src/com/ibm/icu/impl/RBBIDataWrapper.java		patch \| blob \| history
icu4j/main/classes/core/src/com/ibm/icu/text/RBBIRuleBuilder.java		patch \| blob \| history
icu4j/main/classes/core/src/com/ibm/icu/text/RBBISetBuilder.java		patch \| blob \| history
icu4j/main/classes/core/src/com/ibm/icu/text/RBBITableBuilder.java		patch \| blob \| history
icu4j/main/classes/core/src/com/ibm/icu/text/RuleBasedBreakIterator.java		patch \| blob \| history
icu4j/main/shared/data/icudata.jar		patch \| blob \| history
icu4j/main/shared/data/icutzdata.jar		patch \| blob \| history
icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITest.java		patch \| blob \| history