ICU-13194 RBBI auto reverse tables: size reduction, and remove hand written rules.

author Andy Heninger <andy.heninger@gmail.com>

Wed, 28 Mar 2018 01:20:13 +0000 (01:20 +0000)

committer Andy Heninger <andy.heninger@gmail.com>

Wed, 28 Mar 2018 01:20:13 +0000 (01:20 +0000)
author Andy Heninger <andy.heninger@gmail.com>
Wed, 28 Mar 2018 01:20:13 +0000 (01:20 +0000)
committer Andy Heninger <andy.heninger@gmail.com>
Wed, 28 Mar 2018 01:20:13 +0000 (01:20 +0000)
diff --git a/icu4c/source/common/rbbi.cpp b/icu4c/source/common/rbbi.cpp

index b48f028483fe16ff5489a77f87ac56197bc9e651..c5ea2770ba98549b34a6e65312d6b4c5b9449bdf 100644 (file)
--- a/icu4c/source/common/rbbi.cpp
+++ b/icu4c/source/common/rbbi.cpp
@@ -937,169 +937,6 @@ int32_t RuleBasedBreakIterator::handleNext() {
  }
  
  
-
-//-----------------------------------------------------------------------------------
-//
-//  handlePrevious()
-//
-//      Iterate backwards using the safe reverse rules.
-//      The logic of this function is very similar to handleNext(), above.
-//
-//-----------------------------------------------------------------------------------
-int32_t RuleBasedBreakIterator::handlePrevious(int32_t fromPosition) {
-    int32_t             state;
-    uint16_t            category        = 0;
-    RBBIRunMode         mode;
-    RBBIStateTableRow  *row;
-    UChar32             c;
-    LookAheadResults    lookAheadMatches;
-    int32_t             result          = 0;
-    int32_t             initialPosition = 0;
-
-    const RBBIStateTable *stateTable = fData->fSafeRevTable;
-    UTEXT_SETNATIVEINDEX(&fText, fromPosition);
-    #ifdef RBBI_DEBUG
-        if (gTrace) {
-            RBBIDebugPuts("Handle Previous   pos   char  state category");
-        }
-    #endif
-
-    // if we're already at the start of the text, return DONE.
-    if (fData == NULL || UTEXT_GETNATIVEINDEX(&fText)==0) {
-        return BreakIterator::DONE;
-    }
-
-    //  Set up the starting char.
-    initialPosition = (int32_t)UTEXT_GETNATIVEINDEX(&fText);
-    result          = initialPosition;
-    c               = UTEXT_PREVIOUS32(&fText);
-
-    //  Set the initial state for the state machine
-    state = START_STATE;
-    row = (RBBIStateTableRow *)
-            (stateTable->fTableData + (stateTable->fRowLen * state));
-    category = 3;
-    mode     = RBBI_RUN;
-    if (stateTable->fFlags & RBBI_BOF_REQUIRED) {
-        category = 2;
-        mode     = RBBI_START;
-    }
-
-
-    // loop until we reach the start of the text or transition to state 0
-    //
-    for (;;) {
-        if (c == U_SENTINEL) {
-            // Reached end of input string.
-            if (mode == RBBI_END) {
-                // We have already run the loop one last time with the
-                //   character set to the psueudo {eof} value.  Now it is time
-                //   to unconditionally bail out.
-                break;
-            }
-            // Run the loop one last time with the fake end-of-input character category.
-            mode = RBBI_END;
-            category = 1;
-        }
-
-        //
-        // Get the char category.  An incoming category of 1 or 2 means that
-        //      we are preset for doing the beginning or end of input, and
-        //      that we shouldn't get a category from an actual text input character.
-        //
-        if (mode == RBBI_RUN) {
-            // look up the current character's character category, which tells us
-            // which column in the state table to look at.
-            // Note:  the 16 in UTRIE_GET16 refers to the size of the data being returned,
-            //        not the size of the character going in, which is a UChar32.
-            //
-            //  And off the dictionary flag bit. For reverse iteration it is not used.
-            category = UTRIE2_GET16(fData->fTrie, c);
-            category &= ~0x4000;
-        }
-
-        #ifdef RBBI_DEBUG
-            if (gTrace) {
-                RBBIDebugPrintf("             %4d   ", (int32_t)utext_getNativeIndex(&fText));
-                if (0x20<=c && c<0x7f) {
-                    RBBIDebugPrintf("\"%c\"  ", c);
-                } else {
-                    RBBIDebugPrintf("%5x  ", c);
-                }
-                RBBIDebugPrintf("%3d  %3d\n", state, category);
-            }
-        #endif
-
-        // State Transition - move machine to its next state
-        //
-
-        // fNextState is a variable-length array.
-        U_ASSERT(category<fData->fHeader->fCatCount);
-        state = row->fNextState[category];  /*Not accessing beyond memory*/
-        row = (RBBIStateTableRow *)
-            (stateTable->fTableData + (stateTable->fRowLen * state));
-
-        if (row->fAccepting == -1) {
-            // Match found, common case.
-            result = (int32_t)UTEXT_GETNATIVEINDEX(&fText);
-        }
-
-        int16_t completedRule = row->fAccepting;
-        if (completedRule > 0) {
-            // Lookahead match is completed.
-            int32_t lookaheadResult = lookAheadMatches.getPosition(completedRule);
-            if (lookaheadResult >= 0) {
-                UTEXT_SETNATIVEINDEX(&fText, lookaheadResult);
-                return lookaheadResult;
-            }
-        }
-        int16_t rule = row->fLookAhead;
-        if (rule != 0) {
-            // At the position of a '/' in a look-ahead match. Record it.
-            int32_t  pos = (int32_t)UTEXT_GETNATIVEINDEX(&fText);
-            lookAheadMatches.setPosition(rule, pos);
-        }
-
-        if (state == STOP_STATE) {
-            // This is the normal exit from the lookup state machine.
-            // We have advanced through the string until it is certain that no
-            //   longer match is possible, no matter what characters follow.
-            break;
-        }
-
-        // Move (backwards) to the next character to process.
-        // If this is a beginning-of-input loop iteration, don't advance
-        //    the input position.  The next iteration will be processing the
-        //    first real input character.
-        if (mode == RBBI_RUN) {
-            c = UTEXT_PREVIOUS32(&fText);
-        } else {
-            if (mode == RBBI_START) {
-                mode = RBBI_RUN;
-            }
-        }
-    }
-
-    // The state machine is done.  Check whether it found a match...
-
-    // If the iterator failed to advance in the match engine, force it ahead by one.
-    //   (This really indicates a defect in the break rules.  They should always match
-    //    at least one character.)
-    if (result == initialPosition) {
-        UTEXT_SETNATIVEINDEX(&fText, initialPosition);
-        UTEXT_PREVIOUS32(&fText);
-        result = (int32_t)UTEXT_GETNATIVEINDEX(&fText);
-    }
-
-    #ifdef RBBI_DEBUG
-        if (gTrace) {
-            RBBIDebugPrintf("result = %d\n\n", result);
-        }
-    #endif
-    return result;
-}
-
-
  //-----------------------------------------------------------------------------------
  //
  //  handleSafePrevious()
diff --git a/icu4c/source/common/rbbidata.cpp b/icu4c/source/common/rbbidata.cpp

index 385ab08b3295b4eafc2473b4195c8f5b0a7e1f00..1d4c9e5895f376fa0b02e681d47181dae2c8cf47 100644 (file)
--- a/icu4c/source/common/rbbidata.cpp
+++ b/icu4c/source/common/rbbidata.cpp
@@ -80,7 +80,7 @@ UBool RBBIDataWrapper::isDataVersionAcceptable(const UVersionInfo version) {
  void RBBIDataWrapper::init0() {
      fHeader = NULL;
      fForwardTable = NULL;
-    fSafeRevTable = NULL;
+    fReverseTable = NULL;
      fRuleSource   = NULL;
      fRuleStatusTable = NULL;
      fTrie         = NULL;
@@ -109,9 +109,6 @@ void RBBIDataWrapper::init(const RBBIDataHeader *data, UErrorCode &status) {
      if (data->fRTableLen != 0) {
          fReverseTable = (RBBIStateTable *)((char *)data + fHeader->fRTable);
      }
-    if (data->fSRTableLen != 0) {   // TODO: obsolete. Remove.
-        fSafeRevTable = (RBBIStateTable *)((char *)data + fHeader->fSRTable);
-    }
  
      fTrie = utrie2_openFromSerialized(UTRIE2_16_VALUE_BITS,
                                        (uint8_t *)data + fHeader->fTrie,
@@ -262,7 +259,7 @@ void  RBBIDataWrapper::printData() {
      RBBIDebugPrintf("   number of character categories = %d\n\n", fHeader->fCatCount);
  
      printTable("Forward State Transition Table", fForwardTable);
-    printTable("Safe Reverse State Transition Table", fSafeRevTable);
+    printTable("Reverse State Transition Table", fReverseTable);
  
      RBBIDebugPrintf("\nOrignal Rules source:\n");
      for (int32_t c=0; fRuleSource[c] != 0; c++) {
@@ -402,28 +399,6 @@ ubrk_swap(const UDataSwapper *ds, const void *inData, int32_t length, void *outD
                              outBytes+tableStartOffset+topSize, status);
      }
  
-    // Safe Forward state table.  Same layout as forward table, above.
-    tableStartOffset = ds->readUInt32(rbbiDH->fSFTable);
-    tableLength      = ds->readUInt32(rbbiDH->fSFTableLen);
-
-    if (tableLength > 0) {
-        ds->swapArray32(ds, inBytes+tableStartOffset, topSize, 
-                            outBytes+tableStartOffset, status);
-        ds->swapArray16(ds, inBytes+tableStartOffset+topSize, tableLength-topSize,
-                            outBytes+tableStartOffset+topSize, status);
-    }
-
-    // Safe Reverse state table.  Same layout as forward table, above.
-    tableStartOffset = ds->readUInt32(rbbiDH->fSRTable);
-    tableLength      = ds->readUInt32(rbbiDH->fSRTableLen);
-
-    if (tableLength > 0) {
-        ds->swapArray32(ds, inBytes+tableStartOffset, topSize, 
-                            outBytes+tableStartOffset, status);
-        ds->swapArray16(ds, inBytes+tableStartOffset+topSize, tableLength-topSize,
-                            outBytes+tableStartOffset+topSize, status);
-    }
-
      // Trie table for character categories
      utrie2_swap(ds, inBytes+ds->readUInt32(rbbiDH->fTrie), ds->readUInt32(rbbiDH->fTrieLen),
                      outBytes+ds->readUInt32(rbbiDH->fTrie), status);
diff --git a/icu4c/source/common/rbbidata.h b/icu4c/source/common/rbbidata.h

index 83b96f261de4d9147d768f138f25f8d43628dfc2..4a66b67ac887ad13bb72426f43271df720f10b69 100644 (file)
--- a/icu4c/source/common/rbbidata.h
+++ b/icu4c/source/common/rbbidata.h
@@ -81,10 +81,6 @@ struct RBBIDataHeader {
      uint32_t         fFTableLen;
      uint32_t         fRTable;         /*  Offset to the reverse state transition table. */
      uint32_t         fRTableLen;
-    uint32_t         fSFTable;        /*  safe point forward transition table */
-    uint32_t         fSFTableLen;
-    uint32_t         fSRTable;        /*  safe point reverse transition table */
-    uint32_t         fSRTableLen;
      uint32_t         fTrie;           /*  Offset to Trie data for character categories */
      uint32_t         fTrieLen;
      uint32_t         fRuleSource;     /*  Offset to the source for for the break */
@@ -173,8 +169,7 @@ public:
      /*                                     */
      const RBBIDataHeader     *fHeader;
      const RBBIStateTable     *fForwardTable;
-    const RBBIStateTable     *fReverseTable;      // auto-generated safe reverse.
-    const RBBIStateTable     *fSafeRevTable;      // hand-written safe reverse. TODO: delete this.
+    const RBBIStateTable     *fReverseTable;
      const UChar              *fRuleSource;
      const int32_t            *fRuleStatusTable; 
  
diff --git a/icu4c/source/common/rbbirb.cpp b/icu4c/source/common/rbbirb.cpp

index 4a3698893851dafb8b7244b7d60d73339d0b2bec..90752ba1468b2f19fb0d395d8dc9d34435990d94 100644 (file)
--- a/icu4c/source/common/rbbirb.cpp
+++ b/icu4c/source/common/rbbirb.cpp
@@ -62,8 +62,7 @@ RBBIRuleBuilder::RBBIRuleBuilder(const UnicodeString   &rules,
      fSafeFwdTree        = NULL;
      fSafeRevTree        = NULL;
      fDefaultTree        = &fForwardTree;
-    fForwardTables      = NULL;
-    fSafeRevTables      = NULL;
+    fForwardTable       = NULL;
      fRuleStatusVals     = NULL;
      fChainRules         = FALSE;
      fLBCMNoChain        = FALSE;
@@ -112,9 +111,7 @@ RBBIRuleBuilder::~RBBIRuleBuilder() {
  
      delete fUSetNodes;
      delete fSetBuilder;
-    delete fForwardTables;
-    delete fSafeRevTables;
-
+    delete fForwardTable;
      delete fForwardTree;
      delete fReverseTree;
      delete fSafeFwdTree;
@@ -153,9 +150,8 @@ RBBIDataHeader *RBBIRuleBuilder::flattenData() {
      //     without the padding.
      //
      int32_t headerSize        = align8(sizeof(RBBIDataHeader));
-    int32_t forwardTableSize  = align8(fForwardTables->getTableSize());
-    int32_t reverseTableSize  = align8(fForwardTables->getSafeTableSize());
-    int32_t safeRevTableSize  = align8(fSafeRevTables->getTableSize());  // TODO: remove hand-written rules.
+    int32_t forwardTableSize  = align8(fForwardTable->getTableSize());
+    int32_t reverseTableSize  = align8(fForwardTable->getSafeTableSize());
      int32_t trieSize          = align8(fSetBuilder->getTrieSize());
      int32_t statusTableSize   = align8(fRuleStatusVals->size() * sizeof(int32_t));
      int32_t rulesSize         = align8((fStrippedRules.length()+1) * sizeof(UChar));
@@ -163,7 +159,6 @@ RBBIDataHeader *RBBIRuleBuilder::flattenData() {
      int32_t         totalSize = headerSize
                                  + forwardTableSize
                                  + reverseTableSize
-                                + safeRevTableSize
                                  + statusTableSize + trieSize + rulesSize;
  
      RBBIDataHeader  *data     = (RBBIDataHeader *)uprv_malloc(totalSize);
@@ -188,16 +183,7 @@ RBBIDataHeader *RBBIRuleBuilder::flattenData() {
      data->fRTable        = data->fFTable  + data->fFTableLen;
      data->fRTableLen     = reverseTableSize;
  
-    // Do not save the Safe Forward table.
-    data->fSFTable       = data->fRTable + data->fRTableLen;
-    data->fSFTableLen    = 0;
-
-    // Hand written reverse rules. TODO: remove, once synthesized ones are working.
-    data->fSRTable       = data->fSFTable + data->fSFTableLen;
-    data->fSRTableLen    = safeRevTableSize;
-    U_ASSERT(safeRevTableSize > 0);
- 
-    data->fTrie          = data->fSRTable + data->fSRTableLen;
+    data->fTrie          = data->fRTable + data->fRTableLen;
      data->fTrieLen       = fSetBuilder->getTrieSize();
      data->fStatusTable   = data->fTrie    + trieSize;
      data->fStatusTableLen= statusTableSize;
@@ -206,10 +192,8 @@ RBBIDataHeader *RBBIRuleBuilder::flattenData() {
  
      uprv_memset(data->fReserved, 0, sizeof(data->fReserved));
  
-    fForwardTables->exportTable((uint8_t *)data + data->fFTable);
-    fForwardTables->exportSafeTable((uint8_t *)data + data->fRTable);
-    fSafeRevTables->exportTable((uint8_t *)data + data->fSRTable);
-
+    fForwardTable->exportTable((uint8_t *)data + data->fFTable);
+    fForwardTable->exportSafeTable((uint8_t *)data + data->fRTable);
      fSetBuilder->serializeTrie ((uint8_t *)data + data->fTrie);
  
      int32_t *ruleStatusTable = (int32_t *)((uint8_t *)data + data->fStatusTable);
@@ -286,25 +270,22 @@ RBBIDataHeader *RBBIRuleBuilder::build(UErrorCode &status) {
      //
      //   Generate the DFA state transition table.
      //
-    fForwardTables = new RBBITableBuilder(this, &fForwardTree, status);
-    fSafeRevTables = new RBBITableBuilder(this, &fSafeRevTree, status);
-    if (fForwardTables == nullptr || fSafeRevTables == nullptr)
-    {
+    fForwardTable = new RBBITableBuilder(this, &fForwardTree, status);
+    if (fForwardTable == nullptr) {
          status = U_MEMORY_ALLOCATION_ERROR;
          return nullptr;
      }
  
-    fForwardTables->build();
-    fSafeRevTables->build();
+    fForwardTable->buildForwardTable();
      optimizeTables();
-    fForwardTables->buildSafe(status);
+    fForwardTable->buildSafeReverseTable(status);
  
  
  #ifdef RBBI_DEBUG
      if (fDebugEnv && uprv_strstr(fDebugEnv, "states")) {
-        fForwardTables->printStates();
-        fForwardTables->printRuleStatusTable();
-        fForwardTables->printSafeTable();
+        fForwardTable->printStates();
+        fForwardTable->printRuleStatusTable();
+        fForwardTable->printReverseTable();
      }
  #endif
  
@@ -327,14 +308,11 @@ void RBBIRuleBuilder::optimizeTables() {
  
      leftClass = 3;
      rightClass = 0;
-    while (fForwardTables->findDuplCharClassFrom(leftClass, rightClass)) {
+    while (fForwardTable->findDuplCharClassFrom(leftClass, rightClass)) {
          fSetBuilder->mergeCategories(leftClass, rightClass);
-        fForwardTables->removeColumn(rightClass);
-        fSafeRevTables->removeColumn(rightClass);
+        fForwardTable->removeColumn(rightClass);
      }
-
-    fForwardTables->removeDuplicateStates();
-    fSafeRevTables->removeDuplicateStates();
+    fForwardTable->removeDuplicateStates();
  }
  
  U_NAMESPACE_END
diff --git a/icu4c/source/common/rbbirb.h b/icu4c/source/common/rbbirb.h

index ca71f40c7dc1ea7b76b8dc9dc89a20cc4a7441b4..59ff66f9044fd20bec16faf65adc641c783fd968 100644 (file)
--- a/icu4c/source/common/rbbirb.h
+++ b/icu4c/source/common/rbbirb.h
@@ -168,8 +168,7 @@ public:
      RBBISetBuilder                *fSetBuilder;      // Set and Character Category builder.
      UVector                       *fUSetNodes;       // Vector of all uset nodes.
  
-    RBBITableBuilder              *fForwardTables;   // State transition tables
-    RBBITableBuilder              *fSafeRevTables;
+    RBBITableBuilder              *fForwardTable;    // State transition table, build time form.
  
      UVector                       *fRuleStatusVals;  // The values that can be returned
                                                       //   from getRuleStatus().
diff --git a/icu4c/source/common/rbbitblb.cpp b/icu4c/source/common/rbbitblb.cpp

index c3bcc722293e1d97fe6728dab4d399235ca66db6..2a2e7132bc9352acfa7c65a1e9d43a0afb0b0cd5 100644 (file)
--- a/icu4c/source/common/rbbitblb.cpp
+++ b/icu4c/source/common/rbbitblb.cpp
@@ -61,7 +61,7 @@ RBBITableBuilder::~RBBITableBuilder() {
  //                               table from the RBBI rules parse tree.
  //
  //-----------------------------------------------------------------------------
-void  RBBITableBuilder::build() {
+void  RBBITableBuilder::buildForwardTable() {
  
      if (U_FAILURE(*fStatus)) {
          return;
@@ -1150,6 +1150,35 @@ bool RBBITableBuilder::findDuplicateState(int32_t &firstState, int32_t &duplStat
      return false;
  }
  
+
+bool RBBITableBuilder::findDuplicateSafeState(int32_t *firstState, int32_t *duplState) {
+    int32_t numStates = fSafeTable->size();
+
+    for (; *firstState<numStates-1; ++(*firstState)) {
+        UnicodeString *firstRow = static_cast<UnicodeString *>(fSafeTable->elementAt(*firstState));
+        for (*duplState=*firstState+1; *duplState<numStates; ++(*duplState)) {
+            UnicodeString *duplRow = static_cast<UnicodeString *>(fSafeTable->elementAt(*duplState));
+            bool rowsMatch = true;
+            int32_t numCols = firstRow->length();
+            for (int32_t col=0; col < numCols; ++col) {
+                int32_t firstVal = firstRow->charAt(col);
+                int32_t duplVal = duplRow->charAt(col);
+                if (!((firstVal == duplVal) ||
+                        ((firstVal == *firstState || firstVal == *duplState) &&
+                        (duplVal  == *firstState || duplVal  == *duplState)))) {
+                    rowsMatch = false;
+                    break;
+                }
+            }
+            if (rowsMatch) {
+                return true;
+            }
+        }
+    }
+    return false;
+}
+
+
  void RBBITableBuilder::removeState(int32_t keepState, int32_t duplState) {
      U_ASSERT(keepState < duplState);
      U_ASSERT(duplState < fDStates->size());
@@ -1185,6 +1214,29 @@ void RBBITableBuilder::removeState(int32_t keepState, int32_t duplState) {
      }
  }
  
+void RBBITableBuilder::removeSafeState(int32_t keepState, int32_t duplState) {
+    U_ASSERT(keepState < duplState);
+    U_ASSERT(duplState < fSafeTable->size());
+
+    fSafeTable->removeElementAt(duplState);   // Note that fSafeTable has a deleter function
+                                              // and will auto-delete the removed element.
+    int32_t numStates = fSafeTable->size();
+    int32_t numCols = fRB->fSetBuilder->getNumCharCategories();
+    for (int32_t state=0; state<numStates; ++state) {
+        UnicodeString *sd = (UnicodeString *)fSafeTable->elementAt(state);
+        for (int32_t col=0; col<numCols; col++) {
+            int32_t existingVal = sd->charAt(col);
+            int32_t newVal = existingVal;
+            if (existingVal == duplState) {
+                newVal = keepState;
+            } else if (existingVal > duplState) {
+                newVal = existingVal - 1;
+            }
+            sd->setCharAt(col, newVal);
+        }
+    }
+}
+
  
  /*
   * RemoveDuplicateStates
@@ -1198,6 +1250,7 @@ void RBBITableBuilder::removeDuplicateStates() {
      }
  }
  
+
  //-----------------------------------------------------------------------------
  //
  //   getTableSize()    Calculate the size of the runtime form of this
@@ -1277,7 +1330,7 @@ void RBBITableBuilder::exportTable(void *where) {
  /**
   *   Synthesize a safe state table from the main state table.
   */
-void RBBITableBuilder::buildSafe(UErrorCode &status) {
+void RBBITableBuilder::buildSafeReverseTable(UErrorCode &status) {
      // Find safe char class pairs.
  
      // make a state table row for each trailing class, and map from class to row.
@@ -1358,8 +1411,13 @@ void RBBITableBuilder::buildSafe(UErrorCode &status) {
          rowState.setCharAt(c1, 0);
      }
  
-    // TODO: Merge similar states.
-
+    // Remove duplicate or redundant rows from the table.
+    int32_t firstState = 1;
+    int32_t duplicateState = 0;    // initial value is not used; set by findDuplicateSafeState().
+    while (findDuplicateSafeState(&firstState, &duplicateState)) {
+        // printf("Removing duplicate safe states (%d, %d)\n", firstState, duplicateState);
+        removeSafeState(firstState, duplicateState);
+    }
  }
  
  
@@ -1493,7 +1551,7 @@ void RBBITableBuilder::printStates() {
  //
  //-----------------------------------------------------------------------------
  #ifdef RBBI_DEBUG
-void RBBITableBuilder::printSafeTable() {
+void RBBITableBuilder::printReverseTable() {
      int     c;    // input "character"
      int     n;    // state number
  
diff --git a/icu4c/source/common/rbbitblb.h b/icu4c/source/common/rbbitblb.h

index 6b5bb86196d5e7b1ea532624a36c33087614d6d7..9ab1942fb3037d1ac7c904b8372179d5696d4722 100644 (file)
--- a/icu4c/source/common/rbbitblb.h
+++ b/icu4c/source/common/rbbitblb.h
@@ -40,7 +40,7 @@ public:
      RBBITableBuilder(RBBIRuleBuilder *rb, RBBINode **rootNode, UErrorCode &status);
      ~RBBITableBuilder();
  
-    void     build();
+    void     buildForwardTable();
  
      /** Return the runtime size in bytes of the built state table.  */
      int32_t  getTableSize() const;
@@ -63,7 +63,8 @@ public:
      /** Check for, and remove dupicate states (table rows). */
      void     removeDuplicateStates();
  
-    void     buildSafe(UErrorCode &status);
+    /** Build the safe reverse table from the already-constructed forward table. */
+    void     buildSafeReverseTable(UErrorCode &status);
  
      /** Return the runtime size in bytes of the built safe reverse state table. */
      int32_t  getSafeTableSize() const;
@@ -109,6 +110,21 @@ private:
       */
      void removeState(int32_t keepState, int32_t duplState);
  
+    /** Find the next duplicate state in the safe reverse table. An iterator function.
+     * @param firstState ptr to state variable. Begin looking at this state, set to the first of the
+     *                   pair of duplicates on return.
+     * @param duplicateState ptr to where to return the duplicate state of fistState. Output only.
+     * @return true if a duplicate pair of states was found.
+     */
+    bool findDuplicateSafeState(int32_t *firstState, int32_t *duplicateState);
+
+    /** Remove a duplicate state from the safe table.
+     * @param keepState First of the duplicate pair. Keep it.
+     * @param duplState Duplicate state. Remove it. Redirect all table references to the duplicate state
+     *                  to refer to keepState instead.
+     */
+    void removeSafeState(int32_t keepState, int32_t duplState);
+
      // Set functions for UVector.
      //   TODO:  make a USet subclass of UVector
  
@@ -123,7 +139,7 @@ public:
      void     printPosSets(RBBINode *n /* = NULL*/);
      void     printStates();
      void     printRuleStatusTable();
-    void     printSafeTable();
+    void     printReverseTable();
  #else
      #define  printSet(s)
      #define  printPosSets(n)
diff --git a/icu4c/source/data/brkitr/rules/char.txt b/icu4c/source/data/brkitr/rules/char.txt

index 1bfdef637a959dea8649e86be146bc514e383fd2..a3d0c9f934ca4d62707af75d4d3834858b85bb2e 100644 (file)
--- a/icu4c/source/data/brkitr/rules/char.txt
+++ b/icu4c/source/data/brkitr/rules/char.txt
@@ -49,7 +49,6 @@ $EmojiNRK    = [[\p{Emoji}] - [\p{Grapheme_Cluster_Break = Regional_Indicator}*\
  ## -------------------------------------------------
  !!chain;
  !!lookAheadHardBreak;
-!!forward;
  
  $CR $LF;
  
@@ -80,9 +79,3 @@ $Prepend [^$Control $CR $LF];
  
  # GB 999 Match a single code point if no other rule applies.
  .;
-
-## -------------------------------------------------
-
-!!safe_reverse;
-$Regional_Indicator $Regional_Indicator;
-($Extend | $ZWJ | $EmojiNRK | $Extended_Pict)+ .;
diff --git a/icu4c/source/data/brkitr/rules/line.txt b/icu4c/source/data/brkitr/rules/line.txt

index 756b19d74f69e6f85554c56363683b55ce06fb08..7c3baea755ddd25ffae0c6c8ac5f756d472bd23f 100644 (file)
--- a/icu4c/source/data/brkitr/rules/line.txt
+++ b/icu4c/source/data/brkitr/rules/line.txt
@@ -99,8 +99,6 @@ $ALPlus = [$AL $AI $SG $XX [$SA-[[:Mn:][:Mc:]]]];
  
  ## -------------------------------------------------
  
-!!forward;
-
  #
  # CAN_CM  is the set of characters that may combine with CM combining chars.
  #         Note that Linebreak UAX 14's concept of a combining char and the rules
@@ -338,36 +336,3 @@ $EB $CM* $EM;
  # LB 31 Break everywhere else.
  #       Match a single code point if no other rule applies.
  .;
-
-## -------------------------------------------------
-
-!!safe_reverse;
-
-# LB 9
-^$CM+ [^$CM $BK $CR $LF $NL $ZW $SP];
-
-# LB 14
-$SP+ $CM* $OP;
-
-# LB 15
-$SP+ $CM* $QU;
-
-# LB 16
-$SP+ $CM* ($CL | $CP);
-
-# LB 17
-$SP+ $CM* $B2;
-
-# LB 21
-$CM* ($HY | $BA) $CM* $HL;
-
-# LB 25
-($CM* ($IS | $SY))+ $CM* $NU;
-($CL | $CP) $CM* ($NU | $IS | $SY);
-
-#  LB 30
-($CM* $RI)+;
-
-# For dictionary-based break
-$dictionary $dictionary;
-
diff --git a/icu4c/source/data/brkitr/rules/line_fi.txt b/icu4c/source/data/brkitr/rules/line_fi.txt

index 5d0f666b830305918273720d38332e8edb5a6bf9..9bb72de04b4403037a61b1e4b887d145bf5df771 100644 (file)
--- a/icu4c/source/data/brkitr/rules/line_fi.txt
+++ b/icu4c/source/data/brkitr/rules/line_fi.txt
@@ -105,8 +105,6 @@ $ALPlus = [$AL $AI $SG $XX [$SA-[[:Mn:][:Mc:]]]];
  
  ## -------------------------------------------------
  
-!!forward;
-
  #
  # CAN_CM  is the set of characters that may combine with CM combining chars.
  #         Note that Linebreak UAX 14's concept of a combining char and the rules
@@ -347,35 +345,3 @@ $EB $CM* $EM;
  # LB 31 Break everywhere else.
  #       Match a single code point if no other rule applies.
  .;
-
-## -------------------------------------------------
-
-!!safe_reverse;
-
-# LB 9
-^$CM+ [^$CM $BK $CR $LF $NL $ZW $SP];
-
-# LB 14
-$SP+ $CM* $OP;
-
-# LB 15
-$SP+ $CM* $QU;
-
-# LB 16
-$SP+ $CM* ($CL | $CP);
-
-# LB 17
-$SP+ $CM* $B2;
-
-# LB 21
-$CM* ($HY | $BA | $HH) $CM* $HL;
-
-# LB 25
-($CM* ($IS | $SY))+ $CM* $NU;
-($CL | $CP) $CM* ($NU | $IS | $SY);
-
-#  LB 30
-($CM* $RI)+;
-
-# For dictionary-based break
-$dictionary $dictionary;
diff --git a/icu4c/source/data/brkitr/rules/line_loose.txt b/icu4c/source/data/brkitr/rules/line_loose.txt

index ba18f1cd2c48a577b3b656c3a5054a8cb9319df3..409ae07ba83826858fa7e8756ea7b7e1cd817241 100644 (file)
--- a/icu4c/source/data/brkitr/rules/line_loose.txt
+++ b/icu4c/source/data/brkitr/rules/line_loose.txt
@@ -108,8 +108,6 @@ $ALPlus = [$AL $AI $SG $XX [$SA-[[:Mn:][:Mc:]]]];
  
  ## -------------------------------------------------
  
-!!forward;
-
  #
  # CAN_CM  is the set of characters that may combine with CM combining chars.
  #         Note that Linebreak UAX 14's concept of a combining char and the rules
@@ -350,35 +348,3 @@ $EB $CM* $EM;
  # LB 31 Break everywhere else.
  #       Match a single code point if no other rule applies.
  .;
-
-## -------------------------------------------------
-
-!!safe_reverse;
-
-# LB 9
-^$CM+ [^$CM $BK $CR $LF $NL $ZW $SP];
-
-# LB 14
-$SP+ $CM* $OP;
-
-# LB 15
-$SP+ $CM* $QU;
-
-# LB 16
-$SP+ $CM* ($CL | $CP);
-
-# LB 17
-$SP+ $CM* $B2;
-
-# LB 21
-$CM* ($HY | $BA) $CM* $HL;
-
-# LB 25
-($CM* ($IS | $SY))+ $CM* $NU;
-($CL | $CP) $CM* ($NU | $IS | $SY);
-
-#  LB 30
-($CM* $RI)+;
-
-# For dictionary-based break
-$dictionary $dictionary;
diff --git a/icu4c/source/data/brkitr/rules/line_loose_cj.txt b/icu4c/source/data/brkitr/rules/line_loose_cj.txt

index 48791c4d5eb0f4e375e09ad1d9b83da195a9d75d..5de9318e73dda9f42726a5c0e52f0aa296e72ac0 100644 (file)
--- a/icu4c/source/data/brkitr/rules/line_loose_cj.txt
+++ b/icu4c/source/data/brkitr/rules/line_loose_cj.txt
@@ -118,8 +118,6 @@ $ALPlus = [$AL $AI $SG $XX [$SA-[[:Mn:][:Mc:]]]];
  
  ## -------------------------------------------------
  
-!!forward;
-
  #
  # CAN_CM  is the set of characters that may combine with CM combining chars.
  #         Note that Linebreak UAX 14's concept of a combining char and the rules
@@ -364,35 +362,3 @@ $EB $CM* $EM;
  # LB 31 Break everywhere else.
  #       Match a single code point if no other rule applies.
  .;
-
-## -------------------------------------------------
-
-!!safe_reverse;
-
-# LB 9
-^$CM+ [^$CM $BK $CR $LF $NL $ZW $SP];
-
-# LB 14
-$SP+ $CM* $OP;
-
-# LB 15
-$SP+ $CM* $QU;
-
-# LB 16
-$SP+ $CM* ($CL | $CP);
-
-# LB 17
-$SP+ $CM* $B2;
-
-# LB 21
-$CM* ($HY | $BA | $BAX) $CM* $HL;
-
-# LB 25
-($CM* ($IS | $SY))+ $CM* $NU;
-($CL | $CP) $CM* ($NU | $IS | $SY);
-
-#  LB 30
-($CM* $RI)+;
-
-# For dictionary-based break
-$dictionary $dictionary;
diff --git a/icu4c/source/data/brkitr/rules/line_loose_fi.txt b/icu4c/source/data/brkitr/rules/line_loose_fi.txt

index cc09db7969ddb902c5ec9403cd3da8540c6a6ef2..7efa3c9377b8d1c711993494101781819fe6c000 100644 (file)
--- a/icu4c/source/data/brkitr/rules/line_loose_fi.txt
+++ b/icu4c/source/data/brkitr/rules/line_loose_fi.txt
@@ -104,8 +104,6 @@ $ALPlus = [$AL $AI $SG $XX [$SA-[[:Mn:][:Mc:]]]];
  
  ## -------------------------------------------------
  
-!!forward;
-
  #
  # CAN_CM  is the set of characters that may combine with CM combining chars.
  #         Note that Linebreak UAX 14's concept of a combining char and the rules
@@ -349,35 +347,3 @@ $EB $CM* $EM;
  # LB 31 Break everywhere else.
  #       Match a single code point if no other rule applies.
  .;
-
-## -------------------------------------------------
-
-!!safe_reverse;
-
-# LB 9
-^$CM+ [^$CM $BK $CR $LF $NL $ZW $SP];
-
-# LB 14
-$SP+ $CM* $OP;
-
-# LB 15
-$SP+ $CM* $QU;
-
-# LB 16
-$SP+ $CM* ($CL | $CP);
-
-# LB 17
-$SP+ $CM* $B2;
-
-# LB 21
-$CM* ($HY | $BA | $HH) $CM* $HL;
-
-# LB 25
-($CM* ($IS | $SY))+ $CM* $NU;
-($CL | $CP) $CM* ($NU | $IS | $SY);
-
-#  LB 30
-($CM* $RI)+;
-
-# For dictionary-based break
-$dictionary $dictionary;
diff --git a/icu4c/source/data/brkitr/rules/line_normal.txt b/icu4c/source/data/brkitr/rules/line_normal.txt

index bc417cf567fd66fcdb68cb851ebbbce0778bb935..5e63223a92946678dedbabe2e5019f48653765e1 100644 (file)
--- a/icu4c/source/data/brkitr/rules/line_normal.txt
+++ b/icu4c/source/data/brkitr/rules/line_normal.txt
@@ -103,8 +103,6 @@ $ALPlus = [$AL $AI $SG $XX [$SA-[[:Mn:][:Mc:]]]];
  
  ## -------------------------------------------------
  
-!!forward;
-
  #
  # CAN_CM  is the set of characters that may combine with CM combining chars.
  #         Note that Linebreak UAX 14's concept of a combining char and the rules
@@ -342,35 +340,3 @@ $EB $CM* $EM;
  # LB 31 Break everywhere else.
  #       Match a single code point if no other rule applies.
  .;
-
-## -------------------------------------------------
-
-!!safe_reverse;
-
-# LB 9
-^$CM+ [^$CM $BK $CR $LF $NL $ZW $SP];
-
-# LB 14
-$SP+ $CM* $OP;
-
-# LB 15
-$SP+ $CM* $QU;
-
-# LB 16
-$SP+ $CM* ($CL | $CP);
-
-# LB 17
-$SP+ $CM* $B2;
-
-# LB 21
-$CM* ($HY | $BA) $CM* $HL;
-
-# LB 25
-($CM* ($IS | $SY))+ $CM* $NU;
-($CL | $CP) $CM* ($NU | $IS | $SY);
-
-#  LB 30
-($CM* $RI)+;
-
-# For dictionary-based break
-$dictionary $dictionary;
diff --git a/icu4c/source/data/brkitr/rules/line_normal_cj.txt b/icu4c/source/data/brkitr/rules/line_normal_cj.txt

index 5dbcd85ad40ac8e033e892f06273910e9a78a9f6..bbce68f9cef0c252363c640180dd628acfa12bb8 100644 (file)
--- a/icu4c/source/data/brkitr/rules/line_normal_cj.txt
+++ b/icu4c/source/data/brkitr/rules/line_normal_cj.txt
@@ -106,8 +106,6 @@ $ALPlus = [$AL $AI $SG $XX [$SA-[[:Mn:][:Mc:]]]];
  
  ## -------------------------------------------------
  
-!!forward;
-
  #
  # CAN_CM  is the set of characters that may combine with CM combining chars.
  #         Note that Linebreak UAX 14's concept of a combining char and the rules
@@ -348,35 +346,3 @@ $EB $CM* $EM;
  # LB 31 Break everywhere else.
  #       Match a single code point if no other rule applies.
  .;
-
-## -------------------------------------------------
-
-!!safe_reverse;
-
-# LB 9
-^$CM+ [^$CM $BK $CR $LF $NL $ZW $SP];
-
-# LB 14
-$SP+ $CM* $OP;
-
-# LB 15
-$SP+ $CM* $QU;
-
-# LB 16
-$SP+ $CM* ($CL | $CP);
-
-# LB 17
-$SP+ $CM* $B2;
-
-# LB 21
-$CM* ($HY | $BA | $BAX) $CM* $HL;
-
-# LB 25
-($CM* ($IS | $SY))+ $CM* $NU;
-($CL | $CP) $CM* ($NU | $IS | $SY);
-
-#  LB 30
-($CM* $RI)+;
-
-# For dictionary-based break
-$dictionary $dictionary;
diff --git a/icu4c/source/data/brkitr/rules/line_normal_fi.txt b/icu4c/source/data/brkitr/rules/line_normal_fi.txt

index 5cd5553605f6485697104ddc5c5f2dcd1dbf9b1f..07df9cdcd6e1e5bd19cc21abb7a671ea08a6fb9d 100644 (file)
--- a/icu4c/source/data/brkitr/rules/line_normal_fi.txt
+++ b/icu4c/source/data/brkitr/rules/line_normal_fi.txt
@@ -103,8 +103,6 @@ $ALPlus = [$AL $AI $SG $XX [$SA-[[:Mn:][:Mc:]]]];
  
  ## -------------------------------------------------
  
-!!forward;
-
  #
  # CAN_CM  is the set of characters that may combine with CM combining chars.
  #         Note that Linebreak UAX 14's concept of a combining char and the rules
@@ -345,36 +343,3 @@ $EB $CM* $EM;
  # LB 31 Break everywhere else.
  #       Match a single code point if no other rule applies.
  .;
-
-## -------------------------------------------------
-
-!!safe_reverse;
-
-# LB 9
-^$CM+ [^$CM $BK $CR $LF $NL $ZW $SP];
-^$CM+ $SP / .;
-
-# LB 14
-$SP+ $CM* $OP;
-
-# LB 15
-$SP+ $CM* $QU;
-
-# LB 16
-$SP+ $CM* ($CL | $CP);
-
-# LB 17
-$SP+ $CM* $B2;
-
-# LB 21
-$CM* ($HY | $BA | $HH) $CM* $HL;
-
-# LB 25
-($CM* ($IS | $SY))+ $CM* $NU;
-($CL | $CP) $CM* ($NU | $IS | $SY);
-
-#  LB 30
-($CM* $RI)+;
-
-# For dictionary-based break
-$dictionary $dictionary;
diff --git a/icu4c/source/data/brkitr/rules/sent.txt b/icu4c/source/data/brkitr/rules/sent.txt

index 95e6f030ff62010ef934f6df008e71b7b5295483..41fd3fcff906254b88f6c374c52ddcf3e70d9577 100644 (file)
--- a/icu4c/source/data/brkitr/rules/sent.txt
+++ b/icu4c/source/data/brkitr/rules/sent.txt
@@ -50,7 +50,6 @@ $CloseEx    = $Close   ($Extend | $Format)*;
  ## -------------------------------------------------
  
  !!chain;
-!!forward;
  
  # Rule 3 - break after separators.  Keep CR/LF together.
  #
@@ -82,32 +81,3 @@ $ATermEx $CloseEx* $SpEx* $NotLettersEx* $Lower;
  #Rule 12
  [[^$STerm $ATerm $Close $Sp $Sep $LF $CR $Format $Extend]{bof}] ($Extend | $Format | $Close | $Sp)* .;
  [[^$STerm $ATerm $Close $Sp $Sep $LF $CR $Format $Extend]{bof}] ($Extend | $Format | $Close | $Sp)* ([$Sep $LF $CR {eof}] | $CR $LF){100};
-
-## -------------------------------------------------
-
-!!safe_reverse;
-
-$SpEx_R       = ($Extend | $Format)* $Sp;
-$ATermEx_R    = ($Extend | $Format)* $ATerm;
-$STermEx_R    = ($Extend | $Format)* $STerm;
-$CloseEx_R    = ($Extend | $Format)* $Close;
-
-[{bof}] (.? | $LF $CR) [^$Sep $CR $LF]* [$Sep $CR $LF {eof}] ($SpEx_R* $CloseEx_R* ($STermEx_R | $ATermEx_R))*;
-#.*;
-
-# Explanation for this rule:
-#
-#    It needs to back over
-#        The $Sep at which we probably begin
-#        All of the non $Sep chars leading to the preceding $Sep
-#        The preceding $Sep, which will be the second one that the rule matches.
-#        Any immediately preceding STerm or ATerm sequences.  We need to see these
-#              to get the correct rule status when moving forwards again.
-#
-# [{bof}]           inhibit rule chaining.  Without this, rule would loop on itself and match
-#                   the entire string. TODO: can bof be replaced with ^
-#
-# (.? | $LF $CR)    Match one $Sep instance.  Use .? rather than $Sep because position might be
-#                   at the beginning of the string at this point, and we don't want to fail.
-#                   Can only use {eof} once, and it is used later.
-#
diff --git a/icu4c/source/data/brkitr/rules/sent_el.txt b/icu4c/source/data/brkitr/rules/sent_el.txt

index fec60ed76c3e38e60d85cb2a1b74d7b27105f7b4..079698251241c5215e2944441c3810dc69ca13e9 100644 (file)
--- a/icu4c/source/data/brkitr/rules/sent_el.txt
+++ b/icu4c/source/data/brkitr/rules/sent_el.txt
@@ -51,7 +51,6 @@ $CloseEx    = $Close   ($Extend | $Format)*;
  ## -------------------------------------------------
  
  !!chain;
-!!forward;
  
  # Rule 3 - break after separators.  Keep CR/LF together.
  #
@@ -83,40 +82,3 @@ $ATermEx $CloseEx* $SpEx* $NotLettersEx* $Lower;
  #Rule 12
  [[^$STerm $ATerm $Close $Sp $Sep $LF $CR $Format $Extend]{bof}] ($Extend | $Format | $Close | $Sp)* .;
  [[^$STerm $ATerm $Close $Sp $Sep $LF $CR $Format $Extend]{bof}] ($Extend | $Format | $Close | $Sp)* ([$Sep $LF $CR {eof}] | $CR $LF){100};
-
-## -------------------------------------------------
-
-!!safe_reverse;
-
-$SpEx_R       = ($Extend | $Format)* $Sp;
-$ATermEx_R    = ($Extend | $Format)* $ATerm;
-$STermEx_R    = ($Extend | $Format)* $STerm;
-$CloseEx_R    = ($Extend | $Format)* $Close;
-
-#
-#  Reverse rules.
-#     For now, use the old style inexact reverse rules, which are easier
-#     to write, but less efficient.
-#     TODO:  exact reverse rules.  It appears that exact reverse rules
-#            may require improving support for look-ahead breaks in the
-#            builder.  Needs more investigation.
-#
-
-[{bof}] (.? | $LF $CR) [^$Sep $CR $LF]* [$Sep $CR $LF {eof}] ($SpEx_R* $CloseEx_R* ($STermEx_R | $ATermEx_R))*;
-
-# Explanation for this rule:
-#
-#    It needs to back over
-#        The $Sep at which we probably begin
-#        All of the non $Sep chars leading to the preceding $Sep
-#        The preceding $Sep, which will be the second one that the rule matches.
-#        Any immediately preceding STerm or ATerm sequences.  We need to see these
-#              to get the correct rule status when moving forwards again.
-#
-# [{bof}]           inhibit rule chaining.  Without this, rule would loop on itself and match
-#                   the entire string.
-#
-# (.? | $LF $CR)    Match one $Sep instance.  Use .? rather than $Sep because position might be
-#                   at the beginning of the string at this point, and we don't want to fail.
-#                   Can only use {eof} once, and it is used later.
-#
diff --git a/icu4c/source/data/brkitr/rules/title.txt b/icu4c/source/data/brkitr/rules/title.txt

index 0634a9ee15af1b195ad36a07cc86a710176c3cb1..3be2d3097c3855d494f3827c27e7be6ab7e426c2 100644 (file)
--- a/icu4c/source/data/brkitr/rules/title.txt
+++ b/icu4c/source/data/brkitr/rules/title.txt
@@ -13,8 +13,6 @@ $CaseIgnorable   = [[:Mn:][:Me:][:Cf:][:Lm:][:Sk:] \u0027 \u00AD \u2019];
  $Cased           = [[:Upper_Case:][:Lower_Case:][:Lt:]  - $CaseIgnorable];
  $NotCased        = [[^ $Cased] - $CaseIgnorable];
  
-!!forward;
-
  #  If the iterator begins on a CaseIgnorable, advance it past it/them.
  #  This can occur at the start-of-text, or after application of the
  #  safe-reverse rule.
@@ -26,12 +24,3 @@ $NotCased        = [[^ $Cased] - $CaseIgnorable];
  #         the uncased characters following the word.
  
  $Cased ($Cased | $CaseIgnorable)* ($NotCased | $CaseIgnorable)*;
-
-
-!!safe_reverse;
-
-# Safe Reverse: the exact forward rule must not start in the middle
-#  of a word, so the safe reverse skips over any Cased characters,
-#  leaving it just before the start of a word.
-
-($Cased | $CaseIgnorable)*;
diff --git a/icu4c/source/data/brkitr/rules/word.txt b/icu4c/source/data/brkitr/rules/word.txt

index 617205debc26ef868974112cbaa5c027f3e9773a..bb7044326f4bd3b4d60b37308a24208ef956f462 100644 (file)
--- a/icu4c/source/data/brkitr/rules/word.txt
+++ b/icu4c/source/data/brkitr/rules/word.txt
@@ -97,9 +97,6 @@ $IdeographicEx  = $Ideographic  ($Extend |  $Format | $ZWJ)*;
  
  ## -------------------------------------------------
  
-!!forward;
-
-
  # Rule 3 - CR x LF
  #
  $CR $LF;
@@ -197,27 +194,3 @@ $KanaKanji $KanaKanji {400}; # different rule status if both kana and kanji foun
  # Rule 999
  #     Match a single code point if no other rule applies.
  .;
-
-
-## -------------------------------------------------
-
-!!safe_reverse;
-
-# rule 3
-($Extend | $Format | $ZWJ)+ .?;
-
-# rule 6
-($MidLetter | $MidNumLet | $Single_Quote) ($Format | $Extend | $ZWJ)* ($Hebrew_Letter | $ALetterPlus);
-
-# rule 7b
-$Double_Quote ($Format | $Extend | $ZWJ)* $Hebrew_Letter;
-
-
-# rule 11
-($MidNum | $MidNumLet | $Single_Quote) ($Format | $Extend | $ZWJ)* $Numeric;
-
-# rule 13c
-$Regional_Indicator ($Format | $Extend | $ZWJ)* $Regional_Indicator;
-
-# For dictionary-based break
-$dictionary $dictionary;
diff --git a/icu4c/source/data/brkitr/rules/word_POSIX.txt b/icu4c/source/data/brkitr/rules/word_POSIX.txt

index 5ea6a05ce4b969a7df581a099c2d412291eee9d5..c30f6d769a16c244312072459a64384c4a83a8b3 100644 (file)
--- a/icu4c/source/data/brkitr/rules/word_POSIX.txt
+++ b/icu4c/source/data/brkitr/rules/word_POSIX.txt
@@ -97,9 +97,6 @@ $IdeographicEx  = $Ideographic  ($Extend |  $Format | $ZWJ)*;
  
  ## -------------------------------------------------
  
-!!forward;
-
-
  # Rule 3 - CR x LF
  #
  $CR $LF;
@@ -197,27 +194,3 @@ $KanaKanji $KanaKanji {400}; # different rule status if both kana and kanji foun
  # Rule 999
  #     Match a single code point if no other rule applies.
  .;
-
-
-## -------------------------------------------------
-
-!!safe_reverse;
-
-# rule 3
-($Extend | $Format | $ZWJ)+ .?;
-
-# rule 6
-($MidLetter | $MidNumLet | $Single_Quote) ($Format | $Extend | $ZWJ)* ($Hebrew_Letter | $ALetterPlus);
-
-# rule 7b
-$Double_Quote ($Format | $Extend | $ZWJ)* $Hebrew_Letter;
-
-
-# rule 11
-($MidNum | $MidNumLet | $Single_Quote) ($Format | $Extend | $ZWJ)* $Numeric;
-
-# rule 13c
-$Regional_Indicator ($Format | $Extend | $ZWJ)* $Regional_Indicator;
-
-# For dictionary-based break
-$dictionary $dictionary;
author	Andy Heninger <andy.heninger@gmail.com>
	Wed, 28 Mar 2018 01:20:13 +0000 (01:20 +0000)
committer	Andy Heninger <andy.heninger@gmail.com>
	Wed, 28 Mar 2018 01:20:13 +0000 (01:20 +0000)
icu4c/source/common/rbbi.cpp		patch \| blob \| history
icu4c/source/common/rbbidata.cpp		patch \| blob \| history
icu4c/source/common/rbbidata.h		patch \| blob \| history
icu4c/source/common/rbbirb.cpp		patch \| blob \| history
icu4c/source/common/rbbirb.h		patch \| blob \| history
icu4c/source/common/rbbitblb.cpp		patch \| blob \| history
icu4c/source/common/rbbitblb.h		patch \| blob \| history
icu4c/source/data/brkitr/rules/char.txt		patch \| blob \| history
icu4c/source/data/brkitr/rules/line.txt		patch \| blob \| history
icu4c/source/data/brkitr/rules/line_fi.txt		patch \| blob \| history
icu4c/source/data/brkitr/rules/line_loose.txt		patch \| blob \| history
icu4c/source/data/brkitr/rules/line_loose_cj.txt		patch \| blob \| history
icu4c/source/data/brkitr/rules/line_loose_fi.txt		patch \| blob \| history
icu4c/source/data/brkitr/rules/line_normal.txt		patch \| blob \| history
icu4c/source/data/brkitr/rules/line_normal_cj.txt		patch \| blob \| history
icu4c/source/data/brkitr/rules/line_normal_fi.txt		patch \| blob \| history
icu4c/source/data/brkitr/rules/sent.txt		patch \| blob \| history
icu4c/source/data/brkitr/rules/sent_el.txt		patch \| blob \| history
icu4c/source/data/brkitr/rules/title.txt		patch \| blob \| history
icu4c/source/data/brkitr/rules/word.txt		patch \| blob \| history
icu4c/source/data/brkitr/rules/word_POSIX.txt		patch \| blob \| history