ICU-13194 RBBI safe tables, Java port, work in progress.

author Andy Heninger <andy.heninger@gmail.com>

Thu, 29 Mar 2018 16:09:26 +0000 (16:09 +0000)

committer Andy Heninger <andy.heninger@gmail.com>

Thu, 29 Mar 2018 16:09:26 +0000 (16:09 +0000)
author Andy Heninger <andy.heninger@gmail.com>
Thu, 29 Mar 2018 16:09:26 +0000 (16:09 +0000)
committer Andy Heninger <andy.heninger@gmail.com>
Thu, 29 Mar 2018 16:09:26 +0000 (16:09 +0000)
diff --git a/icu4j/main/classes/core/src/com/ibm/icu/text/RBBIDataWrapper.java b/icu4j/main/classes/core/src/com/ibm/icu/text/RBBIDataWrapper.java

index d881946895ad55dc70107c6945b9ad358d27bae1..200f814fd49e3f8eeac58bfa4f10e00f5a267a85 100644 (file)
--- a/icu4j/main/classes/core/src/com/ibm/icu/text/RBBIDataWrapper.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/text/RBBIDataWrapper.java
@@ -177,25 +177,13 @@ public final class RBBIDataWrapper {
       */
      @Deprecated
      public RBBIStateTable   fRTable;
-    /**
-     * @internal
-     * @deprecated This API is ICU internal only.
-     */
-    @Deprecated
-    public RBBIStateTable   fSFTable;
-    /**
-     * @internal
-     * @deprecated This API is ICU internal only.
-     */
-    @Deprecated
-    public RBBIStateTable   fSRTable;
  
      Trie2          fTrie;
      String         fRuleSource;
      int            fStatusTable[];
  
      static final int DATA_FORMAT = 0x42726b20;     // "Brk "
-    static final int FORMAT_VERSION = 0x04000000;  // 4.0.0.0
+    static final int FORMAT_VERSION = 0x05000000;  // 4.0.0.0
  
      private static final class IsAcceptable implements Authenticate {
          @Override
@@ -210,7 +198,7 @@ public final class RBBIDataWrapper {
      // Indexes to fields in the ICU4C style binary form of the RBBI Data Header
      //   Used by the rule compiler when flattening the data.
      //
-    final static int    DH_SIZE           = 24;
+    final static int    DH_SIZE           = 20;
      final static int    DH_MAGIC          = 0;
      final static int    DH_FORMATVERSION  = 1;
      final static int    DH_LENGTH         = 2;
@@ -219,16 +207,12 @@ public final class RBBIDataWrapper {
      final static int    DH_FTABLELEN      = 5;
      final static int    DH_RTABLE         = 6;
      final static int    DH_RTABLELEN      = 7;
-    final static int    DH_SFTABLE        = 8;
-    final static int    DH_SFTABLELEN     = 9;
-    final static int    DH_SRTABLE        = 10;
-    final static int    DH_SRTABLELEN     = 11;
-    final static int    DH_TRIE           = 12;
-    final static int    DH_TRIELEN        = 13;
-    final static int    DH_RULESOURCE     = 14;
-    final static int    DH_RULESOURCELEN  = 15;
-    final static int    DH_STATUSTABLE    = 16;
-    final static int    DH_STATUSTABLELEN = 17;
+    final static int    DH_TRIE           = 8;
+    final static int    DH_TRIELEN        = 9;
+    final static int    DH_RULESOURCE     = 10;
+    final static int    DH_RULESOURCELEN  = 11;
+    final static int    DH_STATUSTABLE    = 12;
+    final static int    DH_STATUSTABLELEN = 13;
  
  
      // Index offsets to the fields in a state table row.
@@ -299,10 +283,6 @@ public final class RBBIDataWrapper {
          int         fFTableLen;
          int         fRTable;         //  Offset to the reverse state transition table.
          int         fRTableLen;
-        int         fSFTable;        //  safe point forward transition table
-        int         fSFTableLen;
-        int         fSRTable;        //  safe point reverse transition table
-        int         fSRTableLen;
          int         fTrie;           //  Offset to Trie data for character categories
          int         fTrieLen;
          int         fRuleSource;     //  Offset to the source for for the break
@@ -358,10 +338,6 @@ public final class RBBIDataWrapper {
          This.fHeader.fFTableLen      = bytes.getInt();
          This.fHeader.fRTable         = bytes.getInt();
          This.fHeader.fRTableLen      = bytes.getInt();
-        This.fHeader.fSFTable        = bytes.getInt();
-        This.fHeader.fSFTableLen     = bytes.getInt();
-        This.fHeader.fSRTable        = bytes.getInt();
-        This.fHeader.fSRTableLen     = bytes.getInt();
          This.fHeader.fTrie           = bytes.getInt();
          This.fHeader.fTrieLen        = bytes.getInt();
          This.fHeader.fRuleSource     = bytes.getInt();
@@ -406,41 +382,6 @@ public final class RBBIDataWrapper {
          This.fRTable = RBBIStateTable.get(bytes, This.fHeader.fRTableLen);
          pos += This.fHeader.fRTableLen;
  
-        //
-        // Read in the Safe Forward state table
-        //
-        if (This.fHeader.fSFTableLen > 0) {
-            // Skip over any padding in the file
-            ICUBinary.skipBytes(bytes, This.fHeader.fSFTable - pos);
-            pos = This.fHeader.fSFTable;
-
-            // Create & fill the table itself.
-            This.fSFTable = RBBIStateTable.get(bytes, This.fHeader.fSFTableLen);
-            pos += This.fHeader.fSFTableLen;
-        }
-
-        //
-        // Read in the Safe Reverse state table
-        //
-        if (This.fHeader.fSRTableLen > 0) {
-            // Skip over any padding in the file
-            ICUBinary.skipBytes(bytes, This.fHeader.fSRTable - pos);
-            pos = This.fHeader.fSRTable;
-
-            // Create & fill the table itself.
-            This.fSRTable = RBBIStateTable.get(bytes, This.fHeader.fSRTableLen);
-            pos += This.fHeader.fSRTableLen;
-        }
-
-        // Rule Compatibility Hacks
-        //    If a rule set includes reverse rules but does not explicitly include safe reverse rules,
-        //    the reverse rules are to be treated as safe reverse rules.
-
-        if (This.fSRTable == null && This.fRTable != null) {
-            This.fSRTable = This.fRTable;
-            This.fRTable = null;
-        }
-
          //
          // Unserialize the Character categories TRIE
          //     Because we can't be absolutely certain where the Trie deserialize will
@@ -512,10 +453,6 @@ public final class RBBIDataWrapper {
          dumpTable(out, fFTable);
          out.println("Reverse State Table");
          dumpTable(out, fRTable);
-        out.println("Forward Safe Points Table");
-        dumpTable(out, fSFTable);
-        out.println("Reverse Safe Points Table");
-        dumpTable(out, fSRTable);
  
          dumpCharCategories(out);
          out.println("Source Rules: " + fRuleSource);
diff --git a/icu4j/main/classes/core/src/com/ibm/icu/text/RBBIRuleBuilder.java b/icu4j/main/classes/core/src/com/ibm/icu/text/RBBIRuleBuilder.java

index 66c87c770becb7e177e1e67c8299d6fac001b009..8cce28129eb6c21fcbf7f949bb27b271aeee2b8f 100644 (file)
--- a/icu4j/main/classes/core/src/com/ibm/icu/text/RBBIRuleBuilder.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/text/RBBIRuleBuilder.java
@@ -40,8 +40,8 @@ class RBBIRuleBuilder {
      RBBINode[]         fTreeRoots = new RBBINode[4];
      static final int   fForwardTree = 0;  // Indexes into the above fTreeRoots array
      static final int   fReverseTree = 1;  //   for each of the trees.
-    static final int   fSafeFwdTree = 2;  //   (in C, these are pointer variables and
-    static final int   fSafeRevTree = 3;  //    there is no array.)
+    //                                    //   (in C, these are pointer variables and
+    //                                    //    there is no array.)
      int fDefaultTree = fForwardTree;      // For rules not qualified with a !
                                            //   the tree to which they belong to.
  
@@ -57,10 +57,7 @@ class RBBIRuleBuilder {
  
      RBBISetBuilder fSetBuilder;           // Set and Character Category builder.
      List<RBBINode> fUSetNodes;            // Vector of all uset nodes.
-    RBBITableBuilder fForwardTables;      // State transition tables
-    RBBITableBuilder fReverseTables;
-    RBBITableBuilder fSafeFwdTables;
-    RBBITableBuilder fSafeRevTables;
+    RBBITableBuilder fForwardTable;       // State transition tables
  
      //
      // Status {tag} values.   These structures are common to all of the rule sets (Forward, Reverse, etc.).
@@ -176,20 +173,16 @@ class RBBIRuleBuilder {
          //   Sections sizes actually stored in the header are for the actual data
          //     without the padding.
          //
-        int headerSize       = 24 * 4;     // align8(sizeof(RBBIDataHeader));
-        int forwardTableSize = align8(fForwardTables.getTableSize());
-        int reverseTableSize = align8(fReverseTables.getTableSize());
-        // int safeFwdTableSize = align8(fSafeFwdTables.getTableSize());
-        int safeRevTableSize = align8(fSafeRevTables.getTableSize());
+        int headerSize       = RBBIDataWrapper.DH_SIZE * 4;     // align8(sizeof(RBBIDataHeader));
+        int forwardTableSize = align8(fForwardTable.getTableSize());
+        int reverseTableSize = align8(fForwardTable.getSafeTableSize());
          int trieSize         = align8(fSetBuilder.getTrieSize());
          int statusTableSize  = align8(fRuleStatusVals.size() * 4);
          int rulesSize        = align8((strippedRules.length()) * 2);
  
          int totalSize = headerSize
                  + forwardTableSize
-                + /* reverseTableSize */ 0
-                + /* safeFwdTableSize */ 0
-                + (safeRevTableSize > 0 ? safeRevTableSize : reverseTableSize)
+                + reverseTableSize
                  + statusTableSize + trieSize + rulesSize;
          int outputPos = 0;               // Track stream position, starting from RBBIDataHeader.
  
@@ -207,39 +200,14 @@ class RBBIRuleBuilder {
          header[RBBIDataWrapper.DH_LENGTH]        = totalSize;            // fLength, the total size of all rule sections.
          header[RBBIDataWrapper.DH_CATCOUNT]      = fSetBuilder.getNumCharCategories(); // fCatCount.
  
-        // Only save the forward table and the safe reverse table,
-        // because these are the only ones used at run-time.
-        //
-        // For the moment, we still build the other tables if they are present in the rule source files,
-        // for backwards compatibility. Old rule files need to work, and this is the simplest approach.
-        //
-        // Additional backwards compatibility consideration: if no safe rules are provided, consider the
-        // reverse rules to actually be the safe reverse rules.
-
          header[RBBIDataWrapper.DH_FTABLE]        = headerSize;           // fFTable
          header[RBBIDataWrapper.DH_FTABLELEN]     = forwardTableSize;     // fTableLen
  
-        // Do not save Reverse Table.
          header[RBBIDataWrapper.DH_RTABLE]        = header[RBBIDataWrapper.DH_FTABLE] + forwardTableSize; // fRTable
-        header[RBBIDataWrapper.DH_RTABLELEN]     = 0;                    // fRTableLen
-
-        // Do not save the Safe Forward table.
-        header[RBBIDataWrapper.DH_SFTABLE]       = header[RBBIDataWrapper.DH_RTABLE]
-                                                     + 0;                // fSTable
-        header[RBBIDataWrapper.DH_SFTABLELEN]    = 0;                    // fSTableLen
-
-        // Safe reverse table. Use if present, otherwise save regular reverse table as the safe reverse.
-        header[RBBIDataWrapper.DH_SRTABLE]       = header[RBBIDataWrapper.DH_SFTABLE]
-                                                     + 0;                // fSRTable
-        if (safeRevTableSize > 0) {
-            header[RBBIDataWrapper.DH_SRTABLELEN] = safeRevTableSize;
-        } else {
-            assert reverseTableSize > 0;
-            header[RBBIDataWrapper.DH_SRTABLELEN] = reverseTableSize;
-        }
+        header[RBBIDataWrapper.DH_RTABLELEN]     = reverseTableSize;     // fRTableLen
  
-        header[RBBIDataWrapper.DH_TRIE]          = header[RBBIDataWrapper.DH_SRTABLE]
-                                                     + header[RBBIDataWrapper.DH_SRTABLELEN]; // fTrie
+        header[RBBIDataWrapper.DH_TRIE]          = header[RBBIDataWrapper.DH_RTABLE]
+                                                     + header[RBBIDataWrapper.DH_RTABLELEN]; // fTrie
          header[RBBIDataWrapper.DH_TRIELEN]       = fSetBuilder.getTrieSize(); // fTrieLen
          header[RBBIDataWrapper.DH_STATUSTABLE]   = header[RBBIDataWrapper.DH_TRIE]
                                                       + header[RBBIDataWrapper.DH_TRIELEN];
@@ -253,49 +221,25 @@ class RBBIRuleBuilder {
          }
  
          // Write out the actual state tables.
-        RBBIDataWrapper.RBBIStateTable table = fForwardTables.exportTable();
-        assert(outputPos == header[4]);
+        RBBIDataWrapper.RBBIStateTable table = fForwardTable.exportTable();
+        assert(outputPos == header[RBBIDataWrapper.DH_FTABLE]);
          outputPos += table.put(dos);
  
-        /* do not write the reverse table
-        tableData = fReverseTables.exportTable();
-        Assert.assrt(outputPos == header[6]);
-        for (i = 0; i < tableData.length; i++) {
-            dos.writeShort(tableData[i]);
-            outputPos += 2;
-        }
-        */
-
-        /* do not write safe forwards table
-        Assert.assrt(outputPos == header[8]);
-        tableData = fSafeFwdTables.exportTable();
-        for (i = 0; i < tableData.length; i++) {
-            dos.writeShort(tableData[i]);
-            outputPos += 2;
-        }
-        */
-
-        // Write the safe reverse table.
-        // If not present, write the plain reverse table (old style rule compatibility)
-        assert(outputPos == header[10]);
-        if (safeRevTableSize > 0) {
-            table = fSafeRevTables.exportTable();
-        } else {
-            table = fReverseTables.exportTable();
-        }
+        table = fForwardTable.exportSafeTable();
+        Assert.assrt(outputPos == header[RBBIDataWrapper.DH_RTABLE]);
          outputPos += table.put(dos);
  
          // write out the Trie table
-        Assert.assrt(outputPos == header[12]);
+        Assert.assrt(outputPos == header[RBBIDataWrapper.DH_TRIE]);
          fSetBuilder.serializeTrie(os);
-        outputPos += header[13];
+        outputPos += header[RBBIDataWrapper.DH_TRIELEN];
          while (outputPos % 8 != 0) { // pad to an 8 byte boundary
              dos.write(0);
              outputPos += 1;
          }
  
          // Write out the status {tag} table.
-        Assert.assrt(outputPos == header[16]);
+        Assert.assrt(outputPos == header[RBBIDataWrapper.DH_STATUSTABLE]);
          for (Integer val : fRuleStatusVals) {
              dos.writeInt(val.intValue());
              outputPos += 4;
@@ -308,7 +252,7 @@ class RBBIRuleBuilder {
  
          // Write out the stripped rules (rules with extra spaces removed
          //   These go last in the data area, even though they are not last in the header.
-        Assert.assrt(outputPos == header[14]);
+        Assert.assrt(outputPos == header[RBBIDataWrapper.DH_RULESOURCE]);
          dos.writeChars(strippedRules);
          outputPos += strippedRules.length() * 2;
          while (outputPos % 8 != 0) { // pad to an 8 byte boundary
@@ -330,7 +274,15 @@ class RBBIRuleBuilder {
          // and list of all Unicode Sets referenced by the rules.
          //
          RBBIRuleBuilder builder = new RBBIRuleBuilder(rules);
-        builder.fScanner.parse();
+        builder.build(os);
+    }
+
+    /**
+     * Compile rules to the binary form, write that to an ouput stream.
+     *
+     */
+    void build(OutputStream os) throws IOException {
+        fScanner.parse();
  
          //
          // UnicodeSet processing.
@@ -338,31 +290,30 @@ class RBBIRuleBuilder {
          //    Generate the mapping tables (TRIE) from input code points to
          //    the character categories.
          //
-        builder.fSetBuilder.buildRanges();
+        fSetBuilder.buildRanges();
  
          //
          //   Generate the DFA state transition table.
          //
-        builder.fForwardTables = new RBBITableBuilder(builder, fForwardTree);
-        builder.fReverseTables = new RBBITableBuilder(builder, fReverseTree);
-        builder.fSafeFwdTables = new RBBITableBuilder(builder, fSafeFwdTree);
-        builder.fSafeRevTables = new RBBITableBuilder(builder, fSafeRevTree);
-        builder.fForwardTables.build();
-        builder.fReverseTables.build();
-        builder.fSafeFwdTables.build();
-        builder.fSafeRevTables.build();
-        if (builder.fDebugEnv != null
-                && builder.fDebugEnv.indexOf("states") >= 0) {
-            builder.fForwardTables.printRuleStatusTable();
+        fForwardTable = new RBBITableBuilder(this, fForwardTree);
+        fForwardTable.buildForwardTable();
+        optimizeTables();
+        fForwardTable.buildSafeReverseTable();
+
+
+        if (fDebugEnv != null
+                && fDebugEnv.indexOf("states") >= 0) {
+            fForwardTable.printStates();
+            fForwardTable.printRuleStatusTable();
+            fForwardTable.printReverseTable();
          }
  
-        builder.optimizeTables();
-        builder.fSetBuilder.buildTrie();
+        fSetBuilder.buildTrie();
          //
          //   Package up the compiled data, writing it to an output stream
          //      in the serialization format.  This is the same as the ICU4C runtime format.
          //
-        builder.flattenData(os);
+        flattenData(os);
      }
  
      static class IntPair {
@@ -377,17 +328,10 @@ class RBBIRuleBuilder {
  
      void optimizeTables() {
          IntPair duplPair = new IntPair(3, 0);
-        while (fForwardTables.findDuplCharClassFrom(duplPair)) {
+        while (fForwardTable.findDuplCharClassFrom(duplPair)) {
              fSetBuilder.mergeCategories(duplPair.first, duplPair.second);
-            fForwardTables.removeColumn(duplPair.second);
-            fReverseTables.removeColumn(duplPair.second);
-            fSafeFwdTables.removeColumn(duplPair.second);
-            fSafeRevTables.removeColumn(duplPair.second);
+            fForwardTable.removeColumn(duplPair.second);
          }
-
-        fForwardTables.removeDuplicateStates();
-        fReverseTables.removeDuplicateStates();
-        fSafeFwdTables.removeDuplicateStates();
-        fSafeRevTables.removeDuplicateStates();
+        fForwardTable.removeDuplicateStates();
      }
  }
diff --git a/icu4j/main/classes/core/src/com/ibm/icu/text/RBBITableBuilder.java b/icu4j/main/classes/core/src/com/ibm/icu/text/RBBITableBuilder.java

index 2a4d0582541ad711c6970c3de980f8b73e5c47c0..b2ae50823781718323bb1a5bc32fecc9010606ba 100644 (file)
--- a/icu4j/main/classes/core/src/com/ibm/icu/text/RBBITableBuilder.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/text/RBBITableBuilder.java
@@ -95,7 +95,7 @@ class RBBITableBuilder {
         //                               table from the RBBI rules parse tree.
         //
         //-----------------------------------------------------------------------------
-       void  build() {
+       void  buildForwardTable() {
             // If there were no rules, just return.  This situation can easily arise
             //   for the reverse rules.
             if (fRB.fTreeRoots[fRootIx]==null) {
diff --git a/icu4j/main/classes/core/src/com/ibm/icu/text/RuleBasedBreakIterator.java b/icu4j/main/classes/core/src/com/ibm/icu/text/RuleBasedBreakIterator.java

index 74a44d871c4785201364fd2b73601c8fbc444ad5..12c703e5b66be67559d825dc2bad4985afa98e92 100644 (file)
--- a/icu4j/main/classes/core/src/com/ibm/icu/text/RuleBasedBreakIterator.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/text/RuleBasedBreakIterator.java
@@ -12,7 +12,6 @@ package com.ibm.icu.text;
  import static com.ibm.icu.impl.CharacterIteration.DONE32;
  import static com.ibm.icu.impl.CharacterIteration.next32;
  import static com.ibm.icu.impl.CharacterIteration.nextTrail32;
-import static com.ibm.icu.impl.CharacterIteration.previous32;
  
  import java.io.ByteArrayOutputStream;
  import java.io.IOException;
@@ -510,7 +509,7 @@ public class RuleBasedBreakIterator extends BreakIterator {
          checkOffset(offset, fText);
  
          // Adjust offset to be on a code point boundary and not beyond the end of the text.
-        // Note that isBoundary() is always be false for offsets that are not on code point boundaries.
+        // Note that isBoundary() is always false for offsets that are not on code point boundaries.
          // But we still need the side effect of leaving iteration at the following boundary.
          int adjustedOffset = CISetIndex32(fText, offset);
  
@@ -966,142 +965,71 @@ public class RuleBasedBreakIterator extends BreakIterator {
       * This locates a "Safe Position" from which the forward break rules
       * will operate correctly. A Safe Position is not necessarily a boundary itself.
       *
-     * The logic of this function is very similar to handleNext(), above.
+     * The logic of this function is very similar to handleNext(), above, but simpler
+     * because the safe table does not require as many options.
       *
       * @param fromPosition the position in the input text to begin the iteration.
       * @internal
       */
-    private int handlePrevious(int fromPosition) {
-        if (fText == null) {
-            return 0;
+    private int handleSafePrevious(int fromPosition) {
+        int             state;
+        short           category = 0;
+        int             result = 0;
+
+        // caches for quicker access
+        CharacterIterator text = fText;
+        Trie2 trie = fRData.fTrie;
+        short[] stateTable  = fRData.fRTable.fTable;
+
+        CISetIndex32(text, fromPosition);
+        if (TRACE) {
+            System.out.print("Handle Previous   pos   char  state category");
          }
  
-        int            state;
-        int            category           = 0;
-        int            mode;
-        int            row;
-        int            c;
-        int            result             = 0;
-        int            initialPosition    = fromPosition;
-        fLookAheadMatches.reset();
-        short[] stateTable = fRData.fSRTable.fTable;
-        CISetIndex32(fText, fromPosition);
-        if (fromPosition == fText.getBeginIndex()) {
+        // if we're already at the start of the text, return DONE.
+        if (text.getIndex() == text.getBeginIndex()) {
              return BreakIterator.DONE;
          }
  
-        // set up the starting char
-        result          = initialPosition;
-        c               = previous32(fText);
-
-        // Set up the initial state for the state machine
+        //  Set the initial state for the state machine
+        int c = CharacterIteration.previous32(text);
          state = START_STATE;
-        row = fRData.getRowIndex(state);
-        category = 3;   // TODO:  obsolete?  from the old start/run mode scheme?
-        mode     = RBBI_RUN;
-        if ((fRData.fSRTable.fFlags & RBBIDataWrapper.RBBI_BOF_REQUIRED) != 0) {
-            category = 2;
-            mode     = RBBI_START;
-        }
-
-        if (TRACE) {
-            System.out.println("Handle Prev   pos   char  state category ");
-        }
+        int row = fRData.getRowIndex(state);
  
-        // loop until we reach the beginning of the text or transition to state 0
+        // loop until we reach the start of the text or transition to state 0
          //
-        mainLoop: for (;;) {
-            if (c == DONE32) {
-                // Reached end of input string.
-                if (mode == RBBI_END) {
-                    // We have already done the {eof} iteration.  Now is the time
-                    // to unconditionally bail out.
-                    break mainLoop;
-                }
-                mode = RBBI_END;
-                category = 1;
-            }
-
-            if (mode == RBBI_RUN) {
-                // look up the current character's category, which tells us
-                // which column in the state table to look at.
-                //
-                //  And off the dictionary flag bit. For reverse iteration it is not used.
-                category = (short) fRData.fTrie.get(c);
-                category &= ~0x4000;
-            }
+        for (; c != DONE32; c = CharacterIteration.previous32(text)) {
  
+            // look up the current character's character category, which tells us
+            // which column in the state table to look at.
+            //
+            //  And off the dictionary flag bit. For reverse iteration it is not used.
+            category = (short) trie.get(c);
+            category &= ~0x4000;
              if (TRACE) {
-                System.out.print("             " + fText.getIndex() + "   ");
-                if (0x20 <= c && c < 0x7f) {
-                    System.out.print("  " + c + "  ");
-                } else {
-                    System.out.print(" " + Integer.toHexString(c) + " ");
-                }
-                System.out.println(" " + state + "  " + category + " ");
+                System.out.print("            " +  RBBIDataWrapper.intToString(text.getIndex(), 5));
+                System.out.print(RBBIDataWrapper.intToHexString(c, 10));
+                System.out.println(RBBIDataWrapper.intToString(state,7) + RBBIDataWrapper.intToString(category,6));
              }
  
              // State Transition - move machine to its next state
              //
+            assert(category < fRData.fHeader.fCatCount);
              state = stateTable[row + RBBIDataWrapper.NEXTSTATES + category];
-            row = fRData.getRowIndex(state);
-
-            if (stateTable[row + RBBIDataWrapper.ACCEPTING] == -1) {
-                // Match found, common case, could have lookahead so we move
-                // on to check it
-                result = fText.getIndex();
-            }
-
-
-            int completedRule = stateTable[row + RBBIDataWrapper.ACCEPTING];
-            if (completedRule > 0) {
-                // Lookahead match is completed.
-                int lookaheadResult = fLookAheadMatches.getPosition(completedRule);
-                if (lookaheadResult >= 0) {
-                    result = lookaheadResult;
-                    break mainLoop;
-                }
-            }
-            int rule = stateTable[row + RBBIDataWrapper.LOOKAHEAD];
-            if (rule != 0) {
-                // At the position of a '/' in a look-ahead match. Record it.
-                int pos = fText.getIndex();
-                fLookAheadMatches.setPosition(rule, pos);
-            }
+            row   = fRData.getRowIndex(state);
  
              if (state == STOP_STATE) {
-                // Normal loop exit is here
-                break mainLoop;
-            }
-
-            // then move iterator position backwards one character
-            //
-            if (mode == RBBI_RUN) {
-                c = previous32(fText);
-            } else {
-                if (mode == RBBI_START) {
-                    mode = RBBI_RUN;
-                }
+                // This is the normal exit from the lookup state machine.
+                // Transition to state zero means we have found a safe point.
+                break;
              }
-
-
-        }   // End of the main loop.
-
-        // The state machine is done.  Check whether it found a match...
-        //
-        // If the iterator failed to move in the match engine, force it back by one code point.
-        //   (This really indicates a defect in the break rules.  They should always match
-        //    at least one character.)
-        if (result == initialPosition) {
-            CISetIndex32(fText, initialPosition);
-            previous32(fText);
-            result = fText.getIndex();
          }
  
+        // The state machine is done.
+        result = text.getIndex();
          if (TRACE) {
-            System.out.println("Result = " + result);
+            System.out.println("result = " + result);
          }
-
          return result;
      }
  
@@ -1493,11 +1421,26 @@ class BreakCache {
          if ((position < fBoundaries[fStartBufIdx] - 15) || position > (fBoundaries[fEndBufIdx] + 15)) {
              int aBoundary = fText.getBeginIndex();
              int ruleStatusIndex = 0;
-            // TODO: check for position == length of text. Although may still need to back up to get rule status.
              if (position > aBoundary + 20) {
-                int backupPos = handlePrevious(position);
-                fPosition = backupPos;
-                aBoundary = handleNext();                // Ignore dictionary, just finding a rule based boundary.
+                int backupPos = handleSafePrevious(position);
+                if (backupPos > aBoundary) {
+                    // Advance to the boundary following the backup position.
+                    // There is a complication: the safe reverse rules identify pairs of code points
+                    // that are safe. If advancing from the safe point moves forwards by less than
+                    // two code points, we need to advance one more time to ensure that the boundary
+                    // is good, including a correct rules status value.
+                    //
+                    fPosition = backupPos;
+                    aBoundary = handleNext();
+                    if (aBoundary == backupPos + 1 ||
+                            (aBoundary == backupPos + 2 &&
+                            Character.isHighSurrogate(fText.setIndex(backupPos)) &&
+                            Character.isLowSurrogate(fText.next()))) {
+                        // The initial handleNext() only advanced by a single code point. Go again.
+                        // Safe rules identify safe pairs.
+                        aBoundary = handleNext();
+                    }
+                }
                  ruleStatusIndex = fRuleStatusIndex;
              }
              reset(aBoundary, ruleStatusIndex);               // Reset cache to hold aBoundary as a single starting point.
@@ -1628,21 +1571,34 @@ class BreakCache {
              if (backupPosition <= textBegin) {
                  backupPosition = textBegin;
              } else {
-                backupPosition = handlePrevious(backupPosition);
+                backupPosition = handleSafePrevious(backupPosition);
              }
              if (backupPosition == BreakIterator.DONE || backupPosition == textBegin) {
                  position = textBegin;
                  positionStatusIdx = 0;
              } else {
+                // Advance to the boundary following the backup position.
+                // There is a complication: the safe reverse rules identify pairs of code points
+                // that are safe. If advancing from the safe point moves forwards by less than
+                // two code points, we need to advance one more time to ensure that the boundary
+                // is good, including a correct rules status value.
+                //
                  fPosition = backupPosition;  // TODO: pass starting position in a clearer way.
                  position = handleNext();
+                if (position == backupPosition + 1 ||
+                        (position == backupPosition + 2 &&
+                        Character.isHighSurrogate(fText.setIndex(backupPosition)) &&
+                        Character.isLowSurrogate(fText.next()))) {
+                    // The initial handleNext() only advanced by a single code point. Go again.
+                    // Safe rules identify safe pairs.
+                    position = handleNext();
+                }
                  positionStatusIdx = fRuleStatusIndex;
-
              }
          } while (position >= fromPosition);
  
          // Find boundaries between the one we just located and the first already-cached boundary
-        // Put them in a side buffer, because we don't yet know where they will fall in the circular cache buffer..
+        // Put them in a side buffer, because we don't yet know where they will fall in the circular cache buffer.
  
          fSideBuffer.removeAllElements();
          fSideBuffer.push(position);
author	Andy Heninger <andy.heninger@gmail.com>
	Thu, 29 Mar 2018 16:09:26 +0000 (16:09 +0000)
committer	Andy Heninger <andy.heninger@gmail.com>
	Thu, 29 Mar 2018 16:09:26 +0000 (16:09 +0000)
icu4j/main/classes/core/src/com/ibm/icu/text/RBBIDataWrapper.java		patch \| blob \| history
icu4j/main/classes/core/src/com/ibm/icu/text/RBBIRuleBuilder.java		patch \| blob \| history
icu4j/main/classes/core/src/com/ibm/icu/text/RBBITableBuilder.java		patch \| blob \| history
icu4j/main/classes/core/src/com/ibm/icu/text/RuleBasedBreakIterator.java		patch \| blob \| history