From: Andy Heninger Date: Thu, 29 Mar 2018 16:09:26 +0000 (+0000) Subject: ICU-13194 RBBI safe tables, Java port, work in progress. X-Git-Tag: release-62-rc~204^2~11 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=ed5b77c406c2ee4383aee13cff8eba6520b7a46c;p=icu ICU-13194 RBBI safe tables, Java port, work in progress. X-SVN-Rev: 41170 --- diff --git a/icu4j/main/classes/core/src/com/ibm/icu/text/RBBIDataWrapper.java b/icu4j/main/classes/core/src/com/ibm/icu/text/RBBIDataWrapper.java index d881946895a..200f814fd49 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/text/RBBIDataWrapper.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/text/RBBIDataWrapper.java @@ -177,25 +177,13 @@ public final class RBBIDataWrapper { */ @Deprecated public RBBIStateTable fRTable; - /** - * @internal - * @deprecated This API is ICU internal only. - */ - @Deprecated - public RBBIStateTable fSFTable; - /** - * @internal - * @deprecated This API is ICU internal only. - */ - @Deprecated - public RBBIStateTable fSRTable; Trie2 fTrie; String fRuleSource; int fStatusTable[]; static final int DATA_FORMAT = 0x42726b20; // "Brk " - static final int FORMAT_VERSION = 0x04000000; // 4.0.0.0 + static final int FORMAT_VERSION = 0x05000000; // 4.0.0.0 private static final class IsAcceptable implements Authenticate { @Override @@ -210,7 +198,7 @@ public final class RBBIDataWrapper { // Indexes to fields in the ICU4C style binary form of the RBBI Data Header // Used by the rule compiler when flattening the data. // - final static int DH_SIZE = 24; + final static int DH_SIZE = 20; final static int DH_MAGIC = 0; final static int DH_FORMATVERSION = 1; final static int DH_LENGTH = 2; @@ -219,16 +207,12 @@ public final class RBBIDataWrapper { final static int DH_FTABLELEN = 5; final static int DH_RTABLE = 6; final static int DH_RTABLELEN = 7; - final static int DH_SFTABLE = 8; - final static int DH_SFTABLELEN = 9; - final static int DH_SRTABLE = 10; - final static int DH_SRTABLELEN = 11; - final static int DH_TRIE = 12; - final static int DH_TRIELEN = 13; - final static int DH_RULESOURCE = 14; - final static int DH_RULESOURCELEN = 15; - final static int DH_STATUSTABLE = 16; - final static int DH_STATUSTABLELEN = 17; + final static int DH_TRIE = 8; + final static int DH_TRIELEN = 9; + final static int DH_RULESOURCE = 10; + final static int DH_RULESOURCELEN = 11; + final static int DH_STATUSTABLE = 12; + final static int DH_STATUSTABLELEN = 13; // Index offsets to the fields in a state table row. @@ -299,10 +283,6 @@ public final class RBBIDataWrapper { int fFTableLen; int fRTable; // Offset to the reverse state transition table. int fRTableLen; - int fSFTable; // safe point forward transition table - int fSFTableLen; - int fSRTable; // safe point reverse transition table - int fSRTableLen; int fTrie; // Offset to Trie data for character categories int fTrieLen; int fRuleSource; // Offset to the source for for the break @@ -358,10 +338,6 @@ public final class RBBIDataWrapper { This.fHeader.fFTableLen = bytes.getInt(); This.fHeader.fRTable = bytes.getInt(); This.fHeader.fRTableLen = bytes.getInt(); - This.fHeader.fSFTable = bytes.getInt(); - This.fHeader.fSFTableLen = bytes.getInt(); - This.fHeader.fSRTable = bytes.getInt(); - This.fHeader.fSRTableLen = bytes.getInt(); This.fHeader.fTrie = bytes.getInt(); This.fHeader.fTrieLen = bytes.getInt(); This.fHeader.fRuleSource = bytes.getInt(); @@ -406,41 +382,6 @@ public final class RBBIDataWrapper { This.fRTable = RBBIStateTable.get(bytes, This.fHeader.fRTableLen); pos += This.fHeader.fRTableLen; - // - // Read in the Safe Forward state table - // - if (This.fHeader.fSFTableLen > 0) { - // Skip over any padding in the file - ICUBinary.skipBytes(bytes, This.fHeader.fSFTable - pos); - pos = This.fHeader.fSFTable; - - // Create & fill the table itself. - This.fSFTable = RBBIStateTable.get(bytes, This.fHeader.fSFTableLen); - pos += This.fHeader.fSFTableLen; - } - - // - // Read in the Safe Reverse state table - // - if (This.fHeader.fSRTableLen > 0) { - // Skip over any padding in the file - ICUBinary.skipBytes(bytes, This.fHeader.fSRTable - pos); - pos = This.fHeader.fSRTable; - - // Create & fill the table itself. - This.fSRTable = RBBIStateTable.get(bytes, This.fHeader.fSRTableLen); - pos += This.fHeader.fSRTableLen; - } - - // Rule Compatibility Hacks - // If a rule set includes reverse rules but does not explicitly include safe reverse rules, - // the reverse rules are to be treated as safe reverse rules. - - if (This.fSRTable == null && This.fRTable != null) { - This.fSRTable = This.fRTable; - This.fRTable = null; - } - // // Unserialize the Character categories TRIE // Because we can't be absolutely certain where the Trie deserialize will @@ -512,10 +453,6 @@ public final class RBBIDataWrapper { dumpTable(out, fFTable); out.println("Reverse State Table"); dumpTable(out, fRTable); - out.println("Forward Safe Points Table"); - dumpTable(out, fSFTable); - out.println("Reverse Safe Points Table"); - dumpTable(out, fSRTable); dumpCharCategories(out); out.println("Source Rules: " + fRuleSource); diff --git a/icu4j/main/classes/core/src/com/ibm/icu/text/RBBIRuleBuilder.java b/icu4j/main/classes/core/src/com/ibm/icu/text/RBBIRuleBuilder.java index 66c87c770be..8cce28129eb 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/text/RBBIRuleBuilder.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/text/RBBIRuleBuilder.java @@ -40,8 +40,8 @@ class RBBIRuleBuilder { RBBINode[] fTreeRoots = new RBBINode[4]; static final int fForwardTree = 0; // Indexes into the above fTreeRoots array static final int fReverseTree = 1; // for each of the trees. - static final int fSafeFwdTree = 2; // (in C, these are pointer variables and - static final int fSafeRevTree = 3; // there is no array.) + // // (in C, these are pointer variables and + // // there is no array.) int fDefaultTree = fForwardTree; // For rules not qualified with a ! // the tree to which they belong to. @@ -57,10 +57,7 @@ class RBBIRuleBuilder { RBBISetBuilder fSetBuilder; // Set and Character Category builder. List fUSetNodes; // Vector of all uset nodes. - RBBITableBuilder fForwardTables; // State transition tables - RBBITableBuilder fReverseTables; - RBBITableBuilder fSafeFwdTables; - RBBITableBuilder fSafeRevTables; + RBBITableBuilder fForwardTable; // State transition tables // // Status {tag} values. These structures are common to all of the rule sets (Forward, Reverse, etc.). @@ -176,20 +173,16 @@ class RBBIRuleBuilder { // Sections sizes actually stored in the header are for the actual data // without the padding. // - int headerSize = 24 * 4; // align8(sizeof(RBBIDataHeader)); - int forwardTableSize = align8(fForwardTables.getTableSize()); - int reverseTableSize = align8(fReverseTables.getTableSize()); - // int safeFwdTableSize = align8(fSafeFwdTables.getTableSize()); - int safeRevTableSize = align8(fSafeRevTables.getTableSize()); + int headerSize = RBBIDataWrapper.DH_SIZE * 4; // align8(sizeof(RBBIDataHeader)); + int forwardTableSize = align8(fForwardTable.getTableSize()); + int reverseTableSize = align8(fForwardTable.getSafeTableSize()); int trieSize = align8(fSetBuilder.getTrieSize()); int statusTableSize = align8(fRuleStatusVals.size() * 4); int rulesSize = align8((strippedRules.length()) * 2); int totalSize = headerSize + forwardTableSize - + /* reverseTableSize */ 0 - + /* safeFwdTableSize */ 0 - + (safeRevTableSize > 0 ? safeRevTableSize : reverseTableSize) + + reverseTableSize + statusTableSize + trieSize + rulesSize; int outputPos = 0; // Track stream position, starting from RBBIDataHeader. @@ -207,39 +200,14 @@ class RBBIRuleBuilder { header[RBBIDataWrapper.DH_LENGTH] = totalSize; // fLength, the total size of all rule sections. header[RBBIDataWrapper.DH_CATCOUNT] = fSetBuilder.getNumCharCategories(); // fCatCount. - // Only save the forward table and the safe reverse table, - // because these are the only ones used at run-time. - // - // For the moment, we still build the other tables if they are present in the rule source files, - // for backwards compatibility. Old rule files need to work, and this is the simplest approach. - // - // Additional backwards compatibility consideration: if no safe rules are provided, consider the - // reverse rules to actually be the safe reverse rules. - header[RBBIDataWrapper.DH_FTABLE] = headerSize; // fFTable header[RBBIDataWrapper.DH_FTABLELEN] = forwardTableSize; // fTableLen - // Do not save Reverse Table. header[RBBIDataWrapper.DH_RTABLE] = header[RBBIDataWrapper.DH_FTABLE] + forwardTableSize; // fRTable - header[RBBIDataWrapper.DH_RTABLELEN] = 0; // fRTableLen - - // Do not save the Safe Forward table. - header[RBBIDataWrapper.DH_SFTABLE] = header[RBBIDataWrapper.DH_RTABLE] - + 0; // fSTable - header[RBBIDataWrapper.DH_SFTABLELEN] = 0; // fSTableLen - - // Safe reverse table. Use if present, otherwise save regular reverse table as the safe reverse. - header[RBBIDataWrapper.DH_SRTABLE] = header[RBBIDataWrapper.DH_SFTABLE] - + 0; // fSRTable - if (safeRevTableSize > 0) { - header[RBBIDataWrapper.DH_SRTABLELEN] = safeRevTableSize; - } else { - assert reverseTableSize > 0; - header[RBBIDataWrapper.DH_SRTABLELEN] = reverseTableSize; - } + header[RBBIDataWrapper.DH_RTABLELEN] = reverseTableSize; // fRTableLen - header[RBBIDataWrapper.DH_TRIE] = header[RBBIDataWrapper.DH_SRTABLE] - + header[RBBIDataWrapper.DH_SRTABLELEN]; // fTrie + header[RBBIDataWrapper.DH_TRIE] = header[RBBIDataWrapper.DH_RTABLE] + + header[RBBIDataWrapper.DH_RTABLELEN]; // fTrie header[RBBIDataWrapper.DH_TRIELEN] = fSetBuilder.getTrieSize(); // fTrieLen header[RBBIDataWrapper.DH_STATUSTABLE] = header[RBBIDataWrapper.DH_TRIE] + header[RBBIDataWrapper.DH_TRIELEN]; @@ -253,49 +221,25 @@ class RBBIRuleBuilder { } // Write out the actual state tables. - RBBIDataWrapper.RBBIStateTable table = fForwardTables.exportTable(); - assert(outputPos == header[4]); + RBBIDataWrapper.RBBIStateTable table = fForwardTable.exportTable(); + assert(outputPos == header[RBBIDataWrapper.DH_FTABLE]); outputPos += table.put(dos); - /* do not write the reverse table - tableData = fReverseTables.exportTable(); - Assert.assrt(outputPos == header[6]); - for (i = 0; i < tableData.length; i++) { - dos.writeShort(tableData[i]); - outputPos += 2; - } - */ - - /* do not write safe forwards table - Assert.assrt(outputPos == header[8]); - tableData = fSafeFwdTables.exportTable(); - for (i = 0; i < tableData.length; i++) { - dos.writeShort(tableData[i]); - outputPos += 2; - } - */ - - // Write the safe reverse table. - // If not present, write the plain reverse table (old style rule compatibility) - assert(outputPos == header[10]); - if (safeRevTableSize > 0) { - table = fSafeRevTables.exportTable(); - } else { - table = fReverseTables.exportTable(); - } + table = fForwardTable.exportSafeTable(); + Assert.assrt(outputPos == header[RBBIDataWrapper.DH_RTABLE]); outputPos += table.put(dos); // write out the Trie table - Assert.assrt(outputPos == header[12]); + Assert.assrt(outputPos == header[RBBIDataWrapper.DH_TRIE]); fSetBuilder.serializeTrie(os); - outputPos += header[13]; + outputPos += header[RBBIDataWrapper.DH_TRIELEN]; while (outputPos % 8 != 0) { // pad to an 8 byte boundary dos.write(0); outputPos += 1; } // Write out the status {tag} table. - Assert.assrt(outputPos == header[16]); + Assert.assrt(outputPos == header[RBBIDataWrapper.DH_STATUSTABLE]); for (Integer val : fRuleStatusVals) { dos.writeInt(val.intValue()); outputPos += 4; @@ -308,7 +252,7 @@ class RBBIRuleBuilder { // Write out the stripped rules (rules with extra spaces removed // These go last in the data area, even though they are not last in the header. - Assert.assrt(outputPos == header[14]); + Assert.assrt(outputPos == header[RBBIDataWrapper.DH_RULESOURCE]); dos.writeChars(strippedRules); outputPos += strippedRules.length() * 2; while (outputPos % 8 != 0) { // pad to an 8 byte boundary @@ -330,7 +274,15 @@ class RBBIRuleBuilder { // and list of all Unicode Sets referenced by the rules. // RBBIRuleBuilder builder = new RBBIRuleBuilder(rules); - builder.fScanner.parse(); + builder.build(os); + } + + /** + * Compile rules to the binary form, write that to an ouput stream. + * + */ + void build(OutputStream os) throws IOException { + fScanner.parse(); // // UnicodeSet processing. @@ -338,31 +290,30 @@ class RBBIRuleBuilder { // Generate the mapping tables (TRIE) from input code points to // the character categories. // - builder.fSetBuilder.buildRanges(); + fSetBuilder.buildRanges(); // // Generate the DFA state transition table. // - builder.fForwardTables = new RBBITableBuilder(builder, fForwardTree); - builder.fReverseTables = new RBBITableBuilder(builder, fReverseTree); - builder.fSafeFwdTables = new RBBITableBuilder(builder, fSafeFwdTree); - builder.fSafeRevTables = new RBBITableBuilder(builder, fSafeRevTree); - builder.fForwardTables.build(); - builder.fReverseTables.build(); - builder.fSafeFwdTables.build(); - builder.fSafeRevTables.build(); - if (builder.fDebugEnv != null - && builder.fDebugEnv.indexOf("states") >= 0) { - builder.fForwardTables.printRuleStatusTable(); + fForwardTable = new RBBITableBuilder(this, fForwardTree); + fForwardTable.buildForwardTable(); + optimizeTables(); + fForwardTable.buildSafeReverseTable(); + + + if (fDebugEnv != null + && fDebugEnv.indexOf("states") >= 0) { + fForwardTable.printStates(); + fForwardTable.printRuleStatusTable(); + fForwardTable.printReverseTable(); } - builder.optimizeTables(); - builder.fSetBuilder.buildTrie(); + fSetBuilder.buildTrie(); // // Package up the compiled data, writing it to an output stream // in the serialization format. This is the same as the ICU4C runtime format. // - builder.flattenData(os); + flattenData(os); } static class IntPair { @@ -377,17 +328,10 @@ class RBBIRuleBuilder { void optimizeTables() { IntPair duplPair = new IntPair(3, 0); - while (fForwardTables.findDuplCharClassFrom(duplPair)) { + while (fForwardTable.findDuplCharClassFrom(duplPair)) { fSetBuilder.mergeCategories(duplPair.first, duplPair.second); - fForwardTables.removeColumn(duplPair.second); - fReverseTables.removeColumn(duplPair.second); - fSafeFwdTables.removeColumn(duplPair.second); - fSafeRevTables.removeColumn(duplPair.second); + fForwardTable.removeColumn(duplPair.second); } - - fForwardTables.removeDuplicateStates(); - fReverseTables.removeDuplicateStates(); - fSafeFwdTables.removeDuplicateStates(); - fSafeRevTables.removeDuplicateStates(); + fForwardTable.removeDuplicateStates(); } } diff --git a/icu4j/main/classes/core/src/com/ibm/icu/text/RBBITableBuilder.java b/icu4j/main/classes/core/src/com/ibm/icu/text/RBBITableBuilder.java index 2a4d0582541..b2ae5082378 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/text/RBBITableBuilder.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/text/RBBITableBuilder.java @@ -95,7 +95,7 @@ class RBBITableBuilder { // table from the RBBI rules parse tree. // //----------------------------------------------------------------------------- - void build() { + void buildForwardTable() { // If there were no rules, just return. This situation can easily arise // for the reverse rules. if (fRB.fTreeRoots[fRootIx]==null) { diff --git a/icu4j/main/classes/core/src/com/ibm/icu/text/RuleBasedBreakIterator.java b/icu4j/main/classes/core/src/com/ibm/icu/text/RuleBasedBreakIterator.java index 74a44d871c4..12c703e5b66 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/text/RuleBasedBreakIterator.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/text/RuleBasedBreakIterator.java @@ -12,7 +12,6 @@ package com.ibm.icu.text; import static com.ibm.icu.impl.CharacterIteration.DONE32; import static com.ibm.icu.impl.CharacterIteration.next32; import static com.ibm.icu.impl.CharacterIteration.nextTrail32; -import static com.ibm.icu.impl.CharacterIteration.previous32; import java.io.ByteArrayOutputStream; import java.io.IOException; @@ -510,7 +509,7 @@ public class RuleBasedBreakIterator extends BreakIterator { checkOffset(offset, fText); // Adjust offset to be on a code point boundary and not beyond the end of the text. - // Note that isBoundary() is always be false for offsets that are not on code point boundaries. + // Note that isBoundary() is always false for offsets that are not on code point boundaries. // But we still need the side effect of leaving iteration at the following boundary. int adjustedOffset = CISetIndex32(fText, offset); @@ -966,142 +965,71 @@ public class RuleBasedBreakIterator extends BreakIterator { * This locates a "Safe Position" from which the forward break rules * will operate correctly. A Safe Position is not necessarily a boundary itself. * - * The logic of this function is very similar to handleNext(), above. + * The logic of this function is very similar to handleNext(), above, but simpler + * because the safe table does not require as many options. * * @param fromPosition the position in the input text to begin the iteration. * @internal */ - private int handlePrevious(int fromPosition) { - if (fText == null) { - return 0; + private int handleSafePrevious(int fromPosition) { + int state; + short category = 0; + int result = 0; + + // caches for quicker access + CharacterIterator text = fText; + Trie2 trie = fRData.fTrie; + short[] stateTable = fRData.fRTable.fTable; + + CISetIndex32(text, fromPosition); + if (TRACE) { + System.out.print("Handle Previous pos char state category"); } - int state; - int category = 0; - int mode; - int row; - int c; - int result = 0; - int initialPosition = fromPosition; - fLookAheadMatches.reset(); - short[] stateTable = fRData.fSRTable.fTable; - CISetIndex32(fText, fromPosition); - if (fromPosition == fText.getBeginIndex()) { + // if we're already at the start of the text, return DONE. + if (text.getIndex() == text.getBeginIndex()) { return BreakIterator.DONE; } - // set up the starting char - result = initialPosition; - c = previous32(fText); - - // Set up the initial state for the state machine + // Set the initial state for the state machine + int c = CharacterIteration.previous32(text); state = START_STATE; - row = fRData.getRowIndex(state); - category = 3; // TODO: obsolete? from the old start/run mode scheme? - mode = RBBI_RUN; - if ((fRData.fSRTable.fFlags & RBBIDataWrapper.RBBI_BOF_REQUIRED) != 0) { - category = 2; - mode = RBBI_START; - } - - if (TRACE) { - System.out.println("Handle Prev pos char state category "); - } + int row = fRData.getRowIndex(state); - // loop until we reach the beginning of the text or transition to state 0 + // loop until we reach the start of the text or transition to state 0 // - mainLoop: for (;;) { - if (c == DONE32) { - // Reached end of input string. - if (mode == RBBI_END) { - // We have already done the {eof} iteration. Now is the time - // to unconditionally bail out. - break mainLoop; - } - mode = RBBI_END; - category = 1; - } - - if (mode == RBBI_RUN) { - // look up the current character's category, which tells us - // which column in the state table to look at. - // - // And off the dictionary flag bit. For reverse iteration it is not used. - category = (short) fRData.fTrie.get(c); - category &= ~0x4000; - } + for (; c != DONE32; c = CharacterIteration.previous32(text)) { + // look up the current character's character category, which tells us + // which column in the state table to look at. + // + // And off the dictionary flag bit. For reverse iteration it is not used. + category = (short) trie.get(c); + category &= ~0x4000; if (TRACE) { - System.out.print(" " + fText.getIndex() + " "); - if (0x20 <= c && c < 0x7f) { - System.out.print(" " + c + " "); - } else { - System.out.print(" " + Integer.toHexString(c) + " "); - } - System.out.println(" " + state + " " + category + " "); + System.out.print(" " + RBBIDataWrapper.intToString(text.getIndex(), 5)); + System.out.print(RBBIDataWrapper.intToHexString(c, 10)); + System.out.println(RBBIDataWrapper.intToString(state,7) + RBBIDataWrapper.intToString(category,6)); } // State Transition - move machine to its next state // + assert(category < fRData.fHeader.fCatCount); state = stateTable[row + RBBIDataWrapper.NEXTSTATES + category]; - row = fRData.getRowIndex(state); - - if (stateTable[row + RBBIDataWrapper.ACCEPTING] == -1) { - // Match found, common case, could have lookahead so we move - // on to check it - result = fText.getIndex(); - } - - - int completedRule = stateTable[row + RBBIDataWrapper.ACCEPTING]; - if (completedRule > 0) { - // Lookahead match is completed. - int lookaheadResult = fLookAheadMatches.getPosition(completedRule); - if (lookaheadResult >= 0) { - result = lookaheadResult; - break mainLoop; - } - } - int rule = stateTable[row + RBBIDataWrapper.LOOKAHEAD]; - if (rule != 0) { - // At the position of a '/' in a look-ahead match. Record it. - int pos = fText.getIndex(); - fLookAheadMatches.setPosition(rule, pos); - } + row = fRData.getRowIndex(state); if (state == STOP_STATE) { - // Normal loop exit is here - break mainLoop; - } - - // then move iterator position backwards one character - // - if (mode == RBBI_RUN) { - c = previous32(fText); - } else { - if (mode == RBBI_START) { - mode = RBBI_RUN; - } + // This is the normal exit from the lookup state machine. + // Transition to state zero means we have found a safe point. + break; } - - - } // End of the main loop. - - // The state machine is done. Check whether it found a match... - // - // If the iterator failed to move in the match engine, force it back by one code point. - // (This really indicates a defect in the break rules. They should always match - // at least one character.) - if (result == initialPosition) { - CISetIndex32(fText, initialPosition); - previous32(fText); - result = fText.getIndex(); } + // The state machine is done. + result = text.getIndex(); if (TRACE) { - System.out.println("Result = " + result); + System.out.println("result = " + result); } - return result; } @@ -1493,11 +1421,26 @@ class BreakCache { if ((position < fBoundaries[fStartBufIdx] - 15) || position > (fBoundaries[fEndBufIdx] + 15)) { int aBoundary = fText.getBeginIndex(); int ruleStatusIndex = 0; - // TODO: check for position == length of text. Although may still need to back up to get rule status. if (position > aBoundary + 20) { - int backupPos = handlePrevious(position); - fPosition = backupPos; - aBoundary = handleNext(); // Ignore dictionary, just finding a rule based boundary. + int backupPos = handleSafePrevious(position); + if (backupPos > aBoundary) { + // Advance to the boundary following the backup position. + // There is a complication: the safe reverse rules identify pairs of code points + // that are safe. If advancing from the safe point moves forwards by less than + // two code points, we need to advance one more time to ensure that the boundary + // is good, including a correct rules status value. + // + fPosition = backupPos; + aBoundary = handleNext(); + if (aBoundary == backupPos + 1 || + (aBoundary == backupPos + 2 && + Character.isHighSurrogate(fText.setIndex(backupPos)) && + Character.isLowSurrogate(fText.next()))) { + // The initial handleNext() only advanced by a single code point. Go again. + // Safe rules identify safe pairs. + aBoundary = handleNext(); + } + } ruleStatusIndex = fRuleStatusIndex; } reset(aBoundary, ruleStatusIndex); // Reset cache to hold aBoundary as a single starting point. @@ -1628,21 +1571,34 @@ class BreakCache { if (backupPosition <= textBegin) { backupPosition = textBegin; } else { - backupPosition = handlePrevious(backupPosition); + backupPosition = handleSafePrevious(backupPosition); } if (backupPosition == BreakIterator.DONE || backupPosition == textBegin) { position = textBegin; positionStatusIdx = 0; } else { + // Advance to the boundary following the backup position. + // There is a complication: the safe reverse rules identify pairs of code points + // that are safe. If advancing from the safe point moves forwards by less than + // two code points, we need to advance one more time to ensure that the boundary + // is good, including a correct rules status value. + // fPosition = backupPosition; // TODO: pass starting position in a clearer way. position = handleNext(); + if (position == backupPosition + 1 || + (position == backupPosition + 2 && + Character.isHighSurrogate(fText.setIndex(backupPosition)) && + Character.isLowSurrogate(fText.next()))) { + // The initial handleNext() only advanced by a single code point. Go again. + // Safe rules identify safe pairs. + position = handleNext(); + } positionStatusIdx = fRuleStatusIndex; - } } while (position >= fromPosition); // Find boundaries between the one we just located and the first already-cached boundary - // Put them in a side buffer, because we don't yet know where they will fall in the circular cache buffer.. + // Put them in a side buffer, because we don't yet know where they will fall in the circular cache buffer. fSideBuffer.removeAllElements(); fSideBuffer.push(position);