*/
@Deprecated
public RBBIStateTable fRTable;
- /**
- * @internal
- * @deprecated This API is ICU internal only.
- */
- @Deprecated
- public RBBIStateTable fSFTable;
- /**
- * @internal
- * @deprecated This API is ICU internal only.
- */
- @Deprecated
- public RBBIStateTable fSRTable;
Trie2 fTrie;
String fRuleSource;
int fStatusTable[];
static final int DATA_FORMAT = 0x42726b20; // "Brk "
- static final int FORMAT_VERSION = 0x04000000; // 4.0.0.0
+ static final int FORMAT_VERSION = 0x05000000; // 4.0.0.0
private static final class IsAcceptable implements Authenticate {
@Override
// Indexes to fields in the ICU4C style binary form of the RBBI Data Header
// Used by the rule compiler when flattening the data.
//
- final static int DH_SIZE = 24;
+ final static int DH_SIZE = 20;
final static int DH_MAGIC = 0;
final static int DH_FORMATVERSION = 1;
final static int DH_LENGTH = 2;
final static int DH_FTABLELEN = 5;
final static int DH_RTABLE = 6;
final static int DH_RTABLELEN = 7;
- final static int DH_SFTABLE = 8;
- final static int DH_SFTABLELEN = 9;
- final static int DH_SRTABLE = 10;
- final static int DH_SRTABLELEN = 11;
- final static int DH_TRIE = 12;
- final static int DH_TRIELEN = 13;
- final static int DH_RULESOURCE = 14;
- final static int DH_RULESOURCELEN = 15;
- final static int DH_STATUSTABLE = 16;
- final static int DH_STATUSTABLELEN = 17;
+ final static int DH_TRIE = 8;
+ final static int DH_TRIELEN = 9;
+ final static int DH_RULESOURCE = 10;
+ final static int DH_RULESOURCELEN = 11;
+ final static int DH_STATUSTABLE = 12;
+ final static int DH_STATUSTABLELEN = 13;
// Index offsets to the fields in a state table row.
int fFTableLen;
int fRTable; // Offset to the reverse state transition table.
int fRTableLen;
- int fSFTable; // safe point forward transition table
- int fSFTableLen;
- int fSRTable; // safe point reverse transition table
- int fSRTableLen;
int fTrie; // Offset to Trie data for character categories
int fTrieLen;
int fRuleSource; // Offset to the source for for the break
This.fHeader.fFTableLen = bytes.getInt();
This.fHeader.fRTable = bytes.getInt();
This.fHeader.fRTableLen = bytes.getInt();
- This.fHeader.fSFTable = bytes.getInt();
- This.fHeader.fSFTableLen = bytes.getInt();
- This.fHeader.fSRTable = bytes.getInt();
- This.fHeader.fSRTableLen = bytes.getInt();
This.fHeader.fTrie = bytes.getInt();
This.fHeader.fTrieLen = bytes.getInt();
This.fHeader.fRuleSource = bytes.getInt();
This.fRTable = RBBIStateTable.get(bytes, This.fHeader.fRTableLen);
pos += This.fHeader.fRTableLen;
- //
- // Read in the Safe Forward state table
- //
- if (This.fHeader.fSFTableLen > 0) {
- // Skip over any padding in the file
- ICUBinary.skipBytes(bytes, This.fHeader.fSFTable - pos);
- pos = This.fHeader.fSFTable;
-
- // Create & fill the table itself.
- This.fSFTable = RBBIStateTable.get(bytes, This.fHeader.fSFTableLen);
- pos += This.fHeader.fSFTableLen;
- }
-
- //
- // Read in the Safe Reverse state table
- //
- if (This.fHeader.fSRTableLen > 0) {
- // Skip over any padding in the file
- ICUBinary.skipBytes(bytes, This.fHeader.fSRTable - pos);
- pos = This.fHeader.fSRTable;
-
- // Create & fill the table itself.
- This.fSRTable = RBBIStateTable.get(bytes, This.fHeader.fSRTableLen);
- pos += This.fHeader.fSRTableLen;
- }
-
- // Rule Compatibility Hacks
- // If a rule set includes reverse rules but does not explicitly include safe reverse rules,
- // the reverse rules are to be treated as safe reverse rules.
-
- if (This.fSRTable == null && This.fRTable != null) {
- This.fSRTable = This.fRTable;
- This.fRTable = null;
- }
-
//
// Unserialize the Character categories TRIE
// Because we can't be absolutely certain where the Trie deserialize will
dumpTable(out, fFTable);
out.println("Reverse State Table");
dumpTable(out, fRTable);
- out.println("Forward Safe Points Table");
- dumpTable(out, fSFTable);
- out.println("Reverse Safe Points Table");
- dumpTable(out, fSRTable);
dumpCharCategories(out);
out.println("Source Rules: " + fRuleSource);
RBBINode[] fTreeRoots = new RBBINode[4];
static final int fForwardTree = 0; // Indexes into the above fTreeRoots array
static final int fReverseTree = 1; // for each of the trees.
- static final int fSafeFwdTree = 2; // (in C, these are pointer variables and
- static final int fSafeRevTree = 3; // there is no array.)
+ // // (in C, these are pointer variables and
+ // // there is no array.)
int fDefaultTree = fForwardTree; // For rules not qualified with a !
// the tree to which they belong to.
RBBISetBuilder fSetBuilder; // Set and Character Category builder.
List<RBBINode> fUSetNodes; // Vector of all uset nodes.
- RBBITableBuilder fForwardTables; // State transition tables
- RBBITableBuilder fReverseTables;
- RBBITableBuilder fSafeFwdTables;
- RBBITableBuilder fSafeRevTables;
+ RBBITableBuilder fForwardTable; // State transition tables
//
// Status {tag} values. These structures are common to all of the rule sets (Forward, Reverse, etc.).
// Sections sizes actually stored in the header are for the actual data
// without the padding.
//
- int headerSize = 24 * 4; // align8(sizeof(RBBIDataHeader));
- int forwardTableSize = align8(fForwardTables.getTableSize());
- int reverseTableSize = align8(fReverseTables.getTableSize());
- // int safeFwdTableSize = align8(fSafeFwdTables.getTableSize());
- int safeRevTableSize = align8(fSafeRevTables.getTableSize());
+ int headerSize = RBBIDataWrapper.DH_SIZE * 4; // align8(sizeof(RBBIDataHeader));
+ int forwardTableSize = align8(fForwardTable.getTableSize());
+ int reverseTableSize = align8(fForwardTable.getSafeTableSize());
int trieSize = align8(fSetBuilder.getTrieSize());
int statusTableSize = align8(fRuleStatusVals.size() * 4);
int rulesSize = align8((strippedRules.length()) * 2);
int totalSize = headerSize
+ forwardTableSize
- + /* reverseTableSize */ 0
- + /* safeFwdTableSize */ 0
- + (safeRevTableSize > 0 ? safeRevTableSize : reverseTableSize)
+ + reverseTableSize
+ statusTableSize + trieSize + rulesSize;
int outputPos = 0; // Track stream position, starting from RBBIDataHeader.
header[RBBIDataWrapper.DH_LENGTH] = totalSize; // fLength, the total size of all rule sections.
header[RBBIDataWrapper.DH_CATCOUNT] = fSetBuilder.getNumCharCategories(); // fCatCount.
- // Only save the forward table and the safe reverse table,
- // because these are the only ones used at run-time.
- //
- // For the moment, we still build the other tables if they are present in the rule source files,
- // for backwards compatibility. Old rule files need to work, and this is the simplest approach.
- //
- // Additional backwards compatibility consideration: if no safe rules are provided, consider the
- // reverse rules to actually be the safe reverse rules.
-
header[RBBIDataWrapper.DH_FTABLE] = headerSize; // fFTable
header[RBBIDataWrapper.DH_FTABLELEN] = forwardTableSize; // fTableLen
- // Do not save Reverse Table.
header[RBBIDataWrapper.DH_RTABLE] = header[RBBIDataWrapper.DH_FTABLE] + forwardTableSize; // fRTable
- header[RBBIDataWrapper.DH_RTABLELEN] = 0; // fRTableLen
-
- // Do not save the Safe Forward table.
- header[RBBIDataWrapper.DH_SFTABLE] = header[RBBIDataWrapper.DH_RTABLE]
- + 0; // fSTable
- header[RBBIDataWrapper.DH_SFTABLELEN] = 0; // fSTableLen
-
- // Safe reverse table. Use if present, otherwise save regular reverse table as the safe reverse.
- header[RBBIDataWrapper.DH_SRTABLE] = header[RBBIDataWrapper.DH_SFTABLE]
- + 0; // fSRTable
- if (safeRevTableSize > 0) {
- header[RBBIDataWrapper.DH_SRTABLELEN] = safeRevTableSize;
- } else {
- assert reverseTableSize > 0;
- header[RBBIDataWrapper.DH_SRTABLELEN] = reverseTableSize;
- }
+ header[RBBIDataWrapper.DH_RTABLELEN] = reverseTableSize; // fRTableLen
- header[RBBIDataWrapper.DH_TRIE] = header[RBBIDataWrapper.DH_SRTABLE]
- + header[RBBIDataWrapper.DH_SRTABLELEN]; // fTrie
+ header[RBBIDataWrapper.DH_TRIE] = header[RBBIDataWrapper.DH_RTABLE]
+ + header[RBBIDataWrapper.DH_RTABLELEN]; // fTrie
header[RBBIDataWrapper.DH_TRIELEN] = fSetBuilder.getTrieSize(); // fTrieLen
header[RBBIDataWrapper.DH_STATUSTABLE] = header[RBBIDataWrapper.DH_TRIE]
+ header[RBBIDataWrapper.DH_TRIELEN];
}
// Write out the actual state tables.
- RBBIDataWrapper.RBBIStateTable table = fForwardTables.exportTable();
- assert(outputPos == header[4]);
+ RBBIDataWrapper.RBBIStateTable table = fForwardTable.exportTable();
+ assert(outputPos == header[RBBIDataWrapper.DH_FTABLE]);
outputPos += table.put(dos);
- /* do not write the reverse table
- tableData = fReverseTables.exportTable();
- Assert.assrt(outputPos == header[6]);
- for (i = 0; i < tableData.length; i++) {
- dos.writeShort(tableData[i]);
- outputPos += 2;
- }
- */
-
- /* do not write safe forwards table
- Assert.assrt(outputPos == header[8]);
- tableData = fSafeFwdTables.exportTable();
- for (i = 0; i < tableData.length; i++) {
- dos.writeShort(tableData[i]);
- outputPos += 2;
- }
- */
-
- // Write the safe reverse table.
- // If not present, write the plain reverse table (old style rule compatibility)
- assert(outputPos == header[10]);
- if (safeRevTableSize > 0) {
- table = fSafeRevTables.exportTable();
- } else {
- table = fReverseTables.exportTable();
- }
+ table = fForwardTable.exportSafeTable();
+ Assert.assrt(outputPos == header[RBBIDataWrapper.DH_RTABLE]);
outputPos += table.put(dos);
// write out the Trie table
- Assert.assrt(outputPos == header[12]);
+ Assert.assrt(outputPos == header[RBBIDataWrapper.DH_TRIE]);
fSetBuilder.serializeTrie(os);
- outputPos += header[13];
+ outputPos += header[RBBIDataWrapper.DH_TRIELEN];
while (outputPos % 8 != 0) { // pad to an 8 byte boundary
dos.write(0);
outputPos += 1;
}
// Write out the status {tag} table.
- Assert.assrt(outputPos == header[16]);
+ Assert.assrt(outputPos == header[RBBIDataWrapper.DH_STATUSTABLE]);
for (Integer val : fRuleStatusVals) {
dos.writeInt(val.intValue());
outputPos += 4;
// Write out the stripped rules (rules with extra spaces removed
// These go last in the data area, even though they are not last in the header.
- Assert.assrt(outputPos == header[14]);
+ Assert.assrt(outputPos == header[RBBIDataWrapper.DH_RULESOURCE]);
dos.writeChars(strippedRules);
outputPos += strippedRules.length() * 2;
while (outputPos % 8 != 0) { // pad to an 8 byte boundary
// and list of all Unicode Sets referenced by the rules.
//
RBBIRuleBuilder builder = new RBBIRuleBuilder(rules);
- builder.fScanner.parse();
+ builder.build(os);
+ }
+
+ /**
+ * Compile rules to the binary form, write that to an ouput stream.
+ *
+ */
+ void build(OutputStream os) throws IOException {
+ fScanner.parse();
//
// UnicodeSet processing.
// Generate the mapping tables (TRIE) from input code points to
// the character categories.
//
- builder.fSetBuilder.buildRanges();
+ fSetBuilder.buildRanges();
//
// Generate the DFA state transition table.
//
- builder.fForwardTables = new RBBITableBuilder(builder, fForwardTree);
- builder.fReverseTables = new RBBITableBuilder(builder, fReverseTree);
- builder.fSafeFwdTables = new RBBITableBuilder(builder, fSafeFwdTree);
- builder.fSafeRevTables = new RBBITableBuilder(builder, fSafeRevTree);
- builder.fForwardTables.build();
- builder.fReverseTables.build();
- builder.fSafeFwdTables.build();
- builder.fSafeRevTables.build();
- if (builder.fDebugEnv != null
- && builder.fDebugEnv.indexOf("states") >= 0) {
- builder.fForwardTables.printRuleStatusTable();
+ fForwardTable = new RBBITableBuilder(this, fForwardTree);
+ fForwardTable.buildForwardTable();
+ optimizeTables();
+ fForwardTable.buildSafeReverseTable();
+
+
+ if (fDebugEnv != null
+ && fDebugEnv.indexOf("states") >= 0) {
+ fForwardTable.printStates();
+ fForwardTable.printRuleStatusTable();
+ fForwardTable.printReverseTable();
}
- builder.optimizeTables();
- builder.fSetBuilder.buildTrie();
+ fSetBuilder.buildTrie();
//
// Package up the compiled data, writing it to an output stream
// in the serialization format. This is the same as the ICU4C runtime format.
//
- builder.flattenData(os);
+ flattenData(os);
}
static class IntPair {
void optimizeTables() {
IntPair duplPair = new IntPair(3, 0);
- while (fForwardTables.findDuplCharClassFrom(duplPair)) {
+ while (fForwardTable.findDuplCharClassFrom(duplPair)) {
fSetBuilder.mergeCategories(duplPair.first, duplPair.second);
- fForwardTables.removeColumn(duplPair.second);
- fReverseTables.removeColumn(duplPair.second);
- fSafeFwdTables.removeColumn(duplPair.second);
- fSafeRevTables.removeColumn(duplPair.second);
+ fForwardTable.removeColumn(duplPair.second);
}
-
- fForwardTables.removeDuplicateStates();
- fReverseTables.removeDuplicateStates();
- fSafeFwdTables.removeDuplicateStates();
- fSafeRevTables.removeDuplicateStates();
+ fForwardTable.removeDuplicateStates();
}
}
// table from the RBBI rules parse tree.
//
//-----------------------------------------------------------------------------
- void build() {
+ void buildForwardTable() {
// If there were no rules, just return. This situation can easily arise
// for the reverse rules.
if (fRB.fTreeRoots[fRootIx]==null) {
import static com.ibm.icu.impl.CharacterIteration.DONE32;
import static com.ibm.icu.impl.CharacterIteration.next32;
import static com.ibm.icu.impl.CharacterIteration.nextTrail32;
-import static com.ibm.icu.impl.CharacterIteration.previous32;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
checkOffset(offset, fText);
// Adjust offset to be on a code point boundary and not beyond the end of the text.
- // Note that isBoundary() is always be false for offsets that are not on code point boundaries.
+ // Note that isBoundary() is always false for offsets that are not on code point boundaries.
// But we still need the side effect of leaving iteration at the following boundary.
int adjustedOffset = CISetIndex32(fText, offset);
* This locates a "Safe Position" from which the forward break rules
* will operate correctly. A Safe Position is not necessarily a boundary itself.
*
- * The logic of this function is very similar to handleNext(), above.
+ * The logic of this function is very similar to handleNext(), above, but simpler
+ * because the safe table does not require as many options.
*
* @param fromPosition the position in the input text to begin the iteration.
* @internal
*/
- private int handlePrevious(int fromPosition) {
- if (fText == null) {
- return 0;
+ private int handleSafePrevious(int fromPosition) {
+ int state;
+ short category = 0;
+ int result = 0;
+
+ // caches for quicker access
+ CharacterIterator text = fText;
+ Trie2 trie = fRData.fTrie;
+ short[] stateTable = fRData.fRTable.fTable;
+
+ CISetIndex32(text, fromPosition);
+ if (TRACE) {
+ System.out.print("Handle Previous pos char state category");
}
- int state;
- int category = 0;
- int mode;
- int row;
- int c;
- int result = 0;
- int initialPosition = fromPosition;
- fLookAheadMatches.reset();
- short[] stateTable = fRData.fSRTable.fTable;
- CISetIndex32(fText, fromPosition);
- if (fromPosition == fText.getBeginIndex()) {
+ // if we're already at the start of the text, return DONE.
+ if (text.getIndex() == text.getBeginIndex()) {
return BreakIterator.DONE;
}
- // set up the starting char
- result = initialPosition;
- c = previous32(fText);
-
- // Set up the initial state for the state machine
+ // Set the initial state for the state machine
+ int c = CharacterIteration.previous32(text);
state = START_STATE;
- row = fRData.getRowIndex(state);
- category = 3; // TODO: obsolete? from the old start/run mode scheme?
- mode = RBBI_RUN;
- if ((fRData.fSRTable.fFlags & RBBIDataWrapper.RBBI_BOF_REQUIRED) != 0) {
- category = 2;
- mode = RBBI_START;
- }
-
- if (TRACE) {
- System.out.println("Handle Prev pos char state category ");
- }
+ int row = fRData.getRowIndex(state);
- // loop until we reach the beginning of the text or transition to state 0
+ // loop until we reach the start of the text or transition to state 0
//
- mainLoop: for (;;) {
- if (c == DONE32) {
- // Reached end of input string.
- if (mode == RBBI_END) {
- // We have already done the {eof} iteration. Now is the time
- // to unconditionally bail out.
- break mainLoop;
- }
- mode = RBBI_END;
- category = 1;
- }
-
- if (mode == RBBI_RUN) {
- // look up the current character's category, which tells us
- // which column in the state table to look at.
- //
- // And off the dictionary flag bit. For reverse iteration it is not used.
- category = (short) fRData.fTrie.get(c);
- category &= ~0x4000;
- }
+ for (; c != DONE32; c = CharacterIteration.previous32(text)) {
+ // look up the current character's character category, which tells us
+ // which column in the state table to look at.
+ //
+ // And off the dictionary flag bit. For reverse iteration it is not used.
+ category = (short) trie.get(c);
+ category &= ~0x4000;
if (TRACE) {
- System.out.print(" " + fText.getIndex() + " ");
- if (0x20 <= c && c < 0x7f) {
- System.out.print(" " + c + " ");
- } else {
- System.out.print(" " + Integer.toHexString(c) + " ");
- }
- System.out.println(" " + state + " " + category + " ");
+ System.out.print(" " + RBBIDataWrapper.intToString(text.getIndex(), 5));
+ System.out.print(RBBIDataWrapper.intToHexString(c, 10));
+ System.out.println(RBBIDataWrapper.intToString(state,7) + RBBIDataWrapper.intToString(category,6));
}
// State Transition - move machine to its next state
//
+ assert(category < fRData.fHeader.fCatCount);
state = stateTable[row + RBBIDataWrapper.NEXTSTATES + category];
- row = fRData.getRowIndex(state);
-
- if (stateTable[row + RBBIDataWrapper.ACCEPTING] == -1) {
- // Match found, common case, could have lookahead so we move
- // on to check it
- result = fText.getIndex();
- }
-
-
- int completedRule = stateTable[row + RBBIDataWrapper.ACCEPTING];
- if (completedRule > 0) {
- // Lookahead match is completed.
- int lookaheadResult = fLookAheadMatches.getPosition(completedRule);
- if (lookaheadResult >= 0) {
- result = lookaheadResult;
- break mainLoop;
- }
- }
- int rule = stateTable[row + RBBIDataWrapper.LOOKAHEAD];
- if (rule != 0) {
- // At the position of a '/' in a look-ahead match. Record it.
- int pos = fText.getIndex();
- fLookAheadMatches.setPosition(rule, pos);
- }
+ row = fRData.getRowIndex(state);
if (state == STOP_STATE) {
- // Normal loop exit is here
- break mainLoop;
- }
-
- // then move iterator position backwards one character
- //
- if (mode == RBBI_RUN) {
- c = previous32(fText);
- } else {
- if (mode == RBBI_START) {
- mode = RBBI_RUN;
- }
+ // This is the normal exit from the lookup state machine.
+ // Transition to state zero means we have found a safe point.
+ break;
}
-
-
- } // End of the main loop.
-
- // The state machine is done. Check whether it found a match...
- //
- // If the iterator failed to move in the match engine, force it back by one code point.
- // (This really indicates a defect in the break rules. They should always match
- // at least one character.)
- if (result == initialPosition) {
- CISetIndex32(fText, initialPosition);
- previous32(fText);
- result = fText.getIndex();
}
+ // The state machine is done.
+ result = text.getIndex();
if (TRACE) {
- System.out.println("Result = " + result);
+ System.out.println("result = " + result);
}
-
return result;
}
if ((position < fBoundaries[fStartBufIdx] - 15) || position > (fBoundaries[fEndBufIdx] + 15)) {
int aBoundary = fText.getBeginIndex();
int ruleStatusIndex = 0;
- // TODO: check for position == length of text. Although may still need to back up to get rule status.
if (position > aBoundary + 20) {
- int backupPos = handlePrevious(position);
- fPosition = backupPos;
- aBoundary = handleNext(); // Ignore dictionary, just finding a rule based boundary.
+ int backupPos = handleSafePrevious(position);
+ if (backupPos > aBoundary) {
+ // Advance to the boundary following the backup position.
+ // There is a complication: the safe reverse rules identify pairs of code points
+ // that are safe. If advancing from the safe point moves forwards by less than
+ // two code points, we need to advance one more time to ensure that the boundary
+ // is good, including a correct rules status value.
+ //
+ fPosition = backupPos;
+ aBoundary = handleNext();
+ if (aBoundary == backupPos + 1 ||
+ (aBoundary == backupPos + 2 &&
+ Character.isHighSurrogate(fText.setIndex(backupPos)) &&
+ Character.isLowSurrogate(fText.next()))) {
+ // The initial handleNext() only advanced by a single code point. Go again.
+ // Safe rules identify safe pairs.
+ aBoundary = handleNext();
+ }
+ }
ruleStatusIndex = fRuleStatusIndex;
}
reset(aBoundary, ruleStatusIndex); // Reset cache to hold aBoundary as a single starting point.
if (backupPosition <= textBegin) {
backupPosition = textBegin;
} else {
- backupPosition = handlePrevious(backupPosition);
+ backupPosition = handleSafePrevious(backupPosition);
}
if (backupPosition == BreakIterator.DONE || backupPosition == textBegin) {
position = textBegin;
positionStatusIdx = 0;
} else {
+ // Advance to the boundary following the backup position.
+ // There is a complication: the safe reverse rules identify pairs of code points
+ // that are safe. If advancing from the safe point moves forwards by less than
+ // two code points, we need to advance one more time to ensure that the boundary
+ // is good, including a correct rules status value.
+ //
fPosition = backupPosition; // TODO: pass starting position in a clearer way.
position = handleNext();
+ if (position == backupPosition + 1 ||
+ (position == backupPosition + 2 &&
+ Character.isHighSurrogate(fText.setIndex(backupPosition)) &&
+ Character.isLowSurrogate(fText.next()))) {
+ // The initial handleNext() only advanced by a single code point. Go again.
+ // Safe rules identify safe pairs.
+ position = handleNext();
+ }
positionStatusIdx = fRuleStatusIndex;
-
}
} while (position >= fromPosition);
// Find boundaries between the one we just located and the first already-cached boundary
- // Put them in a side buffer, because we don't yet know where they will fall in the circular cache buffer..
+ // Put them in a side buffer, because we don't yet know where they will fall in the circular cache buffer.
fSideBuffer.removeAllElements();
fSideBuffer.push(position);