From 3180a6400ef94df5d494116ae7a68f34e55a5cf0 Mon Sep 17 00:00:00 2001 From: Andy Heninger Date: Fri, 30 Mar 2018 01:12:50 +0000 Subject: [PATCH] ICU-13194 RBBI safe tables Java port, work in progress. X-SVN-Rev: 41172 --- .../src/com/ibm/icu/text/RBBIRuleBuilder.java | 9 +- .../src/com/ibm/icu/text/RBBIRuleScanner.java | 14 +- .../com/ibm/icu/text/RBBITableBuilder.java | 277 ++++++++++++++++-- 3 files changed, 264 insertions(+), 36 deletions(-) diff --git a/icu4j/main/classes/core/src/com/ibm/icu/text/RBBIRuleBuilder.java b/icu4j/main/classes/core/src/com/ibm/icu/text/RBBIRuleBuilder.java index 8cce28129eb..12e7b0e0b58 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/text/RBBIRuleBuilder.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/text/RBBIRuleBuilder.java @@ -35,13 +35,16 @@ class RBBIRuleBuilder { // // There are four separate parse trees generated, one for each of the // forward rules, reverse rules, safe forward rules and safe reverse rules. - // This array references the root of each of the trees. + // This array references the root of each of the trees. + // Only fForwardTree data is actually used to generate a state table. + // The other three are retained for back compatibility with old rule files, + // which may have safe and reverse rules. These are still parsed. // RBBINode[] fTreeRoots = new RBBINode[4]; static final int fForwardTree = 0; // Indexes into the above fTreeRoots array static final int fReverseTree = 1; // for each of the trees. - // // (in C, these are pointer variables and - // // there is no array.) + static final int fSafeFwdTree = 3; // (in C, these are pointer variables and + static final int fSafeRevTree = 4; // there is no array.) int fDefaultTree = fForwardTree; // For rules not qualified with a ! // the tree to which they belong to. diff --git a/icu4j/main/classes/core/src/com/ibm/icu/text/RBBIRuleScanner.java b/icu4j/main/classes/core/src/com/ibm/icu/text/RBBIRuleScanner.java index afdc927b2f2..8fb44c71a3f 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/text/RBBIRuleScanner.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/text/RBBIRuleScanner.java @@ -292,7 +292,7 @@ class RBBIRuleScanner { // OR this rule into the appropriate group of them. // - int destRules = (fReverseRule ? RBBIRuleBuilder.fReverseTree : fRB.fDefaultTree); + int destRules = (fReverseRule ? RBBIRuleBuilder.fSafeRevTree : fRB.fDefaultTree); if (fRB.fTreeRoots[destRules] != null) { // This is not the first rule encountered. @@ -972,18 +972,6 @@ class RBBIRuleScanner { error(RBBIRuleBuilder.U_BRK_RULE_SYNTAX); } - // - // If there were NO user specified reverse rules, set up the equivalent of ".*;" - // - if (fRB.fTreeRoots[RBBIRuleBuilder.fReverseTree] == null) { - fRB.fTreeRoots[RBBIRuleBuilder.fReverseTree] = pushNewNode(RBBINode.opStar); - RBBINode operand = pushNewNode(RBBINode.setRef); - findSetFor(kAny, operand, null); - fRB.fTreeRoots[RBBIRuleBuilder.fReverseTree].fLeftChild = operand; - operand.fParent = fRB.fTreeRoots[RBBIRuleBuilder.fReverseTree]; - fNodeStackPtr -= 2; - } - // // Parsing of the input RBBI rules is complete. // We now have a parse tree for the rule expressions diff --git a/icu4j/main/classes/core/src/com/ibm/icu/text/RBBITableBuilder.java b/icu4j/main/classes/core/src/com/ibm/icu/text/RBBITableBuilder.java index b2ae5082378..a40a1bff46a 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/text/RBBITableBuilder.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/text/RBBITableBuilder.java @@ -23,18 +23,16 @@ import com.ibm.icu.lang.UCharacter; import com.ibm.icu.lang.UProperty; import com.ibm.icu.text.RBBIRuleBuilder.IntPair; -// -// class RBBITableBuilder is part of the RBBI rule compiler. -// It builds the state transition table used by the RBBI runtime -// from the expression syntax tree generated by the rule scanner. -// -// This class is part of the RBBI implementation only. -// There is no user-visible public API here. -// +/** + * This class is part of the RBBI rule compiler. + * It builds the state transition table used by the RBBI runtime + * from the expression syntax tree generated by the rule scanner. + * + * This class is part of the RBBI implementation only. + * There is no user-visible public API here. + */ class RBBITableBuilder { - - // // RBBIStateDescriptor - The DFA is initially constructed as a set of these descriptors, // one for each state. @@ -65,13 +63,15 @@ class RBBITableBuilder { private RBBIRuleBuilder fRB; - private int fRootIx; // The array index into RBBIRuleBuilder.fTreeRoots - // for the parse tree to operate on. - // Too bad Java can't do indirection more easily! - private List fDStates; // D states (Aho's terminology) - // Index is state number - // Contents are RBBIStateDescriptor pointers. + /** The array index into RBBIRuleBuilder.fTreeRoots for the parse tree to operate on. */ + private int fRootIx; + + /** D states (Aho's terminology). Index is state number. */ + private List fDStates; + + /** Synthesized safe table, a List of row arrays. */ + private List fSafeTable; //----------------------------------------------------------------------------- // @@ -91,8 +91,8 @@ class RBBITableBuilder { //----------------------------------------------------------------------------- // - // RBBITableBuilder::build - This is the main function for building the DFA state transtion - // table from the RBBI rules parse tree. + // RBBITableBuilder::buildForwardTable - This is the main function for building + // the DFA state transition table from the RBBI rules parse tree. // //----------------------------------------------------------------------------- void buildForwardTable() { @@ -195,8 +195,6 @@ class RBBITableBuilder { // for all tables. Merge the ones from this table into the global set. // mergeRuleStatusVals(); - - if (fRB.fDebugEnv!=null && fRB.fDebugEnv.indexOf("states")>=0) {printStates();} } @@ -924,6 +922,40 @@ class RBBITableBuilder { return false; } + /** + * Find the next duplicate state in the safe reverse table. An iterator function. + * @param states in/out parameter, specifies where to start looking for duplicates, + * and returns the first pair of duplicates found, if any. + * @return true if duplicate states were found, false otherwise. + * @internal + */ + boolean findDuplicateSafeState(RBBIRuleBuilder.IntPair states) { + int numStates = fSafeTable.size(); + + for (; states.first duplState) { + newVal = existingVal - 1; + } + row[col] = (short)newVal; + } + } + } + /** * Check for, and remove duplicate states (table rows). @@ -1047,6 +1106,146 @@ class RBBITableBuilder { return table; } + /** + * Synthesize a safe state table from the main state table. + */ + void buildSafeReverseTable() { + // Find safe char class pairs. + + // make a state table row for each trailing class, and map from class to row. + + // For each pair + // startRow[p1] = p2 + // p2row[p2] = stopRow + // For each unfilled in cell + // set to row corresponding to its column. + + // Each safe pair is stored as two chars in the safePair stringBuilder. + StringBuilder safePairs = new StringBuilder(); + + int numCharClasses = fRB.fSetBuilder.getNumCharCategories(); + int numStates = fDStates.size(); + + for (int c1=0; c1 + // Row 0 is the stop state. + // Row 1 is the start sate. + // Row 2 and beyond are other states, initially one per char class, but + // after initial construction, many of the states will be combined, compacting the table.) + // The String holds the nextState data only. The four leading fields of a row, fAccepting, + // fLookAhead, etc. are not needed for the safe table, and are omitted at this stage of building. + + assert(fSafeTable == null); + fSafeTable = new ArrayList(); + for (int row=0; row