]> granicus.if.org Git - icu/commitdiff
ICU-13194 RBBI safe tables Java port, work in progress.
authorAndy Heninger <andy.heninger@gmail.com>
Fri, 30 Mar 2018 01:12:50 +0000 (01:12 +0000)
committerAndy Heninger <andy.heninger@gmail.com>
Fri, 30 Mar 2018 01:12:50 +0000 (01:12 +0000)
X-SVN-Rev: 41172

icu4j/main/classes/core/src/com/ibm/icu/text/RBBIRuleBuilder.java
icu4j/main/classes/core/src/com/ibm/icu/text/RBBIRuleScanner.java
icu4j/main/classes/core/src/com/ibm/icu/text/RBBITableBuilder.java

index 8cce28129eb6c21fcbf7f949bb27b271aeee2b8f..12e7b0e0b5886eb820b49cec00d9be583b0dd2aa 100644 (file)
@@ -35,13 +35,16 @@ class RBBIRuleBuilder {
     //
     //  There are four separate parse trees generated, one for each of the
     //    forward rules, reverse rules, safe forward rules and safe reverse rules.
-    //  This array references the root of each of the trees.
+    //    This array references the root of each of the trees.
+    //    Only fForwardTree data is actually used to generate a state table.
+    //    The other three are retained for back compatibility with old rule files,
+    //    which may have safe and reverse rules. These are still parsed.
     //
     RBBINode[]         fTreeRoots = new RBBINode[4];
     static final int   fForwardTree = 0;  // Indexes into the above fTreeRoots array
     static final int   fReverseTree = 1;  //   for each of the trees.
-    //                                    //   (in C, these are pointer variables and
-    //                                    //    there is no array.)
+    static final int   fSafeFwdTree = 3;  //   (in C, these are pointer variables and
+    static final int   fSafeRevTree = 4;  //    there is no array.)
     int fDefaultTree = fForwardTree;      // For rules not qualified with a !
                                           //   the tree to which they belong to.
 
index afdc927b2f27cee03d2f2dae7187ed8d3082a660..8fb44c71a3f71f05fbcfa155c74d9908b7dd7315 100644 (file)
@@ -292,7 +292,7 @@ class RBBIRuleScanner {
             //  OR this rule into the appropriate group of them.
             //
 
-            int destRules = (fReverseRule ? RBBIRuleBuilder.fReverseTree : fRB.fDefaultTree);
+            int destRules = (fReverseRule ? RBBIRuleBuilder.fSafeRevTree : fRB.fDefaultTree);
 
             if (fRB.fTreeRoots[destRules] != null) {
                 // This is not the first rule encountered.
@@ -972,18 +972,6 @@ class RBBIRuleScanner {
             error(RBBIRuleBuilder.U_BRK_RULE_SYNTAX);
         }
 
-        //
-        // If there were NO user specified reverse rules, set up the equivalent of ".*;"
-        //
-        if (fRB.fTreeRoots[RBBIRuleBuilder.fReverseTree] == null) {
-            fRB.fTreeRoots[RBBIRuleBuilder.fReverseTree] = pushNewNode(RBBINode.opStar);
-            RBBINode operand = pushNewNode(RBBINode.setRef);
-            findSetFor(kAny, operand, null);
-            fRB.fTreeRoots[RBBIRuleBuilder.fReverseTree].fLeftChild = operand;
-            operand.fParent = fRB.fTreeRoots[RBBIRuleBuilder.fReverseTree];
-            fNodeStackPtr -= 2;
-        }
-
         //
         // Parsing of the input RBBI rules is complete.
         // We now have a parse tree for the rule expressions
index b2ae50823781718323bb1a5bc32fecc9010606ba..a40a1bff46a73bcfef63515671be141ca830a52e 100644 (file)
@@ -23,18 +23,16 @@ import com.ibm.icu.lang.UCharacter;
 import com.ibm.icu.lang.UProperty;
 import com.ibm.icu.text.RBBIRuleBuilder.IntPair;
 
-//
-//  class RBBITableBuilder is part of the RBBI rule compiler.
-//                         It builds the state transition table used by the RBBI runtime
-//                         from the expression syntax tree generated by the rule scanner.
-//
-//                         This class is part of the RBBI implementation only.
-//                         There is no user-visible public API here.
-//
+/**
+ *  This class is part of the RBBI rule compiler.
+ *  It builds the state transition table used by the RBBI runtime
+ *  from the expression syntax tree generated by the rule scanner.
+ *
+ *  This class is part of the RBBI implementation only.
+ *  There is no user-visible public API here.
+ */
 class RBBITableBuilder {
 
-
-
     //
     //  RBBIStateDescriptor - The DFA is initially constructed as a set of these descriptors,
     //                        one for each state.
@@ -65,13 +63,15 @@ class RBBITableBuilder {
 
 
     private  RBBIRuleBuilder  fRB;
-    private  int             fRootIx;             // The array index into RBBIRuleBuilder.fTreeRoots
-                                                   //   for the parse tree to operate on.
-                                                   //   Too bad Java can't do indirection more easily!
 
-    private  List<RBBIStateDescriptor> fDStates;    //  D states (Aho's terminology)
-                                                    //  Index is state number
-                                                    //  Contents are RBBIStateDescriptor pointers.
+    /** The array index into RBBIRuleBuilder.fTreeRoots for the parse tree to operate on. */
+    private  int  fRootIx;
+
+    /** D states (Aho's terminology). Index is state number. */
+    private  List<RBBIStateDescriptor> fDStates;
+
+    /** Synthesized safe table, a List of row arrays.  */
+    private List<short[]>    fSafeTable;
 
     //-----------------------------------------------------------------------------
     //
@@ -91,8 +91,8 @@ class RBBITableBuilder {
 
        //-----------------------------------------------------------------------------
        //
-       //   RBBITableBuilder::build  -  This is the main function for building the DFA state transtion
-       //                               table from the RBBI rules parse tree.
+       //   RBBITableBuilder::buildForwardTable  -  This is the main function for building
+       //                          the DFA state transition table from the RBBI rules parse tree.
        //
        //-----------------------------------------------------------------------------
        void  buildForwardTable() {
@@ -195,8 +195,6 @@ class RBBITableBuilder {
            //    for all tables.  Merge the ones from this table into the global set.
            //
            mergeRuleStatusVals();
-
-           if (fRB.fDebugEnv!=null && fRB.fDebugEnv.indexOf("states")>=0) {printStates();}
        }
 
 
@@ -924,6 +922,40 @@ class RBBITableBuilder {
            return false;
        }
 
+       /**
+        *  Find the next duplicate state in the safe reverse table. An iterator function.
+        *  @param states in/out parameter, specifies where to start looking for duplicates,
+        *                and returns the first pair of duplicates found, if any.
+        *  @return true if duplicate states were found, false otherwise.
+        *  @internal
+        */
+       boolean findDuplicateSafeState(RBBIRuleBuilder.IntPair states) {
+           int numStates = fSafeTable.size();
+
+           for (; states.first<numStates-1; ++states.first) {
+               short[] firstRow = fSafeTable.get(states.first);
+               for (states.second=states.first+1; states.second<numStates; ++states.second) {
+                   short[] duplRow = fSafeTable.get(states.second);
+                   boolean rowsMatch = true;
+                   int numCols = firstRow.length;
+                   for (int col=0; col < numCols; ++col) {
+                       int firstVal = firstRow[col];
+                       int duplVal = duplRow[col];
+                       if (!((firstVal == duplVal) ||
+                               ((firstVal == states.first || firstVal == states.second) &&
+                                       (duplVal  == states.first || duplVal  == states.second)))) {
+                           rowsMatch = false;
+                           break;
+                       }
+                   }
+                   if (rowsMatch) {
+                       return true;
+                   }
+               }
+           }
+           return false;
+       }
+
        /**
         * Remove a duplicate state (row) from the state table. All references to the deleted state are
         * redirected to "keepState", the first encountered of the duplicated pair of states.
@@ -964,6 +996,33 @@ class RBBITableBuilder {
            }
        }
 
+       /**
+        * Remove a duplicate state from the safe table.
+        * @param keepState The first of the duplicate pair of states, the one to be kept.
+        * @param duplState The second of the duplicate pair, the one to be removed.
+        * @internal
+        */
+       void removeSafeState(int keepState, int duplState) {
+           assert(keepState < duplState);
+           assert(duplState < fDStates.size());
+
+           fSafeTable.remove(duplState);
+           int numStates = fSafeTable.size();
+           for (int state=0; state<numStates; ++state) {
+               short[] row = fSafeTable.get(state);
+               for (int col=0; col<row.length; col++) {
+                   int existingVal = row[col];
+                   int newVal = existingVal;
+                   if (existingVal == duplState) {
+                       newVal = keepState;
+                   } else if (existingVal > duplState) {
+                       newVal = existingVal - 1;
+                   }
+                   row[col] = (short)newVal;
+               }
+           }
+       }
+
 
        /**
         *  Check for, and remove duplicate states (table rows).
@@ -1047,6 +1106,146 @@ class RBBITableBuilder {
            return table;
        }
 
+       /**
+        *   Synthesize a safe state table from the main state table.
+        */
+       void buildSafeReverseTable() {
+           // Find safe char class pairs.
+
+           // make a state table row for each trailing class, and map from class to row.
+
+           // For each pair
+           //   startRow[p1] = p2
+           //   p2row[p2] = stopRow
+           // For each unfilled in cell
+           //   set to row corresponding to its column.
+
+           // Each safe pair is stored as two chars in the safePair stringBuilder.
+           StringBuilder safePairs = new StringBuilder();
+
+           int numCharClasses = fRB.fSetBuilder.getNumCharCategories();
+           int numStates = fDStates.size();
+
+           for (int c1=0; c1<numCharClasses; ++c1) {
+               for (int c2=0; c2 < numCharClasses; ++c2) {
+                   int wantedEndState = -1;
+                   int endState = 0;
+                   for (int startState = 1; startState < numStates; ++startState) {
+                       RBBIStateDescriptor startStateD = fDStates.get(startState);
+                       int s2 = startStateD.fDtran[c1];
+                       RBBIStateDescriptor s2StateD = fDStates.get(s2);
+                       endState = s2StateD.fDtran[c2];
+                       if (wantedEndState < 0) {
+                           wantedEndState = endState;
+                       } else {
+                           if (wantedEndState != endState) {
+                               break;
+                           }
+                       }
+                   }
+                   if (wantedEndState == endState) {
+                       safePairs.append((char)c1);
+                       safePairs.append((char)c2);
+                       // System.out.printf("(%d, %d) ", c1, c2);
+                   }
+               }
+               // System.out.printf("\n");
+           }
+
+           // Populate the initial safe table.
+           // The table as a whole is a List<short[]>
+           // Row 0 is the stop state.
+           // Row 1 is the start sate.
+           // Row 2 and beyond are other states, initially one per char class, but
+           //   after initial construction, many of the states will be combined, compacting the table.)
+           // The String holds the nextState data only. The four leading fields of a row, fAccepting,
+           // fLookAhead, etc. are not needed for the safe table, and are omitted at this stage of building.
+
+           assert(fSafeTable == null);
+           fSafeTable = new ArrayList<short[]>();
+           for (int row=0; row<numCharClasses + 2; ++row) {
+               fSafeTable.add(new short[numCharClasses]);
+           }
+
+           // From the start state, each input char class transitions to the state for that input.
+           short[] startState = fSafeTable.get(1);
+           for (int charClass=0; charClass < numCharClasses; ++charClass) {
+               // Note: +2 to skip the start & stop state rows.
+               startState[charClass] = (short)(charClass+2);
+           }
+
+           // Initially make every other state table row look like the start state row
+           //    (except for the stop state, which remains all 0)
+           for (int row=2; row<numCharClasses+2; ++row) {
+               System.arraycopy(startState, 0, fSafeTable.get(row), 0, startState.length);
+           }
+
+           // Run through the safe pairs, set the next state to zero when pair has been seen.
+           // Zero being the stop state, meaning we found a safe point.
+           for (int pairIdx=0; pairIdx<safePairs.length(); pairIdx+=2) {
+               int c1 = safePairs.charAt(pairIdx);
+               int c2 = safePairs.charAt(pairIdx + 1);
+
+               short[] rowState = fSafeTable.get(c2 + 2);
+               rowState[c1] = 0;
+           }
+
+           // Remove duplicate or redundant rows from the table.
+           RBBIRuleBuilder.IntPair states = new RBBIRuleBuilder.IntPair(1, 0);
+           while (findDuplicateSafeState(states)) {
+               // System.out.printf("Removing duplicate safe states (%d, %d)\n", states.first, states.second);
+               removeSafeState(states.first, states.second);
+           }
+       }
+
+
+       /**
+        *  Calculate the size of the runtime form of this safe state table.
+        */
+       int getSafeTableSize() {
+           if (fSafeTable == null) {
+               return 0;
+           }
+           int size    = 16;    // The header of 4 ints, with no rows to the table.
+           int numRows = fSafeTable.size();
+           int numCols = fSafeTable.get(0).length;
+           int rowSize = 8 + 2*numCols;
+           size += numRows * rowSize;
+           // TODO: there are redundant round-up. Figure out best place, get rid of the rest.
+           size = (size + 7) & ~7;   // round up to a multiple of 8 bytes
+           return size;
+       }
+
+
+       /**
+        *  Create a RBBIDataWrapper.RBBIStateTable for the safe reverse table.
+        *  RBBIDataWrapper.RBBIStateTable is similar to struct RBBIStateTable in ICU4C,
+        *  in common/rbbidata.h
+        */
+       RBBIDataWrapper.RBBIStateTable exportSafeTable() {
+           RBBIDataWrapper.RBBIStateTable table = new RBBIDataWrapper.RBBIStateTable();
+           table.fNumStates = fSafeTable.size();
+           int numCharCategories = fSafeTable.get(0).length;
+
+           // Size of table size in shorts.
+           //  the "4" is the size of struct RBBIStateTableRow, the row header part only.
+           int rowLen = 4 + numCharCategories;
+           // TODO: tableSize is basically numStates * numCharCategories,
+           //       except for alignment padding. Clean up here, and in main exportTable().
+           int tableSize = (getSafeTableSize() - 16) / 2;   // fTable length in shorts.
+           table.fTable = new short[tableSize];
+           table.fRowLen = rowLen * 2;                      // Row length in bytes.
+
+           for (int state=0; state<table.fNumStates; state++) {
+               short[] rowArray = fSafeTable.get(state);
+               int row = state * rowLen;
+
+               for (int col=0; col<numCharCategories; col++) {
+                   table.fTable[row + RBBIDataWrapper.NEXTSTATES + col] = rowArray[col];
+               }
+           }
+           return table;
+       }
 
 
        //-----------------------------------------------------------------------------
@@ -1104,6 +1303,44 @@ class RBBITableBuilder {
        }
 
 
+       /**
+        * Debug Function.  Dump the fully constructed safe reverse table.
+        */
+       void printReverseTable() {
+           int     c;    // input "character"
+
+           System.out.printf("    Safe Reverse Table \n");
+           if (fSafeTable == null) {
+               System.out.printf("   --- nullptr ---\n");
+               return;
+           }
+           int numCharCategories = fSafeTable.get(0).length;
+           System.out.printf("state |           i n p u t     s y m b o l s \n");
+           System.out.printf("      | Acc  LA    Tag");
+           for (c=0; c< numCharCategories;  c++) {
+               System.out.printf(" %2d", c);
+           }
+           System.out.printf("\n");
+           System.out.printf("      |---------------");
+           for (c=0; c<numCharCategories; c++) {
+               System.out.printf("---");
+           }
+           System.out.printf("\n");
+
+           for (int n=0; n<fSafeTable.size(); n++) {
+               short rowArray[]  = fSafeTable.get(n);
+               System.out.printf("  %3d | " , n);
+               System.out.printf("%3d %3d %5d ", 0, 0, 0);  // Accepting, LookAhead, Tags
+               for (c=0; c<numCharCategories; c++) {
+                   System.out.printf(" %2d", rowArray[c]);
+               }
+               System.out.printf("\n");
+           }
+           System.out.printf("\n\n");
+       }
+
+
+
 
 
        //-----------------------------------------------------------------------------