ICU-13569 RBBI state table minimization, Java now works.

author Andy Heninger <andy.heninger@gmail.com>

Wed, 14 Feb 2018 23:44:50 +0000 (23:44 +0000)

committer Andy Heninger <andy.heninger@gmail.com>

Wed, 14 Feb 2018 23:44:50 +0000 (23:44 +0000)
author Andy Heninger <andy.heninger@gmail.com>
Wed, 14 Feb 2018 23:44:50 +0000 (23:44 +0000)
committer Andy Heninger <andy.heninger@gmail.com>
Wed, 14 Feb 2018 23:44:50 +0000 (23:44 +0000)
diff --git a/icu4j/main/classes/core/src/com/ibm/icu/text/RBBIRuleBuilder.java b/icu4j/main/classes/core/src/com/ibm/icu/text/RBBIRuleBuilder.java

index 87ea903bd25795107ff811aaf5ff44dca0948052..76921c564c36d35d3f73afb1c003e5866339ab32 100644 (file)
--- a/icu4j/main/classes/core/src/com/ibm/icu/text/RBBIRuleBuilder.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/text/RBBIRuleBuilder.java
@@ -372,25 +372,29 @@ class RBBIRuleBuilder {
          builder.flattenData(os);
      }
  
-    static class ClassPair {
-        int left = 3;
-        int right = 0;
+    static class IntPair {
+        int first = 0;
+        int second = 0;
+        IntPair() {};
+        IntPair(int f, int s) {
+            first = f;
+            second = s;
+        }
      }
  
      void optimizeTables() {
-        ClassPair duplPair = new ClassPair();
-
+        IntPair duplPair = new IntPair(3, 0);
          while (fForwardTables.findDuplCharClassFrom(duplPair)) {
-            fSetBuilder.mergeCategories(duplPair);
-            fForwardTables.removeColumn(duplPair.right);
-            fReverseTables.removeColumn(duplPair.right);
-            fSafeFwdTables.removeColumn(duplPair.right);
-            fSafeRevTables.removeColumn(duplPair.right);
+            fSetBuilder.mergeCategories(duplPair.first, duplPair.second);
+            fForwardTables.removeColumn(duplPair.second);
+            fReverseTables.removeColumn(duplPair.second);
+            fSafeFwdTables.removeColumn(duplPair.second);
+            fSafeRevTables.removeColumn(duplPair.second);
          }
  
          fForwardTables.removeDuplicateStates();
          fReverseTables.removeDuplicateStates();
          fSafeFwdTables.removeDuplicateStates();
          fSafeRevTables.removeDuplicateStates();
-
+    }
  }
diff --git a/icu4j/main/classes/core/src/com/ibm/icu/text/RBBISetBuilder.java b/icu4j/main/classes/core/src/com/ibm/icu/text/RBBISetBuilder.java

index 9f5a8a50a2c5aa2edab942db905215ffb769ed1f..ada2258010199b7ba83f0711caa5bf26d6cc79ce 100644 (file)
--- a/icu4j/main/classes/core/src/com/ibm/icu/text/RBBISetBuilder.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/text/RBBISetBuilder.java
@@ -305,6 +305,10 @@ class RBBISetBuilder {
          }
      }
  
+    /**
+     * Merge two character categories that have been identified as having equivalent behavior.
+     * The ranges belonging to the right category (table column) will be added to the left.
+     */
      void mergeCategories(int left, int right) {
          assert(left >= 1);
          assert(right > left);
@@ -319,6 +323,7 @@ class RBBISetBuilder {
          }
          --fGroupCount;
      }
+
      //-----------------------------------------------------------------------------------
      //
      //          getTrieSize()    Return the size that will be required to serialize the Trie.
diff --git a/icu4j/main/classes/core/src/com/ibm/icu/text/RBBITableBuilder.java b/icu4j/main/classes/core/src/com/ibm/icu/text/RBBITableBuilder.java

index 4d71e76f8120e55534608253eec36c2839f6727c..9130ad81b5809b11ef92bd73a7e301c0ef017b7c 100644 (file)
--- a/icu4j/main/classes/core/src/com/ibm/icu/text/RBBITableBuilder.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/text/RBBITableBuilder.java
@@ -10,6 +10,7 @@
  package com.ibm.icu.text;
  
  import java.util.ArrayList;
+import java.util.Arrays;
  import java.util.Collection;
  import java.util.HashSet;
  import java.util.List;
@@ -20,6 +21,7 @@ import java.util.TreeSet;
  import com.ibm.icu.impl.Assert;
  import com.ibm.icu.lang.UCharacter;
  import com.ibm.icu.lang.UProperty;
+import com.ibm.icu.text.RBBIRuleBuilder.IntPair;
  
  //
  //  class RBBITableBuilder is part of the RBBI rule compiler.
@@ -832,128 +834,148 @@ class RBBITableBuilder {
  
  
  
-//
-//    findDuplCharClassFrom()
-//
-boolean findDuplCharClassFrom(RBBIRuleBuilder.ClassPair classPair) {
-    int numStates = fDStates.size();
-    int numCols = fRB.fSetBuilder.getNumCharCategories();
-
-    uint16_t table_base;
-    uint16_t table_dupl;
-    for (; baseCategory < numCols-1; ++baseCategory) {
-        for (duplCategory=baseCategory+1; duplCategory < numCols; ++duplCategory) {
-             for (int state=0; state<numStates; state++) {
-                 RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates.elementAt(state);
-                 table_base = (uint16_t)sd.fDtran.elementAti(baseCategory);
-                 table_dupl = (uint16_t)sd.fDtran.elementAti(duplCategory);
-                 if (table_base != table_dupl) {
-                     break;
-                 }
-             }
-             if (table_base == table_dupl) {
-                 return true;
-             }
-        }
-    }
-    return false;
-}
+       /**
+        *  Find duplicate (redundant) character classes, beginning at the specified
+        *  pair, within this state table. This is an iterator-like function, used to
+        *  identify character classes (state table columns) that can be eliminated.
+        *  @param categories in/out parameter, specifies where to start looking for duplicates,
+        *                and returns the first pair of duplicates found, if any.
+        *  @return true if duplicate char classes were found, false otherwise.
+        *  @internal
+        */
+       boolean findDuplCharClassFrom(RBBIRuleBuilder.IntPair categories) {
+           int numStates = fDStates.size();
+           int numCols = fRB.fSetBuilder.getNumCharCategories();
+
+           int table_base = 0;
+           int table_dupl = 0;
+           for (; categories.first < numCols-1; ++categories.first) {
+               for (categories.second=categories.first+1; categories.second < numCols; ++categories.second) {
+                   for (int state=0; state<numStates; state++) {
+                       RBBIStateDescriptor sd = fDStates.get(state);
+                       table_base = sd.fDtran[categories.first];
+                       table_dupl = sd.fDtran[categories.second];
+                       if (table_base != table_dupl) {
+                           break;
+                       }
+                   }
+                   if (table_base == table_dupl) {
+                       return true;
+                   }
+               }
+           }
+           return false;
+       }
  
+       /**
+        * Remove a column from the state table. Used when two character categories
+        * have been found equivalent, and merged together, to eliminate the unneeded table column.
+        */
+       void removeColumn(int column) {
+           int numStates = fDStates.size();
+           for (int state=0; state<numStates; state++) {
+               RBBIStateDescriptor sd = fDStates.get(state);
+               assert(column < sd.fDtran.length);
+               int[] newArray = Arrays.copyOf(sd.fDtran, sd.fDtran.length - 1);
+               System.arraycopy(sd.fDtran, column+1, newArray, column, newArray.length - column);
+               sd.fDtran = newArray;
+           }
+       }
  
-//
-//    removeColumn()
-//
-void removeColumn(int column) {
-    int numStates = fDStates.size();
-    for (int state=0; state<numStates; state++) {
-        RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates.elementAt(state);
-        U_ASSERT(column < sd.fDtran.size());
-        sd.fDtran.removeElementAt(column);
-    }
-}
  
-/*
- * findDuplicateState
- */
-bool findDuplicateState(int &firstState, int &duplState) {
-    int numStates = fDStates.size();
-    int numCols = fRB.fSetBuilder.getNumCharCategories();
-
-    for (; firstState<numStates-1; ++firstState) {
-        RBBIStateDescriptor *firstSD = (RBBIStateDescriptor *)fDStates.elementAt(firstState);
-        for (duplState=firstState+1; duplState<numStates; ++duplState) {
-            RBBIStateDescriptor *duplSD = (RBBIStateDescriptor *)fDStates.elementAt(duplState);
-            if (firstSD.fAccepting != duplSD.fAccepting ||
-                firstSD.fLookAhead != duplSD.fLookAhead ||
-                firstSD.fTagsIdx   != duplSD.fTagsIdx) {
-                continue;
-            }
-            bool rowsMatch = true;
-            for (int col=0; col < numCols; ++col) {
-                int firstVal = firstSD.fDtran.elementAti(col);
-                int duplVal = duplSD.fDtran.elementAti(col);
-                if (!((firstVal == duplVal) ||
-                        ((firstVal == firstState || firstVal == duplState) &&
-                        (duplVal  == firstState || duplVal  == duplState)))) {
-                    rowsMatch = false;
-                    break;
-                }
-            }
-            if (rowsMatch) {
-                return true;
-            }
-        }
-    }
-    return false;
-}
+       /**
+        *  Find duplicate (redundant) states, beginning at the specified pair,
+        *  within this state table. This is an iterator-like function, used to
+        *  identify states (state table rows) that can be eliminated.
+        *  @param states in/out parameter, specifies where to start looking for duplicates,
+        *                and returns the first pair of duplicates found, if any.
+        *  @return true if duplicate states were found, false otherwise.
+        *  @internal
+        */
+       boolean findDuplicateState(RBBIRuleBuilder.IntPair states) {
+           int numStates = fDStates.size();
+           int numCols = fRB.fSetBuilder.getNumCharCategories();
+
+           for (; states.first<numStates-1; ++states.first) {
+               RBBIStateDescriptor firstSD = fDStates.get(states.first);
+               for (states.second=states.first+1; states.second<numStates; ++states.second) {
+                   RBBIStateDescriptor duplSD = fDStates.get(states.second);
+                   if (firstSD.fAccepting != duplSD.fAccepting ||
+                           firstSD.fLookAhead != duplSD.fLookAhead ||
+                           firstSD.fTagsIdx   != duplSD.fTagsIdx) {
+                       continue;
+                   }
+                   boolean rowsMatch = true;
+                   for (int col=0; col < numCols; ++col) {
+                       int firstVal = firstSD.fDtran[col];
+                       int duplVal = duplSD.fDtran[col];
+                       if (!((firstVal == duplVal) ||
+                               ((firstVal == states.first || firstVal == states.second) &&
+                                       (duplVal  == states.first || duplVal  == states.second)))) {
+                           rowsMatch = false;
+                           break;
+                       }
+                   }
+                   if (rowsMatch) {
+                       return true;
+                   }
+               }
+           }
+           return false;
+       }
  
-void removeState(int keepState, int duplState) {
-    U_ASSERT(keepState < duplState);
-    U_ASSERT(duplState < fDStates.size());
-
-    RBBIStateDescriptor *duplSD = (RBBIStateDescriptor *)fDStates.elementAt(duplState);
-    fDStates.removeElementAt(duplState);
-    delete duplSD;
-
-    int numStates = fDStates.size();
-    int numCols = fRB.fSetBuilder.getNumCharCategories();
-    for (int state=0; state<numStates; ++state) {
-        RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates.elementAt(state);
-        for (int col=0; col<numCols; col++) {
-            int existingVal = sd.fDtran.elementAti(col);
-            int newVal = existingVal;
-            if (existingVal == duplState) {
-                newVal = keepState;
-            } else if (existingVal > duplState) {
-                newVal = existingVal - 1;
-            }
-            sd.fDtran.setElementAt(newVal, col);
-        }
-        if (sd.fAccepting == duplState) {
-            sd.fAccepting = keepState;
-        } else if (sd.fAccepting > duplState) {
-            sd.fAccepting--;
-        }
-        if (sd.fLookAhead == duplState) {
-            sd.fLookAhead = keepState;
-        } else if (sd.fLookAhead > duplState) {
-            sd.fLookAhead--;
-        }
-    }
-}
+       /**
+        * Remove a duplicate state (row) from the state table. All references to the deleted state are
+        * redirected to "keepState", the first encountered of the duplicated pair of states.
+        * @param keepState The first of the duplicate pair of states, the one to be kept.
+        * @param duplState The second of the duplicate pair, the one to be removed.
+        * @internal
+        */
+       void removeState(int keepState, int duplState) {
+           assert(keepState < duplState);
+           assert(duplState < fDStates.size());
  
+           fDStates.remove(duplState);
  
-/*
- * RemoveDuplicateStates
- */
-void removeDuplicateStates() {
-    int firstState = 3;
-    int duplicateState = 0;
-    while (findDuplicateState(firstState, duplicateState)) {
-        // printf("Removing duplicate states (%d, %d)\n", firstState, duplicateState);
-        removeState(firstState, duplicateState);
-    }
-}
+           int numStates = fDStates.size();
+           int numCols = fRB.fSetBuilder.getNumCharCategories();
+           for (int state=0; state<numStates; ++state) {
+               RBBIStateDescriptor sd = fDStates.get(state);
+               for (int col=0; col<numCols; col++) {
+                   int existingVal = sd.fDtran[col];
+                   int newVal = existingVal;
+                   if (existingVal == duplState) {
+                       newVal = keepState;
+                   } else if (existingVal > duplState) {
+                       newVal = existingVal - 1;
+                   }
+                   sd.fDtran[col] = newVal;
+               }
+               if (sd.fAccepting == duplState) {
+                   sd.fAccepting = keepState;
+               } else if (sd.fAccepting > duplState) {
+                   sd.fAccepting--;
+               }
+               if (sd.fLookAhead == duplState) {
+                   sd.fLookAhead = keepState;
+               } else if (sd.fLookAhead > duplState) {
+                   sd.fLookAhead--;
+               }
+           }
+       }
+
+
+       /**
+        *  Check for, and remove duplicate states (table rows).
+        *  @internal
+        */
+       void removeDuplicateStates() {
+           IntPair dupls = new IntPair(3, 0);
+           while (findDuplicateState(dupls)) {
+               // System.out.printf("Removing duplicate states (%d, %d)\n", dupls.first, dupls.second);
+               removeState(dupls.first, dupls.second);
+           }
+       }
  
  
         //-----------------------------------------------------------------------------
diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITest.java b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITest.java

index e0aff62172a55840e71ac9c45543ba1250e8fb83..e1dd1e3b2858db1a5c17eb0368159b60a2be686f 100644 (file)
--- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITest.java
+++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITest.java
@@ -590,10 +590,10 @@ public class RBBITest extends TestFmwk {
          // Ignore column (char class) 0 while checking; it's special, and may have duplicates.
          for (int c1=1; c1<numCharClasses; c1++) {
              for (int c2 = c1+1; c2 < numCharClasses; c2++) {
-                // assertFalse(String.format("Duplicate columns (%d, %d)", c1, c2), columns.get(c1).equals(columns.get(c2)));
-                if (columns.get(c1).equals(columns.get(c2))) {
-                    System.out.printf("Duplicate columns (%d, %d)\n", c1, c2);
-                }
+                assertFalse(String.format("Duplicate columns (%d, %d)", c1, c2), columns.get(c1).equals(columns.get(c2)));
+                // if (columns.get(c1).equals(columns.get(c2))) {
+                //    System.out.printf("Duplicate columns (%d, %d)\n", c1, c2);
+                // }
              }
          }
  
@@ -615,10 +615,10 @@ public class RBBITest extends TestFmwk {
  
          for (int r1=0; r1 < dw.getStateTableNumStates(fwtbl); r1++) {
              for (int r2= r1+1; r2 < dw.getStateTableNumStates(fwtbl); r2++) {
-                // assertFalse(String.format("Duplicate states (%d, %d)", r1, r2), rows.get(r1).equals(rows.get(r2)));
-                if (rows.get(r1).equals(rows.get(r2))) {
-                    System.out.printf("Duplicate states (%d, %d)\n", r1, r2);
-                }
+                assertFalse(String.format("Duplicate states (%d, %d)", r1, r2), rows.get(r1).equals(rows.get(r2)));
+                // if (rows.get(r1).equals(rows.get(r2))) {
+                //     System.out.printf("Duplicate states (%d, %d)\n", r1, r2);
+                // }
              }
          }
      }
author	Andy Heninger <andy.heninger@gmail.com>
	Wed, 14 Feb 2018 23:44:50 +0000 (23:44 +0000)
committer	Andy Heninger <andy.heninger@gmail.com>
	Wed, 14 Feb 2018 23:44:50 +0000 (23:44 +0000)
icu4j/main/classes/core/src/com/ibm/icu/text/RBBIRuleBuilder.java		patch \| blob \| history
icu4j/main/classes/core/src/com/ibm/icu/text/RBBISetBuilder.java		patch \| blob \| history
icu4j/main/classes/core/src/com/ibm/icu/text/RBBITableBuilder.java		patch \| blob \| history
icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITest.java		patch \| blob \| history