]> granicus.if.org Git - icu/commitdiff
ICU-13346 add Edits.Iterator.previous() for mapping near-earlier indexes; compress...
authorMarkus Scherer <markus.icu@gmail.com>
Mon, 18 Sep 2017 22:59:48 +0000 (22:59 +0000)
committerMarkus Scherer <markus.icu@gmail.com>
Mon, 18 Sep 2017 22:59:48 +0000 (22:59 +0000)
X-SVN-Rev: 40426

icu4j/main/classes/core/src/com/ibm/icu/text/Edits.java
icu4j/main/tests/core/src/com/ibm/icu/dev/test/lang/UCharacterCaseTest.java

index 5dc36c8711521be9597533c4711feb833c1838f7..aede16b94299a2856b7c61f16080b6a95a7237f7 100644 (file)
@@ -18,10 +18,10 @@ public final class Edits {
     private static final int MAX_UNCHANGED_LENGTH = 0x1000;
     private static final int MAX_UNCHANGED = MAX_UNCHANGED_LENGTH - 1;
 
-    // 0wwwcccccccccccc with w=1..6 records ccc+1 replacements of w:w text units.
-    // No length change.
-    private static final int MAX_SHORT_WIDTH = 6;
-    private static final int MAX_SHORT_CHANGE_LENGTH = 0xfff;
+    // 0mmmnnnccccccccc with m=1..6 records ccc+1 replacements of m:n text units.
+    private static final int MAX_SHORT_CHANGE_OLD_LENGTH = 6;
+    private static final int MAX_SHORT_CHANGE_NEW_LENGTH = 7;
+    private static final int SHORT_CHANGE_NUM_MASK = 0x1ff;
     private static final int MAX_SHORT_CHANGE = 0x6fff;
 
     // 0111mmmmmmnnnnnn records a replacement of m text units with n.
@@ -103,20 +103,6 @@ public final class Edits {
      * @provisional This API might change or be removed in a future release.
      */
     public void addReplace(int oldLength, int newLength) {
-        if(oldLength == newLength && 0 < oldLength && oldLength <= MAX_SHORT_WIDTH) {
-            // Replacement of short oldLength text units by same-length new text.
-            // Merge into previous short-replacement record, if any.
-            ++numChanges;
-            int last = lastUnit();
-            if(MAX_UNCHANGED < last && last < MAX_SHORT_CHANGE &&
-                    (last >> 12) == oldLength && (last & 0xfff) < MAX_SHORT_CHANGE_LENGTH) {
-                setLastUnit(last + 1);
-                return;
-            }
-            append(oldLength << 12);
-            return;
-        }
-
         if(oldLength < 0 || newLength < 0) {
             throw new IllegalArgumentException(
                     "addReplace(" + oldLength + ", " + newLength +
@@ -136,6 +122,21 @@ public final class Edits {
             delta += newDelta;
         }
 
+        if(0 < oldLength && oldLength <= MAX_SHORT_CHANGE_OLD_LENGTH &&
+                newLength <= MAX_SHORT_CHANGE_NEW_LENGTH) {
+            // Merge into previous same-lengths short-replacement record, if any.
+            int u = (oldLength << 12) | (newLength << 9);
+            int last = lastUnit();
+            if(MAX_UNCHANGED < last && last < MAX_SHORT_CHANGE &&
+                    (last & ~SHORT_CHANGE_NUM_MASK) == u &&
+                    (last & SHORT_CHANGE_NUM_MASK) < SHORT_CHANGE_NUM_MASK) {
+                setLastUnit(last + 1);
+                return;
+            }
+            append(u);
+            return;
+        }
+
         int head = 0x7000;
         if (oldLength < LENGTH_IN_1TRAIL && newLength < LENGTH_IN_1TRAIL) {
             head |= oldLength << 6;
@@ -225,9 +226,14 @@ public final class Edits {
         private final char[] array;
         private int index;
         private final int length;
+        /**
+         * 0 if we are not within compressed equal-length changes.
+         * Otherwise the number of remaining changes, including the current one.
+         */
         private int remaining;
         private final boolean onlyChanges_, coarse;
 
+        private int dir;  // iteration direction: back(<0), initial(0), forward(>0)
         private boolean changed;
         private int oldLength_, newLength_;
         private int srcIndex, replIndex, destIndex;
@@ -258,7 +264,7 @@ public final class Edits {
             }
         }
 
-        private void updateIndexes() {
+        private void updateNextIndexes() {
             srcIndex += oldLength_;
             if (changed) {
                 replIndex += newLength_;
@@ -266,8 +272,17 @@ public final class Edits {
             destIndex += newLength_;
         }
 
+        private void updatePreviousIndexes() {
+            srcIndex -= oldLength_;
+            if (changed) {
+                replIndex -= newLength_;
+            }
+            destIndex -= newLength_;
+        }
+
         private boolean noNext() {
-            // No change beyond the string.
+            // No change before or beyond the string.
+            dir = 0;
             changed = false;
             oldLength_ = newLength_ = 0;
             return false;
@@ -284,13 +299,32 @@ public final class Edits {
         }
 
         private boolean next(boolean onlyChanges) {
-            // We have an errorCode in case we need to start guarding against integer overflows.
-            // It is also convenient for caller loops if we bail out when an error was set elsewhere.
-            updateIndexes();
-            if (remaining > 0) {
-                // Fine-grained iterator: Continue a sequence of equal-length changes.
-                --remaining;
-                return true;
+            // Forward iteration: Update the string indexes to the limit of the current span,
+            // and post-increment-read array units to assemble a new span.
+            // Leaves the array index one after the last unit of that span.
+            if (dir > 0) {
+                updateNextIndexes();
+            } else {
+                if (dir < 0) {
+                    // Turn around from previous() to next().
+                    // Post-increment-read the same span again.
+                    if (remaining > 0) {
+                        // Fine-grained iterator:
+                        // Stay on the current one of a sequence of compressed changes.
+                        ++index;  // next() rests on the index after the sequence unit.
+                        dir = 1;
+                        return true;
+                    }
+                }
+                dir = 1;
+            }
+            if (remaining >= 1) {
+                // Fine-grained iterator: Continue a sequence of compressed changes.
+                if (remaining > 1) {
+                    --remaining;
+                    return true;
+                }
+                remaining = 0;
             }
             if (index >= length) {
                 return noNext();
@@ -306,7 +340,7 @@ public final class Edits {
                 }
                 newLength_ = oldLength_;
                 if (onlyChanges) {
-                    updateIndexes();
+                    updateNextIndexes();
                     if (index >= length) {
                         return noNext();
                     }
@@ -318,14 +352,19 @@ public final class Edits {
             }
             changed = true;
             if (u <= MAX_SHORT_CHANGE) {
+                int oldLen = u >> 12;
+                int newLen = (u >> 9) & MAX_SHORT_CHANGE_NEW_LENGTH;
+                int num = (u & SHORT_CHANGE_NUM_MASK) + 1;
                 if (coarse) {
-                    int w = u >> 12;
-                    int len = (u & 0xfff) + 1;
-                    oldLength_ = newLength_ = len * w;
+                    oldLength_ = num * oldLen;
+                    newLength_ = num * newLen;
                 } else {
-                    // Split a sequence of equal-length changes that was compressed into one unit.
-                    oldLength_ = newLength_ = u >> 12;
-                    remaining = u & 0xfff;
+                    // Split a sequence of changes that was compressed into one unit.
+                    oldLength_ = oldLen;
+                    newLength_ = newLen;
+                    if (num > 1) {
+                        remaining = num;  // This is the first of two or more changes.
+                    }
                     return true;
                 }
             } else {
@@ -340,19 +379,121 @@ public final class Edits {
             while (index < length && (u = array[index]) > MAX_UNCHANGED) {
                 ++index;
                 if (u <= MAX_SHORT_CHANGE) {
-                    int w = u >> 12;
-                    int len = (u & 0xfff) + 1;
-                    len = len * w;
-                    oldLength_ += len;
-                    newLength_ += len;
+                    int num = (u & SHORT_CHANGE_NUM_MASK) + 1;
+                    oldLength_ += (u >> 12) * num;
+                    newLength_ += ((u >> 9) & MAX_SHORT_CHANGE_NEW_LENGTH) * num;
                 } else {
                     assert(u <= 0x7fff);
-                    int oldLen = readLength((u >> 6) & 0x3f);
-                    int newLen = readLength(u & 0x3f);
-                    oldLength_ += oldLen;
-                    newLength_ += newLen;
+                    oldLength_ += readLength((u >> 6) & 0x3f);
+                    newLength_ += readLength(u & 0x3f);
+                }
+            }
+            return true;
+        }
+
+        private boolean previous() {
+            // Backward iteration: Pre-decrement-read array units to assemble a new span,
+            // then update the string indexes to the start of that span.
+            // Leaves the array index on the head unit of that span.
+            if (dir >= 0) {
+                if (dir > 0) {
+                    // Turn around from next() to previous().
+                    // Set the string indexes to the span limit and
+                    // pre-decrement-read the same span again.
+                    if (remaining > 0) {
+                        // Fine-grained iterator:
+                        // Stay on the current one of a sequence of compressed changes.
+                        --index;  // previous() rests on the sequence unit.
+                        dir = -1;
+                        return true;
+                    }
+                    updateNextIndexes();
                 }
+                dir = -1;
             }
+            if (remaining > 0) {
+                // Fine-grained iterator: Continue a sequence of compressed changes.
+                int u = array[index];
+                assert(MAX_UNCHANGED < u && u <= MAX_SHORT_CHANGE);
+                if (remaining <= (u & SHORT_CHANGE_NUM_MASK)) {
+                    ++remaining;
+                    updatePreviousIndexes();
+                    return true;
+                }
+                remaining = 0;
+            }
+            if (index <= 0) {
+                return noNext();
+            }
+            int u = array[--index];
+            if (u <= MAX_UNCHANGED) {
+                // Combine adjacent unchanged ranges.
+                changed = false;
+                oldLength_ = u + 1;
+                while (index > 0 && (u = array[index - 1]) <= MAX_UNCHANGED) {
+                    --index;
+                    oldLength_ += u + 1;
+                }
+                newLength_ = oldLength_;
+                // No need to handle onlyChanges as long as previous() is called only from findIndex().
+                updatePreviousIndexes();
+                return true;
+            }
+            changed = true;
+            if (u <= MAX_SHORT_CHANGE) {
+                int oldLen = u >> 12;
+                int newLen = (u >> 9) & MAX_SHORT_CHANGE_NEW_LENGTH;
+                int num = (u & SHORT_CHANGE_NUM_MASK) + 1;
+                if (coarse) {
+                    oldLength_ = num * oldLen;
+                    newLength_ = num * newLen;
+                } else {
+                    // Split a sequence of changes that was compressed into one unit.
+                    oldLength_ = oldLen;
+                    newLength_ = newLen;
+                    if (num > 1) {
+                        remaining = 1;  // This is the last of two or more changes.
+                    }
+                    updatePreviousIndexes();
+                    return true;
+                }
+            } else {
+                if (u <= 0x7fff) {
+                    // The change is encoded in u alone.
+                    oldLength_ = readLength((u >> 6) & 0x3f);
+                    newLength_ = readLength(u & 0x3f);
+                } else {
+                    // Back up to the head of the change, read the lengths,
+                    // and reset the index to the head again.
+                    assert(index > 0);
+                    while ((u = array[--index]) > 0x7fff) {}
+                    assert(u > MAX_SHORT_CHANGE);
+                    int headIndex = index++;
+                    oldLength_ = readLength((u >> 6) & 0x3f);
+                    newLength_ = readLength(u & 0x3f);
+                    index = headIndex;
+                }
+                if (!coarse) {
+                    updatePreviousIndexes();
+                    return true;
+                }
+            }
+            // Combine adjacent changes.
+            while (index > 0 && (u = array[index - 1]) > MAX_UNCHANGED) {
+                --index;
+                if (u <= MAX_SHORT_CHANGE) {
+                    int num = (u & SHORT_CHANGE_NUM_MASK) + 1;
+                    oldLength_ += (u >> 12) * num;
+                    newLength_ += ((u >> 9) & MAX_SHORT_CHANGE_NEW_LENGTH) * num;
+                } else if (u <= 0x7fff) {
+                    // Read the lengths, and reset the index to the head again.
+                    int headIndex = index++;
+                    oldLength_ += readLength((u >> 6) & 0x3f);
+                    newLength_ += readLength(u & 0x3f);
+                    index = headIndex;
+                }
+            }
+            updatePreviousIndexes();
             return true;
         }
 
@@ -410,7 +551,43 @@ public final class Edits {
                 spanLength = newLength_;
             }
             if (i < spanStart) {
+                if (i >= (spanStart / 2)) {
+                    // Search backwards.
+                    for (;;) {
+                        boolean hasPrevious = previous();
+                        assert(hasPrevious);  // because i>=0 and the first span starts at 0
+                        spanStart = findSource ? srcIndex : destIndex;
+                        if (i >= spanStart) {
+                            // The index is in the current span.
+                            return 0;
+                        }
+                        if (remaining > 0) {
+                            // Is the index in one of the remaining compressed edits?
+                            // spanStart is the start of the current span, first of the remaining ones.
+                            spanLength = findSource ? oldLength_ : newLength_;
+                            int u = array[index];
+                            assert(MAX_UNCHANGED < u && u <= MAX_SHORT_CHANGE);
+                            int num = (u & SHORT_CHANGE_NUM_MASK) + 1 - remaining;
+                            int len = num * spanLength;
+                            if (i >= (spanStart - len)) {
+                                int n = ((spanStart - i - 1) / spanLength) + 1;
+                                // 1 <= n <= num
+                                srcIndex -= n * oldLength_;
+                                replIndex -= n * newLength_;
+                                destIndex -= n * newLength_;
+                                remaining += n;
+                                return 0;
+                            }
+                            // Skip all of these edits at once.
+                            srcIndex -= num * oldLength_;
+                            replIndex -= num * newLength_;
+                            destIndex -= num * newLength_;
+                            remaining = 0;
+                        }
+                    }
+                }
                 // Reset the iterator to the start.
+                dir = 0;
                 index = remaining = oldLength_ = newLength_ = srcIndex = replIndex = destIndex = 0;
             } else if (i < (spanStart + spanLength)) {
                 // The index is in the current span.
@@ -428,21 +605,21 @@ public final class Edits {
                     // The index is in the current span.
                     return 0;
                 }
-                if (remaining > 0) {
+                if (remaining > 1) {
                     // Is the index in one of the remaining compressed edits?
-                    // spanStart is the start of the current span, before the remaining ones.
-                    int len = (remaining + 1) * spanLength;
+                    // spanStart is the start of the current span, first of the remaining ones.
+                    int len = remaining * spanLength;
                     if (i < (spanStart + len)) {
-                        int n = (i - spanStart) / spanLength;  // 1 <= n <= remaining
-                        len = n * spanLength;
-                        srcIndex += len;
-                        replIndex += len;
-                        destIndex += len;
+                        int n = (i - spanStart) / spanLength;  // 1 <= n <= remaining - 1
+                        srcIndex += n * oldLength_;
+                        replIndex += n * newLength_;
+                        destIndex += n * newLength_;
                         remaining -= n;
                         return 0;
                     }
                     // Make next() skip all of these edits at once.
-                    oldLength_ = newLength_ = len;
+                    oldLength_ *= remaining;
+                    newLength_ *= remaining;
                     remaining = 0;
                 }
             }
index c150f3772e59309d1ad7ac1fc480e0e91e2c93d4..09e4e2ef6c3178986e04bb3e061275821ee03b0e 100644 (file)
@@ -951,12 +951,18 @@ public final class UCharacterCaseTest extends TestFmwk
                 srcIndexes.add(srcIndex);
                 if (expected[i].oldLength > 1) {
                     srcIndexes.add(srcIndex + 1);
+                    if (expected[i].oldLength > 2) {
+                        srcIndexes.add(srcIndex + expected[i].oldLength - 1);
+                    }
                 }
             }
             if (expected[i].newLength > 0) {
                 destIndexes.add(destIndex);
-                if (expected[i].newLength > 0) {
+                if (expected[i].newLength > 1) {
                     destIndexes.add(destIndex + 1);
+                    if (expected[i].newLength > 2) {
+                        destIndexes.add(destIndex + expected[i].newLength - 1);
+                    }
                 }
             }
             srcIndex += expected[i].oldLength;
@@ -967,18 +973,31 @@ public final class UCharacterCaseTest extends TestFmwk
         srcIndexes.add(srcLength + 1);
         destIndexes.add(destLength + 1);
         Collections.reverse(destIndexes);
-        for (int i : srcIndexes) {
-            assertEquals(name + " destIndexFromSrc(" + i + "):",
-                              destIndexFromSrc(expected, srcLength, destLength, i),
-                              ei2.destinationIndexFromSourceIndex(i));
+        // Zig-zag across the indexes to stress next() <-> previous().
+        for (int i = 0; i < srcIndexes.size(); ++i) {
+            for (int j : ZIG_ZAG) {
+                if ((i + j) < srcIndexes.size()) {
+                    int si = srcIndexes.get(i + j);
+                    assertEquals(name + " destIndexFromSrc(" + si + "):",
+                            destIndexFromSrc(expected, srcLength, destLength, si),
+                            ei2.destinationIndexFromSourceIndex(si));
+                }
+            }
         }
-        for (int i : destIndexes) {
-            assertEquals(name + " srcIndexFromDest(" + i + "):",
-                              srcIndexFromDest(expected, srcLength, destLength, i),
-                              ei2.sourceIndexFromDestinationIndex(i));
+        for (int i = 0; i < destIndexes.size(); ++i) {
+            for (int j : ZIG_ZAG) {
+                if ((i + j) < destIndexes.size()) {
+                    int di = destIndexes.get(i + j);
+                    assertEquals(name + " srcIndexFromDest(" + di + "):",
+                            srcIndexFromDest(expected, srcLength, destLength, di),
+                            ei2.sourceIndexFromDestinationIndex(di));
+                }
+            }
         }
     }
 
+    private static final int[] ZIG_ZAG = { 0, 1, 2, 3, 2, 1 };
+
     @Test
     public void TestEdits() {
         Edits edits = new Edits();
@@ -992,21 +1011,21 @@ public final class UCharacterCaseTest extends TestFmwk
         assertFalse("unchanged 10003 hasChanges", edits.hasChanges());
         assertEquals("unchanged 10003 numberOfChanges", 0, edits.numberOfChanges());
         assertEquals("unchanged 10003", 0, edits.lengthDelta());
-        edits.addReplace(1, 1);  // multiple short equal-length edits are compressed
+        edits.addReplace(2, 1);  // multiple short equal-lengths edits are compressed
         edits.addUnchanged(0);
-        edits.addReplace(1, 1);
-        edits.addReplace(1, 1);
+        edits.addReplace(2, 1);
+        edits.addReplace(2, 1);
         edits.addReplace(0, 10);
         edits.addReplace(100, 0);
         edits.addReplace(3000, 4000);  // variable-length encoding
         edits.addReplace(100000, 100000);
         assertTrue("some edits hasChanges", edits.hasChanges());
         assertEquals("some edits numberOfChanges", 7, edits.numberOfChanges());
-        assertEquals("some edits", 10 - 100 + 1000, edits.lengthDelta());
+        assertEquals("some edits", -3 + 10 - 100 + 1000, edits.lengthDelta());
 
         EditChange[] coarseExpectedChanges = new EditChange[] {
                 new EditChange(false, 10003, 10003),
-                new EditChange(true, 103103, 104013)
+                new EditChange(true, 103106, 104013)
         };
         checkEditsIter("coarse",
                 edits.getCoarseIterator(), edits.getCoarseIterator(),
@@ -1017,9 +1036,9 @@ public final class UCharacterCaseTest extends TestFmwk
 
         EditChange[] fineExpectedChanges = new EditChange[] {
                 new EditChange(false, 10003, 10003),
-                new EditChange(true, 1, 1),
-                new EditChange(true, 1, 1),
-                new EditChange(true, 1, 1),
+                new EditChange(true, 2, 1),
+                new EditChange(true, 2, 1),
+                new EditChange(true, 2, 1),
                 new EditChange(true, 0, 10),
                 new EditChange(true, 100, 0),
                 new EditChange(true, 3000, 4000),
@@ -1040,6 +1059,27 @@ public final class UCharacterCaseTest extends TestFmwk
         assertFalse("reset then iterator", ei.next());
     }
 
+    @Test
+    public void TestEditsFindFwdBwd() {
+        // Some users need index mappings to be efficient when they are out of order.
+        // The most interesting failure case for this test is it taking a very long time.
+        Edits e = new Edits();
+        int N = 200000;
+        for (int i = 0; i < N; ++i) {
+            e.addUnchanged(1);
+            e.addReplace(3, 1);
+        }
+        Edits.Iterator iter = e.getFineIterator();
+        for (int i = 0; i <= N; i += 2) {
+            assertEquals("ascending", i * 2, iter.sourceIndexFromDestinationIndex(i));
+            assertEquals("ascending", i * 2 + 1, iter.sourceIndexFromDestinationIndex(i + 1));
+        }
+        for (int i = N; i >= 0; i -= 2) {
+            assertEquals("descending", i * 2 + 1, iter.sourceIndexFromDestinationIndex(i + 1));
+            assertEquals("descending", i * 2, iter.sourceIndexFromDestinationIndex(i));
+        }
+    }
+
     @Test
     public void TestMergeEdits() {
         Edits ab = new Edits(), bc = new Edits(), ac = new Edits(), expected_ac = new Edits();