]> granicus.if.org Git - icu/commitdiff
ICU-13307 Java Edits.mergedAndAppend(ab, bc); map indexes only from inside spans...
authorMarkus Scherer <markus.icu@gmail.com>
Wed, 23 Aug 2017 20:42:30 +0000 (20:42 +0000)
committerMarkus Scherer <markus.icu@gmail.com>
Wed, 23 Aug 2017 20:42:30 +0000 (20:42 +0000)
X-SVN-Rev: 40348

icu4j/main/classes/core/src/com/ibm/icu/text/Edits.java
icu4j/main/tests/core/src/com/ibm/icu/dev/test/lang/UCharacterCaseTest.java

index 9f413cf9fd3393417cb73b894fd96a49e60a7905..5dc36c8711521be9597533c4711feb833c1838f7 100644 (file)
@@ -409,12 +409,7 @@ public final class Edits {
                 spanStart = destIndex;
                 spanLength = newLength_;
             }
-            // If we are at the start or limit of an empty span, then we search from
-            // the start of the string so that we always return
-            // the first of several consecutive empty spans, for consistent results.
-            // We do not currently track the properties of the previous span,
-            // so for now we always reset if we are at the start of the current span.
-            if (i <= spanStart) {
+            if (i < spanStart) {
                 // Reset the iterator to the start.
                 index = remaining = oldLength_ = newLength_ = srcIndex = replIndex = destIndex = 0;
             } else if (i < (spanStart + spanLength)) {
@@ -429,8 +424,8 @@ public final class Edits {
                     spanStart = destIndex;
                     spanLength = newLength_;
                 }
-                if (i == spanStart || i < (spanStart + spanLength)) {
-                    // The index is in the current span, or at an empty one.
+                if (i < (spanStart + spanLength)) {
+                    // The index is in the current span.
                     return 0;
                 }
                 if (remaining > 0) {
@@ -615,4 +610,167 @@ public final class Edits {
     public Iterator getFineIterator() {
         return new Iterator(array, length, false, false);
     }
+
+    /**
+     * Merges the two input Edits and appends the result to this object.
+     *
+     * <p>Consider two string transformations (for example, normalization and case mapping)
+     * where each records Edits in addition to writing an output string.<br>
+     * Edits ab reflect how substrings of input string a
+     * map to substrings of intermediate string b.<br>
+     * Edits bc reflect how substrings of intermediate string b
+     * map to substrings of output string c.<br>
+     * This function merges ab and bc such that the additional edits
+     * recorded in this object reflect how substrings of input string a
+     * map to substrings of output string c.
+     *
+     * <p>If unrelated Edits are passed in where the output string of the first
+     * has a different length than the input string of the second,
+     * then an IllegalArgumentException is thrown.
+     *
+     * @param ab reflects how substrings of input string a
+     *     map to substrings of intermediate string b.
+     * @param bc reflects how substrings of intermediate string b
+     *     map to substrings of output string c.
+     * @return this, with the merged edits appended
+     * @draft ICU 60
+     * @provisional This API might change or be removed in a future release.
+     */
+    public Edits mergeAndAppend(Edits ab, Edits bc) {
+        // Picture string a --(Edits ab)--> string b --(Edits bc)--> string c.
+        // Parallel iteration over both Edits.
+        Iterator abIter = ab.getFineIterator();
+        Iterator bcIter = bc.getFineIterator();
+        boolean abHasNext = true, bcHasNext = true;
+        // Copy iterator state into local variables, so that we can modify and subdivide spans.
+        // ab old & new length, bc old & new length
+        int aLength = 0, ab_bLength = 0, bc_bLength = 0, cLength = 0;
+        // When we have different-intermediate-length changes, we accumulate a larger change.
+        int pending_aLength = 0, pending_cLength = 0;
+        for (;;) {
+            // At this point, for each of the two iterators:
+            // Either we are done with the locally cached current edit,
+            // and its intermediate-string length has been reset,
+            // or we will continue to work with a truncated remainder of this edit.
+            //
+            // If the current edit is done, and the iterator has not yet reached the end,
+            // then we fetch the next edit. This is true for at least one of the iterators.
+            //
+            // Normally it does not matter whether we fetch from ab and then bc or vice versa.
+            // However, the result is observably different when
+            // ab deletions meet bc insertions at the same intermediate-string index.
+            // Some users expect the bc insertions to come first, so we fetch from bc first.
+            if (bc_bLength == 0) {
+                if (bcHasNext && (bcHasNext = bcIter.next())) {
+                    bc_bLength = bcIter.oldLength();
+                    cLength = bcIter.newLength();
+                    if (bc_bLength == 0) {
+                        // insertion
+                        if (ab_bLength == 0 || !abIter.hasChange()) {
+                            addReplace(pending_aLength, pending_cLength + cLength);
+                            pending_aLength = pending_cLength = 0;
+                        } else {
+                            pending_cLength += cLength;
+                        }
+                        continue;
+                    }
+                }
+                // else see if the other iterator is done, too.
+            }
+            if (ab_bLength == 0) {
+                if (abHasNext && (abHasNext = abIter.next())) {
+                    aLength = abIter.oldLength();
+                    ab_bLength = abIter.newLength();
+                    if (ab_bLength == 0) {
+                        // deletion
+                        if (bc_bLength == bcIter.oldLength() || !bcIter.hasChange()) {
+                            addReplace(pending_aLength + aLength, pending_cLength);
+                            pending_aLength = pending_cLength = 0;
+                        } else {
+                            pending_aLength += aLength;
+                        }
+                        continue;
+                    }
+                } else if (bc_bLength == 0) {
+                    // Both iterators are done at the same time:
+                    // The intermediate-string lengths match.
+                    break;
+                } else {
+                    throw new IllegalArgumentException(
+                            "The ab output string is shorter than the bc input string.");
+                }
+            }
+            if (bc_bLength == 0) {
+                throw new IllegalArgumentException(
+                        "The bc input string is shorter than the ab output string.");
+            }
+            //  Done fetching: ab_bLength > 0 && bc_bLength > 0
+
+            // The current state has two parts:
+            // - Past: We accumulate a longer ac edit in the "pending" variables.
+            // - Current: We have copies of the current ab/bc edits in local variables.
+            //   At least one side is newly fetched.
+            //   One side might be a truncated remainder of an edit we fetched earlier.
+
+            if (!abIter.hasChange() && !bcIter.hasChange()) {
+                // An unchanged span all the way from string a to string c.
+                if (pending_aLength != 0 || pending_cLength != 0) {
+                    addReplace(pending_aLength, pending_cLength);
+                    pending_aLength = pending_cLength = 0;
+                }
+                int unchangedLength = aLength <= cLength ? aLength : cLength;
+                addUnchanged(unchangedLength);
+                ab_bLength = aLength -= unchangedLength;
+                bc_bLength = cLength -= unchangedLength;
+                // At least one of the unchanged spans is now empty.
+                continue;
+            }
+            if (!abIter.hasChange() && bcIter.hasChange()) {
+                // Unchanged a->b but changed b->c.
+                if (ab_bLength >= bc_bLength) {
+                    // Split the longer unchanged span into change + remainder.
+                    addReplace(pending_aLength + bc_bLength, pending_cLength + cLength);
+                    pending_aLength = pending_cLength = 0;
+                    aLength = ab_bLength -= bc_bLength;
+                    bc_bLength = 0;
+                    continue;
+                }
+                // Handle the shorter unchanged span below like a change.
+            } else if (abIter.hasChange() && !bcIter.hasChange()) {
+                // Changed a->b and then unchanged b->c.
+                if (ab_bLength <= bc_bLength) {
+                    // Split the longer unchanged span into change + remainder.
+                    addReplace(pending_aLength + aLength, pending_cLength + ab_bLength);
+                    pending_aLength = pending_cLength = 0;
+                    cLength = bc_bLength -= ab_bLength;
+                    ab_bLength = 0;
+                    continue;
+                }
+                // Handle the shorter unchanged span below like a change.
+            } else {  // both abIter.hasChange() && bcIter.hasChange()
+                if (ab_bLength == bc_bLength) {
+                    // Changes on both sides up to the same position. Emit & reset.
+                    addReplace(pending_aLength + aLength, pending_cLength + cLength);
+                    pending_aLength = pending_cLength = 0;
+                    ab_bLength = bc_bLength = 0;
+                    continue;
+                }
+            }
+            // Accumulate the a->c change, reset the shorter side,
+            // keep a remainder of the longer one.
+            pending_aLength += aLength;
+            pending_cLength += cLength;
+            if (ab_bLength < bc_bLength) {
+                bc_bLength -= ab_bLength;
+                cLength = ab_bLength = 0;
+            } else {  // ab_bLength > bc_bLength
+                ab_bLength -= bc_bLength;
+                aLength = bc_bLength = 0;
+            }
+        }
+        if (pending_aLength != 0 || pending_cLength != 0) {
+            addReplace(pending_aLength, pending_cLength);
+        }
+        return this;
+    }
 }
index 1df35243816db58223accf5c70ba604c1eec7e93..a0c3dee246cd13a1ed44b3011f528a3e42aa3722 100644 (file)
@@ -13,6 +13,7 @@ package com.ibm.icu.dev.test.lang;
 
 import java.io.BufferedReader;
 import java.util.ArrayList;
+import java.util.Collections;
 import java.util.List;
 import java.util.Locale;
 
@@ -777,6 +778,88 @@ public final class UCharacterCaseTest extends TestFmwk
         }
     }
 
+    private static String printOneEdit(Edits.Iterator ei) {
+        if (ei.hasChange()) {
+            return "" + ei.oldLength() + "->" + ei.newLength();
+        } else {
+            return "" + ei.oldLength() + "=" + ei.newLength();
+        }
+    }
+
+    /**
+     * Maps indexes according to the expected edits.
+     * A destination index can occur multiple times when there are source deletions.
+     * Map according to the last occurrence, normally in a non-empty destination span.
+     * Simplest is to search from the back.
+     */
+    private static int srcIndexFromDest(
+            EditChange expected[], int srcLength, int destLength, int index) {
+        int srcIndex = srcLength;
+        int destIndex = destLength;
+        int i = expected.length;
+        while (index < destIndex && i > 0) {
+            --i;
+            int prevSrcIndex = srcIndex - expected[i].oldLength;
+            int prevDestIndex = destIndex - expected[i].newLength;
+            if (index == prevDestIndex) {
+                return prevSrcIndex;
+            } else if (index > prevDestIndex) {
+                if (expected[i].change) {
+                    // In a change span, map to its end.
+                    return srcIndex;
+                } else {
+                    // In an unchanged span, offset within it.
+                    return prevSrcIndex + (index - prevDestIndex);
+                }
+            }
+            srcIndex = prevSrcIndex;
+            destIndex = prevDestIndex;
+        }
+        // index is outside the string.
+        return srcIndex;
+    }
+
+    private static int destIndexFromSrc(
+            EditChange expected[], int srcLength, int destLength, int index) {
+        int srcIndex = srcLength;
+        int destIndex = destLength;
+        int i = expected.length;
+        while (index < srcIndex && i > 0) {
+            --i;
+            int prevSrcIndex = srcIndex - expected[i].oldLength;
+            int prevDestIndex = destIndex - expected[i].newLength;
+            if (index == prevSrcIndex) {
+                return prevDestIndex;
+            } else if (index > prevSrcIndex) {
+                if (expected[i].change) {
+                    // In a change span, map to its end.
+                    return destIndex;
+                } else {
+                    // In an unchanged span, offset within it.
+                    return prevDestIndex + (index - prevSrcIndex);
+                }
+            }
+            srcIndex = prevSrcIndex;
+            destIndex = prevDestIndex;
+        }
+        // index is outside the string.
+        return destIndex;
+    }
+
+    private void checkEqualEdits(String name, Edits e1, Edits e2) {
+        Edits.Iterator ei1 = e1.getFineIterator();
+        Edits.Iterator ei2 = e2.getFineIterator();
+        for (int i = 0;; ++i) {
+            boolean ei1HasNext = ei1.next();
+            boolean ei2HasNext = ei2.next();
+            assertEquals(name + " next()[" + i + "]", ei1HasNext, ei2HasNext);
+            assertEquals(name + " edit[" + i + "]", printOneEdit(ei1), printOneEdit(ei2));
+            if (!ei1HasNext || !ei2HasNext) {
+                break;
+            }
+        }
+    }
+
     private static void checkEditsIter(
             String name, Edits.Iterator ei1, Edits.Iterator ei2,  // two equal iterators
             EditChange[] expected, boolean withUnchanged) {
@@ -786,8 +869,6 @@ public final class UCharacterCaseTest extends TestFmwk
         int expSrcIndex = 0;
         int expDestIndex = 0;
         int expReplIndex = 0;
-        int expSrcIndexFromDest = 0;  // for sourceIndexFromDestinationIndex()
-        int expDestIndexFromSrc = 0;  // for destinationIndexFromSourceIndex()
         for (int expIndex = 0; expIndex < expected.length; ++expIndex) {
             EditChange expect = expected[expIndex];
             String msg = name + ' ' + expIndex;
@@ -801,7 +882,7 @@ public final class UCharacterCaseTest extends TestFmwk
                 assertEquals(msg, expReplIndex, ei1.replacementIndex());
             }
 
-            if (expect.oldLength > 0 && expDestIndex == expDestIndexFromSrc) {
+            if (expect.oldLength > 0) {
                 assertTrue(msg, ei2.findSourceIndex(expSrcIndex));
                 assertEquals(msg, expect.change, ei2.hasChange());
                 assertEquals(msg, expect.oldLength, ei2.oldLength());
@@ -817,7 +898,7 @@ public final class UCharacterCaseTest extends TestFmwk
                 }
             }
 
-            if (expect.newLength > 0 && expSrcIndex == expSrcIndexFromDest) {
+            if (expect.newLength > 0) {
                 assertTrue(msg, ei2.findDestinationIndex(expDestIndex));
                 assertEquals(msg, expect.change, ei2.hasChange());
                 assertEquals(msg, expect.oldLength, ei2.oldLength());
@@ -833,45 +914,11 @@ public final class UCharacterCaseTest extends TestFmwk
                 }
             }
 
-            // Span starts.
-            assertEquals(name, expDestIndexFromSrc,
-                    ei2.destinationIndexFromSourceIndex(expSrcIndex));
-            assertEquals(name, expSrcIndexFromDest,
-                    ei2.sourceIndexFromDestinationIndex(expDestIndex));
-
-            // Inside unchanged span map offsets 1:1.
-            if (!expect.change && expect.oldLength >= 2) {
-                assertEquals(name, expDestIndex + 1,
-                        ei2.destinationIndexFromSourceIndex(expSrcIndex + 1));
-                assertEquals(name, expSrcIndex + 1,
-                        ei2.sourceIndexFromDestinationIndex(expDestIndex + 1));
-            }
-
-            // Inside change span map to the span limit.
-            int expSrcLimit = expSrcIndex + expect.oldLength;
-            int expDestLimit = expDestIndex + expect.newLength;
-            if (expect.change) {
-                if (expect.oldLength >= 2) {
-                    assertEquals(name, expDestLimit,
-                            ei2.destinationIndexFromSourceIndex(expSrcIndex + 1));
-                }
-                if (expect.newLength >= 2) {
-                    assertEquals(name, expSrcLimit,
-                            ei2.sourceIndexFromDestinationIndex(expDestIndex + 1));
-                }
-            }
-
-            expSrcIndex = expSrcLimit;
-            expDestIndex = expDestLimit;
+            expSrcIndex += expect.oldLength;
+            expDestIndex += expect.newLength;
             if (expect.change) {
                 expReplIndex += expect.newLength;
             }
-            if (expect.newLength > 0) {
-                expSrcIndexFromDest = expSrcIndex;
-            }
-            if (expect.oldLength > 0) {
-                expDestIndexFromSrc = expDestIndex;
-            }
         }
         String msg = name + " end";
         assertFalse(msg, ei1.next());
@@ -884,8 +931,49 @@ public final class UCharacterCaseTest extends TestFmwk
 
         assertFalse(name, ei2.findSourceIndex(expSrcIndex));
         assertFalse(name, ei2.findDestinationIndex(expDestIndex));
-        assertEquals(name, expDestIndex, ei2.destinationIndexFromSourceIndex(expSrcIndex));
-        assertEquals(name, expSrcIndex, ei2.sourceIndexFromDestinationIndex(expDestIndex));
+
+        // Check mapping of all indexes against a simple implementation
+        // that works on the expected changes.
+        // Iterate once forward, once backward, to cover more runtime conditions.
+        int srcLength = expSrcIndex;
+        int destLength = expDestIndex;
+        List<Integer> srcIndexes = new ArrayList<Integer>();
+        List<Integer> destIndexes = new ArrayList<Integer>();
+        srcIndexes.add(-1);
+        destIndexes.add(-1);
+        int srcIndex = 0;
+        int destIndex = 0;
+        for (int i = 0; i < expected.length; ++i) {
+            if (expected[i].oldLength > 0) {
+                srcIndexes.add(srcIndex);
+                if (expected[i].oldLength > 1) {
+                    srcIndexes.add(srcIndex + 1);
+                }
+            }
+            if (expected[i].newLength > 0) {
+                destIndexes.add(destIndex);
+                if (expected[i].newLength > 0) {
+                    destIndexes.add(destIndex + 1);
+                }
+            }
+            srcIndex += expected[i].oldLength;
+            destIndex += expected[i].newLength;
+        }
+        srcIndexes.add(srcLength);
+        destIndexes.add(destLength);
+        srcIndexes.add(srcLength + 1);
+        destIndexes.add(destLength + 1);
+        Collections.reverse(destIndexes);
+        for (int i : srcIndexes) {
+            assertEquals(name + " destIndexFromSrc(" + i + "):",
+                              destIndexFromSrc(expected, srcLength, destLength, i),
+                              ei2.destinationIndexFromSourceIndex(i));
+        }
+        for (int i : destIndexes) {
+            assertEquals(name + " srcIndexFromDest(" + i + "):",
+                              srcIndexFromDest(expected, srcLength, destLength, i),
+                              ei2.sourceIndexFromDestinationIndex(i));
+        }
     }
 
     @Test
@@ -949,6 +1037,167 @@ public final class UCharacterCaseTest extends TestFmwk
         assertFalse("reset then iterator", ei.next());
     }
 
+    @Test
+    public void TestMergeEdits() {
+        Edits ab = new Edits(), bc = new Edits(), ac = new Edits(), expected_ac = new Edits();
+
+        // Simple: Two parallel non-changes.
+        ab.addUnchanged(2);
+        bc.addUnchanged(2);
+        expected_ac.addUnchanged(2);
+
+        // Simple: Two aligned changes.
+        ab.addReplace(3, 2);
+        bc.addReplace(2, 1);
+        expected_ac.addReplace(3, 1);
+
+        // Unequal non-changes.
+        ab.addUnchanged(5);
+        bc.addUnchanged(3);
+        expected_ac.addUnchanged(3);
+        // ab ahead by 2
+
+        // Overlapping changes accumulate until they share a boundary.
+        ab.addReplace(4, 3);
+        bc.addReplace(3, 2);
+        ab.addReplace(4, 3);
+        bc.addReplace(3, 2);
+        ab.addReplace(4, 3);
+        bc.addReplace(3, 2);
+        bc.addUnchanged(4);
+        expected_ac.addReplace(14, 8);
+        // bc ahead by 2
+
+        // Balance out intermediate-string lengths.
+        ab.addUnchanged(2);
+        expected_ac.addUnchanged(2);
+
+        // Insert something and delete it: Should disappear.
+        ab.addReplace(0, 5);
+        ab.addReplace(0, 2);
+        bc.addReplace(7, 0);
+
+        // Parallel change to make a new boundary.
+        ab.addReplace(1, 2);
+        bc.addReplace(2, 3);
+        expected_ac.addReplace(1, 3);
+
+        // Multiple ab deletions should remain separate at the boundary.
+        ab.addReplace(1, 0);
+        ab.addReplace(2, 0);
+        ab.addReplace(3, 0);
+        expected_ac.addReplace(1, 0);
+        expected_ac.addReplace(2, 0);
+        expected_ac.addReplace(3, 0);
+
+        // Unequal non-changes can be split for another boundary.
+        ab.addUnchanged(2);
+        bc.addUnchanged(1);
+        expected_ac.addUnchanged(1);
+        // ab ahead by 1
+
+        // Multiple bc insertions should create a boundary and remain separate.
+        bc.addReplace(0, 4);
+        bc.addReplace(0, 5);
+        bc.addReplace(0, 6);
+        expected_ac.addReplace(0, 4);
+        expected_ac.addReplace(0, 5);
+        expected_ac.addReplace(0, 6);
+        // ab ahead by 1
+
+        // Multiple ab deletions in the middle of a bc change are merged.
+        bc.addReplace(2, 2);
+        // bc ahead by 1
+        ab.addReplace(1, 0);
+        ab.addReplace(2, 0);
+        ab.addReplace(3, 0);
+        ab.addReplace(4, 1);
+        expected_ac.addReplace(11, 2);
+
+        // Multiple bc insertions in the middle of an ab change are merged.
+        ab.addReplace(5, 6);
+        bc.addReplace(3, 3);
+        // ab ahead by 3
+        bc.addReplace(0, 4);
+        bc.addReplace(0, 5);
+        bc.addReplace(0, 6);
+        bc.addReplace(3, 7);
+        expected_ac.addReplace(5, 25);
+
+        // Delete around a deletion.
+        ab.addReplace(4, 4);
+        ab.addReplace(3, 0);
+        ab.addUnchanged(2);
+        bc.addReplace(2, 2);
+        bc.addReplace(4, 0);
+        expected_ac.addReplace(9, 2);
+
+        // Insert into an insertion.
+        ab.addReplace(0, 2);
+        bc.addReplace(1, 1);
+        bc.addReplace(0, 8);
+        bc.addUnchanged(4);
+        expected_ac.addReplace(0, 10);
+        // bc ahead by 3
+
+        // Balance out intermediate-string lengths.
+        ab.addUnchanged(3);
+        expected_ac.addUnchanged(3);
+
+        // Deletions meet insertions.
+        // Output order is arbitrary in principle, but we expect insertions first
+        // and want to keep it that way.
+        ab.addReplace(2, 0);
+        ab.addReplace(4, 0);
+        ab.addReplace(6, 0);
+        bc.addReplace(0, 1);
+        bc.addReplace(0, 3);
+        bc.addReplace(0, 5);
+        expected_ac.addReplace(0, 1);
+        expected_ac.addReplace(0, 3);
+        expected_ac.addReplace(0, 5);
+        expected_ac.addReplace(2, 0);
+        expected_ac.addReplace(4, 0);
+        expected_ac.addReplace(6, 0);
+
+        // End with a non-change, so that further edits are never reordered.
+        ab.addUnchanged(1);
+        bc.addUnchanged(1);
+        expected_ac.addUnchanged(1);
+
+        ac.mergeAndAppend(ab, bc);
+        checkEqualEdits("ab+bc", expected_ac, ac);
+
+        // Append more Edits.
+        Edits ab2 = new Edits(), bc2 = new Edits();
+        ab2.addUnchanged(5);
+        bc2.addReplace(1, 2);
+        bc2.addUnchanged(4);
+        expected_ac.addReplace(1, 2);
+        expected_ac.addUnchanged(4);
+        ac.mergeAndAppend(ab2, bc2);
+        checkEqualEdits("ab2+bc2", expected_ac, ac);
+
+        // Append empty edits.
+        Edits empty = new Edits();
+        ac.mergeAndAppend(empty, empty);
+        checkEqualEdits("empty+empty", expected_ac, ac);
+
+        // Error: Append more edits with mismatched intermediate-string lengths.
+        Edits mismatch = new Edits();
+        mismatch.addReplace(1, 1);
+        try {
+            ac.mergeAndAppend(ab2, mismatch);
+            fail("ab2+mismatch did not yield IllegalArgumentException");
+        } catch (IllegalArgumentException expected) {
+        }
+        try {
+            ac.mergeAndAppend(mismatch, bc2);
+            fail("mismatch+bc2 did not yield IllegalArgumentException");
+        } catch (IllegalArgumentException expected) {
+        }
+    }
+
     @Test
     public void TestCaseMapWithEdits() {
         StringBuilder sb = new StringBuilder();