From: Markus Scherer Date: Fri, 8 Sep 2017 18:49:08 +0000 (+0000) Subject: ICU-13346 compress some repeated m:n replacements even when m!=n X-Git-Tag: release-60-rc~160 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=a1e9154f8c0dff9970ab8619a46fc49cfde9d7bb;p=icu ICU-13346 compress some repeated m:n replacements even when m!=n X-SVN-Rev: 40376 --- diff --git a/icu4c/source/common/edits.cpp b/icu4c/source/common/edits.cpp index c94fa57b4b5..9ec005624fe 100644 --- a/icu4c/source/common/edits.cpp +++ b/icu4c/source/common/edits.cpp @@ -17,10 +17,10 @@ namespace { const int32_t MAX_UNCHANGED_LENGTH = 0x1000; const int32_t MAX_UNCHANGED = MAX_UNCHANGED_LENGTH - 1; -// 0wwwcccccccccccc with w=1..6 records ccc+1 replacements of w:w text units. -// No length change. -const int32_t MAX_SHORT_WIDTH = 6; -const int32_t MAX_SHORT_CHANGE_LENGTH = 0xfff; +// 0mmmnnnccccccccc with m=1..6 records ccc+1 replacements of m:n text units. +const int32_t MAX_SHORT_CHANGE_OLD_LENGTH = 6; +const int32_t MAX_SHORT_CHANGE_NEW_LENGTH = 7; +const int32_t SHORT_CHANGE_NUM_MASK = 0x1ff; const int32_t MAX_SHORT_CHANGE = 0x6fff; // 0111mmmmmmnnnnnn records a replacement of m text units with n. @@ -138,20 +138,6 @@ void Edits::addUnchanged(int32_t unchangedLength) { void Edits::addReplace(int32_t oldLength, int32_t newLength) { if(U_FAILURE(errorCode_)) { return; } - if(oldLength == newLength && 0 < oldLength && oldLength <= MAX_SHORT_WIDTH) { - // Replacement of short oldLength text units by same-length new text. - // Merge into previous short-replacement record, if any. - ++numChanges; - int32_t last = lastUnit(); - if(MAX_UNCHANGED < last && last < MAX_SHORT_CHANGE && - (last >> 12) == oldLength && (last & 0xfff) < MAX_SHORT_CHANGE_LENGTH) { - setLastUnit(last + 1); - return; - } - append(oldLength << 12); - return; - } - if(oldLength < 0 || newLength < 0) { errorCode_ = U_ILLEGAL_ARGUMENT_ERROR; return; @@ -171,6 +157,21 @@ void Edits::addReplace(int32_t oldLength, int32_t newLength) { delta += newDelta; } + if(0 < oldLength && oldLength <= MAX_SHORT_CHANGE_OLD_LENGTH && + newLength <= MAX_SHORT_CHANGE_NEW_LENGTH) { + // Merge into previous same-lengths short-replacement record, if any. + int32_t u = (oldLength << 12) | (newLength << 9); + int32_t last = lastUnit(); + if(MAX_UNCHANGED < last && last < MAX_SHORT_CHANGE && + (last & ~SHORT_CHANGE_NUM_MASK) == u && + (last & SHORT_CHANGE_NUM_MASK) < SHORT_CHANGE_NUM_MASK) { + setLastUnit(last + 1); + return; + } + append(u); + return; + } + int32_t head = 0x7000; if (oldLength < LENGTH_IN_1TRAIL && newLength < LENGTH_IN_1TRAIL) { head |= oldLength << 6; @@ -457,7 +458,7 @@ UBool Edits::Iterator::next(UBool onlyChanges, UErrorCode &errorCode) { // Post-increment-read the same span again. if (remaining > 0) { // Fine-grained iterator: - // Stay on the current one of a sequence of equal-length changes. + // Stay on the current one of a sequence of compressed changes. ++index; // next() rests on the index after the sequence unit. dir = 1; return TRUE; @@ -466,7 +467,7 @@ UBool Edits::Iterator::next(UBool onlyChanges, UErrorCode &errorCode) { dir = 1; } if (remaining >= 1) { - // Fine-grained iterator: Continue a sequence of equal-length changes. + // Fine-grained iterator: Continue a sequence of compressed changes. if (remaining > 1) { --remaining; return TRUE; @@ -499,16 +500,18 @@ UBool Edits::Iterator::next(UBool onlyChanges, UErrorCode &errorCode) { } changed = TRUE; if (u <= MAX_SHORT_CHANGE) { + int32_t oldLen = u >> 12; + int32_t newLen = (u >> 9) & MAX_SHORT_CHANGE_NEW_LENGTH; + int32_t num = (u & SHORT_CHANGE_NUM_MASK) + 1; if (coarse) { - int32_t w = u >> 12; - int32_t len = (u & 0xfff) + 1; - oldLength_ = newLength_ = len * w; + oldLength_ = num * oldLen; + newLength_ = num * newLen; } else { - // Split a sequence of equal-length changes that was compressed into one unit. - oldLength_ = newLength_ = u >> 12; - u &= 0xfff; - if (u > 0) { - remaining = u + 1; // This is the first of two or more changes. + // Split a sequence of changes that was compressed into one unit. + oldLength_ = oldLen; + newLength_ = newLen; + if (num > 1) { + remaining = num; // This is the first of two or more changes. } return TRUE; } @@ -524,11 +527,9 @@ UBool Edits::Iterator::next(UBool onlyChanges, UErrorCode &errorCode) { while (index < length && (u = array[index]) > MAX_UNCHANGED) { ++index; if (u <= MAX_SHORT_CHANGE) { - int32_t w = u >> 12; - int32_t len = (u & 0xfff) + 1; - len = len * w; - oldLength_ += len; - newLength_ += len; + int32_t num = (u & SHORT_CHANGE_NUM_MASK) + 1; + oldLength_ += (u >> 12) * num; + newLength_ += ((u >> 9) & MAX_SHORT_CHANGE_NEW_LENGTH) * num; } else { U_ASSERT(u <= 0x7fff); oldLength_ += readLength((u >> 6) & 0x3f); @@ -552,7 +553,7 @@ UBool Edits::Iterator::previous(UErrorCode &errorCode) { // pre-decrement-read the same span again. if (remaining > 0) { // Fine-grained iterator: - // Stay on the current one of a sequence of equal-length changes. + // Stay on the current one of a sequence of compressed changes. --index; // previous() rests on the sequence unit. dir = -1; return TRUE; @@ -562,10 +563,10 @@ UBool Edits::Iterator::previous(UErrorCode &errorCode) { dir = -1; } if (remaining > 0) { - // Fine-grained iterator: Continue a sequence of equal-length changes. + // Fine-grained iterator: Continue a sequence of compressed changes. int32_t u = array[index]; U_ASSERT(MAX_UNCHANGED < u && u <= MAX_SHORT_CHANGE); - if (remaining <= (u & 0xfff)) { + if (remaining <= (u & SHORT_CHANGE_NUM_MASK)) { ++remaining; updatePreviousIndexes(); return TRUE; @@ -591,15 +592,17 @@ UBool Edits::Iterator::previous(UErrorCode &errorCode) { } changed = TRUE; if (u <= MAX_SHORT_CHANGE) { + int32_t oldLen = u >> 12; + int32_t newLen = (u >> 9) & MAX_SHORT_CHANGE_NEW_LENGTH; + int32_t num = (u & SHORT_CHANGE_NUM_MASK) + 1; if (coarse) { - int32_t w = u >> 12; - int32_t len = (u & 0xfff) + 1; - oldLength_ = newLength_ = len * w; + oldLength_ = num * oldLen; + newLength_ = num * newLen; } else { - // Split a sequence of equal-length changes that was compressed into one unit. - oldLength_ = newLength_ = u >> 12; - u &= 0xfff; - if (u > 0) { + // Split a sequence of changes that was compressed into one unit. + oldLength_ = oldLen; + newLength_ = newLen; + if (num > 1) { remaining = 1; // This is the last of two or more changes. } updatePreviousIndexes(); @@ -630,11 +633,9 @@ UBool Edits::Iterator::previous(UErrorCode &errorCode) { while (index > 0 && (u = array[index - 1]) > MAX_UNCHANGED) { --index; if (u <= MAX_SHORT_CHANGE) { - int32_t w = u >> 12; - int32_t len = (u & 0xfff) + 1; - len = len * w; - oldLength_ += len; - newLength_ += len; + int32_t num = (u & SHORT_CHANGE_NUM_MASK) + 1; + oldLength_ += (u >> 12) * num; + newLength_ += ((u >> 9) & MAX_SHORT_CHANGE_NEW_LENGTH) * num; } else if (u <= 0x7fff) { // Read the lengths, and reset the index to the head again. int32_t headIndex = index++; @@ -672,24 +673,24 @@ int32_t Edits::Iterator::findIndex(int32_t i, UBool findSource, UErrorCode &erro if (remaining > 0) { // Is the index in one of the remaining compressed edits? // spanStart is the start of the current span, first of the remaining ones. + spanLength = findSource ? oldLength_ : newLength_; int32_t u = array[index]; U_ASSERT(MAX_UNCHANGED < u && u <= MAX_SHORT_CHANGE); - int32_t total = (u & 0xfff) + 1; - int32_t len = (total - remaining) * oldLength_; + int32_t num = (u & SHORT_CHANGE_NUM_MASK) + 1 - remaining; + int32_t len = num * spanLength; if (i >= (spanStart - len)) { - int32_t n = ((spanStart - i - 1) / oldLength_) + 1; - // 1 <= n <= (total - remaining) - len = n * oldLength_; - srcIndex -= len; - replIndex -= len; - destIndex -= len; + int32_t n = ((spanStart - i - 1) / spanLength) + 1; + // 1 <= n <= num + srcIndex -= n * oldLength_; + replIndex -= n * newLength_; + destIndex -= n * newLength_; remaining += n; return 0; } // Skip all of these edits at once. - srcIndex -= len; - replIndex -= len; - destIndex -= len; + srcIndex -= num * oldLength_; + replIndex -= num * newLength_; + destIndex -= num * newLength_; remaining = 0; } } @@ -719,15 +720,15 @@ int32_t Edits::Iterator::findIndex(int32_t i, UBool findSource, UErrorCode &erro int32_t len = remaining * spanLength; if (i < (spanStart + len)) { int32_t n = (i - spanStart) / spanLength; // 1 <= n <= remaining - 1 - len = n * spanLength; - srcIndex += len; - replIndex += len; - destIndex += len; + srcIndex += n * oldLength_; + replIndex += n * newLength_; + destIndex += n * newLength_; remaining -= n; return 0; } // Make next() skip all of these edits at once. - oldLength_ = newLength_ = len; + oldLength_ *= remaining; + newLength_ *= remaining; remaining = 0; } } diff --git a/icu4c/source/test/intltest/strcase.cpp b/icu4c/source/test/intltest/strcase.cpp index 9905a63c812..a963f517a26 100644 --- a/icu4c/source/test/intltest/strcase.cpp +++ b/icu4c/source/test/intltest/strcase.cpp @@ -923,23 +923,23 @@ void StringCaseTest::TestEdits() { assertFalse("unchanged 10003 hasChanges", edits.hasChanges()); assertEquals("unchanged 10003 numberOfChanges", 0, edits.numberOfChanges()); assertEquals("unchanged 10003", 0, edits.lengthDelta()); - edits.addReplace(1, 1); // multiple short equal-length edits are compressed + edits.addReplace(2, 1); // multiple short equal-lengths edits are compressed edits.addUnchanged(0); - edits.addReplace(1, 1); - edits.addReplace(1, 1); + edits.addReplace(2, 1); + edits.addReplace(2, 1); edits.addReplace(0, 10); edits.addReplace(100, 0); edits.addReplace(3000, 4000); // variable-length encoding edits.addReplace(100000, 100000); assertTrue("some edits hasChanges", edits.hasChanges()); assertEquals("some edits numberOfChanges", 7, edits.numberOfChanges()); - assertEquals("some edits", 10 - 100 + 1000, edits.lengthDelta()); + assertEquals("some edits", -3 + 10 - 100 + 1000, edits.lengthDelta()); UErrorCode outErrorCode = U_ZERO_ERROR; assertFalse("edits done: copyErrorTo", edits.copyErrorTo(outErrorCode)); static const EditChange coarseExpectedChanges[] = { { FALSE, 10003, 10003 }, - { TRUE, 103103, 104013 } + { TRUE, 103106, 104013 } }; TestUtility::checkEditsIter(*this, u"coarse", edits.getCoarseIterator(), edits.getCoarseIterator(), @@ -950,9 +950,9 @@ void StringCaseTest::TestEdits() { static const EditChange fineExpectedChanges[] = { { FALSE, 10003, 10003 }, - { TRUE, 1, 1 }, - { TRUE, 1, 1 }, - { TRUE, 1, 1 }, + { TRUE, 2, 1 }, + { TRUE, 2, 1 }, + { TRUE, 2, 1 }, { TRUE, 0, 10 }, { TRUE, 100, 0 }, { TRUE, 3000, 4000 }, @@ -1032,14 +1032,17 @@ void StringCaseTest::TestEditsFindFwdBwd() { Edits e; constexpr int32_t N = 200000; for (int32_t i = 0; i < N; ++i) { + e.addUnchanged(1); e.addReplace(3, 1); } Edits::Iterator iter = e.getFineIterator(); - for (int32_t i = 0; i <= N; ++i) { - assertEquals("ascending", i * 3, iter.sourceIndexFromDestinationIndex(i, errorCode)); + for (int32_t i = 0; i <= N; i += 2) { + assertEquals("ascending", i * 2, iter.sourceIndexFromDestinationIndex(i, errorCode)); + assertEquals("ascending", i * 2 + 1, iter.sourceIndexFromDestinationIndex(i + 1, errorCode)); } - for (int32_t i = N; i >= 0; --i) { - assertEquals("descending", i * 3, iter.sourceIndexFromDestinationIndex(i, errorCode)); + for (int32_t i = N; i >= 0; i -= 2) { + assertEquals("descending", i * 2 + 1, iter.sourceIndexFromDestinationIndex(i + 1, errorCode)); + assertEquals("descending", i * 2, iter.sourceIndexFromDestinationIndex(i, errorCode)); } }