From: Markus Scherer Date: Tue, 28 Aug 2018 19:53:34 +0000 (-0700) Subject: ICU-13530 fix bugs, add tests, clarify docs (#83) X-Git-Tag: release-63-rc~98 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=000c266045acae6eb99dc0ba76ffa589929531ea;p=icu ICU-13530 fix bugs, add tests, clarify docs (#83) * ICU-13530 test & fix cptrie.getRange() when small null data block matches the start of a non-null fast data block * ICU-13530 fix test bug * ICU-13530 test & fix bug calculating worst-case data array capacity at start of compaction * ICU-13530 docs: clarify buildImmutable() discards upper bits, trie then returns unsigned ints; range iteration slower than if ranges were stored directly * ICU-13530 accept feedback on docs --- diff --git a/icu4c/source/common/ucptrie_impl.h b/icu4c/source/common/ucptrie_impl.h index 00481a3a8a8..8202628afaf 100644 --- a/icu4c/source/common/ucptrie_impl.h +++ b/icu4c/source/common/ucptrie_impl.h @@ -274,6 +274,11 @@ U_CFUNC void umutablecptrie_setName(UMutableCPTrie *builder, const char *name); * The header.dataNullOffset (split across two header fields, high bits in header.options) * is the offset of a widely shared data block filled with one single value. * It helps quickly skip over large ranges of data with that value. + * The builder must ensure that if the start of any data block (fast or small) + * matches the dataNullOffset, then the whole block must be filled with the null value. + * Special care must be taken if there is no fast null data block + * but a small one, which is shorter, and it matches the *start* of some fast data block. + * * Similarly, the header.index3NullOffset is the index-array offset of an index-3 block * where all index entries point to the dataNullOffset. * If there is no such data or index-3 block, then these offsets are set to diff --git a/icu4c/source/common/umutablecptrie.cpp b/icu4c/source/common/umutablecptrie.cpp index b1bdb0aabae..f23b5e19261 100644 --- a/icu4c/source/common/umutablecptrie.cpp +++ b/icu4c/source/common/umutablecptrie.cpp @@ -91,7 +91,7 @@ private: void maskValues(uint32_t mask); UChar32 findHighStart() const; int32_t compactWholeDataBlocks(int32_t fastILimit, AllSameBlocks &allSameBlocks); - int32_t compactData(int32_t fastILimit, uint32_t *newData); + int32_t compactData(int32_t fastILimit, uint32_t *newData, int32_t dataNullIndex); int32_t compactIndex(int32_t fastILimit, UErrorCode &errorCode); int32_t compactTrie(int32_t fastILimit, UErrorCode &errorCode); @@ -599,12 +599,12 @@ int32_t findSameBlock(const uint16_t *p, int32_t pStart, int32_t length, return -1; } -int32_t findAllSameBlock(const uint32_t *p, int32_t length, uint32_t value, - int32_t blockLength) { - // Ensure that we do not even partially get past length. - length -= blockLength; +int32_t findAllSameBlock(const uint32_t *p, int32_t start, int32_t limit, + uint32_t value, int32_t blockLength) { + // Ensure that we do not even partially get past limit. + limit -= blockLength; - for (int32_t block = 0; block <= length; ++block) { + for (int32_t block = start; block <= limit; ++block) { if (p[block] == value) { for (int32_t i = 1;; ++i) { if (i == blockLength) { @@ -665,6 +665,15 @@ int32_t getAllSameOverlap(const uint32_t *p, int32_t length, uint32_t value, return length - i; } +bool isStartOfSomeFastBlock(uint32_t dataOffset, const uint32_t index[], int32_t fastILimit) { + for (int32_t i = 0; i < fastILimit; i += SMALL_DATA_BLOCKS_PER_BMP_BLOCK) { + if (index[i] == dataOffset) { + return true; + } + } + return false; +} + /** * Finds the start of the last range in the trie by enumerating backward. * Indexes for code points higher than this will be omitted. @@ -775,6 +784,9 @@ int32_t MutableCodePointTrie::compactWholeDataBlocks(int32_t fastILimit, AllSame // ASCII data will be stored as a linear table, even if the following code // does not yet count it that way. int32_t newDataCapacity = ASCII_LIMIT; + // Add room for a small data null block in case it would match the start of + // a fast data block where dataNullOffset must not be set in that case. + newDataCapacity += UCPTRIE_SMALL_DATA_BLOCK_LENGTH; // Add room for special values (errorValue, highValue) and padding. newDataCapacity += 4; int32_t iLimit = highStart >> UCPTRIE_SHIFT_3; @@ -816,6 +828,7 @@ int32_t MutableCodePointTrie::compactWholeDataBlocks(int32_t fastILimit, AllSame if (getDataBlock(i) < 0) { return -1; } + newDataCapacity += blockLength; continue; } } @@ -918,7 +931,8 @@ void printBlock(const uint32_t *block, int32_t blockLength, uint32_t value, * * It does not try to find an optimal order of writing, deduplicating, and overlapping blocks. */ -int32_t MutableCodePointTrie::compactData(int32_t fastILimit, uint32_t *newData) { +int32_t MutableCodePointTrie::compactData(int32_t fastILimit, + uint32_t *newData, int32_t dataNullIndex) { #ifdef UCPTRIE_DEBUG int32_t countSame=0, sumOverlaps=0; bool printData = dataLength == 29088 /* line.brk */ || @@ -941,14 +955,31 @@ int32_t MutableCodePointTrie::compactData(int32_t fastILimit, uint32_t *newData) int32_t iLimit = highStart >> UCPTRIE_SHIFT_3; int32_t blockLength = UCPTRIE_FAST_DATA_BLOCK_LENGTH; int32_t inc = SMALL_DATA_BLOCKS_PER_BMP_BLOCK; + int32_t fastLength = 0; for (int32_t i = ASCII_I_LIMIT; i < iLimit; i += inc) { if (i == fastILimit) { blockLength = UCPTRIE_SMALL_DATA_BLOCK_LENGTH; inc = 1; + fastLength = newDataLength; } if (flags[i] == ALL_SAME) { uint32_t value = index[i]; - int32_t n = findAllSameBlock(newData, newDataLength, value, blockLength); + int32_t n; + // Find an earlier part of the data array of length blockLength + // that is filled with this value. + // If we find a match, and the current block is the data null block, + // and it is not a fast block but matches the start of a fast block, + // then we need to continue looking. + // This is because this small block is shorter than the fast block, + // and not all of the rest of the fast block is filled with this value. + // Otherwise trie.getRange() would detect that the fast block starts at + // dataNullOffset and assume incorrectly that it is filled with the null value. + for (int32_t start = 0; + (n = findAllSameBlock(newData, start, newDataLength, + value, blockLength)) >= 0 && + i == dataNullIndex && i >= fastILimit && n < fastLength && + isStartOfSomeFastBlock(n, index, fastILimit); + start = n + 1) {} if (n >= 0) { DEBUG_DO(++countSame); index[i] = n; @@ -1306,7 +1337,8 @@ int32_t MutableCodePointTrie::compactTrie(int32_t fastILimit, UErrorCode &errorC } uprv_memcpy(newData, asciiData, sizeof(asciiData)); - int32_t newDataLength = compactData(fastILimit, newData); + int32_t dataNullIndex = allSameBlocks.findMostUsed(); + int32_t newDataLength = compactData(fastILimit, newData, dataNullIndex); U_ASSERT(newDataLength <= newDataCapacity); uprv_free(data); data = newData; @@ -1318,7 +1350,6 @@ int32_t MutableCodePointTrie::compactTrie(int32_t fastILimit, UErrorCode &errorC return 0; } - int32_t dataNullIndex = allSameBlocks.findMostUsed(); if (dataNullIndex >= 0) { dataNullOffset = index[dataNullIndex]; #ifdef UCPTRIE_DEBUG diff --git a/icu4c/source/common/unicode/ucptrie.h b/icu4c/source/common/unicode/ucptrie.h index e9b61df0c9b..9ed604210b0 100644 --- a/icu4c/source/common/unicode/ucptrie.h +++ b/icu4c/source/common/unicode/ucptrie.h @@ -89,17 +89,19 @@ enum UCPTrieValueWidth { */ UCPTRIE_VALUE_BITS_ANY = -1, /** - * 16 bits per UCPTrie data value. + * The trie stores 16 bits per data value. + * It returns them as unsigned values 0..0xffff=65535. * @draft ICU 63 */ UCPTRIE_VALUE_BITS_16, /** - * 32 bits per UCPTrie data value. + * The trie stores 32 bits per data value. * @draft ICU 63 */ UCPTRIE_VALUE_BITS_32, /** - * 8 bits per UCPTrie data value. + * The trie stores 8 bits per data value. + * It returns them as unsigned values 0..0xff=255. * @draft ICU 63 */ UCPTRIE_VALUE_BITS_8 @@ -272,6 +274,8 @@ UCPTrieValueFilter(const void *context, uint32_t value); /** * Returns the last code point such that all those from start to there have the same value. * Can be used to efficiently iterate over all same-value ranges in a trie. + * (This is normally faster than iterating over code points and get()ting each value, + * but much slower than a data structure that stores ranges directly.) * * If the UCPTrieValueFilter function pointer is not NULL, then * the value to be delivered is passed through that function, and the return value is the end diff --git a/icu4c/source/common/unicode/umutablecptrie.h b/icu4c/source/common/unicode/umutablecptrie.h index 811471ca571..f9d4c963910 100644 --- a/icu4c/source/common/unicode/umutablecptrie.h +++ b/icu4c/source/common/unicode/umutablecptrie.h @@ -129,6 +129,9 @@ umutablecptrie_get(const UMutableCPTrie *trie, UChar32 c); /** * Returns the last code point such that all those from start to there have the same value. * Can be used to efficiently iterate over all same-value ranges in a trie. + * (This is normally faster than iterating over code points and get()ting each value, + * but much slower than a data structure that stores ranges directly.) + * * The trie can be modified between calls to this function. * * If the UCPTrieValueFilter function pointer is not NULL, then @@ -189,6 +192,15 @@ umutablecptrie_setRange(UMutableCPTrie *trie, * Compacts the data and builds an immutable UCPTrie according to the parameters. * After this, the mutable trie will be empty. * + * The mutable trie stores 32-bit values until buildImmutable() is called. + * If values shorter than 32 bits are to be stored in the immutable trie, + * then the upper bits are discarded. + * For example, when the mutable trie contains values 0x81, -0x7f, and 0xa581, + * and the value width is 8 bits, then each of these is stored as 0x81 + * and the immutable trie will return that as an unsigned value. + * (Some implementations may want to make productive temporary use of the upper bits + * until buildImmutable() discards them.) + * * Not every possible set of mappings can be built into a UCPTrie, * because of limitations resulting from speed and space optimizations. * Every Unicode assigned character can be mapped to a unique value. diff --git a/icu4c/source/test/cintltst/ucptrietest.c b/icu4c/source/test/cintltst/ucptrietest.c index b92f2ed1814..9969a62937a 100644 --- a/icu4c/source/test/cintltst/ucptrietest.c +++ b/icu4c/source/test/cintltst/ucptrietest.c @@ -1490,6 +1490,75 @@ TrieTestGetRangesFixedSurr(void) { umutablecptrie_close(mutableTrie); } +static void TestSmallNullBlockMatchesFast(void) { + // The initial builder+getRange code had a bug: + // When there is no null data block in the fast-index range, + // but a fast-range data block starts with enough values to match a small data block, + // then getRange() got confused. + // The builder must prevent this. + static const SetRange setRanges[] = { + { 0, 0x880, 1 }, + // U+0880..U+088F map to initial value 0, potential match for small null data block. + { 0x890, 0x1040, 2 }, + // U+1040..U+1050 map to 0. + // First small null data block in a small-type trie. + // In a fast-type trie, it is ok to match a small null data block at U+1041 + // but not at U+1040. + { 0x1051, 0x10000, 3 }, + // No fast data block (block length 64) filled with 0 regardless of trie type. + // Need more blocks filled with 0 than the largest range above, + // and need a highStart above that so that it actually counts. + { 0x20000, 0x110000, 9 } + }; + + static const CheckRange checkRanges[] = { + { 0x0880, 1 }, + { 0x0890, 0 }, + { 0x1040, 2 }, + { 0x1051, 0 }, + { 0x10000, 3 }, + { 0x20000, 0 }, + { 0x110000, 9 } + }; + + testTrieRanges("small0-in-fast", FALSE, + setRanges, UPRV_LENGTHOF(setRanges), + checkRanges, UPRV_LENGTHOF(checkRanges)); +} + +static void ShortAllSameBlocksTest(void) { + static const char *const testName = "short-all-same"; + // Many all-same-value blocks but only of the small block length used in the mutable trie. + // The builder code needs to turn a group of short ALL_SAME blocks below fastLimit + // into a MIXED block, and reserve data array capacity for that. + UErrorCode errorCode = U_ZERO_ERROR; + UMutableCPTrie *mutableTrie = umutablecptrie_open(0, 0xad, &errorCode); + CheckRange checkRanges[0x101]; + int32_t i; + if (U_FAILURE(errorCode)) { + log_err("error: umutablecptrie_open(%s) failed: %s\n", testName, u_errorName(errorCode)); + return; + } + for (i = 0; i < 0x1000; i += 0x10) { + uint32_t value = i >> 4; + umutablecptrie_setRange(mutableTrie, i, i + 0xf, value, &errorCode); + checkRanges[value].limit = i + 0x10; + checkRanges[value].value = value; + } + checkRanges[0x100].limit = 0x110000; + checkRanges[0x100].value = 0; + if (U_FAILURE(errorCode)) { + log_err("error: setting values into a mutable trie (%s) failed - %s\n", + testName, u_errorName(errorCode)); + umutablecptrie_close(mutableTrie); + return; + } + + mutableTrie = testTrieSerializeAllValueWidth(testName, mutableTrie, FALSE, + checkRanges, UPRV_LENGTHOF(checkRanges)); + umutablecptrie_close(mutableTrie); +} + void addUCPTrieTest(TestNode** root) { addTest(root, &TrieTestSet1, "tsutil/ucptrietest/TrieTestSet1"); @@ -1503,4 +1572,6 @@ addUCPTrieTest(TestNode** root) { addTest(root, &ManyAllSameBlocksTest, "tsutil/ucptrietest/ManyAllSameBlocksTest"); addTest(root, &MuchDataTest, "tsutil/ucptrietest/MuchDataTest"); addTest(root, &TrieTestGetRangesFixedSurr, "tsutil/ucptrietest/TrieTestGetRangesFixedSurr"); + addTest(root, &TestSmallNullBlockMatchesFast, "tsutil/ucptrietest/TestSmallNullBlockMatchesFast"); + addTest(root, &ShortAllSameBlocksTest, "tsutil/ucptrietest/ShortAllSameBlocksTest"); } diff --git a/icu4j/main/classes/core/src/com/ibm/icu/util/CodePointMap.java b/icu4j/main/classes/core/src/com/ibm/icu/util/CodePointMap.java index ee871db82ce..c0c34357905 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/util/CodePointMap.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/util/CodePointMap.java @@ -331,10 +331,14 @@ public abstract class CodePointMap implements Iterable { /** * Sets the range object to a range of code points beginning with the start parameter. - * The range end is the the last code point such that + * The range start is the same as the start input parameter + * (even if there are preceding code points that have the same value). + * The range end is the last code point such that * all those from start to there have the same value. * Returns false if start is not 0..U+10FFFF. * Can be used to efficiently iterate over all same-value ranges in a map. + * (This is normally faster than iterating over code points and get()ting each value, + * but may be much slower than a data structure that stores ranges directly.) * *

If the {@link ValueFilter} parameter is not null, then * the value to be delivered is passed through that filter, and the return value is the end @@ -365,7 +369,9 @@ public abstract class CodePointMap implements Iterable { /** * Sets the range object to a range of code points beginning with the start parameter. - * The range end is the the last code point such that + * The range start is the same as the start input parameter + * (even if there are preceding code points that have the same value). + * The range end is the last code point such that * all those from start to there have the same value. * Returns false if start is not 0..U+10FFFF. * diff --git a/icu4j/main/classes/core/src/com/ibm/icu/util/CodePointTrie.java b/icu4j/main/classes/core/src/com/ibm/icu/util/CodePointTrie.java index 71de6fc76d0..71a8936baf4 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/util/CodePointTrie.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/util/CodePointTrie.java @@ -70,21 +70,23 @@ public abstract class CodePointTrie extends CodePointMap { */ public enum ValueWidth { /** - * 16 bits per CodePointTrie data value. + * The trie stores 16 bits per data value. + * It returns them as unsigned values 0..0xffff=65535. * * @draft ICU 63 * @provisional This API might change or be removed in a future release. */ BITS_16, /** - * 32 bits per CodePointTrie data value. + * The trie stores 32 bits per data value. * * @draft ICU 63 * @provisional This API might change or be removed in a future release. */ BITS_32, /** - * 8 bits per CodePointTrie data value. + * The trie stores 8 bits per data value. + * It returns them as unsigned values 0..0xff=255. * * @draft ICU 63 * @provisional This API might change or be removed in a future release. diff --git a/icu4j/main/classes/core/src/com/ibm/icu/util/MutableCodePointTrie.java b/icu4j/main/classes/core/src/com/ibm/icu/util/MutableCodePointTrie.java index c5497818182..de9e6bc6dd5 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/util/MutableCodePointTrie.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/util/MutableCodePointTrie.java @@ -157,6 +157,9 @@ public final class MutableCodePointTrie extends CodePointMap implements Cloneabl /** * {@inheritDoc} + * + *

The trie can be modified between calls to this function. + * * @draft ICU 63 * @provisional This API might change or be removed in a future release. */ @@ -309,6 +312,15 @@ public final class MutableCodePointTrie extends CodePointMap implements Cloneabl * Compacts the data and builds an immutable {@link CodePointTrie} according to the parameters. * After this, the mutable trie will be empty. * + *

The mutable trie stores 32-bit values until buildImmutable() is called. + * If values shorter than 32 bits are to be stored in the immutable trie, + * then the upper bits are discarded. + * For example, when the mutable trie contains values 0x81, -0x7f, and 0xa581, + * and the value width is 8 bits, then each of these is stored as 0x81 + * and the immutable trie will return that as an unsigned value. + * (Some implementations may want to make productive temporary use of the upper bits + * until buildImmutable() discards them.) + * *

Not every possible set of mappings can be built into a CodePointTrie, * because of limitations resulting from speed and space optimizations. * Every Unicode assigned character can be mapped to a unique value. @@ -556,11 +568,12 @@ public final class MutableCodePointTrie extends CodePointMap implements Cloneabl return -1; } - private static int findAllSameBlock(int[] p, int length, int value, int blockLength) { - // Ensure that we do not even partially get past length. - length -= blockLength; + private static int findAllSameBlock(int[] p, int start, int limit, + int value, int blockLength) { + // Ensure that we do not even partially get past limit. + limit -= blockLength; - for (int block = 0; block <= length; ++block) { + for (int block = start; block <= limit; ++block) { if (p[block] == value) { for (int i = 1;; ++i) { if (i == blockLength) { @@ -614,6 +627,15 @@ public final class MutableCodePointTrie extends CodePointMap implements Cloneabl return length - i; } + private static boolean isStartOfSomeFastBlock(int dataOffset, int[] index, int fastILimit) { + for (int i = 0; i < fastILimit; i += SMALL_DATA_BLOCKS_PER_BMP_BLOCK) { + if (index[i] == dataOffset) { + return true; + } + } + return false; + } + /** * Finds the start of the last range in the trie by enumerating backward. * Indexes for code points higher than this will be omitted. @@ -720,6 +742,9 @@ public final class MutableCodePointTrie extends CodePointMap implements Cloneabl // ASCII data will be stored as a linear table, even if the following code // does not yet count it that way. int newDataCapacity = ASCII_LIMIT; + // Add room for a small data null block in case it would match the start of + // a fast data block where dataNullOffset must not be set in that case. + newDataCapacity += CodePointTrie.SMALL_DATA_BLOCK_LENGTH; // Add room for special values (errorValue, highValue) and padding. newDataCapacity += 4; int iLimit = highStart >> CodePointTrie.SHIFT_3; @@ -761,6 +786,7 @@ public final class MutableCodePointTrie extends CodePointMap implements Cloneabl if (getDataBlock(i) < 0) { return -1; } + newDataCapacity += blockLength; continue; } } @@ -810,7 +836,7 @@ public final class MutableCodePointTrie extends CodePointMap implements Cloneabl * * It does not try to find an optimal order of writing, deduplicating, and overlapping blocks. */ - private int compactData(int fastILimit, int[] newData) { + private int compactData(int fastILimit, int[] newData, int dataNullIndex) { // The linear ASCII data has been copied into newData already. int newDataLength = 0; for (int i = 0; newDataLength < ASCII_LIMIT; @@ -821,14 +847,31 @@ public final class MutableCodePointTrie extends CodePointMap implements Cloneabl int iLimit = highStart >> CodePointTrie.SHIFT_3; int blockLength = CodePointTrie.FAST_DATA_BLOCK_LENGTH; int inc = SMALL_DATA_BLOCKS_PER_BMP_BLOCK; + int fastLength = 0; for (int i = ASCII_I_LIMIT; i < iLimit; i += inc) { if (i == fastILimit) { blockLength = CodePointTrie.SMALL_DATA_BLOCK_LENGTH; inc = 1; + fastLength = newDataLength; } if (flags[i] == ALL_SAME) { int value = index[i]; - int n = findAllSameBlock(newData, newDataLength, value, blockLength); + // Find an earlier part of the data array of length blockLength + // that is filled with this value. + // If we find a match, and the current block is the data null block, + // and it is not a fast block but matches the start of a fast block, + // then we need to continue looking. + // This is because this small block is shorter than the fast block, + // and not all of the rest of the fast block is filled with this value. + // Otherwise trie.getRange() would detect that the fast block starts at + // dataNullOffset and assume incorrectly that it is filled with the null value. + int n; + for (int start = 0; + (n = findAllSameBlock(newData, start, newDataLength, + value, blockLength)) >= 0 && + i == dataNullIndex && i >= fastILimit && n < fastLength && + isStartOfSomeFastBlock(n, index, fastILimit); + start = n + 1) {} if (n >= 0) { index[i] = n; } else { @@ -1068,7 +1111,6 @@ public final class MutableCodePointTrie extends CodePointMap implements Cloneabl if (indexLength >= (CodePointTrie.NO_INDEX3_NULL_OFFSET + CodePointTrie.INDEX_3_BLOCK_LENGTH)) { // The index-3 offsets exceed 15 bits, or // the last one cannot be distinguished from the no-null-block value. - // TODO: review exceptions / error codes throw new IndexOutOfBoundsException( "The trie data exceeds limitations of the data structure."); } @@ -1143,18 +1185,17 @@ public final class MutableCodePointTrie extends CodePointMap implements Cloneabl int newDataCapacity = compactWholeDataBlocks(fastILimit, allSameBlocks); int[] newData = Arrays.copyOf(asciiData, newDataCapacity); - int newDataLength = compactData(fastILimit, newData); + int dataNullIndex = allSameBlocks.findMostUsed(); + int newDataLength = compactData(fastILimit, newData, dataNullIndex); assert(newDataLength <= newDataCapacity); data = newData; dataLength = newDataLength; if (dataLength > (0x3ffff + CodePointTrie.SMALL_DATA_BLOCK_LENGTH)) { // The offset of the last data block is too high to be stored in the index table. - // TODO: review exceptions / error codes throw new IndexOutOfBoundsException( "The trie data exceeds limitations of the data structure."); } - int dataNullIndex = allSameBlocks.findMostUsed(); if (dataNullIndex >= 0) { dataNullOffset = index[dataNullIndex]; initialValue = data[dataNullOffset]; diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/CodePointTrieTest.java b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/CodePointTrieTest.java index 819800ad71f..86f156dcf0a 100644 --- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/CodePointTrieTest.java +++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/CodePointTrieTest.java @@ -10,6 +10,7 @@ package com.ibm.icu.dev.test.util; import java.io.ByteArrayOutputStream; import java.nio.ByteBuffer; +import java.util.ArrayList; import java.util.Arrays; import org.junit.Test; @@ -18,9 +19,14 @@ import org.junit.runners.JUnit4; import com.ibm.icu.dev.test.TestFmwk; import com.ibm.icu.impl.Normalizer2Impl.UTF16Plus; +import com.ibm.icu.impl.Utility; +import com.ibm.icu.lang.UCharacter; +import com.ibm.icu.lang.UProperty; +import com.ibm.icu.text.UnicodeSet; import com.ibm.icu.util.CodePointMap; import com.ibm.icu.util.CodePointTrie; import com.ibm.icu.util.MutableCodePointTrie; +import com.ibm.icu.util.VersionInfo; @RunWith(JUnit4.class) public final class CodePointTrieTest extends TestFmwk { @@ -32,6 +38,11 @@ public final class CodePointTrieTest extends TestFmwk { this.value = value; } + @Override + public String toString() { + return Utility.hex(start) + ".." + Utility.hex(limit - 1) + ':' + Utility.hex(value); + } + final int start, limit; final int value; } @@ -63,6 +74,11 @@ public final class CodePointTrieTest extends TestFmwk { this.value = value; } + @Override + public String toString() { + return "≤" + Utility.hex(limit - 1) + ':' + Utility.hex(value); + } + final int limit; final int value; } @@ -982,4 +998,119 @@ public final class CodePointTrieTest extends TestFmwk { testGetRangesFixedSurr("fixedSurr4", mutableTrie, CodePointMap.RangeOption.FIXED_ALL_SURROGATES, checkRangesFixedSurr4); } + + @Test + public void TestSmallNullBlockMatchesFast() { + // The initial builder+getRange code had a bug: + // When there is no null data block in the fast-index range, + // but a fast-range data block starts with enough values to match a small data block, + // then getRange() got confused. + // The builder must prevent this. + final SetRange setRanges[] = { + new SetRange(0, 0x880, 1), + // U+0880..U+088F map to initial value 0, potential match for small null data block. + new SetRange(0x890, 0x1040, 2), + // U+1040..U+1050 map to 0. + // First small null data block in a small-type trie. + // In a fast-type trie, it is ok to match a small null data block at U+1041 + // but not at U+1040. + new SetRange(0x1051, 0x10000, 3), + // No fast data block (block length 64) filled with 0 regardless of trie type. + // Need more blocks filled with 0 than the largest range above, + // and need a highStart above that so that it actually counts. + new SetRange(0x20000, 0x110000, 9) + }; + + final CheckRange checkRanges[] = { + new CheckRange(0x0880, 1), + new CheckRange(0x0890, 0), + new CheckRange(0x1040, 2), + new CheckRange(0x1051, 0), + new CheckRange(0x10000, 3), + new CheckRange(0x20000, 0), + new CheckRange(0x110000, 9) + }; + + testTrieRanges("small0-in-fast", false, setRanges, checkRanges); + } + + @Test + public void ShortAllSameBlocksTest() { + // Many all-same-value blocks but only of the small block length used in the mutable trie. + // The builder code needs to turn a group of short ALL_SAME blocks below fastLimit + // into a MIXED block, and reserve data array capacity for that. + MutableCodePointTrie mutableTrie = new MutableCodePointTrie(0, 0xad); + CheckRange[] checkRanges = new CheckRange[0x101]; + for (int i = 0; i < 0x1000; i += 0x10) { + int value = i >> 4; + mutableTrie.setRange(i, i + 0xf, value); + checkRanges[value] = new CheckRange(i + 0x10, value); + } + checkRanges[0x100] = new CheckRange(0x110000, 0); + + mutableTrie = testTrieSerializeAllValueWidth( + "short-all-same", mutableTrie, false, checkRanges); + } + + private void testIntProperty(String testName, String baseSetPattern, int property) { + UnicodeSet uni11 = new UnicodeSet(baseSetPattern); + MutableCodePointTrie mutableTrie = new MutableCodePointTrie(0, 0xad); + ArrayList checkRanges = new ArrayList<>(); + int start = 0; + int limit = 0; + int prevValue = 0; + for (UnicodeSet.EntryRange range : uni11.ranges()) { + // Ranges are in ascending order, each range is non-empty, + // and there is a gap from one to the next. + // Each code point in a range could have a different value. + // Any of them can be 0. + // We keep initial value 0 between ranges. + if (prevValue != 0) { + mutableTrie.setRange(start, limit - 1, prevValue); + checkRanges.add(new CheckRange(limit, prevValue)); + start = limit; + prevValue = 0; + } + int c = limit = range.codepoint; + do { + int value; + if (property == UProperty.AGE) { + VersionInfo version = UCharacter.getAge(c); + value = (version.getMajor() << 4) | version.getMinor(); + } else { + value = UCharacter.getIntPropertyValue(c, property); + } + if (value != prevValue) { + if (start < limit) { + if (prevValue != 0) { + mutableTrie.setRange(start, limit - 1, prevValue); + } + checkRanges.add(new CheckRange(limit, prevValue)); + } + start = c; + prevValue = value; + } + limit = ++c; + } while (c <= range.codepointEnd); + } + if (prevValue != 0) { + mutableTrie.setRange(start, limit - 1, prevValue); + checkRanges.add(new CheckRange(limit, prevValue)); + } + if (limit < 0x110000) { + checkRanges.add(new CheckRange(0x110000, 0)); + } + testTrieSerializeAllValueWidth(testName, mutableTrie, false, + checkRanges.toArray(new CheckRange[checkRanges.size()])); + } + + @Test + public void AgePropertyTest() { + testIntProperty("age", "[:age=11:]", UProperty.AGE); + } + + @Test + public void BlockPropertyTest() { + testIntProperty("block", "[:^blk=No_Block:]", UProperty.BLOCK); + } }