From: Markus Scherer Date: Thu, 11 Dec 2014 17:04:32 +0000 (+0000) Subject: ICU-10829 simplify U+FFFE collation: U+FFFE use common non-primary weights, adjust... X-Git-Tag: milestone-59-0-1~1373 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=030eff56d3c51c1dc238912f19940885c5f9f010;p=icu ICU-10829 simplify U+FFFE collation: U+FFFE use common non-primary weights, adjust code for that, test order of U+FFFE equivalent to ucol_mergeSortkeys() but not necessarily same sort keys, omit case level if lowerFirst and only common weights X-SVN-Rev: 36856 --- diff --git a/icu4c/source/common/unicode/uvernum.h b/icu4c/source/common/unicode/uvernum.h index de784e2f6fb..ce11bb43baa 100644 --- a/icu4c/source/common/unicode/uvernum.h +++ b/icu4c/source/common/unicode/uvernum.h @@ -146,7 +146,7 @@ * This value may change in subsequent releases of ICU. * @stable ICU 2.4 */ -#define UCOL_RUNTIME_VERSION 8 +#define UCOL_RUNTIME_VERSION 9 /** * Collation builder code version. diff --git a/icu4c/source/data/in/coll/ucadata-implicithan.icu b/icu4c/source/data/in/coll/ucadata-implicithan.icu index e29e35bf845..50f6869f375 100644 Binary files a/icu4c/source/data/in/coll/ucadata-implicithan.icu and b/icu4c/source/data/in/coll/ucadata-implicithan.icu differ diff --git a/icu4c/source/data/in/coll/ucadata-unihan.icu b/icu4c/source/data/in/coll/ucadata-unihan.icu index 941c5d34a62..92feef5258d 100644 Binary files a/icu4c/source/data/in/coll/ucadata-unihan.icu and b/icu4c/source/data/in/coll/ucadata-unihan.icu differ diff --git a/icu4c/source/data/unidata/FractionalUCA.txt b/icu4c/source/data/unidata/FractionalUCA.txt index e8de4821bb5..65306a5af8b 100644 --- a/icu4c/source/data/unidata/FractionalUCA.txt +++ b/icu4c/source/data/unidata/FractionalUCA.txt @@ -47051,7 +47051,7 @@ FDD1 FDD0; [E4, 05, 05] # unassigned first primary # SPECIAL MAX/MIN COLLATION ELEMENTS -FFFE; [02, 02, 02] # Special LOWEST primary, for merge/interleaving +FFFE; [02, 05, 05] # Special LOWEST primary, for merge/interleaving FFFF; [EF FF, 05, 05] # Special HIGHEST primary, for ranges diff --git a/icu4c/source/i18n/collation.h b/icu4c/source/i18n/collation.h index 3d2ea8c6a1c..a840cff4008 100644 --- a/icu4c/source/i18n/collation.h +++ b/icu4c/source/i18n/collation.h @@ -29,17 +29,19 @@ public: // Special sort key bytes for all levels. static const uint8_t TERMINATOR_BYTE = 0; static const uint8_t LEVEL_SEPARATOR_BYTE = 1; + + /** The secondary/tertiary lower limit for tailoring before any root elements. */ + static const uint32_t BEFORE_WEIGHT16 = 0x0100; + /** * Merge-sort-key separator. - * Must not be used as the lead byte of any CE weight, - * nor as primary compression low terminator. + * Same as the unique primary and identical-level weights of U+FFFE. + * Must not be used as primary compression low terminator. * Otherwise usable. */ static const uint8_t MERGE_SEPARATOR_BYTE = 2; static const uint32_t MERGE_SEPARATOR_PRIMARY = 0x02000000; // U+FFFE - static const uint32_t MERGE_SEPARATOR_WEIGHT16 = 0x0200; // U+FFFE - static const uint32_t MERGE_SEPARATOR_LOWER32 = 0x02000200; // U+FFFE - static const uint32_t MERGE_SEPARATOR_CE32 = 0x02000202; // U+FFFE + static const uint32_t MERGE_SEPARATOR_CE32 = 0x02000505; // U+FFFE /** * Primary compression low terminator, must be greater than MERGE_SEPARATOR_BYTE. diff --git a/icu4c/source/i18n/collationbuilder.cpp b/icu4c/source/i18n/collationbuilder.cpp index 6e3b482f095..79b50927cb6 100644 --- a/icu4c/source/i18n/collationbuilder.cpp +++ b/icu4c/source/i18n/collationbuilder.cpp @@ -450,8 +450,8 @@ CollationBuilder::addReset(int32_t strength, const UnicodeString &str, } nodes.setElementAt(node, index); int32_t nextIndex = nextIndexFromNode(node); - // Insert default nodes with weights 02 and 05, reset to the 02 node. - node = nodeFromWeight16(BEFORE_WEIGHT16) | nodeFromStrength(strength); + // Insert default nodes with weights 01 and 05, reset to the 01 node. + node = nodeFromWeight16(Collation::BEFORE_WEIGHT16) | nodeFromStrength(strength); index = insertNodeBetween(index, nextIndex, node, errorCode); node = nodeFromWeight16(Collation::COMMON_WEIGHT16) | hasBefore3 | nodeFromStrength(strength); @@ -961,7 +961,7 @@ CollationBuilder::findCommonNode(int32_t index, int32_t strength) const { index = nextIndexFromNode(node); node = nodes.elementAti(index); U_ASSERT(!isTailoredNode(node) && strengthFromNode(node) == strength && - weight16FromNode(node) == BEFORE_WEIGHT16); + weight16FromNode(node) == Collation::BEFORE_WEIGHT16); // Skip to the explicit common node. do { index = nextIndexFromNode(node); @@ -1398,7 +1398,7 @@ CollationBuilder::makeTailoredCEs(UErrorCode &errorCode) { // Gap at the beginning of the tertiary CE range. t = rootElements.getTertiaryBoundary() - 0x100; tLimit = rootElements.getFirstTertiaryCE() & Collation::ONLY_TERTIARY_MASK; - } else if(t == BEFORE_WEIGHT16) { + } else if(t == Collation::BEFORE_WEIGHT16) { tLimit = Collation::COMMON_WEIGHT16; } else if(!pIsTailored && !sIsTailored) { // p and s are root weights. @@ -1441,7 +1441,7 @@ CollationBuilder::makeTailoredCEs(UErrorCode &errorCode) { // Gap at the beginning of the secondary CE range. s = rootElements.getSecondaryBoundary() - 0x100; sLimit = rootElements.getFirstSecondaryCE() >> 16; - } else if(s == BEFORE_WEIGHT16) { + } else if(s == Collation::BEFORE_WEIGHT16) { sLimit = Collation::COMMON_WEIGHT16; } else if(!pIsTailored) { // p is a root primary. diff --git a/icu4c/source/i18n/collationbuilder.h b/icu4c/source/i18n/collationbuilder.h index df8be2a7c19..0582b167199 100644 --- a/icu4c/source/i18n/collationbuilder.h +++ b/icu4c/source/i18n/collationbuilder.h @@ -215,9 +215,6 @@ private: static int32_t ceStrength(int64_t ce); - /** The secondary/tertiary lower limit for tailoring before the common weight. */ - static const uint32_t BEFORE_WEIGHT16 = Collation::MERGE_SEPARATOR_WEIGHT16; - /** At most 1M nodes, limited by the 20 bits in node bit fields. */ static const int32_t MAX_INDEX = 0xfffff; /** diff --git a/icu4c/source/i18n/collationcompare.cpp b/icu4c/source/i18n/collationcompare.cpp index 6f9107e9db5..5d5a4c0866d 100644 --- a/icu4c/source/i18n/collationcompare.cpp +++ b/icu4c/source/i18n/collationcompare.cpp @@ -136,18 +136,17 @@ CollationCompare::compareUpToQuaternary(CollationIterator &left, CollationIterat int32_t rightStart = 0; for(;;) { // Find the merge separator or the NO_CE terminator. + uint32_t p; int32_t leftLimit = leftStart; - uint32_t leftLower32; - while((leftLower32 = (uint32_t)left.getCE(leftLimit)) > - Collation::MERGE_SEPARATOR_LOWER32 || - leftLower32 == 0) { + while((p = (uint32_t)(left.getCE(leftLimit) >> 32)) > + Collation::MERGE_SEPARATOR_PRIMARY || + p == 0) { ++leftLimit; } int32_t rightLimit = rightStart; - uint32_t rightLower32; - while((rightLower32 = (uint32_t)right.getCE(rightLimit)) > - Collation::MERGE_SEPARATOR_LOWER32 || - rightLower32 == 0) { + while((p = (uint32_t)(right.getCE(rightLimit) >> 32)) > + Collation::MERGE_SEPARATOR_PRIMARY || + p == 0) { ++rightLimit; } @@ -175,7 +174,7 @@ CollationCompare::compareUpToQuaternary(CollationIterator &left, CollationIterat // Both strings have the same number of merge separators, // or else there would have been a primary-level difference. U_ASSERT(left.getCE(leftLimit) == right.getCE(rightLimit)); - if(left.getCE(leftLimit) == Collation::NO_CE) { break; } + if(p == Collation::NO_CE_PRIMARY) { break; } // Skip both merge separators and continue. leftStart = leftLimit + 1; rightStart = rightLimit + 1; @@ -276,20 +275,19 @@ CollationCompare::compareUpToQuaternary(CollationIterator &left, CollationIterat if(leftTertiary != rightTertiary) { if(CollationSettings::sortsTertiaryUpperCaseFirst(options)) { - // Pass through NO_CE and MERGE_SEPARATOR - // and keep real tertiary weights larger than the MERGE_SEPARATOR. + // Pass through NO_CE and keep real tertiary weights larger than that. // Do not change the artificial uppercase weight of a tertiary CE (0.0.ut), // to keep tertiary CEs well-formed. // Their case+tertiary weights must be greater than those of // primary and secondary CEs. - if(leftTertiary > Collation::MERGE_SEPARATOR_WEIGHT16) { + if(leftTertiary > Collation::NO_CE_WEIGHT16) { if(leftLower32 > 0xffff) { leftTertiary ^= 0xc000; } else { leftTertiary += 0x4000; } } - if(rightTertiary > Collation::MERGE_SEPARATOR_WEIGHT16) { + if(rightTertiary > Collation::NO_CE_WEIGHT16) { if(rightLower32 > 0xffff) { rightTertiary ^= 0xc000; } else { @@ -316,11 +314,9 @@ CollationCompare::compareUpToQuaternary(CollationIterator &left, CollationIterat do { int64_t ce = left.getCE(leftIndex++); leftQuaternary = (uint32_t)ce & 0xffff; - if(leftQuaternary == 0) { - // Variable primary or completely ignorable. + if(leftQuaternary <= Collation::NO_CE_WEIGHT16) { + // Variable primary or completely ignorable or NO_CE. leftQuaternary = (uint32_t)(ce >> 32); - } else if(leftQuaternary <= Collation::MERGE_SEPARATOR_WEIGHT16) { - // Leave NO_CE or MERGE_SEPARATOR as is. } else { // Regular CE, not tertiary ignorable. // Preserve the quaternary weight in bits 7..6. @@ -332,11 +328,9 @@ CollationCompare::compareUpToQuaternary(CollationIterator &left, CollationIterat do { int64_t ce = right.getCE(rightIndex++); rightQuaternary = (uint32_t)ce & 0xffff; - if(rightQuaternary == 0) { - // Variable primary or completely ignorable. + if(rightQuaternary <= Collation::NO_CE_WEIGHT16) { + // Variable primary or completely ignorable or NO_CE. rightQuaternary = (uint32_t)(ce >> 32); - } else if(rightQuaternary <= Collation::MERGE_SEPARATOR_WEIGHT16) { - // Leave NO_CE or MERGE_SEPARATOR as is. } else { // Regular CE, not tertiary ignorable. // Preserve the quaternary weight in bits 7..6. @@ -353,7 +347,7 @@ CollationCompare::compareUpToQuaternary(CollationIterator &left, CollationIterat } return (leftQuaternary < rightQuaternary) ? UCOL_LESS : UCOL_GREATER; } - if(leftQuaternary == Collation::NO_CE_WEIGHT16) { break; } + if(leftQuaternary == Collation::NO_CE_PRIMARY) { break; } } return UCOL_EQUAL; } diff --git a/icu4c/source/i18n/collationkeys.cpp b/icu4c/source/i18n/collationkeys.cpp index 6006811377f..978621a6475 100644 --- a/icu4c/source/i18n/collationkeys.cpp +++ b/icu4c/source/i18n/collationkeys.cpp @@ -262,7 +262,7 @@ CollationKeys::writeSortKeyUpToQuaternary(CollationIterator &iter, int32_t commonQuaternaries = 0; uint32_t prevSecondary = 0; - UBool anyMergeSeparators = FALSE; + int32_t secSegmentStart = 0; for(;;) { // No need to keep all CEs in the buffer when we write a sort key. @@ -350,7 +350,11 @@ CollationKeys::writeSortKeyUpToQuaternary(CollationIterator &iter, uint32_t s = lower32 >> 16; if(s == 0) { // secondary ignorable - } else if(s == Collation::COMMON_WEIGHT16) { + } else if(s == Collation::COMMON_WEIGHT16 && + ((options & CollationSettings::BACKWARD_SECONDARY) == 0 || + p != Collation::MERGE_SEPARATOR_PRIMARY)) { + // s is a common secondary weight, and + // backwards-secondary is off or the ce is not the merge separator. ++commonSecondaries; } else if((options & CollationSettings::BACKWARD_SECONDARY) == 0) { if(commonSecondaries != 0) { @@ -389,16 +393,28 @@ CollationKeys::writeSortKeyUpToQuaternary(CollationIterator &iter, } // commonSecondaries == 0 } - // Reduce separators so that we can look for byte<=1 later. - if(s <= Collation::MERGE_SEPARATOR_WEIGHT16) { - if(s == Collation::MERGE_SEPARATOR_WEIGHT16) { - anyMergeSeparators = TRUE; + if(0 < p && p <= Collation::MERGE_SEPARATOR_PRIMARY) { + // The backwards secondary level compares secondary weights backwards + // within segments separated by the merge separator (U+FFFE). + uint8_t *secs = secondaries.data(); + int32_t last = secondaries.length() - 1; + if(secSegmentStart < last) { + uint8_t *p = secs + secSegmentStart; + uint8_t *q = secs + last; + do { + uint8_t b = *p; + *p++ = *q; + *q-- = b; + } while(p < q); } - secondaries.appendByte((s >> 8) - 1); + secondaries.appendByte(p == Collation::NO_CE_PRIMARY ? + Collation::LEVEL_SEPARATOR_BYTE : Collation::MERGE_SEPARATOR_BYTE); + prevSecondary = 0; + secSegmentStart = secondaries.length(); } else { secondaries.appendReverseWeight16(s); + prevSecondary = s; } - prevSecondary = s; } } @@ -411,19 +427,23 @@ CollationKeys::writeSortKeyUpToQuaternary(CollationIterator &iter, } else { uint32_t c = (lower32 >> 8) & 0xff; // case bits & tertiary lead byte U_ASSERT((c & 0xc0) != 0xc0); - if((c & 0xc0) == 0 && c > Collation::MERGE_SEPARATOR_BYTE) { + if((c & 0xc0) == 0 && c > Collation::LEVEL_SEPARATOR_BYTE) { ++commonCases; } else { if((options & CollationSettings::UPPER_FIRST) == 0) { // lowerFirst: Compress common weights to nibbles 1..7..13, mixed=14, upper=15. - if(commonCases != 0) { + // If there are only common (=lowest) weights in the whole level, + // then we need not write anything. + // Level length differences are handled already on the next-higher level. + if(commonCases != 0 && + (c > Collation::LEVEL_SEPARATOR_BYTE || !cases.isEmpty())) { --commonCases; while(commonCases >= CASE_LOWER_FIRST_COMMON_MAX_COUNT) { cases.appendByte(CASE_LOWER_FIRST_COMMON_MIDDLE << 4); commonCases -= CASE_LOWER_FIRST_COMMON_MAX_COUNT; } uint32_t b; - if(c <= Collation::MERGE_SEPARATOR_BYTE) { + if(c <= Collation::LEVEL_SEPARATOR_BYTE) { b = CASE_LOWER_FIRST_COMMON_LOW + commonCases; } else { b = CASE_LOWER_FIRST_COMMON_HIGH - commonCases; @@ -431,7 +451,7 @@ CollationKeys::writeSortKeyUpToQuaternary(CollationIterator &iter, cases.appendByte(b << 4); commonCases = 0; } - if(c > Collation::MERGE_SEPARATOR_BYTE) { + if(c > Collation::LEVEL_SEPARATOR_BYTE) { c = (CASE_LOWER_FIRST_COMMON_HIGH + (c >> 6)) << 4; // 14 or 15 } } else { @@ -447,11 +467,11 @@ CollationKeys::writeSortKeyUpToQuaternary(CollationIterator &iter, cases.appendByte((CASE_UPPER_FIRST_COMMON_LOW + commonCases) << 4); commonCases = 0; } - if(c > Collation::MERGE_SEPARATOR_BYTE) { + if(c > Collation::LEVEL_SEPARATOR_BYTE) { c = (CASE_UPPER_FIRST_COMMON_LOW - (c >> 6)) << 4; // 2 or 1 } } - // c is a separator byte 01 or 02, + // c is a separator byte 01, // or a left-shifted nibble 0x10, 0x20, ... 0xf0. cases.appendByte(c); } @@ -510,14 +530,14 @@ CollationKeys::writeSortKeyUpToQuaternary(CollationIterator &iter, // Their case+tertiary weights must be greater than those of // primary and secondary CEs. // - // Separators 01..02 -> 01..02 (unchanged) - // Lowercase 03..04 -> 83..84 (includes uncased) + // Separator 01 -> 01 (unchanged) + // Lowercase 02..04 -> 82..84 (includes uncased) // Common weight 05 -> 85..C5 (common-weight compression range) // Lowercase 06..3F -> C6..FF - // Mixed case 43..7F -> 43..7F - // Uppercase 83..BF -> 03..3F + // Mixed case 42..7F -> 42..7F + // Uppercase 82..BF -> 02..3F // Tertiary CE 86..BF -> C6..FF - if(t <= Collation::MERGE_SEPARATOR_WEIGHT16) { + if(t <= Collation::NO_CE_WEIGHT16) { // Keep separators unchanged. } else if(lower32 > 0xffff) { // Invert case bits of primary & secondary CEs. @@ -551,24 +571,22 @@ CollationKeys::writeSortKeyUpToQuaternary(CollationIterator &iter, if((levels & Collation::QUATERNARY_LEVEL_FLAG) != 0) { uint32_t q = lower32 & 0xffff; - if((q & 0xc0) == 0 && q > Collation::MERGE_SEPARATOR_WEIGHT16) { + if((q & 0xc0) == 0 && q > Collation::NO_CE_WEIGHT16) { ++commonQuaternaries; - } else if(q <= Collation::MERGE_SEPARATOR_WEIGHT16 && + } else if(q == Collation::NO_CE_WEIGHT16 && (options & CollationSettings::ALTERNATE_MASK) == 0 && - (quaternaries.isEmpty() || - quaternaries[quaternaries.length() - 1] == Collation::MERGE_SEPARATOR_BYTE)) { - // If alternate=non-ignorable and there are only - // common quaternary weights between two separators, - // then we need not write anything between these separators. + quaternaries.isEmpty()) { + // If alternate=non-ignorable and there are only common quaternary weights, + // then we need not write anything. // The only weights greater than the merge separator and less than the common weight // are shifted primary weights, which are not generated for alternate=non-ignorable. // There are also exactly as many quaternary weights as tertiary weights, // so level length differences are handled already on tertiary level. // Any above-common quaternary weight will compare greater regardless. - quaternaries.appendByte(q >> 8); + quaternaries.appendByte(Collation::LEVEL_SEPARATOR_BYTE); } else { - if(q <= Collation::MERGE_SEPARATOR_WEIGHT16) { - q >>= 8; + if(q == Collation::NO_CE_WEIGHT16) { + q = Collation::LEVEL_SEPARATOR_BYTE; } else { q = 0xfc + ((q >> 6) & 3); } @@ -602,42 +620,7 @@ CollationKeys::writeSortKeyUpToQuaternary(CollationIterator &iter, if(!callback.needToWrite(Collation::SECONDARY_LEVEL)) { return; } ok &= secondaries.isOk(); sink.Append(Collation::LEVEL_SEPARATOR_BYTE); - uint8_t *secs = secondaries.data(); - int32_t length = secondaries.length() - 1; // Ignore the trailing NO_CE. - if((options & CollationSettings::BACKWARD_SECONDARY) != 0) { - // The backwards secondary level compares secondary weights backwards - // within segments separated by the merge separator (U+FFFE, weight 02). - // The separator weights 01 & 02 were reduced to 00 & 01 so that - // we do not accidentally separate at a _second_ weight byte of 02. - int32_t start = 0; - for(;;) { - // Find the merge separator or the NO_CE terminator. - int32_t limit; - if(anyMergeSeparators) { - limit = start; - while(secs[limit] > 1) { ++limit; } - } else { - limit = length; - } - // Reverse this segment. - if(start < limit) { - uint8_t *p = secs + start; - uint8_t *q = secs + limit - 1; - while(p < q) { - uint8_t s = *p; - *p++ = *q; - *q-- = s; - } - } - // Did we reach the end of the string? - if(secs[limit] == 0) { break; } - // Restore the merge separator. - secs[limit] = 2; - // Skip the merge separator and continue. - start = limit + 1; - } - } - sink.Append(reinterpret_cast(secs), length); + secondaries.appendTo(sink); } if((levels & Collation::CASE_LEVEL_FLAG) != 0) { @@ -649,21 +632,12 @@ CollationKeys::writeSortKeyUpToQuaternary(CollationIterator &iter, uint8_t b = 0; for(int32_t i = 0; i < length; ++i) { uint8_t c = (uint8_t)cases[i]; - if(c <= Collation::MERGE_SEPARATOR_BYTE) { - U_ASSERT(c != 0); - if(b != 0) { - sink.Append(b); - b = 0; - } - sink.Append(c); + U_ASSERT((c & 0xf) == 0 && c != 0); + if(b == 0) { + b = c; } else { - U_ASSERT((c & 0xf) == 0); - if(b == 0) { - b = c; - } else { - sink.Append(b | (c >> 4)); - b = 0; - } + sink.Append(b | (c >> 4)); + b = 0; } } if(b != 0) { diff --git a/icu4c/source/i18n/collationrootelements.cpp b/icu4c/source/i18n/collationrootelements.cpp index d59048b75b7..f5b17b9f2cd 100644 --- a/icu4c/source/i18n/collationrootelements.cpp +++ b/icu4c/source/i18n/collationrootelements.cpp @@ -124,7 +124,7 @@ CollationRootElements::getSecondaryBefore(uint32_t p, uint32_t s) const { sec = elements[index] >> 16; } else { index = findPrimary(p) + 1; - previousSec = Collation::MERGE_SEPARATOR_WEIGHT16; + previousSec = Collation::BEFORE_WEIGHT16; sec = Collation::COMMON_WEIGHT16; } U_ASSERT(s >= sec); @@ -149,12 +149,12 @@ CollationRootElements::getTertiaryBefore(uint32_t p, uint32_t s, uint32_t t) con previousTer = 0; } else { index = (int32_t)elements[IX_FIRST_SECONDARY_INDEX]; - previousTer = Collation::MERGE_SEPARATOR_WEIGHT16; + previousTer = Collation::BEFORE_WEIGHT16; } secTer = elements[index] & ~SEC_TER_DELTA_FLAG; } else { index = findPrimary(p) + 1; - previousTer = Collation::MERGE_SEPARATOR_WEIGHT16; + previousTer = Collation::BEFORE_WEIGHT16; secTer = Collation::COMMON_SEC_AND_TER_CE; } uint32_t st = (s << 16) | t; diff --git a/icu4c/source/i18n/collationweights.cpp b/icu4c/source/i18n/collationweights.cpp index 17c044f8e92..a73c26fa88d 100644 --- a/icu4c/source/i18n/collationweights.cpp +++ b/icu4c/source/i18n/collationweights.cpp @@ -126,7 +126,7 @@ CollationWeights::initForSecondary() { maxBytes[1] = 0; minBytes[2] = 0; maxBytes[2] = 0; - minBytes[3] = Collation::MERGE_SEPARATOR_BYTE + 1; + minBytes[3] = Collation::LEVEL_SEPARATOR_BYTE + 1; maxBytes[3] = 0xff; minBytes[4] = 2; maxBytes[4] = 0xff; @@ -142,7 +142,7 @@ CollationWeights::initForTertiary() { maxBytes[2] = 0; // We use only 6 bits per byte. // The other bits are used for case & quaternary weights. - minBytes[3] = Collation::MERGE_SEPARATOR_BYTE + 1; + minBytes[3] = Collation::LEVEL_SEPARATOR_BYTE + 1; maxBytes[3] = 0x3f; minBytes[4] = 2; maxBytes[4] = 0x3f; diff --git a/icu4c/source/test/intltest/collationtest.cpp b/icu4c/source/test/intltest/collationtest.cpp index d8094072000..fa14d349c00 100644 --- a/icu4c/source/test/intltest/collationtest.cpp +++ b/icu4c/source/test/intltest/collationtest.cpp @@ -114,6 +114,8 @@ private: UBool getCollationKey(const char *norm, const UnicodeString &line, const UChar *s, int32_t length, CollationKey &key, IcuTestErrorCode &errorCode); + UBool getMergedCollationKey(const UChar *s, int32_t length, + CollationKey &key, IcuTestErrorCode &errorCode); UBool checkCompareTwo(const char *norm, const UnicodeString &prevFileLine, const UnicodeString &prevString, const UnicodeString &s, UCollationResult expectedOrder, Collation::Level expectedLevel, @@ -172,11 +174,9 @@ void CollationTest::TestMinMax() { return; } int64_t ce = ces.elementAti(0); - int64_t expected = - ((int64_t)Collation::MERGE_SEPARATOR_PRIMARY << 32) | - Collation::MERGE_SEPARATOR_LOWER32; + int64_t expected = Collation::makeCE(Collation::MERGE_SEPARATOR_PRIMARY); if(ce != expected) { - errln("CE(U+fffe)=%04lx != 02.02.02", (long)ce); + errln("CE(U+fffe)=%04lx != 02..", (long)ce); } ce = ces.elementAti(1); @@ -617,11 +617,8 @@ UBool isValidCE(const CollationRootElements &re, const CollationData &data, } // Minimum & maximum lead bytes. if((p1 != 0 && p1 <= Collation::MERGE_SEPARATOR_BYTE) || - (s1 != 0 && s1 <= Collation::MERGE_SEPARATOR_BYTE) || - (t1 != 0 && t1 <= Collation::MERGE_SEPARATOR_BYTE)) { - return FALSE; - } - if(t1 != 0 && t1 > 0x3f) { + s1 == Collation::LEVEL_SEPARATOR_BYTE || + t1 == Collation::LEVEL_SEPARATOR_BYTE || t1 > 0x3f) { return FALSE; } if(c > 2) { @@ -1372,7 +1369,39 @@ UBool CollationTest::getCollationKey(const char *norm, const UnicodeString &line return FALSE; } - // If s contains U+FFFE, check that merged segments make the same key. + // Check that internalNextSortKeyPart() makes the same key, with several part sizes. + static const int32_t partSizes[] = { 32, 3, 1 }; + for(int32_t psi = 0; psi < UPRV_LENGTHOF(partSizes); ++psi) { + int32_t partSize = partSizes[psi]; + CharString parts; + if(!getSortKeyParts(s, length, parts, 32, errorCode)) { + infoln(fileTestName); + errln("Collator(%s).internalNextSortKeyPart(%d) failed: %s", + norm, (int)partSize, errorCode.errorName()); + infoln(line); + return FALSE; + } + if(keyLength != parts.length() || uprv_memcmp(keyBytes, parts.data(), keyLength) != 0) { + infoln(fileTestName); + errln("Collator(%s).getCollationKey() != internalNextSortKeyPart(%d)", + norm, (int)partSize); + infoln(line); + infoln(printCollationKey(key)); + infoln(printSortKey(reinterpret_cast(parts.data()), parts.length())); + return FALSE; + } + } + return TRUE; +} + +/** + * Changes the key to the merged segments of the U+FFFE-separated substrings of s. + * Leaves key unchanged if s does not contain U+FFFE. + * @return TRUE if the key was successfully changed + */ +UBool CollationTest::getMergedCollationKey(const UChar *s, int32_t length, + CollationKey &key, IcuTestErrorCode &errorCode) { + if(errorCode.isFailure()) { return FALSE; } LocalMemory mergedKey; int32_t mergedKeyLength = 0; int32_t mergedKeyCapacity = 0; @@ -1382,7 +1411,7 @@ UBool CollationTest::getCollationKey(const char *norm, const UnicodeString &line if(i == sLength) { if(segmentStart == 0) { // s does not contain any U+FFFE. - break; + return FALSE; } } else if(s[i] != 0xfffe) { ++i; @@ -1423,41 +1452,7 @@ UBool CollationTest::getCollationKey(const char *norm, const UnicodeString &line if(i == sLength) { break; } segmentStart = ++i; } - if(segmentStart != 0 && - (mergedKeyLength != keyLength || - uprv_memcmp(mergedKey.getAlias(), keyBytes, keyLength) != 0)) { - infoln(fileTestName); - errln("Collator(%s).getCollationKey(with U+FFFE) != " - "ucol_mergeSortkeys(segments)", - norm); - infoln(line); - infoln(printCollationKey(key)); - infoln(printSortKey(mergedKey.getAlias(), mergedKeyLength)); - return FALSE; - } - - // Check that internalNextSortKeyPart() makes the same key, with several part sizes. - static const int32_t partSizes[] = { 32, 3, 1 }; - for(int32_t psi = 0; psi < UPRV_LENGTHOF(partSizes); ++psi) { - int32_t partSize = partSizes[psi]; - CharString parts; - if(!getSortKeyParts(s, length, parts, 32, errorCode)) { - infoln(fileTestName); - errln("Collator(%s).internalNextSortKeyPart(%d) failed: %s", - norm, (int)partSize, errorCode.errorName()); - infoln(line); - return FALSE; - } - if(keyLength != parts.length() || uprv_memcmp(keyBytes, parts.data(), keyLength) != 0) { - infoln(fileTestName); - errln("Collator(%s).getCollationKey() != internalNextSortKeyPart(%d)", - norm, (int)partSize); - infoln(line); - infoln(printCollationKey(key)); - infoln(printSortKey(reinterpret_cast(parts.data()), parts.length())); - return FALSE; - } - } + key = CollationKey(mergedKey.getAlias(), mergedKeyLength); return TRUE; } @@ -1488,6 +1483,29 @@ const UnicodeString &surrogatesToFFFD(const UnicodeString &s, UnicodeString &buf return buffer; } +int32_t getDifferenceLevel(const CollationKey &prevKey, const CollationKey &key, + UCollationResult order, UBool collHasCaseLevel) { + if(order == UCOL_EQUAL) { + return Collation::NO_LEVEL; + } + int32_t prevKeyLength; + const uint8_t *prevBytes = prevKey.getByteArray(prevKeyLength); + int32_t keyLength; + const uint8_t *bytes = key.getByteArray(keyLength); + int32_t level = Collation::PRIMARY_LEVEL; + for(int32_t i = 0;; ++i) { + uint8_t b = prevBytes[i]; + if(b != bytes[i]) { break; } + if(b == Collation::LEVEL_SEPARATOR_BYTE) { + ++level; + if(level == Collation::CASE_LEVEL && !collHasCaseLevel) { + ++level; + } + } + } + return level; +} + } UBool CollationTest::checkCompareTwo(const char *norm, const UnicodeString &prevFileLine, @@ -1649,23 +1667,9 @@ UBool CollationTest::checkCompareTwo(const char *norm, const UnicodeString &prev infoln(printCollationKey(key)); return FALSE; } + UBool collHasCaseLevel = coll->getAttribute(UCOL_CASE_LEVEL, errorCode) == UCOL_ON; + int32_t level = getDifferenceLevel(prevKey, key, order, collHasCaseLevel); if(order != UCOL_EQUAL && expectedLevel != Collation::NO_LEVEL) { - int32_t prevKeyLength; - const uint8_t *prevBytes = prevKey.getByteArray(prevKeyLength); - int32_t keyLength; - const uint8_t *bytes = key.getByteArray(keyLength); - int32_t level = Collation::PRIMARY_LEVEL; - for(int32_t i = 0;; ++i) { - uint8_t b = prevBytes[i]; - if(b != bytes[i]) { break; } - if(b == Collation::LEVEL_SEPARATOR_BYTE) { - ++level; - if(level == Collation::CASE_LEVEL && - coll->getAttribute(UCOL_CASE_LEVEL, errorCode) == UCOL_OFF) { - ++level; - } - } - } if(level != expectedLevel) { infoln(fileTestName); errln("line %d Collator(%s).getCollationKey(previous, current).compareTo()=%d wrong level: %d != %d", @@ -1677,6 +1681,45 @@ UBool CollationTest::checkCompareTwo(const char *norm, const UnicodeString &prev return FALSE; } } + + // If either string contains U+FFFE, then their sort keys must compare the same as + // the merged sort keys of each string's between-FFFE segments. + // + // It is not required that + // sortkey(str1 + "\uFFFE" + str2) == mergeSortkeys(sortkey(str1), sortkey(str2)) + // only that those two methods yield the same order. + // + // Use bit-wise OR so that getMergedCollationKey() is always called for both strings. + if((getMergedCollationKey(prevString.getBuffer(), prevString.length(), prevKey, errorCode) | + getMergedCollationKey(s.getBuffer(), s.length(), key, errorCode)) || + errorCode.isFailure()) { + order = prevKey.compareTo(key, errorCode); + if(order != expectedOrder || errorCode.isFailure()) { + infoln(fileTestName); + errln("line %d ucol_mergeSortkeys(Collator(%s).getCollationKey" + "(previous, current segments between U+FFFE)).compareTo() wrong order: %d != %d (%s)", + (int)fileLineNumber, norm, order, expectedOrder, errorCode.errorName()); + infoln(prevFileLine); + infoln(fileLine); + infoln(printCollationKey(prevKey)); + infoln(printCollationKey(key)); + return FALSE; + } + int32_t mergedLevel = getDifferenceLevel(prevKey, key, order, collHasCaseLevel); + if(order != UCOL_EQUAL && expectedLevel != Collation::NO_LEVEL) { + if(mergedLevel != level) { + infoln(fileTestName); + errln("line %d ucol_mergeSortkeys(Collator(%s).getCollationKey" + "(previous, current segments between U+FFFE)).compareTo()=%d wrong level: %d != %d", + (int)fileLineNumber, norm, order, mergedLevel, level); + infoln(prevFileLine); + infoln(fileLine); + infoln(printCollationKey(prevKey)); + infoln(printCollationKey(key)); + return FALSE; + } + } + } return TRUE; }