]> granicus.if.org Git - icu/commitdiff
ICU-10829 simplify U+FFFE collation: U+FFFE use common non-primary weights, adjust...
authorMarkus Scherer <markus.icu@gmail.com>
Thu, 11 Dec 2014 17:04:32 +0000 (17:04 +0000)
committerMarkus Scherer <markus.icu@gmail.com>
Thu, 11 Dec 2014 17:04:32 +0000 (17:04 +0000)
X-SVN-Rev: 36856

12 files changed:
icu4c/source/common/unicode/uvernum.h
icu4c/source/data/in/coll/ucadata-implicithan.icu
icu4c/source/data/in/coll/ucadata-unihan.icu
icu4c/source/data/unidata/FractionalUCA.txt
icu4c/source/i18n/collation.h
icu4c/source/i18n/collationbuilder.cpp
icu4c/source/i18n/collationbuilder.h
icu4c/source/i18n/collationcompare.cpp
icu4c/source/i18n/collationkeys.cpp
icu4c/source/i18n/collationrootelements.cpp
icu4c/source/i18n/collationweights.cpp
icu4c/source/test/intltest/collationtest.cpp

index de784e2f6fb7c48a5b1e1a00e06ef8faab675761..ce11bb43baae4c2879ad2e4200a51c50aa5e85c6 100644 (file)
  * This value may change in subsequent releases of ICU.
  * @stable ICU 2.4
  */
-#define UCOL_RUNTIME_VERSION 8
+#define UCOL_RUNTIME_VERSION 9
 
 /**
  * Collation builder code version.
index e29e35bf845f34717036226cca300494041b201d..50f6869f37522d6cb2d0ce55f9846b2433c4ce57 100644 (file)
Binary files a/icu4c/source/data/in/coll/ucadata-implicithan.icu and b/icu4c/source/data/in/coll/ucadata-implicithan.icu differ
index 941c5d34a6283f670796ff1e575c3b3005d69b5a..92feef5258d55ac4f4789ceefb452f2a6dfc3233 100644 (file)
Binary files a/icu4c/source/data/in/coll/ucadata-unihan.icu and b/icu4c/source/data/in/coll/ucadata-unihan.icu differ
index e8de4821bb5c982159c63a94ec552d48240593eb..65306a5af8b67b5f80ee894ffc268467ba4d9002 100644 (file)
@@ -47051,7 +47051,7 @@ FDD1 FDD0;      [E4, 05, 05]    # unassigned first primary
 
 # SPECIAL MAX/MIN COLLATION ELEMENTS
 
-FFFE;  [02, 02, 02]    # Special LOWEST primary, for merge/interleaving
+FFFE;  [02, 05, 05]    # Special LOWEST primary, for merge/interleaving
 FFFF;  [EF FF, 05, 05] # Special HIGHEST primary, for ranges
 
 
index 3d2ea8c6a1c9a5839a154a31ba658a0f97fe373a..a840cff40085fa943f0d20f400a7f49294cadb0c 100644 (file)
@@ -29,17 +29,19 @@ public:
     // Special sort key bytes for all levels.
     static const uint8_t TERMINATOR_BYTE = 0;
     static const uint8_t LEVEL_SEPARATOR_BYTE = 1;
+
+    /** The secondary/tertiary lower limit for tailoring before any root elements. */
+    static const uint32_t BEFORE_WEIGHT16 = 0x0100;
+
     /**
      * Merge-sort-key separator.
-     * Must not be used as the lead byte of any CE weight,
-     * nor as primary compression low terminator.
+     * Same as the unique primary and identical-level weights of U+FFFE.
+     * Must not be used as primary compression low terminator.
      * Otherwise usable.
      */
     static const uint8_t MERGE_SEPARATOR_BYTE = 2;
     static const uint32_t MERGE_SEPARATOR_PRIMARY = 0x02000000;  // U+FFFE
-    static const uint32_t MERGE_SEPARATOR_WEIGHT16 = 0x0200;  // U+FFFE
-    static const uint32_t MERGE_SEPARATOR_LOWER32 = 0x02000200;  // U+FFFE
-    static const uint32_t MERGE_SEPARATOR_CE32 = 0x02000202;  // U+FFFE
+    static const uint32_t MERGE_SEPARATOR_CE32 = 0x02000505;  // U+FFFE
 
     /**
      * Primary compression low terminator, must be greater than MERGE_SEPARATOR_BYTE.
index 6e3b482f0958144cabbcbe22348b17c4189900a9..79b50927cb68efc0714b49dff569b1bf416f393a 100644 (file)
@@ -450,8 +450,8 @@ CollationBuilder::addReset(int32_t strength, const UnicodeString &str,
             }
             nodes.setElementAt(node, index);
             int32_t nextIndex = nextIndexFromNode(node);
-            // Insert default nodes with weights 02 and 05, reset to the 02 node.
-            node = nodeFromWeight16(BEFORE_WEIGHT16) | nodeFromStrength(strength);
+            // Insert default nodes with weights 01 and 05, reset to the 01 node.
+            node = nodeFromWeight16(Collation::BEFORE_WEIGHT16) | nodeFromStrength(strength);
             index = insertNodeBetween(index, nextIndex, node, errorCode);
             node = nodeFromWeight16(Collation::COMMON_WEIGHT16) | hasBefore3 |
                     nodeFromStrength(strength);
@@ -961,7 +961,7 @@ CollationBuilder::findCommonNode(int32_t index, int32_t strength) const {
     index = nextIndexFromNode(node);
     node = nodes.elementAti(index);
     U_ASSERT(!isTailoredNode(node) && strengthFromNode(node) == strength &&
-            weight16FromNode(node) == BEFORE_WEIGHT16);
+            weight16FromNode(node) == Collation::BEFORE_WEIGHT16);
     // Skip to the explicit common node.
     do {
         index = nextIndexFromNode(node);
@@ -1398,7 +1398,7 @@ CollationBuilder::makeTailoredCEs(UErrorCode &errorCode) {
                                 // Gap at the beginning of the tertiary CE range.
                                 t = rootElements.getTertiaryBoundary() - 0x100;
                                 tLimit = rootElements.getFirstTertiaryCE() & Collation::ONLY_TERTIARY_MASK;
-                            } else if(t == BEFORE_WEIGHT16) {
+                            } else if(t == Collation::BEFORE_WEIGHT16) {
                                 tLimit = Collation::COMMON_WEIGHT16;
                             } else if(!pIsTailored && !sIsTailored) {
                                 // p and s are root weights.
@@ -1441,7 +1441,7 @@ CollationBuilder::makeTailoredCEs(UErrorCode &errorCode) {
                                     // Gap at the beginning of the secondary CE range.
                                     s = rootElements.getSecondaryBoundary() - 0x100;
                                     sLimit = rootElements.getFirstSecondaryCE() >> 16;
-                                } else if(s == BEFORE_WEIGHT16) {
+                                } else if(s == Collation::BEFORE_WEIGHT16) {
                                     sLimit = Collation::COMMON_WEIGHT16;
                                 } else if(!pIsTailored) {
                                     // p is a root primary.
index df8be2a7c19fc657f10e1af519e9d745afc185a2..0582b1671995a7ac684b87c9450202ef8c6ec438 100644 (file)
@@ -215,9 +215,6 @@ private:
 
     static int32_t ceStrength(int64_t ce);
 
-    /** The secondary/tertiary lower limit for tailoring before the common weight. */
-    static const uint32_t BEFORE_WEIGHT16 = Collation::MERGE_SEPARATOR_WEIGHT16;
-
     /** At most 1M nodes, limited by the 20 bits in node bit fields. */
     static const int32_t MAX_INDEX = 0xfffff;
     /**
index 6f9107e9db516416fdbdec4493f6cd585d9897ac..5d5a4c0866db6fa4e78596a984d7886959c7c78f 100644 (file)
@@ -136,18 +136,17 @@ CollationCompare::compareUpToQuaternary(CollationIterator &left, CollationIterat
             int32_t rightStart = 0;
             for(;;) {
                 // Find the merge separator or the NO_CE terminator.
+                uint32_t p;
                 int32_t leftLimit = leftStart;
-                uint32_t leftLower32;
-                while((leftLower32 = (uint32_t)left.getCE(leftLimit)) >
-                            Collation::MERGE_SEPARATOR_LOWER32 ||
-                        leftLower32 == 0) {
+                while((p = (uint32_t)(left.getCE(leftLimit) >> 32)) >
+                            Collation::MERGE_SEPARATOR_PRIMARY ||
+                        p == 0) {
                     ++leftLimit;
                 }
                 int32_t rightLimit = rightStart;
-                uint32_t rightLower32;
-                while((rightLower32 = (uint32_t)right.getCE(rightLimit)) >
-                            Collation::MERGE_SEPARATOR_LOWER32 ||
-                        rightLower32 == 0) {
+                while((p = (uint32_t)(right.getCE(rightLimit) >> 32)) >
+                            Collation::MERGE_SEPARATOR_PRIMARY ||
+                        p == 0) {
                     ++rightLimit;
                 }
 
@@ -175,7 +174,7 @@ CollationCompare::compareUpToQuaternary(CollationIterator &left, CollationIterat
                 // Both strings have the same number of merge separators,
                 // or else there would have been a primary-level difference.
                 U_ASSERT(left.getCE(leftLimit) == right.getCE(rightLimit));
-                if(left.getCE(leftLimit) == Collation::NO_CE) { break; }
+                if(p == Collation::NO_CE_PRIMARY) { break; }
                 // Skip both merge separators and continue.
                 leftStart = leftLimit + 1;
                 rightStart = rightLimit + 1;
@@ -276,20 +275,19 @@ CollationCompare::compareUpToQuaternary(CollationIterator &left, CollationIterat
 
         if(leftTertiary != rightTertiary) {
             if(CollationSettings::sortsTertiaryUpperCaseFirst(options)) {
-                // Pass through NO_CE and MERGE_SEPARATOR
-                // and keep real tertiary weights larger than the MERGE_SEPARATOR.
+                // Pass through NO_CE and keep real tertiary weights larger than that.
                 // Do not change the artificial uppercase weight of a tertiary CE (0.0.ut),
                 // to keep tertiary CEs well-formed.
                 // Their case+tertiary weights must be greater than those of
                 // primary and secondary CEs.
-                if(leftTertiary > Collation::MERGE_SEPARATOR_WEIGHT16) {
+                if(leftTertiary > Collation::NO_CE_WEIGHT16) {
                     if(leftLower32 > 0xffff) {
                         leftTertiary ^= 0xc000;
                     } else {
                         leftTertiary += 0x4000;
                     }
                 }
-                if(rightTertiary > Collation::MERGE_SEPARATOR_WEIGHT16) {
+                if(rightTertiary > Collation::NO_CE_WEIGHT16) {
                     if(rightLower32 > 0xffff) {
                         rightTertiary ^= 0xc000;
                     } else {
@@ -316,11 +314,9 @@ CollationCompare::compareUpToQuaternary(CollationIterator &left, CollationIterat
         do {
             int64_t ce = left.getCE(leftIndex++);
             leftQuaternary = (uint32_t)ce & 0xffff;
-            if(leftQuaternary == 0) {
-                // Variable primary or completely ignorable.
+            if(leftQuaternary <= Collation::NO_CE_WEIGHT16) {
+                // Variable primary or completely ignorable or NO_CE.
                 leftQuaternary = (uint32_t)(ce >> 32);
-            } else if(leftQuaternary <= Collation::MERGE_SEPARATOR_WEIGHT16) {
-                // Leave NO_CE or MERGE_SEPARATOR as is.
             } else {
                 // Regular CE, not tertiary ignorable.
                 // Preserve the quaternary weight in bits 7..6.
@@ -332,11 +328,9 @@ CollationCompare::compareUpToQuaternary(CollationIterator &left, CollationIterat
         do {
             int64_t ce = right.getCE(rightIndex++);
             rightQuaternary = (uint32_t)ce & 0xffff;
-            if(rightQuaternary == 0) {
-                // Variable primary or completely ignorable.
+            if(rightQuaternary <= Collation::NO_CE_WEIGHT16) {
+                // Variable primary or completely ignorable or NO_CE.
                 rightQuaternary = (uint32_t)(ce >> 32);
-            } else if(rightQuaternary <= Collation::MERGE_SEPARATOR_WEIGHT16) {
-                // Leave NO_CE or MERGE_SEPARATOR as is.
             } else {
                 // Regular CE, not tertiary ignorable.
                 // Preserve the quaternary weight in bits 7..6.
@@ -353,7 +347,7 @@ CollationCompare::compareUpToQuaternary(CollationIterator &left, CollationIterat
             }
             return (leftQuaternary < rightQuaternary) ? UCOL_LESS : UCOL_GREATER;
         }
-        if(leftQuaternary == Collation::NO_CE_WEIGHT16) { break; }
+        if(leftQuaternary == Collation::NO_CE_PRIMARY) { break; }
     }
     return UCOL_EQUAL;
 }
index 6006811377fec607d99b0f7e300033b9d5eaf399..978621a6475251cafc6a08bf5550d79e3469cde6 100644 (file)
@@ -262,7 +262,7 @@ CollationKeys::writeSortKeyUpToQuaternary(CollationIterator &iter,
     int32_t commonQuaternaries = 0;
 
     uint32_t prevSecondary = 0;
-    UBool anyMergeSeparators = FALSE;
+    int32_t secSegmentStart = 0;
 
     for(;;) {
         // No need to keep all CEs in the buffer when we write a sort key.
@@ -350,7 +350,11 @@ CollationKeys::writeSortKeyUpToQuaternary(CollationIterator &iter,
             uint32_t s = lower32 >> 16;
             if(s == 0) {
                 // secondary ignorable
-            } else if(s == Collation::COMMON_WEIGHT16) {
+            } else if(s == Collation::COMMON_WEIGHT16 &&
+                    ((options & CollationSettings::BACKWARD_SECONDARY) == 0 ||
+                        p != Collation::MERGE_SEPARATOR_PRIMARY)) {
+                // s is a common secondary weight, and
+                // backwards-secondary is off or the ce is not the merge separator.
                 ++commonSecondaries;
             } else if((options & CollationSettings::BACKWARD_SECONDARY) == 0) {
                 if(commonSecondaries != 0) {
@@ -389,16 +393,28 @@ CollationKeys::writeSortKeyUpToQuaternary(CollationIterator &iter,
                     }
                     // commonSecondaries == 0
                 }
-                // Reduce separators so that we can look for byte<=1 later.
-                if(s <= Collation::MERGE_SEPARATOR_WEIGHT16) {
-                    if(s == Collation::MERGE_SEPARATOR_WEIGHT16) {
-                        anyMergeSeparators = TRUE;
+                if(0 < p && p <= Collation::MERGE_SEPARATOR_PRIMARY) {
+                    // The backwards secondary level compares secondary weights backwards
+                    // within segments separated by the merge separator (U+FFFE).
+                    uint8_t *secs = secondaries.data();
+                    int32_t last = secondaries.length() - 1;
+                    if(secSegmentStart < last) {
+                        uint8_t *p = secs + secSegmentStart;
+                        uint8_t *q = secs + last;
+                        do {
+                            uint8_t b = *p;
+                            *p++ = *q;
+                            *q-- = b;
+                        } while(p < q);
                     }
-                    secondaries.appendByte((s >> 8) - 1);
+                    secondaries.appendByte(p == Collation::NO_CE_PRIMARY ?
+                        Collation::LEVEL_SEPARATOR_BYTE : Collation::MERGE_SEPARATOR_BYTE);
+                    prevSecondary = 0;
+                    secSegmentStart = secondaries.length();
                 } else {
                     secondaries.appendReverseWeight16(s);
+                    prevSecondary = s;
                 }
-                prevSecondary = s;
             }
         }
 
@@ -411,19 +427,23 @@ CollationKeys::writeSortKeyUpToQuaternary(CollationIterator &iter,
             } else {
                 uint32_t c = (lower32 >> 8) & 0xff;  // case bits & tertiary lead byte
                 U_ASSERT((c & 0xc0) != 0xc0);
-                if((c & 0xc0) == 0 && c > Collation::MERGE_SEPARATOR_BYTE) {
+                if((c & 0xc0) == 0 && c > Collation::LEVEL_SEPARATOR_BYTE) {
                     ++commonCases;
                 } else {
                     if((options & CollationSettings::UPPER_FIRST) == 0) {
                         // lowerFirst: Compress common weights to nibbles 1..7..13, mixed=14, upper=15.
-                        if(commonCases != 0) {
+                        // If there are only common (=lowest) weights in the whole level,
+                        // then we need not write anything.
+                        // Level length differences are handled already on the next-higher level.
+                        if(commonCases != 0 &&
+                                (c > Collation::LEVEL_SEPARATOR_BYTE || !cases.isEmpty())) {
                             --commonCases;
                             while(commonCases >= CASE_LOWER_FIRST_COMMON_MAX_COUNT) {
                                 cases.appendByte(CASE_LOWER_FIRST_COMMON_MIDDLE << 4);
                                 commonCases -= CASE_LOWER_FIRST_COMMON_MAX_COUNT;
                             }
                             uint32_t b;
-                            if(c <= Collation::MERGE_SEPARATOR_BYTE) {
+                            if(c <= Collation::LEVEL_SEPARATOR_BYTE) {
                                 b = CASE_LOWER_FIRST_COMMON_LOW + commonCases;
                             } else {
                                 b = CASE_LOWER_FIRST_COMMON_HIGH - commonCases;
@@ -431,7 +451,7 @@ CollationKeys::writeSortKeyUpToQuaternary(CollationIterator &iter,
                             cases.appendByte(b << 4);
                             commonCases = 0;
                         }
-                        if(c > Collation::MERGE_SEPARATOR_BYTE) {
+                        if(c > Collation::LEVEL_SEPARATOR_BYTE) {
                             c = (CASE_LOWER_FIRST_COMMON_HIGH + (c >> 6)) << 4;  // 14 or 15
                         }
                     } else {
@@ -447,11 +467,11 @@ CollationKeys::writeSortKeyUpToQuaternary(CollationIterator &iter,
                             cases.appendByte((CASE_UPPER_FIRST_COMMON_LOW + commonCases) << 4);
                             commonCases = 0;
                         }
-                        if(c > Collation::MERGE_SEPARATOR_BYTE) {
+                        if(c > Collation::LEVEL_SEPARATOR_BYTE) {
                             c = (CASE_UPPER_FIRST_COMMON_LOW - (c >> 6)) << 4;  // 2 or 1
                         }
                     }
-                    // c is a separator byte 01 or 02,
+                    // c is a separator byte 01,
                     // or a left-shifted nibble 0x10, 0x20, ... 0xf0.
                     cases.appendByte(c);
                 }
@@ -510,14 +530,14 @@ CollationKeys::writeSortKeyUpToQuaternary(CollationIterator &iter,
                 // Their case+tertiary weights must be greater than those of
                 // primary and secondary CEs.
                 //
-                // Separators    01..02 -> 01..02  (unchanged)
-                // Lowercase     03..04 -> 83..84  (includes uncased)
+                // Separator         01 -> 01      (unchanged)
+                // Lowercase     02..04 -> 82..84  (includes uncased)
                 // Common weight     05 -> 85..C5  (common-weight compression range)
                 // Lowercase     06..3F -> C6..FF
-                // Mixed case    43..7F -> 43..7F
-                // Uppercase     83..BF -> 03..3F
+                // Mixed case    42..7F -> 42..7F
+                // Uppercase     82..BF -> 02..3F
                 // Tertiary CE   86..BF -> C6..FF
-                if(t <= Collation::MERGE_SEPARATOR_WEIGHT16) {
+                if(t <= Collation::NO_CE_WEIGHT16) {
                     // Keep separators unchanged.
                 } else if(lower32 > 0xffff) {
                     // Invert case bits of primary & secondary CEs.
@@ -551,24 +571,22 @@ CollationKeys::writeSortKeyUpToQuaternary(CollationIterator &iter,
 
         if((levels & Collation::QUATERNARY_LEVEL_FLAG) != 0) {
             uint32_t q = lower32 & 0xffff;
-            if((q & 0xc0) == 0 && q > Collation::MERGE_SEPARATOR_WEIGHT16) {
+            if((q & 0xc0) == 0 && q > Collation::NO_CE_WEIGHT16) {
                 ++commonQuaternaries;
-            } else if(q <= Collation::MERGE_SEPARATOR_WEIGHT16 &&
+            } else if(q == Collation::NO_CE_WEIGHT16 &&
                     (options & CollationSettings::ALTERNATE_MASK) == 0 &&
-                    (quaternaries.isEmpty() ||
-                        quaternaries[quaternaries.length() - 1] == Collation::MERGE_SEPARATOR_BYTE)) {
-                // If alternate=non-ignorable and there are only
-                // common quaternary weights between two separators,
-                // then we need not write anything between these separators.
+                    quaternaries.isEmpty()) {
+                // If alternate=non-ignorable and there are only common quaternary weights,
+                // then we need not write anything.
                 // The only weights greater than the merge separator and less than the common weight
                 // are shifted primary weights, which are not generated for alternate=non-ignorable.
                 // There are also exactly as many quaternary weights as tertiary weights,
                 // so level length differences are handled already on tertiary level.
                 // Any above-common quaternary weight will compare greater regardless.
-                quaternaries.appendByte(q >> 8);
+                quaternaries.appendByte(Collation::LEVEL_SEPARATOR_BYTE);
             } else {
-                if(q <= Collation::MERGE_SEPARATOR_WEIGHT16) {
-                    q >>= 8;
+                if(q == Collation::NO_CE_WEIGHT16) {
+                    q = Collation::LEVEL_SEPARATOR_BYTE;
                 } else {
                     q = 0xfc + ((q >> 6) & 3);
                 }
@@ -602,42 +620,7 @@ CollationKeys::writeSortKeyUpToQuaternary(CollationIterator &iter,
         if(!callback.needToWrite(Collation::SECONDARY_LEVEL)) { return; }
         ok &= secondaries.isOk();
         sink.Append(Collation::LEVEL_SEPARATOR_BYTE);
-        uint8_t *secs = secondaries.data();
-        int32_t length = secondaries.length() - 1;  // Ignore the trailing NO_CE.
-        if((options & CollationSettings::BACKWARD_SECONDARY) != 0) {
-            // The backwards secondary level compares secondary weights backwards
-            // within segments separated by the merge separator (U+FFFE, weight 02).
-            // The separator weights 01 & 02 were reduced to 00 & 01 so that
-            // we do not accidentally separate at a _second_ weight byte of 02.
-            int32_t start = 0;
-            for(;;) {
-                // Find the merge separator or the NO_CE terminator.
-                int32_t limit;
-                if(anyMergeSeparators) {
-                    limit = start;
-                    while(secs[limit] > 1) { ++limit; }
-                } else {
-                    limit = length;
-                }
-                // Reverse this segment.
-                if(start < limit) {
-                    uint8_t *p = secs + start;
-                    uint8_t *q = secs + limit - 1;
-                    while(p < q) {
-                        uint8_t s = *p;
-                        *p++ = *q;
-                        *q-- = s;
-                    }
-                }
-                // Did we reach the end of the string?
-                if(secs[limit] == 0) { break; }
-                // Restore the merge separator.
-                secs[limit] = 2;
-                // Skip the merge separator and continue.
-                start = limit + 1;
-            }
-        }
-        sink.Append(reinterpret_cast<char *>(secs), length);
+        secondaries.appendTo(sink);
     }
 
     if((levels & Collation::CASE_LEVEL_FLAG) != 0) {
@@ -649,21 +632,12 @@ CollationKeys::writeSortKeyUpToQuaternary(CollationIterator &iter,
         uint8_t b = 0;
         for(int32_t i = 0; i < length; ++i) {
             uint8_t c = (uint8_t)cases[i];
-            if(c <= Collation::MERGE_SEPARATOR_BYTE) {
-                U_ASSERT(c != 0);
-                if(b != 0) {
-                    sink.Append(b);
-                    b = 0;
-                }
-                sink.Append(c);
+            U_ASSERT((c & 0xf) == 0 && c != 0);
+            if(b == 0) {
+                b = c;
             } else {
-                U_ASSERT((c & 0xf) == 0);
-                if(b == 0) {
-                    b = c;
-                } else {
-                    sink.Append(b | (c >> 4));
-                    b = 0;
-                }
+                sink.Append(b | (c >> 4));
+                b = 0;
             }
         }
         if(b != 0) {
index d59048b75b7c9c3905947e3e751b5bc0aaa618f3..f5b17b9f2cd6b287bf5af7f0ffa29a1e2fbb9999 100644 (file)
@@ -124,7 +124,7 @@ CollationRootElements::getSecondaryBefore(uint32_t p, uint32_t s) const {
         sec = elements[index] >> 16;
     } else {
         index = findPrimary(p) + 1;
-        previousSec = Collation::MERGE_SEPARATOR_WEIGHT16;
+        previousSec = Collation::BEFORE_WEIGHT16;
         sec = Collation::COMMON_WEIGHT16;
     }
     U_ASSERT(s >= sec);
@@ -149,12 +149,12 @@ CollationRootElements::getTertiaryBefore(uint32_t p, uint32_t s, uint32_t t) con
             previousTer = 0;
         } else {
             index = (int32_t)elements[IX_FIRST_SECONDARY_INDEX];
-            previousTer = Collation::MERGE_SEPARATOR_WEIGHT16;
+            previousTer = Collation::BEFORE_WEIGHT16;
         }
         secTer = elements[index] & ~SEC_TER_DELTA_FLAG;
     } else {
         index = findPrimary(p) + 1;
-        previousTer = Collation::MERGE_SEPARATOR_WEIGHT16;
+        previousTer = Collation::BEFORE_WEIGHT16;
         secTer = Collation::COMMON_SEC_AND_TER_CE;
     }
     uint32_t st = (s << 16) | t;
index 17c044f8e92d068d86066f61480b20ed7f17795e..a73c26fa88d34e04bcf0e88b4c04d587ab503756 100644 (file)
@@ -126,7 +126,7 @@ CollationWeights::initForSecondary() {
     maxBytes[1] = 0;
     minBytes[2] = 0;
     maxBytes[2] = 0;
-    minBytes[3] = Collation::MERGE_SEPARATOR_BYTE + 1;
+    minBytes[3] = Collation::LEVEL_SEPARATOR_BYTE + 1;
     maxBytes[3] = 0xff;
     minBytes[4] = 2;
     maxBytes[4] = 0xff;
@@ -142,7 +142,7 @@ CollationWeights::initForTertiary() {
     maxBytes[2] = 0;
     // We use only 6 bits per byte.
     // The other bits are used for case & quaternary weights.
-    minBytes[3] = Collation::MERGE_SEPARATOR_BYTE + 1;
+    minBytes[3] = Collation::LEVEL_SEPARATOR_BYTE + 1;
     maxBytes[3] = 0x3f;
     minBytes[4] = 2;
     maxBytes[4] = 0x3f;
index d80940720007ef09e27e015a896dd2e80ed6bebe..fa14d349c0082c66d324494932d6e1975009dc37 100644 (file)
@@ -114,6 +114,8 @@ private:
     UBool getCollationKey(const char *norm, const UnicodeString &line,
                           const UChar *s, int32_t length,
                           CollationKey &key, IcuTestErrorCode &errorCode);
+    UBool getMergedCollationKey(const UChar *s, int32_t length,
+                                CollationKey &key, IcuTestErrorCode &errorCode);
     UBool checkCompareTwo(const char *norm, const UnicodeString &prevFileLine,
                           const UnicodeString &prevString, const UnicodeString &s,
                           UCollationResult expectedOrder, Collation::Level expectedLevel,
@@ -172,11 +174,9 @@ void CollationTest::TestMinMax() {
         return;
     }
     int64_t ce = ces.elementAti(0);
-    int64_t expected =
-        ((int64_t)Collation::MERGE_SEPARATOR_PRIMARY << 32) |
-        Collation::MERGE_SEPARATOR_LOWER32;
+    int64_t expected = Collation::makeCE(Collation::MERGE_SEPARATOR_PRIMARY);
     if(ce != expected) {
-        errln("CE(U+fffe)=%04lx != 02.02.02", (long)ce);
+        errln("CE(U+fffe)=%04lx != 02..", (long)ce);
     }
 
     ce = ces.elementAti(1);
@@ -617,11 +617,8 @@ UBool isValidCE(const CollationRootElements &re, const CollationData &data,
     }
     // Minimum & maximum lead bytes.
     if((p1 != 0 && p1 <= Collation::MERGE_SEPARATOR_BYTE) ||
-            (s1 != 0 && s1 <= Collation::MERGE_SEPARATOR_BYTE) ||
-            (t1 != 0 && t1 <= Collation::MERGE_SEPARATOR_BYTE)) {
-        return FALSE;
-    }
-    if(t1 != 0 && t1 > 0x3f) {
+            s1 == Collation::LEVEL_SEPARATOR_BYTE ||
+            t1 == Collation::LEVEL_SEPARATOR_BYTE || t1 > 0x3f) {
         return FALSE;
     }
     if(c > 2) {
@@ -1372,7 +1369,39 @@ UBool CollationTest::getCollationKey(const char *norm, const UnicodeString &line
         return FALSE;
     }
 
-    // If s contains U+FFFE, check that merged segments make the same key.
+    // Check that internalNextSortKeyPart() makes the same key, with several part sizes.
+    static const int32_t partSizes[] = { 32, 3, 1 };
+    for(int32_t psi = 0; psi < UPRV_LENGTHOF(partSizes); ++psi) {
+        int32_t partSize = partSizes[psi];
+        CharString parts;
+        if(!getSortKeyParts(s, length, parts, 32, errorCode)) {
+            infoln(fileTestName);
+            errln("Collator(%s).internalNextSortKeyPart(%d) failed: %s",
+                  norm, (int)partSize, errorCode.errorName());
+            infoln(line);
+            return FALSE;
+        }
+        if(keyLength != parts.length() || uprv_memcmp(keyBytes, parts.data(), keyLength) != 0) {
+            infoln(fileTestName);
+            errln("Collator(%s).getCollationKey() != internalNextSortKeyPart(%d)",
+                  norm, (int)partSize);
+            infoln(line);
+            infoln(printCollationKey(key));
+            infoln(printSortKey(reinterpret_cast<uint8_t *>(parts.data()), parts.length()));
+            return FALSE;
+        }
+    }
+    return TRUE;
+}
+
+/**
+ * Changes the key to the merged segments of the U+FFFE-separated substrings of s.
+ * Leaves key unchanged if s does not contain U+FFFE.
+ * @return TRUE if the key was successfully changed
+ */
+UBool CollationTest::getMergedCollationKey(const UChar *s, int32_t length,
+                                           CollationKey &key, IcuTestErrorCode &errorCode) {
+    if(errorCode.isFailure()) { return FALSE; }
     LocalMemory<uint8_t> mergedKey;
     int32_t mergedKeyLength = 0;
     int32_t mergedKeyCapacity = 0;
@@ -1382,7 +1411,7 @@ UBool CollationTest::getCollationKey(const char *norm, const UnicodeString &line
         if(i == sLength) {
             if(segmentStart == 0) {
                 // s does not contain any U+FFFE.
-                break;
+                return FALSE;
             }
         } else if(s[i] != 0xfffe) {
             ++i;
@@ -1423,41 +1452,7 @@ UBool CollationTest::getCollationKey(const char *norm, const UnicodeString &line
         if(i == sLength) { break; }
         segmentStart = ++i;
     }
-    if(segmentStart != 0 &&
-            (mergedKeyLength != keyLength ||
-            uprv_memcmp(mergedKey.getAlias(), keyBytes, keyLength) != 0)) {
-        infoln(fileTestName);
-        errln("Collator(%s).getCollationKey(with U+FFFE) != "
-              "ucol_mergeSortkeys(segments)",
-              norm);
-        infoln(line);
-        infoln(printCollationKey(key));
-        infoln(printSortKey(mergedKey.getAlias(), mergedKeyLength));
-        return FALSE;
-    }
-
-    // Check that internalNextSortKeyPart() makes the same key, with several part sizes.
-    static const int32_t partSizes[] = { 32, 3, 1 };
-    for(int32_t psi = 0; psi < UPRV_LENGTHOF(partSizes); ++psi) {
-        int32_t partSize = partSizes[psi];
-        CharString parts;
-        if(!getSortKeyParts(s, length, parts, 32, errorCode)) {
-            infoln(fileTestName);
-            errln("Collator(%s).internalNextSortKeyPart(%d) failed: %s",
-                  norm, (int)partSize, errorCode.errorName());
-            infoln(line);
-            return FALSE;
-        }
-        if(keyLength != parts.length() || uprv_memcmp(keyBytes, parts.data(), keyLength) != 0) {
-            infoln(fileTestName);
-            errln("Collator(%s).getCollationKey() != internalNextSortKeyPart(%d)",
-                  norm, (int)partSize);
-            infoln(line);
-            infoln(printCollationKey(key));
-            infoln(printSortKey(reinterpret_cast<uint8_t *>(parts.data()), parts.length()));
-            return FALSE;
-        }
-    }
+    key = CollationKey(mergedKey.getAlias(), mergedKeyLength);
     return TRUE;
 }
 
@@ -1488,6 +1483,29 @@ const UnicodeString &surrogatesToFFFD(const UnicodeString &s, UnicodeString &buf
     return buffer;
 }
 
+int32_t getDifferenceLevel(const CollationKey &prevKey, const CollationKey &key,
+                           UCollationResult order, UBool collHasCaseLevel) {
+    if(order == UCOL_EQUAL) {
+        return Collation::NO_LEVEL;
+    }
+    int32_t prevKeyLength;
+    const uint8_t *prevBytes = prevKey.getByteArray(prevKeyLength);
+    int32_t keyLength;
+    const uint8_t *bytes = key.getByteArray(keyLength);
+    int32_t level = Collation::PRIMARY_LEVEL;
+    for(int32_t i = 0;; ++i) {
+        uint8_t b = prevBytes[i];
+        if(b != bytes[i]) { break; }
+        if(b == Collation::LEVEL_SEPARATOR_BYTE) {
+            ++level;
+            if(level == Collation::CASE_LEVEL && !collHasCaseLevel) {
+                ++level;
+            }
+        }
+    }
+    return level;
+}
+
 }
 
 UBool CollationTest::checkCompareTwo(const char *norm, const UnicodeString &prevFileLine,
@@ -1649,23 +1667,9 @@ UBool CollationTest::checkCompareTwo(const char *norm, const UnicodeString &prev
         infoln(printCollationKey(key));
         return FALSE;
     }
+    UBool collHasCaseLevel = coll->getAttribute(UCOL_CASE_LEVEL, errorCode) == UCOL_ON;
+    int32_t level = getDifferenceLevel(prevKey, key, order, collHasCaseLevel);
     if(order != UCOL_EQUAL && expectedLevel != Collation::NO_LEVEL) {
-        int32_t prevKeyLength;
-        const uint8_t *prevBytes = prevKey.getByteArray(prevKeyLength);
-        int32_t keyLength;
-        const uint8_t *bytes = key.getByteArray(keyLength);
-        int32_t level = Collation::PRIMARY_LEVEL;
-        for(int32_t i = 0;; ++i) {
-            uint8_t b = prevBytes[i];
-            if(b != bytes[i]) { break; }
-            if(b == Collation::LEVEL_SEPARATOR_BYTE) {
-                ++level;
-                if(level == Collation::CASE_LEVEL &&
-                        coll->getAttribute(UCOL_CASE_LEVEL, errorCode) == UCOL_OFF) {
-                    ++level;
-                }
-            }
-        }
         if(level != expectedLevel) {
             infoln(fileTestName);
             errln("line %d Collator(%s).getCollationKey(previous, current).compareTo()=%d wrong level: %d != %d",
@@ -1677,6 +1681,45 @@ UBool CollationTest::checkCompareTwo(const char *norm, const UnicodeString &prev
             return FALSE;
         }
     }
+
+    // If either string contains U+FFFE, then their sort keys must compare the same as
+    // the merged sort keys of each string's between-FFFE segments.
+    //
+    // It is not required that
+    //   sortkey(str1 + "\uFFFE" + str2) == mergeSortkeys(sortkey(str1), sortkey(str2))
+    // only that those two methods yield the same order.
+    //
+    // Use bit-wise OR so that getMergedCollationKey() is always called for both strings.
+    if((getMergedCollationKey(prevString.getBuffer(), prevString.length(), prevKey, errorCode) |
+                getMergedCollationKey(s.getBuffer(), s.length(), key, errorCode)) ||
+            errorCode.isFailure()) {
+        order = prevKey.compareTo(key, errorCode);
+        if(order != expectedOrder || errorCode.isFailure()) {
+            infoln(fileTestName);
+            errln("line %d ucol_mergeSortkeys(Collator(%s).getCollationKey"
+                "(previous, current segments between U+FFFE)).compareTo() wrong order: %d != %d (%s)",
+                (int)fileLineNumber, norm, order, expectedOrder, errorCode.errorName());
+            infoln(prevFileLine);
+            infoln(fileLine);
+            infoln(printCollationKey(prevKey));
+            infoln(printCollationKey(key));
+            return FALSE;
+        }
+        int32_t mergedLevel = getDifferenceLevel(prevKey, key, order, collHasCaseLevel);
+        if(order != UCOL_EQUAL && expectedLevel != Collation::NO_LEVEL) {
+            if(mergedLevel != level) {
+                infoln(fileTestName);
+                errln("line %d ucol_mergeSortkeys(Collator(%s).getCollationKey"
+                    "(previous, current segments between U+FFFE)).compareTo()=%d wrong level: %d != %d",
+                    (int)fileLineNumber, norm, order, mergedLevel, level);
+                infoln(prevFileLine);
+                infoln(fileLine);
+                infoln(printCollationKey(prevKey));
+                infoln(printCollationKey(key));
+                return FALSE;
+            }
+        }
+    }
     return TRUE;
 }