]> granicus.if.org Git - icu/commitdiff
ICU-10524 normalization one-way mapping with trailing ccc>1 has no compose-boundary...
authorMarkus Scherer <markus.icu@gmail.com>
Fri, 25 Aug 2017 22:46:12 +0000 (22:46 +0000)
committerMarkus Scherer <markus.icu@gmail.com>
Fri, 25 Aug 2017 22:46:12 +0000 (22:46 +0000)
X-SVN-Rev: 40355

icu4c/source/common/norm2_nfc_data.h
icu4c/source/data/in/nfc.nrm
icu4c/source/data/in/nfkc.nrm
icu4c/source/data/in/nfkc_cf.nrm
icu4c/source/data/in/uts46.nrm
icu4c/source/test/intltest/tstnorm.cpp
icu4c/source/test/intltest/tstnorm.h
icu4c/source/tools/gennorm2/n2builder.cpp
icu4c/source/tools/gennorm2/n2builder.h
icu4j/main/shared/data/icudata.jar
icu4j/main/tests/core/src/com/ibm/icu/dev/test/normalizer/BasicTest.java

index a8e33878dfbe817a43ab3f234a1dd1d37b42d763..8f5c4346db5ffe9e6d26bd35f01534936b878593 100644 (file)
@@ -300,21 +300,21 @@ static const uint16_t norm2_nfc_data_trieIndex[9776]={
 1,1,1,1,0x864,0x198d,1,1,1,1,1,1,0x868,0x1993,1,0x86c,
 0x1999,1,1,1,1,1,1,1,0xfc0e,1,1,1,1,1,1,1,
 1,1,1,1,1,1,1,1,1,0xfe12,1,1,1,0xffcc,0xffb8,0xffcc,
-0xffcc,1,1,1,0x29dd,0x29e3,0x29e9,0x29ef,0x29f5,0x29fb,0x2a01,0x2a07,1,1,1,1,
+0xffcc,1,1,1,0x29dc,0x29e2,0x29e8,0x29ee,0x29f4,0x29fa,0x2a00,0x2a06,1,1,1,1,
 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
 1,1,1,1,1,1,1,1,0xfe0e,1,0xfc00,1,1,1,1,1,
 1,1,1,0x870,1,1,1,0x199f,0x19a5,0xfe12,1,1,1,1,1,1,
-1,1,1,0xfc00,1,1,1,1,0x2a0d,0x2a13,1,0x2a19,1,1,1,1,
-1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0x2a1f,
-1,1,0x2a25,1,1,1,1,1,0xfe0e,1,1,1,1,1,1,1,
+1,1,1,0xfc00,1,1,1,1,0x2a0c,0x2a12,1,0x2a18,1,1,1,1,
+1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0x2a1e,
+1,1,0x2a24,1,1,1,1,1,0xfe0e,1,1,1,1,1,1,1,
 1,1,1,1,1,1,1,1,1,0xfe12,1,1,1,1,1,1,
-1,1,1,1,1,0x2a2b,0x2a31,0x2a37,1,1,0x2a3d,1,1,1,1,1,
+1,1,1,1,1,0x2a2a,0x2a30,0x2a36,1,1,0x2a3c,1,1,1,1,1,
 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
 1,1,1,1,1,1,1,1,0xfe0e,1,1,1,1,1,1,1,
 1,1,1,1,1,1,1,1,1,0xfe12,1,1,1,1,1,1,
 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0x878,
 0x19ab,1,1,0x19b1,0x19b7,0xfe12,1,1,1,1,1,1,1,1,0xfc00,0xfc00,
-1,1,1,1,0x2a43,0x2a49,1,1,1,1,1,1,1,1,1,1,
+1,1,1,1,0x2a42,0x2a48,1,1,1,1,1,1,1,1,1,1,
 1,1,1,1,1,1,1,1,1,1,0x884,1,0x19bd,1,1,1,
 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
 1,1,1,1,1,1,1,1,1,1,1,1,1,1,0xfc00,1,
@@ -342,7 +342,7 @@ static const uint16_t norm2_nfc_data_trieIndex[9776]={
 1,1,1,0x2a4f,1,1,1,1,1,1,1,1,1,0x2a55,1,1,
 1,1,0x2a5b,1,1,1,1,0x2a61,1,1,1,1,0x2a67,1,1,1,
 1,1,1,1,1,1,1,1,1,0x2a6d,1,1,1,1,1,1,
-1,0xff02,0xff04,0x3c40,0xff08,0x3c48,0x2a73,1,0x2a79,1,0xff04,0xff04,0xff04,0xff04,1,1,
+1,0xff02,0xff04,0x3c40,0xff08,0x3c48,0x2a72,1,0x2a78,1,0xff04,0xff04,0xff04,0xff04,1,1,
 0xff04,0x3c50,0xffcc,0xffcc,0xfe12,1,0xffcc,0xffcc,1,1,1,1,1,1,1,1,
 1,1,1,0x2a7f,1,1,1,1,1,1,1,1,1,0x2a85,1,1,
 1,1,0x2a8b,1,1,1,1,0x2a91,1,1,1,1,0x2a97,1,1,1,
@@ -406,15 +406,15 @@ static const uint16_t norm2_nfc_data_trieIndex[9776]={
 0x21ef,0x21f9,0x2203,0x220d,0x10d8,0x10e6,0x2217,0x2221,0x222b,0x2235,1,1,0x10f4,0x1102,0x223f,0x2249,
 0x2253,0x225d,1,1,0x1110,0x1122,0x2267,0x2271,0x227b,0x2285,0x228f,0x2299,1,0x1134,1,0x22a3,
 1,0x22ad,1,0x22b7,0x1146,0x115c,0x1174,0x1182,0x1190,0x119e,0x11ac,0x11ba,0x11c6,0x11dc,0x11f4,0x1202,
-0x1210,0x121e,0x122c,0x123a,0x1246,0x3b8e,0x22bf,0x3b97,0x1250,0x3b9e,0x22c5,0x3ba7,0x22cb,0x3baf,0x22d1,0x3bb7,
+0x1210,0x121e,0x122c,0x123a,0x1246,0x3b8e,0x22bf,0x3b96,0x1250,0x3b9e,0x22c5,0x3ba6,0x22cb,0x3bae,0x22d1,0x3bb6,
 0x125a,0x3bbe,1,1,0x22d8,0x22e2,0x22f1,0x2301,0x2311,0x2321,0x2331,0x2341,0x234c,0x2356,0x2365,0x2375,
 0x2385,0x2395,0x23a5,0x23b5,0x23c0,0x23ca,0x23d9,0x23e9,0x23f9,0x2409,0x2419,0x2429,0x2434,0x243e,0x244d,0x245d,
 0x246d,0x247d,0x248d,0x249d,0x24a8,0x24b2,0x24c1,0x24d1,0x24e1,0x24f1,0x2501,0x2511,0x251c,0x2526,0x2535,0x2545,
-0x2555,0x2565,0x2575,0x2585,0x258f,0x2595,0x259d,0x25a4,0x25ad,1,0x1264,0x25b7,0x25bf,0x25c5,0x25cb,0x3bc7,
-0x25d0,1,0x2aa2,0x8f0,1,0x25d7,0x25df,0x25e6,0x25ef,1,0x126e,0x25f9,0x2601,0x3bcf,0x2607,0x3bd7,
-0x260c,0x2613,0x2619,0x261f,0x2625,0x262b,0x2633,0x3be1,1,1,0x263b,0x2643,0x264b,0x2651,0x2657,0x3beb,
-1,0x265d,0x2663,0x2669,0x266f,0x2675,0x267d,0x3bf5,0x2685,0x268b,0x2691,0x2699,0x26a1,0x26a7,0x26ad,0x3bff,
-0x26b3,0x26b9,0x3c07,0x2aa7,1,1,0x26c1,0x26c8,0x26d1,1,0x1278,0x26db,0x26e3,0x3c0f,0x26e9,0x3c17,
+0x2555,0x2565,0x2575,0x2585,0x258f,0x2595,0x259d,0x25a4,0x25ad,1,0x1264,0x25b7,0x25bf,0x25c5,0x25cb,0x3bc6,
+0x25d0,1,0x2aa2,0x8f0,1,0x25d7,0x25df,0x25e6,0x25ef,1,0x126e,0x25f9,0x2601,0x3bce,0x2607,0x3bd6,
+0x260c,0x2613,0x2619,0x261f,0x2625,0x262b,0x2633,0x3be0,1,1,0x263b,0x2643,0x264b,0x2651,0x2657,0x3bea,
+1,0x265d,0x2663,0x2669,0x266f,0x2675,0x267d,0x3bf4,0x2685,0x268b,0x2691,0x2699,0x26a1,0x26a7,0x26ad,0x3bfe,
+0x26b3,0x26b9,0x3c06,0x2aa7,1,1,0x26c1,0x26c8,0x26d1,1,0x1278,0x26db,0x26e3,0x3c0e,0x26e9,0x3c16,
 0x26ee,0x2aab,0x8fc,1,0xfa09,0xfa09,1,1,1,1,1,1,1,1,1,1,
 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
 1,1,1,1,0xffcc,0xffcc,0xfe02,0xfe02,0xffcc,0xffcc,0xffcc,0xffcc,0xfe02,0xfe02,0xfe02,0xffcc,
@@ -512,10 +512,10 @@ static const uint16_t norm2_nfc_data_trieIndex[9776]={
 0x311b,0x3009,0x311f,0x3123,0x3127,0x312b,0x312f,0x3011,0x2f09,0x3133,0x3015,0x3137,0x3019,0x313b,0x2ae1,0x313f,
 0x3145,0x314b,0x3151,0x3155,0x3159,0x315d,0x3163,0x3169,0x316f,0x3173,1,1,1,1,1,1,
 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
-1,1,1,1,1,1,1,1,1,0x3177,0xfe34,0x317d,1,1,1,1,
-1,1,1,1,1,1,0x3183,0x3189,0x3191,0x319b,0x31a3,0x31a9,0x31af,0x31b5,0x31bb,0x31c1,
-0x31c7,0x31cd,0x31d3,1,0x31d9,0x31df,0x31e5,0x31eb,0x31f1,1,0x31f7,1,0x31fd,0x3203,1,0x3209,
-0x320f,1,0x3215,0x321b,0x3221,0x3227,0x322d,0x3233,0x3239,0x323f,0x3245,1,1,1,1,1,
+1,1,1,1,1,1,1,1,1,0x3176,0xfe34,0x317c,1,1,1,1,
+1,1,1,1,1,1,0x3182,0x3188,0x3190,0x319a,0x31a2,0x31a8,0x31ae,0x31b4,0x31ba,0x31c0,
+0x31c6,0x31cc,0x31d2,1,0x31d8,0x31de,0x31e4,0x31ea,0x31f0,1,0x31f6,1,0x31fc,0x3202,1,0x3208,
+0x320e,1,0x3214,0x321a,0x3220,0x3226,0x322c,0x3232,0x3238,0x323e,0x3244,1,1,1,1,1,
 1,1,1,1,1,1,1,1,1,1,1,1,0xffcc,0xffcc,0xffcc,0xffcc,
 0xffcc,0xffcc,0xffcc,0xffb8,0xffb8,0xffb8,0xffb8,0xffb8,0xffb8,0xffb8,0xffcc,0xffcc,1,1,1,1,
 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
@@ -560,13 +560,13 @@ static const uint16_t norm2_nfc_data_trieIndex[9776]={
 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
 1,1,1,1,1,1,0xfe02,1,1,1,1,1,1,1,1,1,
 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
-1,1,1,1,1,1,0x324b,0x3255,0x3269,0x3281,0x3299,0x32b1,0x32c9,0xffb0,0xffb0,0xfe02,
+1,1,1,1,1,1,0x324a,0x3254,0x3268,0x3280,0x3298,0x32b0,0x32c8,0xffb0,0xffb0,0xfe02,
 0xfe02,0xfe02,1,1,1,0xffc4,0xffb0,0xffb0,0xffb0,0xffb0,0xffb0,1,1,1,1,1,
 1,1,1,0xffb8,0xffb8,0xffb8,0xffb8,0xffb8,0xffb8,0xffb8,0xffb8,1,1,0xffcc,0xffcc,0xffcc,
 0xffcc,0xffcc,0xffb8,0xffb8,1,1,1,1,1,1,1,1,1,1,1,1,
 1,1,1,1,1,1,1,1,1,1,0xffcc,0xffcc,0xffcc,0xffcc,1,1,
-1,1,1,1,1,1,1,1,1,1,1,0x32d7,0x32e1,0x32f5,0x330d,0x3325,
-0x333d,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+1,1,1,1,1,1,1,1,1,1,1,0x32d6,0x32e0,0x32f4,0x330c,0x3324,
+0x333c,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
 1,1,0xffcc,0xffcc,0xffcc,1,1,1,1,1,1,1,1,1,1,1,
 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
index d92f277c25df9f5f9f054daf0f81b35be0d328a5..bb2e35639ad5bcfca4b5eca2c6d67c9f78d39520 100644 (file)
Binary files a/icu4c/source/data/in/nfc.nrm and b/icu4c/source/data/in/nfc.nrm differ
index 1289141ba583b7bcf8c3fa5697e93c6f9b9dce25..23411447a90e00cd3b7e429acec5df8f313ecb4d 100644 (file)
Binary files a/icu4c/source/data/in/nfkc.nrm and b/icu4c/source/data/in/nfkc.nrm differ
index c6b1e1a8351d810ca381af56fd0a272a91052a1a..275fcaa4f9b3266e04c41250b592f64dfee27ec3 100644 (file)
Binary files a/icu4c/source/data/in/nfkc_cf.nrm and b/icu4c/source/data/in/nfkc_cf.nrm differ
index ee765dec403d717c4118aebc1094c6d165d85216..562a67b27a4c60d07903fd4273a9b3c9be993c0c 100644 (file)
Binary files a/icu4c/source/data/in/uts46.nrm and b/icu4c/source/data/in/uts46.nrm differ
index 6ca80f9decd7525b3cd4aa809560f105ece429ec..6bc965e16e57486d60b7df245a4057dca2e3f468 100644 (file)
@@ -59,6 +59,7 @@ void BasicNormalizerTest::runIndexedTest(int32_t index, UBool exec,
     TESTCASE_AUTO(TestLowMappingToEmpty_FCD);
     TESTCASE_AUTO(TestNormalizeIllFormedText);
     TESTCASE_AUTO(TestComposeJamoTBase);
+    TESTCASE_AUTO(TestComposeBoundaryAfter);
     TESTCASE_AUTO_END;
 }
 
@@ -1754,4 +1755,21 @@ BasicNormalizerTest::TestComposeJamoTBase() {
     assertTrue("isNormalizedUTF8(normalized)", nfkc->isNormalizedUTF8(result8, errorCode));
 }
 
+void
+BasicNormalizerTest::TestComposeBoundaryAfter() {
+    IcuTestErrorCode errorCode(*this, "TestComposeBoundaryAfter");
+    const Normalizer2 *nfkc = Normalizer2::getNFKCInstance(errorCode);
+    if(errorCode.logDataIfFailureAndReset("Normalizer2::getNFKCInstance() call failed")) {
+        return;
+    }
+    // U+02DA and U+FB2C do not have compose-boundaries-after.
+    UnicodeString s(u"\u02DA\u0339 \uFB2C\u05B6");
+    UnicodeString expected(u" \u0339\u030A \u05E9\u05B6\u05BC\u05C1");
+    UnicodeString result = nfkc->normalize(s, errorCode);
+    assertSuccess("nfkc", errorCode.get());
+    assertEquals("nfkc", expected, result);
+    assertFalse("U+02DA boundary-after", nfkc->hasBoundaryAfter(0x2DA));
+    assertFalse("U+FB2C boundary-after", nfkc->hasBoundaryAfter(0xFB2C));
+}
+
 #endif /* #if !UCONFIG_NO_NORMALIZATION */
index 2891e8c98ee6855207ea9fe703d8d1baf4a783e7..db7edfbdf446151c27a1caf130c1b3f9542f40f0 100644 (file)
@@ -52,6 +52,7 @@ public:
     void TestLowMappingToEmpty_FCD();
     void TestNormalizeIllFormedText();
     void TestComposeJamoTBase();
+    void TestComposeBoundaryAfter();
 
 private:
     UnicodeString canonTests[24][3];
index b457fe216aeb0e1530f0a9d417f1ccbbaf4fb2c0..d3aad1214ce26867f0292019b5ed48f3f7373b3c 100644 (file)
@@ -209,7 +209,8 @@ void Normalizer2DataBuilder::removeMapping(UChar32 c) {
     norms.mappingSet.add(c);
 }
 
-UBool Normalizer2DataBuilder::mappingHasCompBoundaryAfter(const BuilderReorderingBuffer &buffer) const {
+UBool Normalizer2DataBuilder::mappingHasCompBoundaryAfter(const BuilderReorderingBuffer &buffer,
+                                                          Norm::MappingType mappingType) const {
     if(buffer.isEmpty()) {
         return FALSE;  // Maps-to-empty-string is no boundary of any kind.
     }
@@ -217,6 +218,15 @@ UBool Normalizer2DataBuilder::mappingHasCompBoundaryAfter(const BuilderReorderin
     if(lastStarterIndex<0) {
         return FALSE;  // no starter
     }
+    const int32_t lastIndex=buffer.length()-1;
+    if(mappingType==Norm::ONE_WAY && lastStarterIndex<lastIndex && buffer.ccAt(lastIndex)>1) {
+        // One-way mapping where after the last starter is at least one combining mark
+        // with a combining class greater than 1,
+        // which means that another combining mark can reorder before it.
+        // By contrast, in a round-trip mapping this does not prevent a boundary as long as
+        // the starter or composite does not combine-forward with a following combining mark.
+        return FALSE;
+    }
     UChar32 starter=buffer.charAt(lastStarterIndex);
     if(lastStarterIndex==0 && norms.combinesBack(starter)) {
         // The last starter is at the beginning of the mapping and combines backward.
@@ -227,7 +237,7 @@ UBool Normalizer2DataBuilder::mappingHasCompBoundaryAfter(const BuilderReorderin
             0<lastStarterIndex && Hangul::isJamoL(buffer.charAt(lastStarterIndex-1)))) {
         // A Jamo leading consonant or an LV pair combines-forward if it is at the end,
         // otherwise it is blocked.
-        return lastStarterIndex!=buffer.length()-1;
+        return lastStarterIndex!=lastIndex;
     }
     // Note: There can be no Hangul syllable in the fully decomposed mapping.
 
@@ -344,7 +354,7 @@ void Normalizer2DataBuilder::postProcess(Norm &norm) {
         norm.hasCompBoundaryBefore=
             !buffer.isEmpty() && norm.leadCC==0 && !norms.combinesBack(buffer.charAt(0));
         norm.hasCompBoundaryAfter=
-            norm.compositions==nullptr && mappingHasCompBoundaryAfter(buffer);
+            norm.compositions==nullptr && mappingHasCompBoundaryAfter(buffer, norm.mappingType);
 
         if(norm.combinesBack) {
             norm.error="combines-back and decomposes, not possible in Unicode normalization";
index eb92bf382f3539dfb7cb65576145218342bd9455..61b36be00448cd6d6b6cc7670e7b211422c1fbe7 100644 (file)
@@ -84,7 +84,8 @@ private:
      * or its mapping contains no starter,
      * or the last starter combines-forward.
      */
-    UBool mappingHasCompBoundaryAfter(const BuilderReorderingBuffer &buffer) const;
+    UBool mappingHasCompBoundaryAfter(const BuilderReorderingBuffer &buffer,
+                                      Norm::MappingType mappingType) const;
     /** Returns TRUE if the mapping by itself recomposes, that is, it is not comp-normalized. */
     UBool mappingRecomposes(const BuilderReorderingBuffer &buffer) const;
     void postProcess(Norm &norm);
index 17b32c5fb6eb9424fbcc0afe527f7dfc1afa138f..30a878de14210d91d297ec64bd959a09b73e4887 100755 (executable)
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:193787da8cd2caebf1901892beccad07f8e7f3c714ef482681784bc583be5c60
-size 12226288
+oid sha256:36b9c089da215705724fac836c41e0db1922f8ea85bf734c601f70fc623187ad
+size 12226253
index 834d0bcc3f47970edfca42920cfec7143848ee5d..107ee60eed9b9a728e0080ed702bb040af6b8da2 100644 (file)
@@ -2867,6 +2867,18 @@ public class BasicTest extends TestFmwk {
         assertTrue("isNormalized(normalized)", nfkc.isNormalized(result));
     }
 
+    @Test
+    public void TestComposeBoundaryAfter() {
+        Normalizer2 nfkc = Normalizer2.getNFKCInstance();
+        // U+02DA and U+FB2C do not have compose-boundaries-after.
+        String s = "\u02DA\u0339 \uFB2C\u05B6";
+        String expected = " \u0339\u030A \u05E9\u05B6\u05BC\u05C1";
+        String result = nfkc.normalize(s);
+        assertEquals("nfkc", expected, result);
+        assertFalse("U+02DA boundary-after", nfkc.hasBoundaryAfter(0x2DA));
+        assertFalse("U+FB2C boundary-after", nfkc.hasBoundaryAfter(0xFB2C));
+    }
+
     @Test
     public void TestNFC() {
         // Coverage tests.