From: Markus Scherer Date: Fri, 25 Aug 2017 22:46:12 +0000 (+0000) Subject: ICU-10524 normalization one-way mapping with trailing ccc>1 has no compose-boundary... X-Git-Tag: release-60-rc~173 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=2f87cf4c46e23e2f61af2b9b64b75e16544e985b;p=icu ICU-10524 normalization one-way mapping with trailing ccc>1 has no compose-boundary-after X-SVN-Rev: 40355 --- diff --git a/icu4c/source/common/norm2_nfc_data.h b/icu4c/source/common/norm2_nfc_data.h index a8e33878dfb..8f5c4346db5 100644 --- a/icu4c/source/common/norm2_nfc_data.h +++ b/icu4c/source/common/norm2_nfc_data.h @@ -300,21 +300,21 @@ static const uint16_t norm2_nfc_data_trieIndex[9776]={ 1,1,1,1,0x864,0x198d,1,1,1,1,1,1,0x868,0x1993,1,0x86c, 0x1999,1,1,1,1,1,1,1,0xfc0e,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,0xfe12,1,1,1,0xffcc,0xffb8,0xffcc, -0xffcc,1,1,1,0x29dd,0x29e3,0x29e9,0x29ef,0x29f5,0x29fb,0x2a01,0x2a07,1,1,1,1, +0xffcc,1,1,1,0x29dc,0x29e2,0x29e8,0x29ee,0x29f4,0x29fa,0x2a00,0x2a06,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,0xfe0e,1,0xfc00,1,1,1,1,1, 1,1,1,0x870,1,1,1,0x199f,0x19a5,0xfe12,1,1,1,1,1,1, -1,1,1,0xfc00,1,1,1,1,0x2a0d,0x2a13,1,0x2a19,1,1,1,1, -1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0x2a1f, -1,1,0x2a25,1,1,1,1,1,0xfe0e,1,1,1,1,1,1,1, +1,1,1,0xfc00,1,1,1,1,0x2a0c,0x2a12,1,0x2a18,1,1,1,1, +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0x2a1e, +1,1,0x2a24,1,1,1,1,1,0xfe0e,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,0xfe12,1,1,1,1,1,1, -1,1,1,1,1,0x2a2b,0x2a31,0x2a37,1,1,0x2a3d,1,1,1,1,1, +1,1,1,1,1,0x2a2a,0x2a30,0x2a36,1,1,0x2a3c,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,0xfe0e,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,0xfe12,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0x878, 0x19ab,1,1,0x19b1,0x19b7,0xfe12,1,1,1,1,1,1,1,1,0xfc00,0xfc00, -1,1,1,1,0x2a43,0x2a49,1,1,1,1,1,1,1,1,1,1, +1,1,1,1,0x2a42,0x2a48,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,0x884,1,0x19bd,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,0xfc00,1, @@ -342,7 +342,7 @@ static const uint16_t norm2_nfc_data_trieIndex[9776]={ 1,1,1,0x2a4f,1,1,1,1,1,1,1,1,1,0x2a55,1,1, 1,1,0x2a5b,1,1,1,1,0x2a61,1,1,1,1,0x2a67,1,1,1, 1,1,1,1,1,1,1,1,1,0x2a6d,1,1,1,1,1,1, -1,0xff02,0xff04,0x3c40,0xff08,0x3c48,0x2a73,1,0x2a79,1,0xff04,0xff04,0xff04,0xff04,1,1, +1,0xff02,0xff04,0x3c40,0xff08,0x3c48,0x2a72,1,0x2a78,1,0xff04,0xff04,0xff04,0xff04,1,1, 0xff04,0x3c50,0xffcc,0xffcc,0xfe12,1,0xffcc,0xffcc,1,1,1,1,1,1,1,1, 1,1,1,0x2a7f,1,1,1,1,1,1,1,1,1,0x2a85,1,1, 1,1,0x2a8b,1,1,1,1,0x2a91,1,1,1,1,0x2a97,1,1,1, @@ -406,15 +406,15 @@ static const uint16_t norm2_nfc_data_trieIndex[9776]={ 0x21ef,0x21f9,0x2203,0x220d,0x10d8,0x10e6,0x2217,0x2221,0x222b,0x2235,1,1,0x10f4,0x1102,0x223f,0x2249, 0x2253,0x225d,1,1,0x1110,0x1122,0x2267,0x2271,0x227b,0x2285,0x228f,0x2299,1,0x1134,1,0x22a3, 1,0x22ad,1,0x22b7,0x1146,0x115c,0x1174,0x1182,0x1190,0x119e,0x11ac,0x11ba,0x11c6,0x11dc,0x11f4,0x1202, -0x1210,0x121e,0x122c,0x123a,0x1246,0x3b8e,0x22bf,0x3b97,0x1250,0x3b9e,0x22c5,0x3ba7,0x22cb,0x3baf,0x22d1,0x3bb7, +0x1210,0x121e,0x122c,0x123a,0x1246,0x3b8e,0x22bf,0x3b96,0x1250,0x3b9e,0x22c5,0x3ba6,0x22cb,0x3bae,0x22d1,0x3bb6, 0x125a,0x3bbe,1,1,0x22d8,0x22e2,0x22f1,0x2301,0x2311,0x2321,0x2331,0x2341,0x234c,0x2356,0x2365,0x2375, 0x2385,0x2395,0x23a5,0x23b5,0x23c0,0x23ca,0x23d9,0x23e9,0x23f9,0x2409,0x2419,0x2429,0x2434,0x243e,0x244d,0x245d, 0x246d,0x247d,0x248d,0x249d,0x24a8,0x24b2,0x24c1,0x24d1,0x24e1,0x24f1,0x2501,0x2511,0x251c,0x2526,0x2535,0x2545, -0x2555,0x2565,0x2575,0x2585,0x258f,0x2595,0x259d,0x25a4,0x25ad,1,0x1264,0x25b7,0x25bf,0x25c5,0x25cb,0x3bc7, -0x25d0,1,0x2aa2,0x8f0,1,0x25d7,0x25df,0x25e6,0x25ef,1,0x126e,0x25f9,0x2601,0x3bcf,0x2607,0x3bd7, -0x260c,0x2613,0x2619,0x261f,0x2625,0x262b,0x2633,0x3be1,1,1,0x263b,0x2643,0x264b,0x2651,0x2657,0x3beb, -1,0x265d,0x2663,0x2669,0x266f,0x2675,0x267d,0x3bf5,0x2685,0x268b,0x2691,0x2699,0x26a1,0x26a7,0x26ad,0x3bff, -0x26b3,0x26b9,0x3c07,0x2aa7,1,1,0x26c1,0x26c8,0x26d1,1,0x1278,0x26db,0x26e3,0x3c0f,0x26e9,0x3c17, +0x2555,0x2565,0x2575,0x2585,0x258f,0x2595,0x259d,0x25a4,0x25ad,1,0x1264,0x25b7,0x25bf,0x25c5,0x25cb,0x3bc6, +0x25d0,1,0x2aa2,0x8f0,1,0x25d7,0x25df,0x25e6,0x25ef,1,0x126e,0x25f9,0x2601,0x3bce,0x2607,0x3bd6, +0x260c,0x2613,0x2619,0x261f,0x2625,0x262b,0x2633,0x3be0,1,1,0x263b,0x2643,0x264b,0x2651,0x2657,0x3bea, +1,0x265d,0x2663,0x2669,0x266f,0x2675,0x267d,0x3bf4,0x2685,0x268b,0x2691,0x2699,0x26a1,0x26a7,0x26ad,0x3bfe, +0x26b3,0x26b9,0x3c06,0x2aa7,1,1,0x26c1,0x26c8,0x26d1,1,0x1278,0x26db,0x26e3,0x3c0e,0x26e9,0x3c16, 0x26ee,0x2aab,0x8fc,1,0xfa09,0xfa09,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,0xffcc,0xffcc,0xfe02,0xfe02,0xffcc,0xffcc,0xffcc,0xffcc,0xfe02,0xfe02,0xfe02,0xffcc, @@ -512,10 +512,10 @@ static const uint16_t norm2_nfc_data_trieIndex[9776]={ 0x311b,0x3009,0x311f,0x3123,0x3127,0x312b,0x312f,0x3011,0x2f09,0x3133,0x3015,0x3137,0x3019,0x313b,0x2ae1,0x313f, 0x3145,0x314b,0x3151,0x3155,0x3159,0x315d,0x3163,0x3169,0x316f,0x3173,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, -1,1,1,1,1,1,1,1,1,0x3177,0xfe34,0x317d,1,1,1,1, -1,1,1,1,1,1,0x3183,0x3189,0x3191,0x319b,0x31a3,0x31a9,0x31af,0x31b5,0x31bb,0x31c1, -0x31c7,0x31cd,0x31d3,1,0x31d9,0x31df,0x31e5,0x31eb,0x31f1,1,0x31f7,1,0x31fd,0x3203,1,0x3209, -0x320f,1,0x3215,0x321b,0x3221,0x3227,0x322d,0x3233,0x3239,0x323f,0x3245,1,1,1,1,1, +1,1,1,1,1,1,1,1,1,0x3176,0xfe34,0x317c,1,1,1,1, +1,1,1,1,1,1,0x3182,0x3188,0x3190,0x319a,0x31a2,0x31a8,0x31ae,0x31b4,0x31ba,0x31c0, +0x31c6,0x31cc,0x31d2,1,0x31d8,0x31de,0x31e4,0x31ea,0x31f0,1,0x31f6,1,0x31fc,0x3202,1,0x3208, +0x320e,1,0x3214,0x321a,0x3220,0x3226,0x322c,0x3232,0x3238,0x323e,0x3244,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,0xffcc,0xffcc,0xffcc,0xffcc, 0xffcc,0xffcc,0xffcc,0xffb8,0xffb8,0xffb8,0xffb8,0xffb8,0xffb8,0xffb8,0xffcc,0xffcc,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, @@ -560,13 +560,13 @@ static const uint16_t norm2_nfc_data_trieIndex[9776]={ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,0xfe02,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, -1,1,1,1,1,1,0x324b,0x3255,0x3269,0x3281,0x3299,0x32b1,0x32c9,0xffb0,0xffb0,0xfe02, +1,1,1,1,1,1,0x324a,0x3254,0x3268,0x3280,0x3298,0x32b0,0x32c8,0xffb0,0xffb0,0xfe02, 0xfe02,0xfe02,1,1,1,0xffc4,0xffb0,0xffb0,0xffb0,0xffb0,0xffb0,1,1,1,1,1, 1,1,1,0xffb8,0xffb8,0xffb8,0xffb8,0xffb8,0xffb8,0xffb8,0xffb8,1,1,0xffcc,0xffcc,0xffcc, 0xffcc,0xffcc,0xffb8,0xffb8,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,0xffcc,0xffcc,0xffcc,0xffcc,1,1, -1,1,1,1,1,1,1,1,1,1,1,0x32d7,0x32e1,0x32f5,0x330d,0x3325, -0x333d,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, +1,1,1,1,1,1,1,1,1,1,1,0x32d6,0x32e0,0x32f4,0x330c,0x3324, +0x333c,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,0xffcc,0xffcc,0xffcc,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, diff --git a/icu4c/source/data/in/nfc.nrm b/icu4c/source/data/in/nfc.nrm index d92f277c25d..bb2e35639ad 100644 Binary files a/icu4c/source/data/in/nfc.nrm and b/icu4c/source/data/in/nfc.nrm differ diff --git a/icu4c/source/data/in/nfkc.nrm b/icu4c/source/data/in/nfkc.nrm index 1289141ba58..23411447a90 100644 Binary files a/icu4c/source/data/in/nfkc.nrm and b/icu4c/source/data/in/nfkc.nrm differ diff --git a/icu4c/source/data/in/nfkc_cf.nrm b/icu4c/source/data/in/nfkc_cf.nrm index c6b1e1a8351..275fcaa4f9b 100644 Binary files a/icu4c/source/data/in/nfkc_cf.nrm and b/icu4c/source/data/in/nfkc_cf.nrm differ diff --git a/icu4c/source/data/in/uts46.nrm b/icu4c/source/data/in/uts46.nrm index ee765dec403..562a67b27a4 100644 Binary files a/icu4c/source/data/in/uts46.nrm and b/icu4c/source/data/in/uts46.nrm differ diff --git a/icu4c/source/test/intltest/tstnorm.cpp b/icu4c/source/test/intltest/tstnorm.cpp index 6ca80f9decd..6bc965e16e5 100644 --- a/icu4c/source/test/intltest/tstnorm.cpp +++ b/icu4c/source/test/intltest/tstnorm.cpp @@ -59,6 +59,7 @@ void BasicNormalizerTest::runIndexedTest(int32_t index, UBool exec, TESTCASE_AUTO(TestLowMappingToEmpty_FCD); TESTCASE_AUTO(TestNormalizeIllFormedText); TESTCASE_AUTO(TestComposeJamoTBase); + TESTCASE_AUTO(TestComposeBoundaryAfter); TESTCASE_AUTO_END; } @@ -1754,4 +1755,21 @@ BasicNormalizerTest::TestComposeJamoTBase() { assertTrue("isNormalizedUTF8(normalized)", nfkc->isNormalizedUTF8(result8, errorCode)); } +void +BasicNormalizerTest::TestComposeBoundaryAfter() { + IcuTestErrorCode errorCode(*this, "TestComposeBoundaryAfter"); + const Normalizer2 *nfkc = Normalizer2::getNFKCInstance(errorCode); + if(errorCode.logDataIfFailureAndReset("Normalizer2::getNFKCInstance() call failed")) { + return; + } + // U+02DA and U+FB2C do not have compose-boundaries-after. + UnicodeString s(u"\u02DA\u0339 \uFB2C\u05B6"); + UnicodeString expected(u" \u0339\u030A \u05E9\u05B6\u05BC\u05C1"); + UnicodeString result = nfkc->normalize(s, errorCode); + assertSuccess("nfkc", errorCode.get()); + assertEquals("nfkc", expected, result); + assertFalse("U+02DA boundary-after", nfkc->hasBoundaryAfter(0x2DA)); + assertFalse("U+FB2C boundary-after", nfkc->hasBoundaryAfter(0xFB2C)); +} + #endif /* #if !UCONFIG_NO_NORMALIZATION */ diff --git a/icu4c/source/test/intltest/tstnorm.h b/icu4c/source/test/intltest/tstnorm.h index 2891e8c98ee..db7edfbdf44 100644 --- a/icu4c/source/test/intltest/tstnorm.h +++ b/icu4c/source/test/intltest/tstnorm.h @@ -52,6 +52,7 @@ public: void TestLowMappingToEmpty_FCD(); void TestNormalizeIllFormedText(); void TestComposeJamoTBase(); + void TestComposeBoundaryAfter(); private: UnicodeString canonTests[24][3]; diff --git a/icu4c/source/tools/gennorm2/n2builder.cpp b/icu4c/source/tools/gennorm2/n2builder.cpp index b457fe216ae..d3aad1214ce 100644 --- a/icu4c/source/tools/gennorm2/n2builder.cpp +++ b/icu4c/source/tools/gennorm2/n2builder.cpp @@ -209,7 +209,8 @@ void Normalizer2DataBuilder::removeMapping(UChar32 c) { norms.mappingSet.add(c); } -UBool Normalizer2DataBuilder::mappingHasCompBoundaryAfter(const BuilderReorderingBuffer &buffer) const { +UBool Normalizer2DataBuilder::mappingHasCompBoundaryAfter(const BuilderReorderingBuffer &buffer, + Norm::MappingType mappingType) const { if(buffer.isEmpty()) { return FALSE; // Maps-to-empty-string is no boundary of any kind. } @@ -217,6 +218,15 @@ UBool Normalizer2DataBuilder::mappingHasCompBoundaryAfter(const BuilderReorderin if(lastStarterIndex<0) { return FALSE; // no starter } + const int32_t lastIndex=buffer.length()-1; + if(mappingType==Norm::ONE_WAY && lastStarterIndex1) { + // One-way mapping where after the last starter is at least one combining mark + // with a combining class greater than 1, + // which means that another combining mark can reorder before it. + // By contrast, in a round-trip mapping this does not prevent a boundary as long as + // the starter or composite does not combine-forward with a following combining mark. + return FALSE; + } UChar32 starter=buffer.charAt(lastStarterIndex); if(lastStarterIndex==0 && norms.combinesBack(starter)) { // The last starter is at the beginning of the mapping and combines backward. @@ -227,7 +237,7 @@ UBool Normalizer2DataBuilder::mappingHasCompBoundaryAfter(const BuilderReorderin 0