From: Markus Scherer Date: Thu, 30 Jun 2011 23:22:17 +0000 (+0000) Subject: ICU-8606 add Normalizer2.getCombiningClass(c) X-Git-Tag: milestone-59-0-1~4695 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=4abbf7161ae580f06076b50ded461e7e4ef2af08;p=icu ICU-8606 add Normalizer2.getCombiningClass(c) X-SVN-Rev: 30263 --- diff --git a/icu4c/source/common/filterednormalizer2.cpp b/icu4c/source/common/filterednormalizer2.cpp index a23a459c794..9ca56bf7cce 100644 --- a/icu4c/source/common/filterednormalizer2.cpp +++ b/icu4c/source/common/filterednormalizer2.cpp @@ -1,7 +1,7 @@ /* ******************************************************************************* * -* Copyright (C) 2009-2010, International Business Machines +* Copyright (C) 2009-2011, International Business Machines * Corporation and others. All Rights Reserved. * ******************************************************************************* @@ -153,6 +153,11 @@ FilteredNormalizer2::getDecomposition(UChar32 c, UnicodeString &decomposition) c return set.contains(c) && norm2.getDecomposition(c, decomposition); } +uint8_t +FilteredNormalizer2::getCombiningClass(UChar32 c) const { + return set.contains(c) ? norm2.getCombiningClass(c) : 0; +} + UBool FilteredNormalizer2::isNormalized(const UnicodeString &s, UErrorCode &errorCode) const { uprv_checkCanGetBuffer(s, errorCode); diff --git a/icu4c/source/common/normalizer2.cpp b/icu4c/source/common/normalizer2.cpp index 93f074f4972..92cc7a7a7d2 100644 --- a/icu4c/source/common/normalizer2.cpp +++ b/icu4c/source/common/normalizer2.cpp @@ -189,6 +189,11 @@ public: return TRUE; } + virtual uint8_t + getCombiningClass(UChar32 c) const { + return impl.getCC(impl.getNorm16(c)); + } + // quick checks virtual UBool isNormalized(const UnicodeString &s, UErrorCode &errorCode) const { @@ -632,6 +637,11 @@ Normalizer2::getInstance(const char *packageName, return NULL; } +uint8_t +Normalizer2::getCombiningClass(UChar32 /*c*/) const { + return 0; +} + UOBJECT_DEFINE_NO_RTTI_IMPLEMENTATION(Normalizer2) U_NAMESPACE_END @@ -782,6 +792,11 @@ unorm2_getDecomposition(const UNormalizer2 *norm2, } } +U_DRAFT uint8_t U_EXPORT2 +unorm2_getCombiningClass(const UNormalizer2 *norm2, UChar32 c) { + return reinterpret_cast(norm2)->getCombiningClass(c); +} + U_DRAFT UBool U_EXPORT2 unorm2_isNormalized(const UNormalizer2 *norm2, const UChar *s, int32_t length, @@ -847,9 +862,9 @@ unorm2_isInert(const UNormalizer2 *norm2, UChar32 c) { U_CAPI uint8_t U_EXPORT2 u_getCombiningClass(UChar32 c) { UErrorCode errorCode=U_ZERO_ERROR; - const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(errorCode); + const Normalizer2 *nfd=Normalizer2Factory::getNFDInstance(errorCode); if(U_SUCCESS(errorCode)) { - return impl->getCC(impl->getNorm16(c)); + return nfd->getCombiningClass(c); } else { return 0; } diff --git a/icu4c/source/common/unicode/normalizer2.h b/icu4c/source/common/unicode/normalizer2.h index 98058dc3eae..1486ca881ca 100644 --- a/icu4c/source/common/unicode/normalizer2.h +++ b/icu4c/source/common/unicode/normalizer2.h @@ -190,6 +190,17 @@ public: virtual UBool getDecomposition(UChar32 c, UnicodeString &decomposition) const = 0; + /** + * Gets the combining class of c. + * The default implementation returns 0 + * but all standard implementations return the Unicode Canonical_Combining_Class value. + * @param c code point + * @return c's combining class + * @draft ICU 49 + */ + virtual uint8_t + getCombiningClass(UChar32 c) const; + /** * Tests if the string is normalized. * Internally, in cases where the quickCheck() method would return "maybe" @@ -394,6 +405,17 @@ public: virtual UBool getDecomposition(UChar32 c, UnicodeString &decomposition) const; + /** + * Gets the combining class of c. + * The default implementation returns 0 + * but all standard implementations return the Unicode Canonical_Combining_Class value. + * @param c code point + * @return c's combining class + * @draft ICU 49 + */ + virtual uint8_t + getCombiningClass(UChar32 c) const; + /** * Tests if the string is normalized. * For details see the Normalizer2 base class documentation. diff --git a/icu4c/source/common/unicode/unorm2.h b/icu4c/source/common/unicode/unorm2.h index a522b4735a8..c7e1a4845d7 100644 --- a/icu4c/source/common/unicode/unorm2.h +++ b/icu4c/source/common/unicode/unorm2.h @@ -1,7 +1,7 @@ /* ******************************************************************************* * -* Copyright (C) 2009-2010, International Business Machines +* Copyright (C) 2009-2011, International Business Machines * Corporation and others. All Rights Reserved. * ******************************************************************************* @@ -280,6 +280,18 @@ unorm2_getDecomposition(const UNormalizer2 *norm2, UChar32 c, UChar *decomposition, int32_t capacity, UErrorCode *pErrorCode); +/** + * Gets the combining class of c. + * The default implementation returns 0 + * but all standard implementations return the Unicode Canonical_Combining_Class value. + * @param norm2 UNormalizer2 instance + * @param c code point + * @return c's combining class + * @draft ICU 49 + */ +U_DRAFT uint8_t U_EXPORT2 +unorm2_getCombiningClass(const UNormalizer2 *norm2, UChar32 c); + /** * Tests if the string is normalized. * Internally, in cases where the quickCheck() method would return "maybe" diff --git a/icu4c/source/common/uts46.cpp b/icu4c/source/common/uts46.cpp index 59792e5e6aa..682f9ed4bdb 100644 --- a/icu4c/source/common/uts46.cpp +++ b/icu4c/source/common/uts46.cpp @@ -1120,7 +1120,7 @@ UTS46::isLabelOkContextJ(const UChar *label, int32_t labelLength) const { UChar32 c; int32_t j=i; U16_PREV_UNSAFE(label, j, c); - if(u_getCombiningClass(c)==9) { + if(uts46Norm2.getCombiningClass(c)==9) { continue; } // check precontext (Joining_Type:{L,D})(Joining_Type:T)* @@ -1163,7 +1163,7 @@ UTS46::isLabelOkContextJ(const UChar *label, int32_t labelLength) const { UChar32 c; int32_t j=i; U16_PREV_UNSAFE(label, j, c); - if(u_getCombiningClass(c)!=9) { + if(uts46Norm2.getCombiningClass(c)!=9) { return FALSE; } } diff --git a/icu4c/source/test/cintltst/cucdtst.c b/icu4c/source/test/cintltst/cucdtst.c index 44e9c858db7..fc1ba351ef7 100644 --- a/icu4c/source/test/cintltst/cucdtst.c +++ b/icu4c/source/test/cintltst/cucdtst.c @@ -925,6 +925,14 @@ static void TestIdentifier() } /* for each line of UnicodeData.txt, check some of the properties */ +typedef struct UnicodeDataContext { +#if UCONFIG_NO_NORMALIZATION + const void *dummy; +#else + const UNormalizer2 *nfkc; +#endif +} UnicodeDataContext; + /* * ### TODO * This test fails incorrectly if the First or Last code point of a repetitive area @@ -950,6 +958,10 @@ unicodeDataLineFn(void *context, int32_t i; int8_t type; +#if !UCONFIG_NO_NORMALIZATION + const UNormalizer2 *nfkc; +#endif + /* get the character code, field 0 */ c=strtoul(fields[0][0], &end, 16); if(end<=fields[0][0] || end!=fields[0][1]) { @@ -985,6 +997,10 @@ unicodeDataLineFn(void *context, if(value!=u_getCombiningClass(c) || value!=(uint32_t)u_getIntPropertyValue(c, UCHAR_CANONICAL_COMBINING_CLASS)) { log_err("error: u_getCombiningClass(U+%04lx)==%hu instead of %lu\n", c, u_getCombiningClass(c), value); } + nfkc=((UnicodeDataContext *)context)->nfkc; + if(value!=unorm2_getCombiningClass(nfkc, c)) { + log_err("error: unorm2_getCombiningClass(nfkc, U+%04lx)==%hu instead of %lu\n", c, unorm2_getCombiningClass(nfkc, c), value); + } #endif /* get BiDi category, field 4 */ @@ -1191,6 +1207,8 @@ static void TestUnicodeData() UChar32 c; int8_t type; + UnicodeDataContext context; + u_versionFromString(expectVersionArray, U_UNICODE_VERSION); u_getUnicodeVersion(versionArray); if(memcmp(versionArray, expectVersionArray, U_MAX_VERSION_LENGTH) != 0) @@ -1212,7 +1230,14 @@ static void TestUnicodeData() } errorCode=U_ZERO_ERROR; - parseUCDFile("UnicodeData.txt", fields, 15, unicodeDataLineFn, NULL, &errorCode); +#if !UCONFIG_NO_NORMALIZATION + context.nfkc=unorm2_getInstance(NULL, "nfkc", UNORM2_COMPOSE, &errorCode); + if(U_FAILURE(errorCode)) { + log_data_err("error: unable to open an NFKC UNormalizer2 - %s\n", u_errorName(errorCode)); + return; + } +#endif + parseUCDFile("UnicodeData.txt", fields, 15, unicodeDataLineFn, &context, &errorCode); if(U_FAILURE(errorCode)) { return; /* if we couldn't parse UnicodeData.txt, we should return */ } diff --git a/icu4c/source/test/intltest/tstnorm.cpp b/icu4c/source/test/intltest/tstnorm.cpp index 8b6391c19c7..d0aeaac746e 100644 --- a/icu4c/source/test/intltest/tstnorm.cpp +++ b/icu4c/source/test/intltest/tstnorm.cpp @@ -1,6 +1,6 @@ /******************************************************************** * COPYRIGHT: - * Copyright (c) 1997-2010, International Business Machines Corporation and + * Copyright (c) 1997-2011, International Business Machines Corporation and * others. All Rights Reserved. ********************************************************************/ @@ -1458,9 +1458,7 @@ BasicNormalizerTest::TestFilteredNormalizer2Coverage() { dataerrln("Normalizer2Factory::getNFCInstance() call failed - %s", u_errorName(status)); return; } - UnicodeSet filter(UNICODE_STRING_SIMPLE("[^\\u00a0-\\u00ff]"), errorCode); - UnicodeString newString1 = UNICODE_STRING_SIMPLE("[^\\u0100-\\u01ff]"); - UnicodeString newString2 = UNICODE_STRING_SIMPLE("[^\\u0200-\\u02ff]"); + UnicodeSet filter(UNICODE_STRING_SIMPLE("[^\\u00a0-\\u00ff\\u0310-\\u031f]"), errorCode); FilteredNormalizer2 fn2(*nfcNorm2, filter); UChar32 char32 = 0x0054; @@ -1473,6 +1471,20 @@ BasicNormalizerTest::TestFilteredNormalizer2Coverage() { errln("FilteredNormalizer2.hasBoundaryAfter() failed."); } + UChar32 c; + for(c=0; c<=0x3ff; ++c) { + uint8_t expectedCC= filter.contains(c) ? nfcNorm2->getCombiningClass(c) : 0; + uint8_t cc=fn2.getCombiningClass(c); + if(cc!=expectedCC) { + errln( + UnicodeString("FilteredNormalizer2(NFC, ^A0-FF,310-31F).getCombiningClass(U+")+ + hex(c)+ + ")==filtered NFC.getCC()"); + } + } + + UnicodeString newString1 = UNICODE_STRING_SIMPLE("[^\\u0100-\\u01ff]"); + UnicodeString newString2 = UNICODE_STRING_SIMPLE("[^\\u0200-\\u02ff]"); fn2.append(newString1, newString2, errorCode); if (U_FAILURE(errorCode)) { errln("FilteredNormalizer2.append() failed.");