ICU-8606 add Normalizer2.getCombiningClass(c)

author Markus Scherer <markus.icu@gmail.com>

Thu, 30 Jun 2011 23:22:17 +0000 (23:22 +0000)

committer Markus Scherer <markus.icu@gmail.com>

Thu, 30 Jun 2011 23:22:17 +0000 (23:22 +0000)
author Markus Scherer <markus.icu@gmail.com>
Thu, 30 Jun 2011 23:22:17 +0000 (23:22 +0000)
committer Markus Scherer <markus.icu@gmail.com>
Thu, 30 Jun 2011 23:22:17 +0000 (23:22 +0000)
diff --git a/icu4c/source/common/filterednormalizer2.cpp b/icu4c/source/common/filterednormalizer2.cpp

index a23a459c794d92d314b5770f3a02ba103c53d653..9ca56bf7ccedbd8b5351fd207846533d1b59bb03 100644 (file)
--- a/icu4c/source/common/filterednormalizer2.cpp
+++ b/icu4c/source/common/filterednormalizer2.cpp
@@ -1,7 +1,7 @@
  /*
  *******************************************************************************
  *
-*   Copyright (C) 2009-2010, International Business Machines
+*   Copyright (C) 2009-2011, International Business Machines
  *   Corporation and others.  All Rights Reserved.
  *
  *******************************************************************************
@@ -153,6 +153,11 @@ FilteredNormalizer2::getDecomposition(UChar32 c, UnicodeString &decomposition) c
      return set.contains(c) && norm2.getDecomposition(c, decomposition);
  }
  
+uint8_t
+FilteredNormalizer2::getCombiningClass(UChar32 c) const {
+    return set.contains(c) ? norm2.getCombiningClass(c) : 0;
+}
+
  UBool
  FilteredNormalizer2::isNormalized(const UnicodeString &s, UErrorCode &errorCode) const {
      uprv_checkCanGetBuffer(s, errorCode);
diff --git a/icu4c/source/common/normalizer2.cpp b/icu4c/source/common/normalizer2.cpp

index 93f074f4972cf3a80c34bc505de0db2d582ef699..92cc7a7a7d259e6232161037aee88bffac4acba6 100644 (file)
--- a/icu4c/source/common/normalizer2.cpp
+++ b/icu4c/source/common/normalizer2.cpp
@@ -189,6 +189,11 @@ public:
          return TRUE;
      }
  
+    virtual uint8_t
+    getCombiningClass(UChar32 c) const {
+        return impl.getCC(impl.getNorm16(c));
+    }
+
      // quick checks
      virtual UBool
      isNormalized(const UnicodeString &s, UErrorCode &errorCode) const {
@@ -632,6 +637,11 @@ Normalizer2::getInstance(const char *packageName,
      return NULL;
  }
  
+uint8_t
+Normalizer2::getCombiningClass(UChar32 /*c*/) const {
+    return 0;
+}
+
  UOBJECT_DEFINE_NO_RTTI_IMPLEMENTATION(Normalizer2)
  
  U_NAMESPACE_END
@@ -782,6 +792,11 @@ unorm2_getDecomposition(const UNormalizer2 *norm2,
      }
  }
  
+U_DRAFT uint8_t U_EXPORT2
+unorm2_getCombiningClass(const UNormalizer2 *norm2, UChar32 c) {
+    return reinterpret_cast<const Normalizer2 *>(norm2)->getCombiningClass(c);
+}
+
  U_DRAFT UBool U_EXPORT2
  unorm2_isNormalized(const UNormalizer2 *norm2,
                      const UChar *s, int32_t length,
@@ -847,9 +862,9 @@ unorm2_isInert(const UNormalizer2 *norm2, UChar32 c) {
  U_CAPI uint8_t U_EXPORT2
  u_getCombiningClass(UChar32 c) {
      UErrorCode errorCode=U_ZERO_ERROR;
-    const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(errorCode);
+    const Normalizer2 *nfd=Normalizer2Factory::getNFDInstance(errorCode);
      if(U_SUCCESS(errorCode)) {
-        return impl->getCC(impl->getNorm16(c));
+        return nfd->getCombiningClass(c);
      } else {
          return 0;
      }
diff --git a/icu4c/source/common/unicode/normalizer2.h b/icu4c/source/common/unicode/normalizer2.h

index 98058dc3eae6a4bd62b64dbc113c5a176720d8a6..1486ca881cafd1e358c3175403849587d4b998e7 100644 (file)
--- a/icu4c/source/common/unicode/normalizer2.h
+++ b/icu4c/source/common/unicode/normalizer2.h
@@ -190,6 +190,17 @@ public:
      virtual UBool
      getDecomposition(UChar32 c, UnicodeString &decomposition) const = 0;
  
+    /**
+     * Gets the combining class of c.
+     * The default implementation returns 0
+     * but all standard implementations return the Unicode Canonical_Combining_Class value.
+     * @param c code point
+     * @return c's combining class
+     * @draft ICU 49
+     */
+    virtual uint8_t
+    getCombiningClass(UChar32 c) const;
+
      /**
       * Tests if the string is normalized.
       * Internally, in cases where the quickCheck() method would return "maybe"
@@ -394,6 +405,17 @@ public:
      virtual UBool
      getDecomposition(UChar32 c, UnicodeString &decomposition) const;
  
+    /**
+     * Gets the combining class of c.
+     * The default implementation returns 0
+     * but all standard implementations return the Unicode Canonical_Combining_Class value.
+     * @param c code point
+     * @return c's combining class
+     * @draft ICU 49
+     */
+    virtual uint8_t
+    getCombiningClass(UChar32 c) const;
+
      /**
       * Tests if the string is normalized.
       * For details see the Normalizer2 base class documentation.
diff --git a/icu4c/source/common/unicode/unorm2.h b/icu4c/source/common/unicode/unorm2.h

index a522b4735a8d6ac4ebda2dacb05249db4a2a2f1c..c7e1a4845d71fe1cbf9379cd93eaf176e8b9bc7b 100644 (file)
--- a/icu4c/source/common/unicode/unorm2.h
+++ b/icu4c/source/common/unicode/unorm2.h
@@ -1,7 +1,7 @@
  /*
  *******************************************************************************
  *
-*   Copyright (C) 2009-2010, International Business Machines
+*   Copyright (C) 2009-2011, International Business Machines
  *   Corporation and others.  All Rights Reserved.
  *
  *******************************************************************************
@@ -280,6 +280,18 @@ unorm2_getDecomposition(const UNormalizer2 *norm2,
                          UChar32 c, UChar *decomposition, int32_t capacity,
                          UErrorCode *pErrorCode);
  
+/**
+ * Gets the combining class of c.
+ * The default implementation returns 0
+ * but all standard implementations return the Unicode Canonical_Combining_Class value.
+ * @param norm2 UNormalizer2 instance
+ * @param c code point
+ * @return c's combining class
+ * @draft ICU 49
+ */
+U_DRAFT uint8_t U_EXPORT2
+unorm2_getCombiningClass(const UNormalizer2 *norm2, UChar32 c);
+
  /**
   * Tests if the string is normalized.
   * Internally, in cases where the quickCheck() method would return "maybe"
diff --git a/icu4c/source/common/uts46.cpp b/icu4c/source/common/uts46.cpp

index 59792e5e6aae4a71076884f80395f5abdf1cfd02..682f9ed4bdb399a928639c71d6a5920c61b76710 100644 (file)
--- a/icu4c/source/common/uts46.cpp
+++ b/icu4c/source/common/uts46.cpp
@@ -1120,7 +1120,7 @@ UTS46::isLabelOkContextJ(const UChar *label, int32_t labelLength) const {
              UChar32 c;
              int32_t j=i;
              U16_PREV_UNSAFE(label, j, c);
-            if(u_getCombiningClass(c)==9) {
+            if(uts46Norm2.getCombiningClass(c)==9) {
                  continue;
              }
              // check precontext (Joining_Type:{L,D})(Joining_Type:T)*
@@ -1163,7 +1163,7 @@ UTS46::isLabelOkContextJ(const UChar *label, int32_t labelLength) const {
              UChar32 c;
              int32_t j=i;
              U16_PREV_UNSAFE(label, j, c);
-            if(u_getCombiningClass(c)!=9) {
+            if(uts46Norm2.getCombiningClass(c)!=9) {
                  return FALSE;
              }
          }
diff --git a/icu4c/source/test/cintltst/cucdtst.c b/icu4c/source/test/cintltst/cucdtst.c

index 44e9c858db710aa1ed5e94bceb8db80a7e9d70f8..fc1ba351ef7f2f1728987fcccdd35d561513aa74 100644 (file)
--- a/icu4c/source/test/cintltst/cucdtst.c
+++ b/icu4c/source/test/cintltst/cucdtst.c
@@ -925,6 +925,14 @@ static void TestIdentifier()
  }
  
  /* for each line of UnicodeData.txt, check some of the properties */
+typedef struct UnicodeDataContext {
+#if UCONFIG_NO_NORMALIZATION
+    const void *dummy;
+#else
+    const UNormalizer2 *nfkc;
+#endif
+} UnicodeDataContext;
+
  /*
   * ### TODO
   * This test fails incorrectly if the First or Last code point of a repetitive area
@@ -950,6 +958,10 @@ unicodeDataLineFn(void *context,
      int32_t i;
      int8_t type;
  
+#if !UCONFIG_NO_NORMALIZATION
+    const UNormalizer2 *nfkc;
+#endif
+
      /* get the character code, field 0 */
      c=strtoul(fields[0][0], &end, 16);
      if(end<=fields[0][0] || end!=fields[0][1]) {
@@ -985,6 +997,10 @@ unicodeDataLineFn(void *context,
      if(value!=u_getCombiningClass(c) || value!=(uint32_t)u_getIntPropertyValue(c, UCHAR_CANONICAL_COMBINING_CLASS)) {
          log_err("error: u_getCombiningClass(U+%04lx)==%hu instead of %lu\n", c, u_getCombiningClass(c), value);
      }
+    nfkc=((UnicodeDataContext *)context)->nfkc;
+    if(value!=unorm2_getCombiningClass(nfkc, c)) {
+        log_err("error: unorm2_getCombiningClass(nfkc, U+%04lx)==%hu instead of %lu\n", c, unorm2_getCombiningClass(nfkc, c), value);
+    }
  #endif
  
      /* get BiDi category, field 4 */
@@ -1191,6 +1207,8 @@ static void TestUnicodeData()
      UChar32 c;
      int8_t type;
  
+    UnicodeDataContext context;
+
      u_versionFromString(expectVersionArray, U_UNICODE_VERSION);
      u_getUnicodeVersion(versionArray);
      if(memcmp(versionArray, expectVersionArray, U_MAX_VERSION_LENGTH) != 0)
@@ -1212,7 +1230,14 @@ static void TestUnicodeData()
      }
  
      errorCode=U_ZERO_ERROR;
-    parseUCDFile("UnicodeData.txt", fields, 15, unicodeDataLineFn, NULL, &errorCode);
+#if !UCONFIG_NO_NORMALIZATION
+    context.nfkc=unorm2_getInstance(NULL, "nfkc", UNORM2_COMPOSE, &errorCode);
+    if(U_FAILURE(errorCode)) {
+        log_data_err("error: unable to open an NFKC UNormalizer2 - %s\n", u_errorName(errorCode));
+        return;
+    }
+#endif
+    parseUCDFile("UnicodeData.txt", fields, 15, unicodeDataLineFn, &context, &errorCode);
      if(U_FAILURE(errorCode)) {
          return; /* if we couldn't parse UnicodeData.txt, we should return */
      }
diff --git a/icu4c/source/test/intltest/tstnorm.cpp b/icu4c/source/test/intltest/tstnorm.cpp

index 8b6391c19c7957b3d00ac72fb26bbef0421eb0f0..d0aeaac746eb228802866191c45c7102735bef70 100644 (file)
--- a/icu4c/source/test/intltest/tstnorm.cpp
+++ b/icu4c/source/test/intltest/tstnorm.cpp
@@ -1,6 +1,6 @@
  /********************************************************************
   * COPYRIGHT: 
- * Copyright (c) 1997-2010, International Business Machines Corporation and
+ * Copyright (c) 1997-2011, International Business Machines Corporation and
   * others. All Rights Reserved.
   ********************************************************************/
  
@@ -1458,9 +1458,7 @@ BasicNormalizerTest::TestFilteredNormalizer2Coverage() {
          dataerrln("Normalizer2Factory::getNFCInstance() call failed - %s", u_errorName(status));
          return;
      }
-    UnicodeSet filter(UNICODE_STRING_SIMPLE("[^\\u00a0-\\u00ff]"), errorCode);
-    UnicodeString newString1 = UNICODE_STRING_SIMPLE("[^\\u0100-\\u01ff]");
-    UnicodeString newString2 = UNICODE_STRING_SIMPLE("[^\\u0200-\\u02ff]");
+    UnicodeSet filter(UNICODE_STRING_SIMPLE("[^\\u00a0-\\u00ff\\u0310-\\u031f]"), errorCode);
      FilteredNormalizer2 fn2(*nfcNorm2, filter);
  
      UChar32 char32 = 0x0054;
@@ -1473,6 +1471,20 @@ BasicNormalizerTest::TestFilteredNormalizer2Coverage() {
          errln("FilteredNormalizer2.hasBoundaryAfter() failed.");
      }
  
+    UChar32 c;
+    for(c=0; c<=0x3ff; ++c) {
+        uint8_t expectedCC= filter.contains(c) ? nfcNorm2->getCombiningClass(c) : 0;
+        uint8_t cc=fn2.getCombiningClass(c);
+        if(cc!=expectedCC) {
+            errln(
+                UnicodeString("FilteredNormalizer2(NFC, ^A0-FF,310-31F).getCombiningClass(U+")+
+                hex(c)+
+                ")==filtered NFC.getCC()");
+        }
+    }
+
+    UnicodeString newString1 = UNICODE_STRING_SIMPLE("[^\\u0100-\\u01ff]");
+    UnicodeString newString2 = UNICODE_STRING_SIMPLE("[^\\u0200-\\u02ff]");
      fn2.append(newString1, newString2, errorCode);
      if (U_FAILURE(errorCode)) {
          errln("FilteredNormalizer2.append() failed.");
author	Markus Scherer <markus.icu@gmail.com>
	Thu, 30 Jun 2011 23:22:17 +0000 (23:22 +0000)
committer	Markus Scherer <markus.icu@gmail.com>
	Thu, 30 Jun 2011 23:22:17 +0000 (23:22 +0000)
icu4c/source/common/filterednormalizer2.cpp		patch \| blob \| history
icu4c/source/common/normalizer2.cpp		patch \| blob \| history
icu4c/source/common/unicode/normalizer2.h		patch \| blob \| history
icu4c/source/common/unicode/unorm2.h		patch \| blob \| history
icu4c/source/common/uts46.cpp		patch \| blob \| history
icu4c/source/test/cintltst/cucdtst.c		patch \| blob \| history
icu4c/source/test/intltest/tstnorm.cpp		patch \| blob \| history