ICU-13271 add Normalizer2::isNormalizedUTF8()

author Markus Scherer <markus.icu@gmail.com>

Thu, 20 Jul 2017 22:08:30 +0000 (22:08 +0000)

committer Markus Scherer <markus.icu@gmail.com>

Thu, 20 Jul 2017 22:08:30 +0000 (22:08 +0000)
author Markus Scherer <markus.icu@gmail.com>
Thu, 20 Jul 2017 22:08:30 +0000 (22:08 +0000)
committer Markus Scherer <markus.icu@gmail.com>
Thu, 20 Jul 2017 22:08:30 +0000 (22:08 +0000)
diff --git a/icu4c/source/common/filterednormalizer2.cpp b/icu4c/source/common/filterednormalizer2.cpp

index eef199797c10e1b541040754b46b10e4197e9f6a..f627b601ce070638aa7eaa816ff254e8065965fa 100644 (file)
--- a/icu4c/source/common/filterednormalizer2.cpp
+++ b/icu4c/source/common/filterednormalizer2.cpp
@@ -244,6 +244,31 @@ FilteredNormalizer2::isNormalized(const UnicodeString &s, UErrorCode &errorCode)
      return TRUE;
  }
  
+UBool
+FilteredNormalizer2::isNormalizedUTF8(StringPiece sp, UErrorCode &errorCode) const {
+    if(U_FAILURE(errorCode)) {
+        return FALSE;
+    }
+    const char *s = sp.data();
+    int32_t length = sp.length();
+    USetSpanCondition spanCondition = USET_SPAN_SIMPLE;
+    while (length > 0) {
+        int32_t spanLength = set.spanUTF8(s, length, spanCondition);
+        if (spanCondition == USET_SPAN_NOT_CONTAINED) {
+            spanCondition = USET_SPAN_SIMPLE;
+        } else {
+            if (!norm2.isNormalizedUTF8(StringPiece(s, spanLength), errorCode) ||
+                    U_FAILURE(errorCode)) {
+                return FALSE;
+            }
+            spanCondition = USET_SPAN_NOT_CONTAINED;
+        }
+        s += spanLength;
+        length -= spanLength;
+    }
+    return TRUE;
+}
+
  UNormalizationCheckResult
  FilteredNormalizer2::quickCheck(const UnicodeString &s, UErrorCode &errorCode) const {
      uprv_checkCanGetBuffer(s, errorCode);
diff --git a/icu4c/source/common/norm2allmodes.h b/icu4c/source/common/norm2allmodes.h

index 638fc87b5c441dff2cb5630af31fa27eeecd8a04..3305f05f64f1905a155397b5e3c01a43af9caab2 100644 (file)
--- a/icu4c/source/common/norm2allmodes.h
+++ b/icu4c/source/common/norm2allmodes.h
@@ -270,6 +270,14 @@ private:
          }
          return impl.compose(sArray, sArray+s.length(), onlyContiguous, FALSE, buffer, errorCode);
      }
+    virtual UBool
+    isNormalizedUTF8(StringPiece sp, UErrorCode &errorCode) const override {
+        if(U_FAILURE(errorCode)) {
+            return FALSE;
+        }
+        const uint8_t *s = reinterpret_cast<const uint8_t *>(sp.data());
+        return impl.composeUTF8(0, onlyContiguous, s, s + sp.length(), nullptr, nullptr, errorCode);
+    }
      virtual UNormalizationCheckResult
      quickCheck(const UnicodeString &s, UErrorCode &errorCode) const override {
          if(U_FAILURE(errorCode)) {
diff --git a/icu4c/source/common/normalizer2.cpp b/icu4c/source/common/normalizer2.cpp

index a62a8d930bacfad6cdd9789916fbe835322ad637..8915c1ddc31d543ccb4ce6bf4ae2fcc78853fc58 100644 (file)
--- a/icu4c/source/common/normalizer2.cpp
+++ b/icu4c/source/common/normalizer2.cpp
@@ -73,6 +73,11 @@ Normalizer2::getCombiningClass(UChar32 /*c*/) const {
      return 0;
  }
  
+UBool
+Normalizer2::isNormalizedUTF8(StringPiece s, UErrorCode &errorCode) const {
+    return U_SUCCESS(errorCode) && isNormalized(UnicodeString::fromUTF8(s), errorCode);
+}
+
  // Normalizer2 implementation for the old UNORM_NONE.
  class NoopNormalizer2 : public Normalizer2 {
      virtual ~NoopNormalizer2();
@@ -139,8 +144,12 @@ class NoopNormalizer2 : public Normalizer2 {
      }
      // No need to override the default getRawDecomposition().
      virtual UBool
-    isNormalized(const UnicodeString &, UErrorCode &) const override {
-        return TRUE;
+    isNormalized(const UnicodeString &, UErrorCode &errorCode) const override {
+        return U_SUCCESS(errorCode);
+    }
+    virtual UBool
+    isNormalizedUTF8(StringPiece, UErrorCode &errorCode) const override {
+        return U_SUCCESS(errorCode);
      }
      virtual UNormalizationCheckResult
      quickCheck(const UnicodeString &, UErrorCode &) const override {
diff --git a/icu4c/source/common/normalizer2impl.h b/icu4c/source/common/normalizer2impl.h

index fdc0d3b3c5e34a11498465b34edfdd138d4dc631..9dd4d1e5ab188b699e72ec0ba54ca1929182e09e 100644 (file)
--- a/icu4c/source/common/normalizer2impl.h
+++ b/icu4c/source/common/normalizer2impl.h
@@ -420,20 +420,20 @@ public:
  
          // Norm16 value thresholds for quick check combinations and types of extra data.
  
-        // Mappings & compositions in [minYesNo..minYesNoMappingsOnly[.
+        /** Mappings & compositions in [minYesNo..minYesNoMappingsOnly[. */
          IX_MIN_YES_NO,
-        // Mappings are comp-normalized.
+        /** Mappings are comp-normalized. */
          IX_MIN_NO_NO,
          IX_LIMIT_NO_NO,
          IX_MIN_MAYBE_YES,
  
-        // Mappings only in [minYesNoMappingsOnly..minNoNo[.
+        /** Mappings only in [minYesNoMappingsOnly..minNoNo[. */
          IX_MIN_YES_NO_MAPPINGS_ONLY,
-        // Mappings are not comp-normalized but have a comp boundary before.
+        /** Mappings are not comp-normalized but have a comp boundary before. */
          IX_MIN_NO_NO_COMP_BOUNDARY_BEFORE,
-        // Mappings do not have a comp boundary before.
+        /** Mappings do not have a comp boundary before. */
          IX_MIN_NO_NO_COMP_NO_MAYBE_CC,
-        // Mappings to the empty string.
+        /** Mappings to the empty string. */
          IX_MIN_NO_NO_EMPTY,
  
          IX_MIN_LCCC_CP,
diff --git a/icu4c/source/common/unicode/normalizer2.h b/icu4c/source/common/unicode/normalizer2.h

index f71ccb8b839373dca8abfff1ae4394f845a16074..631e29bafef18390b3726c6495d5fd7bce131e40 100644 (file)
--- a/icu4c/source/common/unicode/normalizer2.h
+++ b/icu4c/source/common/unicode/normalizer2.h
@@ -371,6 +371,30 @@ public:
       */
      virtual UBool
      isNormalized(const UnicodeString &s, UErrorCode &errorCode) const = 0;
+    /**
+     * Tests if the UTF-8 string is normalized.
+     * Internally, in cases where the quickCheck() method would return "maybe"
+     * (which is only possible for the two COMPOSE modes) this method
+     * resolves to "yes" or "no" to provide a definitive result,
+     * at the cost of doing more work in those cases.
+     *
+     * This works for all normalization modes,
+     * but it is currently optimized for UTF-8 only for "compose" modes,
+     * such as for NFC, NFKC, and NFKC_Casefold
+     * (UNORM2_COMPOSE and UNORM2_COMPOSE_CONTIGUOUS).
+     * For other modes it currently converts to UTF-16 and calls isNormalized().
+     *
+     * @param s UTF-8 input string
+     * @param errorCode Standard ICU error code. Its input value must
+     *                  pass the U_SUCCESS() test, or else the function returns
+     *                  immediately. Check for U_FAILURE() on output or use with
+     *                  function chaining. (See User Guide for details.)
+     * @return TRUE if s is normalized
+     * @draft ICU 60
+     */
+    virtual UBool
+    isNormalizedUTF8(StringPiece s, UErrorCode &errorCode) const;
+
  
      /**
       * Tests if the string is normalized.
@@ -641,6 +665,29 @@ public:
       */
      virtual UBool
      isNormalized(const UnicodeString &s, UErrorCode &errorCode) const override;
+    /**
+     * Tests if the UTF-8 string is normalized.
+     * Internally, in cases where the quickCheck() method would return "maybe"
+     * (which is only possible for the two COMPOSE modes) this method
+     * resolves to "yes" or "no" to provide a definitive result,
+     * at the cost of doing more work in those cases.
+     *
+     * This works for all normalization modes,
+     * but it is currently optimized for UTF-8 only for "compose" modes,
+     * such as for NFC, NFKC, and NFKC_Casefold
+     * (UNORM2_COMPOSE and UNORM2_COMPOSE_CONTIGUOUS).
+     * For other modes it currently converts to UTF-16 and calls isNormalized().
+     *
+     * @param s UTF-8 input string
+     * @param errorCode Standard ICU error code. Its input value must
+     *                  pass the U_SUCCESS() test, or else the function returns
+     *                  immediately. Check for U_FAILURE() on output or use with
+     *                  function chaining. (See User Guide for details.)
+     * @return TRUE if s is normalized
+     * @draft ICU 60
+     */
+    virtual UBool
+    isNormalizedUTF8(StringPiece s, UErrorCode &errorCode) const override;
      /**
       * Tests if the string is normalized.
       * For details see the Normalizer2 base class documentation.
diff --git a/icu4c/source/test/intltest/normconf.cpp b/icu4c/source/test/intltest/normconf.cpp

index b8af4fd3c16405d87689273623344267d54b0605..4d2a9f76f071d64790afa0eb47016d6825b77689 100644 (file)
--- a/icu4c/source/test/intltest/normconf.cpp
+++ b/icu4c/source/test/intltest/normconf.cpp
@@ -280,6 +280,15 @@ void NormalizerConformanceTest::TestConformance(FileStream *input, int32_t optio
      }
  }
  
+namespace {
+
+UBool isNormalizedUTF8(const Normalizer2 &norm2, const UnicodeString &s, UErrorCode &errorCode) {
+    std::string s8;
+    return norm2.isNormalizedUTF8(s.toUTF8String(s8), errorCode);
+}
+
+}  // namespace
+
  /**
   * Verify the conformance of the given line of the Unicode
   * normalization (UTR 15) test suite file.  For each line,
@@ -342,18 +351,38 @@ UBool NormalizerConformanceTest::checkConformance(const UnicodeString* field,
          dataerrln("Normalizer error: isNormalized(NFC(s), UNORM_NFC) is FALSE");
          pass = FALSE;
      }
-    if(field[0]!=field[1] && Normalizer::isNormalized(field[0], UNORM_NFC, options, status)) {
-        errln("Normalizer error: isNormalized(s, UNORM_NFC) is TRUE");
+    if(options==0 && !isNormalizedUTF8(*nfc, field[1], status)) {
+        dataerrln("Normalizer error: nfc.isNormalizedUTF8(NFC(s)) is FALSE");
          pass = FALSE;
      }
+    if(field[0]!=field[1]) {
+        if(Normalizer::isNormalized(field[0], UNORM_NFC, options, status)) {
+            errln("Normalizer error: isNormalized(s, UNORM_NFC) is TRUE");
+            pass = FALSE;
+        }
+        if(isNormalizedUTF8(*nfc, field[0], status)) {
+            errln("Normalizer error: nfc.isNormalizedUTF8(s) is TRUE");
+            pass = FALSE;
+        }
+    }
      if(!Normalizer::isNormalized(field[3], UNORM_NFKC, options, status)) {
          dataerrln("Normalizer error: isNormalized(NFKC(s), UNORM_NFKC) is FALSE");
          pass = FALSE;
      }
-    if(field[0]!=field[3] && Normalizer::isNormalized(field[0], UNORM_NFKC, options, status)) {
-        errln("Normalizer error: isNormalized(s, UNORM_NFKC) is TRUE");
+    if(options==0 && !isNormalizedUTF8(*nfkc, field[3], status)) {
+        dataerrln("Normalizer error: nfkc.isNormalizedUTF8(NFKC(s)) is FALSE");
          pass = FALSE;
      }
+    if(field[0]!=field[3]) {
+        if(Normalizer::isNormalized(field[0], UNORM_NFKC, options, status)) {
+            errln("Normalizer error: isNormalized(s, UNORM_NFKC) is TRUE");
+            pass = FALSE;
+        }
+        if(options==0 && isNormalizedUTF8(*nfkc, field[0], status)) {
+            errln("Normalizer error: nfkc.isNormalizedUTF8(s) is TRUE");
+            pass = FALSE;
+        }
+    }
  
      // test FCD quick check and "makeFCD"
      Normalizer::normalize(field[0], UNORM_FCD, options, fcd, status);
diff --git a/icu4c/source/test/intltest/tstnorm.cpp b/icu4c/source/test/intltest/tstnorm.cpp

index 240a3a81064339aeac08929c5c21912c3ef5f96b..b79232e5bfdec768781d310610f8210eedc8eda8 100644 (file)
--- a/icu4c/source/test/intltest/tstnorm.cpp
+++ b/icu4c/source/test/intltest/tstnorm.cpp
@@ -58,6 +58,7 @@ void BasicNormalizerTest::runIndexedTest(int32_t index, UBool exec,
      TESTCASE_AUTO(TestLowMappingToEmpty_D);
      TESTCASE_AUTO(TestLowMappingToEmpty_FCD);
      TESTCASE_AUTO(TestNormalizeIllFormedText);
+    TESTCASE_AUTO(TestComposeJamoTBase);
      TESTCASE_AUTO_END;
  }
  
@@ -1566,6 +1567,9 @@ BasicNormalizerTest::TestNormalizeUTF8WithEdits() {
              expectedChanges, UPRV_LENGTHOF(expectedChanges),
              TRUE, errorCode);
  
+    assertFalse("isNormalizedUTF8(source)", nfkc_cf->isNormalizedUTF8(src, errorCode));
+    assertTrue("isNormalizedUTF8(normalized)", nfkc_cf->isNormalizedUTF8(result, errorCode));
+
      // Omit unchanged text.
      expected = u8"aääạ\u0308ạ\u0308가각갃";
      result.clear();
@@ -1605,6 +1609,9 @@ BasicNormalizerTest::TestNormalizeUTF8WithEdits() {
              filteredChanges, UPRV_LENGTHOF(filteredChanges),
              TRUE, errorCode);
  
+    assertFalse("filtered isNormalizedUTF8(source)", fn2.isNormalizedUTF8(src, errorCode));
+    assertTrue("filtered isNormalizedUTF8(normalized)", fn2.isNormalizedUTF8(result, errorCode));
+
      // Omit unchanged text.
      // Note that the result is not normalized because the inner normalizer
      // does not see text across filter spans.
@@ -1711,4 +1718,32 @@ BasicNormalizerTest::TestNormalizeIllFormedText() {
      assertEquals("normalizeUTF8", expected8.c_str(), result8.c_str());
  }
  
+void
+BasicNormalizerTest::TestComposeJamoTBase() {
+    // Algorithmic composition of Hangul syllables must not combine with JAMO_T_BASE = U+11A7
+    // which is not a conjoining Jamo Trailing consonant.
+    IcuTestErrorCode errorCode(*this, "TestComposeJamoTBase");
+    const Normalizer2 *nfkc = Normalizer2::getNFKCInstance(errorCode);
+    if(errorCode.logDataIfFailureAndReset("Normalizer2::getNFKCInstance() call failed")) {
+        return;
+    }
+    UnicodeString s(u"\u1100\u1161\u11A7\u1100\u314F\u11A7가\u11A7");
+    UnicodeString expected(u"가\u11A7가\u11A7가\u11A7");
+    UnicodeString result = nfkc->normalize(s, errorCode);
+    assertSuccess("normalize(LV+11A7)", errorCode.get());
+    assertEquals("normalize(LV+11A7)", expected, result);
+    assertFalse("isNormalized(LV+11A7)", nfkc->isNormalized(s, errorCode));
+    assertTrue("isNormalized(normalized)", nfkc->isNormalized(result, errorCode));
+
+    std::string s8(u8"\u1100\u1161\u11A7\u1100\u314F\u11A7가\u11A7");
+    std::string expected8(u8"가\u11A7가\u11A7가\u11A7");
+    std::string result8;
+    StringByteSink<std::string> sink(&result8, expected8.length());
+    nfkc->normalizeUTF8(0, s8, sink, nullptr, errorCode);
+    assertSuccess("normalizeUTF8(LV+11A7)", errorCode.get());
+    assertEquals("normalizeUTF8(LV+11A7)", expected8.c_str(), result8.c_str());
+    assertFalse("isNormalizedUTF8(LV+11A7)", nfkc->isNormalizedUTF8(s8, errorCode));
+    assertTrue("isNormalizedUTF8(normalized)", nfkc->isNormalizedUTF8(result8, errorCode));
+}
+
  #endif /* #if !UCONFIG_NO_NORMALIZATION */
diff --git a/icu4c/source/test/intltest/tstnorm.h b/icu4c/source/test/intltest/tstnorm.h

index d98aafd70942671a5acb4d657b7027fb98fd779b..2891e8c98ee6855207ea9fe703d8d1baf4a783e7 100644 (file)
--- a/icu4c/source/test/intltest/tstnorm.h
+++ b/icu4c/source/test/intltest/tstnorm.h
@@ -51,6 +51,7 @@ public:
      void TestLowMappingToEmpty_D();
      void TestLowMappingToEmpty_FCD();
      void TestNormalizeIllFormedText();
+    void TestComposeJamoTBase();
  
  private:
      UnicodeString canonTests[24][3];
diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/Normalizer2Impl.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/Normalizer2Impl.java

index 9cb231590b1c9715a653f63c3f6026aa3c92cf46..735d89e9853cc86f1f930dc50c90c16ced90402c 100644 (file)
--- a/icu4j/main/classes/core/src/com/ibm/icu/impl/Normalizer2Impl.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/Normalizer2Impl.java
@@ -901,20 +901,21 @@ public final class Normalizer2Impl {
      public static final int IX_MIN_COMP_NO_MAYBE_CP=9;
  
      // Norm16 value thresholds for quick check combinations and types of extra data.
-    // Mappings & compositions in [minYesNo..minYesNoMappingsOnly[.
+
+    /** Mappings & compositions in [minYesNo..minYesNoMappingsOnly[. */
      public static final int IX_MIN_YES_NO=10;
-    // Mappings are comp-normalized.
+    /** Mappings are comp-normalized. */
      public static final int IX_MIN_NO_NO=11;
      public static final int IX_LIMIT_NO_NO=12;
      public static final int IX_MIN_MAYBE_YES=13;
  
-    // Mappings only in [minYesNoMappingsOnly..minNoNo[.
+    /** Mappings only in [minYesNoMappingsOnly..minNoNo[. */
      public static final int IX_MIN_YES_NO_MAPPINGS_ONLY=14;
-    // Mappings are not comp-normalized but have a comp boundary before.
+    /** Mappings are not comp-normalized but have a comp boundary before. */
      public static final int IX_MIN_NO_NO_COMP_BOUNDARY_BEFORE=15;
-    // Mappings do not have a comp boundary before.
+    /** Mappings do not have a comp boundary before. */
      public static final int IX_MIN_NO_NO_COMP_NO_MAYBE_CC=16;
-    // Mappings to the empty string.
+    /** Mappings to the empty string. */
      public static final int IX_MIN_NO_NO_EMPTY=17;
  
      public static final int IX_MIN_LCCC_CP=18;
diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/normalizer/BasicTest.java b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/normalizer/BasicTest.java

index f146ba959fbead3a628d547838b6c1965adfd219..834d0bcc3f47970edfca42920cfec7143848ee5d 100644 (file)
--- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/normalizer/BasicTest.java
+++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/normalizer/BasicTest.java
@@ -2854,6 +2854,19 @@ public class BasicTest extends TestFmwk {
          assertEquals("normalize", expected, result);
      }
  
+    @Test
+    public void TestComposeJamoTBase() {
+        // Algorithmic composition of Hangul syllables must not combine with JAMO_T_BASE = U+11A7
+        // which is not a conjoining Jamo Trailing consonant.
+        Normalizer2 nfkc = Normalizer2.getNFKCInstance();
+        String s = "\u1100\u1161\u11A7\u1100\u314F\u11A7가\u11A7";
+        String expected = "가\u11A7가\u11A7가\u11A7";
+        String result = nfkc.normalize(s);
+        assertEquals("normalize(LV+11A7)", expected, result);
+        assertFalse("isNormalized(LV+11A7)", nfkc.isNormalized(s));
+        assertTrue("isNormalized(normalized)", nfkc.isNormalized(result));
+    }
+
      @Test
      public void TestNFC() {
          // Coverage tests.
author	Markus Scherer <markus.icu@gmail.com>
	Thu, 20 Jul 2017 22:08:30 +0000 (22:08 +0000)
committer	Markus Scherer <markus.icu@gmail.com>
	Thu, 20 Jul 2017 22:08:30 +0000 (22:08 +0000)
icu4c/source/common/filterednormalizer2.cpp		patch \| blob \| history
icu4c/source/common/norm2allmodes.h		patch \| blob \| history
icu4c/source/common/normalizer2.cpp		patch \| blob \| history
icu4c/source/common/normalizer2impl.h		patch \| blob \| history
icu4c/source/common/unicode/normalizer2.h		patch \| blob \| history
icu4c/source/test/intltest/normconf.cpp		patch \| blob \| history
icu4c/source/test/intltest/tstnorm.cpp		patch \| blob \| history
icu4c/source/test/intltest/tstnorm.h		patch \| blob \| history
icu4j/main/classes/core/src/com/ibm/icu/impl/Normalizer2Impl.java		patch \| blob \| history
icu4j/main/tests/core/src/com/ibm/icu/dev/test/normalizer/BasicTest.java		patch \| blob \| history