return TRUE;
}
+UBool
+FilteredNormalizer2::isNormalizedUTF8(StringPiece sp, UErrorCode &errorCode) const {
+ if(U_FAILURE(errorCode)) {
+ return FALSE;
+ }
+ const char *s = sp.data();
+ int32_t length = sp.length();
+ USetSpanCondition spanCondition = USET_SPAN_SIMPLE;
+ while (length > 0) {
+ int32_t spanLength = set.spanUTF8(s, length, spanCondition);
+ if (spanCondition == USET_SPAN_NOT_CONTAINED) {
+ spanCondition = USET_SPAN_SIMPLE;
+ } else {
+ if (!norm2.isNormalizedUTF8(StringPiece(s, spanLength), errorCode) ||
+ U_FAILURE(errorCode)) {
+ return FALSE;
+ }
+ spanCondition = USET_SPAN_NOT_CONTAINED;
+ }
+ s += spanLength;
+ length -= spanLength;
+ }
+ return TRUE;
+}
+
UNormalizationCheckResult
FilteredNormalizer2::quickCheck(const UnicodeString &s, UErrorCode &errorCode) const {
uprv_checkCanGetBuffer(s, errorCode);
}
return impl.compose(sArray, sArray+s.length(), onlyContiguous, FALSE, buffer, errorCode);
}
+ virtual UBool
+ isNormalizedUTF8(StringPiece sp, UErrorCode &errorCode) const override {
+ if(U_FAILURE(errorCode)) {
+ return FALSE;
+ }
+ const uint8_t *s = reinterpret_cast<const uint8_t *>(sp.data());
+ return impl.composeUTF8(0, onlyContiguous, s, s + sp.length(), nullptr, nullptr, errorCode);
+ }
virtual UNormalizationCheckResult
quickCheck(const UnicodeString &s, UErrorCode &errorCode) const override {
if(U_FAILURE(errorCode)) {
return 0;
}
+UBool
+Normalizer2::isNormalizedUTF8(StringPiece s, UErrorCode &errorCode) const {
+ return U_SUCCESS(errorCode) && isNormalized(UnicodeString::fromUTF8(s), errorCode);
+}
+
// Normalizer2 implementation for the old UNORM_NONE.
class NoopNormalizer2 : public Normalizer2 {
virtual ~NoopNormalizer2();
}
// No need to override the default getRawDecomposition().
virtual UBool
- isNormalized(const UnicodeString &, UErrorCode &) const override {
- return TRUE;
+ isNormalized(const UnicodeString &, UErrorCode &errorCode) const override {
+ return U_SUCCESS(errorCode);
+ }
+ virtual UBool
+ isNormalizedUTF8(StringPiece, UErrorCode &errorCode) const override {
+ return U_SUCCESS(errorCode);
}
virtual UNormalizationCheckResult
quickCheck(const UnicodeString &, UErrorCode &) const override {
// Norm16 value thresholds for quick check combinations and types of extra data.
- // Mappings & compositions in [minYesNo..minYesNoMappingsOnly[.
+ /** Mappings & compositions in [minYesNo..minYesNoMappingsOnly[. */
IX_MIN_YES_NO,
- // Mappings are comp-normalized.
+ /** Mappings are comp-normalized. */
IX_MIN_NO_NO,
IX_LIMIT_NO_NO,
IX_MIN_MAYBE_YES,
- // Mappings only in [minYesNoMappingsOnly..minNoNo[.
+ /** Mappings only in [minYesNoMappingsOnly..minNoNo[. */
IX_MIN_YES_NO_MAPPINGS_ONLY,
- // Mappings are not comp-normalized but have a comp boundary before.
+ /** Mappings are not comp-normalized but have a comp boundary before. */
IX_MIN_NO_NO_COMP_BOUNDARY_BEFORE,
- // Mappings do not have a comp boundary before.
+ /** Mappings do not have a comp boundary before. */
IX_MIN_NO_NO_COMP_NO_MAYBE_CC,
- // Mappings to the empty string.
+ /** Mappings to the empty string. */
IX_MIN_NO_NO_EMPTY,
IX_MIN_LCCC_CP,
*/
virtual UBool
isNormalized(const UnicodeString &s, UErrorCode &errorCode) const = 0;
+ /**
+ * Tests if the UTF-8 string is normalized.
+ * Internally, in cases where the quickCheck() method would return "maybe"
+ * (which is only possible for the two COMPOSE modes) this method
+ * resolves to "yes" or "no" to provide a definitive result,
+ * at the cost of doing more work in those cases.
+ *
+ * This works for all normalization modes,
+ * but it is currently optimized for UTF-8 only for "compose" modes,
+ * such as for NFC, NFKC, and NFKC_Casefold
+ * (UNORM2_COMPOSE and UNORM2_COMPOSE_CONTIGUOUS).
+ * For other modes it currently converts to UTF-16 and calls isNormalized().
+ *
+ * @param s UTF-8 input string
+ * @param errorCode Standard ICU error code. Its input value must
+ * pass the U_SUCCESS() test, or else the function returns
+ * immediately. Check for U_FAILURE() on output or use with
+ * function chaining. (See User Guide for details.)
+ * @return TRUE if s is normalized
+ * @draft ICU 60
+ */
+ virtual UBool
+ isNormalizedUTF8(StringPiece s, UErrorCode &errorCode) const;
+
/**
* Tests if the string is normalized.
*/
virtual UBool
isNormalized(const UnicodeString &s, UErrorCode &errorCode) const override;
+ /**
+ * Tests if the UTF-8 string is normalized.
+ * Internally, in cases where the quickCheck() method would return "maybe"
+ * (which is only possible for the two COMPOSE modes) this method
+ * resolves to "yes" or "no" to provide a definitive result,
+ * at the cost of doing more work in those cases.
+ *
+ * This works for all normalization modes,
+ * but it is currently optimized for UTF-8 only for "compose" modes,
+ * such as for NFC, NFKC, and NFKC_Casefold
+ * (UNORM2_COMPOSE and UNORM2_COMPOSE_CONTIGUOUS).
+ * For other modes it currently converts to UTF-16 and calls isNormalized().
+ *
+ * @param s UTF-8 input string
+ * @param errorCode Standard ICU error code. Its input value must
+ * pass the U_SUCCESS() test, or else the function returns
+ * immediately. Check for U_FAILURE() on output or use with
+ * function chaining. (See User Guide for details.)
+ * @return TRUE if s is normalized
+ * @draft ICU 60
+ */
+ virtual UBool
+ isNormalizedUTF8(StringPiece s, UErrorCode &errorCode) const override;
/**
* Tests if the string is normalized.
* For details see the Normalizer2 base class documentation.
}
}
+namespace {
+
+UBool isNormalizedUTF8(const Normalizer2 &norm2, const UnicodeString &s, UErrorCode &errorCode) {
+ std::string s8;
+ return norm2.isNormalizedUTF8(s.toUTF8String(s8), errorCode);
+}
+
+} // namespace
+
/**
* Verify the conformance of the given line of the Unicode
* normalization (UTR 15) test suite file. For each line,
dataerrln("Normalizer error: isNormalized(NFC(s), UNORM_NFC) is FALSE");
pass = FALSE;
}
- if(field[0]!=field[1] && Normalizer::isNormalized(field[0], UNORM_NFC, options, status)) {
- errln("Normalizer error: isNormalized(s, UNORM_NFC) is TRUE");
+ if(options==0 && !isNormalizedUTF8(*nfc, field[1], status)) {
+ dataerrln("Normalizer error: nfc.isNormalizedUTF8(NFC(s)) is FALSE");
pass = FALSE;
}
+ if(field[0]!=field[1]) {
+ if(Normalizer::isNormalized(field[0], UNORM_NFC, options, status)) {
+ errln("Normalizer error: isNormalized(s, UNORM_NFC) is TRUE");
+ pass = FALSE;
+ }
+ if(isNormalizedUTF8(*nfc, field[0], status)) {
+ errln("Normalizer error: nfc.isNormalizedUTF8(s) is TRUE");
+ pass = FALSE;
+ }
+ }
if(!Normalizer::isNormalized(field[3], UNORM_NFKC, options, status)) {
dataerrln("Normalizer error: isNormalized(NFKC(s), UNORM_NFKC) is FALSE");
pass = FALSE;
}
- if(field[0]!=field[3] && Normalizer::isNormalized(field[0], UNORM_NFKC, options, status)) {
- errln("Normalizer error: isNormalized(s, UNORM_NFKC) is TRUE");
+ if(options==0 && !isNormalizedUTF8(*nfkc, field[3], status)) {
+ dataerrln("Normalizer error: nfkc.isNormalizedUTF8(NFKC(s)) is FALSE");
pass = FALSE;
}
+ if(field[0]!=field[3]) {
+ if(Normalizer::isNormalized(field[0], UNORM_NFKC, options, status)) {
+ errln("Normalizer error: isNormalized(s, UNORM_NFKC) is TRUE");
+ pass = FALSE;
+ }
+ if(options==0 && isNormalizedUTF8(*nfkc, field[0], status)) {
+ errln("Normalizer error: nfkc.isNormalizedUTF8(s) is TRUE");
+ pass = FALSE;
+ }
+ }
// test FCD quick check and "makeFCD"
Normalizer::normalize(field[0], UNORM_FCD, options, fcd, status);
TESTCASE_AUTO(TestLowMappingToEmpty_D);
TESTCASE_AUTO(TestLowMappingToEmpty_FCD);
TESTCASE_AUTO(TestNormalizeIllFormedText);
+ TESTCASE_AUTO(TestComposeJamoTBase);
TESTCASE_AUTO_END;
}
expectedChanges, UPRV_LENGTHOF(expectedChanges),
TRUE, errorCode);
+ assertFalse("isNormalizedUTF8(source)", nfkc_cf->isNormalizedUTF8(src, errorCode));
+ assertTrue("isNormalizedUTF8(normalized)", nfkc_cf->isNormalizedUTF8(result, errorCode));
+
// Omit unchanged text.
expected = u8"aääạ\u0308ạ\u0308가각갃";
result.clear();
filteredChanges, UPRV_LENGTHOF(filteredChanges),
TRUE, errorCode);
+ assertFalse("filtered isNormalizedUTF8(source)", fn2.isNormalizedUTF8(src, errorCode));
+ assertTrue("filtered isNormalizedUTF8(normalized)", fn2.isNormalizedUTF8(result, errorCode));
+
// Omit unchanged text.
// Note that the result is not normalized because the inner normalizer
// does not see text across filter spans.
assertEquals("normalizeUTF8", expected8.c_str(), result8.c_str());
}
+void
+BasicNormalizerTest::TestComposeJamoTBase() {
+ // Algorithmic composition of Hangul syllables must not combine with JAMO_T_BASE = U+11A7
+ // which is not a conjoining Jamo Trailing consonant.
+ IcuTestErrorCode errorCode(*this, "TestComposeJamoTBase");
+ const Normalizer2 *nfkc = Normalizer2::getNFKCInstance(errorCode);
+ if(errorCode.logDataIfFailureAndReset("Normalizer2::getNFKCInstance() call failed")) {
+ return;
+ }
+ UnicodeString s(u"\u1100\u1161\u11A7\u1100\u314F\u11A7가\u11A7");
+ UnicodeString expected(u"가\u11A7가\u11A7가\u11A7");
+ UnicodeString result = nfkc->normalize(s, errorCode);
+ assertSuccess("normalize(LV+11A7)", errorCode.get());
+ assertEquals("normalize(LV+11A7)", expected, result);
+ assertFalse("isNormalized(LV+11A7)", nfkc->isNormalized(s, errorCode));
+ assertTrue("isNormalized(normalized)", nfkc->isNormalized(result, errorCode));
+
+ std::string s8(u8"\u1100\u1161\u11A7\u1100\u314F\u11A7가\u11A7");
+ std::string expected8(u8"가\u11A7가\u11A7가\u11A7");
+ std::string result8;
+ StringByteSink<std::string> sink(&result8, expected8.length());
+ nfkc->normalizeUTF8(0, s8, sink, nullptr, errorCode);
+ assertSuccess("normalizeUTF8(LV+11A7)", errorCode.get());
+ assertEquals("normalizeUTF8(LV+11A7)", expected8.c_str(), result8.c_str());
+ assertFalse("isNormalizedUTF8(LV+11A7)", nfkc->isNormalizedUTF8(s8, errorCode));
+ assertTrue("isNormalizedUTF8(normalized)", nfkc->isNormalizedUTF8(result8, errorCode));
+}
+
#endif /* #if !UCONFIG_NO_NORMALIZATION */
void TestLowMappingToEmpty_D();
void TestLowMappingToEmpty_FCD();
void TestNormalizeIllFormedText();
+ void TestComposeJamoTBase();
private:
UnicodeString canonTests[24][3];
public static final int IX_MIN_COMP_NO_MAYBE_CP=9;
// Norm16 value thresholds for quick check combinations and types of extra data.
- // Mappings & compositions in [minYesNo..minYesNoMappingsOnly[.
+
+ /** Mappings & compositions in [minYesNo..minYesNoMappingsOnly[. */
public static final int IX_MIN_YES_NO=10;
- // Mappings are comp-normalized.
+ /** Mappings are comp-normalized. */
public static final int IX_MIN_NO_NO=11;
public static final int IX_LIMIT_NO_NO=12;
public static final int IX_MIN_MAYBE_YES=13;
- // Mappings only in [minYesNoMappingsOnly..minNoNo[.
+ /** Mappings only in [minYesNoMappingsOnly..minNoNo[. */
public static final int IX_MIN_YES_NO_MAPPINGS_ONLY=14;
- // Mappings are not comp-normalized but have a comp boundary before.
+ /** Mappings are not comp-normalized but have a comp boundary before. */
public static final int IX_MIN_NO_NO_COMP_BOUNDARY_BEFORE=15;
- // Mappings do not have a comp boundary before.
+ /** Mappings do not have a comp boundary before. */
public static final int IX_MIN_NO_NO_COMP_NO_MAYBE_CC=16;
- // Mappings to the empty string.
+ /** Mappings to the empty string. */
public static final int IX_MIN_NO_NO_EMPTY=17;
public static final int IX_MIN_LCCC_CP=18;
assertEquals("normalize", expected, result);
}
+ @Test
+ public void TestComposeJamoTBase() {
+ // Algorithmic composition of Hangul syllables must not combine with JAMO_T_BASE = U+11A7
+ // which is not a conjoining Jamo Trailing consonant.
+ Normalizer2 nfkc = Normalizer2.getNFKCInstance();
+ String s = "\u1100\u1161\u11A7\u1100\u314F\u11A7가\u11A7";
+ String expected = "가\u11A7가\u11A7가\u11A7";
+ String result = nfkc.normalize(s);
+ assertEquals("normalize(LV+11A7)", expected, result);
+ assertFalse("isNormalized(LV+11A7)", nfkc.isNormalized(s));
+ assertTrue("isNormalized(normalized)", nfkc.isNormalized(result));
+ }
+
@Test
public void TestNFC() {
// Coverage tests.