ICU-21141 Fix titlecase of accented Dutch ij digraph

author Elango <elango@unicode.org>

Tue, 15 Feb 2022 23:27:24 +0000 (23:27 +0000)

committer Elango <elango@unicode.org>

Wed, 16 Feb 2022 17:07:09 +0000 (09:07 -0800)
author Elango <elango@unicode.org>
Tue, 15 Feb 2022 23:27:24 +0000 (23:27 +0000)
committer Elango <elango@unicode.org>
Wed, 16 Feb 2022 17:07:09 +0000 (09:07 -0800)
diff --git a/icu4c/source/common/ucasemap.cpp b/icu4c/source/common/ucasemap.cpp

index ed72bda828fc1c85fe114c56bb6b2761f08abd83..b6e7f2b744d57cb97811305274001548250ac8ec 100644 (file)
--- a/icu4c/source/common/ucasemap.cpp
+++ b/icu4c/source/common/ucasemap.cpp
@@ -420,6 +420,96 @@ void toUpper(int32_t caseLocale, uint32_t options,
  
  #if !UCONFIG_NO_BREAK_ITERATION
  
+namespace {
+
+constexpr uint8_t ACUTE_BYTE0 = u8"\u0301"[0];
+
+constexpr uint8_t ACUTE_BYTE1 = u8"\u0301"[1];
+
+/**
+ * Input: c is a letter I with or without acute accent.
+ * start is the index in src after c, and is less than segmentLimit.
+ * If a plain i/I is followed by a plain j/J,
+ * or an i/I with acute (precomposed or decomposed) is followed by a j/J with acute,
+ * then we output accordingly.
+ *
+ * @return the src index after the titlecased sequence, or the start index if no Dutch IJ
+ */
+int32_t maybeTitleDutchIJ(const uint8_t *src, UChar32 c, int32_t start, int32_t segmentLimit,
+                          ByteSink &sink, uint32_t options, icu::Edits *edits, UErrorCode &errorCode) {
+
+    int32_t index = start;
+    bool withAcute = false;
+
+    // If the conditions are met, then the following variables tell us what to output.
+    int32_t unchanged1 = 0;  // code units before the j, or the whole sequence (0..3)
+    bool doTitleJ = false;  // true if the j needs to be titlecased
+    int32_t unchanged2 = 0;  // after the j (0 or 1)
+
+    // next character after the first letter
+    UChar32 c2;
+    c2 = src[index++];
+
+    // Is the first letter an i/I with accent?
+    if (c == u'I') {
+        if (c2 == ACUTE_BYTE0 && index < segmentLimit && src[index++] == ACUTE_BYTE1) {
+            withAcute = true;
+            unchanged1 = 2;  // ACUTE is 2 code units in UTF-8
+            if (index == segmentLimit) { return start; }
+            c2 = src[index++];
+        }
+    } else {  // Í
+        withAcute = true;
+    }
+
+    // Is the next character a j/J?
+    if (c2 == u'j') {
+        doTitleJ = true;
+    } else if (c2 == u'J') {
+        ++unchanged1;
+    } else {
+        return start;
+    }
+
+    // A plain i/I must be followed by a plain j/J.
+    // An i/I with acute must be followed by a j/J with acute.
+    if (withAcute) {
+        if ((index + 1) >= segmentLimit || src[index++] != ACUTE_BYTE0 || src[index++] != ACUTE_BYTE1) {
+            return start;
+        }
+        if (doTitleJ) {
+            unchanged2 = 2;  // ACUTE is 2 code units in UTF-8
+        } else {
+            unchanged1 = unchanged1 + 2;    // ACUTE is 2 code units in UTF-8
+        }
+    }
+
+    // There must not be another combining mark.
+    if (index < segmentLimit) {
+        int32_t cp;
+        int32_t i = index;
+        U8_NEXT(src, i, segmentLimit, cp);
+        uint32_t typeMask = U_GET_GC_MASK(cp);
+        if ((typeMask & U_GC_M_MASK) != 0) {
+            return start;
+        }
+    }
+
+    // Output the rest of the Dutch IJ.
+    ByteSinkUtil::appendUnchanged(src + start, unchanged1, sink, options, edits, errorCode);
+    start += unchanged1;
+    if (doTitleJ) {
+        ByteSinkUtil::appendCodePoint(1, u'J', sink, edits);
+        ++start;
+    }
+    ByteSinkUtil::appendUnchanged(src + start, unchanged2, sink, options, edits, errorCode);
+
+    U_ASSERT(start + unchanged2 == index);
+    return index;
+}
+
+}  // namespace
+
  U_CFUNC void U_CALLCONV
  ucasemap_internalUTF8ToTitle(
          int32_t caseLocale, uint32_t options, BreakIterator *iter,
@@ -505,18 +595,13 @@ ucasemap_internalUTF8ToTitle(
  
                  /* Special case Dutch IJ titlecasing */
                  if (titleStart+1 < index &&
-                        caseLocale == UCASE_LOC_DUTCH &&
-                        (src[titleStart] == 0x0049 || src[titleStart] == 0x0069)) {
-                    if (src[titleStart+1] == 0x006A) {
-                        ByteSinkUtil::appendCodePoint(1, 0x004A, sink, edits);
-                        titleLimit++;
-                    } else if (src[titleStart+1] == 0x004A) {
-                        // Keep the capital J from getting lowercased.
-                        if (!ByteSinkUtil::appendUnchanged(src+titleStart+1, 1,
-                                                           sink, options, edits, errorCode)) {
-                            return;
-                        }
-                        titleLimit++;
+                    caseLocale == UCASE_LOC_DUTCH) {
+                    if (c < 0) {
+                        c = ~c;
+                    }
+
+                    if (c == u'I' || c == u'Í') {
+                        titleLimit = maybeTitleDutchIJ(src, c, titleLimit, index, sink, options, edits, errorCode);
                      }
                  }
  
diff --git a/icu4c/source/common/ustrcase.cpp b/icu4c/source/common/ustrcase.cpp

index 36b19e75f2d7ae0ba295a80e912242408cff6cd8..acd37a598ab12c4929b2dd096060417fd3336f93 100644 (file)
--- a/icu4c/source/common/ustrcase.cpp
+++ b/icu4c/source/common/ustrcase.cpp
@@ -36,6 +36,12 @@
  #include "ustr_imp.h"
  #include "uassert.h"
  
+/**
+ * Code point for COMBINING ACUTE ACCENT
+ * @internal
+ */
+#define ACUTE u'\u0301'
+
  U_NAMESPACE_BEGIN
  
  namespace {
@@ -396,6 +402,93 @@ U_NAMESPACE_USE
  
  #if !UCONFIG_NO_BREAK_ITERATION
  
+namespace {
+
+/**
+ * Input: c is a letter I with or without acute accent.
+ * start is the index in src after c, and is less than segmentLimit.
+ * If a plain i/I is followed by a plain j/J,
+ * or an i/I with acute (precomposed or decomposed) is followed by a j/J with acute,
+ * then we output accordingly.
+ *
+ * @return the src index after the titlecased sequence, or the start index if no Dutch IJ
+ */
+int32_t maybeTitleDutchIJ(const UChar *src, UChar32 c, int32_t start, int32_t segmentLimit,
+                          UChar *dest, int32_t &destIndex, int32_t destCapacity, uint32_t options,
+                          icu::Edits *edits) {
+
+    int32_t index = start;
+    bool withAcute = false;
+
+    // If the conditions are met, then the following variables tell us what to output.
+    int32_t unchanged1 = 0;  // code units before the j, or the whole sequence (0..3)
+    bool doTitleJ = false;  // true if the j needs to be titlecased
+    int32_t unchanged2 = 0;  // after the j (0 or 1)
+
+    // next character after the first letter
+    UChar c2 = src[index++];
+
+    // Is the first letter an i/I with accent?
+    if (c == u'I') {
+        if (c2 == ACUTE) {
+            withAcute = true;
+            unchanged1 = 1;
+            if (index == segmentLimit) { return start; }
+            c2 = src[index++];
+        }
+    } else {  // Í
+        withAcute = true;
+    }
+
+    // Is the next character a j/J?
+    if (c2 == u'j') {
+        doTitleJ = true;
+    } else if (c2 == u'J') {
+        ++unchanged1;
+    } else {
+        return start;
+    }
+
+    // A plain i/I must be followed by a plain j/J.
+    // An i/I with acute must be followed by a j/J with acute.
+    if (withAcute) {
+        if (index == segmentLimit || src[index++] != ACUTE) { return start; }
+        if (doTitleJ) {
+            unchanged2 = 1;
+        } else {
+            ++unchanged1;
+        }
+    }
+
+    // There must not be another combining mark.
+    if (index < segmentLimit) {
+        int32_t cp;
+        int32_t i = index;
+        U16_NEXT(src, i, segmentLimit, cp);
+        uint32_t typeMask = U_GET_GC_MASK(cp);
+        if ((typeMask & U_GC_M_MASK) != 0) {
+            return start;
+        }
+    }
+
+    // Output the rest of the Dutch IJ.
+    destIndex = appendUnchanged(dest, destIndex, destCapacity, src + start, unchanged1, options, edits);
+    start += unchanged1;
+    if (doTitleJ) {
+        destIndex = appendUChar(dest, destIndex, destCapacity, u'J');
+        if (edits != nullptr) {
+            edits->addReplace(1, 1);
+        }
+        ++start;
+    }
+    destIndex = appendUnchanged(dest, destIndex, destCapacity, src + start, unchanged2, options, edits);
+
+    U_ASSERT(start + unchanged2 == index);
+    return index;
+}
+
+}  // namespace
+
  U_CFUNC int32_t U_CALLCONV
  ustrcase_internalToTitle(int32_t caseLocale, uint32_t options, BreakIterator *iter,
                           UChar *dest, int32_t destCapacity,
@@ -412,14 +505,14 @@ ustrcase_internalToTitle(int32_t caseLocale, uint32_t options, BreakIterator *it
      csc.limit=srcLength;
      int32_t destIndex=0;
      int32_t prev=0;
-    UBool isFirstIndex=TRUE;
+    bool isFirstIndex=true;
  
      /* titlecasing loop */
      while(prev<srcLength) {
          /* find next index where to titlecase */
          int32_t index;
          if(isFirstIndex) {
-            isFirstIndex=FALSE;
+            isFirstIndex=false;
              index=iter->first();
          } else {
              index=iter->next();
@@ -446,7 +539,7 @@ ustrcase_internalToTitle(int32_t caseLocale, uint32_t options, BreakIterator *it
                  // Stop with titleStart<titleLimit<=index
                  // if there is a character to be titlecased,
                  // or else stop with titleStart==titleLimit==index.
-                UBool toCased = (options&U_TITLECASE_ADJUST_TO_CASED) != 0;
+                bool toCased = (options&U_TITLECASE_ADJUST_TO_CASED) != 0;
                  while (toCased ? UCASE_NONE==ucase_getType(c) : !ustrcase_isLNS(c)) {
                      titleStart=titleLimit;
                      if(titleLimit==index) {
@@ -479,27 +572,15 @@ ustrcase_internalToTitle(int32_t caseLocale, uint32_t options, BreakIterator *it
  
                  /* Special case Dutch IJ titlecasing */
                  if (titleStart+1 < index &&
-                        caseLocale == UCASE_LOC_DUTCH &&
-                        (src[titleStart] == 0x0049 || src[titleStart] == 0x0069)) {
-                    if (src[titleStart+1] == 0x006A) {
-                        destIndex=appendUChar(dest, destIndex, destCapacity, 0x004A);
-                        if(destIndex<0) {
-                            errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
-                            return 0;
-                        }
-                        if(edits!=NULL) {
-                            edits->addReplace(1, 1);
-                        }
-                        titleLimit++;
-                    } else if (src[titleStart+1] == 0x004A) {
-                        // Keep the capital J from getting lowercased.
-                        destIndex=appendUnchanged(dest, destIndex, destCapacity,
-                                                  src+titleStart+1, 1, options, edits);
-                        if(destIndex<0) {
-                            errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
-                            return 0;
-                        }
-                        titleLimit++;
+                        caseLocale == UCASE_LOC_DUTCH) {
+                    if (c < 0) {
+                        c = ~c;
+                    }
+
+                    if (c == u'I' || c == u'Í') {
+                        titleLimit = maybeTitleDutchIJ(src, c, titleStart + 1, index, 
+                                                       dest, destIndex, destCapacity, options, 
+                                                       edits);
                      }
                  }
  
diff --git a/icu4c/source/test/intltest/strcase.cpp b/icu4c/source/test/intltest/strcase.cpp

index 006bcd64ed3f07f6b87c3fa6d358d29af008f6a0..14df2a36bdb4f626204e9f70de06f72641199e5a 100644 (file)
--- a/icu4c/source/test/intltest/strcase.cpp
+++ b/icu4c/source/test/intltest/strcase.cpp
@@ -51,6 +51,7 @@ public:
                          void *iter, const char *localeID, uint32_t options);
      void TestCasing();
      void TestTitleOptions();
+    void TestDutchTitle();
      void TestFullCaseFoldingIterator();
      void TestGreekUpper();
      void TestArmenian();
@@ -95,6 +96,7 @@ StringCaseTest::runIndexedTest(int32_t index, UBool exec, const char *&name, cha
  #if !UCONFIG_NO_BREAK_ITERATION && !UCONFIG_NO_FILE_IO && !UCONFIG_NO_LEGACY_CONVERSION
      TESTCASE_AUTO(TestCasing);
      TESTCASE_AUTO(TestTitleOptions);
+    TESTCASE_AUTO(TestDutchTitle);
  #endif
      TESTCASE_AUTO(TestFullCaseFoldingIterator);
      TESTCASE_AUTO(TestGreekUpper);
@@ -451,6 +453,7 @@ StringCaseTest::TestCasingImpl(const UnicodeString &input,
      }
      if(result!=output) {
          dataerrln("error: UnicodeString.%s() got a wrong result for a test case from casing.res", name);
+        dataerrln(UnicodeString("input = [") + input + "], expected = [" + output + "], actual = [" + result + "]");
      }
  #if !UCONFIG_NO_BREAK_ITERATION
      if(whichCase==TEST_TITLE && options==0) {
@@ -667,6 +670,104 @@ StringCaseTest::TestTitleOptions() {
  #endif
  }
  
+#if !UCONFIG_NO_BREAK_ITERATION
+void StringCaseTest::TestDutchTitle() {
+    IcuTestErrorCode errorCode(*this, "TestDutchTitle");
+
+    Locale nl("nl");  // Dutch
+    LocalPointer<BreakIterator> iter(
+        BreakIterator::createWordInstance(nl, errorCode));
+    
+    // Dutch titlecase check in English
+    TestCasingImpl(
+        u"ijssel igloo IJMUIDEN",
+        u"Ijssel Igloo Ijmuiden",
+        TEST_TITLE,
+        nullptr, 
+        "en",
+        0);
+
+    // Dutch titlecase check in Dutch
+    TestCasingImpl(
+        u"ijssel igloo IJMUIDEN", 
+        u"IJssel Igloo IJmuiden", 
+        TEST_TITLE,
+        nullptr, 
+        "nl",
+        0);
+
+    // Dutch titlecase check in Dutch with nolowercase option
+    if (U_SUCCESS(errorCode)) {
+        iter->setText(u"ijssel igloo IjMUIdEN iPoD ijenough");
+        TestCasingImpl(
+            u"ijssel igloo IjMUIdEN iPoD ijenough", 
+            u"IJssel Igloo IJMUIdEN IPoD IJenough", 
+            TEST_TITLE,
+            nullptr, 
+            "nl",
+            U_TITLECASE_NO_LOWERCASE);
+    }
+
+    errorCode.reset();
+
+    // Accented IJ testing
+
+    struct dutchTitleTestCase {
+        const UnicodeString input;
+        const UnicodeString expectedFull;
+        const UnicodeString expectedOnlyChanged;
+    } dutchTitleTestCases[] = {
+        // input,            expectedFull,      expectedOnlyChanged
+        {u"ij",              u"IJ",             u"IJ"},
+        {u"IJ",              u"IJ",             u""},
+        {u"íj́",              u"ÍJ́",             u"ÍJ"},
+        {u"ÍJ́",              u"ÍJ́",             u""},
+        {u"íJ́",              u"ÍJ́",             u"Í"},
+        {u"Ij́",              u"Ij́",             u""},
+        {u"ij́",              u"Ij́",             u"I"},
+        {u"ïj́",              u"Ïj́",             u"Ï"},
+        {u"íj\u0308",        u"Íj\u0308",       u"Í"},
+        {u"íj́\U0001D16E",    u"Íj́\U0001D16E",   u"Í"},
+        {u"íj\u1ABE",        u"Íj\u1ABE",       u"Í"},
+
+        {u"ijabc",              u"IJabc",             u"IJ"},
+        {u"IJabc",              u"IJabc",             u""},
+        {u"íj́abc",              u"ÍJ́abc",             u"ÍJ"},
+        {u"ÍJ́abc",              u"ÍJ́abc",             u""},
+        {u"íJ́abc",              u"ÍJ́abc",             u"Í"},
+        {u"Ij́abc",              u"Ij́abc",             u""},
+        {u"ij́abc",              u"Ij́abc",             u"I"},
+        {u"ïj́abc",              u"Ïj́abc",             u"Ï"},
+        {u"íjabc\u0308",        u"Íjabc\u0308",       u"Í"},
+        {u"íj́abc\U0001D16E",    u"ÍJ́abc\U0001D16E",   u"ÍJ"},
+        {u"íjabc\u1ABE",        u"Íjabc\u1ABE",       u"Í"},
+    };
+
+    for (const auto& cas : dutchTitleTestCases) {
+        const UnicodeString &input = cas.input;
+        const UnicodeString &expectedFull = cas.expectedFull;
+        const UnicodeString &expectedOnlyChanged = cas.expectedOnlyChanged;
+
+        for (const auto& isOnlyChanged : {true, false}) {
+            uint32_t testOptions = U_TITLECASE_NO_LOWERCASE
+                | (isOnlyChanged ? U_OMIT_UNCHANGED_TEXT : 0);
+            
+            const UnicodeString &expected = isOnlyChanged ? expectedOnlyChanged : expectedFull;
+            
+            TestCasingImpl(
+                input,
+                expected,
+                TEST_TITLE,
+                nullptr,
+                "nl",
+                testOptions
+            );
+        }
+        
+    }
+}
+#endif
+
  void
  StringCaseTest::TestFullCaseFoldingIterator() {
      UnicodeString ffi=UNICODE_STRING_SIMPLE("ffi");
diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/CaseMapImpl.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/CaseMapImpl.java

index 9c48035acc6dc8ef1bc2e84ed38b51b92ac010b3..052e52c592f4806ef9eec09d9bc02d0d5f95dabf 100644 (file)
--- a/icu4j/main/classes/core/src/com/ibm/icu/impl/CaseMapImpl.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/CaseMapImpl.java
@@ -70,6 +70,10 @@ public final class CaseMapImpl {
              cpStart=cpLimit=limit;
          }
  
+        public void moveTo(int i) {
+            cpStart=cpLimit=i;
+        }
+
          /**
           * Iterate forward through the string to fetch the next code point
           * to be case-mapped, and set the context indexes for it.
@@ -189,6 +193,13 @@ public final class CaseMapImpl {
          return options | newOption;
      }
  
+    private static final char ACUTE = '\u0301';
+
+    private static final int U_GC_M_MASK =
+            (1 << UCharacterCategory.NON_SPACING_MARK) |
+            (1 << UCharacterCategory.COMBINING_SPACING_MARK) |
+            (1 << UCharacterCategory.ENCLOSING_MARK);
+
      private static final int LNS =
              (1 << UCharacterCategory.UPPERCASE_LETTER) |
              (1 << UCharacterCategory.LOWERCASE_LETTER) |
@@ -726,34 +737,25 @@ public final class CaseMapImpl {
                      }
  
                      if(titleStart<index) {
-                        int titleLimit=iter.getCPLimit();
                          // titlecase c which is from [titleStart..titleLimit[
                          c = UCaseProps.INSTANCE.toFullTitle(c, iter, dest, caseLocale);
                          appendResult(c, dest, iter.getCPLength(), options, edits);
  
                          // Special case Dutch IJ titlecasing
+                        int titleLimit;
                          if (titleStart+1 < index && caseLocale == UCaseProps.LOC_DUTCH) {
-                            char c1 = src.charAt(titleStart);
-                            if ((c1 == 'i' || c1 == 'I')) {
-                                char c2 = src.charAt(titleStart+1);
-                                if (c2 == 'j') {
-                                    dest.append('J');
-                                    if (edits != null) {
-                                        edits.addReplace(1, 1);
-                                    }
-                                    c = iter.nextCaseMapCP();
-                                    titleLimit++;
-                                    assert c == c2;
-                                    assert titleLimit == iter.getCPLimit();
-                                } else if (c2 == 'J') {
-                                    // Keep the capital J from getting lowercased.
-                                    appendUnchanged(src, titleStart + 1, 1, dest, options, edits);
-                                    c = iter.nextCaseMapCP();
-                                    titleLimit++;
-                                    assert c == c2;
-                                    assert titleLimit == iter.getCPLimit();
-                                }
+                            if (c < 0) {
+                                c = ~c;
+                            }
+                            if (c == 'I' || c == 'Í') {
+                                titleLimit = maybeTitleDutchIJ(src, c, titleStart + 1, index, dest, options, edits);
+                                iter.moveTo(titleLimit);
+                            }
+                            else {
+                                titleLimit = iter.getCPLimit();
                              }
+                        } else {
+                            titleLimit = iter.getCPLimit();
                          }
  
                          // lowercase [titleLimit..index[
@@ -779,6 +781,82 @@ public final class CaseMapImpl {
          }
      }
  
+    /**
+     * Input: c is a letter I with or without acute accent.
+     * start is the index in src after c, and is less than segmentLimit.
+     * If a plain i/I is followed by a plain j/J,
+     * or an i/I with acute (precomposed or decomposed) is followed by a j/J with acute,
+     * then we output accordingly.
+     *
+     * @return the src index after the titlecased sequence, or the start index if no Dutch IJ
+     * @throws IOException
+     */
+    private static <A extends Appendable> int maybeTitleDutchIJ(
+            CharSequence src, int c, int start, int segmentLimit,
+            A dest, int options, Edits edits) throws IOException {
+        int index = start;
+        boolean withAcute = false;
+
+        // If the conditions are met, then the following variables tell us what to output.
+        int unchanged1 = 0;  // code units before the j, or the whole sequence (0..3)
+        boolean doTitleJ = false;  // true if the j needs to be titlecased
+        int unchanged2 = 0;  // after the j (0 or 1)
+
+        // next character after the first letter
+        char c2 = src.charAt(index++);
+
+        // Is the first letter an i/I with accent?
+        if (c == 'I') {
+            if (c2 == ACUTE) {
+                withAcute = true;
+                unchanged1 = 1;
+                if (index == segmentLimit) { return start; }
+                c2 = src.charAt(index++);
+            }
+        } else {  // Í
+            withAcute = true;
+        }
+        // Is the next character a j/J?
+        if (c2 == 'j') {
+            doTitleJ = true;
+        } else if (c2 == 'J') {
+            ++unchanged1;
+        } else {
+            return start;
+        }
+        // A plain i/I must be followed by a plain j/J.
+        // An i/I with acute must be followed by a j/J with acute.
+        if (withAcute) {
+            if (index == segmentLimit || src.charAt(index++) != ACUTE) { return start; }
+            if (doTitleJ) {
+                unchanged2 = 1;
+            } else {
+                ++unchanged1;
+            }
+        }
+        // There must not be another combining mark.
+        if (index < segmentLimit) {
+            int cp = Character.codePointAt(src, index);
+            int bit = 1 << UCharacter.getType(cp);
+            if ((bit & U_GC_M_MASK) != 0) {
+                return start;
+            }
+        }
+        // Output the rest of the Dutch IJ.
+        appendUnchanged(src, start, unchanged1, dest, options, edits);
+        start += unchanged1;
+        if (doTitleJ) {
+            dest.append('J');
+            if (edits != null) {
+                edits.addReplace(1, 1);
+            }
+            ++start;
+        }
+        appendUnchanged(src, start, unchanged2, dest, options, edits);
+        assert start + unchanged2 == index;
+        return index;
+    }
+
      public static String fold(int options, CharSequence src) {
          if (src.length() <= 100 && (options & OMIT_UNCHANGED_TEXT) == 0) {
              if (src.length() == 0) {
diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/lang/UCharacterCaseTest.java b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/lang/UCharacterCaseTest.java

index 4562ee9a27047eaa572aec6d8a09749fb1e0060d..f56f2950e31d3fa39a04214483f50bb2926be199 100644 (file)
--- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/lang/UCharacterCaseTest.java
+++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/lang/UCharacterCaseTest.java
@@ -13,6 +13,7 @@ package com.ibm.icu.dev.test.lang;
  
  import java.io.BufferedReader;
  import java.util.ArrayList;
+import java.util.Arrays;
  import java.util.Collections;
  import java.util.List;
  import java.util.Locale;
@@ -23,6 +24,7 @@ import org.junit.runners.JUnit4;
  
  import com.ibm.icu.dev.test.TestFmwk;
  import com.ibm.icu.dev.test.TestUtil;
+import com.ibm.icu.impl.CaseMapImpl;
  import com.ibm.icu.impl.Utility;
  import com.ibm.icu.lang.UCharacter;
  import com.ibm.icu.lang.UProperty;
@@ -467,6 +469,67 @@ public final class UCharacterCaseTest extends TestFmwk
          assertEquals("Dutch titlecase check in Dutch with nolowercase option",
                  "IJssel Igloo IJMUIdEN IPoD IJenough",
                  UCharacter.toTitleCase(LOC_DUTCH, "ijssel igloo IjMUIdEN iPoD ijenough", iter, options));
+
+        // Accented IJ testing
+
+        String[][] dutchIJCasesData = {
+                // input,           expectedFull,     expOnlyChanged
+                {"ij",              "IJ",             "IJ"},
+                {"IJ",              "IJ",             ""},
+                {"íj́",              "ÍJ́",             "ÍJ"},
+                {"ÍJ́",              "ÍJ́",             ""},
+                {"íJ́",              "ÍJ́",             "Í"},
+                {"Ij́",              "Ij́",             ""},
+                {"ij́",              "Ij́",             "I"},
+                {"ïj́",              "Ïj́",             "Ï"},
+                {"íj\u0308",        "Íj\u0308",       "Í"},
+                {"íj́\uD834\uDD6E",  "Íj́\uD834\uDD6E", "Í"}, // \uD834\uDD6E == \U0001D16E
+                {"íj\u1ABE",        "Íj\u1ABE",       "Í"},
+
+                {"ijabc",              "IJabc",             "IJ"},
+                {"IJabc",              "IJabc",             ""},
+                {"íj́abc",              "ÍJ́abc",             "ÍJ"},
+                {"ÍJ́abc",              "ÍJ́abc",             ""},
+                {"íJ́abc",              "ÍJ́abc",             "Í"},
+                {"Ij́abc",              "Ij́abc",             ""},
+                {"ij́abc",              "Ij́abc",             "I"},
+                {"ïj́abc",              "Ïj́abc",             "Ï"},
+                {"íjabc\u0308",        "Íjabc\u0308",       "Í"},
+                {"íj́abc\uD834\uDD6E",  "ÍJ́abc\uD834\uDD6E", "ÍJ"},
+                {"íjabc\u1ABE",        "Íjabc\u1ABE",       "Í"},
+        };
+
+        for (String[] caseDatum : dutchIJCasesData) {
+            String input = caseDatum[0];
+            String expectedFull = caseDatum[1];
+            String expectedOnlyChanged = caseDatum[2];
+
+            for (boolean isOnlyChanged : Arrays.asList(true, false)) {
+                String testMsg = "Dutch accented ij"
+                        + (isOnlyChanged ? ", only changes" : "");
+
+                int testOptions = UCharacter.TITLECASE_NO_LOWERCASE
+                        | (isOnlyChanged ? CaseMapImpl.OMIT_UNCHANGED_TEXT : 0);
+
+                CaseMap.Title titleCaseMapBase = CaseMap.toTitle().noLowercase();
+                CaseMap.Title titleCaseMap = isOnlyChanged ? titleCaseMapBase.omitUnchangedText() : titleCaseMapBase;
+
+                String expected = isOnlyChanged ? expectedOnlyChanged : expectedFull;
+
+                // Newer API for title casing
+                StringBuilder resultBuilder = new StringBuilder();
+                Edits edits = new Edits();
+                titleCaseMap.apply(DUTCH_LOCALE_, null, input, resultBuilder, edits);
+                String result = resultBuilder.toString();
+                assertEquals(testMsg + ", [" + input + "]",
+                        expected, result);
+
+                // Older API for title casing (vs. Newer API)
+                String oldApiResult = UCharacter.toTitleCase(LOC_DUTCH, input, null, testOptions);
+                assertEquals(testMsg + ", Title.apply() vs UCharacter.toTitleCase()" + ", [" + input + "]",
+                        result, oldApiResult);
+            }
+        }
      }
  
      @Test
author	Elango <elango@unicode.org>
	Tue, 15 Feb 2022 23:27:24 +0000 (23:27 +0000)
committer	Elango <elango@unicode.org>
	Wed, 16 Feb 2022 17:07:09 +0000 (09:07 -0800)
icu4c/source/common/ucasemap.cpp		patch \| blob \| history
icu4c/source/common/ustrcase.cpp		patch \| blob \| history
icu4c/source/test/intltest/strcase.cpp		patch \| blob \| history
icu4j/main/classes/core/src/com/ibm/icu/impl/CaseMapImpl.java		patch \| blob \| history
icu4j/main/tests/core/src/com/ibm/icu/dev/test/lang/UCharacterCaseTest.java		patch \| blob \| history