#if !UCONFIG_NO_BREAK_ITERATION
+namespace {
+
+constexpr uint8_t ACUTE_BYTE0 = u8"\u0301"[0];
+
+constexpr uint8_t ACUTE_BYTE1 = u8"\u0301"[1];
+
+/**
+ * Input: c is a letter I with or without acute accent.
+ * start is the index in src after c, and is less than segmentLimit.
+ * If a plain i/I is followed by a plain j/J,
+ * or an i/I with acute (precomposed or decomposed) is followed by a j/J with acute,
+ * then we output accordingly.
+ *
+ * @return the src index after the titlecased sequence, or the start index if no Dutch IJ
+ */
+int32_t maybeTitleDutchIJ(const uint8_t *src, UChar32 c, int32_t start, int32_t segmentLimit,
+ ByteSink &sink, uint32_t options, icu::Edits *edits, UErrorCode &errorCode) {
+
+ int32_t index = start;
+ bool withAcute = false;
+
+ // If the conditions are met, then the following variables tell us what to output.
+ int32_t unchanged1 = 0; // code units before the j, or the whole sequence (0..3)
+ bool doTitleJ = false; // true if the j needs to be titlecased
+ int32_t unchanged2 = 0; // after the j (0 or 1)
+
+ // next character after the first letter
+ UChar32 c2;
+ c2 = src[index++];
+
+ // Is the first letter an i/I with accent?
+ if (c == u'I') {
+ if (c2 == ACUTE_BYTE0 && index < segmentLimit && src[index++] == ACUTE_BYTE1) {
+ withAcute = true;
+ unchanged1 = 2; // ACUTE is 2 code units in UTF-8
+ if (index == segmentLimit) { return start; }
+ c2 = src[index++];
+ }
+ } else { // Í
+ withAcute = true;
+ }
+
+ // Is the next character a j/J?
+ if (c2 == u'j') {
+ doTitleJ = true;
+ } else if (c2 == u'J') {
+ ++unchanged1;
+ } else {
+ return start;
+ }
+
+ // A plain i/I must be followed by a plain j/J.
+ // An i/I with acute must be followed by a j/J with acute.
+ if (withAcute) {
+ if ((index + 1) >= segmentLimit || src[index++] != ACUTE_BYTE0 || src[index++] != ACUTE_BYTE1) {
+ return start;
+ }
+ if (doTitleJ) {
+ unchanged2 = 2; // ACUTE is 2 code units in UTF-8
+ } else {
+ unchanged1 = unchanged1 + 2; // ACUTE is 2 code units in UTF-8
+ }
+ }
+
+ // There must not be another combining mark.
+ if (index < segmentLimit) {
+ int32_t cp;
+ int32_t i = index;
+ U8_NEXT(src, i, segmentLimit, cp);
+ uint32_t typeMask = U_GET_GC_MASK(cp);
+ if ((typeMask & U_GC_M_MASK) != 0) {
+ return start;
+ }
+ }
+
+ // Output the rest of the Dutch IJ.
+ ByteSinkUtil::appendUnchanged(src + start, unchanged1, sink, options, edits, errorCode);
+ start += unchanged1;
+ if (doTitleJ) {
+ ByteSinkUtil::appendCodePoint(1, u'J', sink, edits);
+ ++start;
+ }
+ ByteSinkUtil::appendUnchanged(src + start, unchanged2, sink, options, edits, errorCode);
+
+ U_ASSERT(start + unchanged2 == index);
+ return index;
+}
+
+} // namespace
+
U_CFUNC void U_CALLCONV
ucasemap_internalUTF8ToTitle(
int32_t caseLocale, uint32_t options, BreakIterator *iter,
/* Special case Dutch IJ titlecasing */
if (titleStart+1 < index &&
- caseLocale == UCASE_LOC_DUTCH &&
- (src[titleStart] == 0x0049 || src[titleStart] == 0x0069)) {
- if (src[titleStart+1] == 0x006A) {
- ByteSinkUtil::appendCodePoint(1, 0x004A, sink, edits);
- titleLimit++;
- } else if (src[titleStart+1] == 0x004A) {
- // Keep the capital J from getting lowercased.
- if (!ByteSinkUtil::appendUnchanged(src+titleStart+1, 1,
- sink, options, edits, errorCode)) {
- return;
- }
- titleLimit++;
+ caseLocale == UCASE_LOC_DUTCH) {
+ if (c < 0) {
+ c = ~c;
+ }
+
+ if (c == u'I' || c == u'Í') {
+ titleLimit = maybeTitleDutchIJ(src, c, titleLimit, index, sink, options, edits, errorCode);
}
}
#include "ustr_imp.h"
#include "uassert.h"
+/**
+ * Code point for COMBINING ACUTE ACCENT
+ * @internal
+ */
+#define ACUTE u'\u0301'
+
U_NAMESPACE_BEGIN
namespace {
#if !UCONFIG_NO_BREAK_ITERATION
+namespace {
+
+/**
+ * Input: c is a letter I with or without acute accent.
+ * start is the index in src after c, and is less than segmentLimit.
+ * If a plain i/I is followed by a plain j/J,
+ * or an i/I with acute (precomposed or decomposed) is followed by a j/J with acute,
+ * then we output accordingly.
+ *
+ * @return the src index after the titlecased sequence, or the start index if no Dutch IJ
+ */
+int32_t maybeTitleDutchIJ(const UChar *src, UChar32 c, int32_t start, int32_t segmentLimit,
+ UChar *dest, int32_t &destIndex, int32_t destCapacity, uint32_t options,
+ icu::Edits *edits) {
+
+ int32_t index = start;
+ bool withAcute = false;
+
+ // If the conditions are met, then the following variables tell us what to output.
+ int32_t unchanged1 = 0; // code units before the j, or the whole sequence (0..3)
+ bool doTitleJ = false; // true if the j needs to be titlecased
+ int32_t unchanged2 = 0; // after the j (0 or 1)
+
+ // next character after the first letter
+ UChar c2 = src[index++];
+
+ // Is the first letter an i/I with accent?
+ if (c == u'I') {
+ if (c2 == ACUTE) {
+ withAcute = true;
+ unchanged1 = 1;
+ if (index == segmentLimit) { return start; }
+ c2 = src[index++];
+ }
+ } else { // Í
+ withAcute = true;
+ }
+
+ // Is the next character a j/J?
+ if (c2 == u'j') {
+ doTitleJ = true;
+ } else if (c2 == u'J') {
+ ++unchanged1;
+ } else {
+ return start;
+ }
+
+ // A plain i/I must be followed by a plain j/J.
+ // An i/I with acute must be followed by a j/J with acute.
+ if (withAcute) {
+ if (index == segmentLimit || src[index++] != ACUTE) { return start; }
+ if (doTitleJ) {
+ unchanged2 = 1;
+ } else {
+ ++unchanged1;
+ }
+ }
+
+ // There must not be another combining mark.
+ if (index < segmentLimit) {
+ int32_t cp;
+ int32_t i = index;
+ U16_NEXT(src, i, segmentLimit, cp);
+ uint32_t typeMask = U_GET_GC_MASK(cp);
+ if ((typeMask & U_GC_M_MASK) != 0) {
+ return start;
+ }
+ }
+
+ // Output the rest of the Dutch IJ.
+ destIndex = appendUnchanged(dest, destIndex, destCapacity, src + start, unchanged1, options, edits);
+ start += unchanged1;
+ if (doTitleJ) {
+ destIndex = appendUChar(dest, destIndex, destCapacity, u'J');
+ if (edits != nullptr) {
+ edits->addReplace(1, 1);
+ }
+ ++start;
+ }
+ destIndex = appendUnchanged(dest, destIndex, destCapacity, src + start, unchanged2, options, edits);
+
+ U_ASSERT(start + unchanged2 == index);
+ return index;
+}
+
+} // namespace
+
U_CFUNC int32_t U_CALLCONV
ustrcase_internalToTitle(int32_t caseLocale, uint32_t options, BreakIterator *iter,
UChar *dest, int32_t destCapacity,
csc.limit=srcLength;
int32_t destIndex=0;
int32_t prev=0;
- UBool isFirstIndex=TRUE;
+ bool isFirstIndex=true;
/* titlecasing loop */
while(prev<srcLength) {
/* find next index where to titlecase */
int32_t index;
if(isFirstIndex) {
- isFirstIndex=FALSE;
+ isFirstIndex=false;
index=iter->first();
} else {
index=iter->next();
// Stop with titleStart<titleLimit<=index
// if there is a character to be titlecased,
// or else stop with titleStart==titleLimit==index.
- UBool toCased = (options&U_TITLECASE_ADJUST_TO_CASED) != 0;
+ bool toCased = (options&U_TITLECASE_ADJUST_TO_CASED) != 0;
while (toCased ? UCASE_NONE==ucase_getType(c) : !ustrcase_isLNS(c)) {
titleStart=titleLimit;
if(titleLimit==index) {
/* Special case Dutch IJ titlecasing */
if (titleStart+1 < index &&
- caseLocale == UCASE_LOC_DUTCH &&
- (src[titleStart] == 0x0049 || src[titleStart] == 0x0069)) {
- if (src[titleStart+1] == 0x006A) {
- destIndex=appendUChar(dest, destIndex, destCapacity, 0x004A);
- if(destIndex<0) {
- errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
- return 0;
- }
- if(edits!=NULL) {
- edits->addReplace(1, 1);
- }
- titleLimit++;
- } else if (src[titleStart+1] == 0x004A) {
- // Keep the capital J from getting lowercased.
- destIndex=appendUnchanged(dest, destIndex, destCapacity,
- src+titleStart+1, 1, options, edits);
- if(destIndex<0) {
- errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
- return 0;
- }
- titleLimit++;
+ caseLocale == UCASE_LOC_DUTCH) {
+ if (c < 0) {
+ c = ~c;
+ }
+
+ if (c == u'I' || c == u'Í') {
+ titleLimit = maybeTitleDutchIJ(src, c, titleStart + 1, index,
+ dest, destIndex, destCapacity, options,
+ edits);
}
}
void *iter, const char *localeID, uint32_t options);
void TestCasing();
void TestTitleOptions();
+ void TestDutchTitle();
void TestFullCaseFoldingIterator();
void TestGreekUpper();
void TestArmenian();
#if !UCONFIG_NO_BREAK_ITERATION && !UCONFIG_NO_FILE_IO && !UCONFIG_NO_LEGACY_CONVERSION
TESTCASE_AUTO(TestCasing);
TESTCASE_AUTO(TestTitleOptions);
+ TESTCASE_AUTO(TestDutchTitle);
#endif
TESTCASE_AUTO(TestFullCaseFoldingIterator);
TESTCASE_AUTO(TestGreekUpper);
}
if(result!=output) {
dataerrln("error: UnicodeString.%s() got a wrong result for a test case from casing.res", name);
+ dataerrln(UnicodeString("input = [") + input + "], expected = [" + output + "], actual = [" + result + "]");
}
#if !UCONFIG_NO_BREAK_ITERATION
if(whichCase==TEST_TITLE && options==0) {
#endif
}
+#if !UCONFIG_NO_BREAK_ITERATION
+void StringCaseTest::TestDutchTitle() {
+ IcuTestErrorCode errorCode(*this, "TestDutchTitle");
+
+ Locale nl("nl"); // Dutch
+ LocalPointer<BreakIterator> iter(
+ BreakIterator::createWordInstance(nl, errorCode));
+
+ // Dutch titlecase check in English
+ TestCasingImpl(
+ u"ijssel igloo IJMUIDEN",
+ u"Ijssel Igloo Ijmuiden",
+ TEST_TITLE,
+ nullptr,
+ "en",
+ 0);
+
+ // Dutch titlecase check in Dutch
+ TestCasingImpl(
+ u"ijssel igloo IJMUIDEN",
+ u"IJssel Igloo IJmuiden",
+ TEST_TITLE,
+ nullptr,
+ "nl",
+ 0);
+
+ // Dutch titlecase check in Dutch with nolowercase option
+ if (U_SUCCESS(errorCode)) {
+ iter->setText(u"ijssel igloo IjMUIdEN iPoD ijenough");
+ TestCasingImpl(
+ u"ijssel igloo IjMUIdEN iPoD ijenough",
+ u"IJssel Igloo IJMUIdEN IPoD IJenough",
+ TEST_TITLE,
+ nullptr,
+ "nl",
+ U_TITLECASE_NO_LOWERCASE);
+ }
+
+ errorCode.reset();
+
+ // Accented IJ testing
+
+ struct dutchTitleTestCase {
+ const UnicodeString input;
+ const UnicodeString expectedFull;
+ const UnicodeString expectedOnlyChanged;
+ } dutchTitleTestCases[] = {
+ // input, expectedFull, expectedOnlyChanged
+ {u"ij", u"IJ", u"IJ"},
+ {u"IJ", u"IJ", u""},
+ {u"íj́", u"ÍJ́", u"ÍJ"},
+ {u"ÍJ́", u"ÍJ́", u""},
+ {u"íJ́", u"ÍJ́", u"Í"},
+ {u"Ij́", u"Ij́", u""},
+ {u"ij́", u"Ij́", u"I"},
+ {u"ïj́", u"Ïj́", u"Ï"},
+ {u"íj\u0308", u"Íj\u0308", u"Í"},
+ {u"íj́\U0001D16E", u"Íj́\U0001D16E", u"Í"},
+ {u"íj\u1ABE", u"Íj\u1ABE", u"Í"},
+
+ {u"ijabc", u"IJabc", u"IJ"},
+ {u"IJabc", u"IJabc", u""},
+ {u"íj́abc", u"ÍJ́abc", u"ÍJ"},
+ {u"ÍJ́abc", u"ÍJ́abc", u""},
+ {u"íJ́abc", u"ÍJ́abc", u"Í"},
+ {u"Ij́abc", u"Ij́abc", u""},
+ {u"ij́abc", u"Ij́abc", u"I"},
+ {u"ïj́abc", u"Ïj́abc", u"Ï"},
+ {u"íjabc\u0308", u"Íjabc\u0308", u"Í"},
+ {u"íj́abc\U0001D16E", u"ÍJ́abc\U0001D16E", u"ÍJ"},
+ {u"íjabc\u1ABE", u"Íjabc\u1ABE", u"Í"},
+ };
+
+ for (const auto& cas : dutchTitleTestCases) {
+ const UnicodeString &input = cas.input;
+ const UnicodeString &expectedFull = cas.expectedFull;
+ const UnicodeString &expectedOnlyChanged = cas.expectedOnlyChanged;
+
+ for (const auto& isOnlyChanged : {true, false}) {
+ uint32_t testOptions = U_TITLECASE_NO_LOWERCASE
+ | (isOnlyChanged ? U_OMIT_UNCHANGED_TEXT : 0);
+
+ const UnicodeString &expected = isOnlyChanged ? expectedOnlyChanged : expectedFull;
+
+ TestCasingImpl(
+ input,
+ expected,
+ TEST_TITLE,
+ nullptr,
+ "nl",
+ testOptions
+ );
+ }
+
+ }
+}
+#endif
+
void
StringCaseTest::TestFullCaseFoldingIterator() {
UnicodeString ffi=UNICODE_STRING_SIMPLE("ffi");
cpStart=cpLimit=limit;
}
+ public void moveTo(int i) {
+ cpStart=cpLimit=i;
+ }
+
/**
* Iterate forward through the string to fetch the next code point
* to be case-mapped, and set the context indexes for it.
return options | newOption;
}
+ private static final char ACUTE = '\u0301';
+
+ private static final int U_GC_M_MASK =
+ (1 << UCharacterCategory.NON_SPACING_MARK) |
+ (1 << UCharacterCategory.COMBINING_SPACING_MARK) |
+ (1 << UCharacterCategory.ENCLOSING_MARK);
+
private static final int LNS =
(1 << UCharacterCategory.UPPERCASE_LETTER) |
(1 << UCharacterCategory.LOWERCASE_LETTER) |
}
if(titleStart<index) {
- int titleLimit=iter.getCPLimit();
// titlecase c which is from [titleStart..titleLimit[
c = UCaseProps.INSTANCE.toFullTitle(c, iter, dest, caseLocale);
appendResult(c, dest, iter.getCPLength(), options, edits);
// Special case Dutch IJ titlecasing
+ int titleLimit;
if (titleStart+1 < index && caseLocale == UCaseProps.LOC_DUTCH) {
- char c1 = src.charAt(titleStart);
- if ((c1 == 'i' || c1 == 'I')) {
- char c2 = src.charAt(titleStart+1);
- if (c2 == 'j') {
- dest.append('J');
- if (edits != null) {
- edits.addReplace(1, 1);
- }
- c = iter.nextCaseMapCP();
- titleLimit++;
- assert c == c2;
- assert titleLimit == iter.getCPLimit();
- } else if (c2 == 'J') {
- // Keep the capital J from getting lowercased.
- appendUnchanged(src, titleStart + 1, 1, dest, options, edits);
- c = iter.nextCaseMapCP();
- titleLimit++;
- assert c == c2;
- assert titleLimit == iter.getCPLimit();
- }
+ if (c < 0) {
+ c = ~c;
+ }
+ if (c == 'I' || c == 'Í') {
+ titleLimit = maybeTitleDutchIJ(src, c, titleStart + 1, index, dest, options, edits);
+ iter.moveTo(titleLimit);
+ }
+ else {
+ titleLimit = iter.getCPLimit();
}
+ } else {
+ titleLimit = iter.getCPLimit();
}
// lowercase [titleLimit..index[
}
}
+ /**
+ * Input: c is a letter I with or without acute accent.
+ * start is the index in src after c, and is less than segmentLimit.
+ * If a plain i/I is followed by a plain j/J,
+ * or an i/I with acute (precomposed or decomposed) is followed by a j/J with acute,
+ * then we output accordingly.
+ *
+ * @return the src index after the titlecased sequence, or the start index if no Dutch IJ
+ * @throws IOException
+ */
+ private static <A extends Appendable> int maybeTitleDutchIJ(
+ CharSequence src, int c, int start, int segmentLimit,
+ A dest, int options, Edits edits) throws IOException {
+ int index = start;
+ boolean withAcute = false;
+
+ // If the conditions are met, then the following variables tell us what to output.
+ int unchanged1 = 0; // code units before the j, or the whole sequence (0..3)
+ boolean doTitleJ = false; // true if the j needs to be titlecased
+ int unchanged2 = 0; // after the j (0 or 1)
+
+ // next character after the first letter
+ char c2 = src.charAt(index++);
+
+ // Is the first letter an i/I with accent?
+ if (c == 'I') {
+ if (c2 == ACUTE) {
+ withAcute = true;
+ unchanged1 = 1;
+ if (index == segmentLimit) { return start; }
+ c2 = src.charAt(index++);
+ }
+ } else { // Í
+ withAcute = true;
+ }
+ // Is the next character a j/J?
+ if (c2 == 'j') {
+ doTitleJ = true;
+ } else if (c2 == 'J') {
+ ++unchanged1;
+ } else {
+ return start;
+ }
+ // A plain i/I must be followed by a plain j/J.
+ // An i/I with acute must be followed by a j/J with acute.
+ if (withAcute) {
+ if (index == segmentLimit || src.charAt(index++) != ACUTE) { return start; }
+ if (doTitleJ) {
+ unchanged2 = 1;
+ } else {
+ ++unchanged1;
+ }
+ }
+ // There must not be another combining mark.
+ if (index < segmentLimit) {
+ int cp = Character.codePointAt(src, index);
+ int bit = 1 << UCharacter.getType(cp);
+ if ((bit & U_GC_M_MASK) != 0) {
+ return start;
+ }
+ }
+ // Output the rest of the Dutch IJ.
+ appendUnchanged(src, start, unchanged1, dest, options, edits);
+ start += unchanged1;
+ if (doTitleJ) {
+ dest.append('J');
+ if (edits != null) {
+ edits.addReplace(1, 1);
+ }
+ ++start;
+ }
+ appendUnchanged(src, start, unchanged2, dest, options, edits);
+ assert start + unchanged2 == index;
+ return index;
+ }
+
public static String fold(int options, CharSequence src) {
if (src.length() <= 100 && (options & OMIT_UNCHANGED_TEXT) == 0) {
if (src.length() == 0) {
import java.io.BufferedReader;
import java.util.ArrayList;
+import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.Locale;
import com.ibm.icu.dev.test.TestFmwk;
import com.ibm.icu.dev.test.TestUtil;
+import com.ibm.icu.impl.CaseMapImpl;
import com.ibm.icu.impl.Utility;
import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.lang.UProperty;
assertEquals("Dutch titlecase check in Dutch with nolowercase option",
"IJssel Igloo IJMUIdEN IPoD IJenough",
UCharacter.toTitleCase(LOC_DUTCH, "ijssel igloo IjMUIdEN iPoD ijenough", iter, options));
+
+ // Accented IJ testing
+
+ String[][] dutchIJCasesData = {
+ // input, expectedFull, expOnlyChanged
+ {"ij", "IJ", "IJ"},
+ {"IJ", "IJ", ""},
+ {"íj́", "ÍJ́", "ÍJ"},
+ {"ÍJ́", "ÍJ́", ""},
+ {"íJ́", "ÍJ́", "Í"},
+ {"Ij́", "Ij́", ""},
+ {"ij́", "Ij́", "I"},
+ {"ïj́", "Ïj́", "Ï"},
+ {"íj\u0308", "Íj\u0308", "Í"},
+ {"íj́\uD834\uDD6E", "Íj́\uD834\uDD6E", "Í"}, // \uD834\uDD6E == \U0001D16E
+ {"íj\u1ABE", "Íj\u1ABE", "Í"},
+
+ {"ijabc", "IJabc", "IJ"},
+ {"IJabc", "IJabc", ""},
+ {"íj́abc", "ÍJ́abc", "ÍJ"},
+ {"ÍJ́abc", "ÍJ́abc", ""},
+ {"íJ́abc", "ÍJ́abc", "Í"},
+ {"Ij́abc", "Ij́abc", ""},
+ {"ij́abc", "Ij́abc", "I"},
+ {"ïj́abc", "Ïj́abc", "Ï"},
+ {"íjabc\u0308", "Íjabc\u0308", "Í"},
+ {"íj́abc\uD834\uDD6E", "ÍJ́abc\uD834\uDD6E", "ÍJ"},
+ {"íjabc\u1ABE", "Íjabc\u1ABE", "Í"},
+ };
+
+ for (String[] caseDatum : dutchIJCasesData) {
+ String input = caseDatum[0];
+ String expectedFull = caseDatum[1];
+ String expectedOnlyChanged = caseDatum[2];
+
+ for (boolean isOnlyChanged : Arrays.asList(true, false)) {
+ String testMsg = "Dutch accented ij"
+ + (isOnlyChanged ? ", only changes" : "");
+
+ int testOptions = UCharacter.TITLECASE_NO_LOWERCASE
+ | (isOnlyChanged ? CaseMapImpl.OMIT_UNCHANGED_TEXT : 0);
+
+ CaseMap.Title titleCaseMapBase = CaseMap.toTitle().noLowercase();
+ CaseMap.Title titleCaseMap = isOnlyChanged ? titleCaseMapBase.omitUnchangedText() : titleCaseMapBase;
+
+ String expected = isOnlyChanged ? expectedOnlyChanged : expectedFull;
+
+ // Newer API for title casing
+ StringBuilder resultBuilder = new StringBuilder();
+ Edits edits = new Edits();
+ titleCaseMap.apply(DUTCH_LOCALE_, null, input, resultBuilder, edits);
+ String result = resultBuilder.toString();
+ assertEquals(testMsg + ", [" + input + "]",
+ expected, result);
+
+ // Older API for title casing (vs. Newer API)
+ String oldApiResult = UCharacter.toTitleCase(LOC_DUTCH, input, null, testOptions);
+ assertEquals(testMsg + ", Title.apply() vs UCharacter.toTitleCase()" + ", [" + input + "]",
+ result, oldApiResult);
+ }
+ }
}
@Test