From cfef2fb339a0934fdda1dbbf6e5be153a74008d8 Mon Sep 17 00:00:00 2001 From: Markus Scherer Date: Fri, 9 Jun 2017 23:04:03 +0000 Subject: [PATCH] ICU-9198 add titlecasing options: wholeString, sentences, adjustToCased X-SVN-Rev: 40164 --- icu4c/source/common/ucase.h | 8 +- icu4c/source/common/ucasemap.cpp | 47 ++-- icu4c/source/common/ucasemap_imp.h | 55 ++++- .../common/ucasemap_titlecase_brkiter.cpp | 18 +- icu4c/source/common/unicode/casemap.h | 10 +- icu4c/source/common/unicode/stringoptions.h | 82 +++++-- icu4c/source/common/unicode/ucasemap.h | 2 +- icu4c/source/common/unicode/unistr.h | 6 +- .../common/unistr_titlecase_brkiter.cpp | 31 ++- .../source/common/ustr_titlecase_brkiter.cpp | 199 +++++++++++++--- icu4c/source/common/ustrcase.cpp | 47 ++-- icu4c/source/test/intltest/strcase.cpp | 56 +++++ .../src/com/ibm/icu/impl/CaseMapImpl.java | 220 ++++++++++++++++-- .../ibm/icu/impl/LocaleDisplayNamesImpl.java | 15 +- .../core/src/com/ibm/icu/impl/UCaseProps.java | 8 +- .../core/src/com/ibm/icu/lang/UCharacter.java | 70 ++---- .../core/src/com/ibm/icu/text/CaseMap.java | 89 +++++-- .../icu/dev/test/lang/UCharacterCaseTest.java | 57 +++++ .../dev/test/translit/TransliteratorTest.java | 52 +++-- 19 files changed, 830 insertions(+), 242 deletions(-) diff --git a/icu4c/source/common/ucase.h b/icu4c/source/common/ucase.h index 0240641132d..9d6365eadfc 100644 --- a/icu4c/source/common/ucase.h +++ b/icu4c/source/common/ucase.h @@ -69,10 +69,16 @@ enum { /** * Bit mask for getting just the options from a string compare options word * that are relevant for case folding (of a single string or code point). + * + * Currently only bit 0 for U_FOLD_CASE_EXCLUDE_SPECIAL_I. + * It is conceivable that at some point we might use one more bit for using uppercase sharp s. + * It is conceivable that at some point we might want the option to use only simple case foldings + * when operating on strings. + * * See stringoptions.h. * @internal */ -#define _FOLD_CASE_OPTIONS_MASK 0xff +#define _FOLD_CASE_OPTIONS_MASK 7 /* single-code point functions */ diff --git a/icu4c/source/common/ucasemap.cpp b/icu4c/source/common/ucasemap.cpp index c21c4453b76..1f83c0d6a06 100644 --- a/icu4c/source/common/ucasemap.cpp +++ b/icu4c/source/common/ucasemap.cpp @@ -381,7 +381,7 @@ ucasemap_internalUTF8ToTitle( const uint8_t *src, int32_t srcLength, icu::Edits *edits, UErrorCode &errorCode) { - if(U_FAILURE(errorCode)) { + if (!ustrcase_checkTitleAdjustmentOptions(options, errorCode)) { return 0; } @@ -408,45 +408,38 @@ ucasemap_internalUTF8ToTitle( } /* - * Unicode 4 & 5 section 3.13 Default Case Operations: - * - * R3 toTitlecase(X): Find the word boundaries based on Unicode Standard Annex - * #29, "Text Boundaries." Between each pair of word boundaries, find the first - * cased character F. If F exists, map F to default_title(F); then map each - * subsequent character C to default_lower(C). - * - * In this implementation, segment [prev..index[ into 3 parts: - * a) uncased characters (copy as-is) [prev..titleStart[ - * b) first case letter (titlecase) [titleStart..titleLimit[ + * Segment [prev..index[ into 3 parts: + * a) skipped characters (copy as-is) [prev..titleStart[ + * b) first letter (titlecase) [titleStart..titleLimit[ * c) subsequent characters (lowercase) [titleLimit..index[ */ if(prev &ownedIter, UErrorCode &errorCode); + +#endif + +U_NAMESPACE_END + #include "unicode/unistr.h" // for UStringCaseMapper /* diff --git a/icu4c/source/common/ucasemap_titlecase_brkiter.cpp b/icu4c/source/common/ucasemap_titlecase_brkiter.cpp index a253850fa29..2e09a5548a1 100644 --- a/icu4c/source/common/ucasemap_titlecase_brkiter.cpp +++ b/icu4c/source/common/ucasemap_titlecase_brkiter.cpp @@ -42,11 +42,8 @@ int32_t CaseMap::utf8ToTitle( UText utext=UTEXT_INITIALIZER; utext_openUTF8(&utext, src, srcLength, &errorCode); LocalPointer ownedIter; + iter = ustrcase_getTitleBreakIterator(nullptr, locale, options, iter, ownedIter, errorCode); if(iter==NULL) { - iter=BreakIterator::createWordInstance(Locale(locale), errorCode); - ownedIter.adoptInstead(iter); - } - if(U_FAILURE(errorCode)) { utext_close(&utext); return 0; } @@ -88,12 +85,19 @@ ucasemap_utf8ToTitle(UCaseMap *csm, } UText utext=UTEXT_INITIALIZER; utext_openUTF8(&utext, (const char *)src, srcLength, pErrorCode); - if(csm->iter==NULL) { - csm->iter=BreakIterator::createWordInstance(Locale(csm->locale), *pErrorCode); - } if (U_FAILURE(*pErrorCode)) { return 0; } + if(csm->iter==NULL) { + LocalPointer ownedIter; + BreakIterator *iter = ustrcase_getTitleBreakIterator( + nullptr, csm->locale, csm->options, nullptr, ownedIter, *pErrorCode); + if (iter == nullptr) { + utext_close(&utext); + return 0; + } + csm->iter = ownedIter.orphan(); + } csm->iter->setText(&utext, *pErrorCode); int32_t length=ucasemap_mapUTF8( csm->caseLocale, csm->options, csm->iter, diff --git a/icu4c/source/common/unicode/casemap.h b/icu4c/source/common/unicode/casemap.h index 1b8af69a26f..581f1ab532a 100644 --- a/icu4c/source/common/unicode/casemap.h +++ b/icu4c/source/common/unicode/casemap.h @@ -113,7 +113,9 @@ public: * * @param locale The locale ID. ("" = root locale, NULL = default locale.) * @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT, - * U_TITLECASE_NO_LOWERCASE, U_TITLECASE_NO_BREAK_ADJUSTMENT. + * U_TITLECASE_NO_LOWERCASE, + * U_TITLECASE_NO_BREAK_ADJUSTMENT, U_TITLECASE_ADJUST_TO_CASED, + * U_TITLECASE_WHOLE_STRING, U_TITLECASE_SENTENCES. * @param iter A break iterator to find the first characters of words that are to be titlecased. * It is set to the source string (setText()) * and used one or more times for iteration (first() and next()). @@ -272,9 +274,11 @@ public: * * @param locale The locale ID. ("" = root locale, NULL = default locale.) * @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT, - * U_TITLECASE_NO_LOWERCASE, U_TITLECASE_NO_BREAK_ADJUSTMENT. + * U_TITLECASE_NO_LOWERCASE, + * U_TITLECASE_NO_BREAK_ADJUSTMENT, U_TITLECASE_ADJUST_TO_CASED, + * U_TITLECASE_WHOLE_STRING, U_TITLECASE_SENTENCES. * @param iter A break iterator to find the first characters of words that are to be titlecased. - * It is set to the source string (setText()) + * It is set to the source string (setUText()) * and used one or more times for iteration (first() and next()). * If NULL, then a word break iterator for the locale is used * (or something equivalent). diff --git a/icu4c/source/common/unicode/stringoptions.h b/icu4c/source/common/unicode/stringoptions.h index 975e193609d..270b9de691c 100644 --- a/icu4c/source/common/unicode/stringoptions.h +++ b/icu4c/source/common/unicode/stringoptions.h @@ -39,49 +39,101 @@ */ #define U_FOLD_CASE_EXCLUDE_SPECIAL_I 1 +#ifndef U_HIDE_DRAFT_API + +/** + * Titlecase the string as a whole rather than each word. + * (Titlecase only the character at index 0, possibly adjusted.) + * Option bits value for titlecasing APIs that take an options bit set. + * + * It is an error to specify multiple titlecasing iterator options together, + * including both an options bit and an explicit BreakIterator. + * + * @see U_TITLECASE_ADJUST_TO_CASED + * @draft ICU 60 + */ +#define U_TITLECASE_WHOLE_STRING 0x20 + +/** + * Titlecase sentences rather than words. + * (Titlecase only the first character of each sentence, possibly adjusted.) + * Option bits value for titlecasing APIs that take an options bit set. + * + * It is an error to specify multiple titlecasing iterator options together, + * including both an options bit and an explicit BreakIterator. + * + * @see U_TITLECASE_ADJUST_TO_CASED + * @draft ICU 60 + */ +#define U_TITLECASE_SENTENCES 0x40 + +#endif // U_HIDE_DRAFT_API + /** * Do not lowercase non-initial parts of words when titlecasing. * Option bit for titlecasing APIs that take an options bit set. * - * By default, titlecasing will titlecase the first cased character - * of a word and lowercase all other characters. + * By default, titlecasing will titlecase the character at each + * (possibly adjusted) BreakIterator index and + * lowercase all other characters up to the next iterator index. * With this option, the other characters will not be modified. * + * @see U_TITLECASE_ADJUST_TO_CASED + * @see UnicodeString::toTitle + * @see CaseMap::toTitle * @see ucasemap_setOptions * @see ucasemap_toTitle * @see ucasemap_utf8ToTitle - * @see UnicodeString::toTitle * @stable ICU 3.8 */ #define U_TITLECASE_NO_LOWERCASE 0x100 /** - * Do not adjust the titlecasing indexes from BreakIterator::next() indexes; + * Do not adjust the titlecasing BreakIterator indexes; * titlecase exactly the characters at breaks from the iterator. * Option bit for titlecasing APIs that take an options bit set. * * By default, titlecasing will take each break iterator index, - * adjust it by looking for the next cased character, and titlecase that one. - * Other characters are lowercased. + * adjust it to the next relevant character (see U_TITLECASE_ADJUST_TO_CASED), + * and titlecase that one. * - * This follows Unicode 4 & 5 section 3.13 Default Case Operations: + * Other characters are lowercased. * - * R3 toTitlecase(X): Find the word boundaries based on Unicode Standard Annex - * #29, "Text Boundaries." Between each pair of word boundaries, find the first - * cased character F. If F exists, map F to default_title(F); then map each - * subsequent character C to default_lower(C). + * It is an error to specify multiple titlecasing adjustment options together. * + * @see U_TITLECASE_ADJUST_TO_CASED + * @see U_TITLECASE_NO_LOWERCASE + * @see UnicodeString::toTitle + * @see CaseMap::toTitle * @see ucasemap_setOptions * @see ucasemap_toTitle * @see ucasemap_utf8ToTitle - * @see UnicodeString::toTitle - * @see U_TITLECASE_NO_LOWERCASE * @stable ICU 3.8 */ #define U_TITLECASE_NO_BREAK_ADJUSTMENT 0x200 #ifndef U_HIDE_DRAFT_API +/** + * Adjust each titlecasing BreakIterator index to the next cased character. + * (See the Unicode Standard, chapter 3, Default Case Conversion, R3 toTitlecase(X).) + * Option bit for titlecasing APIs that take an options bit set. + * + * This used to be the default index adjustment in ICU. + * Since ICU 60, the default index adjustment is to the next character that is + * a letter, number, symbol, or private use code point. + * (Uncased modifier letters are skipped.) + * The difference in behavior is small for word titlecasing, + * but the new adjustment is much better for whole-string and sentence titlecasing: + * It yields "49ers" and "«丰(abc)»" instead of "49Ers" and "«丰(Abc)»". + * + * It is an error to specify multiple titlecasing adjustment options together. + * + * @see U_TITLECASE_NO_BREAK_ADJUSTMENT + * @draft ICU 60 + */ +#define U_TITLECASE_ADJUST_TO_CASED 0x400 + /** * Omit unchanged text when recording how source substrings * relate to changed and unchanged result substrings. @@ -126,7 +178,9 @@ // // Internal: (may change or be removed) // ucase.h #define _STRCASECMP_OPTIONS_MASK 0xffff -// ucase.h #define _FOLD_CASE_OPTIONS_MASK 0xff +// ucase.h #define _FOLD_CASE_OPTIONS_MASK 7 +// ucasemap_imp.h #define U_TITLECASE_ITERATOR_MASK 0xe0 +// ucasemap_imp.h #define U_TITLECASE_ADJUSTMENT_MASK 0x600 // ustr_imp.h #define _STRNCMP_STYLE 0x1000 // unormcmp.cpp #define _COMPARE_EQUIV 0x80000 diff --git a/icu4c/source/common/unicode/ucasemap.h b/icu4c/source/common/unicode/ucasemap.h index 7c69bdc2076..6b253e3d638 100644 --- a/icu4c/source/common/unicode/ucasemap.h +++ b/icu4c/source/common/unicode/ucasemap.h @@ -202,7 +202,7 @@ ucasemap_setBreakIterator(UCaseMap *csm, UBreakIterator *iterToAdopt, UErrorCode * The standard titlecase iterator for the root locale implements the * algorithm of Unicode TR 21. * - * This function uses only the setUText(), first(), next() and close() methods of the + * This function uses only the setText(), first() and next() methods of the * provided break iterator. * * The result may be longer or shorter than the original. diff --git a/icu4c/source/common/unicode/unistr.h b/icu4c/source/common/unicode/unistr.h index 445d57c911a..ede23973c92 100644 --- a/icu4c/source/common/unicode/unistr.h +++ b/icu4c/source/common/unicode/unistr.h @@ -2775,11 +2775,11 @@ public: * break iterator is opened. * Otherwise the provided iterator is set to the string's text. * @param locale The locale to consider. + * @param options Options bit set, usually 0. See U_TITLECASE_NO_LOWERCASE, + * U_TITLECASE_NO_BREAK_ADJUSTMENT, U_TITLECASE_ADJUST_TO_CASED, + * U_TITLECASE_WHOLE_STRING, U_TITLECASE_SENTENCES. * @param options Options bit set, see ucasemap_open(). * @return A reference to this. - * @see U_TITLECASE_NO_LOWERCASE - * @see U_TITLECASE_NO_BREAK_ADJUSTMENT - * @see ucasemap_open * @stable ICU 3.8 */ UnicodeString &toTitle(BreakIterator *titleIter, const Locale &locale, uint32_t options); diff --git a/icu4c/source/common/unistr_titlecase_brkiter.cpp b/icu4c/source/common/unistr_titlecase_brkiter.cpp index a0ff7719317..4969884b0dc 100644 --- a/icu4c/source/common/unistr_titlecase_brkiter.cpp +++ b/icu4c/source/common/unistr_titlecase_brkiter.cpp @@ -30,31 +30,26 @@ U_NAMESPACE_BEGIN UnicodeString & -UnicodeString::toTitle(BreakIterator *titleIter) { - return toTitle(titleIter, Locale::getDefault(), 0); +UnicodeString::toTitle(BreakIterator *iter) { + return toTitle(iter, Locale::getDefault(), 0); } UnicodeString & -UnicodeString::toTitle(BreakIterator *titleIter, const Locale &locale) { - return toTitle(titleIter, locale, 0); +UnicodeString::toTitle(BreakIterator *iter, const Locale &locale) { + return toTitle(iter, locale, 0); } UnicodeString & -UnicodeString::toTitle(BreakIterator *titleIter, const Locale &locale, uint32_t options) { - BreakIterator *bi=titleIter; - if(bi==NULL) { - UErrorCode errorCode=U_ZERO_ERROR; - bi=BreakIterator::createWordInstance(locale, errorCode); - if(U_FAILURE(errorCode)) { - setToBogus(); - return *this; +UnicodeString::toTitle(BreakIterator *iter, const Locale &locale, uint32_t options) { + LocalPointer ownedIter; + UErrorCode errorCode = U_ZERO_ERROR; + iter = ustrcase_getTitleBreakIterator(&locale, "", options, iter, ownedIter, errorCode); + if (iter == nullptr) { + setToBogus(); + return *this; } - } - caseMap(ustrcase_getCaseLocale(locale.getBaseName()), options, bi, ustrcase_internalToTitle); - if(titleIter==NULL) { - delete bi; - } - return *this; + caseMap(ustrcase_getCaseLocale(locale.getBaseName()), options, iter, ustrcase_internalToTitle); + return *this; } U_NAMESPACE_END diff --git a/icu4c/source/common/ustr_titlecase_brkiter.cpp b/icu4c/source/common/ustr_titlecase_brkiter.cpp index 0b2ba02064b..d71cdb6035e 100644 --- a/icu4c/source/common/ustr_titlecase_brkiter.cpp +++ b/icu4c/source/common/ustr_titlecase_brkiter.cpp @@ -23,46 +23,153 @@ #include "unicode/brkiter.h" #include "unicode/casemap.h" +#include "unicode/chariter.h" #include "unicode/localpointer.h" #include "unicode/ubrk.h" #include "unicode/ucasemap.h" +#include "unicode/utext.h" #include "cmemory.h" +#include "uassert.h" #include "ucase.h" #include "ucasemap_imp.h" -U_NAMESPACE_USE +U_NAMESPACE_BEGIN -/* functions available in the common library (for unistr_case.cpp) */ +/** + * Whole-string BreakIterator. + * Titlecasing only calls setText(), first(), and next(). + * We implement the rest only to satisfy the abstract interface. + */ +class WholeStringBreakIterator : public BreakIterator { +public: + WholeStringBreakIterator() : BreakIterator(), length(0) {} + ~WholeStringBreakIterator() override; + UBool operator==(const BreakIterator&) const override; + BreakIterator *clone() const override; + static UClassID U_EXPORT2 getStaticClassID(); + UClassID getDynamicClassID() const override; + CharacterIterator &getText() const override; + UText *getUText(UText *fillIn, UErrorCode &errorCode) const override; + void setText(const UnicodeString &text) override; + void setText(UText *text, UErrorCode &errorCode) override; + void adoptText(CharacterIterator* it) override; + int32_t first() override; + int32_t last() override; + int32_t previous() override; + int32_t next() override; + int32_t current() const override; + int32_t following(int32_t offset) override; + int32_t preceding(int32_t offset) override; + UBool isBoundary(int32_t offset) override; + int32_t next(int32_t n) override; + BreakIterator *createBufferClone(void *stackBuffer, int32_t &BufferSize, + UErrorCode &errorCode) override; + BreakIterator &refreshInputText(UText *input, UErrorCode &errorCode) override; -/* public API functions */ +private: + int32_t length; +}; -U_CAPI int32_t U_EXPORT2 -u_strToTitle(UChar *dest, int32_t destCapacity, - const UChar *src, int32_t srcLength, - UBreakIterator *titleIter, - const char *locale, - UErrorCode *pErrorCode) { - LocalPointer ownedIter; - BreakIterator *iter; - if(titleIter!=NULL) { - iter=reinterpret_cast(titleIter); - } else { - iter=BreakIterator::createWordInstance(Locale(locale), *pErrorCode); - ownedIter.adoptInstead(iter); +UOBJECT_DEFINE_RTTI_IMPLEMENTATION(WholeStringBreakIterator) + +WholeStringBreakIterator::~WholeStringBreakIterator() {} +UBool WholeStringBreakIterator::operator==(const BreakIterator&) const { return FALSE; } +BreakIterator *WholeStringBreakIterator::clone() const { return nullptr; } + +CharacterIterator &WholeStringBreakIterator::getText() const { + U_ASSERT(FALSE); // really should not be called + // Returns a null reference. + // Otherwise we would have to define a dummy CharacterIterator, + // and either have it as a field and const_cast it to a non-const reference, + // or have it via a pointer and return a reference to that. + CharacterIterator *none = nullptr; + return *none; +} +UText *WholeStringBreakIterator::getUText(UText * /*fillIn*/, UErrorCode &errorCode) const { + if (U_SUCCESS(errorCode)) { + errorCode = U_UNSUPPORTED_ERROR; } - if(U_FAILURE(*pErrorCode)) { - return 0; + return nullptr; +} + +void WholeStringBreakIterator::setText(const UnicodeString &text) { + length = text.length(); +} +void WholeStringBreakIterator::setText(UText *text, UErrorCode &errorCode) { + if (U_SUCCESS(errorCode)) { + int64_t length64 = utext_nativeLength(text); + if (length64 <= INT32_MAX) { + length = (int32_t)length64; + } else { + errorCode = U_INDEX_OUTOFBOUNDS_ERROR; + } } - UnicodeString s(srcLength<0, src, srcLength); - iter->setText(s); - return ustrcase_mapWithOverlap( - ustrcase_getCaseLocale(locale), 0, iter, - dest, destCapacity, - src, srcLength, - ustrcase_internalToTitle, *pErrorCode); +} +void WholeStringBreakIterator::adoptText(CharacterIterator* it) { + U_ASSERT(FALSE); // should not be called + length = it->getLength(); + delete it; } -U_NAMESPACE_BEGIN +int32_t WholeStringBreakIterator::first() { return 0; } +int32_t WholeStringBreakIterator::last() { return length; } +int32_t WholeStringBreakIterator::previous() { return 0; } +int32_t WholeStringBreakIterator::next() { return length; } +int32_t WholeStringBreakIterator::current() const { return 0; } +int32_t WholeStringBreakIterator::following(int32_t /*offset*/) { return length; } +int32_t WholeStringBreakIterator::preceding(int32_t /*offset*/) { return 0; } +UBool WholeStringBreakIterator::isBoundary(int32_t /*offset*/) { return FALSE; } +int32_t WholeStringBreakIterator::next(int32_t /*n*/) { return length; } + +BreakIterator *WholeStringBreakIterator::createBufferClone( + void * /*stackBuffer*/, int32_t & /*BufferSize*/, UErrorCode &errorCode) { + if (U_SUCCESS(errorCode)) { + errorCode = U_UNSUPPORTED_ERROR; + } + return nullptr; +} +BreakIterator &WholeStringBreakIterator::refreshInputText( + UText * /*input*/, UErrorCode &errorCode) { + if (U_SUCCESS(errorCode)) { + errorCode = U_UNSUPPORTED_ERROR; + } + return *this; +} + +U_CFUNC +BreakIterator *ustrcase_getTitleBreakIterator( + const Locale *locale, const char *locID, uint32_t options, BreakIterator *iter, + LocalPointer &ownedIter, UErrorCode &errorCode) { + if (U_FAILURE(errorCode)) { return nullptr; } + options &= U_TITLECASE_ITERATOR_MASK; + if (options != 0 && iter != nullptr) { + errorCode = U_ILLEGAL_ARGUMENT_ERROR; + return nullptr; + } + if (iter == nullptr) { + switch (options) { + case 0: + iter = BreakIterator::createWordInstance( + locale != nullptr ? *locale : Locale(locID), errorCode); + break; + case U_TITLECASE_WHOLE_STRING: + iter = new WholeStringBreakIterator(); + if (iter == nullptr) { + errorCode = U_MEMORY_ALLOCATION_ERROR; + } + break; + case U_TITLECASE_SENTENCES: + iter = BreakIterator::createSentenceInstance( + locale != nullptr ? *locale : Locale(locID), errorCode); + break; + default: + errorCode = U_ILLEGAL_ARGUMENT_ERROR; + break; + } + ownedIter.adoptInstead(iter); + } + return iter; +} int32_t CaseMap::toTitle( const char *locale, uint32_t options, BreakIterator *iter, @@ -70,11 +177,8 @@ int32_t CaseMap::toTitle( UChar *dest, int32_t destCapacity, Edits *edits, UErrorCode &errorCode) { LocalPointer ownedIter; + iter = ustrcase_getTitleBreakIterator(nullptr, locale, options, iter, ownedIter, errorCode); if(iter==NULL) { - iter=BreakIterator::createWordInstance(Locale(locale), errorCode); - ownedIter.adoptInstead(iter); - } - if(U_FAILURE(errorCode)) { return 0; } UnicodeString s(srcLength<0, src, srcLength); @@ -88,6 +192,30 @@ int32_t CaseMap::toTitle( U_NAMESPACE_END +U_NAMESPACE_USE + +U_CAPI int32_t U_EXPORT2 +u_strToTitle(UChar *dest, int32_t destCapacity, + const UChar *src, int32_t srcLength, + UBreakIterator *titleIter, + const char *locale, + UErrorCode *pErrorCode) { + LocalPointer ownedIter; + BreakIterator *iter = ustrcase_getTitleBreakIterator( + nullptr, locale, 0, reinterpret_cast(titleIter), + ownedIter, *pErrorCode); + if (iter == nullptr) { + return 0; + } + UnicodeString s(srcLength<0, src, srcLength); + iter->setText(s); + return ustrcase_mapWithOverlap( + ustrcase_getCaseLocale(locale), 0, iter, + dest, destCapacity, + src, srcLength, + ustrcase_internalToTitle, *pErrorCode); +} + U_CAPI int32_t U_EXPORT2 ucasemap_toTitle(UCaseMap *csm, UChar *dest, int32_t destCapacity, @@ -97,10 +225,13 @@ ucasemap_toTitle(UCaseMap *csm, return 0; } if (csm->iter == NULL) { - csm->iter = BreakIterator::createWordInstance(Locale(csm->locale), *pErrorCode); - } - if (U_FAILURE(*pErrorCode)) { - return 0; + LocalPointer ownedIter; + BreakIterator *iter = ustrcase_getTitleBreakIterator( + nullptr, csm->locale, csm->options, nullptr, ownedIter, *pErrorCode); + if (iter == nullptr) { + return 0; + } + csm->iter = ownedIter.orphan(); } UnicodeString s(srcLength<0, src, srcLength); csm->iter->setText(s); diff --git a/icu4c/source/common/ustrcase.cpp b/icu4c/source/common/ustrcase.cpp index 57f6c8b755c..6fffb90a389 100644 --- a/icu4c/source/common/ustrcase.cpp +++ b/icu4c/source/common/ustrcase.cpp @@ -237,7 +237,7 @@ ustrcase_internalToTitle(int32_t caseLocale, uint32_t options, BreakIterator *it const UChar *src, int32_t srcLength, icu::Edits *edits, UErrorCode &errorCode) { - if(U_FAILURE(errorCode)) { + if (!ustrcase_checkTitleAdjustmentOptions(options, errorCode)) { return 0; } @@ -264,45 +264,38 @@ ustrcase_internalToTitle(int32_t caseLocale, uint32_t options, BreakIterator *it } /* - * Unicode 4 & 5 section 3.13 Default Case Operations: - * - * R3 toTitlecase(X): Find the word boundaries based on Unicode Standard Annex - * #29, "Text Boundaries." Between each pair of word boundaries, find the first - * cased character F. If F exists, map F to default_title(F); then map each - * subsequent character C to default_lower(C). - * - * In this implementation, segment [prev..index[ into 3 parts: - * a) uncased characters (copy as-is) [prev..titleStart[ - * b) first case letter (titlecase) [titleStart..titleLimit[ + * Segment [prev..index[ into 3 parts: + * a) skipped characters (copy as-is) [prev..titleStart[ + * b) first letter (titlecase) [titleStart..titleLimit[ * c) subsequent characters (lowercase) [titleLimit..index[ */ if(prev %s not illegal argument", + errorCode.errorName()); + } + errorCode.reset(); + CaseMap::toTitle("", U_TITLECASE_WHOLE_STRING|U_TITLECASE_SENTENCES, nullptr, + u"", 0, nullptr, 0, nullptr, errorCode); + if (errorCode.get() != U_ILLEGAL_ARGUMENT_ERROR) { + errln("CaseMap::toTitle(multiple iterator options) -> %s not illegal argument", + errorCode.errorName()); + } + errorCode.reset(); + LocalPointer iter( + BreakIterator::createCharacterInstance(Locale::getRoot(), errorCode)); + CaseMap::toTitle("", U_TITLECASE_WHOLE_STRING, iter.getAlias(), + u"", 0, nullptr, 0, nullptr, errorCode); + if (errorCode.get() != U_ILLEGAL_ARGUMENT_ERROR) { + errln("CaseMap::toTitle(iterator option + iterator) -> %s not illegal argument", + errorCode.errorName()); + } + errorCode.reset(); +} + void StringCaseTest::TestFullCaseFoldingIterator() { UnicodeString ffi=UNICODE_STRING_SIMPLE("ffi"); diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/CaseMapImpl.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/CaseMapImpl.java index f28e60ed5ea..b59b54fdc33 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/impl/CaseMapImpl.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/CaseMapImpl.java @@ -3,11 +3,15 @@ package com.ibm.icu.impl; import java.io.IOException; +import java.text.CharacterIterator; +import java.util.Locale; import com.ibm.icu.lang.UCharacter; +import com.ibm.icu.lang.UCharacterCategory; import com.ibm.icu.text.BreakIterator; import com.ibm.icu.text.Edits; import com.ibm.icu.util.ICUUncheckedIOException; +import com.ibm.icu.util.ULocale; public final class CaseMapImpl { /** @@ -134,11 +138,192 @@ public final class CaseMapImpl { protected int dir; // 0=initial state >0=forward <0=backward } + public static final int TITLECASE_WHOLE_STRING = 0x20; + public static final int TITLECASE_SENTENCES = 0x40; + + /** + * Bit mask for the titlecasing iterator options bit field. + * Currently only 3 out of 8 values are used: + * 0 (words), TITLECASE_WHOLE_STRING, TITLECASE_SENTENCES. + * See stringoptions.h. + * @internal + */ + private static final int TITLECASE_ITERATOR_MASK = 0xe0; + + public static final int TITLECASE_ADJUST_TO_CASED = 0x400; + + /** + * Bit mask for the titlecasing index adjustment options bit set. + * Currently two bits are defined: + * TITLECASE_NO_BREAK_ADJUSTMENT, TITLECASE_ADJUST_TO_CASED. + * See stringoptions.h. + * @internal + */ + private static final int TITLECASE_ADJUSTMENT_MASK = 0x600; + + public static int addTitleAdjustmentOption(int options, int newOption) { + int adjOptions = options & TITLECASE_ADJUSTMENT_MASK; + if (adjOptions !=0 && adjOptions != newOption) { + throw new IllegalArgumentException("multiple titlecasing index adjustment options"); + } + return options | newOption; + } + + private static final int LNS = + (1 << UCharacterCategory.UPPERCASE_LETTER) | + (1 << UCharacterCategory.LOWERCASE_LETTER) | + (1 << UCharacterCategory.TITLECASE_LETTER) | + // Not MODIFIER_LETTER: We count only cased modifier letters. + (1 << UCharacterCategory.OTHER_LETTER) | + + (1 << UCharacterCategory.DECIMAL_DIGIT_NUMBER) | + (1 << UCharacterCategory.LETTER_NUMBER) | + (1 << UCharacterCategory.OTHER_NUMBER) | + + (1 << UCharacterCategory.MATH_SYMBOL) | + (1 << UCharacterCategory.CURRENCY_SYMBOL) | + (1 << UCharacterCategory.MODIFIER_SYMBOL) | + (1 << UCharacterCategory.OTHER_SYMBOL) | + + (1 << UCharacterCategory.PRIVATE_USE); + + private static boolean isLNS(int c) { + // Letter, number, symbol, + // or a private use code point because those are typically used as letters or numbers. + // Consider modifier letters only if they are cased. + int gc = UCharacterProperty.INSTANCE.getType(c); + return ((1 << gc) & LNS) != 0 || + (gc == UCharacterCategory.MODIFIER_LETTER && + UCaseProps.INSTANCE.getType(c) != UCaseProps.NONE); + } + + public static int addTitleIteratorOption(int options, int newOption) { + int iterOptions = options & TITLECASE_ITERATOR_MASK; + if (iterOptions !=0 && iterOptions != newOption) { + throw new IllegalArgumentException("multiple titlecasing iterator options"); + } + return options | newOption; + } + + public static BreakIterator getTitleBreakIterator( + Locale locale, int options, BreakIterator iter) { + options &= TITLECASE_ITERATOR_MASK; + if (options != 0 && iter != null) { + throw new IllegalArgumentException( + "titlecasing iterator option together with an explicit iterator"); + } + if (iter == null) { + switch (options) { + case 0: + iter = BreakIterator.getWordInstance(locale); + break; + case TITLECASE_WHOLE_STRING: + iter = new WholeStringBreakIterator(); + break; + case TITLECASE_SENTENCES: + iter = BreakIterator.getSentenceInstance(locale); + break; + default: + throw new IllegalArgumentException("unknown titlecasing iterator option"); + } + } + return iter; + } + + public static BreakIterator getTitleBreakIterator( + ULocale locale, int options, BreakIterator iter) { + options &= TITLECASE_ITERATOR_MASK; + if (options != 0 && iter != null) { + throw new IllegalArgumentException( + "titlecasing iterator option together with an explicit iterator"); + } + if (iter == null) { + switch (options) { + case 0: + iter = BreakIterator.getWordInstance(locale); + break; + case TITLECASE_WHOLE_STRING: + iter = new WholeStringBreakIterator(); + break; + case TITLECASE_SENTENCES: + iter = BreakIterator.getSentenceInstance(locale); + break; + default: + throw new IllegalArgumentException("unknown titlecasing iterator option"); + } + } + return iter; + } + /** * Omit unchanged text when case-mapping with Edits. */ public static final int OMIT_UNCHANGED_TEXT = 0x4000; + private static final class WholeStringBreakIterator extends BreakIterator { + private int length; + + private static void notImplemented() { + throw new UnsupportedOperationException("should not occur"); + } + + @Override + public int first() { + return 0; + } + + @Override + public int last() { + notImplemented(); + return 0; + } + + @Override + public int next(int n) { + notImplemented(); + return 0; + } + + @Override + public int next() { + return length; + } + + @Override + public int previous() { + notImplemented(); + return 0; + } + + @Override + public int following(int offset) { + notImplemented(); + return 0; + } + + @Override + public int current() { + notImplemented(); + return 0; + } + + @Override + public CharacterIterator getText() { + notImplemented(); + return null; + } + + @Override + public void setText(CharacterIterator newText) { + length = newText.getEndIndex(); + } + + @Override + public void setText(String newText) { + length = newText.length(); + } + } + private static int appendCodePoint(Appendable a, int c) throws IOException { if (c <= Character.MAX_VALUE) { a.append((char)c); @@ -266,32 +451,33 @@ public final class CaseMapImpl { } /* - * Unicode 4 & 5 section 3.13 Default Case Operations: - * - * R3 toTitlecase(X): Find the word boundaries based on Unicode Standard Annex - * #29, "Text Boundaries." Between each pair of word boundaries, find the first - * cased character F. If F exists, map F to default_title(F); then map each - * subsequent character C to default_lower(C). - * - * In this implementation, segment [prev..index[ into 3 parts: - * a) uncased characters (copy as-is) [prev..titleStart[ - * b) first case letter (titlecase) [titleStart..titleLimit[ + * Segment [prev..index[ into 3 parts: + * a) skipped characters (copy as-is) [prev..titleStart[ + * b) first letter (titlecase) [titleStart..titleLimit[ * c) subsequent characters (lowercase) [titleLimit..index[ */ if(prev=0 - && UCaseProps.NONE==UCaseProps.INSTANCE.getType(c)) {} + if ((options&UCharacter.TITLECASE_NO_BREAK_ADJUSTMENT)==0) { + // Adjust the titlecasing index to the next cased character, + // or to the next letter/number/symbol/private use. + // Stop with titleStart=0) {} // If c<0 then we have only uncased characters in [prev..index[ // and stopped with titleStart==titleLimit==index. titleStart=iter.getCPStart(); - appendUnchanged(src, prev, titleStart-prev, dest, options, edits); + if (prev < titleStart) { + appendUnchanged(src, prev, titleStart-prev, dest, options, edits); + } } if(titleStartIt is an error to specify multiple titlecasing iterator options together, + * including both an option and an explicit BreakIterator. + * + * @return an options object with this option. + * @see #adjustToCased() + * @draft ICU 60 + * @provisional This API might change or be removed in a future release. + */ + public Title wholeString() { + return new Title(CaseMapImpl.addTitleIteratorOption( + internalOptions, CaseMapImpl.TITLECASE_WHOLE_STRING)); + } + + /** + * Returns an instance that behaves like this one but + * titlecases sentences rather than words. + * (Titlecases only the first character of each sentence, possibly adjusted.) + * + *

It is an error to specify multiple titlecasing iterator options together, + * including both an option and an explicit BreakIterator. + * + * @return an options object with this option. + * @see #adjustToCased() + * @draft ICU 60 + * @provisional This API might change or be removed in a future release. + */ + public Title sentences() { + return new Title(CaseMapImpl.addTitleIteratorOption( + internalOptions, CaseMapImpl.TITLECASE_SENTENCES)); + } + /** * {@inheritDoc} * @draft ICU 59 @@ -191,12 +227,14 @@ public abstract class CaseMap { * Returns an instance that behaves like this one but * does not lowercase non-initial parts of words when titlecasing. * - *

By default, titlecasing will titlecase the first cased character - * of a word and lowercase all other characters. + *

By default, titlecasing will titlecase the character at each + * (possibly adjusted) BreakIterator index and + * lowercase all other characters up to the next iterator index. * With this option, the other characters will not be modified. * * @return an options object with this option. * @see UCharacter#TITLECASE_NO_LOWERCASE + * @see #adjustToCased() * @draft ICU 59 * @provisional This API might change or be removed in a future release. */ @@ -204,22 +242,16 @@ public abstract class CaseMap { return new Title(internalOptions | UCharacter.TITLECASE_NO_LOWERCASE); } - // TODO: update references to the Unicode Standard for recent version /** * Returns an instance that behaves like this one but - * does not adjust the titlecasing indexes from BreakIterator::next() indexes; + * does not adjust the titlecasing BreakIterator indexes; * titlecases exactly the characters at breaks from the iterator. * *

By default, titlecasing will take each break iterator index, - * adjust it by looking for the next cased character, and titlecase that one. - * Other characters are lowercased. - * - *

This follows Unicode 4 & 5 section 3.13 Default Case Operations: + * adjust it to the next relevant character (see {@link #adjustToCased()}), + * and titlecase that one. * - * R3 toTitlecase(X): Find the word boundaries based on Unicode Standard Annex - * #29, "Text Boundaries." Between each pair of word boundaries, find the first - * cased character F. If F exists, map F to default_title(F); then map each - * subsequent character C to default_lower(C). + *

Other characters are lowercased. * * @return an options object with this option. * @see UCharacter#TITLECASE_NO_BREAK_ADJUSTMENT @@ -227,7 +259,33 @@ public abstract class CaseMap { * @provisional This API might change or be removed in a future release. */ public Title noBreakAdjustment() { - return new Title(internalOptions | UCharacter.TITLECASE_NO_BREAK_ADJUSTMENT); + return new Title(CaseMapImpl.addTitleAdjustmentOption( + internalOptions, UCharacter.TITLECASE_NO_BREAK_ADJUSTMENT)); + } + + /** + * Returns an instance that behaves like this one but + * adjusts each titlecasing BreakIterator index to the next cased character. + * (See the Unicode Standard, chapter 3, Default Case Conversion, R3 toTitlecase(X).) + * + *

This used to be the default index adjustment in ICU. + * Since ICU 60, the default index adjustment is to the next character that is + * a letter, number, symbol, or private use code point. + * (Uncased modifier letters are skipped.) + * The difference in behavior is small for word titlecasing, + * but the new adjustment is much better for whole-string and sentence titlecasing: + * It yields "49ers" and "«丰(abc)»" instead of "49Ers" and "«丰(Abc)»". + * + *

It is an error to specify multiple titlecasing adjustment options together. + * + * @return an options object with this option. + * @see #noBreakAdjustment() + * @draft ICU 60 + * @provisional This API might change or be removed in a future release. + */ + public Title adjustToCased() { + return new Title(CaseMapImpl.addTitleAdjustmentOption( + internalOptions, CaseMapImpl.TITLECASE_ADJUST_TO_CASED)); } /** @@ -259,9 +317,10 @@ public abstract class CaseMap { */ public A apply( Locale locale, BreakIterator iter, CharSequence src, A dest, Edits edits) { - if (iter == null) { - iter = BreakIterator.getWordInstance(locale); + if (iter == null && locale == null) { + locale = Locale.getDefault(); } + iter = CaseMapImpl.getTitleBreakIterator(locale, internalOptions, iter); iter.setText(src.toString()); return CaseMapImpl.toTitle( getCaseLocale(locale), internalOptions, iter, src, dest, edits); diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/lang/UCharacterCaseTest.java b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/lang/UCharacterCaseTest.java index 6f8a67983b8..8075fef6c88 100644 --- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/lang/UCharacterCaseTest.java +++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/lang/UCharacterCaseTest.java @@ -343,6 +343,63 @@ public final class UCharacterCaseTest extends TestFmwk } } + // Not a @Test. See ICU4C intltest strcase.cpp TestCasingImpl(). + void TestCasingImpl(String input, String output, CaseMap.Title toTitle, Locale locale) { + String result = toTitle.apply(locale, null, input, new StringBuilder(), null).toString(); + assertEquals("toTitle(" + input + ')', output, result); + } + + @Test + public void TestTitleOptions() { + Locale root = Locale.ROOT; + // New options in ICU 60. + TestCasingImpl("ʻcAt! ʻeTc.", "ʻCat! ʻetc.", + CaseMap.toTitle().wholeString(), root); + TestCasingImpl("a ʻCaT. A ʻdOg! ʻeTc.", "A ʻCaT. A ʻdOg! ʻETc.", + CaseMap.toTitle().sentences().noLowercase(), root); + TestCasingImpl("49eRs", "49ers", + CaseMap.toTitle().wholeString(), root); + TestCasingImpl("«丰(aBc)»", "«丰(abc)»", + CaseMap.toTitle().wholeString(), root); + TestCasingImpl("49eRs", "49Ers", + CaseMap.toTitle().wholeString().adjustToCased(), root); + TestCasingImpl("«丰(aBc)»", "«丰(Abc)»", + CaseMap.toTitle().wholeString().adjustToCased(), root); + TestCasingImpl(" john. Smith", " John. Smith", + CaseMap.toTitle().wholeString().noLowercase(), root); + TestCasingImpl(" john. Smith", " john. smith", + CaseMap.toTitle().wholeString().noBreakAdjustment(), root); + TestCasingImpl("«ijs»", "«IJs»", + CaseMap.toTitle().wholeString(), new Locale("nl", "BE")); + TestCasingImpl("«ijs»", "«İjs»", + CaseMap.toTitle().wholeString(), new Locale("tr", "DE")); + + // Test conflicting settings. + // If & when we add more options, then the ORed combinations may become + // indistinguishable from valid values. + try { + CaseMap.toTitle().noBreakAdjustment().adjustToCased(). + apply(root, null, "", new StringBuilder(), null); + fail("CaseMap.toTitle(multiple adjustment options) " + + "did not throw an IllegalArgumentException"); + } catch(IllegalArgumentException expected) { + } + try { + CaseMap.toTitle().wholeString().sentences(). + apply(root, null, "", new StringBuilder(), null); + fail("CaseMap.toTitle(multiple iterator options) " + + "did not throw an IllegalArgumentException"); + } catch(IllegalArgumentException expected) { + } + BreakIterator iter = BreakIterator.getCharacterInstance(root); + try { + CaseMap.toTitle().wholeString().apply(root, iter, "", new StringBuilder(), null); + fail("CaseMap.toTitle(iterator option + iterator) " + + "did not throw an IllegalArgumentException"); + } catch(IllegalArgumentException expected) { + } + } + @Test public void TestDutchTitle() { ULocale LOC_DUTCH = new ULocale("nl"); diff --git a/icu4j/main/tests/translit/src/com/ibm/icu/dev/test/translit/TransliteratorTest.java b/icu4j/main/tests/translit/src/com/ibm/icu/dev/test/translit/TransliteratorTest.java index 02953cbe14d..a53ea50e58b 100644 --- a/icu4j/main/tests/translit/src/com/ibm/icu/dev/test/translit/TransliteratorTest.java +++ b/icu4j/main/tests/translit/src/com/ibm/icu/dev/test/translit/TransliteratorTest.java @@ -133,7 +133,7 @@ public class TransliteratorTest extends TestFmwk { Transliterator hanLatin = Transliterator.getInstance("Han-Latin"); assertTransform("Transform", "z\u00E0o Unicode", hanLatin, "\u9020Unicode"); assertTransform("Transform", "z\u00E0i chu\u00E0ng z\u00E0o Unicode zh\u012B qi\u00E1n", hanLatin, "\u5728\u5275\u9020Unicode\u4E4B\u524D"); - } + } @Test public void TestRegistry() { @@ -510,15 +510,19 @@ public class TransliteratorTest extends TestFmwk { Transliterator hex = Transliterator.getInstance("Any-Hex"); hex.setFilter(new UnicodeFilter() { + @Override public boolean contains(int c) { return c != 'c'; } + @Override public String toPattern(boolean escapeUnprintable) { return ""; } + @Override public boolean matchesIndexValue(int v) { return false; } + @Override public void addMatchSetTo(UnicodeSet toUnionTo) {} }); String s = "abcde"; @@ -1561,6 +1565,7 @@ public class TransliteratorTest extends TestFmwk { public NameableNullTrans(String id) { super(id, null); } + @Override protected void handleTransliterate(Replaceable text, Position offsets, boolean incremental) { offsets.start = offsets.limit; @@ -1570,6 +1575,7 @@ public class TransliteratorTest extends TestFmwk { public TestFact(String theID) { id = theID; } + @Override public Transliterator getInstance(String ignoredID) { return new NameableNullTrans(id); } @@ -1873,8 +1879,8 @@ public class TransliteratorTest extends TestFmwk { t.setFilter(new UnicodeSet("[:Ll:]")); expect(t, "aAaA", "bAbA"); } finally { - Transliterator.unregister("a_to_A"); - Transliterator.unregister("A_to_b"); + Transliterator.unregister("a_to_A"); + Transliterator.unregister("A_to_b"); } } @@ -2731,6 +2737,7 @@ public class TransliteratorTest extends TestFmwk { //System.out.println("Registering: " + ID + ", " + t.toRules(true)); Transliterator.registerFactory(ID, singleton); } + @Override public Transliterator getInstance(String ID) { return (Transliterator) m.get(ID); } @@ -2751,8 +2758,17 @@ public class TransliteratorTest extends TestFmwk { String casefold = UCharacter.foldCase(s, true); assertEquals("Casefold", casefold, toCasefold.transform(s)); - String title = UCharacter.toTitleCase(ULocale.ROOT, s, null); - assertEquals("Title", title, toTitle.transform(s)); + if (i != 0x0345) { + // ICU 60 changes the default titlecasing index adjustment. + // For word breaks it is mostly the same as before, + // but it is different for the iota subscript (the only cased combining mark). + // This should be ok because the iota subscript is not supposed to appear + // at the start of a word. + // The title Transliterator is far below feature parity with the + // UCharacter and CaseMap titlecasing functions. + String title = UCharacter.toTitleCase(ULocale.ROOT, s, null); + assertEquals("Title", title, toTitle.transform(s)); + } String upper = UCharacter.toUpperCase(ULocale.ROOT, s); assertEquals("Upper", upper, toUpper.transform(s)); @@ -3008,6 +3024,7 @@ public class TransliteratorTest extends TestFmwk { Transliterator.registerFactory(ID, singleton); } + @Override public Transliterator getInstance(String ID) { return (Transliterator) m.get(new CaseInsensitiveString(ID)); } @@ -3040,7 +3057,7 @@ public class TransliteratorTest extends TestFmwk { */ @Test public void TestAny() { - UnicodeSet alphabetic = (UnicodeSet) new UnicodeSet("[:alphabetic:]").freeze(); + UnicodeSet alphabetic = new UnicodeSet("[:alphabetic:]").freeze(); StringBuffer testString = new StringBuffer(); for (int i = 0; i < UScript.CODE_LIMIT; ++i) { UnicodeSet sample = new UnicodeSet().applyPropertyAlias("script", UScript.getShortName(i)).retainAll(alphabetic); @@ -3142,7 +3159,7 @@ public class TransliteratorTest extends TestFmwk { // add all the trail characters if (!nonStarters.containsSome(trailString)) { - continue; + continue; } UnicodeSet trailSet = leadToTrail.get(first); if (trailSet == null) { @@ -3190,7 +3207,7 @@ public class TransliteratorTest extends TestFmwk { // disorderedMarks.add(s); // disorderedMarks.add(nfc.normalize(s)); // addDerivedStrings(nfc, disorderedMarks, s); - // } + // } // s = nfd.getDecomposition(i); // if (s != null) { // disorderedMarks.add(s); @@ -3292,6 +3309,10 @@ public class TransliteratorTest extends TestFmwk { addSourceTarget(s, empiricalSource, t, empiricalTarget); } } + if (rule.contains("title")) { + // See the comment in TestCasing() about the iota subscript. + empiricalSource.remove(0x345); + } assertEquals("getSource(" + ruleDisplay + ")", empiricalSource, actualSource, SetAssert.MISSING_OK); assertEquals("getTarget(" + ruleDisplay + ")", empiricalTarget, actualTarget, SetAssert.MISSING_OK); } @@ -3336,8 +3357,8 @@ public class TransliteratorTest extends TestFmwk { String direction = t == t0 ? "FORWARD\t" : "REVERSE\t"; targetIndex++; UnicodeSet expectedTarget = testPair.length <= targetIndex ? expectedSource - : testPair[targetIndex] == null ? expectedSource - : testPair[targetIndex].length() == 0 ? expectedSource + : testPair[targetIndex] == null ? expectedSource + : testPair[targetIndex].length() == 0 ? expectedSource : new UnicodeSet(testPair[targetIndex]); ok = assertEquals(direction + "getSource\t\"" + test + '"', expectedSource, source); if (!ok) { // for debugging @@ -3410,7 +3431,7 @@ public class TransliteratorTest extends TestFmwk { }; for (String[] row : startTests) { int actual = findSharedStartLength(row[1], row[2]); - assertEquals("findSharedStartLength(" + row[1] + "," + row[2] + ")", + assertEquals("findSharedStartLength(" + row[1] + "," + row[2] + ")", Integer.parseInt(row[0]), actual); } @@ -3423,8 +3444,8 @@ public class TransliteratorTest extends TestFmwk { }; for (String[] row : endTests) { int actual = findSharedEndLength(row[1], row[2]); - assertEquals("findSharedEndLength(" + row[1] + "," + row[2] + ")", - Integer.parseInt(row[0]), + assertEquals("findSharedEndLength(" + row[1] + "," + row[2] + ")", + Integer.parseInt(row[0]), actual); } } @@ -3916,7 +3937,7 @@ the ::BEGIN/::END stuff) @Test public void TestThai() { Transliterator tr = Transliterator.getInstance("Any-Latin", Transliterator.FORWARD); - String thaiText = + String thaiText = "\u0e42\u0e14\u0e22\u0e1e\u0e37\u0e49\u0e19\u0e10\u0e32\u0e19\u0e41\u0e25\u0e49\u0e27, \u0e04\u0e2d" + "\u0e21\u0e1e\u0e34\u0e27\u0e40\u0e15\u0e2d\u0e23\u0e4c\u0e08\u0e30\u0e40\u0e01\u0e35\u0e48\u0e22" + "\u0e27\u0e02\u0e49\u0e2d\u0e07\u0e01\u0e31\u0e1a\u0e40\u0e23\u0e37\u0e48\u0e2d\u0e07\u0e02\u0e2d" + @@ -3948,7 +3969,7 @@ the ::BEGIN/::END stuff) "\u0e17\u0e04\u0e19\u0e34\u0e04\u0e17\u0e35\u0e48\u0e43\u0e0a\u0e49\u0e01\u0e31\u0e19\u0e2d\u0e22" + "\u0e39\u0e48\u0e17\u0e31\u0e48\u0e27\u0e44\u0e1b."; - String latinText = + String latinText = "doy ph\u1ee5\u0304\u0302n \u1e6d\u0304h\u0101n l\u00e6\u0302w, khxmphiwtexr\u0312 ca ke\u012b\u0300" + "ywk\u0304\u0125xng k\u1ea1b re\u1ee5\u0304\u0300xng k\u0304hxng t\u1ea1wlek\u0304h. khxmphiwtexr" + "\u0312 c\u1ea1d k\u0115b t\u1ea1w x\u1ea1ks\u0304\u02b9r l\u00e6a x\u1ea1kk\u0304h ra x\u1ee5\u0304" + @@ -4041,6 +4062,7 @@ the ::BEGIN/::END stuff) this.expectedData = expectedData; } + @Override public void run() { errorMsg = null; StringBuffer inBuf = new StringBuffer(testData); -- 2.40.0