]> granicus.if.org Git - icu/commitdiff
ICU-9198 add titlecasing options: wholeString, sentences, adjustToCased
authorMarkus Scherer <markus.icu@gmail.com>
Fri, 9 Jun 2017 23:04:03 +0000 (23:04 +0000)
committerMarkus Scherer <markus.icu@gmail.com>
Fri, 9 Jun 2017 23:04:03 +0000 (23:04 +0000)
X-SVN-Rev: 40164

19 files changed:
icu4c/source/common/ucase.h
icu4c/source/common/ucasemap.cpp
icu4c/source/common/ucasemap_imp.h
icu4c/source/common/ucasemap_titlecase_brkiter.cpp
icu4c/source/common/unicode/casemap.h
icu4c/source/common/unicode/stringoptions.h
icu4c/source/common/unicode/ucasemap.h
icu4c/source/common/unicode/unistr.h
icu4c/source/common/unistr_titlecase_brkiter.cpp
icu4c/source/common/ustr_titlecase_brkiter.cpp
icu4c/source/common/ustrcase.cpp
icu4c/source/test/intltest/strcase.cpp
icu4j/main/classes/core/src/com/ibm/icu/impl/CaseMapImpl.java
icu4j/main/classes/core/src/com/ibm/icu/impl/LocaleDisplayNamesImpl.java
icu4j/main/classes/core/src/com/ibm/icu/impl/UCaseProps.java
icu4j/main/classes/core/src/com/ibm/icu/lang/UCharacter.java
icu4j/main/classes/core/src/com/ibm/icu/text/CaseMap.java
icu4j/main/tests/core/src/com/ibm/icu/dev/test/lang/UCharacterCaseTest.java
icu4j/main/tests/translit/src/com/ibm/icu/dev/test/translit/TransliteratorTest.java

index 0240641132d643161a8fd1131119677887dc8efd..9d6365eadfcca2564d8549f2a461c6aae866e177 100644 (file)
@@ -69,10 +69,16 @@ enum {
 /**
  * Bit mask for getting just the options from a string compare options word
  * that are relevant for case folding (of a single string or code point).
+ *
+ * Currently only bit 0 for U_FOLD_CASE_EXCLUDE_SPECIAL_I.
+ * It is conceivable that at some point we might use one more bit for using uppercase sharp s.
+ * It is conceivable that at some point we might want the option to use only simple case foldings
+ * when operating on strings.
+ *
  * See stringoptions.h.
  * @internal
  */
-#define _FOLD_CASE_OPTIONS_MASK 0xff
+#define _FOLD_CASE_OPTIONS_MASK 7
 
 /* single-code point functions */
 
index c21c4453b7641fa8ad4c6a1094316061cc0266b8..1f83c0d6a06bcca3482283b692d4edf28b57a9db 100644 (file)
@@ -381,7 +381,7 @@ ucasemap_internalUTF8ToTitle(
         const uint8_t *src, int32_t srcLength,
         icu::Edits *edits,
         UErrorCode &errorCode) {
-    if(U_FAILURE(errorCode)) {
+    if (!ustrcase_checkTitleAdjustmentOptions(options, errorCode)) {
         return 0;
     }
 
@@ -408,45 +408,38 @@ ucasemap_internalUTF8ToTitle(
         }
 
         /*
-         * Unicode 4 & 5 section 3.13 Default Case Operations:
-         *
-         * R3  toTitlecase(X): Find the word boundaries based on Unicode Standard Annex
-         * #29, "Text Boundaries." Between each pair of word boundaries, find the first
-         * cased character F. If F exists, map F to default_title(F); then map each
-         * subsequent character C to default_lower(C).
-         *
-         * In this implementation, segment [prev..index[ into 3 parts:
-         * a) uncased characters (copy as-is) [prev..titleStart[
-         * b) first case letter (titlecase)         [titleStart..titleLimit[
+         * Segment [prev..index[ into 3 parts:
+         * a) skipped characters (copy as-is) [prev..titleStart[
+         * b) first letter (titlecase)              [titleStart..titleLimit[
          * c) subsequent characters (lowercase)                 [titleLimit..index[
          */
         if(prev<index) {
-            /* find and copy uncased characters [prev..titleStart[ */
+            /* find and copy skipped characters [prev..titleStart[ */
             int32_t titleStart=prev;
             int32_t titleLimit=prev;
             UChar32 c;
             U8_NEXT(src, titleLimit, index, c);
-            if((options&U_TITLECASE_NO_BREAK_ADJUSTMENT)==0 && UCASE_NONE==ucase_getType(c)) {
-                /* Adjust the titlecasing index (titleStart) to the next cased character. */
-                for(;;) {
+            if ((options&U_TITLECASE_NO_BREAK_ADJUSTMENT)==0) {
+                // Adjust the titlecasing index to the next cased character,
+                // or to the next letter/number/symbol/private use.
+                // Stop with titleStart<titleLimit<=index
+                // if there is a character to be titlecased,
+                // or else stop with titleStart==titleLimit==index.
+                UBool toCased = (options&U_TITLECASE_ADJUST_TO_CASED) != 0;
+                while (toCased ? UCASE_NONE==ucase_getType(c) : !ustrcase_isLNS(c)) {
                     titleStart=titleLimit;
                     if(titleLimit==index) {
-                        /*
-                         * only uncased characters in [prev..index[
-                         * stop with titleStart==titleLimit==index
-                         */
                         break;
                     }
                     U8_NEXT(src, titleLimit, index, c);
-                    if(UCASE_NONE!=ucase_getType(c)) {
-                        break; /* cased letter at [titleStart..titleLimit[ */
-                    }
                 }
-                destIndex=appendUnchanged(dest, destIndex, destCapacity,
-                                          src+prev, titleStart-prev, options, edits);
-                if(destIndex<0) {
-                    errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
-                    return 0;
+                if (prev < titleStart) {
+                    destIndex=appendUnchanged(dest, destIndex, destCapacity,
+                                              src+prev, titleStart-prev, options, edits);
+                    if(destIndex<0) {
+                        errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
+                        return 0;
+                    }
                 }
             }
 
index 5a670964f634f8ea17d5719a1dbe63ed5c7e6d4c..345a734658b26afd72248452f2585a2a0c4ec967 100644 (file)
@@ -9,8 +9,27 @@
 
 #include "unicode/utypes.h"
 #include "unicode/ucasemap.h"
+#include "unicode/uchar.h"
 #include "ucase.h"
 
+/**
+ * Bit mask for the titlecasing iterator options bit field.
+ * Currently only 3 out of 8 values are used:
+ * 0 (words), U_TITLECASE_WHOLE_STRING, U_TITLECASE_SENTENCES.
+ * See stringoptions.h.
+ * @internal
+ */
+#define U_TITLECASE_ITERATOR_MASK 0xe0
+
+/**
+ * Bit mask for the titlecasing index adjustment options bit set.
+ * Currently two bits are defined:
+ * U_TITLECASE_NO_BREAK_ADJUSTMENT, U_TITLECASE_ADJUST_TO_CASED.
+ * See stringoptions.h.
+ * @internal
+ */
+#define U_TITLECASE_ADJUSTMENT_MASK 0x600
+
 /**
  * Internal API, used by u_strcasecmp() etc.
  * Compare strings case-insensitively,
@@ -23,7 +42,7 @@ u_strcmpFold(const UChar *s1, int32_t length1,
              UErrorCode *pErrorCode);
 
 /**
- * Interanl API, used for detecting length of
+ * Internal API, used for detecting length of
  * shared prefix case-insensitively.
  * @param s1            input string 1
  * @param length1       length of string 1, or -1 (NULL terminated)
@@ -52,6 +71,40 @@ uprv_haveProperties(UErrorCode *pErrorCode);
 
 #ifdef __cplusplus
 
+U_NAMESPACE_BEGIN
+
+/** Returns TRUE if the options are valid. Otherwise FALSE, and sets an error. */
+inline UBool ustrcase_checkTitleAdjustmentOptions(uint32_t options, UErrorCode &errorCode) {
+    if (U_FAILURE(errorCode)) { return FALSE; }
+    if ((options & U_TITLECASE_ADJUSTMENT_MASK) == U_TITLECASE_ADJUSTMENT_MASK) {
+        // Both options together.
+        errorCode = U_ILLEGAL_ARGUMENT_ERROR;
+        return FALSE;
+    }
+    return TRUE;
+}
+
+inline UBool ustrcase_isLNS(UChar32 c) {
+    // Letter, number, symbol,
+    // or a private use code point because those are typically used as letters or numbers.
+    // Consider modifier letters only if they are cased.
+    const uint32_t LNS = (U_GC_L_MASK|U_GC_N_MASK|U_GC_S_MASK|U_GC_CO_MASK) & ~U_GC_LM_MASK;
+    int gc = u_charType(c);
+    return (U_MASK(gc) & LNS) != 0 || (gc == U_MODIFIER_LETTER && ucase_getType(c) != UCASE_NONE);
+}
+
+#if !UCONFIG_NO_BREAK_ITERATION
+
+/** Returns nullptr if error. Pass in either locale or locID, not both. */
+U_CFUNC
+BreakIterator *ustrcase_getTitleBreakIterator(
+        const Locale *locale, const char *locID, uint32_t options, BreakIterator *iter,
+        LocalPointer<BreakIterator> &ownedIter, UErrorCode &errorCode);
+
+#endif
+
+U_NAMESPACE_END
+
 #include "unicode/unistr.h"  // for UStringCaseMapper
 
 /*
index a253850fa290cf8f195e52449bd0ae1fcb4d1bb2..2e09a5548a1c579654fbd92048ddec7c94a0a425 100644 (file)
@@ -42,11 +42,8 @@ int32_t CaseMap::utf8ToTitle(
     UText utext=UTEXT_INITIALIZER;
     utext_openUTF8(&utext, src, srcLength, &errorCode);
     LocalPointer<BreakIterator> ownedIter;
+    iter = ustrcase_getTitleBreakIterator(nullptr, locale, options, iter, ownedIter, errorCode);
     if(iter==NULL) {
-        iter=BreakIterator::createWordInstance(Locale(locale), errorCode);
-        ownedIter.adoptInstead(iter);
-    }
-    if(U_FAILURE(errorCode)) {
         utext_close(&utext);
         return 0;
     }
@@ -88,12 +85,19 @@ ucasemap_utf8ToTitle(UCaseMap *csm,
     }
     UText utext=UTEXT_INITIALIZER;
     utext_openUTF8(&utext, (const char *)src, srcLength, pErrorCode);
-    if(csm->iter==NULL) {
-        csm->iter=BreakIterator::createWordInstance(Locale(csm->locale), *pErrorCode);
-    }
     if (U_FAILURE(*pErrorCode)) {
         return 0;
     }
+    if(csm->iter==NULL) {
+        LocalPointer<BreakIterator> ownedIter;
+        BreakIterator *iter = ustrcase_getTitleBreakIterator(
+            nullptr, csm->locale, csm->options, nullptr, ownedIter, *pErrorCode);
+        if (iter == nullptr) {
+            utext_close(&utext);
+            return 0;
+        }
+        csm->iter = ownedIter.orphan();
+    }
     csm->iter->setText(&utext, *pErrorCode);
     int32_t length=ucasemap_mapUTF8(
             csm->caseLocale, csm->options, csm->iter,
index 1b8af69a26f6f2470044adc325908e331c479f5a..581f1ab532ae6a628787f856cfa80b2337640572 100644 (file)
@@ -113,7 +113,9 @@ public:
      *
      * @param locale    The locale ID. ("" = root locale, NULL = default locale.)
      * @param options   Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT,
-     *                  U_TITLECASE_NO_LOWERCASE, U_TITLECASE_NO_BREAK_ADJUSTMENT.
+     *                  U_TITLECASE_NO_LOWERCASE,
+     *                  U_TITLECASE_NO_BREAK_ADJUSTMENT, U_TITLECASE_ADJUST_TO_CASED,
+     *                  U_TITLECASE_WHOLE_STRING, U_TITLECASE_SENTENCES.
      * @param iter      A break iterator to find the first characters of words that are to be titlecased.
      *                  It is set to the source string (setText())
      *                  and used one or more times for iteration (first() and next()).
@@ -272,9 +274,11 @@ public:
      *
      * @param locale    The locale ID. ("" = root locale, NULL = default locale.)
      * @param options   Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT,
-     *                  U_TITLECASE_NO_LOWERCASE, U_TITLECASE_NO_BREAK_ADJUSTMENT.
+     *                  U_TITLECASE_NO_LOWERCASE,
+     *                  U_TITLECASE_NO_BREAK_ADJUSTMENT, U_TITLECASE_ADJUST_TO_CASED,
+     *                  U_TITLECASE_WHOLE_STRING, U_TITLECASE_SENTENCES.
      * @param iter      A break iterator to find the first characters of words that are to be titlecased.
-     *                  It is set to the source string (setText())
+     *                  It is set to the source string (setUText())
      *                  and used one or more times for iteration (first() and next()).
      *                  If NULL, then a word break iterator for the locale is used
      *                  (or something equivalent).
index 975e193609de791c4f0df053ca3c53f68228a73a..270b9de691c9546af830e4806f0bbb0f0c14779b 100644 (file)
  */
 #define U_FOLD_CASE_EXCLUDE_SPECIAL_I 1
 
+#ifndef U_HIDE_DRAFT_API
+
+/**
+ * Titlecase the string as a whole rather than each word.
+ * (Titlecase only the character at index 0, possibly adjusted.)
+ * Option bits value for titlecasing APIs that take an options bit set.
+ *
+ * It is an error to specify multiple titlecasing iterator options together,
+ * including both an options bit and an explicit BreakIterator.
+ *
+ * @see U_TITLECASE_ADJUST_TO_CASED
+ * @draft ICU 60
+ */
+#define U_TITLECASE_WHOLE_STRING 0x20
+
+/**
+ * Titlecase sentences rather than words.
+ * (Titlecase only the first character of each sentence, possibly adjusted.)
+ * Option bits value for titlecasing APIs that take an options bit set.
+ *
+ * It is an error to specify multiple titlecasing iterator options together,
+ * including both an options bit and an explicit BreakIterator.
+ *
+ * @see U_TITLECASE_ADJUST_TO_CASED
+ * @draft ICU 60
+ */
+#define U_TITLECASE_SENTENCES 0x40
+
+#endif  // U_HIDE_DRAFT_API
+
 /**
  * Do not lowercase non-initial parts of words when titlecasing.
  * Option bit for titlecasing APIs that take an options bit set.
  *
- * By default, titlecasing will titlecase the first cased character
- * of a word and lowercase all other characters.
+ * By default, titlecasing will titlecase the character at each
+ * (possibly adjusted) BreakIterator index and
+ * lowercase all other characters up to the next iterator index.
  * With this option, the other characters will not be modified.
  *
+ * @see U_TITLECASE_ADJUST_TO_CASED
+ * @see UnicodeString::toTitle
+ * @see CaseMap::toTitle
  * @see ucasemap_setOptions
  * @see ucasemap_toTitle
  * @see ucasemap_utf8ToTitle
- * @see UnicodeString::toTitle
  * @stable ICU 3.8
  */
 #define U_TITLECASE_NO_LOWERCASE 0x100
 
 /**
- * Do not adjust the titlecasing indexes from BreakIterator::next() indexes;
+ * Do not adjust the titlecasing BreakIterator indexes;
  * titlecase exactly the characters at breaks from the iterator.
  * Option bit for titlecasing APIs that take an options bit set.
  *
  * By default, titlecasing will take each break iterator index,
- * adjust it by looking for the next cased character, and titlecase that one.
- * Other characters are lowercased.
+ * adjust it to the next relevant character (see U_TITLECASE_ADJUST_TO_CASED),
+ * and titlecase that one.
  *
- * This follows Unicode 4 & 5 section 3.13 Default Case Operations:
+ * Other characters are lowercased.
  *
- * R3  toTitlecase(X): Find the word boundaries based on Unicode Standard Annex
- * #29, "Text Boundaries." Between each pair of word boundaries, find the first
- * cased character F. If F exists, map F to default_title(F); then map each
- * subsequent character C to default_lower(C).
+ * It is an error to specify multiple titlecasing adjustment options together.
  *
+ * @see U_TITLECASE_ADJUST_TO_CASED
+ * @see U_TITLECASE_NO_LOWERCASE
+ * @see UnicodeString::toTitle
+ * @see CaseMap::toTitle
  * @see ucasemap_setOptions
  * @see ucasemap_toTitle
  * @see ucasemap_utf8ToTitle
- * @see UnicodeString::toTitle
- * @see U_TITLECASE_NO_LOWERCASE
  * @stable ICU 3.8
  */
 #define U_TITLECASE_NO_BREAK_ADJUSTMENT 0x200
 
 #ifndef U_HIDE_DRAFT_API
 
+/**
+ * Adjust each titlecasing BreakIterator index to the next cased character.
+ * (See the Unicode Standard, chapter 3, Default Case Conversion, R3 toTitlecase(X).)
+ * Option bit for titlecasing APIs that take an options bit set.
+ *
+ * This used to be the default index adjustment in ICU.
+ * Since ICU 60, the default index adjustment is to the next character that is
+ * a letter, number, symbol, or private use code point.
+ * (Uncased modifier letters are skipped.)
+ * The difference in behavior is small for word titlecasing,
+ * but the new adjustment is much better for whole-string and sentence titlecasing:
+ * It yields "49ers" and "«丰(abc)»" instead of "49Ers" and "«丰(Abc)»".
+ *
+ * It is an error to specify multiple titlecasing adjustment options together.
+ *
+ * @see U_TITLECASE_NO_BREAK_ADJUSTMENT
+ * @draft ICU 60
+ */
+#define U_TITLECASE_ADJUST_TO_CASED 0x400
+
 /**
  * Omit unchanged text when recording how source substrings
  * relate to changed and unchanged result substrings.
 //
 // Internal: (may change or be removed)
 // ucase.h #define _STRCASECMP_OPTIONS_MASK 0xffff
-// ucase.h #define _FOLD_CASE_OPTIONS_MASK 0xff
+// ucase.h #define _FOLD_CASE_OPTIONS_MASK 7
+// ucasemap_imp.h #define U_TITLECASE_ITERATOR_MASK 0xe0
+// ucasemap_imp.h #define U_TITLECASE_ADJUSTMENT_MASK 0x600
 // ustr_imp.h #define _STRNCMP_STYLE 0x1000
 // unormcmp.cpp #define _COMPARE_EQUIV 0x80000
 
index 7c69bdc20764e0c8da99780565f869f2c9bcde23..6b253e3d638475e535179283467914f180a6a55d 100644 (file)
@@ -202,7 +202,7 @@ ucasemap_setBreakIterator(UCaseMap *csm, UBreakIterator *iterToAdopt, UErrorCode
  * The standard titlecase iterator for the root locale implements the
  * algorithm of Unicode TR 21.
  *
- * This function uses only the setUText(), first(), next() and close() methods of the
+ * This function uses only the setText(), first() and next() methods of the
  * provided break iterator.
  *
  * The result may be longer or shorter than the original.
index 445d57c911a72f967b9267f1aa9f1fd93925c05a..ede23973c92d67633c31225dfe6b6cfa37b979f7 100644 (file)
@@ -2775,11 +2775,11 @@ public:
    *                  break iterator is opened.
    *                  Otherwise the provided iterator is set to the string's text.
    * @param locale    The locale to consider.
+   * @param options   Options bit set, usually 0. See U_TITLECASE_NO_LOWERCASE,
+   *                  U_TITLECASE_NO_BREAK_ADJUSTMENT, U_TITLECASE_ADJUST_TO_CASED,
+   *                  U_TITLECASE_WHOLE_STRING, U_TITLECASE_SENTENCES.
    * @param options Options bit set, see ucasemap_open().
    * @return A reference to this.
-   * @see U_TITLECASE_NO_LOWERCASE
-   * @see U_TITLECASE_NO_BREAK_ADJUSTMENT
-   * @see ucasemap_open
    * @stable ICU 3.8
    */
   UnicodeString &toTitle(BreakIterator *titleIter, const Locale &locale, uint32_t options);
index a0ff7719317d892ca9c6cb9d000c0860526f6c29..4969884b0dc9b95c98d30f7e760298833944b67c 100644 (file)
 U_NAMESPACE_BEGIN
 
 UnicodeString &
-UnicodeString::toTitle(BreakIterator *titleIter) {
-  return toTitle(titleIter, Locale::getDefault(), 0);
+UnicodeString::toTitle(BreakIterator *iter) {
+    return toTitle(iter, Locale::getDefault(), 0);
 }
 
 UnicodeString &
-UnicodeString::toTitle(BreakIterator *titleIter, const Locale &locale) {
-  return toTitle(titleIter, locale, 0);
+UnicodeString::toTitle(BreakIterator *iter, const Locale &locale) {
+    return toTitle(iter, locale, 0);
 }
 
 UnicodeString &
-UnicodeString::toTitle(BreakIterator *titleIter, const Locale &locale, uint32_t options) {
-  BreakIterator *bi=titleIter;
-  if(bi==NULL) {
-    UErrorCode errorCode=U_ZERO_ERROR;
-    bi=BreakIterator::createWordInstance(locale, errorCode);
-    if(U_FAILURE(errorCode)) {
-      setToBogus();
-      return *this;
+UnicodeString::toTitle(BreakIterator *iter, const Locale &locale, uint32_t options) {
+    LocalPointer<BreakIterator> ownedIter;
+    UErrorCode errorCode = U_ZERO_ERROR;
+    iter = ustrcase_getTitleBreakIterator(&locale, "", options, iter, ownedIter, errorCode);
+    if (iter == nullptr) {
+        setToBogus();
+        return *this;
     }
-  }
-  caseMap(ustrcase_getCaseLocale(locale.getBaseName()), options, bi, ustrcase_internalToTitle);
-  if(titleIter==NULL) {
-    delete bi;
-  }
-  return *this;
+    caseMap(ustrcase_getCaseLocale(locale.getBaseName()), options, iter, ustrcase_internalToTitle);
+    return *this;
 }
 
 U_NAMESPACE_END
index 0b2ba02064b3249c72783411d6ad3d064b45513d..d71cdb6035ed012c347a790b99c8c32daca42a13 100644 (file)
 
 #include "unicode/brkiter.h"
 #include "unicode/casemap.h"
+#include "unicode/chariter.h"
 #include "unicode/localpointer.h"
 #include "unicode/ubrk.h"
 #include "unicode/ucasemap.h"
+#include "unicode/utext.h"
 #include "cmemory.h"
+#include "uassert.h"
 #include "ucase.h"
 #include "ucasemap_imp.h"
 
-U_NAMESPACE_USE
+U_NAMESPACE_BEGIN
 
-/* functions available in the common library (for unistr_case.cpp) */
+/**
+ * Whole-string BreakIterator.
+ * Titlecasing only calls setText(), first(), and next().
+ * We implement the rest only to satisfy the abstract interface.
+ */
+class WholeStringBreakIterator : public BreakIterator {
+public:
+    WholeStringBreakIterator() : BreakIterator(), length(0) {}
+    ~WholeStringBreakIterator() override;
+    UBool operator==(const BreakIterator&) const override;
+    BreakIterator *clone() const override;
+    static UClassID U_EXPORT2 getStaticClassID();
+    UClassID getDynamicClassID() const override;
+    CharacterIterator &getText() const override;
+    UText *getUText(UText *fillIn, UErrorCode &errorCode) const override;
+    void  setText(const UnicodeString &text) override;
+    void  setText(UText *text, UErrorCode &errorCode) override;
+    void  adoptText(CharacterIterator* it) override;
+    int32_t first() override;
+    int32_t last() override;
+    int32_t previous() override;
+    int32_t next() override;
+    int32_t current() const override;
+    int32_t following(int32_t offset) override;
+    int32_t preceding(int32_t offset) override;
+    UBool isBoundary(int32_t offset) override;
+    int32_t next(int32_t n) override;
+    BreakIterator *createBufferClone(void *stackBuffer, int32_t &BufferSize,
+                                     UErrorCode &errorCode) override;
+    BreakIterator &refreshInputText(UText *input, UErrorCode &errorCode) override;
 
-/* public API functions */
+private:
+    int32_t length;
+};
 
-U_CAPI int32_t U_EXPORT2
-u_strToTitle(UChar *dest, int32_t destCapacity,
-             const UChar *src, int32_t srcLength,
-             UBreakIterator *titleIter,
-             const char *locale,
-             UErrorCode *pErrorCode) {
-    LocalPointer<BreakIterator> ownedIter;
-    BreakIterator *iter;
-    if(titleIter!=NULL) {
-        iter=reinterpret_cast<BreakIterator *>(titleIter);
-    } else {
-        iter=BreakIterator::createWordInstance(Locale(locale), *pErrorCode);
-        ownedIter.adoptInstead(iter);
+UOBJECT_DEFINE_RTTI_IMPLEMENTATION(WholeStringBreakIterator)
+
+WholeStringBreakIterator::~WholeStringBreakIterator() {}
+UBool WholeStringBreakIterator::operator==(const BreakIterator&) const { return FALSE; }
+BreakIterator *WholeStringBreakIterator::clone() const { return nullptr; }
+
+CharacterIterator &WholeStringBreakIterator::getText() const {
+    U_ASSERT(FALSE);  // really should not be called
+    // Returns a null reference.
+    // Otherwise we would have to define a dummy CharacterIterator,
+    // and either have it as a field and const_cast it to a non-const reference,
+    // or have it via a pointer and return a reference to that.
+    CharacterIterator *none = nullptr;
+    return *none;
+}
+UText *WholeStringBreakIterator::getUText(UText * /*fillIn*/, UErrorCode &errorCode) const {
+    if (U_SUCCESS(errorCode)) {
+        errorCode = U_UNSUPPORTED_ERROR;
     }
-    if(U_FAILURE(*pErrorCode)) {
-        return 0;
+    return nullptr;
+}
+
+void  WholeStringBreakIterator::setText(const UnicodeString &text) {
+    length = text.length();
+}
+void  WholeStringBreakIterator::setText(UText *text, UErrorCode &errorCode) {
+    if (U_SUCCESS(errorCode)) {
+        int64_t length64 = utext_nativeLength(text);
+        if (length64 <= INT32_MAX) {
+            length = (int32_t)length64;
+        } else {
+            errorCode = U_INDEX_OUTOFBOUNDS_ERROR;
+        }
     }
-    UnicodeString s(srcLength<0, src, srcLength);
-    iter->setText(s);
-    return ustrcase_mapWithOverlap(
-        ustrcase_getCaseLocale(locale), 0, iter,
-        dest, destCapacity,
-        src, srcLength,
-        ustrcase_internalToTitle, *pErrorCode);
+}
+void  WholeStringBreakIterator::adoptText(CharacterIterator* it) {
+    U_ASSERT(FALSE);  // should not be called
+    length = it->getLength();
+    delete it;
 }
 
-U_NAMESPACE_BEGIN
+int32_t WholeStringBreakIterator::first() { return 0; }
+int32_t WholeStringBreakIterator::last() { return length; }
+int32_t WholeStringBreakIterator::previous() { return 0; }
+int32_t WholeStringBreakIterator::next() { return length; }
+int32_t WholeStringBreakIterator::current() const { return 0; }
+int32_t WholeStringBreakIterator::following(int32_t /*offset*/) { return length; }
+int32_t WholeStringBreakIterator::preceding(int32_t /*offset*/) { return 0; }
+UBool WholeStringBreakIterator::isBoundary(int32_t /*offset*/) { return FALSE; }
+int32_t WholeStringBreakIterator::next(int32_t /*n*/) { return length; }
+
+BreakIterator *WholeStringBreakIterator::createBufferClone(
+        void * /*stackBuffer*/, int32_t & /*BufferSize*/, UErrorCode &errorCode) {
+    if (U_SUCCESS(errorCode)) {
+        errorCode = U_UNSUPPORTED_ERROR;
+    }
+    return nullptr;
+}
+BreakIterator &WholeStringBreakIterator::refreshInputText(
+        UText * /*input*/, UErrorCode &errorCode) {
+    if (U_SUCCESS(errorCode)) {
+        errorCode = U_UNSUPPORTED_ERROR;
+    }
+    return *this;
+}
+
+U_CFUNC
+BreakIterator *ustrcase_getTitleBreakIterator(
+        const Locale *locale, const char *locID, uint32_t options, BreakIterator *iter,
+        LocalPointer<BreakIterator> &ownedIter, UErrorCode &errorCode) {
+    if (U_FAILURE(errorCode)) { return nullptr; }
+    options &= U_TITLECASE_ITERATOR_MASK;
+    if (options != 0 && iter != nullptr) {
+        errorCode = U_ILLEGAL_ARGUMENT_ERROR;
+        return nullptr;
+    }
+    if (iter == nullptr) {
+        switch (options) {
+        case 0:
+            iter = BreakIterator::createWordInstance(
+                locale != nullptr ? *locale : Locale(locID), errorCode);
+            break;
+        case U_TITLECASE_WHOLE_STRING:
+            iter = new WholeStringBreakIterator();
+            if (iter == nullptr) {
+                errorCode = U_MEMORY_ALLOCATION_ERROR;
+            }
+            break;
+        case U_TITLECASE_SENTENCES:
+            iter = BreakIterator::createSentenceInstance(
+                locale != nullptr ? *locale : Locale(locID), errorCode);
+            break;
+        default:
+            errorCode = U_ILLEGAL_ARGUMENT_ERROR;
+            break;
+        }
+        ownedIter.adoptInstead(iter);
+    }
+    return iter;
+}
 
 int32_t CaseMap::toTitle(
         const char *locale, uint32_t options, BreakIterator *iter,
@@ -70,11 +177,8 @@ int32_t CaseMap::toTitle(
         UChar *dest, int32_t destCapacity, Edits *edits,
         UErrorCode &errorCode) {
     LocalPointer<BreakIterator> ownedIter;
+    iter = ustrcase_getTitleBreakIterator(nullptr, locale, options, iter, ownedIter, errorCode);
     if(iter==NULL) {
-        iter=BreakIterator::createWordInstance(Locale(locale), errorCode);
-        ownedIter.adoptInstead(iter);
-    }
-    if(U_FAILURE(errorCode)) {
         return 0;
     }
     UnicodeString s(srcLength<0, src, srcLength);
@@ -88,6 +192,30 @@ int32_t CaseMap::toTitle(
 
 U_NAMESPACE_END
 
+U_NAMESPACE_USE
+
+U_CAPI int32_t U_EXPORT2
+u_strToTitle(UChar *dest, int32_t destCapacity,
+             const UChar *src, int32_t srcLength,
+             UBreakIterator *titleIter,
+             const char *locale,
+             UErrorCode *pErrorCode) {
+    LocalPointer<BreakIterator> ownedIter;
+    BreakIterator *iter = ustrcase_getTitleBreakIterator(
+        nullptr, locale, 0, reinterpret_cast<BreakIterator *>(titleIter),
+        ownedIter, *pErrorCode);
+    if (iter == nullptr) {
+        return 0;
+    }
+    UnicodeString s(srcLength<0, src, srcLength);
+    iter->setText(s);
+    return ustrcase_mapWithOverlap(
+        ustrcase_getCaseLocale(locale), 0, iter,
+        dest, destCapacity,
+        src, srcLength,
+        ustrcase_internalToTitle, *pErrorCode);
+}
+
 U_CAPI int32_t U_EXPORT2
 ucasemap_toTitle(UCaseMap *csm,
                  UChar *dest, int32_t destCapacity,
@@ -97,10 +225,13 @@ ucasemap_toTitle(UCaseMap *csm,
         return 0;
     }
     if (csm->iter == NULL) {
-        csm->iter = BreakIterator::createWordInstance(Locale(csm->locale), *pErrorCode);
-    }
-    if (U_FAILURE(*pErrorCode)) {
-        return 0;
+        LocalPointer<BreakIterator> ownedIter;
+        BreakIterator *iter = ustrcase_getTitleBreakIterator(
+            nullptr, csm->locale, csm->options, nullptr, ownedIter, *pErrorCode);
+        if (iter == nullptr) {
+            return 0;
+        }
+        csm->iter = ownedIter.orphan();
     }
     UnicodeString s(srcLength<0, src, srcLength);
     csm->iter->setText(s);
index 57f6c8b755cbbbb3da30c9436507865f78eedd9d..6fffb90a3891b4ba2435cfd7c0237414c4838a5e 100644 (file)
@@ -237,7 +237,7 @@ ustrcase_internalToTitle(int32_t caseLocale, uint32_t options, BreakIterator *it
                          const UChar *src, int32_t srcLength,
                          icu::Edits *edits,
                          UErrorCode &errorCode) {
-    if(U_FAILURE(errorCode)) {
+    if (!ustrcase_checkTitleAdjustmentOptions(options, errorCode)) {
         return 0;
     }
 
@@ -264,45 +264,38 @@ ustrcase_internalToTitle(int32_t caseLocale, uint32_t options, BreakIterator *it
         }
 
         /*
-         * Unicode 4 & 5 section 3.13 Default Case Operations:
-         *
-         * R3  toTitlecase(X): Find the word boundaries based on Unicode Standard Annex
-         * #29, "Text Boundaries." Between each pair of word boundaries, find the first
-         * cased character F. If F exists, map F to default_title(F); then map each
-         * subsequent character C to default_lower(C).
-         *
-         * In this implementation, segment [prev..index[ into 3 parts:
-         * a) uncased characters (copy as-is) [prev..titleStart[
-         * b) first case letter (titlecase)         [titleStart..titleLimit[
+         * Segment [prev..index[ into 3 parts:
+         * a) skipped characters (copy as-is) [prev..titleStart[
+         * b) first letter (titlecase)              [titleStart..titleLimit[
          * c) subsequent characters (lowercase)                 [titleLimit..index[
          */
         if(prev<index) {
-            /* find and copy uncased characters [prev..titleStart[ */
+            // Find and copy skipped characters [prev..titleStart[
             int32_t titleStart=prev;
             int32_t titleLimit=prev;
             UChar32 c;
             U16_NEXT(src, titleLimit, index, c);
-            if((options&U_TITLECASE_NO_BREAK_ADJUSTMENT)==0 && UCASE_NONE==ucase_getType(c)) {
-                /* Adjust the titlecasing index (titleStart) to the next cased character. */
-                for(;;) {
+            if ((options&U_TITLECASE_NO_BREAK_ADJUSTMENT)==0) {
+                // Adjust the titlecasing index to the next cased character,
+                // or to the next letter/number/symbol/private use.
+                // Stop with titleStart<titleLimit<=index
+                // if there is a character to be titlecased,
+                // or else stop with titleStart==titleLimit==index.
+                UBool toCased = (options&U_TITLECASE_ADJUST_TO_CASED) != 0;
+                while (toCased ? UCASE_NONE==ucase_getType(c) : !ustrcase_isLNS(c)) {
                     titleStart=titleLimit;
                     if(titleLimit==index) {
-                        /*
-                         * only uncased characters in [prev..index[
-                         * stop with titleStart==titleLimit==index
-                         */
                         break;
                     }
                     U16_NEXT(src, titleLimit, index, c);
-                    if(UCASE_NONE!=ucase_getType(c)) {
-                        break; /* cased letter at [titleStart..titleLimit[ */
-                    }
                 }
-                destIndex=appendUnchanged(dest, destIndex, destCapacity,
-                                          src+prev, titleStart-prev, options, edits);
-                if(destIndex<0) {
-                    errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
-                    return 0;
+                if (prev < titleStart) {
+                    destIndex=appendUnchanged(dest, destIndex, destCapacity,
+                                              src+prev, titleStart-prev, options, edits);
+                    if(destIndex<0) {
+                        errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
+                        return 0;
+                    }
                 }
             }
 
index 3edad8e698782a6cc3dfa5ed2cb3905e33a4c7a2..a3901b2302e6351ca0a0cbf4947d7e88e60f8114 100644 (file)
@@ -19,6 +19,7 @@
 */
 
 #include "unicode/std_string.h"
+#include "unicode/brkiter.h"
 #include "unicode/casemap.h"
 #include "unicode/edits.h"
 #include "unicode/uchar.h"
@@ -49,6 +50,7 @@ public:
                         int32_t whichCase,
                         void *iter, const char *localeID, uint32_t options);
     void TestCasing();
+    void TestTitleOptions();
     void TestFullCaseFoldingIterator();
     void TestGreekUpper();
     void TestLongUpper();
@@ -84,6 +86,7 @@ StringCaseTest::runIndexedTest(int32_t index, UBool exec, const char *&name, cha
     TESTCASE_AUTO(TestCaseConversion);
 #if !UCONFIG_NO_BREAK_ITERATION && !UCONFIG_NO_FILE_IO && !UCONFIG_NO_LEGACY_CONVERSION
     TESTCASE_AUTO(TestCasing);
+    TESTCASE_AUTO(TestTitleOptions);
 #endif
     TESTCASE_AUTO(TestFullCaseFoldingIterator);
     TESTCASE_AUTO(TestGreekUpper);
@@ -593,6 +596,59 @@ StringCaseTest::TestCasing() {
 #endif
 }
 
+void
+StringCaseTest::TestTitleOptions() {
+    // New options in ICU 60.
+    TestCasingImpl(u"ʻcAt! ʻeTc.", u"ʻCat! ʻetc.", TEST_TITLE,
+                   nullptr, "", U_TITLECASE_WHOLE_STRING);
+    TestCasingImpl(u"a ʻCaT. A ʻdOg! ʻeTc.", u"A ʻCaT. A ʻdOg! ʻETc.", TEST_TITLE,
+                   nullptr, "", U_TITLECASE_SENTENCES|U_TITLECASE_NO_LOWERCASE);
+    TestCasingImpl(u"49eRs", u"49ers", TEST_TITLE,
+                   nullptr, "", U_TITLECASE_WHOLE_STRING);
+    TestCasingImpl(u"«丰(aBc)»", u"«丰(abc)»", TEST_TITLE,
+                   nullptr, "", U_TITLECASE_WHOLE_STRING);
+    TestCasingImpl(u"49eRs", u"49Ers", TEST_TITLE,
+                   nullptr, "", U_TITLECASE_WHOLE_STRING|U_TITLECASE_ADJUST_TO_CASED);
+    TestCasingImpl(u"«丰(aBc)»", u"«丰(Abc)»", TEST_TITLE,
+                   nullptr, "", U_TITLECASE_WHOLE_STRING|U_TITLECASE_ADJUST_TO_CASED);
+    TestCasingImpl(u" john. Smith", u" John. Smith", TEST_TITLE,
+                   nullptr, "", U_TITLECASE_WHOLE_STRING|U_TITLECASE_NO_LOWERCASE);
+    TestCasingImpl(u" john. Smith", u" john. smith", TEST_TITLE,
+                   nullptr, "", U_TITLECASE_WHOLE_STRING|U_TITLECASE_NO_BREAK_ADJUSTMENT);
+    TestCasingImpl(u"«ijs»", u"«IJs»", TEST_TITLE,
+                   nullptr, "nl-BE", U_TITLECASE_WHOLE_STRING);
+    TestCasingImpl(u"«ijs»", u"«İjs»", TEST_TITLE,
+                   nullptr, "tr-DE", U_TITLECASE_WHOLE_STRING);
+
+    // Test conflicting settings.
+    // If & when we add more options, then the ORed combinations may become
+    // indistinguishable from valid values.
+    IcuTestErrorCode errorCode(*this, "TestTitleOptions");
+    CaseMap::toTitle("", U_TITLECASE_NO_BREAK_ADJUSTMENT|U_TITLECASE_ADJUST_TO_CASED, nullptr,
+                     u"", 0, nullptr, 0, nullptr, errorCode);
+    if (errorCode.get() != U_ILLEGAL_ARGUMENT_ERROR) {
+        errln("CaseMap::toTitle(multiple adjustment options) -> %s not illegal argument",
+              errorCode.errorName());
+    }
+    errorCode.reset();
+    CaseMap::toTitle("", U_TITLECASE_WHOLE_STRING|U_TITLECASE_SENTENCES, nullptr,
+                     u"", 0, nullptr, 0, nullptr, errorCode);
+    if (errorCode.get() != U_ILLEGAL_ARGUMENT_ERROR) {
+        errln("CaseMap::toTitle(multiple iterator options) -> %s not illegal argument",
+              errorCode.errorName());
+    }
+    errorCode.reset();
+    LocalPointer<BreakIterator> iter(
+        BreakIterator::createCharacterInstance(Locale::getRoot(), errorCode));
+    CaseMap::toTitle("", U_TITLECASE_WHOLE_STRING, iter.getAlias(),
+                     u"", 0, nullptr, 0, nullptr, errorCode);
+    if (errorCode.get() != U_ILLEGAL_ARGUMENT_ERROR) {
+        errln("CaseMap::toTitle(iterator option + iterator) -> %s not illegal argument",
+              errorCode.errorName());
+    }
+    errorCode.reset();
+}
+
 void
 StringCaseTest::TestFullCaseFoldingIterator() {
     UnicodeString ffi=UNICODE_STRING_SIMPLE("ffi");
index f28e60ed5eaa358434124237189ec49a34a9877e..b59b54fdc33b19d2587a50dffef8e0f55a75266e 100644 (file)
@@ -3,11 +3,15 @@
 package com.ibm.icu.impl;
 
 import java.io.IOException;
+import java.text.CharacterIterator;
+import java.util.Locale;
 
 import com.ibm.icu.lang.UCharacter;
+import com.ibm.icu.lang.UCharacterCategory;
 import com.ibm.icu.text.BreakIterator;
 import com.ibm.icu.text.Edits;
 import com.ibm.icu.util.ICUUncheckedIOException;
+import com.ibm.icu.util.ULocale;
 
 public final class CaseMapImpl {
     /**
@@ -134,11 +138,192 @@ public final class CaseMapImpl {
         protected int dir; // 0=initial state  >0=forward  <0=backward
     }
 
+    public static final int TITLECASE_WHOLE_STRING = 0x20;
+    public static final int TITLECASE_SENTENCES = 0x40;
+
+    /**
+     * Bit mask for the titlecasing iterator options bit field.
+     * Currently only 3 out of 8 values are used:
+     * 0 (words), TITLECASE_WHOLE_STRING, TITLECASE_SENTENCES.
+     * See stringoptions.h.
+     * @internal
+     */
+    private static final int TITLECASE_ITERATOR_MASK = 0xe0;
+
+    public static final int TITLECASE_ADJUST_TO_CASED = 0x400;
+
+    /**
+     * Bit mask for the titlecasing index adjustment options bit set.
+     * Currently two bits are defined:
+     * TITLECASE_NO_BREAK_ADJUSTMENT, TITLECASE_ADJUST_TO_CASED.
+     * See stringoptions.h.
+     * @internal
+     */
+    private static final int TITLECASE_ADJUSTMENT_MASK = 0x600;
+
+    public static int addTitleAdjustmentOption(int options, int newOption) {
+        int adjOptions = options & TITLECASE_ADJUSTMENT_MASK;
+        if (adjOptions !=0 && adjOptions != newOption) {
+            throw new IllegalArgumentException("multiple titlecasing index adjustment options");
+        }
+        return options | newOption;
+    }
+
+    private static final int LNS =
+            (1 << UCharacterCategory.UPPERCASE_LETTER) |
+            (1 << UCharacterCategory.LOWERCASE_LETTER) |
+            (1 << UCharacterCategory.TITLECASE_LETTER) |
+            // Not MODIFIER_LETTER: We count only cased modifier letters.
+            (1 << UCharacterCategory.OTHER_LETTER) |
+
+            (1 << UCharacterCategory.DECIMAL_DIGIT_NUMBER) |
+            (1 << UCharacterCategory.LETTER_NUMBER) |
+            (1 << UCharacterCategory.OTHER_NUMBER) |
+
+            (1 << UCharacterCategory.MATH_SYMBOL) |
+            (1 << UCharacterCategory.CURRENCY_SYMBOL) |
+            (1 << UCharacterCategory.MODIFIER_SYMBOL) |
+            (1 << UCharacterCategory.OTHER_SYMBOL) |
+
+            (1 << UCharacterCategory.PRIVATE_USE);
+
+    private static boolean isLNS(int c) {
+        // Letter, number, symbol,
+        // or a private use code point because those are typically used as letters or numbers.
+        // Consider modifier letters only if they are cased.
+        int gc = UCharacterProperty.INSTANCE.getType(c);
+        return ((1 << gc) & LNS) != 0 ||
+                (gc == UCharacterCategory.MODIFIER_LETTER &&
+                    UCaseProps.INSTANCE.getType(c) != UCaseProps.NONE);
+    }
+
+    public static int addTitleIteratorOption(int options, int newOption) {
+        int iterOptions = options & TITLECASE_ITERATOR_MASK;
+        if (iterOptions !=0 && iterOptions != newOption) {
+            throw new IllegalArgumentException("multiple titlecasing iterator options");
+        }
+        return options | newOption;
+    }
+
+    public static BreakIterator getTitleBreakIterator(
+            Locale locale, int options, BreakIterator iter) {
+        options &= TITLECASE_ITERATOR_MASK;
+        if (options != 0 && iter != null) {
+            throw new IllegalArgumentException(
+                    "titlecasing iterator option together with an explicit iterator");
+        }
+        if (iter == null) {
+            switch (options) {
+            case 0:
+                iter = BreakIterator.getWordInstance(locale);
+                break;
+            case TITLECASE_WHOLE_STRING:
+                iter = new WholeStringBreakIterator();
+                break;
+            case TITLECASE_SENTENCES:
+                iter = BreakIterator.getSentenceInstance(locale);
+                break;
+            default:
+                throw new IllegalArgumentException("unknown titlecasing iterator option");
+            }
+        }
+        return iter;
+    }
+
+    public static BreakIterator getTitleBreakIterator(
+            ULocale locale, int options, BreakIterator iter) {
+        options &= TITLECASE_ITERATOR_MASK;
+        if (options != 0 && iter != null) {
+            throw new IllegalArgumentException(
+                    "titlecasing iterator option together with an explicit iterator");
+        }
+        if (iter == null) {
+            switch (options) {
+            case 0:
+                iter = BreakIterator.getWordInstance(locale);
+                break;
+            case TITLECASE_WHOLE_STRING:
+                iter = new WholeStringBreakIterator();
+                break;
+            case TITLECASE_SENTENCES:
+                iter = BreakIterator.getSentenceInstance(locale);
+                break;
+            default:
+                throw new IllegalArgumentException("unknown titlecasing iterator option");
+            }
+        }
+        return iter;
+    }
+
     /**
      * Omit unchanged text when case-mapping with Edits.
      */
     public static final int OMIT_UNCHANGED_TEXT = 0x4000;
 
+    private static final class WholeStringBreakIterator extends BreakIterator {
+        private int length;
+
+        private static void notImplemented() {
+            throw new UnsupportedOperationException("should not occur");
+        }
+
+        @Override
+        public int first() {
+            return 0;
+        }
+
+        @Override
+        public int last() {
+            notImplemented();
+            return 0;
+        }
+
+        @Override
+        public int next(int n) {
+            notImplemented();
+            return 0;
+        }
+
+        @Override
+        public int next() {
+            return length;
+        }
+
+        @Override
+        public int previous() {
+            notImplemented();
+            return 0;
+        }
+
+        @Override
+        public int following(int offset) {
+            notImplemented();
+            return 0;
+        }
+
+        @Override
+        public int current() {
+            notImplemented();
+            return 0;
+        }
+
+        @Override
+        public CharacterIterator getText() {
+            notImplemented();
+            return null;
+        }
+
+        @Override
+        public void setText(CharacterIterator newText) {
+            length = newText.getEndIndex();
+        }
+
+        @Override
+        public void setText(String newText) {
+            length = newText.length();
+        }
+    }
+
     private static int appendCodePoint(Appendable a, int c) throws IOException {
         if (c <= Character.MAX_VALUE) {
             a.append((char)c);
@@ -266,32 +451,33 @@ public final class CaseMapImpl {
                 }
 
                 /*
-                 * Unicode 4 & 5 section 3.13 Default Case Operations:
-                 *
-                 * R3  toTitlecase(X): Find the word boundaries based on Unicode Standard Annex
-                 * #29, "Text Boundaries." Between each pair of word boundaries, find the first
-                 * cased character F. If F exists, map F to default_title(F); then map each
-                 * subsequent character C to default_lower(C).
-                 *
-                 * In this implementation, segment [prev..index[ into 3 parts:
-                 * a) uncased characters (copy as-is) [prev..titleStart[
-                 * b) first case letter (titlecase)         [titleStart..titleLimit[
+                 * Segment [prev..index[ into 3 parts:
+                 * a) skipped characters (copy as-is) [prev..titleStart[
+                 * b) first letter (titlecase)              [titleStart..titleLimit[
                  * c) subsequent characters (lowercase)                 [titleLimit..index[
                  */
                 if(prev<index) {
-                    // find and copy uncased characters [prev..titleStart[
+                    // Find and copy skipped characters [prev..titleStart[
                     int titleStart=prev;
                     iter.setLimit(index);
                     int c=iter.nextCaseMapCP();
-                    if((options&UCharacter.TITLECASE_NO_BREAK_ADJUSTMENT)==0
-                            && UCaseProps.NONE==UCaseProps.INSTANCE.getType(c)) {
-                        // Adjust the titlecasing index (titleStart) to the next cased character.
-                        while((c=iter.nextCaseMapCP())>=0
-                                && UCaseProps.NONE==UCaseProps.INSTANCE.getType(c)) {}
+                    if ((options&UCharacter.TITLECASE_NO_BREAK_ADJUSTMENT)==0) {
+                        // Adjust the titlecasing index to the next cased character,
+                        // or to the next letter/number/symbol/private use.
+                        // Stop with titleStart<titleLimit<=index
+                        // if there is a character to be titlecased,
+                        // or else stop with titleStart==titleLimit==index.
+                        boolean toCased = (options&CaseMapImpl.TITLECASE_ADJUST_TO_CASED) != 0;
+                        while ((toCased ?
+                                    UCaseProps.NONE==UCaseProps.INSTANCE.getType(c) :
+                                        !CaseMapImpl.isLNS(c)) &&
+                                (c=iter.nextCaseMapCP())>=0) {}
                         // If c<0 then we have only uncased characters in [prev..index[
                         // and stopped with titleStart==titleLimit==index.
                         titleStart=iter.getCPStart();
-                        appendUnchanged(src, prev, titleStart-prev, dest, options, edits);
+                        if (prev < titleStart) {
+                            appendUnchanged(src, prev, titleStart-prev, dest, options, edits);
+                        }
                     }
 
                     if(titleStart<index) {
index 03ff19efdddfa43da8b8d906d028e74440fcbe72..2b5076cff8184c449b24e05b035b6854f4bf1618 100644 (file)
@@ -26,6 +26,7 @@ import com.ibm.icu.impl.locale.AsciiUtil;
 import com.ibm.icu.lang.UCharacter;
 import com.ibm.icu.lang.UScript;
 import com.ibm.icu.text.BreakIterator;
+import com.ibm.icu.text.CaseMap;
 import com.ibm.icu.text.DisplayContext;
 import com.ibm.icu.text.DisplayContext.Type;
 import com.ibm.icu.text.LocaleDisplayNames;
@@ -86,6 +87,13 @@ public class LocaleDisplayNamesImpl extends LocaleDisplayNames {
      */
     private transient BreakIterator capitalizationBrkIter = null;
 
+    private static final CaseMap.Title TO_TITLE_WHOLE_STRING_NO_LOWERCASE =
+            CaseMap.toTitle().wholeString().noLowercase();
+
+    private static String toTitleWholeStringNoLowercase(ULocale locale, String s) {
+        return TO_TITLE_WHOLE_STRING_NO_LOWERCASE.apply(
+                locale.toLocale(), null, s, new StringBuilder(), null).toString();
+    }
 
     public static LocaleDisplayNames getInstance(ULocale locale, DialectHandling dialectHandling) {
         synchronized (cache) {
@@ -602,9 +610,12 @@ public class LocaleDisplayNamesImpl extends LocaleDisplayNames {
         ULocale minimized = ULocale.minimizeSubtags(modified, ULocale.Minimize.FAVOR_SCRIPT);
         String tempName = modified.getDisplayName(locale);
         boolean titlecase = capContext == DisplayContext.CAPITALIZATION_FOR_UI_LIST_OR_MENU;
-        String nameInDisplayLocale =  titlecase ? UCharacter.toTitleFirst(locale, tempName) : tempName;
+        String nameInDisplayLocale =
+                titlecase ? toTitleWholeStringNoLowercase(locale, tempName) : tempName;
         tempName = modified.getDisplayName(modified);
-        String nameInSelf = capContext == DisplayContext.CAPITALIZATION_FOR_UI_LIST_OR_MENU ? UCharacter.toTitleFirst(modified, tempName) : tempName;
+        String nameInSelf = capContext ==
+                DisplayContext.CAPITALIZATION_FOR_UI_LIST_OR_MENU ?
+                        toTitleWholeStringNoLowercase(modified, tempName) : tempName;
         return new UiListItem(minimized, modified, nameInDisplayLocale, nameInSelf);
     }
 
index 6b5619d23cee0f41b6c0d5e09991fb7c2bd638f3..0882229766e5c14b83df7becaf04a43e175b6c4c 100644 (file)
@@ -1124,9 +1124,15 @@ public final class UCaseProps {
     /**
      * Bit mask for getting just the options from a string compare options word
      * that are relevant for case folding (of a single string or code point).
+     *
+     * Currently only bit 0 for FOLD_CASE_EXCLUDE_SPECIAL_I.
+     * It is conceivable that at some point we might use one more bit for using uppercase sharp s.
+     * It is conceivable that at some point we might want the option to use only simple case foldings
+     * when operating on strings.
+     *
      * @internal
      */
-    private static final int FOLD_CASE_OPTIONS_MASK = 0xff;
+    private static final int FOLD_CASE_OPTIONS_MASK = 7;
 
     /* return the simple case folding mapping for c */
     public final int fold(int c, int options) {
index 4416e1208c29c916039fea1e9f602cba833ba610..fc970dc790bb20973e36e84abab60cd92e34d29c 100644 (file)
@@ -5185,22 +5185,14 @@ public final class UCharacter implements ECharacterCategory, ECharacterDirection
      */
     public static String toTitleCase(ULocale locale, String str,
             BreakIterator titleIter, int options) {
-        if(titleIter == null) {
-            if (locale == null) {
-                locale = ULocale.getDefault();
-            }
-            titleIter = BreakIterator.getWordInstance(locale);
+        if (titleIter == null && locale == null) {
+            locale = ULocale.getDefault();
         }
+        titleIter = CaseMapImpl.getTitleBreakIterator(locale, options, titleIter);
         titleIter.setText(str);
         return toTitleCase(getCaseLocale(locale), options, titleIter, str);
     }
 
-
-    private static final int BREAK_MASK =
-            (1<<UCharacterCategory.DECIMAL_DIGIT_NUMBER)
-            | (1<<UCharacterCategory.OTHER_LETTER)
-            | (1<<UCharacterCategory.MODIFIER_LETTER);
-
     /**
      * Return a string with just the first word titlecased, for menus and UI, etc. This does not affect most of the string,
      * and sometimes has no effect at all; the original string is returned whenever casing
@@ -5225,49 +5217,14 @@ public final class UCharacter implements ECharacterCategory, ECharacterDirection
      */
     @Deprecated
     public static String toTitleFirst(ULocale locale, String str) {
-        int c = 0;
-        for (int i = 0; i < str.length(); i += UCharacter.charCount(c)) {
-            c = UCharacter.codePointAt(str, i);
-            int propertyMask = UCharacter.getIntPropertyValue(c, UProperty.GENERAL_CATEGORY_MASK);
-            if ((propertyMask & BREAK_MASK) != 0) { // handle "49ers", initial CJK
-                break;
-            }
-            if (UCaseProps.INSTANCE.getType(c) == UCaseProps.NONE) {
-                continue;
-            }
-
-            // we now have the first cased character
-            // What we really want is something like:
-            // String titled = UCharacter.toTitleCase(locale, str, i, outputCharsTaken);
-            // That is, just give us the titlecased string, for the locale, at i and following,
-            // and tell us how many characters are replaced.
-            // The following won't work completely: it needs some more substantial changes to UCaseProps
-
-            String substring = str.substring(i, i+UCharacter.charCount(c));
-            String titled = UCharacter.toTitleCase(locale, substring, BreakIterator.getSentenceInstance(locale), 0);
-
-            // skip if no change
-            if (titled.codePointAt(0) == c) {
-                // Using 0 is safe, since any change in titling will not have first initial character
-                break;
-            }
-            StringBuilder result = new StringBuilder(str.length()).append(str, 0, i);
-            int startOfSuffix;
-
-            // handle dutch, but check first for 'i', since that's faster. Should be built into UCaseProps.
-
-            if (c == 'i' && locale.getLanguage().equals("nl") && i < str.length() && str.charAt(i+1) == 'j') {
-                result.append("IJ");
-                startOfSuffix = 2;
-            } else {
-                result.append(titled);
-                startOfSuffix = i + UCharacter.charCount(c);
-            }
-
-            // add the remainder, and return
-            return result.append(str, startOfSuffix, str.length()).toString();
-        }
-        return str; // no change
+        return toTitleCase(locale, str, null,
+                CaseMapImpl.TITLECASE_WHOLE_STRING|TITLECASE_NO_LOWERCASE);
+        // TODO: Remove this function.
+        // Move something like the following helper function into CLDR.
+        // private static final CaseMap.Title TO_TITLE_WHOLE_STRING_NO_LOWERCASE =
+        //         CaseMap.toTitle().wholeString().noLowercase();
+        // return TO_TITLE_WHOLE_STRING_NO_LOWERCASE.apply(
+        //         locale.toLocale(), null, str, new StringBuilder(), null).toString();
     }
 
     /**
@@ -5295,9 +5252,10 @@ public final class UCharacter implements ECharacterCategory, ECharacterDirection
     public static String toTitleCase(Locale locale, String str,
             BreakIterator titleIter,
             int options) {
-        if(titleIter == null) {
-            titleIter = BreakIterator.getWordInstance(locale);
+        if (titleIter == null && locale == null) {
+            locale = Locale.getDefault();
         }
+        titleIter = CaseMapImpl.getTitleBreakIterator(locale, options, titleIter);
         titleIter.setText(str);
         return toTitleCase(getCaseLocale(locale), options, titleIter, str);
     }
index e998c66248ad51042985dee369183757c6430024..2f05a967f258dfb9e72051ae747fd66a98e206a6 100644 (file)
@@ -174,6 +174,42 @@ public abstract class CaseMap {
         private static final Title OMIT_UNCHANGED = new Title(CaseMapImpl.OMIT_UNCHANGED_TEXT);
         private Title(int opt) { super(opt); }
 
+        /**
+         * Returns an instance that behaves like this one but
+         * titlecases the string as a whole rather than each word.
+         * (Titlecases only the character at index 0, possibly adjusted.)
+         *
+         * <p>It is an error to specify multiple titlecasing iterator options together,
+         * including both an option and an explicit BreakIterator.
+         *
+         * @return an options object with this option.
+         * @see #adjustToCased()
+         * @draft ICU 60
+         * @provisional This API might change or be removed in a future release.
+         */
+        public Title wholeString() {
+            return new Title(CaseMapImpl.addTitleIteratorOption(
+                    internalOptions, CaseMapImpl.TITLECASE_WHOLE_STRING));
+        }
+
+        /**
+         * Returns an instance that behaves like this one but
+         * titlecases sentences rather than words.
+         * (Titlecases only the first character of each sentence, possibly adjusted.)
+         *
+         * <p>It is an error to specify multiple titlecasing iterator options together,
+         * including both an option and an explicit BreakIterator.
+         *
+         * @return an options object with this option.
+         * @see #adjustToCased()
+         * @draft ICU 60
+         * @provisional This API might change or be removed in a future release.
+         */
+        public Title sentences() {
+            return new Title(CaseMapImpl.addTitleIteratorOption(
+                    internalOptions, CaseMapImpl.TITLECASE_SENTENCES));
+        }
+
         /**
          * {@inheritDoc}
          * @draft ICU 59
@@ -191,12 +227,14 @@ public abstract class CaseMap {
          * Returns an instance that behaves like this one but
          * does not lowercase non-initial parts of words when titlecasing.
          *
-         * <p>By default, titlecasing will titlecase the first cased character
-         * of a word and lowercase all other characters.
+         * <p>By default, titlecasing will titlecase the character at each
+         * (possibly adjusted) BreakIterator index and
+         * lowercase all other characters up to the next iterator index.
          * With this option, the other characters will not be modified.
          *
          * @return an options object with this option.
          * @see UCharacter#TITLECASE_NO_LOWERCASE
+         * @see #adjustToCased()
          * @draft ICU 59
          * @provisional This API might change or be removed in a future release.
          */
@@ -204,22 +242,16 @@ public abstract class CaseMap {
             return new Title(internalOptions | UCharacter.TITLECASE_NO_LOWERCASE);
         }
 
-        // TODO: update references to the Unicode Standard for recent version
         /**
          * Returns an instance that behaves like this one but
-         * does not adjust the titlecasing indexes from BreakIterator::next() indexes;
+         * does not adjust the titlecasing BreakIterator indexes;
          * titlecases exactly the characters at breaks from the iterator.
          *
          * <p>By default, titlecasing will take each break iterator index,
-         * adjust it by looking for the next cased character, and titlecase that one.
-         * Other characters are lowercased.
-         *
-         * <p>This follows Unicode 4 &amp; 5 section 3.13 Default Case Operations:
+         * adjust it to the next relevant character (see {@link #adjustToCased()}),
+         * and titlecase that one.
          *
-         * R3  toTitlecase(X): Find the word boundaries based on Unicode Standard Annex
-         * #29, "Text Boundaries." Between each pair of word boundaries, find the first
-         * cased character F. If F exists, map F to default_title(F); then map each
-         * subsequent character C to default_lower(C).
+         * <p>Other characters are lowercased.
          *
          * @return an options object with this option.
          * @see UCharacter#TITLECASE_NO_BREAK_ADJUSTMENT
@@ -227,7 +259,33 @@ public abstract class CaseMap {
          * @provisional This API might change or be removed in a future release.
          */
         public Title noBreakAdjustment() {
-            return new Title(internalOptions | UCharacter.TITLECASE_NO_BREAK_ADJUSTMENT);
+            return new Title(CaseMapImpl.addTitleAdjustmentOption(
+                    internalOptions, UCharacter.TITLECASE_NO_BREAK_ADJUSTMENT));
+        }
+
+        /**
+         * Returns an instance that behaves like this one but
+         * adjusts each titlecasing BreakIterator index to the next cased character.
+         * (See the Unicode Standard, chapter 3, Default Case Conversion, R3 toTitlecase(X).)
+         *
+         * <p>This used to be the default index adjustment in ICU.
+         * Since ICU 60, the default index adjustment is to the next character that is
+         * a letter, number, symbol, or private use code point.
+         * (Uncased modifier letters are skipped.)
+         * The difference in behavior is small for word titlecasing,
+         * but the new adjustment is much better for whole-string and sentence titlecasing:
+         * It yields "49ers" and "«丰(abc)»" instead of "49Ers" and "«丰(Abc)»".
+         *
+         * <p>It is an error to specify multiple titlecasing adjustment options together.
+         *
+         * @return an options object with this option.
+         * @see #noBreakAdjustment()
+         * @draft ICU 60
+         * @provisional This API might change or be removed in a future release.
+         */
+        public Title adjustToCased() {
+            return new Title(CaseMapImpl.addTitleAdjustmentOption(
+                    internalOptions, CaseMapImpl.TITLECASE_ADJUST_TO_CASED));
         }
 
         /**
@@ -259,9 +317,10 @@ public abstract class CaseMap {
          */
          public <A extends Appendable> A apply(
                  Locale locale, BreakIterator iter, CharSequence src, A dest, Edits edits) {
-             if (iter == null) {
-                 iter = BreakIterator.getWordInstance(locale);
+             if (iter == null && locale == null) {
+                 locale = Locale.getDefault();
              }
+             iter = CaseMapImpl.getTitleBreakIterator(locale, internalOptions, iter);
              iter.setText(src.toString());
              return CaseMapImpl.toTitle(
                      getCaseLocale(locale), internalOptions, iter, src, dest, edits);
index 6f8a67983b882cd14e612c72b5f5a9a0b1212024..8075fef6c8854870d313faf9224dee6b3cd87aa7 100644 (file)
@@ -343,6 +343,63 @@ public final class UCharacterCaseTest extends TestFmwk
          }
     }
 
+    // Not a @Test. See ICU4C intltest strcase.cpp TestCasingImpl().
+    void TestCasingImpl(String input, String output, CaseMap.Title toTitle, Locale locale) {
+        String result = toTitle.apply(locale, null, input, new StringBuilder(), null).toString();
+        assertEquals("toTitle(" + input + ')', output, result);
+    }
+
+    @Test
+    public void TestTitleOptions() {
+        Locale root = Locale.ROOT;
+        // New options in ICU 60.
+        TestCasingImpl("ʻcAt! ʻeTc.", "ʻCat! ʻetc.",
+                CaseMap.toTitle().wholeString(), root);
+        TestCasingImpl("a ʻCaT. A ʻdOg! ʻeTc.", "A ʻCaT. A ʻdOg! ʻETc.",
+                CaseMap.toTitle().sentences().noLowercase(), root);
+        TestCasingImpl("49eRs", "49ers",
+                CaseMap.toTitle().wholeString(), root);
+        TestCasingImpl("«丰(aBc)»", "«丰(abc)»",
+                CaseMap.toTitle().wholeString(), root);
+        TestCasingImpl("49eRs", "49Ers",
+                CaseMap.toTitle().wholeString().adjustToCased(), root);
+        TestCasingImpl("«丰(aBc)»", "«丰(Abc)»",
+                CaseMap.toTitle().wholeString().adjustToCased(), root);
+        TestCasingImpl(" john. Smith", " John. Smith",
+                CaseMap.toTitle().wholeString().noLowercase(), root);
+        TestCasingImpl(" john. Smith", " john. smith",
+                CaseMap.toTitle().wholeString().noBreakAdjustment(), root);
+        TestCasingImpl("«ijs»", "«IJs»",
+                CaseMap.toTitle().wholeString(), new Locale("nl", "BE"));
+        TestCasingImpl("«ijs»", "«İjs»",
+                CaseMap.toTitle().wholeString(), new Locale("tr", "DE"));
+
+        // Test conflicting settings.
+        // If & when we add more options, then the ORed combinations may become
+        // indistinguishable from valid values.
+        try {
+            CaseMap.toTitle().noBreakAdjustment().adjustToCased().
+                    apply(root, null, "", new StringBuilder(), null);
+            fail("CaseMap.toTitle(multiple adjustment options) " +
+                    "did not throw an IllegalArgumentException");
+        } catch(IllegalArgumentException expected) {
+        }
+        try {
+            CaseMap.toTitle().wholeString().sentences().
+                    apply(root, null, "", new StringBuilder(), null);
+            fail("CaseMap.toTitle(multiple iterator options) " +
+                    "did not throw an IllegalArgumentException");
+        } catch(IllegalArgumentException expected) {
+        }
+        BreakIterator iter = BreakIterator.getCharacterInstance(root);
+        try {
+            CaseMap.toTitle().wholeString().apply(root, iter, "", new StringBuilder(), null);
+            fail("CaseMap.toTitle(iterator option + iterator) " +
+                    "did not throw an IllegalArgumentException");
+        } catch(IllegalArgumentException expected) {
+        }
+    }
+
     @Test
     public void TestDutchTitle() {
         ULocale LOC_DUTCH = new ULocale("nl");
index 02953cbe14da5f6f4ff5a3eb3f144a6c95ec4da6..a53ea50e58b60eb4a7dc4b92741e49af22daac41 100644 (file)
@@ -133,7 +133,7 @@ public class TransliteratorTest extends TestFmwk {
         Transliterator hanLatin = Transliterator.getInstance("Han-Latin");
         assertTransform("Transform", "z\u00E0o Unicode", hanLatin, "\u9020Unicode");
         assertTransform("Transform", "z\u00E0i chu\u00E0ng z\u00E0o Unicode zh\u012B qi\u00E1n", hanLatin, "\u5728\u5275\u9020Unicode\u4E4B\u524D");
-    } 
+    }
 
     @Test
     public void TestRegistry() {
@@ -510,15 +510,19 @@ public class TransliteratorTest extends TestFmwk {
 
         Transliterator hex = Transliterator.getInstance("Any-Hex");
         hex.setFilter(new UnicodeFilter() {
+            @Override
             public boolean contains(int c) {
                 return c != 'c';
             }
+            @Override
             public String toPattern(boolean escapeUnprintable) {
                 return "";
             }
+            @Override
             public boolean matchesIndexValue(int v) {
                 return false;
             }
+            @Override
             public void addMatchSetTo(UnicodeSet toUnionTo) {}
         });
         String s = "abcde";
@@ -1561,6 +1565,7 @@ public class TransliteratorTest extends TestFmwk {
             public NameableNullTrans(String id) {
                 super(id, null);
             }
+            @Override
             protected void handleTransliterate(Replaceable text,
                     Position offsets, boolean incremental) {
                 offsets.start = offsets.limit;
@@ -1570,6 +1575,7 @@ public class TransliteratorTest extends TestFmwk {
         public TestFact(String theID) {
             id = theID;
         }
+        @Override
         public Transliterator getInstance(String ignoredID) {
             return new NameableNullTrans(id);
         }
@@ -1873,8 +1879,8 @@ public class TransliteratorTest extends TestFmwk {
                 t.setFilter(new UnicodeSet("[:Ll:]"));
                 expect(t, "aAaA", "bAbA");
             } finally {
-                Transliterator.unregister("a_to_A"); 
-                Transliterator.unregister("A_to_b");   
+                Transliterator.unregister("a_to_A");
+                Transliterator.unregister("A_to_b");
             }
         }
 
@@ -2731,6 +2737,7 @@ public class TransliteratorTest extends TestFmwk {
             //System.out.println("Registering: " + ID + ", " + t.toRules(true));
             Transliterator.registerFactory(ID, singleton);
         }
+        @Override
         public Transliterator getInstance(String ID) {
             return (Transliterator) m.get(ID);
         }
@@ -2751,8 +2758,17 @@ public class TransliteratorTest extends TestFmwk {
             String casefold = UCharacter.foldCase(s, true);
             assertEquals("Casefold", casefold, toCasefold.transform(s));
 
-            String title = UCharacter.toTitleCase(ULocale.ROOT, s, null);
-            assertEquals("Title", title, toTitle.transform(s));
+            if (i != 0x0345) {
+                // ICU 60 changes the default titlecasing index adjustment.
+                // For word breaks it is mostly the same as before,
+                // but it is different for the iota subscript (the only cased combining mark).
+                // This should be ok because the iota subscript is not supposed to appear
+                // at the start of a word.
+                // The title Transliterator is far below feature parity with the
+                // UCharacter and CaseMap titlecasing functions.
+                String title = UCharacter.toTitleCase(ULocale.ROOT, s, null);
+                assertEquals("Title", title, toTitle.transform(s));
+            }
 
             String upper = UCharacter.toUpperCase(ULocale.ROOT, s);
             assertEquals("Upper", upper, toUpper.transform(s));
@@ -3008,6 +3024,7 @@ public class TransliteratorTest extends TestFmwk {
             Transliterator.registerFactory(ID, singleton);
         }
 
+        @Override
         public Transliterator getInstance(String ID) {
             return (Transliterator) m.get(new CaseInsensitiveString(ID));
         }
@@ -3040,7 +3057,7 @@ public class TransliteratorTest extends TestFmwk {
      */
     @Test
     public void TestAny() {
-        UnicodeSet alphabetic = (UnicodeSet) new UnicodeSet("[:alphabetic:]").freeze();
+        UnicodeSet alphabetic = new UnicodeSet("[:alphabetic:]").freeze();
         StringBuffer testString = new StringBuffer();
         for (int i = 0; i < UScript.CODE_LIMIT; ++i) {
             UnicodeSet sample = new UnicodeSet().applyPropertyAlias("script", UScript.getShortName(i)).retainAll(alphabetic);
@@ -3142,7 +3159,7 @@ public class TransliteratorTest extends TestFmwk {
 
             // add all the trail characters
             if (!nonStarters.containsSome(trailString)) {
-                continue; 
+                continue;
             }
             UnicodeSet trailSet = leadToTrail.get(first);
             if (trailSet == null) {
@@ -3190,7 +3207,7 @@ public class TransliteratorTest extends TestFmwk {
         //                disorderedMarks.add(s);
         //                disorderedMarks.add(nfc.normalize(s));
         //                addDerivedStrings(nfc, disorderedMarks, s);
-        //            }            
+        //            }
         //            s = nfd.getDecomposition(i);
         //            if (s != null) {
         //                disorderedMarks.add(s);
@@ -3292,6 +3309,10 @@ public class TransliteratorTest extends TestFmwk {
                     addSourceTarget(s, empiricalSource, t, empiricalTarget);
                 }
             }
+            if (rule.contains("title")) {
+                // See the comment in TestCasing() about the iota subscript.
+                empiricalSource.remove(0x345);
+            }
             assertEquals("getSource(" + ruleDisplay + ")", empiricalSource, actualSource, SetAssert.MISSING_OK);
             assertEquals("getTarget(" + ruleDisplay + ")", empiricalTarget, actualTarget, SetAssert.MISSING_OK);
         }
@@ -3336,8 +3357,8 @@ public class TransliteratorTest extends TestFmwk {
                 String direction = t == t0 ? "FORWARD\t" : "REVERSE\t";
                 targetIndex++;
                 UnicodeSet expectedTarget = testPair.length <= targetIndex ? expectedSource
-                        : testPair[targetIndex] == null ? expectedSource 
-                                : testPair[targetIndex].length() == 0 ? expectedSource 
+                        : testPair[targetIndex] == null ? expectedSource
+                                : testPair[targetIndex].length() == 0 ? expectedSource
                                         : new UnicodeSet(testPair[targetIndex]);
                 ok = assertEquals(direction + "getSource\t\"" + test + '"', expectedSource, source);
                 if (!ok) { // for debugging
@@ -3410,7 +3431,7 @@ public class TransliteratorTest extends TestFmwk {
         };
         for (String[] row : startTests) {
             int actual = findSharedStartLength(row[1], row[2]);
-            assertEquals("findSharedStartLength(" + row[1] + "," + row[2] + ")", 
+            assertEquals("findSharedStartLength(" + row[1] + "," + row[2] + ")",
                     Integer.parseInt(row[0]),
                     actual);
         }
@@ -3423,8 +3444,8 @@ public class TransliteratorTest extends TestFmwk {
         };
         for (String[] row : endTests) {
             int actual = findSharedEndLength(row[1], row[2]);
-            assertEquals("findSharedEndLength(" + row[1] + "," + row[2] + ")", 
-                    Integer.parseInt(row[0]), 
+            assertEquals("findSharedEndLength(" + row[1] + "," + row[2] + ")",
+                    Integer.parseInt(row[0]),
                     actual);
         }
     }
@@ -3916,7 +3937,7 @@ the ::BEGIN/::END stuff)
     @Test
     public void TestThai() {
         Transliterator tr = Transliterator.getInstance("Any-Latin", Transliterator.FORWARD);
-        String thaiText = 
+        String thaiText =
             "\u0e42\u0e14\u0e22\u0e1e\u0e37\u0e49\u0e19\u0e10\u0e32\u0e19\u0e41\u0e25\u0e49\u0e27, \u0e04\u0e2d" +
             "\u0e21\u0e1e\u0e34\u0e27\u0e40\u0e15\u0e2d\u0e23\u0e4c\u0e08\u0e30\u0e40\u0e01\u0e35\u0e48\u0e22" +
             "\u0e27\u0e02\u0e49\u0e2d\u0e07\u0e01\u0e31\u0e1a\u0e40\u0e23\u0e37\u0e48\u0e2d\u0e07\u0e02\u0e2d" +
@@ -3948,7 +3969,7 @@ the ::BEGIN/::END stuff)
             "\u0e17\u0e04\u0e19\u0e34\u0e04\u0e17\u0e35\u0e48\u0e43\u0e0a\u0e49\u0e01\u0e31\u0e19\u0e2d\u0e22" +
             "\u0e39\u0e48\u0e17\u0e31\u0e48\u0e27\u0e44\u0e1b.";
 
-        String latinText = 
+        String latinText =
             "doy ph\u1ee5\u0304\u0302n \u1e6d\u0304h\u0101n l\u00e6\u0302w, khxmphiwtexr\u0312 ca ke\u012b\u0300" +
             "ywk\u0304\u0125xng k\u1ea1b re\u1ee5\u0304\u0300xng k\u0304hxng t\u1ea1wlek\u0304h. khxmphiwtexr" +
             "\u0312 c\u1ea1d k\u0115b t\u1ea1w x\u1ea1ks\u0304\u02b9r l\u00e6a x\u1ea1kk\u0304h ra x\u1ee5\u0304" +
@@ -4041,6 +4062,7 @@ the ::BEGIN/::END stuff)
             this.expectedData = expectedData;
         }
 
+        @Override
         public void run() {
             errorMsg = null;
             StringBuffer inBuf = new StringBuffer(testData);