/**
* Bit mask for getting just the options from a string compare options word
* that are relevant for case folding (of a single string or code point).
+ *
+ * Currently only bit 0 for U_FOLD_CASE_EXCLUDE_SPECIAL_I.
+ * It is conceivable that at some point we might use one more bit for using uppercase sharp s.
+ * It is conceivable that at some point we might want the option to use only simple case foldings
+ * when operating on strings.
+ *
* See stringoptions.h.
* @internal
*/
-#define _FOLD_CASE_OPTIONS_MASK 0xff
+#define _FOLD_CASE_OPTIONS_MASK 7
/* single-code point functions */
const uint8_t *src, int32_t srcLength,
icu::Edits *edits,
UErrorCode &errorCode) {
- if(U_FAILURE(errorCode)) {
+ if (!ustrcase_checkTitleAdjustmentOptions(options, errorCode)) {
return 0;
}
}
/*
- * Unicode 4 & 5 section 3.13 Default Case Operations:
- *
- * R3 toTitlecase(X): Find the word boundaries based on Unicode Standard Annex
- * #29, "Text Boundaries." Between each pair of word boundaries, find the first
- * cased character F. If F exists, map F to default_title(F); then map each
- * subsequent character C to default_lower(C).
- *
- * In this implementation, segment [prev..index[ into 3 parts:
- * a) uncased characters (copy as-is) [prev..titleStart[
- * b) first case letter (titlecase) [titleStart..titleLimit[
+ * Segment [prev..index[ into 3 parts:
+ * a) skipped characters (copy as-is) [prev..titleStart[
+ * b) first letter (titlecase) [titleStart..titleLimit[
* c) subsequent characters (lowercase) [titleLimit..index[
*/
if(prev<index) {
- /* find and copy uncased characters [prev..titleStart[ */
+ /* find and copy skipped characters [prev..titleStart[ */
int32_t titleStart=prev;
int32_t titleLimit=prev;
UChar32 c;
U8_NEXT(src, titleLimit, index, c);
- if((options&U_TITLECASE_NO_BREAK_ADJUSTMENT)==0 && UCASE_NONE==ucase_getType(c)) {
- /* Adjust the titlecasing index (titleStart) to the next cased character. */
- for(;;) {
+ if ((options&U_TITLECASE_NO_BREAK_ADJUSTMENT)==0) {
+ // Adjust the titlecasing index to the next cased character,
+ // or to the next letter/number/symbol/private use.
+ // Stop with titleStart<titleLimit<=index
+ // if there is a character to be titlecased,
+ // or else stop with titleStart==titleLimit==index.
+ UBool toCased = (options&U_TITLECASE_ADJUST_TO_CASED) != 0;
+ while (toCased ? UCASE_NONE==ucase_getType(c) : !ustrcase_isLNS(c)) {
titleStart=titleLimit;
if(titleLimit==index) {
- /*
- * only uncased characters in [prev..index[
- * stop with titleStart==titleLimit==index
- */
break;
}
U8_NEXT(src, titleLimit, index, c);
- if(UCASE_NONE!=ucase_getType(c)) {
- break; /* cased letter at [titleStart..titleLimit[ */
- }
}
- destIndex=appendUnchanged(dest, destIndex, destCapacity,
- src+prev, titleStart-prev, options, edits);
- if(destIndex<0) {
- errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
- return 0;
+ if (prev < titleStart) {
+ destIndex=appendUnchanged(dest, destIndex, destCapacity,
+ src+prev, titleStart-prev, options, edits);
+ if(destIndex<0) {
+ errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
+ return 0;
+ }
}
}
#include "unicode/utypes.h"
#include "unicode/ucasemap.h"
+#include "unicode/uchar.h"
#include "ucase.h"
+/**
+ * Bit mask for the titlecasing iterator options bit field.
+ * Currently only 3 out of 8 values are used:
+ * 0 (words), U_TITLECASE_WHOLE_STRING, U_TITLECASE_SENTENCES.
+ * See stringoptions.h.
+ * @internal
+ */
+#define U_TITLECASE_ITERATOR_MASK 0xe0
+
+/**
+ * Bit mask for the titlecasing index adjustment options bit set.
+ * Currently two bits are defined:
+ * U_TITLECASE_NO_BREAK_ADJUSTMENT, U_TITLECASE_ADJUST_TO_CASED.
+ * See stringoptions.h.
+ * @internal
+ */
+#define U_TITLECASE_ADJUSTMENT_MASK 0x600
+
/**
* Internal API, used by u_strcasecmp() etc.
* Compare strings case-insensitively,
UErrorCode *pErrorCode);
/**
- * Interanl API, used for detecting length of
+ * Internal API, used for detecting length of
* shared prefix case-insensitively.
* @param s1 input string 1
* @param length1 length of string 1, or -1 (NULL terminated)
#ifdef __cplusplus
+U_NAMESPACE_BEGIN
+
+/** Returns TRUE if the options are valid. Otherwise FALSE, and sets an error. */
+inline UBool ustrcase_checkTitleAdjustmentOptions(uint32_t options, UErrorCode &errorCode) {
+ if (U_FAILURE(errorCode)) { return FALSE; }
+ if ((options & U_TITLECASE_ADJUSTMENT_MASK) == U_TITLECASE_ADJUSTMENT_MASK) {
+ // Both options together.
+ errorCode = U_ILLEGAL_ARGUMENT_ERROR;
+ return FALSE;
+ }
+ return TRUE;
+}
+
+inline UBool ustrcase_isLNS(UChar32 c) {
+ // Letter, number, symbol,
+ // or a private use code point because those are typically used as letters or numbers.
+ // Consider modifier letters only if they are cased.
+ const uint32_t LNS = (U_GC_L_MASK|U_GC_N_MASK|U_GC_S_MASK|U_GC_CO_MASK) & ~U_GC_LM_MASK;
+ int gc = u_charType(c);
+ return (U_MASK(gc) & LNS) != 0 || (gc == U_MODIFIER_LETTER && ucase_getType(c) != UCASE_NONE);
+}
+
+#if !UCONFIG_NO_BREAK_ITERATION
+
+/** Returns nullptr if error. Pass in either locale or locID, not both. */
+U_CFUNC
+BreakIterator *ustrcase_getTitleBreakIterator(
+ const Locale *locale, const char *locID, uint32_t options, BreakIterator *iter,
+ LocalPointer<BreakIterator> &ownedIter, UErrorCode &errorCode);
+
+#endif
+
+U_NAMESPACE_END
+
#include "unicode/unistr.h" // for UStringCaseMapper
/*
UText utext=UTEXT_INITIALIZER;
utext_openUTF8(&utext, src, srcLength, &errorCode);
LocalPointer<BreakIterator> ownedIter;
+ iter = ustrcase_getTitleBreakIterator(nullptr, locale, options, iter, ownedIter, errorCode);
if(iter==NULL) {
- iter=BreakIterator::createWordInstance(Locale(locale), errorCode);
- ownedIter.adoptInstead(iter);
- }
- if(U_FAILURE(errorCode)) {
utext_close(&utext);
return 0;
}
}
UText utext=UTEXT_INITIALIZER;
utext_openUTF8(&utext, (const char *)src, srcLength, pErrorCode);
- if(csm->iter==NULL) {
- csm->iter=BreakIterator::createWordInstance(Locale(csm->locale), *pErrorCode);
- }
if (U_FAILURE(*pErrorCode)) {
return 0;
}
+ if(csm->iter==NULL) {
+ LocalPointer<BreakIterator> ownedIter;
+ BreakIterator *iter = ustrcase_getTitleBreakIterator(
+ nullptr, csm->locale, csm->options, nullptr, ownedIter, *pErrorCode);
+ if (iter == nullptr) {
+ utext_close(&utext);
+ return 0;
+ }
+ csm->iter = ownedIter.orphan();
+ }
csm->iter->setText(&utext, *pErrorCode);
int32_t length=ucasemap_mapUTF8(
csm->caseLocale, csm->options, csm->iter,
*
* @param locale The locale ID. ("" = root locale, NULL = default locale.)
* @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT,
- * U_TITLECASE_NO_LOWERCASE, U_TITLECASE_NO_BREAK_ADJUSTMENT.
+ * U_TITLECASE_NO_LOWERCASE,
+ * U_TITLECASE_NO_BREAK_ADJUSTMENT, U_TITLECASE_ADJUST_TO_CASED,
+ * U_TITLECASE_WHOLE_STRING, U_TITLECASE_SENTENCES.
* @param iter A break iterator to find the first characters of words that are to be titlecased.
* It is set to the source string (setText())
* and used one or more times for iteration (first() and next()).
*
* @param locale The locale ID. ("" = root locale, NULL = default locale.)
* @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT,
- * U_TITLECASE_NO_LOWERCASE, U_TITLECASE_NO_BREAK_ADJUSTMENT.
+ * U_TITLECASE_NO_LOWERCASE,
+ * U_TITLECASE_NO_BREAK_ADJUSTMENT, U_TITLECASE_ADJUST_TO_CASED,
+ * U_TITLECASE_WHOLE_STRING, U_TITLECASE_SENTENCES.
* @param iter A break iterator to find the first characters of words that are to be titlecased.
- * It is set to the source string (setText())
+ * It is set to the source string (setUText())
* and used one or more times for iteration (first() and next()).
* If NULL, then a word break iterator for the locale is used
* (or something equivalent).
*/
#define U_FOLD_CASE_EXCLUDE_SPECIAL_I 1
+#ifndef U_HIDE_DRAFT_API
+
+/**
+ * Titlecase the string as a whole rather than each word.
+ * (Titlecase only the character at index 0, possibly adjusted.)
+ * Option bits value for titlecasing APIs that take an options bit set.
+ *
+ * It is an error to specify multiple titlecasing iterator options together,
+ * including both an options bit and an explicit BreakIterator.
+ *
+ * @see U_TITLECASE_ADJUST_TO_CASED
+ * @draft ICU 60
+ */
+#define U_TITLECASE_WHOLE_STRING 0x20
+
+/**
+ * Titlecase sentences rather than words.
+ * (Titlecase only the first character of each sentence, possibly adjusted.)
+ * Option bits value for titlecasing APIs that take an options bit set.
+ *
+ * It is an error to specify multiple titlecasing iterator options together,
+ * including both an options bit and an explicit BreakIterator.
+ *
+ * @see U_TITLECASE_ADJUST_TO_CASED
+ * @draft ICU 60
+ */
+#define U_TITLECASE_SENTENCES 0x40
+
+#endif // U_HIDE_DRAFT_API
+
/**
* Do not lowercase non-initial parts of words when titlecasing.
* Option bit for titlecasing APIs that take an options bit set.
*
- * By default, titlecasing will titlecase the first cased character
- * of a word and lowercase all other characters.
+ * By default, titlecasing will titlecase the character at each
+ * (possibly adjusted) BreakIterator index and
+ * lowercase all other characters up to the next iterator index.
* With this option, the other characters will not be modified.
*
+ * @see U_TITLECASE_ADJUST_TO_CASED
+ * @see UnicodeString::toTitle
+ * @see CaseMap::toTitle
* @see ucasemap_setOptions
* @see ucasemap_toTitle
* @see ucasemap_utf8ToTitle
- * @see UnicodeString::toTitle
* @stable ICU 3.8
*/
#define U_TITLECASE_NO_LOWERCASE 0x100
/**
- * Do not adjust the titlecasing indexes from BreakIterator::next() indexes;
+ * Do not adjust the titlecasing BreakIterator indexes;
* titlecase exactly the characters at breaks from the iterator.
* Option bit for titlecasing APIs that take an options bit set.
*
* By default, titlecasing will take each break iterator index,
- * adjust it by looking for the next cased character, and titlecase that one.
- * Other characters are lowercased.
+ * adjust it to the next relevant character (see U_TITLECASE_ADJUST_TO_CASED),
+ * and titlecase that one.
*
- * This follows Unicode 4 & 5 section 3.13 Default Case Operations:
+ * Other characters are lowercased.
*
- * R3 toTitlecase(X): Find the word boundaries based on Unicode Standard Annex
- * #29, "Text Boundaries." Between each pair of word boundaries, find the first
- * cased character F. If F exists, map F to default_title(F); then map each
- * subsequent character C to default_lower(C).
+ * It is an error to specify multiple titlecasing adjustment options together.
*
+ * @see U_TITLECASE_ADJUST_TO_CASED
+ * @see U_TITLECASE_NO_LOWERCASE
+ * @see UnicodeString::toTitle
+ * @see CaseMap::toTitle
* @see ucasemap_setOptions
* @see ucasemap_toTitle
* @see ucasemap_utf8ToTitle
- * @see UnicodeString::toTitle
- * @see U_TITLECASE_NO_LOWERCASE
* @stable ICU 3.8
*/
#define U_TITLECASE_NO_BREAK_ADJUSTMENT 0x200
#ifndef U_HIDE_DRAFT_API
+/**
+ * Adjust each titlecasing BreakIterator index to the next cased character.
+ * (See the Unicode Standard, chapter 3, Default Case Conversion, R3 toTitlecase(X).)
+ * Option bit for titlecasing APIs that take an options bit set.
+ *
+ * This used to be the default index adjustment in ICU.
+ * Since ICU 60, the default index adjustment is to the next character that is
+ * a letter, number, symbol, or private use code point.
+ * (Uncased modifier letters are skipped.)
+ * The difference in behavior is small for word titlecasing,
+ * but the new adjustment is much better for whole-string and sentence titlecasing:
+ * It yields "49ers" and "«丰(abc)»" instead of "49Ers" and "«丰(Abc)»".
+ *
+ * It is an error to specify multiple titlecasing adjustment options together.
+ *
+ * @see U_TITLECASE_NO_BREAK_ADJUSTMENT
+ * @draft ICU 60
+ */
+#define U_TITLECASE_ADJUST_TO_CASED 0x400
+
/**
* Omit unchanged text when recording how source substrings
* relate to changed and unchanged result substrings.
//
// Internal: (may change or be removed)
// ucase.h #define _STRCASECMP_OPTIONS_MASK 0xffff
-// ucase.h #define _FOLD_CASE_OPTIONS_MASK 0xff
+// ucase.h #define _FOLD_CASE_OPTIONS_MASK 7
+// ucasemap_imp.h #define U_TITLECASE_ITERATOR_MASK 0xe0
+// ucasemap_imp.h #define U_TITLECASE_ADJUSTMENT_MASK 0x600
// ustr_imp.h #define _STRNCMP_STYLE 0x1000
// unormcmp.cpp #define _COMPARE_EQUIV 0x80000
* The standard titlecase iterator for the root locale implements the
* algorithm of Unicode TR 21.
*
- * This function uses only the setUText(), first(), next() and close() methods of the
+ * This function uses only the setText(), first() and next() methods of the
* provided break iterator.
*
* The result may be longer or shorter than the original.
* break iterator is opened.
* Otherwise the provided iterator is set to the string's text.
* @param locale The locale to consider.
+ * @param options Options bit set, usually 0. See U_TITLECASE_NO_LOWERCASE,
+ * U_TITLECASE_NO_BREAK_ADJUSTMENT, U_TITLECASE_ADJUST_TO_CASED,
+ * U_TITLECASE_WHOLE_STRING, U_TITLECASE_SENTENCES.
* @param options Options bit set, see ucasemap_open().
* @return A reference to this.
- * @see U_TITLECASE_NO_LOWERCASE
- * @see U_TITLECASE_NO_BREAK_ADJUSTMENT
- * @see ucasemap_open
* @stable ICU 3.8
*/
UnicodeString &toTitle(BreakIterator *titleIter, const Locale &locale, uint32_t options);
U_NAMESPACE_BEGIN
UnicodeString &
-UnicodeString::toTitle(BreakIterator *titleIter) {
- return toTitle(titleIter, Locale::getDefault(), 0);
+UnicodeString::toTitle(BreakIterator *iter) {
+ return toTitle(iter, Locale::getDefault(), 0);
}
UnicodeString &
-UnicodeString::toTitle(BreakIterator *titleIter, const Locale &locale) {
- return toTitle(titleIter, locale, 0);
+UnicodeString::toTitle(BreakIterator *iter, const Locale &locale) {
+ return toTitle(iter, locale, 0);
}
UnicodeString &
-UnicodeString::toTitle(BreakIterator *titleIter, const Locale &locale, uint32_t options) {
- BreakIterator *bi=titleIter;
- if(bi==NULL) {
- UErrorCode errorCode=U_ZERO_ERROR;
- bi=BreakIterator::createWordInstance(locale, errorCode);
- if(U_FAILURE(errorCode)) {
- setToBogus();
- return *this;
+UnicodeString::toTitle(BreakIterator *iter, const Locale &locale, uint32_t options) {
+ LocalPointer<BreakIterator> ownedIter;
+ UErrorCode errorCode = U_ZERO_ERROR;
+ iter = ustrcase_getTitleBreakIterator(&locale, "", options, iter, ownedIter, errorCode);
+ if (iter == nullptr) {
+ setToBogus();
+ return *this;
}
- }
- caseMap(ustrcase_getCaseLocale(locale.getBaseName()), options, bi, ustrcase_internalToTitle);
- if(titleIter==NULL) {
- delete bi;
- }
- return *this;
+ caseMap(ustrcase_getCaseLocale(locale.getBaseName()), options, iter, ustrcase_internalToTitle);
+ return *this;
}
U_NAMESPACE_END
#include "unicode/brkiter.h"
#include "unicode/casemap.h"
+#include "unicode/chariter.h"
#include "unicode/localpointer.h"
#include "unicode/ubrk.h"
#include "unicode/ucasemap.h"
+#include "unicode/utext.h"
#include "cmemory.h"
+#include "uassert.h"
#include "ucase.h"
#include "ucasemap_imp.h"
-U_NAMESPACE_USE
+U_NAMESPACE_BEGIN
-/* functions available in the common library (for unistr_case.cpp) */
+/**
+ * Whole-string BreakIterator.
+ * Titlecasing only calls setText(), first(), and next().
+ * We implement the rest only to satisfy the abstract interface.
+ */
+class WholeStringBreakIterator : public BreakIterator {
+public:
+ WholeStringBreakIterator() : BreakIterator(), length(0) {}
+ ~WholeStringBreakIterator() override;
+ UBool operator==(const BreakIterator&) const override;
+ BreakIterator *clone() const override;
+ static UClassID U_EXPORT2 getStaticClassID();
+ UClassID getDynamicClassID() const override;
+ CharacterIterator &getText() const override;
+ UText *getUText(UText *fillIn, UErrorCode &errorCode) const override;
+ void setText(const UnicodeString &text) override;
+ void setText(UText *text, UErrorCode &errorCode) override;
+ void adoptText(CharacterIterator* it) override;
+ int32_t first() override;
+ int32_t last() override;
+ int32_t previous() override;
+ int32_t next() override;
+ int32_t current() const override;
+ int32_t following(int32_t offset) override;
+ int32_t preceding(int32_t offset) override;
+ UBool isBoundary(int32_t offset) override;
+ int32_t next(int32_t n) override;
+ BreakIterator *createBufferClone(void *stackBuffer, int32_t &BufferSize,
+ UErrorCode &errorCode) override;
+ BreakIterator &refreshInputText(UText *input, UErrorCode &errorCode) override;
-/* public API functions */
+private:
+ int32_t length;
+};
-U_CAPI int32_t U_EXPORT2
-u_strToTitle(UChar *dest, int32_t destCapacity,
- const UChar *src, int32_t srcLength,
- UBreakIterator *titleIter,
- const char *locale,
- UErrorCode *pErrorCode) {
- LocalPointer<BreakIterator> ownedIter;
- BreakIterator *iter;
- if(titleIter!=NULL) {
- iter=reinterpret_cast<BreakIterator *>(titleIter);
- } else {
- iter=BreakIterator::createWordInstance(Locale(locale), *pErrorCode);
- ownedIter.adoptInstead(iter);
+UOBJECT_DEFINE_RTTI_IMPLEMENTATION(WholeStringBreakIterator)
+
+WholeStringBreakIterator::~WholeStringBreakIterator() {}
+UBool WholeStringBreakIterator::operator==(const BreakIterator&) const { return FALSE; }
+BreakIterator *WholeStringBreakIterator::clone() const { return nullptr; }
+
+CharacterIterator &WholeStringBreakIterator::getText() const {
+ U_ASSERT(FALSE); // really should not be called
+ // Returns a null reference.
+ // Otherwise we would have to define a dummy CharacterIterator,
+ // and either have it as a field and const_cast it to a non-const reference,
+ // or have it via a pointer and return a reference to that.
+ CharacterIterator *none = nullptr;
+ return *none;
+}
+UText *WholeStringBreakIterator::getUText(UText * /*fillIn*/, UErrorCode &errorCode) const {
+ if (U_SUCCESS(errorCode)) {
+ errorCode = U_UNSUPPORTED_ERROR;
}
- if(U_FAILURE(*pErrorCode)) {
- return 0;
+ return nullptr;
+}
+
+void WholeStringBreakIterator::setText(const UnicodeString &text) {
+ length = text.length();
+}
+void WholeStringBreakIterator::setText(UText *text, UErrorCode &errorCode) {
+ if (U_SUCCESS(errorCode)) {
+ int64_t length64 = utext_nativeLength(text);
+ if (length64 <= INT32_MAX) {
+ length = (int32_t)length64;
+ } else {
+ errorCode = U_INDEX_OUTOFBOUNDS_ERROR;
+ }
}
- UnicodeString s(srcLength<0, src, srcLength);
- iter->setText(s);
- return ustrcase_mapWithOverlap(
- ustrcase_getCaseLocale(locale), 0, iter,
- dest, destCapacity,
- src, srcLength,
- ustrcase_internalToTitle, *pErrorCode);
+}
+void WholeStringBreakIterator::adoptText(CharacterIterator* it) {
+ U_ASSERT(FALSE); // should not be called
+ length = it->getLength();
+ delete it;
}
-U_NAMESPACE_BEGIN
+int32_t WholeStringBreakIterator::first() { return 0; }
+int32_t WholeStringBreakIterator::last() { return length; }
+int32_t WholeStringBreakIterator::previous() { return 0; }
+int32_t WholeStringBreakIterator::next() { return length; }
+int32_t WholeStringBreakIterator::current() const { return 0; }
+int32_t WholeStringBreakIterator::following(int32_t /*offset*/) { return length; }
+int32_t WholeStringBreakIterator::preceding(int32_t /*offset*/) { return 0; }
+UBool WholeStringBreakIterator::isBoundary(int32_t /*offset*/) { return FALSE; }
+int32_t WholeStringBreakIterator::next(int32_t /*n*/) { return length; }
+
+BreakIterator *WholeStringBreakIterator::createBufferClone(
+ void * /*stackBuffer*/, int32_t & /*BufferSize*/, UErrorCode &errorCode) {
+ if (U_SUCCESS(errorCode)) {
+ errorCode = U_UNSUPPORTED_ERROR;
+ }
+ return nullptr;
+}
+BreakIterator &WholeStringBreakIterator::refreshInputText(
+ UText * /*input*/, UErrorCode &errorCode) {
+ if (U_SUCCESS(errorCode)) {
+ errorCode = U_UNSUPPORTED_ERROR;
+ }
+ return *this;
+}
+
+U_CFUNC
+BreakIterator *ustrcase_getTitleBreakIterator(
+ const Locale *locale, const char *locID, uint32_t options, BreakIterator *iter,
+ LocalPointer<BreakIterator> &ownedIter, UErrorCode &errorCode) {
+ if (U_FAILURE(errorCode)) { return nullptr; }
+ options &= U_TITLECASE_ITERATOR_MASK;
+ if (options != 0 && iter != nullptr) {
+ errorCode = U_ILLEGAL_ARGUMENT_ERROR;
+ return nullptr;
+ }
+ if (iter == nullptr) {
+ switch (options) {
+ case 0:
+ iter = BreakIterator::createWordInstance(
+ locale != nullptr ? *locale : Locale(locID), errorCode);
+ break;
+ case U_TITLECASE_WHOLE_STRING:
+ iter = new WholeStringBreakIterator();
+ if (iter == nullptr) {
+ errorCode = U_MEMORY_ALLOCATION_ERROR;
+ }
+ break;
+ case U_TITLECASE_SENTENCES:
+ iter = BreakIterator::createSentenceInstance(
+ locale != nullptr ? *locale : Locale(locID), errorCode);
+ break;
+ default:
+ errorCode = U_ILLEGAL_ARGUMENT_ERROR;
+ break;
+ }
+ ownedIter.adoptInstead(iter);
+ }
+ return iter;
+}
int32_t CaseMap::toTitle(
const char *locale, uint32_t options, BreakIterator *iter,
UChar *dest, int32_t destCapacity, Edits *edits,
UErrorCode &errorCode) {
LocalPointer<BreakIterator> ownedIter;
+ iter = ustrcase_getTitleBreakIterator(nullptr, locale, options, iter, ownedIter, errorCode);
if(iter==NULL) {
- iter=BreakIterator::createWordInstance(Locale(locale), errorCode);
- ownedIter.adoptInstead(iter);
- }
- if(U_FAILURE(errorCode)) {
return 0;
}
UnicodeString s(srcLength<0, src, srcLength);
U_NAMESPACE_END
+U_NAMESPACE_USE
+
+U_CAPI int32_t U_EXPORT2
+u_strToTitle(UChar *dest, int32_t destCapacity,
+ const UChar *src, int32_t srcLength,
+ UBreakIterator *titleIter,
+ const char *locale,
+ UErrorCode *pErrorCode) {
+ LocalPointer<BreakIterator> ownedIter;
+ BreakIterator *iter = ustrcase_getTitleBreakIterator(
+ nullptr, locale, 0, reinterpret_cast<BreakIterator *>(titleIter),
+ ownedIter, *pErrorCode);
+ if (iter == nullptr) {
+ return 0;
+ }
+ UnicodeString s(srcLength<0, src, srcLength);
+ iter->setText(s);
+ return ustrcase_mapWithOverlap(
+ ustrcase_getCaseLocale(locale), 0, iter,
+ dest, destCapacity,
+ src, srcLength,
+ ustrcase_internalToTitle, *pErrorCode);
+}
+
U_CAPI int32_t U_EXPORT2
ucasemap_toTitle(UCaseMap *csm,
UChar *dest, int32_t destCapacity,
return 0;
}
if (csm->iter == NULL) {
- csm->iter = BreakIterator::createWordInstance(Locale(csm->locale), *pErrorCode);
- }
- if (U_FAILURE(*pErrorCode)) {
- return 0;
+ LocalPointer<BreakIterator> ownedIter;
+ BreakIterator *iter = ustrcase_getTitleBreakIterator(
+ nullptr, csm->locale, csm->options, nullptr, ownedIter, *pErrorCode);
+ if (iter == nullptr) {
+ return 0;
+ }
+ csm->iter = ownedIter.orphan();
}
UnicodeString s(srcLength<0, src, srcLength);
csm->iter->setText(s);
const UChar *src, int32_t srcLength,
icu::Edits *edits,
UErrorCode &errorCode) {
- if(U_FAILURE(errorCode)) {
+ if (!ustrcase_checkTitleAdjustmentOptions(options, errorCode)) {
return 0;
}
}
/*
- * Unicode 4 & 5 section 3.13 Default Case Operations:
- *
- * R3 toTitlecase(X): Find the word boundaries based on Unicode Standard Annex
- * #29, "Text Boundaries." Between each pair of word boundaries, find the first
- * cased character F. If F exists, map F to default_title(F); then map each
- * subsequent character C to default_lower(C).
- *
- * In this implementation, segment [prev..index[ into 3 parts:
- * a) uncased characters (copy as-is) [prev..titleStart[
- * b) first case letter (titlecase) [titleStart..titleLimit[
+ * Segment [prev..index[ into 3 parts:
+ * a) skipped characters (copy as-is) [prev..titleStart[
+ * b) first letter (titlecase) [titleStart..titleLimit[
* c) subsequent characters (lowercase) [titleLimit..index[
*/
if(prev<index) {
- /* find and copy uncased characters [prev..titleStart[ */
+ // Find and copy skipped characters [prev..titleStart[
int32_t titleStart=prev;
int32_t titleLimit=prev;
UChar32 c;
U16_NEXT(src, titleLimit, index, c);
- if((options&U_TITLECASE_NO_BREAK_ADJUSTMENT)==0 && UCASE_NONE==ucase_getType(c)) {
- /* Adjust the titlecasing index (titleStart) to the next cased character. */
- for(;;) {
+ if ((options&U_TITLECASE_NO_BREAK_ADJUSTMENT)==0) {
+ // Adjust the titlecasing index to the next cased character,
+ // or to the next letter/number/symbol/private use.
+ // Stop with titleStart<titleLimit<=index
+ // if there is a character to be titlecased,
+ // or else stop with titleStart==titleLimit==index.
+ UBool toCased = (options&U_TITLECASE_ADJUST_TO_CASED) != 0;
+ while (toCased ? UCASE_NONE==ucase_getType(c) : !ustrcase_isLNS(c)) {
titleStart=titleLimit;
if(titleLimit==index) {
- /*
- * only uncased characters in [prev..index[
- * stop with titleStart==titleLimit==index
- */
break;
}
U16_NEXT(src, titleLimit, index, c);
- if(UCASE_NONE!=ucase_getType(c)) {
- break; /* cased letter at [titleStart..titleLimit[ */
- }
}
- destIndex=appendUnchanged(dest, destIndex, destCapacity,
- src+prev, titleStart-prev, options, edits);
- if(destIndex<0) {
- errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
- return 0;
+ if (prev < titleStart) {
+ destIndex=appendUnchanged(dest, destIndex, destCapacity,
+ src+prev, titleStart-prev, options, edits);
+ if(destIndex<0) {
+ errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
+ return 0;
+ }
}
}
*/
#include "unicode/std_string.h"
+#include "unicode/brkiter.h"
#include "unicode/casemap.h"
#include "unicode/edits.h"
#include "unicode/uchar.h"
int32_t whichCase,
void *iter, const char *localeID, uint32_t options);
void TestCasing();
+ void TestTitleOptions();
void TestFullCaseFoldingIterator();
void TestGreekUpper();
void TestLongUpper();
TESTCASE_AUTO(TestCaseConversion);
#if !UCONFIG_NO_BREAK_ITERATION && !UCONFIG_NO_FILE_IO && !UCONFIG_NO_LEGACY_CONVERSION
TESTCASE_AUTO(TestCasing);
+ TESTCASE_AUTO(TestTitleOptions);
#endif
TESTCASE_AUTO(TestFullCaseFoldingIterator);
TESTCASE_AUTO(TestGreekUpper);
#endif
}
+void
+StringCaseTest::TestTitleOptions() {
+ // New options in ICU 60.
+ TestCasingImpl(u"ʻcAt! ʻeTc.", u"ʻCat! ʻetc.", TEST_TITLE,
+ nullptr, "", U_TITLECASE_WHOLE_STRING);
+ TestCasingImpl(u"a ʻCaT. A ʻdOg! ʻeTc.", u"A ʻCaT. A ʻdOg! ʻETc.", TEST_TITLE,
+ nullptr, "", U_TITLECASE_SENTENCES|U_TITLECASE_NO_LOWERCASE);
+ TestCasingImpl(u"49eRs", u"49ers", TEST_TITLE,
+ nullptr, "", U_TITLECASE_WHOLE_STRING);
+ TestCasingImpl(u"«丰(aBc)»", u"«丰(abc)»", TEST_TITLE,
+ nullptr, "", U_TITLECASE_WHOLE_STRING);
+ TestCasingImpl(u"49eRs", u"49Ers", TEST_TITLE,
+ nullptr, "", U_TITLECASE_WHOLE_STRING|U_TITLECASE_ADJUST_TO_CASED);
+ TestCasingImpl(u"«丰(aBc)»", u"«丰(Abc)»", TEST_TITLE,
+ nullptr, "", U_TITLECASE_WHOLE_STRING|U_TITLECASE_ADJUST_TO_CASED);
+ TestCasingImpl(u" john. Smith", u" John. Smith", TEST_TITLE,
+ nullptr, "", U_TITLECASE_WHOLE_STRING|U_TITLECASE_NO_LOWERCASE);
+ TestCasingImpl(u" john. Smith", u" john. smith", TEST_TITLE,
+ nullptr, "", U_TITLECASE_WHOLE_STRING|U_TITLECASE_NO_BREAK_ADJUSTMENT);
+ TestCasingImpl(u"«ijs»", u"«IJs»", TEST_TITLE,
+ nullptr, "nl-BE", U_TITLECASE_WHOLE_STRING);
+ TestCasingImpl(u"«ijs»", u"«İjs»", TEST_TITLE,
+ nullptr, "tr-DE", U_TITLECASE_WHOLE_STRING);
+
+ // Test conflicting settings.
+ // If & when we add more options, then the ORed combinations may become
+ // indistinguishable from valid values.
+ IcuTestErrorCode errorCode(*this, "TestTitleOptions");
+ CaseMap::toTitle("", U_TITLECASE_NO_BREAK_ADJUSTMENT|U_TITLECASE_ADJUST_TO_CASED, nullptr,
+ u"", 0, nullptr, 0, nullptr, errorCode);
+ if (errorCode.get() != U_ILLEGAL_ARGUMENT_ERROR) {
+ errln("CaseMap::toTitle(multiple adjustment options) -> %s not illegal argument",
+ errorCode.errorName());
+ }
+ errorCode.reset();
+ CaseMap::toTitle("", U_TITLECASE_WHOLE_STRING|U_TITLECASE_SENTENCES, nullptr,
+ u"", 0, nullptr, 0, nullptr, errorCode);
+ if (errorCode.get() != U_ILLEGAL_ARGUMENT_ERROR) {
+ errln("CaseMap::toTitle(multiple iterator options) -> %s not illegal argument",
+ errorCode.errorName());
+ }
+ errorCode.reset();
+ LocalPointer<BreakIterator> iter(
+ BreakIterator::createCharacterInstance(Locale::getRoot(), errorCode));
+ CaseMap::toTitle("", U_TITLECASE_WHOLE_STRING, iter.getAlias(),
+ u"", 0, nullptr, 0, nullptr, errorCode);
+ if (errorCode.get() != U_ILLEGAL_ARGUMENT_ERROR) {
+ errln("CaseMap::toTitle(iterator option + iterator) -> %s not illegal argument",
+ errorCode.errorName());
+ }
+ errorCode.reset();
+}
+
void
StringCaseTest::TestFullCaseFoldingIterator() {
UnicodeString ffi=UNICODE_STRING_SIMPLE("ffi");
package com.ibm.icu.impl;
import java.io.IOException;
+import java.text.CharacterIterator;
+import java.util.Locale;
import com.ibm.icu.lang.UCharacter;
+import com.ibm.icu.lang.UCharacterCategory;
import com.ibm.icu.text.BreakIterator;
import com.ibm.icu.text.Edits;
import com.ibm.icu.util.ICUUncheckedIOException;
+import com.ibm.icu.util.ULocale;
public final class CaseMapImpl {
/**
protected int dir; // 0=initial state >0=forward <0=backward
}
+ public static final int TITLECASE_WHOLE_STRING = 0x20;
+ public static final int TITLECASE_SENTENCES = 0x40;
+
+ /**
+ * Bit mask for the titlecasing iterator options bit field.
+ * Currently only 3 out of 8 values are used:
+ * 0 (words), TITLECASE_WHOLE_STRING, TITLECASE_SENTENCES.
+ * See stringoptions.h.
+ * @internal
+ */
+ private static final int TITLECASE_ITERATOR_MASK = 0xe0;
+
+ public static final int TITLECASE_ADJUST_TO_CASED = 0x400;
+
+ /**
+ * Bit mask for the titlecasing index adjustment options bit set.
+ * Currently two bits are defined:
+ * TITLECASE_NO_BREAK_ADJUSTMENT, TITLECASE_ADJUST_TO_CASED.
+ * See stringoptions.h.
+ * @internal
+ */
+ private static final int TITLECASE_ADJUSTMENT_MASK = 0x600;
+
+ public static int addTitleAdjustmentOption(int options, int newOption) {
+ int adjOptions = options & TITLECASE_ADJUSTMENT_MASK;
+ if (adjOptions !=0 && adjOptions != newOption) {
+ throw new IllegalArgumentException("multiple titlecasing index adjustment options");
+ }
+ return options | newOption;
+ }
+
+ private static final int LNS =
+ (1 << UCharacterCategory.UPPERCASE_LETTER) |
+ (1 << UCharacterCategory.LOWERCASE_LETTER) |
+ (1 << UCharacterCategory.TITLECASE_LETTER) |
+ // Not MODIFIER_LETTER: We count only cased modifier letters.
+ (1 << UCharacterCategory.OTHER_LETTER) |
+
+ (1 << UCharacterCategory.DECIMAL_DIGIT_NUMBER) |
+ (1 << UCharacterCategory.LETTER_NUMBER) |
+ (1 << UCharacterCategory.OTHER_NUMBER) |
+
+ (1 << UCharacterCategory.MATH_SYMBOL) |
+ (1 << UCharacterCategory.CURRENCY_SYMBOL) |
+ (1 << UCharacterCategory.MODIFIER_SYMBOL) |
+ (1 << UCharacterCategory.OTHER_SYMBOL) |
+
+ (1 << UCharacterCategory.PRIVATE_USE);
+
+ private static boolean isLNS(int c) {
+ // Letter, number, symbol,
+ // or a private use code point because those are typically used as letters or numbers.
+ // Consider modifier letters only if they are cased.
+ int gc = UCharacterProperty.INSTANCE.getType(c);
+ return ((1 << gc) & LNS) != 0 ||
+ (gc == UCharacterCategory.MODIFIER_LETTER &&
+ UCaseProps.INSTANCE.getType(c) != UCaseProps.NONE);
+ }
+
+ public static int addTitleIteratorOption(int options, int newOption) {
+ int iterOptions = options & TITLECASE_ITERATOR_MASK;
+ if (iterOptions !=0 && iterOptions != newOption) {
+ throw new IllegalArgumentException("multiple titlecasing iterator options");
+ }
+ return options | newOption;
+ }
+
+ public static BreakIterator getTitleBreakIterator(
+ Locale locale, int options, BreakIterator iter) {
+ options &= TITLECASE_ITERATOR_MASK;
+ if (options != 0 && iter != null) {
+ throw new IllegalArgumentException(
+ "titlecasing iterator option together with an explicit iterator");
+ }
+ if (iter == null) {
+ switch (options) {
+ case 0:
+ iter = BreakIterator.getWordInstance(locale);
+ break;
+ case TITLECASE_WHOLE_STRING:
+ iter = new WholeStringBreakIterator();
+ break;
+ case TITLECASE_SENTENCES:
+ iter = BreakIterator.getSentenceInstance(locale);
+ break;
+ default:
+ throw new IllegalArgumentException("unknown titlecasing iterator option");
+ }
+ }
+ return iter;
+ }
+
+ public static BreakIterator getTitleBreakIterator(
+ ULocale locale, int options, BreakIterator iter) {
+ options &= TITLECASE_ITERATOR_MASK;
+ if (options != 0 && iter != null) {
+ throw new IllegalArgumentException(
+ "titlecasing iterator option together with an explicit iterator");
+ }
+ if (iter == null) {
+ switch (options) {
+ case 0:
+ iter = BreakIterator.getWordInstance(locale);
+ break;
+ case TITLECASE_WHOLE_STRING:
+ iter = new WholeStringBreakIterator();
+ break;
+ case TITLECASE_SENTENCES:
+ iter = BreakIterator.getSentenceInstance(locale);
+ break;
+ default:
+ throw new IllegalArgumentException("unknown titlecasing iterator option");
+ }
+ }
+ return iter;
+ }
+
/**
* Omit unchanged text when case-mapping with Edits.
*/
public static final int OMIT_UNCHANGED_TEXT = 0x4000;
+ private static final class WholeStringBreakIterator extends BreakIterator {
+ private int length;
+
+ private static void notImplemented() {
+ throw new UnsupportedOperationException("should not occur");
+ }
+
+ @Override
+ public int first() {
+ return 0;
+ }
+
+ @Override
+ public int last() {
+ notImplemented();
+ return 0;
+ }
+
+ @Override
+ public int next(int n) {
+ notImplemented();
+ return 0;
+ }
+
+ @Override
+ public int next() {
+ return length;
+ }
+
+ @Override
+ public int previous() {
+ notImplemented();
+ return 0;
+ }
+
+ @Override
+ public int following(int offset) {
+ notImplemented();
+ return 0;
+ }
+
+ @Override
+ public int current() {
+ notImplemented();
+ return 0;
+ }
+
+ @Override
+ public CharacterIterator getText() {
+ notImplemented();
+ return null;
+ }
+
+ @Override
+ public void setText(CharacterIterator newText) {
+ length = newText.getEndIndex();
+ }
+
+ @Override
+ public void setText(String newText) {
+ length = newText.length();
+ }
+ }
+
private static int appendCodePoint(Appendable a, int c) throws IOException {
if (c <= Character.MAX_VALUE) {
a.append((char)c);
}
/*
- * Unicode 4 & 5 section 3.13 Default Case Operations:
- *
- * R3 toTitlecase(X): Find the word boundaries based on Unicode Standard Annex
- * #29, "Text Boundaries." Between each pair of word boundaries, find the first
- * cased character F. If F exists, map F to default_title(F); then map each
- * subsequent character C to default_lower(C).
- *
- * In this implementation, segment [prev..index[ into 3 parts:
- * a) uncased characters (copy as-is) [prev..titleStart[
- * b) first case letter (titlecase) [titleStart..titleLimit[
+ * Segment [prev..index[ into 3 parts:
+ * a) skipped characters (copy as-is) [prev..titleStart[
+ * b) first letter (titlecase) [titleStart..titleLimit[
* c) subsequent characters (lowercase) [titleLimit..index[
*/
if(prev<index) {
- // find and copy uncased characters [prev..titleStart[
+ // Find and copy skipped characters [prev..titleStart[
int titleStart=prev;
iter.setLimit(index);
int c=iter.nextCaseMapCP();
- if((options&UCharacter.TITLECASE_NO_BREAK_ADJUSTMENT)==0
- && UCaseProps.NONE==UCaseProps.INSTANCE.getType(c)) {
- // Adjust the titlecasing index (titleStart) to the next cased character.
- while((c=iter.nextCaseMapCP())>=0
- && UCaseProps.NONE==UCaseProps.INSTANCE.getType(c)) {}
+ if ((options&UCharacter.TITLECASE_NO_BREAK_ADJUSTMENT)==0) {
+ // Adjust the titlecasing index to the next cased character,
+ // or to the next letter/number/symbol/private use.
+ // Stop with titleStart<titleLimit<=index
+ // if there is a character to be titlecased,
+ // or else stop with titleStart==titleLimit==index.
+ boolean toCased = (options&CaseMapImpl.TITLECASE_ADJUST_TO_CASED) != 0;
+ while ((toCased ?
+ UCaseProps.NONE==UCaseProps.INSTANCE.getType(c) :
+ !CaseMapImpl.isLNS(c)) &&
+ (c=iter.nextCaseMapCP())>=0) {}
// If c<0 then we have only uncased characters in [prev..index[
// and stopped with titleStart==titleLimit==index.
titleStart=iter.getCPStart();
- appendUnchanged(src, prev, titleStart-prev, dest, options, edits);
+ if (prev < titleStart) {
+ appendUnchanged(src, prev, titleStart-prev, dest, options, edits);
+ }
}
if(titleStart<index) {
import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.lang.UScript;
import com.ibm.icu.text.BreakIterator;
+import com.ibm.icu.text.CaseMap;
import com.ibm.icu.text.DisplayContext;
import com.ibm.icu.text.DisplayContext.Type;
import com.ibm.icu.text.LocaleDisplayNames;
*/
private transient BreakIterator capitalizationBrkIter = null;
+ private static final CaseMap.Title TO_TITLE_WHOLE_STRING_NO_LOWERCASE =
+ CaseMap.toTitle().wholeString().noLowercase();
+
+ private static String toTitleWholeStringNoLowercase(ULocale locale, String s) {
+ return TO_TITLE_WHOLE_STRING_NO_LOWERCASE.apply(
+ locale.toLocale(), null, s, new StringBuilder(), null).toString();
+ }
public static LocaleDisplayNames getInstance(ULocale locale, DialectHandling dialectHandling) {
synchronized (cache) {
ULocale minimized = ULocale.minimizeSubtags(modified, ULocale.Minimize.FAVOR_SCRIPT);
String tempName = modified.getDisplayName(locale);
boolean titlecase = capContext == DisplayContext.CAPITALIZATION_FOR_UI_LIST_OR_MENU;
- String nameInDisplayLocale = titlecase ? UCharacter.toTitleFirst(locale, tempName) : tempName;
+ String nameInDisplayLocale =
+ titlecase ? toTitleWholeStringNoLowercase(locale, tempName) : tempName;
tempName = modified.getDisplayName(modified);
- String nameInSelf = capContext == DisplayContext.CAPITALIZATION_FOR_UI_LIST_OR_MENU ? UCharacter.toTitleFirst(modified, tempName) : tempName;
+ String nameInSelf = capContext ==
+ DisplayContext.CAPITALIZATION_FOR_UI_LIST_OR_MENU ?
+ toTitleWholeStringNoLowercase(modified, tempName) : tempName;
return new UiListItem(minimized, modified, nameInDisplayLocale, nameInSelf);
}
/**
* Bit mask for getting just the options from a string compare options word
* that are relevant for case folding (of a single string or code point).
+ *
+ * Currently only bit 0 for FOLD_CASE_EXCLUDE_SPECIAL_I.
+ * It is conceivable that at some point we might use one more bit for using uppercase sharp s.
+ * It is conceivable that at some point we might want the option to use only simple case foldings
+ * when operating on strings.
+ *
* @internal
*/
- private static final int FOLD_CASE_OPTIONS_MASK = 0xff;
+ private static final int FOLD_CASE_OPTIONS_MASK = 7;
/* return the simple case folding mapping for c */
public final int fold(int c, int options) {
*/
public static String toTitleCase(ULocale locale, String str,
BreakIterator titleIter, int options) {
- if(titleIter == null) {
- if (locale == null) {
- locale = ULocale.getDefault();
- }
- titleIter = BreakIterator.getWordInstance(locale);
+ if (titleIter == null && locale == null) {
+ locale = ULocale.getDefault();
}
+ titleIter = CaseMapImpl.getTitleBreakIterator(locale, options, titleIter);
titleIter.setText(str);
return toTitleCase(getCaseLocale(locale), options, titleIter, str);
}
-
- private static final int BREAK_MASK =
- (1<<UCharacterCategory.DECIMAL_DIGIT_NUMBER)
- | (1<<UCharacterCategory.OTHER_LETTER)
- | (1<<UCharacterCategory.MODIFIER_LETTER);
-
/**
* Return a string with just the first word titlecased, for menus and UI, etc. This does not affect most of the string,
* and sometimes has no effect at all; the original string is returned whenever casing
*/
@Deprecated
public static String toTitleFirst(ULocale locale, String str) {
- int c = 0;
- for (int i = 0; i < str.length(); i += UCharacter.charCount(c)) {
- c = UCharacter.codePointAt(str, i);
- int propertyMask = UCharacter.getIntPropertyValue(c, UProperty.GENERAL_CATEGORY_MASK);
- if ((propertyMask & BREAK_MASK) != 0) { // handle "49ers", initial CJK
- break;
- }
- if (UCaseProps.INSTANCE.getType(c) == UCaseProps.NONE) {
- continue;
- }
-
- // we now have the first cased character
- // What we really want is something like:
- // String titled = UCharacter.toTitleCase(locale, str, i, outputCharsTaken);
- // That is, just give us the titlecased string, for the locale, at i and following,
- // and tell us how many characters are replaced.
- // The following won't work completely: it needs some more substantial changes to UCaseProps
-
- String substring = str.substring(i, i+UCharacter.charCount(c));
- String titled = UCharacter.toTitleCase(locale, substring, BreakIterator.getSentenceInstance(locale), 0);
-
- // skip if no change
- if (titled.codePointAt(0) == c) {
- // Using 0 is safe, since any change in titling will not have first initial character
- break;
- }
- StringBuilder result = new StringBuilder(str.length()).append(str, 0, i);
- int startOfSuffix;
-
- // handle dutch, but check first for 'i', since that's faster. Should be built into UCaseProps.
-
- if (c == 'i' && locale.getLanguage().equals("nl") && i < str.length() && str.charAt(i+1) == 'j') {
- result.append("IJ");
- startOfSuffix = 2;
- } else {
- result.append(titled);
- startOfSuffix = i + UCharacter.charCount(c);
- }
-
- // add the remainder, and return
- return result.append(str, startOfSuffix, str.length()).toString();
- }
- return str; // no change
+ return toTitleCase(locale, str, null,
+ CaseMapImpl.TITLECASE_WHOLE_STRING|TITLECASE_NO_LOWERCASE);
+ // TODO: Remove this function.
+ // Move something like the following helper function into CLDR.
+ // private static final CaseMap.Title TO_TITLE_WHOLE_STRING_NO_LOWERCASE =
+ // CaseMap.toTitle().wholeString().noLowercase();
+ // return TO_TITLE_WHOLE_STRING_NO_LOWERCASE.apply(
+ // locale.toLocale(), null, str, new StringBuilder(), null).toString();
}
/**
public static String toTitleCase(Locale locale, String str,
BreakIterator titleIter,
int options) {
- if(titleIter == null) {
- titleIter = BreakIterator.getWordInstance(locale);
+ if (titleIter == null && locale == null) {
+ locale = Locale.getDefault();
}
+ titleIter = CaseMapImpl.getTitleBreakIterator(locale, options, titleIter);
titleIter.setText(str);
return toTitleCase(getCaseLocale(locale), options, titleIter, str);
}
private static final Title OMIT_UNCHANGED = new Title(CaseMapImpl.OMIT_UNCHANGED_TEXT);
private Title(int opt) { super(opt); }
+ /**
+ * Returns an instance that behaves like this one but
+ * titlecases the string as a whole rather than each word.
+ * (Titlecases only the character at index 0, possibly adjusted.)
+ *
+ * <p>It is an error to specify multiple titlecasing iterator options together,
+ * including both an option and an explicit BreakIterator.
+ *
+ * @return an options object with this option.
+ * @see #adjustToCased()
+ * @draft ICU 60
+ * @provisional This API might change or be removed in a future release.
+ */
+ public Title wholeString() {
+ return new Title(CaseMapImpl.addTitleIteratorOption(
+ internalOptions, CaseMapImpl.TITLECASE_WHOLE_STRING));
+ }
+
+ /**
+ * Returns an instance that behaves like this one but
+ * titlecases sentences rather than words.
+ * (Titlecases only the first character of each sentence, possibly adjusted.)
+ *
+ * <p>It is an error to specify multiple titlecasing iterator options together,
+ * including both an option and an explicit BreakIterator.
+ *
+ * @return an options object with this option.
+ * @see #adjustToCased()
+ * @draft ICU 60
+ * @provisional This API might change or be removed in a future release.
+ */
+ public Title sentences() {
+ return new Title(CaseMapImpl.addTitleIteratorOption(
+ internalOptions, CaseMapImpl.TITLECASE_SENTENCES));
+ }
+
/**
* {@inheritDoc}
* @draft ICU 59
* Returns an instance that behaves like this one but
* does not lowercase non-initial parts of words when titlecasing.
*
- * <p>By default, titlecasing will titlecase the first cased character
- * of a word and lowercase all other characters.
+ * <p>By default, titlecasing will titlecase the character at each
+ * (possibly adjusted) BreakIterator index and
+ * lowercase all other characters up to the next iterator index.
* With this option, the other characters will not be modified.
*
* @return an options object with this option.
* @see UCharacter#TITLECASE_NO_LOWERCASE
+ * @see #adjustToCased()
* @draft ICU 59
* @provisional This API might change or be removed in a future release.
*/
return new Title(internalOptions | UCharacter.TITLECASE_NO_LOWERCASE);
}
- // TODO: update references to the Unicode Standard for recent version
/**
* Returns an instance that behaves like this one but
- * does not adjust the titlecasing indexes from BreakIterator::next() indexes;
+ * does not adjust the titlecasing BreakIterator indexes;
* titlecases exactly the characters at breaks from the iterator.
*
* <p>By default, titlecasing will take each break iterator index,
- * adjust it by looking for the next cased character, and titlecase that one.
- * Other characters are lowercased.
- *
- * <p>This follows Unicode 4 & 5 section 3.13 Default Case Operations:
+ * adjust it to the next relevant character (see {@link #adjustToCased()}),
+ * and titlecase that one.
*
- * R3 toTitlecase(X): Find the word boundaries based on Unicode Standard Annex
- * #29, "Text Boundaries." Between each pair of word boundaries, find the first
- * cased character F. If F exists, map F to default_title(F); then map each
- * subsequent character C to default_lower(C).
+ * <p>Other characters are lowercased.
*
* @return an options object with this option.
* @see UCharacter#TITLECASE_NO_BREAK_ADJUSTMENT
* @provisional This API might change or be removed in a future release.
*/
public Title noBreakAdjustment() {
- return new Title(internalOptions | UCharacter.TITLECASE_NO_BREAK_ADJUSTMENT);
+ return new Title(CaseMapImpl.addTitleAdjustmentOption(
+ internalOptions, UCharacter.TITLECASE_NO_BREAK_ADJUSTMENT));
+ }
+
+ /**
+ * Returns an instance that behaves like this one but
+ * adjusts each titlecasing BreakIterator index to the next cased character.
+ * (See the Unicode Standard, chapter 3, Default Case Conversion, R3 toTitlecase(X).)
+ *
+ * <p>This used to be the default index adjustment in ICU.
+ * Since ICU 60, the default index adjustment is to the next character that is
+ * a letter, number, symbol, or private use code point.
+ * (Uncased modifier letters are skipped.)
+ * The difference in behavior is small for word titlecasing,
+ * but the new adjustment is much better for whole-string and sentence titlecasing:
+ * It yields "49ers" and "«丰(abc)»" instead of "49Ers" and "«丰(Abc)»".
+ *
+ * <p>It is an error to specify multiple titlecasing adjustment options together.
+ *
+ * @return an options object with this option.
+ * @see #noBreakAdjustment()
+ * @draft ICU 60
+ * @provisional This API might change or be removed in a future release.
+ */
+ public Title adjustToCased() {
+ return new Title(CaseMapImpl.addTitleAdjustmentOption(
+ internalOptions, CaseMapImpl.TITLECASE_ADJUST_TO_CASED));
}
/**
*/
public <A extends Appendable> A apply(
Locale locale, BreakIterator iter, CharSequence src, A dest, Edits edits) {
- if (iter == null) {
- iter = BreakIterator.getWordInstance(locale);
+ if (iter == null && locale == null) {
+ locale = Locale.getDefault();
}
+ iter = CaseMapImpl.getTitleBreakIterator(locale, internalOptions, iter);
iter.setText(src.toString());
return CaseMapImpl.toTitle(
getCaseLocale(locale), internalOptions, iter, src, dest, edits);
}
}
+ // Not a @Test. See ICU4C intltest strcase.cpp TestCasingImpl().
+ void TestCasingImpl(String input, String output, CaseMap.Title toTitle, Locale locale) {
+ String result = toTitle.apply(locale, null, input, new StringBuilder(), null).toString();
+ assertEquals("toTitle(" + input + ')', output, result);
+ }
+
+ @Test
+ public void TestTitleOptions() {
+ Locale root = Locale.ROOT;
+ // New options in ICU 60.
+ TestCasingImpl("ʻcAt! ʻeTc.", "ʻCat! ʻetc.",
+ CaseMap.toTitle().wholeString(), root);
+ TestCasingImpl("a ʻCaT. A ʻdOg! ʻeTc.", "A ʻCaT. A ʻdOg! ʻETc.",
+ CaseMap.toTitle().sentences().noLowercase(), root);
+ TestCasingImpl("49eRs", "49ers",
+ CaseMap.toTitle().wholeString(), root);
+ TestCasingImpl("«丰(aBc)»", "«丰(abc)»",
+ CaseMap.toTitle().wholeString(), root);
+ TestCasingImpl("49eRs", "49Ers",
+ CaseMap.toTitle().wholeString().adjustToCased(), root);
+ TestCasingImpl("«丰(aBc)»", "«丰(Abc)»",
+ CaseMap.toTitle().wholeString().adjustToCased(), root);
+ TestCasingImpl(" john. Smith", " John. Smith",
+ CaseMap.toTitle().wholeString().noLowercase(), root);
+ TestCasingImpl(" john. Smith", " john. smith",
+ CaseMap.toTitle().wholeString().noBreakAdjustment(), root);
+ TestCasingImpl("«ijs»", "«IJs»",
+ CaseMap.toTitle().wholeString(), new Locale("nl", "BE"));
+ TestCasingImpl("«ijs»", "«İjs»",
+ CaseMap.toTitle().wholeString(), new Locale("tr", "DE"));
+
+ // Test conflicting settings.
+ // If & when we add more options, then the ORed combinations may become
+ // indistinguishable from valid values.
+ try {
+ CaseMap.toTitle().noBreakAdjustment().adjustToCased().
+ apply(root, null, "", new StringBuilder(), null);
+ fail("CaseMap.toTitle(multiple adjustment options) " +
+ "did not throw an IllegalArgumentException");
+ } catch(IllegalArgumentException expected) {
+ }
+ try {
+ CaseMap.toTitle().wholeString().sentences().
+ apply(root, null, "", new StringBuilder(), null);
+ fail("CaseMap.toTitle(multiple iterator options) " +
+ "did not throw an IllegalArgumentException");
+ } catch(IllegalArgumentException expected) {
+ }
+ BreakIterator iter = BreakIterator.getCharacterInstance(root);
+ try {
+ CaseMap.toTitle().wholeString().apply(root, iter, "", new StringBuilder(), null);
+ fail("CaseMap.toTitle(iterator option + iterator) " +
+ "did not throw an IllegalArgumentException");
+ } catch(IllegalArgumentException expected) {
+ }
+ }
+
@Test
public void TestDutchTitle() {
ULocale LOC_DUTCH = new ULocale("nl");
Transliterator hanLatin = Transliterator.getInstance("Han-Latin");
assertTransform("Transform", "z\u00E0o Unicode", hanLatin, "\u9020Unicode");
assertTransform("Transform", "z\u00E0i chu\u00E0ng z\u00E0o Unicode zh\u012B qi\u00E1n", hanLatin, "\u5728\u5275\u9020Unicode\u4E4B\u524D");
- }
+ }
@Test
public void TestRegistry() {
Transliterator hex = Transliterator.getInstance("Any-Hex");
hex.setFilter(new UnicodeFilter() {
+ @Override
public boolean contains(int c) {
return c != 'c';
}
+ @Override
public String toPattern(boolean escapeUnprintable) {
return "";
}
+ @Override
public boolean matchesIndexValue(int v) {
return false;
}
+ @Override
public void addMatchSetTo(UnicodeSet toUnionTo) {}
});
String s = "abcde";
public NameableNullTrans(String id) {
super(id, null);
}
+ @Override
protected void handleTransliterate(Replaceable text,
Position offsets, boolean incremental) {
offsets.start = offsets.limit;
public TestFact(String theID) {
id = theID;
}
+ @Override
public Transliterator getInstance(String ignoredID) {
return new NameableNullTrans(id);
}
t.setFilter(new UnicodeSet("[:Ll:]"));
expect(t, "aAaA", "bAbA");
} finally {
- Transliterator.unregister("a_to_A");
- Transliterator.unregister("A_to_b");
+ Transliterator.unregister("a_to_A");
+ Transliterator.unregister("A_to_b");
}
}
//System.out.println("Registering: " + ID + ", " + t.toRules(true));
Transliterator.registerFactory(ID, singleton);
}
+ @Override
public Transliterator getInstance(String ID) {
return (Transliterator) m.get(ID);
}
String casefold = UCharacter.foldCase(s, true);
assertEquals("Casefold", casefold, toCasefold.transform(s));
- String title = UCharacter.toTitleCase(ULocale.ROOT, s, null);
- assertEquals("Title", title, toTitle.transform(s));
+ if (i != 0x0345) {
+ // ICU 60 changes the default titlecasing index adjustment.
+ // For word breaks it is mostly the same as before,
+ // but it is different for the iota subscript (the only cased combining mark).
+ // This should be ok because the iota subscript is not supposed to appear
+ // at the start of a word.
+ // The title Transliterator is far below feature parity with the
+ // UCharacter and CaseMap titlecasing functions.
+ String title = UCharacter.toTitleCase(ULocale.ROOT, s, null);
+ assertEquals("Title", title, toTitle.transform(s));
+ }
String upper = UCharacter.toUpperCase(ULocale.ROOT, s);
assertEquals("Upper", upper, toUpper.transform(s));
Transliterator.registerFactory(ID, singleton);
}
+ @Override
public Transliterator getInstance(String ID) {
return (Transliterator) m.get(new CaseInsensitiveString(ID));
}
*/
@Test
public void TestAny() {
- UnicodeSet alphabetic = (UnicodeSet) new UnicodeSet("[:alphabetic:]").freeze();
+ UnicodeSet alphabetic = new UnicodeSet("[:alphabetic:]").freeze();
StringBuffer testString = new StringBuffer();
for (int i = 0; i < UScript.CODE_LIMIT; ++i) {
UnicodeSet sample = new UnicodeSet().applyPropertyAlias("script", UScript.getShortName(i)).retainAll(alphabetic);
// add all the trail characters
if (!nonStarters.containsSome(trailString)) {
- continue;
+ continue;
}
UnicodeSet trailSet = leadToTrail.get(first);
if (trailSet == null) {
// disorderedMarks.add(s);
// disorderedMarks.add(nfc.normalize(s));
// addDerivedStrings(nfc, disorderedMarks, s);
- // }
+ // }
// s = nfd.getDecomposition(i);
// if (s != null) {
// disorderedMarks.add(s);
addSourceTarget(s, empiricalSource, t, empiricalTarget);
}
}
+ if (rule.contains("title")) {
+ // See the comment in TestCasing() about the iota subscript.
+ empiricalSource.remove(0x345);
+ }
assertEquals("getSource(" + ruleDisplay + ")", empiricalSource, actualSource, SetAssert.MISSING_OK);
assertEquals("getTarget(" + ruleDisplay + ")", empiricalTarget, actualTarget, SetAssert.MISSING_OK);
}
String direction = t == t0 ? "FORWARD\t" : "REVERSE\t";
targetIndex++;
UnicodeSet expectedTarget = testPair.length <= targetIndex ? expectedSource
- : testPair[targetIndex] == null ? expectedSource
- : testPair[targetIndex].length() == 0 ? expectedSource
+ : testPair[targetIndex] == null ? expectedSource
+ : testPair[targetIndex].length() == 0 ? expectedSource
: new UnicodeSet(testPair[targetIndex]);
ok = assertEquals(direction + "getSource\t\"" + test + '"', expectedSource, source);
if (!ok) { // for debugging
};
for (String[] row : startTests) {
int actual = findSharedStartLength(row[1], row[2]);
- assertEquals("findSharedStartLength(" + row[1] + "," + row[2] + ")",
+ assertEquals("findSharedStartLength(" + row[1] + "," + row[2] + ")",
Integer.parseInt(row[0]),
actual);
}
};
for (String[] row : endTests) {
int actual = findSharedEndLength(row[1], row[2]);
- assertEquals("findSharedEndLength(" + row[1] + "," + row[2] + ")",
- Integer.parseInt(row[0]),
+ assertEquals("findSharedEndLength(" + row[1] + "," + row[2] + ")",
+ Integer.parseInt(row[0]),
actual);
}
}
@Test
public void TestThai() {
Transliterator tr = Transliterator.getInstance("Any-Latin", Transliterator.FORWARD);
- String thaiText =
+ String thaiText =
"\u0e42\u0e14\u0e22\u0e1e\u0e37\u0e49\u0e19\u0e10\u0e32\u0e19\u0e41\u0e25\u0e49\u0e27, \u0e04\u0e2d" +
"\u0e21\u0e1e\u0e34\u0e27\u0e40\u0e15\u0e2d\u0e23\u0e4c\u0e08\u0e30\u0e40\u0e01\u0e35\u0e48\u0e22" +
"\u0e27\u0e02\u0e49\u0e2d\u0e07\u0e01\u0e31\u0e1a\u0e40\u0e23\u0e37\u0e48\u0e2d\u0e07\u0e02\u0e2d" +
"\u0e17\u0e04\u0e19\u0e34\u0e04\u0e17\u0e35\u0e48\u0e43\u0e0a\u0e49\u0e01\u0e31\u0e19\u0e2d\u0e22" +
"\u0e39\u0e48\u0e17\u0e31\u0e48\u0e27\u0e44\u0e1b.";
- String latinText =
+ String latinText =
"doy ph\u1ee5\u0304\u0302n \u1e6d\u0304h\u0101n l\u00e6\u0302w, khxmphiwtexr\u0312 ca ke\u012b\u0300" +
"ywk\u0304\u0125xng k\u1ea1b re\u1ee5\u0304\u0300xng k\u0304hxng t\u1ea1wlek\u0304h. khxmphiwtexr" +
"\u0312 c\u1ea1d k\u0115b t\u1ea1w x\u1ea1ks\u0304\u02b9r l\u00e6a x\u1ea1kk\u0304h ra x\u1ee5\u0304" +
this.expectedData = expectedData;
}
+ @Override
public void run() {
errorMsg = null;
StringBuffer inBuf = new StringBuffer(testData);