From: Markus Scherer Date: Thu, 15 Feb 2018 06:43:56 +0000 (+0000) Subject: ICU-12647 make string case mapping functions faster X-Git-Tag: release-61-rc~97 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=e8bb1bb9c2b5ae20ab0a4dd8b2a4454d2f093040;p=icu ICU-12647 make string case mapping functions faster X-SVN-Rev: 40921 --- diff --git a/icu4c/source/common/bytesinkutil.cpp b/icu4c/source/common/bytesinkutil.cpp index bf1a2d45f8a..6af7ddfd597 100644 --- a/icu4c/source/common/bytesinkutil.cpp +++ b/icu4c/source/common/bytesinkutil.cpp @@ -92,20 +92,16 @@ ByteSinkUtil::appendTwoBytes(UChar32 c, ByteSink &sink) { sink.Append(s8, 2); } -UBool -ByteSinkUtil::appendUnchanged(const uint8_t *s, int32_t length, - ByteSink &sink, uint32_t options, Edits *edits, - UErrorCode &errorCode) { - if (U_FAILURE(errorCode)) { return FALSE; } - if (length > 0) { - if (edits != nullptr) { - edits->addUnchanged(length); - } - if ((options & U_OMIT_UNCHANGED_TEXT) == 0) { - sink.Append(reinterpret_cast(s), length); - } +void +ByteSinkUtil::appendNonEmptyUnchanged(const uint8_t *s, int32_t length, + ByteSink &sink, uint32_t options, Edits *edits) { + U_ASSERT(length > 0); + if (edits != nullptr) { + edits->addUnchanged(length); + } + if ((options & U_OMIT_UNCHANGED_TEXT) == 0) { + sink.Append(reinterpret_cast(s), length); } - return TRUE; } UBool @@ -117,7 +113,11 @@ ByteSinkUtil::appendUnchanged(const uint8_t *s, const uint8_t *limit, errorCode = U_INDEX_OUTOFBOUNDS_ERROR; return FALSE; } - return appendUnchanged(s, (int32_t)(limit - s), sink, options, edits, errorCode); + int32_t length = (int32_t)(limit - s); + if (length > 0) { + appendNonEmptyUnchanged(s, length, sink, options, edits); + } + return TRUE; } U_NAMESPACE_END diff --git a/icu4c/source/common/bytesinkutil.h b/icu4c/source/common/bytesinkutil.h index 004b49c4ce6..8287ffea4ca 100644 --- a/icu4c/source/common/bytesinkutil.h +++ b/icu4c/source/common/bytesinkutil.h @@ -43,11 +43,19 @@ public: static UBool appendUnchanged(const uint8_t *s, int32_t length, ByteSink &sink, uint32_t options, Edits *edits, - UErrorCode &errorCode); + UErrorCode &errorCode) { + if (U_FAILURE(errorCode)) { return FALSE; } + if (length > 0) { appendNonEmptyUnchanged(s, length, sink, options, edits); } + return TRUE; + } static UBool appendUnchanged(const uint8_t *s, const uint8_t *limit, ByteSink &sink, uint32_t options, Edits *edits, UErrorCode &errorCode); + +private: + static void appendNonEmptyUnchanged(const uint8_t *s, int32_t length, + ByteSink &sink, uint32_t options, Edits *edits); }; U_NAMESPACE_END diff --git a/icu4c/source/common/ucase.cpp b/icu4c/source/common/ucase.cpp index 6b22f9e37bf..95b27acb754 100644 --- a/icu4c/source/common/ucase.cpp +++ b/icu4c/source/common/ucase.cpp @@ -77,9 +77,12 @@ ucase_addPropertyStarts(const USetAdder *sa, UErrorCode *pErrorCode) { /* data access primitives --------------------------------------------------- */ -#define GET_EXCEPTIONS(csp, props) ((csp)->exceptions+((props)>>UCASE_EXC_SHIFT)) +U_CFUNC const UTrie2 * U_EXPORT2 +ucase_getTrie() { + return &ucase_props_singleton.trie; +} -#define PROPS_HAS_EXCEPTION(props) ((props)&UCASE_EXCEPTION) +#define GET_EXCEPTIONS(csp, props) ((csp)->exceptions+((props)>>UCASE_EXC_SHIFT)) /* number of bits in an 8-bit integer value */ static const uint8_t flagsOffset[256]={ @@ -128,8 +131,8 @@ static const uint8_t flagsOffset[256]={ U_CAPI UChar32 U_EXPORT2 ucase_tolower(UChar32 c) { uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c); - if(!PROPS_HAS_EXCEPTION(props)) { - if(UCASE_GET_TYPE(props)>=UCASE_UPPER) { + if(!UCASE_HAS_EXCEPTION(props)) { + if(UCASE_IS_UPPER_OR_TITLE(props)) { c+=UCASE_GET_DELTA(props); } } else { @@ -145,7 +148,7 @@ ucase_tolower(UChar32 c) { U_CAPI UChar32 U_EXPORT2 ucase_toupper(UChar32 c) { uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c); - if(!PROPS_HAS_EXCEPTION(props)) { + if(!UCASE_HAS_EXCEPTION(props)) { if(UCASE_GET_TYPE(props)==UCASE_LOWER) { c+=UCASE_GET_DELTA(props); } @@ -162,7 +165,7 @@ ucase_toupper(UChar32 c) { U_CAPI UChar32 U_EXPORT2 ucase_totitle(UChar32 c) { uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c); - if(!PROPS_HAS_EXCEPTION(props)) { + if(!UCASE_HAS_EXCEPTION(props)) { if(UCASE_GET_TYPE(props)==UCASE_LOWER) { c+=UCASE_GET_DELTA(props); } @@ -223,7 +226,7 @@ ucase_addCaseClosure(UChar32 c, const USetAdder *sa) { } props=UTRIE2_GET16(&ucase_props_singleton.trie, c); - if(!PROPS_HAS_EXCEPTION(props)) { + if(!UCASE_HAS_EXCEPTION(props)) { if(UCASE_GET_TYPE(props)!=UCASE_NONE) { /* add the one simple case mapping, no matter what type it is */ int32_t delta=UCASE_GET_DELTA(props); @@ -419,6 +422,138 @@ FullCaseFoldingIterator::next(UnicodeString &full) { return c; } +namespace LatinCase { + +const int8_t TO_LOWER_NORMAL[LIMIT] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + + 0, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, EXC, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 0, 32, 32, 32, 32, 32, 32, 32, EXC, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + + 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, + 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, + 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, + EXC, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, + + 0, 1, 0, 1, 0, 1, 0, 1, 0, EXC, 1, 0, 1, 0, 1, 0, + 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, + 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, + 1, 0, 1, 0, 1, 0, 1, 0, -121, 1, 0, 1, 0, 1, 0, EXC +}; + +const int8_t TO_LOWER_TR_LT[LIMIT] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + + 0, 32, 32, 32, 32, 32, 32, 32, 32, EXC, EXC, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, EXC, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, EXC, EXC, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 0, 32, 32, 32, 32, 32, 32, 32, EXC, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + + 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, + 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, + 1, 0, 1, 0, 1, 0, 1, 0, EXC, 0, 1, 0, 1, 0, EXC, 0, + EXC, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, + + 0, 1, 0, 1, 0, 1, 0, 1, 0, EXC, 1, 0, 1, 0, 1, 0, + 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, + 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, + 1, 0, 1, 0, 1, 0, 1, 0, -121, 1, 0, 1, 0, 1, 0, EXC +}; + +const int8_t TO_UPPER_NORMAL[LIMIT] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, + -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, 0, 0, 0, 0, 0, + + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, EXC, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, EXC, + -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, + -32, -32, -32, -32, -32, -32, -32, 0, -32, -32, -32, -32, -32, -32, -32, 121, + + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, + 0, EXC, 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, -1, 0, -1, 0, + + -1, 0, -1, 0, -1, 0, -1, 0, -1, EXC, 0, -1, 0, -1, 0, -1, + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, + 0, -1, 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, -1, 0, -1, EXC +}; + +const int8_t TO_UPPER_TR[LIMIT] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, -32, -32, -32, -32, -32, -32, -32, -32, EXC, -32, -32, -32, -32, -32, -32, + -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, 0, 0, 0, 0, 0, + + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, EXC, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, EXC, + -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, + -32, -32, -32, -32, -32, -32, -32, 0, -32, -32, -32, -32, -32, -32, -32, 121, + + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, + 0, EXC, 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, -1, 0, -1, 0, + + -1, 0, -1, 0, -1, 0, -1, 0, -1, EXC, 0, -1, 0, -1, 0, -1, + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, + 0, -1, 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, -1, 0, -1, EXC +}; + +} // namespace LatinCase + U_NAMESPACE_END /** @return UCASE_NONE, UCASE_LOWER, UCASE_UPPER, UCASE_TITLE */ @@ -439,7 +574,7 @@ ucase_getTypeOrIgnorable(UChar32 c) { static inline int32_t getDotType(UChar32 c) { uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c); - if(!PROPS_HAS_EXCEPTION(props)) { + if(!UCASE_HAS_EXCEPTION(props)) { return props&UCASE_DOT_MASK; } else { const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props); @@ -878,8 +1013,8 @@ ucase_toFullLower(UChar32 c, U_ASSERT(c >= 0); UChar32 result=c; uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c); - if(!PROPS_HAS_EXCEPTION(props)) { - if(UCASE_GET_TYPE(props)>=UCASE_UPPER) { + if(!UCASE_HAS_EXCEPTION(props)) { + if(UCASE_IS_UPPER_OR_TITLE(props)) { result=c+UCASE_GET_DELTA(props); } } else { @@ -1024,7 +1159,7 @@ toUpperOrTitle(UChar32 c, U_ASSERT(c >= 0); UChar32 result=c; uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c); - if(!PROPS_HAS_EXCEPTION(props)) { + if(!UCASE_HAS_EXCEPTION(props)) { if(UCASE_GET_TYPE(props)==UCASE_LOWER) { result=c+UCASE_GET_DELTA(props); } @@ -1169,8 +1304,8 @@ ucase_toFullTitle(UChar32 c, U_CAPI UChar32 U_EXPORT2 ucase_fold(UChar32 c, uint32_t options) { uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c); - if(!PROPS_HAS_EXCEPTION(props)) { - if(UCASE_GET_TYPE(props)>=UCASE_UPPER) { + if(!UCASE_HAS_EXCEPTION(props)) { + if(UCASE_IS_UPPER_OR_TITLE(props)) { c+=UCASE_GET_DELTA(props); } } else { @@ -1234,8 +1369,8 @@ ucase_toFullFolding(UChar32 c, U_ASSERT(c >= 0); UChar32 result=c; uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c); - if(!PROPS_HAS_EXCEPTION(props)) { - if(UCASE_GET_TYPE(props)>=UCASE_UPPER) { + if(!UCASE_HAS_EXCEPTION(props)) { + if(UCASE_IS_UPPER_OR_TITLE(props)) { result=c+UCASE_GET_DELTA(props); } } else { diff --git a/icu4c/source/common/ucase.h b/icu4c/source/common/ucase.h index 9d6365eadfc..a7a8c9f00d1 100644 --- a/icu4c/source/common/ucase.h +++ b/icu4c/source/common/ucase.h @@ -26,6 +26,7 @@ #include "putilimp.h" #include "uset_imp.h" #include "udataswp.h" +#include "utrie2.h" #ifdef __cplusplus U_NAMESPACE_BEGIN @@ -148,6 +149,33 @@ private: int32_t rowCpIndex; }; +/** + * Fast case mapping data for ASCII/Latin. + * Linear arrays of delta bytes: 0=no mapping; EXC=exception. + * Deltas must not cross the ASCII boundary, or else they cannot be easily used + * in simple UTF-8 code. + */ +namespace LatinCase { + +/** Case mapping/folding data for code points up to U+017F. */ +constexpr UChar LIMIT = 0x180; +/** U+017F case-folds and uppercases crossing the ASCII boundary. */ +constexpr UChar LONG_S = 0x17f; +/** Exception: Complex mapping, or too-large delta. */ +constexpr int8_t EXC = -0x80; + +/** Deltas for lowercasing for most locales, and default case folding. */ +extern const int8_t TO_LOWER_NORMAL[LIMIT]; +/** Deltas for lowercasing for tr/az/lt, and Turkic case folding. */ +extern const int8_t TO_LOWER_TR_LT[LIMIT]; + +/** Deltas for uppercasing for most locales. */ +extern const int8_t TO_UPPER_NORMAL[LIMIT]; +/** Deltas for uppercasing for tr/az. */ +extern const int8_t TO_UPPER_TR[LIMIT]; + +} // namespace LatinCase + U_NAMESPACE_END #endif @@ -308,6 +336,9 @@ enum { /* definitions for 16-bit case properties word ------------------------------ */ +U_CFUNC const UTrie2 * U_EXPORT2 +ucase_getTrie(); + /* 2-bit constants for types of cased characters */ #define UCASE_TYPE_MASK 3 enum { @@ -320,10 +351,14 @@ enum { #define UCASE_GET_TYPE(props) ((props)&UCASE_TYPE_MASK) #define UCASE_GET_TYPE_AND_IGNORABLE(props) ((props)&7) +#define UCASE_IS_UPPER_OR_TITLE(props) ((props)&2) + #define UCASE_IGNORABLE 4 #define UCASE_SENSITIVE 8 #define UCASE_EXCEPTION 0x10 +#define UCASE_HAS_EXCEPTION(props) ((props)&UCASE_EXCEPTION) + #define UCASE_DOT_MASK 0x60 enum { UCASE_NO_DOT=0, /* normal characters with cc=0 */ diff --git a/icu4c/source/common/ucasemap.cpp b/icu4c/source/common/ucasemap.cpp index 8eec93c6e3e..99e30c9fc69 100644 --- a/icu4c/source/common/ucasemap.cpp +++ b/icu4c/source/common/ucasemap.cpp @@ -165,9 +165,7 @@ appendResult(int32_t cpLength, int32_t result, const UChar *s, inline uint8_t getTwoByteLead(UChar32 c) { return (uint8_t)((c >> 6) | 0xc0); } inline uint8_t getTwoByteTrail(UChar32 c) { return (uint8_t)((c & 0x3f) | 0x80); } -} // namespace - -static UChar32 U_CALLCONV +UChar32 U_CALLCONV utf8_caseContextIterator(void *context, int8_t dir) { UCaseContext *csc=(UCaseContext *)context; UChar32 c; @@ -199,36 +197,227 @@ utf8_caseContextIterator(void *context, int8_t dir) { return U_SENTINEL; } -/* - * Case-maps [srcStart..srcLimit[ but takes - * context [0..srcLength[ into account. +/** + * caseLocale >= 0: Lowercases [srcStart..srcLimit[ but takes context [0..srcLength[ into account. + * caseLocale < 0: Case-folds [srcStart..srcLimit[. */ -static void -_caseMap(int32_t caseLocale, uint32_t options, UCaseMapFull *map, - const uint8_t *src, UCaseContext *csc, - int32_t srcStart, int32_t srcLimit, - icu::ByteSink &sink, icu::Edits *edits, - UErrorCode &errorCode) { - /* case mapping loop */ - int32_t srcIndex=srcStart; - while (U_SUCCESS(errorCode) && srcIndex= 0 ? + !(caseLocale == UCASE_LOC_TURKISH || caseLocale == UCASE_LOC_LITHUANIAN) : + (options & _FOLD_CASE_OPTIONS_MASK) == U_FOLD_CASE_DEFAULT)) { + latinToLower = LatinCase::TO_LOWER_NORMAL; + } else { + latinToLower = LatinCase::TO_LOWER_TR_LT; + } + const UTrie2 *trie = ucase_getTrie(); + int32_t prev = srcStart; + int32_t srcIndex = srcStart; + for (;;) { + // fast path for simple cases int32_t cpStart; - csc->cpStart=cpStart=srcIndex; UChar32 c; - U8_NEXT(src, srcIndex, srcLimit, c); - csc->cpLimit=srcIndex; - if(c<0) { - // Malformed UTF-8. - ByteSinkUtil::appendUnchanged(src+cpStart, srcIndex-cpStart, + for (;;) { + if (U_FAILURE(errorCode) || srcIndex >= srcLimit) { + c = U_SENTINEL; + break; + } + uint8_t lead = src[srcIndex++]; + if (lead <= 0x7f) { + int8_t d = latinToLower[lead]; + if (d == LatinCase::EXC) { + cpStart = srcIndex - 1; + c = lead; + break; + } + if (d == 0) { continue; } + ByteSinkUtil::appendUnchanged(src + prev, srcIndex - 1 - prev, + sink, options, edits, errorCode); + char ascii = (char)(lead + d); + sink.Append(&ascii, 1); + if (edits != nullptr) { + edits->addReplace(1, 1); + } + prev = srcIndex; + continue; + } else if (lead < 0xe3) { + uint8_t t; + if (0xc2 <= lead && lead <= 0xc5 && srcIndex < srcLimit && + (t = src[srcIndex] - 0x80) <= 0x3f) { + // U+0080..U+017F + ++srcIndex; + c = ((lead - 0xc0) << 6) | t; + int8_t d = latinToLower[c]; + if (d == LatinCase::EXC) { + cpStart = srcIndex - 2; + break; + } + if (d == 0) { continue; } + ByteSinkUtil::appendUnchanged(src + prev, srcIndex - 2 - prev, + sink, options, edits, errorCode); + ByteSinkUtil::appendTwoBytes(c + d, sink); + if (edits != nullptr) { + edits->addReplace(2, 2); + } + prev = srcIndex; + continue; + } + } else if ((lead <= 0xe9 || lead == 0xeb || lead == 0xec) && + (srcIndex + 2) <= srcLimit && + U8_IS_TRAIL(src[srcIndex]) && U8_IS_TRAIL(src[srcIndex + 1])) { + // most of CJK: no case mappings + srcIndex += 2; + continue; + } + cpStart = --srcIndex; + U8_NEXT(src, srcIndex, srcLimit, c); + if (c < 0) { + // ill-formed UTF-8 + continue; + } + uint16_t props = UTRIE2_GET16(trie, c); + if (UCASE_HAS_EXCEPTION(props)) { break; } + int32_t delta; + if (!UCASE_IS_UPPER_OR_TITLE(props) || (delta = UCASE_GET_DELTA(props)) == 0) { + continue; + } + ByteSinkUtil::appendUnchanged(src + prev, cpStart - prev, sink, options, edits, errorCode); + ByteSinkUtil::appendCodePoint(srcIndex - cpStart, c + delta, sink, edits); + prev = srcIndex; + } + if (c < 0) { + break; + } + // slow path + const UChar *s; + if (caseLocale >= 0) { + csc->cpStart = cpStart; + csc->cpLimit = srcIndex; + c = ucase_toFullLower(c, utf8_caseContextIterator, csc, &s, caseLocale); } else { - const UChar *s; - c=map(c, utf8_caseContextIterator, csc, &s, caseLocale); + c = ucase_toFullFolding(c, &s, options); + } + if (c >= 0) { + ByteSinkUtil::appendUnchanged(src + prev, cpStart - prev, + sink, options, edits, errorCode); appendResult(srcIndex - cpStart, c, s, sink, options, edits, errorCode); + prev = srcIndex; } } + ByteSinkUtil::appendUnchanged(src + prev, srcIndex - prev, + sink, options, edits, errorCode); } +void toUpper(int32_t caseLocale, uint32_t options, + const uint8_t *src, UCaseContext *csc, int32_t srcLength, + icu::ByteSink &sink, icu::Edits *edits, UErrorCode &errorCode) { + const int8_t *latinToUpper; + if (caseLocale == UCASE_LOC_TURKISH) { + latinToUpper = LatinCase::TO_UPPER_TR; + } else { + latinToUpper = LatinCase::TO_UPPER_NORMAL; + } + const UTrie2 *trie = ucase_getTrie(); + int32_t prev = 0; + int32_t srcIndex = 0; + for (;;) { + // fast path for simple cases + int32_t cpStart; + UChar32 c; + for (;;) { + if (U_FAILURE(errorCode) || srcIndex >= srcLength) { + c = U_SENTINEL; + break; + } + uint8_t lead = src[srcIndex++]; + if (lead <= 0x7f) { + int8_t d = latinToUpper[lead]; + if (d == LatinCase::EXC) { + cpStart = srcIndex - 1; + c = lead; + break; + } + if (d == 0) { continue; } + ByteSinkUtil::appendUnchanged(src + prev, srcIndex - 1 - prev, + sink, options, edits, errorCode); + char ascii = (char)(lead + d); + sink.Append(&ascii, 1); + if (edits != nullptr) { + edits->addReplace(1, 1); + } + prev = srcIndex; + continue; + } else if (lead < 0xe3) { + uint8_t t; + if (0xc2 <= lead && lead <= 0xc5 && srcIndex < srcLength && + (t = src[srcIndex] - 0x80) <= 0x3f) { + // U+0080..U+017F + ++srcIndex; + c = ((lead - 0xc0) << 6) | t; + int8_t d = latinToUpper[c]; + if (d == LatinCase::EXC) { + cpStart = srcIndex - 2; + break; + } + if (d == 0) { continue; } + ByteSinkUtil::appendUnchanged(src + prev, srcIndex - 2 - prev, + sink, options, edits, errorCode); + ByteSinkUtil::appendTwoBytes(c + d, sink); + if (edits != nullptr) { + edits->addReplace(2, 2); + } + prev = srcIndex; + continue; + } + } else if ((lead <= 0xe9 || lead == 0xeb || lead == 0xec) && + (srcIndex + 2) <= srcLength && + U8_IS_TRAIL(src[srcIndex]) && U8_IS_TRAIL(src[srcIndex + 1])) { + // most of CJK: no case mappings + srcIndex += 2; + continue; + } + cpStart = --srcIndex; + U8_NEXT(src, srcIndex, srcLength, c); + if (c < 0) { + // ill-formed UTF-8 + continue; + } + uint16_t props = UTRIE2_GET16(trie, c); + if (UCASE_HAS_EXCEPTION(props)) { break; } + int32_t delta; + if (UCASE_GET_TYPE(props) != UCASE_LOWER || (delta = UCASE_GET_DELTA(props)) == 0) { + continue; + } + ByteSinkUtil::appendUnchanged(src + prev, cpStart - prev, + sink, options, edits, errorCode); + ByteSinkUtil::appendCodePoint(srcIndex - cpStart, c + delta, sink, edits); + prev = srcIndex; + } + if (c < 0) { + break; + } + // slow path + csc->cpStart = cpStart; + csc->cpLimit = srcIndex; + const UChar *s; + c = ucase_toFullUpper(c, utf8_caseContextIterator, csc, &s, caseLocale); + if (c >= 0) { + ByteSinkUtil::appendUnchanged(src + prev, cpStart - prev, + sink, options, edits, errorCode); + appendResult(srcIndex - cpStart, c, s, sink, options, edits, errorCode); + prev = srcIndex; + } + } + ByteSinkUtil::appendUnchanged(src + prev, srcIndex - prev, + sink, options, edits, errorCode); +} + +} // namespace + #if !UCONFIG_NO_BREAK_ITERATION U_CFUNC void U_CALLCONV @@ -335,10 +524,9 @@ ucasemap_internalUTF8ToTitle( if(titleLimitaddUnchanged(length); + } + if(options & U_OMIT_UNCHANGED_TEXT) { + return destIndex; + } + if(length>(INT32_MAX-destIndex)) { + return -1; // integer overflow + } + if((destIndex+length)<=destCapacity) { + u_memcpy(dest+destIndex, s, length); + } + return destIndex + length; +} + +inline int32_t appendUnchanged(UChar *dest, int32_t destIndex, int32_t destCapacity, const UChar *s, int32_t length, uint32_t options, icu::Edits *edits) { - if(length>0) { - if(edits!=NULL) { - edits->addUnchanged(length); - } - if(options & U_OMIT_UNCHANGED_TEXT) { - return destIndex; - } - if(length>(INT32_MAX-destIndex)) { - return -1; // integer overflow - } - if((destIndex+length)<=destCapacity) { - u_memcpy(dest+destIndex, s, length); - } - destIndex+=length; + if (length <= 0) { + return destIndex; } - return destIndex; + return appendNonEmptyUnchanged(dest, destIndex, destCapacity, s, length, options, edits); } -static UChar32 U_CALLCONV +UChar32 U_CALLCONV utf16_caseContextIterator(void *context, int8_t dir) { UCaseContext *csc=(UCaseContext *)context; UChar32 c; @@ -197,39 +195,205 @@ utf16_caseContextIterator(void *context, int8_t dir) { return U_SENTINEL; } -/* - * Case-maps [srcStart..srcLimit[ but takes - * context [0..srcLength[ into account. +/** + * caseLocale >= 0: Lowercases [srcStart..srcLimit[ but takes context [0..srcLength[ into account. + * caseLocale < 0: Case-folds [srcStart..srcLimit[. */ -static int32_t -_caseMap(int32_t caseLocale, uint32_t options, UCaseMapFull *map, - UChar *dest, int32_t destCapacity, - const UChar *src, UCaseContext *csc, - int32_t srcStart, int32_t srcLimit, - icu::Edits *edits, - UErrorCode &errorCode) { - /* case mapping loop */ - int32_t srcIndex=srcStart; - int32_t destIndex=0; - while(srcIndexcpStart=cpStart=srcIndex; +int32_t toLower(int32_t caseLocale, uint32_t options, + UChar *dest, int32_t destCapacity, + const UChar *src, UCaseContext *csc, int32_t srcStart, int32_t srcLimit, + icu::Edits *edits, UErrorCode &errorCode) { + const int8_t *latinToLower; + if (caseLocale == UCASE_LOC_ROOT || + (caseLocale >= 0 ? + !(caseLocale == UCASE_LOC_TURKISH || caseLocale == UCASE_LOC_LITHUANIAN) : + (options & _FOLD_CASE_OPTIONS_MASK) == U_FOLD_CASE_DEFAULT)) { + latinToLower = LatinCase::TO_LOWER_NORMAL; + } else { + latinToLower = LatinCase::TO_LOWER_TR_LT; + } + const UTrie2 *trie = ucase_getTrie(); + int32_t destIndex = 0; + int32_t prev = srcStart; + int32_t srcIndex = srcStart; + for (;;) { + // fast path for simple cases + UChar lead; + while (srcIndex < srcLimit) { + lead = src[srcIndex]; + int32_t delta; + if (lead < LatinCase::LONG_S) { + int8_t d = latinToLower[lead]; + if (d == LatinCase::EXC) { break; } + ++srcIndex; + if (d == 0) { continue; } + delta = d; + } else if (lead >= 0xd800) { + break; // surrogate or higher + } else { + uint16_t props = UTRIE2_GET16_FROM_U16_SINGLE_LEAD(trie, lead); + if (UCASE_HAS_EXCEPTION(props)) { break; } + ++srcIndex; + if (!UCASE_IS_UPPER_OR_TITLE(props) || (delta = UCASE_GET_DELTA(props)) == 0) { + continue; + } + } + lead += delta; + destIndex = appendUnchanged(dest, destIndex, destCapacity, + src + prev, srcIndex - 1 - prev, options, edits); + if (destIndex >= 0) { + destIndex = appendUChar(dest, destIndex, destCapacity, lead); + if (edits != nullptr) { + edits->addReplace(1, 1); + } + } + if (destIndex < 0) { + errorCode = U_INDEX_OUTOFBOUNDS_ERROR; + return 0; + } + prev = srcIndex; + } + if (srcIndex >= srcLimit) { + break; + } + // slow path + int32_t cpStart = srcIndex++; + UChar trail; UChar32 c; - U16_NEXT(src, srcIndex, srcLimit, c); - csc->cpLimit=srcIndex; + if (U16_IS_LEAD(lead) && srcIndex < srcLimit && U16_IS_TRAIL(trail = src[srcIndex])) { + c = U16_GET_SUPPLEMENTARY(lead, trail); + ++srcIndex; + } else { + c = lead; + } const UChar *s; - c=map(c, utf16_caseContextIterator, csc, &s, caseLocale); - destIndex = appendResult(dest, destIndex, destCapacity, c, s, - srcIndex - cpStart, options, edits); - if (destIndex < 0) { - errorCode = U_INDEX_OUTOFBOUNDS_ERROR; - return 0; + if (caseLocale >= 0) { + csc->cpStart = cpStart; + csc->cpLimit = srcIndex; + c = ucase_toFullLower(c, utf16_caseContextIterator, csc, &s, caseLocale); + } else { + c = ucase_toFullFolding(c, &s, options); } + if (c >= 0) { + destIndex = appendUnchanged(dest, destIndex, destCapacity, + src + prev, cpStart - prev, options, edits); + if (destIndex >= 0) { + destIndex = appendResult(dest, destIndex, destCapacity, c, s, + srcIndex - cpStart, options, edits); + } + if (destIndex < 0) { + errorCode = U_INDEX_OUTOFBOUNDS_ERROR; + return 0; + } + prev = srcIndex; + } + } + destIndex = appendUnchanged(dest, destIndex, destCapacity, + src + prev, srcIndex - prev, options, edits); + if (destIndex < 0) { + errorCode = U_INDEX_OUTOFBOUNDS_ERROR; + return 0; } + return destIndex; +} +int32_t toUpper(int32_t caseLocale, uint32_t options, + UChar *dest, int32_t destCapacity, + const UChar *src, UCaseContext *csc, int32_t srcLength, + icu::Edits *edits, UErrorCode &errorCode) { + const int8_t *latinToUpper; + if (caseLocale == UCASE_LOC_TURKISH) { + latinToUpper = LatinCase::TO_UPPER_TR; + } else { + latinToUpper = LatinCase::TO_UPPER_NORMAL; + } + const UTrie2 *trie = ucase_getTrie(); + int32_t destIndex = 0; + int32_t prev = 0; + int32_t srcIndex = 0; + for (;;) { + // fast path for simple cases + UChar lead; + while (srcIndex < srcLength) { + lead = src[srcIndex]; + int32_t delta; + if (lead < LatinCase::LONG_S) { + int8_t d = latinToUpper[lead]; + if (d == LatinCase::EXC) { break; } + ++srcIndex; + if (d == 0) { continue; } + delta = d; + } else if (lead >= 0xd800) { + break; // surrogate or higher + } else { + uint16_t props = UTRIE2_GET16_FROM_U16_SINGLE_LEAD(trie, lead); + if (UCASE_HAS_EXCEPTION(props)) { break; } + ++srcIndex; + if (UCASE_GET_TYPE(props) != UCASE_LOWER || (delta = UCASE_GET_DELTA(props)) == 0) { + continue; + } + } + lead += delta; + destIndex = appendUnchanged(dest, destIndex, destCapacity, + src + prev, srcIndex - 1 - prev, options, edits); + if (destIndex >= 0) { + destIndex = appendUChar(dest, destIndex, destCapacity, lead); + if (edits != nullptr) { + edits->addReplace(1, 1); + } + } + if (destIndex < 0) { + errorCode = U_INDEX_OUTOFBOUNDS_ERROR; + return 0; + } + prev = srcIndex; + } + if (srcIndex >= srcLength) { + break; + } + // slow path + int32_t cpStart; + csc->cpStart = cpStart = srcIndex++; + UChar trail; + UChar32 c; + if (U16_IS_LEAD(lead) && srcIndex < srcLength && U16_IS_TRAIL(trail = src[srcIndex])) { + c = U16_GET_SUPPLEMENTARY(lead, trail); + ++srcIndex; + } else { + c = lead; + } + csc->cpLimit = srcIndex; + const UChar *s; + c = ucase_toFullUpper(c, utf16_caseContextIterator, csc, &s, caseLocale); + if (c >= 0) { + destIndex = appendUnchanged(dest, destIndex, destCapacity, + src + prev, cpStart - prev, options, edits); + if (destIndex >= 0) { + destIndex = appendResult(dest, destIndex, destCapacity, c, s, + srcIndex - cpStart, options, edits); + } + if (destIndex < 0) { + errorCode = U_INDEX_OUTOFBOUNDS_ERROR; + return 0; + } + prev = srcIndex; + } + } + destIndex = appendUnchanged(dest, destIndex, destCapacity, + src + prev, srcIndex - prev, options, edits); + if (destIndex < 0) { + errorCode = U_INDEX_OUTOFBOUNDS_ERROR; + return 0; + } return destIndex; } +} // namespace + +U_NAMESPACE_END + +U_NAMESPACE_USE + #if !UCONFIG_NO_BREAK_ITERATION U_CFUNC int32_t U_CALLCONV @@ -344,11 +508,10 @@ ustrcase_internalToTitle(int32_t caseLocale, uint32_t options, BreakIterator *it if((options&U_TITLECASE_NO_LOWERCASE)==0) { /* Normal operation: Lowercase the rest of the word. */ destIndex+= - _caseMap( - caseLocale, options, ucase_toFullLower, + toLower( + caseLocale, options, dest+destIndex, destCapacity-destIndex, - src, &csc, - titleLimit, index, + src, &csc, titleLimit, index, edits, errorCode); if(errorCode==U_BUFFER_OVERFLOW_ERROR) { errorCode=U_ZERO_ERROR; @@ -1013,8 +1176,8 @@ ustrcase_internalToLower(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_IT UCaseContext csc=UCASECONTEXT_INITIALIZER; csc.p=(void *)src; csc.limit=srcLength; - int32_t destIndex = _caseMap( - caseLocale, options, ucase_toFullLower, + int32_t destIndex = toLower( + caseLocale, options, dest, destCapacity, src, &csc, 0, srcLength, edits, errorCode); @@ -1035,10 +1198,10 @@ ustrcase_internalToUpper(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_IT UCaseContext csc=UCASECONTEXT_INITIALIZER; csc.p=(void *)src; csc.limit=srcLength; - destIndex = _caseMap( - caseLocale, options, ucase_toFullUpper, + destIndex = toUpper( + caseLocale, options, dest, destCapacity, - src, &csc, 0, srcLength, + src, &csc, srcLength, edits, errorCode); } return checkOverflowAndEditsError(destIndex, destCapacity, edits, errorCode); @@ -1050,23 +1213,11 @@ ustrcase_internalFold(int32_t /* caseLocale */, uint32_t options, UCASEMAP_BREAK const UChar *src, int32_t srcLength, icu::Edits *edits, UErrorCode &errorCode) { - /* case mapping loop */ - int32_t srcIndex = 0; - int32_t destIndex = 0; - while (srcIndex < srcLength) { - int32_t cpStart = srcIndex; - UChar32 c; - U16_NEXT(src, srcIndex, srcLength, c); - const UChar *s; - c = ucase_toFullFolding(c, &s, options); - destIndex = appendResult(dest, destIndex, destCapacity, c, s, - srcIndex - cpStart, options, edits); - if (destIndex < 0) { - errorCode = U_INDEX_OUTOFBOUNDS_ERROR; - return 0; - } - } - + int32_t destIndex = toLower( + -1, options, + dest, destCapacity, + src, nullptr, 0, srcLength, + edits, errorCode); return checkOverflowAndEditsError(destIndex, destCapacity, edits, errorCode); } diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/CaseMapImpl.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/CaseMapImpl.java index f6ab77dbb81..2b30c028601 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/impl/CaseMapImpl.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/CaseMapImpl.java @@ -30,6 +30,21 @@ public final class CaseMapImpl { dir=0; } + /** + * Constructor. + * @param src String to iterate over. + * @param cpStart Start index of the current code point. + * @param cpLimit Limit index of the current code point. + */ + public StringContextIterator(CharSequence src, int cpStart, int cpLimit) { + s = src; + index = 0; + limit = src.length(); + this.cpStart = cpStart; + this.cpLimit = cpLimit; + dir = 0; + } + /** * Set the iteration limit for nextCaseMapCP() to an index within the string. * If the limit parameter is negative or past the string, then the @@ -77,6 +92,11 @@ public final class CaseMapImpl { } } + public void setCPStartAndLimit(int s, int l) { + cpStart = s; + cpLimit = l; + dir = 0; + } /** * Returns the start of the code point that was last returned * by nextCaseMapCP(). @@ -400,13 +420,162 @@ public final class CaseMapImpl { return result.toString(); } - private static void internalToLower(int caseLocale, int options, StringContextIterator iter, + private static final Trie2_16 CASE_TRIE = UCaseProps.getTrie(); + + /** + * caseLocale >= 0: Lowercases [srcStart..srcLimit[ but takes context [0..srcLength[ into account. + * caseLocale < 0: Case-folds [srcStart..srcLimit[. + */ + private static void internalToLower(int caseLocale, int options, + CharSequence src, int srcStart, int srcLimit, StringContextIterator iter, Appendable dest, Edits edits) throws IOException { - int c; - while ((c = iter.nextCaseMapCP()) >= 0) { - c = UCaseProps.INSTANCE.toFullLower(c, iter, dest, caseLocale); - appendResult(c, dest, iter.getCPLength(), options, edits); + byte[] latinToLower; + if (caseLocale == UCaseProps.LOC_ROOT || + (caseLocale >= 0 ? + !(caseLocale == UCaseProps.LOC_TURKISH || caseLocale == UCaseProps.LOC_LITHUANIAN) : + (options & UCaseProps.FOLD_CASE_OPTIONS_MASK) == UCharacter.FOLD_CASE_DEFAULT)) { + latinToLower = UCaseProps.LatinCase.TO_LOWER_NORMAL; + } else { + latinToLower = UCaseProps.LatinCase.TO_LOWER_TR_LT; } + int prev = srcStart; + int srcIndex = srcStart; + outerLoop: + for (;;) { + // fast path for simple cases + char lead; + for (;;) { + if (srcIndex >= srcLimit) { + break outerLoop; + } + lead = src.charAt(srcIndex); + int delta; + if (lead < UCaseProps.LatinCase.LONG_S) { + byte d = latinToLower[lead]; + if (d == UCaseProps.LatinCase.EXC) { break; } + ++srcIndex; + if (d == 0) { continue; } + delta = d; + } else if (lead >= 0xd800) { + break; // surrogate or higher + } else { + int props = CASE_TRIE.getFromU16SingleLead(lead); + if (UCaseProps.propsHasException(props)) { break; } + ++srcIndex; + if (!UCaseProps.isUpperOrTitleFromProps(props) || + (delta = UCaseProps.getDelta(props)) == 0) { + continue; + } + } + lead += delta; + appendUnchanged(src, prev, srcIndex - 1 - prev, dest, options, edits); + dest.append(lead); + if (edits != null) { + edits.addReplace(1, 1); + } + prev = srcIndex; + } + // slow path + int cpStart = srcIndex++; + char trail; + int c; + if (Character.isHighSurrogate(lead) && srcIndex < srcLimit && + Character.isLowSurrogate(trail = src.charAt(srcIndex))) { + c = Character.toCodePoint(lead, trail); + ++srcIndex; + } else { + c = lead; + } + if (caseLocale >= 0) { + if (iter == null) { + iter = new StringContextIterator(src, cpStart, srcIndex); + } else { + iter.setCPStartAndLimit(cpStart, srcIndex); + } + c = UCaseProps.INSTANCE.toFullLower(c, iter, dest, caseLocale); + } else { + c = UCaseProps.INSTANCE.toFullFolding(c, dest, options); + } + if (c >= 0) { + appendUnchanged(src, prev, cpStart - prev, dest, options, edits); + appendResult(c, dest, srcIndex - cpStart, options, edits); + prev = srcIndex; + } + } + appendUnchanged(src, prev, srcIndex - prev, dest, options, edits); + } + + private static void internalToUpper(int caseLocale, int options, + CharSequence src, Appendable dest, Edits edits) throws IOException { + StringContextIterator iter = null; + byte[] latinToUpper; + if (caseLocale == UCaseProps.LOC_TURKISH) { + latinToUpper = UCaseProps.LatinCase.TO_UPPER_TR; + } else { + latinToUpper = UCaseProps.LatinCase.TO_UPPER_NORMAL; + } + int prev = 0; + int srcIndex = 0; + int srcLength = src.length(); + outerLoop: + for (;;) { + // fast path for simple cases + char lead; + for (;;) { + if (srcIndex >= srcLength) { + break outerLoop; + } + lead = src.charAt(srcIndex); + int delta; + if (lead < UCaseProps.LatinCase.LONG_S) { + byte d = latinToUpper[lead]; + if (d == UCaseProps.LatinCase.EXC) { break; } + ++srcIndex; + if (d == 0) { continue; } + delta = d; + } else if (lead >= 0xd800) { + break; // surrogate or higher + } else { + int props = CASE_TRIE.getFromU16SingleLead(lead); + if (UCaseProps.propsHasException(props)) { break; } + ++srcIndex; + if (UCaseProps.getTypeFromProps(props) != UCaseProps.LOWER || + (delta = UCaseProps.getDelta(props)) == 0) { + continue; + } + } + lead += delta; + appendUnchanged(src, prev, srcIndex - 1 - prev, dest, options, edits); + dest.append(lead); + if (edits != null) { + edits.addReplace(1, 1); + } + prev = srcIndex; + } + // slow path + int cpStart = srcIndex++; + char trail; + int c; + if (Character.isHighSurrogate(lead) && srcIndex < srcLength && + Character.isLowSurrogate(trail = src.charAt(srcIndex))) { + c = Character.toCodePoint(lead, trail); + ++srcIndex; + } else { + c = lead; + } + if (iter == null) { + iter = new StringContextIterator(src, cpStart, srcIndex); + } else { + iter.setCPStartAndLimit(cpStart, srcIndex); + } + c = UCaseProps.INSTANCE.toFullUpper(c, iter, dest, caseLocale); + if (c >= 0) { + appendUnchanged(src, prev, cpStart - prev, dest, options, edits); + appendResult(c, dest, srcIndex - cpStart, options, edits); + prev = srcIndex; + } + } + appendUnchanged(src, prev, srcIndex - prev, dest, options, edits); } public static String toLower(int caseLocale, int options, CharSequence src) { @@ -432,8 +601,7 @@ public final class CaseMapImpl { if (edits != null) { edits.reset(); } - StringContextIterator iter = new StringContextIterator(src); - internalToLower(caseLocale, options, iter, dest, edits); + internalToLower(caseLocale, options, src, 0, src.length(), null, dest, edits); return dest; } catch (IOException e) { throw new ICUUncheckedIOException(e); @@ -466,12 +634,7 @@ public final class CaseMapImpl { if (caseLocale == UCaseProps.LOC_GREEK) { return GreekUpper.toUpper(options, src, dest, edits); } - StringContextIterator iter = new StringContextIterator(src); - int c; - while ((c = iter.nextCaseMapCP()) >= 0) { - c = UCaseProps.INSTANCE.toFullUpper(c, iter, dest, caseLocale); - appendResult(c, dest, iter.getCPLength(), options, edits); - } + internalToUpper(caseLocale, options, src, dest, edits); return dest; } catch (IOException e) { throw new ICUUncheckedIOException(e); @@ -589,12 +752,13 @@ public final class CaseMapImpl { if(titleLimit>EXC_SHIFT; } - private static final boolean propsHasException(int props) { + static final boolean propsHasException(int props) { return (props&EXCEPTION)!=0; } @@ -187,7 +187,7 @@ public final class UCaseProps { public final int tolower(int c) { int props=trie.get(c); if(!propsHasException(props)) { - if(getTypeFromProps(props)>=UPPER) { + if(isUpperOrTitleFromProps(props)) { c+=getDelta(props); } } else { @@ -591,6 +591,153 @@ public final class UCaseProps { public int next(); } + /** + * Fast case mapping data for ASCII/Latin. + * Linear arrays of delta bytes: 0=no mapping; EXC=exception. + * Deltas must not cross the ASCII boundary, or else they cannot be easily used + * in simple UTF-8 code. + */ + static final class LatinCase { + /** Case mapping/folding data for code points up to U+017F. */ + static final char LIMIT = 0x180; + /** U+017F case-folds and uppercases crossing the ASCII boundary. */ + static final char LONG_S = 0x17f; + /** Exception: Complex mapping, or too-large delta. */ + static final byte EXC = -0x80; + + /** Deltas for lowercasing for most locales, and default case folding. */ + static final byte[] TO_LOWER_NORMAL = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + + 0, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, EXC, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 0, 32, 32, 32, 32, 32, 32, 32, EXC, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + + 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, + 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, + 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, + EXC, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, + + 0, 1, 0, 1, 0, 1, 0, 1, 0, EXC, 1, 0, 1, 0, 1, 0, + 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, + 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, + 1, 0, 1, 0, 1, 0, 1, 0, -121, 1, 0, 1, 0, 1, 0, EXC + }; + + /** Deltas for lowercasing for tr/az/lt, and Turkic case folding. */ + static final byte[] TO_LOWER_TR_LT = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + + 0, 32, 32, 32, 32, 32, 32, 32, 32, EXC, EXC, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, EXC, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, EXC, EXC, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 0, 32, 32, 32, 32, 32, 32, 32, EXC, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + + 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, + 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, + 1, 0, 1, 0, 1, 0, 1, 0, EXC, 0, 1, 0, 1, 0, EXC, 0, + EXC, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, + + 0, 1, 0, 1, 0, 1, 0, 1, 0, EXC, 1, 0, 1, 0, 1, 0, + 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, + 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, + 1, 0, 1, 0, 1, 0, 1, 0, -121, 1, 0, 1, 0, 1, 0, EXC + }; + + /** Deltas for uppercasing for most locales. */ + static final byte[] TO_UPPER_NORMAL = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, + -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, 0, 0, 0, 0, 0, + + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, EXC, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, EXC, + -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, + -32, -32, -32, -32, -32, -32, -32, 0, -32, -32, -32, -32, -32, -32, -32, 121, + + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, + 0, EXC, 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, -1, 0, -1, 0, + + -1, 0, -1, 0, -1, 0, -1, 0, -1, EXC, 0, -1, 0, -1, 0, -1, + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, + 0, -1, 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, -1, 0, -1, EXC + }; + + /** Deltas for uppercasing for tr/az. */ + static final byte[] TO_UPPER_TR = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, -32, -32, -32, -32, -32, -32, -32, -32, EXC, -32, -32, -32, -32, -32, -32, + -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, 0, 0, 0, 0, 0, + + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, EXC, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, EXC, + -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, + -32, -32, -32, -32, -32, -32, -32, 0, -32, -32, -32, -32, -32, -32, -32, 121, + + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, + 0, EXC, 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, -1, 0, -1, 0, + + -1, 0, -1, 0, -1, 0, -1, 0, -1, EXC, 0, -1, 0, -1, 0, -1, + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, + 0, -1, 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, -1, 0, -1, EXC + }; + } + /** * For string case mappings, a single character (a code point) is mapped * either to itself (in which case in-place mapping functions do nothing), @@ -609,8 +756,8 @@ public final class UCaseProps { //ivate static final int LOC_UNKNOWN=0; public static final int LOC_ROOT=1; - private static final int LOC_TURKISH=2; - private static final int LOC_LITHUANIAN=3; + static final int LOC_TURKISH=2; + static final int LOC_LITHUANIAN=3; static final int LOC_GREEK=4; public static final int LOC_DUTCH=5; @@ -823,7 +970,7 @@ public final class UCaseProps { result=c; props=trie.get(c); if(!propsHasException(props)) { - if(getTypeFromProps(props)>=UPPER) { + if(isUpperOrTitleFromProps(props)) { result=c+getDelta(props); } } else { @@ -1132,13 +1279,13 @@ public final class UCaseProps { * * @internal */ - private static final int FOLD_CASE_OPTIONS_MASK = 7; + static final int FOLD_CASE_OPTIONS_MASK = 7; /* return the simple case folding mapping for c */ public final int fold(int c, int options) { int props=trie.get(c); if(!propsHasException(props)) { - if(getTypeFromProps(props)>=UPPER) { + if(isUpperOrTitleFromProps(props)) { c+=getDelta(props); } } else { @@ -1201,7 +1348,7 @@ public final class UCaseProps { result=c; props=trie.get(c); if(!propsHasException(props)) { - if(getTypeFromProps(props)>=UPPER) { + if(isUpperOrTitleFromProps(props)) { result=c+getDelta(props); } } else { @@ -1361,6 +1508,10 @@ public final class UCaseProps { // definitions for 16-bit case properties word ------------------------- *** + static Trie2_16 getTrie() { + return INSTANCE.trie; + } + /* 2-bit constants for types of cased characters */ public static final int TYPE_MASK=3; public static final int NONE=0; @@ -1369,7 +1520,7 @@ public final class UCaseProps { public static final int TITLE=3; /** @return NONE, LOWER, UPPER, TITLE */ - private static final int getTypeFromProps(int props) { + static final int getTypeFromProps(int props) { return props&TYPE_MASK; } @@ -1378,6 +1529,10 @@ public final class UCaseProps { return props&7; } + static final boolean isUpperOrTitleFromProps(int props) { + return (props & 2) != 0; + } + static final int IGNORABLE=4; private static final int SENSITIVE= 8; private static final int EXCEPTION= 0x10; @@ -1394,7 +1549,7 @@ public final class UCaseProps { //private static final int MAX_DELTA= 0xff; //private static final int MIN_DELTA= (-MAX_DELTA-1); - private static final int getDelta(int props) { + static final int getDelta(int props) { return (short)props>>DELTA_SHIFT; } diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/lang/UCharacterCaseTest.java b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/lang/UCharacterCaseTest.java index f54fbec2cce..320b188f930 100644 --- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/lang/UCharacterCaseTest.java +++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/lang/UCharacterCaseTest.java @@ -480,19 +480,17 @@ public final class UCharacterCaseTest extends TestFmwk } } else { - if (!SPECIAL_DATA_[j + 1].equals( - UCharacter.toLowerCase(str))) { + String lower = UCharacter.toLowerCase(str); + if (!SPECIAL_DATA_[j + 1].equals(lower)) { errln("error lowercasing special characters " + hex(str) + " expected " + SPECIAL_DATA_[j + 1] + - " but got " + - hex(UCharacter.toLowerCase(locale, str))); + " but got " + hex(lower)); } - if (!SPECIAL_DATA_[j + 2].equals( - UCharacter.toUpperCase(locale, str))) { + String upper = UCharacter.toUpperCase(str); + if (!SPECIAL_DATA_[j + 2].equals(upper)) { errln("error uppercasing special characters " + hex(str) + " expected " + SPECIAL_DATA_[j + 2] + - " but got " + - hex(UCharacter.toUpperCase(locale, str))); + " but got " + hex(upper)); } } }