sink.Append(s8, 2);
}
-UBool
-ByteSinkUtil::appendUnchanged(const uint8_t *s, int32_t length,
- ByteSink &sink, uint32_t options, Edits *edits,
- UErrorCode &errorCode) {
- if (U_FAILURE(errorCode)) { return FALSE; }
- if (length > 0) {
- if (edits != nullptr) {
- edits->addUnchanged(length);
- }
- if ((options & U_OMIT_UNCHANGED_TEXT) == 0) {
- sink.Append(reinterpret_cast<const char *>(s), length);
- }
+void
+ByteSinkUtil::appendNonEmptyUnchanged(const uint8_t *s, int32_t length,
+ ByteSink &sink, uint32_t options, Edits *edits) {
+ U_ASSERT(length > 0);
+ if (edits != nullptr) {
+ edits->addUnchanged(length);
+ }
+ if ((options & U_OMIT_UNCHANGED_TEXT) == 0) {
+ sink.Append(reinterpret_cast<const char *>(s), length);
}
- return TRUE;
}
UBool
errorCode = U_INDEX_OUTOFBOUNDS_ERROR;
return FALSE;
}
- return appendUnchanged(s, (int32_t)(limit - s), sink, options, edits, errorCode);
+ int32_t length = (int32_t)(limit - s);
+ if (length > 0) {
+ appendNonEmptyUnchanged(s, length, sink, options, edits);
+ }
+ return TRUE;
}
U_NAMESPACE_END
static UBool appendUnchanged(const uint8_t *s, int32_t length,
ByteSink &sink, uint32_t options, Edits *edits,
- UErrorCode &errorCode);
+ UErrorCode &errorCode) {
+ if (U_FAILURE(errorCode)) { return FALSE; }
+ if (length > 0) { appendNonEmptyUnchanged(s, length, sink, options, edits); }
+ return TRUE;
+ }
static UBool appendUnchanged(const uint8_t *s, const uint8_t *limit,
ByteSink &sink, uint32_t options, Edits *edits,
UErrorCode &errorCode);
+
+private:
+ static void appendNonEmptyUnchanged(const uint8_t *s, int32_t length,
+ ByteSink &sink, uint32_t options, Edits *edits);
};
U_NAMESPACE_END
/* data access primitives --------------------------------------------------- */
-#define GET_EXCEPTIONS(csp, props) ((csp)->exceptions+((props)>>UCASE_EXC_SHIFT))
+U_CFUNC const UTrie2 * U_EXPORT2
+ucase_getTrie() {
+ return &ucase_props_singleton.trie;
+}
-#define PROPS_HAS_EXCEPTION(props) ((props)&UCASE_EXCEPTION)
+#define GET_EXCEPTIONS(csp, props) ((csp)->exceptions+((props)>>UCASE_EXC_SHIFT))
/* number of bits in an 8-bit integer value */
static const uint8_t flagsOffset[256]={
U_CAPI UChar32 U_EXPORT2
ucase_tolower(UChar32 c) {
uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
- if(!PROPS_HAS_EXCEPTION(props)) {
- if(UCASE_GET_TYPE(props)>=UCASE_UPPER) {
+ if(!UCASE_HAS_EXCEPTION(props)) {
+ if(UCASE_IS_UPPER_OR_TITLE(props)) {
c+=UCASE_GET_DELTA(props);
}
} else {
U_CAPI UChar32 U_EXPORT2
ucase_toupper(UChar32 c) {
uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
- if(!PROPS_HAS_EXCEPTION(props)) {
+ if(!UCASE_HAS_EXCEPTION(props)) {
if(UCASE_GET_TYPE(props)==UCASE_LOWER) {
c+=UCASE_GET_DELTA(props);
}
U_CAPI UChar32 U_EXPORT2
ucase_totitle(UChar32 c) {
uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
- if(!PROPS_HAS_EXCEPTION(props)) {
+ if(!UCASE_HAS_EXCEPTION(props)) {
if(UCASE_GET_TYPE(props)==UCASE_LOWER) {
c+=UCASE_GET_DELTA(props);
}
}
props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
- if(!PROPS_HAS_EXCEPTION(props)) {
+ if(!UCASE_HAS_EXCEPTION(props)) {
if(UCASE_GET_TYPE(props)!=UCASE_NONE) {
/* add the one simple case mapping, no matter what type it is */
int32_t delta=UCASE_GET_DELTA(props);
return c;
}
+namespace LatinCase {
+
+const int8_t TO_LOWER_NORMAL[LIMIT] = {
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
+ 0, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, EXC, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 0, 32, 32, 32, 32, 32, 32, 32, EXC,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
+ 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
+ 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
+ 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
+ EXC, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1,
+
+ 0, 1, 0, 1, 0, 1, 0, 1, 0, EXC, 1, 0, 1, 0, 1, 0,
+ 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
+ 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
+ 1, 0, 1, 0, 1, 0, 1, 0, -121, 1, 0, 1, 0, 1, 0, EXC
+};
+
+const int8_t TO_LOWER_TR_LT[LIMIT] = {
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
+ 0, 32, 32, 32, 32, 32, 32, 32, 32, EXC, EXC, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, EXC, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, EXC, EXC, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 0, 32, 32, 32, 32, 32, 32, 32, EXC,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
+ 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
+ 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
+ 1, 0, 1, 0, 1, 0, 1, 0, EXC, 0, 1, 0, 1, 0, EXC, 0,
+ EXC, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1,
+
+ 0, 1, 0, 1, 0, 1, 0, 1, 0, EXC, 1, 0, 1, 0, 1, 0,
+ 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
+ 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
+ 1, 0, 1, 0, 1, 0, 1, 0, -121, 1, 0, 1, 0, 1, 0, EXC
+};
+
+const int8_t TO_UPPER_NORMAL[LIMIT] = {
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32,
+ -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, 0, 0, 0, 0, 0,
+
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, EXC, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, EXC,
+ -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32,
+ -32, -32, -32, -32, -32, -32, -32, 0, -32, -32, -32, -32, -32, -32, -32, 121,
+
+ 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
+ 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
+ 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
+ 0, EXC, 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, -1, 0, -1, 0,
+
+ -1, 0, -1, 0, -1, 0, -1, 0, -1, EXC, 0, -1, 0, -1, 0, -1,
+ 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
+ 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
+ 0, -1, 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, -1, 0, -1, EXC
+};
+
+const int8_t TO_UPPER_TR[LIMIT] = {
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, -32, -32, -32, -32, -32, -32, -32, -32, EXC, -32, -32, -32, -32, -32, -32,
+ -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, 0, 0, 0, 0, 0,
+
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, EXC, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, EXC,
+ -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32,
+ -32, -32, -32, -32, -32, -32, -32, 0, -32, -32, -32, -32, -32, -32, -32, 121,
+
+ 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
+ 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
+ 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
+ 0, EXC, 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, -1, 0, -1, 0,
+
+ -1, 0, -1, 0, -1, 0, -1, 0, -1, EXC, 0, -1, 0, -1, 0, -1,
+ 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
+ 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
+ 0, -1, 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, -1, 0, -1, EXC
+};
+
+} // namespace LatinCase
+
U_NAMESPACE_END
/** @return UCASE_NONE, UCASE_LOWER, UCASE_UPPER, UCASE_TITLE */
static inline int32_t
getDotType(UChar32 c) {
uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
- if(!PROPS_HAS_EXCEPTION(props)) {
+ if(!UCASE_HAS_EXCEPTION(props)) {
return props&UCASE_DOT_MASK;
} else {
const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props);
U_ASSERT(c >= 0);
UChar32 result=c;
uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
- if(!PROPS_HAS_EXCEPTION(props)) {
- if(UCASE_GET_TYPE(props)>=UCASE_UPPER) {
+ if(!UCASE_HAS_EXCEPTION(props)) {
+ if(UCASE_IS_UPPER_OR_TITLE(props)) {
result=c+UCASE_GET_DELTA(props);
}
} else {
U_ASSERT(c >= 0);
UChar32 result=c;
uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
- if(!PROPS_HAS_EXCEPTION(props)) {
+ if(!UCASE_HAS_EXCEPTION(props)) {
if(UCASE_GET_TYPE(props)==UCASE_LOWER) {
result=c+UCASE_GET_DELTA(props);
}
U_CAPI UChar32 U_EXPORT2
ucase_fold(UChar32 c, uint32_t options) {
uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
- if(!PROPS_HAS_EXCEPTION(props)) {
- if(UCASE_GET_TYPE(props)>=UCASE_UPPER) {
+ if(!UCASE_HAS_EXCEPTION(props)) {
+ if(UCASE_IS_UPPER_OR_TITLE(props)) {
c+=UCASE_GET_DELTA(props);
}
} else {
U_ASSERT(c >= 0);
UChar32 result=c;
uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
- if(!PROPS_HAS_EXCEPTION(props)) {
- if(UCASE_GET_TYPE(props)>=UCASE_UPPER) {
+ if(!UCASE_HAS_EXCEPTION(props)) {
+ if(UCASE_IS_UPPER_OR_TITLE(props)) {
result=c+UCASE_GET_DELTA(props);
}
} else {
#include "putilimp.h"
#include "uset_imp.h"
#include "udataswp.h"
+#include "utrie2.h"
#ifdef __cplusplus
U_NAMESPACE_BEGIN
int32_t rowCpIndex;
};
+/**
+ * Fast case mapping data for ASCII/Latin.
+ * Linear arrays of delta bytes: 0=no mapping; EXC=exception.
+ * Deltas must not cross the ASCII boundary, or else they cannot be easily used
+ * in simple UTF-8 code.
+ */
+namespace LatinCase {
+
+/** Case mapping/folding data for code points up to U+017F. */
+constexpr UChar LIMIT = 0x180;
+/** U+017F case-folds and uppercases crossing the ASCII boundary. */
+constexpr UChar LONG_S = 0x17f;
+/** Exception: Complex mapping, or too-large delta. */
+constexpr int8_t EXC = -0x80;
+
+/** Deltas for lowercasing for most locales, and default case folding. */
+extern const int8_t TO_LOWER_NORMAL[LIMIT];
+/** Deltas for lowercasing for tr/az/lt, and Turkic case folding. */
+extern const int8_t TO_LOWER_TR_LT[LIMIT];
+
+/** Deltas for uppercasing for most locales. */
+extern const int8_t TO_UPPER_NORMAL[LIMIT];
+/** Deltas for uppercasing for tr/az. */
+extern const int8_t TO_UPPER_TR[LIMIT];
+
+} // namespace LatinCase
+
U_NAMESPACE_END
#endif
/* definitions for 16-bit case properties word ------------------------------ */
+U_CFUNC const UTrie2 * U_EXPORT2
+ucase_getTrie();
+
/* 2-bit constants for types of cased characters */
#define UCASE_TYPE_MASK 3
enum {
#define UCASE_GET_TYPE(props) ((props)&UCASE_TYPE_MASK)
#define UCASE_GET_TYPE_AND_IGNORABLE(props) ((props)&7)
+#define UCASE_IS_UPPER_OR_TITLE(props) ((props)&2)
+
#define UCASE_IGNORABLE 4
#define UCASE_SENSITIVE 8
#define UCASE_EXCEPTION 0x10
+#define UCASE_HAS_EXCEPTION(props) ((props)&UCASE_EXCEPTION)
+
#define UCASE_DOT_MASK 0x60
enum {
UCASE_NO_DOT=0, /* normal characters with cc=0 */
inline uint8_t getTwoByteLead(UChar32 c) { return (uint8_t)((c >> 6) | 0xc0); }
inline uint8_t getTwoByteTrail(UChar32 c) { return (uint8_t)((c & 0x3f) | 0x80); }
-} // namespace
-
-static UChar32 U_CALLCONV
+UChar32 U_CALLCONV
utf8_caseContextIterator(void *context, int8_t dir) {
UCaseContext *csc=(UCaseContext *)context;
UChar32 c;
return U_SENTINEL;
}
-/*
- * Case-maps [srcStart..srcLimit[ but takes
- * context [0..srcLength[ into account.
+/**
+ * caseLocale >= 0: Lowercases [srcStart..srcLimit[ but takes context [0..srcLength[ into account.
+ * caseLocale < 0: Case-folds [srcStart..srcLimit[.
*/
-static void
-_caseMap(int32_t caseLocale, uint32_t options, UCaseMapFull *map,
- const uint8_t *src, UCaseContext *csc,
- int32_t srcStart, int32_t srcLimit,
- icu::ByteSink &sink, icu::Edits *edits,
- UErrorCode &errorCode) {
- /* case mapping loop */
- int32_t srcIndex=srcStart;
- while (U_SUCCESS(errorCode) && srcIndex<srcLimit) {
+void toLower(int32_t caseLocale, uint32_t options,
+ const uint8_t *src, UCaseContext *csc, int32_t srcStart, int32_t srcLimit,
+ icu::ByteSink &sink, icu::Edits *edits, UErrorCode &errorCode) {
+ const int8_t *latinToLower;
+ if (caseLocale == UCASE_LOC_ROOT ||
+ (caseLocale >= 0 ?
+ !(caseLocale == UCASE_LOC_TURKISH || caseLocale == UCASE_LOC_LITHUANIAN) :
+ (options & _FOLD_CASE_OPTIONS_MASK) == U_FOLD_CASE_DEFAULT)) {
+ latinToLower = LatinCase::TO_LOWER_NORMAL;
+ } else {
+ latinToLower = LatinCase::TO_LOWER_TR_LT;
+ }
+ const UTrie2 *trie = ucase_getTrie();
+ int32_t prev = srcStart;
+ int32_t srcIndex = srcStart;
+ for (;;) {
+ // fast path for simple cases
int32_t cpStart;
- csc->cpStart=cpStart=srcIndex;
UChar32 c;
- U8_NEXT(src, srcIndex, srcLimit, c);
- csc->cpLimit=srcIndex;
- if(c<0) {
- // Malformed UTF-8.
- ByteSinkUtil::appendUnchanged(src+cpStart, srcIndex-cpStart,
+ for (;;) {
+ if (U_FAILURE(errorCode) || srcIndex >= srcLimit) {
+ c = U_SENTINEL;
+ break;
+ }
+ uint8_t lead = src[srcIndex++];
+ if (lead <= 0x7f) {
+ int8_t d = latinToLower[lead];
+ if (d == LatinCase::EXC) {
+ cpStart = srcIndex - 1;
+ c = lead;
+ break;
+ }
+ if (d == 0) { continue; }
+ ByteSinkUtil::appendUnchanged(src + prev, srcIndex - 1 - prev,
+ sink, options, edits, errorCode);
+ char ascii = (char)(lead + d);
+ sink.Append(&ascii, 1);
+ if (edits != nullptr) {
+ edits->addReplace(1, 1);
+ }
+ prev = srcIndex;
+ continue;
+ } else if (lead < 0xe3) {
+ uint8_t t;
+ if (0xc2 <= lead && lead <= 0xc5 && srcIndex < srcLimit &&
+ (t = src[srcIndex] - 0x80) <= 0x3f) {
+ // U+0080..U+017F
+ ++srcIndex;
+ c = ((lead - 0xc0) << 6) | t;
+ int8_t d = latinToLower[c];
+ if (d == LatinCase::EXC) {
+ cpStart = srcIndex - 2;
+ break;
+ }
+ if (d == 0) { continue; }
+ ByteSinkUtil::appendUnchanged(src + prev, srcIndex - 2 - prev,
+ sink, options, edits, errorCode);
+ ByteSinkUtil::appendTwoBytes(c + d, sink);
+ if (edits != nullptr) {
+ edits->addReplace(2, 2);
+ }
+ prev = srcIndex;
+ continue;
+ }
+ } else if ((lead <= 0xe9 || lead == 0xeb || lead == 0xec) &&
+ (srcIndex + 2) <= srcLimit &&
+ U8_IS_TRAIL(src[srcIndex]) && U8_IS_TRAIL(src[srcIndex + 1])) {
+ // most of CJK: no case mappings
+ srcIndex += 2;
+ continue;
+ }
+ cpStart = --srcIndex;
+ U8_NEXT(src, srcIndex, srcLimit, c);
+ if (c < 0) {
+ // ill-formed UTF-8
+ continue;
+ }
+ uint16_t props = UTRIE2_GET16(trie, c);
+ if (UCASE_HAS_EXCEPTION(props)) { break; }
+ int32_t delta;
+ if (!UCASE_IS_UPPER_OR_TITLE(props) || (delta = UCASE_GET_DELTA(props)) == 0) {
+ continue;
+ }
+ ByteSinkUtil::appendUnchanged(src + prev, cpStart - prev,
sink, options, edits, errorCode);
+ ByteSinkUtil::appendCodePoint(srcIndex - cpStart, c + delta, sink, edits);
+ prev = srcIndex;
+ }
+ if (c < 0) {
+ break;
+ }
+ // slow path
+ const UChar *s;
+ if (caseLocale >= 0) {
+ csc->cpStart = cpStart;
+ csc->cpLimit = srcIndex;
+ c = ucase_toFullLower(c, utf8_caseContextIterator, csc, &s, caseLocale);
} else {
- const UChar *s;
- c=map(c, utf8_caseContextIterator, csc, &s, caseLocale);
+ c = ucase_toFullFolding(c, &s, options);
+ }
+ if (c >= 0) {
+ ByteSinkUtil::appendUnchanged(src + prev, cpStart - prev,
+ sink, options, edits, errorCode);
appendResult(srcIndex - cpStart, c, s, sink, options, edits, errorCode);
+ prev = srcIndex;
}
}
+ ByteSinkUtil::appendUnchanged(src + prev, srcIndex - prev,
+ sink, options, edits, errorCode);
}
+void toUpper(int32_t caseLocale, uint32_t options,
+ const uint8_t *src, UCaseContext *csc, int32_t srcLength,
+ icu::ByteSink &sink, icu::Edits *edits, UErrorCode &errorCode) {
+ const int8_t *latinToUpper;
+ if (caseLocale == UCASE_LOC_TURKISH) {
+ latinToUpper = LatinCase::TO_UPPER_TR;
+ } else {
+ latinToUpper = LatinCase::TO_UPPER_NORMAL;
+ }
+ const UTrie2 *trie = ucase_getTrie();
+ int32_t prev = 0;
+ int32_t srcIndex = 0;
+ for (;;) {
+ // fast path for simple cases
+ int32_t cpStart;
+ UChar32 c;
+ for (;;) {
+ if (U_FAILURE(errorCode) || srcIndex >= srcLength) {
+ c = U_SENTINEL;
+ break;
+ }
+ uint8_t lead = src[srcIndex++];
+ if (lead <= 0x7f) {
+ int8_t d = latinToUpper[lead];
+ if (d == LatinCase::EXC) {
+ cpStart = srcIndex - 1;
+ c = lead;
+ break;
+ }
+ if (d == 0) { continue; }
+ ByteSinkUtil::appendUnchanged(src + prev, srcIndex - 1 - prev,
+ sink, options, edits, errorCode);
+ char ascii = (char)(lead + d);
+ sink.Append(&ascii, 1);
+ if (edits != nullptr) {
+ edits->addReplace(1, 1);
+ }
+ prev = srcIndex;
+ continue;
+ } else if (lead < 0xe3) {
+ uint8_t t;
+ if (0xc2 <= lead && lead <= 0xc5 && srcIndex < srcLength &&
+ (t = src[srcIndex] - 0x80) <= 0x3f) {
+ // U+0080..U+017F
+ ++srcIndex;
+ c = ((lead - 0xc0) << 6) | t;
+ int8_t d = latinToUpper[c];
+ if (d == LatinCase::EXC) {
+ cpStart = srcIndex - 2;
+ break;
+ }
+ if (d == 0) { continue; }
+ ByteSinkUtil::appendUnchanged(src + prev, srcIndex - 2 - prev,
+ sink, options, edits, errorCode);
+ ByteSinkUtil::appendTwoBytes(c + d, sink);
+ if (edits != nullptr) {
+ edits->addReplace(2, 2);
+ }
+ prev = srcIndex;
+ continue;
+ }
+ } else if ((lead <= 0xe9 || lead == 0xeb || lead == 0xec) &&
+ (srcIndex + 2) <= srcLength &&
+ U8_IS_TRAIL(src[srcIndex]) && U8_IS_TRAIL(src[srcIndex + 1])) {
+ // most of CJK: no case mappings
+ srcIndex += 2;
+ continue;
+ }
+ cpStart = --srcIndex;
+ U8_NEXT(src, srcIndex, srcLength, c);
+ if (c < 0) {
+ // ill-formed UTF-8
+ continue;
+ }
+ uint16_t props = UTRIE2_GET16(trie, c);
+ if (UCASE_HAS_EXCEPTION(props)) { break; }
+ int32_t delta;
+ if (UCASE_GET_TYPE(props) != UCASE_LOWER || (delta = UCASE_GET_DELTA(props)) == 0) {
+ continue;
+ }
+ ByteSinkUtil::appendUnchanged(src + prev, cpStart - prev,
+ sink, options, edits, errorCode);
+ ByteSinkUtil::appendCodePoint(srcIndex - cpStart, c + delta, sink, edits);
+ prev = srcIndex;
+ }
+ if (c < 0) {
+ break;
+ }
+ // slow path
+ csc->cpStart = cpStart;
+ csc->cpLimit = srcIndex;
+ const UChar *s;
+ c = ucase_toFullUpper(c, utf8_caseContextIterator, csc, &s, caseLocale);
+ if (c >= 0) {
+ ByteSinkUtil::appendUnchanged(src + prev, cpStart - prev,
+ sink, options, edits, errorCode);
+ appendResult(srcIndex - cpStart, c, s, sink, options, edits, errorCode);
+ prev = srcIndex;
+ }
+ }
+ ByteSinkUtil::appendUnchanged(src + prev, srcIndex - prev,
+ sink, options, edits, errorCode);
+}
+
+} // namespace
+
#if !UCONFIG_NO_BREAK_ITERATION
U_CFUNC void U_CALLCONV
if(titleLimit<index) {
if((options&U_TITLECASE_NO_LOWERCASE)==0) {
/* Normal operation: Lowercase the rest of the word. */
- _caseMap(caseLocale, options, ucase_toFullLower,
- src, &csc,
- titleLimit, index,
- sink, edits, errorCode);
+ toLower(caseLocale, options,
+ src, &csc, titleLimit, index,
+ sink, edits, errorCode);
if(U_FAILURE(errorCode)) {
return;
}
UCaseContext csc=UCASECONTEXT_INITIALIZER;
csc.p=(void *)src;
csc.limit=srcLength;
- _caseMap(
- caseLocale, options, ucase_toFullLower,
+ toLower(
+ caseLocale, options,
src, &csc, 0, srcLength,
sink, edits, errorCode);
}
UCaseContext csc=UCASECONTEXT_INITIALIZER;
csc.p=(void *)src;
csc.limit=srcLength;
- _caseMap(
- caseLocale, options, ucase_toFullUpper,
- src, &csc, 0, srcLength,
+ toUpper(
+ caseLocale, options,
+ src, &csc, srcLength,
sink, edits, errorCode);
}
}
const uint8_t *src, int32_t srcLength,
icu::ByteSink &sink, icu::Edits *edits,
UErrorCode &errorCode) {
- /* case mapping loop */
- int32_t srcIndex = 0;
- while (U_SUCCESS(errorCode) && srcIndex < srcLength) {
- int32_t cpStart = srcIndex;
- UChar32 c;
- U8_NEXT(src, srcIndex, srcLength, c);
- if(c<0) {
- // Malformed UTF-8.
- ByteSinkUtil::appendUnchanged(src+cpStart, srcIndex-cpStart,
- sink, options, edits, errorCode);
- } else {
- const UChar *s;
- c = ucase_toFullFolding(c, &s, options);
- appendResult(srcIndex - cpStart, c, s, sink, options, edits, errorCode);
- }
- }
+ toLower(
+ -1, options,
+ src, nullptr, 0, srcLength,
+ sink, edits, errorCode);
}
void
return destIndex;
}
-} // namespace
-
-U_NAMESPACE_END
-
-U_NAMESPACE_USE
-
-/* string casing ------------------------------------------------------------ */
-
/* Appends a full case mapping result, see UCASE_MAX_STRING_LENGTH. */
-static inline int32_t
+inline int32_t
appendResult(UChar *dest, int32_t destIndex, int32_t destCapacity,
int32_t result, const UChar *s,
int32_t cpLength, uint32_t options, icu::Edits *edits) {
return destIndex;
}
-static inline int32_t
+inline int32_t
appendUChar(UChar *dest, int32_t destIndex, int32_t destCapacity, UChar c) {
if(destIndex<destCapacity) {
dest[destIndex]=c;
return destIndex+1;
}
-static inline int32_t
+int32_t
+appendNonEmptyUnchanged(UChar *dest, int32_t destIndex, int32_t destCapacity,
+ const UChar *s, int32_t length, uint32_t options, icu::Edits *edits) {
+ if(edits!=NULL) {
+ edits->addUnchanged(length);
+ }
+ if(options & U_OMIT_UNCHANGED_TEXT) {
+ return destIndex;
+ }
+ if(length>(INT32_MAX-destIndex)) {
+ return -1; // integer overflow
+ }
+ if((destIndex+length)<=destCapacity) {
+ u_memcpy(dest+destIndex, s, length);
+ }
+ return destIndex + length;
+}
+
+inline int32_t
appendUnchanged(UChar *dest, int32_t destIndex, int32_t destCapacity,
const UChar *s, int32_t length, uint32_t options, icu::Edits *edits) {
- if(length>0) {
- if(edits!=NULL) {
- edits->addUnchanged(length);
- }
- if(options & U_OMIT_UNCHANGED_TEXT) {
- return destIndex;
- }
- if(length>(INT32_MAX-destIndex)) {
- return -1; // integer overflow
- }
- if((destIndex+length)<=destCapacity) {
- u_memcpy(dest+destIndex, s, length);
- }
- destIndex+=length;
+ if (length <= 0) {
+ return destIndex;
}
- return destIndex;
+ return appendNonEmptyUnchanged(dest, destIndex, destCapacity, s, length, options, edits);
}
-static UChar32 U_CALLCONV
+UChar32 U_CALLCONV
utf16_caseContextIterator(void *context, int8_t dir) {
UCaseContext *csc=(UCaseContext *)context;
UChar32 c;
return U_SENTINEL;
}
-/*
- * Case-maps [srcStart..srcLimit[ but takes
- * context [0..srcLength[ into account.
+/**
+ * caseLocale >= 0: Lowercases [srcStart..srcLimit[ but takes context [0..srcLength[ into account.
+ * caseLocale < 0: Case-folds [srcStart..srcLimit[.
*/
-static int32_t
-_caseMap(int32_t caseLocale, uint32_t options, UCaseMapFull *map,
- UChar *dest, int32_t destCapacity,
- const UChar *src, UCaseContext *csc,
- int32_t srcStart, int32_t srcLimit,
- icu::Edits *edits,
- UErrorCode &errorCode) {
- /* case mapping loop */
- int32_t srcIndex=srcStart;
- int32_t destIndex=0;
- while(srcIndex<srcLimit) {
- int32_t cpStart;
- csc->cpStart=cpStart=srcIndex;
+int32_t toLower(int32_t caseLocale, uint32_t options,
+ UChar *dest, int32_t destCapacity,
+ const UChar *src, UCaseContext *csc, int32_t srcStart, int32_t srcLimit,
+ icu::Edits *edits, UErrorCode &errorCode) {
+ const int8_t *latinToLower;
+ if (caseLocale == UCASE_LOC_ROOT ||
+ (caseLocale >= 0 ?
+ !(caseLocale == UCASE_LOC_TURKISH || caseLocale == UCASE_LOC_LITHUANIAN) :
+ (options & _FOLD_CASE_OPTIONS_MASK) == U_FOLD_CASE_DEFAULT)) {
+ latinToLower = LatinCase::TO_LOWER_NORMAL;
+ } else {
+ latinToLower = LatinCase::TO_LOWER_TR_LT;
+ }
+ const UTrie2 *trie = ucase_getTrie();
+ int32_t destIndex = 0;
+ int32_t prev = srcStart;
+ int32_t srcIndex = srcStart;
+ for (;;) {
+ // fast path for simple cases
+ UChar lead;
+ while (srcIndex < srcLimit) {
+ lead = src[srcIndex];
+ int32_t delta;
+ if (lead < LatinCase::LONG_S) {
+ int8_t d = latinToLower[lead];
+ if (d == LatinCase::EXC) { break; }
+ ++srcIndex;
+ if (d == 0) { continue; }
+ delta = d;
+ } else if (lead >= 0xd800) {
+ break; // surrogate or higher
+ } else {
+ uint16_t props = UTRIE2_GET16_FROM_U16_SINGLE_LEAD(trie, lead);
+ if (UCASE_HAS_EXCEPTION(props)) { break; }
+ ++srcIndex;
+ if (!UCASE_IS_UPPER_OR_TITLE(props) || (delta = UCASE_GET_DELTA(props)) == 0) {
+ continue;
+ }
+ }
+ lead += delta;
+ destIndex = appendUnchanged(dest, destIndex, destCapacity,
+ src + prev, srcIndex - 1 - prev, options, edits);
+ if (destIndex >= 0) {
+ destIndex = appendUChar(dest, destIndex, destCapacity, lead);
+ if (edits != nullptr) {
+ edits->addReplace(1, 1);
+ }
+ }
+ if (destIndex < 0) {
+ errorCode = U_INDEX_OUTOFBOUNDS_ERROR;
+ return 0;
+ }
+ prev = srcIndex;
+ }
+ if (srcIndex >= srcLimit) {
+ break;
+ }
+ // slow path
+ int32_t cpStart = srcIndex++;
+ UChar trail;
UChar32 c;
- U16_NEXT(src, srcIndex, srcLimit, c);
- csc->cpLimit=srcIndex;
+ if (U16_IS_LEAD(lead) && srcIndex < srcLimit && U16_IS_TRAIL(trail = src[srcIndex])) {
+ c = U16_GET_SUPPLEMENTARY(lead, trail);
+ ++srcIndex;
+ } else {
+ c = lead;
+ }
const UChar *s;
- c=map(c, utf16_caseContextIterator, csc, &s, caseLocale);
- destIndex = appendResult(dest, destIndex, destCapacity, c, s,
- srcIndex - cpStart, options, edits);
- if (destIndex < 0) {
- errorCode = U_INDEX_OUTOFBOUNDS_ERROR;
- return 0;
+ if (caseLocale >= 0) {
+ csc->cpStart = cpStart;
+ csc->cpLimit = srcIndex;
+ c = ucase_toFullLower(c, utf16_caseContextIterator, csc, &s, caseLocale);
+ } else {
+ c = ucase_toFullFolding(c, &s, options);
}
+ if (c >= 0) {
+ destIndex = appendUnchanged(dest, destIndex, destCapacity,
+ src + prev, cpStart - prev, options, edits);
+ if (destIndex >= 0) {
+ destIndex = appendResult(dest, destIndex, destCapacity, c, s,
+ srcIndex - cpStart, options, edits);
+ }
+ if (destIndex < 0) {
+ errorCode = U_INDEX_OUTOFBOUNDS_ERROR;
+ return 0;
+ }
+ prev = srcIndex;
+ }
+ }
+ destIndex = appendUnchanged(dest, destIndex, destCapacity,
+ src + prev, srcIndex - prev, options, edits);
+ if (destIndex < 0) {
+ errorCode = U_INDEX_OUTOFBOUNDS_ERROR;
+ return 0;
}
+ return destIndex;
+}
+int32_t toUpper(int32_t caseLocale, uint32_t options,
+ UChar *dest, int32_t destCapacity,
+ const UChar *src, UCaseContext *csc, int32_t srcLength,
+ icu::Edits *edits, UErrorCode &errorCode) {
+ const int8_t *latinToUpper;
+ if (caseLocale == UCASE_LOC_TURKISH) {
+ latinToUpper = LatinCase::TO_UPPER_TR;
+ } else {
+ latinToUpper = LatinCase::TO_UPPER_NORMAL;
+ }
+ const UTrie2 *trie = ucase_getTrie();
+ int32_t destIndex = 0;
+ int32_t prev = 0;
+ int32_t srcIndex = 0;
+ for (;;) {
+ // fast path for simple cases
+ UChar lead;
+ while (srcIndex < srcLength) {
+ lead = src[srcIndex];
+ int32_t delta;
+ if (lead < LatinCase::LONG_S) {
+ int8_t d = latinToUpper[lead];
+ if (d == LatinCase::EXC) { break; }
+ ++srcIndex;
+ if (d == 0) { continue; }
+ delta = d;
+ } else if (lead >= 0xd800) {
+ break; // surrogate or higher
+ } else {
+ uint16_t props = UTRIE2_GET16_FROM_U16_SINGLE_LEAD(trie, lead);
+ if (UCASE_HAS_EXCEPTION(props)) { break; }
+ ++srcIndex;
+ if (UCASE_GET_TYPE(props) != UCASE_LOWER || (delta = UCASE_GET_DELTA(props)) == 0) {
+ continue;
+ }
+ }
+ lead += delta;
+ destIndex = appendUnchanged(dest, destIndex, destCapacity,
+ src + prev, srcIndex - 1 - prev, options, edits);
+ if (destIndex >= 0) {
+ destIndex = appendUChar(dest, destIndex, destCapacity, lead);
+ if (edits != nullptr) {
+ edits->addReplace(1, 1);
+ }
+ }
+ if (destIndex < 0) {
+ errorCode = U_INDEX_OUTOFBOUNDS_ERROR;
+ return 0;
+ }
+ prev = srcIndex;
+ }
+ if (srcIndex >= srcLength) {
+ break;
+ }
+ // slow path
+ int32_t cpStart;
+ csc->cpStart = cpStart = srcIndex++;
+ UChar trail;
+ UChar32 c;
+ if (U16_IS_LEAD(lead) && srcIndex < srcLength && U16_IS_TRAIL(trail = src[srcIndex])) {
+ c = U16_GET_SUPPLEMENTARY(lead, trail);
+ ++srcIndex;
+ } else {
+ c = lead;
+ }
+ csc->cpLimit = srcIndex;
+ const UChar *s;
+ c = ucase_toFullUpper(c, utf16_caseContextIterator, csc, &s, caseLocale);
+ if (c >= 0) {
+ destIndex = appendUnchanged(dest, destIndex, destCapacity,
+ src + prev, cpStart - prev, options, edits);
+ if (destIndex >= 0) {
+ destIndex = appendResult(dest, destIndex, destCapacity, c, s,
+ srcIndex - cpStart, options, edits);
+ }
+ if (destIndex < 0) {
+ errorCode = U_INDEX_OUTOFBOUNDS_ERROR;
+ return 0;
+ }
+ prev = srcIndex;
+ }
+ }
+ destIndex = appendUnchanged(dest, destIndex, destCapacity,
+ src + prev, srcIndex - prev, options, edits);
+ if (destIndex < 0) {
+ errorCode = U_INDEX_OUTOFBOUNDS_ERROR;
+ return 0;
+ }
return destIndex;
}
+} // namespace
+
+U_NAMESPACE_END
+
+U_NAMESPACE_USE
+
#if !UCONFIG_NO_BREAK_ITERATION
U_CFUNC int32_t U_CALLCONV
if((options&U_TITLECASE_NO_LOWERCASE)==0) {
/* Normal operation: Lowercase the rest of the word. */
destIndex+=
- _caseMap(
- caseLocale, options, ucase_toFullLower,
+ toLower(
+ caseLocale, options,
dest+destIndex, destCapacity-destIndex,
- src, &csc,
- titleLimit, index,
+ src, &csc, titleLimit, index,
edits, errorCode);
if(errorCode==U_BUFFER_OVERFLOW_ERROR) {
errorCode=U_ZERO_ERROR;
UCaseContext csc=UCASECONTEXT_INITIALIZER;
csc.p=(void *)src;
csc.limit=srcLength;
- int32_t destIndex = _caseMap(
- caseLocale, options, ucase_toFullLower,
+ int32_t destIndex = toLower(
+ caseLocale, options,
dest, destCapacity,
src, &csc, 0, srcLength,
edits, errorCode);
UCaseContext csc=UCASECONTEXT_INITIALIZER;
csc.p=(void *)src;
csc.limit=srcLength;
- destIndex = _caseMap(
- caseLocale, options, ucase_toFullUpper,
+ destIndex = toUpper(
+ caseLocale, options,
dest, destCapacity,
- src, &csc, 0, srcLength,
+ src, &csc, srcLength,
edits, errorCode);
}
return checkOverflowAndEditsError(destIndex, destCapacity, edits, errorCode);
const UChar *src, int32_t srcLength,
icu::Edits *edits,
UErrorCode &errorCode) {
- /* case mapping loop */
- int32_t srcIndex = 0;
- int32_t destIndex = 0;
- while (srcIndex < srcLength) {
- int32_t cpStart = srcIndex;
- UChar32 c;
- U16_NEXT(src, srcIndex, srcLength, c);
- const UChar *s;
- c = ucase_toFullFolding(c, &s, options);
- destIndex = appendResult(dest, destIndex, destCapacity, c, s,
- srcIndex - cpStart, options, edits);
- if (destIndex < 0) {
- errorCode = U_INDEX_OUTOFBOUNDS_ERROR;
- return 0;
- }
- }
-
+ int32_t destIndex = toLower(
+ -1, options,
+ dest, destCapacity,
+ src, nullptr, 0, srcLength,
+ edits, errorCode);
return checkOverflowAndEditsError(destIndex, destCapacity, edits, errorCode);
}
dir=0;
}
+ /**
+ * Constructor.
+ * @param src String to iterate over.
+ * @param cpStart Start index of the current code point.
+ * @param cpLimit Limit index of the current code point.
+ */
+ public StringContextIterator(CharSequence src, int cpStart, int cpLimit) {
+ s = src;
+ index = 0;
+ limit = src.length();
+ this.cpStart = cpStart;
+ this.cpLimit = cpLimit;
+ dir = 0;
+ }
+
/**
* Set the iteration limit for nextCaseMapCP() to an index within the string.
* If the limit parameter is negative or past the string, then the
}
}
+ public void setCPStartAndLimit(int s, int l) {
+ cpStart = s;
+ cpLimit = l;
+ dir = 0;
+ }
/**
* Returns the start of the code point that was last returned
* by nextCaseMapCP().
return result.toString();
}
- private static void internalToLower(int caseLocale, int options, StringContextIterator iter,
+ private static final Trie2_16 CASE_TRIE = UCaseProps.getTrie();
+
+ /**
+ * caseLocale >= 0: Lowercases [srcStart..srcLimit[ but takes context [0..srcLength[ into account.
+ * caseLocale < 0: Case-folds [srcStart..srcLimit[.
+ */
+ private static void internalToLower(int caseLocale, int options,
+ CharSequence src, int srcStart, int srcLimit, StringContextIterator iter,
Appendable dest, Edits edits) throws IOException {
- int c;
- while ((c = iter.nextCaseMapCP()) >= 0) {
- c = UCaseProps.INSTANCE.toFullLower(c, iter, dest, caseLocale);
- appendResult(c, dest, iter.getCPLength(), options, edits);
+ byte[] latinToLower;
+ if (caseLocale == UCaseProps.LOC_ROOT ||
+ (caseLocale >= 0 ?
+ !(caseLocale == UCaseProps.LOC_TURKISH || caseLocale == UCaseProps.LOC_LITHUANIAN) :
+ (options & UCaseProps.FOLD_CASE_OPTIONS_MASK) == UCharacter.FOLD_CASE_DEFAULT)) {
+ latinToLower = UCaseProps.LatinCase.TO_LOWER_NORMAL;
+ } else {
+ latinToLower = UCaseProps.LatinCase.TO_LOWER_TR_LT;
}
+ int prev = srcStart;
+ int srcIndex = srcStart;
+ outerLoop:
+ for (;;) {
+ // fast path for simple cases
+ char lead;
+ for (;;) {
+ if (srcIndex >= srcLimit) {
+ break outerLoop;
+ }
+ lead = src.charAt(srcIndex);
+ int delta;
+ if (lead < UCaseProps.LatinCase.LONG_S) {
+ byte d = latinToLower[lead];
+ if (d == UCaseProps.LatinCase.EXC) { break; }
+ ++srcIndex;
+ if (d == 0) { continue; }
+ delta = d;
+ } else if (lead >= 0xd800) {
+ break; // surrogate or higher
+ } else {
+ int props = CASE_TRIE.getFromU16SingleLead(lead);
+ if (UCaseProps.propsHasException(props)) { break; }
+ ++srcIndex;
+ if (!UCaseProps.isUpperOrTitleFromProps(props) ||
+ (delta = UCaseProps.getDelta(props)) == 0) {
+ continue;
+ }
+ }
+ lead += delta;
+ appendUnchanged(src, prev, srcIndex - 1 - prev, dest, options, edits);
+ dest.append(lead);
+ if (edits != null) {
+ edits.addReplace(1, 1);
+ }
+ prev = srcIndex;
+ }
+ // slow path
+ int cpStart = srcIndex++;
+ char trail;
+ int c;
+ if (Character.isHighSurrogate(lead) && srcIndex < srcLimit &&
+ Character.isLowSurrogate(trail = src.charAt(srcIndex))) {
+ c = Character.toCodePoint(lead, trail);
+ ++srcIndex;
+ } else {
+ c = lead;
+ }
+ if (caseLocale >= 0) {
+ if (iter == null) {
+ iter = new StringContextIterator(src, cpStart, srcIndex);
+ } else {
+ iter.setCPStartAndLimit(cpStart, srcIndex);
+ }
+ c = UCaseProps.INSTANCE.toFullLower(c, iter, dest, caseLocale);
+ } else {
+ c = UCaseProps.INSTANCE.toFullFolding(c, dest, options);
+ }
+ if (c >= 0) {
+ appendUnchanged(src, prev, cpStart - prev, dest, options, edits);
+ appendResult(c, dest, srcIndex - cpStart, options, edits);
+ prev = srcIndex;
+ }
+ }
+ appendUnchanged(src, prev, srcIndex - prev, dest, options, edits);
+ }
+
+ private static void internalToUpper(int caseLocale, int options,
+ CharSequence src, Appendable dest, Edits edits) throws IOException {
+ StringContextIterator iter = null;
+ byte[] latinToUpper;
+ if (caseLocale == UCaseProps.LOC_TURKISH) {
+ latinToUpper = UCaseProps.LatinCase.TO_UPPER_TR;
+ } else {
+ latinToUpper = UCaseProps.LatinCase.TO_UPPER_NORMAL;
+ }
+ int prev = 0;
+ int srcIndex = 0;
+ int srcLength = src.length();
+ outerLoop:
+ for (;;) {
+ // fast path for simple cases
+ char lead;
+ for (;;) {
+ if (srcIndex >= srcLength) {
+ break outerLoop;
+ }
+ lead = src.charAt(srcIndex);
+ int delta;
+ if (lead < UCaseProps.LatinCase.LONG_S) {
+ byte d = latinToUpper[lead];
+ if (d == UCaseProps.LatinCase.EXC) { break; }
+ ++srcIndex;
+ if (d == 0) { continue; }
+ delta = d;
+ } else if (lead >= 0xd800) {
+ break; // surrogate or higher
+ } else {
+ int props = CASE_TRIE.getFromU16SingleLead(lead);
+ if (UCaseProps.propsHasException(props)) { break; }
+ ++srcIndex;
+ if (UCaseProps.getTypeFromProps(props) != UCaseProps.LOWER ||
+ (delta = UCaseProps.getDelta(props)) == 0) {
+ continue;
+ }
+ }
+ lead += delta;
+ appendUnchanged(src, prev, srcIndex - 1 - prev, dest, options, edits);
+ dest.append(lead);
+ if (edits != null) {
+ edits.addReplace(1, 1);
+ }
+ prev = srcIndex;
+ }
+ // slow path
+ int cpStart = srcIndex++;
+ char trail;
+ int c;
+ if (Character.isHighSurrogate(lead) && srcIndex < srcLength &&
+ Character.isLowSurrogate(trail = src.charAt(srcIndex))) {
+ c = Character.toCodePoint(lead, trail);
+ ++srcIndex;
+ } else {
+ c = lead;
+ }
+ if (iter == null) {
+ iter = new StringContextIterator(src, cpStart, srcIndex);
+ } else {
+ iter.setCPStartAndLimit(cpStart, srcIndex);
+ }
+ c = UCaseProps.INSTANCE.toFullUpper(c, iter, dest, caseLocale);
+ if (c >= 0) {
+ appendUnchanged(src, prev, cpStart - prev, dest, options, edits);
+ appendResult(c, dest, srcIndex - cpStart, options, edits);
+ prev = srcIndex;
+ }
+ }
+ appendUnchanged(src, prev, srcIndex - prev, dest, options, edits);
}
public static String toLower(int caseLocale, int options, CharSequence src) {
if (edits != null) {
edits.reset();
}
- StringContextIterator iter = new StringContextIterator(src);
- internalToLower(caseLocale, options, iter, dest, edits);
+ internalToLower(caseLocale, options, src, 0, src.length(), null, dest, edits);
return dest;
} catch (IOException e) {
throw new ICUUncheckedIOException(e);
if (caseLocale == UCaseProps.LOC_GREEK) {
return GreekUpper.toUpper(options, src, dest, edits);
}
- StringContextIterator iter = new StringContextIterator(src);
- int c;
- while ((c = iter.nextCaseMapCP()) >= 0) {
- c = UCaseProps.INSTANCE.toFullUpper(c, iter, dest, caseLocale);
- appendResult(c, dest, iter.getCPLength(), options, edits);
- }
+ internalToUpper(caseLocale, options, src, dest, edits);
return dest;
} catch (IOException e) {
throw new ICUUncheckedIOException(e);
if(titleLimit<index) {
if((options&UCharacter.TITLECASE_NO_LOWERCASE)==0) {
// Normal operation: Lowercase the rest of the word.
- internalToLower(caseLocale, options, iter, dest, edits);
+ internalToLower(caseLocale, options,
+ src, titleLimit, index, iter, dest, edits);
} else {
// Optionally just copy the rest of the word unchanged.
appendUnchanged(src, titleLimit, index-titleLimit, dest, options, edits);
- iter.moveToLimit();
}
+ iter.moveToLimit();
}
}
}
if (edits != null) {
edits.reset();
}
- int length = src.length();
- for (int i = 0; i < length;) {
- int c = Character.codePointAt(src, i);
- int cpLength = Character.charCount(c);
- i += cpLength;
- c = UCaseProps.INSTANCE.toFullFolding(c, dest, options);
- appendResult(c, dest, cpLength, options, edits);
- }
+ internalToLower(-1, options, src, 0, src.length(), null, dest, edits);
return dest;
} catch (IOException e) {
throw new ICUUncheckedIOException(e);
return props>>EXC_SHIFT;
}
- private static final boolean propsHasException(int props) {
+ static final boolean propsHasException(int props) {
return (props&EXCEPTION)!=0;
}
public final int tolower(int c) {
int props=trie.get(c);
if(!propsHasException(props)) {
- if(getTypeFromProps(props)>=UPPER) {
+ if(isUpperOrTitleFromProps(props)) {
c+=getDelta(props);
}
} else {
public int next();
}
+ /**
+ * Fast case mapping data for ASCII/Latin.
+ * Linear arrays of delta bytes: 0=no mapping; EXC=exception.
+ * Deltas must not cross the ASCII boundary, or else they cannot be easily used
+ * in simple UTF-8 code.
+ */
+ static final class LatinCase {
+ /** Case mapping/folding data for code points up to U+017F. */
+ static final char LIMIT = 0x180;
+ /** U+017F case-folds and uppercases crossing the ASCII boundary. */
+ static final char LONG_S = 0x17f;
+ /** Exception: Complex mapping, or too-large delta. */
+ static final byte EXC = -0x80;
+
+ /** Deltas for lowercasing for most locales, and default case folding. */
+ static final byte[] TO_LOWER_NORMAL = {
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
+ 0, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, EXC, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 0, 32, 32, 32, 32, 32, 32, 32, EXC,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
+ 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
+ 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
+ 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
+ EXC, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1,
+
+ 0, 1, 0, 1, 0, 1, 0, 1, 0, EXC, 1, 0, 1, 0, 1, 0,
+ 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
+ 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
+ 1, 0, 1, 0, 1, 0, 1, 0, -121, 1, 0, 1, 0, 1, 0, EXC
+ };
+
+ /** Deltas for lowercasing for tr/az/lt, and Turkic case folding. */
+ static final byte[] TO_LOWER_TR_LT = {
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
+ 0, 32, 32, 32, 32, 32, 32, 32, 32, EXC, EXC, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, EXC, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, EXC, EXC, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 0, 32, 32, 32, 32, 32, 32, 32, EXC,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
+ 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
+ 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
+ 1, 0, 1, 0, 1, 0, 1, 0, EXC, 0, 1, 0, 1, 0, EXC, 0,
+ EXC, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1,
+
+ 0, 1, 0, 1, 0, 1, 0, 1, 0, EXC, 1, 0, 1, 0, 1, 0,
+ 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
+ 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
+ 1, 0, 1, 0, 1, 0, 1, 0, -121, 1, 0, 1, 0, 1, 0, EXC
+ };
+
+ /** Deltas for uppercasing for most locales. */
+ static final byte[] TO_UPPER_NORMAL = {
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32,
+ -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, 0, 0, 0, 0, 0,
+
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, EXC, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, EXC,
+ -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32,
+ -32, -32, -32, -32, -32, -32, -32, 0, -32, -32, -32, -32, -32, -32, -32, 121,
+
+ 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
+ 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
+ 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
+ 0, EXC, 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, -1, 0, -1, 0,
+
+ -1, 0, -1, 0, -1, 0, -1, 0, -1, EXC, 0, -1, 0, -1, 0, -1,
+ 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
+ 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
+ 0, -1, 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, -1, 0, -1, EXC
+ };
+
+ /** Deltas for uppercasing for tr/az. */
+ static final byte[] TO_UPPER_TR = {
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, -32, -32, -32, -32, -32, -32, -32, -32, EXC, -32, -32, -32, -32, -32, -32,
+ -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, 0, 0, 0, 0, 0,
+
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, EXC, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, EXC,
+ -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32,
+ -32, -32, -32, -32, -32, -32, -32, 0, -32, -32, -32, -32, -32, -32, -32, 121,
+
+ 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
+ 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
+ 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
+ 0, EXC, 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, -1, 0, -1, 0,
+
+ -1, 0, -1, 0, -1, 0, -1, 0, -1, EXC, 0, -1, 0, -1, 0, -1,
+ 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
+ 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
+ 0, -1, 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, -1, 0, -1, EXC
+ };
+ }
+
/**
* For string case mappings, a single character (a code point) is mapped
* either to itself (in which case in-place mapping functions do nothing),
//ivate static final int LOC_UNKNOWN=0;
public static final int LOC_ROOT=1;
- private static final int LOC_TURKISH=2;
- private static final int LOC_LITHUANIAN=3;
+ static final int LOC_TURKISH=2;
+ static final int LOC_LITHUANIAN=3;
static final int LOC_GREEK=4;
public static final int LOC_DUTCH=5;
result=c;
props=trie.get(c);
if(!propsHasException(props)) {
- if(getTypeFromProps(props)>=UPPER) {
+ if(isUpperOrTitleFromProps(props)) {
result=c+getDelta(props);
}
} else {
*
* @internal
*/
- private static final int FOLD_CASE_OPTIONS_MASK = 7;
+ static final int FOLD_CASE_OPTIONS_MASK = 7;
/* return the simple case folding mapping for c */
public final int fold(int c, int options) {
int props=trie.get(c);
if(!propsHasException(props)) {
- if(getTypeFromProps(props)>=UPPER) {
+ if(isUpperOrTitleFromProps(props)) {
c+=getDelta(props);
}
} else {
result=c;
props=trie.get(c);
if(!propsHasException(props)) {
- if(getTypeFromProps(props)>=UPPER) {
+ if(isUpperOrTitleFromProps(props)) {
result=c+getDelta(props);
}
} else {
// definitions for 16-bit case properties word ------------------------- ***
+ static Trie2_16 getTrie() {
+ return INSTANCE.trie;
+ }
+
/* 2-bit constants for types of cased characters */
public static final int TYPE_MASK=3;
public static final int NONE=0;
public static final int TITLE=3;
/** @return NONE, LOWER, UPPER, TITLE */
- private static final int getTypeFromProps(int props) {
+ static final int getTypeFromProps(int props) {
return props&TYPE_MASK;
}
return props&7;
}
+ static final boolean isUpperOrTitleFromProps(int props) {
+ return (props & 2) != 0;
+ }
+
static final int IGNORABLE=4;
private static final int SENSITIVE= 8;
private static final int EXCEPTION= 0x10;
//private static final int MAX_DELTA= 0xff;
//private static final int MIN_DELTA= (-MAX_DELTA-1);
- private static final int getDelta(int props) {
+ static final int getDelta(int props) {
return (short)props>>DELTA_SHIFT;
}
}
}
else {
- if (!SPECIAL_DATA_[j + 1].equals(
- UCharacter.toLowerCase(str))) {
+ String lower = UCharacter.toLowerCase(str);
+ if (!SPECIAL_DATA_[j + 1].equals(lower)) {
errln("error lowercasing special characters " +
hex(str) + " expected " + SPECIAL_DATA_[j + 1] +
- " but got " +
- hex(UCharacter.toLowerCase(locale, str)));
+ " but got " + hex(lower));
}
- if (!SPECIAL_DATA_[j + 2].equals(
- UCharacter.toUpperCase(locale, str))) {
+ String upper = UCharacter.toUpperCase(str);
+ if (!SPECIAL_DATA_[j + 2].equals(upper)) {
errln("error uppercasing special characters " +
hex(str) + " expected " + SPECIAL_DATA_[j + 2] +
- " but got " +
- hex(UCharacter.toUpperCase(locale, str)));
+ " but got " + hex(upper));
}
}
}