From 5da94f206a93fab635b2355588d3e1d80f8b1294 Mon Sep 17 00:00:00 2001 From: Markus Scherer Date: Thu, 9 Feb 2017 21:15:34 +0000 Subject: [PATCH] ICU-12410 move new code into new files, split ucasemap_imp.h from ustr_imp.h X-SVN-Rev: 39655 --- icu4c/source/common/Makefile.in | 1 + icu4c/source/common/common.vcxproj | 30 ++ icu4c/source/common/common.vcxproj.filters | 12 + icu4c/source/common/edits.cpp | 342 +++++++++++++++ icu4c/source/common/loclikely.cpp | 1 + icu4c/source/common/ucasemap.cpp | 1 + icu4c/source/common/ucasemap_imp.h | 236 ++++++++++ .../common/ucasemap_titlecase_brkiter.cpp | 2 +- icu4c/source/common/unicode/casemap.h | 193 ++++++++ icu4c/source/common/unicode/edits.h | 244 +++++++++++ icu4c/source/common/unicode/ucasemap.h | 411 +----------------- icu4c/source/common/unistr_case.cpp | 4 +- icu4c/source/common/unistr_case_locale.cpp | 2 +- .../common/unistr_titlecase_brkiter.cpp | 2 +- icu4c/source/common/ustr_imp.h | 242 ----------- .../source/common/ustr_titlecase_brkiter.cpp | 3 +- icu4c/source/common/ustrcase.cpp | 331 +------------- icu4c/source/common/ustrcase_locale.cpp | 3 +- icu4c/source/common/ustring.cpp | 1 + icu4c/source/i18n/measfmt.cpp | 1 + icu4c/source/i18n/reldatefmt.cpp | 1 + icu4c/source/i18n/smpdtfmt.cpp | 2 + icu4c/source/test/cintltst/cstrcase.c | 1 + 23 files changed, 1088 insertions(+), 978 deletions(-) create mode 100644 icu4c/source/common/edits.cpp create mode 100644 icu4c/source/common/ucasemap_imp.h create mode 100644 icu4c/source/common/unicode/casemap.h create mode 100644 icu4c/source/common/unicode/edits.h diff --git a/icu4c/source/common/Makefile.in b/icu4c/source/common/Makefile.in index 59ffb7377d7..10fa8de38eb 100644 --- a/icu4c/source/common/Makefile.in +++ b/icu4c/source/common/Makefile.in @@ -94,6 +94,7 @@ stringtriebuilder.o bytestriebuilder.o \ bytestrie.o bytestrieiterator.o \ ucharstrie.o ucharstriebuilder.o ucharstrieiterator.o \ dictionarydata.o \ +edits.o \ appendable.o ustr_cnv.o unistr_cnv.o unistr.o unistr_case.o unistr_props.o \ utf_impl.o ustring.o ustrcase.o ucasemap.o ucasemap_titlecase_brkiter.o cstring.o ustrfmt.o ustrtrns.o ustr_wcs.o utext.o \ unistr_case_locale.o ustrcase_locale.o unistr_titlecase_brkiter.o ustr_titlecase_brkiter.o \ diff --git a/icu4c/source/common/common.vcxproj b/icu4c/source/common/common.vcxproj index 952abc17fd5..ec2aeab5f75 100644 --- a/icu4c/source/common/common.vcxproj +++ b/icu4c/source/common/common.vcxproj @@ -449,6 +449,7 @@ + @@ -1511,6 +1512,20 @@ ..\..\include\unicode\%(Filename)%(Extension);%(Outputs) copy "%(FullPath)" ..\..\include\unicode + + ..\..\include\unicode\%(Filename)%(Extension);%(Outputs) + + + copy "%(FullPath)" ..\..\include\unicode + + ..\..\include\unicode\%(Filename)%(Extension);%(Outputs) + copy "%(FullPath)" ..\..\include\unicode + + ..\..\include\unicode\%(Filename)%(Extension);%(Outputs) + copy "%(FullPath)" ..\..\include\unicode + + ..\..\include\unicode\%(Filename)%(Extension);%(Outputs) + copy "%(FullPath)" ..\..\include\unicode ..\..\include\unicode\%(Filename)%(Extension);%(Outputs) @@ -1532,6 +1547,20 @@ + + copy "%(FullPath)" ..\..\include\unicode + + ..\..\include\unicode\%(Filename)%(Extension);%(Outputs) + copy "%(FullPath)" ..\..\include\unicode + + ..\..\include\unicode\%(Filename)%(Extension);%(Outputs) + copy "%(FullPath)" ..\..\include\unicode + + ..\..\include\unicode\%(Filename)%(Extension);%(Outputs) + copy "%(FullPath)" ..\..\include\unicode + + ..\..\include\unicode\%(Filename)%(Extension);%(Outputs) + copy "%(FullPath)" ..\..\include\unicode @@ -1616,6 +1645,7 @@ ..\..\include\unicode\%(Filename)%(Extension);%(Outputs) + copy "%(FullPath)" ..\..\include\unicode diff --git a/icu4c/source/common/common.vcxproj.filters b/icu4c/source/common/common.vcxproj.filters index 8cf40fb1c0e..21387cd7508 100644 --- a/icu4c/source/common/common.vcxproj.filters +++ b/icu4c/source/common/common.vcxproj.filters @@ -478,6 +478,9 @@ strings + + strings + strings @@ -870,6 +873,9 @@ strings + + strings + strings @@ -1096,9 +1102,15 @@ strings + + strings + strings + + strings + strings diff --git a/icu4c/source/common/edits.cpp b/icu4c/source/common/edits.cpp new file mode 100644 index 00000000000..7d216b5acb8 --- /dev/null +++ b/icu4c/source/common/edits.cpp @@ -0,0 +1,342 @@ +// Copyright (C) 2017 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html + +// edits.cpp +// created: 2017feb08 Markus W. Scherer + +#include "unicode/utypes.h" +#include "unicode/edits.h" +#include "cmemory.h" +#include "uassert.h" + +U_NAMESPACE_BEGIN + +namespace { + +// 0000uuuuuuuuuuuu records u+1 unchanged text units. +const int32_t MAX_UNCHANGED_LENGTH = 0x1000; +const int32_t MAX_UNCHANGED = MAX_UNCHANGED_LENGTH - 1; + +// 0wwwcccccccccccc with w=1..6 records ccc+1 replacements of w:w text units. +// No length change. +const int32_t MAX_SHORT_WIDTH = 6; +const int32_t MAX_SHORT_CHANGE_LENGTH = 0xfff; +const int32_t MAX_SHORT_CHANGE = 0x6fff; + +// 0111mmmmmmnnnnnn records a replacement of m text units with n. +// m or n = 61: actual length follows in the next edits array unit. +// m or n = 62..63: actual length follows in the next two edits array units. +// Bit 30 of the actual length is in the head unit. +// Trailing units have bit 15 set. +const int32_t LENGTH_IN_1TRAIL = 61; +const int32_t LENGTH_IN_2TRAIL = 62; + +} // namespace + +Edits::~Edits() { + if(array != stackArray) { + uprv_free(array); + } +} + +void Edits::reset() { + length = 0; +} + +void Edits::addUnchanged(int32_t unchangedLength) { + if(U_FAILURE(errorCode) || unchangedLength == 0) { return; } + if(unchangedLength < 0) { + errorCode = U_ILLEGAL_ARGUMENT_ERROR; + return; + } + // Merge into previous unchanged-text record, if any. + int32_t last = lastUnit(); + if(last < MAX_UNCHANGED) { + int32_t remaining = MAX_UNCHANGED - last; + if (remaining >= unchangedLength) { + setLastUnit(last + unchangedLength); + return; + } + setLastUnit(MAX_UNCHANGED); + unchangedLength -= remaining; + } + // Split large lengths into multiple units. + while(unchangedLength >= MAX_UNCHANGED_LENGTH) { + append(MAX_UNCHANGED); + unchangedLength -= MAX_UNCHANGED_LENGTH; + } + // Write a small (remaining) length. + if(unchangedLength > 0) { + append(unchangedLength - 1); + } +} + +void Edits::addReplace(int32_t oldLength, int32_t newLength) { + if(U_FAILURE(errorCode)) { return; } + if(oldLength == newLength && 0 < oldLength && oldLength <= MAX_SHORT_WIDTH) { + // Replacement of short oldLength text units by same-length new text. + // Merge into previous short-replacement record, if any. + int32_t last = lastUnit(); + if(MAX_UNCHANGED < last && last < MAX_SHORT_CHANGE && + (last >> 12) == oldLength && (last & 0xfff) < MAX_SHORT_CHANGE_LENGTH) { + setLastUnit(last + 1); + return; + } + append(oldLength << 12); + return; + } + + if(oldLength < 0 || newLength < 0) { + errorCode = U_ILLEGAL_ARGUMENT_ERROR; + return; + } + if (oldLength == 0 && newLength == 0) { + return; + } + int32_t newDelta = newLength - oldLength; + if (newDelta != 0) { + if (newDelta > 0 ? newDelta > (INT32_MAX - delta) : newDelta < (INT32_MIN - delta)) { + // Integer overflow or underflow. + errorCode = U_INDEX_OUTOFBOUNDS_ERROR; + return; + } + delta += newDelta; + } + + int32_t head = 0x7000; + if (oldLength < LENGTH_IN_1TRAIL && newLength < LENGTH_IN_1TRAIL) { + head |= oldLength << 6; + head |= newLength; + append(head); + } else if ((capacity - length) >= 5 || growArray()) { + int32_t limit = length + 1; + if(oldLength < LENGTH_IN_1TRAIL) { + head |= oldLength << 6; + } else if(oldLength <= 0x7fff) { + head |= LENGTH_IN_1TRAIL << 6; + array[limit++] = (uint16_t)(0x8000 | oldLength); + } else { + head |= (LENGTH_IN_2TRAIL + (oldLength >> 30)) << 6; + array[limit++] = (uint16_t)(0x8000 | (oldLength >> 15)); + array[limit++] = (uint16_t)(0x8000 | oldLength); + } + if(newLength < LENGTH_IN_1TRAIL) { + head |= newLength; + } else if(newLength <= 0x7fff) { + head |= LENGTH_IN_1TRAIL; + array[limit++] = (uint16_t)(0x8000 | newLength); + } else { + head |= LENGTH_IN_2TRAIL + (newLength >> 30); + array[limit++] = (uint16_t)(0x8000 | (newLength >> 15)); + array[limit++] = (uint16_t)(0x8000 | newLength); + } + array[length] = (uint16_t)head; + length = limit; + } +} + +void Edits::append(int32_t r) { + if(length < capacity || growArray()) { + array[length++] = (uint16_t)r; + } +} + +UBool Edits::growArray() { + int32_t newCapacity; + if (array == stackArray) { + newCapacity = 2000; + } else if (capacity == INT32_MAX) { + errorCode = U_BUFFER_OVERFLOW_ERROR; + return FALSE; + } else if (capacity >= (INT32_MAX / 2)) { + newCapacity = INT32_MAX; + } else { + newCapacity = 2 * capacity; + } + // Grow by at least 5 units so that a maximal change record will fit. + if ((newCapacity - capacity) < 5) { + errorCode = U_BUFFER_OVERFLOW_ERROR; + return FALSE; + } + uint16_t *newArray = (uint16_t *)uprv_malloc((size_t)newCapacity * 2); + if (newArray == NULL) { + errorCode = U_MEMORY_ALLOCATION_ERROR; + return FALSE; + } + uprv_memcpy(newArray, array, (size_t)length * 2); + if (array != stackArray) { + uprv_free(array); + } + array = newArray; + capacity = newCapacity; + return TRUE; +} + +UBool Edits::copyErrorTo(UErrorCode &outErrorCode) { + if (U_FAILURE(outErrorCode)) { return TRUE; } + if (U_SUCCESS(errorCode)) { return FALSE; } + outErrorCode = errorCode; + return TRUE; +} + +UBool Edits::hasChanges() const { + if (delta != 0) { + return TRUE; + } + for (int32_t i = 0; i < length; ++i) { + if (array[i] > MAX_UNCHANGED) { + return TRUE; + } + } + return FALSE; +} + +Edits::Iterator::Iterator(const uint16_t *a, int32_t len, UBool oc, UBool crs) : + array(a), index(0), length(len), remaining(0), + onlyChanges(oc), coarse(crs), + changed(FALSE), oldLength_(0), newLength_(0), + srcIndex(0), replIndex(0), destIndex(0) {} + +int32_t Edits::Iterator::readLength(int32_t head) { + if (head < LENGTH_IN_1TRAIL) { + return head; + } else if (head < LENGTH_IN_2TRAIL) { + U_ASSERT(index < length); + U_ASSERT(array[index] >= 0x8000); + return array[index++]; + } else { + U_ASSERT((index + 2) <= length); + U_ASSERT(array[index] >= 0x8000); + U_ASSERT(array[index + 1] >= 0x8000); + int32_t len = ((head & 1) << 30) | + ((int32_t)(array[index] & 0x7fff) << 15) | + (array[index + 1] & 0x7fff); + index += 2; + return len; + } +} + +void Edits::Iterator::updateIndexes() { + srcIndex += oldLength_; + if (changed) { + replIndex += newLength_; + } + destIndex += newLength_; +} + +UBool Edits::Iterator::noNext() { + // Empty span beyond the string. + oldLength_ = newLength_ = 0; + return FALSE; +} + +UBool Edits::Iterator::next(UErrorCode &errorCode) { + if (U_FAILURE(errorCode)) { return FALSE; } + // We have an errorCode in case we need to start guarding against integer overflows. + // It is also convenient for caller loops if we bail out when an error was set elsewhere. + updateIndexes(); + if (remaining > 0) { + // Fine-grained iterator: Continue a sequence of equal-length changes. + --remaining; + return TRUE; + } + if (index >= length) { + return noNext(); + } + int32_t u = array[index++]; + if (u <= MAX_UNCHANGED) { + // Combine adjacent unchanged ranges. + changed = FALSE; + oldLength_ = u + 1; + while (index < length && (u = array[index]) <= MAX_UNCHANGED) { + ++index; + oldLength_ += u + 1; + } + newLength_ = oldLength_; + if (onlyChanges) { + updateIndexes(); + if (index >= length) { + return noNext(); + } + // already fetched u > MAX_UNCHANGED at index + ++index; + } else { + return TRUE; + } + } + changed = TRUE; + if (u <= MAX_SHORT_CHANGE) { + if (coarse) { + int32_t w = u >> 12; + int32_t len = (u & 0xfff) + 1; + oldLength_ = newLength_ = len * w; + } else { + // Split a sequence of equal-length changes that was compressed into one unit. + oldLength_ = newLength_ = u >> 12; + remaining = u & 0xfff; + return TRUE; + } + } else { + U_ASSERT(u <= 0x7fff); + oldLength_ = readLength((u >> 6) & 0x3f); + newLength_ = readLength(u & 0x3f); + if (!coarse) { + return TRUE; + } + } + // Combine adjacent changes. + while (index < length && (u = array[index]) > MAX_UNCHANGED) { + ++index; + if (u <= MAX_SHORT_CHANGE) { + int32_t w = u >> 12; + int32_t len = (u & 0xfff) + 1; + len = len * w; + oldLength_ += len; + newLength_ += len; + } else { + U_ASSERT(u <= 0x7fff); + int32_t oldLen = readLength((u >> 6) & 0x3f); + int32_t newLen = readLength(u & 0x3f); + oldLength_ += oldLen; + newLength_ += newLen; + } + } + return TRUE; +} + +UBool Edits::Iterator::findSourceIndex(int32_t i, UErrorCode &errorCode) { + if (U_FAILURE(errorCode) || i < 0) { return FALSE; } + if (i < srcIndex) { + // Reset the iterator to the start. + index = remaining = srcIndex = replIndex = destIndex = 0; + } else if (i < (srcIndex + oldLength_)) { + // The index is in the current span. + return TRUE; + } + while (next(errorCode)) { + if (i < (srcIndex + oldLength_)) { + // The index is in the current span. + return TRUE; + } + if (remaining > 0) { + // Is the index in one of the remaining compressed edits? + // srcIndex is the start of the current span, before the remaining ones. + int32_t len = (remaining + 1) * oldLength_; + if (i < (srcIndex + len)) { + int32_t n = (i - srcIndex) / oldLength_; // 1 <= n <= remaining + len = n * oldLength_; + srcIndex += len; + replIndex += len; + destIndex += len; + remaining -= n; + return TRUE; + } + // Make next() skip all of these edits at once. + oldLength_ = newLength_ = len; + remaining = 0; + } + } + return FALSE; +} + +U_NAMESPACE_END diff --git a/icu4c/source/common/loclikely.cpp b/icu4c/source/common/loclikely.cpp index c13b37e1a1f..543c22e0d3c 100644 --- a/icu4c/source/common/loclikely.cpp +++ b/icu4c/source/common/loclikely.cpp @@ -22,6 +22,7 @@ #include "unicode/utypes.h" #include "unicode/locid.h" #include "unicode/putil.h" +#include "unicode/uchar.h" #include "unicode/uloc.h" #include "unicode/ures.h" #include "unicode/uscript.h" diff --git a/icu4c/source/common/ucasemap.cpp b/icu4c/source/common/ucasemap.cpp index 6a4a511ae35..901c50f03a5 100644 --- a/icu4c/source/common/ucasemap.cpp +++ b/icu4c/source/common/ucasemap.cpp @@ -33,6 +33,7 @@ #include "cmemory.h" #include "cstring.h" #include "ucase.h" +#include "ucasemap_imp.h" #include "ustr_imp.h" U_NAMESPACE_USE diff --git a/icu4c/source/common/ucasemap_imp.h b/icu4c/source/common/ucasemap_imp.h new file mode 100644 index 00000000000..8e287f7a17d --- /dev/null +++ b/icu4c/source/common/ucasemap_imp.h @@ -0,0 +1,236 @@ +// Copyright (C) 2017 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html + +// ucasemap_imp.h +// created: 2017feb08 Markus W. Scherer + +#ifndef __UCASEMAP_IMP_H__ +#define __UCASEMAP_IMP_H__ + +#include "unicode/utypes.h" +#include "unicode/ucasemap.h" +#include "ucase.h" + +#ifndef U_COMPARE_IGNORE_CASE +/* see also unorm.h */ +/** + * Option bit for unorm_compare: + * Perform case-insensitive comparison. + */ +#define U_COMPARE_IGNORE_CASE 0x10000 +#endif + +/** + * Internal API, used by u_strcasecmp() etc. + * Compare strings case-insensitively, + * in code point order or code unit order. + */ +U_CFUNC int32_t +u_strcmpFold(const UChar *s1, int32_t length1, + const UChar *s2, int32_t length2, + uint32_t options, + UErrorCode *pErrorCode); + +/** + * Interanl API, used for detecting length of + * shared prefix case-insensitively. + * @param s1 input string 1 + * @param length1 length of string 1, or -1 (NULL terminated) + * @param s2 input string 2 + * @param length2 length of string 2, or -1 (NULL terminated) + * @param options compare options + * @param matchLen1 (output) length of partial prefix match in s1 + * @param matchLen2 (output) length of partial prefix match in s2 + * @param pErrorCode receives error status + */ +U_CAPI void +u_caseInsensitivePrefixMatch(const UChar *s1, int32_t length1, + const UChar *s2, int32_t length2, + uint32_t options, + int32_t *matchLen1, int32_t *matchLen2, + UErrorCode *pErrorCode); + +/** + * Are the Unicode properties loaded? + * This must be used before internal functions are called that do + * not perform this check. + * Generate a debug assertion failure if data is not loaded. + */ +U_CFUNC UBool +uprv_haveProperties(UErrorCode *pErrorCode); + +#ifdef __cplusplus + +#include "unicode/unistr.h" // for UStringCaseMapper + +/* + * Internal string casing functions implementing + * ustring.h/ustrcase.cpp and UnicodeString case mapping functions. + */ + +struct UCaseMap : public icu::UMemory { + /** Implements most of ucasemap_open(). */ + UCaseMap(const char *localeID, uint32_t opts, UErrorCode *pErrorCode); + ~UCaseMap(); + +#if !UCONFIG_NO_BREAK_ITERATION + icu::BreakIterator *iter; /* We adopt the iterator, so we own it. */ +#endif + char locale[32]; + int32_t caseLocale; + uint32_t options; +}; + +#if UCONFIG_NO_BREAK_ITERATION +# define UCASEMAP_BREAK_ITERATOR_PARAM +# define UCASEMAP_BREAK_ITERATOR_UNUSED +# define UCASEMAP_BREAK_ITERATOR +# define UCASEMAP_BREAK_ITERATOR_NULL +#else +# define UCASEMAP_BREAK_ITERATOR_PARAM icu::BreakIterator *iter, +# define UCASEMAP_BREAK_ITERATOR_UNUSED icu::BreakIterator *, +# define UCASEMAP_BREAK_ITERATOR iter, +# define UCASEMAP_BREAK_ITERATOR_NULL NULL, +#endif + +U_CFUNC int32_t +ustrcase_getCaseLocale(const char *locale); + +// TODO: swap src / dest if approved for new public api +/** Implements UStringCaseMapper. */ +U_CFUNC int32_t U_CALLCONV +ustrcase_internalToLower(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_PARAM + UChar *dest, int32_t destCapacity, + const UChar *src, int32_t srcLength, + icu::Edits *edits, + UErrorCode &errorCode); + +/** Implements UStringCaseMapper. */ +U_CFUNC int32_t U_CALLCONV +ustrcase_internalToUpper(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_PARAM + UChar *dest, int32_t destCapacity, + const UChar *src, int32_t srcLength, + icu::Edits *edits, + UErrorCode &errorCode); + +#if !UCONFIG_NO_BREAK_ITERATION + +/** Implements UStringCaseMapper. */ +U_CFUNC int32_t U_CALLCONV +ustrcase_internalToTitle(int32_t caseLocale, uint32_t options, + icu::BreakIterator *iter, + UChar *dest, int32_t destCapacity, + const UChar *src, int32_t srcLength, + icu::Edits *edits, + UErrorCode &errorCode); + +#endif + +/** Implements UStringCaseMapper. */ +U_CFUNC int32_t U_CALLCONV +ustrcase_internalFold(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_PARAM + UChar *dest, int32_t destCapacity, + const UChar *src, int32_t srcLength, + icu::Edits *edits, + UErrorCode &errorCode); + +/** + * Common string case mapping implementation for ucasemap_toXyz() and UnicodeString::toXyz(). + * Implements argument checking. + */ +U_CFUNC int32_t +ustrcase_map(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_PARAM + UChar *dest, int32_t destCapacity, + const UChar *src, int32_t srcLength, + UStringCaseMapper *stringCaseMapper, + icu::Edits *edits, + UErrorCode &errorCode); + +/** + * Common string case mapping implementation for old-fashioned u_strToXyz() functions + * that allow the source string to overlap the destination buffer. + * Implements argument checking and internally works with an intermediate buffer if necessary. + */ +U_CFUNC int32_t +ustrcase_mapWithOverlap(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_PARAM + UChar *dest, int32_t destCapacity, + const UChar *src, int32_t srcLength, + UStringCaseMapper *stringCaseMapper, + UErrorCode &errorCode); + +/** + * UTF-8 string case mapping function type, used by ucasemap_mapUTF8(). + * UTF-8 version of UStringCaseMapper. + * All error checking must be done. + * The UCaseMap must be fully initialized, with locale and/or iter set as needed. + * src and dest must not overlap. + */ +typedef int32_t U_CALLCONV +UTF8CaseMapper(int32_t caseLocale, uint32_t options, +#if !UCONFIG_NO_BREAK_ITERATION + icu::BreakIterator *iter, +#endif + uint8_t *dest, int32_t destCapacity, + const uint8_t *src, int32_t srcLength, + UErrorCode *pErrorCode); + +#if !UCONFIG_NO_BREAK_ITERATION + +/** Implements UTF8CaseMapper. */ +U_CFUNC int32_t U_CALLCONV +ucasemap_internalUTF8ToTitle(int32_t caseLocale, uint32_t options, + icu::BreakIterator *iter, + uint8_t *dest, int32_t destCapacity, + const uint8_t *src, int32_t srcLength, + UErrorCode *pErrorCode); + +#endif + +/** + * Implements argument checking and buffer handling + * for UTF-8 string case mapping as a common function. + */ +U_CFUNC int32_t +ucasemap_mapUTF8(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_PARAM + uint8_t *dest, int32_t destCapacity, + const uint8_t *src, int32_t srcLength, + UTF8CaseMapper *stringCaseMapper, + UErrorCode *pErrorCode); + +U_NAMESPACE_BEGIN +namespace GreekUpper { + +// Data bits. +static const uint32_t UPPER_MASK = 0x3ff; +static const uint32_t HAS_VOWEL = 0x1000; +static const uint32_t HAS_YPOGEGRAMMENI = 0x2000; +static const uint32_t HAS_ACCENT = 0x4000; +static const uint32_t HAS_DIALYTIKA = 0x8000; +// Further bits during data building and processing, not stored in the data map. +static const uint32_t HAS_COMBINING_DIALYTIKA = 0x10000; +static const uint32_t HAS_OTHER_GREEK_DIACRITIC = 0x20000; + +static const uint32_t HAS_VOWEL_AND_ACCENT = HAS_VOWEL | HAS_ACCENT; +static const uint32_t HAS_VOWEL_AND_ACCENT_AND_DIALYTIKA = + HAS_VOWEL_AND_ACCENT | HAS_DIALYTIKA; +static const uint32_t HAS_EITHER_DIALYTIKA = HAS_DIALYTIKA | HAS_COMBINING_DIALYTIKA; + +// State bits. +static const uint32_t AFTER_CASED = 1; +static const uint32_t AFTER_VOWEL_WITH_ACCENT = 2; + +uint32_t getLetterData(UChar32 c); + +/** + * Returns a non-zero value for each of the Greek combining diacritics + * listed in The Unicode Standard, version 8, chapter 7.2 Greek, + * plus some perispomeni look-alikes. + */ +uint32_t getDiacriticData(UChar32 c); + +} // namespace GreekUpper +U_NAMESPACE_END + +#endif // __cplusplus + +#endif // __UCASEMAP_IMP_H__ diff --git a/icu4c/source/common/ucasemap_titlecase_brkiter.cpp b/icu4c/source/common/ucasemap_titlecase_brkiter.cpp index ca0d5463ca9..f580dc76266 100644 --- a/icu4c/source/common/ucasemap_titlecase_brkiter.cpp +++ b/icu4c/source/common/ucasemap_titlecase_brkiter.cpp @@ -26,7 +26,7 @@ #include "unicode/ucasemap.h" #include "cmemory.h" #include "ucase.h" -#include "ustr_imp.h" +#include "ucasemap_imp.h" U_NAMESPACE_USE diff --git a/icu4c/source/common/unicode/casemap.h b/icu4c/source/common/unicode/casemap.h new file mode 100644 index 00000000000..a65d9eb2543 --- /dev/null +++ b/icu4c/source/common/unicode/casemap.h @@ -0,0 +1,193 @@ +// Copyright (C) 2017 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html + +// casemap.h +// created: 2017jan12 Markus W. Scherer + +#ifndef __CASEMAP_H__ +#define __CASEMAP_H__ + +#include "unicode/utypes.h" +#include "unicode/uobject.h" + +/** + * \file + * \brief C++ API: Low-level C++ case mapping functions. + */ + +U_NAMESPACE_BEGIN + +#ifndef U_HIDE_DRAFT_API + +class BreakIterator; +class Edits; + +/** + * Low-level C++ case mapping functions. + * + * @draft ICU 59 + */ +class U_COMMON_API CaseMap final : public UMemory { +public: + /** + * Lowercases a UTF-16 string and optionally records edits. + * Casing is locale-dependent and context-sensitive. + * The result may be longer or shorter than the original. + * The source string and the destination buffer must not overlap. + * + * @param locale The locale ID. ("" = root locale, NULL = default locale.) + * @param options Options bit set, usually 0. See UCASEMAP_OMIT_UNCHANGED_TEXT. + * @param src The original string. + * @param srcLength The length of the original string. If -1, then src must be NUL-terminated. + * @param dest A buffer for the result string. The result will be NUL-terminated if + * the buffer is large enough. + * The contents is undefined in case of failure. + * @param destCapacity The size of the buffer (number of bytes). If it is 0, then + * dest may be NULL and the function will only return the length of the result + * without writing any of the result string. + * @param edits Records edits for index mapping, working with styled text, + * and getting only changes (if any). + * This function calls edits->reset() first. edits can be NULL. + * @param errorCode Reference to an in/out error code value + * which must not indicate a failure before the function call. + * @return The length of the result string, if successful - or in case of a buffer overflow, + * in which case it will be greater than destCapacity. + * + * @see u_strToLower + * @draft ICU 59 + */ + static int32_t toLower( + const char *locale, uint32_t options, + const UChar *src, int32_t srcLength, + UChar *dest, int32_t destCapacity, Edits *edits, + UErrorCode &errorCode); + + /** + * Uppercases a UTF-16 string and optionally records edits. + * Casing is locale-dependent and context-sensitive. + * The result may be longer or shorter than the original. + * The source string and the destination buffer must not overlap. + * + * @param locale The locale ID. ("" = root locale, NULL = default locale.) + * @param options Options bit set, usually 0. See UCASEMAP_OMIT_UNCHANGED_TEXT. + * @param src The original string. + * @param srcLength The length of the original string. If -1, then src must be NUL-terminated. + * @param dest A buffer for the result string. The result will be NUL-terminated if + * the buffer is large enough. + * The contents is undefined in case of failure. + * @param destCapacity The size of the buffer (number of bytes). If it is 0, then + * dest may be NULL and the function will only return the length of the result + * without writing any of the result string. + * @param edits Records edits for index mapping, working with styled text, + * and getting only changes (if any). + * This function calls edits->reset() first. edits can be NULL. + * @param errorCode Reference to an in/out error code value + * which must not indicate a failure before the function call. + * @return The length of the result string, if successful - or in case of a buffer overflow, + * in which case it will be greater than destCapacity. + * + * @see u_strToUpper + * @draft ICU 59 + */ + static int32_t toUpper( + const char *locale, uint32_t options, + const UChar *src, int32_t srcLength, + UChar *dest, int32_t destCapacity, Edits *edits, + UErrorCode &errorCode); + +#if !UCONFIG_NO_BREAK_ITERATION + + /** + * Titlecases a UTF-16 string and optionally records edits. + * Casing is locale-dependent and context-sensitive. + * The result may be longer or shorter than the original. + * The source string and the destination buffer must not overlap. + * + * Titlecasing uses a break iterator to find the first characters of words + * that are to be titlecased. It titlecases those characters and lowercases + * all others. (This can be modified with options bits.) + * + * @param locale The locale ID. ("" = root locale, NULL = default locale.) + * @param options Options bit set, usually 0. See UCASEMAP_OMIT_UNCHANGED_TEXT, + * U_TITLECASE_NO_LOWERCASE, U_TITLECASE_NO_BREAK_ADJUSTMENT. + * @param iter A break iterator to find the first characters of words that are to be titlecased. + * It is set to the source string (setText()) + * and used one or more times for iteration (first() and next()). + * If NULL, then a word break iterator for the locale is used + * (or something equivalent). + * @param src The original string. + * @param srcLength The length of the original string. If -1, then src must be NUL-terminated. + * @param dest A buffer for the result string. The result will be NUL-terminated if + * the buffer is large enough. + * The contents is undefined in case of failure. + * @param destCapacity The size of the buffer (number of bytes). If it is 0, then + * dest may be NULL and the function will only return the length of the result + * without writing any of the result string. + * @param edits Records edits for index mapping, working with styled text, + * and getting only changes (if any). + * This function calls edits->reset() first. edits can be NULL. + * @param errorCode Reference to an in/out error code value + * which must not indicate a failure before the function call. + * @return The length of the result string, if successful - or in case of a buffer overflow, + * in which case it will be greater than destCapacity. + * + * @see u_strToTitle + * @see ucasemap_toTitle + * @draft ICU 59 + */ + static int32_t toTitle( + const char *locale, uint32_t options, BreakIterator *iter, + const UChar *src, int32_t srcLength, + UChar *dest, int32_t destCapacity, Edits *edits, + UErrorCode &errorCode); + +#endif // UCONFIG_NO_BREAK_ITERATION + + /** + * Case-folds a UTF-16 string and optionally records edits. + * + * Case-folding is locale-independent and not context-sensitive, + * but there is an option for whether to include or exclude mappings for dotted I + * and dotless i that are marked with 'T' in CaseFolding.txt. + * + * The result may be longer or shorter than the original. + * The source string and the destination buffer must not overlap. + * + * @param options Options bit set, usually 0. See UCASEMAP_OMIT_UNCHANGED_TEXT, + * U_FOLD_CASE_DEFAULT, U_FOLD_CASE_EXCLUDE_SPECIAL_I. + * @param src The original string. + * @param srcLength The length of the original string. If -1, then src must be NUL-terminated. + * @param dest A buffer for the result string. The result will be NUL-terminated if + * the buffer is large enough. + * The contents is undefined in case of failure. + * @param destCapacity The size of the buffer (number of bytes). If it is 0, then + * dest may be NULL and the function will only return the length of the result + * without writing any of the result string. + * @param edits Records edits for index mapping, working with styled text, + * and getting only changes (if any). + * This function calls edits->reset() first. edits can be NULL. + * @param errorCode Reference to an in/out error code value + * which must not indicate a failure before the function call. + * @return The length of the result string, if successful - or in case of a buffer overflow, + * in which case it will be greater than destCapacity. + * + * @see u_strFoldCase + * @draft ICU 59 + */ + static int32_t foldCase( + uint32_t options, + const UChar *src, int32_t srcLength, + UChar *dest, int32_t destCapacity, Edits *edits, + UErrorCode &errorCode); + +private: + CaseMap() = delete; + CaseMap(const CaseMap &other) = delete; + CaseMap &operator=(const CaseMap &other) = delete; +}; + +#endif // U_HIDE_DRAFT_API + +U_NAMESPACE_END + +#endif // __CASEMAP_H__ diff --git a/icu4c/source/common/unicode/edits.h b/icu4c/source/common/unicode/edits.h new file mode 100644 index 00000000000..3a7acb1c72a --- /dev/null +++ b/icu4c/source/common/unicode/edits.h @@ -0,0 +1,244 @@ +// Copyright (C) 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html + +// edits.h +// created: 2016dec30 Markus W. Scherer + +#ifndef __EDITS_H__ +#define __EDITS_H__ + +#include "unicode/utypes.h" +#include "unicode/uobject.h" + +/** + * \file + * \brief C++ API: C++ class Edits for low-level string transformations on styled text. + */ + +U_NAMESPACE_BEGIN + +#ifndef U_HIDE_DRAFT_API + +/** + * Records lengths of string edits but not replacement text. + * Supports replacements, insertions, deletions in linear progression. + * Does not support moving/reordering of text. + * + * An Edits object tracks a separate UErrorCode, but ICU string transformation functions + * (e.g., case mapping functions) merge any such errors into their API's UErrorCode. + * + * @draft ICU 59 + */ +class U_COMMON_API Edits final : public UMemory { +public: + /** + * Constructs an empty object. + * @draft ICU 59 + */ + Edits() : + array(stackArray), capacity(STACK_CAPACITY), length(0), delta(0), + errorCode(U_ZERO_ERROR) {} + /** + * Destructor. + * @draft ICU 59 + */ + ~Edits(); + + /** + * Resets the data but may not release memory. + * @draft ICU 59 + */ + void reset(); + + /** + * Adds a record for an unchanged segment of text. + * Normally called from inside ICU string transformation functions, not user code. + * @draft ICU 59 + */ + void addUnchanged(int32_t unchangedLength); + /** + * Adds a record for a text replacement/insertion/deletion. + * Normally called from inside ICU string transformation functions, not user code. + * @draft ICU 59 + */ + void addReplace(int32_t oldLength, int32_t newLength); + /** + * Sets the UErrorCode if an error occurred while recording edits. + * Preserves older error codes in the outErrorCode. + * Normally called from inside ICU string transformation functions, not user code. + * @return TRUE if U_FAILURE(outErrorCode) + * @draft ICU 59 + */ + UBool copyErrorTo(UErrorCode &outErrorCode); + + /** + * How much longer is the new text compared with the old text? + * @return new length minus old length + * @draft ICU 59 + */ + int32_t lengthDelta() const { return delta; } + /** + * @return TRUE if there are any change edits + * @draft ICU 59 + */ + UBool hasChanges() const; + + /** + * Access to the list of edits. + * @see getCoarseIterator + * @see getFineIterator + * @draft ICU 59 + */ + struct Iterator final : public UMemory { + /** + * Copy constructor. + * @draft ICU 59 + */ + Iterator(const Iterator &other) = default; + /** + * Assignment operator. + * @draft ICU 59 + */ + Iterator &operator=(const Iterator &other) = default; + + /** + * Advances to the next edit. + * @return TRUE if there is another edit + * @draft ICU 59 + */ + UBool next(UErrorCode &errorCode); + + /** + * Finds the edit that contains the source index. + * The source index may be found in a non-change + * even if normal iteration would skip non-changes. + * Normal iteration can continue from a found edit. + * + * The iterator state before this search logically does not matter. + * (It may affect the performance of the search.) + * + * The iterator state after this search is undefined + * if the source index is out of bounds for the source string. + * + * @param i source index + * @return TRUE if the edit for the source index was found + * @draft ICU 59 + */ + UBool findSourceIndex(int32_t i, UErrorCode &errorCode); + + /** + * @return TRUE if this edit replaces oldLength() units with newLength() different ones. + * FALSE if oldLength units remain unchanged. + * @draft ICU 59 + */ + UBool hasChange() const { return changed; } + /** + * @return the number of units in the original string which are replaced or remain unchanged. + * @draft ICU 59 + */ + int32_t oldLength() const { return oldLength_; } + /** + * @return the number of units in the modified string, if hasChange() is TRUE. + * Same as oldLength if hasChange() is FALSE. + * @draft ICU 59 + */ + int32_t newLength() const { return newLength_; } + + /** + * @return the current index into the source string + * @draft ICU 59 + */ + int32_t sourceIndex() const { return srcIndex; } + /** + * @return the current index into the replacement-characters-only string, + * not counting unchanged spans + * @draft ICU 59 + */ + int32_t replacementIndex() const { return replIndex; } + /** + * @return the current index into the full destination string + * @draft ICU 59 + */ + int32_t destinationIndex() const { return destIndex; } + + private: + friend class Edits; + + Iterator(const uint16_t *a, int32_t len, UBool oc, UBool crs); + + int32_t readLength(int32_t head); + void updateIndexes(); + UBool noNext(); + + const uint16_t *array; + int32_t index, length; + int32_t remaining; + UBool onlyChanges, coarse; + + UBool changed; + int32_t oldLength_, newLength_; + int32_t srcIndex, replIndex, destIndex; + }; + + /** + * Returns an Iterator for coarse-grained changes for simple string updates. + * Skips non-changes. + * @return an Iterator that merges adjacent changes. + * @draft ICU 59 + */ + Iterator getCoarseChangesIterator() const { + return Iterator(array, length, TRUE, TRUE); + } + + /** + * Returns an Iterator for coarse-grained changes and non-changes for simple string updates. + * @return an Iterator that merges adjacent changes. + * @draft ICU 59 + */ + Iterator getCoarseIterator() const { + return Iterator(array, length, FALSE, TRUE); + } + + /** + * Returns an Iterator for fine-grained changes for modifying styled text. + * Skips non-changes. + * @return an Iterator that separates adjacent changes. + * @draft ICU 59 + */ + Iterator getFineChangesIterator() const { + return Iterator(array, length, TRUE, FALSE); + } + + /** + * Returns an Iterator for fine-grained changes and non-changes for modifying styled text. + * @return an Iterator that separates adjacent changes. + * @draft ICU 59 + */ + Iterator getFineIterator() const { + return Iterator(array, length, FALSE, FALSE); + } + +private: + Edits(const Edits &) = delete; + Edits &operator=(const Edits &) = delete; + + void setLastUnit(int32_t last) { array[length - 1] = (uint16_t)last; } + int32_t lastUnit() const { return length > 0 ? array[length - 1] : 0xffff; } + + void append(int32_t r); + UBool growArray(); + + static const int32_t STACK_CAPACITY = 100; + uint16_t *array; + int32_t capacity; + int32_t length; + int32_t delta; + UErrorCode errorCode; + uint16_t stackArray[STACK_CAPACITY]; +}; + +#endif // U_HIDE_DRAFT_API + +U_NAMESPACE_END + +#endif // __EDITS_H__ diff --git a/icu4c/source/common/unicode/ucasemap.h b/icu4c/source/common/unicode/ucasemap.h index ea3fea52370..c79c252eb39 100644 --- a/icu4c/source/common/unicode/ucasemap.h +++ b/icu4c/source/common/unicode/ucasemap.h @@ -23,11 +23,6 @@ #include "unicode/utypes.h" #include "unicode/localpointer.h" - -#if U_SHOW_CPLUSPLUS_API -#include "unicode/uobject.h" -#endif // U_SHOW_CPLUSPLUS_API - #include "unicode/ustring.h" /** @@ -88,8 +83,6 @@ ucasemap_close(UCaseMap *csm); U_NAMESPACE_BEGIN -class BreakIterator; - /** * \class LocalUCaseMapPointer * "Smart pointer" class, closes a UCaseMap via ucasemap_close(). @@ -101,401 +94,6 @@ class BreakIterator; */ U_DEFINE_LOCAL_OPEN_POINTER(LocalUCaseMapPointer, UCaseMap, ucasemap_close); -// TODO: move to new C++ unicode/casemap.h - -#ifndef U_HIDE_DRAFT_API - -/** - * Records lengths of string edits but not replacement text. - * Supports replacements, insertions, deletions in linear progression. - * Does not support moving/reordering of text. - * - * An Edits object tracks a separate UErrorCode, but ICU string transformation functions - * (e.g., case mapping functions) merge any such errors into their API's UErrorCode. - * - * @draft ICU 59 - */ -class U_COMMON_API Edits final : public UMemory { -public: - /** - * Constructs an empty object. - * @draft ICU 59 - */ - Edits() : - array(stackArray), capacity(STACK_CAPACITY), length(0), delta(0), - errorCode(U_ZERO_ERROR) {} - /** - * Destructor. - * @draft ICU 59 - */ - ~Edits(); - - /** - * Resets the data but may not release memory. - * @draft ICU 59 - */ - void reset(); - - /** - * Adds a record for an unchanged segment of text. - * Normally called from inside ICU string transformation functions, not user code. - * @draft ICU 59 - */ - void addUnchanged(int32_t unchangedLength); - /** - * Adds a record for a text replacement/insertion/deletion. - * Normally called from inside ICU string transformation functions, not user code. - * @draft ICU 59 - */ - void addReplace(int32_t oldLength, int32_t newLength); - /** - * Sets the UErrorCode if an error occurred while recording edits. - * Preserves older error codes in the outErrorCode. - * Normally called from inside ICU string transformation functions, not user code. - * @return TRUE if U_FAILURE(outErrorCode) - * @draft ICU 59 - */ - UBool copyErrorTo(UErrorCode &outErrorCode); - - /** - * How much longer is the new text compared with the old text? - * @return new length minus old length - * @draft ICU 59 - */ - int32_t lengthDelta() const { return delta; } - /** - * @return TRUE if there are any change edits - * @draft ICU 59 - */ - UBool hasChanges() const; - - /** - * Access to the list of edits. - * @see getCoarseIterator - * @see getFineIterator - * @draft ICU 59 - */ - struct Iterator final : public UMemory { - /** - * Copy constructor. - * @draft ICU 59 - */ - Iterator(const Iterator &other) = default; - /** - * Assignment operator. - * @draft ICU 59 - */ - Iterator &operator=(const Iterator &other) = default; - - /** - * Advances to the next edit. - * @return TRUE if there is another edit - * @draft ICU 59 - */ - UBool next(UErrorCode &errorCode); - - /** - * Finds the edit that contains the source index. - * The source index may be found in a non-change - * even if normal iteration would skip non-changes. - * Normal iteration can continue from a found edit. - * - * The iterator state before this search logically does not matter. - * (It may affect the performance of the search.) - * - * The iterator state after this search is undefined - * if the source index is out of bounds for the source string. - * - * @param i source index - * @return TRUE if the edit for the source index was found - * @draft ICU 59 - */ - UBool findSourceIndex(int32_t i, UErrorCode &errorCode); - - /** - * @return TRUE if this edit replaces oldLength() units with newLength() different ones. - * FALSE if oldLength units remain unchanged. - * @draft ICU 59 - */ - UBool hasChange() const { return changed; } - /** - * @return the number of units in the original string which are replaced or remain unchanged. - * @draft ICU 59 - */ - int32_t oldLength() const { return oldLength_; } - /** - * @return the number of units in the modified string, if hasChange() is TRUE. - * Same as oldLength if hasChange() is FALSE. - * @draft ICU 59 - */ - int32_t newLength() const { return newLength_; } - - /** - * @return the current index into the source string - * @draft ICU 59 - */ - int32_t sourceIndex() const { return srcIndex; } - /** - * @return the current index into the replacement-characters-only string, - * not counting unchanged spans - * @draft ICU 59 - */ - int32_t replacementIndex() const { return replIndex; } - /** - * @return the current index into the full destination string - * @draft ICU 59 - */ - int32_t destinationIndex() const { return destIndex; } - - private: - friend class Edits; - - Iterator(const uint16_t *a, int32_t len, UBool oc, UBool crs); - - int32_t readLength(int32_t head); - void updateIndexes(); - UBool noNext(); - - const uint16_t *array; - int32_t index, length; - int32_t remaining; - UBool onlyChanges, coarse; - - UBool changed; - int32_t oldLength_, newLength_; - int32_t srcIndex, replIndex, destIndex; - }; - - /** - * Returns an Iterator for coarse-grained changes for simple string updates. - * Skips non-changes. - * @return an Iterator that merges adjacent changes. - * @draft ICU 59 - */ - Iterator getCoarseChangesIterator() const { - return Iterator(array, length, TRUE, TRUE); - } - - /** - * Returns an Iterator for coarse-grained changes and non-changes for simple string updates. - * @return an Iterator that merges adjacent changes. - * @draft ICU 59 - */ - Iterator getCoarseIterator() const { - return Iterator(array, length, FALSE, TRUE); - } - - /** - * Returns an Iterator for fine-grained changes for modifying styled text. - * Skips non-changes. - * @return an Iterator that separates adjacent changes. - * @draft ICU 59 - */ - Iterator getFineChangesIterator() const { - return Iterator(array, length, TRUE, FALSE); - } - - /** - * Returns an Iterator for fine-grained changes and non-changes for modifying styled text. - * @return an Iterator that separates adjacent changes. - * @draft ICU 59 - */ - Iterator getFineIterator() const { - return Iterator(array, length, FALSE, FALSE); - } - -private: - Edits(const Edits &) = delete; - Edits &operator=(const Edits &) = delete; - - void setLastUnit(int32_t last) { array[length - 1] = (uint16_t)last; } - int32_t lastUnit() const { return length > 0 ? array[length - 1] : 0xffff; } - - void append(int32_t r); - UBool growArray(); - - static const int32_t STACK_CAPACITY = 100; - uint16_t *array; - int32_t capacity; - int32_t length; - int32_t delta; - UErrorCode errorCode; - uint16_t stackArray[STACK_CAPACITY]; -}; - -/** - * Low-level C++ case mapping functions. - * - * @draft ICU 59 - */ -class U_COMMON_API CaseMap final : public UMemory { -public: - /** - * Lowercases a UTF-16 string and optionally records edits. - * Casing is locale-dependent and context-sensitive. - * The result may be longer or shorter than the original. - * The source string and the destination buffer must not overlap. - * - * @param locale The locale ID. ("" = root locale, NULL = default locale.) - * @param options Options bit set, usually 0. See UCASEMAP_OMIT_UNCHANGED_TEXT. - * @param src The original string. - * @param srcLength The length of the original string. If -1, then src must be NUL-terminated. - * @param dest A buffer for the result string. The result will be NUL-terminated if - * the buffer is large enough. - * The contents is undefined in case of failure. - * @param destCapacity The size of the buffer (number of bytes). If it is 0, then - * dest may be NULL and the function will only return the length of the result - * without writing any of the result string. - * @param edits Records edits for index mapping, working with styled text, - * and getting only changes (if any). - * This function calls edits->reset() first. edits can be NULL. - * @param errorCode Reference to an in/out error code value - * which must not indicate a failure before the function call. - * @return The length of the result string, if successful - or in case of a buffer overflow, - * in which case it will be greater than destCapacity. - * - * @see u_strToLower - * @draft ICU 59 - */ - static int32_t toLower( - const char *locale, uint32_t options, - const UChar *src, int32_t srcLength, - UChar *dest, int32_t destCapacity, Edits *edits, - UErrorCode &errorCode); - - /** - * Uppercases a UTF-16 string and optionally records edits. - * Casing is locale-dependent and context-sensitive. - * The result may be longer or shorter than the original. - * The source string and the destination buffer must not overlap. - * - * @param locale The locale ID. ("" = root locale, NULL = default locale.) - * @param options Options bit set, usually 0. See UCASEMAP_OMIT_UNCHANGED_TEXT. - * @param src The original string. - * @param srcLength The length of the original string. If -1, then src must be NUL-terminated. - * @param dest A buffer for the result string. The result will be NUL-terminated if - * the buffer is large enough. - * The contents is undefined in case of failure. - * @param destCapacity The size of the buffer (number of bytes). If it is 0, then - * dest may be NULL and the function will only return the length of the result - * without writing any of the result string. - * @param edits Records edits for index mapping, working with styled text, - * and getting only changes (if any). - * This function calls edits->reset() first. edits can be NULL. - * @param errorCode Reference to an in/out error code value - * which must not indicate a failure before the function call. - * @return The length of the result string, if successful - or in case of a buffer overflow, - * in which case it will be greater than destCapacity. - * - * @see u_strToUpper - * @draft ICU 59 - */ - static int32_t toUpper( - const char *locale, uint32_t options, - const UChar *src, int32_t srcLength, - UChar *dest, int32_t destCapacity, Edits *edits, - UErrorCode &errorCode); - -#if !UCONFIG_NO_BREAK_ITERATION - - /** - * Titlecases a UTF-16 string and optionally records edits. - * Casing is locale-dependent and context-sensitive. - * The result may be longer or shorter than the original. - * The source string and the destination buffer must not overlap. - * - * Titlecasing uses a break iterator to find the first characters of words - * that are to be titlecased. It titlecases those characters and lowercases - * all others. (This can be modified with options bits.) - * - * @param locale The locale ID. ("" = root locale, NULL = default locale.) - * @param options Options bit set, usually 0. See UCASEMAP_OMIT_UNCHANGED_TEXT, - * U_TITLECASE_NO_LOWERCASE, U_TITLECASE_NO_BREAK_ADJUSTMENT. - * @param iter A break iterator to find the first characters of words that are to be titlecased. - * It is set to the source string (setText()) - * and used one or more times for iteration (first() and next()). - * If NULL, then a word break iterator for the locale is used - * (or something equivalent). - * @param src The original string. - * @param srcLength The length of the original string. If -1, then src must be NUL-terminated. - * @param dest A buffer for the result string. The result will be NUL-terminated if - * the buffer is large enough. - * The contents is undefined in case of failure. - * @param destCapacity The size of the buffer (number of bytes). If it is 0, then - * dest may be NULL and the function will only return the length of the result - * without writing any of the result string. - * @param edits Records edits for index mapping, working with styled text, - * and getting only changes (if any). - * This function calls edits->reset() first. edits can be NULL. - * @param errorCode Reference to an in/out error code value - * which must not indicate a failure before the function call. - * @return The length of the result string, if successful - or in case of a buffer overflow, - * in which case it will be greater than destCapacity. - * - * @see u_strToTitle - * @see ucasemap_toTitle - * @draft ICU 59 - */ - static int32_t toTitle( - const char *locale, uint32_t options, BreakIterator *iter, - const UChar *src, int32_t srcLength, - UChar *dest, int32_t destCapacity, Edits *edits, - UErrorCode &errorCode); - -#endif // UCONFIG_NO_BREAK_ITERATION - - /** - * Case-folds a UTF-16 string and optionally records edits. - * - * Case-folding is locale-independent and not context-sensitive, - * but there is an option for whether to include or exclude mappings for dotted I - * and dotless i that are marked with 'T' in CaseFolding.txt. - * - * The result may be longer or shorter than the original. - * The source string and the destination buffer must not overlap. - * - * @param options Options bit set, usually 0. See UCASEMAP_OMIT_UNCHANGED_TEXT, - * U_FOLD_CASE_DEFAULT, U_FOLD_CASE_EXCLUDE_SPECIAL_I. - * @param src The original string. - * @param srcLength The length of the original string. If -1, then src must be NUL-terminated. - * @param dest A buffer for the result string. The result will be NUL-terminated if - * the buffer is large enough. - * The contents is undefined in case of failure. - * @param destCapacity The size of the buffer (number of bytes). If it is 0, then - * dest may be NULL and the function will only return the length of the result - * without writing any of the result string. - * @param edits Records edits for index mapping, working with styled text, - * and getting only changes (if any). - * This function calls edits->reset() first. edits can be NULL. - * @param errorCode Reference to an in/out error code value - * which must not indicate a failure before the function call. - * @return The length of the result string, if successful - or in case of a buffer overflow, - * in which case it will be greater than destCapacity. - * - * @see u_strFoldCase - * @draft ICU 59 - */ - static int32_t foldCase( - uint32_t options, - const UChar *src, int32_t srcLength, - UChar *dest, int32_t destCapacity, Edits *edits, - UErrorCode &errorCode); - -private: - CaseMap() = delete; - CaseMap(const CaseMap &other) = delete; - CaseMap &operator=(const CaseMap &other) = delete; -}; - -/** - * Omit unchanged text when case-mapping with Edits. - * - * @draft ICU 59 - */ -#define UCASEMAP_OMIT_UNCHANGED_TEXT 0x4000 - -#endif // U_HIDE_DRAFT_API - U_NAMESPACE_END #endif @@ -587,6 +185,15 @@ ucasemap_setOptions(UCaseMap *csm, uint32_t options, UErrorCode *pErrorCode); */ #define U_TITLECASE_NO_BREAK_ADJUSTMENT 0x200 +/** + * Omit unchanged text when case-mapping with Edits. + * + * @see CaseMap + * @see Edits + * @draft ICU 59 + */ +#define UCASEMAP_OMIT_UNCHANGED_TEXT 0x4000 + #if !UCONFIG_NO_BREAK_ITERATION /** diff --git a/icu4c/source/common/unistr_case.cpp b/icu4c/source/common/unistr_case.cpp index 3b8d150b730..205a5b8f3df 100644 --- a/icu4c/source/common/unistr_case.cpp +++ b/icu4c/source/common/unistr_case.cpp @@ -19,6 +19,8 @@ */ #include "unicode/utypes.h" +#include "unicode/casemap.h" +#include "unicode/edits.h" #include "unicode/putil.h" #include "cstring.h" #include "cmemory.h" @@ -26,8 +28,8 @@ #include "unicode/unistr.h" #include "unicode/uchar.h" #include "uassert.h" +#include "ucasemap_imp.h" #include "uelement.h" -#include "ustr_imp.h" U_NAMESPACE_BEGIN diff --git a/icu4c/source/common/unistr_case_locale.cpp b/icu4c/source/common/unistr_case_locale.cpp index 46ada884137..9fa5f100dcc 100644 --- a/icu4c/source/common/unistr_case_locale.cpp +++ b/icu4c/source/common/unistr_case_locale.cpp @@ -21,7 +21,7 @@ #include "unicode/locid.h" #include "unicode/ucasemap.h" #include "unicode/unistr.h" -#include "ustr_imp.h" +#include "ucasemap_imp.h" U_NAMESPACE_BEGIN diff --git a/icu4c/source/common/unistr_titlecase_brkiter.cpp b/icu4c/source/common/unistr_titlecase_brkiter.cpp index d04233c3497..635eab8cd52 100644 --- a/icu4c/source/common/unistr_titlecase_brkiter.cpp +++ b/icu4c/source/common/unistr_titlecase_brkiter.cpp @@ -25,7 +25,7 @@ #include "unicode/locid.h" #include "unicode/ucasemap.h" #include "unicode/unistr.h" -#include "ustr_imp.h" +#include "ucasemap_imp.h" U_NAMESPACE_BEGIN diff --git a/icu4c/source/common/ustr_imp.h b/icu4c/source/common/ustr_imp.h index e4a6ab70660..d397f1ead98 100644 --- a/icu4c/source/common/ustr_imp.h +++ b/icu4c/source/common/ustr_imp.h @@ -18,24 +18,6 @@ #define __USTR_IMP_H__ #include "unicode/utypes.h" -#include "unicode/ucasemap.h" -#include "unicode/uiter.h" -#include "ucase.h" - -/** Simple declaration to avoid including unicode/ubrk.h. */ -#ifndef UBRK_TYPEDEF_UBREAK_ITERATOR -# define UBRK_TYPEDEF_UBREAK_ITERATOR - typedef struct UBreakIterator UBreakIterator; -#endif - -#ifndef U_COMPARE_IGNORE_CASE -/* see also unorm.h */ -/** - * Option bit for unorm_compare: - * Perform case-insensitive comparison. - */ -#define U_COMPARE_IGNORE_CASE 0x10000 -#endif /** * Internal option for unorm_cmpEquivFold() for strncmp style. @@ -54,230 +36,6 @@ uprv_strCompare(const UChar *s1, int32_t length1, const UChar *s2, int32_t length2, UBool strncmpStyle, UBool codePointOrder); -/** - * Internal API, used by u_strcasecmp() etc. - * Compare strings case-insensitively, - * in code point order or code unit order. - */ -U_CFUNC int32_t -u_strcmpFold(const UChar *s1, int32_t length1, - const UChar *s2, int32_t length2, - uint32_t options, - UErrorCode *pErrorCode); - -/** - * Interanl API, used for detecting length of - * shared prefix case-insensitively. - * @param s1 input string 1 - * @param length1 length of string 1, or -1 (NULL terminated) - * @param s2 input string 2 - * @param length2 length of string 2, or -1 (NULL terminated) - * @param options compare options - * @param matchLen1 (output) length of partial prefix match in s1 - * @param matchLen2 (output) length of partial prefix match in s2 - * @param pErrorCode receives error status - */ -U_CAPI void -u_caseInsensitivePrefixMatch(const UChar *s1, int32_t length1, - const UChar *s2, int32_t length2, - uint32_t options, - int32_t *matchLen1, int32_t *matchLen2, - UErrorCode *pErrorCode); - -/** - * Are the Unicode properties loaded? - * This must be used before internal functions are called that do - * not perform this check. - * Generate a debug assertion failure if data is not loaded. - */ -U_CFUNC UBool -uprv_haveProperties(UErrorCode *pErrorCode); - -/** - * Load the Unicode property data. - * Intended primarily for use from u_init(). - * Has no effect if property data is already loaded. - * NOT thread safe. - */ -/*U_CFUNC int8_t -uprv_loadPropsData(UErrorCode *errorCode);*/ - -#ifdef __cplusplus -// TODO: Consider moving these case mapping definitions -// into a new internal header like ucasemap_imp.h. - -#include "unicode/unistr.h" // for UStringCaseMapper - -/* - * Internal string casing functions implementing - * ustring.h/ustrcase.c and UnicodeString case mapping functions. - */ - -struct UCaseMap : public icu::UMemory { - /** Implements most of ucasemap_open(). */ - UCaseMap(const char *localeID, uint32_t opts, UErrorCode *pErrorCode); - ~UCaseMap(); - -#if !UCONFIG_NO_BREAK_ITERATION - icu::BreakIterator *iter; /* We adopt the iterator, so we own it. */ -#endif - char locale[32]; - int32_t caseLocale; - uint32_t options; -}; - -#if UCONFIG_NO_BREAK_ITERATION -# define UCASEMAP_BREAK_ITERATOR_PARAM -# define UCASEMAP_BREAK_ITERATOR_UNUSED -# define UCASEMAP_BREAK_ITERATOR -# define UCASEMAP_BREAK_ITERATOR_NULL -#else -# define UCASEMAP_BREAK_ITERATOR_PARAM icu::BreakIterator *iter, -# define UCASEMAP_BREAK_ITERATOR_UNUSED icu::BreakIterator *, -# define UCASEMAP_BREAK_ITERATOR iter, -# define UCASEMAP_BREAK_ITERATOR_NULL NULL, -#endif - -U_CFUNC int32_t -ustrcase_getCaseLocale(const char *locale); - -// TODO: swap src / dest if approved for new public api -/** Implements UStringCaseMapper. */ -U_CFUNC int32_t U_CALLCONV -ustrcase_internalToLower(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_PARAM - UChar *dest, int32_t destCapacity, - const UChar *src, int32_t srcLength, - icu::Edits *edits, - UErrorCode &errorCode); - -/** Implements UStringCaseMapper. */ -U_CFUNC int32_t U_CALLCONV -ustrcase_internalToUpper(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_PARAM - UChar *dest, int32_t destCapacity, - const UChar *src, int32_t srcLength, - icu::Edits *edits, - UErrorCode &errorCode); - -#if !UCONFIG_NO_BREAK_ITERATION - -/** Implements UStringCaseMapper. */ -U_CFUNC int32_t U_CALLCONV -ustrcase_internalToTitle(int32_t caseLocale, uint32_t options, - icu::BreakIterator *iter, - UChar *dest, int32_t destCapacity, - const UChar *src, int32_t srcLength, - icu::Edits *edits, - UErrorCode &errorCode); - -#endif - -/** Implements UStringCaseMapper. */ -U_CFUNC int32_t U_CALLCONV -ustrcase_internalFold(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_PARAM - UChar *dest, int32_t destCapacity, - const UChar *src, int32_t srcLength, - icu::Edits *edits, - UErrorCode &errorCode); - -/** - * Common string case mapping implementation for ucasemap_toXyz() and UnicodeString::toXyz(). - * Implements argument checking. - */ -U_CFUNC int32_t -ustrcase_map(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_PARAM - UChar *dest, int32_t destCapacity, - const UChar *src, int32_t srcLength, - UStringCaseMapper *stringCaseMapper, - icu::Edits *edits, - UErrorCode &errorCode); - -/** - * Common string case mapping implementation for old-fashioned u_strToXyz() functions - * that allow the source string to overlap the destination buffer. - * Implements argument checking and internally works with an intermediate buffer if necessary. - */ -U_CFUNC int32_t -ustrcase_mapWithOverlap(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_PARAM - UChar *dest, int32_t destCapacity, - const UChar *src, int32_t srcLength, - UStringCaseMapper *stringCaseMapper, - UErrorCode &errorCode); - -/** - * UTF-8 string case mapping function type, used by ucasemap_mapUTF8(). - * UTF-8 version of UStringCaseMapper. - * All error checking must be done. - * The UCaseMap must be fully initialized, with locale and/or iter set as needed. - * src and dest must not overlap. - */ -typedef int32_t U_CALLCONV -UTF8CaseMapper(int32_t caseLocale, uint32_t options, -#if !UCONFIG_NO_BREAK_ITERATION - icu::BreakIterator *iter, -#endif - uint8_t *dest, int32_t destCapacity, - const uint8_t *src, int32_t srcLength, - UErrorCode *pErrorCode); - -#if !UCONFIG_NO_BREAK_ITERATION - -/** Implements UTF8CaseMapper. */ -U_CFUNC int32_t U_CALLCONV -ucasemap_internalUTF8ToTitle(int32_t caseLocale, uint32_t options, - icu::BreakIterator *iter, - uint8_t *dest, int32_t destCapacity, - const uint8_t *src, int32_t srcLength, - UErrorCode *pErrorCode); - -#endif - -/** - * Implements argument checking and buffer handling - * for UTF-8 string case mapping as a common function. - */ -U_CFUNC int32_t -ucasemap_mapUTF8(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_PARAM - uint8_t *dest, int32_t destCapacity, - const uint8_t *src, int32_t srcLength, - UTF8CaseMapper *stringCaseMapper, - UErrorCode *pErrorCode); - -U_NAMESPACE_BEGIN -namespace GreekUpper { - -// Data bits. -static const uint32_t UPPER_MASK = 0x3ff; -static const uint32_t HAS_VOWEL = 0x1000; -static const uint32_t HAS_YPOGEGRAMMENI = 0x2000; -static const uint32_t HAS_ACCENT = 0x4000; -static const uint32_t HAS_DIALYTIKA = 0x8000; -// Further bits during data building and processing, not stored in the data map. -static const uint32_t HAS_COMBINING_DIALYTIKA = 0x10000; -static const uint32_t HAS_OTHER_GREEK_DIACRITIC = 0x20000; - -static const uint32_t HAS_VOWEL_AND_ACCENT = HAS_VOWEL | HAS_ACCENT; -static const uint32_t HAS_VOWEL_AND_ACCENT_AND_DIALYTIKA = - HAS_VOWEL_AND_ACCENT | HAS_DIALYTIKA; -static const uint32_t HAS_EITHER_DIALYTIKA = HAS_DIALYTIKA | HAS_COMBINING_DIALYTIKA; - -// State bits. -static const uint32_t AFTER_CASED = 1; -static const uint32_t AFTER_VOWEL_WITH_ACCENT = 2; - -uint32_t getLetterData(UChar32 c); - -/** - * Returns a non-zero value for each of the Greek combining diacritics - * listed in The Unicode Standard, version 8, chapter 7.2 Greek, - * plus some perispomeni look-alikes. - */ -uint32_t getDiacriticData(UChar32 c); - -} // namespace GreekUpper -U_NAMESPACE_END - -#endif // __cplusplus - U_CAPI int32_t U_EXPORT2 ustr_hashUCharsN(const UChar *str, int32_t length); diff --git a/icu4c/source/common/ustr_titlecase_brkiter.cpp b/icu4c/source/common/ustr_titlecase_brkiter.cpp index 1072e43ce41..9642a0ddbf1 100644 --- a/icu4c/source/common/ustr_titlecase_brkiter.cpp +++ b/icu4c/source/common/ustr_titlecase_brkiter.cpp @@ -22,12 +22,13 @@ #if !UCONFIG_NO_BREAK_ITERATION #include "unicode/brkiter.h" +#include "unicode/casemap.h" #include "unicode/localpointer.h" #include "unicode/ubrk.h" #include "unicode/ucasemap.h" #include "cmemory.h" #include "ucase.h" -#include "ustr_imp.h" +#include "ucasemap_imp.h" U_NAMESPACE_USE diff --git a/icu4c/source/common/ustrcase.cpp b/icu4c/source/common/ustrcase.cpp index f08a1fae53f..cb95c3da295 100644 --- a/icu4c/source/common/ustrcase.cpp +++ b/icu4c/source/common/ustrcase.cpp @@ -22,6 +22,8 @@ #include "unicode/utypes.h" #include "unicode/brkiter.h" +#include "unicode/casemap.h" +#include "unicode/edits.h" #include "unicode/ustring.h" #include "unicode/ucasemap.h" #include "unicode/ubrk.h" @@ -29,6 +31,7 @@ #include "unicode/utf16.h" #include "cmemory.h" #include "ucase.h" +#include "ucasemap_imp.h" #include "ustr_imp.h" #include "uassert.h" @@ -36,334 +39,6 @@ U_NAMESPACE_BEGIN namespace { -// 0000uuuuuuuuuuuu records u+1 unchanged text units. -const int32_t MAX_UNCHANGED_LENGTH = 0x1000; -const int32_t MAX_UNCHANGED = MAX_UNCHANGED_LENGTH - 1; - -// 0wwwcccccccccccc with w=1..6 records ccc+1 replacements of w:w text units. -// No length change. -const int32_t MAX_SHORT_WIDTH = 6; -const int32_t MAX_SHORT_CHANGE_LENGTH = 0xfff; -const int32_t MAX_SHORT_CHANGE = 0x6fff; - -// 0111mmmmmmnnnnnn records a replacement of m text units with n. -// m or n = 61: actual length follows in the next edits array unit. -// m or n = 62..63: actual length follows in the next two edits array units. -// Bit 30 of the actual length is in the head unit. -// Trailing units have bit 15 set. -const int32_t LENGTH_IN_1TRAIL = 61; -const int32_t LENGTH_IN_2TRAIL = 62; - -} // namespace - -Edits::~Edits() { - if(array != stackArray) { - uprv_free(array); - } -} - -void Edits::reset() { - length = 0; -} - -void Edits::addUnchanged(int32_t unchangedLength) { - if(U_FAILURE(errorCode) || unchangedLength == 0) { return; } - if(unchangedLength < 0) { - errorCode = U_ILLEGAL_ARGUMENT_ERROR; - return; - } - // Merge into previous unchanged-text record, if any. - int32_t last = lastUnit(); - if(last < MAX_UNCHANGED) { - int32_t remaining = MAX_UNCHANGED - last; - if (remaining >= unchangedLength) { - setLastUnit(last + unchangedLength); - return; - } - setLastUnit(MAX_UNCHANGED); - unchangedLength -= remaining; - } - // Split large lengths into multiple units. - while(unchangedLength >= MAX_UNCHANGED_LENGTH) { - append(MAX_UNCHANGED); - unchangedLength -= MAX_UNCHANGED_LENGTH; - } - // Write a small (remaining) length. - if(unchangedLength > 0) { - append(unchangedLength - 1); - } -} - -void Edits::addReplace(int32_t oldLength, int32_t newLength) { - if(U_FAILURE(errorCode)) { return; } - if(oldLength == newLength && 0 < oldLength && oldLength <= MAX_SHORT_WIDTH) { - // Replacement of short oldLength text units by same-length new text. - // Merge into previous short-replacement record, if any. - int32_t last = lastUnit(); - if(MAX_UNCHANGED < last && last < MAX_SHORT_CHANGE && - (last >> 12) == oldLength && (last & 0xfff) < MAX_SHORT_CHANGE_LENGTH) { - setLastUnit(last + 1); - return; - } - append(oldLength << 12); - return; - } - - if(oldLength < 0 || newLength < 0) { - errorCode = U_ILLEGAL_ARGUMENT_ERROR; - return; - } - if (oldLength == 0 && newLength == 0) { - return; - } - int32_t newDelta = newLength - oldLength; - if (newDelta != 0) { - if (newDelta > 0 ? newDelta > (INT32_MAX - delta) : newDelta < (INT32_MIN - delta)) { - // Integer overflow or underflow. - errorCode = U_INDEX_OUTOFBOUNDS_ERROR; - return; - } - delta += newDelta; - } - - int32_t head = 0x7000; - if (oldLength < LENGTH_IN_1TRAIL && newLength < LENGTH_IN_1TRAIL) { - head |= oldLength << 6; - head |= newLength; - append(head); - } else if ((capacity - length) >= 5 || growArray()) { - int32_t limit = length + 1; - if(oldLength < LENGTH_IN_1TRAIL) { - head |= oldLength << 6; - } else if(oldLength <= 0x7fff) { - head |= LENGTH_IN_1TRAIL << 6; - array[limit++] = (uint16_t)(0x8000 | oldLength); - } else { - head |= (LENGTH_IN_2TRAIL + (oldLength >> 30)) << 6; - array[limit++] = (uint16_t)(0x8000 | (oldLength >> 15)); - array[limit++] = (uint16_t)(0x8000 | oldLength); - } - if(newLength < LENGTH_IN_1TRAIL) { - head |= newLength; - } else if(newLength <= 0x7fff) { - head |= LENGTH_IN_1TRAIL; - array[limit++] = (uint16_t)(0x8000 | newLength); - } else { - head |= LENGTH_IN_2TRAIL + (newLength >> 30); - array[limit++] = (uint16_t)(0x8000 | (newLength >> 15)); - array[limit++] = (uint16_t)(0x8000 | newLength); - } - array[length] = (uint16_t)head; - length = limit; - } -} - -void Edits::append(int32_t r) { - if(length < capacity || growArray()) { - array[length++] = (uint16_t)r; - } -} - -UBool Edits::growArray() { - int32_t newCapacity; - if (array == stackArray) { - newCapacity = 2000; - } else if (capacity == INT32_MAX) { - errorCode = U_BUFFER_OVERFLOW_ERROR; - return FALSE; - } else if (capacity >= (INT32_MAX / 2)) { - newCapacity = INT32_MAX; - } else { - newCapacity = 2 * capacity; - } - // Grow by at least 5 units so that a maximal change record will fit. - if ((newCapacity - capacity) < 5) { - errorCode = U_BUFFER_OVERFLOW_ERROR; - return FALSE; - } - uint16_t *newArray = (uint16_t *)uprv_malloc((size_t)newCapacity * 2); - if (newArray == NULL) { - errorCode = U_MEMORY_ALLOCATION_ERROR; - return FALSE; - } - uprv_memcpy(newArray, array, (size_t)length * 2); - if (array != stackArray) { - uprv_free(array); - } - array = newArray; - capacity = newCapacity; - return TRUE; -} - -UBool Edits::copyErrorTo(UErrorCode &outErrorCode) { - if (U_FAILURE(outErrorCode)) { return TRUE; } - if (U_SUCCESS(errorCode)) { return FALSE; } - outErrorCode = errorCode; - return TRUE; -} - -UBool Edits::hasChanges() const { - if (delta != 0) { - return TRUE; - } - for (int32_t i = 0; i < length; ++i) { - if (array[i] > MAX_UNCHANGED) { - return TRUE; - } - } - return FALSE; -} - -Edits::Iterator::Iterator(const uint16_t *a, int32_t len, UBool oc, UBool crs) : - array(a), index(0), length(len), remaining(0), - onlyChanges(oc), coarse(crs), - changed(FALSE), oldLength_(0), newLength_(0), - srcIndex(0), replIndex(0), destIndex(0) {} - -int32_t Edits::Iterator::readLength(int32_t head) { - if (head < LENGTH_IN_1TRAIL) { - return head; - } else if (head < LENGTH_IN_2TRAIL) { - U_ASSERT(index < length); - U_ASSERT(array[index] >= 0x8000); - return array[index++]; - } else { - U_ASSERT((index + 2) <= length); - U_ASSERT(array[index] >= 0x8000); - U_ASSERT(array[index + 1] >= 0x8000); - int32_t len = ((head & 1) << 30) | - ((int32_t)(array[index] & 0x7fff) << 15) | - (array[index + 1] & 0x7fff); - index += 2; - return len; - } -} - -void Edits::Iterator::updateIndexes() { - srcIndex += oldLength_; - if (changed) { - replIndex += newLength_; - } - destIndex += newLength_; -} - -UBool Edits::Iterator::noNext() { - // Empty span beyond the string. - oldLength_ = newLength_ = 0; - return FALSE; -} - -UBool Edits::Iterator::next(UErrorCode &errorCode) { - if (U_FAILURE(errorCode)) { return FALSE; } - // We have an errorCode in case we need to start guarding against integer overflows. - // It is also convenient for caller loops if we bail out when an error was set elsewhere. - updateIndexes(); - if (remaining > 0) { - // Fine-grained iterator: Continue a sequence of equal-length changes. - --remaining; - return TRUE; - } - if (index >= length) { - return noNext(); - } - int32_t u = array[index++]; - if (u <= MAX_UNCHANGED) { - // Combine adjacent unchanged ranges. - changed = FALSE; - oldLength_ = u + 1; - while (index < length && (u = array[index]) <= MAX_UNCHANGED) { - ++index; - oldLength_ += u + 1; - } - newLength_ = oldLength_; - if (onlyChanges) { - updateIndexes(); - if (index >= length) { - return noNext(); - } - // already fetched u > MAX_UNCHANGED at index - ++index; - } else { - return TRUE; - } - } - changed = TRUE; - if (u <= MAX_SHORT_CHANGE) { - if (coarse) { - int32_t w = u >> 12; - int32_t len = (u & 0xfff) + 1; - oldLength_ = newLength_ = len * w; - } else { - // Split a sequence of equal-length changes that was compressed into one unit. - oldLength_ = newLength_ = u >> 12; - remaining = u & 0xfff; - return TRUE; - } - } else { - U_ASSERT(u <= 0x7fff); - oldLength_ = readLength((u >> 6) & 0x3f); - newLength_ = readLength(u & 0x3f); - if (!coarse) { - return TRUE; - } - } - // Combine adjacent changes. - while (index < length && (u = array[index]) > MAX_UNCHANGED) { - ++index; - if (u <= MAX_SHORT_CHANGE) { - int32_t w = u >> 12; - int32_t len = (u & 0xfff) + 1; - len = len * w; - oldLength_ += len; - newLength_ += len; - } else { - U_ASSERT(u <= 0x7fff); - int32_t oldLen = readLength((u >> 6) & 0x3f); - int32_t newLen = readLength(u & 0x3f); - oldLength_ += oldLen; - newLength_ += newLen; - } - } - return TRUE; -} - -UBool Edits::Iterator::findSourceIndex(int32_t i, UErrorCode &errorCode) { - if (U_FAILURE(errorCode) || i < 0) { return FALSE; } - if (i < srcIndex) { - // Reset the iterator to the start. - index = remaining = srcIndex = replIndex = destIndex = 0; - } else if (i < (srcIndex + oldLength_)) { - // The index is in the current span. - return TRUE; - } - while (next(errorCode)) { - if (i < (srcIndex + oldLength_)) { - // The index is in the current span. - return TRUE; - } - if (remaining > 0) { - // Is the index in one of the remaining compressed edits? - // srcIndex is the start of the current span, before the remaining ones. - int32_t len = (remaining + 1) * oldLength_; - if (i < (srcIndex + len)) { - int32_t n = (i - srcIndex) / oldLength_; // 1 <= n <= remaining - len = n * oldLength_; - srcIndex += len; - replIndex += len; - destIndex += len; - remaining -= n; - return TRUE; - } - // Make next() skip all of these edits at once. - oldLength_ = newLength_ = len; - remaining = 0; - } - } - return FALSE; -} - -namespace { - int32_t checkOverflowAndEditsError(int32_t destIndex, int32_t destCapacity, Edits *edits, UErrorCode &errorCode) { if (U_SUCCESS(errorCode)) { diff --git a/icu4c/source/common/ustrcase_locale.cpp b/icu4c/source/common/ustrcase_locale.cpp index 74b3ab17dda..77a7af29c56 100644 --- a/icu4c/source/common/ustrcase_locale.cpp +++ b/icu4c/source/common/ustrcase_locale.cpp @@ -20,11 +20,12 @@ #include "unicode/utypes.h" #include "uassert.h" #include "unicode/brkiter.h" +#include "unicode/casemap.h" #include "unicode/ucasemap.h" #include "unicode/uloc.h" #include "unicode/ustring.h" #include "ucase.h" -#include "ustr_imp.h" +#include "ucasemap_imp.h" U_CFUNC int32_t ustrcase_getCaseLocale(const char *locale) { diff --git a/icu4c/source/common/ustring.cpp b/icu4c/source/common/ustring.cpp index aae7f5c36d3..77d9c787317 100644 --- a/icu4c/source/common/ustring.cpp +++ b/icu4c/source/common/ustring.cpp @@ -19,6 +19,7 @@ #include "unicode/utypes.h" #include "unicode/putil.h" +#include "unicode/uchar.h" #include "unicode/ustring.h" #include "unicode/utf16.h" #include "cstring.h" diff --git a/icu4c/source/i18n/measfmt.cpp b/icu4c/source/i18n/measfmt.cpp index b5a9156ec2c..4ed387b9e81 100644 --- a/icu4c/source/i18n/measfmt.cpp +++ b/icu4c/source/i18n/measfmt.cpp @@ -26,6 +26,7 @@ #include "unicode/decimfmt.h" #include "uresimp.h" #include "unicode/ures.h" +#include "unicode/ustring.h" #include "ureslocs.h" #include "cstring.h" #include "mutex.h" diff --git a/icu4c/source/i18n/reldatefmt.cpp b/icu4c/source/i18n/reldatefmt.cpp index dd4894e95e2..6d6cae12935 100644 --- a/icu4c/source/i18n/reldatefmt.cpp +++ b/icu4c/source/i18n/reldatefmt.cpp @@ -15,6 +15,7 @@ #if !UCONFIG_NO_FORMATTING && !UCONFIG_NO_BREAK_ITERATION #include "unicode/dtfmtsym.h" +#include "unicode/ucasemap.h" #include "unicode/ureldatefmt.h" #include "unicode/udisplaycontext.h" #include "unicode/unum.h" diff --git a/icu4c/source/i18n/smpdtfmt.cpp b/icu4c/source/i18n/smpdtfmt.cpp index 85cc162a11f..c260c7aadb1 100644 --- a/icu4c/source/i18n/smpdtfmt.cpp +++ b/icu4c/source/i18n/smpdtfmt.cpp @@ -48,6 +48,7 @@ #include "unicode/simpletz.h" #include "unicode/rbtz.h" #include "unicode/tzfmt.h" +#include "unicode/ucasemap.h" #include "unicode/utf16.h" #include "unicode/vtzone.h" #include "unicode/udisplaycontext.h" @@ -64,6 +65,7 @@ #include #include "smpdtfst.h" #include "sharednumberformat.h" +#include "ucasemap_imp.h" #include "ustr_imp.h" #include "charstr.h" #include "uvector.h" diff --git a/icu4c/source/test/cintltst/cstrcase.c b/icu4c/source/test/cintltst/cstrcase.c index e5cb74d1831..27007909035 100644 --- a/icu4c/source/test/cintltst/cstrcase.c +++ b/icu4c/source/test/cintltst/cstrcase.c @@ -27,6 +27,7 @@ #include "unicode/ucasemap.h" #include "cmemory.h" #include "cintltst.h" +#include "ucasemap_imp.h" #include "ustr_imp.h" /* test string case mapping functions --------------------------------------- */ -- 2.40.0