bytestrie.o bytestrieiterator.o \
ucharstrie.o ucharstriebuilder.o ucharstrieiterator.o \
dictionarydata.o \
+edits.o \
appendable.o ustr_cnv.o unistr_cnv.o unistr.o unistr_case.o unistr_props.o \
utf_impl.o ustring.o ustrcase.o ucasemap.o ucasemap_titlecase_brkiter.o cstring.o ustrfmt.o ustrtrns.o ustr_wcs.o utext.o \
unistr_case_locale.o ustrcase_locale.o unistr_titlecase_brkiter.o ustr_titlecase_brkiter.o \
<ClCompile Include="cstring.cpp" />
<ClCompile Include="cstr.cpp" />
<ClCompile Include="cwchar.cpp" />
+ <ClCompile Include="edits.cpp" />
<ClCompile Include="messagepattern.cpp" />
<ClCompile Include="schriter.cpp" />
<ClCompile Include="stringpiece.cpp" />
</Command>
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">..\..\include\unicode\%(Filename)%(Extension);%(Outputs)</Outputs>
<Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">copy "%(FullPath)" ..\..\include\unicode
+</Command>
+ <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">..\..\include\unicode\%(Filename)%(Extension);%(Outputs)</Outputs>
+ </CustomBuild>
+ <CustomBuild Include="unicode\casemap.h">
+ <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">copy "%(FullPath)" ..\..\include\unicode
+</Command>
+ <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">..\..\include\unicode\%(Filename)%(Extension);%(Outputs)</Outputs>
+ <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">copy "%(FullPath)" ..\..\include\unicode
+</Command>
+ <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">..\..\include\unicode\%(Filename)%(Extension);%(Outputs)</Outputs>
+ <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">copy "%(FullPath)" ..\..\include\unicode
+</Command>
+ <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">..\..\include\unicode\%(Filename)%(Extension);%(Outputs)</Outputs>
+ <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">copy "%(FullPath)" ..\..\include\unicode
</Command>
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">..\..\include\unicode\%(Filename)%(Extension);%(Outputs)</Outputs>
</CustomBuild>
<ClInclude Include="cstring.h" />
<ClInclude Include="cstr.h" />
<ClInclude Include="cwchar.h" />
+ <CustomBuild Include="unicode\edits.h">
+ <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">copy "%(FullPath)" ..\..\include\unicode
+</Command>
+ <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">..\..\include\unicode\%(Filename)%(Extension);%(Outputs)</Outputs>
+ <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">copy "%(FullPath)" ..\..\include\unicode
+</Command>
+ <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">..\..\include\unicode\%(Filename)%(Extension);%(Outputs)</Outputs>
+ <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">copy "%(FullPath)" ..\..\include\unicode
+</Command>
+ <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">..\..\include\unicode\%(Filename)%(Extension);%(Outputs)</Outputs>
+ <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">copy "%(FullPath)" ..\..\include\unicode
+</Command>
+ <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">..\..\include\unicode\%(Filename)%(Extension);%(Outputs)</Outputs>
+ </CustomBuild>
<CustomBuild Include="unicode\messagepattern.h">
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">copy "%(FullPath)" ..\..\include\unicode
</Command>
</Command>
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">..\..\include\unicode\%(Filename)%(Extension);%(Outputs)</Outputs>
</CustomBuild>
+ <ClInclude Include="ucasemap_imp.h" />
<CustomBuild Include="unicode\ucharstrie.h">
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">copy "%(FullPath)" ..\..\include\unicode
</Command>
<ClCompile Include="cwchar.cpp">
<Filter>strings</Filter>
</ClCompile>
+ <ClCompile Include="edits.cpp">
+ <Filter>strings</Filter>
+ </ClCompile>
<ClCompile Include="schriter.cpp">
<Filter>strings</Filter>
</ClCompile>
<ClInclude Include="cwchar.h">
<Filter>strings</Filter>
</ClInclude>
+ <ClInclude Include="ucasemap_imp.h">
+ <Filter>strings</Filter>
+ </ClInclude>
<ClInclude Include="uinvchar.h">
<Filter>strings</Filter>
</ClInclude>
<CustomBuild Include="unicode\bytestream.h">
<Filter>strings</Filter>
</CustomBuild>
+ <CustomBuild Include="unicode\casemap.h">
+ <Filter>strings</Filter>
+ </CustomBuild>
<CustomBuild Include="unicode\chariter.h">
<Filter>strings</Filter>
</CustomBuild>
+ <CustomBuild Include="unicode\edits.h">
+ <Filter>strings</Filter>
+ </CustomBuild>
<CustomBuild Include="unicode\rep.h">
<Filter>strings</Filter>
</CustomBuild>
--- /dev/null
+// Copyright (C) 2017 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+
+// edits.cpp
+// created: 2017feb08 Markus W. Scherer
+
+#include "unicode/utypes.h"
+#include "unicode/edits.h"
+#include "cmemory.h"
+#include "uassert.h"
+
+U_NAMESPACE_BEGIN
+
+namespace {
+
+// 0000uuuuuuuuuuuu records u+1 unchanged text units.
+const int32_t MAX_UNCHANGED_LENGTH = 0x1000;
+const int32_t MAX_UNCHANGED = MAX_UNCHANGED_LENGTH - 1;
+
+// 0wwwcccccccccccc with w=1..6 records ccc+1 replacements of w:w text units.
+// No length change.
+const int32_t MAX_SHORT_WIDTH = 6;
+const int32_t MAX_SHORT_CHANGE_LENGTH = 0xfff;
+const int32_t MAX_SHORT_CHANGE = 0x6fff;
+
+// 0111mmmmmmnnnnnn records a replacement of m text units with n.
+// m or n = 61: actual length follows in the next edits array unit.
+// m or n = 62..63: actual length follows in the next two edits array units.
+// Bit 30 of the actual length is in the head unit.
+// Trailing units have bit 15 set.
+const int32_t LENGTH_IN_1TRAIL = 61;
+const int32_t LENGTH_IN_2TRAIL = 62;
+
+} // namespace
+
+Edits::~Edits() {
+ if(array != stackArray) {
+ uprv_free(array);
+ }
+}
+
+void Edits::reset() {
+ length = 0;
+}
+
+void Edits::addUnchanged(int32_t unchangedLength) {
+ if(U_FAILURE(errorCode) || unchangedLength == 0) { return; }
+ if(unchangedLength < 0) {
+ errorCode = U_ILLEGAL_ARGUMENT_ERROR;
+ return;
+ }
+ // Merge into previous unchanged-text record, if any.
+ int32_t last = lastUnit();
+ if(last < MAX_UNCHANGED) {
+ int32_t remaining = MAX_UNCHANGED - last;
+ if (remaining >= unchangedLength) {
+ setLastUnit(last + unchangedLength);
+ return;
+ }
+ setLastUnit(MAX_UNCHANGED);
+ unchangedLength -= remaining;
+ }
+ // Split large lengths into multiple units.
+ while(unchangedLength >= MAX_UNCHANGED_LENGTH) {
+ append(MAX_UNCHANGED);
+ unchangedLength -= MAX_UNCHANGED_LENGTH;
+ }
+ // Write a small (remaining) length.
+ if(unchangedLength > 0) {
+ append(unchangedLength - 1);
+ }
+}
+
+void Edits::addReplace(int32_t oldLength, int32_t newLength) {
+ if(U_FAILURE(errorCode)) { return; }
+ if(oldLength == newLength && 0 < oldLength && oldLength <= MAX_SHORT_WIDTH) {
+ // Replacement of short oldLength text units by same-length new text.
+ // Merge into previous short-replacement record, if any.
+ int32_t last = lastUnit();
+ if(MAX_UNCHANGED < last && last < MAX_SHORT_CHANGE &&
+ (last >> 12) == oldLength && (last & 0xfff) < MAX_SHORT_CHANGE_LENGTH) {
+ setLastUnit(last + 1);
+ return;
+ }
+ append(oldLength << 12);
+ return;
+ }
+
+ if(oldLength < 0 || newLength < 0) {
+ errorCode = U_ILLEGAL_ARGUMENT_ERROR;
+ return;
+ }
+ if (oldLength == 0 && newLength == 0) {
+ return;
+ }
+ int32_t newDelta = newLength - oldLength;
+ if (newDelta != 0) {
+ if (newDelta > 0 ? newDelta > (INT32_MAX - delta) : newDelta < (INT32_MIN - delta)) {
+ // Integer overflow or underflow.
+ errorCode = U_INDEX_OUTOFBOUNDS_ERROR;
+ return;
+ }
+ delta += newDelta;
+ }
+
+ int32_t head = 0x7000;
+ if (oldLength < LENGTH_IN_1TRAIL && newLength < LENGTH_IN_1TRAIL) {
+ head |= oldLength << 6;
+ head |= newLength;
+ append(head);
+ } else if ((capacity - length) >= 5 || growArray()) {
+ int32_t limit = length + 1;
+ if(oldLength < LENGTH_IN_1TRAIL) {
+ head |= oldLength << 6;
+ } else if(oldLength <= 0x7fff) {
+ head |= LENGTH_IN_1TRAIL << 6;
+ array[limit++] = (uint16_t)(0x8000 | oldLength);
+ } else {
+ head |= (LENGTH_IN_2TRAIL + (oldLength >> 30)) << 6;
+ array[limit++] = (uint16_t)(0x8000 | (oldLength >> 15));
+ array[limit++] = (uint16_t)(0x8000 | oldLength);
+ }
+ if(newLength < LENGTH_IN_1TRAIL) {
+ head |= newLength;
+ } else if(newLength <= 0x7fff) {
+ head |= LENGTH_IN_1TRAIL;
+ array[limit++] = (uint16_t)(0x8000 | newLength);
+ } else {
+ head |= LENGTH_IN_2TRAIL + (newLength >> 30);
+ array[limit++] = (uint16_t)(0x8000 | (newLength >> 15));
+ array[limit++] = (uint16_t)(0x8000 | newLength);
+ }
+ array[length] = (uint16_t)head;
+ length = limit;
+ }
+}
+
+void Edits::append(int32_t r) {
+ if(length < capacity || growArray()) {
+ array[length++] = (uint16_t)r;
+ }
+}
+
+UBool Edits::growArray() {
+ int32_t newCapacity;
+ if (array == stackArray) {
+ newCapacity = 2000;
+ } else if (capacity == INT32_MAX) {
+ errorCode = U_BUFFER_OVERFLOW_ERROR;
+ return FALSE;
+ } else if (capacity >= (INT32_MAX / 2)) {
+ newCapacity = INT32_MAX;
+ } else {
+ newCapacity = 2 * capacity;
+ }
+ // Grow by at least 5 units so that a maximal change record will fit.
+ if ((newCapacity - capacity) < 5) {
+ errorCode = U_BUFFER_OVERFLOW_ERROR;
+ return FALSE;
+ }
+ uint16_t *newArray = (uint16_t *)uprv_malloc((size_t)newCapacity * 2);
+ if (newArray == NULL) {
+ errorCode = U_MEMORY_ALLOCATION_ERROR;
+ return FALSE;
+ }
+ uprv_memcpy(newArray, array, (size_t)length * 2);
+ if (array != stackArray) {
+ uprv_free(array);
+ }
+ array = newArray;
+ capacity = newCapacity;
+ return TRUE;
+}
+
+UBool Edits::copyErrorTo(UErrorCode &outErrorCode) {
+ if (U_FAILURE(outErrorCode)) { return TRUE; }
+ if (U_SUCCESS(errorCode)) { return FALSE; }
+ outErrorCode = errorCode;
+ return TRUE;
+}
+
+UBool Edits::hasChanges() const {
+ if (delta != 0) {
+ return TRUE;
+ }
+ for (int32_t i = 0; i < length; ++i) {
+ if (array[i] > MAX_UNCHANGED) {
+ return TRUE;
+ }
+ }
+ return FALSE;
+}
+
+Edits::Iterator::Iterator(const uint16_t *a, int32_t len, UBool oc, UBool crs) :
+ array(a), index(0), length(len), remaining(0),
+ onlyChanges(oc), coarse(crs),
+ changed(FALSE), oldLength_(0), newLength_(0),
+ srcIndex(0), replIndex(0), destIndex(0) {}
+
+int32_t Edits::Iterator::readLength(int32_t head) {
+ if (head < LENGTH_IN_1TRAIL) {
+ return head;
+ } else if (head < LENGTH_IN_2TRAIL) {
+ U_ASSERT(index < length);
+ U_ASSERT(array[index] >= 0x8000);
+ return array[index++];
+ } else {
+ U_ASSERT((index + 2) <= length);
+ U_ASSERT(array[index] >= 0x8000);
+ U_ASSERT(array[index + 1] >= 0x8000);
+ int32_t len = ((head & 1) << 30) |
+ ((int32_t)(array[index] & 0x7fff) << 15) |
+ (array[index + 1] & 0x7fff);
+ index += 2;
+ return len;
+ }
+}
+
+void Edits::Iterator::updateIndexes() {
+ srcIndex += oldLength_;
+ if (changed) {
+ replIndex += newLength_;
+ }
+ destIndex += newLength_;
+}
+
+UBool Edits::Iterator::noNext() {
+ // Empty span beyond the string.
+ oldLength_ = newLength_ = 0;
+ return FALSE;
+}
+
+UBool Edits::Iterator::next(UErrorCode &errorCode) {
+ if (U_FAILURE(errorCode)) { return FALSE; }
+ // We have an errorCode in case we need to start guarding against integer overflows.
+ // It is also convenient for caller loops if we bail out when an error was set elsewhere.
+ updateIndexes();
+ if (remaining > 0) {
+ // Fine-grained iterator: Continue a sequence of equal-length changes.
+ --remaining;
+ return TRUE;
+ }
+ if (index >= length) {
+ return noNext();
+ }
+ int32_t u = array[index++];
+ if (u <= MAX_UNCHANGED) {
+ // Combine adjacent unchanged ranges.
+ changed = FALSE;
+ oldLength_ = u + 1;
+ while (index < length && (u = array[index]) <= MAX_UNCHANGED) {
+ ++index;
+ oldLength_ += u + 1;
+ }
+ newLength_ = oldLength_;
+ if (onlyChanges) {
+ updateIndexes();
+ if (index >= length) {
+ return noNext();
+ }
+ // already fetched u > MAX_UNCHANGED at index
+ ++index;
+ } else {
+ return TRUE;
+ }
+ }
+ changed = TRUE;
+ if (u <= MAX_SHORT_CHANGE) {
+ if (coarse) {
+ int32_t w = u >> 12;
+ int32_t len = (u & 0xfff) + 1;
+ oldLength_ = newLength_ = len * w;
+ } else {
+ // Split a sequence of equal-length changes that was compressed into one unit.
+ oldLength_ = newLength_ = u >> 12;
+ remaining = u & 0xfff;
+ return TRUE;
+ }
+ } else {
+ U_ASSERT(u <= 0x7fff);
+ oldLength_ = readLength((u >> 6) & 0x3f);
+ newLength_ = readLength(u & 0x3f);
+ if (!coarse) {
+ return TRUE;
+ }
+ }
+ // Combine adjacent changes.
+ while (index < length && (u = array[index]) > MAX_UNCHANGED) {
+ ++index;
+ if (u <= MAX_SHORT_CHANGE) {
+ int32_t w = u >> 12;
+ int32_t len = (u & 0xfff) + 1;
+ len = len * w;
+ oldLength_ += len;
+ newLength_ += len;
+ } else {
+ U_ASSERT(u <= 0x7fff);
+ int32_t oldLen = readLength((u >> 6) & 0x3f);
+ int32_t newLen = readLength(u & 0x3f);
+ oldLength_ += oldLen;
+ newLength_ += newLen;
+ }
+ }
+ return TRUE;
+}
+
+UBool Edits::Iterator::findSourceIndex(int32_t i, UErrorCode &errorCode) {
+ if (U_FAILURE(errorCode) || i < 0) { return FALSE; }
+ if (i < srcIndex) {
+ // Reset the iterator to the start.
+ index = remaining = srcIndex = replIndex = destIndex = 0;
+ } else if (i < (srcIndex + oldLength_)) {
+ // The index is in the current span.
+ return TRUE;
+ }
+ while (next(errorCode)) {
+ if (i < (srcIndex + oldLength_)) {
+ // The index is in the current span.
+ return TRUE;
+ }
+ if (remaining > 0) {
+ // Is the index in one of the remaining compressed edits?
+ // srcIndex is the start of the current span, before the remaining ones.
+ int32_t len = (remaining + 1) * oldLength_;
+ if (i < (srcIndex + len)) {
+ int32_t n = (i - srcIndex) / oldLength_; // 1 <= n <= remaining
+ len = n * oldLength_;
+ srcIndex += len;
+ replIndex += len;
+ destIndex += len;
+ remaining -= n;
+ return TRUE;
+ }
+ // Make next() skip all of these edits at once.
+ oldLength_ = newLength_ = len;
+ remaining = 0;
+ }
+ }
+ return FALSE;
+}
+
+U_NAMESPACE_END
#include "unicode/utypes.h"
#include "unicode/locid.h"
#include "unicode/putil.h"
+#include "unicode/uchar.h"
#include "unicode/uloc.h"
#include "unicode/ures.h"
#include "unicode/uscript.h"
#include "cmemory.h"
#include "cstring.h"
#include "ucase.h"
+#include "ucasemap_imp.h"
#include "ustr_imp.h"
U_NAMESPACE_USE
--- /dev/null
+// Copyright (C) 2017 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+
+// ucasemap_imp.h
+// created: 2017feb08 Markus W. Scherer
+
+#ifndef __UCASEMAP_IMP_H__
+#define __UCASEMAP_IMP_H__
+
+#include "unicode/utypes.h"
+#include "unicode/ucasemap.h"
+#include "ucase.h"
+
+#ifndef U_COMPARE_IGNORE_CASE
+/* see also unorm.h */
+/**
+ * Option bit for unorm_compare:
+ * Perform case-insensitive comparison.
+ */
+#define U_COMPARE_IGNORE_CASE 0x10000
+#endif
+
+/**
+ * Internal API, used by u_strcasecmp() etc.
+ * Compare strings case-insensitively,
+ * in code point order or code unit order.
+ */
+U_CFUNC int32_t
+u_strcmpFold(const UChar *s1, int32_t length1,
+ const UChar *s2, int32_t length2,
+ uint32_t options,
+ UErrorCode *pErrorCode);
+
+/**
+ * Interanl API, used for detecting length of
+ * shared prefix case-insensitively.
+ * @param s1 input string 1
+ * @param length1 length of string 1, or -1 (NULL terminated)
+ * @param s2 input string 2
+ * @param length2 length of string 2, or -1 (NULL terminated)
+ * @param options compare options
+ * @param matchLen1 (output) length of partial prefix match in s1
+ * @param matchLen2 (output) length of partial prefix match in s2
+ * @param pErrorCode receives error status
+ */
+U_CAPI void
+u_caseInsensitivePrefixMatch(const UChar *s1, int32_t length1,
+ const UChar *s2, int32_t length2,
+ uint32_t options,
+ int32_t *matchLen1, int32_t *matchLen2,
+ UErrorCode *pErrorCode);
+
+/**
+ * Are the Unicode properties loaded?
+ * This must be used before internal functions are called that do
+ * not perform this check.
+ * Generate a debug assertion failure if data is not loaded.
+ */
+U_CFUNC UBool
+uprv_haveProperties(UErrorCode *pErrorCode);
+
+#ifdef __cplusplus
+
+#include "unicode/unistr.h" // for UStringCaseMapper
+
+/*
+ * Internal string casing functions implementing
+ * ustring.h/ustrcase.cpp and UnicodeString case mapping functions.
+ */
+
+struct UCaseMap : public icu::UMemory {
+ /** Implements most of ucasemap_open(). */
+ UCaseMap(const char *localeID, uint32_t opts, UErrorCode *pErrorCode);
+ ~UCaseMap();
+
+#if !UCONFIG_NO_BREAK_ITERATION
+ icu::BreakIterator *iter; /* We adopt the iterator, so we own it. */
+#endif
+ char locale[32];
+ int32_t caseLocale;
+ uint32_t options;
+};
+
+#if UCONFIG_NO_BREAK_ITERATION
+# define UCASEMAP_BREAK_ITERATOR_PARAM
+# define UCASEMAP_BREAK_ITERATOR_UNUSED
+# define UCASEMAP_BREAK_ITERATOR
+# define UCASEMAP_BREAK_ITERATOR_NULL
+#else
+# define UCASEMAP_BREAK_ITERATOR_PARAM icu::BreakIterator *iter,
+# define UCASEMAP_BREAK_ITERATOR_UNUSED icu::BreakIterator *,
+# define UCASEMAP_BREAK_ITERATOR iter,
+# define UCASEMAP_BREAK_ITERATOR_NULL NULL,
+#endif
+
+U_CFUNC int32_t
+ustrcase_getCaseLocale(const char *locale);
+
+// TODO: swap src / dest if approved for new public api
+/** Implements UStringCaseMapper. */
+U_CFUNC int32_t U_CALLCONV
+ustrcase_internalToLower(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_PARAM
+ UChar *dest, int32_t destCapacity,
+ const UChar *src, int32_t srcLength,
+ icu::Edits *edits,
+ UErrorCode &errorCode);
+
+/** Implements UStringCaseMapper. */
+U_CFUNC int32_t U_CALLCONV
+ustrcase_internalToUpper(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_PARAM
+ UChar *dest, int32_t destCapacity,
+ const UChar *src, int32_t srcLength,
+ icu::Edits *edits,
+ UErrorCode &errorCode);
+
+#if !UCONFIG_NO_BREAK_ITERATION
+
+/** Implements UStringCaseMapper. */
+U_CFUNC int32_t U_CALLCONV
+ustrcase_internalToTitle(int32_t caseLocale, uint32_t options,
+ icu::BreakIterator *iter,
+ UChar *dest, int32_t destCapacity,
+ const UChar *src, int32_t srcLength,
+ icu::Edits *edits,
+ UErrorCode &errorCode);
+
+#endif
+
+/** Implements UStringCaseMapper. */
+U_CFUNC int32_t U_CALLCONV
+ustrcase_internalFold(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_PARAM
+ UChar *dest, int32_t destCapacity,
+ const UChar *src, int32_t srcLength,
+ icu::Edits *edits,
+ UErrorCode &errorCode);
+
+/**
+ * Common string case mapping implementation for ucasemap_toXyz() and UnicodeString::toXyz().
+ * Implements argument checking.
+ */
+U_CFUNC int32_t
+ustrcase_map(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_PARAM
+ UChar *dest, int32_t destCapacity,
+ const UChar *src, int32_t srcLength,
+ UStringCaseMapper *stringCaseMapper,
+ icu::Edits *edits,
+ UErrorCode &errorCode);
+
+/**
+ * Common string case mapping implementation for old-fashioned u_strToXyz() functions
+ * that allow the source string to overlap the destination buffer.
+ * Implements argument checking and internally works with an intermediate buffer if necessary.
+ */
+U_CFUNC int32_t
+ustrcase_mapWithOverlap(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_PARAM
+ UChar *dest, int32_t destCapacity,
+ const UChar *src, int32_t srcLength,
+ UStringCaseMapper *stringCaseMapper,
+ UErrorCode &errorCode);
+
+/**
+ * UTF-8 string case mapping function type, used by ucasemap_mapUTF8().
+ * UTF-8 version of UStringCaseMapper.
+ * All error checking must be done.
+ * The UCaseMap must be fully initialized, with locale and/or iter set as needed.
+ * src and dest must not overlap.
+ */
+typedef int32_t U_CALLCONV
+UTF8CaseMapper(int32_t caseLocale, uint32_t options,
+#if !UCONFIG_NO_BREAK_ITERATION
+ icu::BreakIterator *iter,
+#endif
+ uint8_t *dest, int32_t destCapacity,
+ const uint8_t *src, int32_t srcLength,
+ UErrorCode *pErrorCode);
+
+#if !UCONFIG_NO_BREAK_ITERATION
+
+/** Implements UTF8CaseMapper. */
+U_CFUNC int32_t U_CALLCONV
+ucasemap_internalUTF8ToTitle(int32_t caseLocale, uint32_t options,
+ icu::BreakIterator *iter,
+ uint8_t *dest, int32_t destCapacity,
+ const uint8_t *src, int32_t srcLength,
+ UErrorCode *pErrorCode);
+
+#endif
+
+/**
+ * Implements argument checking and buffer handling
+ * for UTF-8 string case mapping as a common function.
+ */
+U_CFUNC int32_t
+ucasemap_mapUTF8(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_PARAM
+ uint8_t *dest, int32_t destCapacity,
+ const uint8_t *src, int32_t srcLength,
+ UTF8CaseMapper *stringCaseMapper,
+ UErrorCode *pErrorCode);
+
+U_NAMESPACE_BEGIN
+namespace GreekUpper {
+
+// Data bits.
+static const uint32_t UPPER_MASK = 0x3ff;
+static const uint32_t HAS_VOWEL = 0x1000;
+static const uint32_t HAS_YPOGEGRAMMENI = 0x2000;
+static const uint32_t HAS_ACCENT = 0x4000;
+static const uint32_t HAS_DIALYTIKA = 0x8000;
+// Further bits during data building and processing, not stored in the data map.
+static const uint32_t HAS_COMBINING_DIALYTIKA = 0x10000;
+static const uint32_t HAS_OTHER_GREEK_DIACRITIC = 0x20000;
+
+static const uint32_t HAS_VOWEL_AND_ACCENT = HAS_VOWEL | HAS_ACCENT;
+static const uint32_t HAS_VOWEL_AND_ACCENT_AND_DIALYTIKA =
+ HAS_VOWEL_AND_ACCENT | HAS_DIALYTIKA;
+static const uint32_t HAS_EITHER_DIALYTIKA = HAS_DIALYTIKA | HAS_COMBINING_DIALYTIKA;
+
+// State bits.
+static const uint32_t AFTER_CASED = 1;
+static const uint32_t AFTER_VOWEL_WITH_ACCENT = 2;
+
+uint32_t getLetterData(UChar32 c);
+
+/**
+ * Returns a non-zero value for each of the Greek combining diacritics
+ * listed in The Unicode Standard, version 8, chapter 7.2 Greek,
+ * plus some perispomeni look-alikes.
+ */
+uint32_t getDiacriticData(UChar32 c);
+
+} // namespace GreekUpper
+U_NAMESPACE_END
+
+#endif // __cplusplus
+
+#endif // __UCASEMAP_IMP_H__
#include "unicode/ucasemap.h"
#include "cmemory.h"
#include "ucase.h"
-#include "ustr_imp.h"
+#include "ucasemap_imp.h"
U_NAMESPACE_USE
--- /dev/null
+// Copyright (C) 2017 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+
+// casemap.h
+// created: 2017jan12 Markus W. Scherer
+
+#ifndef __CASEMAP_H__
+#define __CASEMAP_H__
+
+#include "unicode/utypes.h"
+#include "unicode/uobject.h"
+
+/**
+ * \file
+ * \brief C++ API: Low-level C++ case mapping functions.
+ */
+
+U_NAMESPACE_BEGIN
+
+#ifndef U_HIDE_DRAFT_API
+
+class BreakIterator;
+class Edits;
+
+/**
+ * Low-level C++ case mapping functions.
+ *
+ * @draft ICU 59
+ */
+class U_COMMON_API CaseMap final : public UMemory {
+public:
+ /**
+ * Lowercases a UTF-16 string and optionally records edits.
+ * Casing is locale-dependent and context-sensitive.
+ * The result may be longer or shorter than the original.
+ * The source string and the destination buffer must not overlap.
+ *
+ * @param locale The locale ID. ("" = root locale, NULL = default locale.)
+ * @param options Options bit set, usually 0. See UCASEMAP_OMIT_UNCHANGED_TEXT.
+ * @param src The original string.
+ * @param srcLength The length of the original string. If -1, then src must be NUL-terminated.
+ * @param dest A buffer for the result string. The result will be NUL-terminated if
+ * the buffer is large enough.
+ * The contents is undefined in case of failure.
+ * @param destCapacity The size of the buffer (number of bytes). If it is 0, then
+ * dest may be NULL and the function will only return the length of the result
+ * without writing any of the result string.
+ * @param edits Records edits for index mapping, working with styled text,
+ * and getting only changes (if any).
+ * This function calls edits->reset() first. edits can be NULL.
+ * @param errorCode Reference to an in/out error code value
+ * which must not indicate a failure before the function call.
+ * @return The length of the result string, if successful - or in case of a buffer overflow,
+ * in which case it will be greater than destCapacity.
+ *
+ * @see u_strToLower
+ * @draft ICU 59
+ */
+ static int32_t toLower(
+ const char *locale, uint32_t options,
+ const UChar *src, int32_t srcLength,
+ UChar *dest, int32_t destCapacity, Edits *edits,
+ UErrorCode &errorCode);
+
+ /**
+ * Uppercases a UTF-16 string and optionally records edits.
+ * Casing is locale-dependent and context-sensitive.
+ * The result may be longer or shorter than the original.
+ * The source string and the destination buffer must not overlap.
+ *
+ * @param locale The locale ID. ("" = root locale, NULL = default locale.)
+ * @param options Options bit set, usually 0. See UCASEMAP_OMIT_UNCHANGED_TEXT.
+ * @param src The original string.
+ * @param srcLength The length of the original string. If -1, then src must be NUL-terminated.
+ * @param dest A buffer for the result string. The result will be NUL-terminated if
+ * the buffer is large enough.
+ * The contents is undefined in case of failure.
+ * @param destCapacity The size of the buffer (number of bytes). If it is 0, then
+ * dest may be NULL and the function will only return the length of the result
+ * without writing any of the result string.
+ * @param edits Records edits for index mapping, working with styled text,
+ * and getting only changes (if any).
+ * This function calls edits->reset() first. edits can be NULL.
+ * @param errorCode Reference to an in/out error code value
+ * which must not indicate a failure before the function call.
+ * @return The length of the result string, if successful - or in case of a buffer overflow,
+ * in which case it will be greater than destCapacity.
+ *
+ * @see u_strToUpper
+ * @draft ICU 59
+ */
+ static int32_t toUpper(
+ const char *locale, uint32_t options,
+ const UChar *src, int32_t srcLength,
+ UChar *dest, int32_t destCapacity, Edits *edits,
+ UErrorCode &errorCode);
+
+#if !UCONFIG_NO_BREAK_ITERATION
+
+ /**
+ * Titlecases a UTF-16 string and optionally records edits.
+ * Casing is locale-dependent and context-sensitive.
+ * The result may be longer or shorter than the original.
+ * The source string and the destination buffer must not overlap.
+ *
+ * Titlecasing uses a break iterator to find the first characters of words
+ * that are to be titlecased. It titlecases those characters and lowercases
+ * all others. (This can be modified with options bits.)
+ *
+ * @param locale The locale ID. ("" = root locale, NULL = default locale.)
+ * @param options Options bit set, usually 0. See UCASEMAP_OMIT_UNCHANGED_TEXT,
+ * U_TITLECASE_NO_LOWERCASE, U_TITLECASE_NO_BREAK_ADJUSTMENT.
+ * @param iter A break iterator to find the first characters of words that are to be titlecased.
+ * It is set to the source string (setText())
+ * and used one or more times for iteration (first() and next()).
+ * If NULL, then a word break iterator for the locale is used
+ * (or something equivalent).
+ * @param src The original string.
+ * @param srcLength The length of the original string. If -1, then src must be NUL-terminated.
+ * @param dest A buffer for the result string. The result will be NUL-terminated if
+ * the buffer is large enough.
+ * The contents is undefined in case of failure.
+ * @param destCapacity The size of the buffer (number of bytes). If it is 0, then
+ * dest may be NULL and the function will only return the length of the result
+ * without writing any of the result string.
+ * @param edits Records edits for index mapping, working with styled text,
+ * and getting only changes (if any).
+ * This function calls edits->reset() first. edits can be NULL.
+ * @param errorCode Reference to an in/out error code value
+ * which must not indicate a failure before the function call.
+ * @return The length of the result string, if successful - or in case of a buffer overflow,
+ * in which case it will be greater than destCapacity.
+ *
+ * @see u_strToTitle
+ * @see ucasemap_toTitle
+ * @draft ICU 59
+ */
+ static int32_t toTitle(
+ const char *locale, uint32_t options, BreakIterator *iter,
+ const UChar *src, int32_t srcLength,
+ UChar *dest, int32_t destCapacity, Edits *edits,
+ UErrorCode &errorCode);
+
+#endif // UCONFIG_NO_BREAK_ITERATION
+
+ /**
+ * Case-folds a UTF-16 string and optionally records edits.
+ *
+ * Case-folding is locale-independent and not context-sensitive,
+ * but there is an option for whether to include or exclude mappings for dotted I
+ * and dotless i that are marked with 'T' in CaseFolding.txt.
+ *
+ * The result may be longer or shorter than the original.
+ * The source string and the destination buffer must not overlap.
+ *
+ * @param options Options bit set, usually 0. See UCASEMAP_OMIT_UNCHANGED_TEXT,
+ * U_FOLD_CASE_DEFAULT, U_FOLD_CASE_EXCLUDE_SPECIAL_I.
+ * @param src The original string.
+ * @param srcLength The length of the original string. If -1, then src must be NUL-terminated.
+ * @param dest A buffer for the result string. The result will be NUL-terminated if
+ * the buffer is large enough.
+ * The contents is undefined in case of failure.
+ * @param destCapacity The size of the buffer (number of bytes). If it is 0, then
+ * dest may be NULL and the function will only return the length of the result
+ * without writing any of the result string.
+ * @param edits Records edits for index mapping, working with styled text,
+ * and getting only changes (if any).
+ * This function calls edits->reset() first. edits can be NULL.
+ * @param errorCode Reference to an in/out error code value
+ * which must not indicate a failure before the function call.
+ * @return The length of the result string, if successful - or in case of a buffer overflow,
+ * in which case it will be greater than destCapacity.
+ *
+ * @see u_strFoldCase
+ * @draft ICU 59
+ */
+ static int32_t foldCase(
+ uint32_t options,
+ const UChar *src, int32_t srcLength,
+ UChar *dest, int32_t destCapacity, Edits *edits,
+ UErrorCode &errorCode);
+
+private:
+ CaseMap() = delete;
+ CaseMap(const CaseMap &other) = delete;
+ CaseMap &operator=(const CaseMap &other) = delete;
+};
+
+#endif // U_HIDE_DRAFT_API
+
+U_NAMESPACE_END
+
+#endif // __CASEMAP_H__
--- /dev/null
+// Copyright (C) 2016 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+
+// edits.h
+// created: 2016dec30 Markus W. Scherer
+
+#ifndef __EDITS_H__
+#define __EDITS_H__
+
+#include "unicode/utypes.h"
+#include "unicode/uobject.h"
+
+/**
+ * \file
+ * \brief C++ API: C++ class Edits for low-level string transformations on styled text.
+ */
+
+U_NAMESPACE_BEGIN
+
+#ifndef U_HIDE_DRAFT_API
+
+/**
+ * Records lengths of string edits but not replacement text.
+ * Supports replacements, insertions, deletions in linear progression.
+ * Does not support moving/reordering of text.
+ *
+ * An Edits object tracks a separate UErrorCode, but ICU string transformation functions
+ * (e.g., case mapping functions) merge any such errors into their API's UErrorCode.
+ *
+ * @draft ICU 59
+ */
+class U_COMMON_API Edits final : public UMemory {
+public:
+ /**
+ * Constructs an empty object.
+ * @draft ICU 59
+ */
+ Edits() :
+ array(stackArray), capacity(STACK_CAPACITY), length(0), delta(0),
+ errorCode(U_ZERO_ERROR) {}
+ /**
+ * Destructor.
+ * @draft ICU 59
+ */
+ ~Edits();
+
+ /**
+ * Resets the data but may not release memory.
+ * @draft ICU 59
+ */
+ void reset();
+
+ /**
+ * Adds a record for an unchanged segment of text.
+ * Normally called from inside ICU string transformation functions, not user code.
+ * @draft ICU 59
+ */
+ void addUnchanged(int32_t unchangedLength);
+ /**
+ * Adds a record for a text replacement/insertion/deletion.
+ * Normally called from inside ICU string transformation functions, not user code.
+ * @draft ICU 59
+ */
+ void addReplace(int32_t oldLength, int32_t newLength);
+ /**
+ * Sets the UErrorCode if an error occurred while recording edits.
+ * Preserves older error codes in the outErrorCode.
+ * Normally called from inside ICU string transformation functions, not user code.
+ * @return TRUE if U_FAILURE(outErrorCode)
+ * @draft ICU 59
+ */
+ UBool copyErrorTo(UErrorCode &outErrorCode);
+
+ /**
+ * How much longer is the new text compared with the old text?
+ * @return new length minus old length
+ * @draft ICU 59
+ */
+ int32_t lengthDelta() const { return delta; }
+ /**
+ * @return TRUE if there are any change edits
+ * @draft ICU 59
+ */
+ UBool hasChanges() const;
+
+ /**
+ * Access to the list of edits.
+ * @see getCoarseIterator
+ * @see getFineIterator
+ * @draft ICU 59
+ */
+ struct Iterator final : public UMemory {
+ /**
+ * Copy constructor.
+ * @draft ICU 59
+ */
+ Iterator(const Iterator &other) = default;
+ /**
+ * Assignment operator.
+ * @draft ICU 59
+ */
+ Iterator &operator=(const Iterator &other) = default;
+
+ /**
+ * Advances to the next edit.
+ * @return TRUE if there is another edit
+ * @draft ICU 59
+ */
+ UBool next(UErrorCode &errorCode);
+
+ /**
+ * Finds the edit that contains the source index.
+ * The source index may be found in a non-change
+ * even if normal iteration would skip non-changes.
+ * Normal iteration can continue from a found edit.
+ *
+ * The iterator state before this search logically does not matter.
+ * (It may affect the performance of the search.)
+ *
+ * The iterator state after this search is undefined
+ * if the source index is out of bounds for the source string.
+ *
+ * @param i source index
+ * @return TRUE if the edit for the source index was found
+ * @draft ICU 59
+ */
+ UBool findSourceIndex(int32_t i, UErrorCode &errorCode);
+
+ /**
+ * @return TRUE if this edit replaces oldLength() units with newLength() different ones.
+ * FALSE if oldLength units remain unchanged.
+ * @draft ICU 59
+ */
+ UBool hasChange() const { return changed; }
+ /**
+ * @return the number of units in the original string which are replaced or remain unchanged.
+ * @draft ICU 59
+ */
+ int32_t oldLength() const { return oldLength_; }
+ /**
+ * @return the number of units in the modified string, if hasChange() is TRUE.
+ * Same as oldLength if hasChange() is FALSE.
+ * @draft ICU 59
+ */
+ int32_t newLength() const { return newLength_; }
+
+ /**
+ * @return the current index into the source string
+ * @draft ICU 59
+ */
+ int32_t sourceIndex() const { return srcIndex; }
+ /**
+ * @return the current index into the replacement-characters-only string,
+ * not counting unchanged spans
+ * @draft ICU 59
+ */
+ int32_t replacementIndex() const { return replIndex; }
+ /**
+ * @return the current index into the full destination string
+ * @draft ICU 59
+ */
+ int32_t destinationIndex() const { return destIndex; }
+
+ private:
+ friend class Edits;
+
+ Iterator(const uint16_t *a, int32_t len, UBool oc, UBool crs);
+
+ int32_t readLength(int32_t head);
+ void updateIndexes();
+ UBool noNext();
+
+ const uint16_t *array;
+ int32_t index, length;
+ int32_t remaining;
+ UBool onlyChanges, coarse;
+
+ UBool changed;
+ int32_t oldLength_, newLength_;
+ int32_t srcIndex, replIndex, destIndex;
+ };
+
+ /**
+ * Returns an Iterator for coarse-grained changes for simple string updates.
+ * Skips non-changes.
+ * @return an Iterator that merges adjacent changes.
+ * @draft ICU 59
+ */
+ Iterator getCoarseChangesIterator() const {
+ return Iterator(array, length, TRUE, TRUE);
+ }
+
+ /**
+ * Returns an Iterator for coarse-grained changes and non-changes for simple string updates.
+ * @return an Iterator that merges adjacent changes.
+ * @draft ICU 59
+ */
+ Iterator getCoarseIterator() const {
+ return Iterator(array, length, FALSE, TRUE);
+ }
+
+ /**
+ * Returns an Iterator for fine-grained changes for modifying styled text.
+ * Skips non-changes.
+ * @return an Iterator that separates adjacent changes.
+ * @draft ICU 59
+ */
+ Iterator getFineChangesIterator() const {
+ return Iterator(array, length, TRUE, FALSE);
+ }
+
+ /**
+ * Returns an Iterator for fine-grained changes and non-changes for modifying styled text.
+ * @return an Iterator that separates adjacent changes.
+ * @draft ICU 59
+ */
+ Iterator getFineIterator() const {
+ return Iterator(array, length, FALSE, FALSE);
+ }
+
+private:
+ Edits(const Edits &) = delete;
+ Edits &operator=(const Edits &) = delete;
+
+ void setLastUnit(int32_t last) { array[length - 1] = (uint16_t)last; }
+ int32_t lastUnit() const { return length > 0 ? array[length - 1] : 0xffff; }
+
+ void append(int32_t r);
+ UBool growArray();
+
+ static const int32_t STACK_CAPACITY = 100;
+ uint16_t *array;
+ int32_t capacity;
+ int32_t length;
+ int32_t delta;
+ UErrorCode errorCode;
+ uint16_t stackArray[STACK_CAPACITY];
+};
+
+#endif // U_HIDE_DRAFT_API
+
+U_NAMESPACE_END
+
+#endif // __EDITS_H__
#include "unicode/utypes.h"
#include "unicode/localpointer.h"
-
-#if U_SHOW_CPLUSPLUS_API
-#include "unicode/uobject.h"
-#endif // U_SHOW_CPLUSPLUS_API
-
#include "unicode/ustring.h"
/**
U_NAMESPACE_BEGIN
-class BreakIterator;
-
/**
* \class LocalUCaseMapPointer
* "Smart pointer" class, closes a UCaseMap via ucasemap_close().
*/
U_DEFINE_LOCAL_OPEN_POINTER(LocalUCaseMapPointer, UCaseMap, ucasemap_close);
-// TODO: move to new C++ unicode/casemap.h
-
-#ifndef U_HIDE_DRAFT_API
-
-/**
- * Records lengths of string edits but not replacement text.
- * Supports replacements, insertions, deletions in linear progression.
- * Does not support moving/reordering of text.
- *
- * An Edits object tracks a separate UErrorCode, but ICU string transformation functions
- * (e.g., case mapping functions) merge any such errors into their API's UErrorCode.
- *
- * @draft ICU 59
- */
-class U_COMMON_API Edits final : public UMemory {
-public:
- /**
- * Constructs an empty object.
- * @draft ICU 59
- */
- Edits() :
- array(stackArray), capacity(STACK_CAPACITY), length(0), delta(0),
- errorCode(U_ZERO_ERROR) {}
- /**
- * Destructor.
- * @draft ICU 59
- */
- ~Edits();
-
- /**
- * Resets the data but may not release memory.
- * @draft ICU 59
- */
- void reset();
-
- /**
- * Adds a record for an unchanged segment of text.
- * Normally called from inside ICU string transformation functions, not user code.
- * @draft ICU 59
- */
- void addUnchanged(int32_t unchangedLength);
- /**
- * Adds a record for a text replacement/insertion/deletion.
- * Normally called from inside ICU string transformation functions, not user code.
- * @draft ICU 59
- */
- void addReplace(int32_t oldLength, int32_t newLength);
- /**
- * Sets the UErrorCode if an error occurred while recording edits.
- * Preserves older error codes in the outErrorCode.
- * Normally called from inside ICU string transformation functions, not user code.
- * @return TRUE if U_FAILURE(outErrorCode)
- * @draft ICU 59
- */
- UBool copyErrorTo(UErrorCode &outErrorCode);
-
- /**
- * How much longer is the new text compared with the old text?
- * @return new length minus old length
- * @draft ICU 59
- */
- int32_t lengthDelta() const { return delta; }
- /**
- * @return TRUE if there are any change edits
- * @draft ICU 59
- */
- UBool hasChanges() const;
-
- /**
- * Access to the list of edits.
- * @see getCoarseIterator
- * @see getFineIterator
- * @draft ICU 59
- */
- struct Iterator final : public UMemory {
- /**
- * Copy constructor.
- * @draft ICU 59
- */
- Iterator(const Iterator &other) = default;
- /**
- * Assignment operator.
- * @draft ICU 59
- */
- Iterator &operator=(const Iterator &other) = default;
-
- /**
- * Advances to the next edit.
- * @return TRUE if there is another edit
- * @draft ICU 59
- */
- UBool next(UErrorCode &errorCode);
-
- /**
- * Finds the edit that contains the source index.
- * The source index may be found in a non-change
- * even if normal iteration would skip non-changes.
- * Normal iteration can continue from a found edit.
- *
- * The iterator state before this search logically does not matter.
- * (It may affect the performance of the search.)
- *
- * The iterator state after this search is undefined
- * if the source index is out of bounds for the source string.
- *
- * @param i source index
- * @return TRUE if the edit for the source index was found
- * @draft ICU 59
- */
- UBool findSourceIndex(int32_t i, UErrorCode &errorCode);
-
- /**
- * @return TRUE if this edit replaces oldLength() units with newLength() different ones.
- * FALSE if oldLength units remain unchanged.
- * @draft ICU 59
- */
- UBool hasChange() const { return changed; }
- /**
- * @return the number of units in the original string which are replaced or remain unchanged.
- * @draft ICU 59
- */
- int32_t oldLength() const { return oldLength_; }
- /**
- * @return the number of units in the modified string, if hasChange() is TRUE.
- * Same as oldLength if hasChange() is FALSE.
- * @draft ICU 59
- */
- int32_t newLength() const { return newLength_; }
-
- /**
- * @return the current index into the source string
- * @draft ICU 59
- */
- int32_t sourceIndex() const { return srcIndex; }
- /**
- * @return the current index into the replacement-characters-only string,
- * not counting unchanged spans
- * @draft ICU 59
- */
- int32_t replacementIndex() const { return replIndex; }
- /**
- * @return the current index into the full destination string
- * @draft ICU 59
- */
- int32_t destinationIndex() const { return destIndex; }
-
- private:
- friend class Edits;
-
- Iterator(const uint16_t *a, int32_t len, UBool oc, UBool crs);
-
- int32_t readLength(int32_t head);
- void updateIndexes();
- UBool noNext();
-
- const uint16_t *array;
- int32_t index, length;
- int32_t remaining;
- UBool onlyChanges, coarse;
-
- UBool changed;
- int32_t oldLength_, newLength_;
- int32_t srcIndex, replIndex, destIndex;
- };
-
- /**
- * Returns an Iterator for coarse-grained changes for simple string updates.
- * Skips non-changes.
- * @return an Iterator that merges adjacent changes.
- * @draft ICU 59
- */
- Iterator getCoarseChangesIterator() const {
- return Iterator(array, length, TRUE, TRUE);
- }
-
- /**
- * Returns an Iterator for coarse-grained changes and non-changes for simple string updates.
- * @return an Iterator that merges adjacent changes.
- * @draft ICU 59
- */
- Iterator getCoarseIterator() const {
- return Iterator(array, length, FALSE, TRUE);
- }
-
- /**
- * Returns an Iterator for fine-grained changes for modifying styled text.
- * Skips non-changes.
- * @return an Iterator that separates adjacent changes.
- * @draft ICU 59
- */
- Iterator getFineChangesIterator() const {
- return Iterator(array, length, TRUE, FALSE);
- }
-
- /**
- * Returns an Iterator for fine-grained changes and non-changes for modifying styled text.
- * @return an Iterator that separates adjacent changes.
- * @draft ICU 59
- */
- Iterator getFineIterator() const {
- return Iterator(array, length, FALSE, FALSE);
- }
-
-private:
- Edits(const Edits &) = delete;
- Edits &operator=(const Edits &) = delete;
-
- void setLastUnit(int32_t last) { array[length - 1] = (uint16_t)last; }
- int32_t lastUnit() const { return length > 0 ? array[length - 1] : 0xffff; }
-
- void append(int32_t r);
- UBool growArray();
-
- static const int32_t STACK_CAPACITY = 100;
- uint16_t *array;
- int32_t capacity;
- int32_t length;
- int32_t delta;
- UErrorCode errorCode;
- uint16_t stackArray[STACK_CAPACITY];
-};
-
-/**
- * Low-level C++ case mapping functions.
- *
- * @draft ICU 59
- */
-class U_COMMON_API CaseMap final : public UMemory {
-public:
- /**
- * Lowercases a UTF-16 string and optionally records edits.
- * Casing is locale-dependent and context-sensitive.
- * The result may be longer or shorter than the original.
- * The source string and the destination buffer must not overlap.
- *
- * @param locale The locale ID. ("" = root locale, NULL = default locale.)
- * @param options Options bit set, usually 0. See UCASEMAP_OMIT_UNCHANGED_TEXT.
- * @param src The original string.
- * @param srcLength The length of the original string. If -1, then src must be NUL-terminated.
- * @param dest A buffer for the result string. The result will be NUL-terminated if
- * the buffer is large enough.
- * The contents is undefined in case of failure.
- * @param destCapacity The size of the buffer (number of bytes). If it is 0, then
- * dest may be NULL and the function will only return the length of the result
- * without writing any of the result string.
- * @param edits Records edits for index mapping, working with styled text,
- * and getting only changes (if any).
- * This function calls edits->reset() first. edits can be NULL.
- * @param errorCode Reference to an in/out error code value
- * which must not indicate a failure before the function call.
- * @return The length of the result string, if successful - or in case of a buffer overflow,
- * in which case it will be greater than destCapacity.
- *
- * @see u_strToLower
- * @draft ICU 59
- */
- static int32_t toLower(
- const char *locale, uint32_t options,
- const UChar *src, int32_t srcLength,
- UChar *dest, int32_t destCapacity, Edits *edits,
- UErrorCode &errorCode);
-
- /**
- * Uppercases a UTF-16 string and optionally records edits.
- * Casing is locale-dependent and context-sensitive.
- * The result may be longer or shorter than the original.
- * The source string and the destination buffer must not overlap.
- *
- * @param locale The locale ID. ("" = root locale, NULL = default locale.)
- * @param options Options bit set, usually 0. See UCASEMAP_OMIT_UNCHANGED_TEXT.
- * @param src The original string.
- * @param srcLength The length of the original string. If -1, then src must be NUL-terminated.
- * @param dest A buffer for the result string. The result will be NUL-terminated if
- * the buffer is large enough.
- * The contents is undefined in case of failure.
- * @param destCapacity The size of the buffer (number of bytes). If it is 0, then
- * dest may be NULL and the function will only return the length of the result
- * without writing any of the result string.
- * @param edits Records edits for index mapping, working with styled text,
- * and getting only changes (if any).
- * This function calls edits->reset() first. edits can be NULL.
- * @param errorCode Reference to an in/out error code value
- * which must not indicate a failure before the function call.
- * @return The length of the result string, if successful - or in case of a buffer overflow,
- * in which case it will be greater than destCapacity.
- *
- * @see u_strToUpper
- * @draft ICU 59
- */
- static int32_t toUpper(
- const char *locale, uint32_t options,
- const UChar *src, int32_t srcLength,
- UChar *dest, int32_t destCapacity, Edits *edits,
- UErrorCode &errorCode);
-
-#if !UCONFIG_NO_BREAK_ITERATION
-
- /**
- * Titlecases a UTF-16 string and optionally records edits.
- * Casing is locale-dependent and context-sensitive.
- * The result may be longer or shorter than the original.
- * The source string and the destination buffer must not overlap.
- *
- * Titlecasing uses a break iterator to find the first characters of words
- * that are to be titlecased. It titlecases those characters and lowercases
- * all others. (This can be modified with options bits.)
- *
- * @param locale The locale ID. ("" = root locale, NULL = default locale.)
- * @param options Options bit set, usually 0. See UCASEMAP_OMIT_UNCHANGED_TEXT,
- * U_TITLECASE_NO_LOWERCASE, U_TITLECASE_NO_BREAK_ADJUSTMENT.
- * @param iter A break iterator to find the first characters of words that are to be titlecased.
- * It is set to the source string (setText())
- * and used one or more times for iteration (first() and next()).
- * If NULL, then a word break iterator for the locale is used
- * (or something equivalent).
- * @param src The original string.
- * @param srcLength The length of the original string. If -1, then src must be NUL-terminated.
- * @param dest A buffer for the result string. The result will be NUL-terminated if
- * the buffer is large enough.
- * The contents is undefined in case of failure.
- * @param destCapacity The size of the buffer (number of bytes). If it is 0, then
- * dest may be NULL and the function will only return the length of the result
- * without writing any of the result string.
- * @param edits Records edits for index mapping, working with styled text,
- * and getting only changes (if any).
- * This function calls edits->reset() first. edits can be NULL.
- * @param errorCode Reference to an in/out error code value
- * which must not indicate a failure before the function call.
- * @return The length of the result string, if successful - or in case of a buffer overflow,
- * in which case it will be greater than destCapacity.
- *
- * @see u_strToTitle
- * @see ucasemap_toTitle
- * @draft ICU 59
- */
- static int32_t toTitle(
- const char *locale, uint32_t options, BreakIterator *iter,
- const UChar *src, int32_t srcLength,
- UChar *dest, int32_t destCapacity, Edits *edits,
- UErrorCode &errorCode);
-
-#endif // UCONFIG_NO_BREAK_ITERATION
-
- /**
- * Case-folds a UTF-16 string and optionally records edits.
- *
- * Case-folding is locale-independent and not context-sensitive,
- * but there is an option for whether to include or exclude mappings for dotted I
- * and dotless i that are marked with 'T' in CaseFolding.txt.
- *
- * The result may be longer or shorter than the original.
- * The source string and the destination buffer must not overlap.
- *
- * @param options Options bit set, usually 0. See UCASEMAP_OMIT_UNCHANGED_TEXT,
- * U_FOLD_CASE_DEFAULT, U_FOLD_CASE_EXCLUDE_SPECIAL_I.
- * @param src The original string.
- * @param srcLength The length of the original string. If -1, then src must be NUL-terminated.
- * @param dest A buffer for the result string. The result will be NUL-terminated if
- * the buffer is large enough.
- * The contents is undefined in case of failure.
- * @param destCapacity The size of the buffer (number of bytes). If it is 0, then
- * dest may be NULL and the function will only return the length of the result
- * without writing any of the result string.
- * @param edits Records edits for index mapping, working with styled text,
- * and getting only changes (if any).
- * This function calls edits->reset() first. edits can be NULL.
- * @param errorCode Reference to an in/out error code value
- * which must not indicate a failure before the function call.
- * @return The length of the result string, if successful - or in case of a buffer overflow,
- * in which case it will be greater than destCapacity.
- *
- * @see u_strFoldCase
- * @draft ICU 59
- */
- static int32_t foldCase(
- uint32_t options,
- const UChar *src, int32_t srcLength,
- UChar *dest, int32_t destCapacity, Edits *edits,
- UErrorCode &errorCode);
-
-private:
- CaseMap() = delete;
- CaseMap(const CaseMap &other) = delete;
- CaseMap &operator=(const CaseMap &other) = delete;
-};
-
-/**
- * Omit unchanged text when case-mapping with Edits.
- *
- * @draft ICU 59
- */
-#define UCASEMAP_OMIT_UNCHANGED_TEXT 0x4000
-
-#endif // U_HIDE_DRAFT_API
-
U_NAMESPACE_END
#endif
*/
#define U_TITLECASE_NO_BREAK_ADJUSTMENT 0x200
+/**
+ * Omit unchanged text when case-mapping with Edits.
+ *
+ * @see CaseMap
+ * @see Edits
+ * @draft ICU 59
+ */
+#define UCASEMAP_OMIT_UNCHANGED_TEXT 0x4000
+
#if !UCONFIG_NO_BREAK_ITERATION
/**
*/
#include "unicode/utypes.h"
+#include "unicode/casemap.h"
+#include "unicode/edits.h"
#include "unicode/putil.h"
#include "cstring.h"
#include "cmemory.h"
#include "unicode/unistr.h"
#include "unicode/uchar.h"
#include "uassert.h"
+#include "ucasemap_imp.h"
#include "uelement.h"
-#include "ustr_imp.h"
U_NAMESPACE_BEGIN
#include "unicode/locid.h"
#include "unicode/ucasemap.h"
#include "unicode/unistr.h"
-#include "ustr_imp.h"
+#include "ucasemap_imp.h"
U_NAMESPACE_BEGIN
#include "unicode/locid.h"
#include "unicode/ucasemap.h"
#include "unicode/unistr.h"
-#include "ustr_imp.h"
+#include "ucasemap_imp.h"
U_NAMESPACE_BEGIN
#define __USTR_IMP_H__
#include "unicode/utypes.h"
-#include "unicode/ucasemap.h"
-#include "unicode/uiter.h"
-#include "ucase.h"
-
-/** Simple declaration to avoid including unicode/ubrk.h. */
-#ifndef UBRK_TYPEDEF_UBREAK_ITERATOR
-# define UBRK_TYPEDEF_UBREAK_ITERATOR
- typedef struct UBreakIterator UBreakIterator;
-#endif
-
-#ifndef U_COMPARE_IGNORE_CASE
-/* see also unorm.h */
-/**
- * Option bit for unorm_compare:
- * Perform case-insensitive comparison.
- */
-#define U_COMPARE_IGNORE_CASE 0x10000
-#endif
/**
* Internal option for unorm_cmpEquivFold() for strncmp style.
const UChar *s2, int32_t length2,
UBool strncmpStyle, UBool codePointOrder);
-/**
- * Internal API, used by u_strcasecmp() etc.
- * Compare strings case-insensitively,
- * in code point order or code unit order.
- */
-U_CFUNC int32_t
-u_strcmpFold(const UChar *s1, int32_t length1,
- const UChar *s2, int32_t length2,
- uint32_t options,
- UErrorCode *pErrorCode);
-
-/**
- * Interanl API, used for detecting length of
- * shared prefix case-insensitively.
- * @param s1 input string 1
- * @param length1 length of string 1, or -1 (NULL terminated)
- * @param s2 input string 2
- * @param length2 length of string 2, or -1 (NULL terminated)
- * @param options compare options
- * @param matchLen1 (output) length of partial prefix match in s1
- * @param matchLen2 (output) length of partial prefix match in s2
- * @param pErrorCode receives error status
- */
-U_CAPI void
-u_caseInsensitivePrefixMatch(const UChar *s1, int32_t length1,
- const UChar *s2, int32_t length2,
- uint32_t options,
- int32_t *matchLen1, int32_t *matchLen2,
- UErrorCode *pErrorCode);
-
-/**
- * Are the Unicode properties loaded?
- * This must be used before internal functions are called that do
- * not perform this check.
- * Generate a debug assertion failure if data is not loaded.
- */
-U_CFUNC UBool
-uprv_haveProperties(UErrorCode *pErrorCode);
-
-/**
- * Load the Unicode property data.
- * Intended primarily for use from u_init().
- * Has no effect if property data is already loaded.
- * NOT thread safe.
- */
-/*U_CFUNC int8_t
-uprv_loadPropsData(UErrorCode *errorCode);*/
-
-#ifdef __cplusplus
-// TODO: Consider moving these case mapping definitions
-// into a new internal header like ucasemap_imp.h.
-
-#include "unicode/unistr.h" // for UStringCaseMapper
-
-/*
- * Internal string casing functions implementing
- * ustring.h/ustrcase.c and UnicodeString case mapping functions.
- */
-
-struct UCaseMap : public icu::UMemory {
- /** Implements most of ucasemap_open(). */
- UCaseMap(const char *localeID, uint32_t opts, UErrorCode *pErrorCode);
- ~UCaseMap();
-
-#if !UCONFIG_NO_BREAK_ITERATION
- icu::BreakIterator *iter; /* We adopt the iterator, so we own it. */
-#endif
- char locale[32];
- int32_t caseLocale;
- uint32_t options;
-};
-
-#if UCONFIG_NO_BREAK_ITERATION
-# define UCASEMAP_BREAK_ITERATOR_PARAM
-# define UCASEMAP_BREAK_ITERATOR_UNUSED
-# define UCASEMAP_BREAK_ITERATOR
-# define UCASEMAP_BREAK_ITERATOR_NULL
-#else
-# define UCASEMAP_BREAK_ITERATOR_PARAM icu::BreakIterator *iter,
-# define UCASEMAP_BREAK_ITERATOR_UNUSED icu::BreakIterator *,
-# define UCASEMAP_BREAK_ITERATOR iter,
-# define UCASEMAP_BREAK_ITERATOR_NULL NULL,
-#endif
-
-U_CFUNC int32_t
-ustrcase_getCaseLocale(const char *locale);
-
-// TODO: swap src / dest if approved for new public api
-/** Implements UStringCaseMapper. */
-U_CFUNC int32_t U_CALLCONV
-ustrcase_internalToLower(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_PARAM
- UChar *dest, int32_t destCapacity,
- const UChar *src, int32_t srcLength,
- icu::Edits *edits,
- UErrorCode &errorCode);
-
-/** Implements UStringCaseMapper. */
-U_CFUNC int32_t U_CALLCONV
-ustrcase_internalToUpper(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_PARAM
- UChar *dest, int32_t destCapacity,
- const UChar *src, int32_t srcLength,
- icu::Edits *edits,
- UErrorCode &errorCode);
-
-#if !UCONFIG_NO_BREAK_ITERATION
-
-/** Implements UStringCaseMapper. */
-U_CFUNC int32_t U_CALLCONV
-ustrcase_internalToTitle(int32_t caseLocale, uint32_t options,
- icu::BreakIterator *iter,
- UChar *dest, int32_t destCapacity,
- const UChar *src, int32_t srcLength,
- icu::Edits *edits,
- UErrorCode &errorCode);
-
-#endif
-
-/** Implements UStringCaseMapper. */
-U_CFUNC int32_t U_CALLCONV
-ustrcase_internalFold(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_PARAM
- UChar *dest, int32_t destCapacity,
- const UChar *src, int32_t srcLength,
- icu::Edits *edits,
- UErrorCode &errorCode);
-
-/**
- * Common string case mapping implementation for ucasemap_toXyz() and UnicodeString::toXyz().
- * Implements argument checking.
- */
-U_CFUNC int32_t
-ustrcase_map(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_PARAM
- UChar *dest, int32_t destCapacity,
- const UChar *src, int32_t srcLength,
- UStringCaseMapper *stringCaseMapper,
- icu::Edits *edits,
- UErrorCode &errorCode);
-
-/**
- * Common string case mapping implementation for old-fashioned u_strToXyz() functions
- * that allow the source string to overlap the destination buffer.
- * Implements argument checking and internally works with an intermediate buffer if necessary.
- */
-U_CFUNC int32_t
-ustrcase_mapWithOverlap(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_PARAM
- UChar *dest, int32_t destCapacity,
- const UChar *src, int32_t srcLength,
- UStringCaseMapper *stringCaseMapper,
- UErrorCode &errorCode);
-
-/**
- * UTF-8 string case mapping function type, used by ucasemap_mapUTF8().
- * UTF-8 version of UStringCaseMapper.
- * All error checking must be done.
- * The UCaseMap must be fully initialized, with locale and/or iter set as needed.
- * src and dest must not overlap.
- */
-typedef int32_t U_CALLCONV
-UTF8CaseMapper(int32_t caseLocale, uint32_t options,
-#if !UCONFIG_NO_BREAK_ITERATION
- icu::BreakIterator *iter,
-#endif
- uint8_t *dest, int32_t destCapacity,
- const uint8_t *src, int32_t srcLength,
- UErrorCode *pErrorCode);
-
-#if !UCONFIG_NO_BREAK_ITERATION
-
-/** Implements UTF8CaseMapper. */
-U_CFUNC int32_t U_CALLCONV
-ucasemap_internalUTF8ToTitle(int32_t caseLocale, uint32_t options,
- icu::BreakIterator *iter,
- uint8_t *dest, int32_t destCapacity,
- const uint8_t *src, int32_t srcLength,
- UErrorCode *pErrorCode);
-
-#endif
-
-/**
- * Implements argument checking and buffer handling
- * for UTF-8 string case mapping as a common function.
- */
-U_CFUNC int32_t
-ucasemap_mapUTF8(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_PARAM
- uint8_t *dest, int32_t destCapacity,
- const uint8_t *src, int32_t srcLength,
- UTF8CaseMapper *stringCaseMapper,
- UErrorCode *pErrorCode);
-
-U_NAMESPACE_BEGIN
-namespace GreekUpper {
-
-// Data bits.
-static const uint32_t UPPER_MASK = 0x3ff;
-static const uint32_t HAS_VOWEL = 0x1000;
-static const uint32_t HAS_YPOGEGRAMMENI = 0x2000;
-static const uint32_t HAS_ACCENT = 0x4000;
-static const uint32_t HAS_DIALYTIKA = 0x8000;
-// Further bits during data building and processing, not stored in the data map.
-static const uint32_t HAS_COMBINING_DIALYTIKA = 0x10000;
-static const uint32_t HAS_OTHER_GREEK_DIACRITIC = 0x20000;
-
-static const uint32_t HAS_VOWEL_AND_ACCENT = HAS_VOWEL | HAS_ACCENT;
-static const uint32_t HAS_VOWEL_AND_ACCENT_AND_DIALYTIKA =
- HAS_VOWEL_AND_ACCENT | HAS_DIALYTIKA;
-static const uint32_t HAS_EITHER_DIALYTIKA = HAS_DIALYTIKA | HAS_COMBINING_DIALYTIKA;
-
-// State bits.
-static const uint32_t AFTER_CASED = 1;
-static const uint32_t AFTER_VOWEL_WITH_ACCENT = 2;
-
-uint32_t getLetterData(UChar32 c);
-
-/**
- * Returns a non-zero value for each of the Greek combining diacritics
- * listed in The Unicode Standard, version 8, chapter 7.2 Greek,
- * plus some perispomeni look-alikes.
- */
-uint32_t getDiacriticData(UChar32 c);
-
-} // namespace GreekUpper
-U_NAMESPACE_END
-
-#endif // __cplusplus
-
U_CAPI int32_t U_EXPORT2
ustr_hashUCharsN(const UChar *str, int32_t length);
#if !UCONFIG_NO_BREAK_ITERATION
#include "unicode/brkiter.h"
+#include "unicode/casemap.h"
#include "unicode/localpointer.h"
#include "unicode/ubrk.h"
#include "unicode/ucasemap.h"
#include "cmemory.h"
#include "ucase.h"
-#include "ustr_imp.h"
+#include "ucasemap_imp.h"
U_NAMESPACE_USE
#include "unicode/utypes.h"
#include "unicode/brkiter.h"
+#include "unicode/casemap.h"
+#include "unicode/edits.h"
#include "unicode/ustring.h"
#include "unicode/ucasemap.h"
#include "unicode/ubrk.h"
#include "unicode/utf16.h"
#include "cmemory.h"
#include "ucase.h"
+#include "ucasemap_imp.h"
#include "ustr_imp.h"
#include "uassert.h"
namespace {
-// 0000uuuuuuuuuuuu records u+1 unchanged text units.
-const int32_t MAX_UNCHANGED_LENGTH = 0x1000;
-const int32_t MAX_UNCHANGED = MAX_UNCHANGED_LENGTH - 1;
-
-// 0wwwcccccccccccc with w=1..6 records ccc+1 replacements of w:w text units.
-// No length change.
-const int32_t MAX_SHORT_WIDTH = 6;
-const int32_t MAX_SHORT_CHANGE_LENGTH = 0xfff;
-const int32_t MAX_SHORT_CHANGE = 0x6fff;
-
-// 0111mmmmmmnnnnnn records a replacement of m text units with n.
-// m or n = 61: actual length follows in the next edits array unit.
-// m or n = 62..63: actual length follows in the next two edits array units.
-// Bit 30 of the actual length is in the head unit.
-// Trailing units have bit 15 set.
-const int32_t LENGTH_IN_1TRAIL = 61;
-const int32_t LENGTH_IN_2TRAIL = 62;
-
-} // namespace
-
-Edits::~Edits() {
- if(array != stackArray) {
- uprv_free(array);
- }
-}
-
-void Edits::reset() {
- length = 0;
-}
-
-void Edits::addUnchanged(int32_t unchangedLength) {
- if(U_FAILURE(errorCode) || unchangedLength == 0) { return; }
- if(unchangedLength < 0) {
- errorCode = U_ILLEGAL_ARGUMENT_ERROR;
- return;
- }
- // Merge into previous unchanged-text record, if any.
- int32_t last = lastUnit();
- if(last < MAX_UNCHANGED) {
- int32_t remaining = MAX_UNCHANGED - last;
- if (remaining >= unchangedLength) {
- setLastUnit(last + unchangedLength);
- return;
- }
- setLastUnit(MAX_UNCHANGED);
- unchangedLength -= remaining;
- }
- // Split large lengths into multiple units.
- while(unchangedLength >= MAX_UNCHANGED_LENGTH) {
- append(MAX_UNCHANGED);
- unchangedLength -= MAX_UNCHANGED_LENGTH;
- }
- // Write a small (remaining) length.
- if(unchangedLength > 0) {
- append(unchangedLength - 1);
- }
-}
-
-void Edits::addReplace(int32_t oldLength, int32_t newLength) {
- if(U_FAILURE(errorCode)) { return; }
- if(oldLength == newLength && 0 < oldLength && oldLength <= MAX_SHORT_WIDTH) {
- // Replacement of short oldLength text units by same-length new text.
- // Merge into previous short-replacement record, if any.
- int32_t last = lastUnit();
- if(MAX_UNCHANGED < last && last < MAX_SHORT_CHANGE &&
- (last >> 12) == oldLength && (last & 0xfff) < MAX_SHORT_CHANGE_LENGTH) {
- setLastUnit(last + 1);
- return;
- }
- append(oldLength << 12);
- return;
- }
-
- if(oldLength < 0 || newLength < 0) {
- errorCode = U_ILLEGAL_ARGUMENT_ERROR;
- return;
- }
- if (oldLength == 0 && newLength == 0) {
- return;
- }
- int32_t newDelta = newLength - oldLength;
- if (newDelta != 0) {
- if (newDelta > 0 ? newDelta > (INT32_MAX - delta) : newDelta < (INT32_MIN - delta)) {
- // Integer overflow or underflow.
- errorCode = U_INDEX_OUTOFBOUNDS_ERROR;
- return;
- }
- delta += newDelta;
- }
-
- int32_t head = 0x7000;
- if (oldLength < LENGTH_IN_1TRAIL && newLength < LENGTH_IN_1TRAIL) {
- head |= oldLength << 6;
- head |= newLength;
- append(head);
- } else if ((capacity - length) >= 5 || growArray()) {
- int32_t limit = length + 1;
- if(oldLength < LENGTH_IN_1TRAIL) {
- head |= oldLength << 6;
- } else if(oldLength <= 0x7fff) {
- head |= LENGTH_IN_1TRAIL << 6;
- array[limit++] = (uint16_t)(0x8000 | oldLength);
- } else {
- head |= (LENGTH_IN_2TRAIL + (oldLength >> 30)) << 6;
- array[limit++] = (uint16_t)(0x8000 | (oldLength >> 15));
- array[limit++] = (uint16_t)(0x8000 | oldLength);
- }
- if(newLength < LENGTH_IN_1TRAIL) {
- head |= newLength;
- } else if(newLength <= 0x7fff) {
- head |= LENGTH_IN_1TRAIL;
- array[limit++] = (uint16_t)(0x8000 | newLength);
- } else {
- head |= LENGTH_IN_2TRAIL + (newLength >> 30);
- array[limit++] = (uint16_t)(0x8000 | (newLength >> 15));
- array[limit++] = (uint16_t)(0x8000 | newLength);
- }
- array[length] = (uint16_t)head;
- length = limit;
- }
-}
-
-void Edits::append(int32_t r) {
- if(length < capacity || growArray()) {
- array[length++] = (uint16_t)r;
- }
-}
-
-UBool Edits::growArray() {
- int32_t newCapacity;
- if (array == stackArray) {
- newCapacity = 2000;
- } else if (capacity == INT32_MAX) {
- errorCode = U_BUFFER_OVERFLOW_ERROR;
- return FALSE;
- } else if (capacity >= (INT32_MAX / 2)) {
- newCapacity = INT32_MAX;
- } else {
- newCapacity = 2 * capacity;
- }
- // Grow by at least 5 units so that a maximal change record will fit.
- if ((newCapacity - capacity) < 5) {
- errorCode = U_BUFFER_OVERFLOW_ERROR;
- return FALSE;
- }
- uint16_t *newArray = (uint16_t *)uprv_malloc((size_t)newCapacity * 2);
- if (newArray == NULL) {
- errorCode = U_MEMORY_ALLOCATION_ERROR;
- return FALSE;
- }
- uprv_memcpy(newArray, array, (size_t)length * 2);
- if (array != stackArray) {
- uprv_free(array);
- }
- array = newArray;
- capacity = newCapacity;
- return TRUE;
-}
-
-UBool Edits::copyErrorTo(UErrorCode &outErrorCode) {
- if (U_FAILURE(outErrorCode)) { return TRUE; }
- if (U_SUCCESS(errorCode)) { return FALSE; }
- outErrorCode = errorCode;
- return TRUE;
-}
-
-UBool Edits::hasChanges() const {
- if (delta != 0) {
- return TRUE;
- }
- for (int32_t i = 0; i < length; ++i) {
- if (array[i] > MAX_UNCHANGED) {
- return TRUE;
- }
- }
- return FALSE;
-}
-
-Edits::Iterator::Iterator(const uint16_t *a, int32_t len, UBool oc, UBool crs) :
- array(a), index(0), length(len), remaining(0),
- onlyChanges(oc), coarse(crs),
- changed(FALSE), oldLength_(0), newLength_(0),
- srcIndex(0), replIndex(0), destIndex(0) {}
-
-int32_t Edits::Iterator::readLength(int32_t head) {
- if (head < LENGTH_IN_1TRAIL) {
- return head;
- } else if (head < LENGTH_IN_2TRAIL) {
- U_ASSERT(index < length);
- U_ASSERT(array[index] >= 0x8000);
- return array[index++];
- } else {
- U_ASSERT((index + 2) <= length);
- U_ASSERT(array[index] >= 0x8000);
- U_ASSERT(array[index + 1] >= 0x8000);
- int32_t len = ((head & 1) << 30) |
- ((int32_t)(array[index] & 0x7fff) << 15) |
- (array[index + 1] & 0x7fff);
- index += 2;
- return len;
- }
-}
-
-void Edits::Iterator::updateIndexes() {
- srcIndex += oldLength_;
- if (changed) {
- replIndex += newLength_;
- }
- destIndex += newLength_;
-}
-
-UBool Edits::Iterator::noNext() {
- // Empty span beyond the string.
- oldLength_ = newLength_ = 0;
- return FALSE;
-}
-
-UBool Edits::Iterator::next(UErrorCode &errorCode) {
- if (U_FAILURE(errorCode)) { return FALSE; }
- // We have an errorCode in case we need to start guarding against integer overflows.
- // It is also convenient for caller loops if we bail out when an error was set elsewhere.
- updateIndexes();
- if (remaining > 0) {
- // Fine-grained iterator: Continue a sequence of equal-length changes.
- --remaining;
- return TRUE;
- }
- if (index >= length) {
- return noNext();
- }
- int32_t u = array[index++];
- if (u <= MAX_UNCHANGED) {
- // Combine adjacent unchanged ranges.
- changed = FALSE;
- oldLength_ = u + 1;
- while (index < length && (u = array[index]) <= MAX_UNCHANGED) {
- ++index;
- oldLength_ += u + 1;
- }
- newLength_ = oldLength_;
- if (onlyChanges) {
- updateIndexes();
- if (index >= length) {
- return noNext();
- }
- // already fetched u > MAX_UNCHANGED at index
- ++index;
- } else {
- return TRUE;
- }
- }
- changed = TRUE;
- if (u <= MAX_SHORT_CHANGE) {
- if (coarse) {
- int32_t w = u >> 12;
- int32_t len = (u & 0xfff) + 1;
- oldLength_ = newLength_ = len * w;
- } else {
- // Split a sequence of equal-length changes that was compressed into one unit.
- oldLength_ = newLength_ = u >> 12;
- remaining = u & 0xfff;
- return TRUE;
- }
- } else {
- U_ASSERT(u <= 0x7fff);
- oldLength_ = readLength((u >> 6) & 0x3f);
- newLength_ = readLength(u & 0x3f);
- if (!coarse) {
- return TRUE;
- }
- }
- // Combine adjacent changes.
- while (index < length && (u = array[index]) > MAX_UNCHANGED) {
- ++index;
- if (u <= MAX_SHORT_CHANGE) {
- int32_t w = u >> 12;
- int32_t len = (u & 0xfff) + 1;
- len = len * w;
- oldLength_ += len;
- newLength_ += len;
- } else {
- U_ASSERT(u <= 0x7fff);
- int32_t oldLen = readLength((u >> 6) & 0x3f);
- int32_t newLen = readLength(u & 0x3f);
- oldLength_ += oldLen;
- newLength_ += newLen;
- }
- }
- return TRUE;
-}
-
-UBool Edits::Iterator::findSourceIndex(int32_t i, UErrorCode &errorCode) {
- if (U_FAILURE(errorCode) || i < 0) { return FALSE; }
- if (i < srcIndex) {
- // Reset the iterator to the start.
- index = remaining = srcIndex = replIndex = destIndex = 0;
- } else if (i < (srcIndex + oldLength_)) {
- // The index is in the current span.
- return TRUE;
- }
- while (next(errorCode)) {
- if (i < (srcIndex + oldLength_)) {
- // The index is in the current span.
- return TRUE;
- }
- if (remaining > 0) {
- // Is the index in one of the remaining compressed edits?
- // srcIndex is the start of the current span, before the remaining ones.
- int32_t len = (remaining + 1) * oldLength_;
- if (i < (srcIndex + len)) {
- int32_t n = (i - srcIndex) / oldLength_; // 1 <= n <= remaining
- len = n * oldLength_;
- srcIndex += len;
- replIndex += len;
- destIndex += len;
- remaining -= n;
- return TRUE;
- }
- // Make next() skip all of these edits at once.
- oldLength_ = newLength_ = len;
- remaining = 0;
- }
- }
- return FALSE;
-}
-
-namespace {
-
int32_t checkOverflowAndEditsError(int32_t destIndex, int32_t destCapacity,
Edits *edits, UErrorCode &errorCode) {
if (U_SUCCESS(errorCode)) {
#include "unicode/utypes.h"
#include "uassert.h"
#include "unicode/brkiter.h"
+#include "unicode/casemap.h"
#include "unicode/ucasemap.h"
#include "unicode/uloc.h"
#include "unicode/ustring.h"
#include "ucase.h"
-#include "ustr_imp.h"
+#include "ucasemap_imp.h"
U_CFUNC int32_t
ustrcase_getCaseLocale(const char *locale) {
#include "unicode/utypes.h"
#include "unicode/putil.h"
+#include "unicode/uchar.h"
#include "unicode/ustring.h"
#include "unicode/utf16.h"
#include "cstring.h"
#include "unicode/decimfmt.h"
#include "uresimp.h"
#include "unicode/ures.h"
+#include "unicode/ustring.h"
#include "ureslocs.h"
#include "cstring.h"
#include "mutex.h"
#if !UCONFIG_NO_FORMATTING && !UCONFIG_NO_BREAK_ITERATION
#include "unicode/dtfmtsym.h"
+#include "unicode/ucasemap.h"
#include "unicode/ureldatefmt.h"
#include "unicode/udisplaycontext.h"
#include "unicode/unum.h"
#include "unicode/simpletz.h"
#include "unicode/rbtz.h"
#include "unicode/tzfmt.h"
+#include "unicode/ucasemap.h"
#include "unicode/utf16.h"
#include "unicode/vtzone.h"
#include "unicode/udisplaycontext.h"
#include <float.h>
#include "smpdtfst.h"
#include "sharednumberformat.h"
+#include "ucasemap_imp.h"
#include "ustr_imp.h"
#include "charstr.h"
#include "uvector.h"
#include "unicode/ucasemap.h"
#include "cmemory.h"
#include "cintltst.h"
+#include "ucasemap_imp.h"
#include "ustr_imp.h"
/* test string case mapping functions --------------------------------------- */