*/
Edits() :
array(stackArray), capacity(STACK_CAPACITY), length(0), delta(0),
- errorCode(U_ZERO_ERROR) {}
+ omit(FALSE), errorCode(U_ZERO_ERROR) {}
~Edits();
+
/**
* Resets the data but may not release memory.
* @internal ICU 59 technology preview
*/
void reset();
+
+ /**
+ * Controls whether the case mapping function is to write or omit
+ * characters that do not change.
+ * The complete result can be computed by applying just the changes
+ * to the original string.
+ * @see omitUnchanged
+ * @see writeUnchanged
+ * @internal ICU 59 technology preview
+ */
+ Edits &setWriteUnchanged(UBool write) {
+ omit = !write;
+ return *this;
+ }
+ /**
+ * @return TRUE if the case mapping function is to omit characters that do not change.
+ * @see setWriteUnchanged
+ * @internal ICU 59 technology preview
+ */
+ UBool omitUnchanged() const { return omit; }
+ /**
+ * @return TRUE if the case mapping function is to write characters that do not change.
+ * @see setWriteUnchanged
+ * @internal ICU 59 technology preview
+ */
+ UBool writeUnchanged() const { return !omit; }
+
/**
* Adds a record for an unchanged segment of text.
* @internal ICU 59 technology preview
* @internal ICU 59 technology preview
*/
int32_t lengthDelta() const { return delta; }
+ /**
+ * @return TRUE if there are any change edits
+ * @internal ICU 59 technology preview
+ */
+ UBool hasChanges() const;
+
+ /**
+ * Access to the list of edits.
+ * @see getCoarseIterator
+ * @see getFineIterator
+ * @internal ICU 59 technology preview
+ */
+ struct Iterator final : public UMemory {
+ /**
+ * Advances to the next edit.
+ * @return TRUE if there is another edit
+ * @internal ICU 59 technology preview
+ */
+ UBool next(UErrorCode &errorCode);
+
+ /**
+ * TRUE if this edit replaces oldLength units with newLength different ones.
+ * FALSE if oldLength units remain unchanged.
+ * @internal ICU 59 technology preview
+ */
+ UBool changed;
+ /**
+ * Number of units in the original string which are replaced or remain unchanged.
+ * @internal ICU 59 technology preview
+ */
+ int32_t oldLength;
+ /**
+ * Number of units in the modified string, if changed is TRUE.
+ * Same as oldLength if changed is FALSE.
+ * @internal ICU 59 technology preview
+ */
+ int32_t newLength;
+
+ private:
+ friend class Edits;
+
+ Iterator(const uint16_t *a, int32_t len, UBool crs) :
+ array(a), index(0), length(len), width(0), remaining(0), coarse(crs) {}
+
+ int32_t readLength(int32_t head);
+
+ const uint16_t *array;
+ int32_t index, length;
+ int32_t width, remaining;
+ UBool coarse;
+ };
+
+ /**
+ * Returns an Iterator for coarse-grained changes for simple string updates.
+ * @return an Iterator that merges adjacent changes.
+ * @internal ICU 59 technology preview
+ */
+ Iterator getCoarseIterator() const {
+ return Iterator(array, length, TRUE);
+ }
+
+ /**
+ * Returns an Iterator for fine-grained changes for modifying text with metadata.
+ * @return an Iterator that separates adjacent changes.
+ * @internal ICU 59 technology preview
+ */
+ Iterator getFineIterator() const {
+ return Iterator(array, length, FALSE);
+ }
private:
Edits(const Edits &) = delete;
int32_t capacity;
int32_t length;
int32_t delta;
+ UBool omit;
UErrorCode errorCode;
uint16_t stackArray[STACK_CAPACITY];
};
*
* @internal ICU 59 technology preview
*/
-#define UCASEMAP_OMIT_UNCHANGED 0x4000
+// TODO: does not work well as an option because we would need to set/reset it on UCaseMaps
+// that are often const, replaced for now by Edits.setWriteUnchanged(UBool)
+// #define UCASEMAP_OMIT_UNCHANGED 0x4000
#endif // U_HIDE_INTERNAL_API
const char *src, int32_t srcLength,
UErrorCode *pErrorCode);
+#if U_SHOW_CPLUSPLUS_API
+
// Not #ifndef U_HIDE_INTERNAL_API because UnicodeString needs the UStringCaseMapper.
/**
* Internal string case mapping function type.
icu::Edits *edits,
UErrorCode *pErrorCode);
+#endif // U_SHOW_CPLUSPLUS_API
#endif
#include "unicode/ustring.h"
#include "unicode/unistr.h"
#include "unicode/uchar.h"
+#include "uassert.h"
#include "uelement.h"
#include "ustr_imp.h"
return *this;
}
- // We need to allocate a new buffer for the internal string case mapping function.
- // This is very similar to how doReplace() keeps the old array pointer
- // and deletes the old array itself after it is done.
- // In addition, we are forcing cloneArrayIfNeeded() to always allocate a new array.
- UChar oldStackBuffer[US_STACKBUF_SIZE];
+ UChar oldBuffer[2 * US_STACKBUF_SIZE];
UChar *oldArray;
- int32_t oldLength;
-
- if(fUnion.fFields.fLengthAndFlags&kUsingStackBuffer) {
- // copy the stack buffer contents because it will be overwritten
- oldArray = oldStackBuffer;
- oldLength = getShortLength();
- u_memcpy(oldStackBuffer, fUnion.fStackFields.fBuffer, oldLength);
+ int32_t oldLength = length();
+ int32_t newLength;
+ UBool writable = isBufferWritable();
+ UErrorCode errorCode = U_ZERO_ERROR;
+
+ // Try to avoid heap-allocating a new character array for this string.
+ if (writable ? oldLength <= UPRV_LENGTHOF(oldBuffer) : oldLength < US_STACKBUF_SIZE) {
+ // Short string: Copy the contents into a temporary buffer and
+ // case-map back into the current array, or into the stack buffer.
+ UChar *buffer = getArrayStart();
+ int32_t capacity;
+ oldArray = oldBuffer;
+ u_memcpy(oldBuffer, buffer, oldLength);
+ if (writable) {
+ capacity = getCapacity();
+ } else {
+ // Switch from the read-only alias or shared heap buffer to the stack buffer.
+ if (!cloneArrayIfNeeded(US_STACKBUF_SIZE, US_STACKBUF_SIZE, /* doCopyArray= */ FALSE)) {
+ return *this;
+ }
+ U_ASSERT(fUnion.fFields.fLengthAndFlags & kUsingStackBuffer);
+ buffer = fUnion.fStackFields.fBuffer;
+ capacity = US_STACKBUF_SIZE;
+ }
+ newLength = stringCaseMapper(csm, buffer, capacity, oldArray, oldLength, NULL, &errorCode);
+ if (U_SUCCESS(errorCode)) {
+ setLength(newLength);
+ return *this;
+ } else if (errorCode == U_BUFFER_OVERFLOW_ERROR) {
+ // common overflow handling below
+ } else {
+ setToBogus();
+ return *this;
+ }
} else {
+ // Longer string or read-only buffer:
+ // Collect only changes and then apply them to this string.
+ // Case mapping often changes only small parts of a string,
+ // and often does not change its length.
oldArray = getArrayStart();
- oldLength = length();
+ Edits edits;
+ edits.setWriteUnchanged(FALSE);
+ UChar replacementChars[200];
+ int32_t replacementLength = stringCaseMapper(
+ csm, replacementChars, UPRV_LENGTHOF(replacementChars),
+ oldArray, oldLength, &edits, &errorCode);
+ UErrorCode editsError = U_ZERO_ERROR;
+ if (edits.setErrorCode(editsError)) {
+ setToBogus();
+ return *this;
+ }
+ newLength = oldLength + edits.lengthDelta();
+ if (U_SUCCESS(errorCode)) {
+ if (!cloneArrayIfNeeded(newLength, newLength)) {
+ return *this;
+ }
+ int32_t index = 0; // index into this string
+ int32_t replIndex = 0; // index into replacementChars
+ for (Edits::Iterator iter = edits.getCoarseIterator(); iter.next(errorCode);) {
+ if (iter.changed) {
+ doReplace(index, iter.oldLength, replacementChars, replIndex, iter.newLength);
+ replIndex += iter.newLength;
+ }
+ index += iter.newLength;
+ }
+ if (U_FAILURE(errorCode)) {
+ setToBogus();
+ }
+ U_ASSERT(replIndex == replacementLength);
+ return *this;
+ } else if (errorCode == U_BUFFER_OVERFLOW_ERROR) {
+ // common overflow handling below
+ } else {
+ setToBogus();
+ return *this;
+ }
}
- int32_t capacity;
- if(oldLength <= US_STACKBUF_SIZE) {
- capacity = US_STACKBUF_SIZE;
- } else {
- capacity = oldLength + 20;
- }
+ // Handle buffer overflow, newLength is known.
+ // We need to allocate a new buffer for the internal string case mapping function.
+ // This is very similar to how doReplace() keeps the old array pointer
+ // and deletes the old array itself after it is done.
+ // In addition, we are forcing cloneArrayIfNeeded() to always allocate a new array.
int32_t *bufferToDelete = 0;
- if(!cloneArrayIfNeeded(capacity, capacity, FALSE, &bufferToDelete, TRUE)) {
+ if (!cloneArrayIfNeeded(newLength, newLength, FALSE, &bufferToDelete, TRUE)) {
return *this;
}
-
- // Case-map, and if the result is too long, then reallocate and repeat.
- UErrorCode errorCode;
- int32_t newLength;
- do {
- errorCode = U_ZERO_ERROR;
- newLength = stringCaseMapper(csm, getArrayStart(), getCapacity(),
- oldArray, oldLength, &errorCode);
- setLength(newLength);
- } while(errorCode==U_BUFFER_OVERFLOW_ERROR && cloneArrayIfNeeded(newLength, newLength, FALSE));
-
+ errorCode = U_ZERO_ERROR;
+ newLength = stringCaseMapper(csm, getArrayStart(), getCapacity(),
+ oldArray, oldLength, NULL, &errorCode);
if (bufferToDelete) {
uprv_free(bufferToDelete);
}
- if(U_FAILURE(errorCode)) {
+ if (U_SUCCESS(errorCode)) {
+ setLength(newLength);
+ } else {
setToBogus();
}
return *this;
unistr_case_internalToTitle(const UCaseMap *csm,
UChar *dest, int32_t destCapacity,
const UChar *src, int32_t srcLength,
+ icu::Edits *edits,
UErrorCode *pErrorCode) {
ubrk_setText(csm->iter, src, srcLength, pErrorCode);
- return ustrcase_internalToTitle(csm, dest, destCapacity, src, srcLength, pErrorCode);
+ return ustrcase_internalToTitle(csm, dest, destCapacity, src, srcLength, edits, pErrorCode);
}
/*
/*U_CFUNC int8_t
uprv_loadPropsData(UErrorCode *errorCode);*/
+#ifdef __cplusplus
+// TODO: Consider moving these case mapping definitions
+// into a new internal header like ucasemap_imp.h.
+
/*
* Internal string casing functions implementing
* ustring.h/ustrcase.c and UnicodeString case mapping functions.
uint32_t options;
};
-#ifndef __UCASEMAP_H__
-typedef struct UCaseMap UCaseMap;
-#endif
-
#if UCONFIG_NO_BREAK_ITERATION
# define UCASEMAP_INITIALIZER { NULL, { 0 }, 0, 0 }
#else
UTF8CaseMapper *stringCaseMapper,
UErrorCode *pErrorCode);
-#ifdef __cplusplus
-
U_NAMESPACE_BEGIN
namespace GreekUpper {
&csm,
dest, destCapacity,
src, srcLength,
- ustrcase_internalToTitle, pErrorCode);
+ ustrcase_internalToTitle, NULL, pErrorCode);
if(titleIter==NULL && csm.iter!=NULL) {
ubrk_close(csm.iter);
}
}
UBool Edits::setErrorCode(UErrorCode &outErrorCode) {
- if(U_FAILURE(outErrorCode)) { return TRUE; }
- if(U_SUCCESS(errorCode)) { return FALSE; }
+ if (U_FAILURE(outErrorCode)) { return TRUE; }
+ if (U_SUCCESS(errorCode)) { return FALSE; }
outErrorCode = errorCode;
return TRUE;
}
+UBool Edits::hasChanges() const {
+ if (delta != 0) {
+ return TRUE;
+ }
+ for (int32_t i = 0; i < length; ++i) {
+ if (array[i] > MAX_UNCHANGED) {
+ return TRUE;
+ }
+ }
+ return FALSE;
+}
+
+UBool Edits::Iterator::next(UErrorCode &errorCode) {
+ if (U_FAILURE(errorCode)) { return FALSE; }
+ // Always set all relevant public fields: Do not rely on them not having been touched.
+ if (remaining > 0) {
+ // Fine-grained iterator: Continue a sequence of equal-length changes.
+ changed = TRUE;
+ oldLength = newLength = width;
+ --remaining;
+ return TRUE;
+ }
+ if (index >= length) {
+ return FALSE;
+ }
+ int32_t u = array[index++];
+ if (u <= MAX_UNCHANGED) {
+ // Combine adjacent unchanged ranges.
+ changed = FALSE;
+ oldLength = u + 1;
+ while (index < length && (u = array[index]) <= MAX_UNCHANGED) {
+ ++index;
+ if (u >= (INT32_MAX - oldLength)) {
+ errorCode = U_INDEX_OUTOFBOUNDS_ERROR;
+ return FALSE;
+ }
+ oldLength += u + 1;
+ }
+ newLength = oldLength;
+ return TRUE;
+ }
+ changed = TRUE;
+ if (u <= MAX_SHORT_CHANGE) {
+ if (coarse) {
+ int32_t w = u >> 12;
+ int32_t len = (u & 0xfff) + 1;
+ oldLength = newLength = w * len;
+ } else {
+ // Split a sequence of equal-length changes that was compressed into one unit.
+ oldLength = newLength = width = u >> 12;
+ remaining = u & 0xfff;
+ return TRUE;
+ }
+ } else {
+ U_ASSERT(u <= 0x7fff);
+ oldLength = readLength((u >> 6) & 0x3f);
+ newLength = readLength(u & 0x3f);
+ if (!coarse) {
+ return TRUE;
+ }
+ }
+ // Combine adjacent changes.
+ while (index < length && (u = array[index]) > MAX_UNCHANGED) {
+ ++index;
+ if (u <= MAX_SHORT_CHANGE) {
+ int32_t w = u >> 12;
+ int32_t len = (u & 0xfff) + 1;
+ len = w * len;
+ if (len > (INT32_MAX - oldLength) || len > (INT32_MAX - newLength)) {
+ errorCode = U_INDEX_OUTOFBOUNDS_ERROR;
+ return FALSE;
+ }
+ oldLength += len;
+ newLength += len;
+ } else {
+ U_ASSERT(u <= 0x7fff);
+ int32_t oldLen = readLength((u >> 6) & 0x3f);
+ int32_t newLen = readLength(u & 0x3f);
+ if (oldLen > (INT32_MAX - oldLength) || newLen > (INT32_MAX - newLength)) {
+ errorCode = U_INDEX_OUTOFBOUNDS_ERROR;
+ return FALSE;
+ }
+ oldLength += oldLen;
+ newLength += newLen;
+ }
+ }
+ return TRUE;
+}
+
+int32_t Edits::Iterator::readLength(int32_t head) {
+ if (head < LENGTH_IN_1TRAIL) {
+ return head;
+ } else if (head < LENGTH_IN_2TRAIL) {
+ U_ASSERT(index < length);
+ U_ASSERT(array[index] >= 0x8000);
+ return array[index++];
+ } else {
+ U_ASSERT((index + 2) <= length);
+ U_ASSERT(array[index] >= 0x8000);
+ U_ASSERT(array[index + 1] >= 0x8000);
+ int32_t len = ((head & 1) << 30) |
+ ((int32_t)(array[index] & 0x7fff) << 15) |
+ (array[index + 1] & 0x7fff);
+ index += 2;
+ return len;
+ }
+}
+
U_NAMESPACE_END
U_NAMESPACE_USE
static inline int32_t
appendResult(UChar *dest, int32_t destIndex, int32_t destCapacity,
int32_t result, const UChar *s,
- uint32_t options, int32_t cpLength, icu::Edits *edits) {
+ int32_t cpLength, icu::Edits *edits) {
UChar32 c;
int32_t length;
/* (not) original code point */
if(edits!=NULL) {
edits->addUnchanged(cpLength);
- }
- if(options & UCASEMAP_OMIT_UNCHANGED) {
- return destIndex;
+ if(edits->omitUnchanged()) {
+ return destIndex;
+ }
}
c=~result;
if(destIndex<destCapacity && c<=0xffff) { // BMP slightly-fastpath
appendUnchanged(UChar *dest, int32_t destIndex, int32_t destCapacity,
const UChar *s, int32_t length, icu::Edits *edits) {
if(length>0) {
+ if(edits!=NULL) {
+ edits->addUnchanged(length);
+ if(edits->omitUnchanged()) {
+ return destIndex;
+ }
+ }
if(length>(INT32_MAX-destIndex)) {
return -1; // integer overflow
}
u_memcpy(dest+destIndex, s, length);
}
destIndex+=length;
- if(edits!=NULL) {
- edits->addUnchanged(length);
- }
}
return destIndex;
}
const UChar *s;
c=map(csm->csp, c, utf16_caseContextIterator, csc, &s, csm->locale, &locCache);
destIndex = appendResult(dest, destIndex, destCapacity, c, s,
- csm->options, srcIndex - cpStart, edits);
+ srcIndex - cpStart, edits);
if (destIndex < 0) {
*pErrorCode = U_INDEX_OUTOFBOUNDS_ERROR;
return 0;
c=ucase_toFullTitle(csm->csp, c, utf16_caseContextIterator, &csc, &s,
csm->locale, &locCache);
destIndex=appendResult(dest, destIndex, destCapacity, c, s,
- csm->options, titleLimit-titleStart, edits);
+ titleLimit-titleStart, edits);
if(destIndex<0) {
*pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
return 0;
}
UBool change;
- if ((csm->options & UCASEMAP_OMIT_UNCHANGED) == 0 && edits == NULL) {
+ if (edits == NULL) {
change = TRUE; // common, simple usage
} else {
// Find out first whether we are changing the text.
edits->addUnchanged(oldLength);
}
// Write unchanged text?
- change |= (csm->options & UCASEMAP_OMIT_UNCHANGED) == 0;
+ change = edits->writeUnchanged();
}
}
const UChar *s;
c=ucase_toFullUpper(csm->csp, c, NULL, NULL, &s, csm->locale, &locCache);
destIndex = appendResult(dest, destIndex, destCapacity, c, s,
- csm->options, nextIndex - i, edits);
+ nextIndex - i, edits);
if (destIndex < 0) {
*pErrorCode = U_INDEX_OUTOFBOUNDS_ERROR;
return 0;
const UChar *s;
c = ucase_toFullFolding(csm->csp, c, &s, csm->options);
destIndex = appendResult(dest, destIndex, destCapacity, c, s,
- csm->options, srcIndex - cpStart, edits);
+ srcIndex - cpStart, edits);
if (destIndex < 0) {
*pErrorCode = U_INDEX_OUTOFBOUNDS_ERROR;
return 0;