UBool next(UErrorCode &errorCode);
/**
- * TRUE if this edit replaces oldLength units with newLength different ones.
- * FALSE if oldLength units remain unchanged.
+ * Finds the edit that contains the source index.
+ * The source index may be found in a non-change
+ * even if normal iteration would skip non-changes.
+ * Normal iteration can continue from a found edit.
+ *
+ * The iterator state before this search logically does not matter.
+ * (It may affect the performance of the search.)
+ *
+ * The iterator state after this search is undefined
+ * if the source index is out of bounds for the source string.
+ *
+ * @param i source index
+ * @return TRUE if the edit for the source index was found
* @internal ICU 59 technology preview
*/
- UBool changed;
+ UBool findSourceIndex(int32_t i, UErrorCode &errorCode);
+
+ /**
+ * @return TRUE if this edit replaces oldLength() units with newLength() different ones.
+ * FALSE if oldLength units remain unchanged.
+ * @internal ICU 59 technology preview
+ */
+ UBool hasChange() const { return changed; }
/**
- * Number of units in the original string which are replaced or remain unchanged.
+ * @return the number of units in the original string which are replaced or remain unchanged.
* @internal ICU 59 technology preview
*/
- int32_t oldLength;
+ int32_t oldLength() const { return oldLength_; }
/**
- * Number of units in the modified string, if changed is TRUE.
- * Same as oldLength if changed is FALSE.
+ * @return the number of units in the modified string, if hasChange() is TRUE.
+ * Same as oldLength if hasChange() is FALSE.
* @internal ICU 59 technology preview
*/
- int32_t newLength;
+ int32_t newLength() const { return newLength_; }
+
+ /**
+ * @return the current index into the source string
+ * @internal ICU 59 technology preview
+ */
+ int32_t sourceIndex() const { return srcIndex; }
+ /**
+ * @return the current index into the replacement-characters-only string,
+ * not counting unchanged spans
+ * @internal ICU 59 technology preview
+ */
+ int32_t replacementIndex() const { return replIndex; }
+ /**
+ * @return the current index into the full destination string
+ * @internal ICU 59 technology preview
+ */
+ int32_t destinationIndex() const { return destIndex; }
private:
friend class Edits;
- Iterator(const uint16_t *a, int32_t len, UBool crs) :
- array(a), index(0), length(len), width(0), remaining(0), coarse(crs) {}
+ Iterator(const uint16_t *a, int32_t len, UBool oc, UBool crs);
int32_t readLength(int32_t head);
+ void updateIndexes();
+ UBool noNext();
const uint16_t *array;
int32_t index, length;
- int32_t width, remaining;
- UBool coarse;
+ int32_t remaining;
+ UBool onlyChanges, coarse;
+
+ UBool changed;
+ int32_t oldLength_, newLength_;
+ int32_t srcIndex, replIndex, destIndex;
};
/**
* Returns an Iterator for coarse-grained changes for simple string updates.
+ * Skips non-changes.
+ * @return an Iterator that merges adjacent changes.
+ * @internal ICU 59 technology preview
+ */
+ Iterator getCoarseChangesIterator() const {
+ return Iterator(array, length, TRUE, TRUE);
+ }
+
+ /**
+ * Returns an Iterator for coarse-grained changes and non-changes for simple string updates.
* @return an Iterator that merges adjacent changes.
* @internal ICU 59 technology preview
*/
Iterator getCoarseIterator() const {
- return Iterator(array, length, TRUE);
+ return Iterator(array, length, FALSE, TRUE);
+ }
+
+ /**
+ * Returns an Iterator for fine-grained changes for modifying styled text.
+ * Skips non-changes.
+ * @return an Iterator that separates adjacent changes.
+ * @internal ICU 59 technology preview
+ */
+ Iterator getFineChangesIterator() const {
+ return Iterator(array, length, TRUE, FALSE);
}
/**
- * Returns an Iterator for fine-grained changes for modifying text with metadata.
+ * Returns an Iterator for fine-grained changes and non-changes for modifying styled text.
* @return an Iterator that separates adjacent changes.
* @internal ICU 59 technology preview
*/
Iterator getFineIterator() const {
- return Iterator(array, length, FALSE);
+ return Iterator(array, length, FALSE, FALSE);
}
private:
*/
#define U_TITLECASE_NO_BREAK_ADJUSTMENT 0x200
+#if U_SHOW_CPLUSPLUS_API
+#ifndef U_HIDE_INTERNAL_API
+
+/**
+ * Lowercases the characters in a UTF-16 string and optionally records edits.
+ * Casing is locale-dependent and context-sensitive.
+ * The result may be longer or shorter than the original.
+ * The source string and the destination buffer must not overlap.
+ *
+ * @param csm UCaseMap service object.
+ * @param dest A buffer for the result string. The result will be NUL-terminated if
+ * the buffer is large enough.
+ * The contents is undefined in case of failure.
+ * @param destCapacity The size of the buffer (number of bytes). If it is 0, then
+ * dest may be NULL and the function will only return the length of the result
+ * without writing any of the result string.
+ * @param src The original string.
+ * @param srcLength The length of the original string. If -1, then src must be NUL-terminated.
+ * @param edits Records edits for index mapping, working with styled text,
+ * and getting only changes (if any). Can be NULL.
+ * @param pErrorCode Must be a valid pointer to an error code value,
+ * which must not indicate a failure before the function call.
+ * @return The length of the result string, if successful - or in case of a buffer overflow,
+ * in which case it will be greater than destCapacity.
+ *
+ * @see u_strToLower
+ * @internal ICU 59 technology preview
+ */
+U_CAPI int32_t U_EXPORT2
+ucasemap_toLowerWithEdits(const UCaseMap *csm,
+ UChar *dest, int32_t destCapacity,
+ const UChar *src, int32_t srcLength,
+ icu::Edits *edits,
+ UErrorCode *pErrorCode);
+
+/**
+ * Uppercases the characters in a UTF-16 string and optionally records edits.
+ * Casing is locale-dependent and context-sensitive.
+ * The result may be longer or shorter than the original.
+ * The source string and the destination buffer must not overlap.
+ *
+ * @param csm UCaseMap service object.
+ * @param dest A buffer for the result string. The result will be NUL-terminated if
+ * the buffer is large enough.
+ * The contents is undefined in case of failure.
+ * @param destCapacity The size of the buffer (number of bytes). If it is 0, then
+ * dest may be NULL and the function will only return the length of the result
+ * without writing any of the result string.
+ * @param src The original string.
+ * @param srcLength The length of the original string. If -1, then src must be NUL-terminated.
+ * @param edits Records edits for index mapping, working with styled text,
+ * and getting only changes (if any). Can be NULL.
+ * @param pErrorCode Must be a valid pointer to an error code value,
+ * which must not indicate a failure before the function call.
+ * @return The length of the result string, if successful - or in case of a buffer overflow,
+ * in which case it will be greater than destCapacity.
+ *
+ * @see u_strToLower
+ * @internal ICU 59 technology preview
+ */
+U_CAPI int32_t U_EXPORT2
+ucasemap_toUpperWithEdits(const UCaseMap *csm,
+ UChar *dest, int32_t destCapacity,
+ const UChar *src, int32_t srcLength,
+ icu::Edits *edits,
+ UErrorCode *pErrorCode);
+
+#endif // U_HIDE_INTERNAL_API
+#endif // U_SHOW_CPLUSPLUS_API
+
#if !UCONFIG_NO_BREAK_ITERATION
/**
Edits edits;
edits.setWriteUnchanged(FALSE);
UChar replacementChars[200];
- int32_t replacementLength = stringCaseMapper(
- csm, replacementChars, UPRV_LENGTHOF(replacementChars),
- oldArray, oldLength, &edits, &errorCode);
+ stringCaseMapper(csm, replacementChars, UPRV_LENGTHOF(replacementChars),
+ oldArray, oldLength, &edits, &errorCode);
UErrorCode editsError = U_ZERO_ERROR;
if (edits.setErrorCode(editsError)) {
setToBogus();
}
newLength = oldLength + edits.lengthDelta();
if (U_SUCCESS(errorCode)) {
- if (!cloneArrayIfNeeded(newLength, newLength)) {
+ // Grow the buffer at most once, not for multiple doReplace() calls.
+ if (newLength > oldLength && !cloneArrayIfNeeded(newLength, newLength)) {
return *this;
}
- int32_t index = 0; // index into this string
- int32_t replIndex = 0; // index into replacementChars
- for (Edits::Iterator iter = edits.getCoarseIterator(); iter.next(errorCode);) {
- if (iter.changed) {
- doReplace(index, iter.oldLength, replacementChars, replIndex, iter.newLength);
- replIndex += iter.newLength;
- }
- index += iter.newLength;
+ for (Edits::Iterator iter = edits.getCoarseChangesIterator(); iter.next(errorCode);) {
+ doReplace(iter.destinationIndex(), iter.oldLength(),
+ replacementChars, iter.replacementIndex(), iter.newLength());
}
if (U_FAILURE(errorCode)) {
setToBogus();
}
- U_ASSERT(replIndex == replacementLength);
return *this;
} else if (errorCode == U_BUFFER_OVERFLOW_ERROR) {
// common overflow handling below
UErrorCode *pErrorCode);
/**
- * Implements argument checking and buffer handling
- * for string case mapping as a common function.
+ * Common string case mapping implementation for ucasemap_toXyz() and UnicodeString::toXyz().
+ * Implements argument checking.
*/
U_CFUNC int32_t
ustrcase_map(const UCaseMap *csm,
icu::Edits *edits,
UErrorCode *pErrorCode);
+/**
+ * Common string case mapping implementation for old-fashioned u_strToXyz() functions
+ * that allow the source string to overlap the destination buffer.
+ * Implements argument checking and internally works with an intermediate buffer if necessary.
+ */
+U_CFUNC int32_t
+ustrcase_mapWithOverlap(const UCaseMap *csm,
+ UChar *dest, int32_t destCapacity,
+ const UChar *src, int32_t srcLength,
+ UStringCaseMapper *stringCaseMapper,
+ UErrorCode *pErrorCode);
+
/**
* UTF-8 string case mapping function type, used by ucasemap_mapUTF8().
* UTF-8 version of UStringCaseMapper.
} else {
csm.iter=ubrk_open(UBRK_WORD, csm.locale, src, srcLength, pErrorCode);
}
- int32_t length=ustrcase_map(
+ int32_t length=ustrcase_mapWithOverlap(
&csm,
dest, destCapacity,
src, srcLength,
- ustrcase_internalToTitle, NULL, pErrorCode);
+ ustrcase_internalToTitle, pErrorCode);
if(titleIter==NULL && csm.iter!=NULL) {
ubrk_close(csm.iter);
}
return FALSE;
}
+Edits::Iterator::Iterator(const uint16_t *a, int32_t len, UBool oc, UBool crs) :
+ array(a), index(0), length(len), remaining(0),
+ onlyChanges(oc), coarse(crs),
+ changed(FALSE), oldLength_(0), newLength_(0),
+ srcIndex(0), replIndex(0), destIndex(0) {}
+
+int32_t Edits::Iterator::readLength(int32_t head) {
+ if (head < LENGTH_IN_1TRAIL) {
+ return head;
+ } else if (head < LENGTH_IN_2TRAIL) {
+ U_ASSERT(index < length);
+ U_ASSERT(array[index] >= 0x8000);
+ return array[index++];
+ } else {
+ U_ASSERT((index + 2) <= length);
+ U_ASSERT(array[index] >= 0x8000);
+ U_ASSERT(array[index + 1] >= 0x8000);
+ int32_t len = ((head & 1) << 30) |
+ ((int32_t)(array[index] & 0x7fff) << 15) |
+ (array[index + 1] & 0x7fff);
+ index += 2;
+ return len;
+ }
+}
+
+void Edits::Iterator::updateIndexes() {
+ srcIndex += oldLength_;
+ if (changed) {
+ replIndex += newLength_;
+ }
+ destIndex += newLength_;
+}
+
+UBool Edits::Iterator::noNext() {
+ // Empty span beyond the string.
+ oldLength_ = newLength_ = 0;
+ return FALSE;
+}
+
UBool Edits::Iterator::next(UErrorCode &errorCode) {
if (U_FAILURE(errorCode)) { return FALSE; }
- // Always set all relevant public fields: Do not rely on them not having been touched.
+ // We have an errorCode in case we need to start guarding against integer overflows.
+ // It is also convenient for caller loops if we bail out when an error was set elsewhere.
+ updateIndexes();
if (remaining > 0) {
// Fine-grained iterator: Continue a sequence of equal-length changes.
- changed = TRUE;
- oldLength = newLength = width;
--remaining;
return TRUE;
}
if (index >= length) {
- return FALSE;
+ return noNext();
}
int32_t u = array[index++];
if (u <= MAX_UNCHANGED) {
// Combine adjacent unchanged ranges.
changed = FALSE;
- oldLength = u + 1;
+ oldLength_ = u + 1;
while (index < length && (u = array[index]) <= MAX_UNCHANGED) {
++index;
- if (u >= (INT32_MAX - oldLength)) {
- errorCode = U_INDEX_OUTOFBOUNDS_ERROR;
- return FALSE;
+ oldLength_ += u + 1;
+ }
+ newLength_ = oldLength_;
+ if (onlyChanges) {
+ updateIndexes();
+ if (index >= length) {
+ return noNext();
}
- oldLength += u + 1;
+ // already fetched u > MAX_UNCHANGED at index
+ ++index;
+ } else {
+ return TRUE;
}
- newLength = oldLength;
- return TRUE;
}
changed = TRUE;
if (u <= MAX_SHORT_CHANGE) {
if (coarse) {
int32_t w = u >> 12;
int32_t len = (u & 0xfff) + 1;
- oldLength = newLength = w * len;
+ oldLength_ = newLength_ = len * w;
} else {
// Split a sequence of equal-length changes that was compressed into one unit.
- oldLength = newLength = width = u >> 12;
+ oldLength_ = newLength_ = u >> 12;
remaining = u & 0xfff;
return TRUE;
}
} else {
U_ASSERT(u <= 0x7fff);
- oldLength = readLength((u >> 6) & 0x3f);
- newLength = readLength(u & 0x3f);
+ oldLength_ = readLength((u >> 6) & 0x3f);
+ newLength_ = readLength(u & 0x3f);
if (!coarse) {
return TRUE;
}
if (u <= MAX_SHORT_CHANGE) {
int32_t w = u >> 12;
int32_t len = (u & 0xfff) + 1;
- len = w * len;
- if (len > (INT32_MAX - oldLength) || len > (INT32_MAX - newLength)) {
- errorCode = U_INDEX_OUTOFBOUNDS_ERROR;
- return FALSE;
- }
- oldLength += len;
- newLength += len;
+ len = len * w;
+ oldLength_ += len;
+ newLength_ += len;
} else {
U_ASSERT(u <= 0x7fff);
int32_t oldLen = readLength((u >> 6) & 0x3f);
int32_t newLen = readLength(u & 0x3f);
- if (oldLen > (INT32_MAX - oldLength) || newLen > (INT32_MAX - newLength)) {
- errorCode = U_INDEX_OUTOFBOUNDS_ERROR;
- return FALSE;
- }
- oldLength += oldLen;
- newLength += newLen;
+ oldLength_ += oldLen;
+ newLength_ += newLen;
}
}
return TRUE;
}
-int32_t Edits::Iterator::readLength(int32_t head) {
- if (head < LENGTH_IN_1TRAIL) {
- return head;
- } else if (head < LENGTH_IN_2TRAIL) {
- U_ASSERT(index < length);
- U_ASSERT(array[index] >= 0x8000);
- return array[index++];
- } else {
- U_ASSERT((index + 2) <= length);
- U_ASSERT(array[index] >= 0x8000);
- U_ASSERT(array[index + 1] >= 0x8000);
- int32_t len = ((head & 1) << 30) |
- ((int32_t)(array[index] & 0x7fff) << 15) |
- (array[index + 1] & 0x7fff);
- index += 2;
- return len;
+UBool Edits::Iterator::findSourceIndex(int32_t i, UErrorCode &errorCode) {
+ if (U_FAILURE(errorCode) || i < 0) { return FALSE; }
+ if (i < srcIndex) {
+ // Reset the iterator to the start.
+ index = remaining = srcIndex = replIndex = destIndex = 0;
+ } else if (i < (srcIndex + oldLength_)) {
+ // The index is in the current span.
+ return TRUE;
}
+ while (next(errorCode)) {
+ if (i < (srcIndex + oldLength_)) {
+ // The index is in the current span.
+ return TRUE;
+ }
+ if (remaining > 0) {
+ // Is the index in one of the remaining compressed edits?
+ // srcIndex is the start of the current span, before the remaining ones.
+ int32_t len = (remaining + 1) * oldLength_;
+ if (i < (srcIndex + len)) {
+ int32_t n = (i - srcIndex) / oldLength_; // 1 <= n <= remaining
+ len = n * oldLength_;
+ srcIndex += len;
+ replIndex += len;
+ destIndex += len;
+ remaining -= n;
+ return TRUE;
+ }
+ // Make next() skip all of these edits at once.
+ oldLength_ = newLength_ = len;
+ remaining = 0;
+ }
+ }
+ return FALSE;
}
U_NAMESPACE_END
UStringCaseMapper *stringCaseMapper,
icu::Edits *edits,
UErrorCode *pErrorCode) {
+ int32_t destLength;
+
+ /* check argument values */
+ if(U_FAILURE(*pErrorCode)) {
+ return 0;
+ }
+ if( destCapacity<0 ||
+ (dest==NULL && destCapacity>0) ||
+ src==NULL ||
+ srcLength<-1
+ ) {
+ *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
+ return 0;
+ }
+
+ /* get the string length */
+ if(srcLength==-1) {
+ srcLength=u_strlen(src);
+ }
+
+ /* check for overlapping source and destination */
+ if( dest!=NULL &&
+ ((src>=dest && src<(dest+destCapacity)) ||
+ (dest>=src && dest<(src+srcLength)))
+ ) {
+ *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
+ return 0;
+ }
+
+ destLength=stringCaseMapper(csm, dest, destCapacity, src, srcLength, edits, pErrorCode);
+ return u_terminateUChars(dest, destCapacity, destLength, pErrorCode);
+}
+
+U_CFUNC int32_t
+ustrcase_mapWithOverlap(const UCaseMap *csm,
+ UChar *dest, int32_t destCapacity,
+ const UChar *src, int32_t srcLength,
+ UStringCaseMapper *stringCaseMapper,
+ UErrorCode *pErrorCode) {
UChar buffer[300];
UChar *temp;
temp=dest;
}
- destLength=stringCaseMapper(csm, temp, destCapacity, src, srcLength, edits, pErrorCode);
+ destLength=stringCaseMapper(csm, temp, destCapacity, src, srcLength, NULL, pErrorCode);
if(temp!=dest) {
/* copy the result string to the destination buffer */
if (U_SUCCESS(*pErrorCode) && 0 < destLength && destLength <= destCapacity) {
UCaseMap csm=UCASEMAP_INITIALIZER;
csm.csp=ucase_getSingleton();
csm.options=options;
- return ustrcase_map(
+ return ustrcase_mapWithOverlap(
&csm,
dest, destCapacity,
src, srcLength,
- ustrcase_internalFold, NULL, pErrorCode);
+ ustrcase_internalFold, pErrorCode);
+}
+
+U_CAPI int32_t U_EXPORT2
+ucasemap_toLowerWithEdits(const UCaseMap *csm,
+ UChar *dest, int32_t destCapacity,
+ const UChar *src, int32_t srcLength,
+ icu::Edits *edits,
+ UErrorCode *pErrorCode) {
+ return ustrcase_map(
+ csm,
+ dest, destCapacity,
+ src, srcLength,
+ ustrcase_internalToLower, edits, pErrorCode);
+}
+
+U_CAPI int32_t U_EXPORT2
+ucasemap_toUpperWithEdits(const UCaseMap *csm,
+ UChar *dest, int32_t destCapacity,
+ const UChar *src, int32_t srcLength,
+ icu::Edits *edits,
+ UErrorCode *pErrorCode) {
+ return ustrcase_map(
+ csm,
+ dest, destCapacity,
+ src, srcLength,
+ ustrcase_internalToUpper, edits, pErrorCode);
}
/* case-insensitive string comparisons -------------------------------------- */
UErrorCode *pErrorCode) {
UCaseMap csm=UCASEMAP_INITIALIZER;
setTempCaseMap(&csm, locale);
- return ustrcase_map(
+ return ustrcase_mapWithOverlap(
&csm,
dest, destCapacity,
src, srcLength,
- ustrcase_internalToLower, NULL, pErrorCode);
+ ustrcase_internalToLower, pErrorCode);
}
U_CAPI int32_t U_EXPORT2
UErrorCode *pErrorCode) {
UCaseMap csm=UCASEMAP_INITIALIZER;
setTempCaseMap(&csm, locale);
- return ustrcase_map(
+ return ustrcase_mapWithOverlap(
&csm,
dest, destCapacity,
src, srcLength,
- ustrcase_internalToUpper, NULL, pErrorCode);
+ ustrcase_internalToUpper, pErrorCode);
}