From 00f2e12b65ea6179569abbd73534f9aba0b7cb3d Mon Sep 17 00:00:00 2001 From: Markus Scherer Date: Fri, 6 Jan 2017 00:20:31 +0000 Subject: [PATCH] ICU-12410 Edits::Iterator getters not fields, tracks indexes, findSourceIndex(), skip-unchanged iterators, ucasemap_toLowerWithEdits() & ucasemap_toUpperWithEdits() X-SVN-Rev: 39548 --- icu4c/source/common/unicode/ucasemap.h | 160 +++++++++++-- icu4c/source/common/unistr_case.cpp | 20 +- icu4c/source/common/ustr_imp.h | 16 +- .../source/common/ustr_titlecase_brkiter.cpp | 4 +- icu4c/source/common/ustrcase.cpp | 211 ++++++++++++++---- icu4c/source/common/ustrcase_locale.cpp | 8 +- 6 files changed, 336 insertions(+), 83 deletions(-) diff --git a/icu4c/source/common/unicode/ucasemap.h b/icu4c/source/common/unicode/ucasemap.h index 25a1af25dd5..2bff9ee1404 100644 --- a/icu4c/source/common/unicode/ucasemap.h +++ b/icu4c/source/common/unicode/ucasemap.h @@ -197,53 +197,113 @@ public: UBool next(UErrorCode &errorCode); /** - * TRUE if this edit replaces oldLength units with newLength different ones. - * FALSE if oldLength units remain unchanged. + * Finds the edit that contains the source index. + * The source index may be found in a non-change + * even if normal iteration would skip non-changes. + * Normal iteration can continue from a found edit. + * + * The iterator state before this search logically does not matter. + * (It may affect the performance of the search.) + * + * The iterator state after this search is undefined + * if the source index is out of bounds for the source string. + * + * @param i source index + * @return TRUE if the edit for the source index was found * @internal ICU 59 technology preview */ - UBool changed; + UBool findSourceIndex(int32_t i, UErrorCode &errorCode); + + /** + * @return TRUE if this edit replaces oldLength() units with newLength() different ones. + * FALSE if oldLength units remain unchanged. + * @internal ICU 59 technology preview + */ + UBool hasChange() const { return changed; } /** - * Number of units in the original string which are replaced or remain unchanged. + * @return the number of units in the original string which are replaced or remain unchanged. * @internal ICU 59 technology preview */ - int32_t oldLength; + int32_t oldLength() const { return oldLength_; } /** - * Number of units in the modified string, if changed is TRUE. - * Same as oldLength if changed is FALSE. + * @return the number of units in the modified string, if hasChange() is TRUE. + * Same as oldLength if hasChange() is FALSE. * @internal ICU 59 technology preview */ - int32_t newLength; + int32_t newLength() const { return newLength_; } + + /** + * @return the current index into the source string + * @internal ICU 59 technology preview + */ + int32_t sourceIndex() const { return srcIndex; } + /** + * @return the current index into the replacement-characters-only string, + * not counting unchanged spans + * @internal ICU 59 technology preview + */ + int32_t replacementIndex() const { return replIndex; } + /** + * @return the current index into the full destination string + * @internal ICU 59 technology preview + */ + int32_t destinationIndex() const { return destIndex; } private: friend class Edits; - Iterator(const uint16_t *a, int32_t len, UBool crs) : - array(a), index(0), length(len), width(0), remaining(0), coarse(crs) {} + Iterator(const uint16_t *a, int32_t len, UBool oc, UBool crs); int32_t readLength(int32_t head); + void updateIndexes(); + UBool noNext(); const uint16_t *array; int32_t index, length; - int32_t width, remaining; - UBool coarse; + int32_t remaining; + UBool onlyChanges, coarse; + + UBool changed; + int32_t oldLength_, newLength_; + int32_t srcIndex, replIndex, destIndex; }; /** * Returns an Iterator for coarse-grained changes for simple string updates. + * Skips non-changes. + * @return an Iterator that merges adjacent changes. + * @internal ICU 59 technology preview + */ + Iterator getCoarseChangesIterator() const { + return Iterator(array, length, TRUE, TRUE); + } + + /** + * Returns an Iterator for coarse-grained changes and non-changes for simple string updates. * @return an Iterator that merges adjacent changes. * @internal ICU 59 technology preview */ Iterator getCoarseIterator() const { - return Iterator(array, length, TRUE); + return Iterator(array, length, FALSE, TRUE); + } + + /** + * Returns an Iterator for fine-grained changes for modifying styled text. + * Skips non-changes. + * @return an Iterator that separates adjacent changes. + * @internal ICU 59 technology preview + */ + Iterator getFineChangesIterator() const { + return Iterator(array, length, TRUE, FALSE); } /** - * Returns an Iterator for fine-grained changes for modifying text with metadata. + * Returns an Iterator for fine-grained changes and non-changes for modifying styled text. * @return an Iterator that separates adjacent changes. * @internal ICU 59 technology preview */ Iterator getFineIterator() const { - return Iterator(array, length, FALSE); + return Iterator(array, length, FALSE, FALSE); } private: @@ -383,6 +443,76 @@ ucasemap_setOptions(UCaseMap *csm, uint32_t options, UErrorCode *pErrorCode); */ #define U_TITLECASE_NO_BREAK_ADJUSTMENT 0x200 +#if U_SHOW_CPLUSPLUS_API +#ifndef U_HIDE_INTERNAL_API + +/** + * Lowercases the characters in a UTF-16 string and optionally records edits. + * Casing is locale-dependent and context-sensitive. + * The result may be longer or shorter than the original. + * The source string and the destination buffer must not overlap. + * + * @param csm UCaseMap service object. + * @param dest A buffer for the result string. The result will be NUL-terminated if + * the buffer is large enough. + * The contents is undefined in case of failure. + * @param destCapacity The size of the buffer (number of bytes). If it is 0, then + * dest may be NULL and the function will only return the length of the result + * without writing any of the result string. + * @param src The original string. + * @param srcLength The length of the original string. If -1, then src must be NUL-terminated. + * @param edits Records edits for index mapping, working with styled text, + * and getting only changes (if any). Can be NULL. + * @param pErrorCode Must be a valid pointer to an error code value, + * which must not indicate a failure before the function call. + * @return The length of the result string, if successful - or in case of a buffer overflow, + * in which case it will be greater than destCapacity. + * + * @see u_strToLower + * @internal ICU 59 technology preview + */ +U_CAPI int32_t U_EXPORT2 +ucasemap_toLowerWithEdits(const UCaseMap *csm, + UChar *dest, int32_t destCapacity, + const UChar *src, int32_t srcLength, + icu::Edits *edits, + UErrorCode *pErrorCode); + +/** + * Uppercases the characters in a UTF-16 string and optionally records edits. + * Casing is locale-dependent and context-sensitive. + * The result may be longer or shorter than the original. + * The source string and the destination buffer must not overlap. + * + * @param csm UCaseMap service object. + * @param dest A buffer for the result string. The result will be NUL-terminated if + * the buffer is large enough. + * The contents is undefined in case of failure. + * @param destCapacity The size of the buffer (number of bytes). If it is 0, then + * dest may be NULL and the function will only return the length of the result + * without writing any of the result string. + * @param src The original string. + * @param srcLength The length of the original string. If -1, then src must be NUL-terminated. + * @param edits Records edits for index mapping, working with styled text, + * and getting only changes (if any). Can be NULL. + * @param pErrorCode Must be a valid pointer to an error code value, + * which must not indicate a failure before the function call. + * @return The length of the result string, if successful - or in case of a buffer overflow, + * in which case it will be greater than destCapacity. + * + * @see u_strToLower + * @internal ICU 59 technology preview + */ +U_CAPI int32_t U_EXPORT2 +ucasemap_toUpperWithEdits(const UCaseMap *csm, + UChar *dest, int32_t destCapacity, + const UChar *src, int32_t srcLength, + icu::Edits *edits, + UErrorCode *pErrorCode); + +#endif // U_HIDE_INTERNAL_API +#endif // U_SHOW_CPLUSPLUS_API + #if !UCONFIG_NO_BREAK_ITERATION /** diff --git a/icu4c/source/common/unistr_case.cpp b/icu4c/source/common/unistr_case.cpp index 32fb20e87e6..e9771d1ded5 100644 --- a/icu4c/source/common/unistr_case.cpp +++ b/icu4c/source/common/unistr_case.cpp @@ -140,9 +140,8 @@ UnicodeString::caseMap(const UCaseMap *csm, Edits edits; edits.setWriteUnchanged(FALSE); UChar replacementChars[200]; - int32_t replacementLength = stringCaseMapper( - csm, replacementChars, UPRV_LENGTHOF(replacementChars), - oldArray, oldLength, &edits, &errorCode); + stringCaseMapper(csm, replacementChars, UPRV_LENGTHOF(replacementChars), + oldArray, oldLength, &edits, &errorCode); UErrorCode editsError = U_ZERO_ERROR; if (edits.setErrorCode(editsError)) { setToBogus(); @@ -150,22 +149,17 @@ UnicodeString::caseMap(const UCaseMap *csm, } newLength = oldLength + edits.lengthDelta(); if (U_SUCCESS(errorCode)) { - if (!cloneArrayIfNeeded(newLength, newLength)) { + // Grow the buffer at most once, not for multiple doReplace() calls. + if (newLength > oldLength && !cloneArrayIfNeeded(newLength, newLength)) { return *this; } - int32_t index = 0; // index into this string - int32_t replIndex = 0; // index into replacementChars - for (Edits::Iterator iter = edits.getCoarseIterator(); iter.next(errorCode);) { - if (iter.changed) { - doReplace(index, iter.oldLength, replacementChars, replIndex, iter.newLength); - replIndex += iter.newLength; - } - index += iter.newLength; + for (Edits::Iterator iter = edits.getCoarseChangesIterator(); iter.next(errorCode);) { + doReplace(iter.destinationIndex(), iter.oldLength(), + replacementChars, iter.replacementIndex(), iter.newLength()); } if (U_FAILURE(errorCode)) { setToBogus(); } - U_ASSERT(replIndex == replacementLength); return *this; } else if (errorCode == U_BUFFER_OVERFLOW_ERROR) { // common overflow handling below diff --git a/icu4c/source/common/ustr_imp.h b/icu4c/source/common/ustr_imp.h index 544d05c3768..28bdca1e001 100644 --- a/icu4c/source/common/ustr_imp.h +++ b/icu4c/source/common/ustr_imp.h @@ -167,8 +167,8 @@ ustrcase_internalFold(const UCaseMap *csm, UErrorCode *pErrorCode); /** - * Implements argument checking and buffer handling - * for string case mapping as a common function. + * Common string case mapping implementation for ucasemap_toXyz() and UnicodeString::toXyz(). + * Implements argument checking. */ U_CFUNC int32_t ustrcase_map(const UCaseMap *csm, @@ -178,6 +178,18 @@ ustrcase_map(const UCaseMap *csm, icu::Edits *edits, UErrorCode *pErrorCode); +/** + * Common string case mapping implementation for old-fashioned u_strToXyz() functions + * that allow the source string to overlap the destination buffer. + * Implements argument checking and internally works with an intermediate buffer if necessary. + */ +U_CFUNC int32_t +ustrcase_mapWithOverlap(const UCaseMap *csm, + UChar *dest, int32_t destCapacity, + const UChar *src, int32_t srcLength, + UStringCaseMapper *stringCaseMapper, + UErrorCode *pErrorCode); + /** * UTF-8 string case mapping function type, used by ucasemap_mapUTF8(). * UTF-8 version of UStringCaseMapper. diff --git a/icu4c/source/common/ustr_titlecase_brkiter.cpp b/icu4c/source/common/ustr_titlecase_brkiter.cpp index d5e5a2c2415..463ba2c546f 100644 --- a/icu4c/source/common/ustr_titlecase_brkiter.cpp +++ b/icu4c/source/common/ustr_titlecase_brkiter.cpp @@ -62,11 +62,11 @@ u_strToTitle(UChar *dest, int32_t destCapacity, } else { csm.iter=ubrk_open(UBRK_WORD, csm.locale, src, srcLength, pErrorCode); } - int32_t length=ustrcase_map( + int32_t length=ustrcase_mapWithOverlap( &csm, dest, destCapacity, src, srcLength, - ustrcase_internalToTitle, NULL, pErrorCode); + ustrcase_internalToTitle, pErrorCode); if(titleIter==NULL && csm.iter!=NULL) { ubrk_close(csm.iter); } diff --git a/icu4c/source/common/ustrcase.cpp b/icu4c/source/common/ustrcase.cpp index c833345788a..35b5d6370da 100644 --- a/icu4c/source/common/ustrcase.cpp +++ b/icu4c/source/common/ustrcase.cpp @@ -226,51 +226,95 @@ UBool Edits::hasChanges() const { return FALSE; } +Edits::Iterator::Iterator(const uint16_t *a, int32_t len, UBool oc, UBool crs) : + array(a), index(0), length(len), remaining(0), + onlyChanges(oc), coarse(crs), + changed(FALSE), oldLength_(0), newLength_(0), + srcIndex(0), replIndex(0), destIndex(0) {} + +int32_t Edits::Iterator::readLength(int32_t head) { + if (head < LENGTH_IN_1TRAIL) { + return head; + } else if (head < LENGTH_IN_2TRAIL) { + U_ASSERT(index < length); + U_ASSERT(array[index] >= 0x8000); + return array[index++]; + } else { + U_ASSERT((index + 2) <= length); + U_ASSERT(array[index] >= 0x8000); + U_ASSERT(array[index + 1] >= 0x8000); + int32_t len = ((head & 1) << 30) | + ((int32_t)(array[index] & 0x7fff) << 15) | + (array[index + 1] & 0x7fff); + index += 2; + return len; + } +} + +void Edits::Iterator::updateIndexes() { + srcIndex += oldLength_; + if (changed) { + replIndex += newLength_; + } + destIndex += newLength_; +} + +UBool Edits::Iterator::noNext() { + // Empty span beyond the string. + oldLength_ = newLength_ = 0; + return FALSE; +} + UBool Edits::Iterator::next(UErrorCode &errorCode) { if (U_FAILURE(errorCode)) { return FALSE; } - // Always set all relevant public fields: Do not rely on them not having been touched. + // We have an errorCode in case we need to start guarding against integer overflows. + // It is also convenient for caller loops if we bail out when an error was set elsewhere. + updateIndexes(); if (remaining > 0) { // Fine-grained iterator: Continue a sequence of equal-length changes. - changed = TRUE; - oldLength = newLength = width; --remaining; return TRUE; } if (index >= length) { - return FALSE; + return noNext(); } int32_t u = array[index++]; if (u <= MAX_UNCHANGED) { // Combine adjacent unchanged ranges. changed = FALSE; - oldLength = u + 1; + oldLength_ = u + 1; while (index < length && (u = array[index]) <= MAX_UNCHANGED) { ++index; - if (u >= (INT32_MAX - oldLength)) { - errorCode = U_INDEX_OUTOFBOUNDS_ERROR; - return FALSE; + oldLength_ += u + 1; + } + newLength_ = oldLength_; + if (onlyChanges) { + updateIndexes(); + if (index >= length) { + return noNext(); } - oldLength += u + 1; + // already fetched u > MAX_UNCHANGED at index + ++index; + } else { + return TRUE; } - newLength = oldLength; - return TRUE; } changed = TRUE; if (u <= MAX_SHORT_CHANGE) { if (coarse) { int32_t w = u >> 12; int32_t len = (u & 0xfff) + 1; - oldLength = newLength = w * len; + oldLength_ = newLength_ = len * w; } else { // Split a sequence of equal-length changes that was compressed into one unit. - oldLength = newLength = width = u >> 12; + oldLength_ = newLength_ = u >> 12; remaining = u & 0xfff; return TRUE; } } else { U_ASSERT(u <= 0x7fff); - oldLength = readLength((u >> 6) & 0x3f); - newLength = readLength(u & 0x3f); + oldLength_ = readLength((u >> 6) & 0x3f); + newLength_ = readLength(u & 0x3f); if (!coarse) { return TRUE; } @@ -281,45 +325,53 @@ UBool Edits::Iterator::next(UErrorCode &errorCode) { if (u <= MAX_SHORT_CHANGE) { int32_t w = u >> 12; int32_t len = (u & 0xfff) + 1; - len = w * len; - if (len > (INT32_MAX - oldLength) || len > (INT32_MAX - newLength)) { - errorCode = U_INDEX_OUTOFBOUNDS_ERROR; - return FALSE; - } - oldLength += len; - newLength += len; + len = len * w; + oldLength_ += len; + newLength_ += len; } else { U_ASSERT(u <= 0x7fff); int32_t oldLen = readLength((u >> 6) & 0x3f); int32_t newLen = readLength(u & 0x3f); - if (oldLen > (INT32_MAX - oldLength) || newLen > (INT32_MAX - newLength)) { - errorCode = U_INDEX_OUTOFBOUNDS_ERROR; - return FALSE; - } - oldLength += oldLen; - newLength += newLen; + oldLength_ += oldLen; + newLength_ += newLen; } } return TRUE; } -int32_t Edits::Iterator::readLength(int32_t head) { - if (head < LENGTH_IN_1TRAIL) { - return head; - } else if (head < LENGTH_IN_2TRAIL) { - U_ASSERT(index < length); - U_ASSERT(array[index] >= 0x8000); - return array[index++]; - } else { - U_ASSERT((index + 2) <= length); - U_ASSERT(array[index] >= 0x8000); - U_ASSERT(array[index + 1] >= 0x8000); - int32_t len = ((head & 1) << 30) | - ((int32_t)(array[index] & 0x7fff) << 15) | - (array[index + 1] & 0x7fff); - index += 2; - return len; +UBool Edits::Iterator::findSourceIndex(int32_t i, UErrorCode &errorCode) { + if (U_FAILURE(errorCode) || i < 0) { return FALSE; } + if (i < srcIndex) { + // Reset the iterator to the start. + index = remaining = srcIndex = replIndex = destIndex = 0; + } else if (i < (srcIndex + oldLength_)) { + // The index is in the current span. + return TRUE; } + while (next(errorCode)) { + if (i < (srcIndex + oldLength_)) { + // The index is in the current span. + return TRUE; + } + if (remaining > 0) { + // Is the index in one of the remaining compressed edits? + // srcIndex is the start of the current span, before the remaining ones. + int32_t len = (remaining + 1) * oldLength_; + if (i < (srcIndex + len)) { + int32_t n = (i - srcIndex) / oldLength_; // 1 <= n <= remaining + len = n * oldLength_; + srcIndex += len; + replIndex += len; + destIndex += len; + remaining -= n; + return TRUE; + } + // Make next() skip all of these edits at once. + oldLength_ = newLength_ = len; + remaining = 0; + } + } + return FALSE; } U_NAMESPACE_END @@ -1360,6 +1412,45 @@ ustrcase_map(const UCaseMap *csm, UStringCaseMapper *stringCaseMapper, icu::Edits *edits, UErrorCode *pErrorCode) { + int32_t destLength; + + /* check argument values */ + if(U_FAILURE(*pErrorCode)) { + return 0; + } + if( destCapacity<0 || + (dest==NULL && destCapacity>0) || + src==NULL || + srcLength<-1 + ) { + *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; + return 0; + } + + /* get the string length */ + if(srcLength==-1) { + srcLength=u_strlen(src); + } + + /* check for overlapping source and destination */ + if( dest!=NULL && + ((src>=dest && src<(dest+destCapacity)) || + (dest>=src && dest<(src+srcLength))) + ) { + *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; + return 0; + } + + destLength=stringCaseMapper(csm, dest, destCapacity, src, srcLength, edits, pErrorCode); + return u_terminateUChars(dest, destCapacity, destLength, pErrorCode); +} + +U_CFUNC int32_t +ustrcase_mapWithOverlap(const UCaseMap *csm, + UChar *dest, int32_t destCapacity, + const UChar *src, int32_t srcLength, + UStringCaseMapper *stringCaseMapper, + UErrorCode *pErrorCode) { UChar buffer[300]; UChar *temp; @@ -1404,7 +1495,7 @@ ustrcase_map(const UCaseMap *csm, temp=dest; } - destLength=stringCaseMapper(csm, temp, destCapacity, src, srcLength, edits, pErrorCode); + destLength=stringCaseMapper(csm, temp, destCapacity, src, srcLength, NULL, pErrorCode); if(temp!=dest) { /* copy the result string to the destination buffer */ if (U_SUCCESS(*pErrorCode) && 0 < destLength && destLength <= destCapacity) { @@ -1428,11 +1519,37 @@ u_strFoldCase(UChar *dest, int32_t destCapacity, UCaseMap csm=UCASEMAP_INITIALIZER; csm.csp=ucase_getSingleton(); csm.options=options; - return ustrcase_map( + return ustrcase_mapWithOverlap( &csm, dest, destCapacity, src, srcLength, - ustrcase_internalFold, NULL, pErrorCode); + ustrcase_internalFold, pErrorCode); +} + +U_CAPI int32_t U_EXPORT2 +ucasemap_toLowerWithEdits(const UCaseMap *csm, + UChar *dest, int32_t destCapacity, + const UChar *src, int32_t srcLength, + icu::Edits *edits, + UErrorCode *pErrorCode) { + return ustrcase_map( + csm, + dest, destCapacity, + src, srcLength, + ustrcase_internalToLower, edits, pErrorCode); +} + +U_CAPI int32_t U_EXPORT2 +ucasemap_toUpperWithEdits(const UCaseMap *csm, + UChar *dest, int32_t destCapacity, + const UChar *src, int32_t srcLength, + icu::Edits *edits, + UErrorCode *pErrorCode) { + return ustrcase_map( + csm, + dest, destCapacity, + src, srcLength, + ustrcase_internalToUpper, edits, pErrorCode); } /* case-insensitive string comparisons -------------------------------------- */ diff --git a/icu4c/source/common/ustrcase_locale.cpp b/icu4c/source/common/ustrcase_locale.cpp index 0550d108308..20749e1d66e 100644 --- a/icu4c/source/common/ustrcase_locale.cpp +++ b/icu4c/source/common/ustrcase_locale.cpp @@ -90,11 +90,11 @@ u_strToLower(UChar *dest, int32_t destCapacity, UErrorCode *pErrorCode) { UCaseMap csm=UCASEMAP_INITIALIZER; setTempCaseMap(&csm, locale); - return ustrcase_map( + return ustrcase_mapWithOverlap( &csm, dest, destCapacity, src, srcLength, - ustrcase_internalToLower, NULL, pErrorCode); + ustrcase_internalToLower, pErrorCode); } U_CAPI int32_t U_EXPORT2 @@ -104,9 +104,9 @@ u_strToUpper(UChar *dest, int32_t destCapacity, UErrorCode *pErrorCode) { UCaseMap csm=UCASEMAP_INITIALIZER; setTempCaseMap(&csm, locale); - return ustrcase_map( + return ustrcase_mapWithOverlap( &csm, dest, destCapacity, src, srcLength, - ustrcase_internalToUpper, NULL, pErrorCode); + ustrcase_internalToUpper, pErrorCode); } -- 2.40.0