From: Markus Scherer Date: Mon, 18 Sep 2017 21:45:11 +0000 (+0000) Subject: ICU-13203 CaseMap UTF-8 add StringPiece->ByteSink overloads; change implementation... X-Git-Tag: release-60-rc~136 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=68ef77118b8031091e9f0ef8b6ecef4e81f19055;p=icu ICU-13203 CaseMap UTF-8 add StringPiece->ByteSink overloads; change implementation to that and change array->array versions into wrappers X-SVN-Rev: 40425 --- diff --git a/icu4c/source/common/Makefile.in b/icu4c/source/common/Makefile.in index 10fa8de38eb..bc91704e5d7 100644 --- a/icu4c/source/common/Makefile.in +++ b/icu4c/source/common/Makefile.in @@ -89,7 +89,7 @@ ucnv_ext.o ucnvmbcs.o ucnv2022.o ucnvhz.o ucnv_lmb.o ucnvisci.o ucnvdisp.o ucnv_ resource.o uresbund.o ures_cnv.o uresdata.o resbund.o resbund_cnv.o \ ucurr.o \ messagepattern.o ucat.o locmap.o uloc.o locid.o locutil.o locavailable.o locdispnames.o locdspnm.o loclikely.o locresdata.o \ -bytestream.o stringpiece.o \ +bytestream.o stringpiece.o bytesinkutil.o \ stringtriebuilder.o bytestriebuilder.o \ bytestrie.o bytestrieiterator.o \ ucharstrie.o ucharstriebuilder.o ucharstrieiterator.o \ diff --git a/icu4c/source/common/bytesinkutil.cpp b/icu4c/source/common/bytesinkutil.cpp new file mode 100644 index 00000000000..bf1a2d45f8a --- /dev/null +++ b/icu4c/source/common/bytesinkutil.cpp @@ -0,0 +1,123 @@ +// © 2017 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html + +// bytesinkutil.cpp +// created: 2017sep14 Markus W. Scherer + +#include "unicode/utypes.h" +#include "unicode/bytestream.h" +#include "unicode/edits.h" +#include "unicode/stringoptions.h" +#include "unicode/utf8.h" +#include "unicode/utf16.h" +#include "bytesinkutil.h" +#include "cmemory.h" +#include "uassert.h" + +U_NAMESPACE_BEGIN + +UBool +ByteSinkUtil::appendChange(int32_t length, const char16_t *s16, int32_t s16Length, + ByteSink &sink, Edits *edits, UErrorCode &errorCode) { + if (U_FAILURE(errorCode)) { return FALSE; } + char scratch[200]; + int32_t s8Length = 0; + for (int32_t i = 0; i < s16Length;) { + int32_t capacity; + int32_t desiredCapacity = s16Length - i; + if (desiredCapacity < (INT32_MAX / 3)) { + desiredCapacity *= 3; // max 3 UTF-8 bytes per UTF-16 code unit + } else if (desiredCapacity < (INT32_MAX / 2)) { + desiredCapacity *= 2; + } else { + desiredCapacity = INT32_MAX; + } + char *buffer = sink.GetAppendBuffer(U8_MAX_LENGTH, desiredCapacity, + scratch, UPRV_LENGTHOF(scratch), &capacity); + capacity -= U8_MAX_LENGTH - 1; + int32_t j = 0; + for (; i < s16Length && j < capacity;) { + UChar32 c; + U16_NEXT_UNSAFE(s16, i, c); + U8_APPEND_UNSAFE(buffer, j, c); + } + if (j > (INT32_MAX - s8Length)) { + errorCode = U_INDEX_OUTOFBOUNDS_ERROR; + return FALSE; + } + sink.Append(buffer, j); + s8Length += j; + } + if (edits != nullptr) { + edits->addReplace(length, s8Length); + } + return TRUE; +} + +UBool +ByteSinkUtil::appendChange(const uint8_t *s, const uint8_t *limit, + const char16_t *s16, int32_t s16Length, + ByteSink &sink, Edits *edits, UErrorCode &errorCode) { + if (U_FAILURE(errorCode)) { return FALSE; } + if ((limit - s) > INT32_MAX) { + errorCode = U_INDEX_OUTOFBOUNDS_ERROR; + return FALSE; + } + return appendChange((int32_t)(limit - s), s16, s16Length, sink, edits, errorCode); +} + +void +ByteSinkUtil::appendCodePoint(int32_t length, UChar32 c, ByteSink &sink, Edits *edits) { + char s8[U8_MAX_LENGTH]; + int32_t s8Length = 0; + U8_APPEND_UNSAFE(s8, s8Length, c); + if (edits != nullptr) { + edits->addReplace(length, s8Length); + } + sink.Append(s8, s8Length); +} + +namespace { + +// See unicode/utf8.h U8_APPEND_UNSAFE(). +inline uint8_t getTwoByteLead(UChar32 c) { return (uint8_t)((c >> 6) | 0xc0); } +inline uint8_t getTwoByteTrail(UChar32 c) { return (uint8_t)((c & 0x3f) | 0x80); } + +} // namespace + +void +ByteSinkUtil::appendTwoBytes(UChar32 c, ByteSink &sink) { + U_ASSERT(0x80 <= c && c <= 0x7ff); // 2-byte UTF-8 + char s8[2] = { (char)getTwoByteLead(c), (char)getTwoByteTrail(c) }; + sink.Append(s8, 2); +} + +UBool +ByteSinkUtil::appendUnchanged(const uint8_t *s, int32_t length, + ByteSink &sink, uint32_t options, Edits *edits, + UErrorCode &errorCode) { + if (U_FAILURE(errorCode)) { return FALSE; } + if (length > 0) { + if (edits != nullptr) { + edits->addUnchanged(length); + } + if ((options & U_OMIT_UNCHANGED_TEXT) == 0) { + sink.Append(reinterpret_cast(s), length); + } + } + return TRUE; +} + +UBool +ByteSinkUtil::appendUnchanged(const uint8_t *s, const uint8_t *limit, + ByteSink &sink, uint32_t options, Edits *edits, + UErrorCode &errorCode) { + if (U_FAILURE(errorCode)) { return FALSE; } + if ((limit - s) > INT32_MAX) { + errorCode = U_INDEX_OUTOFBOUNDS_ERROR; + return FALSE; + } + return appendUnchanged(s, (int32_t)(limit - s), sink, options, edits, errorCode); +} + +U_NAMESPACE_END diff --git a/icu4c/source/common/bytesinkutil.h b/icu4c/source/common/bytesinkutil.h new file mode 100644 index 00000000000..004b49c4ce6 --- /dev/null +++ b/icu4c/source/common/bytesinkutil.h @@ -0,0 +1,53 @@ +// © 2017 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html + +// bytesinkutil.h +// created: 2017sep14 Markus W. Scherer + +#include "unicode/utypes.h" +#include "unicode/bytestream.h" +#include "unicode/edits.h" +#include "cmemory.h" +#include "uassert.h" + +U_NAMESPACE_BEGIN + +class ByteSink; +class Edits; + +class U_COMMON_API ByteSinkUtil { +public: + ByteSinkUtil() = delete; // all static + + /** (length) bytes were mapped to valid (s16, s16Length). */ + static UBool appendChange(int32_t length, + const char16_t *s16, int32_t s16Length, + ByteSink &sink, Edits *edits, UErrorCode &errorCode); + + /** The bytes at [s, limit[ were mapped to valid (s16, s16Length). */ + static UBool appendChange(const uint8_t *s, const uint8_t *limit, + const char16_t *s16, int32_t s16Length, + ByteSink &sink, Edits *edits, UErrorCode &errorCode); + + /** (length) bytes were mapped/changed to valid code point c. */ + static void appendCodePoint(int32_t length, UChar32 c, ByteSink &sink, Edits *edits = nullptr); + + /** The few bytes at [src, nextSrc[ were mapped/changed to valid code point c. */ + static inline void appendCodePoint(const uint8_t *src, const uint8_t *nextSrc, UChar32 c, + ByteSink &sink, Edits *edits = nullptr) { + appendCodePoint((int32_t)(nextSrc - src), c, sink, edits); + } + + /** Append the two-byte character (U+0080..U+07FF). */ + static void appendTwoBytes(UChar32 c, ByteSink &sink); + + static UBool appendUnchanged(const uint8_t *s, int32_t length, + ByteSink &sink, uint32_t options, Edits *edits, + UErrorCode &errorCode); + + static UBool appendUnchanged(const uint8_t *s, const uint8_t *limit, + ByteSink &sink, uint32_t options, Edits *edits, + UErrorCode &errorCode); +}; + +U_NAMESPACE_END diff --git a/icu4c/source/common/bytestream.cpp b/icu4c/source/common/bytestream.cpp index bfd7bded714..0d0e4dda39b 100644 --- a/icu4c/source/common/bytestream.cpp +++ b/icu4c/source/common/bytestream.cpp @@ -45,6 +45,12 @@ void CheckedArrayByteSink::Append(const char* bytes, int32_t n) { if (n <= 0) { return; } + if (n > (INT32_MAX - appended_)) { + // TODO: Report as integer overflow, not merely buffer overflow. + appended_ = INT32_MAX; + overflowed_ = TRUE; + return; + } appended_ += n; int32_t available = capacity_ - size_; if (n > available) { diff --git a/icu4c/source/common/common.vcxproj b/icu4c/source/common/common.vcxproj index 055e849cd43..3a36fcc563d 100644 --- a/icu4c/source/common/common.vcxproj +++ b/icu4c/source/common/common.vcxproj @@ -445,6 +445,7 @@ + @@ -1478,6 +1479,7 @@ ..\..\include\unicode\%(Filename)%(Extension);%(Outputs) + copy "%(FullPath)" ..\..\include\unicode diff --git a/icu4c/source/common/common.vcxproj.filters b/icu4c/source/common/common.vcxproj.filters index 0542a8b1384..1bdeced6798 100644 --- a/icu4c/source/common/common.vcxproj.filters +++ b/icu4c/source/common/common.vcxproj.filters @@ -460,6 +460,9 @@ sprep + + strings + strings @@ -861,6 +864,9 @@ sprep + + strings + strings diff --git a/icu4c/source/common/common_uwp.vcxproj b/icu4c/source/common/common_uwp.vcxproj index d3d9f8ae888..5cb402e1c2b 100644 --- a/icu4c/source/common/common_uwp.vcxproj +++ b/icu4c/source/common/common_uwp.vcxproj @@ -452,6 +452,7 @@ + @@ -894,6 +895,7 @@ ..\..\include\unicode\%(Filename)%(Extension);%(Outputs) + copy "%(FullPath)" ..\..\include\unicode diff --git a/icu4c/source/common/normalizer2impl.cpp b/icu4c/source/common/normalizer2impl.cpp index 1249aede85e..15b4a528934 100644 --- a/icu4c/source/common/normalizer2impl.cpp +++ b/icu4c/source/common/normalizer2impl.cpp @@ -28,6 +28,7 @@ #include "unicode/ustring.h" #include "unicode/utf16.h" #include "unicode/utf8.h" +#include "bytesinkutil.h" #include "cmemory.h" #include "mutex.h" #include "normalizer2impl.h" @@ -129,60 +130,6 @@ int32_t getJamoTMinusBase(const uint8_t *src, const uint8_t *limit) { return -1; } -/** The bytes at [src, nextSrc[ were mapped to valid (s16, s16Length). */ -UBool -appendChange(const uint8_t *src, const uint8_t *nextSrc, - const char16_t *s16, int32_t s16Length, - ByteSink &sink, Edits *edits, UErrorCode &errorCode) { - U_ASSERT(U_SUCCESS(errorCode)); - U_ASSERT((nextSrc - src) <= INT32_MAX); // ensured by caller - char scratch[200]; - int32_t s8Length = 0; - for (int32_t i = 0; i < s16Length;) { - int32_t capacity; - int32_t desiredCapacity = s16Length - i; - if (desiredCapacity < (INT32_MAX / 3)) { - desiredCapacity *= 3; // max 3 UTF-8 bytes per UTF-16 code unit - } else if (desiredCapacity < (INT32_MAX / 2)) { - desiredCapacity *= 2; - } else { - desiredCapacity = INT32_MAX; - } - char *buffer = sink.GetAppendBuffer(U8_MAX_LENGTH, desiredCapacity, - scratch, UPRV_LENGTHOF(scratch), &capacity); - capacity -= U8_MAX_LENGTH - 1; - int32_t j = 0; - for (; i < s16Length && j < capacity;) { - UChar32 c; - U16_NEXT_UNSAFE(s16, i, c); - U8_APPEND_UNSAFE(buffer, j, c); - } - if (j > (INT32_MAX - s8Length)) { - errorCode = U_INDEX_OUTOFBOUNDS_ERROR; - return FALSE; - } - sink.Append(buffer, j); - s8Length += j; - } - if (edits != nullptr) { - edits->addReplace((int32_t)(nextSrc - src), s8Length); - } - return TRUE; -} - -/** The few bytes at [src, nextSrc[ were mapped to valid code point c. */ -void -appendCodePoint(const uint8_t *src, const uint8_t *nextSrc, UChar32 c, - ByteSink &sink, Edits *edits) { - char buffer[U8_MAX_LENGTH]; - int32_t length = 0; - U8_APPEND_UNSAFE(buffer, length, c); - if (edits != nullptr) { - edits->addReplace((int32_t)(nextSrc - src), length); - } - sink.Append(buffer, length); -} - void appendCodePointDelta(const uint8_t *cpStart, const uint8_t *cpLimit, int32_t delta, ByteSink &sink, Edits *edits) { @@ -214,27 +161,6 @@ appendCodePointDelta(const uint8_t *cpStart, const uint8_t *cpLimit, int32_t del sink.Append(buffer, length); } -UBool -appendUnchanged(const uint8_t *s, const uint8_t *limit, - ByteSink &sink, uint32_t options, Edits *edits, - UErrorCode &errorCode) { - U_ASSERT(U_SUCCESS(errorCode)); - if ((limit - s) > INT32_MAX) { - errorCode = U_INDEX_OUTOFBOUNDS_ERROR; - return FALSE; - } - int32_t length = (int32_t)(limit - s); - if (length > 0) { - if (edits != nullptr) { - edits->addUnchanged(length); - } - if ((options & U_OMIT_UNCHANGED_TEXT) ==0) { - sink.Append(reinterpret_cast(s), length); - } - } - return TRUE; -} - } // namespace // ReorderingBuffer -------------------------------------------------------- *** @@ -1851,7 +1777,8 @@ Normalizer2Impl::composeUTF8(uint32_t options, UBool onlyContiguous, for (;;) { if (src == limit) { if (prevBoundary != limit && sink != nullptr) { - appendUnchanged(prevBoundary, limit, *sink, options, edits, errorCode); + ByteSinkUtil::appendUnchanged(prevBoundary, limit, + *sink, options, edits, errorCode); } return TRUE; } @@ -1884,7 +1811,8 @@ Normalizer2Impl::composeUTF8(uint32_t options, UBool onlyContiguous, if (norm16HasCompBoundaryAfter(norm16, onlyContiguous) || hasCompBoundaryBefore(src, limit)) { if (prevBoundary != prevSrc && - !appendUnchanged(prevBoundary, prevSrc, *sink, options, edits, errorCode)) { + !ByteSinkUtil::appendUnchanged(prevBoundary, prevSrc, + *sink, options, edits, errorCode)) { break; } appendCodePointDelta(prevSrc, src, getAlgorithmicDelta(norm16), *sink, edits); @@ -1896,13 +1824,14 @@ Normalizer2Impl::composeUTF8(uint32_t options, UBool onlyContiguous, if (norm16HasCompBoundaryAfter(norm16, onlyContiguous) || hasCompBoundaryBefore(src, limit)) { if (prevBoundary != prevSrc && - !appendUnchanged(prevBoundary, prevSrc, *sink, options, edits, errorCode)) { + !ByteSinkUtil::appendUnchanged(prevBoundary, prevSrc, + *sink, options, edits, errorCode)) { break; } const uint16_t *mapping = getMapping(norm16); int32_t length = *mapping++ & MAPPING_LENGTH_MASK; - if (!appendChange(prevSrc, src, (const UChar *)mapping, length, - *sink, edits, errorCode)) { + if (!ByteSinkUtil::appendChange(prevSrc, src, (const UChar *)mapping, length, + *sink, edits, errorCode)) { break; } prevBoundary = src; @@ -1915,7 +1844,8 @@ Normalizer2Impl::composeUTF8(uint32_t options, UBool onlyContiguous, if (hasCompBoundaryBefore(src, limit) || hasCompBoundaryAfter(prevBoundary, prevSrc, onlyContiguous)) { if (prevBoundary != prevSrc && - !appendUnchanged(prevBoundary, prevSrc, *sink, options, edits, errorCode)) { + !ByteSinkUtil::appendUnchanged(prevBoundary, prevSrc, + *sink, options, edits, errorCode)) { break; } if (edits != nullptr) { @@ -1955,10 +1885,11 @@ Normalizer2Impl::composeUTF8(uint32_t options, UBool onlyContiguous, Hangul::JAMO_T_COUNT + t; prevSrc -= 3; // Replace the Jamo L as well. if (prevBoundary != prevSrc && - !appendUnchanged(prevBoundary, prevSrc, *sink, options, edits, errorCode)) { + !ByteSinkUtil::appendUnchanged(prevBoundary, prevSrc, + *sink, options, edits, errorCode)) { break; } - appendCodePoint(prevSrc, src, syllable, *sink, edits); + ByteSinkUtil::appendCodePoint(prevSrc, src, syllable, *sink, edits); prevBoundary = src; continue; } @@ -1979,10 +1910,11 @@ Normalizer2Impl::composeUTF8(uint32_t options, UBool onlyContiguous, UChar32 syllable = prev + getJamoTMinusBase(prevSrc, src); prevSrc -= 3; // Replace the Hangul LV as well. if (prevBoundary != prevSrc && - !appendUnchanged(prevBoundary, prevSrc, *sink, options, edits, errorCode)) { + !ByteSinkUtil::appendUnchanged(prevBoundary, prevSrc, + *sink, options, edits, errorCode)) { break; } - appendCodePoint(prevSrc, src, syllable, *sink, edits); + ByteSinkUtil::appendCodePoint(prevSrc, src, syllable, *sink, edits); prevBoundary = src; continue; } @@ -2006,7 +1938,8 @@ Normalizer2Impl::composeUTF8(uint32_t options, UBool onlyContiguous, for (;;) { if (src == limit) { if (sink != nullptr) { - appendUnchanged(prevBoundary, limit, *sink, options, edits, errorCode); + ByteSinkUtil::appendUnchanged(prevBoundary, limit, + *sink, options, edits, errorCode); } return TRUE; } @@ -2070,11 +2003,12 @@ Normalizer2Impl::composeUTF8(uint32_t options, UBool onlyContiguous, return FALSE; } if (prevBoundary != prevSrc && - !appendUnchanged(prevBoundary, prevSrc, *sink, options, edits, errorCode)) { + !ByteSinkUtil::appendUnchanged(prevBoundary, prevSrc, + *sink, options, edits, errorCode)) { break; } - if (!appendChange(prevSrc, src, buffer.getStart(), buffer.length(), - *sink, edits, errorCode)) { + if (!ByteSinkUtil::appendChange(prevSrc, src, buffer.getStart(), buffer.length(), + *sink, edits, errorCode)) { break; } prevBoundary = src; diff --git a/icu4c/source/common/ucasemap.cpp b/icu4c/source/common/ucasemap.cpp index 7ad4c315f8e..8eec93c6e3e 100644 --- a/icu4c/source/common/ucasemap.cpp +++ b/icu4c/source/common/ucasemap.cpp @@ -20,9 +20,11 @@ #include "unicode/utypes.h" #include "unicode/brkiter.h" +#include "unicode/bytestream.h" #include "unicode/casemap.h" #include "unicode/edits.h" #include "unicode/stringoptions.h" +#include "unicode/stringpiece.h" #include "unicode/ubrk.h" #include "unicode/uloc.h" #include "unicode/ustring.h" @@ -33,6 +35,7 @@ #include "unicode/utf.h" #include "unicode/utf8.h" #include "unicode/utf16.h" +#include "bytesinkutil.h" #include "cmemory.h" #include "cstring.h" #include "uassert.h" @@ -40,27 +43,6 @@ #include "ucasemap_imp.h" #include "ustr_imp.h" -U_NAMESPACE_BEGIN - -namespace { - -// TODO: share with UTF-16? inline in ucasemap_imp.h? -int32_t checkOverflowAndEditsError(int32_t destIndex, int32_t destCapacity, - Edits *edits, UErrorCode &errorCode) { - if (U_SUCCESS(errorCode)) { - if (destIndex > destCapacity) { - errorCode = U_BUFFER_OVERFLOW_ERROR; - } else if (edits != NULL) { - edits->copyErrorTo(errorCode); - } - } - return destIndex; -} - -} // namespace - -U_NAMESPACE_END - U_NAMESPACE_USE /* UCaseMap service object -------------------------------------------------- */ @@ -151,14 +133,13 @@ ucasemap_setOptions(UCaseMap *csm, uint32_t options, UErrorCode *pErrorCode) { /* TODO(markus): Move to a new, separate utf8case.cpp file. */ +namespace { + /* append a full case mapping result, see UCASE_MAX_STRING_LENGTH */ -static inline int32_t -appendResult(uint8_t *dest, int32_t destIndex, int32_t destCapacity, - int32_t result, const UChar *s, - int32_t cpLength, uint32_t options, icu::Edits *edits) { - UChar32 c; - int32_t length; - UErrorCode errorCode; +inline UBool +appendResult(int32_t cpLength, int32_t result, const UChar *s, + ByteSink &sink, uint32_t options, icu::Edits *edits, UErrorCode &errorCode) { + U_ASSERT(U_SUCCESS(errorCode)); /* decode the result */ if(result<0) { @@ -166,137 +147,25 @@ appendResult(uint8_t *dest, int32_t destIndex, int32_t destCapacity, if(edits!=NULL) { edits->addUnchanged(cpLength); } - if(options & U_OMIT_UNCHANGED_TEXT) { - return destIndex; - } - c=~result; - if(destIndex(INT32_MAX-destIndex)) { - return -1; // integer overflow - } - } - if(edits!=NULL) { - edits->addReplace(cpLength, length); - } - // We might have an overflow, but we know the actual length. - return destIndex+length; - } else if(destIndexaddReplace(cpLength, 1); - } - return destIndex; + return ByteSinkUtil::appendChange(cpLength, s, result, sink, edits, errorCode); } else { - c=result; - length=U8_LENGTH(c); - if(edits!=NULL) { - edits->addReplace(cpLength, length); - } - } - } - // c>=0 single code point - if(length>(INT32_MAX-destIndex)) { - return -1; // integer overflow - } - - if(destIndex> 6) | 0xc0); } -static inline uint8_t getTwoByteTrail(UChar32 c) { return (uint8_t)((c & 0x3f) | 0x80); } - -static inline int32_t -appendTwoBytes(uint8_t *dest, int32_t destIndex, int32_t destCapacity, UChar32 c) { - U_ASSERT(0x370 <= c && c <= 0x3ff); // 2-byte UTF-8, main Greek block - if(2>(INT32_MAX-destIndex)) { - return -1; // integer overflow - } - int32_t limit=destIndex+2; - if(limit<=destCapacity) { - dest+=destIndex; - dest[0]=getTwoByteLead(c); - dest[1]=getTwoByteTrail(c); - } - return limit; -} +inline uint8_t getTwoByteLead(UChar32 c) { return (uint8_t)((c >> 6) | 0xc0); } +inline uint8_t getTwoByteTrail(UChar32 c) { return (uint8_t)((c & 0x3f) | 0x80); } -static inline int32_t -appendTwoBytes(uint8_t *dest, int32_t destIndex, int32_t destCapacity, const char *s) { - if(2>(INT32_MAX-destIndex)) { - return -1; // integer overflow - } - int32_t limit=destIndex+2; - if(limit<=destCapacity) { - dest+=destIndex; - dest[0]=(uint8_t)s[0]; - dest[1]=(uint8_t)s[1]; - } - return limit; -} - -static inline int32_t -appendUnchanged(uint8_t *dest, int32_t destIndex, int32_t destCapacity, - const uint8_t *s, int32_t length, uint32_t options, icu::Edits *edits) { - if(length>0) { - if(edits!=NULL) { - edits->addUnchanged(length); - } - if(options & U_OMIT_UNCHANGED_TEXT) { - return destIndex; - } - if(length>(INT32_MAX-destIndex)) { - return -1; // integer overflow - } - if((destIndex+length)<=destCapacity) { - uprv_memcpy(dest+destIndex, s, length); - } - destIndex+=length; - } - return destIndex; -} +} // namespace static UChar32 U_CALLCONV utf8_caseContextIterator(void *context, int8_t dir) { @@ -334,17 +203,15 @@ utf8_caseContextIterator(void *context, int8_t dir) { * Case-maps [srcStart..srcLimit[ but takes * context [0..srcLength[ into account. */ -static int32_t +static void _caseMap(int32_t caseLocale, uint32_t options, UCaseMapFull *map, - uint8_t *dest, int32_t destCapacity, const uint8_t *src, UCaseContext *csc, int32_t srcStart, int32_t srcLimit, - icu::Edits *edits, + icu::ByteSink &sink, icu::Edits *edits, UErrorCode &errorCode) { /* case mapping loop */ int32_t srcIndex=srcStart; - int32_t destIndex=0; - while(srcIndexcpStart=cpStart=srcIndex; UChar32 c; @@ -352,45 +219,32 @@ _caseMap(int32_t caseLocale, uint32_t options, UCaseMapFull *map, csc->cpLimit=srcIndex; if(c<0) { // Malformed UTF-8. - destIndex=appendUnchanged(dest, destIndex, destCapacity, - src+cpStart, srcIndex-cpStart, options, edits); - if(destIndex<0) { - errorCode=U_INDEX_OUTOFBOUNDS_ERROR; - return 0; - } - continue; - } - const UChar *s; - c=map(c, utf8_caseContextIterator, csc, &s, caseLocale); - destIndex = appendResult(dest, destIndex, destCapacity, c, s, - srcIndex - cpStart, options, edits); - if (destIndex < 0) { - errorCode = U_INDEX_OUTOFBOUNDS_ERROR; - return 0; + ByteSinkUtil::appendUnchanged(src+cpStart, srcIndex-cpStart, + sink, options, edits, errorCode); + } else { + const UChar *s; + c=map(c, utf8_caseContextIterator, csc, &s, caseLocale); + appendResult(srcIndex - cpStart, c, s, sink, options, edits, errorCode); } } - - return destIndex; } #if !UCONFIG_NO_BREAK_ITERATION -U_CFUNC int32_t U_CALLCONV +U_CFUNC void U_CALLCONV ucasemap_internalUTF8ToTitle( int32_t caseLocale, uint32_t options, BreakIterator *iter, - uint8_t *dest, int32_t destCapacity, const uint8_t *src, int32_t srcLength, - icu::Edits *edits, + ByteSink &sink, icu::Edits *edits, UErrorCode &errorCode) { if (!ustrcase_checkTitleAdjustmentOptions(options, errorCode)) { - return 0; + return; } /* set up local variables */ UCaseContext csc=UCASECONTEXT_INITIALIZER; csc.p=(void *)src; csc.limit=srcLength; - int32_t destIndex=0; int32_t prev=0; UBool isFirstIndex=TRUE; @@ -435,11 +289,9 @@ ucasemap_internalUTF8ToTitle( U8_NEXT(src, titleLimit, index, c); } if (prev < titleStart) { - destIndex=appendUnchanged(dest, destIndex, destCapacity, - src+prev, titleStart-prev, options, edits); - if(destIndex<0) { - errorCode=U_INDEX_OUTOFBOUNDS_ERROR; - return 0; + if (!ByteSinkUtil::appendUnchanged(src+prev, titleStart-prev, + sink, options, edits, errorCode)) { + return; } } } @@ -451,16 +303,15 @@ ucasemap_internalUTF8ToTitle( csc.cpLimit=titleLimit; const UChar *s; c=ucase_toFullTitle(c, utf8_caseContextIterator, &csc, &s, caseLocale); - destIndex=appendResult(dest, destIndex, destCapacity, c, s, - titleLimit-titleStart, options, edits); + if (!appendResult(titleLimit-titleStart, c, s, sink, options, edits, errorCode)) { + return; + } } else { // Malformed UTF-8. - destIndex=appendUnchanged(dest, destIndex, destCapacity, - src+titleStart, titleLimit-titleStart, options, edits); - } - if(destIndex<0) { - errorCode=U_INDEX_OUTOFBOUNDS_ERROR; - return 0; + if (!ByteSinkUtil::appendUnchanged(src+titleStart, titleLimit-titleStart, + sink, options, edits, errorCode)) { + return; + } } /* Special case Dutch IJ titlecasing */ @@ -468,22 +319,13 @@ ucasemap_internalUTF8ToTitle( caseLocale == UCASE_LOC_DUTCH && (src[titleStart] == 0x0049 || src[titleStart] == 0x0069)) { if (src[titleStart+1] == 0x006A) { - destIndex=appendASCII(dest, destIndex, destCapacity, 0x004A); - if(destIndex<0) { - errorCode=U_INDEX_OUTOFBOUNDS_ERROR; - return 0; - } - if(edits!=NULL) { - edits->addReplace(1, 1); - } + ByteSinkUtil::appendCodePoint(1, 0x004A, sink, edits); titleLimit++; } else if (src[titleStart+1] == 0x004A) { // Keep the capital J from getting lowercased. - destIndex=appendUnchanged(dest, destIndex, destCapacity, - src+titleStart+1, 1, options, edits); - if(destIndex<0) { - errorCode=U_INDEX_OUTOFBOUNDS_ERROR; - return 0; + if (!ByteSinkUtil::appendUnchanged(src+titleStart+1, 1, + sink, options, edits, errorCode)) { + return; } titleLimit++; } @@ -493,26 +335,18 @@ ucasemap_internalUTF8ToTitle( if(titleLimit= 0 && (data & HAS_EITHER_DIALYTIKA) != 0) { - destIndex=appendTwoBytes(dest, destIndex, destCapacity, u8"\u0308"); // restore or add a dialytika + ByteSinkUtil::appendTwoBytes(upper, sink); + if ((data & HAS_EITHER_DIALYTIKA) != 0) { + sink.Append(u8"\u0308", 2); // restore or add a dialytika } - if (destIndex >= 0 && addTonos) { - destIndex=appendTwoBytes(dest, destIndex, destCapacity, u8"\u0301"); + if (addTonos) { + sink.Append(u8"\u0301", 2); } - while (destIndex >= 0 && numYpogegrammeni > 0) { - destIndex=appendTwoBytes(dest, destIndex, destCapacity, u8"\u0399"); + while (numYpogegrammeni > 0) { + sink.Append(u8"\u0399", 2); --numYpogegrammeni; } - if(destIndex<0) { - errorCode=U_INDEX_OUTOFBOUNDS_ERROR; - return 0; - } } } else if(c>=0) { const UChar *s; c=ucase_toFullUpper(c, NULL, NULL, &s, UCASE_LOC_GREEK); - destIndex = appendResult(dest, destIndex, destCapacity, c, s, - nextIndex - i, options, edits); - if (destIndex < 0) { - errorCode = U_INDEX_OUTOFBOUNDS_ERROR; - return 0; + if (!appendResult(nextIndex - i, c, s, sink, options, edits, errorCode)) { + return; } } else { // Malformed UTF-8. - destIndex=appendUnchanged(dest, destIndex, destCapacity, - src+i, nextIndex-i, options, edits); - if(destIndex<0) { - errorCode=U_INDEX_OUTOFBOUNDS_ERROR; - return 0; + if (!ByteSinkUtil::appendUnchanged(src+i, nextIndex-i, + sink, options, edits, errorCode)) { + return; } } i = nextIndex; state = nextState; } - - return destIndex; } } // namespace GreekUpper U_NAMESPACE_END -static int32_t U_CALLCONV +static void U_CALLCONV ucasemap_internalUTF8ToLower(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_UNUSED - uint8_t *dest, int32_t destCapacity, const uint8_t *src, int32_t srcLength, - icu::Edits *edits, + icu::ByteSink &sink, icu::Edits *edits, UErrorCode &errorCode) { UCaseContext csc=UCASECONTEXT_INITIALIZER; csc.p=(void *)src; csc.limit=srcLength; - int32_t destIndex = _caseMap( + _caseMap( caseLocale, options, ucase_toFullLower, - dest, destCapacity, src, &csc, 0, srcLength, - edits, errorCode); - return checkOverflowAndEditsError(destIndex, destCapacity, edits, errorCode); + sink, edits, errorCode); } -static int32_t U_CALLCONV +static void U_CALLCONV ucasemap_internalUTF8ToUpper(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_UNUSED - uint8_t *dest, int32_t destCapacity, const uint8_t *src, int32_t srcLength, - icu::Edits *edits, + icu::ByteSink &sink, icu::Edits *edits, UErrorCode &errorCode) { - int32_t destIndex; if (caseLocale == UCASE_LOC_GREEK) { - destIndex = GreekUpper::toUpper(options, dest, destCapacity, - src, srcLength, edits, errorCode); + GreekUpper::toUpper(options, src, srcLength, sink, edits, errorCode); } else { UCaseContext csc=UCASECONTEXT_INITIALIZER; csc.p=(void *)src; csc.limit=srcLength; - destIndex = _caseMap( + _caseMap( caseLocale, options, ucase_toFullUpper, - dest, destCapacity, src, &csc, 0, srcLength, - edits, errorCode); + sink, edits, errorCode); } - return checkOverflowAndEditsError(destIndex, destCapacity, edits, errorCode); } -static int32_t U_CALLCONV +static void U_CALLCONV ucasemap_internalUTF8Fold(int32_t /* caseLocale */, uint32_t options, UCASEMAP_BREAK_ITERATOR_UNUSED - uint8_t *dest, int32_t destCapacity, const uint8_t *src, int32_t srcLength, - icu::Edits *edits, + icu::ByteSink &sink, icu::Edits *edits, UErrorCode &errorCode) { /* case mapping loop */ int32_t srcIndex = 0; - int32_t destIndex = 0; - while (srcIndex < srcLength) { + while (U_SUCCESS(errorCode) && srcIndex < srcLength) { int32_t cpStart = srcIndex; UChar32 c; U8_NEXT(src, srcIndex, srcLength, c); if(c<0) { // Malformed UTF-8. - destIndex=appendUnchanged(dest, destIndex, destCapacity, - src+cpStart, srcIndex-cpStart, options, edits); - if(destIndex<0) { - errorCode=U_INDEX_OUTOFBOUNDS_ERROR; - return 0; - } - continue; - } - const UChar *s; - c = ucase_toFullFolding(c, &s, options); - destIndex = appendResult(dest, destIndex, destCapacity, c, s, - srcIndex - cpStart, options, edits); - if (destIndex < 0) { - errorCode = U_INDEX_OUTOFBOUNDS_ERROR; - return 0; + ByteSinkUtil::appendUnchanged(src+cpStart, srcIndex-cpStart, + sink, options, edits, errorCode); + } else { + const UChar *s; + c = ucase_toFullFolding(c, &s, options); + appendResult(srcIndex - cpStart, c, s, sink, options, edits, errorCode); } } +} + +void +ucasemap_mapUTF8(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_PARAM + const char *src, int32_t srcLength, + UTF8CaseMapper *stringCaseMapper, + icu::ByteSink &sink, icu::Edits *edits, + UErrorCode &errorCode) { + /* check argument values */ + if (U_FAILURE(errorCode)) { + return; + } + if ((src == nullptr && srcLength != 0) || srcLength < -1) { + errorCode = U_ILLEGAL_ARGUMENT_ERROR; + return; + } + + // Get the string length. + if (srcLength == -1) { + srcLength = (int32_t)uprv_strlen((const char *)src); + } - return checkOverflowAndEditsError(destIndex, destCapacity, edits, errorCode); + if (edits != nullptr && (options & U_EDITS_NO_RESET) == 0) { + edits->reset(); + } + stringCaseMapper(caseLocale, options, UCASEMAP_BREAK_ITERATOR + (const uint8_t *)src, srcLength, sink, edits, errorCode); + sink.Flush(); + if (U_SUCCESS(errorCode)) { + if (edits != nullptr) { + edits->copyErrorTo(errorCode); + } + } } -U_CFUNC int32_t +int32_t ucasemap_mapUTF8(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_PARAM - uint8_t *dest, int32_t destCapacity, - const uint8_t *src, int32_t srcLength, + char *dest, int32_t destCapacity, + const char *src, int32_t srcLength, UTF8CaseMapper *stringCaseMapper, icu::Edits *edits, UErrorCode &errorCode) { - int32_t destLength; - /* check argument values */ if(U_FAILURE(errorCode)) { return 0; } if( destCapacity<0 || (dest==NULL && destCapacity>0) || - src==NULL || - srcLength<-1 + (src==NULL && srcLength!=0) || srcLength<-1 ) { errorCode=U_ILLEGAL_ARGUMENT_ERROR; return 0; @@ -823,12 +651,21 @@ ucasemap_mapUTF8(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_P return 0; } + CheckedArrayByteSink sink(dest, destCapacity); if (edits != nullptr && (options & U_EDITS_NO_RESET) == 0) { edits->reset(); } - destLength=stringCaseMapper(caseLocale, options, UCASEMAP_BREAK_ITERATOR - dest, destCapacity, src, srcLength, edits, errorCode); - return u_terminateChars((char *)dest, destCapacity, destLength, &errorCode); + stringCaseMapper(caseLocale, options, UCASEMAP_BREAK_ITERATOR + (const uint8_t *)src, srcLength, sink, edits, errorCode); + sink.Flush(); + if (U_SUCCESS(errorCode)) { + if (sink.Overflowed()) { + errorCode = U_BUFFER_OVERFLOW_ERROR; + } else if (edits != nullptr) { + edits->copyErrorTo(errorCode); + } + } + return u_terminateChars(dest, destCapacity, sink.NumberOfBytesAppended(), &errorCode); } /* public API functions */ @@ -840,8 +677,8 @@ ucasemap_utf8ToLower(const UCaseMap *csm, UErrorCode *pErrorCode) { return ucasemap_mapUTF8( csm->caseLocale, csm->options, UCASEMAP_BREAK_ITERATOR_NULL - (uint8_t *)dest, destCapacity, - (const uint8_t *)src, srcLength, + dest, destCapacity, + src, srcLength, ucasemap_internalUTF8ToLower, NULL, *pErrorCode); } @@ -852,8 +689,8 @@ ucasemap_utf8ToUpper(const UCaseMap *csm, UErrorCode *pErrorCode) { return ucasemap_mapUTF8( csm->caseLocale, csm->options, UCASEMAP_BREAK_ITERATOR_NULL - (uint8_t *)dest, destCapacity, - (const uint8_t *)src, srcLength, + dest, destCapacity, + src, srcLength, ucasemap_internalUTF8ToUpper, NULL, *pErrorCode); } @@ -864,13 +701,43 @@ ucasemap_utf8FoldCase(const UCaseMap *csm, UErrorCode *pErrorCode) { return ucasemap_mapUTF8( UCASE_LOC_ROOT, csm->options, UCASEMAP_BREAK_ITERATOR_NULL - (uint8_t *)dest, destCapacity, - (const uint8_t *)src, srcLength, + dest, destCapacity, + src, srcLength, ucasemap_internalUTF8Fold, NULL, *pErrorCode); } U_NAMESPACE_BEGIN +void CaseMap::utf8ToLower( + const char *locale, uint32_t options, + StringPiece src, ByteSink &sink, Edits *edits, + UErrorCode &errorCode) { + ucasemap_mapUTF8( + ustrcase_getCaseLocale(locale), options, UCASEMAP_BREAK_ITERATOR_NULL + src.data(), src.length(), + ucasemap_internalUTF8ToLower, sink, edits, errorCode); +} + +void CaseMap::utf8ToUpper( + const char *locale, uint32_t options, + StringPiece src, ByteSink &sink, Edits *edits, + UErrorCode &errorCode) { + ucasemap_mapUTF8( + ustrcase_getCaseLocale(locale), options, UCASEMAP_BREAK_ITERATOR_NULL + src.data(), src.length(), + ucasemap_internalUTF8ToUpper, sink, edits, errorCode); +} + +void CaseMap::utf8Fold( + uint32_t options, + StringPiece src, ByteSink &sink, Edits *edits, + UErrorCode &errorCode) { + ucasemap_mapUTF8( + UCASE_LOC_ROOT, options, UCASEMAP_BREAK_ITERATOR_NULL + src.data(), src.length(), + ucasemap_internalUTF8Fold, sink, edits, errorCode); +} + int32_t CaseMap::utf8ToLower( const char *locale, uint32_t options, const char *src, int32_t srcLength, @@ -878,8 +745,8 @@ int32_t CaseMap::utf8ToLower( UErrorCode &errorCode) { return ucasemap_mapUTF8( ustrcase_getCaseLocale(locale), options, UCASEMAP_BREAK_ITERATOR_NULL - (uint8_t *)dest, destCapacity, - (const uint8_t *)src, srcLength, + dest, destCapacity, + src, srcLength, ucasemap_internalUTF8ToLower, edits, errorCode); } @@ -890,8 +757,8 @@ int32_t CaseMap::utf8ToUpper( UErrorCode &errorCode) { return ucasemap_mapUTF8( ustrcase_getCaseLocale(locale), options, UCASEMAP_BREAK_ITERATOR_NULL - (uint8_t *)dest, destCapacity, - (const uint8_t *)src, srcLength, + dest, destCapacity, + src, srcLength, ucasemap_internalUTF8ToUpper, edits, errorCode); } @@ -902,8 +769,8 @@ int32_t CaseMap::utf8Fold( UErrorCode &errorCode) { return ucasemap_mapUTF8( UCASE_LOC_ROOT, options, UCASEMAP_BREAK_ITERATOR_NULL - (uint8_t *)dest, destCapacity, - (const uint8_t *)src, srcLength, + dest, destCapacity, + src, srcLength, ucasemap_internalUTF8Fold, edits, errorCode); } diff --git a/icu4c/source/common/ucasemap_imp.h b/icu4c/source/common/ucasemap_imp.h index 345a734658b..ad9d52ec320 100644 --- a/icu4c/source/common/ucasemap_imp.h +++ b/icu4c/source/common/ucasemap_imp.h @@ -73,6 +73,8 @@ uprv_haveProperties(UErrorCode *pErrorCode); U_NAMESPACE_BEGIN +class ByteSink; + /** Returns TRUE if the options are valid. Otherwise FALSE, and sets an error. */ inline UBool ustrcase_checkTitleAdjustmentOptions(uint32_t options, UErrorCode &errorCode) { if (U_FAILURE(errorCode)) { return FALSE; } @@ -207,39 +209,43 @@ ustrcase_mapWithOverlap(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITE * UTF-8 version of UStringCaseMapper. * All error checking must be done. * The UCaseMap must be fully initialized, with locale and/or iter set as needed. - * src and dest must not overlap. */ -typedef int32_t U_CALLCONV +typedef void U_CALLCONV UTF8CaseMapper(int32_t caseLocale, uint32_t options, #if !UCONFIG_NO_BREAK_ITERATION icu::BreakIterator *iter, #endif - uint8_t *dest, int32_t destCapacity, const uint8_t *src, int32_t srcLength, - icu::Edits *edits, + icu::ByteSink &sink, icu::Edits *edits, UErrorCode &errorCode); #if !UCONFIG_NO_BREAK_ITERATION /** Implements UTF8CaseMapper. */ -U_CFUNC int32_t U_CALLCONV +U_CFUNC void U_CALLCONV ucasemap_internalUTF8ToTitle(int32_t caseLocale, uint32_t options, icu::BreakIterator *iter, - uint8_t *dest, int32_t destCapacity, const uint8_t *src, int32_t srcLength, - icu::Edits *edits, + icu::ByteSink &sink, icu::Edits *edits, UErrorCode &errorCode); #endif +void +ucasemap_mapUTF8(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_PARAM + const char *src, int32_t srcLength, + UTF8CaseMapper *stringCaseMapper, + icu::ByteSink &sink, icu::Edits *edits, + UErrorCode &errorCode); + /** * Implements argument checking and buffer handling * for UTF-8 string case mapping as a common function. */ -U_CFUNC int32_t +int32_t ucasemap_mapUTF8(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_PARAM - uint8_t *dest, int32_t destCapacity, - const uint8_t *src, int32_t srcLength, + char *dest, int32_t destCapacity, + const char *src, int32_t srcLength, UTF8CaseMapper *stringCaseMapper, icu::Edits *edits, UErrorCode &errorCode); diff --git a/icu4c/source/common/ucasemap_titlecase_brkiter.cpp b/icu4c/source/common/ucasemap_titlecase_brkiter.cpp index 2e09a5548a1..c21dfb7698a 100644 --- a/icu4c/source/common/ucasemap_titlecase_brkiter.cpp +++ b/icu4c/source/common/ucasemap_titlecase_brkiter.cpp @@ -31,6 +31,29 @@ U_NAMESPACE_BEGIN +void CaseMap::utf8ToTitle( + const char *locale, uint32_t options, BreakIterator *iter, + StringPiece src, ByteSink &sink, Edits *edits, + UErrorCode &errorCode) { + if (U_FAILURE(errorCode)) { + return; + } + UText utext = UTEXT_INITIALIZER; + utext_openUTF8(&utext, src.data(), src.length(), &errorCode); + LocalPointer ownedIter; + iter = ustrcase_getTitleBreakIterator(nullptr, locale, options, iter, ownedIter, errorCode); + if (iter == nullptr) { + utext_close(&utext); + return; + } + iter->setText(&utext, errorCode); + ucasemap_mapUTF8( + ustrcase_getCaseLocale(locale), options, iter, + src.data(), src.length(), + ucasemap_internalUTF8ToTitle, sink, edits, errorCode); + utext_close(&utext); +} + int32_t CaseMap::utf8ToTitle( const char *locale, uint32_t options, BreakIterator *iter, const char *src, int32_t srcLength, @@ -50,8 +73,8 @@ int32_t CaseMap::utf8ToTitle( iter->setText(&utext, errorCode); int32_t length=ucasemap_mapUTF8( ustrcase_getCaseLocale(locale), options, iter, - (uint8_t *)dest, destCapacity, - (const uint8_t *)src, srcLength, + dest, destCapacity, + src, srcLength, ucasemap_internalUTF8ToTitle, edits, errorCode); utext_close(&utext); return length; @@ -101,8 +124,8 @@ ucasemap_utf8ToTitle(UCaseMap *csm, csm->iter->setText(&utext, *pErrorCode); int32_t length=ucasemap_mapUTF8( csm->caseLocale, csm->options, csm->iter, - (uint8_t *)dest, destCapacity, - (const uint8_t *)src, srcLength, + dest, destCapacity, + src, srcLength, ucasemap_internalUTF8ToTitle, NULL, *pErrorCode); utext_close(&utext); return length; diff --git a/icu4c/source/common/unicode/casemap.h b/icu4c/source/common/unicode/casemap.h index 6a29a426605..4a4917bdcaf 100644 --- a/icu4c/source/common/unicode/casemap.h +++ b/icu4c/source/common/unicode/casemap.h @@ -8,6 +8,7 @@ #define __CASEMAP_H__ #include "unicode/utypes.h" +#include "unicode/stringpiece.h" #include "unicode/uobject.h" /** @@ -20,6 +21,7 @@ U_NAMESPACE_BEGIN #ifndef U_HIDE_DRAFT_API class BreakIterator; +class ByteSink; class Edits; /** @@ -194,6 +196,129 @@ public: char16_t *dest, int32_t destCapacity, Edits *edits, UErrorCode &errorCode); + /** + * Lowercases a UTF-8 string and optionally records edits. + * Casing is locale-dependent and context-sensitive. + * The result may be longer or shorter than the original. + * + * @param locale The locale ID. ("" = root locale, NULL = default locale.) + * @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT and U_EDITS_NO_RESET. + * @param src The original string. + * @param sink A ByteSink to which the result string is written. + * sink.Flush() is called at the end. + * @param edits Records edits for index mapping, working with styled text, + * and getting only changes (if any). + * The Edits contents is undefined if any error occurs. + * This function calls edits->reset() first unless + * options includes U_EDITS_NO_RESET. edits can be NULL. + * @param errorCode Reference to an in/out error code value + * which must not indicate a failure before the function call. + * + * @see ucasemap_utf8ToLower + * @draft ICU 60 + */ + static void utf8ToLower( + const char *locale, uint32_t options, + StringPiece src, ByteSink &sink, Edits *edits, + UErrorCode &errorCode); + + /** + * Uppercases a UTF-8 string and optionally records edits. + * Casing is locale-dependent and context-sensitive. + * The result may be longer or shorter than the original. + * + * @param locale The locale ID. ("" = root locale, NULL = default locale.) + * @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT and U_EDITS_NO_RESET. + * @param src The original string. + * @param sink A ByteSink to which the result string is written. + * sink.Flush() is called at the end. + * @param edits Records edits for index mapping, working with styled text, + * and getting only changes (if any). + * The Edits contents is undefined if any error occurs. + * This function calls edits->reset() first unless + * options includes U_EDITS_NO_RESET. edits can be NULL. + * @param errorCode Reference to an in/out error code value + * which must not indicate a failure before the function call. + * + * @see ucasemap_utf8ToUpper + * @draft ICU 60 + */ + static void utf8ToUpper( + const char *locale, uint32_t options, + StringPiece src, ByteSink &sink, Edits *edits, + UErrorCode &errorCode); + +#if !UCONFIG_NO_BREAK_ITERATION + + /** + * Titlecases a UTF-8 string and optionally records edits. + * Casing is locale-dependent and context-sensitive. + * The result may be longer or shorter than the original. + * + * Titlecasing uses a break iterator to find the first characters of words + * that are to be titlecased. It titlecases those characters and lowercases + * all others. (This can be modified with options bits.) + * + * @param locale The locale ID. ("" = root locale, NULL = default locale.) + * @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT, U_EDITS_NO_RESET, + * U_TITLECASE_NO_LOWERCASE, + * U_TITLECASE_NO_BREAK_ADJUSTMENT, U_TITLECASE_ADJUST_TO_CASED, + * U_TITLECASE_WHOLE_STRING, U_TITLECASE_SENTENCES. + * @param iter A break iterator to find the first characters of words that are to be titlecased. + * It is set to the source string (setUText()) + * and used one or more times for iteration (first() and next()). + * If NULL, then a word break iterator for the locale is used + * (or something equivalent). + * @param src The original string. + * @param sink A ByteSink to which the result string is written. + * sink.Flush() is called at the end. + * @param edits Records edits for index mapping, working with styled text, + * and getting only changes (if any). + * The Edits contents is undefined if any error occurs. + * This function calls edits->reset() first unless + * options includes U_EDITS_NO_RESET. edits can be NULL. + * @param errorCode Reference to an in/out error code value + * which must not indicate a failure before the function call. + * + * @see ucasemap_utf8ToTitle + * @draft ICU 60 + */ + static void utf8ToTitle( + const char *locale, uint32_t options, BreakIterator *iter, + StringPiece src, ByteSink &sink, Edits *edits, + UErrorCode &errorCode); + +#endif // UCONFIG_NO_BREAK_ITERATION + + /** + * Case-folds a UTF-8 string and optionally records edits. + * + * Case folding is locale-independent and not context-sensitive, + * but there is an option for whether to include or exclude mappings for dotted I + * and dotless i that are marked with 'T' in CaseFolding.txt. + * + * The result may be longer or shorter than the original. + * + * @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT and U_EDITS_NO_RESET. + * @param src The original string. + * @param sink A ByteSink to which the result string is written. + * sink.Flush() is called at the end. + * @param edits Records edits for index mapping, working with styled text, + * and getting only changes (if any). + * The Edits contents is undefined if any error occurs. + * This function calls edits->reset() first unless + * options includes U_EDITS_NO_RESET. edits can be NULL. + * @param errorCode Reference to an in/out error code value + * which must not indicate a failure before the function call. + * + * @see ucasemap_utf8FoldCase + * @draft ICU 60 + */ + static void utf8Fold( + uint32_t options, + StringPiece src, ByteSink &sink, Edits *edits, + UErrorCode &errorCode); + /** * Lowercases a UTF-8 string and optionally records edits. * Casing is locale-dependent and context-sensitive. @@ -224,7 +349,7 @@ public: * @see ucasemap_utf8ToLower * @draft ICU 59 */ - static int32_t utf8ToLower( + static int32_t utf8ToLower( const char *locale, uint32_t options, const char *src, int32_t srcLength, char *dest, int32_t destCapacity, Edits *edits, diff --git a/icu4c/source/test/intltest/strcase.cpp b/icu4c/source/test/intltest/strcase.cpp index 8f7a57d2b41..f78acc30197 100644 --- a/icu4c/source/test/intltest/strcase.cpp +++ b/icu4c/source/test/intltest/strcase.cpp @@ -1438,55 +1438,50 @@ void StringCaseTest::TestCaseMapToString() { void StringCaseTest::TestCaseMapUTF8ToString() { IcuTestErrorCode errorCode(*this, "TestCaseMapUTF8ToString"); - // TODO: Change this to writing to string via ByteSink when that is available. - char dest[50]; + std::string dest; + StringByteSink sink(&dest); // Omit unchanged text. - int32_t length = CaseMap::utf8ToLower("tr", U_OMIT_UNCHANGED_TEXT, - u8"IstanBul", 8, dest, UPRV_LENGTHOF(dest), nullptr, errorCode); - assertEquals(u"toLower(IstanBul)", UnicodeString(u"ıb"), - UnicodeString::fromUTF8(StringPiece(dest, length))); - length = CaseMap::utf8ToUpper("el", U_OMIT_UNCHANGED_TEXT, - u8"Πατάτα", 6 * 2, dest, UPRV_LENGTHOF(dest), nullptr, errorCode); + CaseMap::utf8ToLower("tr", U_OMIT_UNCHANGED_TEXT, u8"IstanBul", sink, nullptr, errorCode); + assertEquals(u"toLower(IstanBul)", UnicodeString(u"ıb"), UnicodeString::fromUTF8(dest)); + dest.clear(); + CaseMap::utf8ToUpper("el", U_OMIT_UNCHANGED_TEXT, u8"Πατάτα", sink, nullptr, errorCode); assertEquals(u"toUpper(Πατάτα)", UnicodeString(u"ΑΤΑΤΑ"), - UnicodeString::fromUTF8(StringPiece(dest, length))); + UnicodeString::fromUTF8(dest)); #if !UCONFIG_NO_BREAK_ITERATION - length = CaseMap::utf8ToTitle("nl", - U_OMIT_UNCHANGED_TEXT | - U_TITLECASE_NO_BREAK_ADJUSTMENT | - U_TITLECASE_NO_LOWERCASE, - nullptr, u8"IjssEL IglOo", 12, - dest, UPRV_LENGTHOF(dest), nullptr, errorCode); + dest.clear(); + CaseMap::utf8ToTitle( + "nl", U_OMIT_UNCHANGED_TEXT | U_TITLECASE_NO_BREAK_ADJUSTMENT | U_TITLECASE_NO_LOWERCASE, + nullptr, u8"IjssEL IglOo", sink, nullptr, errorCode); assertEquals(u"toTitle(IjssEL IglOo)", UnicodeString(u"J"), - UnicodeString::fromUTF8(StringPiece(dest, length))); + UnicodeString::fromUTF8(dest)); #endif - length = CaseMap::utf8Fold(U_OMIT_UNCHANGED_TEXT | U_FOLD_CASE_EXCLUDE_SPECIAL_I, - u8"IßtanBul", 1 + 2 + 6, dest, UPRV_LENGTHOF(dest), nullptr, errorCode); + dest.clear(); + CaseMap::utf8Fold(U_OMIT_UNCHANGED_TEXT | U_FOLD_CASE_EXCLUDE_SPECIAL_I, + u8"IßtanBul", sink, nullptr, errorCode); assertEquals(u"foldCase(IßtanBul)", UnicodeString(u"ıssb"), - UnicodeString::fromUTF8(StringPiece(dest, length))); + UnicodeString::fromUTF8(dest)); // Return the whole result string. - length = CaseMap::utf8ToLower("tr", 0, - u8"IstanBul", 8, dest, UPRV_LENGTHOF(dest), nullptr, errorCode); + dest.clear(); + CaseMap::utf8ToLower("tr", 0, u8"IstanBul", sink, nullptr, errorCode); assertEquals(u"toLower(IstanBul)", UnicodeString(u"ıstanbul"), - UnicodeString::fromUTF8(StringPiece(dest, length))); - length = CaseMap::utf8ToUpper("el", 0, - u8"Πατάτα", 6 * 2, dest, UPRV_LENGTHOF(dest), nullptr, errorCode); + UnicodeString::fromUTF8(dest)); + dest.clear(); + CaseMap::utf8ToUpper("el", 0, u8"Πατάτα", sink, nullptr, errorCode); assertEquals(u"toUpper(Πατάτα)", UnicodeString(u"ΠΑΤΑΤΑ"), - UnicodeString::fromUTF8(StringPiece(dest, length))); + UnicodeString::fromUTF8(dest)); #if !UCONFIG_NO_BREAK_ITERATION - length = CaseMap::utf8ToTitle("nl", - U_TITLECASE_NO_BREAK_ADJUSTMENT | - U_TITLECASE_NO_LOWERCASE, - nullptr, u8"IjssEL IglOo", 12, - dest, UPRV_LENGTHOF(dest), nullptr, errorCode); + dest.clear(); + CaseMap::utf8ToTitle("nl", U_TITLECASE_NO_BREAK_ADJUSTMENT | U_TITLECASE_NO_LOWERCASE, + nullptr, u8"IjssEL IglOo", sink, nullptr, errorCode); assertEquals(u"toTitle(IjssEL IglOo)", UnicodeString(u"IJssEL IglOo"), - UnicodeString::fromUTF8(StringPiece(dest, length))); + UnicodeString::fromUTF8(dest)); #endif - length = CaseMap::utf8Fold(U_FOLD_CASE_EXCLUDE_SPECIAL_I, - u8"IßtanBul", 1 + 2 + 6, dest, UPRV_LENGTHOF(dest), nullptr, errorCode); + dest.clear(); + CaseMap::utf8Fold(U_FOLD_CASE_EXCLUDE_SPECIAL_I, u8"IßtanBul", sink, nullptr, errorCode); assertEquals(u"foldCase(IßtanBul)", UnicodeString(u"ısstanbul"), - UnicodeString::fromUTF8(StringPiece(dest, length))); + UnicodeString::fromUTF8(dest)); } void StringCaseTest::TestLongUnicodeString() {