From 524748c6bfee282fc150b3614523ec1a7191e608 Mon Sep 17 00:00:00 2001 From: Markus Scherer Date: Thu, 12 Mar 2020 19:21:24 -0700 Subject: [PATCH] ICU-20984 StringPiece & ByteSink overloads for char8_t* --- icu4c/source/common/ucasemap.cpp | 6 +- icu4c/source/common/unicode/bytestream.h | 34 +++++++ icu4c/source/common/unicode/stringpiece.h | 88 +++++++++++++++++-- icu4c/source/test/intltest/collationtest.cpp | 17 ++-- .../intltest/compactdecimalformattest.cpp | 61 ++++++------- icu4c/source/test/intltest/regextst.cpp | 5 +- icu4c/source/test/intltest/strcase.cpp | 25 +++--- icu4c/source/test/intltest/strtest.cpp | 66 +++++++++++++- icu4c/source/test/intltest/strtest.h | 2 + icu4c/source/test/intltest/tstnorm.cpp | 29 +++--- icu4c/source/test/intltest/uts46test.cpp | 2 +- 11 files changed, 258 insertions(+), 77 deletions(-) diff --git a/icu4c/source/common/ucasemap.cpp b/icu4c/source/common/ucasemap.cpp index cc998c993d7..ed72bda828f 100644 --- a/icu4c/source/common/ucasemap.cpp +++ b/icu4c/source/common/ucasemap.cpp @@ -687,13 +687,13 @@ void toUpper(uint32_t options, if (change) { ByteSinkUtil::appendTwoBytes(upper, sink); if ((data & HAS_EITHER_DIALYTIKA) != 0) { - sink.Append(reinterpret_cast(u8"\u0308"), 2); // restore or add a dialytika + sink.AppendU8(u8"\u0308", 2); // restore or add a dialytika } if (addTonos) { - sink.Append(reinterpret_cast(u8"\u0301"), 2); + sink.AppendU8(u8"\u0301", 2); } while (numYpogegrammeni > 0) { - sink.Append(reinterpret_cast(u8"\u0399"), 2); + sink.AppendU8(u8"\u0399", 2); --numYpogegrammeni; } } diff --git a/icu4c/source/common/unicode/bytestream.h b/icu4c/source/common/unicode/bytestream.h index 0d60492fe23..7fe24062228 100644 --- a/icu4c/source/common/unicode/bytestream.h +++ b/icu4c/source/common/unicode/bytestream.h @@ -71,6 +71,40 @@ public: */ virtual void Append(const char* bytes, int32_t n) = 0; +#ifndef U_HIDE_DRAFT_API + /** + * Appends n bytes to this. Same as Append(). + * Call AppendU8() with u8"string literals" which are const char * in C++11 + * but const char8_t * in C++20. + * If the compiler does support char8_t as a distinct type, + * then an AppendU8() overload for that is defined and will be chosen. + * + * @param bytes the pointer to the bytes + * @param n the number of bytes; must be non-negative + * @draft ICU 67 + */ + inline void AppendU8(const char* bytes, int32_t n) { + Append(bytes, n); + } + +#if defined(__cpp_char8_t) || defined(U_IN_DOXYGEN) + /** + * Appends n bytes to this. Same as Append() but for a const char8_t * pointer. + * Call AppendU8() with u8"string literals" which are const char * in C++11 + * but const char8_t * in C++20. + * If the compiler does support char8_t as a distinct type, + * then this AppendU8() overload for that is defined and will be chosen. + * + * @param bytes the pointer to the bytes + * @param n the number of bytes; must be non-negative + * @draft ICU 67 + */ + inline void AppendU8(const char8_t* bytes, int32_t n) { + Append(reinterpret_cast(bytes), n); + } +#endif +#endif // U_HIDE_DRAFT_API + /** * Returns a writable buffer for appending and writes the buffer's capacity to * *result_capacity. Guarantees *result_capacity>=min_capacity. diff --git a/icu4c/source/common/unicode/stringpiece.h b/icu4c/source/common/unicode/stringpiece.h index ba2240e6ac0..52c1e9ebd24 100644 --- a/icu4c/source/common/unicode/stringpiece.h +++ b/icu4c/source/common/unicode/stringpiece.h @@ -67,19 +67,50 @@ class U_COMMON_API StringPiece : public UMemory { * Default constructor, creates an empty StringPiece. * @stable ICU 4.2 */ - StringPiece() : ptr_(NULL), length_(0) { } + StringPiece() : ptr_(nullptr), length_(0) { } + /** * Constructs from a NUL-terminated const char * pointer. * @param str a NUL-terminated const char * pointer * @stable ICU 4.2 */ StringPiece(const char* str); +#ifndef U_HIDE_DRAFT_API +#if defined(__cpp_char8_t) || defined(U_IN_DOXYGEN) + /** + * Constructs from a NUL-terminated const char8_t * pointer. + * @param str a NUL-terminated const char8_t * pointer + * @draft ICU 67 + */ + StringPiece(const char8_t* str) : StringPiece(reinterpret_cast(str)) {} +#endif + /** + * Constructs an empty StringPiece. + * Needed for type disambiguation from multiple other overloads. + * @param p nullptr + * @draft ICU 67 + */ + StringPiece(std::nullptr_t p) : ptr_(p), length_(0) {} +#endif // U_HIDE_DRAFT_API + /** * Constructs from a std::string. * @stable ICU 4.2 */ StringPiece(const std::string& str) : ptr_(str.data()), length_(static_cast(str.size())) { } +#ifndef U_HIDE_DRAFT_API +#if defined(__cpp_lib_char8_t) || defined(U_IN_DOXYGEN) + /** + * Constructs from a std::u8string. + * @draft ICU 67 + */ + StringPiece(const std::u8string& str) + : ptr_(reinterpret_cast(str.data())), + length_(static_cast(str.size())) { } +#endif +#endif // U_HIDE_DRAFT_API + #ifndef U_HIDE_DRAFT_API /** * Constructs from some other implementation of a string piece class, from any @@ -88,7 +119,7 @@ class U_COMMON_API StringPiece : public UMemory { * \code{.cpp} * * struct OtherStringPieceClass { - * const char* data(); + * const char* data(); // or const char8_t* * size_t size(); * }; * @@ -97,16 +128,25 @@ class U_COMMON_API StringPiece : public UMemory { * The other string piece class will typically be std::string_view from C++17 * or absl::string_view from Abseil. * + * Starting with C++20, data() may also return a const char8_t* pointer, + * as from std::u8string_view. + * * @param str the other string piece * @draft ICU 65 */ template ::value && + (std::is_same::value +#if defined(__cpp_char8_t) + || std::is_same::value +#endif + ) && std::is_same::value>::type> StringPiece(T str) - : ptr_(str.data()), length_(static_cast(str.size())) {} + : ptr_(reinterpret_cast(str.data())), + length_(static_cast(str.size())) {} #endif // U_HIDE_DRAFT_API + /** * Constructs from a const char * pointer and a specified length. * @param offset a const char * pointer (need not be terminated) @@ -114,6 +154,19 @@ class U_COMMON_API StringPiece : public UMemory { * @stable ICU 4.2 */ StringPiece(const char* offset, int32_t len) : ptr_(offset), length_(len) { } +#ifndef U_HIDE_DRAFT_API +#if defined(__cpp_char8_t) || defined(U_IN_DOXYGEN) + /** + * Constructs from a const char8_t * pointer and a specified length. + * @param str a const char8_t * pointer (need not be terminated) + * @param len the length of the string; must be non-negative + * @draft ICU 67 + */ + StringPiece(const char8_t* str, int32_t len) : + StringPiece(reinterpret_cast(str), len) {} +#endif +#endif // U_HIDE_DRAFT_API + /** * Substring of another StringPiece. * @param x the other StringPiece @@ -132,7 +185,7 @@ class U_COMMON_API StringPiece : public UMemory { StringPiece(const StringPiece& x, int32_t pos, int32_t len); /** - * Returns the string pointer. May be NULL if it is empty. + * Returns the string pointer. May be nullptr if it is empty. * * data() may return a pointer to a buffer with embedded NULs, and the * returned buffer may or may not be null terminated. Therefore it is @@ -165,7 +218,7 @@ class U_COMMON_API StringPiece : public UMemory { * Sets to an empty string. * @stable ICU 4.2 */ - void clear() { ptr_ = NULL; length_ = 0; } + void clear() { ptr_ = nullptr; length_ = 0; } /** * Reset the stringpiece to refer to new data. @@ -182,6 +235,29 @@ class U_COMMON_API StringPiece : public UMemory { */ void set(const char* str); +#ifndef U_HIDE_DRAFT_API +#if defined(__cpp_char8_t) || defined(U_IN_DOXYGEN) + /** + * Resets the stringpiece to refer to new data. + * @param xdata pointer the new string data. Need not be NUL-terminated. + * @param len the length of the new data + * @draft ICU 67 + */ + inline void set(const char8_t* xdata, int32_t len) { + set(reinterpret_cast(xdata), len); + } + + /** + * Resets the stringpiece to refer to new data. + * @param str a pointer to a NUL-terminated string. + * @draft ICU 67 + */ + inline void set(const char8_t* str) { + set(reinterpret_cast(str)); + } +#endif +#endif // U_HIDE_DRAFT_API + /** * Removes the first n string units. * @param n prefix length, must be non-negative and <=length() diff --git a/icu4c/source/test/intltest/collationtest.cpp b/icu4c/source/test/intltest/collationtest.cpp index 9562e4d4aeb..de51eece5c4 100644 --- a/icu4c/source/test/intltest/collationtest.cpp +++ b/icu4c/source/test/intltest/collationtest.cpp @@ -22,6 +22,7 @@ #include "unicode/sortkey.h" #include "unicode/std_string.h" #include "unicode/strenum.h" +#include "unicode/stringpiece.h" #include "unicode/tblcoll.h" #include "unicode/uiter.h" #include "unicode/uniset.h" @@ -293,15 +294,15 @@ void CollationTest::TestIllegalUTF8() { } coll->setAttribute(UCOL_STRENGTH, UCOL_IDENTICAL, errorCode); - static const char *strings[] = { + static const StringPiece strings[] = { // string with U+FFFD == illegal byte sequence - reinterpret_cast(u8"a\uFFFDz"), reinterpret_cast("a\x80z"), // trail byte - reinterpret_cast(u8"a\uFFFD\uFFFDz"), reinterpret_cast("a\xc1\x81z"), // non-shortest form - reinterpret_cast(u8"a\uFFFD\uFFFD\uFFFDz"), reinterpret_cast("a\xe0\x82\x83z"), // non-shortest form - reinterpret_cast(u8"a\uFFFD\uFFFD\uFFFDz"), reinterpret_cast("a\xed\xa0\x80z"), // lead surrogate: would be U+D800 - reinterpret_cast(u8"a\uFFFD\uFFFD\uFFFDz"), reinterpret_cast("a\xed\xbf\xbfz"), // trail surrogate: would be U+DFFF - reinterpret_cast(u8"a\uFFFD\uFFFD\uFFFD\uFFFDz"), reinterpret_cast("a\xf0\x8f\xbf\xbfz"), // non-shortest form - reinterpret_cast(u8"a\uFFFD\uFFFD\uFFFD\uFFFDz"), reinterpret_cast("a\xf4\x90\x80\x80z") // out of range: would be U+110000 + u8"a\uFFFDz", "a\x80z", // trail byte + u8"a\uFFFD\uFFFDz", "a\xc1\x81z", // non-shortest form + u8"a\uFFFD\uFFFD\uFFFDz", "a\xe0\x82\x83z", // non-shortest form + u8"a\uFFFD\uFFFD\uFFFDz", "a\xed\xa0\x80z", // lead surrogate: would be U+D800 + u8"a\uFFFD\uFFFD\uFFFDz", "a\xed\xbf\xbfz", // trail surrogate: would be U+DFFF + u8"a\uFFFD\uFFFD\uFFFD\uFFFDz", "a\xf0\x8f\xbf\xbfz", // non-shortest form + u8"a\uFFFD\uFFFD\uFFFD\uFFFDz", "a\xf4\x90\x80\x80z" // out of range: would be U+110000 }; for(int32_t i = 0; i < UPRV_LENGTHOF(strings); i += 2) { diff --git a/icu4c/source/test/intltest/compactdecimalformattest.cpp b/icu4c/source/test/intltest/compactdecimalformattest.cpp index 90afecdb293..1f51f70d010 100644 --- a/icu4c/source/test/intltest/compactdecimalformattest.cpp +++ b/icu4c/source/test/intltest/compactdecimalformattest.cpp @@ -23,6 +23,7 @@ typedef struct ExpectedResult { double value; + // Invariant characters, will be converted to UTF-16 and then unescaped. const char *expected; } ExpectedResult; @@ -185,38 +186,38 @@ static ExpectedResult kChineseCurrencyTestData[] = { {123456789012345.0, "\\u00A5120\\u4E07\\u4EBF"}, }; static ExpectedResult kGermanCurrencyTestData[] = { - {1.0, reinterpret_cast(u8"1\\u00A0\\u20AC")}, - {12.0, reinterpret_cast(u8"12\\u00A0\\u20AC")}, - {123.0, reinterpret_cast(u8"120\\u00A0\\u20AC")}, - {1234.0, reinterpret_cast(u8"1200\\u00A0\\u20AC")}, - {12345.0, reinterpret_cast(u8"12.000\\u00A0\\u20AC")}, - {123456.0, reinterpret_cast(u8"120.000\\u00A0\\u20AC")}, - {1234567.0, reinterpret_cast(u8"1,2\\u00A0Mio.\\u00A0\\u20AC")}, - {12345678.0, reinterpret_cast(u8"12\\u00A0Mio.\\u00A0\\u20AC")}, - {123456789.0, reinterpret_cast(u8"120\\u00A0Mio.\\u00A0\\u20AC")}, - {1234567890.0, reinterpret_cast(u8"1,2\\u00A0Mrd.\\u00A0\\u20AC")}, - {12345678901.0, reinterpret_cast(u8"12\\u00A0Mrd.\\u00A0\\u20AC")}, - {123456789012.0, reinterpret_cast(u8"120\\u00A0Mrd.\\u00A0\\u20AC")}, - {1234567890123.0, reinterpret_cast(u8"1,2\\u00A0Bio.\\u00A0\\u20AC")}, - {12345678901234.0, reinterpret_cast(u8"12\\u00A0Bio.\\u00A0\\u20AC")}, - {123456789012345.0, reinterpret_cast(u8"120\\u00A0Bio.\\u00A0\\u20AC")}, + {1.0, "1\\u00A0\\u20AC"}, + {12.0, "12\\u00A0\\u20AC"}, + {123.0, "120\\u00A0\\u20AC"}, + {1234.0, "1200\\u00A0\\u20AC"}, + {12345.0, "12.000\\u00A0\\u20AC"}, + {123456.0, "120.000\\u00A0\\u20AC"}, + {1234567.0, "1,2\\u00A0Mio.\\u00A0\\u20AC"}, + {12345678.0, "12\\u00A0Mio.\\u00A0\\u20AC"}, + {123456789.0, "120\\u00A0Mio.\\u00A0\\u20AC"}, + {1234567890.0, "1,2\\u00A0Mrd.\\u00A0\\u20AC"}, + {12345678901.0, "12\\u00A0Mrd.\\u00A0\\u20AC"}, + {123456789012.0, "120\\u00A0Mrd.\\u00A0\\u20AC"}, + {1234567890123.0, "1,2\\u00A0Bio.\\u00A0\\u20AC"}, + {12345678901234.0, "12\\u00A0Bio.\\u00A0\\u20AC"}, + {123456789012345.0, "120\\u00A0Bio.\\u00A0\\u20AC"}, }; static ExpectedResult kEnglishCurrencyTestData[] = { - {1.0, reinterpret_cast(u8"$1")}, - {12.0, reinterpret_cast(u8"$12")}, - {123.0, reinterpret_cast(u8"$120")}, - {1234.0, reinterpret_cast(u8"$1.2K")}, - {12345.0, reinterpret_cast(u8"$12K")}, - {123456.0, reinterpret_cast(u8"$120K")}, - {1234567.0, reinterpret_cast(u8"$1.2M")}, - {12345678.0, reinterpret_cast(u8"$12M")}, - {123456789.0, reinterpret_cast(u8"$120M")}, - {1234567890.0, reinterpret_cast(u8"$1.2B")}, - {12345678901.0, reinterpret_cast(u8"$12B")}, - {123456789012.0, reinterpret_cast(u8"$120B")}, - {1234567890123.0, reinterpret_cast(u8"$1.2T")}, - {12345678901234.0, reinterpret_cast(u8"$12T")}, - {123456789012345.0, reinterpret_cast(u8"$120T")}, + {1.0, "$1"}, + {12.0, "$12"}, + {123.0, "$120"}, + {1234.0, "$1.2K"}, + {12345.0, "$12K"}, + {123456.0, "$120K"}, + {1234567.0, "$1.2M"}, + {12345678.0, "$12M"}, + {123456789.0, "$120M"}, + {1234567890.0, "$1.2B"}, + {12345678901.0, "$12B"}, + {123456789012.0, "$120B"}, + {1234567890123.0, "$1.2T"}, + {12345678901234.0, "$12T"}, + {123456789012345.0, "$120T"}, }; diff --git a/icu4c/source/test/intltest/regextst.cpp b/icu4c/source/test/intltest/regextst.cpp index 311c7bc94b9..5f7e36b3ae1 100644 --- a/icu4c/source/test/intltest/regextst.cpp +++ b/icu4c/source/test/intltest/regextst.cpp @@ -31,6 +31,7 @@ #include "unicode/localpointer.h" #include "unicode/regex.h" +#include "unicode/stringpiece.h" #include "unicode/uchar.h" #include "unicode/ucnv.h" #include "unicode/uniset.h" @@ -5838,11 +5839,11 @@ void RegexTest::TestBug12884() { REGEX_ASSERT(status == U_REGEX_TIME_OUT); // UText, wrapping non-UTF-16 text, also takes a different execution path. - const char *text8 = reinterpret_cast(u8"¿Qué es Unicode? Unicode proporciona un número único para cada" + StringPiece text8(u8"¿Qué es Unicode? Unicode proporciona un número único para cada" "carácter, sin importar la plataforma, sin importar el programa," "sin importar el idioma."); status = U_ZERO_ERROR; - LocalUTextPointer ut(utext_openUTF8(NULL, text8, -1, &status)); + LocalUTextPointer ut(utext_openUTF8(NULL, text8.data(), text8.length(), &status)); REGEX_CHECK_STATUS; m.reset(ut.getAlias()); m.find(status); diff --git a/icu4c/source/test/intltest/strcase.cpp b/icu4c/source/test/intltest/strcase.cpp index dc81fb45132..4093c519262 100644 --- a/icu4c/source/test/intltest/strcase.cpp +++ b/icu4c/source/test/intltest/strcase.cpp @@ -1314,7 +1314,8 @@ void StringCaseTest::TestCaseMapUTF8WithEdits() { Edits edits; int32_t length = CaseMap::utf8ToLower("tr", U_OMIT_UNCHANGED_TEXT, - reinterpret_cast(u8"IstanBul"), 8, dest, UPRV_LENGTHOF(dest), &edits, errorCode); + reinterpret_cast(u8"IstanBul"), 8, + dest, UPRV_LENGTHOF(dest), &edits, errorCode); assertEquals(u"toLower(IstanBul)", UnicodeString(u"ıb"), UnicodeString::fromUTF8(StringPiece(dest, length))); static const EditChange lowerExpectedChanges[] = { @@ -1330,7 +1331,8 @@ void StringCaseTest::TestCaseMapUTF8WithEdits() { edits.reset(); length = CaseMap::utf8ToUpper("el", U_OMIT_UNCHANGED_TEXT, - reinterpret_cast(u8"Πατάτα"), 6 * 2, dest, UPRV_LENGTHOF(dest), &edits, errorCode); + reinterpret_cast(u8"Πατάτα"), 6 * 2, + dest, UPRV_LENGTHOF(dest), &edits, errorCode); assertEquals(u"toUpper(Πατάτα)", UnicodeString(u"ΑΤΑΤΑ"), UnicodeString::fromUTF8(StringPiece(dest, length))); static const EditChange upperExpectedChanges[] = { @@ -1370,7 +1372,8 @@ void StringCaseTest::TestCaseMapUTF8WithEdits() { // No explicit nor automatic edits.reset(). Edits should be appended. length = CaseMap::utf8Fold(U_OMIT_UNCHANGED_TEXT | U_EDITS_NO_RESET | U_FOLD_CASE_EXCLUDE_SPECIAL_I, - reinterpret_cast(u8"IßtanBul"), 1 + 2 + 6, dest, UPRV_LENGTHOF(dest), &edits, errorCode); + reinterpret_cast(u8"IßtanBul"), 1 + 2 + 6, + dest, UPRV_LENGTHOF(dest), &edits, errorCode); assertEquals(u"foldCase(IßtanBul)", UnicodeString(u"ıssb"), UnicodeString::fromUTF8(StringPiece(dest, length))); static const EditChange foldExpectedChanges[] = { @@ -1454,44 +1457,44 @@ void StringCaseTest::TestCaseMapUTF8ToString() { StringByteSink sink(&dest); // Omit unchanged text. - CaseMap::utf8ToLower("tr", U_OMIT_UNCHANGED_TEXT, reinterpret_cast(u8"IstanBul"), sink, nullptr, errorCode); + CaseMap::utf8ToLower("tr", U_OMIT_UNCHANGED_TEXT, u8"IstanBul", sink, nullptr, errorCode); assertEquals(u"toLower(IstanBul)", UnicodeString(u"ıb"), UnicodeString::fromUTF8(dest)); dest.clear(); - CaseMap::utf8ToUpper("el", U_OMIT_UNCHANGED_TEXT, reinterpret_cast(u8"Πατάτα"), sink, nullptr, errorCode); + CaseMap::utf8ToUpper("el", U_OMIT_UNCHANGED_TEXT, u8"Πατάτα", sink, nullptr, errorCode); assertEquals(u"toUpper(Πατάτα)", UnicodeString(u"ΑΤΑΤΑ"), UnicodeString::fromUTF8(dest)); #if !UCONFIG_NO_BREAK_ITERATION dest.clear(); CaseMap::utf8ToTitle( "nl", U_OMIT_UNCHANGED_TEXT | U_TITLECASE_NO_BREAK_ADJUSTMENT | U_TITLECASE_NO_LOWERCASE, - nullptr, reinterpret_cast(u8"IjssEL IglOo"), sink, nullptr, errorCode); + nullptr, u8"IjssEL IglOo", sink, nullptr, errorCode); assertEquals(u"toTitle(IjssEL IglOo)", UnicodeString(u"J"), UnicodeString::fromUTF8(dest)); #endif dest.clear(); CaseMap::utf8Fold(U_OMIT_UNCHANGED_TEXT | U_FOLD_CASE_EXCLUDE_SPECIAL_I, - reinterpret_cast(u8"IßtanBul"), sink, nullptr, errorCode); + u8"IßtanBul", sink, nullptr, errorCode); assertEquals(u"foldCase(IßtanBul)", UnicodeString(u"ıssb"), UnicodeString::fromUTF8(dest)); // Return the whole result string. dest.clear(); - CaseMap::utf8ToLower("tr", 0, reinterpret_cast(u8"IstanBul"), sink, nullptr, errorCode); + CaseMap::utf8ToLower("tr", 0, u8"IstanBul", sink, nullptr, errorCode); assertEquals(u"toLower(IstanBul)", UnicodeString(u"ıstanbul"), UnicodeString::fromUTF8(dest)); dest.clear(); - CaseMap::utf8ToUpper("el", 0, reinterpret_cast(u8"Πατάτα"), sink, nullptr, errorCode); + CaseMap::utf8ToUpper("el", 0, u8"Πατάτα", sink, nullptr, errorCode); assertEquals(u"toUpper(Πατάτα)", UnicodeString(u"ΠΑΤΑΤΑ"), UnicodeString::fromUTF8(dest)); #if !UCONFIG_NO_BREAK_ITERATION dest.clear(); CaseMap::utf8ToTitle("nl", U_TITLECASE_NO_BREAK_ADJUSTMENT | U_TITLECASE_NO_LOWERCASE, - nullptr, reinterpret_cast(u8"IjssEL IglOo"), sink, nullptr, errorCode); + nullptr, u8"IjssEL IglOo", sink, nullptr, errorCode); assertEquals(u"toTitle(IjssEL IglOo)", UnicodeString(u"IJssEL IglOo"), UnicodeString::fromUTF8(dest)); #endif dest.clear(); - CaseMap::utf8Fold(U_FOLD_CASE_EXCLUDE_SPECIAL_I, reinterpret_cast(u8"IßtanBul"), sink, nullptr, errorCode); + CaseMap::utf8Fold(U_FOLD_CASE_EXCLUDE_SPECIAL_I, u8"IßtanBul", sink, nullptr, errorCode); assertEquals(u"foldCase(IßtanBul)", UnicodeString(u"ısstanbul"), UnicodeString::fromUTF8(dest)); } diff --git a/icu4c/source/test/intltest/strtest.cpp b/icu4c/source/test/intltest/strtest.cpp index 1665a03cdb8..cf00cd4241d 100644 --- a/icu4c/source/test/intltest/strtest.cpp +++ b/icu4c/source/test/intltest/strtest.cpp @@ -248,9 +248,11 @@ void StringTest::runIndexedTest(int32_t index, UBool exec, const char *&name, ch #ifdef U_HAVE_STRING_VIEW TESTCASE_AUTO(TestStringPieceStringView); #endif + TESTCASE_AUTO(TestStringPieceU8); TESTCASE_AUTO(TestByteSink); TESTCASE_AUTO(TestCheckedArrayByteSink); TESTCASE_AUTO(TestStringByteSink); + TESTCASE_AUTO(TestStringByteSinkAppendU8); TESTCASE_AUTO(TestCharString); TESTCASE_AUTO(TestCStr); TESTCASE_AUTO(Testctou); @@ -265,7 +267,7 @@ StringTest::TestStringPiece() { errln("StringPiece() failed"); } // Construct from NULL const char * pointer. - StringPiece null(NULL); + StringPiece null((const char *)nullptr); if(!null.empty() || null.data()!=NULL || null.length()!=0 || null.size()!=0) { errln("StringPiece(NULL) failed"); } @@ -395,7 +397,7 @@ StringTest::TestStringPiece() { void StringTest::TestStringPieceComparisons() { StringPiece empty; - StringPiece null(NULL); + StringPiece null(nullptr); StringPiece abc("abc"); StringPiece abcd("abcdefg", 4); StringPiece abx("abx"); @@ -521,6 +523,52 @@ StringTest::TestStringPieceStringView() { } #endif +void +StringTest::TestStringPieceU8() { + // ICU-20984 "mitigate some C++20 char8_t breakages" + // For the following APIs there are overloads for both + // const char * and const char8_t *. + // A u8"string literal" has one type or the other + // depending on C++ version and compiler settings. + StringPiece abc(u8"abc"); + assertEquals("abc.length", 3, abc.length()); + assertEquals("abc", "\x61\x62\x63", abc.data()); + + StringPiece abc3(u8"abcdef", 3); + assertEquals("abc3.length", 3, abc3.length()); + assertEquals("abc3[0]", 0x61, abc3.data()[0]); + assertEquals("abc3[1]", 0x62, abc3.data()[1]); + assertEquals("abc3[2]", 0x63, abc3.data()[2]); + + StringPiece uvw("q"); + uvw.set(u8"uvw"); + assertEquals("uvw.length", 3, uvw.length()); + assertEquals("uvw", "\x75\x76\x77", uvw.data()); + + StringPiece xyz("r"); + xyz.set(u8"xyzXYZ", 3); + assertEquals("xyz.length", 3, xyz.length()); + assertEquals("xyz[0]", 0x78, xyz.data()[0]); + assertEquals("xyz[1]", 0x79, xyz.data()[1]); + assertEquals("xyz[2]", 0x7a, xyz.data()[2]); + + StringPiece null(nullptr); + assertTrue("null is empty", null.empty()); + assertTrue("null is null", null.data() == nullptr); + +#ifdef __cpp_lib_char8_t + std::u8string_view u8sv(u8"sv"); // C++20 + StringPiece u8svsp(u8sv); + assertEquals("u8svsp.length", 2, u8svsp.length()); + assertEquals("u8svsp", "\x73\x76", u8svsp.data()); + + std::u8string u8str(u8"str"); // C++20 + StringPiece u8strsp(u8str); + assertEquals("u8strsp.length", 3, u8strsp.length()); + assertEquals("u8strsp", "\x73\x74\x72", u8strsp.data()); +#endif // __cpp_lib_char8_t +} + // Verify that ByteSink is subclassable and Flush() overridable. class SimpleByteSink : public ByteSink { public: @@ -653,6 +701,20 @@ StringTest::TestStringByteSink() { } } +void +StringTest::TestStringByteSinkAppendU8() { + // ICU-20984 "mitigate some C++20 char8_t breakages" + // For the following APIs there are overloads for both + // const char * and const char8_t *. + // A u8"string literal" has one type or the other + // depending on C++ version and compiler settings. + std::string result("abc"); + StringByteSink sink(&result); + sink.AppendU8("def", 3); + sink.AppendU8(u8"ghijkl", 4); + assertEquals("abcdefghij", "abcdef\x67\x68\x69\x6a", result.c_str()); +} + #if defined(_MSC_VER) #include #endif diff --git a/icu4c/source/test/intltest/strtest.h b/icu4c/source/test/intltest/strtest.h index 8359f84823a..2a1b98804f3 100644 --- a/icu4c/source/test/intltest/strtest.h +++ b/icu4c/source/test/intltest/strtest.h @@ -49,9 +49,11 @@ private: #ifdef U_HAVE_STRING_VIEW void TestStringPieceStringView(); #endif + void TestStringPieceU8(); void TestByteSink(); void TestCheckedArrayByteSink(); void TestStringByteSink(); + void TestStringByteSinkAppendU8(); void TestSTLCompatibility(); void TestCharString(); void TestCStr(); diff --git a/icu4c/source/test/intltest/tstnorm.cpp b/icu4c/source/test/intltest/tstnorm.cpp index 886df6f15ad..e478872d53e 100644 --- a/icu4c/source/test/intltest/tstnorm.cpp +++ b/icu4c/source/test/intltest/tstnorm.cpp @@ -14,6 +14,7 @@ #include "unicode/errorcode.h" #include "unicode/normlzr.h" #include "unicode/stringoptions.h" +#include "unicode/stringpiece.h" #include "unicode/uniset.h" #include "unicode/usetiter.h" #include "unicode/schriter.h" @@ -1573,15 +1574,15 @@ BasicNormalizerTest::TestNormalizeUTF8WithEdits() { if(errorCode.errDataIfFailureAndReset("Normalizer2::getNFKCCasefoldInstance() call failed")) { return; } - static const char *const src = - reinterpret_cast(u8" AÄA\u0308A\u0308\u00ad\u0323Ä\u0323,\u00ad\u1100\u1161가\u11A8가\u3133 "); - std::string expected = reinterpret_cast(u8" aääạ\u0308ạ\u0308,가각갃 "); + static const StringPiece src = + u8" AÄA\u0308A\u0308\u00ad\u0323Ä\u0323,\u00ad\u1100\u1161가\u11A8가\u3133 "; + StringPiece expected = u8" aääạ\u0308ạ\u0308,가각갃 "; std::string result; StringByteSink sink(&result, static_cast(expected.length())); Edits edits; nfkc_cf->normalizeUTF8(0, src, sink, &edits, errorCode); assertSuccess("normalizeUTF8 with Edits", errorCode.get()); - assertEquals("normalizeUTF8 with Edits", expected.c_str(), result.c_str()); + assertEquals("normalizeUTF8 with Edits", expected.data(), result.c_str()); static const EditChange expectedChanges[] = { { FALSE, 2, 2 }, // 2 spaces { TRUE, 1, 1 }, // A→a @@ -1607,12 +1608,12 @@ BasicNormalizerTest::TestNormalizeUTF8WithEdits() { assertTrue("isNormalizedUTF8(normalized)", nfkc_cf->isNormalizedUTF8(result, errorCode)); // Omit unchanged text. - expected = reinterpret_cast(u8"aääạ\u0308ạ\u0308가각갃"); + expected = u8"aääạ\u0308ạ\u0308가각갃"; result.clear(); edits.reset(); nfkc_cf->normalizeUTF8(U_OMIT_UNCHANGED_TEXT, src, sink, &edits, errorCode); assertSuccess("normalizeUTF8 omit unchanged", errorCode.get()); - assertEquals("normalizeUTF8 omit unchanged", expected.c_str(), result.c_str()); + assertEquals("normalizeUTF8 omit unchanged", expected.data(), result.c_str()); assertTrue("normalizeUTF8 omit unchanged hasChanges", edits.hasChanges()); assertEquals("normalizeUTF8 omit unchanged numberOfChanges", 9, edits.numberOfChanges()); TestUtility::checkEditsIter(*this, u"normalizeUTF8 omit unchanged", @@ -1623,12 +1624,12 @@ BasicNormalizerTest::TestNormalizeUTF8WithEdits() { // With filter: The normalization code does not see the "A" substrings. UnicodeSet filter(u"[^A]", errorCode); FilteredNormalizer2 fn2(*nfkc_cf, filter); - expected = reinterpret_cast(u8" AäA\u0308A\u0323\u0308ạ\u0308,가각갃 "); + expected = u8" AäA\u0308A\u0323\u0308ạ\u0308,가각갃 "; result.clear(); edits.reset(); fn2.normalizeUTF8(0, src, sink, &edits, errorCode); assertSuccess("filtered normalizeUTF8", errorCode.get()); - assertEquals("filtered normalizeUTF8", expected.c_str(), result.c_str()); + assertEquals("filtered normalizeUTF8", expected.data(), result.c_str()); static const EditChange filteredChanges[] = { { FALSE, 3, 3 }, // 2 spaces + A { TRUE, 2, 2 }, // Ä→ä @@ -1655,12 +1656,12 @@ BasicNormalizerTest::TestNormalizeUTF8WithEdits() { // Omit unchanged text. // Note that the result is not normalized because the inner normalizer // does not see text across filter spans. - expected = reinterpret_cast(u8"ä\u0323\u0308ạ\u0308가각갃"); + expected = u8"ä\u0323\u0308ạ\u0308가각갃"; result.clear(); edits.reset(); fn2.normalizeUTF8(U_OMIT_UNCHANGED_TEXT, src, sink, &edits, errorCode); assertSuccess("filtered normalizeUTF8 omit unchanged", errorCode.get()); - assertEquals("filtered normalizeUTF8 omit unchanged", expected.c_str(), result.c_str()); + assertEquals("filtered normalizeUTF8 omit unchanged", expected.data(), result.c_str()); assertTrue("filtered normalizeUTF8 omit unchanged hasChanges", edits.hasChanges()); assertEquals("filtered normalizeUTF8 omit unchanged numberOfChanges", 7, edits.numberOfChanges()); TestUtility::checkEditsIter(*this, u"filtered normalizeUTF8 omit unchanged", @@ -1777,13 +1778,13 @@ BasicNormalizerTest::TestComposeJamoTBase() { assertFalse("isNormalized(LV+11A7)", nfkc->isNormalized(s, errorCode)); assertTrue("isNormalized(normalized)", nfkc->isNormalized(result, errorCode)); - std::string s8(reinterpret_cast(u8"\u1100\u1161\u11A7\u1100\u314F\u11A7가\u11A7")); - std::string expected8(reinterpret_cast(u8"가\u11A7가\u11A7가\u11A7")); + StringPiece s8(u8"\u1100\u1161\u11A7\u1100\u314F\u11A7가\u11A7"); + StringPiece expected8(u8"가\u11A7가\u11A7가\u11A7"); std::string result8; - StringByteSink sink(&result8, static_cast(expected8.length())); + StringByteSink sink(&result8, expected8.length()); nfkc->normalizeUTF8(0, s8, sink, nullptr, errorCode); assertSuccess("normalizeUTF8(LV+11A7)", errorCode.get()); - assertEquals("normalizeUTF8(LV+11A7)", expected8.c_str(), result8.c_str()); + assertEquals("normalizeUTF8(LV+11A7)", expected8.data(), result8.c_str()); assertFalse("isNormalizedUTF8(LV+11A7)", nfkc->isNormalizedUTF8(s8, errorCode)); assertTrue("isNormalizedUTF8(normalized)", nfkc->isNormalizedUTF8(result8, errorCode)); } diff --git a/icu4c/source/test/intltest/uts46test.cpp b/icu4c/source/test/intltest/uts46test.cpp index e11fdf2bc7a..b399d2dd724 100644 --- a/icu4c/source/test/intltest/uts46test.cpp +++ b/icu4c/source/test/intltest/uts46test.cpp @@ -160,7 +160,7 @@ void UTS46Test::TestAPI() { char buffer[100]; TestCheckedArrayByteSink sink(buffer, UPRV_LENGTHOF(buffer)); errorCode=U_ZERO_ERROR; - nontrans->labelToUnicodeUTF8(StringPiece(NULL, 5), sink, info, errorCode); + nontrans->labelToUnicodeUTF8(StringPiece((const char *)NULL, 5), sink, info, errorCode); if(errorCode!=U_ILLEGAL_ARGUMENT_ERROR || sink.NumberOfBytesWritten()!=0) { errln("N.labelToUnicodeUTF8(StringPiece(NULL, 5)) did not set illegal-argument-error ", "or did output something - %s", -- 2.40.0