if (change) {
ByteSinkUtil::appendTwoBytes(upper, sink);
if ((data & HAS_EITHER_DIALYTIKA) != 0) {
- sink.Append(reinterpret_cast<const char*>(u8"\u0308"), 2); // restore or add a dialytika
+ sink.AppendU8(u8"\u0308", 2); // restore or add a dialytika
}
if (addTonos) {
- sink.Append(reinterpret_cast<const char*>(u8"\u0301"), 2);
+ sink.AppendU8(u8"\u0301", 2);
}
while (numYpogegrammeni > 0) {
- sink.Append(reinterpret_cast<const char*>(u8"\u0399"), 2);
+ sink.AppendU8(u8"\u0399", 2);
--numYpogegrammeni;
}
}
*/
virtual void Append(const char* bytes, int32_t n) = 0;
+#ifndef U_HIDE_DRAFT_API
+ /**
+ * Appends n bytes to this. Same as Append().
+ * Call AppendU8() with u8"string literals" which are const char * in C++11
+ * but const char8_t * in C++20.
+ * If the compiler does support char8_t as a distinct type,
+ * then an AppendU8() overload for that is defined and will be chosen.
+ *
+ * @param bytes the pointer to the bytes
+ * @param n the number of bytes; must be non-negative
+ * @draft ICU 67
+ */
+ inline void AppendU8(const char* bytes, int32_t n) {
+ Append(bytes, n);
+ }
+
+#if defined(__cpp_char8_t) || defined(U_IN_DOXYGEN)
+ /**
+ * Appends n bytes to this. Same as Append() but for a const char8_t * pointer.
+ * Call AppendU8() with u8"string literals" which are const char * in C++11
+ * but const char8_t * in C++20.
+ * If the compiler does support char8_t as a distinct type,
+ * then this AppendU8() overload for that is defined and will be chosen.
+ *
+ * @param bytes the pointer to the bytes
+ * @param n the number of bytes; must be non-negative
+ * @draft ICU 67
+ */
+ inline void AppendU8(const char8_t* bytes, int32_t n) {
+ Append(reinterpret_cast<const char*>(bytes), n);
+ }
+#endif
+#endif // U_HIDE_DRAFT_API
+
/**
* Returns a writable buffer for appending and writes the buffer's capacity to
* *result_capacity. Guarantees *result_capacity>=min_capacity.
* Default constructor, creates an empty StringPiece.
* @stable ICU 4.2
*/
- StringPiece() : ptr_(NULL), length_(0) { }
+ StringPiece() : ptr_(nullptr), length_(0) { }
+
/**
* Constructs from a NUL-terminated const char * pointer.
* @param str a NUL-terminated const char * pointer
* @stable ICU 4.2
*/
StringPiece(const char* str);
+#ifndef U_HIDE_DRAFT_API
+#if defined(__cpp_char8_t) || defined(U_IN_DOXYGEN)
+ /**
+ * Constructs from a NUL-terminated const char8_t * pointer.
+ * @param str a NUL-terminated const char8_t * pointer
+ * @draft ICU 67
+ */
+ StringPiece(const char8_t* str) : StringPiece(reinterpret_cast<const char*>(str)) {}
+#endif
+ /**
+ * Constructs an empty StringPiece.
+ * Needed for type disambiguation from multiple other overloads.
+ * @param p nullptr
+ * @draft ICU 67
+ */
+ StringPiece(std::nullptr_t p) : ptr_(p), length_(0) {}
+#endif // U_HIDE_DRAFT_API
+
/**
* Constructs from a std::string.
* @stable ICU 4.2
*/
StringPiece(const std::string& str)
: ptr_(str.data()), length_(static_cast<int32_t>(str.size())) { }
+#ifndef U_HIDE_DRAFT_API
+#if defined(__cpp_lib_char8_t) || defined(U_IN_DOXYGEN)
+ /**
+ * Constructs from a std::u8string.
+ * @draft ICU 67
+ */
+ StringPiece(const std::u8string& str)
+ : ptr_(reinterpret_cast<const char*>(str.data())),
+ length_(static_cast<int32_t>(str.size())) { }
+#endif
+#endif // U_HIDE_DRAFT_API
+
#ifndef U_HIDE_DRAFT_API
/**
* Constructs from some other implementation of a string piece class, from any
* \code{.cpp}
*
* struct OtherStringPieceClass {
- * const char* data();
+ * const char* data(); // or const char8_t*
* size_t size();
* };
*
* The other string piece class will typically be std::string_view from C++17
* or absl::string_view from Abseil.
*
+ * Starting with C++20, data() may also return a const char8_t* pointer,
+ * as from std::u8string_view.
+ *
* @param str the other string piece
* @draft ICU 65
*/
template <typename T,
typename = typename std::enable_if<
- std::is_same<decltype(T().data()), const char*>::value &&
+ (std::is_same<decltype(T().data()), const char*>::value
+#if defined(__cpp_char8_t)
+ || std::is_same<decltype(T().data()), const char8_t*>::value
+#endif
+ ) &&
std::is_same<decltype(T().size()), size_t>::value>::type>
StringPiece(T str)
- : ptr_(str.data()), length_(static_cast<int32_t>(str.size())) {}
+ : ptr_(reinterpret_cast<const char*>(str.data())),
+ length_(static_cast<int32_t>(str.size())) {}
#endif // U_HIDE_DRAFT_API
+
/**
* Constructs from a const char * pointer and a specified length.
* @param offset a const char * pointer (need not be terminated)
* @stable ICU 4.2
*/
StringPiece(const char* offset, int32_t len) : ptr_(offset), length_(len) { }
+#ifndef U_HIDE_DRAFT_API
+#if defined(__cpp_char8_t) || defined(U_IN_DOXYGEN)
+ /**
+ * Constructs from a const char8_t * pointer and a specified length.
+ * @param str a const char8_t * pointer (need not be terminated)
+ * @param len the length of the string; must be non-negative
+ * @draft ICU 67
+ */
+ StringPiece(const char8_t* str, int32_t len) :
+ StringPiece(reinterpret_cast<const char*>(str), len) {}
+#endif
+#endif // U_HIDE_DRAFT_API
+
/**
* Substring of another StringPiece.
* @param x the other StringPiece
StringPiece(const StringPiece& x, int32_t pos, int32_t len);
/**
- * Returns the string pointer. May be NULL if it is empty.
+ * Returns the string pointer. May be nullptr if it is empty.
*
* data() may return a pointer to a buffer with embedded NULs, and the
* returned buffer may or may not be null terminated. Therefore it is
* Sets to an empty string.
* @stable ICU 4.2
*/
- void clear() { ptr_ = NULL; length_ = 0; }
+ void clear() { ptr_ = nullptr; length_ = 0; }
/**
* Reset the stringpiece to refer to new data.
*/
void set(const char* str);
+#ifndef U_HIDE_DRAFT_API
+#if defined(__cpp_char8_t) || defined(U_IN_DOXYGEN)
+ /**
+ * Resets the stringpiece to refer to new data.
+ * @param xdata pointer the new string data. Need not be NUL-terminated.
+ * @param len the length of the new data
+ * @draft ICU 67
+ */
+ inline void set(const char8_t* xdata, int32_t len) {
+ set(reinterpret_cast<const char*>(xdata), len);
+ }
+
+ /**
+ * Resets the stringpiece to refer to new data.
+ * @param str a pointer to a NUL-terminated string.
+ * @draft ICU 67
+ */
+ inline void set(const char8_t* str) {
+ set(reinterpret_cast<const char*>(str));
+ }
+#endif
+#endif // U_HIDE_DRAFT_API
+
/**
* Removes the first n string units.
* @param n prefix length, must be non-negative and <=length()
#include "unicode/sortkey.h"
#include "unicode/std_string.h"
#include "unicode/strenum.h"
+#include "unicode/stringpiece.h"
#include "unicode/tblcoll.h"
#include "unicode/uiter.h"
#include "unicode/uniset.h"
}
coll->setAttribute(UCOL_STRENGTH, UCOL_IDENTICAL, errorCode);
- static const char *strings[] = {
+ static const StringPiece strings[] = {
// string with U+FFFD == illegal byte sequence
- reinterpret_cast<const char*>(u8"a\uFFFDz"), reinterpret_cast<const char*>("a\x80z"), // trail byte
- reinterpret_cast<const char*>(u8"a\uFFFD\uFFFDz"), reinterpret_cast<const char*>("a\xc1\x81z"), // non-shortest form
- reinterpret_cast<const char*>(u8"a\uFFFD\uFFFD\uFFFDz"), reinterpret_cast<const char*>("a\xe0\x82\x83z"), // non-shortest form
- reinterpret_cast<const char*>(u8"a\uFFFD\uFFFD\uFFFDz"), reinterpret_cast<const char*>("a\xed\xa0\x80z"), // lead surrogate: would be U+D800
- reinterpret_cast<const char*>(u8"a\uFFFD\uFFFD\uFFFDz"), reinterpret_cast<const char*>("a\xed\xbf\xbfz"), // trail surrogate: would be U+DFFF
- reinterpret_cast<const char*>(u8"a\uFFFD\uFFFD\uFFFD\uFFFDz"), reinterpret_cast<const char*>("a\xf0\x8f\xbf\xbfz"), // non-shortest form
- reinterpret_cast<const char*>(u8"a\uFFFD\uFFFD\uFFFD\uFFFDz"), reinterpret_cast<const char*>("a\xf4\x90\x80\x80z") // out of range: would be U+110000
+ u8"a\uFFFDz", "a\x80z", // trail byte
+ u8"a\uFFFD\uFFFDz", "a\xc1\x81z", // non-shortest form
+ u8"a\uFFFD\uFFFD\uFFFDz", "a\xe0\x82\x83z", // non-shortest form
+ u8"a\uFFFD\uFFFD\uFFFDz", "a\xed\xa0\x80z", // lead surrogate: would be U+D800
+ u8"a\uFFFD\uFFFD\uFFFDz", "a\xed\xbf\xbfz", // trail surrogate: would be U+DFFF
+ u8"a\uFFFD\uFFFD\uFFFD\uFFFDz", "a\xf0\x8f\xbf\xbfz", // non-shortest form
+ u8"a\uFFFD\uFFFD\uFFFD\uFFFDz", "a\xf4\x90\x80\x80z" // out of range: would be U+110000
};
for(int32_t i = 0; i < UPRV_LENGTHOF(strings); i += 2) {
typedef struct ExpectedResult {
double value;
+ // Invariant characters, will be converted to UTF-16 and then unescaped.
const char *expected;
} ExpectedResult;
{123456789012345.0, "\\u00A5120\\u4E07\\u4EBF"},
};
static ExpectedResult kGermanCurrencyTestData[] = {
- {1.0, reinterpret_cast<const char*>(u8"1\\u00A0\\u20AC")},
- {12.0, reinterpret_cast<const char*>(u8"12\\u00A0\\u20AC")},
- {123.0, reinterpret_cast<const char*>(u8"120\\u00A0\\u20AC")},
- {1234.0, reinterpret_cast<const char*>(u8"1200\\u00A0\\u20AC")},
- {12345.0, reinterpret_cast<const char*>(u8"12.000\\u00A0\\u20AC")},
- {123456.0, reinterpret_cast<const char*>(u8"120.000\\u00A0\\u20AC")},
- {1234567.0, reinterpret_cast<const char*>(u8"1,2\\u00A0Mio.\\u00A0\\u20AC")},
- {12345678.0, reinterpret_cast<const char*>(u8"12\\u00A0Mio.\\u00A0\\u20AC")},
- {123456789.0, reinterpret_cast<const char*>(u8"120\\u00A0Mio.\\u00A0\\u20AC")},
- {1234567890.0, reinterpret_cast<const char*>(u8"1,2\\u00A0Mrd.\\u00A0\\u20AC")},
- {12345678901.0, reinterpret_cast<const char*>(u8"12\\u00A0Mrd.\\u00A0\\u20AC")},
- {123456789012.0, reinterpret_cast<const char*>(u8"120\\u00A0Mrd.\\u00A0\\u20AC")},
- {1234567890123.0, reinterpret_cast<const char*>(u8"1,2\\u00A0Bio.\\u00A0\\u20AC")},
- {12345678901234.0, reinterpret_cast<const char*>(u8"12\\u00A0Bio.\\u00A0\\u20AC")},
- {123456789012345.0, reinterpret_cast<const char*>(u8"120\\u00A0Bio.\\u00A0\\u20AC")},
+ {1.0, "1\\u00A0\\u20AC"},
+ {12.0, "12\\u00A0\\u20AC"},
+ {123.0, "120\\u00A0\\u20AC"},
+ {1234.0, "1200\\u00A0\\u20AC"},
+ {12345.0, "12.000\\u00A0\\u20AC"},
+ {123456.0, "120.000\\u00A0\\u20AC"},
+ {1234567.0, "1,2\\u00A0Mio.\\u00A0\\u20AC"},
+ {12345678.0, "12\\u00A0Mio.\\u00A0\\u20AC"},
+ {123456789.0, "120\\u00A0Mio.\\u00A0\\u20AC"},
+ {1234567890.0, "1,2\\u00A0Mrd.\\u00A0\\u20AC"},
+ {12345678901.0, "12\\u00A0Mrd.\\u00A0\\u20AC"},
+ {123456789012.0, "120\\u00A0Mrd.\\u00A0\\u20AC"},
+ {1234567890123.0, "1,2\\u00A0Bio.\\u00A0\\u20AC"},
+ {12345678901234.0, "12\\u00A0Bio.\\u00A0\\u20AC"},
+ {123456789012345.0, "120\\u00A0Bio.\\u00A0\\u20AC"},
};
static ExpectedResult kEnglishCurrencyTestData[] = {
- {1.0, reinterpret_cast<const char*>(u8"$1")},
- {12.0, reinterpret_cast<const char*>(u8"$12")},
- {123.0, reinterpret_cast<const char*>(u8"$120")},
- {1234.0, reinterpret_cast<const char*>(u8"$1.2K")},
- {12345.0, reinterpret_cast<const char*>(u8"$12K")},
- {123456.0, reinterpret_cast<const char*>(u8"$120K")},
- {1234567.0, reinterpret_cast<const char*>(u8"$1.2M")},
- {12345678.0, reinterpret_cast<const char*>(u8"$12M")},
- {123456789.0, reinterpret_cast<const char*>(u8"$120M")},
- {1234567890.0, reinterpret_cast<const char*>(u8"$1.2B")},
- {12345678901.0, reinterpret_cast<const char*>(u8"$12B")},
- {123456789012.0, reinterpret_cast<const char*>(u8"$120B")},
- {1234567890123.0, reinterpret_cast<const char*>(u8"$1.2T")},
- {12345678901234.0, reinterpret_cast<const char*>(u8"$12T")},
- {123456789012345.0, reinterpret_cast<const char*>(u8"$120T")},
+ {1.0, "$1"},
+ {12.0, "$12"},
+ {123.0, "$120"},
+ {1234.0, "$1.2K"},
+ {12345.0, "$12K"},
+ {123456.0, "$120K"},
+ {1234567.0, "$1.2M"},
+ {12345678.0, "$12M"},
+ {123456789.0, "$120M"},
+ {1234567890.0, "$1.2B"},
+ {12345678901.0, "$12B"},
+ {123456789012.0, "$120B"},
+ {1234567890123.0, "$1.2T"},
+ {12345678901234.0, "$12T"},
+ {123456789012345.0, "$120T"},
};
#include "unicode/localpointer.h"
#include "unicode/regex.h"
+#include "unicode/stringpiece.h"
#include "unicode/uchar.h"
#include "unicode/ucnv.h"
#include "unicode/uniset.h"
REGEX_ASSERT(status == U_REGEX_TIME_OUT);
// UText, wrapping non-UTF-16 text, also takes a different execution path.
- const char *text8 = reinterpret_cast<const char*>(u8"¿Qué es Unicode? Unicode proporciona un número único para cada"
+ StringPiece text8(u8"¿Qué es Unicode? Unicode proporciona un número único para cada"
"carácter, sin importar la plataforma, sin importar el programa,"
"sin importar el idioma.");
status = U_ZERO_ERROR;
- LocalUTextPointer ut(utext_openUTF8(NULL, text8, -1, &status));
+ LocalUTextPointer ut(utext_openUTF8(NULL, text8.data(), text8.length(), &status));
REGEX_CHECK_STATUS;
m.reset(ut.getAlias());
m.find(status);
Edits edits;
int32_t length = CaseMap::utf8ToLower("tr", U_OMIT_UNCHANGED_TEXT,
- reinterpret_cast<const char*>(u8"IstanBul"), 8, dest, UPRV_LENGTHOF(dest), &edits, errorCode);
+ reinterpret_cast<const char*>(u8"IstanBul"), 8,
+ dest, UPRV_LENGTHOF(dest), &edits, errorCode);
assertEquals(u"toLower(IstanBul)", UnicodeString(u"ıb"),
UnicodeString::fromUTF8(StringPiece(dest, length)));
static const EditChange lowerExpectedChanges[] = {
edits.reset();
length = CaseMap::utf8ToUpper("el", U_OMIT_UNCHANGED_TEXT,
- reinterpret_cast<const char*>(u8"Πατάτα"), 6 * 2, dest, UPRV_LENGTHOF(dest), &edits, errorCode);
+ reinterpret_cast<const char*>(u8"Πατάτα"), 6 * 2,
+ dest, UPRV_LENGTHOF(dest), &edits, errorCode);
assertEquals(u"toUpper(Πατάτα)", UnicodeString(u"ΑΤΑΤΑ"),
UnicodeString::fromUTF8(StringPiece(dest, length)));
static const EditChange upperExpectedChanges[] = {
// No explicit nor automatic edits.reset(). Edits should be appended.
length = CaseMap::utf8Fold(U_OMIT_UNCHANGED_TEXT | U_EDITS_NO_RESET |
U_FOLD_CASE_EXCLUDE_SPECIAL_I,
- reinterpret_cast<const char*>(u8"IßtanBul"), 1 + 2 + 6, dest, UPRV_LENGTHOF(dest), &edits, errorCode);
+ reinterpret_cast<const char*>(u8"IßtanBul"), 1 + 2 + 6,
+ dest, UPRV_LENGTHOF(dest), &edits, errorCode);
assertEquals(u"foldCase(IßtanBul)", UnicodeString(u"ıssb"),
UnicodeString::fromUTF8(StringPiece(dest, length)));
static const EditChange foldExpectedChanges[] = {
StringByteSink<std::string> sink(&dest);
// Omit unchanged text.
- CaseMap::utf8ToLower("tr", U_OMIT_UNCHANGED_TEXT, reinterpret_cast<const char*>(u8"IstanBul"), sink, nullptr, errorCode);
+ CaseMap::utf8ToLower("tr", U_OMIT_UNCHANGED_TEXT, u8"IstanBul", sink, nullptr, errorCode);
assertEquals(u"toLower(IstanBul)", UnicodeString(u"ıb"), UnicodeString::fromUTF8(dest));
dest.clear();
- CaseMap::utf8ToUpper("el", U_OMIT_UNCHANGED_TEXT, reinterpret_cast<const char*>(u8"Πατάτα"), sink, nullptr, errorCode);
+ CaseMap::utf8ToUpper("el", U_OMIT_UNCHANGED_TEXT, u8"Πατάτα", sink, nullptr, errorCode);
assertEquals(u"toUpper(Πατάτα)", UnicodeString(u"ΑΤΑΤΑ"),
UnicodeString::fromUTF8(dest));
#if !UCONFIG_NO_BREAK_ITERATION
dest.clear();
CaseMap::utf8ToTitle(
"nl", U_OMIT_UNCHANGED_TEXT | U_TITLECASE_NO_BREAK_ADJUSTMENT | U_TITLECASE_NO_LOWERCASE,
- nullptr, reinterpret_cast<const char*>(u8"IjssEL IglOo"), sink, nullptr, errorCode);
+ nullptr, u8"IjssEL IglOo", sink, nullptr, errorCode);
assertEquals(u"toTitle(IjssEL IglOo)", UnicodeString(u"J"),
UnicodeString::fromUTF8(dest));
#endif
dest.clear();
CaseMap::utf8Fold(U_OMIT_UNCHANGED_TEXT | U_FOLD_CASE_EXCLUDE_SPECIAL_I,
- reinterpret_cast<const char*>(u8"IßtanBul"), sink, nullptr, errorCode);
+ u8"IßtanBul", sink, nullptr, errorCode);
assertEquals(u"foldCase(IßtanBul)", UnicodeString(u"ıssb"),
UnicodeString::fromUTF8(dest));
// Return the whole result string.
dest.clear();
- CaseMap::utf8ToLower("tr", 0, reinterpret_cast<const char*>(u8"IstanBul"), sink, nullptr, errorCode);
+ CaseMap::utf8ToLower("tr", 0, u8"IstanBul", sink, nullptr, errorCode);
assertEquals(u"toLower(IstanBul)", UnicodeString(u"ıstanbul"),
UnicodeString::fromUTF8(dest));
dest.clear();
- CaseMap::utf8ToUpper("el", 0, reinterpret_cast<const char*>(u8"Πατάτα"), sink, nullptr, errorCode);
+ CaseMap::utf8ToUpper("el", 0, u8"Πατάτα", sink, nullptr, errorCode);
assertEquals(u"toUpper(Πατάτα)", UnicodeString(u"ΠΑΤΑΤΑ"),
UnicodeString::fromUTF8(dest));
#if !UCONFIG_NO_BREAK_ITERATION
dest.clear();
CaseMap::utf8ToTitle("nl", U_TITLECASE_NO_BREAK_ADJUSTMENT | U_TITLECASE_NO_LOWERCASE,
- nullptr, reinterpret_cast<const char*>(u8"IjssEL IglOo"), sink, nullptr, errorCode);
+ nullptr, u8"IjssEL IglOo", sink, nullptr, errorCode);
assertEquals(u"toTitle(IjssEL IglOo)", UnicodeString(u"IJssEL IglOo"),
UnicodeString::fromUTF8(dest));
#endif
dest.clear();
- CaseMap::utf8Fold(U_FOLD_CASE_EXCLUDE_SPECIAL_I, reinterpret_cast<const char*>(u8"IßtanBul"), sink, nullptr, errorCode);
+ CaseMap::utf8Fold(U_FOLD_CASE_EXCLUDE_SPECIAL_I, u8"IßtanBul", sink, nullptr, errorCode);
assertEquals(u"foldCase(IßtanBul)", UnicodeString(u"ısstanbul"),
UnicodeString::fromUTF8(dest));
}
#ifdef U_HAVE_STRING_VIEW
TESTCASE_AUTO(TestStringPieceStringView);
#endif
+ TESTCASE_AUTO(TestStringPieceU8);
TESTCASE_AUTO(TestByteSink);
TESTCASE_AUTO(TestCheckedArrayByteSink);
TESTCASE_AUTO(TestStringByteSink);
+ TESTCASE_AUTO(TestStringByteSinkAppendU8);
TESTCASE_AUTO(TestCharString);
TESTCASE_AUTO(TestCStr);
TESTCASE_AUTO(Testctou);
errln("StringPiece() failed");
}
// Construct from NULL const char * pointer.
- StringPiece null(NULL);
+ StringPiece null((const char *)nullptr);
if(!null.empty() || null.data()!=NULL || null.length()!=0 || null.size()!=0) {
errln("StringPiece(NULL) failed");
}
void
StringTest::TestStringPieceComparisons() {
StringPiece empty;
- StringPiece null(NULL);
+ StringPiece null(nullptr);
StringPiece abc("abc");
StringPiece abcd("abcdefg", 4);
StringPiece abx("abx");
}
#endif
+void
+StringTest::TestStringPieceU8() {
+ // ICU-20984 "mitigate some C++20 char8_t breakages"
+ // For the following APIs there are overloads for both
+ // const char * and const char8_t *.
+ // A u8"string literal" has one type or the other
+ // depending on C++ version and compiler settings.
+ StringPiece abc(u8"abc");
+ assertEquals("abc.length", 3, abc.length());
+ assertEquals("abc", "\x61\x62\x63", abc.data());
+
+ StringPiece abc3(u8"abcdef", 3);
+ assertEquals("abc3.length", 3, abc3.length());
+ assertEquals("abc3[0]", 0x61, abc3.data()[0]);
+ assertEquals("abc3[1]", 0x62, abc3.data()[1]);
+ assertEquals("abc3[2]", 0x63, abc3.data()[2]);
+
+ StringPiece uvw("q");
+ uvw.set(u8"uvw");
+ assertEquals("uvw.length", 3, uvw.length());
+ assertEquals("uvw", "\x75\x76\x77", uvw.data());
+
+ StringPiece xyz("r");
+ xyz.set(u8"xyzXYZ", 3);
+ assertEquals("xyz.length", 3, xyz.length());
+ assertEquals("xyz[0]", 0x78, xyz.data()[0]);
+ assertEquals("xyz[1]", 0x79, xyz.data()[1]);
+ assertEquals("xyz[2]", 0x7a, xyz.data()[2]);
+
+ StringPiece null(nullptr);
+ assertTrue("null is empty", null.empty());
+ assertTrue("null is null", null.data() == nullptr);
+
+#ifdef __cpp_lib_char8_t
+ std::u8string_view u8sv(u8"sv"); // C++20
+ StringPiece u8svsp(u8sv);
+ assertEquals("u8svsp.length", 2, u8svsp.length());
+ assertEquals("u8svsp", "\x73\x76", u8svsp.data());
+
+ std::u8string u8str(u8"str"); // C++20
+ StringPiece u8strsp(u8str);
+ assertEquals("u8strsp.length", 3, u8strsp.length());
+ assertEquals("u8strsp", "\x73\x74\x72", u8strsp.data());
+#endif // __cpp_lib_char8_t
+}
+
// Verify that ByteSink is subclassable and Flush() overridable.
class SimpleByteSink : public ByteSink {
public:
}
}
+void
+StringTest::TestStringByteSinkAppendU8() {
+ // ICU-20984 "mitigate some C++20 char8_t breakages"
+ // For the following APIs there are overloads for both
+ // const char * and const char8_t *.
+ // A u8"string literal" has one type or the other
+ // depending on C++ version and compiler settings.
+ std::string result("abc");
+ StringByteSink<std::string> sink(&result);
+ sink.AppendU8("def", 3);
+ sink.AppendU8(u8"ghijkl", 4);
+ assertEquals("abcdefghij", "abcdef\x67\x68\x69\x6a", result.c_str());
+}
+
#if defined(_MSC_VER)
#include <vector>
#endif
#ifdef U_HAVE_STRING_VIEW
void TestStringPieceStringView();
#endif
+ void TestStringPieceU8();
void TestByteSink();
void TestCheckedArrayByteSink();
void TestStringByteSink();
+ void TestStringByteSinkAppendU8();
void TestSTLCompatibility();
void TestCharString();
void TestCStr();
#include "unicode/errorcode.h"
#include "unicode/normlzr.h"
#include "unicode/stringoptions.h"
+#include "unicode/stringpiece.h"
#include "unicode/uniset.h"
#include "unicode/usetiter.h"
#include "unicode/schriter.h"
if(errorCode.errDataIfFailureAndReset("Normalizer2::getNFKCCasefoldInstance() call failed")) {
return;
}
- static const char *const src =
- reinterpret_cast<const char*>(u8" AÄA\u0308A\u0308\u00ad\u0323Ä\u0323,\u00ad\u1100\u1161가\u11A8가\u3133 ");
- std::string expected = reinterpret_cast<const char*>(u8" aääạ\u0308ạ\u0308,가각갃 ");
+ static const StringPiece src =
+ u8" AÄA\u0308A\u0308\u00ad\u0323Ä\u0323,\u00ad\u1100\u1161가\u11A8가\u3133 ";
+ StringPiece expected = u8" aääạ\u0308ạ\u0308,가각갃 ";
std::string result;
StringByteSink<std::string> sink(&result, static_cast<int32_t>(expected.length()));
Edits edits;
nfkc_cf->normalizeUTF8(0, src, sink, &edits, errorCode);
assertSuccess("normalizeUTF8 with Edits", errorCode.get());
- assertEquals("normalizeUTF8 with Edits", expected.c_str(), result.c_str());
+ assertEquals("normalizeUTF8 with Edits", expected.data(), result.c_str());
static const EditChange expectedChanges[] = {
{ FALSE, 2, 2 }, // 2 spaces
{ TRUE, 1, 1 }, // A→a
assertTrue("isNormalizedUTF8(normalized)", nfkc_cf->isNormalizedUTF8(result, errorCode));
// Omit unchanged text.
- expected = reinterpret_cast<const char*>(u8"aääạ\u0308ạ\u0308가각갃");
+ expected = u8"aääạ\u0308ạ\u0308가각갃";
result.clear();
edits.reset();
nfkc_cf->normalizeUTF8(U_OMIT_UNCHANGED_TEXT, src, sink, &edits, errorCode);
assertSuccess("normalizeUTF8 omit unchanged", errorCode.get());
- assertEquals("normalizeUTF8 omit unchanged", expected.c_str(), result.c_str());
+ assertEquals("normalizeUTF8 omit unchanged", expected.data(), result.c_str());
assertTrue("normalizeUTF8 omit unchanged hasChanges", edits.hasChanges());
assertEquals("normalizeUTF8 omit unchanged numberOfChanges", 9, edits.numberOfChanges());
TestUtility::checkEditsIter(*this, u"normalizeUTF8 omit unchanged",
// With filter: The normalization code does not see the "A" substrings.
UnicodeSet filter(u"[^A]", errorCode);
FilteredNormalizer2 fn2(*nfkc_cf, filter);
- expected = reinterpret_cast<const char*>(u8" AäA\u0308A\u0323\u0308ạ\u0308,가각갃 ");
+ expected = u8" AäA\u0308A\u0323\u0308ạ\u0308,가각갃 ";
result.clear();
edits.reset();
fn2.normalizeUTF8(0, src, sink, &edits, errorCode);
assertSuccess("filtered normalizeUTF8", errorCode.get());
- assertEquals("filtered normalizeUTF8", expected.c_str(), result.c_str());
+ assertEquals("filtered normalizeUTF8", expected.data(), result.c_str());
static const EditChange filteredChanges[] = {
{ FALSE, 3, 3 }, // 2 spaces + A
{ TRUE, 2, 2 }, // Ä→ä
// Omit unchanged text.
// Note that the result is not normalized because the inner normalizer
// does not see text across filter spans.
- expected = reinterpret_cast<const char*>(u8"ä\u0323\u0308ạ\u0308가각갃");
+ expected = u8"ä\u0323\u0308ạ\u0308가각갃";
result.clear();
edits.reset();
fn2.normalizeUTF8(U_OMIT_UNCHANGED_TEXT, src, sink, &edits, errorCode);
assertSuccess("filtered normalizeUTF8 omit unchanged", errorCode.get());
- assertEquals("filtered normalizeUTF8 omit unchanged", expected.c_str(), result.c_str());
+ assertEquals("filtered normalizeUTF8 omit unchanged", expected.data(), result.c_str());
assertTrue("filtered normalizeUTF8 omit unchanged hasChanges", edits.hasChanges());
assertEquals("filtered normalizeUTF8 omit unchanged numberOfChanges", 7, edits.numberOfChanges());
TestUtility::checkEditsIter(*this, u"filtered normalizeUTF8 omit unchanged",
assertFalse("isNormalized(LV+11A7)", nfkc->isNormalized(s, errorCode));
assertTrue("isNormalized(normalized)", nfkc->isNormalized(result, errorCode));
- std::string s8(reinterpret_cast<const char*>(u8"\u1100\u1161\u11A7\u1100\u314F\u11A7가\u11A7"));
- std::string expected8(reinterpret_cast<const char*>(u8"가\u11A7가\u11A7가\u11A7"));
+ StringPiece s8(u8"\u1100\u1161\u11A7\u1100\u314F\u11A7가\u11A7");
+ StringPiece expected8(u8"가\u11A7가\u11A7가\u11A7");
std::string result8;
- StringByteSink<std::string> sink(&result8, static_cast<int32_t>(expected8.length()));
+ StringByteSink<std::string> sink(&result8, expected8.length());
nfkc->normalizeUTF8(0, s8, sink, nullptr, errorCode);
assertSuccess("normalizeUTF8(LV+11A7)", errorCode.get());
- assertEquals("normalizeUTF8(LV+11A7)", expected8.c_str(), result8.c_str());
+ assertEquals("normalizeUTF8(LV+11A7)", expected8.data(), result8.c_str());
assertFalse("isNormalizedUTF8(LV+11A7)", nfkc->isNormalizedUTF8(s8, errorCode));
assertTrue("isNormalizedUTF8(normalized)", nfkc->isNormalizedUTF8(result8, errorCode));
}
char buffer[100];
TestCheckedArrayByteSink sink(buffer, UPRV_LENGTHOF(buffer));
errorCode=U_ZERO_ERROR;
- nontrans->labelToUnicodeUTF8(StringPiece(NULL, 5), sink, info, errorCode);
+ nontrans->labelToUnicodeUTF8(StringPiece((const char *)NULL, 5), sink, info, errorCode);
if(errorCode!=U_ILLEGAL_ARGUMENT_ERROR || sink.NumberOfBytesWritten()!=0) {
errln("N.labelToUnicodeUTF8(StringPiece(NULL, 5)) did not set illegal-argument-error ",
"or did output something - %s",