ICU-20984 StringPiece & ByteSink overloads for char8_t*

author Markus Scherer <markus.icu@gmail.com>

Fri, 13 Mar 2020 02:21:24 +0000 (19:21 -0700)

committer Markus Scherer <markus.icu@gmail.com>

Mon, 16 Mar 2020 17:49:21 +0000 (10:49 -0700)
author Markus Scherer <markus.icu@gmail.com>
Fri, 13 Mar 2020 02:21:24 +0000 (19:21 -0700)
committer Markus Scherer <markus.icu@gmail.com>
Mon, 16 Mar 2020 17:49:21 +0000 (10:49 -0700)
diff --git a/icu4c/source/common/ucasemap.cpp b/icu4c/source/common/ucasemap.cpp

index cc998c993d73dc277f1b9b40cf4656b3e6eabefa..ed72bda828fc1c85fe114c56bb6b2761f08abd83 100644 (file)
--- a/icu4c/source/common/ucasemap.cpp
+++ b/icu4c/source/common/ucasemap.cpp
@@ -687,13 +687,13 @@ void toUpper(uint32_t options,
              if (change) {
                  ByteSinkUtil::appendTwoBytes(upper, sink);
                  if ((data & HAS_EITHER_DIALYTIKA) != 0) {
-                    sink.Append(reinterpret_cast<const char*>(u8"\u0308"), 2);  // restore or add a dialytika
+                    sink.AppendU8(u8"\u0308", 2);  // restore or add a dialytika
                  }
                  if (addTonos) {
-                    sink.Append(reinterpret_cast<const char*>(u8"\u0301"), 2);
+                    sink.AppendU8(u8"\u0301", 2);
                  }
                  while (numYpogegrammeni > 0) {
-                    sink.Append(reinterpret_cast<const char*>(u8"\u0399"), 2);
+                    sink.AppendU8(u8"\u0399", 2);
                      --numYpogegrammeni;
                  }
              }
diff --git a/icu4c/source/common/unicode/bytestream.h b/icu4c/source/common/unicode/bytestream.h

index 0d60492fe235563db2198bd55f0f6f02ad60b456..7fe24062228ce346fdd10c7584e3a586444eaaff 100644 (file)
--- a/icu4c/source/common/unicode/bytestream.h
+++ b/icu4c/source/common/unicode/bytestream.h
@@ -71,6 +71,40 @@ public:
     */
    virtual void Append(const char* bytes, int32_t n) = 0;
  
+#ifndef U_HIDE_DRAFT_API
+  /**
+   * Appends n bytes to this. Same as Append().
+   * Call AppendU8() with u8"string literals" which are const char * in C++11
+   * but const char8_t * in C++20.
+   * If the compiler does support char8_t as a distinct type,
+   * then an AppendU8() overload for that is defined and will be chosen.
+   *
+   * @param bytes the pointer to the bytes
+   * @param n the number of bytes; must be non-negative
+   * @draft ICU 67
+   */
+  inline void AppendU8(const char* bytes, int32_t n) {
+    Append(bytes, n);
+  }
+
+#if defined(__cpp_char8_t) || defined(U_IN_DOXYGEN)
+  /**
+   * Appends n bytes to this. Same as Append() but for a const char8_t * pointer.
+   * Call AppendU8() with u8"string literals" which are const char * in C++11
+   * but const char8_t * in C++20.
+   * If the compiler does support char8_t as a distinct type,
+   * then this AppendU8() overload for that is defined and will be chosen.
+   *
+   * @param bytes the pointer to the bytes
+   * @param n the number of bytes; must be non-negative
+   * @draft ICU 67
+   */
+  inline void AppendU8(const char8_t* bytes, int32_t n) {
+    Append(reinterpret_cast<const char*>(bytes), n);
+  }
+#endif
+#endif  // U_HIDE_DRAFT_API
+
    /**
     * Returns a writable buffer for appending and writes the buffer's capacity to
     * *result_capacity. Guarantees *result_capacity>=min_capacity.
diff --git a/icu4c/source/common/unicode/stringpiece.h b/icu4c/source/common/unicode/stringpiece.h

index ba2240e6ac023b2c2b316a6cf4b34a3907790bc3..52c1e9ebd241df73388109f4a79c6cc7e796777c 100644 (file)
--- a/icu4c/source/common/unicode/stringpiece.h
+++ b/icu4c/source/common/unicode/stringpiece.h
@@ -67,19 +67,50 @@ class U_COMMON_API StringPiece : public UMemory {
     * Default constructor, creates an empty StringPiece.
     * @stable ICU 4.2
     */
-  StringPiece() : ptr_(NULL), length_(0) { }
+  StringPiece() : ptr_(nullptr), length_(0) { }
+
    /**
     * Constructs from a NUL-terminated const char * pointer.
     * @param str a NUL-terminated const char * pointer
     * @stable ICU 4.2
     */
    StringPiece(const char* str);
+#ifndef U_HIDE_DRAFT_API
+#if defined(__cpp_char8_t) || defined(U_IN_DOXYGEN)
+  /**
+   * Constructs from a NUL-terminated const char8_t * pointer.
+   * @param str a NUL-terminated const char8_t * pointer
+   * @draft ICU 67
+   */
+  StringPiece(const char8_t* str) : StringPiece(reinterpret_cast<const char*>(str)) {}
+#endif
+  /**
+   * Constructs an empty StringPiece.
+   * Needed for type disambiguation from multiple other overloads.
+   * @param p nullptr
+   * @draft ICU 67
+   */
+  StringPiece(std::nullptr_t p) : ptr_(p), length_(0) {}
+#endif  // U_HIDE_DRAFT_API
+
    /**
     * Constructs from a std::string.
     * @stable ICU 4.2
     */
    StringPiece(const std::string& str)
      : ptr_(str.data()), length_(static_cast<int32_t>(str.size())) { }
+#ifndef U_HIDE_DRAFT_API
+#if defined(__cpp_lib_char8_t) || defined(U_IN_DOXYGEN)
+  /**
+   * Constructs from a std::u8string.
+   * @draft ICU 67
+   */
+  StringPiece(const std::u8string& str)
+    : ptr_(reinterpret_cast<const char*>(str.data())),
+      length_(static_cast<int32_t>(str.size())) { }
+#endif
+#endif  // U_HIDE_DRAFT_API
+
  #ifndef U_HIDE_DRAFT_API
    /**
     * Constructs from some other implementation of a string piece class, from any
@@ -88,7 +119,7 @@ class U_COMMON_API StringPiece : public UMemory {
     * \code{.cpp}
     *
     *   struct OtherStringPieceClass {
-   *     const char* data();
+   *     const char* data();  // or const char8_t*
     *     size_t size();
     *   };
     *
@@ -97,16 +128,25 @@ class U_COMMON_API StringPiece : public UMemory {
     * The other string piece class will typically be std::string_view from C++17
     * or absl::string_view from Abseil.
     *
+   * Starting with C++20, data() may also return a const char8_t* pointer,
+   * as from std::u8string_view.
+   *
     * @param str the other string piece
     * @draft ICU 65
     */
    template <typename T,
              typename = typename std::enable_if<
-                std::is_same<decltype(T().data()), const char*>::value &&
+                (std::is_same<decltype(T().data()), const char*>::value
+#if defined(__cpp_char8_t)
+                    || std::is_same<decltype(T().data()), const char8_t*>::value
+#endif
+                ) &&
                  std::is_same<decltype(T().size()), size_t>::value>::type>
    StringPiece(T str)
-      : ptr_(str.data()), length_(static_cast<int32_t>(str.size())) {}
+      : ptr_(reinterpret_cast<const char*>(str.data())),
+        length_(static_cast<int32_t>(str.size())) {}
  #endif  // U_HIDE_DRAFT_API
+
    /**
     * Constructs from a const char * pointer and a specified length.
     * @param offset a const char * pointer (need not be terminated)
@@ -114,6 +154,19 @@ class U_COMMON_API StringPiece : public UMemory {
     * @stable ICU 4.2
     */
    StringPiece(const char* offset, int32_t len) : ptr_(offset), length_(len) { }
+#ifndef U_HIDE_DRAFT_API
+#if defined(__cpp_char8_t) || defined(U_IN_DOXYGEN)
+  /**
+   * Constructs from a const char8_t * pointer and a specified length.
+   * @param str a const char8_t * pointer (need not be terminated)
+   * @param len the length of the string; must be non-negative
+   * @draft ICU 67
+   */
+  StringPiece(const char8_t* str, int32_t len) :
+      StringPiece(reinterpret_cast<const char*>(str), len) {}
+#endif
+#endif  // U_HIDE_DRAFT_API
+
    /**
     * Substring of another StringPiece.
     * @param x the other StringPiece
@@ -132,7 +185,7 @@ class U_COMMON_API StringPiece : public UMemory {
    StringPiece(const StringPiece& x, int32_t pos, int32_t len);
  
    /**
-   * Returns the string pointer. May be NULL if it is empty.
+   * Returns the string pointer. May be nullptr if it is empty.
     *
     * data() may return a pointer to a buffer with embedded NULs, and the
     * returned buffer may or may not be null terminated.  Therefore it is
@@ -165,7 +218,7 @@ class U_COMMON_API StringPiece : public UMemory {
     * Sets to an empty string.
     * @stable ICU 4.2
     */
-  void clear() { ptr_ = NULL; length_ = 0; }
+  void clear() { ptr_ = nullptr; length_ = 0; }
  
    /**
     * Reset the stringpiece to refer to new data.
@@ -182,6 +235,29 @@ class U_COMMON_API StringPiece : public UMemory {
     */
    void set(const char* str);
  
+#ifndef U_HIDE_DRAFT_API
+#if defined(__cpp_char8_t) || defined(U_IN_DOXYGEN)
+  /**
+   * Resets the stringpiece to refer to new data.
+   * @param xdata pointer the new string data. Need not be NUL-terminated.
+   * @param len the length of the new data
+   * @draft ICU 67
+   */
+  inline void set(const char8_t* xdata, int32_t len) {
+      set(reinterpret_cast<const char*>(xdata), len);
+  }
+
+  /**
+   * Resets the stringpiece to refer to new data.
+   * @param str a pointer to a NUL-terminated string.
+   * @draft ICU 67
+   */
+  inline void set(const char8_t* str) {
+      set(reinterpret_cast<const char*>(str));
+  }
+#endif
+#endif  // U_HIDE_DRAFT_API
+
    /**
     * Removes the first n string units.
     * @param n prefix length, must be non-negative and <=length()
diff --git a/icu4c/source/test/intltest/collationtest.cpp b/icu4c/source/test/intltest/collationtest.cpp

index 9562e4d4aebfd65ee25d9714388d8ff8441e2057..de51eece5c425859b1009e5bb4d17dfc9841554d 100644 (file)
--- a/icu4c/source/test/intltest/collationtest.cpp
+++ b/icu4c/source/test/intltest/collationtest.cpp
@@ -22,6 +22,7 @@
  #include "unicode/sortkey.h"
  #include "unicode/std_string.h"
  #include "unicode/strenum.h"
+#include "unicode/stringpiece.h"
  #include "unicode/tblcoll.h"
  #include "unicode/uiter.h"
  #include "unicode/uniset.h"
@@ -293,15 +294,15 @@ void CollationTest::TestIllegalUTF8() {
      }
      coll->setAttribute(UCOL_STRENGTH, UCOL_IDENTICAL, errorCode);
  
-    static const char *strings[] = {
+    static const StringPiece strings[] = {
          // string with U+FFFD == illegal byte sequence
-        reinterpret_cast<const char*>(u8"a\uFFFDz"),                    reinterpret_cast<const char*>("a\x80z"),  // trail byte
-        reinterpret_cast<const char*>(u8"a\uFFFD\uFFFDz"),              reinterpret_cast<const char*>("a\xc1\x81z"),  // non-shortest form
-        reinterpret_cast<const char*>(u8"a\uFFFD\uFFFD\uFFFDz"),        reinterpret_cast<const char*>("a\xe0\x82\x83z"),  // non-shortest form
-        reinterpret_cast<const char*>(u8"a\uFFFD\uFFFD\uFFFDz"),        reinterpret_cast<const char*>("a\xed\xa0\x80z"),  // lead surrogate: would be U+D800
-        reinterpret_cast<const char*>(u8"a\uFFFD\uFFFD\uFFFDz"),        reinterpret_cast<const char*>("a\xed\xbf\xbfz"),  // trail surrogate: would be U+DFFF
-        reinterpret_cast<const char*>(u8"a\uFFFD\uFFFD\uFFFD\uFFFDz"),  reinterpret_cast<const char*>("a\xf0\x8f\xbf\xbfz"),  // non-shortest form
-        reinterpret_cast<const char*>(u8"a\uFFFD\uFFFD\uFFFD\uFFFDz"),  reinterpret_cast<const char*>("a\xf4\x90\x80\x80z")  // out of range: would be U+110000
+        u8"a\uFFFDz",                   "a\x80z",  // trail byte
+        u8"a\uFFFD\uFFFDz",             "a\xc1\x81z",  // non-shortest form
+        u8"a\uFFFD\uFFFD\uFFFDz",       "a\xe0\x82\x83z",  // non-shortest form
+        u8"a\uFFFD\uFFFD\uFFFDz",       "a\xed\xa0\x80z",  // lead surrogate: would be U+D800
+        u8"a\uFFFD\uFFFD\uFFFDz",       "a\xed\xbf\xbfz",  // trail surrogate: would be U+DFFF
+        u8"a\uFFFD\uFFFD\uFFFD\uFFFDz", "a\xf0\x8f\xbf\xbfz",  // non-shortest form
+        u8"a\uFFFD\uFFFD\uFFFD\uFFFDz", "a\xf4\x90\x80\x80z"  // out of range: would be U+110000
      };
  
      for(int32_t i = 0; i < UPRV_LENGTHOF(strings); i += 2) {
diff --git a/icu4c/source/test/intltest/compactdecimalformattest.cpp b/icu4c/source/test/intltest/compactdecimalformattest.cpp

index 90afecdb293653128f111373679df83ffd5a1cd6..1f51f70d010f131d07693a973e14ade951bfac8e 100644 (file)
--- a/icu4c/source/test/intltest/compactdecimalformattest.cpp
+++ b/icu4c/source/test/intltest/compactdecimalformattest.cpp
@@ -23,6 +23,7 @@
  
  typedef struct ExpectedResult {
    double value;
+  // Invariant characters, will be converted to UTF-16 and then unescaped.
    const char *expected;
  } ExpectedResult;
  
@@ -185,38 +186,38 @@ static ExpectedResult kChineseCurrencyTestData[] = {
          {123456789012345.0, "\\u00A5120\\u4E07\\u4EBF"},
  };
  static ExpectedResult kGermanCurrencyTestData[] = {
-        {1.0, reinterpret_cast<const char*>(u8"1\\u00A0\\u20AC")},
-        {12.0, reinterpret_cast<const char*>(u8"12\\u00A0\\u20AC")},
-        {123.0, reinterpret_cast<const char*>(u8"120\\u00A0\\u20AC")},
-        {1234.0, reinterpret_cast<const char*>(u8"1200\\u00A0\\u20AC")},
-        {12345.0, reinterpret_cast<const char*>(u8"12.000\\u00A0\\u20AC")},
-        {123456.0, reinterpret_cast<const char*>(u8"120.000\\u00A0\\u20AC")},
-        {1234567.0, reinterpret_cast<const char*>(u8"1,2\\u00A0Mio.\\u00A0\\u20AC")},
-        {12345678.0, reinterpret_cast<const char*>(u8"12\\u00A0Mio.\\u00A0\\u20AC")},
-        {123456789.0, reinterpret_cast<const char*>(u8"120\\u00A0Mio.\\u00A0\\u20AC")},
-        {1234567890.0, reinterpret_cast<const char*>(u8"1,2\\u00A0Mrd.\\u00A0\\u20AC")},
-        {12345678901.0, reinterpret_cast<const char*>(u8"12\\u00A0Mrd.\\u00A0\\u20AC")},
-        {123456789012.0, reinterpret_cast<const char*>(u8"120\\u00A0Mrd.\\u00A0\\u20AC")},
-        {1234567890123.0, reinterpret_cast<const char*>(u8"1,2\\u00A0Bio.\\u00A0\\u20AC")},
-        {12345678901234.0, reinterpret_cast<const char*>(u8"12\\u00A0Bio.\\u00A0\\u20AC")},
-        {123456789012345.0, reinterpret_cast<const char*>(u8"120\\u00A0Bio.\\u00A0\\u20AC")},
+        {1.0, "1\\u00A0\\u20AC"},
+        {12.0, "12\\u00A0\\u20AC"},
+        {123.0, "120\\u00A0\\u20AC"},
+        {1234.0, "1200\\u00A0\\u20AC"},
+        {12345.0, "12.000\\u00A0\\u20AC"},
+        {123456.0, "120.000\\u00A0\\u20AC"},
+        {1234567.0, "1,2\\u00A0Mio.\\u00A0\\u20AC"},
+        {12345678.0, "12\\u00A0Mio.\\u00A0\\u20AC"},
+        {123456789.0, "120\\u00A0Mio.\\u00A0\\u20AC"},
+        {1234567890.0, "1,2\\u00A0Mrd.\\u00A0\\u20AC"},
+        {12345678901.0, "12\\u00A0Mrd.\\u00A0\\u20AC"},
+        {123456789012.0, "120\\u00A0Mrd.\\u00A0\\u20AC"},
+        {1234567890123.0, "1,2\\u00A0Bio.\\u00A0\\u20AC"},
+        {12345678901234.0, "12\\u00A0Bio.\\u00A0\\u20AC"},
+        {123456789012345.0, "120\\u00A0Bio.\\u00A0\\u20AC"},
  };
  static ExpectedResult kEnglishCurrencyTestData[] = {
-        {1.0, reinterpret_cast<const char*>(u8"$1")},
-        {12.0, reinterpret_cast<const char*>(u8"$12")},
-        {123.0, reinterpret_cast<const char*>(u8"$120")},
-        {1234.0, reinterpret_cast<const char*>(u8"$1.2K")},
-        {12345.0, reinterpret_cast<const char*>(u8"$12K")},
-        {123456.0, reinterpret_cast<const char*>(u8"$120K")},
-        {1234567.0, reinterpret_cast<const char*>(u8"$1.2M")},
-        {12345678.0, reinterpret_cast<const char*>(u8"$12M")},
-        {123456789.0, reinterpret_cast<const char*>(u8"$120M")},
-        {1234567890.0, reinterpret_cast<const char*>(u8"$1.2B")},
-        {12345678901.0, reinterpret_cast<const char*>(u8"$12B")},
-        {123456789012.0, reinterpret_cast<const char*>(u8"$120B")},
-        {1234567890123.0, reinterpret_cast<const char*>(u8"$1.2T")},
-        {12345678901234.0, reinterpret_cast<const char*>(u8"$12T")},
-        {123456789012345.0, reinterpret_cast<const char*>(u8"$120T")},
+        {1.0, "$1"},
+        {12.0, "$12"},
+        {123.0, "$120"},
+        {1234.0, "$1.2K"},
+        {12345.0, "$12K"},
+        {123456.0, "$120K"},
+        {1234567.0, "$1.2M"},
+        {12345678.0, "$12M"},
+        {123456789.0, "$120M"},
+        {1234567890.0, "$1.2B"},
+        {12345678901.0, "$12B"},
+        {123456789012.0, "$120B"},
+        {1234567890123.0, "$1.2T"},
+        {12345678901234.0, "$12T"},
+        {123456789012345.0, "$120T"},
  };
  
  
diff --git a/icu4c/source/test/intltest/regextst.cpp b/icu4c/source/test/intltest/regextst.cpp

index 311c7bc94b919ac73c7fc31321838f1180a89a38..5f7e36b3ae182469e579a76a794bec4bf23f14ac 100644 (file)
--- a/icu4c/source/test/intltest/regextst.cpp
+++ b/icu4c/source/test/intltest/regextst.cpp
@@ -31,6 +31,7 @@
  
  #include "unicode/localpointer.h"
  #include "unicode/regex.h"
+#include "unicode/stringpiece.h"
  #include "unicode/uchar.h"
  #include "unicode/ucnv.h"
  #include "unicode/uniset.h"
@@ -5838,11 +5839,11 @@ void RegexTest::TestBug12884() {
      REGEX_ASSERT(status == U_REGEX_TIME_OUT);
  
      // UText, wrapping non-UTF-16 text, also takes a different execution path.
-    const char *text8 = reinterpret_cast<const char*>(u8"¿Qué es Unicode?  Unicode proporciona un número único para cada"
+    StringPiece text8(u8"¿Qué es Unicode?  Unicode proporciona un número único para cada"
                            "carácter, sin importar la plataforma, sin importar el programa,"
                            "sin importar el idioma.");
      status = U_ZERO_ERROR;
-    LocalUTextPointer ut(utext_openUTF8(NULL, text8, -1, &status));
+    LocalUTextPointer ut(utext_openUTF8(NULL, text8.data(), text8.length(), &status));
      REGEX_CHECK_STATUS;
      m.reset(ut.getAlias());
      m.find(status);
diff --git a/icu4c/source/test/intltest/strcase.cpp b/icu4c/source/test/intltest/strcase.cpp

index dc81fb4513280cf33cbb56e40abc84ba1f5164c7..4093c51926273a60b81d4f80f1ebe4050f1d1a5b 100644 (file)
--- a/icu4c/source/test/intltest/strcase.cpp
+++ b/icu4c/source/test/intltest/strcase.cpp
@@ -1314,7 +1314,8 @@ void StringCaseTest::TestCaseMapUTF8WithEdits() {
      Edits edits;
  
      int32_t length = CaseMap::utf8ToLower("tr", U_OMIT_UNCHANGED_TEXT,
-                                          reinterpret_cast<const char*>(u8"IstanBul"), 8, dest, UPRV_LENGTHOF(dest), &edits, errorCode);
+                                          reinterpret_cast<const char*>(u8"IstanBul"), 8,
+                                          dest, UPRV_LENGTHOF(dest), &edits, errorCode);
      assertEquals(u"toLower(IstanBul)", UnicodeString(u"ıb"),
                   UnicodeString::fromUTF8(StringPiece(dest, length)));
      static const EditChange lowerExpectedChanges[] = {
@@ -1330,7 +1331,8 @@ void StringCaseTest::TestCaseMapUTF8WithEdits() {
  
      edits.reset();
      length = CaseMap::utf8ToUpper("el", U_OMIT_UNCHANGED_TEXT,
-                                  reinterpret_cast<const char*>(u8"Πατάτα"), 6 * 2, dest, UPRV_LENGTHOF(dest), &edits, errorCode);
+                                  reinterpret_cast<const char*>(u8"Πατάτα"), 6 * 2,
+                                  dest, UPRV_LENGTHOF(dest), &edits, errorCode);
      assertEquals(u"toUpper(Πατάτα)", UnicodeString(u"ΑΤΑΤΑ"),
                   UnicodeString::fromUTF8(StringPiece(dest, length)));
      static const EditChange upperExpectedChanges[] = {
@@ -1370,7 +1372,8 @@ void StringCaseTest::TestCaseMapUTF8WithEdits() {
      // No explicit nor automatic edits.reset(). Edits should be appended.
      length = CaseMap::utf8Fold(U_OMIT_UNCHANGED_TEXT | U_EDITS_NO_RESET |
                                     U_FOLD_CASE_EXCLUDE_SPECIAL_I,
-                               reinterpret_cast<const char*>(u8"IßtanBul"), 1 + 2 + 6, dest, UPRV_LENGTHOF(dest), &edits, errorCode);
+                               reinterpret_cast<const char*>(u8"IßtanBul"), 1 + 2 + 6,
+                               dest, UPRV_LENGTHOF(dest), &edits, errorCode);
      assertEquals(u"foldCase(IßtanBul)", UnicodeString(u"ıssb"),
                   UnicodeString::fromUTF8(StringPiece(dest, length)));
      static const EditChange foldExpectedChanges[] = {
@@ -1454,44 +1457,44 @@ void StringCaseTest::TestCaseMapUTF8ToString() {
      StringByteSink<std::string> sink(&dest);
  
      // Omit unchanged text.
-    CaseMap::utf8ToLower("tr", U_OMIT_UNCHANGED_TEXT, reinterpret_cast<const char*>(u8"IstanBul"), sink, nullptr, errorCode);
+    CaseMap::utf8ToLower("tr", U_OMIT_UNCHANGED_TEXT, u8"IstanBul", sink, nullptr, errorCode);
      assertEquals(u"toLower(IstanBul)", UnicodeString(u"ıb"), UnicodeString::fromUTF8(dest));
      dest.clear();
-    CaseMap::utf8ToUpper("el", U_OMIT_UNCHANGED_TEXT, reinterpret_cast<const char*>(u8"Πατάτα"), sink, nullptr, errorCode);
+    CaseMap::utf8ToUpper("el", U_OMIT_UNCHANGED_TEXT, u8"Πατάτα", sink, nullptr, errorCode);
      assertEquals(u"toUpper(Πατάτα)", UnicodeString(u"ΑΤΑΤΑ"),
                   UnicodeString::fromUTF8(dest));
  #if !UCONFIG_NO_BREAK_ITERATION
      dest.clear();
      CaseMap::utf8ToTitle(
          "nl", U_OMIT_UNCHANGED_TEXT | U_TITLECASE_NO_BREAK_ADJUSTMENT | U_TITLECASE_NO_LOWERCASE,
-        nullptr, reinterpret_cast<const char*>(u8"IjssEL IglOo"), sink, nullptr, errorCode);
+        nullptr, u8"IjssEL IglOo", sink, nullptr, errorCode);
      assertEquals(u"toTitle(IjssEL IglOo)", UnicodeString(u"J"),
                   UnicodeString::fromUTF8(dest));
  #endif
      dest.clear();
      CaseMap::utf8Fold(U_OMIT_UNCHANGED_TEXT | U_FOLD_CASE_EXCLUDE_SPECIAL_I,
-                      reinterpret_cast<const char*>(u8"IßtanBul"), sink, nullptr, errorCode);
+                      u8"IßtanBul", sink, nullptr, errorCode);
      assertEquals(u"foldCase(IßtanBul)", UnicodeString(u"ıssb"),
                   UnicodeString::fromUTF8(dest));
  
      // Return the whole result string.
      dest.clear();
-    CaseMap::utf8ToLower("tr", 0, reinterpret_cast<const char*>(u8"IstanBul"), sink, nullptr, errorCode);
+    CaseMap::utf8ToLower("tr", 0, u8"IstanBul", sink, nullptr, errorCode);
      assertEquals(u"toLower(IstanBul)", UnicodeString(u"ıstanbul"),
                   UnicodeString::fromUTF8(dest));
      dest.clear();
-    CaseMap::utf8ToUpper("el", 0, reinterpret_cast<const char*>(u8"Πατάτα"), sink, nullptr, errorCode);
+    CaseMap::utf8ToUpper("el", 0, u8"Πατάτα", sink, nullptr, errorCode);
      assertEquals(u"toUpper(Πατάτα)", UnicodeString(u"ΠΑΤΑΤΑ"),
                   UnicodeString::fromUTF8(dest));
  #if !UCONFIG_NO_BREAK_ITERATION
      dest.clear();
      CaseMap::utf8ToTitle("nl", U_TITLECASE_NO_BREAK_ADJUSTMENT | U_TITLECASE_NO_LOWERCASE,
-                         nullptr, reinterpret_cast<const char*>(u8"IjssEL IglOo"), sink, nullptr, errorCode);
+                         nullptr, u8"IjssEL IglOo", sink, nullptr, errorCode);
      assertEquals(u"toTitle(IjssEL IglOo)", UnicodeString(u"IJssEL IglOo"),
                   UnicodeString::fromUTF8(dest));
  #endif
      dest.clear();
-    CaseMap::utf8Fold(U_FOLD_CASE_EXCLUDE_SPECIAL_I, reinterpret_cast<const char*>(u8"IßtanBul"), sink, nullptr, errorCode);
+    CaseMap::utf8Fold(U_FOLD_CASE_EXCLUDE_SPECIAL_I, u8"IßtanBul", sink, nullptr, errorCode);
      assertEquals(u"foldCase(IßtanBul)", UnicodeString(u"ısstanbul"),
                   UnicodeString::fromUTF8(dest));
  }
diff --git a/icu4c/source/test/intltest/strtest.cpp b/icu4c/source/test/intltest/strtest.cpp

index 1665a03cdb87cd9ff7a06ac5b8dc9182ba111900..cf00cd4241dbf873a281b003d10fb5c1fc8fb30f 100644 (file)
--- a/icu4c/source/test/intltest/strtest.cpp
+++ b/icu4c/source/test/intltest/strtest.cpp
@@ -248,9 +248,11 @@ void StringTest::runIndexedTest(int32_t index, UBool exec, const char *&name, ch
  #ifdef U_HAVE_STRING_VIEW
      TESTCASE_AUTO(TestStringPieceStringView);
  #endif
+    TESTCASE_AUTO(TestStringPieceU8);
      TESTCASE_AUTO(TestByteSink);
      TESTCASE_AUTO(TestCheckedArrayByteSink);
      TESTCASE_AUTO(TestStringByteSink);
+    TESTCASE_AUTO(TestStringByteSinkAppendU8);
      TESTCASE_AUTO(TestCharString);
      TESTCASE_AUTO(TestCStr);
      TESTCASE_AUTO(Testctou);
@@ -265,7 +267,7 @@ StringTest::TestStringPiece() {
          errln("StringPiece() failed");
      }
      // Construct from NULL const char * pointer.
-    StringPiece null(NULL);
+    StringPiece null((const char *)nullptr);
      if(!null.empty() || null.data()!=NULL || null.length()!=0 || null.size()!=0) {
          errln("StringPiece(NULL) failed");
      }
@@ -395,7 +397,7 @@ StringTest::TestStringPiece() {
  void
  StringTest::TestStringPieceComparisons() {
      StringPiece empty;
-    StringPiece null(NULL);
+    StringPiece null(nullptr);
      StringPiece abc("abc");
      StringPiece abcd("abcdefg", 4);
      StringPiece abx("abx");
@@ -521,6 +523,52 @@ StringTest::TestStringPieceStringView() {
  }
  #endif
  
+void
+StringTest::TestStringPieceU8() {
+    // ICU-20984 "mitigate some C++20 char8_t breakages"
+    // For the following APIs there are overloads for both
+    // const char * and const char8_t *.
+    // A u8"string literal" has one type or the other
+    // depending on C++ version and compiler settings.
+    StringPiece abc(u8"abc");
+    assertEquals("abc.length", 3, abc.length());
+    assertEquals("abc", "\x61\x62\x63", abc.data());
+
+    StringPiece abc3(u8"abcdef", 3);
+    assertEquals("abc3.length", 3, abc3.length());
+    assertEquals("abc3[0]", 0x61, abc3.data()[0]);
+    assertEquals("abc3[1]", 0x62, abc3.data()[1]);
+    assertEquals("abc3[2]", 0x63, abc3.data()[2]);
+
+    StringPiece uvw("q");
+    uvw.set(u8"uvw");
+    assertEquals("uvw.length", 3, uvw.length());
+    assertEquals("uvw", "\x75\x76\x77", uvw.data());
+
+    StringPiece xyz("r");
+    xyz.set(u8"xyzXYZ", 3);
+    assertEquals("xyz.length", 3, xyz.length());
+    assertEquals("xyz[0]", 0x78, xyz.data()[0]);
+    assertEquals("xyz[1]", 0x79, xyz.data()[1]);
+    assertEquals("xyz[2]", 0x7a, xyz.data()[2]);
+
+    StringPiece null(nullptr);
+    assertTrue("null is empty", null.empty());
+    assertTrue("null is null", null.data() == nullptr);
+
+#ifdef __cpp_lib_char8_t
+    std::u8string_view u8sv(u8"sv");  // C++20
+    StringPiece u8svsp(u8sv);
+    assertEquals("u8svsp.length", 2, u8svsp.length());
+    assertEquals("u8svsp", "\x73\x76", u8svsp.data());
+
+    std::u8string u8str(u8"str");  // C++20
+    StringPiece u8strsp(u8str);
+    assertEquals("u8strsp.length", 3, u8strsp.length());
+    assertEquals("u8strsp", "\x73\x74\x72", u8strsp.data());
+#endif  // __cpp_lib_char8_t
+}
+
  // Verify that ByteSink is subclassable and Flush() overridable.
  class SimpleByteSink : public ByteSink {
  public:
@@ -653,6 +701,20 @@ StringTest::TestStringByteSink() {
      }
  }
  
+void
+StringTest::TestStringByteSinkAppendU8() {
+    // ICU-20984 "mitigate some C++20 char8_t breakages"
+    // For the following APIs there are overloads for both
+    // const char * and const char8_t *.
+    // A u8"string literal" has one type or the other
+    // depending on C++ version and compiler settings.
+    std::string result("abc");
+    StringByteSink<std::string> sink(&result);
+    sink.AppendU8("def", 3);
+    sink.AppendU8(u8"ghijkl", 4);
+    assertEquals("abcdefghij", "abcdef\x67\x68\x69\x6a", result.c_str());
+}
+
  #if defined(_MSC_VER)
  #include <vector>
  #endif
diff --git a/icu4c/source/test/intltest/strtest.h b/icu4c/source/test/intltest/strtest.h

index 8359f84823a83f050ca627ec855c8dd9376ae39c..2a1b98804f38e1c7852ae68d5aa385a0d7c43608 100644 (file)
--- a/icu4c/source/test/intltest/strtest.h
+++ b/icu4c/source/test/intltest/strtest.h
@@ -49,9 +49,11 @@ private:
  #ifdef U_HAVE_STRING_VIEW
      void TestStringPieceStringView();
  #endif
+    void TestStringPieceU8();
      void TestByteSink();
      void TestCheckedArrayByteSink();
      void TestStringByteSink();
+    void TestStringByteSinkAppendU8();
      void TestSTLCompatibility();
      void TestCharString();
      void TestCStr();
diff --git a/icu4c/source/test/intltest/tstnorm.cpp b/icu4c/source/test/intltest/tstnorm.cpp

index 886df6f15adf103cf31106372a4e78346614c311..e478872d53e7d775d04fae20f0e801ec99e4d1ea 100644 (file)
--- a/icu4c/source/test/intltest/tstnorm.cpp
+++ b/icu4c/source/test/intltest/tstnorm.cpp
@@ -14,6 +14,7 @@
  #include "unicode/errorcode.h"
  #include "unicode/normlzr.h"
  #include "unicode/stringoptions.h"
+#include "unicode/stringpiece.h"
  #include "unicode/uniset.h"
  #include "unicode/usetiter.h"
  #include "unicode/schriter.h"
@@ -1573,15 +1574,15 @@ BasicNormalizerTest::TestNormalizeUTF8WithEdits() {
      if(errorCode.errDataIfFailureAndReset("Normalizer2::getNFKCCasefoldInstance() call failed")) {
          return;
      }
-    static const char *const src =
-        reinterpret_cast<const char*>(u8"  AÄA\u0308A\u0308\u00ad\u0323Ä\u0323,\u00ad\u1100\u1161가\u11A8가\u3133  ");
-    std::string expected = reinterpret_cast<const char*>(u8"  aääạ\u0308ạ\u0308,가각갃  ");
+    static const StringPiece src =
+        u8"  AÄA\u0308A\u0308\u00ad\u0323Ä\u0323,\u00ad\u1100\u1161가\u11A8가\u3133  ";
+    StringPiece expected = u8"  aääạ\u0308ạ\u0308,가각갃  ";
      std::string result;
      StringByteSink<std::string> sink(&result, static_cast<int32_t>(expected.length()));
      Edits edits;
      nfkc_cf->normalizeUTF8(0, src, sink, &edits, errorCode);
      assertSuccess("normalizeUTF8 with Edits", errorCode.get());
-    assertEquals("normalizeUTF8 with Edits", expected.c_str(), result.c_str());
+    assertEquals("normalizeUTF8 with Edits", expected.data(), result.c_str());
      static const EditChange expectedChanges[] = {
          { FALSE, 2, 2 },  // 2 spaces
          { TRUE, 1, 1 },  // A→a
@@ -1607,12 +1608,12 @@ BasicNormalizerTest::TestNormalizeUTF8WithEdits() {
      assertTrue("isNormalizedUTF8(normalized)", nfkc_cf->isNormalizedUTF8(result, errorCode));
  
      // Omit unchanged text.
-    expected = reinterpret_cast<const char*>(u8"aääạ\u0308ạ\u0308가각갃");
+    expected = u8"aääạ\u0308ạ\u0308가각갃";
      result.clear();
      edits.reset();
      nfkc_cf->normalizeUTF8(U_OMIT_UNCHANGED_TEXT, src, sink, &edits, errorCode);
      assertSuccess("normalizeUTF8 omit unchanged", errorCode.get());
-    assertEquals("normalizeUTF8 omit unchanged", expected.c_str(), result.c_str());
+    assertEquals("normalizeUTF8 omit unchanged", expected.data(), result.c_str());
      assertTrue("normalizeUTF8 omit unchanged hasChanges", edits.hasChanges());
      assertEquals("normalizeUTF8 omit unchanged numberOfChanges", 9, edits.numberOfChanges());
      TestUtility::checkEditsIter(*this, u"normalizeUTF8 omit unchanged",
@@ -1623,12 +1624,12 @@ BasicNormalizerTest::TestNormalizeUTF8WithEdits() {
      // With filter: The normalization code does not see the "A" substrings.
      UnicodeSet filter(u"[^A]", errorCode);
      FilteredNormalizer2 fn2(*nfkc_cf, filter);
-    expected = reinterpret_cast<const char*>(u8"  AäA\u0308A\u0323\u0308ạ\u0308,가각갃  ");
+    expected = u8"  AäA\u0308A\u0323\u0308ạ\u0308,가각갃  ";
      result.clear();
      edits.reset();
      fn2.normalizeUTF8(0, src, sink, &edits, errorCode);
      assertSuccess("filtered normalizeUTF8", errorCode.get());
-    assertEquals("filtered normalizeUTF8", expected.c_str(), result.c_str());
+    assertEquals("filtered normalizeUTF8", expected.data(), result.c_str());
      static const EditChange filteredChanges[] = {
          { FALSE, 3, 3 },  // 2 spaces + A
          { TRUE, 2, 2 },  // Ä→ä
@@ -1655,12 +1656,12 @@ BasicNormalizerTest::TestNormalizeUTF8WithEdits() {
      // Omit unchanged text.
      // Note that the result is not normalized because the inner normalizer
      // does not see text across filter spans.
-    expected = reinterpret_cast<const char*>(u8"ä\u0323\u0308ạ\u0308가각갃");
+    expected = u8"ä\u0323\u0308ạ\u0308가각갃";
      result.clear();
      edits.reset();
      fn2.normalizeUTF8(U_OMIT_UNCHANGED_TEXT, src, sink, &edits, errorCode);
      assertSuccess("filtered normalizeUTF8 omit unchanged", errorCode.get());
-    assertEquals("filtered normalizeUTF8 omit unchanged", expected.c_str(), result.c_str());
+    assertEquals("filtered normalizeUTF8 omit unchanged", expected.data(), result.c_str());
      assertTrue("filtered normalizeUTF8 omit unchanged hasChanges", edits.hasChanges());
      assertEquals("filtered normalizeUTF8 omit unchanged numberOfChanges", 7, edits.numberOfChanges());
      TestUtility::checkEditsIter(*this, u"filtered normalizeUTF8 omit unchanged",
@@ -1777,13 +1778,13 @@ BasicNormalizerTest::TestComposeJamoTBase() {
      assertFalse("isNormalized(LV+11A7)", nfkc->isNormalized(s, errorCode));
      assertTrue("isNormalized(normalized)", nfkc->isNormalized(result, errorCode));
  
-    std::string s8(reinterpret_cast<const char*>(u8"\u1100\u1161\u11A7\u1100\u314F\u11A7가\u11A7"));
-    std::string expected8(reinterpret_cast<const char*>(u8"가\u11A7가\u11A7가\u11A7"));
+    StringPiece s8(u8"\u1100\u1161\u11A7\u1100\u314F\u11A7가\u11A7");
+    StringPiece expected8(u8"가\u11A7가\u11A7가\u11A7");
      std::string result8;
-    StringByteSink<std::string> sink(&result8, static_cast<int32_t>(expected8.length()));
+    StringByteSink<std::string> sink(&result8, expected8.length());
      nfkc->normalizeUTF8(0, s8, sink, nullptr, errorCode);
      assertSuccess("normalizeUTF8(LV+11A7)", errorCode.get());
-    assertEquals("normalizeUTF8(LV+11A7)", expected8.c_str(), result8.c_str());
+    assertEquals("normalizeUTF8(LV+11A7)", expected8.data(), result8.c_str());
      assertFalse("isNormalizedUTF8(LV+11A7)", nfkc->isNormalizedUTF8(s8, errorCode));
      assertTrue("isNormalizedUTF8(normalized)", nfkc->isNormalizedUTF8(result8, errorCode));
  }
diff --git a/icu4c/source/test/intltest/uts46test.cpp b/icu4c/source/test/intltest/uts46test.cpp

index e11fdf2bc7a2cfb366c5c2906a5e16f5651b41a6..b399d2dd72489d6c6f7a11d3b4c9e728221bf158 100644 (file)
--- a/icu4c/source/test/intltest/uts46test.cpp
+++ b/icu4c/source/test/intltest/uts46test.cpp
@@ -160,7 +160,7 @@ void UTS46Test::TestAPI() {
      char buffer[100];
      TestCheckedArrayByteSink sink(buffer, UPRV_LENGTHOF(buffer));
      errorCode=U_ZERO_ERROR;
-    nontrans->labelToUnicodeUTF8(StringPiece(NULL, 5), sink, info, errorCode);
+    nontrans->labelToUnicodeUTF8(StringPiece((const char *)NULL, 5), sink, info, errorCode);
      if(errorCode!=U_ILLEGAL_ARGUMENT_ERROR || sink.NumberOfBytesWritten()!=0) {
          errln("N.labelToUnicodeUTF8(StringPiece(NULL, 5)) did not set illegal-argument-error ",
                "or did output something - %s",
author	Markus Scherer <markus.icu@gmail.com>
	Fri, 13 Mar 2020 02:21:24 +0000 (19:21 -0700)
committer	Markus Scherer <markus.icu@gmail.com>
	Mon, 16 Mar 2020 17:49:21 +0000 (10:49 -0700)
icu4c/source/common/ucasemap.cpp		patch \| blob \| history
icu4c/source/common/unicode/bytestream.h		patch \| blob \| history
icu4c/source/common/unicode/stringpiece.h		patch \| blob \| history
icu4c/source/test/intltest/collationtest.cpp		patch \| blob \| history
icu4c/source/test/intltest/compactdecimalformattest.cpp		patch \| blob \| history
icu4c/source/test/intltest/regextst.cpp		patch \| blob \| history
icu4c/source/test/intltest/strcase.cpp		patch \| blob \| history
icu4c/source/test/intltest/strtest.cpp		patch \| blob \| history
icu4c/source/test/intltest/strtest.h		patch \| blob \| history
icu4c/source/test/intltest/tstnorm.cpp		patch \| blob \| history
icu4c/source/test/intltest/uts46test.cpp		patch \| blob \| history