ICU-13707 Make UnicodeString safe when appended or inserted into itself. (#147)

author Shane F. Carr <shane@unicode.org>

Fri, 21 Sep 2018 20:44:21 +0000 (13:44 -0700)

committer Shane Carr <shane@unicode.org>

Thu, 27 Sep 2018 21:27:41 +0000 (14:27 -0700)
author Shane F. Carr <shane@unicode.org>
Fri, 21 Sep 2018 20:44:21 +0000 (13:44 -0700)
committer Shane Carr <shane@unicode.org>
Thu, 27 Sep 2018 21:27:41 +0000 (14:27 -0700)
diff --git a/icu4c/source/common/unicode/unistr.h b/icu4c/source/common/unicode/unistr.h

index b84f40bd449cede203809005134905007b98f8bb..bf954b5f1d82321e0cdce7932b632413de54f865 100644 (file)
--- a/icu4c/source/common/unicode/unistr.h
+++ b/icu4c/source/common/unicode/unistr.h
@@ -243,6 +243,9 @@ class UnicodeStringAppendable;  // unicode/appendable.h
   * than other ICU APIs. In particular:
   * - If indexes are out of bounds for a UnicodeString object
   *   (<0 or >length()) then they are "pinned" to the nearest boundary.
+ * - If the buffer passed to an insert/append/replace operation is owned by the
+ *   target object, e.g., calling str.append(str), an extra copy may take place
+ *   to ensure safety.
   * - If primitive string pointer values (e.g., const char16_t * or char *)
   *   for input strings are NULL, then those input string parameters are treated
   *   as if they pointed to an empty string.
diff --git a/icu4c/source/common/unistr.cpp b/icu4c/source/common/unistr.cpp

index 5d7cab2e155ccc21aed8592bdfc61f687b5d0e69..c8b6c0a3a46319219b7e9d15a220dc5930786401 100644 (file)
--- a/icu4c/source/common/unistr.cpp
+++ b/icu4c/source/common/unistr.cpp
@@ -1447,10 +1447,15 @@ UnicodeString::doReplace(int32_t start,
    }
  
    if(srcChars == 0) {
-    srcStart = srcLength = 0;
-  } else if(srcLength < 0) {
-    // get the srcLength if necessary
-    srcLength = u_strlen(srcChars + srcStart);
+    srcLength = 0;
+  } else {
+    // Perform all remaining operations relative to srcChars + srcStart.
+    // From this point forward, do not use srcStart.
+    srcChars += srcStart;
+    if (srcLength < 0) {
+      // get the srcLength if necessary
+      srcLength = u_strlen(srcChars);
+    }
    }
  
    // pin the indices to legal values
@@ -1465,17 +1470,28 @@ UnicodeString::doReplace(int32_t start,
    }
    newLength += srcLength;
  
+  // Check for insertion into ourself
+  const UChar *oldArray = getArrayStart();
+  if (isBufferWritable() &&
+      oldArray < srcChars + srcLength &&
+      srcChars < oldArray + oldLength) {
+    // Copy into a new UnicodeString and start over
+    UnicodeString copy(srcChars, srcLength);
+    if (copy.isBogus()) {
+      setToBogus();
+      return *this;
+    }
+    return doReplace(start, length, copy.getArrayStart(), 0, srcLength);
+  }
+
    // cloneArrayIfNeeded(doCopyArray=FALSE) may change fArray but will not copy the current contents;
    // therefore we need to keep the current fArray
    UChar oldStackBuffer[US_STACKBUF_SIZE];
-  UChar *oldArray;
    if((fUnion.fFields.fLengthAndFlags&kUsingStackBuffer) && (newLength > US_STACKBUF_SIZE)) {
      // copy the stack buffer contents because it will be overwritten with
      // fUnion.fFields values
-    u_memcpy(oldStackBuffer, fUnion.fStackFields.fBuffer, oldLength);
+    u_memcpy(oldStackBuffer, oldArray, oldLength);
      oldArray = oldStackBuffer;
-  } else {
-    oldArray = getArrayStart();
    }
  
    // clone our array and allocate a bigger array if needed
@@ -1503,7 +1519,7 @@ UnicodeString::doReplace(int32_t start,
    }
  
    // now fill in the hole with the new string
-  us_arrayCopy(srcChars, srcStart, newArray, start, srcLength);
+  us_arrayCopy(srcChars, 0, newArray, start, srcLength);
  
    setLength(newLength);
  
@@ -1536,15 +1552,34 @@ UnicodeString::doAppend(const UChar *srcChars, int32_t srcStart, int32_t srcLeng
      return *this;
    }
  
+  // Perform all remaining operations relative to srcChars + srcStart.
+  // From this point forward, do not use srcStart.
+  srcChars += srcStart;
+
    if(srcLength < 0) {
      // get the srcLength if necessary
-    if((srcLength = u_strlen(srcChars + srcStart)) == 0) {
+    if((srcLength = u_strlen(srcChars)) == 0) {
        return *this;
      }
    }
  
    int32_t oldLength = length();
    int32_t newLength = oldLength + srcLength;
+
+  // Check for append onto ourself
+  const UChar* oldArray = getArrayStart();
+  if (isBufferWritable() &&
+      oldArray < srcChars + srcLength &&
+      srcChars < oldArray + oldLength) {
+    // Copy into a new UnicodeString and start over
+    UnicodeString copy(srcChars, srcLength);
+    if (copy.isBogus()) {
+      setToBogus();
+      return *this;
+    }
+    return doAppend(copy.getArrayStart(), 0, srcLength);
+  }
+
    // optimize append() onto a large-enough, owned string
    if((newLength <= getCapacity() && isBufferWritable()) ||
        cloneArrayIfNeeded(newLength, getGrowCapacity(newLength))) {
@@ -1556,8 +1591,8 @@ UnicodeString::doAppend(const UChar *srcChars, int32_t srcStart, int32_t srcLeng
      // or
      //   str.appendString(buffer, length)
      // or similar.
-    if(srcChars + srcStart != newArray + oldLength) {
-      us_arrayCopy(srcChars, srcStart, newArray, oldLength, srcLength);
+    if(srcChars != newArray + oldLength) {
+      us_arrayCopy(srcChars, 0, newArray, oldLength, srcLength);
      }
      setLength(newLength);
    }
diff --git a/icu4c/source/test/intltest/ustrtest.cpp b/icu4c/source/test/intltest/ustrtest.cpp

index 4b7cb7ae7c7f800c3bb3a3fe5111ebf583cc82b6..57d72fb3f547d8758285792037b4ee4426dedd66 100644 (file)
--- a/icu4c/source/test/intltest/ustrtest.cpp
+++ b/icu4c/source/test/intltest/ustrtest.cpp
@@ -64,6 +64,7 @@ void UnicodeStringTest::runIndexedTest( int32_t index, UBool exec, const char* &
      TESTCASE_AUTO(TestUInt16Pointers);
      TESTCASE_AUTO(TestWCharPointers);
      TESTCASE_AUTO(TestNullPointers);
+    TESTCASE_AUTO(TestUnicodeStringInsertAppendToSelf);
      TESTCASE_AUTO_END;
  }
  
@@ -1123,27 +1124,25 @@ UnicodeStringTest::TestMiscellaneous()
          errln("UnicodeString(u[-1]).getTerminatedBuffer() returns a bad buffer");
      }
  
-    test1=UNICODE_STRING("la", 2);
-    test1.append(UNICODE_STRING(" lila", 5).getTerminatedBuffer(), 0, -1);
-    if(test1!=UNICODE_STRING("la lila", 7)) {
-        errln("UnicodeString::append(const UChar *, start, length) failed");
-    }
+    // NOTE: Some compilers will optimize u"la" to point to the same static memory
+    // as u" lila", offset by 3 code units
+    test1=UnicodeString(TRUE, u"la", 2);
+    test1.append(UnicodeString(TRUE, u" lila", 5).getTerminatedBuffer(), 0, -1);
+    assertEquals("UnicodeString::append(const UChar *, start, length) failed",
+        u"la lila", test1);
  
-    test1.insert(3, UNICODE_STRING("dudum ", 6), 0, INT32_MAX);
-    if(test1!=UNICODE_STRING("la dudum lila", 13)) {
-        errln("UnicodeString::insert(start, const UniStr &, start, length) failed");
-    }
+    test1.insert(3, UnicodeString(TRUE, u"dudum ", 6), 0, INT32_MAX);
+    assertEquals("UnicodeString::insert(start, const UniStr &, start, length) failed",
+        u"la dudum lila", test1);
  
      static const UChar ucs[]={ 0x68, 0x6d, 0x20, 0 };
      test1.insert(9, ucs, -1);
-    if(test1!=UNICODE_STRING("la dudum hm lila", 16)) {
-        errln("UnicodeString::insert(start, const UChar *, length) failed");
-    }
+    assertEquals("UnicodeString::insert(start, const UChar *, length) failed",
+        u"la dudum hm lila", test1);
  
      test1.replace(9, 2, (UChar)0x2b);
-    if(test1!=UNICODE_STRING("la dudum + lila", 15)) {
-        errln("UnicodeString::replace(start, length, UChar) failed");
-    }
+    assertEquals("UnicodeString::replace(start, length, UChar) failed",
+        u"la dudum + lila", test1);
  
      if(test1.hasMetaData() || UnicodeString().hasMetaData()) {
          errln("UnicodeString::hasMetaData() returns TRUE");
@@ -2248,3 +2247,59 @@ UnicodeStringTest::TestNullPointers() {
      UnicodeString(u"def").extract(nullptr, 0, errorCode);
      assertEquals("buffer overflow extracting to nullptr", U_BUFFER_OVERFLOW_ERROR, errorCode);
  }
+
+void UnicodeStringTest::TestUnicodeStringInsertAppendToSelf() {
+    IcuTestErrorCode status(*this, "TestUnicodeStringAppendToSelf");
+
+    // Test append operation
+    UnicodeString str(u"foo ");
+    str.append(str);
+    str.append(str);
+    str.append(str);
+    assertEquals("", u"foo foo foo foo foo foo foo foo ", str);
+
+    // Test append operation with readonly alias to start
+    str = UnicodeString(TRUE, u"foo ", 4);
+    str.append(str);
+    str.append(str);
+    str.append(str);
+    assertEquals("", u"foo foo foo foo foo foo foo foo ", str);
+
+    // Test append operation with aliased substring
+    str = u"abcde";
+    UnicodeString sub = str.tempSubString(1, 2);
+    str.append(sub);
+    assertEquals("", u"abcdebc", str);
+
+    // Test append operation with double-aliased substring
+    str = UnicodeString(TRUE, u"abcde", 5);
+    sub = str.tempSubString(1, 2);
+    str.append(sub);
+    assertEquals("", u"abcdebc", str);
+
+    // Test insert operation
+    str = u"a-*b";
+    str.insert(2, str);
+    str.insert(4, str);
+    str.insert(8, str);
+    assertEquals("", u"a-a-a-a-a-a-a-a-*b*b*b*b*b*b*b*b", str);
+
+    // Test insert operation with readonly alias to start
+    str = UnicodeString(TRUE, u"a-*b", 4);
+    str.insert(2, str);
+    str.insert(4, str);
+    str.insert(8, str);
+    assertEquals("", u"a-a-a-a-a-a-a-a-*b*b*b*b*b*b*b*b", str);
+
+    // Test insert operation with aliased substring
+    str = u"abcde";
+    sub = str.tempSubString(1, 3);
+    str.insert(2, sub);
+    assertEquals("", u"abbcdcde", str);
+
+    // Test insert operation with double-aliased substring
+    str = UnicodeString(TRUE, u"abcde", 5);
+    sub = str.tempSubString(1, 3);
+    str.insert(2, sub);
+    assertEquals("", u"abbcdcde", str);
+}
diff --git a/icu4c/source/test/intltest/ustrtest.h b/icu4c/source/test/intltest/ustrtest.h

index 4ba348c431fd5ca7ef9fc7258b34e24ce514990a..218befdcc6850658fe63e7070ec460e6ee3cf78f 100644 (file)
--- a/icu4c/source/test/intltest/ustrtest.h
+++ b/icu4c/source/test/intltest/ustrtest.h
@@ -96,6 +96,7 @@ public:
      void TestUInt16Pointers();
      void TestWCharPointers();
      void TestNullPointers();
+    void TestUnicodeStringInsertAppendToSelf();
  };
  
  #endif
author	Shane F. Carr <shane@unicode.org>
	Fri, 21 Sep 2018 20:44:21 +0000 (13:44 -0700)
committer	Shane Carr <shane@unicode.org>
	Thu, 27 Sep 2018 21:27:41 +0000 (14:27 -0700)
icu4c/source/common/unicode/unistr.h		patch \| blob \| history
icu4c/source/common/unistr.cpp		patch \| blob \| history
icu4c/source/test/intltest/ustrtest.cpp		patch \| blob \| history
icu4c/source/test/intltest/ustrtest.h		patch \| blob \| history