ICU-21280 Correct source bytes counting in UTF8->UTF8 conversion

author Roman Savchenko <gmstima@gmail.com>

Fri, 11 Sep 2020 12:50:41 +0000 (15:50 +0300)

committer Markus Scherer <markus.icu@gmail.com>

Thu, 17 Sep 2020 00:50:21 +0000 (17:50 -0700)
author Roman Savchenko <gmstima@gmail.com>
Fri, 11 Sep 2020 12:50:41 +0000 (15:50 +0300)
committer Markus Scherer <markus.icu@gmail.com>
Thu, 17 Sep 2020 00:50:21 +0000 (17:50 -0700)
diff --git a/icu4c/source/common/ucnv_u8.cpp b/icu4c/source/common/ucnv_u8.cpp

index 9b518e08df6785da63cd8629810e7e28504136e3..1ef7fa2f02f0a54510f1c8222923ab3d8e85fde8 100644 (file)
--- a/icu4c/source/common/ucnv_u8.cpp
+++ b/icu4c/source/common/ucnv_u8.cpp
@@ -707,9 +707,9 @@ ucnv_UTF8FromUTF8(UConverterFromUnicodeArgs *pFromUArgs,
  
          // Do not go back into the bytes that will be read for finishing a partial
          // sequence from the previous buffer.
-        int32_t length=count-toULimit;
+        int32_t length=count-toULength;
          U8_TRUNCATE_IF_INCOMPLETE(source, 0, length);
-        count=toULimit+length;
+        count=toULength+length;
      }
  
      if(c!=0) {
diff --git a/icu4c/source/test/intltest/convtest.cpp b/icu4c/source/test/intltest/convtest.cpp

index 949bd3b84860a6391e223349c97ee930a800ca6f..a25a04be9017353d582ea6cb8d01e39eb49b25e1 100644 (file)
--- a/icu4c/source/test/intltest/convtest.cpp
+++ b/icu4c/source/test/intltest/convtest.cpp
@@ -77,6 +77,7 @@ ConversionTest::runIndexedTest(int32_t index, UBool exec, const char *&name, cha
      TESTCASE_AUTO(TestGetUnicodeSet2);
      TESTCASE_AUTO(TestDefaultIgnorableCallback);
      TESTCASE_AUTO(TestUTF8ToUTF8Overflow);
+    TESTCASE_AUTO(TestUTF8ToUTF8Streaming);
      TESTCASE_AUTO_END;
  }
  
@@ -830,6 +831,65 @@ ConversionTest::TestUTF8ToUTF8Overflow() {
      }
  }
  
+void
+ConversionTest::TestUTF8ToUTF8Streaming() {
+    IcuTestErrorCode errorCode(*this, "TestUTF8ToUTF8Streaming");
+    LocalUConverterPointer cnv1(ucnv_open("UTF-8", errorCode));
+    LocalUConverterPointer cnv2(ucnv_open("UTF-8", errorCode));
+
+    // UTF8 encoded cyrillic part of 'Lorem ipsum'
+    static const char* text =
+        "\xd0\xb5\xd1\x82\x20\xd1\x81\xd1\x86\xd0\xb0\xd0\xb5\xd0\xb2\xd0"
+        "\xbe\xd0\xbb\xd0\xb0\x20\xd1\x81\xd0\xb0\xd0\xb4\xd0\xb8\xd0\xbf"
+        "\xd1\x81\xd1\x86\xd0\xb8\xd0\xbd\xd0\xb3\x20\xd0\xb0\xd1\x86\xd1"
+        "\x86\xd0\xbe\xd0\xbc\xd0\xbc\xd0\xbe\xd0\xb4\xd0\xb0\xd1\x80\xd0"
+        "\xb5\x20\xd1\x85\xd0\xb0\xd1\x81";
+
+    int32_t chunk1 = 25; // partial lead at the end: 0xd0
+    int32_t chunk2 = 47; // partial tail at the beginning: 0xb0
+
+    char result[128];
+
+    int32_t sourceLen = (int32_t)strlen(text);
+    const char* source = text;
+    const char* sourceLimit = text + chunk1;
+
+    int32_t targetLen = sizeof(result);
+    char* target = result;
+    const char* targetLimit = result + targetLen;
+
+    UChar buffer16[20];
+    UChar* pivotSource = buffer16;
+    UChar* pivotTarget = buffer16;
+    const UChar* pivotLimit = buffer16 + UPRV_LENGTHOF(buffer16);
+
+    int32_t length;
+    ucnv_convertEx(cnv2.getAlias(), cnv1.getAlias(),
+        &target, result + targetLen, &source, sourceLimit,
+        buffer16, &pivotSource, &pivotTarget, pivotLimit,
+        FALSE, FALSE, errorCode);
+
+    length = (int32_t)(target - result);
+    targetLen -= length;
+    assertEquals("First chunk -1 doesn't match converted length", chunk1 - 1, length);
+
+    source = text + chunk1;
+    sourceLimit = source + chunk2;
+
+    // Convert the rest and flush.
+    ucnv_convertEx(cnv2.getAlias(), cnv1.getAlias(),
+        &target, targetLimit, &source, sourceLimit,
+        buffer16, &pivotSource, &pivotTarget, pivotLimit,
+        FALSE, TRUE, errorCode);
+
+    length = (int32_t)(target - result - length);
+    targetLen -= length;
+    assertEquals("Second chunk + 2 doesn't  match converted length", chunk2 + 1, length);
+
+    assertEquals("Full text length match", sourceLen, sizeof(result) - targetLen);
+    assertSuccess("UTF-8->UTF-8", errorCode);
+}
+
  // open testdata or ICU data converter ------------------------------------- ***
  
  UConverter *
diff --git a/icu4c/source/test/intltest/convtest.h b/icu4c/source/test/intltest/convtest.h

index 84a3a89a503393b6f06c13a8e4c40421a966587c..dda10cb5d27b758ca01b7a0a88a78fb54a3fcfce 100644 (file)
--- a/icu4c/source/test/intltest/convtest.h
+++ b/icu4c/source/test/intltest/convtest.h
@@ -77,6 +77,7 @@ public:
      void TestGetUnicodeSet2();
      void TestDefaultIgnorableCallback();
      void TestUTF8ToUTF8Overflow();
+    void TestUTF8ToUTF8Streaming();
  
  private:
      UBool
author	Roman Savchenko <gmstima@gmail.com>
	Fri, 11 Sep 2020 12:50:41 +0000 (15:50 +0300)
committer	Markus Scherer <markus.icu@gmail.com>
	Thu, 17 Sep 2020 00:50:21 +0000 (17:50 -0700)
icu4c/source/common/ucnv_u8.cpp		patch \| blob \| history
icu4c/source/test/intltest/convtest.cpp		patch \| blob \| history
icu4c/source/test/intltest/convtest.h		patch \| blob \| history