ICU-13560 make some toUnicode converter check toULength not toUnicodeStatus for conti...

author Markus Scherer <markus.icu@gmail.com>

Tue, 23 Jan 2018 21:32:36 +0000 (21:32 +0000)

committer Markus Scherer <markus.icu@gmail.com>

Tue, 23 Jan 2018 21:32:36 +0000 (21:32 +0000)
author Markus Scherer <markus.icu@gmail.com>
Tue, 23 Jan 2018 21:32:36 +0000 (21:32 +0000)
committer Markus Scherer <markus.icu@gmail.com>
Tue, 23 Jan 2018 21:32:36 +0000 (21:32 +0000)
diff --git a/icu4c/source/common/ucnv_u32.cpp b/icu4c/source/common/ucnv_u32.cpp

index e1b755ab7f2ac432949f6e022b68c202bd96b4ff..5777117a924d8adee5a1a44937cb9ed46e15bb5f 100644 (file)
--- a/icu4c/source/common/ucnv_u32.cpp
+++ b/icu4c/source/common/ucnv_u32.cpp
@@ -55,7 +55,7 @@ T_UConverter_toUnicode_UTF32_BE(UConverterToUnicodeArgs * args,
      uint32_t ch, i;
  
      /* Restore state of current sequence */
-    if (args->converter->toUnicodeStatus && myTarget < targetLimit) {
+    if (args->converter->toULength > 0 && myTarget < targetLimit) {
          i = args->converter->toULength;       /* restore # of bytes consumed */
          args->converter->toULength = 0;
  
@@ -136,7 +136,7 @@ T_UConverter_toUnicode_UTF32_BE_OFFSET_LOGIC(UConverterToUnicodeArgs * args,
      int32_t offsetNum = 0;
  
      /* Restore state of current sequence */
-    if (args->converter->toUnicodeStatus && myTarget < targetLimit) {
+    if (args->converter->toULength > 0 && myTarget < targetLimit) {
          i = args->converter->toULength;       /* restore # of bytes consumed */
          args->converter->toULength = 0;
  
@@ -517,7 +517,7 @@ T_UConverter_toUnicode_UTF32_LE(UConverterToUnicodeArgs * args,
      uint32_t ch, i;
  
      /* Restore state of current sequence */
-    if (args->converter->toUnicodeStatus && myTarget < targetLimit)
+    if (args->converter->toULength > 0 && myTarget < targetLimit)
      {
          i = args->converter->toULength;       /* restore # of bytes consumed */
          args->converter->toULength = 0;
@@ -604,7 +604,7 @@ T_UConverter_toUnicode_UTF32_LE_OFFSET_LOGIC(UConverterToUnicodeArgs * args,
      int32_t offsetNum = 0;
  
      /* Restore state of current sequence */
-    if (args->converter->toUnicodeStatus && myTarget < targetLimit)
+    if (args->converter->toULength > 0 && myTarget < targetLimit)
      {
          i = args->converter->toULength;       /* restore # of bytes consumed */
          args->converter->toULength = 0;
diff --git a/icu4c/source/common/ucnv_u8.cpp b/icu4c/source/common/ucnv_u8.cpp

index 5d72f8ef377b083b04afaaeabc08278782d983a5..094e2dfb6f43279885dde600400da5cc2af10dfe 100644 (file)
--- a/icu4c/source/common/ucnv_u8.cpp
+++ b/icu4c/source/common/ucnv_u8.cpp
@@ -76,7 +76,7 @@ static void  U_CALLCONV ucnv_toUnicode_UTF8 (UConverterToUnicodeArgs * args,
      int32_t i, inBytes;
  
      /* Restore size of current sequence */
-    if (cnv->toUnicodeStatus && myTarget < targetLimit)
+    if (cnv->toULength > 0 && myTarget < targetLimit)
      {
          inBytes = cnv->mode;            /* restore # of bytes to consume */
          i = cnv->toULength;             /* restore # of bytes consumed */
@@ -194,7 +194,7 @@ static void  U_CALLCONV ucnv_toUnicode_UTF8_OFFSETS_LOGIC (UConverterToUnicodeAr
      int32_t i, inBytes;
  
      /* Restore size of current sequence */
-    if (cnv->toUnicodeStatus && myTarget < targetLimit)
+    if (cnv->toULength > 0 && myTarget < targetLimit)
      {
          inBytes = cnv->mode;            /* restore # of bytes to consume */
          i = cnv->toULength;             /* restore # of bytes consumed */
@@ -670,12 +670,13 @@ ucnv_UTF8FromUTF8(UConverterFromUnicodeArgs *pFromUArgs,
      targetCapacity=(int32_t)(pFromUArgs->targetLimit-pFromUArgs->target);
  
      /* get the converter state from the UTF-8 UConverter */
-    c=(UChar32)utf8->toUnicodeStatus;
-    if(c!=0) {
+    if(utf8->toULength > 0) {
          toULength=oldToULength=utf8->toULength;
          toULimit=(int8_t)utf8->mode;
+        c=(UChar32)utf8->toUnicodeStatus;
      } else {
          toULength=oldToULength=toULimit=0;
+        c = 0;
      }
  
      count=(int32_t)(sourceLimit-source)+oldToULength;
diff --git a/icu4c/source/common/ucnvlat1.cpp b/icu4c/source/common/ucnvlat1.cpp

index 23e918afe7a9d063d3f3e17c556d32ebfe43bdf8..358bc0caa25a19e76936a57861d9488f98b6ee8c 100644 (file)
--- a/icu4c/source/common/ucnvlat1.cpp
+++ b/icu4c/source/common/ucnvlat1.cpp
@@ -340,7 +340,11 @@ ucnv_Latin1FromUTF8(UConverterFromUnicodeArgs *pFromUArgs,
      targetCapacity=(int32_t)(pFromUArgs->targetLimit-pFromUArgs->target);
  
      /* get the converter state from the UTF-8 UConverter */
-    c=(UChar32)utf8->toUnicodeStatus;
+    if (utf8->toULength > 0) {
+        c=(UChar32)utf8->toUnicodeStatus;
+    } else {
+        c = 0;
+    }
      if(c!=0 && source<sourceLimit) {
          if(targetCapacity==0) {
              *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
@@ -620,7 +624,7 @@ ucnv_ASCIIFromUTF8(UConverterFromUnicodeArgs *pFromUArgs,
  
      uint8_t c;
  
-    if(pToUArgs->converter->toUnicodeStatus!=0) {
+    if(pToUArgs->converter->toULength > 0) {
          /* no handling of partial UTF-8 characters here, fall back to pivoting */
          *pErrorCode=U_USING_DEFAULT_WARNING;
          return;
diff --git a/icu4c/source/common/ucnvmbcs.cpp b/icu4c/source/common/ucnvmbcs.cpp

index 4b36cc605b16c778f2c63406b1062e4f23a7a112..2d0c857758d9bc93898eca2338b3a57e4e0b20d5 100644 (file)
--- a/icu4c/source/common/ucnvmbcs.cpp
+++ b/icu4c/source/common/ucnvmbcs.cpp
@@ -5064,12 +5064,13 @@ ucnv_SBCSFromUTF8(UConverterFromUnicodeArgs *pFromUArgs,
      hasSupplementary=(UBool)(cnv->sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY);
  
      /* get the converter state from the UTF-8 UConverter */
-    c=(UChar32)utf8->toUnicodeStatus;
-    if(c!=0) {
+    if(utf8->toULength > 0) {
          toULength=oldToULength=utf8->toULength;
          toULimit=(int8_t)utf8->mode;
+        c=(UChar32)utf8->toUnicodeStatus;
      } else {
          toULength=oldToULength=toULimit=0;
+        c = 0;
      }
  
      // The conversion loop checks source<sourceLimit only once per 1/2/3-byte character.
@@ -5359,12 +5360,13 @@ ucnv_DBCSFromUTF8(UConverterFromUnicodeArgs *pFromUArgs,
      hasSupplementary=(UBool)(cnv->sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY);
  
      /* get the converter state from the UTF-8 UConverter */
-    c=(UChar32)utf8->toUnicodeStatus;
-    if(c!=0) {
+    if(utf8->toULength > 0) {
          toULength=oldToULength=utf8->toULength;
          toULimit=(int8_t)utf8->mode;
+        c=(UChar32)utf8->toUnicodeStatus;
      } else {
          toULength=oldToULength=toULimit=0;
+        c = 0;
      }
  
      // The conversion loop checks source<sourceLimit only once per 1/2/3-byte character.
diff --git a/icu4c/source/test/intltest/convtest.cpp b/icu4c/source/test/intltest/convtest.cpp

index db0aa86912bc800ed4ab9ca8507c8a9a8470614e..3a0e5414d6f0de1d1a5258ca46971cb742911baa 100644 (file)
--- a/icu4c/source/test/intltest/convtest.cpp
+++ b/icu4c/source/test/intltest/convtest.cpp
@@ -733,6 +733,7 @@ ConversionTest::TestUTF8ToUTF8Overflow() {
      UChar *pivotSource = buffer16;
      UChar *pivotTarget = buffer16;
      const UChar *pivotLimit = buffer16 + UPRV_LENGTHOF(buffer16);
+    int32_t length;
  
      // Convert with insufficient target capacity.
      result[2] = 5;
@@ -741,7 +742,7 @@ ConversionTest::TestUTF8ToUTF8Overflow() {
                     buffer16, &pivotSource, &pivotTarget, pivotLimit,
                     FALSE, FALSE, errorCode);
      assertEquals("overflow", U_BUFFER_OVERFLOW_ERROR, errorCode.reset());
-    int32_t length = (int32_t)(target - result);
+    length = (int32_t)(target - result);
      assertEquals("number of bytes written", 2, length);
      assertEquals("next byte not clobbered", 5, result[2]);
  
@@ -790,6 +791,52 @@ ConversionTest::TestUTF8ToUTF8Overflow() {
      if (length == 5) {
          assertTrue("text2 result same as input", memcmp(text2, result, length) == 0);
      }
+
+    ucnv_reset(cnv1.getAlias());
+    ucnv_reset(cnv2.getAlias());
+    memset(result, 0, sizeof(result));
+    static const char *illFormed = "\xf1\x91\x93\x96\x91\x94";  // U+514D6 + two more trail bytes
+    source = illFormed;
+    sourceLimit = illFormed + strlen(illFormed);
+    target = result;
+    pivotSource = pivotTarget = buffer16;
+
+    ucnv_setToUCallBack(cnv1.getAlias(), UCNV_TO_U_CALLBACK_STOP, nullptr, nullptr, nullptr, errorCode);
+
+    // Convert only two bytes and flush (but expect failure).
+    char errorBytes[10];
+    int8_t errorLength;
+    result[0] = 5;
+    ucnv_convertEx(cnv2.getAlias(), cnv1.getAlias(),
+                   &target, targetLimit, &source, source + 2,
+                   buffer16, &pivotSource, &pivotTarget, pivotLimit,
+                   FALSE, TRUE, errorCode);
+    assertEquals("illFormed truncated", U_TRUNCATED_CHAR_FOUND, errorCode.reset());
+    length = (int32_t)(target - result);
+    assertEquals("illFormed number of bytes written", 0, length);
+    errorLength = UPRV_LENGTHOF(errorBytes);
+    ucnv_getInvalidChars(cnv1.getAlias(), errorBytes, &errorLength, errorCode);
+    assertEquals("illFormed truncated errorLength", 2, (int32_t)errorLength);
+    if (errorLength == 2) {
+        assertEquals("illFormed truncated errorBytes", 0xf191, 
+                     ((int32_t)(uint8_t)errorBytes[0] << 8) | (uint8_t)errorBytes[1]);
+    }
+
+    // Continue conversion starting with a trail byte.
+    ucnv_convertEx(cnv2.getAlias(), cnv1.getAlias(),
+                   &target, targetLimit, &source, sourceLimit,
+                   buffer16, &pivotSource, &pivotTarget, pivotLimit,
+                   FALSE, TRUE, errorCode);
+
+    assertEquals("illFormed trail byte", U_ILLEGAL_CHAR_FOUND, errorCode.reset());
+    length = (int32_t)(target - result);
+    assertEquals("illFormed trail byte number of bytes written", 0, length);
+    errorLength = UPRV_LENGTHOF(errorBytes);
+    ucnv_getInvalidChars(cnv1.getAlias(), errorBytes, &errorLength, errorCode);
+    assertEquals("illFormed trail byte errorLength", 1, (int32_t)errorLength);
+    if (errorLength == 1) {
+        assertEquals("illFormed trail byte errorBytes", 0x93, (int32_t)(uint8_t)errorBytes[0]);
+    }
  }
  
  // open testdata or ICU data converter ------------------------------------- ***
author	Markus Scherer <markus.icu@gmail.com>
	Tue, 23 Jan 2018 21:32:36 +0000 (21:32 +0000)
committer	Markus Scherer <markus.icu@gmail.com>
	Tue, 23 Jan 2018 21:32:36 +0000 (21:32 +0000)
icu4c/source/common/ucnv_u32.cpp		patch \| blob \| history
icu4c/source/common/ucnv_u8.cpp		patch \| blob \| history
icu4c/source/common/ucnvlat1.cpp		patch \| blob \| history
icu4c/source/common/ucnvmbcs.cpp		patch \| blob \| history
icu4c/source/test/intltest/convtest.cpp		patch \| blob \| history