ICU-13311 change illegal-UTF-8 handling in converter code

author Markus Scherer <markus.icu@gmail.com>

Sat, 23 Sep 2017 06:34:53 +0000 (06:34 +0000)

committer Markus Scherer <markus.icu@gmail.com>

Sat, 23 Sep 2017 06:34:53 +0000 (06:34 +0000)
author Markus Scherer <markus.icu@gmail.com>
Sat, 23 Sep 2017 06:34:53 +0000 (06:34 +0000)
committer Markus Scherer <markus.icu@gmail.com>
Sat, 23 Sep 2017 06:34:53 +0000 (06:34 +0000)
diff --git a/icu4c/source/common/ucnv_u8.cpp b/icu4c/source/common/ucnv_u8.cpp

index 4419381fd6c8f18fed279b69cfc32055a9930b9d..951988ed9ca3d1665146aee054b43fd493cfb283 100644 (file)
--- a/icu4c/source/common/ucnv_u8.cpp
+++ b/icu4c/source/common/ucnv_u8.cpp
@@ -31,6 +31,7 @@
  #include "ucnv_bld.h"
  #include "ucnv_cnv.h"
  #include "cmemory.h"
+#include "ustr_imp.h"
  
  /* Prototypes --------------------------------------------------------------- */
  
@@ -44,51 +45,13 @@ U_CFUNC void ucnv_fromUnicode_UTF8_OFFSETS_LOGIC(UConverterFromUnicodeArgs *args
  
  /* UTF-8 -------------------------------------------------------------------- */
  
-/* UTF-8 Conversion DATA
- *   for more information see Unicode Standard 2.0, Transformation Formats Appendix A-9
- */
-/*static const uint32_t REPLACEMENT_CHARACTER = 0x0000FFFD;*/
  #define MAXIMUM_UCS2            0x0000FFFF
-#define MAXIMUM_UTF             0x0010FFFF
-#define MAXIMUM_UCS4            0x7FFFFFFF
-#define HALF_SHIFT              10
-#define HALF_BASE               0x0010000
-#define HALF_MASK               0x3FF
-#define SURROGATE_HIGH_START    0xD800
-#define SURROGATE_HIGH_END      0xDBFF
-#define SURROGATE_LOW_START     0xDC00
-#define SURROGATE_LOW_END       0xDFFF
-
-/* -SURROGATE_LOW_START + HALF_BASE */
-#define SURROGATE_LOW_BASE      9216
-
-static const uint32_t offsetsFromUTF8[7] = {0,
-  (uint32_t) 0x00000000, (uint32_t) 0x00003080, (uint32_t) 0x000E2080,
-  (uint32_t) 0x03C82080, (uint32_t) 0xFA082080, (uint32_t) 0x82082080
-};
  
-/* END OF UTF-8 Conversion DATA */
-
-static const int8_t bytesFromUTF8[256] = {
-  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-  3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
+static const uint32_t offsetsFromUTF8[5] = {0,
+  (uint32_t) 0x00000000, (uint32_t) 0x00003080, (uint32_t) 0x000E2080,
+  (uint32_t) 0x03C82080
  };
  
-/*
- * Starting with Unicode 3.0.1:
- * UTF-8 byte sequences of length N _must_ encode code points of or above utf8_minChar32[N];
- * byte sequences with more than 4 bytes are illegal in UTF-8,
- * which is tested with impossible values for them
- */
-static const uint32_t
-utf8_minChar32[7]={ 0, 0, 0x80, 0x800, 0x10000, 0xffffffff, 0xffffffff };
-
  static UBool hasCESU8Data(const UConverter *cnv)
  {
  #if UCONFIG_ONLY_HTML_CONVERSION
@@ -127,7 +90,7 @@ static void  U_CALLCONV ucnv_toUnicode_UTF8 (UConverterToUnicodeArgs * args,
      while (mySource < sourceLimit && myTarget < targetLimit)
      {
          ch = *(mySource++);
-        if (ch < 0x80)        /* Simple case */
+        if (U8_IS_SINGLE(ch))        /* Simple case */
          {
              *(myTarget++) = (UChar) ch;
          }
@@ -135,7 +98,7 @@ static void  U_CALLCONV ucnv_toUnicode_UTF8 (UConverterToUnicodeArgs * args,
          {
              /* store the first char */
              toUBytes[0] = (char)ch;
-            inBytes = bytesFromUTF8[ch]; /* lookup current sequence length */
+            inBytes = U8_COUNT_BYTES_NON_ASCII(ch); /* lookup current sequence length */
              i = 1;
  
  morebytes:
@@ -144,7 +107,8 @@ morebytes:
                  if (mySource < sourceLimit)
                  {
                      toUBytes[i] = (char) (ch2 = *mySource);
-                    if (!U8_IS_TRAIL(ch2))
+                    if (!icu::UTF8::isValidTrail(ch, ch2, i, inBytes) &&
+                            !(isCESU8 && i == 1 && ch == 0xed && U8_IS_TRAIL(ch2)))
                      {
                          break; /* i < inBytes */
                      }
@@ -162,24 +126,12 @@ morebytes:
                  }
              }
  
-            /* Remove the accumulated high bits */
-            ch -= offsetsFromUTF8[inBytes];
-
-            /*
-             * Legal UTF-8 byte sequences in Unicode 3.0.1 and up:
-             * - use only trail bytes after a lead byte (checked above)
-             * - use the right number of trail bytes for a given lead byte
-             * - encode a code point <= U+10ffff
-             * - use the fewest possible number of bytes for their code points
-             * - use at most 4 bytes (for i>=5 it is 0x10ffff<utf8_minChar32[])
-             *
-             * Starting with Unicode 3.2, surrogate code points must not be encoded in UTF-8.
-             * There are no irregular sequences any more.
-             * In CESU-8, only surrogates, not supplementary code points, are encoded directly.
-             */
-            if (i == inBytes && ch <= MAXIMUM_UTF && ch >= utf8_minChar32[i] &&
-                (isCESU8 ? i <= 3 : !U_IS_SURROGATE(ch)))
+            // In CESU-8, only surrogates, not supplementary code points, are encoded directly.
+            if (i == inBytes && (!isCESU8 || i <= 3))
              {
+                /* Remove the accumulated high bits */
+                ch -= offsetsFromUTF8[inBytes];
+
                  /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
                  if (ch <= MAXIMUM_UCS2) 
                  {
@@ -189,9 +141,8 @@ morebytes:
                  else
                  {
                      /* write out the surrogates */
-                    ch -= HALF_BASE;
-                    *(myTarget++) = (UChar) ((ch >> HALF_SHIFT) + SURROGATE_HIGH_START);
-                    ch = (ch & HALF_MASK) + SURROGATE_LOW_START;
+                    *(myTarget++) = U16_LEAD(ch);
+                    ch = U16_TRAIL(ch);
                      if (myTarget < targetLimit)
                      {
                          *(myTarget++) = (UChar)ch;
@@ -256,7 +207,7 @@ static void  U_CALLCONV ucnv_toUnicode_UTF8_OFFSETS_LOGIC (UConverterToUnicodeAr
      while (mySource < sourceLimit && myTarget < targetLimit)
      {
          ch = *(mySource++);
-        if (ch < 0x80)        /* Simple case */
+        if (U8_IS_SINGLE(ch))        /* Simple case */
          {
              *(myTarget++) = (UChar) ch;
              *(myOffsets++) = offsetNum++;
@@ -264,7 +215,7 @@ static void  U_CALLCONV ucnv_toUnicode_UTF8_OFFSETS_LOGIC (UConverterToUnicodeAr
          else
          {
              toUBytes[0] = (char)ch;
-            inBytes = bytesFromUTF8[ch];
+            inBytes = U8_COUNT_BYTES_NON_ASCII(ch);
              i = 1;
  
  morebytes:
@@ -273,7 +224,8 @@ morebytes:
                  if (mySource < sourceLimit)
                  {
                      toUBytes[i] = (char) (ch2 = *mySource);
-                    if (!U8_IS_TRAIL(ch2))
+                    if (!icu::UTF8::isValidTrail(ch, ch2, i, inBytes) &&
+                            !(isCESU8 && i == 1 && ch == 0xed && U8_IS_TRAIL(ch2)))
                      {
                          break; /* i < inBytes */
                      }
@@ -290,24 +242,12 @@ morebytes:
                  }
              }
  
-            /* Remove the accumulated high bits */
-            ch -= offsetsFromUTF8[inBytes];
-
-            /*
-             * Legal UTF-8 byte sequences in Unicode 3.0.1 and up:
-             * - use only trail bytes after a lead byte (checked above)
-             * - use the right number of trail bytes for a given lead byte
-             * - encode a code point <= U+10ffff
-             * - use the fewest possible number of bytes for their code points
-             * - use at most 4 bytes (for i>=5 it is 0x10ffff<utf8_minChar32[])
-             *
-             * Starting with Unicode 3.2, surrogate code points must not be encoded in UTF-8.
-             * There are no irregular sequences any more.
-             * In CESU-8, only surrogates, not supplementary code points, are encoded directly.
-             */
-            if (i == inBytes && ch <= MAXIMUM_UTF && ch >= utf8_minChar32[i] &&
-                (isCESU8 ? i <= 3 : !U_IS_SURROGATE(ch)))
+            // In CESU-8, only surrogates, not supplementary code points, are encoded directly.
+            if (i == inBytes && (!isCESU8 || i <= 3))
              {
+                /* Remove the accumulated high bits */
+                ch -= offsetsFromUTF8[inBytes];
+
                  /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
                  if (ch <= MAXIMUM_UCS2) 
                  {
@@ -318,10 +258,9 @@ morebytes:
                  else
                  {
                      /* write out the surrogates */
-                    ch -= HALF_BASE;
-                    *(myTarget++) = (UChar) ((ch >> HALF_SHIFT) + SURROGATE_HIGH_START);
+                    *(myTarget++) = U16_LEAD(ch);
                      *(myOffsets++) = offsetNum;
-                    ch = (ch & HALF_MASK) + SURROGATE_LOW_START;
+                    ch = U16_TRAIL(ch);
                      if (myTarget < targetLimit)
                      {
                          *(myTarget++) = (UChar)ch;
@@ -616,10 +555,9 @@ static UChar32 U_CALLCONV ucnv_getNextUChar_UTF8(UConverterToUnicodeArgs *args,
      UConverter *cnv;
      const uint8_t *sourceInitial;
      const uint8_t *source;
-    uint16_t extraBytesToWrite;
      uint8_t myByte;
      UChar32 ch;
-    int8_t i, isLegalSequence;
+    int8_t i;
  
      /* UTF-8 only here, the framework handles CESU-8 to combine surrogate pairs */
  
@@ -633,14 +571,14 @@ static UChar32 U_CALLCONV ucnv_getNextUChar_UTF8(UConverterToUnicodeArgs *args,
      }
  
      myByte = (uint8_t)*(source++);
-    if (myByte < 0x80)
+    if (U8_IS_SINGLE(myByte))
      {
          args->source = (const char *)source;
          return (UChar32)myByte;
      }
  
-    extraBytesToWrite = (uint16_t)bytesFromUTF8[myByte];
-    if (extraBytesToWrite == 0) {
+    uint16_t countTrailBytes = U8_COUNT_TRAIL_BYTES(myByte);
+    if (countTrailBytes == 0) {
          cnv->toUBytes[0] = myByte;
          cnv->toULength = 1;
          *err = U_ILLEGAL_CHAR_FOUND;
@@ -649,15 +587,17 @@ static UChar32 U_CALLCONV ucnv_getNextUChar_UTF8(UConverterToUnicodeArgs *args,
      }
  
      /*The byte sequence is longer than the buffer area passed*/
-    if (((const char *)source + extraBytesToWrite - 1) > args->sourceLimit)
+    if (((const char *)source + countTrailBytes) > args->sourceLimit)
      {
          /* check if all of the remaining bytes are trail bytes */
+        uint16_t extraBytesToWrite = countTrailBytes + 1;
          cnv->toUBytes[0] = myByte;
          i = 1;
          *err = U_TRUNCATED_CHAR_FOUND;
          while(source < (const uint8_t *)args->sourceLimit) {
-            if(U8_IS_TRAIL(myByte = *source)) {
-                cnv->toUBytes[i++] = myByte;
+            uint8_t b = *source;
+            if(icu::UTF8::isValidTrail(myByte, b, i, extraBytesToWrite)) {
+                cnv->toUBytes[i++] = b;
                  ++source;
              } else {
                  /* error even before we run out of input */
@@ -670,81 +610,28 @@ static UChar32 U_CALLCONV ucnv_getNextUChar_UTF8(UConverterToUnicodeArgs *args,
          return 0xffff;
      }
  
-    isLegalSequence = 1;
      ch = myByte << 6;
-    switch(extraBytesToWrite)
-    {     
-      /* note: code falls through cases! (sic)*/ 
-    case 6:
-        ch += (myByte = *source);
-        ch <<= 6;
-        if (!U8_IS_TRAIL(myByte))
-        {
-            isLegalSequence = 0;
-            break;
-        }
-        ++source;
-        U_FALLTHROUGH;
-    case 5:
-        ch += (myByte = *source);
-        ch <<= 6;
-        if (!U8_IS_TRAIL(myByte))
-        {
-            isLegalSequence = 0;
-            break;
+    if(countTrailBytes == 2) {
+        uint8_t t1 = *source, t2;
+        if(U8_IS_VALID_LEAD3_AND_T1(myByte, t1) && U8_IS_TRAIL(t2 = *++source)) {
+            args->source = (const char *)(source + 1);
+            return (((ch + t1) << 6) + t2) - offsetsFromUTF8[3];
          }
-        ++source;
-        U_FALLTHROUGH;
-    case 4:
-        ch += (myByte = *source);
-        ch <<= 6;
-        if (!U8_IS_TRAIL(myByte))
-        {
-            isLegalSequence = 0;
-            break;
+    } else if(countTrailBytes == 1) {
+        uint8_t t1 = *source;
+        if(U8_IS_TRAIL(t1)) {
+            args->source = (const char *)(source + 1);
+            return (ch + t1) - offsetsFromUTF8[2];
          }
-        ++source;
-        U_FALLTHROUGH;
-    case 3:
-        ch += (myByte = *source);
-        ch <<= 6;
-        if (!U8_IS_TRAIL(myByte))
-        {
-            isLegalSequence = 0;
-            break;
+    } else {  // countTrailBytes == 3
+        uint8_t t1 = *source, t2, t3;
+        if(U8_IS_VALID_LEAD4_AND_T1(myByte, t1) && U8_IS_TRAIL(t2 = *++source) &&
+                U8_IS_TRAIL(t3 = *++source)) {
+            args->source = (const char *)(source + 1);
+            return (((((ch + t1) << 6) + t2) << 6) + t3) - offsetsFromUTF8[4];
          }
-        ++source;
-        U_FALLTHROUGH;
-    case 2:
-        ch += (myByte = *source);
-        if (!U8_IS_TRAIL(myByte))
-        {
-            isLegalSequence = 0;
-            break;
-        }
-        ++source;
-    };
-    ch -= offsetsFromUTF8[extraBytesToWrite];
-    args->source = (const char *)source;
-
-    /*
-     * Legal UTF-8 byte sequences in Unicode 3.0.1 and up:
-     * - use only trail bytes after a lead byte (checked above)
-     * - use the right number of trail bytes for a given lead byte
-     * - encode a code point <= U+10ffff
-     * - use the fewest possible number of bytes for their code points
-     * - use at most 4 bytes (for i>=5 it is 0x10ffff<utf8_minChar32[])
-     *
-     * Starting with Unicode 3.2, surrogate code points must not be encoded in UTF-8.
-     * There are no irregular sequences any more.
-     */
-    if (isLegalSequence &&
-        (uint32_t)ch <= MAXIMUM_UTF &&
-        (uint32_t)ch >= utf8_minChar32[extraBytesToWrite] &&
-        !U_IS_SURROGATE(ch)
-    ) {
-        return ch; /* return the code point */
      }
+    args->source = (const char *)source;
  
      for(i = 0; sourceInitial < source; ++i) {
          cnv->toUBytes[i] = *sourceInitial++;
@@ -757,14 +644,6 @@ U_CDECL_END
  
  /* UTF-8-from-UTF-8 conversion functions ------------------------------------ */
  
-/* minimum code point values for n-byte UTF-8 sequences, n=0..4 */
-static const UChar32
-utf8_minLegal[5]={ 0, 0, 0x80, 0x800, 0x10000 };
-
-/* offsets for n-byte UTF-8 sequences that were calculated with ((lead<<6)+trail)<<6+trail... */
-static const UChar32
-utf8_offsets[7]={ 0, 0, 0x3080, 0xE2080, 0x3C82080 };
-
  U_CDECL_BEGIN
  /* "Convert" UTF-8 to UTF-8: Validate and copy. Modified from ucnv_DBCSFromUTF8(). */
  static void U_CALLCONV
@@ -812,39 +691,35 @@ ucnv_UTF8FromUTF8(UConverterFromUnicodeArgs *pFromUArgs,
          *pErrorCode=U_USING_DEFAULT_WARNING;
          return;
      } else {
-        /*
-         * Use a single counter for source and target, counting the minimum of
-         * the source length and the target capacity.
-         * As a result, the source length is checked only once per multi-byte
-         * character instead of twice.
-         *
-         * Make sure that the last byte sequence is complete, or else
-         * stop just before it.
-         * (The longest legal byte sequence has 3 trail bytes.)
-         * Count oldToULength (number of source bytes from a previous buffer)
-         * into the source length but reduce the source index by toULimit
-         * while going back over trail bytes in order to not go back into
-         * the bytes that will be read for finishing a partial
-         * sequence from the previous buffer.
-         * Let the standard converter handle edge cases.
-         */
-        int32_t i;
-
+        // Use a single counter for source and target, counting the minimum of
+        // the source length and the target capacity.
+        // Let the standard converter handle edge cases.
          if(count>targetCapacity) {
              count=targetCapacity;
          }
  
-        i=0;
-        while(i<3 && i<(count-toULimit)) {
-            b=source[count-oldToULength-i-1];
-            if(U8_IS_TRAIL(b)) {
-                ++i;
-            } else {
-                if(i<U8_COUNT_TRAIL_BYTES(b)) {
-                    /* stop converting before the lead byte if there are not enough trail bytes for it */
-                    count-=i+1;
+        // The conversion loop checks count>0 only once per 1/2/3-byte character.
+        // If the buffer ends with a truncated 2- or 3-byte sequence,
+        // then we reduce the count to stop before that,
+        // and collect the remaining bytes after the conversion loop.
+        {
+            // Do not go back into the bytes that will be read for finishing a partial
+            // sequence from the previous buffer.
+            int32_t length=count-toULimit;
+            if(length>0) {
+                uint8_t b1=*(sourceLimit-1);
+                if(U8_IS_SINGLE(b1)) {
+                    // common ASCII character
+                } else if(U8_IS_TRAIL(b1) && length>=2) {
+                    uint8_t b2=*(sourceLimit-2);
+                    if(0xe0<=b2 && b2<0xf0 && U8_IS_VALID_LEAD3_AND_T1(b2, b1)) {
+                        // truncated 3-byte sequence
+                        count-=2;
+                    }
+                } else if(0xc2<=b1 && b1<0xf0) {
+                    // truncated 2- or 3-byte sequence
+                    --count;
                  }
-                break;
              }
          }
      }
@@ -859,17 +734,17 @@ ucnv_UTF8FromUTF8(UConverterFromUnicodeArgs *pFromUArgs,
      /* conversion loop */
      while(count>0) {
          b=*source++;
-        if((int8_t)b>=0) {
+        if(U8_IS_SINGLE(b)) {
              /* convert ASCII */
              *target++=b;
              --count;
              continue;
          } else {
-            if(b>0xe0) {
-                if( /* handle U+1000..U+D7FF inline */
-                    (t1=source[0]) >= 0x80 && ((b<0xed && (t1 <= 0xbf)) ||
-                                               (b==0xed && (t1 <= 0x9f))) &&
-                    (t2=source[1]) >= 0x80 && t2 <= 0xbf
+            if(b>=0xe0) {
+                if( /* handle U+0800..U+FFFF inline */
+                    b<0xf0 &&
+                    U8_IS_VALID_LEAD3_AND_T1(b, t1=source[0]) &&
+                    U8_IS_TRAIL(t2=source[1])
                  ) {
                      source+=2;
                      *target++=b;
@@ -878,10 +753,10 @@ ucnv_UTF8FromUTF8(UConverterFromUnicodeArgs *pFromUArgs,
                      count-=3;
                      continue;
                  }
-            } else if(b<0xe0) {
+            } else {
                  if( /* handle U+0080..U+07FF inline */
                      b>=0xc2 &&
-                    (t1=*source) >= 0x80 && t1 <= 0xbf
+                    U8_IS_TRAIL(t1=*source)
                  ) {
                      ++source;
                      *target++=b;
@@ -889,30 +764,18 @@ ucnv_UTF8FromUTF8(UConverterFromUnicodeArgs *pFromUArgs,
                      count-=2;
                      continue;
                  }
-            } else if(b==0xe0) {
-                if( /* handle U+0800..U+0FFF inline */
-                    (t1=source[0]) >= 0xa0 && t1 <= 0xbf &&
-                    (t2=source[1]) >= 0x80 && t2 <= 0xbf
-                ) {
-                    source+=2;
-                    *target++=b;
-                    *target++=t1;
-                    *target++=t2;
-                    count-=3;
-                    continue;
-                }
              }
  
              /* handle "complicated" and error cases, and continuing partial characters */
              oldToULength=0;
              toULength=1;
-            toULimit=U8_COUNT_TRAIL_BYTES(b)+1;
+            toULimit=U8_COUNT_BYTES_NON_ASCII(b);
              c=b;
  moreBytes:
              while(toULength<toULimit) {
                  if(source<sourceLimit) {
                      b=*source;
-                    if(U8_IS_TRAIL(b)) {
+                    if(icu::UTF8::isValidTrail(c, b, toULength, toULimit)) {
                          ++source;
                          ++toULength;
                          c=(c<<6)+b;
@@ -934,18 +797,7 @@ moreBytes:
                  }
              }
  
-            if( toULength==toULimit &&      /* consumed all trail bytes */
-                (toULength==3 || toULength==2) &&             /* BMP */
-                (c-=utf8_offsets[toULength])>=utf8_minLegal[toULength] &&
-                (c<=0xd7ff || 0xe000<=c)    /* not a surrogate */
-            ) {
-                /* legal byte sequence for BMP code point */
-            } else if(
-                toULength==toULimit && toULength==4 &&
-                (0x10000<=(c-=utf8_offsets[4]) && c<=0x10ffff)
-            ) {
-                /* legal byte sequence for supplementary code point */
-            } else {
+            if(toULength!=toULimit) {
                  /* error handling: illegal UTF-8 byte sequence */
                  source-=(toULength-oldToULength);
                  while(oldToULength<toULength) {
@@ -979,7 +831,7 @@ moreBytes:
              *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
          } else {
              b=*source;
-            toULimit=U8_COUNT_TRAIL_BYTES(b)+1;
+            toULimit=U8_COUNT_BYTES(b);
              if(toULimit>(sourceLimit-source)) {
                  /* collect a truncated byte sequence */
                  toULength=0;
diff --git a/icu4c/source/common/ucnvlat1.cpp b/icu4c/source/common/ucnvlat1.cpp

index 8aa5456b8cf442932c9ec000237529b867cc8ea2..23e918afe7a9d063d3f3e17c556d32ebfe43bdf8 100644 (file)
--- a/icu4c/source/common/ucnvlat1.cpp
+++ b/icu4c/source/common/ucnvlat1.cpp
@@ -23,6 +23,7 @@
  #include "unicode/utf8.h"
  #include "ucnv_bld.h"
  #include "ucnv_cnv.h"
+#include "ustr_imp.h"
  
  /* control optimizations according to the platform */
  #define LATIN1_UNROLL_FROM_UNICODE 1
@@ -374,7 +375,7 @@ ucnv_Latin1FromUTF8(UConverterFromUnicodeArgs *pFromUArgs,
      while(source<sourceLimit) {
          if(targetCapacity>0) {
              b=*source++;
-            if((int8_t)b>=0) {
+            if(U8_IS_SINGLE(b)) {
                  /* convert ASCII */
                  *target++=(uint8_t)b;
                  --targetCapacity;
@@ -409,7 +410,7 @@ ucnv_Latin1FromUTF8(UConverterFromUnicodeArgs *pFromUArgs,
      if(U_SUCCESS(*pErrorCode) && source<(sourceLimit=(uint8_t *)pToUArgs->sourceLimit)) {
          utf8->toUnicodeStatus=utf8->toUBytes[0]=b=*source++;
          utf8->toULength=1;
-        utf8->mode=U8_COUNT_TRAIL_BYTES(b)+1;
+        utf8->mode=U8_COUNT_BYTES(b);
      }
  
      /* write back the updated pointers */
diff --git a/icu4c/source/common/ucnvmbcs.cpp b/icu4c/source/common/ucnvmbcs.cpp

index 21a651f89684db6e0d6f2a46f51851362409375c..4b36cc605b16c778f2c63406b1062e4f23a7a112 100644 (file)
--- a/icu4c/source/common/ucnvmbcs.cpp
+++ b/icu4c/source/common/ucnvmbcs.cpp
@@ -59,6 +59,7 @@
  #include "cmemory.h"
  #include "cstring.h"
  #include "umutex.h"
+#include "ustr_imp.h"
  
  /* control optimizations according to the platform */
  #define MBCS_UNROLL_SINGLE_TO_BMP 1
@@ -5011,13 +5012,9 @@ ucnv_MBCSSingleFromUChar32(UConverterSharedData *sharedData,
  
  /* MBCS-from-UTF-8 conversion functions ------------------------------------- */
  
-/* minimum code point values for n-byte UTF-8 sequences, n=0..4 */
-static const UChar32
-utf8_minLegal[5]={ 0, 0, 0x80, 0x800, 0x10000 };
-
  /* offsets for n-byte UTF-8 sequences that were calculated with ((lead<<6)+trail)<<6+trail... */
  static const UChar32
-utf8_offsets[7]={ 0, 0, 0x3080, 0xE2080, 0x3C82080 };
+utf8_offsets[5]={ 0, 0, 0x3080, 0xE2080, 0x3C82080 };
  
  static void U_CALLCONV
  ucnv_SBCSFromUTF8(UConverterFromUnicodeArgs *pFromUArgs,
@@ -5075,28 +5072,27 @@ ucnv_SBCSFromUTF8(UConverterFromUnicodeArgs *pFromUArgs,
          toULength=oldToULength=toULimit=0;
      }
  
-    /*
-     * Make sure that the last byte sequence before sourceLimit is complete
-     * or runs into a lead byte.
-     * Do not go back into the bytes that will be read for finishing a partial
-     * sequence from the previous buffer.
-     * In the conversion loop compare source with sourceLimit only once
-     * per multi-byte character.
-     */
+    // The conversion loop checks source<sourceLimit only once per 1/2/3-byte character.
+    // If the buffer ends with a truncated 2- or 3-byte sequence,
+    // then we reduce the sourceLimit to before that,
+    // and collect the remaining bytes after the conversion loop.
      {
-        int32_t i, length;
-
-        length=(int32_t)(sourceLimit-source) - (toULimit-oldToULength);
-        for(i=0; i<3 && i<length;) {
-            b=*(sourceLimit-i-1);
-            if(U8_IS_TRAIL(b)) {
-                ++i;
-            } else {
-                if(i<U8_COUNT_TRAIL_BYTES(b)) {
-                    /* exit the conversion loop before the lead byte if there are not enough trail bytes for it */
-                    sourceLimit-=i+1;
+        // Do not go back into the bytes that will be read for finishing a partial
+        // sequence from the previous buffer.
+        int32_t length=(int32_t)(sourceLimit-source) - (toULimit-oldToULength);
+        if(length>0) {
+            uint8_t b1=*(sourceLimit-1);
+            if(U8_IS_SINGLE(b1)) {
+                // common ASCII character
+            } else if(U8_IS_TRAIL(b1) && length>=2) {
+                uint8_t b2=*(sourceLimit-2);
+                if(0xe0<=b2 && b2<0xf0 && U8_IS_VALID_LEAD3_AND_T1(b2, b1)) {
+                    // truncated 3-byte sequence
+                    sourceLimit-=2;
                  }
-                break;
+            } else if(0xc2<=b1 && b1<0xf0) {
+                // truncated 2- or 3-byte sequence
+                --sourceLimit;
              }
          }
      }
@@ -5130,7 +5126,7 @@ ucnv_SBCSFromUTF8(UConverterFromUnicodeArgs *pFromUArgs,
      while(source<sourceLimit) {
          if(targetCapacity>0) {
              b=*source++;
-            if((int8_t)b>=0) {
+            if(U8_IS_SINGLE(b)) {
                  /* convert ASCII */
                  if(IS_ASCII_ROUNDTRIP(b, asciiRoundtrips)) {
                      *target++=(uint8_t)b;
@@ -5185,7 +5181,7 @@ ucnv_SBCSFromUTF8(UConverterFromUnicodeArgs *pFromUArgs,
                      /* handle "complicated" and error cases, and continuing partial characters */
                      oldToULength=0;
                      toULength=1;
-                    toULimit=U8_COUNT_TRAIL_BYTES(b)+1;
+                    toULimit=U8_COUNT_BYTES_NON_ASCII(b);
                      c=b;
  moreBytes:
                      while(toULength<toULimit) {
@@ -5198,7 +5194,7 @@ moreBytes:
                           */
                          if(source<(uint8_t *)pToUArgs->sourceLimit) {
                              b=*source;
-                            if(U8_IS_TRAIL(b)) {
+                            if(icu::UTF8::isValidTrail(c, b, toULength, toULimit)) {
                                  ++source;
                                  ++toULength;
                                  c=(c<<6)+b;
@@ -5220,22 +5216,18 @@ moreBytes:
                          }
                      }
  
-                    if( toULength==toULimit &&      /* consumed all trail bytes */
-                        (toULength==3 || toULength==2) &&             /* BMP */
-                        (c-=utf8_offsets[toULength])>=utf8_minLegal[toULength] &&
-                        (c<=0xd7ff || 0xe000<=c)    /* not a surrogate */
-                    ) {
-                        value=MBCS_SINGLE_RESULT_FROM_U(table, results, c);
-                    } else if(
-                        toULength==toULimit && toULength==4 &&
-                        (0x10000<=(c-=utf8_offsets[4]) && c<=0x10ffff)
-                    ) {
-                        /* supplementary code point */
-                        if(!hasSupplementary) {
-                            /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
-                            value=0;
-                        } else {
+                    if(toULength==toULimit) {
+                        c-=utf8_offsets[toULength];
+                        if(toULength<=3) {  /* BMP */
                              value=MBCS_SINGLE_RESULT_FROM_U(table, results, c);
+                        } else {
+                            /* supplementary code point */
+                            if(!hasSupplementary) {
+                                /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
+                                value=0;
+                            } else {
+                                value=MBCS_SINGLE_RESULT_FROM_U(table, results, c);
+                            }
                          }
                      } else {
                          /* error handling: illegal UTF-8 byte sequence */
@@ -5310,7 +5302,7 @@ moreBytes:
              source<(sourceLimit=(uint8_t *)pToUArgs->sourceLimit)) {
          c=utf8->toUBytes[0]=b=*source++;
          toULength=1;
-        toULimit=U8_COUNT_TRAIL_BYTES(b)+1;
+        toULimit=U8_COUNT_BYTES(b);
          while(source<sourceLimit) {
              utf8->toUBytes[toULength++]=b=*source++;
              c=(c<<6)+b;
@@ -5375,28 +5367,27 @@ ucnv_DBCSFromUTF8(UConverterFromUnicodeArgs *pFromUArgs,
          toULength=oldToULength=toULimit=0;
      }
  
-    /*
-     * Make sure that the last byte sequence before sourceLimit is complete
-     * or runs into a lead byte.
-     * Do not go back into the bytes that will be read for finishing a partial
-     * sequence from the previous buffer.
-     * In the conversion loop compare source with sourceLimit only once
-     * per multi-byte character.
-     */
+    // The conversion loop checks source<sourceLimit only once per 1/2/3-byte character.
+    // If the buffer ends with a truncated 2- or 3-byte sequence,
+    // then we reduce the sourceLimit to before that,
+    // and collect the remaining bytes after the conversion loop.
      {
-        int32_t i, length;
-
-        length=(int32_t)(sourceLimit-source) - (toULimit-oldToULength);
-        for(i=0; i<3 && i<length;) {
-            b=*(sourceLimit-i-1);
-            if(U8_IS_TRAIL(b)) {
-                ++i;
-            } else {
-                if(i<U8_COUNT_TRAIL_BYTES(b)) {
-                    /* exit the conversion loop before the lead byte if there are not enough trail bytes for it */
-                    sourceLimit-=i+1;
+        // Do not go back into the bytes that will be read for finishing a partial
+        // sequence from the previous buffer.
+        int32_t length=(int32_t)(sourceLimit-source) - (toULimit-oldToULength);
+        if(length>0) {
+            uint8_t b1=*(sourceLimit-1);
+            if(U8_IS_SINGLE(b1)) {
+                // common ASCII character
+            } else if(U8_IS_TRAIL(b1) && length>=2) {
+                uint8_t b2=*(sourceLimit-2);
+                if(0xe0<=b2 && b2<0xf0 && U8_IS_VALID_LEAD3_AND_T1(b2, b1)) {
+                    // truncated 3-byte sequence
+                    sourceLimit-=2;
                  }
-                break;
+            } else if(0xc2<=b1 && b1<0xf0) {
+                // truncated 2- or 3-byte sequence
+                --sourceLimit;
              }
          }
      }
@@ -5412,7 +5403,7 @@ ucnv_DBCSFromUTF8(UConverterFromUnicodeArgs *pFromUArgs,
      while(source<sourceLimit) {
          if(targetCapacity>0) {
              b=*source++;
-            if((int8_t)b>=0) {
+            if(U8_IS_SINGLE(b)) {
                  /* convert ASCII */
                  if(IS_ASCII_ROUNDTRIP(b, asciiRoundtrips)) {
                      *target++=b;
@@ -5426,13 +5417,13 @@ ucnv_DBCSFromUTF8(UConverterFromUnicodeArgs *pFromUArgs,
                      }
                  }
              } else {
-                if(b>0xe0) {
-                    if( /* handle U+1000..U+D7FF inline */
-                        (((t1=(uint8_t)(source[0]-0x80), b<0xed) && (t1 <= 0x3f)) ||
-                                                        (b==0xed && (t1 <= 0x1f))) &&
+                if(b>=0xe0) {
+                    if( /* handle U+0800..U+D7FF inline */
+                        b<=0xed &&  // do not assume maxFastUChar>0xd7ff
+                        U8_IS_VALID_LEAD3_AND_T1(b, t1=source[0]) &&
                          (t2=(uint8_t)(source[1]-0x80)) <= 0x3f
                      ) {
-                        c=((b&0xf)<<6)|t1;
+                        c=((b&0xf)<<6)|(t1&0x3f);
                          source+=2;
                          value=DBCS_RESULT_FROM_UTF8(mbcsIndex, results, c, t2);
                          if(value==0) {
@@ -5442,7 +5433,7 @@ ucnv_DBCSFromUTF8(UConverterFromUnicodeArgs *pFromUArgs,
                      } else {
                          c=-1;
                      }
-                } else if(b<0xe0) {
+                } else {
                      if( /* handle U+0080..U+07FF inline */
                          b>=0xc2 &&
                          (t1=(uint8_t)(*source-0x80)) <= 0x3f
@@ -5457,15 +5448,13 @@ ucnv_DBCSFromUTF8(UConverterFromUnicodeArgs *pFromUArgs,
                      } else {
                          c=-1;
                      }
-                } else {
-                    c=-1;
                  }
  
                  if(c<0) {
                      /* handle "complicated" and error cases, and continuing partial characters */
                      oldToULength=0;
                      toULength=1;
-                    toULimit=U8_COUNT_TRAIL_BYTES(b)+1;
+                    toULimit=U8_COUNT_BYTES_NON_ASCII(b);
                      c=b;
  moreBytes:
                      while(toULength<toULimit) {
@@ -5478,7 +5467,7 @@ moreBytes:
                           */
                          if(source<(uint8_t *)pToUArgs->sourceLimit) {
                              b=*source;
-                            if(U8_IS_TRAIL(b)) {
+                            if(icu::UTF8::isValidTrail(c, b, toULength, toULimit)) {
                                  ++source;
                                  ++toULength;
                                  c=(c<<6)+b;
@@ -5500,22 +5489,18 @@ moreBytes:
                          }
                      }
  
-                    if( toULength==toULimit &&      /* consumed all trail bytes */
-                        (toULength==3 || toULength==2) &&             /* BMP */
-                        (c-=utf8_offsets[toULength])>=utf8_minLegal[toULength] &&
-                        (c<=0xd7ff || 0xe000<=c)    /* not a surrogate */
-                    ) {
-                        stage2Entry=MBCS_STAGE_2_FROM_U(table, c);
-                    } else if(
-                        toULength==toULimit && toULength==4 &&
-                        (0x10000<=(c-=utf8_offsets[4]) && c<=0x10ffff)
-                    ) {
-                        /* supplementary code point */
-                        if(!hasSupplementary) {
-                            /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
-                            stage2Entry=0;
-                        } else {
+                    if(toULength==toULimit) {
+                        c-=utf8_offsets[toULength];
+                        if(toULength<=3) {  /* BMP */
                              stage2Entry=MBCS_STAGE_2_FROM_U(table, c);
+                        } else {
+                            /* supplementary code point */
+                            if(!hasSupplementary) {
+                                /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
+                                stage2Entry=0;
+                            } else {
+                                stage2Entry=MBCS_STAGE_2_FROM_U(table, c);
+                            }
                          }
                      } else {
                          /* error handling: illegal UTF-8 byte sequence */
@@ -5620,7 +5605,7 @@ unassigned:
              source<(sourceLimit=(uint8_t *)pToUArgs->sourceLimit)) {
          c=utf8->toUBytes[0]=b=*source++;
          toULength=1;
-        toULimit=U8_COUNT_TRAIL_BYTES(b)+1;
+        toULimit=U8_COUNT_BYTES(b);
          while(source<sourceLimit) {
              utf8->toUBytes[toULength++]=b=*source++;
              c=(c<<6)+b;
diff --git a/icu4c/source/common/unicode/utf8.h b/icu4c/source/common/unicode/utf8.h

index 55024fdbfe5883f98cc839fe526b72bb8323d3c2..df08d341ab868afbb6782f93b7d650df2b72e0cb 100644 (file)
--- a/icu4c/source/common/unicode/utf8.h
+++ b/icu4c/source/common/unicode/utf8.h
@@ -53,8 +53,8 @@
   * @internal
   */
  #define U8_COUNT_TRAIL_BYTES(leadByte) \
-    ((uint8_t)(leadByte)<=0xf4 ? \
-        ((uint8_t)(leadByte)>=0xc2)+((uint8_t)(leadByte)>=0xe0)+((uint8_t)(leadByte)>=0xf0) : 0)
+    (U8_IS_LEAD(leadByte) ? \
+        ((uint8_t)(leadByte)>=0xe0)+((uint8_t)(leadByte)>=0xf0)+1 : 0)
  
  /**
   * Counts the trail bytes for a UTF-8 lead byte of a valid UTF-8 sequence.
@@ -80,29 +80,35 @@
  #define U8_MASK_LEAD_BYTE(leadByte, countTrailBytes) ((leadByte)&=(1<<(6-(countTrailBytes)))-1)
  
  /**
- * Internal bit vector for 3-byte UTF-8 validity check.
- * Lead byte E0..EF bits 3..0 as byte index,
- * first trail byte bits 7..5 as bit index into that byte.
+ * Internal bit vector for 3-byte UTF-8 validity check, for use in U8_IS_VALID_LEAD3_AND_T1.
+ * Each bit indicates whether one lead byte + first trail byte pair starts a valid sequence.
+ * Lead byte E0..EF bits 3..0 are used as byte index,
+ * first trail byte bits 7..5 are used as bit index into that byte.
+ * @see U8_IS_VALID_LEAD3_AND_T1
   * @internal
   */
  #define U8_LEAD3_T1_BITS "\x20\x30\x30\x30\x30\x30\x30\x30\x30\x30\x30\x30\x30\x10\x30\x30"
  
  /**
   * Internal 3-byte UTF-8 validity check.
+ * Non-zero if lead byte E0..EF and first trail byte 00..FF start a valid sequence.
   * @internal
   */
  #define U8_IS_VALID_LEAD3_AND_T1(lead, t1) (U8_LEAD3_T1_BITS[(lead)&0xf]&(1<<((uint8_t)(t1)>>5)))
  
  /**
- * Internal bit vector for 4-byte UTF-8 validity check.
- * First trail byte bits 7..4 as byte index,
- * lead byte F0..F4 bits 2..0 as bit index into that byte.
+ * Internal bit vector for 4-byte UTF-8 validity check, for use in U8_IS_VALID_LEAD4_AND_T1.
+ * Each bit indicates whether one lead byte + first trail byte pair starts a valid sequence.
+ * First trail byte bits 7..4 are used as byte index,
+ * lead byte F0..F4 bits 2..0 are used as bit index into that byte.
+ * @see U8_IS_VALID_LEAD4_AND_T1
   * @internal
   */
  #define U8_LEAD4_T1_BITS "\x00\x00\x00\x00\x00\x00\x00\x00\x1E\x0F\x0F\x0F\x00\x00\x00\x00"
  
  /**
   * Internal 4-byte UTF-8 validity check.
+ * Non-zero if lead byte F0..F4 and first trail byte 00..FF start a valid sequence.
   * @internal
   */
  #define U8_IS_VALID_LEAD4_AND_T1(lead, t1) (U8_LEAD4_T1_BITS[(uint8_t)(t1)>>4]&(1<<((lead)&7)))
@@ -166,7 +172,7 @@ utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i);
  #define U8_IS_SINGLE(c) (((c)&0x80)==0)
  
  /**
- * Is this code unit (byte) a UTF-8 lead byte?
+ * Is this code unit (byte) a UTF-8 lead byte? (0xC2..0xF4)
   * @param c 8-bit code unit (byte)
   * @return TRUE or FALSE
   * @stable ICU 2.4
@@ -175,7 +181,7 @@ utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i);
  // 0x32=0xf4-0xc2
  
  /**
- * Is this code unit (byte) a UTF-8 trail byte?
+ * Is this code unit (byte) a UTF-8 trail byte? (0x80..0xBF)
   * @param c 8-bit code unit (byte)
   * @return TRUE or FALSE
   * @stable ICU 2.4
diff --git a/icu4c/source/common/ustr_imp.h b/icu4c/source/common/ustr_imp.h

index 9815915ff5294589f6845dd8e0f3134c317b6cf1..c555ee37ea8096866e7584007aac6b0ab81c50bc 100644 (file)
--- a/icu4c/source/common/ustr_imp.h
+++ b/icu4c/source/common/ustr_imp.h
@@ -18,6 +18,7 @@
  #define __USTR_IMP_H__
  
  #include "unicode/utypes.h"
+#include "unicode/utf8.h"
  
  /**
   * Internal option for unorm_cmpEquivFold() for strncmp style.
@@ -81,4 +82,62 @@ u_terminateUChar32s(UChar32 *dest, int32_t destCapacity, int32_t length, UErrorC
  U_CAPI int32_t U_EXPORT2
  u_terminateWChars(wchar_t *dest, int32_t destCapacity, int32_t length, UErrorCode *pErrorCode);
  
+/**
+ * Counts the bytes of any whole valid sequence for a UTF-8 lead byte.
+ * Returns 1 for ASCII 0..0x7f.
+ * Returns 0 for 0x80..0xc1 as well as for 0xf5..0xff.
+ * leadByte might be evaluated multiple times.
+ *
+ * @param leadByte The first byte of a UTF-8 sequence. Must be 0..0xff.
+ * @return 0..4
+ */
+#define U8_COUNT_BYTES(leadByte) \
+    (U8_IS_SINGLE(leadByte) ? 1 : U8_COUNT_BYTES_NON_ASCII(leadByte))
+
+/**
+ * Counts the bytes of any whole valid sequence for a UTF-8 lead byte.
+ * Returns 0 for 0x00..0xc1 as well as for 0xf5..0xff.
+ * leadByte might be evaluated multiple times.
+ *
+ * @param leadByte The first byte of a UTF-8 sequence. Must be 0..0xff.
+ * @return 0 or 2..4
+ */
+#define U8_COUNT_BYTES_NON_ASCII(leadByte) \
+    (U8_IS_LEAD(leadByte) ? ((uint8_t)(leadByte)>=0xe0)+((uint8_t)(leadByte)>=0xf0)+2 : 0)
+
+#ifdef __cplusplus
+
+U_NAMESPACE_BEGIN
+
+class UTF8 {
+public:
+    UTF8() = delete;  // all static
+
+    /**
+     * Is t a valid UTF-8 trail byte?
+     *
+     * @param prev Must be the preceding lead byte if i==1 and length>=3;
+     *             otherwise ignored.
+     * @param t The i-th byte following the lead byte.
+     * @param i The index (1..3) of byte t in the byte sequence. 0<i<length
+     * @param length The length (2..4) of the byte sequence according to the lead byte.
+     * @return TRUE if t is a valid trail byte in this context.
+     */
+    static inline UBool isValidTrail(int32_t prev, uint8_t t, int32_t i, int32_t length) {
+        // The first trail byte after a 3- or 4-byte lead byte
+        // needs to be validated together with its lead byte.
+        if (length <= 2 || i > 1) {
+            return U8_IS_TRAIL(t);
+        } else if (length == 3) {
+            return U8_IS_VALID_LEAD3_AND_T1(prev, t);
+        } else {  // length == 4
+            return U8_IS_VALID_LEAD4_AND_T1(prev, t);
+        }
+    }
+};
+
+U_NAMESPACE_END
+
+#endif  // __cplusplus
+
  #endif
diff --git a/icu4c/source/common/utf_impl.cpp b/icu4c/source/common/utf_impl.cpp

index b560a4f7aa40146ab4e3edea9e0ae8cfd470caea..f78c566e0988843b715965ee81232cb7dffd6651 100644 (file)
--- a/icu4c/source/common/utf_impl.cpp
+++ b/icu4c/source/common/utf_impl.cpp
@@ -281,13 +281,13 @@ utf8_prevCharSafeBody(const uint8_t *s, int32_t start, int32_t *pi, UChar32 c, U
                          }
                      }
                  }
-            } else if((0xf0<=b2 && b2<=0xf4) && U8_IS_VALID_LEAD4_AND_T1(b2, b1)) {
+            } else if(0xf0<=b2 && b2<=0xf4 && U8_IS_VALID_LEAD4_AND_T1(b2, b1)) {
                  // Truncated 4-byte sequence.
                  *pi=i;
                  return errorValue(2, strict);
              }
-        } else if(((0xe0<=b1 && b1<0xf0) && U8_IS_VALID_LEAD3_AND_T1(b1, c)) ||
-                ((0xf0<=b1 && b1<=0xf4) && U8_IS_VALID_LEAD4_AND_T1(b1, c))) {
+        } else if((0xe0<=b1 && b1<0xf0 && U8_IS_VALID_LEAD3_AND_T1(b1, c)) ||
+                (0xf0<=b1 && b1<=0xf4 && U8_IS_VALID_LEAD4_AND_T1(b1, c))) {
              // Truncated 3- or 4-byte sequence.
              *pi=i;
              return errorValue(1, strict);
@@ -318,12 +318,12 @@ utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i) {
                          return i;
                      }
                  }
-            } else if((0xf0<=b2 && b2<=0xf4) && U8_IS_VALID_LEAD4_AND_T1(b2, b1)) {
+            } else if(0xf0<=b2 && b2<=0xf4 && U8_IS_VALID_LEAD4_AND_T1(b2, b1)) {
                  // Truncated 4-byte sequence.
                  return i;
              }
-        } else if(((0xe0<=b1 && b1<0xf0) && U8_IS_VALID_LEAD3_AND_T1(b1, c)) ||
-                ((0xf0<=b1 && b1<=0xf4) && U8_IS_VALID_LEAD4_AND_T1(b1, c))) {
+        } else if((0xe0<=b1 && b1<0xf0 && U8_IS_VALID_LEAD3_AND_T1(b1, c)) ||
+                (0xf0<=b1 && b1<=0xf4 && U8_IS_VALID_LEAD4_AND_T1(b1, c))) {
              // Truncated 3- or 4-byte sequence.
              return i;
          }
diff --git a/icu4c/source/test/cintltst/ccapitst.c b/icu4c/source/test/cintltst/ccapitst.c

index 4ae23ceed8a1cf3711dbff876f6cadd9a4884301..b5eb5f32e68edc00e22a5852bdf817e2facba716 100644 (file)
--- a/icu4c/source/test/cintltst/ccapitst.c
+++ b/icu4c/source/test/cintltst/ccapitst.c
@@ -2495,6 +2495,26 @@ static UBool getTestChar(UConverter *cnv, const char *converterName,
      return TRUE;
  }
  
+static UBool isOneTruncatedUTF8(const char *s, int32_t length) {
+    if(length==0) {
+        return FALSE;
+    } else if(length==1) {
+        return U8_IS_LEAD(s[0]);
+    } else {
+        int32_t count=U8_COUNT_TRAIL_BYTES(s[0]);
+        if(length<=count) {
+            // 2 or more bytes, but fewer than the lead byte indicates.
+            int32_t oneLength=0;
+            U8_FWD_1(s, oneLength, length);
+            // Truncated if we reach the end of the string.
+            // Not true if the lead byte and first trail byte do not start a valid sequence,
+            // e.g., E0 80 -> oneLength=1.
+            return oneLength==length;
+        }
+        return FALSE;
+    }
+}
+
  static void testFromTruncatedUTF8(UConverter *utf8Cnv, UConverter *cnv, const char *converterName,
                                    char charUTF8[4], int32_t charUTF8Length,
                                    char char0[8], int32_t char0Length,
@@ -2526,7 +2546,7 @@ static void testFromTruncatedUTF8(UConverter *utf8Cnv, UConverter *cnv, const ch
      for(i=0; i<UPRV_LENGTHOF(badUTF8); ++i) {
          /* truncated sequence? */
          int32_t length=strlen(badUTF8[i]);
-        if(length>=(1+U8_COUNT_TRAIL_BYTES(badUTF8[i][0]))) {
+        if(!isOneTruncatedUTF8(badUTF8[i], length)) {
              continue;
          }
  
diff --git a/icu4c/source/test/cintltst/nccbtst.c b/icu4c/source/test/cintltst/nccbtst.c

index e48563072aa2e8b9f90d24a9deae2dff743dcd38..55780d5662223859113d33ea10fd0185e58c7c5c 100644 (file)
--- a/icu4c/source/test/cintltst/nccbtst.c
+++ b/icu4c/source/test/cintltst/nccbtst.c
@@ -1482,7 +1482,7 @@ static void TestSub(int32_t inputsize, int32_t outputsize)
          if(!testConvertFromUnicode(testinput, UPRV_LENGTHOF(testinput),
                  expectedUTF8, UPRV_LENGTHOF(expectedUTF8), "utf8",
                  UCNV_FROM_U_CALLBACK_SUBSTITUTE, offsets, NULL, 0 )) {
-            log_err("u-> utf8 with stop did not match.\n");
+            log_err("u-> utf8 with substitute did not match.\n");
          }
      }
  
@@ -1614,8 +1614,8 @@ static void TestSub(int32_t inputsize, int32_t outputsize)
      {
          const uint8_t sampleText1[] = { 0x31, 0xe4, 0xba, 0x8c, 
              0xe0, 0x80,  0x61,};
-        UChar    expected1[] = {  0x0031, 0x4e8c, 0xfffd, 0x0061};
-        int32_t offsets1[] = {   0x0000, 0x0001, 0x0004, 0x0006};
+        UChar    expected1[] = {  0x0031, 0x4e8c, 0xfffd, 0xfffd, 0x0061};
+        int32_t offsets1[] = {   0x0000, 0x0001, 0x0004, 0x0005, 0x0006};
  
          if(!testConvertToUnicode(sampleText1, UPRV_LENGTHOF(sampleText1),
                   expected1, UPRV_LENGTHOF(expected1),"utf8",
diff --git a/icu4c/source/test/cintltst/ncnvtst.c b/icu4c/source/test/cintltst/ncnvtst.c

index c1e5b4fdc6e210e66fcfb54e33e8561c801eeb66..255020a2e9c9f2e82454f660bdc6425c64082ab7 100644 (file)
--- a/icu4c/source/test/cintltst/ncnvtst.c
+++ b/icu4c/source/test/cintltst/ncnvtst.c
@@ -963,8 +963,8 @@ static void TestWithBufferSize(int32_t insize, int32_t outsize){
      {
          const uint8_t sampleText1[] = { 0x31, 0xe4, 0xba, 0x8c, 
              0xe0, 0x80,  0x61};
-        UChar    expected1[] = {  0x0031, 0x4e8c, 0xfffd, 0x0061};
-        int32_t offsets1[] = {   0x0000, 0x0001, 0x0004, 0x0006};
+        UChar    expected1[] = {  0x0031, 0x4e8c, 0xfffd, 0xfffd, 0x0061};
+        int32_t offsets1[] = {   0x0000, 0x0001, 0x0004, 0x0005, 0x0006};
  
          if(!testConvertToU(sampleText1, sizeof(sampleText1),
                   expected1, UPRV_LENGTHOF(expected1),"utf8", UCNV_TO_U_CALLBACK_SUBSTITUTE, offsets1,FALSE))
diff --git a/icu4c/source/test/cintltst/nucnvtst.c b/icu4c/source/test/cintltst/nucnvtst.c

index 3366b66999911178700d5609479ba9d5790ff7f1..7aa7a1beaf7e1fd7a54386c10aa2cef56648ef7e 100644 (file)
--- a/icu4c/source/test/cintltst/nucnvtst.c
+++ b/icu4c/source/test/cintltst/nucnvtst.c
@@ -1113,26 +1113,36 @@ static void TestNewConvertWithBufferSizes(int32_t outsize, int32_t insize )
              0xf4, 0x8f, 0xbf, 0xbf,         /* 10FFFF */
              0xdf, 0xbf,                     /* 7ff */
              0xbf,                           /* truncated tail */
-            0xf4, 0x90, 0x80, 0x80,         /* 11FFFF */
+            0xf4, 0x90, 0x80, 0x80,         /* 110000 */
              0x02
          };
  
          static const uint16_t utf8Expected[]={
              0x0061,
-            0xfffd,
+            0xfffd, 0xfffd, 0xfffd, 0xfffd,
              0x0000,
              0x0062,
-            0xfffd,
-            0xfffd,
+            0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd,
+            0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd,
              0xdbff, 0xdfff,
              0x07ff,
              0xfffd,
-            0xfffd,
+            0xfffd, 0xfffd, 0xfffd, 0xfffd,
              0x0002
          };
  
          static const int32_t utf8Offsets[]={
-            0, 1, 5, 6, 7, 12, 17, 17, 21, 23, 24, 28
+            0,
+            1, 2, 3, 4,
+            5,
+            6,
+            7, 8, 9, 10, 11,
+            12, 13, 14, 15, 16,
+            17, 17,
+            21,
+            23,
+            24, 25, 26, 27,
+            28
          };
          testConvertToU(utf8, sizeof(utf8),
                         utf8Expected, UPRV_LENGTHOF(utf8Expected), "utf-8", utf8Offsets ,FALSE);
diff --git a/icu4c/source/test/testdata/conversion.txt b/icu4c/source/test/testdata/conversion.txt

index 7b0272ab304df609685b4651e445749f12ab3286..bc8402fe4a2641c4365ab3a2ff22248be6b3ac82 100644 (file)
--- a/icu4c/source/test/testdata/conversion.txt
+++ b/icu4c/source/test/testdata/conversion.txt
@@ -763,9 +763,9 @@ conversion:table(nofallback) {
          // surrogates in CESU-8
          { "CESU-8", :bin{ eda080eda081edb081 }, "\ud800\U00010401", :intvector{ 0, 3, 6 }, :int{1}, :int{0}, "", "", :bin{""} }
          // e080 is a partial sequence
-        { "UTF-8", :bin{ 31ffe4ba8ce08061 }, "1\ufffd\u4e8c\ufffda", :intvector{ 0, 1, 2, 5, 7 }, :int{0}, :int{0}, "", "", :bin{ e080 } }
+        { "UTF-8", :bin{ 31ffe4ba8ce08061 }, "1\ufffd\u4e8c\ufffd\ufffda", :intvector{ 0, 1, 2, 5, 6, 7 }, :int{0}, :int{0}, "", "", :bin{ 80 } }
          // fbbfbfbfbf exceedes U+10ffff
-        { "UTF-8", :bin{ 31fbbfbfbfbf61 }, "1\ufffda", :intvector{ 0, 1, 6 }, :int{0}, :int{0}, "", "", :bin{ fbbfbfbfbf } }
+        { "UTF-8", :bin{ 31fbbfbfbfbf61 }, "1\ufffd\ufffd\ufffd\ufffd\ufffda", :intvector{ 0, 1, 2, 3, 4, 5, 6 }, :int{0}, :int{0}, "", "", :bin{ bf } }
  
          // lead byte a2 without trail byte
          { "ibm-1363", :bin{ a2aea2 }, "\u00a1", :intvector{ 0 }, :int{1}, :int{0}, "truncated", ".", :bin{ a2 } }
diff --git a/icu4j/main/classes/charset/src/com/ibm/icu/charset/CharsetUTF8.java b/icu4j/main/classes/charset/src/com/ibm/icu/charset/CharsetUTF8.java

index b1d0472536a6e29992ee836c9bf3e4cc1c3a450e..bd52379e7f3b0d50284ff82c236d0d3cee191238 100644 (file)
--- a/icu4j/main/classes/charset/src/com/ibm/icu/charset/CharsetUTF8.java
+++ b/icu4j/main/classes/charset/src/com/ibm/icu/charset/CharsetUTF8.java
@@ -36,26 +36,7 @@ class CharsetUTF8 extends CharsetICU {
          maxCharsPerByte = 1;
      }
  
-    private static final int BITMASK_FROM_UTF8[] = { -1, 0x7f, 0x1f, 0xf, 0x7, 0x3, 0x1 };
-
-    private static final byte BYTES_FROM_UTF8[] = {
-        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
-    };
-
-    /*
-     * Starting with Unicode 3.0.1: UTF-8 byte sequences of length N _must_ encode code points of or
-     * above utf8_minChar32[N]; byte sequences with more than 4 bytes are illegal in UTF-8, which is
-     * tested with impossible values for them
-     */
-    private static final int UTF8_MIN_CHAR32[] = { 0, 0, 0x80, 0x800, 0x10000,
-            Integer.MAX_VALUE, Integer.MAX_VALUE };
+    private static final int BITMASK_FROM_UTF8[] = { -1, 0x7f, 0x1f, 0xf, 0x7 };
  
      private final boolean isCESU8 = this instanceof CharsetCESU8;
  
@@ -92,9 +73,9 @@ class CharsetUTF8 extends CharsetICU {
  
                  if (mode == 0) {
                      /* nothing is stored in toUnicodeStatus, read a byte as input */
-                    char32 = (toUBytesArray[0] = sourceArray[sourceIndex++]) & 0xff;
-                    bytesExpected = BYTES_FROM_UTF8[char32];
-                    char32 &= BITMASK_FROM_UTF8[bytesExpected];
+                    toUBytesArray[0] = ch = sourceArray[sourceIndex++];
+                    bytesExpected = UTF8.countBytes(ch);
+                    char32 = ch & BITMASK_FROM_UTF8[bytesExpected];
                      bytesSoFar = 1;
                  } else {
                      /* a partially or fully built code point is stored in toUnicodeStatus */
@@ -118,8 +99,9 @@ class CharsetUTF8 extends CharsetICU {
                              cr = CoderResult.UNDERFLOW;
                              break;
                          }
-                        if (((ch = toUBytesArray[bytesSoFar] = sourceArray[sourceIndex++]) & 0xc0) != 0x80) {
-                            /* not a trail byte (is not of the form 10xxxxxx) */
+                        toUBytesArray[bytesSoFar] = ch = sourceArray[sourceIndex++];
+                        if (!UTF8.isValidTrail(char32, ch, bytesSoFar, bytesExpected)
+                                && !(isCESU8 && bytesSoFar == 1 && char32 == 0xd && UTF8.isTrail(ch))) {
                              sourceIndex--;
                              toULength = bytesSoFar;
                              cr = CoderResult.malformedForLength(bytesSoFar);
@@ -127,8 +109,7 @@ class CharsetUTF8 extends CharsetICU {
                          }
                          char32 = (char32 << 6) | (ch & 0x3f);
                          bytesSoFar++;
-                    } else if (bytesSoFar == bytesExpected && UTF8_MIN_CHAR32[bytesExpected] <= char32 && char32 <= 0x10ffff
-                            && (isCESU8 ? bytesExpected <= 3 : !UTF16.isSurrogate((char) char32))) {
+                    } else if (bytesSoFar == bytesExpected && (!isCESU8 || bytesSoFar <= 3)) {
                          /*
                           * char32 is a valid code point and is composed of the correct number of
                           * bytes ... we now need to output it in UTF-16
@@ -168,8 +149,8 @@ class CharsetUTF8 extends CharsetICU {
                          }
  
                          /* keep reading the next input (and writing it) while bytes == 1 */
-                        while ((bytesExpected = BYTES_FROM_UTF8[char32 = (toUBytesArray[0] = sourceArray[sourceIndex++]) & 0xff]) == 1) {
-                            targetArray[targetIndex++] = (char) char32;
+                        while (UTF8.isSingle(ch = sourceArray[sourceIndex++])) {
+                            targetArray[targetIndex++] = (char) ch;
                              if (sourceIndex >= sourceLimit) {
                                  cr = CoderResult.UNDERFLOW;
                                  break outer;
@@ -179,9 +160,11 @@ class CharsetUTF8 extends CharsetICU {
                                  break outer;
                              }
                          }
+                        toUBytesArray[0] = ch;
  
                          /* remove the bits that indicate the number of bytes */
-                        char32 &= BITMASK_FROM_UTF8[bytesExpected];
+                        bytesExpected = UTF8.countBytes(ch);
+                        char32 = ch & BITMASK_FROM_UTF8[bytesExpected];
                          bytesSoFar = 1;
                      } else {
                          /*
@@ -212,9 +195,9 @@ class CharsetUTF8 extends CharsetICU {
  
                  if (mode == 0) {
                      /* nothing is stored in toUnicodeStatus, read a byte as input */
-                    char32 = (toUBytesArray[0] = source.get(sourceIndex++)) & 0xff;
-                    bytesExpected = BYTES_FROM_UTF8[char32];
-                    char32 &= BITMASK_FROM_UTF8[bytesExpected];
+                    toUBytesArray[0] = ch = source.get(sourceIndex++);
+                    bytesExpected = UTF8.countBytes(ch);
+                    char32 = ch & BITMASK_FROM_UTF8[bytesExpected];
                      bytesSoFar = 1;
                  } else {
                      /* a partially or fully built code point is stored in toUnicodeStatus */
@@ -238,8 +221,9 @@ class CharsetUTF8 extends CharsetICU {
                              cr = CoderResult.UNDERFLOW;
                              break;
                          }
-                        if (((ch = toUBytesArray[bytesSoFar] = source.get(sourceIndex++)) & 0xc0) != 0x80) {
-                            /* not a trail byte (is not of the form 10xxxxxx) */
+                        toUBytesArray[bytesSoFar] = ch = source.get(sourceIndex++);
+                        if (!UTF8.isValidTrail(char32, ch, bytesSoFar, bytesExpected)
+                                && !(isCESU8 && bytesSoFar == 1 && char32 == 0xd && UTF8.isTrail(ch))) {
                              sourceIndex--;
                              toULength = bytesSoFar;
                              cr = CoderResult.malformedForLength(bytesSoFar);
@@ -247,21 +231,7 @@ class CharsetUTF8 extends CharsetICU {
                          }
                          char32 = (char32 << 6) | (ch & 0x3f);
                          bytesSoFar++;
-                    }
-                    /*
-                     * Legal UTF-8 byte sequences in Unicode 3.0.1 and up:
-                     * - use only trail bytes after a lead byte (checked above)
-                     * - use the right number of trail bytes for a given lead byte
-                     * - encode a code point <= U+10ffff
-                     * - use the fewest possible number of bytes for their code points
-                     * - use at most 4 bytes (for i>=5 it is 0x10ffff<utf8_minChar32[])
-                     *
-                     * Starting with Unicode 3.2, surrogate code points must not be encoded in UTF-8.
-                     * There are no irregular sequences any more.
-                     * In CESU-8, only surrogates, not supplementary code points, are encoded directly.
-                     */
-                    else if (bytesSoFar == bytesExpected && UTF8_MIN_CHAR32[bytesExpected] <= char32 && char32 <= 0x10ffff
-                            && (isCESU8 ? bytesExpected <= 3 : !UTF16.isSurrogate((char) char32))) {
+                    } else if (bytesSoFar == bytesExpected && (!isCESU8 || bytesSoFar <= 3)) {
                          /*
                           * char32 is a valid code point and is composed of the correct number of
                           * bytes ... we now need to output it in UTF-16
@@ -305,8 +275,8 @@ class CharsetUTF8 extends CharsetICU {
                          }
  
                          /* keep reading the next input (and writing it) while bytes == 1 */
-                        while ((bytesExpected = BYTES_FROM_UTF8[char32 = (toUBytesArray[0] = source.get(sourceIndex++)) & 0xff]) == 1) {
-                            target.put(targetIndex++, (char) char32);
+                        while (UTF8.isSingle(ch = source.get(sourceIndex++))) {
+                            target.put(targetIndex++, (char) ch);
                              if (sourceIndex >= sourceLimit) {
                                  cr = CoderResult.UNDERFLOW;
                                  break outer;
@@ -316,9 +286,11 @@ class CharsetUTF8 extends CharsetICU {
                                  break outer;
                              }
                          }
+                        toUBytesArray[0] = ch;
  
                          /* remove the bits that indicate the number of bytes */
-                        char32 &= BITMASK_FROM_UTF8[bytesExpected];
+                        bytesExpected = UTF8.countBytes(ch);
+                        char32 = ch & BITMASK_FROM_UTF8[bytesExpected];
                          bytesSoFar = 1;
                      } else {
                          /*
@@ -658,32 +630,6 @@ class CharsetUTF8 extends CharsetICU {
          return (byte) (0x80 | (char32 & 0x3f));
      }
  
-    /* single-code point definitions -------------------------------------------- */
-
-    /*
-     * Does this code unit (byte) encode a code point by itself (US-ASCII 0..0x7f)?
-     * @param c 8-bit code unit (byte)
-     * @return TRUE or FALSE
-     */
-    // static final boolean isSingle(byte c) {return (((c)&0x80)==0);}
-    /*
-     * Is this code unit (byte) a UTF-8 lead byte?
-     * @param c 8-bit code unit (byte)
-     * @return TRUE or FALSE
-     */
-    // static final boolean isLead(byte c) {return ((((c)-0xc0) &
-    // UConverterConstants.UNSIGNED_BYTE_MASK)<0x3e);}
-    /*
-     * Is this code unit (byte) a UTF-8 trail byte?
-     *
-     * @param c
-     *            8-bit code unit (byte)
-     * @return TRUE or FALSE
-     */
-    /*private static final boolean isTrail(byte c) {
-        return (((c) & 0xc0) == 0x80);
-    }*/
-
      @Override
      public CharsetDecoder newDecoder() {
          return new CharsetDecoderUTF8(this);
diff --git a/icu4j/main/classes/charset/src/com/ibm/icu/charset/UTF8.java b/icu4j/main/classes/charset/src/com/ibm/icu/charset/UTF8.java

new file mode 100644 (file)

index 0000000..6dc39f4
--- /dev/null
+++ b/icu4j/main/classes/charset/src/com/ibm/icu/charset/UTF8.java
@@ -0,0 +1,172 @@
+// © 2017 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html#License
+package com.ibm.icu.charset;
+
+/**
+ * Partial Java port of ICU4C unicode/utf8.h and ustr_imp.h.
+ */
+class UTF8 {
+    /**
+     * Counts the trail bytes for a UTF-8 lead byte.
+     * Returns 0 for 0..0xc1 as well as for 0xf5..0xff.
+     *
+     * @param leadByte The first byte of a UTF-8 sequence. Must be 0..0xff.
+     * @return 0..3
+     */
+    static int countTrailBytes(byte leadByte) {
+        if (leadByte < (byte)0xe0) {
+            return leadByte < (byte)0xc2 ? 0 : 1;
+        } else if (leadByte < (byte)0xf0) {
+            return 2;
+        } else {
+            return leadByte <= (byte)0xf4 ? 3 : 0;
+        }
+    }
+
+    /**
+     * Counts the bytes of any whole valid sequence for a UTF-8 lead byte.
+     * Returns 1 for ASCII 0..0x7f.
+     * Returns 0 for 0x80..0xc1 as well as for 0xf5..0xff.
+     *
+     * @param leadByte The first byte of a UTF-8 sequence. Must be 0..0xff.
+     * @return 0..4
+     */
+    static int countBytes(byte leadByte) {
+        if (leadByte >= 0) {
+            return 1;
+        } else if (leadByte < (byte)0xe0) {
+            return leadByte < (byte)0xc2 ? 0 : 2;
+        } else if (leadByte < (byte)0xf0) {
+            return 3;
+        } else {
+            return leadByte <= (byte)0xf4 ? 4 : 0;
+        }
+    }
+
+    /**
+     * Internal bit vector for 3-byte UTF-8 validity check, for use in {@link #isValidLead3AndT1}.
+     * Each bit indicates whether one lead byte + first trail byte pair starts a valid sequence.
+     * Lead byte E0..EF bits 3..0 are used as data int index,
+     * first trail byte bits 7..5 are used as bit index into that int.
+     *
+     * @see #isValidLead3AndT1
+     */
+    private static final int[] U8_LEAD3_T1_BITS = {
+        0x20, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x10, 0x30, 0x30
+    };
+
+    /**
+     * Internal 3-byte UTF-8 validity check.
+     *
+     * @param lead E0..EF
+     * @param t1 00..FF
+     * @return true if lead byte E0..EF and first trail byte 00..FF start a valid sequence.
+     */
+    static boolean isValidLead3AndT1(int lead, byte t1) {
+        return (U8_LEAD3_T1_BITS[lead & 0xf] & (1 << ((t1 & 0xff) >> 5))) != 0;
+    }
+
+    /**
+     * Internal bit vector for 4-byte UTF-8 validity check, for use in {@link #isValidLead4AndT1}.
+     * Each bit indicates whether one lead byte + first trail byte pair starts a valid sequence.
+     * Lead byte F0..F4 bits 2..0 are used as data int index,
+     * first trail byte bits 7..4 are used as bit index into that int.
+     *
+     * @see #isValidLead4AndT1
+     */
+    private static final int[] U8_LEAD4_T1_BITS = {
+        0x0e00, 0x0f00, 0x0f00, 0x0f00, 0x0100
+    };
+
+    /**
+     * Internal 4-byte UTF-8 validity check.
+     *
+     * @param lead F0..F4
+     * @param t1 00..FF
+     * @return true if lead byte F0..F4 and first trail byte 00..FF start a valid sequence.
+     */
+    static boolean isValidLead4AndT1(int lead, byte t1) {
+        return (U8_LEAD4_T1_BITS[lead & 7] & (1 << ((t1 & 0xff) >> 4))) != 0;
+    }
+
+    /**
+     * Does this code unit (byte) encode a code point by itself (US-ASCII 0..0x7f)?
+     *
+     * @param c 8-bit code unit (byte)
+     * @return true if c is an ASCII byte
+     */
+    static boolean isSingle(byte c) {
+        return c >= 0;
+    }
+
+    /**
+     * Is this code unit (byte) a UTF-8 lead byte?
+     *
+     * @param c 8-bit code unit (byte)
+     * @return true if c is a lead byte
+     */
+    static boolean isLead(byte c) {
+        return ((c - 0xc2) & 0xff) <= 0x32;  // 0x32=0xf4-0xc2
+    }
+
+    /**
+     * Is this code unit (byte) a UTF-8 trail byte? (0x80..0xBF)
+     *
+     * @param c 8-bit code unit (byte)
+     * @return true if c is a trail byte
+     */
+    static boolean isTrail(byte c) {
+        return c < (byte)0xc0;
+    }
+
+    /**
+     * How many code units (bytes) are used for the UTF-8 encoding
+     * of this Unicode code point?
+     *
+     * @param c 32-bit code point
+     * @return 1..4, or 0 if c is a surrogate or not a Unicode code point
+     */
+    static int length(int c) {
+        if (c >= 0) {
+            if (c <= 0x7f) {
+                return 1;
+            } else if (c <= 0x7ff) {
+                return 2;
+            } else if (c <= 0xd7ff) {
+                return 3;
+            } else if (c <= 0xffff) {
+                return c >= 0xe000 ? 3 : 0;
+            } else if (c <= 0x10ffff) {
+                return 4;
+            }
+        }
+        return 0;
+    }
+
+    /**
+     * 4: The maximum number of UTF-8 code units (bytes) per Unicode code point (U+0000..U+10ffff).
+     */
+    static int MAX_LENGTH = 4;
+
+    /**
+     * Is t a valid UTF-8 trail byte?
+     *
+     * @param prev Must be the preceding lead byte if i==1 and length>=3;
+     *             otherwise ignored.
+     * @param t The i-th byte following the lead byte.
+     * @param i The index (1..3) of byte t in the byte sequence. 0<i<length
+     * @param length The length (2..4) of the byte sequence according to the lead byte.
+     * @return true if t is a valid trail byte in this context.
+     */
+    static boolean isValidTrail(int prev, byte t, int i, int length) {
+        // The first trail byte after a 3- or 4-byte lead byte
+        // needs to be validated together with its lead byte.
+        if (length <= 2 || i > 1) {
+            return isTrail(t);
+        } else if (length == 3) {
+            return isValidLead3AndT1(prev, t);
+        } else {  // length == 4
+            return isValidLead4AndT1(prev, t);
+        }
+    }
+}
diff --git a/icu4j/main/shared/data/testdata.jar b/icu4j/main/shared/data/testdata.jar

index b328d872ff9bd5e844ef7f02ea3648afc2274f49..c6c9435e54f7b86a50f0ac80da182903f9f2ac21 100755 (executable)
--- a/icu4j/main/shared/data/testdata.jar
+++ b/icu4j/main/shared/data/testdata.jar
@@ -1,3 +1,3 @@
  version https://git-lfs.github.com/spec/v1
-oid sha256:fd856769e94b963fb8a0b63148c63349198ef0c0ec3729173170ccbfd94c4999
-size 812769
+oid sha256:a99e848a9249a672092d5fc14d8fe02dc5728ad1f3548c287a9d1c5b12088013
+size 812760
diff --git a/icu4j/main/tests/charset/src/com/ibm/icu/dev/test/charset/TestCharset.java b/icu4j/main/tests/charset/src/com/ibm/icu/dev/test/charset/TestCharset.java

index 9a5dc189d3e2ab75f85752a345202dc87425f09a..5ffdd41a258f98b198aaaa52cb88597d3ccf127a 100644 (file)
--- a/icu4j/main/tests/charset/src/com/ibm/icu/dev/test/charset/TestCharset.java
+++ b/icu4j/main/tests/charset/src/com/ibm/icu/dev/test/charset/TestCharset.java
@@ -562,7 +562,7 @@ public class TestCharset extends TestFmwk {
                  cr = decoder.decode(bs, us, true);
                  bs.rewind();
                  us.rewind();
-                if (!cr.isMalformed() || cr.length() != 3) {
+                if (!cr.isMalformed() || cr.length() != 1) {
                      errln("Incorrect result in " + converter + " decoder for 0x"
                              + Integer.toHexString(i) + " received " + cr);
                      break;
@@ -584,7 +584,7 @@ public class TestCharset extends TestFmwk {
                  cr = decoder.decode(bs, us, true);
                  bs.rewind();
                  us.rewind();
-                if (!cr.isMalformed() || cr.length() != 3) {
+                if (!cr.isMalformed() || cr.length() != 1) {
                      errln("Incorrect result in " + converter + " decoder for 0x"
                              + Integer.toHexString(i) + " received " + cr);
                      break;
@@ -4653,7 +4653,7 @@ public class TestCharset extends TestFmwk {
          //decoding code coverage
          //test malform error
          decoder.reset();
-        bs.put((byte)0xC0); bs.put((byte)0xC0);
+        bs.put((byte)0xC2); bs.put((byte)0xC2);
          us.put((char)0x0000);
          bs2 = bs.asReadOnlyBuffer();
  
diff --git a/icu4j/main/tests/charset/src/com/ibm/icu/dev/test/charset/TestConversion.java b/icu4j/main/tests/charset/src/com/ibm/icu/dev/test/charset/TestConversion.java

index 1a1f2f305bb624e8ce61120daf45187f11afc024..67c1cf641bbcfc1709ee5ad4e9082003a05fd4e1 100644 (file)
--- a/icu4j/main/tests/charset/src/com/ibm/icu/dev/test/charset/TestConversion.java
+++ b/icu4j/main/tests/charset/src/com/ibm/icu/dev/test/charset/TestConversion.java
@@ -39,18 +39,18 @@ import junitparams.JUnitParamsRunner;
  import junitparams.Parameters;
  
  /**
- * This maps to convtest.c which tests the test file for data-driven conversion tests. 
- * 
+ * This maps to convtest.c which tests the test file for data-driven conversion tests.
+ *
   */
  @RunWith(JUnitParamsRunner.class)
  public class TestConversion extends TestFmwk {
      /**
       * This maps to the C struct of conversion case in convtest.h that stores the
       * data for a conversion test
-     * 
+     *
       */
      private class ConversionCase {
-        int caseNr;                                             // testcase index   
+        int caseNr;                                             // testcase index
          String option = null;                                   // callback options
          CodingErrorAction cbErrorAction = null;                 // callback action type
          CharBuffer toUnicodeResult = null;
@@ -64,7 +64,7 @@ public class TestConversion extends TestFmwk {
          boolean finalFlush;                                     // flush
          boolean fallbacks;                                      // fallback
          String outErrorCode;                                    // errorCode
-        String cbopt;                                           // callback 
+        String cbopt;                                           // callback
  
          // TestGetUnicodeSet variables
          String map;
@@ -91,7 +91,7 @@ public class TestConversion extends TestFmwk {
      }
  
      @SuppressWarnings("unused")
-    private List<TestDataPair> getTestData() throws Exception { 
+    private List<TestDataPair> getTestData() throws Exception {
          return ModuleTest.getTestData("com/ibm/icu/dev/data/testdata/", "conversion");
      }
  
@@ -132,7 +132,7 @@ public class TestConversion extends TestFmwk {
      // private methods -------------------------------------------------------
  
  
-    // fromUnicode test worker functions --------------------------------------- 
+    // fromUnicode test worker functions ---------------------------------------
      private void TestFromUnicode(DataMap testcase, int caseNr) {
  
          ConversionCase cc = new ConversionCase();
@@ -154,7 +154,7 @@ public class TestConversion extends TestFmwk {
              errln("error parsing conversion/toUnicode test case " + cc.caseNr);
              return;
          }
-        
+
          /*
           * Skip the following data driven converter tests.
           * These tests were added to the data driven conversion test in ICU
@@ -215,7 +215,7 @@ public class TestConversion extends TestFmwk {
                  break;
              }
  
-            // check for any options for the callback value -- 
+            // check for any options for the callback value --
              cc.option = cc.cbErrorAction == null ? cc.cbopt : cc.cbopt
                      .substring(1);
              if (cc.option == null) {
@@ -225,7 +225,7 @@ public class TestConversion extends TestFmwk {
          FromUnicodeCase(cc);
      }
  
-    
+
      private void FromUnicodeCase(ConversionCase cc) {
          // create charset encoder for conversion test
          CharsetProviderICU provider = new CharsetProviderICU();
@@ -238,7 +238,7 @@ public class TestConversion extends TestFmwk {
                          "com/ibm/icu/dev/data/testdata", this.getClass().getClassLoader())
                      : (Charset) provider.charsetForName(cc.charset);
              if (charset != null) {
-                encoder = (CharsetEncoder) charset.newEncoder();
+                encoder = charset.newEncoder();
                  encoder.onMalformedInput(CodingErrorAction.REPLACE);
                  encoder.onUnmappableCharacter(CodingErrorAction.REPLACE);
                  if (encoder instanceof CharsetEncoderICU) {
@@ -260,7 +260,7 @@ public class TestConversion extends TestFmwk {
              return;
          }
  
-        // set the callback for the encoder 
+        // set the callback for the encoder
          if (cc.cbErrorAction != null) {
              if (cc.cbEncoder != null) {
                  ((CharsetEncoderICU)encoder).setFromUCallback(CoderResult.malformedForLength(1), cc.cbEncoder, cc.option);
@@ -364,7 +364,7 @@ public class TestConversion extends TestFmwk {
              break;
          }
      }
-    
+
      private int stepFromUnicode(ConversionCase cc, CharsetEncoder encoder, int step) {
          if (step < 0) {
              errln("Negative step size, test internal error.");
@@ -387,7 +387,7 @@ public class TestConversion extends TestFmwk {
              currentSourceLimit = sourceLen;
              currentTargetLimit = targetLen;
          }
-        
+
          CoderResult cr = null;
  
          for (;;) {
@@ -529,7 +529,7 @@ public class TestConversion extends TestFmwk {
                          "com/ibm/icu/dev/data/testdata", this.getClass().getClassLoader())
                      : (Charset) provider.charsetForName(cc.charset);
              if (charset != null) {
-                decoder = (CharsetDecoder) charset.newDecoder();
+                decoder = charset.newDecoder();
                  decoder.onMalformedInput(CodingErrorAction.REPLACE);
                  decoder.onUnmappableCharacter(CodingErrorAction.REPLACE);
              }
@@ -588,13 +588,13 @@ public class TestConversion extends TestFmwk {
              }
          }
  
-        //      Check the step to unicode    
+        //      Check the step to unicode
          boolean ok;
          int resultLength;
  
          String steps[][] = { { "0", "bulk" }, // must be first for offsets to be checked
                  { "1", "step=1" }, { "3", "step=3" }, { "7", "step=7" } };
-        /* TODO: currently not supported test steps, getNext API is not supported for now  
+        /* TODO: currently not supported test steps, getNext API is not supported for now
           { "-1", "getNext" },
           { "-2", "toU(bulk)+getNext" },
           { "-3", "getNext+toU(bulk)" },
@@ -702,14 +702,14 @@ public class TestConversion extends TestFmwk {
                      target.limit(target.capacity());
                      flush = cc.finalFlush;
                  }
-                // convert 
+                // convert
                  CoderResult cr = null;
                  if (source.hasRemaining()) {
  
                      cr = decoder.decode(source, target, flush);
                      // check pointers and errors
                      if (cr.isOverflow()) {
-                        // the partial target is filled, set a new limit, 
+                        // the partial target is filled, set a new limit,
                          oStep = (target.position() + step);
                          target.limit((oStep < target.capacity()) ? oStep
                                  : target.capacity());
@@ -733,7 +733,7 @@ public class TestConversion extends TestFmwk {
  
                          cr = decoder.decode(source, target, true);
  
-                        //due to limitation of the API we need to check for target limit for expected 
+                        //due to limitation of the API we need to check for target limit for expected
                          if (target.position() != cc.unicode.length()) {
                              if (target.limit() != cc.unicode.length()) {
                                  target.limit(cc.unicode.length());
@@ -781,7 +781,7 @@ public class TestConversion extends TestFmwk {
                          if (cr.isOverflow()) {
  
                              if (target.limit() >= target.capacity()) {
-                                // target has reached its limit, an error occurred 
+                                // target has reached its limit, an error occurred
                                  logln("UnExpected error: Target Buffer is larger than capacity");
                                  break;
                              } else {
@@ -841,7 +841,7 @@ public class TestConversion extends TestFmwk {
                      }
                      CoderResult cr = decoder.decode(source, target, source
                              .limit() == sourceLen);
-                    // check pointers and errors 
+                    // check pointers and errors
                      if (cr.isOverflow()) {
                          // one character has been consumed
                          if (target.limit() >= target.capacity()) {
@@ -915,12 +915,12 @@ public class TestConversion extends TestFmwk {
                              "com/ibm/icu/dev/data/testdata", this.getClass().getClassLoader())
                              : (CharsetICU) provider.charsetForName(cc.charset);
  
-                    //checking for converter that are not supported at this point        
+                    //checking for converter that are not supported at this point
                      try{
                          if(charset==null ||
                                  charset.name()=="BOCU-1" ||charset.name()== "SCSU"|| charset.name()=="lmbcs1" || charset.name()== "lmbcs2" ||
                                  charset.name()== "lmbcs3" || charset.name()== "lmbcs4" || charset.name()=="lmbcs5" || charset.name()=="lmbcs6" ||
-                                charset.name()== "lmbcs8" || charset.name()=="lmbcs11" || charset.name()=="lmbcs16" || charset.name()=="lmbcs17" || 
+                                charset.name()== "lmbcs8" || charset.name()=="lmbcs11" || charset.name()=="lmbcs16" || charset.name()=="lmbcs17" ||
                                  charset.name()=="lmbcs18"|| charset.name()=="lmbcs19"){
                              logln("Converter not supported at this point :" + cc.charset);
                              return;
@@ -944,7 +944,7 @@ public class TestConversion extends TestFmwk {
                      charset.getUnicodeSet(unicodeset, cc.which);
                      UnicodeSet diffset = new UnicodeSet();
  
-                    //are there items that must be in unicodeset but are not?           
+                    //are there items that must be in unicodeset but are not?
                      (diffset = mapset).removeAll(unicodeset);
                      if(!diffset.isEmpty()){
                          StringBuffer s = new StringBuffer(diffset.toPattern(true));
@@ -975,11 +975,11 @@ public class TestConversion extends TestFmwk {
       * This follows ucnv.c method ucnv_detectUnicodeSignature() to detect the
       * start of the stream for example U+FEFF (the Unicode BOM/signature
       * character) that can be ignored.
-     * 
+     *
       * Detects Unicode signature byte sequences at the start of the byte stream
       * and returns number of bytes of the BOM of the indicated Unicode charset.
       * 0 is returned when no Unicode signature is recognized.
-     * 
+     *
       */
  
      private String detectUnicodeSignature(ByteBuffer source) {
author	Markus Scherer <markus.icu@gmail.com>
	Sat, 23 Sep 2017 06:34:53 +0000 (06:34 +0000)
committer	Markus Scherer <markus.icu@gmail.com>
	Sat, 23 Sep 2017 06:34:53 +0000 (06:34 +0000)
icu4c/source/common/ucnv_u8.cpp		patch \| blob \| history
icu4c/source/common/ucnvlat1.cpp		patch \| blob \| history
icu4c/source/common/ucnvmbcs.cpp		patch \| blob \| history
icu4c/source/common/unicode/utf8.h		patch \| blob \| history
icu4c/source/common/ustr_imp.h		patch \| blob \| history
icu4c/source/common/utf_impl.cpp		patch \| blob \| history
icu4c/source/test/cintltst/ccapitst.c		patch \| blob \| history
icu4c/source/test/cintltst/nccbtst.c		patch \| blob \| history
icu4c/source/test/cintltst/ncnvtst.c		patch \| blob \| history
icu4c/source/test/cintltst/nucnvtst.c		patch \| blob \| history
icu4c/source/test/testdata/conversion.txt		patch \| blob \| history
icu4j/main/classes/charset/src/com/ibm/icu/charset/CharsetUTF8.java		patch \| blob \| history
icu4j/main/classes/charset/src/com/ibm/icu/charset/UTF8.java	[new file with mode: 0644]	patch \| blob
icu4j/main/shared/data/testdata.jar		patch \| blob \| history
icu4j/main/tests/charset/src/com/ibm/icu/dev/test/charset/TestCharset.java		patch \| blob \| history
icu4j/main/tests/charset/src/com/ibm/icu/dev/test/charset/TestConversion.java		patch \| blob \| history