From: Markus Scherer Date: Sat, 23 Sep 2017 06:34:53 +0000 (+0000) Subject: ICU-13311 change illegal-UTF-8 handling in converter code X-Git-Tag: release-60-rc~115 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=fa2ddc86c7a410aed0144de34b7ff55668338edd;p=icu ICU-13311 change illegal-UTF-8 handling in converter code X-SVN-Rev: 40455 --- diff --git a/icu4c/source/common/ucnv_u8.cpp b/icu4c/source/common/ucnv_u8.cpp index 4419381fd6c..951988ed9ca 100644 --- a/icu4c/source/common/ucnv_u8.cpp +++ b/icu4c/source/common/ucnv_u8.cpp @@ -31,6 +31,7 @@ #include "ucnv_bld.h" #include "ucnv_cnv.h" #include "cmemory.h" +#include "ustr_imp.h" /* Prototypes --------------------------------------------------------------- */ @@ -44,51 +45,13 @@ U_CFUNC void ucnv_fromUnicode_UTF8_OFFSETS_LOGIC(UConverterFromUnicodeArgs *args /* UTF-8 -------------------------------------------------------------------- */ -/* UTF-8 Conversion DATA - * for more information see Unicode Standard 2.0, Transformation Formats Appendix A-9 - */ -/*static const uint32_t REPLACEMENT_CHARACTER = 0x0000FFFD;*/ #define MAXIMUM_UCS2 0x0000FFFF -#define MAXIMUM_UTF 0x0010FFFF -#define MAXIMUM_UCS4 0x7FFFFFFF -#define HALF_SHIFT 10 -#define HALF_BASE 0x0010000 -#define HALF_MASK 0x3FF -#define SURROGATE_HIGH_START 0xD800 -#define SURROGATE_HIGH_END 0xDBFF -#define SURROGATE_LOW_START 0xDC00 -#define SURROGATE_LOW_END 0xDFFF - -/* -SURROGATE_LOW_START + HALF_BASE */ -#define SURROGATE_LOW_BASE 9216 - -static const uint32_t offsetsFromUTF8[7] = {0, - (uint32_t) 0x00000000, (uint32_t) 0x00003080, (uint32_t) 0x000E2080, - (uint32_t) 0x03C82080, (uint32_t) 0xFA082080, (uint32_t) 0x82082080 -}; -/* END OF UTF-8 Conversion DATA */ - -static const int8_t bytesFromUTF8[256] = { - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0 +static const uint32_t offsetsFromUTF8[5] = {0, + (uint32_t) 0x00000000, (uint32_t) 0x00003080, (uint32_t) 0x000E2080, + (uint32_t) 0x03C82080 }; -/* - * Starting with Unicode 3.0.1: - * UTF-8 byte sequences of length N _must_ encode code points of or above utf8_minChar32[N]; - * byte sequences with more than 4 bytes are illegal in UTF-8, - * which is tested with impossible values for them - */ -static const uint32_t -utf8_minChar32[7]={ 0, 0, 0x80, 0x800, 0x10000, 0xffffffff, 0xffffffff }; - static UBool hasCESU8Data(const UConverter *cnv) { #if UCONFIG_ONLY_HTML_CONVERSION @@ -127,7 +90,7 @@ static void U_CALLCONV ucnv_toUnicode_UTF8 (UConverterToUnicodeArgs * args, while (mySource < sourceLimit && myTarget < targetLimit) { ch = *(mySource++); - if (ch < 0x80) /* Simple case */ + if (U8_IS_SINGLE(ch)) /* Simple case */ { *(myTarget++) = (UChar) ch; } @@ -135,7 +98,7 @@ static void U_CALLCONV ucnv_toUnicode_UTF8 (UConverterToUnicodeArgs * args, { /* store the first char */ toUBytes[0] = (char)ch; - inBytes = bytesFromUTF8[ch]; /* lookup current sequence length */ + inBytes = U8_COUNT_BYTES_NON_ASCII(ch); /* lookup current sequence length */ i = 1; morebytes: @@ -144,7 +107,8 @@ morebytes: if (mySource < sourceLimit) { toUBytes[i] = (char) (ch2 = *mySource); - if (!U8_IS_TRAIL(ch2)) + if (!icu::UTF8::isValidTrail(ch, ch2, i, inBytes) && + !(isCESU8 && i == 1 && ch == 0xed && U8_IS_TRAIL(ch2))) { break; /* i < inBytes */ } @@ -162,24 +126,12 @@ morebytes: } } - /* Remove the accumulated high bits */ - ch -= offsetsFromUTF8[inBytes]; - - /* - * Legal UTF-8 byte sequences in Unicode 3.0.1 and up: - * - use only trail bytes after a lead byte (checked above) - * - use the right number of trail bytes for a given lead byte - * - encode a code point <= U+10ffff - * - use the fewest possible number of bytes for their code points - * - use at most 4 bytes (for i>=5 it is 0x10ffff= utf8_minChar32[i] && - (isCESU8 ? i <= 3 : !U_IS_SURROGATE(ch))) + // In CESU-8, only surrogates, not supplementary code points, are encoded directly. + if (i == inBytes && (!isCESU8 || i <= 3)) { + /* Remove the accumulated high bits */ + ch -= offsetsFromUTF8[inBytes]; + /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */ if (ch <= MAXIMUM_UCS2) { @@ -189,9 +141,8 @@ morebytes: else { /* write out the surrogates */ - ch -= HALF_BASE; - *(myTarget++) = (UChar) ((ch >> HALF_SHIFT) + SURROGATE_HIGH_START); - ch = (ch & HALF_MASK) + SURROGATE_LOW_START; + *(myTarget++) = U16_LEAD(ch); + ch = U16_TRAIL(ch); if (myTarget < targetLimit) { *(myTarget++) = (UChar)ch; @@ -256,7 +207,7 @@ static void U_CALLCONV ucnv_toUnicode_UTF8_OFFSETS_LOGIC (UConverterToUnicodeAr while (mySource < sourceLimit && myTarget < targetLimit) { ch = *(mySource++); - if (ch < 0x80) /* Simple case */ + if (U8_IS_SINGLE(ch)) /* Simple case */ { *(myTarget++) = (UChar) ch; *(myOffsets++) = offsetNum++; @@ -264,7 +215,7 @@ static void U_CALLCONV ucnv_toUnicode_UTF8_OFFSETS_LOGIC (UConverterToUnicodeAr else { toUBytes[0] = (char)ch; - inBytes = bytesFromUTF8[ch]; + inBytes = U8_COUNT_BYTES_NON_ASCII(ch); i = 1; morebytes: @@ -273,7 +224,8 @@ morebytes: if (mySource < sourceLimit) { toUBytes[i] = (char) (ch2 = *mySource); - if (!U8_IS_TRAIL(ch2)) + if (!icu::UTF8::isValidTrail(ch, ch2, i, inBytes) && + !(isCESU8 && i == 1 && ch == 0xed && U8_IS_TRAIL(ch2))) { break; /* i < inBytes */ } @@ -290,24 +242,12 @@ morebytes: } } - /* Remove the accumulated high bits */ - ch -= offsetsFromUTF8[inBytes]; - - /* - * Legal UTF-8 byte sequences in Unicode 3.0.1 and up: - * - use only trail bytes after a lead byte (checked above) - * - use the right number of trail bytes for a given lead byte - * - encode a code point <= U+10ffff - * - use the fewest possible number of bytes for their code points - * - use at most 4 bytes (for i>=5 it is 0x10ffff= utf8_minChar32[i] && - (isCESU8 ? i <= 3 : !U_IS_SURROGATE(ch))) + // In CESU-8, only surrogates, not supplementary code points, are encoded directly. + if (i == inBytes && (!isCESU8 || i <= 3)) { + /* Remove the accumulated high bits */ + ch -= offsetsFromUTF8[inBytes]; + /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */ if (ch <= MAXIMUM_UCS2) { @@ -318,10 +258,9 @@ morebytes: else { /* write out the surrogates */ - ch -= HALF_BASE; - *(myTarget++) = (UChar) ((ch >> HALF_SHIFT) + SURROGATE_HIGH_START); + *(myTarget++) = U16_LEAD(ch); *(myOffsets++) = offsetNum; - ch = (ch & HALF_MASK) + SURROGATE_LOW_START; + ch = U16_TRAIL(ch); if (myTarget < targetLimit) { *(myTarget++) = (UChar)ch; @@ -616,10 +555,9 @@ static UChar32 U_CALLCONV ucnv_getNextUChar_UTF8(UConverterToUnicodeArgs *args, UConverter *cnv; const uint8_t *sourceInitial; const uint8_t *source; - uint16_t extraBytesToWrite; uint8_t myByte; UChar32 ch; - int8_t i, isLegalSequence; + int8_t i; /* UTF-8 only here, the framework handles CESU-8 to combine surrogate pairs */ @@ -633,14 +571,14 @@ static UChar32 U_CALLCONV ucnv_getNextUChar_UTF8(UConverterToUnicodeArgs *args, } myByte = (uint8_t)*(source++); - if (myByte < 0x80) + if (U8_IS_SINGLE(myByte)) { args->source = (const char *)source; return (UChar32)myByte; } - extraBytesToWrite = (uint16_t)bytesFromUTF8[myByte]; - if (extraBytesToWrite == 0) { + uint16_t countTrailBytes = U8_COUNT_TRAIL_BYTES(myByte); + if (countTrailBytes == 0) { cnv->toUBytes[0] = myByte; cnv->toULength = 1; *err = U_ILLEGAL_CHAR_FOUND; @@ -649,15 +587,17 @@ static UChar32 U_CALLCONV ucnv_getNextUChar_UTF8(UConverterToUnicodeArgs *args, } /*The byte sequence is longer than the buffer area passed*/ - if (((const char *)source + extraBytesToWrite - 1) > args->sourceLimit) + if (((const char *)source + countTrailBytes) > args->sourceLimit) { /* check if all of the remaining bytes are trail bytes */ + uint16_t extraBytesToWrite = countTrailBytes + 1; cnv->toUBytes[0] = myByte; i = 1; *err = U_TRUNCATED_CHAR_FOUND; while(source < (const uint8_t *)args->sourceLimit) { - if(U8_IS_TRAIL(myByte = *source)) { - cnv->toUBytes[i++] = myByte; + uint8_t b = *source; + if(icu::UTF8::isValidTrail(myByte, b, i, extraBytesToWrite)) { + cnv->toUBytes[i++] = b; ++source; } else { /* error even before we run out of input */ @@ -670,81 +610,28 @@ static UChar32 U_CALLCONV ucnv_getNextUChar_UTF8(UConverterToUnicodeArgs *args, return 0xffff; } - isLegalSequence = 1; ch = myByte << 6; - switch(extraBytesToWrite) - { - /* note: code falls through cases! (sic)*/ - case 6: - ch += (myByte = *source); - ch <<= 6; - if (!U8_IS_TRAIL(myByte)) - { - isLegalSequence = 0; - break; - } - ++source; - U_FALLTHROUGH; - case 5: - ch += (myByte = *source); - ch <<= 6; - if (!U8_IS_TRAIL(myByte)) - { - isLegalSequence = 0; - break; + if(countTrailBytes == 2) { + uint8_t t1 = *source, t2; + if(U8_IS_VALID_LEAD3_AND_T1(myByte, t1) && U8_IS_TRAIL(t2 = *++source)) { + args->source = (const char *)(source + 1); + return (((ch + t1) << 6) + t2) - offsetsFromUTF8[3]; } - ++source; - U_FALLTHROUGH; - case 4: - ch += (myByte = *source); - ch <<= 6; - if (!U8_IS_TRAIL(myByte)) - { - isLegalSequence = 0; - break; + } else if(countTrailBytes == 1) { + uint8_t t1 = *source; + if(U8_IS_TRAIL(t1)) { + args->source = (const char *)(source + 1); + return (ch + t1) - offsetsFromUTF8[2]; } - ++source; - U_FALLTHROUGH; - case 3: - ch += (myByte = *source); - ch <<= 6; - if (!U8_IS_TRAIL(myByte)) - { - isLegalSequence = 0; - break; + } else { // countTrailBytes == 3 + uint8_t t1 = *source, t2, t3; + if(U8_IS_VALID_LEAD4_AND_T1(myByte, t1) && U8_IS_TRAIL(t2 = *++source) && + U8_IS_TRAIL(t3 = *++source)) { + args->source = (const char *)(source + 1); + return (((((ch + t1) << 6) + t2) << 6) + t3) - offsetsFromUTF8[4]; } - ++source; - U_FALLTHROUGH; - case 2: - ch += (myByte = *source); - if (!U8_IS_TRAIL(myByte)) - { - isLegalSequence = 0; - break; - } - ++source; - }; - ch -= offsetsFromUTF8[extraBytesToWrite]; - args->source = (const char *)source; - - /* - * Legal UTF-8 byte sequences in Unicode 3.0.1 and up: - * - use only trail bytes after a lead byte (checked above) - * - use the right number of trail bytes for a given lead byte - * - encode a code point <= U+10ffff - * - use the fewest possible number of bytes for their code points - * - use at most 4 bytes (for i>=5 it is 0x10ffff= utf8_minChar32[extraBytesToWrite] && - !U_IS_SURROGATE(ch) - ) { - return ch; /* return the code point */ } + args->source = (const char *)source; for(i = 0; sourceInitial < source; ++i) { cnv->toUBytes[i] = *sourceInitial++; @@ -757,14 +644,6 @@ U_CDECL_END /* UTF-8-from-UTF-8 conversion functions ------------------------------------ */ -/* minimum code point values for n-byte UTF-8 sequences, n=0..4 */ -static const UChar32 -utf8_minLegal[5]={ 0, 0, 0x80, 0x800, 0x10000 }; - -/* offsets for n-byte UTF-8 sequences that were calculated with ((lead<<6)+trail)<<6+trail... */ -static const UChar32 -utf8_offsets[7]={ 0, 0, 0x3080, 0xE2080, 0x3C82080 }; - U_CDECL_BEGIN /* "Convert" UTF-8 to UTF-8: Validate and copy. Modified from ucnv_DBCSFromUTF8(). */ static void U_CALLCONV @@ -812,39 +691,35 @@ ucnv_UTF8FromUTF8(UConverterFromUnicodeArgs *pFromUArgs, *pErrorCode=U_USING_DEFAULT_WARNING; return; } else { - /* - * Use a single counter for source and target, counting the minimum of - * the source length and the target capacity. - * As a result, the source length is checked only once per multi-byte - * character instead of twice. - * - * Make sure that the last byte sequence is complete, or else - * stop just before it. - * (The longest legal byte sequence has 3 trail bytes.) - * Count oldToULength (number of source bytes from a previous buffer) - * into the source length but reduce the source index by toULimit - * while going back over trail bytes in order to not go back into - * the bytes that will be read for finishing a partial - * sequence from the previous buffer. - * Let the standard converter handle edge cases. - */ - int32_t i; - + // Use a single counter for source and target, counting the minimum of + // the source length and the target capacity. + // Let the standard converter handle edge cases. if(count>targetCapacity) { count=targetCapacity; } - i=0; - while(i<3 && i<(count-toULimit)) { - b=source[count-oldToULength-i-1]; - if(U8_IS_TRAIL(b)) { - ++i; - } else { - if(i0 only once per 1/2/3-byte character. + // If the buffer ends with a truncated 2- or 3-byte sequence, + // then we reduce the count to stop before that, + // and collect the remaining bytes after the conversion loop. + { + // Do not go back into the bytes that will be read for finishing a partial + // sequence from the previous buffer. + int32_t length=count-toULimit; + if(length>0) { + uint8_t b1=*(sourceLimit-1); + if(U8_IS_SINGLE(b1)) { + // common ASCII character + } else if(U8_IS_TRAIL(b1) && length>=2) { + uint8_t b2=*(sourceLimit-2); + if(0xe0<=b2 && b2<0xf0 && U8_IS_VALID_LEAD3_AND_T1(b2, b1)) { + // truncated 3-byte sequence + count-=2; + } + } else if(0xc2<=b1 && b1<0xf0) { + // truncated 2- or 3-byte sequence + --count; } - break; } } } @@ -859,17 +734,17 @@ ucnv_UTF8FromUTF8(UConverterFromUnicodeArgs *pFromUArgs, /* conversion loop */ while(count>0) { b=*source++; - if((int8_t)b>=0) { + if(U8_IS_SINGLE(b)) { /* convert ASCII */ *target++=b; --count; continue; } else { - if(b>0xe0) { - if( /* handle U+1000..U+D7FF inline */ - (t1=source[0]) >= 0x80 && ((b<0xed && (t1 <= 0xbf)) || - (b==0xed && (t1 <= 0x9f))) && - (t2=source[1]) >= 0x80 && t2 <= 0xbf + if(b>=0xe0) { + if( /* handle U+0800..U+FFFF inline */ + b<0xf0 && + U8_IS_VALID_LEAD3_AND_T1(b, t1=source[0]) && + U8_IS_TRAIL(t2=source[1]) ) { source+=2; *target++=b; @@ -878,10 +753,10 @@ ucnv_UTF8FromUTF8(UConverterFromUnicodeArgs *pFromUArgs, count-=3; continue; } - } else if(b<0xe0) { + } else { if( /* handle U+0080..U+07FF inline */ b>=0xc2 && - (t1=*source) >= 0x80 && t1 <= 0xbf + U8_IS_TRAIL(t1=*source) ) { ++source; *target++=b; @@ -889,30 +764,18 @@ ucnv_UTF8FromUTF8(UConverterFromUnicodeArgs *pFromUArgs, count-=2; continue; } - } else if(b==0xe0) { - if( /* handle U+0800..U+0FFF inline */ - (t1=source[0]) >= 0xa0 && t1 <= 0xbf && - (t2=source[1]) >= 0x80 && t2 <= 0xbf - ) { - source+=2; - *target++=b; - *target++=t1; - *target++=t2; - count-=3; - continue; - } } /* handle "complicated" and error cases, and continuing partial characters */ oldToULength=0; toULength=1; - toULimit=U8_COUNT_TRAIL_BYTES(b)+1; + toULimit=U8_COUNT_BYTES_NON_ASCII(b); c=b; moreBytes: while(toULength=utf8_minLegal[toULength] && - (c<=0xd7ff || 0xe000<=c) /* not a surrogate */ - ) { - /* legal byte sequence for BMP code point */ - } else if( - toULength==toULimit && toULength==4 && - (0x10000<=(c-=utf8_offsets[4]) && c<=0x10ffff) - ) { - /* legal byte sequence for supplementary code point */ - } else { + if(toULength!=toULimit) { /* error handling: illegal UTF-8 byte sequence */ source-=(toULength-oldToULength); while(oldToULength(sourceLimit-source)) { /* collect a truncated byte sequence */ toULength=0; diff --git a/icu4c/source/common/ucnvlat1.cpp b/icu4c/source/common/ucnvlat1.cpp index 8aa5456b8cf..23e918afe7a 100644 --- a/icu4c/source/common/ucnvlat1.cpp +++ b/icu4c/source/common/ucnvlat1.cpp @@ -23,6 +23,7 @@ #include "unicode/utf8.h" #include "ucnv_bld.h" #include "ucnv_cnv.h" +#include "ustr_imp.h" /* control optimizations according to the platform */ #define LATIN1_UNROLL_FROM_UNICODE 1 @@ -374,7 +375,7 @@ ucnv_Latin1FromUTF8(UConverterFromUnicodeArgs *pFromUArgs, while(source0) { b=*source++; - if((int8_t)b>=0) { + if(U8_IS_SINGLE(b)) { /* convert ASCII */ *target++=(uint8_t)b; --targetCapacity; @@ -409,7 +410,7 @@ ucnv_Latin1FromUTF8(UConverterFromUnicodeArgs *pFromUArgs, if(U_SUCCESS(*pErrorCode) && source<(sourceLimit=(uint8_t *)pToUArgs->sourceLimit)) { utf8->toUnicodeStatus=utf8->toUBytes[0]=b=*source++; utf8->toULength=1; - utf8->mode=U8_COUNT_TRAIL_BYTES(b)+1; + utf8->mode=U8_COUNT_BYTES(b); } /* write back the updated pointers */ diff --git a/icu4c/source/common/ucnvmbcs.cpp b/icu4c/source/common/ucnvmbcs.cpp index 21a651f8968..4b36cc605b1 100644 --- a/icu4c/source/common/ucnvmbcs.cpp +++ b/icu4c/source/common/ucnvmbcs.cpp @@ -59,6 +59,7 @@ #include "cmemory.h" #include "cstring.h" #include "umutex.h" +#include "ustr_imp.h" /* control optimizations according to the platform */ #define MBCS_UNROLL_SINGLE_TO_BMP 1 @@ -5011,13 +5012,9 @@ ucnv_MBCSSingleFromUChar32(UConverterSharedData *sharedData, /* MBCS-from-UTF-8 conversion functions ------------------------------------- */ -/* minimum code point values for n-byte UTF-8 sequences, n=0..4 */ -static const UChar32 -utf8_minLegal[5]={ 0, 0, 0x80, 0x800, 0x10000 }; - /* offsets for n-byte UTF-8 sequences that were calculated with ((lead<<6)+trail)<<6+trail... */ static const UChar32 -utf8_offsets[7]={ 0, 0, 0x3080, 0xE2080, 0x3C82080 }; +utf8_offsets[5]={ 0, 0, 0x3080, 0xE2080, 0x3C82080 }; static void U_CALLCONV ucnv_SBCSFromUTF8(UConverterFromUnicodeArgs *pFromUArgs, @@ -5075,28 +5072,27 @@ ucnv_SBCSFromUTF8(UConverterFromUnicodeArgs *pFromUArgs, toULength=oldToULength=toULimit=0; } - /* - * Make sure that the last byte sequence before sourceLimit is complete - * or runs into a lead byte. - * Do not go back into the bytes that will be read for finishing a partial - * sequence from the previous buffer. - * In the conversion loop compare source with sourceLimit only once - * per multi-byte character. - */ + // The conversion loop checks source0) { + uint8_t b1=*(sourceLimit-1); + if(U8_IS_SINGLE(b1)) { + // common ASCII character + } else if(U8_IS_TRAIL(b1) && length>=2) { + uint8_t b2=*(sourceLimit-2); + if(0xe0<=b2 && b2<0xf0 && U8_IS_VALID_LEAD3_AND_T1(b2, b1)) { + // truncated 3-byte sequence + sourceLimit-=2; } - break; + } else if(0xc2<=b1 && b1<0xf0) { + // truncated 2- or 3-byte sequence + --sourceLimit; } } } @@ -5130,7 +5126,7 @@ ucnv_SBCSFromUTF8(UConverterFromUnicodeArgs *pFromUArgs, while(source0) { b=*source++; - if((int8_t)b>=0) { + if(U8_IS_SINGLE(b)) { /* convert ASCII */ if(IS_ASCII_ROUNDTRIP(b, asciiRoundtrips)) { *target++=(uint8_t)b; @@ -5185,7 +5181,7 @@ ucnv_SBCSFromUTF8(UConverterFromUnicodeArgs *pFromUArgs, /* handle "complicated" and error cases, and continuing partial characters */ oldToULength=0; toULength=1; - toULimit=U8_COUNT_TRAIL_BYTES(b)+1; + toULimit=U8_COUNT_BYTES_NON_ASCII(b); c=b; moreBytes: while(toULengthsourceLimit) { b=*source; - if(U8_IS_TRAIL(b)) { + if(icu::UTF8::isValidTrail(c, b, toULength, toULimit)) { ++source; ++toULength; c=(c<<6)+b; @@ -5220,22 +5216,18 @@ moreBytes: } } - if( toULength==toULimit && /* consumed all trail bytes */ - (toULength==3 || toULength==2) && /* BMP */ - (c-=utf8_offsets[toULength])>=utf8_minLegal[toULength] && - (c<=0xd7ff || 0xe000<=c) /* not a surrogate */ - ) { - value=MBCS_SINGLE_RESULT_FROM_U(table, results, c); - } else if( - toULength==toULimit && toULength==4 && - (0x10000<=(c-=utf8_offsets[4]) && c<=0x10ffff) - ) { - /* supplementary code point */ - if(!hasSupplementary) { - /* BMP-only codepages are stored without stage 1 entries for supplementary code points */ - value=0; - } else { + if(toULength==toULimit) { + c-=utf8_offsets[toULength]; + if(toULength<=3) { /* BMP */ value=MBCS_SINGLE_RESULT_FROM_U(table, results, c); + } else { + /* supplementary code point */ + if(!hasSupplementary) { + /* BMP-only codepages are stored without stage 1 entries for supplementary code points */ + value=0; + } else { + value=MBCS_SINGLE_RESULT_FROM_U(table, results, c); + } } } else { /* error handling: illegal UTF-8 byte sequence */ @@ -5310,7 +5302,7 @@ moreBytes: source<(sourceLimit=(uint8_t *)pToUArgs->sourceLimit)) { c=utf8->toUBytes[0]=b=*source++; toULength=1; - toULimit=U8_COUNT_TRAIL_BYTES(b)+1; + toULimit=U8_COUNT_BYTES(b); while(sourcetoUBytes[toULength++]=b=*source++; c=(c<<6)+b; @@ -5375,28 +5367,27 @@ ucnv_DBCSFromUTF8(UConverterFromUnicodeArgs *pFromUArgs, toULength=oldToULength=toULimit=0; } - /* - * Make sure that the last byte sequence before sourceLimit is complete - * or runs into a lead byte. - * Do not go back into the bytes that will be read for finishing a partial - * sequence from the previous buffer. - * In the conversion loop compare source with sourceLimit only once - * per multi-byte character. - */ + // The conversion loop checks source0) { + uint8_t b1=*(sourceLimit-1); + if(U8_IS_SINGLE(b1)) { + // common ASCII character + } else if(U8_IS_TRAIL(b1) && length>=2) { + uint8_t b2=*(sourceLimit-2); + if(0xe0<=b2 && b2<0xf0 && U8_IS_VALID_LEAD3_AND_T1(b2, b1)) { + // truncated 3-byte sequence + sourceLimit-=2; } - break; + } else if(0xc2<=b1 && b1<0xf0) { + // truncated 2- or 3-byte sequence + --sourceLimit; } } } @@ -5412,7 +5403,7 @@ ucnv_DBCSFromUTF8(UConverterFromUnicodeArgs *pFromUArgs, while(source0) { b=*source++; - if((int8_t)b>=0) { + if(U8_IS_SINGLE(b)) { /* convert ASCII */ if(IS_ASCII_ROUNDTRIP(b, asciiRoundtrips)) { *target++=b; @@ -5426,13 +5417,13 @@ ucnv_DBCSFromUTF8(UConverterFromUnicodeArgs *pFromUArgs, } } } else { - if(b>0xe0) { - if( /* handle U+1000..U+D7FF inline */ - (((t1=(uint8_t)(source[0]-0x80), b<0xed) && (t1 <= 0x3f)) || - (b==0xed && (t1 <= 0x1f))) && + if(b>=0xe0) { + if( /* handle U+0800..U+D7FF inline */ + b<=0xed && // do not assume maxFastUChar>0xd7ff + U8_IS_VALID_LEAD3_AND_T1(b, t1=source[0]) && (t2=(uint8_t)(source[1]-0x80)) <= 0x3f ) { - c=((b&0xf)<<6)|t1; + c=((b&0xf)<<6)|(t1&0x3f); source+=2; value=DBCS_RESULT_FROM_UTF8(mbcsIndex, results, c, t2); if(value==0) { @@ -5442,7 +5433,7 @@ ucnv_DBCSFromUTF8(UConverterFromUnicodeArgs *pFromUArgs, } else { c=-1; } - } else if(b<0xe0) { + } else { if( /* handle U+0080..U+07FF inline */ b>=0xc2 && (t1=(uint8_t)(*source-0x80)) <= 0x3f @@ -5457,15 +5448,13 @@ ucnv_DBCSFromUTF8(UConverterFromUnicodeArgs *pFromUArgs, } else { c=-1; } - } else { - c=-1; } if(c<0) { /* handle "complicated" and error cases, and continuing partial characters */ oldToULength=0; toULength=1; - toULimit=U8_COUNT_TRAIL_BYTES(b)+1; + toULimit=U8_COUNT_BYTES_NON_ASCII(b); c=b; moreBytes: while(toULengthsourceLimit) { b=*source; - if(U8_IS_TRAIL(b)) { + if(icu::UTF8::isValidTrail(c, b, toULength, toULimit)) { ++source; ++toULength; c=(c<<6)+b; @@ -5500,22 +5489,18 @@ moreBytes: } } - if( toULength==toULimit && /* consumed all trail bytes */ - (toULength==3 || toULength==2) && /* BMP */ - (c-=utf8_offsets[toULength])>=utf8_minLegal[toULength] && - (c<=0xd7ff || 0xe000<=c) /* not a surrogate */ - ) { - stage2Entry=MBCS_STAGE_2_FROM_U(table, c); - } else if( - toULength==toULimit && toULength==4 && - (0x10000<=(c-=utf8_offsets[4]) && c<=0x10ffff) - ) { - /* supplementary code point */ - if(!hasSupplementary) { - /* BMP-only codepages are stored without stage 1 entries for supplementary code points */ - stage2Entry=0; - } else { + if(toULength==toULimit) { + c-=utf8_offsets[toULength]; + if(toULength<=3) { /* BMP */ stage2Entry=MBCS_STAGE_2_FROM_U(table, c); + } else { + /* supplementary code point */ + if(!hasSupplementary) { + /* BMP-only codepages are stored without stage 1 entries for supplementary code points */ + stage2Entry=0; + } else { + stage2Entry=MBCS_STAGE_2_FROM_U(table, c); + } } } else { /* error handling: illegal UTF-8 byte sequence */ @@ -5620,7 +5605,7 @@ unassigned: source<(sourceLimit=(uint8_t *)pToUArgs->sourceLimit)) { c=utf8->toUBytes[0]=b=*source++; toULength=1; - toULimit=U8_COUNT_TRAIL_BYTES(b)+1; + toULimit=U8_COUNT_BYTES(b); while(sourcetoUBytes[toULength++]=b=*source++; c=(c<<6)+b; diff --git a/icu4c/source/common/unicode/utf8.h b/icu4c/source/common/unicode/utf8.h index 55024fdbfe5..df08d341ab8 100644 --- a/icu4c/source/common/unicode/utf8.h +++ b/icu4c/source/common/unicode/utf8.h @@ -53,8 +53,8 @@ * @internal */ #define U8_COUNT_TRAIL_BYTES(leadByte) \ - ((uint8_t)(leadByte)<=0xf4 ? \ - ((uint8_t)(leadByte)>=0xc2)+((uint8_t)(leadByte)>=0xe0)+((uint8_t)(leadByte)>=0xf0) : 0) + (U8_IS_LEAD(leadByte) ? \ + ((uint8_t)(leadByte)>=0xe0)+((uint8_t)(leadByte)>=0xf0)+1 : 0) /** * Counts the trail bytes for a UTF-8 lead byte of a valid UTF-8 sequence. @@ -80,29 +80,35 @@ #define U8_MASK_LEAD_BYTE(leadByte, countTrailBytes) ((leadByte)&=(1<<(6-(countTrailBytes)))-1) /** - * Internal bit vector for 3-byte UTF-8 validity check. - * Lead byte E0..EF bits 3..0 as byte index, - * first trail byte bits 7..5 as bit index into that byte. + * Internal bit vector for 3-byte UTF-8 validity check, for use in U8_IS_VALID_LEAD3_AND_T1. + * Each bit indicates whether one lead byte + first trail byte pair starts a valid sequence. + * Lead byte E0..EF bits 3..0 are used as byte index, + * first trail byte bits 7..5 are used as bit index into that byte. + * @see U8_IS_VALID_LEAD3_AND_T1 * @internal */ #define U8_LEAD3_T1_BITS "\x20\x30\x30\x30\x30\x30\x30\x30\x30\x30\x30\x30\x30\x10\x30\x30" /** * Internal 3-byte UTF-8 validity check. + * Non-zero if lead byte E0..EF and first trail byte 00..FF start a valid sequence. * @internal */ #define U8_IS_VALID_LEAD3_AND_T1(lead, t1) (U8_LEAD3_T1_BITS[(lead)&0xf]&(1<<((uint8_t)(t1)>>5))) /** - * Internal bit vector for 4-byte UTF-8 validity check. - * First trail byte bits 7..4 as byte index, - * lead byte F0..F4 bits 2..0 as bit index into that byte. + * Internal bit vector for 4-byte UTF-8 validity check, for use in U8_IS_VALID_LEAD4_AND_T1. + * Each bit indicates whether one lead byte + first trail byte pair starts a valid sequence. + * First trail byte bits 7..4 are used as byte index, + * lead byte F0..F4 bits 2..0 are used as bit index into that byte. + * @see U8_IS_VALID_LEAD4_AND_T1 * @internal */ #define U8_LEAD4_T1_BITS "\x00\x00\x00\x00\x00\x00\x00\x00\x1E\x0F\x0F\x0F\x00\x00\x00\x00" /** * Internal 4-byte UTF-8 validity check. + * Non-zero if lead byte F0..F4 and first trail byte 00..FF start a valid sequence. * @internal */ #define U8_IS_VALID_LEAD4_AND_T1(lead, t1) (U8_LEAD4_T1_BITS[(uint8_t)(t1)>>4]&(1<<((lead)&7))) @@ -166,7 +172,7 @@ utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i); #define U8_IS_SINGLE(c) (((c)&0x80)==0) /** - * Is this code unit (byte) a UTF-8 lead byte? + * Is this code unit (byte) a UTF-8 lead byte? (0xC2..0xF4) * @param c 8-bit code unit (byte) * @return TRUE or FALSE * @stable ICU 2.4 @@ -175,7 +181,7 @@ utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i); // 0x32=0xf4-0xc2 /** - * Is this code unit (byte) a UTF-8 trail byte? + * Is this code unit (byte) a UTF-8 trail byte? (0x80..0xBF) * @param c 8-bit code unit (byte) * @return TRUE or FALSE * @stable ICU 2.4 diff --git a/icu4c/source/common/ustr_imp.h b/icu4c/source/common/ustr_imp.h index 9815915ff52..c555ee37ea8 100644 --- a/icu4c/source/common/ustr_imp.h +++ b/icu4c/source/common/ustr_imp.h @@ -18,6 +18,7 @@ #define __USTR_IMP_H__ #include "unicode/utypes.h" +#include "unicode/utf8.h" /** * Internal option for unorm_cmpEquivFold() for strncmp style. @@ -81,4 +82,62 @@ u_terminateUChar32s(UChar32 *dest, int32_t destCapacity, int32_t length, UErrorC U_CAPI int32_t U_EXPORT2 u_terminateWChars(wchar_t *dest, int32_t destCapacity, int32_t length, UErrorCode *pErrorCode); +/** + * Counts the bytes of any whole valid sequence for a UTF-8 lead byte. + * Returns 1 for ASCII 0..0x7f. + * Returns 0 for 0x80..0xc1 as well as for 0xf5..0xff. + * leadByte might be evaluated multiple times. + * + * @param leadByte The first byte of a UTF-8 sequence. Must be 0..0xff. + * @return 0..4 + */ +#define U8_COUNT_BYTES(leadByte) \ + (U8_IS_SINGLE(leadByte) ? 1 : U8_COUNT_BYTES_NON_ASCII(leadByte)) + +/** + * Counts the bytes of any whole valid sequence for a UTF-8 lead byte. + * Returns 0 for 0x00..0xc1 as well as for 0xf5..0xff. + * leadByte might be evaluated multiple times. + * + * @param leadByte The first byte of a UTF-8 sequence. Must be 0..0xff. + * @return 0 or 2..4 + */ +#define U8_COUNT_BYTES_NON_ASCII(leadByte) \ + (U8_IS_LEAD(leadByte) ? ((uint8_t)(leadByte)>=0xe0)+((uint8_t)(leadByte)>=0xf0)+2 : 0) + +#ifdef __cplusplus + +U_NAMESPACE_BEGIN + +class UTF8 { +public: + UTF8() = delete; // all static + + /** + * Is t a valid UTF-8 trail byte? + * + * @param prev Must be the preceding lead byte if i==1 and length>=3; + * otherwise ignored. + * @param t The i-th byte following the lead byte. + * @param i The index (1..3) of byte t in the byte sequence. 0 1) { + return U8_IS_TRAIL(t); + } else if (length == 3) { + return U8_IS_VALID_LEAD3_AND_T1(prev, t); + } else { // length == 4 + return U8_IS_VALID_LEAD4_AND_T1(prev, t); + } + } +}; + +U_NAMESPACE_END + +#endif // __cplusplus + #endif diff --git a/icu4c/source/common/utf_impl.cpp b/icu4c/source/common/utf_impl.cpp index b560a4f7aa4..f78c566e098 100644 --- a/icu4c/source/common/utf_impl.cpp +++ b/icu4c/source/common/utf_impl.cpp @@ -281,13 +281,13 @@ utf8_prevCharSafeBody(const uint8_t *s, int32_t start, int32_t *pi, UChar32 c, U } } } - } else if((0xf0<=b2 && b2<=0xf4) && U8_IS_VALID_LEAD4_AND_T1(b2, b1)) { + } else if(0xf0<=b2 && b2<=0xf4 && U8_IS_VALID_LEAD4_AND_T1(b2, b1)) { // Truncated 4-byte sequence. *pi=i; return errorValue(2, strict); } - } else if(((0xe0<=b1 && b1<0xf0) && U8_IS_VALID_LEAD3_AND_T1(b1, c)) || - ((0xf0<=b1 && b1<=0xf4) && U8_IS_VALID_LEAD4_AND_T1(b1, c))) { + } else if((0xe0<=b1 && b1<0xf0 && U8_IS_VALID_LEAD3_AND_T1(b1, c)) || + (0xf0<=b1 && b1<=0xf4 && U8_IS_VALID_LEAD4_AND_T1(b1, c))) { // Truncated 3- or 4-byte sequence. *pi=i; return errorValue(1, strict); @@ -318,12 +318,12 @@ utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i) { return i; } } - } else if((0xf0<=b2 && b2<=0xf4) && U8_IS_VALID_LEAD4_AND_T1(b2, b1)) { + } else if(0xf0<=b2 && b2<=0xf4 && U8_IS_VALID_LEAD4_AND_T1(b2, b1)) { // Truncated 4-byte sequence. return i; } - } else if(((0xe0<=b1 && b1<0xf0) && U8_IS_VALID_LEAD3_AND_T1(b1, c)) || - ((0xf0<=b1 && b1<=0xf4) && U8_IS_VALID_LEAD4_AND_T1(b1, c))) { + } else if((0xe0<=b1 && b1<0xf0 && U8_IS_VALID_LEAD3_AND_T1(b1, c)) || + (0xf0<=b1 && b1<=0xf4 && U8_IS_VALID_LEAD4_AND_T1(b1, c))) { // Truncated 3- or 4-byte sequence. return i; } diff --git a/icu4c/source/test/cintltst/ccapitst.c b/icu4c/source/test/cintltst/ccapitst.c index 4ae23ceed8a..b5eb5f32e68 100644 --- a/icu4c/source/test/cintltst/ccapitst.c +++ b/icu4c/source/test/cintltst/ccapitst.c @@ -2495,6 +2495,26 @@ static UBool getTestChar(UConverter *cnv, const char *converterName, return TRUE; } +static UBool isOneTruncatedUTF8(const char *s, int32_t length) { + if(length==0) { + return FALSE; + } else if(length==1) { + return U8_IS_LEAD(s[0]); + } else { + int32_t count=U8_COUNT_TRAIL_BYTES(s[0]); + if(length<=count) { + // 2 or more bytes, but fewer than the lead byte indicates. + int32_t oneLength=0; + U8_FWD_1(s, oneLength, length); + // Truncated if we reach the end of the string. + // Not true if the lead byte and first trail byte do not start a valid sequence, + // e.g., E0 80 -> oneLength=1. + return oneLength==length; + } + return FALSE; + } +} + static void testFromTruncatedUTF8(UConverter *utf8Cnv, UConverter *cnv, const char *converterName, char charUTF8[4], int32_t charUTF8Length, char char0[8], int32_t char0Length, @@ -2526,7 +2546,7 @@ static void testFromTruncatedUTF8(UConverter *utf8Cnv, UConverter *cnv, const ch for(i=0; i=(1+U8_COUNT_TRAIL_BYTES(badUTF8[i][0]))) { + if(!isOneTruncatedUTF8(badUTF8[i], length)) { continue; } diff --git a/icu4c/source/test/cintltst/nccbtst.c b/icu4c/source/test/cintltst/nccbtst.c index e48563072aa..55780d56622 100644 --- a/icu4c/source/test/cintltst/nccbtst.c +++ b/icu4c/source/test/cintltst/nccbtst.c @@ -1482,7 +1482,7 @@ static void TestSub(int32_t inputsize, int32_t outputsize) if(!testConvertFromUnicode(testinput, UPRV_LENGTHOF(testinput), expectedUTF8, UPRV_LENGTHOF(expectedUTF8), "utf8", UCNV_FROM_U_CALLBACK_SUBSTITUTE, offsets, NULL, 0 )) { - log_err("u-> utf8 with stop did not match.\n"); + log_err("u-> utf8 with substitute did not match.\n"); } } @@ -1614,8 +1614,8 @@ static void TestSub(int32_t inputsize, int32_t outputsize) { const uint8_t sampleText1[] = { 0x31, 0xe4, 0xba, 0x8c, 0xe0, 0x80, 0x61,}; - UChar expected1[] = { 0x0031, 0x4e8c, 0xfffd, 0x0061}; - int32_t offsets1[] = { 0x0000, 0x0001, 0x0004, 0x0006}; + UChar expected1[] = { 0x0031, 0x4e8c, 0xfffd, 0xfffd, 0x0061}; + int32_t offsets1[] = { 0x0000, 0x0001, 0x0004, 0x0005, 0x0006}; if(!testConvertToUnicode(sampleText1, UPRV_LENGTHOF(sampleText1), expected1, UPRV_LENGTHOF(expected1),"utf8", diff --git a/icu4c/source/test/cintltst/ncnvtst.c b/icu4c/source/test/cintltst/ncnvtst.c index c1e5b4fdc6e..255020a2e9c 100644 --- a/icu4c/source/test/cintltst/ncnvtst.c +++ b/icu4c/source/test/cintltst/ncnvtst.c @@ -963,8 +963,8 @@ static void TestWithBufferSize(int32_t insize, int32_t outsize){ { const uint8_t sampleText1[] = { 0x31, 0xe4, 0xba, 0x8c, 0xe0, 0x80, 0x61}; - UChar expected1[] = { 0x0031, 0x4e8c, 0xfffd, 0x0061}; - int32_t offsets1[] = { 0x0000, 0x0001, 0x0004, 0x0006}; + UChar expected1[] = { 0x0031, 0x4e8c, 0xfffd, 0xfffd, 0x0061}; + int32_t offsets1[] = { 0x0000, 0x0001, 0x0004, 0x0005, 0x0006}; if(!testConvertToU(sampleText1, sizeof(sampleText1), expected1, UPRV_LENGTHOF(expected1),"utf8", UCNV_TO_U_CALLBACK_SUBSTITUTE, offsets1,FALSE)) diff --git a/icu4c/source/test/cintltst/nucnvtst.c b/icu4c/source/test/cintltst/nucnvtst.c index 3366b669999..7aa7a1beaf7 100644 --- a/icu4c/source/test/cintltst/nucnvtst.c +++ b/icu4c/source/test/cintltst/nucnvtst.c @@ -1113,26 +1113,36 @@ static void TestNewConvertWithBufferSizes(int32_t outsize, int32_t insize ) 0xf4, 0x8f, 0xbf, 0xbf, /* 10FFFF */ 0xdf, 0xbf, /* 7ff */ 0xbf, /* truncated tail */ - 0xf4, 0x90, 0x80, 0x80, /* 11FFFF */ + 0xf4, 0x90, 0x80, 0x80, /* 110000 */ 0x02 }; static const uint16_t utf8Expected[]={ 0x0061, - 0xfffd, + 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0x0000, 0x0062, - 0xfffd, - 0xfffd, + 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, + 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xdbff, 0xdfff, 0x07ff, 0xfffd, - 0xfffd, + 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0x0002 }; static const int32_t utf8Offsets[]={ - 0, 1, 5, 6, 7, 12, 17, 17, 21, 23, 24, 28 + 0, + 1, 2, 3, 4, + 5, + 6, + 7, 8, 9, 10, 11, + 12, 13, 14, 15, 16, + 17, 17, + 21, + 23, + 24, 25, 26, 27, + 28 }; testConvertToU(utf8, sizeof(utf8), utf8Expected, UPRV_LENGTHOF(utf8Expected), "utf-8", utf8Offsets ,FALSE); diff --git a/icu4c/source/test/testdata/conversion.txt b/icu4c/source/test/testdata/conversion.txt index 7b0272ab304..bc8402fe4a2 100644 --- a/icu4c/source/test/testdata/conversion.txt +++ b/icu4c/source/test/testdata/conversion.txt @@ -763,9 +763,9 @@ conversion:table(nofallback) { // surrogates in CESU-8 { "CESU-8", :bin{ eda080eda081edb081 }, "\ud800\U00010401", :intvector{ 0, 3, 6 }, :int{1}, :int{0}, "", "", :bin{""} } // e080 is a partial sequence - { "UTF-8", :bin{ 31ffe4ba8ce08061 }, "1\ufffd\u4e8c\ufffda", :intvector{ 0, 1, 2, 5, 7 }, :int{0}, :int{0}, "", "", :bin{ e080 } } + { "UTF-8", :bin{ 31ffe4ba8ce08061 }, "1\ufffd\u4e8c\ufffd\ufffda", :intvector{ 0, 1, 2, 5, 6, 7 }, :int{0}, :int{0}, "", "", :bin{ 80 } } // fbbfbfbfbf exceedes U+10ffff - { "UTF-8", :bin{ 31fbbfbfbfbf61 }, "1\ufffda", :intvector{ 0, 1, 6 }, :int{0}, :int{0}, "", "", :bin{ fbbfbfbfbf } } + { "UTF-8", :bin{ 31fbbfbfbfbf61 }, "1\ufffd\ufffd\ufffd\ufffd\ufffda", :intvector{ 0, 1, 2, 3, 4, 5, 6 }, :int{0}, :int{0}, "", "", :bin{ bf } } // lead byte a2 without trail byte { "ibm-1363", :bin{ a2aea2 }, "\u00a1", :intvector{ 0 }, :int{1}, :int{0}, "truncated", ".", :bin{ a2 } } diff --git a/icu4j/main/classes/charset/src/com/ibm/icu/charset/CharsetUTF8.java b/icu4j/main/classes/charset/src/com/ibm/icu/charset/CharsetUTF8.java index b1d0472536a..bd52379e7f3 100644 --- a/icu4j/main/classes/charset/src/com/ibm/icu/charset/CharsetUTF8.java +++ b/icu4j/main/classes/charset/src/com/ibm/icu/charset/CharsetUTF8.java @@ -36,26 +36,7 @@ class CharsetUTF8 extends CharsetICU { maxCharsPerByte = 1; } - private static final int BITMASK_FROM_UTF8[] = { -1, 0x7f, 0x1f, 0xf, 0x7, 0x3, 0x1 }; - - private static final byte BYTES_FROM_UTF8[] = { - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0 - }; - - /* - * Starting with Unicode 3.0.1: UTF-8 byte sequences of length N _must_ encode code points of or - * above utf8_minChar32[N]; byte sequences with more than 4 bytes are illegal in UTF-8, which is - * tested with impossible values for them - */ - private static final int UTF8_MIN_CHAR32[] = { 0, 0, 0x80, 0x800, 0x10000, - Integer.MAX_VALUE, Integer.MAX_VALUE }; + private static final int BITMASK_FROM_UTF8[] = { -1, 0x7f, 0x1f, 0xf, 0x7 }; private final boolean isCESU8 = this instanceof CharsetCESU8; @@ -92,9 +73,9 @@ class CharsetUTF8 extends CharsetICU { if (mode == 0) { /* nothing is stored in toUnicodeStatus, read a byte as input */ - char32 = (toUBytesArray[0] = sourceArray[sourceIndex++]) & 0xff; - bytesExpected = BYTES_FROM_UTF8[char32]; - char32 &= BITMASK_FROM_UTF8[bytesExpected]; + toUBytesArray[0] = ch = sourceArray[sourceIndex++]; + bytesExpected = UTF8.countBytes(ch); + char32 = ch & BITMASK_FROM_UTF8[bytesExpected]; bytesSoFar = 1; } else { /* a partially or fully built code point is stored in toUnicodeStatus */ @@ -118,8 +99,9 @@ class CharsetUTF8 extends CharsetICU { cr = CoderResult.UNDERFLOW; break; } - if (((ch = toUBytesArray[bytesSoFar] = sourceArray[sourceIndex++]) & 0xc0) != 0x80) { - /* not a trail byte (is not of the form 10xxxxxx) */ + toUBytesArray[bytesSoFar] = ch = sourceArray[sourceIndex++]; + if (!UTF8.isValidTrail(char32, ch, bytesSoFar, bytesExpected) + && !(isCESU8 && bytesSoFar == 1 && char32 == 0xd && UTF8.isTrail(ch))) { sourceIndex--; toULength = bytesSoFar; cr = CoderResult.malformedForLength(bytesSoFar); @@ -127,8 +109,7 @@ class CharsetUTF8 extends CharsetICU { } char32 = (char32 << 6) | (ch & 0x3f); bytesSoFar++; - } else if (bytesSoFar == bytesExpected && UTF8_MIN_CHAR32[bytesExpected] <= char32 && char32 <= 0x10ffff - && (isCESU8 ? bytesExpected <= 3 : !UTF16.isSurrogate((char) char32))) { + } else if (bytesSoFar == bytesExpected && (!isCESU8 || bytesSoFar <= 3)) { /* * char32 is a valid code point and is composed of the correct number of * bytes ... we now need to output it in UTF-16 @@ -168,8 +149,8 @@ class CharsetUTF8 extends CharsetICU { } /* keep reading the next input (and writing it) while bytes == 1 */ - while ((bytesExpected = BYTES_FROM_UTF8[char32 = (toUBytesArray[0] = sourceArray[sourceIndex++]) & 0xff]) == 1) { - targetArray[targetIndex++] = (char) char32; + while (UTF8.isSingle(ch = sourceArray[sourceIndex++])) { + targetArray[targetIndex++] = (char) ch; if (sourceIndex >= sourceLimit) { cr = CoderResult.UNDERFLOW; break outer; @@ -179,9 +160,11 @@ class CharsetUTF8 extends CharsetICU { break outer; } } + toUBytesArray[0] = ch; /* remove the bits that indicate the number of bytes */ - char32 &= BITMASK_FROM_UTF8[bytesExpected]; + bytesExpected = UTF8.countBytes(ch); + char32 = ch & BITMASK_FROM_UTF8[bytesExpected]; bytesSoFar = 1; } else { /* @@ -212,9 +195,9 @@ class CharsetUTF8 extends CharsetICU { if (mode == 0) { /* nothing is stored in toUnicodeStatus, read a byte as input */ - char32 = (toUBytesArray[0] = source.get(sourceIndex++)) & 0xff; - bytesExpected = BYTES_FROM_UTF8[char32]; - char32 &= BITMASK_FROM_UTF8[bytesExpected]; + toUBytesArray[0] = ch = source.get(sourceIndex++); + bytesExpected = UTF8.countBytes(ch); + char32 = ch & BITMASK_FROM_UTF8[bytesExpected]; bytesSoFar = 1; } else { /* a partially or fully built code point is stored in toUnicodeStatus */ @@ -238,8 +221,9 @@ class CharsetUTF8 extends CharsetICU { cr = CoderResult.UNDERFLOW; break; } - if (((ch = toUBytesArray[bytesSoFar] = source.get(sourceIndex++)) & 0xc0) != 0x80) { - /* not a trail byte (is not of the form 10xxxxxx) */ + toUBytesArray[bytesSoFar] = ch = source.get(sourceIndex++); + if (!UTF8.isValidTrail(char32, ch, bytesSoFar, bytesExpected) + && !(isCESU8 && bytesSoFar == 1 && char32 == 0xd && UTF8.isTrail(ch))) { sourceIndex--; toULength = bytesSoFar; cr = CoderResult.malformedForLength(bytesSoFar); @@ -247,21 +231,7 @@ class CharsetUTF8 extends CharsetICU { } char32 = (char32 << 6) | (ch & 0x3f); bytesSoFar++; - } - /* - * Legal UTF-8 byte sequences in Unicode 3.0.1 and up: - * - use only trail bytes after a lead byte (checked above) - * - use the right number of trail bytes for a given lead byte - * - encode a code point <= U+10ffff - * - use the fewest possible number of bytes for their code points - * - use at most 4 bytes (for i>=5 it is 0x10ffff= sourceLimit) { cr = CoderResult.UNDERFLOW; break outer; @@ -316,9 +286,11 @@ class CharsetUTF8 extends CharsetICU { break outer; } } + toUBytesArray[0] = ch; /* remove the bits that indicate the number of bytes */ - char32 &= BITMASK_FROM_UTF8[bytesExpected]; + bytesExpected = UTF8.countBytes(ch); + char32 = ch & BITMASK_FROM_UTF8[bytesExpected]; bytesSoFar = 1; } else { /* @@ -658,32 +630,6 @@ class CharsetUTF8 extends CharsetICU { return (byte) (0x80 | (char32 & 0x3f)); } - /* single-code point definitions -------------------------------------------- */ - - /* - * Does this code unit (byte) encode a code point by itself (US-ASCII 0..0x7f)? - * @param c 8-bit code unit (byte) - * @return TRUE or FALSE - */ - // static final boolean isSingle(byte c) {return (((c)&0x80)==0);} - /* - * Is this code unit (byte) a UTF-8 lead byte? - * @param c 8-bit code unit (byte) - * @return TRUE or FALSE - */ - // static final boolean isLead(byte c) {return ((((c)-0xc0) & - // UConverterConstants.UNSIGNED_BYTE_MASK)<0x3e);} - /* - * Is this code unit (byte) a UTF-8 trail byte? - * - * @param c - * 8-bit code unit (byte) - * @return TRUE or FALSE - */ - /*private static final boolean isTrail(byte c) { - return (((c) & 0xc0) == 0x80); - }*/ - @Override public CharsetDecoder newDecoder() { return new CharsetDecoderUTF8(this); diff --git a/icu4j/main/classes/charset/src/com/ibm/icu/charset/UTF8.java b/icu4j/main/classes/charset/src/com/ibm/icu/charset/UTF8.java new file mode 100644 index 00000000000..6dc39f486e2 --- /dev/null +++ b/icu4j/main/classes/charset/src/com/ibm/icu/charset/UTF8.java @@ -0,0 +1,172 @@ +// © 2017 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html#License +package com.ibm.icu.charset; + +/** + * Partial Java port of ICU4C unicode/utf8.h and ustr_imp.h. + */ +class UTF8 { + /** + * Counts the trail bytes for a UTF-8 lead byte. + * Returns 0 for 0..0xc1 as well as for 0xf5..0xff. + * + * @param leadByte The first byte of a UTF-8 sequence. Must be 0..0xff. + * @return 0..3 + */ + static int countTrailBytes(byte leadByte) { + if (leadByte < (byte)0xe0) { + return leadByte < (byte)0xc2 ? 0 : 1; + } else if (leadByte < (byte)0xf0) { + return 2; + } else { + return leadByte <= (byte)0xf4 ? 3 : 0; + } + } + + /** + * Counts the bytes of any whole valid sequence for a UTF-8 lead byte. + * Returns 1 for ASCII 0..0x7f. + * Returns 0 for 0x80..0xc1 as well as for 0xf5..0xff. + * + * @param leadByte The first byte of a UTF-8 sequence. Must be 0..0xff. + * @return 0..4 + */ + static int countBytes(byte leadByte) { + if (leadByte >= 0) { + return 1; + } else if (leadByte < (byte)0xe0) { + return leadByte < (byte)0xc2 ? 0 : 2; + } else if (leadByte < (byte)0xf0) { + return 3; + } else { + return leadByte <= (byte)0xf4 ? 4 : 0; + } + } + + /** + * Internal bit vector for 3-byte UTF-8 validity check, for use in {@link #isValidLead3AndT1}. + * Each bit indicates whether one lead byte + first trail byte pair starts a valid sequence. + * Lead byte E0..EF bits 3..0 are used as data int index, + * first trail byte bits 7..5 are used as bit index into that int. + * + * @see #isValidLead3AndT1 + */ + private static final int[] U8_LEAD3_T1_BITS = { + 0x20, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x10, 0x30, 0x30 + }; + + /** + * Internal 3-byte UTF-8 validity check. + * + * @param lead E0..EF + * @param t1 00..FF + * @return true if lead byte E0..EF and first trail byte 00..FF start a valid sequence. + */ + static boolean isValidLead3AndT1(int lead, byte t1) { + return (U8_LEAD3_T1_BITS[lead & 0xf] & (1 << ((t1 & 0xff) >> 5))) != 0; + } + + /** + * Internal bit vector for 4-byte UTF-8 validity check, for use in {@link #isValidLead4AndT1}. + * Each bit indicates whether one lead byte + first trail byte pair starts a valid sequence. + * Lead byte F0..F4 bits 2..0 are used as data int index, + * first trail byte bits 7..4 are used as bit index into that int. + * + * @see #isValidLead4AndT1 + */ + private static final int[] U8_LEAD4_T1_BITS = { + 0x0e00, 0x0f00, 0x0f00, 0x0f00, 0x0100 + }; + + /** + * Internal 4-byte UTF-8 validity check. + * + * @param lead F0..F4 + * @param t1 00..FF + * @return true if lead byte F0..F4 and first trail byte 00..FF start a valid sequence. + */ + static boolean isValidLead4AndT1(int lead, byte t1) { + return (U8_LEAD4_T1_BITS[lead & 7] & (1 << ((t1 & 0xff) >> 4))) != 0; + } + + /** + * Does this code unit (byte) encode a code point by itself (US-ASCII 0..0x7f)? + * + * @param c 8-bit code unit (byte) + * @return true if c is an ASCII byte + */ + static boolean isSingle(byte c) { + return c >= 0; + } + + /** + * Is this code unit (byte) a UTF-8 lead byte? + * + * @param c 8-bit code unit (byte) + * @return true if c is a lead byte + */ + static boolean isLead(byte c) { + return ((c - 0xc2) & 0xff) <= 0x32; // 0x32=0xf4-0xc2 + } + + /** + * Is this code unit (byte) a UTF-8 trail byte? (0x80..0xBF) + * + * @param c 8-bit code unit (byte) + * @return true if c is a trail byte + */ + static boolean isTrail(byte c) { + return c < (byte)0xc0; + } + + /** + * How many code units (bytes) are used for the UTF-8 encoding + * of this Unicode code point? + * + * @param c 32-bit code point + * @return 1..4, or 0 if c is a surrogate or not a Unicode code point + */ + static int length(int c) { + if (c >= 0) { + if (c <= 0x7f) { + return 1; + } else if (c <= 0x7ff) { + return 2; + } else if (c <= 0xd7ff) { + return 3; + } else if (c <= 0xffff) { + return c >= 0xe000 ? 3 : 0; + } else if (c <= 0x10ffff) { + return 4; + } + } + return 0; + } + + /** + * 4: The maximum number of UTF-8 code units (bytes) per Unicode code point (U+0000..U+10ffff). + */ + static int MAX_LENGTH = 4; + + /** + * Is t a valid UTF-8 trail byte? + * + * @param prev Must be the preceding lead byte if i==1 and length>=3; + * otherwise ignored. + * @param t The i-th byte following the lead byte. + * @param i The index (1..3) of byte t in the byte sequence. 0 1) { + return isTrail(t); + } else if (length == 3) { + return isValidLead3AndT1(prev, t); + } else { // length == 4 + return isValidLead4AndT1(prev, t); + } + } +} diff --git a/icu4j/main/shared/data/testdata.jar b/icu4j/main/shared/data/testdata.jar index b328d872ff9..c6c9435e54f 100755 --- a/icu4j/main/shared/data/testdata.jar +++ b/icu4j/main/shared/data/testdata.jar @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fd856769e94b963fb8a0b63148c63349198ef0c0ec3729173170ccbfd94c4999 -size 812769 +oid sha256:a99e848a9249a672092d5fc14d8fe02dc5728ad1f3548c287a9d1c5b12088013 +size 812760 diff --git a/icu4j/main/tests/charset/src/com/ibm/icu/dev/test/charset/TestCharset.java b/icu4j/main/tests/charset/src/com/ibm/icu/dev/test/charset/TestCharset.java index 9a5dc189d3e..5ffdd41a258 100644 --- a/icu4j/main/tests/charset/src/com/ibm/icu/dev/test/charset/TestCharset.java +++ b/icu4j/main/tests/charset/src/com/ibm/icu/dev/test/charset/TestCharset.java @@ -562,7 +562,7 @@ public class TestCharset extends TestFmwk { cr = decoder.decode(bs, us, true); bs.rewind(); us.rewind(); - if (!cr.isMalformed() || cr.length() != 3) { + if (!cr.isMalformed() || cr.length() != 1) { errln("Incorrect result in " + converter + " decoder for 0x" + Integer.toHexString(i) + " received " + cr); break; @@ -584,7 +584,7 @@ public class TestCharset extends TestFmwk { cr = decoder.decode(bs, us, true); bs.rewind(); us.rewind(); - if (!cr.isMalformed() || cr.length() != 3) { + if (!cr.isMalformed() || cr.length() != 1) { errln("Incorrect result in " + converter + " decoder for 0x" + Integer.toHexString(i) + " received " + cr); break; @@ -4653,7 +4653,7 @@ public class TestCharset extends TestFmwk { //decoding code coverage //test malform error decoder.reset(); - bs.put((byte)0xC0); bs.put((byte)0xC0); + bs.put((byte)0xC2); bs.put((byte)0xC2); us.put((char)0x0000); bs2 = bs.asReadOnlyBuffer(); diff --git a/icu4j/main/tests/charset/src/com/ibm/icu/dev/test/charset/TestConversion.java b/icu4j/main/tests/charset/src/com/ibm/icu/dev/test/charset/TestConversion.java index 1a1f2f305bb..67c1cf641bb 100644 --- a/icu4j/main/tests/charset/src/com/ibm/icu/dev/test/charset/TestConversion.java +++ b/icu4j/main/tests/charset/src/com/ibm/icu/dev/test/charset/TestConversion.java @@ -39,18 +39,18 @@ import junitparams.JUnitParamsRunner; import junitparams.Parameters; /** - * This maps to convtest.c which tests the test file for data-driven conversion tests. - * + * This maps to convtest.c which tests the test file for data-driven conversion tests. + * */ @RunWith(JUnitParamsRunner.class) public class TestConversion extends TestFmwk { /** * This maps to the C struct of conversion case in convtest.h that stores the * data for a conversion test - * + * */ private class ConversionCase { - int caseNr; // testcase index + int caseNr; // testcase index String option = null; // callback options CodingErrorAction cbErrorAction = null; // callback action type CharBuffer toUnicodeResult = null; @@ -64,7 +64,7 @@ public class TestConversion extends TestFmwk { boolean finalFlush; // flush boolean fallbacks; // fallback String outErrorCode; // errorCode - String cbopt; // callback + String cbopt; // callback // TestGetUnicodeSet variables String map; @@ -91,7 +91,7 @@ public class TestConversion extends TestFmwk { } @SuppressWarnings("unused") - private List getTestData() throws Exception { + private List getTestData() throws Exception { return ModuleTest.getTestData("com/ibm/icu/dev/data/testdata/", "conversion"); } @@ -132,7 +132,7 @@ public class TestConversion extends TestFmwk { // private methods ------------------------------------------------------- - // fromUnicode test worker functions --------------------------------------- + // fromUnicode test worker functions --------------------------------------- private void TestFromUnicode(DataMap testcase, int caseNr) { ConversionCase cc = new ConversionCase(); @@ -154,7 +154,7 @@ public class TestConversion extends TestFmwk { errln("error parsing conversion/toUnicode test case " + cc.caseNr); return; } - + /* * Skip the following data driven converter tests. * These tests were added to the data driven conversion test in ICU @@ -215,7 +215,7 @@ public class TestConversion extends TestFmwk { break; } - // check for any options for the callback value -- + // check for any options for the callback value -- cc.option = cc.cbErrorAction == null ? cc.cbopt : cc.cbopt .substring(1); if (cc.option == null) { @@ -225,7 +225,7 @@ public class TestConversion extends TestFmwk { FromUnicodeCase(cc); } - + private void FromUnicodeCase(ConversionCase cc) { // create charset encoder for conversion test CharsetProviderICU provider = new CharsetProviderICU(); @@ -238,7 +238,7 @@ public class TestConversion extends TestFmwk { "com/ibm/icu/dev/data/testdata", this.getClass().getClassLoader()) : (Charset) provider.charsetForName(cc.charset); if (charset != null) { - encoder = (CharsetEncoder) charset.newEncoder(); + encoder = charset.newEncoder(); encoder.onMalformedInput(CodingErrorAction.REPLACE); encoder.onUnmappableCharacter(CodingErrorAction.REPLACE); if (encoder instanceof CharsetEncoderICU) { @@ -260,7 +260,7 @@ public class TestConversion extends TestFmwk { return; } - // set the callback for the encoder + // set the callback for the encoder if (cc.cbErrorAction != null) { if (cc.cbEncoder != null) { ((CharsetEncoderICU)encoder).setFromUCallback(CoderResult.malformedForLength(1), cc.cbEncoder, cc.option); @@ -364,7 +364,7 @@ public class TestConversion extends TestFmwk { break; } } - + private int stepFromUnicode(ConversionCase cc, CharsetEncoder encoder, int step) { if (step < 0) { errln("Negative step size, test internal error."); @@ -387,7 +387,7 @@ public class TestConversion extends TestFmwk { currentSourceLimit = sourceLen; currentTargetLimit = targetLen; } - + CoderResult cr = null; for (;;) { @@ -529,7 +529,7 @@ public class TestConversion extends TestFmwk { "com/ibm/icu/dev/data/testdata", this.getClass().getClassLoader()) : (Charset) provider.charsetForName(cc.charset); if (charset != null) { - decoder = (CharsetDecoder) charset.newDecoder(); + decoder = charset.newDecoder(); decoder.onMalformedInput(CodingErrorAction.REPLACE); decoder.onUnmappableCharacter(CodingErrorAction.REPLACE); } @@ -588,13 +588,13 @@ public class TestConversion extends TestFmwk { } } - // Check the step to unicode + // Check the step to unicode boolean ok; int resultLength; String steps[][] = { { "0", "bulk" }, // must be first for offsets to be checked { "1", "step=1" }, { "3", "step=3" }, { "7", "step=7" } }; - /* TODO: currently not supported test steps, getNext API is not supported for now + /* TODO: currently not supported test steps, getNext API is not supported for now { "-1", "getNext" }, { "-2", "toU(bulk)+getNext" }, { "-3", "getNext+toU(bulk)" }, @@ -702,14 +702,14 @@ public class TestConversion extends TestFmwk { target.limit(target.capacity()); flush = cc.finalFlush; } - // convert + // convert CoderResult cr = null; if (source.hasRemaining()) { cr = decoder.decode(source, target, flush); // check pointers and errors if (cr.isOverflow()) { - // the partial target is filled, set a new limit, + // the partial target is filled, set a new limit, oStep = (target.position() + step); target.limit((oStep < target.capacity()) ? oStep : target.capacity()); @@ -733,7 +733,7 @@ public class TestConversion extends TestFmwk { cr = decoder.decode(source, target, true); - //due to limitation of the API we need to check for target limit for expected + //due to limitation of the API we need to check for target limit for expected if (target.position() != cc.unicode.length()) { if (target.limit() != cc.unicode.length()) { target.limit(cc.unicode.length()); @@ -781,7 +781,7 @@ public class TestConversion extends TestFmwk { if (cr.isOverflow()) { if (target.limit() >= target.capacity()) { - // target has reached its limit, an error occurred + // target has reached its limit, an error occurred logln("UnExpected error: Target Buffer is larger than capacity"); break; } else { @@ -841,7 +841,7 @@ public class TestConversion extends TestFmwk { } CoderResult cr = decoder.decode(source, target, source .limit() == sourceLen); - // check pointers and errors + // check pointers and errors if (cr.isOverflow()) { // one character has been consumed if (target.limit() >= target.capacity()) { @@ -915,12 +915,12 @@ public class TestConversion extends TestFmwk { "com/ibm/icu/dev/data/testdata", this.getClass().getClassLoader()) : (CharsetICU) provider.charsetForName(cc.charset); - //checking for converter that are not supported at this point + //checking for converter that are not supported at this point try{ if(charset==null || charset.name()=="BOCU-1" ||charset.name()== "SCSU"|| charset.name()=="lmbcs1" || charset.name()== "lmbcs2" || charset.name()== "lmbcs3" || charset.name()== "lmbcs4" || charset.name()=="lmbcs5" || charset.name()=="lmbcs6" || - charset.name()== "lmbcs8" || charset.name()=="lmbcs11" || charset.name()=="lmbcs16" || charset.name()=="lmbcs17" || + charset.name()== "lmbcs8" || charset.name()=="lmbcs11" || charset.name()=="lmbcs16" || charset.name()=="lmbcs17" || charset.name()=="lmbcs18"|| charset.name()=="lmbcs19"){ logln("Converter not supported at this point :" + cc.charset); return; @@ -944,7 +944,7 @@ public class TestConversion extends TestFmwk { charset.getUnicodeSet(unicodeset, cc.which); UnicodeSet diffset = new UnicodeSet(); - //are there items that must be in unicodeset but are not? + //are there items that must be in unicodeset but are not? (diffset = mapset).removeAll(unicodeset); if(!diffset.isEmpty()){ StringBuffer s = new StringBuffer(diffset.toPattern(true)); @@ -975,11 +975,11 @@ public class TestConversion extends TestFmwk { * This follows ucnv.c method ucnv_detectUnicodeSignature() to detect the * start of the stream for example U+FEFF (the Unicode BOM/signature * character) that can be ignored. - * + * * Detects Unicode signature byte sequences at the start of the byte stream * and returns number of bytes of the BOM of the indicated Unicode charset. * 0 is returned when no Unicode signature is recognized. - * + * */ private String detectUnicodeSignature(ByteBuffer source) {