From 27c08578acc7faed8deb42e77e00e73f44215985 Mon Sep 17 00:00:00 2001 From: Markus Scherer Date: Thu, 21 Sep 2017 23:45:08 +0000 Subject: [PATCH] ICU-13311 change illegal-UTF-8 handling in non-converter code X-SVN-Rev: 40445 --- icu4c/source/common/bmpset.cpp | 108 +-- icu4c/source/common/bmpset.h | 15 +- icu4c/source/common/unicode/utf.h | 34 +- icu4c/source/common/unicode/utf8.h | 138 ++-- icu4c/source/common/unisetspan.cpp | 21 +- icu4c/source/common/ustrtrns.cpp | 669 ++++++------------ icu4c/source/common/utext.cpp | 16 +- icu4c/source/common/utf_impl.cpp | 277 ++++---- icu4c/source/common/utrie2.h | 31 +- icu4c/source/i18n/utf8collationiterator.cpp | 68 +- icu4c/source/test/cintltst/custrtrn.c | 19 +- icu4c/source/test/cintltst/trie2test.c | 57 +- icu4c/source/test/cintltst/utf8tst.c | 383 ++++++---- icu4c/source/test/intltest/collationtest.cpp | 26 +- icu4c/source/test/intltest/strtest.cpp | 4 +- icu4c/source/test/intltest/ustrtest.cpp | 6 +- icu4c/source/test/intltest/utxttest.cpp | 69 +- icu4c/source/test/intltest/utxttest.h | 1 - .../core/src/com/ibm/icu/impl/BMPSet.java | 24 +- 19 files changed, 911 insertions(+), 1055 deletions(-) diff --git a/icu4c/source/common/bmpset.cpp b/icu4c/source/common/bmpset.cpp index 08f9bed0664..f84bfd7f5bf 100644 --- a/icu4c/source/common/bmpset.cpp +++ b/icu4c/source/common/bmpset.cpp @@ -28,7 +28,7 @@ U_NAMESPACE_BEGIN BMPSet::BMPSet(const int32_t *parentList, int32_t parentListLength) : list(parentList), listLength(parentListLength) { - uprv_memset(asciiBytes, 0, sizeof(asciiBytes)); + uprv_memset(latin1Contains, 0, sizeof(latin1Contains)); uprv_memset(table7FF, 0, sizeof(table7FF)); uprv_memset(bmpBlockBits, 0, sizeof(bmpBlockBits)); @@ -45,14 +45,16 @@ BMPSet::BMPSet(const int32_t *parentList, int32_t parentListLength) : list4kStarts[i]=findCodePoint(i<<12, list4kStarts[i-1], listLength-1); } list4kStarts[0x11]=listLength-1; + containsFFFD=containsSlow(0xfffd, list4kStarts[0xf], list4kStarts[0x10]); initBits(); overrideIllegal(); } BMPSet::BMPSet(const BMPSet &otherBMPSet, const int32_t *newParentList, int32_t newParentListLength) : + containsFFFD(otherBMPSet.containsFFFD), list(newParentList), listLength(newParentListLength) { - uprv_memcpy(asciiBytes, otherBMPSet.asciiBytes, sizeof(asciiBytes)); + uprv_memcpy(latin1Contains, otherBMPSet.latin1Contains, sizeof(latin1Contains)); uprv_memcpy(table7FF, otherBMPSet.table7FF, sizeof(table7FF)); uprv_memcpy(bmpBlockBits, otherBMPSet.bmpBlockBits, sizeof(bmpBlockBits)); uprv_memcpy(list4kStarts, otherBMPSet.list4kStarts, sizeof(list4kStarts)); @@ -120,7 +122,7 @@ void BMPSet::initBits() { UChar32 start, limit; int32_t listIndex=0; - // Set asciiBytes[]. + // Set latin1Contains[]. do { start=list[listIndex++]; if(listIndex=0x80) { + if(start>=0x100) { break; } do { - asciiBytes[start++]=1; - } while(start0x80) { + if(start<0x80) { + start=0x80; + } + break; + } + } // Set table7FF[]. while(start<0x800) { @@ -204,19 +223,14 @@ void BMPSet::initBits() { * for faster validity checking at runtime. * No need to set 0 values where they were reset to 0 in the constructor * and not modified by initBits(). - * (asciiBytes[] trail bytes, table7FF[] 0..7F, bmpBlockBits[] 0..7FF) + * (table7FF[] 0..7F, bmpBlockBits[] 0..7FF) * Need to set 0 values for surrogates D800..DFFF. */ void BMPSet::overrideIllegal() { uint32_t bits, mask; int32_t i; - if(containsSlow(0xfffd, list4kStarts[0xf], list4kStarts[0x10])) { - // contains(FFFD)==TRUE - for(i=0x80; i<0xc0; ++i) { - asciiBytes[i]=1; - } - + if(containsFFFD) { bits=3; // Lead bytes 0xC0 and 0xC1. for(i=0; i<64; ++i) { table7FF[i]|=bits; @@ -233,7 +247,6 @@ void BMPSet::overrideIllegal() { bmpBlockBits[i]=(bmpBlockBits[i]&mask)|bits; } } else { - // contains(FFFD)==FALSE mask=~(0x10001<<0xd); // Lead byte 0xED. for(i=32; i<64; ++i) { // Second half of 4k block. bmpBlockBits[i]&=mask; @@ -277,8 +290,8 @@ int32_t BMPSet::findCodePoint(UChar32 c, int32_t lo, int32_t hi) const { UBool BMPSet::contains(UChar32 c) const { - if((uint32_t)c<=0x7f) { - return (UBool)asciiBytes[c]; + if((uint32_t)c<=0xff) { + return (UBool)latin1Contains[c]; } else if((uint32_t)c<=0x7ff) { return (UBool)((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))!=0); } else if((uint32_t)c<0xd800 || (c>=0xe000 && c<=0xffff)) { @@ -314,8 +327,8 @@ BMPSet::span(const UChar *s, const UChar *limit, USetSpanCondition spanCondition // span do { c=*s; - if(c<=0x7f) { - if(!asciiBytes[c]) { + if(c<=0xff) { + if(!latin1Contains[c]) { break; } } else if(c<=0x7ff) { @@ -354,8 +367,8 @@ BMPSet::span(const UChar *s, const UChar *limit, USetSpanCondition spanCondition // span not do { c=*s; - if(c<=0x7f) { - if(asciiBytes[c]) { + if(c<=0xff) { + if(latin1Contains[c]) { break; } } else if(c<=0x7ff) { @@ -403,8 +416,8 @@ BMPSet::spanBack(const UChar *s, const UChar *limit, USetSpanCondition spanCondi // span for(;;) { c=*(--limit); - if(c<=0x7f) { - if(!asciiBytes[c]) { + if(c<=0xff) { + if(!latin1Contains[c]) { break; } } else if(c<=0x7ff) { @@ -446,8 +459,8 @@ BMPSet::spanBack(const UChar *s, const UChar *limit, USetSpanCondition spanCondi // span not for(;;) { c=*(--limit); - if(c<=0x7f) { - if(asciiBytes[c]) { + if(c<=0xff) { + if(latin1Contains[c]) { break; } } else if(c<=0x7ff) { @@ -497,22 +510,22 @@ const uint8_t * BMPSet::spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const { const uint8_t *limit=s+length; uint8_t b=*s; - if((int8_t)b>=0) { + if(U8_IS_SINGLE(b)) { // Initial all-ASCII span. if(spanCondition) { do { - if(!asciiBytes[b] || ++s==limit) { + if(!latin1Contains[b] || ++s==limit) { return s; } b=*s; - } while((int8_t)b>=0); + } while(U8_IS_SINGLE(b)); } else { do { - if(asciiBytes[b] || ++s==limit) { + if(latin1Contains[b] || ++s==limit) { return s; } b=*s; - } while((int8_t)b>=0); + } while(U8_IS_SINGLE(b)); } length=(int32_t)(limit-s); } @@ -540,20 +553,20 @@ BMPSet::spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanConditi // single trail byte, check for preceding 3- or 4-byte lead byte if(length>=2 && (b=*(limit-2))>=0xe0) { limit-=2; - if(asciiBytes[0x80]!=spanCondition) { + if(containsFFFD!=spanCondition) { limit0=limit; } } else if(b<0xc0 && b>=0x80 && length>=3 && (b=*(limit-3))>=0xf0) { // 4-byte lead byte with only two trail bytes limit-=3; - if(asciiBytes[0x80]!=spanCondition) { + if(containsFFFD!=spanCondition) { limit0=limit; } } } else { // lead byte with no trail bytes --limit; - if(asciiBytes[0x80]!=spanCondition) { + if(containsFFFD!=spanCondition) { limit0=limit; } } @@ -563,26 +576,26 @@ BMPSet::spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanConditi while(s=0xc0 && (t1=(uint8_t)(*s-0x80)) <= 0x3f ) { if((USetSpanCondition)((table7FF[t1]&((uint32_t)1<<(b&0x1f)))!=0) != spanCondition) { @@ -642,7 +656,7 @@ BMPSet::spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanConditi // Give an illegal sequence the same value as the result of contains(FFFD). // Handle each byte of an illegal sequence separately to simplify the code; // no need to optimize error handling. - if(asciiBytes[0x80]!=spanCondition) { + if(containsFFFD!=spanCondition) { return s-1; } } @@ -667,26 +681,26 @@ BMPSet::spanBackUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCon do { b=s[--length]; - if((int8_t)b>=0) { + if(U8_IS_SINGLE(b)) { // ASCII sub-span if(spanCondition) { do { - if(!asciiBytes[b]) { + if(!latin1Contains[b]) { return length+1; } else if(length==0) { return 0; } b=s[--length]; - } while((int8_t)b>=0); + } while(U8_IS_SINGLE(b)); } else { do { - if(asciiBytes[b]) { + if(latin1Contains[b]) { return length+1; } else if(length==0) { return 0; } b=s[--length]; - } while((int8_t)b>=0); + } while(U8_IS_SINGLE(b)); } } diff --git a/icu4c/source/common/bmpset.h b/icu4c/source/common/bmpset.h index 87375d2cace..018aeb7f95b 100644 --- a/icu4c/source/common/bmpset.h +++ b/icu4c/source/common/bmpset.h @@ -28,11 +28,12 @@ U_NAMESPACE_BEGIN * Helper class for frozen UnicodeSets, implements contains() and span() * optimized for BMP code points. Structured to be UTF-8-friendly. * - * ASCII: Look up bytes. + * Latin-1: Look up bytes. * 2-byte characters: Bits organized vertically. * 3-byte characters: Use zero/one/mixed data per 64-block in U+0000..U+FFFF, * with mixed for illegal ranges. - * Supplementary characters: Call contains() on the parent set. + * Supplementary characters: Binary search over + * the supplementary part of the parent set's inversion list. */ class BMPSet : public UMemory { public: @@ -96,12 +97,12 @@ private: inline UBool containsSlow(UChar32 c, int32_t lo, int32_t hi) const; /* - * One byte per ASCII character, or trail byte in lead position. - * 0 or 1 for ASCII characters. - * The value for trail bytes is the result of contains(FFFD) - * for faster validity checking at runtime. + * One byte 0 or 1 per Latin-1 character. */ - UBool asciiBytes[0xc0]; + UBool latin1Contains[0x100]; + + /* TRUE if contains(U+FFFD). */ + UBool containsFFFD; /* * One bit per code point from U+0000..U+07FF. diff --git a/icu4c/source/common/unicode/utf.h b/icu4c/source/common/unicode/utf.h index dff4286815d..ef512997f05 100644 --- a/icu4c/source/common/unicode/utf.h +++ b/icu4c/source/common/unicode/utf.h @@ -23,9 +23,6 @@ * This file defines macros for checking whether a code point is * a surrogate or a non-character etc. * - * The UChar and UChar32 data types for Unicode code units and code points - * are defined in umachine.h because they can be machine-dependent. - * * If U_NO_DEFAULT_INCLUDE_UTF_HEADERS is 0 then utf.h is included by utypes.h * and itself includes utf8.h and utf16.h after some * common definitions. @@ -50,11 +47,11 @@ * but are optimized for the much more frequently occurring BMP code points. * * umachine.h defines UChar to be an unsigned 16-bit integer. - * Where available, UChar is defined to be a char16_t - * or a wchar_t (if that is an unsigned 16-bit type), otherwise uint16_t. + * Since ICU 59, ICU uses char16_t in C++, UChar only in C, + * and defines UChar=char16_t by default. See the UChar API docs for details. * * UChar32 is defined to be a signed 32-bit integer (int32_t), large enough for a 21-bit - * Unicode code point (Unicode scalar value, 0..0x10ffff). + * Unicode code point (Unicode scalar value, 0..0x10ffff) and U_SENTINEL (-1). * Before ICU 2.4, the definition of UChar32 was similarly platform-dependent as * the definition of UChar. For details see the documentation for UChar32 itself. * @@ -63,11 +60,20 @@ * For actual Unicode character properties see uchar.h. * * By default, string operations must be done with error checking in case - * a string is not well-formed UTF-16. - * The macros will detect if a surrogate code unit is unpaired + * a string is not well-formed UTF-16 or UTF-8. + * + * The U16_ macros detect if a surrogate code unit is unpaired * (lead unit without trail unit or vice versa) and just return the unit itself * as the code point. * + * The U8_ macros detect illegal byte sequences and return a negative value. + * Starting with ICU 60, the observable length of a single illegal byte sequence + * skipped by one of these macros follows the Unicode 6+ recommendation + * which is consistent with the W3C Encoding Standard. + * + * There are ..._OR_FFFD versions of both U16_ and U8_ macros + * that return U+FFFD for illegal code unit sequences. + * * The regular "safe" macros require that the initial, passed-in string index * is within bounds. They only check the index when they read more than one * code unit. This is usually done with code similar to the following loop: @@ -91,10 +97,7 @@ * The performance differences are much larger here because UTF-8 provides so * many opportunities for malformed sequences. * The unsafe UTF-8 macros are entirely implemented inside the macro definitions - * and are fast, while the safe UTF-8 macros call functions for all but the - * trivial (ASCII) cases. - * (ICU 3.6 optimizes U8_NEXT() and U8_APPEND() to handle most other common - * characters inline as well.) + * and are fast, while the safe UTF-8 macros call functions for some complicated cases. * * Unlike with UTF-16, malformed sequences cannot be expressed with distinct * code point values (0..U+10ffff). They are indicated with negative values instead. @@ -126,8 +129,7 @@ */ #define U_IS_UNICODE_NONCHAR(c) \ ((c)>=0xfdd0 && \ - ((uint32_t)(c)<=0xfdef || ((c)&0xfffe)==0xfffe) && \ - (uint32_t)(c)<=0x10ffff) + ((c)<=0xfdef || ((c)&0xfffe)==0xfffe) && (c)<=0x10ffff) /** * Is c a Unicode code point value (0..U+10ffff) @@ -148,9 +150,7 @@ */ #define U_IS_UNICODE_CHAR(c) \ ((uint32_t)(c)<0xd800 || \ - ((uint32_t)(c)>0xdfff && \ - (uint32_t)(c)<=0x10ffff && \ - !U_IS_UNICODE_NONCHAR(c))) + (0xdfff<(c) && (c)<=0x10ffff && !U_IS_UNICODE_NONCHAR(c))) /** * Is this code point a BMP code point (U+0000..U+ffff)? diff --git a/icu4c/source/common/unicode/utf8.h b/icu4c/source/common/unicode/utf8.h index 18e7e9455c9..55024fdbfe5 100644 --- a/icu4c/source/common/unicode/utf8.h +++ b/icu4c/source/common/unicode/utf8.h @@ -41,34 +41,24 @@ /* internal definitions ----------------------------------------------------- */ - - /** * Counts the trail bytes for a UTF-8 lead byte. - * Returns 0 for 0..0xbf as well as for 0xfe and 0xff. + * Returns 0 for 0..0xc1 as well as for 0xf5..0xff. + * leadByte might be evaluated multiple times. * * This is internal since it is not meant to be called directly by external clients; * however it is called by public macros in this file and thus must remain stable. * - * Note: Beginning with ICU 50, the implementation uses a multi-condition expression - * which was shown in 2012 (on x86-64) to compile to fast, branch-free code. - * leadByte is evaluated multiple times. - * - * The pre-ICU 50 implementation used the exported array utf8_countTrailBytes: - * #define U8_COUNT_TRAIL_BYTES(leadByte) (utf8_countTrailBytes[leadByte]) - * leadByte was evaluated exactly once. - * * @param leadByte The first byte of a UTF-8 sequence. Must be 0..0xff. * @internal */ #define U8_COUNT_TRAIL_BYTES(leadByte) \ - ((uint8_t)(leadByte)<0xf0 ? \ - ((uint8_t)(leadByte)>=0xc0)+((uint8_t)(leadByte)>=0xe0) : \ - (uint8_t)(leadByte)<0xfe ? 3+((uint8_t)(leadByte)>=0xf8)+((uint8_t)(leadByte)>=0xfc) : 0) + ((uint8_t)(leadByte)<=0xf4 ? \ + ((uint8_t)(leadByte)>=0xc2)+((uint8_t)(leadByte)>=0xe0)+((uint8_t)(leadByte)>=0xf0) : 0) /** * Counts the trail bytes for a UTF-8 lead byte of a valid UTF-8 sequence. - * The maximum supported lead byte is 0xf4 corresponding to U+10FFFF. + * Returns 0 for 0..0xc1. Undefined for 0xf5..0xff. * leadByte might be evaluated multiple times. * * This is internal since it is not meant to be called directly by external clients; @@ -78,7 +68,7 @@ * @internal */ #define U8_COUNT_TRAIL_BYTES_UNSAFE(leadByte) \ - (((leadByte)>=0xc0)+((leadByte)>=0xe0)+((leadByte)>=0xf0)) + (((uint8_t)(leadByte)>=0xc2)+((uint8_t)(leadByte)>=0xe0)+((uint8_t)(leadByte)>=0xf0)) /** * Mask a UTF-8 lead byte, leave only the lower bits that form part of the code point value. @@ -89,6 +79,34 @@ */ #define U8_MASK_LEAD_BYTE(leadByte, countTrailBytes) ((leadByte)&=(1<<(6-(countTrailBytes)))-1) +/** + * Internal bit vector for 3-byte UTF-8 validity check. + * Lead byte E0..EF bits 3..0 as byte index, + * first trail byte bits 7..5 as bit index into that byte. + * @internal + */ +#define U8_LEAD3_T1_BITS "\x20\x30\x30\x30\x30\x30\x30\x30\x30\x30\x30\x30\x30\x10\x30\x30" + +/** + * Internal 3-byte UTF-8 validity check. + * @internal + */ +#define U8_IS_VALID_LEAD3_AND_T1(lead, t1) (U8_LEAD3_T1_BITS[(lead)&0xf]&(1<<((uint8_t)(t1)>>5))) + +/** + * Internal bit vector for 4-byte UTF-8 validity check. + * First trail byte bits 7..4 as byte index, + * lead byte F0..F4 bits 2..0 as bit index into that byte. + * @internal + */ +#define U8_LEAD4_T1_BITS "\x00\x00\x00\x00\x00\x00\x00\x00\x1E\x0F\x0F\x0F\x00\x00\x00\x00" + +/** + * Internal 4-byte UTF-8 validity check. + * @internal + */ +#define U8_IS_VALID_LEAD4_AND_T1(lead, t1) (U8_LEAD4_T1_BITS[(uint8_t)(t1)>>4]&(1<<((lead)&7))) + /** * Function for handling "next code point" with error-checking. * @@ -153,7 +171,8 @@ utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i); * @return TRUE or FALSE * @stable ICU 2.4 */ -#define U8_IS_LEAD(c) ((uint8_t)((c)-0xc0)<0x3e) +#define U8_IS_LEAD(c) ((uint8_t)((c)-0xc2)<=0x32) +// 0x32=0xf4-0xc2 /** * Is this code unit (byte) a UTF-8 trail byte? @@ -161,7 +180,7 @@ utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i); * @return TRUE or FALSE * @stable ICU 2.4 */ -#define U8_IS_TRAIL(c) (((c)&0xc0)==0x80) +#define U8_IS_TRAIL(c) ((int8_t)(c)<-0x40) /** * How many code units (bytes) are used for the UTF-8 encoding @@ -289,7 +308,7 @@ utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i); */ #define U8_NEXT_UNSAFE(s, i, c) { \ (c)=(uint8_t)(s)[(i)++]; \ - if((c)>=0x80) { \ + if(!U8_IS_SINGLE(c)) { \ if((c)<0xe0) { \ (c)=(((c)&0x1f)<<6)|((s)[(i)++]&0x3f); \ } else if((c)<0xf0) { \ @@ -325,22 +344,19 @@ utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i); */ #define U8_NEXT(s, i, length, c) { \ (c)=(uint8_t)(s)[(i)++]; \ - if((c)>=0x80) { \ + if(!U8_IS_SINGLE(c)) { \ uint8_t __t1, __t2; \ - if( /* handle U+1000..U+CFFF inline */ \ - (0xe0<(c) && (c)<=0xec) && \ - (((i)+1)<(length) || (length)<0) && \ - (__t1=(uint8_t)((s)[i]-0x80))<=0x3f && \ - (__t2=(uint8_t)((s)[(i)+1]-0x80))<= 0x3f \ - ) { \ - /* no need for (c&0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */ \ - (c)=(UChar)(((c)<<12)|(__t1<<6)|__t2); \ + if( /* handle U+0800..U+FFFF inline */ \ + (0xe0<=(c) && (c)<0xf0) && \ + (((i)+1)<(length) || (length)<0) && \ + U8_IS_VALID_LEAD3_AND_T1((c), __t1=(s)[i]) && \ + (__t2=(s)[(i)+1]-0x80)<=0x3f) { \ + (c)=(((c)&0xf)<<12)|((__t1&0x3f)<<6)|__t2; \ (i)+=2; \ } else if( /* handle U+0080..U+07FF inline */ \ - ((c)<0xe0 && (c)>=0xc2) && \ - ((i)!=(length)) && \ - (__t1=(uint8_t)((s)[i]-0x80))<=0x3f \ - ) { \ + ((c)<0xe0 && (c)>=0xc2) && \ + ((i)!=(length)) && \ + (__t1=(s)[i]-0x80)<=0x3f) { \ (c)=(((c)&0x1f)<<6)|__t1; \ ++(i); \ } else { \ @@ -376,22 +392,19 @@ utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i); */ #define U8_NEXT_OR_FFFD(s, i, length, c) { \ (c)=(uint8_t)(s)[(i)++]; \ - if((c)>=0x80) { \ + if(!U8_IS_SINGLE(c)) { \ uint8_t __t1, __t2; \ - if( /* handle U+1000..U+CFFF inline */ \ - (0xe0<(c) && (c)<=0xec) && \ - (((i)+1)<(length) || (length)<0) && \ - (__t1=(uint8_t)((s)[i]-0x80))<=0x3f && \ - (__t2=(uint8_t)((s)[(i)+1]-0x80))<= 0x3f \ - ) { \ - /* no need for (c&0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */ \ - (c)=(UChar)(((c)<<12)|(__t1<<6)|__t2); \ + if( /* handle U+0800..U+FFFF inline */ \ + (0xe0<=(c) && (c)<0xf0) && \ + (((i)+1)<(length) || (length)<0) && \ + U8_IS_VALID_LEAD3_AND_T1((c), __t1=(s)[i]) && \ + (__t2=(s)[(i)+1]-0x80)<=0x3f) { \ + (c)=(((c)&0xf)<<12)|((__t1&0x3f)<<6)|__t2; \ (i)+=2; \ } else if( /* handle U+0080..U+07FF inline */ \ - ((c)<0xe0 && (c)>=0xc2) && \ - ((i)!=(length)) && \ - (__t1=(uint8_t)((s)[i]-0x80))<=0x3f \ - ) { \ + ((c)<0xe0 && (c)>=0xc2) && \ + ((i)!=(length)) && \ + (__t1=(s)[i]-0x80)<=0x3f) { \ (c)=(((c)&0x1f)<<6)|__t1; \ ++(i); \ } else { \ @@ -476,7 +489,7 @@ utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i); * @stable ICU 2.4 */ #define U8_FWD_1_UNSAFE(s, i) { \ - (i)+=1+U8_COUNT_TRAIL_BYTES_UNSAFE((uint8_t)(s)[i]); \ + (i)+=1+U8_COUNT_TRAIL_BYTES_UNSAFE((s)[i]); \ } /** @@ -493,15 +506,24 @@ utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i); * @stable ICU 2.4 */ #define U8_FWD_1(s, i, length) { \ - uint8_t __b=(uint8_t)(s)[(i)++]; \ - if(U8_IS_LEAD(__b)) { \ - uint8_t __count=U8_COUNT_TRAIL_BYTES(__b); \ - if((i)+__count>(length) && (length)>=0) { \ - __count=(uint8_t)((length)-(i)); \ - } \ - while(__count>0 && U8_IS_TRAIL((s)[i])) { \ - ++(i); \ - --__count; \ + uint8_t __b=(s)[(i)++]; \ + if(U8_IS_LEAD(__b) && (i)!=(length)) { \ + uint8_t __t1=(s)[i]; \ + if((0xe0<=__b && __b<0xf0)) { \ + if(U8_IS_VALID_LEAD3_AND_T1(__b, __t1) && \ + ++(i)!=(length) && U8_IS_TRAIL((s)[i])) { \ + ++(i); \ + } \ + } else if(__b<0xe0) { \ + if(U8_IS_TRAIL(__t1)) { \ + ++(i); \ + } \ + } else /* c>=0xf0 */ { \ + if(U8_IS_VALID_LEAD4_AND_T1(__b, __t1) && \ + ++(i)!=(length) && U8_IS_TRAIL((s)[i]) && \ + ++(i)!=(length) && U8_IS_TRAIL((s)[i])) { \ + ++(i); \ + } \ } \ } \ } @@ -615,7 +637,7 @@ utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i); /* c is a trail byte */ \ (c)&=0x3f; \ for(;;) { \ - __b=(uint8_t)(s)[--(i)]; \ + __b=(s)[--(i)]; \ if(__b>=0xc0) { \ U8_MASK_LEAD_BYTE(__b, __count); \ (c)|=(UChar32)__b<<__shift; \ @@ -651,7 +673,7 @@ utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i); */ #define U8_PREV(s, start, i, c) { \ (c)=(uint8_t)(s)[--(i)]; \ - if((c)>=0x80) { \ + if(!U8_IS_SINGLE(c)) { \ (c)=utf8_prevCharSafeBody((const uint8_t *)s, start, &(i), c, -1); \ } \ } @@ -682,7 +704,7 @@ utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i); */ #define U8_PREV_OR_FFFD(s, start, i, c) { \ (c)=(uint8_t)(s)[--(i)]; \ - if((c)>=0x80) { \ + if(!U8_IS_SINGLE(c)) { \ (c)=utf8_prevCharSafeBody((const uint8_t *)s, start, &(i), c, -3); \ } \ } diff --git a/icu4c/source/common/unisetspan.cpp b/icu4c/source/common/unisetspan.cpp index 83bc7945faa..0a8893472f9 100644 --- a/icu4c/source/common/unisetspan.cpp +++ b/icu4c/source/common/unisetspan.cpp @@ -502,7 +502,7 @@ spanOneBack(const UnicodeSet &set, const UChar *s, int32_t length) { static inline int32_t spanOneUTF8(const UnicodeSet &set, const uint8_t *s, int32_t length) { UChar32 c=*s; - if((int8_t)c>=0) { + if(U8_IS_SINGLE(c)) { return set.contains(c) ? 1 : -1; } // Take advantage of non-ASCII fastpaths in U8_NEXT_OR_FFFD(). @@ -514,7 +514,7 @@ spanOneUTF8(const UnicodeSet &set, const uint8_t *s, int32_t length) { static inline int32_t spanOneBackUTF8(const UnicodeSet &set, const uint8_t *s, int32_t length) { UChar32 c=s[length-1]; - if((int8_t)c>=0) { + if(U8_IS_SINGLE(c)) { return set.contains(c) ? 1 : -1; } int32_t i=length-1; @@ -1006,11 +1006,9 @@ int32_t UnicodeSetStringSpan::spanUTF8(const uint8_t *s, int32_t length, USetSpa // Try to match if the increment is not listed already. // Match at code point boundaries. (The UTF-8 strings were converted // from UTF-16 and are guaranteed to be well-formed.) - if( !U8_IS_TRAIL(s[pos-overlap]) && - !offsets.containsOffset(inc) && - matches8(s+pos-overlap, s8, length8) - - ) { + if(!U8_IS_TRAIL(s[pos-overlap]) && + !offsets.containsOffset(inc) && + matches8(s+pos-overlap, s8, length8)) { if(inc==rest) { return length; // Reached the end of the string. } @@ -1052,11 +1050,10 @@ int32_t UnicodeSetStringSpan::spanUTF8(const uint8_t *s, int32_t length, USetSpa // Try to match if the string is longer or starts earlier. // Match at code point boundaries. (The UTF-8 strings were converted // from UTF-16 and are guaranteed to be well-formed.) - if( !U8_IS_TRAIL(s[pos-overlap]) && - (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ inc>maxInc) && - matches8(s+pos-overlap, s8, length8) - - ) { + if(!U8_IS_TRAIL(s[pos-overlap]) && + (overlap>maxOverlap || + /* redundant overlap==maxOverlap && */ inc>maxInc) && + matches8(s+pos-overlap, s8, length8)) { maxInc=inc; // Longest match from earliest start. maxOverlap=overlap; break; diff --git a/icu4c/source/common/ustrtrns.cpp b/icu4c/source/common/ustrtrns.cpp index 09eca22fda3..5dc032c02fb 100644 --- a/icu4c/source/common/ustrtrns.cpp +++ b/icu4c/source/common/ustrtrns.cpp @@ -256,152 +256,6 @@ u_strToUTF32(UChar32 *dest, pErrorCode); } -/* for utf8_nextCharSafeBodyTerminated() */ -static const UChar32 -utf8_minLegal[4]={ 0, 0x80, 0x800, 0x10000 }; - -/* - * Version of utf8_nextCharSafeBody() with the following differences: - * - checks for NUL termination instead of length - * - works with pointers instead of indexes - * - always strict (strict==-1) - * - * *ps points to after the lead byte and will be moved to after the last trail byte. - * c is the lead byte. - * @return the code point, or U_SENTINEL - */ -static UChar32 -utf8_nextCharSafeBodyTerminated(const uint8_t **ps, UChar32 c) { - const uint8_t *s=*ps; - uint8_t trail, illegal=0; - uint8_t count=U8_COUNT_TRAIL_BYTES(c); - U_ASSERT(count<6); - U8_MASK_LEAD_BYTE((c), count); - /* count==0 for illegally leading trail bytes and the illegal bytes 0xfe and 0xff */ - switch(count) { - /* each branch falls through to the next one */ - case 5: - case 4: - /* count>=4 is always illegal: no more than 3 trail bytes in Unicode's UTF-8 */ - illegal=1; - break; - case 3: - trail=(uint8_t)(*s++ - 0x80); - c=(c<<6)|trail; - if(trail>0x3f || c>=0x110) { - /* not a trail byte, or code point>0x10ffff (outside Unicode) */ - illegal=1; - break; - } - U_FALLTHROUGH; - case 2: - trail=(uint8_t)(*s++ - 0x80); - if(trail>0x3f) { - /* not a trail byte */ - illegal=1; - break; - } - c=(c<<6)|trail; - U_FALLTHROUGH; - case 1: - trail=(uint8_t)(*s++ - 0x80); - if(trail>0x3f) { - /* not a trail byte */ - illegal=1; - } - c=(c<<6)|trail; - break; - case 0: - return U_SENTINEL; - /* no default branch to optimize switch() - all values are covered */ - } - - /* correct sequence - all trail bytes have (b7..b6)==(10)? */ - /* illegal is also set if count>=4 */ - if(illegal || c0 && U8_IS_TRAIL(*s)) { - ++s; - --count; - } - c=U_SENTINEL; - } - *ps=s; - return c; -} - -/* - * Version of utf8_nextCharSafeBody() with the following differences: - * - works with pointers instead of indexes - * - always strict (strict==-1) - * - * *ps points to after the lead byte and will be moved to after the last trail byte. - * c is the lead byte. - * @return the code point, or U_SENTINEL - */ -static UChar32 -utf8_nextCharSafeBodyPointer(const uint8_t **ps, const uint8_t *limit, UChar32 c) { - const uint8_t *s=*ps; - uint8_t trail, illegal=0; - uint8_t count=U8_COUNT_TRAIL_BYTES(c); - if((limit-s)>=count) { - U8_MASK_LEAD_BYTE((c), count); - /* count==0 for illegally leading trail bytes and the illegal bytes 0xfe and 0xff */ - switch(count) { - /* each branch falls through to the next one */ - case 5: - case 4: - /* count>=4 is always illegal: no more than 3 trail bytes in Unicode's UTF-8 */ - illegal=1; - break; - case 3: - trail=*s++; - c=(c<<6)|(trail&0x3f); - if(c<0x110) { - illegal|=(trail&0xc0)^0x80; - } else { - /* code point>0x10ffff, outside Unicode */ - illegal=1; - break; - } - U_FALLTHROUGH; - case 2: - trail=*s++; - c=(c<<6)|(trail&0x3f); - illegal|=(trail&0xc0)^0x80; - U_FALLTHROUGH; - case 1: - trail=*s++; - c=(c<<6)|(trail&0x3f); - illegal|=(trail&0xc0)^0x80; - break; - case 0: - return U_SENTINEL; - /* no default branch to optimize switch() - all values are covered */ - } - } else { - illegal=1; /* too few bytes left */ - } - - /* correct sequence - all trail bytes have (b7..b6)==(10)? */ - /* illegal is also set if count>=4 */ - U_ASSERT(illegal || count0 && s 0) || subchar > 0x10ffff || U_IS_SURROGATE(subchar) @@ -434,7 +279,10 @@ u_strFromUTF8WithSub(UChar *dest, if(pNumSubstitutions!=NULL) { *pNumSubstitutions=0; } - numSubstitutions=0; + UChar *pDest = dest; + UChar *pDestLimit = dest+destCapacity; + int32_t reqLength = 0; + int32_t numSubstitutions=0; /* * Inline processing of UTF-8 byte sequences: @@ -455,95 +303,81 @@ u_strFromUTF8WithSub(UChar *dest, * The code explicitly checks for NULs only in the lead byte position. * A NUL byte in the trail byte position fails the trail byte range check anyway. */ - while(((ch = *pSrc) != 0) && (pDest < pDestLimit)) { - if(ch <= 0x7f){ - *pDest++=(UChar)ch; - ++pSrc; + int32_t i; + UChar32 c; + for(i = 0; (c = (uint8_t)src[i]) != 0 && (pDest < pDestLimit);) { + // modified copy of U8_NEXT() + ++i; + if(U8_IS_SINGLE(c)) { + *pDest++=(UChar)c; } else { - if(ch > 0xe0) { - if( /* handle U+1000..U+CFFF inline */ - ch <= 0xec && - (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f && - (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f - ) { - /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */ - *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2); - pSrc += 3; - continue; - } - } else if(ch < 0xe0) { - if( /* handle U+0080..U+07FF inline */ - ch >= 0xc2 && - (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f - ) { - *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1); - pSrc += 2; - continue; - } - } - - /* function call for "complicated" and error cases */ - ++pSrc; /* continue after the lead byte */ - ch=utf8_nextCharSafeBodyTerminated(&pSrc, ch); - if(ch<0 && (++numSubstitutions, ch = subchar) < 0) { - *pErrorCode = U_INVALID_CHAR_FOUND; - return NULL; - } else if(ch<=0xFFFF) { - *(pDest++)=(UChar)ch; + uint8_t __t1, __t2; + if( /* handle U+0800..U+FFFF inline */ + (0xe0<=(c) && (c)<0xf0) && + U8_IS_VALID_LEAD3_AND_T1((c), src[i]) && + (__t2=src[(i)+1]-0x80)<=0x3f) { + *pDest++ = (((c)&0xf)<<12)|((src[i]&0x3f)<<6)|__t2; + i+=2; + } else if( /* handle U+0080..U+07FF inline */ + ((c)<0xe0 && (c)>=0xc2) && + (__t1=src[i]-0x80)<=0x3f) { + *pDest++ = (((c)&0x1f)<<6)|__t1; + ++(i); } else { - *(pDest++)=U16_LEAD(ch); - if(pDest 0xe0) { - if( /* handle U+1000..U+CFFF inline */ - ch <= 0xec && - (uint8_t)(pSrc[1] - 0x80) <= 0x3f && - (uint8_t)(pSrc[2] - 0x80) <= 0x3f - ) { - ++reqLength; - pSrc += 3; - continue; - } - } else if(ch < 0xe0) { - if( /* handle U+0080..U+07FF inline */ - ch >= 0xc2 && - (uint8_t)(pSrc[1] - 0x80) <= 0x3f - ) { - ++reqLength; - pSrc += 2; - continue; + uint8_t __t1, __t2; + if( /* handle U+0800..U+FFFF inline */ + (0xe0<=(c) && (c)<0xf0) && + U8_IS_VALID_LEAD3_AND_T1((c), src[i]) && + (__t2=src[(i)+1]-0x80)<=0x3f) { + ++reqLength; + i+=2; + } else if( /* handle U+0080..U+07FF inline */ + ((c)<0xe0 && (c)>=0xc2) && + (__t1=src[i]-0x80)<=0x3f) { + ++reqLength; + ++(i); + } else { + /* function call for "complicated" and error cases */ + (c)=utf8_nextCharSafeBody((const uint8_t *)src, &(i), -1, c, -1); + if(c<0 && (++numSubstitutions, c = subchar) < 0) { + *pErrorCode = U_INVALID_CHAR_FOUND; + return NULL; } + reqLength += U16_LENGTH(c); } - - /* function call for "complicated" and error cases */ - ++pSrc; /* continue after the lead byte */ - ch=utf8_nextCharSafeBodyTerminated(&pSrc, ch); - if(ch<0 && (++numSubstitutions, ch = subchar) < 0) { - *pErrorCode = U_INVALID_CHAR_FOUND; - return NULL; - } - reqLength += U16_LENGTH(ch); } } } else /* srcLength >= 0 */ { - const uint8_t *pSrcLimit = pSrc + srcLength; - int32_t count; - - /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */ + /* Faster loop without ongoing checking for srcLength and pDestLimit. */ + int32_t i = 0; + UChar32 c; for(;;) { /* * Each iteration of the inner loop progresses by at most 3 UTF-8 @@ -551,10 +385,10 @@ u_strFromUTF8WithSub(UChar *dest, * For supplementary code points (4 & 2), which are rare, * there is an additional adjustment. */ - count = (int32_t)(pDestLimit - pDest); - srcLength = (int32_t)((pSrcLimit - pSrc) / 3); - if(count > srcLength) { - count = srcLength; /* min(remaining dest, remaining src/3) */ + int32_t count = (int32_t)(pDestLimit - pDest); + int32_t count2 = (srcLength - i) / 3; + if(count > count2) { + count = count2; /* min(remaining dest, remaining src/3) */ } if(count < 3) { /* @@ -565,147 +399,123 @@ u_strFromUTF8WithSub(UChar *dest, } do { - ch = *pSrc; - if(ch <= 0x7f){ - *pDest++=(UChar)ch; - ++pSrc; + // modified copy of U8_NEXT() + c = (uint8_t)src[i++]; + if(U8_IS_SINGLE(c)) { + *pDest++=(UChar)c; } else { - if(ch > 0xe0) { - if( /* handle U+1000..U+CFFF inline */ - ch <= 0xec && - (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f && - (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f - ) { - /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */ - *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2); - pSrc += 3; - continue; - } - } else if(ch < 0xe0) { - if( /* handle U+0080..U+07FF inline */ - ch >= 0xc2 && - (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f - ) { - *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1); - pSrc += 2; - continue; + uint8_t __t1, __t2; + if( /* handle U+0800..U+FFFF inline */ + (0xe0<=(c) && (c)<0xf0) && + ((i)+1)=0xc2) && + ((i)!=srcLength) && + (__t1=src[i]-0x80)<=0x3f) { + *pDest++ = (((c)&0x1f)<<6)|__t1; + ++(i); + } else { + if(c >= 0xf0 || subchar > 0xffff) { + // We may read up to four bytes and write up to two UChars, + // which we didn't account for with computing count, + // so we adjust it here. + if(--count == 0) { + --i; // back out byte c + break; + } } - } - if(ch >= 0xf0 || subchar > 0xffff) { - /* - * We may read up to six bytes and write up to two UChars, - * which we didn't account for with computing count, - * so we adjust it here. - */ - if(--count == 0) { - break; + /* function call for "complicated" and error cases */ + (c)=utf8_nextCharSafeBody((const uint8_t *)src, &(i), srcLength, c, -1); + if(c<0 && (++numSubstitutions, c = subchar) < 0) { + *pErrorCode = U_INVALID_CHAR_FOUND; + return NULL; + } else if(c<=0xFFFF) { + *(pDest++)=(UChar)c; + } else { + *(pDest++)=U16_LEAD(c); + *(pDest++)=U16_TRAIL(c); } } - - /* function call for "complicated" and error cases */ - ++pSrc; /* continue after the lead byte */ - ch=utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch); - if(ch<0 && (++numSubstitutions, ch = subchar) < 0){ - *pErrorCode = U_INVALID_CHAR_FOUND; - return NULL; - }else if(ch<=0xFFFF){ - *(pDest++)=(UChar)ch; - }else{ - *(pDest++)=U16_LEAD(ch); - *(pDest++)=U16_TRAIL(ch); - } } } while(--count > 0); } - while((pSrc 0xe0) { - if( /* handle U+1000..U+CFFF inline */ - ch <= 0xec && - ((pSrcLimit - pSrc) >= 3) && - (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f && - (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f - ) { - /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */ - *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2); - pSrc += 3; - continue; - } - } else if(ch < 0xe0) { - if( /* handle U+0080..U+07FF inline */ - ch >= 0xc2 && - ((pSrcLimit - pSrc) >= 2) && - (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f - ) { - *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1); - pSrc += 2; - continue; - } - } - - /* function call for "complicated" and error cases */ - ++pSrc; /* continue after the lead byte */ - ch=utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch); - if(ch<0 && (++numSubstitutions, ch = subchar) < 0){ - *pErrorCode = U_INVALID_CHAR_FOUND; - return NULL; - }else if(ch<=0xFFFF){ - *(pDest++)=(UChar)ch; - }else{ - *(pDest++)=U16_LEAD(ch); - if(pDest=0xc2) && + ((i)!=srcLength) && + (__t1=src[i]-0x80)<=0x3f) { + *pDest++ = (((c)&0x1f)<<6)|__t1; + ++(i); + } else { + /* function call for "complicated" and error cases */ + (c)=utf8_nextCharSafeBody((const uint8_t *)src, &(i), srcLength, c, -1); + if(c<0 && (++numSubstitutions, c = subchar) < 0) { + *pErrorCode = U_INVALID_CHAR_FOUND; + return NULL; + } else if(c<=0xFFFF) { + *(pDest++)=(UChar)c; + } else { + *(pDest++)=U16_LEAD(c); + if(pDest 0xe0) { - if( /* handle U+1000..U+CFFF inline */ - ch <= 0xec && - ((pSrcLimit - pSrc) >= 3) && - (uint8_t)(pSrc[1] - 0x80) <= 0x3f && - (uint8_t)(pSrc[2] - 0x80) <= 0x3f - ) { - reqLength++; - pSrc += 3; - continue; - } - } else if(ch < 0xe0) { - if( /* handle U+0080..U+07FF inline */ - ch >= 0xc2 && - ((pSrcLimit - pSrc) >= 2) && - (uint8_t)(pSrc[1] - 0x80) <= 0x3f - ) { - reqLength++; - pSrc += 2; - continue; + uint8_t __t1, __t2; + if( /* handle U+0800..U+FFFF inline */ + (0xe0<=(c) && (c)<0xf0) && + ((i)+1)=0xc2) && + ((i)!=srcLength) && + (__t1=src[i]-0x80)<=0x3f) { + ++reqLength; + ++(i); + } else { + /* function call for "complicated" and error cases */ + (c)=utf8_nextCharSafeBody((const uint8_t *)src, &(i), srcLength, c, -1); + if(c<0 && (++numSubstitutions, c = subchar) < 0) { + *pErrorCode = U_INVALID_CHAR_FOUND; + return NULL; } + reqLength += U16_LENGTH(c); } - - /* function call for "complicated" and error cases */ - ++pSrc; /* continue after the lead byte */ - ch=utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch); - if(ch<0 && (++numSubstitutions, ch = subchar) < 0){ - *pErrorCode = U_INVALID_CHAR_FOUND; - return NULL; - } - reqLength+=U16_LENGTH(ch); } } } @@ -753,7 +563,7 @@ u_strFromUTF8Lenient(UChar *dest, uint8_t* pSrc = (uint8_t*) src; /* args check */ - if(pErrorCode==NULL || U_FAILURE(*pErrorCode)){ + if(U_FAILURE(*pErrorCode)){ return NULL; } @@ -994,7 +804,7 @@ u_strToUTF8WithSub(char *dest, int32_t numSubstitutions; /* args check */ - if(pErrorCode==NULL || U_FAILURE(*pErrorCode)){ + if(U_FAILURE(*pErrorCode)){ return NULL; } @@ -1266,18 +1076,8 @@ u_strFromJavaModifiedUTF8WithSub( int32_t srcLength, UChar32 subchar, int32_t *pNumSubstitutions, UErrorCode *pErrorCode) { - UChar *pDest = dest; - UChar *pDestLimit = dest+destCapacity; - UChar32 ch; - int32_t reqLength = 0; - const uint8_t* pSrc = (const uint8_t*) src; - const uint8_t *pSrcLimit; - int32_t count; - uint8_t t1, t2; /* trail bytes */ - int32_t numSubstitutions; - /* args check */ - if(U_FAILURE(*pErrorCode)){ + if(U_FAILURE(*pErrorCode)) { return NULL; } if( (src==NULL && srcLength!=0) || srcLength < -1 || @@ -1291,18 +1091,22 @@ u_strFromJavaModifiedUTF8WithSub( if(pNumSubstitutions!=NULL) { *pNumSubstitutions=0; } - numSubstitutions=0; + UChar *pDest = dest; + UChar *pDestLimit = dest+destCapacity; + int32_t reqLength = 0; + int32_t numSubstitutions=0; if(srcLength < 0) { /* * Transform a NUL-terminated ASCII string. * Handle non-ASCII strings with slower code. */ - while(((ch = *pSrc) != 0) && ch <= 0x7f && (pDest < pDestLimit)) { - *pDest++=(UChar)ch; - ++pSrc; + UChar32 c; + while(((c = (uint8_t)*src) != 0) && c <= 0x7f && (pDest < pDestLimit)) { + *pDest++=(UChar)c; + ++src; } - if(ch == 0) { + if(c == 0) { reqLength=(int32_t)(pDest - dest); if(pDestLength) { *pDestLength = reqLength; @@ -1312,33 +1116,38 @@ u_strFromJavaModifiedUTF8WithSub( u_terminateUChars(dest, destCapacity, reqLength, pErrorCode); return dest; } - srcLength = static_cast(uprv_strlen((const char *)pSrc)); + srcLength = static_cast(uprv_strlen(src)); } - /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */ - pSrcLimit = (pSrc == NULL) ? NULL : pSrc + srcLength; + /* Faster loop without ongoing checking for srcLength and pDestLimit. */ + UChar32 ch; + uint8_t t1, t2; + int32_t i = 0; for(;;) { - count = (int32_t)(pDestLimit - pDest); - srcLength = (int32_t)(pSrcLimit - pSrc); - if(count >= srcLength && srcLength > 0 && *pSrc <= 0x7f) { + int32_t count = (int32_t)(pDestLimit - pDest); + int32_t count2 = srcLength - i; + if(count >= count2 && srcLength > 0 && U8_IS_SINGLE(*src)) { /* fast ASCII loop */ - const uint8_t *prevSrc = pSrc; - int32_t delta; - while(pSrc < pSrcLimit && (ch = *pSrc) <= 0x7f) { - *pDest++=(UChar)ch; - ++pSrc; + int32_t start = i; + uint8_t b; + while(i < srcLength && U8_IS_SINGLE(b = src[i])) { + *pDest++=b; + ++i; } - delta = (int32_t)(pSrc - prevSrc); + int32_t delta = i - start; count -= delta; - srcLength -= delta; + count2 -= delta; } /* * Each iteration of the inner loop progresses by at most 3 UTF-8 * bytes and one UChar. */ - srcLength /= 3; - if(count > srcLength) { - count = srcLength; /* min(remaining dest, remaining src/3) */ + if(subchar > 0xFFFF) { + break; + } + count2 /= 3; + if(count > count2) { + count = count2; /* min(remaining dest, remaining src/3) */ } if(count < 3) { /* @@ -1348,29 +1157,28 @@ u_strFromJavaModifiedUTF8WithSub( break; } do { - ch = *pSrc; - if(ch <= 0x7f){ + ch = (uint8_t)src[i++]; + if(U8_IS_SINGLE(ch)) { *pDest++=(UChar)ch; - ++pSrc; } else { if(ch >= 0xe0) { if( /* handle U+0000..U+FFFF inline */ ch <= 0xef && - (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f && - (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f + (t1 = (uint8_t)(src[i] - 0x80)) <= 0x3f && + (t2 = (uint8_t)(src[i+1] - 0x80)) <= 0x3f ) { /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */ *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2); - pSrc += 3; + i += 2; continue; } } else { if( /* handle U+0000..U+07FF inline */ ch >= 0xc0 && - (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f + (t1 = (uint8_t)(src[i] - 0x80)) <= 0x3f ) { *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1); - pSrc += 2; + ++i; continue; } } @@ -1383,49 +1191,43 @@ u_strFromJavaModifiedUTF8WithSub( * We need to write two UChars, adjusted count for that, * and ran out of space. */ + --i; // back out byte ch break; } else { /* function call for error cases */ - ++pSrc; /* continue after the lead byte */ - utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch); + utf8_nextCharSafeBody((const uint8_t *)src, &(i), srcLength, ch, -1); ++numSubstitutions; - if(subchar<=0xFFFF) { - *(pDest++)=(UChar)subchar; - } else { - *(pDest++)=U16_LEAD(subchar); - *(pDest++)=U16_TRAIL(subchar); - } + *(pDest++)=(UChar)subchar; } } } while(--count > 0); } - while((pSrc= 0xe0) { if( /* handle U+0000..U+FFFF inline */ ch <= 0xef && - ((pSrcLimit - pSrc) >= 3) && - (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f && - (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f + (i+1) < srcLength && + (t1 = (uint8_t)(src[i] - 0x80)) <= 0x3f && + (t2 = (uint8_t)(src[i+1] - 0x80)) <= 0x3f ) { /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */ *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2); - pSrc += 3; + i += 2; continue; } } else { if( /* handle U+0000..U+07FF inline */ ch >= 0xc0 && - ((pSrcLimit - pSrc) >= 2) && - (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f + i < srcLength && + (t1 = (uint8_t)(src[i] - 0x80)) <= 0x3f ) { *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1); - pSrc += 2; + ++i; continue; } } @@ -1435,8 +1237,7 @@ u_strFromJavaModifiedUTF8WithSub( return NULL; } else { /* function call for error cases */ - ++pSrc; /* continue after the lead byte */ - utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch); + utf8_nextCharSafeBody((const uint8_t *)src, &(i), srcLength, ch, -1); ++numSubstitutions; if(subchar<=0xFFFF) { *(pDest++)=(UChar)subchar; @@ -1453,32 +1254,31 @@ u_strFromJavaModifiedUTF8WithSub( } } - /* do not fill the dest buffer just count the UChars needed */ - while(pSrc < pSrcLimit){ - ch = *pSrc; - if(ch <= 0x7f) { + /* Pre-flight the rest of the string. */ + while(i < srcLength) { + ch = (uint8_t)src[i++]; + if(U8_IS_SINGLE(ch)) { reqLength++; - ++pSrc; } else { if(ch >= 0xe0) { if( /* handle U+0000..U+FFFF inline */ ch <= 0xef && - ((pSrcLimit - pSrc) >= 3) && - (uint8_t)(pSrc[1] - 0x80) <= 0x3f && - (uint8_t)(pSrc[2] - 0x80) <= 0x3f + (i+1) < srcLength && + (uint8_t)(src[i] - 0x80) <= 0x3f && + (uint8_t)(src[i+1] - 0x80) <= 0x3f ) { reqLength++; - pSrc += 3; + i += 2; continue; } } else { if( /* handle U+0000..U+07FF inline */ ch >= 0xc0 && - ((pSrcLimit - pSrc) >= 2) && - (uint8_t)(pSrc[1] - 0x80) <= 0x3f + i < srcLength && + (uint8_t)(src[i] - 0x80) <= 0x3f ) { reqLength++; - pSrc += 2; + ++i; continue; } } @@ -1488,8 +1288,7 @@ u_strFromJavaModifiedUTF8WithSub( return NULL; } else { /* function call for error cases */ - ++pSrc; /* continue after the lead byte */ - utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch); + utf8_nextCharSafeBody((const uint8_t *)src, &(i), srcLength, ch, -1); ++numSubstitutions; reqLength+=U16_LENGTH(ch); } diff --git a/icu4c/source/common/utext.cpp b/icu4c/source/common/utext.cpp index 52ae7ff9787..a2c9008abf6 100644 --- a/icu4c/source/common/utext.cpp +++ b/icu4c/source/common/utext.cpp @@ -847,15 +847,11 @@ U_CDECL_END //------------------------------------------------------------------------------ // Chunk size. -// Must be less than 42 (256/6), because of byte mapping from UChar indexes to native indexes. -// Worst case there are six UTF-8 bytes per UChar. -// obsolete 6 byte form fd + 5 trails maps to fffd -// obsolete 5 byte form fc + 4 trails maps to fffd -// non-shortest 4 byte forms maps to fffd -// normal supplementaries map to a pair of utf-16, two utf8 bytes per utf-16 unit -// mapToUChars array size must allow for the worst case, 6. -// This could be brought down to 4, by treating fd and fc as pure illegal, -// rather than obsolete lead bytes. But that is not compatible with the utf-8 access macros. +// Must be less than 85 (256/3), because of byte mapping from UChar indexes to native indexes. +// Worst case is three native bytes to one UChar. (Supplemenaries are 4 native bytes +// to two UChars.) +// The longest illegal byte sequence treated as a single error (and converted to U+FFFD) +// is a three-byte sequence (truncated four-byte sequence). // enum { UTF8_TEXT_CHUNK_SIZE=32 }; @@ -895,7 +891,7 @@ struct UTF8Buf { // Requires two extra slots, // one for a supplementary starting in the last normal position, // and one for an entry for the buffer limit position. - uint8_t mapToUChars[UTF8_TEXT_CHUNK_SIZE*6+6]; // Map native offset from bufNativeStart to + uint8_t mapToUChars[UTF8_TEXT_CHUNK_SIZE*3+6]; // Map native offset from bufNativeStart to // correspoding offset in filled part of buf. int32_t align; }; diff --git a/icu4c/source/common/utf_impl.cpp b/icu4c/source/common/utf_impl.cpp index 856072cb767..b560a4f7aa4 100644 --- a/icu4c/source/common/utf_impl.cpp +++ b/icu4c/source/common/utf_impl.cpp @@ -7,7 +7,7 @@ * Corporation and others. All Rights Reserved. * ****************************************************************************** -* file name: utf_impl.c +* file name: utf_impl.cpp * encoding: UTF-8 * tab size: 8 (not used) * indentation:4 @@ -54,10 +54,6 @@ * - SUB AX, BX (result) * -finish: * (BSR: Bit Scan Reverse, scans for a 1-bit, starting from the MSB) - * - * In Unicode, all UTF-8 byte sequences with more than 4 bytes are illegal; - * lead bytes above 0xf4 are illegal. - * We keep them in this table for skipping long ISO 10646-UTF-8 sequences. */ extern "C" U_EXPORT const uint8_t utf8_countTrailBytes[256]={ @@ -76,27 +72,24 @@ utf8_countTrailBytes[256]={ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + // illegal C0 & C1 + // 2-byte lead bytes C2..DF + 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + // 3-byte lead bytes E0..EF 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 3, 3, 3, 3, 3, - 3, 3, 3, /* illegal in Unicode */ - 4, 4, 4, 4, /* illegal in Unicode */ - 5, 5, /* illegal in Unicode */ - 0, 0 /* illegal bytes 0xfe and 0xff */ + // 4-byte lead bytes F0..F4 + // illegal F5..FF + 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; -static const UChar32 -utf8_minLegal[4]={ 0, 0x80, 0x800, 0x10000 }; - static const UChar32 utf8_errorValue[6]={ // Same values as UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_2, UTF_ERROR_VALUE, // but without relying on the obsolete unicode/utf_old.h. 0x15, 0x9f, 0xffff, - 0x10ffff, - 0x3ffffff, 0x7fffffff + 0x10ffff }; static UChar32 @@ -136,61 +129,59 @@ errorValue(int32_t count, int8_t strict) { */ U_CAPI UChar32 U_EXPORT2 utf8_nextCharSafeBody(const uint8_t *s, int32_t *pi, int32_t length, UChar32 c, UBool strict) { + // *pi is one after byte c. int32_t i=*pi; - uint8_t count=U8_COUNT_TRAIL_BYTES(c); - U_ASSERT(count <= 5); /* U8_COUNT_TRAIL_BYTES returns value 0...5 */ - if(i+count<=length || length<0) { - uint8_t trail; - - U8_MASK_LEAD_BYTE(c, count); - /* support NUL-terminated strings: do not read beyond the first non-trail byte */ - switch(count) { - /* each branch falls through to the next one */ - case 0: - /* count==0 for illegally leading trail bytes and the illegal bytes 0xfe and 0xff */ - case 5: - case 4: - /* count>=4 is always illegal: no more than 3 trail bytes in Unicode's UTF-8 */ - break; - case 3: - trail=s[i++]-0x80; - c=(c<<6)|trail; - /* c>=0x110 would result in code point>0x10ffff, outside Unicode */ - if(c>=0x110 || trail>0x3f) { break; } - U_FALLTHROUGH; - case 2: - trail=s[i++]-0x80; - c=(c<<6)|trail; - /* - * test for a surrogate d800..dfff unless we are lenient: - * before the last (c<<6), a surrogate is c=360..37f - */ - if(((c&0xffe0)==0x360 && strict!=-2) || trail>0x3f) { break; } - U_FALLTHROUGH; - case 1: - trail=s[i++]-0x80; - c=(c<<6)|trail; - if(trail>0x3f) { break; } - /* correct sequence - all trail bytes have (b7..b6)==(10) */ - if(c>=utf8_minLegal[count] && - /* strict: forbid non-characters like U+fffe */ - (strict<=0 || !U_IS_UNICODE_NONCHAR(c))) { + // length can be negative for NUL-terminated strings: Read and validate one byte at a time. + if(i==length || c>0xf4) { + // end of string, or not a lead byte + } else if(c>=0xf0) { + // Test for 4-byte sequences first because + // U8_NEXT() handles shorter valid sequences inline. + uint8_t t1=s[i], t2, t3; + c&=7; + if(U8_IS_VALID_LEAD4_AND_T1(c, t1) && + ++i!=length && (t2=s[i]-0x80)<=0x3f && + ++i!=length && (t3=s[i]-0x80)<=0x3f) { + ++i; + c=(c<<18)|((t1&0x3f)<<12)|(t2<<6)|t3; + // strict: forbid non-characters like U+fffe + if(strict<=0 || !U_IS_UNICODE_NONCHAR(c)) { *pi=i; return c; } - /* no default branch to optimize switch() - all values are covered */ } - } else { - /* too few bytes left */ - count=length-i; - } + } else if(c>=0xe0) { + c&=0xf; + if(strict!=-2) { + uint8_t t1=s[i], t2; + if(U8_IS_VALID_LEAD3_AND_T1(c, t1) && + ++i!=length && (t2=s[i]-0x80)<=0x3f) { + ++i; + c=(c<<12)|((t1&0x3f)<<6)|t2; + // strict: forbid non-characters like U+fffe + if(strict<=0 || !U_IS_UNICODE_NONCHAR(c)) { + *pi=i; + return c; + } + } + } else { + // strict=-2 -> lenient: allow surrogates + uint8_t t1=s[i]-0x80, t2; + if(t1<=0x3f && (c>0 || t1>=0x20) && + ++i!=length && (t2=s[i]-0x80)<=0x3f) { + *pi=i+1; + return (c<<12)|(t1<<6)|t2; + } + } + } else if(c>=0xc2) { + uint8_t t1=s[i]-0x80; + if(t1<=0x3f) { + *pi=i+1; + return ((c-0xc0)<<6)|t1; + } + } // else 0x80<=c<0xc2 is not a lead byte /* error handling */ - i=*pi; - while(count>0 && U8_IS_TRAIL(s[i])) { - ++i; - --count; - } c=errorValue(i-*pi, strict); *pi=i; return c; @@ -243,99 +234,99 @@ utf8_appendCharSafeBody(uint8_t *s, int32_t i, int32_t length, UChar32 c, UBool U_CAPI UChar32 U_EXPORT2 utf8_prevCharSafeBody(const uint8_t *s, int32_t start, int32_t *pi, UChar32 c, UBool strict) { + // *pi is the index of byte c. int32_t i=*pi; - uint8_t b, count=1, shift=6; - - if(!U8_IS_TRAIL(c)) { return errorValue(0, strict); } - - /* extract value bits from the last trail byte */ - c&=0x3f; - - for(;;) { - if(i<=start) { - /* no lead byte at all */ - return errorValue(0, strict); - } - - /* read another previous byte */ - b=s[--i]; - if((uint8_t)(b-0x80)<0x7e) { /* 0x80<=b<0xfe */ - if(b&0x40) { - /* lead byte, this will always end the loop */ - uint8_t shouldCount=U8_COUNT_TRAIL_BYTES(b); - - if(count==shouldCount) { - /* set the new position */ - *pi=i; - U8_MASK_LEAD_BYTE(b, count); - c|=(UChar32)b<=4 || c>0x10ffff || c0 && U_IS_UNICODE_NONCHAR(c))) { - /* illegal sequence or (strict and non-character) */ - if(count>=4) { - count=3; + if(U8_IS_TRAIL(c) && i>start) { + uint8_t b1=s[--i]; + if(0xc2<=b1 && b1<0xe0) { + *pi=i; + return ((b1-0xc0)<<6)|(c&0x3f); + } else if(U8_IS_TRAIL(b1) && i>start) { + // Extract the value bits from the last trail byte. + c&=0x3f; + uint8_t b2=s[--i]; + if(0xe0<=b2 && b2<0xf0) { + b2&=0xf; + if(strict!=-2) { + if(U8_IS_VALID_LEAD3_AND_T1(b2, b1)) { + *pi=i; + c=(b2<<12)|((b1&0x3f)<<6)|c; + if(strict<=0 || !U_IS_UNICODE_NONCHAR(c)) { + return c; + } else { + // strict: forbid non-characters like U+fffe + return errorValue(2, strict); } - c=errorValue(count, strict); - } else { - /* exit with correct c */ } } else { - /* the lead byte does not match the number of trail bytes */ - /* only set the position to the lead byte if it would - include the trail byte that we started with */ - if(count lenient: allow surrogates + b1-=0x80; + if((b2>0 || b1>=0x20)) { + *pi=i; + return (b2<<12)|(b1<<6)|c; + } + } + } else if(U8_IS_TRAIL(b2) && i>start) { + uint8_t b3=s[--i]; + if(0xf0<=b3 && b3<=0xf4) { + b3&=7; + if(U8_IS_VALID_LEAD4_AND_T1(b3, b2)) { *pi=i; - c=errorValue(count, strict); - } else { - c=errorValue(0, strict); + c=(b3<<18)|((b2&0x3f)<<12)|((b1&0x3f)<<6)|c; + if(strict<=0 || !U_IS_UNICODE_NONCHAR(c)) { + return c; + } else { + // strict: forbid non-characters like U+fffe + return errorValue(3, strict); + } } } - break; - } else if(count<5) { - /* trail byte */ - c|=(UChar32)(b&0x3f)<start) { - Z=I-5; - } else { - Z=start; - } - - /* return I if the sequence starting there is long enough to include i */ - do { - b=s[I]; - if((uint8_t)(b-0x80)>=0x7e) { /* not 0x80<=b<0xfe */ - break; - } else if(b>=0xc0) { - if(U8_COUNT_TRAIL_BYTES(b)>=(i-I)) { - return I; - } else { - break; + // Same as utf8_prevCharSafeBody(..., strict=-1) minus assembling code points. + int32_t orig_i=i; + uint8_t c=s[i]; + if(U8_IS_TRAIL(c) && i>start) { + uint8_t b1=s[--i]; + if(0xc2<=b1 && b1<0xe0) { + return i; + } else if(U8_IS_TRAIL(b1) && i>start) { + uint8_t b2=s[--i]; + if(0xe0<=b2 && b2<0xf0) { + if(U8_IS_VALID_LEAD3_AND_T1(b2, b1)) { + return i; + } + } else if(U8_IS_TRAIL(b2) && i>start) { + uint8_t b3=s[--i]; + if(0xf0<=b3 && b3<=0xf4) { + if(U8_IS_VALID_LEAD4_AND_T1(b3, b2)) { + return i; + } + } + } else if((0xf0<=b2 && b2<=0xf4) && U8_IS_VALID_LEAD4_AND_T1(b2, b1)) { + // Truncated 4-byte sequence. + return i; } + } else if(((0xe0<=b1 && b1<0xf0) && U8_IS_VALID_LEAD3_AND_T1(b1, c)) || + ((0xf0<=b1 && b1<=0xf4) && U8_IS_VALID_LEAD4_AND_T1(b1, c))) { + // Truncated 3- or 4-byte sequence. + return i; } - } while(Z<=--I); - - /* return i itself to be consistent with the FWD_1 macro */ - return i; + } + return orig_i; } diff --git a/icu4c/source/common/utrie2.h b/icu4c/source/common/utrie2.h index b33e3305f93..8e1caa5e90b 100644 --- a/icu4c/source/common/utrie2.h +++ b/icu4c/source/common/utrie2.h @@ -20,6 +20,7 @@ #define __UTRIE2_H__ #include "unicode/utypes.h" +#include "unicode/utf8.h" #include "putilimp.h" #include "udataswp.h" @@ -54,6 +55,8 @@ typedef struct UTrie UTrie; * is truncated, omitting both the BMP portion and the high range. * - There is a special small index for 2-byte UTF-8, and the initial data * entries are designed for fast 1/2-byte UTF-8 lookup. + * Starting with ICU 60, C0 and C1 are not recognized as UTF-8 lead bytes any more at all, + * and the associated 2-byte indexes are unused. */ /** @@ -933,29 +936,29 @@ utrie2_internalU8PrevIndex(const UTrie2 *trie, UChar32 c, /** Internal UTF-8 next-post-increment: get the next code point's data. */ #define _UTRIE2_U8_NEXT(trie, ascii, data, src, limit, result) { \ uint8_t __lead=(uint8_t)*(src)++; \ - if(__lead<0xc0) { \ + if(U8_IS_SINGLE(__lead)) { \ (result)=(trie)->ascii[__lead]; \ } else { \ uint8_t __t1, __t2; \ - if( /* handle U+0000..U+07FF inline */ \ - __lead<0xe0 && (src)<(limit) && \ - (__t1=(uint8_t)(*(src)-0x80))<=0x3f \ - ) { \ - ++(src); \ - (result)=(trie)->data[ \ - (trie)->index[(UTRIE2_UTF8_2B_INDEX_2_OFFSET-0xc0)+__lead]+ \ - __t1]; \ - } else if( /* handle U+0000..U+CFFF inline */ \ - __lead<0xed && ((src)+1)<(limit) && \ - (__t1=(uint8_t)(*(src)-0x80))<=0x3f && (__lead>0xe0 || __t1>=0x20) && \ + if( /* handle U+0800..U+FFFF inline */ \ + 0xe0<=__lead && __lead<0xf0 && ((src)+1)<(limit) && \ + U8_IS_VALID_LEAD3_AND_T1(__lead, __t1=(uint8_t)*(src)) && \ (__t2=(uint8_t)(*((src)+1)-0x80))<= 0x3f \ ) { \ (src)+=2; \ (result)=(trie)->data[ \ ((int32_t)((trie)->index[((__lead-0xe0)<<(12-UTRIE2_SHIFT_2))+ \ - (__t1<<(6-UTRIE2_SHIFT_2))+(__t2>>UTRIE2_SHIFT_2)]) \ + ((__t1&0x3f)<<(6-UTRIE2_SHIFT_2))+(__t2>>UTRIE2_SHIFT_2)]) \ <=0xc2 && (src)<(limit) && \ + (__t1=(uint8_t)(*(src)-0x80))<=0x3f \ + ) { \ + ++(src); \ + (result)=(trie)->data[ \ + (trie)->index[(UTRIE2_UTF8_2B_INDEX_2_OFFSET-0xc0)+__lead]+ \ + __t1]; \ } else { \ int32_t __index=utrie2_internalU8NextIndex((trie), __lead, (const uint8_t *)(src), \ (const uint8_t *)(limit)); \ @@ -968,7 +971,7 @@ utrie2_internalU8PrevIndex(const UTrie2 *trie, UChar32 c, /** Internal UTF-8 pre-decrement-previous: get the previous code point's data. */ #define _UTRIE2_U8_PREV(trie, ascii, data, start, src, result) { \ uint8_t __b=(uint8_t)*--(src); \ - if(__b<0x80) { \ + if(U8_IS_SINGLE(__b)) { \ (result)=(trie)->ascii[__b]; \ } else { \ int32_t __index=utrie2_internalU8PrevIndex((trie), __b, (const uint8_t *)(start), \ diff --git a/icu4c/source/i18n/utf8collationiterator.cpp b/icu4c/source/i18n/utf8collationiterator.cpp index 85d4b76b08e..345b1994ef0 100644 --- a/icu4c/source/i18n/utf8collationiterator.cpp +++ b/icu4c/source/i18n/utf8collationiterator.cpp @@ -49,26 +49,25 @@ UTF8CollationIterator::handleNextCE32(UChar32 &c, UErrorCode & /*errorCode*/) { } // Optimized combination of U8_NEXT_OR_FFFD() and UTRIE2_U8_NEXT32(). c = u8[pos++]; - if(c < 0xc0) { - // ASCII 00..7F; trail bytes 80..BF map to error values. + if(U8_IS_SINGLE(c)) { + // ASCII 00..7F return trie->data32[c]; } uint8_t t1, t2; - if(c < 0xe0 && pos != length && (t1 = (u8[pos] - 0x80)) <= 0x3f) { - // U+0080..U+07FF; 00..7F map to error values. + if(0xe0 <= c && c < 0xf0 && + ((pos + 1) < length || length < 0) && + U8_IS_VALID_LEAD3_AND_T1(c, t1 = u8[pos]) && + (t2 = (u8[pos + 1] - 0x80)) <= 0x3f) { + // U+0800..U+FFFF except surrogates + c = (((c & 0xf) << 12) | ((t1 & 0x3f) << 6) | t2); + pos += 2; + return UTRIE2_GET32_FROM_U16_SINGLE_LEAD(trie, c); + } else if(c < 0xe0 && c >= 0xc2 && pos != length && (t1 = (u8[pos] - 0x80)) <= 0x3f) { + // U+0080..U+07FF uint32_t ce32 = trie->data32[trie->index[(UTRIE2_UTF8_2B_INDEX_2_OFFSET - 0xc0) + c] + t1]; c = ((c & 0x1f) << 6) | t1; ++pos; return ce32; - } else if(c <= 0xef && - ((pos + 1) < length || length < 0) && - (t1 = (u8[pos] - 0x80)) <= 0x3f && (c != 0xe0 || t1 >= 0x20) && - (t2 = (u8[pos + 1] - 0x80)) <= 0x3f - ) { - // U+0800..U+FFFF; caller maps surrogates to error values. - c = (UChar)((c << 12) | (t1 << 6) | t2); - pos += 2; - return UTRIE2_GET32_FROM_U16_SINGLE_LEAD(trie, c); } else { // Function call for supplementary code points and error cases. // Illegal byte sequences yield U+FFFD. @@ -158,28 +157,17 @@ FCDUTF8CollationIterator::handleNextCE32(UChar32 &c, UErrorCode &errorCode) { return Collation::FALLBACK_CE32; } c = u8[pos++]; - if(c < 0xc0) { - // ASCII 00..7F; trail bytes 80..BF map to error values. + if(U8_IS_SINGLE(c)) { + // ASCII 00..7F return trie->data32[c]; } uint8_t t1, t2; - if(c < 0xe0 && pos != length && (t1 = (u8[pos] - 0x80)) <= 0x3f) { - // U+0080..U+07FF; 00..7F map to error values. - uint32_t ce32 = trie->data32[trie->index[(UTRIE2_UTF8_2B_INDEX_2_OFFSET - 0xc0) + c] + t1]; - c = ((c & 0x1f) << 6) | t1; - ++pos; - if(CollationFCD::hasTccc(c) && pos != length && nextHasLccc()) { - pos -= 2; - } else { - return ce32; - } - } else if(c <= 0xef && - ((pos + 1) < length || length < 0) && - (t1 = (u8[pos] - 0x80)) <= 0x3f && (c != 0xe0 || t1 >= 0x20) && - (t2 = (u8[pos + 1] - 0x80)) <= 0x3f - ) { - // U+0800..U+FFFF; caller maps surrogates to error values. - c = (UChar)((c << 12) | (t1 << 6) | t2); + if(0xe0 <= c && c < 0xf0 && + ((pos + 1) < length || length < 0) && + U8_IS_VALID_LEAD3_AND_T1(c, t1 = u8[pos]) && + (t2 = (u8[pos + 1] - 0x80)) <= 0x3f) { + // U+0800..U+FFFF except surrogates + c = (((c & 0xf) << 12) | ((t1 & 0x3f) << 6) | t2); pos += 2; if(CollationFCD::hasTccc(c) && (CollationFCD::maybeTibetanCompositeVowel(c) || @@ -188,6 +176,16 @@ FCDUTF8CollationIterator::handleNextCE32(UChar32 &c, UErrorCode &errorCode) { } else { break; // return CE32(BMP) } + } else if(c < 0xe0 && c >= 0xc2 && pos != length && (t1 = (u8[pos] - 0x80)) <= 0x3f) { + // U+0080..U+07FF + uint32_t ce32 = trie->data32[trie->index[(UTRIE2_UTF8_2B_INDEX_2_OFFSET - 0xc0) + c] + t1]; + c = ((c & 0x1f) << 6) | t1; + ++pos; + if(CollationFCD::hasTccc(c) && pos != length && nextHasLccc()) { + pos -= 2; + } else { + return ce32; + } } else { // Function call for supplementary code points and error cases. // Illegal byte sequences yield U+FFFD. @@ -237,7 +235,7 @@ UBool FCDUTF8CollationIterator::previousHasTccc() const { U_ASSERT(state == CHECK_BWD && pos != 0); UChar32 c = u8[pos - 1]; - if(c < 0x80) { return FALSE; } + if(U8_IS_SINGLE(c)) { return FALSE; } int32_t i = pos; U8_PREV_OR_FFFD(u8, 0, i, c); if(c > 0xffff) { c = U16_LEAD(c); } @@ -271,7 +269,7 @@ FCDUTF8CollationIterator::nextCodePoint(UErrorCode &errorCode) { if(pos == length || ((c = u8[pos]) == 0 && length < 0)) { return U_SENTINEL; } - if(c < 0x80) { + if(U8_IS_SINGLE(c)) { ++pos; return c; } @@ -309,7 +307,7 @@ FCDUTF8CollationIterator::previousCodePoint(UErrorCode &errorCode) { if(pos == 0) { return U_SENTINEL; } - if((c = u8[pos - 1]) < 0x80) { + if(U8_IS_SINGLE(c = u8[pos - 1])) { --pos; return c; } diff --git a/icu4c/source/test/cintltst/custrtrn.c b/icu4c/source/test/cintltst/custrtrn.c index bf1068e3482..087da834ce8 100644 --- a/icu4c/source/test/cintltst/custrtrn.c +++ b/icu4c/source/test/cintltst/custrtrn.c @@ -670,12 +670,13 @@ static void Test_UChar_UTF8_API(void){ } /* test UTF-8 with single surrogates - illegal in Unicode 3.2 */ + // Since ICU 60, each surrogate byte sequence is treated as 3 single-byte errors. { static const UChar withLead16[]={ 0x1800, 0xd89a, 0x0061 }, withTrail16[]={ 0x1800, 0xdcba, 0x0061, 0 }, - withTrail16SubFFFD[]={ 0x1800, 0xfffd, 0x0061, 0 }, /* sub==U+FFFD */ - withTrail16Sub50005[]={ 0x1800, 0xd900, 0xdc05, 0x0061, 0 }; /* sub==U+50005 */ + withTrail16SubFFFD[]={ 0x1800, 0xfffd, 0xfffd, 0xfffd, 0x0061, 0 }, /* sub==U+FFFD */ + withTrail16Sub50005[]={ 0x1800, 0xd900, 0xdc05, 0xd900, 0xdc05, 0xd900, 0xdc05, 0x0061, 0 }; /* sub==U+50005 */ static const uint8_t withLead8[]={ 0xe1, 0xa0, 0x80, 0xed, 0xa2, 0x9a, 0x61 }, withTrail8[]={ 0xe1, 0xa0, 0x80, 0xed, 0xb2, 0xba, 0x61, 0 }, @@ -706,7 +707,7 @@ static void Test_UChar_UTF8_API(void){ &err); if(U_FAILURE(err) || uDestLen!=u_strlen(withTrail16Sub50005) || 0!=u_memcmp(withTrail16Sub50005, out16, uDestLen+1) || - numSubstitutions!=1) { + numSubstitutions!=3) { log_err("error: u_strFromUTF8WithSub(length) failed\n"); } @@ -721,7 +722,7 @@ static void Test_UChar_UTF8_API(void){ &err); if(U_FAILURE(err) || uDestLen!=u_strlen(withTrail16SubFFFD) || 0!=u_memcmp(withTrail16SubFFFD, out16, uDestLen+1) || - numSubstitutions!=1) { + numSubstitutions!=3) { log_err("error: u_strFromUTF8WithSub(NUL termination) failed\n"); } @@ -734,7 +735,7 @@ static void Test_UChar_UTF8_API(void){ (const char *)withTrail8, -1, 0x50005, &numSubstitutions, &err); - if(err!=U_BUFFER_OVERFLOW_ERROR || uDestLen!=u_strlen(withTrail16Sub50005) || numSubstitutions!=1) { + if(err!=U_BUFFER_OVERFLOW_ERROR || uDestLen!=u_strlen(withTrail16Sub50005) || numSubstitutions!=3) { log_err("error: u_strFromUTF8WithSub(preflight/NUL termination) failed\n"); } @@ -1015,14 +1016,6 @@ Test_FromUTF8Lenient(void) { log_err("u_strFromUTF8Lenient(U_MEMORY_ALLOCATION_ERROR) failed\n"); } - dest[0]=0x1234; - destLength=-1; - errorCode=U_MEMORY_ALLOCATION_ERROR; - pDest=u_strFromUTF8Lenient(dest, 1, &destLength, (const char *)bytes, -1, NULL); - if(dest[0]!=0x1234) { - log_err("u_strFromUTF8Lenient(pErrorCode=NULL) failed\n"); - } - /* test normal behavior */ number=0; /* string number for log_err() */ diff --git a/icu4c/source/test/cintltst/trie2test.c b/icu4c/source/test/cintltst/trie2test.c index 5d11733e850..9444159beda 100644 --- a/icu4c/source/test/cintltst/trie2test.c +++ b/icu4c/source/test/cintltst/trie2test.c @@ -350,6 +350,11 @@ static void testTrieUTF8(const char *testName, const UTrie2 *trie, UTrie2ValueBits valueBits, const CheckRange checkRanges[], int32_t countCheckRanges) { + // Note: The byte sequence comments refer to the original UTF-8 definition. + // Starting with ICU 60, any sequence that is not a prefix of a valid one + // is treated as multiple single-byte errors. + // For testing, we only rely on U8_... and UTrie2 UTF-8 macros + // iterating consistently. static const uint8_t illegal[]={ 0xc0, 0x80, /* non-shortest U+0000 */ 0xc1, 0xbf, /* non-shortest U+007f */ @@ -394,15 +399,36 @@ testTrieUTF8(const char *testName, value=checkRanges[i].value; /* write three legal (or surrogate) code points */ U8_APPEND_UNSAFE(s, length, prevCP); /* start of the range */ - values[countValues++]=U_IS_SURROGATE(prevCP) ? errorValue : value; + if(U_IS_SURROGATE(prevCP)) { + // A surrogate byte sequence counts as 3 single-byte errors. + values[countValues++]=errorValue; + values[countValues++]=errorValue; + values[countValues++]=errorValue; + } else { + values[countValues++]=value; + } c=checkRanges[i].limit; prevCP=(prevCP+c)/2; /* middle of the range */ U8_APPEND_UNSAFE(s, length, prevCP); - values[countValues++]=U_IS_SURROGATE(prevCP) ? errorValue : value; + if(U_IS_SURROGATE(prevCP)) { + // A surrogate byte sequence counts as 3 single-byte errors. + values[countValues++]=errorValue; + values[countValues++]=errorValue; + values[countValues++]=errorValue; + } else { + values[countValues++]=value; + } prevCP=c; --c; /* end of the range */ U8_APPEND_UNSAFE(s, length, c); - values[countValues++]=U_IS_SURROGATE(c) ? errorValue : value; + if(U_IS_SURROGATE(prevCP)) { + // A surrogate byte sequence counts as 3 single-byte errors. + values[countValues++]=errorValue; + values[countValues++]=errorValue; + values[countValues++]=errorValue; + } else { + values[countValues++]=value; + } /* write an illegal byte sequence */ if(i8U+%04lx): 0x%lx instead of 0x%lx\n", - testName, (unsigned long)bytes, (long)c, (long)value, (long)values[i]); + log_err("error: wrong value from UTRIE2_U8_NEXT(%s)(from %d %lx->U+%04lx) (read %d bytes): " + "0x%lx instead of 0x%lx\n", + testName, (int)prev8, (unsigned long)bytes, (long)c, (int)((p-s)-prev8), + (long)value, (long)values[i]); } if(i8!=(p-s)) { - log_err("error: wrong end index from UTRIE2_U8_NEXT(%s)(%lx->U+%04lx): %ld != %ld\n", - testName, (unsigned long)bytes, (long)c, (long)(p-s), (long)i8); + log_err("error: wrong end index from UTRIE2_U8_NEXT(%s)(from %d %lx->U+%04lx): %ld != %ld\n", + testName, (int)prev8, (unsigned long)bytes, (long)c, (long)(p-s), (long)i8); continue; } ++i; @@ -471,12 +500,14 @@ testTrieUTF8(const char *testName, } } if(value!=values[i]) { - log_err("error: wrong value from UTRIE2_U8_PREV(%s)(%lx->U+%04lx): 0x%lx instead of 0x%lx\n", - testName, (unsigned long)bytes, (long)c, (long)value, (long)values[i]); + log_err("error: wrong value from UTRIE2_U8_PREV(%s)(from %d %lx->U+%04lx) (read %d bytes): " + ": 0x%lx instead of 0x%lx\n", + testName, (int)prev8, (unsigned long)bytes, (long)c, (int)(prev8-(p-s)), + (long)value, (long)values[i]); } if(i8!=(p-s)) { - log_err("error: wrong end index from UTRIE2_U8_PREV(%s)(%lx->U+%04lx): %ld != %ld\n", - testName, (unsigned long)bytes, (long)c, (long)(p-s), (long)i8); + log_err("error: wrong end index from UTRIE2_U8_PREV(%s)(from %d %lx->U+%04lx): %ld != %ld\n", + testName, (int)prev8, (unsigned long)bytes, (long)c, (long)(p-s), (long)i8); continue; } } diff --git a/icu4c/source/test/cintltst/utf8tst.c b/icu4c/source/test/cintltst/utf8tst.c index bbc67f4bb73..0bbb5e5413d 100644 --- a/icu4c/source/test/cintltst/utf8tst.c +++ b/icu4c/source/test/cintltst/utf8tst.c @@ -121,7 +121,7 @@ addUTF8Test(TestNode** root) static void TestCodeUnitValues() { - static const uint8_t codeunit[]={0x00, 0x65, 0x7e, 0x7f, 0xc0, 0xc4, 0xf0, 0xfd, 0x80, 0x81, 0xbc, 0xbe,}; + static const uint8_t codeunit[]={0x00, 0x65, 0x7e, 0x7f, 0xc2, 0xc4, 0xf0, 0xf4, 0x80, 0x81, 0xbc, 0xbe,}; int16_t i; for(i=0; i= 0 && offset < sizeof(input) - 1) { #if !U_HIDE_OBSOLETE_UTF_OLD_H UTF8_GET_CHAR_UNSAFE(input, offset, c); - if(c != result[i]){ - log_err("ERROR: UTF8_GET_CHAR_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n", offset, result[i], c); + if(c != expected) { + log_err("ERROR: UTF8_GET_CHAR_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n", + offset, expected, c); } #endif U8_GET_UNSAFE(input, offset, c); - if(c != result[i]){ - log_err("ERROR: U8_GET_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n", offset, result[i], c); + if(c != expected) { + log_err("ERROR: U8_GET_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n", + offset, expected, c); } } @@ -285,146 +288,160 @@ static void TestGetChar() } static void TestNextPrevChar() { - static const uint8_t input[]={0x61, 0xf0, 0x90, 0x90, 0x81, 0xc0, 0x80, 0xfd, 0xbe, 0xc2, 0x61, 0x81, 0x90, 0x90, 0xf0, 0x00}; + static const uint8_t input[]={ + 0x61, + 0xf0, 0x90, 0x90, 0x81, + 0xc0, 0x80, // non-shortest form + 0xf3, 0xbe, // truncated + 0xc2, // truncated + 0x61, + 0x81, 0x90, 0x90, 0xf0, // "backwards" sequence + 0x00 + }; static const UChar32 result[]={ - /* next_unsafe next_safe_ns next_safe_s prev_unsafe prev_safe_ns prev_safe_s */ - 0x0061, 0x0061, 0x0061, 0x0000, 0x0000, 0x0000, - 0x10401, 0x10401, 0x10401, 0xf0, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, - 0x90, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, 0x2841410, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, - 0x90, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, 0xa1050, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, - 0x81, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, 0x2841, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, - 0x00, UTF8_ERROR_VALUE_2, UTF8_ERROR_VALUE_2, 0x61, 0x61, 0x61, - 0x80, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, 0xc2, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, - 0xfd, UTF8_ERROR_VALUE_2, UTF8_ERROR_VALUE_2, 0x77e, UTF8_ERROR_VALUE_2, UTF8_ERROR_VALUE_2, - 0xbe, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, 0xfd, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, - 0xa1, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, 0x00, UTF8_ERROR_VALUE_2, UTF8_ERROR_VALUE_2, - 0x61, 0x61, 0x61, 0xc0, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, - 0x81, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, 0x10401, 0x10401, 0x10401, - 0x90, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, 0x410, UTF_ERROR_VALUE, UTF_ERROR_VALUE, - 0x90, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, 0x410, UTF8_ERROR_VALUE_2, UTF8_ERROR_VALUE_2, - 0x0840, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, 0xf0, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, - 0x0000, 0x0000, 0x0000, 0x0061, 0x0061, 0x0061 + /* next_safe_ns next_safe_s prev_safe_ns prev_safe_s */ + 0x0061, 0x0061, 0x0000, 0x0000, + 0x10401, 0x10401, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, + UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, + UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, + UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, + UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, 0x61, 0x61, + UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, + UTF8_ERROR_VALUE_2, UTF8_ERROR_VALUE_2, UTF8_ERROR_VALUE_2, UTF8_ERROR_VALUE_2, + UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, + UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, + 0x61, 0x61, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, + UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, 0x10401, 0x10401, + UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, UTF_ERROR_VALUE, UTF_ERROR_VALUE, + UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_2, UTF8_ERROR_VALUE_2, + UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, + 0x0000, 0x0000, 0x0061, 0x0061 }; static const int32_t movedOffset[]={ - /* next_unsafe next_safe_ns next_safe_s prev_unsafe prev_safe_ns prev_safe_s */ - 1, 1, 1, 15, 15, 15, - 5, 5, 5, 14, 14 , 14, - 3, 3, 3, 9, 13, 13, - 4, 4, 4, 9, 12, 12, - 5, 5, 5, 9, 11, 11, - 7, 7, 7, 10, 10, 10, - 7, 7, 7, 9, 9, 9, - 8, 9, 9, 7, 7, 7, - 9, 9, 9, 7, 7, 7, - 11, 10, 10, 5, 5, 5, - 11, 11, 11, 5, 5, 5, - 12, 12, 12, 1, 1, 1, - 13, 13, 13, 1, 1, 1, - 14, 14, 14, 1, 1, 1, - 14, 15, 15, 1, 1, 1, - 14, 16, 16, 0, 0, 0, + /* next_safe prev_safe_s */ + 1, 15, + 5, 14, + 3, 13, + 4, 12, + 5, 11, + 6, 10, + 7, 9, + 9, 7, + 9, 7, + 10, 6, + 11, 5, + 12, 1, + 13, 1, + 14, 1, + 15, 1, + 16, 0, }; - /* TODO: remove unused columns for next_unsafe & prev_unsafe, and adjust the test code */ UChar32 c, expected; - uint32_t i=0; + uint32_t i=0, j=0; uint32_t offset=0; int32_t setOffset=0; for(offset=0; offset 0; --offset){ - expected=result[i+4]; + expected=result[i+2]; // prev_safe_ns #if !U_HIDE_OBSOLETE_UTF_OLD_H - setOffset=offset; - UTF8_PREV_CHAR_SAFE(input, 0, setOffset, c, FALSE); - if(setOffset != movedOffset[i+4]){ - log_err("ERROR: UTF8_PREV_CHAR_SAFE failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n", - offset, movedOffset[i+4], setOffset); - } - if(c != expected){ - log_err("ERROR: UTF8_PREV_CHAR_SAFE failed for input=%ld. Expected:%lx Got:%lx\n", offset, expected, c); + setOffset=offset; + UTF8_PREV_CHAR_SAFE(input, 0, setOffset, c, FALSE); + if(setOffset != movedOffset[j+1]) { + log_err("ERROR: UTF8_PREV_CHAR_SAFE failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n", + offset, movedOffset[j+1], setOffset); + } + if(c != expected) { + log_err("ERROR: UTF8_PREV_CHAR_SAFE failed at offset=%ld. Expected:%lx Got:%lx\n", offset, expected, c); } #endif - setOffset=offset; - U8_PREV(input, 0, setOffset, c); - if(setOffset != movedOffset[i+4]){ - log_err("ERROR: U8_PREV failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n", - offset, movedOffset[i+4], setOffset); - } + setOffset=offset; + U8_PREV(input, 0, setOffset, c); + if(setOffset != movedOffset[j+1]) { + log_err("ERROR: U8_PREV failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n", + offset, movedOffset[j+1], setOffset); + } if(UTF_IS_ERROR(expected)) { expected=U_SENTINEL; } - if(c != expected){ - log_err("ERROR: U8_PREV failed for input=%ld. Expected:%lx Got:%lx\n", offset, expected, c); + if(c != expected) { + log_err("ERROR: U8_PREV failed at offset=%ld. Expected:%lx Got:%lx\n", offset, expected, c); } setOffset=offset; U8_PREV_OR_FFFD(input, 0, setOffset, c); - if(setOffset != movedOffset[i+4]){ + if(setOffset != movedOffset[j+1]) { log_err("ERROR: U8_PREV_OR_FFFD failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n", - offset, movedOffset[i+4], setOffset); + offset, movedOffset[j+1], setOffset); } if(expected<0) { expected=0xfffd; } - if(c != expected){ - log_err("ERROR: U8_PREV_OR_FFFD failed for input=%ld. Expected:%lx Got:%lx\n", offset, expected, c); + if(c != expected) { + log_err("ERROR: U8_PREV_OR_FFFD failed at offset=%ld. Expected:%lx Got:%lx\n", offset, expected, c); } #if !U_HIDE_OBSOLETE_UTF_OLD_H - setOffset=offset; - UTF8_PREV_CHAR_SAFE(input, 0, setOffset, c, TRUE); - if(setOffset != movedOffset[i+5]){ - log_err("ERROR: UTF8_PREV_CHAR_SAFE(strict) failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n", - offset, movedOffset[i+5], setOffset); - } - if(c != result[i+5]){ - log_err("ERROR: UTF8_PREV_CHAR_SAFE(strict) failed for input=%ld. Expected:%lx Got:%lx\n", offset, result[i+5], c); - } + setOffset=offset; + UTF8_PREV_CHAR_SAFE(input, 0, setOffset, c, TRUE); + if(setOffset != movedOffset[j+1]) { + log_err("ERROR: UTF8_PREV_CHAR_SAFE(strict) failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n", + offset, movedOffset[j+1], setOffset); + } + expected=result[i+3]; // prev_safe_s + if(c != expected) { + log_err("ERROR: UTF8_PREV_CHAR_SAFE(strict) failed at offset=%ld. Expected:%lx Got:%lx\n", + offset, expected, c); + } #endif - i=i+6; + i=i+4; + j=j+2; } } @@ -433,11 +450,13 @@ static void TestNulTerminated() { static const uint8_t input[]={ /* 0 */ 0x61, /* 1 */ 0xf0, 0x90, 0x90, 0x81, - /* 5 */ 0xc0, 0x80, + /* 5 */ 0xc0, + /* 6 */ 0x80, /* 7 */ 0xdf, 0x80, /* 9 */ 0xc2, /* 10 */ 0x62, - /* 11 */ 0xfd, 0xbe, + /* 11 */ 0xfd, + /* 12 */ 0xbe, /* 13 */ 0xe0, 0xa0, 0x80, /* 16 */ 0xe2, 0x82, 0xac, /* 19 */ 0xf0, 0x90, 0x90, @@ -447,14 +466,16 @@ static void TestNulTerminated() { static const UChar32 result[]={ 0x61, 0x10401, - U_SENTINEL, + U_SENTINEL, // C0 not a lead byte + U_SENTINEL, // 80 0x7c0, - U_SENTINEL, + U_SENTINEL, // C2 0x62, - U_SENTINEL, + U_SENTINEL, // FD not a lead byte + U_SENTINEL, // BE 0x800, 0x20ac, - U_SENTINEL, + U_SENTINEL, // truncated F0 90 90 0 }; @@ -544,6 +565,22 @@ static void TestNextPrevNonCharacters() { log_err("U8_PREV(at %d) failed to read a non-character\n", idx); } } +#if !U_HIDE_OBSOLETE_UTF_OLD_H + for(idx=0; idx<(int32_t)sizeof(nonChars);) { + UChar32 expected= nonChars[idx]<0xf0 ? 0xffff : 0x10ffff; + UTF8_NEXT_CHAR_SAFE(nonChars, idx, sizeof(nonChars), ch, TRUE); + if(ch!=expected) { + log_err("UTF8_NEXT_CHAR_SAFE(strict, before %d) failed to read a non-character\n", idx); + } + } + for(idx=(int32_t)sizeof(nonChars); idx>0;) { + UTF8_PREV_CHAR_SAFE(nonChars, 0, idx, ch, TRUE); + UChar32 expected= nonChars[idx]<0xf0 ? 0xffff : 0x10ffff; + if(ch!=expected) { + log_err("UTF8_PREV_CHAR_SAFE(strict, at %d) failed to read a non-character\n", idx); + } + } +#endif } static void TestNextPrevCharUnsafe() { @@ -563,58 +600,83 @@ static void TestNextPrevCharUnsafe() { static const UChar32 codePoints[]={ 0x61, 0x10401, - 0, + -1, 0x20ac, 0xa1, 0x10ffff, 0 }; - UChar32 c; + UChar32 c, expected; int32_t i; uint32_t offset; #if !U_HIDE_OBSOLETE_UTF_OLD_H for(i=0, offset=0; offset= 0 && c != expected) { log_err("ERROR: UTF8_NEXT_CHAR_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n", - offset, codePoints[i], c); + offset, expected, c); + } + if(offset==6) { + // The obsolete UTF8_NEXT_CHAR_UNSAFE() skips 1+UTF8_COUNT_TRAIL_BYTES(lead) bytes + // while the new one skips C0 80 together. + ++offset; } } #endif for(i=0, offset=0; offset= 0 && c != expected) { log_err("ERROR: U8_NEXT_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n", - offset, codePoints[i], c); + offset, expected, c); } } #if !U_HIDE_OBSOLETE_UTF_OLD_H for(i=UPRV_LENGTHOF(codePoints)-1, offset=sizeof(input); offset > 0; --i){ - UTF8_PREV_CHAR_UNSAFE(input, offset, c); - if(c != codePoints[i]){ - log_err("ERROR: UTF8_PREV_CHAR_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n", - offset, codePoints[i], c); - } + UTF8_PREV_CHAR_UNSAFE(input, offset, c); + expected = codePoints[i]; + if(expected >= 0 && c != expected) { + log_err("ERROR: UTF8_PREV_CHAR_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n", + offset, expected, c); + } } #endif for(i=UPRV_LENGTHOF(codePoints)-1, offset=sizeof(input); offset > 0; --i){ - U8_PREV_UNSAFE(input, offset, c); - if(c != codePoints[i]){ - log_err("ERROR: U8_PREV_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n", - offset, codePoints[i], c); - } + U8_PREV_UNSAFE(input, offset, c); + expected = codePoints[i]; + if(expected >= 0 && c != expected) { + log_err("ERROR: U8_PREV_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n", + offset, expected, c); + } } } static void TestFwdBack() { - static const uint8_t input[]={0x61, 0xF0, 0x90, 0x90, 0x81, 0xff, 0x62, 0xc0, 0x80, 0x7f, 0x8f, 0xc0, 0x63, 0x81, 0x90, 0x90, 0xF0, 0x00}; - static const uint16_t fwd_safe[] ={1, 5, 6, 7, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18}; - static const uint16_t back_safe[] ={17, 16, 15, 14, 13, 12, 11, 10, 9, 7, 6, 5, 1, 0}; + static const uint8_t input[]={ + 0x61, + 0xF0, 0x90, 0x90, 0x81, + 0xff, + 0x62, + 0xc0, + 0x80, + 0x7f, + 0x8f, + 0xc0, + 0x63, + 0x81, + 0x90, + 0x90, + 0xF0, + 0x00 + }; + static const uint16_t fwd_safe[] ={1, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18}; + static const uint16_t back_safe[] ={17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 1, 0}; - static const uint16_t Nvalue[]= {0, 1, 2, 3, 1, 2, 1, 5}; + static const uint16_t Nvalue[]= {0, 1, 2, 4, 1, 2, 1, 5}; static const uint16_t fwd_N_safe[] ={0, 1, 6, 10, 11, 13, 14, 18}; /*safe macro keeps it at the end of the string */ - static const uint16_t back_N_safe[] ={18, 17, 15, 12, 11, 9, 7, 0}; + static const uint16_t back_N_safe[] ={18, 17, 15, 11, 10, 8, 7, 0}; uint32_t offsafe=0; @@ -707,7 +769,10 @@ static void TestFwdBackUnsafe() { 0xf4, 0x8f, 0xbf, 0xbf, 0x00 }; - static const int8_t boundaries[]={ 0, 1, 5, 7, 10, 12, 16, 17 }; + // forward unsafe skips only C0 + static const int8_t boundaries[]={ 0, 1, 5, 6, 7, 10, 12, 16, 17 }; + // backward unsafe skips C0 80 together + static const int8_t backBoundaries[]={ 0, 1, 5, 7, 10, 12, 16, 17 }; int32_t offset; int32_t i; @@ -726,17 +791,17 @@ static void TestFwdBackUnsafe() { } } #if !U_HIDE_OBSOLETE_UTF_OLD_H - for(i=UPRV_LENGTHOF(boundaries)-2, offset=UPRV_LENGTHOF(input); offset>0; --i) { + for(i=UPRV_LENGTHOF(backBoundaries)-2, offset=UPRV_LENGTHOF(input); offset>0; --i) { UTF8_BACK_1_UNSAFE(input, offset); - if(offset != boundaries[i]){ - log_err("ERROR: UTF8_BACK_1_UNSAFE offset expected:%d, Got:%d\n", boundaries[i], offset); + if(offset != backBoundaries[i]){ + log_err("ERROR: UTF8_BACK_1_UNSAFE offset expected:%d, Got:%d\n", backBoundaries[i], offset); } } #endif - for(i=UPRV_LENGTHOF(boundaries)-2, offset=UPRV_LENGTHOF(input); offset>0; --i) { + for(i=UPRV_LENGTHOF(backBoundaries)-2, offset=UPRV_LENGTHOF(input); offset>0; --i) { U8_BACK_1_UNSAFE(input, offset); - if(offset != boundaries[i]){ - log_err("ERROR: U8_BACK_1_UNSAFE offset expected:%d, Got:%d\n", boundaries[i], offset); + if(offset != backBoundaries[i]){ + log_err("ERROR: U8_BACK_1_UNSAFE offset expected:%d, Got:%d\n", backBoundaries[i], offset); } } #if !U_HIDE_OBSOLETE_UTF_OLD_H @@ -756,21 +821,21 @@ static void TestFwdBackUnsafe() { } } #if !U_HIDE_OBSOLETE_UTF_OLD_H - for(i=0; isetAttribute(UCOL_STRENGTH, UCOL_IDENTICAL, errorCode); static const char *strings[] = { - // U+FFFD - "a\xef\xbf\xbdz", - // illegal byte sequences - "a\x80z", // trail byte - "a\xc1\x81z", // non-shortest form - "a\xe0\x82\x83z", // non-shortest form - "a\xed\xa0\x80z", // lead surrogate: would be U+D800 - "a\xed\xbf\xbfz", // trail surrogate: would be U+DFFF - "a\xf0\x8f\xbf\xbfz", // non-shortest form - "a\xf4\x90\x80\x80z" // out of range: would be U+110000 + // string with U+FFFD == illegal byte sequence + u8"a\uFFFDz", "a\x80z", // trail byte + u8"a\uFFFD\uFFFDz", "a\xc1\x81z", // non-shortest form + u8"a\uFFFD\uFFFD\uFFFDz", "a\xe0\x82\x83z", // non-shortest form + u8"a\uFFFD\uFFFD\uFFFDz", "a\xed\xa0\x80z", // lead surrogate: would be U+D800 + u8"a\uFFFD\uFFFD\uFFFDz", "a\xed\xbf\xbfz", // trail surrogate: would be U+DFFF + u8"a\uFFFD\uFFFD\uFFFD\uFFFDz", "a\xf0\x8f\xbf\xbfz", // non-shortest form + u8"a\uFFFD\uFFFD\uFFFD\uFFFDz", "a\xf4\x90\x80\x80z" // out of range: would be U+110000 }; - StringPiece fffd(strings[0]); - for(int32_t i = 1; i < UPRV_LENGTHOF(strings); ++i) { - StringPiece illegal(strings[i]); + for(int32_t i = 0; i < UPRV_LENGTHOF(strings); i += 2) { + StringPiece fffd(strings[i]); + StringPiece illegal(strings[i + 1]); UCollationResult order = coll->compareUTF8(fffd, illegal, errorCode); if(order != UCOL_EQUAL) { - errln("compareUTF8(U+FFFD, string %d with illegal UTF-8)=%d != UCOL_EQUAL", + errln("compareUTF8(pair %d: U+FFFD, illegal UTF-8)=%d != UCOL_EQUAL", (int)i, order); } } diff --git a/icu4c/source/test/intltest/strtest.cpp b/icu4c/source/test/intltest/strtest.cpp index 9f542e811af..d8fd7a0042a 100644 --- a/icu4c/source/test/intltest/strtest.cpp +++ b/icu4c/source/test/intltest/strtest.cpp @@ -146,7 +146,7 @@ void StringTest::Test_UTF8_COUNT_TRAIL_BYTES() { #if !U_HIDE_OBSOLETE_UTF_OLD_H if(UTF8_COUNT_TRAIL_BYTES(0x7F) != 0 - || UTF8_COUNT_TRAIL_BYTES(0xC0) != 1 + || UTF8_COUNT_TRAIL_BYTES(0xC2) != 1 || UTF8_COUNT_TRAIL_BYTES(0xE0) != 2 || UTF8_COUNT_TRAIL_BYTES(0xF0) != 3) { errln("UTF8_COUNT_TRAIL_BYTES does not work right! See utf_old.h."); @@ -155,7 +155,7 @@ StringTest::Test_UTF8_COUNT_TRAIL_BYTES() { // Note: U8_COUNT_TRAIL_BYTES (current) and UTF8_COUNT_TRAIL_BYTES (deprecated) // have completely different implementations. if (U8_COUNT_TRAIL_BYTES(0x7F) != 0 - || U8_COUNT_TRAIL_BYTES(0xC0) != 1 + || U8_COUNT_TRAIL_BYTES(0xC2) != 1 || U8_COUNT_TRAIL_BYTES(0xE0) != 2 || U8_COUNT_TRAIL_BYTES(0xF0) != 3) { errln("U8_COUNT_TRAIL_BYTES does not work right! See utf8.h."); diff --git a/icu4c/source/test/intltest/ustrtest.cpp b/icu4c/source/test/intltest/ustrtest.cpp index a222e2a2905..4b7cb7ae7c7 100644 --- a/icu4c/source/test/intltest/ustrtest.cpp +++ b/icu4c/source/test/intltest/ustrtest.cpp @@ -1881,9 +1881,9 @@ UnicodeStringTest::TestUTF8() { 0xf3, 0xa0, 0x80, 0x80, 0xf4, 0x8f, 0xbf, 0xbf }; static const UChar expected_utf16[] = { - 0x41, 0xfffd, - 0x61, 0xfffd, - 0xfffd, 0x5a, + 0x41, 0xfffd, 0xfffd, 0xfffd, + 0x61, 0xfffd, 0xfffd, 0xfffd, + 0xfffd, 0xfffd, 0xfffd, 0xfffd,0x5a, 0xd900, 0xdc00, 0x7a, 0xd800, 0xdc00, 0xd840, 0xdc00, 0xdb40, 0xdc00, 0xdbff, 0xdfff diff --git a/icu4c/source/test/intltest/utxttest.cpp b/icu4c/source/test/intltest/utxttest.cpp index f2298eaa2e9..2a779b21213 100644 --- a/icu4c/source/test/intltest/utxttest.cpp +++ b/icu4c/source/test/intltest/utxttest.cpp @@ -60,7 +60,6 @@ UTextTest::runIndexedTest(int32_t index, UBool exec, TESTCASE_AUTO(Ticket10562); TESTCASE_AUTO(Ticket10983); TESTCASE_AUTO(Ticket12130); - TESTCASE_AUTO(Ticket12888); TESTCASE_AUTO(Ticket13344); TESTCASE_AUTO_END; } @@ -951,10 +950,14 @@ void UTextTest::ErrorTest() UChar buf[10]; int n = utext_extract(ut, 0, 9, buf, 10, &status); TEST_SUCCESS(status); - TEST_ASSERT(n==5); + TEST_ASSERT(n==7); + TEST_ASSERT(buf[0] == 0x41); TEST_ASSERT(buf[1] == 0xfffd); - TEST_ASSERT(buf[3] == 0xfffd); TEST_ASSERT(buf[2] == 0x42); + TEST_ASSERT(buf[3] == 0xfffd); + TEST_ASSERT(buf[4] == 0xfffd); + TEST_ASSERT(buf[5] == 0xfffd); + TEST_ASSERT(buf[6] == 0x43); utext_close(ut); } @@ -1578,66 +1581,6 @@ void UTextTest::Ticket12130() { utext_close(&ut); } -// Ticket 12888: bad handling of illegal utf-8 containing many instances of the archaic, now illegal, -// six byte utf-8 forms. Original implementation had an assumption that -// there would be at most three utf-8 bytes per UTF-16 code unit. -// The five and six byte sequences map to a single replacement character. - -void UTextTest::Ticket12888() { - const char *badString = - "\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80" - "\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80" - "\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80" - "\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80" - "\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80" - "\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80" - "\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80" - "\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80" - "\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80" - "\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80" - "\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80" - "\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80" - "\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80" - "\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80" - "\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80" - "\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80" - "\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80" - "\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80" - "\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80" - "\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80"; - - UErrorCode status = U_ZERO_ERROR; - LocalUTextPointer ut(utext_openUTF8(NULL, badString, -1, &status)); - TEST_SUCCESS(status); - for (;;) { - UChar32 c = utext_next32(ut.getAlias()); - if (c == U_SENTINEL) { - break; - } - } - int32_t endIdx = utext_getNativeIndex(ut.getAlias()); - if (endIdx != (int32_t)strlen(badString)) { - errln("%s:%d expected=%d, actual=%d", __FILE__, __LINE__, strlen(badString), endIdx); - return; - } - - for (int32_t prevIndex = endIdx; prevIndex>0;) { - UChar32 c = utext_previous32(ut.getAlias()); - int32_t currentIndex = utext_getNativeIndex(ut.getAlias()); - if (c != 0xfffd) { - errln("%s:%d (expected, actual, index) = (%d, %d, %d)\n", - __FILE__, __LINE__, 0xfffd, c, currentIndex); - break; - } - if (currentIndex != prevIndex - 6) { - errln("%s:%d: wrong index. Expected, actual = %d, %d", - __FILE__, __LINE__, prevIndex - 6, currentIndex); - break; - } - prevIndex = currentIndex; - } -} - // Ticket 13344 The macro form of UTEXT_SETNATIVEINDEX failed when target was a trail surrogate // of a supplementary character. diff --git a/icu4c/source/test/intltest/utxttest.h b/icu4c/source/test/intltest/utxttest.h index c0b3145351c..4ed39861160 100644 --- a/icu4c/source/test/intltest/utxttest.h +++ b/icu4c/source/test/intltest/utxttest.h @@ -38,7 +38,6 @@ public: void Ticket10562(); void Ticket10983(); void Ticket12130(); - void Ticket12888(); void Ticket13344(); private: diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/BMPSet.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/BMPSet.java index 879d230ce2d..038477da5e8 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/impl/BMPSet.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/BMPSet.java @@ -16,11 +16,12 @@ import com.ibm.icu.util.OutputInt; /** * Helper class for frozen UnicodeSets, implements contains() and span() optimized for BMP code points. - * + * * Latin-1: Look up bytes. * 2-byte characters: Bits organized vertically. * 3-byte characters: Use zero/one/mixed data per 64-block in U+0000..U+FFFF, with mixed for illegal ranges. - * Supplementary characters: Call contains() on the parent set. + * Supplementary characters: Binary search over + * the supplementary part of the parent set's inversion list. */ public final class BMPSet { public static int U16_SURROGATE_OFFSET = ((0xd800 << 10) + 0xdc00 - 0x10000); @@ -34,9 +35,8 @@ public final class BMPSet { * One bit per code point from U+0000..U+07FF. The bits are organized vertically; consecutive code points * correspond to the same bit positions in consecutive table words. With code point parts lead=c{10..6} * trail=c{5..0} it is set.contains(c)==(table7FF[trail] bit lead) - * - * Bits for 0..7F (non-shortest forms) are set to the result of contains(FFFD) for faster validity checking at - * runtime. + * + * Bits for 0..FF are unused (0). */ private int[] table7FF; @@ -46,9 +46,8 @@ public final class BMPSet { * t1=c{11..6} test bits (lead+16) and lead in bmpBlockBits[t1]. If the upper bit is 0, then the lower bit * indicates if contains(c) for all code points in the 64-block. If the upper bit is 1, then the block is mixed * and set.contains(c) must be called. - * - * Bits for 0..7FF (non-shortest forms) and D800..DFFF are set to the result of contains(FFFD) for faster - * validity checking at runtime. + * + * Bits for 0..7FF are unused (0). */ private int[] bmpBlockBits; @@ -127,7 +126,7 @@ public final class BMPSet { /** * Span the initial substring for which each character c has spanCondition==contains(c). It must be * spanCondition==0 or 1. - * + * * @param start The start index * @param outCount If not null: Receives the number of code points in the span. * @return the limit (exclusive end) of the span @@ -232,7 +231,7 @@ public final class BMPSet { * Symmetrical with span(). * Span the trailing substring for which each character c has spanCondition==contains(c). It must be s.length >= * limit and spanCondition==0 or 1. - * + * * @return The string index which starts the span (i.e. inclusive). */ public final int spanBack(CharSequence s, int limit, SpanCondition spanCondition) { @@ -462,10 +461,10 @@ public final class BMPSet { /** * Same as UnicodeSet.findCodePoint(int c) except that the binary search is restricted for finding code * points in a certain range. - * + * * For restricting the search for finding in the range start..end, pass in lo=findCodePoint(start) and * hi=findCodePoint(end) with 0<=lo<=hi