BMPSet::BMPSet(const int32_t *parentList, int32_t parentListLength) :
list(parentList), listLength(parentListLength) {
- uprv_memset(asciiBytes, 0, sizeof(asciiBytes));
+ uprv_memset(latin1Contains, 0, sizeof(latin1Contains));
uprv_memset(table7FF, 0, sizeof(table7FF));
uprv_memset(bmpBlockBits, 0, sizeof(bmpBlockBits));
list4kStarts[i]=findCodePoint(i<<12, list4kStarts[i-1], listLength-1);
}
list4kStarts[0x11]=listLength-1;
+ containsFFFD=containsSlow(0xfffd, list4kStarts[0xf], list4kStarts[0x10]);
initBits();
overrideIllegal();
}
BMPSet::BMPSet(const BMPSet &otherBMPSet, const int32_t *newParentList, int32_t newParentListLength) :
+ containsFFFD(otherBMPSet.containsFFFD),
list(newParentList), listLength(newParentListLength) {
- uprv_memcpy(asciiBytes, otherBMPSet.asciiBytes, sizeof(asciiBytes));
+ uprv_memcpy(latin1Contains, otherBMPSet.latin1Contains, sizeof(latin1Contains));
uprv_memcpy(table7FF, otherBMPSet.table7FF, sizeof(table7FF));
uprv_memcpy(bmpBlockBits, otherBMPSet.bmpBlockBits, sizeof(bmpBlockBits));
uprv_memcpy(list4kStarts, otherBMPSet.list4kStarts, sizeof(list4kStarts));
UChar32 start, limit;
int32_t listIndex=0;
- // Set asciiBytes[].
+ // Set latin1Contains[].
do {
start=list[listIndex++];
if(listIndex<listLength) {
} else {
limit=0x110000;
}
- if(start>=0x80) {
+ if(start>=0x100) {
break;
}
do {
- asciiBytes[start++]=1;
- } while(start<limit && start<0x80);
- } while(limit<=0x80);
+ latin1Contains[start++]=1;
+ } while(start<limit && start<0x100);
+ } while(limit<=0x100);
+
+ // Find the first range overlapping with (or after) 80..FF again,
+ // to include them in table7FF as well.
+ for(listIndex=0;;) {
+ start=list[listIndex++];
+ if(listIndex<listLength) {
+ limit=list[listIndex++];
+ } else {
+ limit=0x110000;
+ }
+ if(limit>0x80) {
+ if(start<0x80) {
+ start=0x80;
+ }
+ break;
+ }
+ }
// Set table7FF[].
while(start<0x800) {
* for faster validity checking at runtime.
* No need to set 0 values where they were reset to 0 in the constructor
* and not modified by initBits().
- * (asciiBytes[] trail bytes, table7FF[] 0..7F, bmpBlockBits[] 0..7FF)
+ * (table7FF[] 0..7F, bmpBlockBits[] 0..7FF)
* Need to set 0 values for surrogates D800..DFFF.
*/
void BMPSet::overrideIllegal() {
uint32_t bits, mask;
int32_t i;
- if(containsSlow(0xfffd, list4kStarts[0xf], list4kStarts[0x10])) {
- // contains(FFFD)==TRUE
- for(i=0x80; i<0xc0; ++i) {
- asciiBytes[i]=1;
- }
-
+ if(containsFFFD) {
bits=3; // Lead bytes 0xC0 and 0xC1.
for(i=0; i<64; ++i) {
table7FF[i]|=bits;
bmpBlockBits[i]=(bmpBlockBits[i]&mask)|bits;
}
} else {
- // contains(FFFD)==FALSE
mask=~(0x10001<<0xd); // Lead byte 0xED.
for(i=32; i<64; ++i) { // Second half of 4k block.
bmpBlockBits[i]&=mask;
UBool
BMPSet::contains(UChar32 c) const {
- if((uint32_t)c<=0x7f) {
- return (UBool)asciiBytes[c];
+ if((uint32_t)c<=0xff) {
+ return (UBool)latin1Contains[c];
} else if((uint32_t)c<=0x7ff) {
return (UBool)((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))!=0);
} else if((uint32_t)c<0xd800 || (c>=0xe000 && c<=0xffff)) {
// span
do {
c=*s;
- if(c<=0x7f) {
- if(!asciiBytes[c]) {
+ if(c<=0xff) {
+ if(!latin1Contains[c]) {
break;
}
} else if(c<=0x7ff) {
// span not
do {
c=*s;
- if(c<=0x7f) {
- if(asciiBytes[c]) {
+ if(c<=0xff) {
+ if(latin1Contains[c]) {
break;
}
} else if(c<=0x7ff) {
// span
for(;;) {
c=*(--limit);
- if(c<=0x7f) {
- if(!asciiBytes[c]) {
+ if(c<=0xff) {
+ if(!latin1Contains[c]) {
break;
}
} else if(c<=0x7ff) {
// span not
for(;;) {
c=*(--limit);
- if(c<=0x7f) {
- if(asciiBytes[c]) {
+ if(c<=0xff) {
+ if(latin1Contains[c]) {
break;
}
} else if(c<=0x7ff) {
BMPSet::spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const {
const uint8_t *limit=s+length;
uint8_t b=*s;
- if((int8_t)b>=0) {
+ if(U8_IS_SINGLE(b)) {
// Initial all-ASCII span.
if(spanCondition) {
do {
- if(!asciiBytes[b] || ++s==limit) {
+ if(!latin1Contains[b] || ++s==limit) {
return s;
}
b=*s;
- } while((int8_t)b>=0);
+ } while(U8_IS_SINGLE(b));
} else {
do {
- if(asciiBytes[b] || ++s==limit) {
+ if(latin1Contains[b] || ++s==limit) {
return s;
}
b=*s;
- } while((int8_t)b>=0);
+ } while(U8_IS_SINGLE(b));
}
length=(int32_t)(limit-s);
}
// single trail byte, check for preceding 3- or 4-byte lead byte
if(length>=2 && (b=*(limit-2))>=0xe0) {
limit-=2;
- if(asciiBytes[0x80]!=spanCondition) {
+ if(containsFFFD!=spanCondition) {
limit0=limit;
}
} else if(b<0xc0 && b>=0x80 && length>=3 && (b=*(limit-3))>=0xf0) {
// 4-byte lead byte with only two trail bytes
limit-=3;
- if(asciiBytes[0x80]!=spanCondition) {
+ if(containsFFFD!=spanCondition) {
limit0=limit;
}
}
} else {
// lead byte with no trail bytes
--limit;
- if(asciiBytes[0x80]!=spanCondition) {
+ if(containsFFFD!=spanCondition) {
limit0=limit;
}
}
while(s<limit) {
b=*s;
- if(b<0xc0) {
- // ASCII; or trail bytes with the result of contains(FFFD).
+ if(U8_IS_SINGLE(b)) {
+ // ASCII
if(spanCondition) {
do {
- if(!asciiBytes[b]) {
+ if(!latin1Contains[b]) {
return s;
} else if(++s==limit) {
return limit0;
}
b=*s;
- } while(b<0xc0);
+ } while(U8_IS_SINGLE(b));
} else {
do {
- if(asciiBytes[b]) {
+ if(latin1Contains[b]) {
return s;
} else if(++s==limit) {
return limit0;
}
b=*s;
- } while(b<0xc0);
+ } while(U8_IS_SINGLE(b));
}
}
++s; // Advance past the lead byte.
UChar32 c=((UChar32)(b-0xf0)<<18)|((UChar32)t1<<12)|(t2<<6)|t3;
if( ( (0x10000<=c && c<=0x10ffff) ?
containsSlow(c, list4kStarts[0x10], list4kStarts[0x11]) :
- asciiBytes[0x80]
+ containsFFFD
) != spanCondition
) {
return s-1;
s+=3;
continue;
}
- } else /* 0xc0<=b<0xe0 */ {
+ } else {
if( /* handle U+0000..U+07FF inline */
+ b>=0xc0 &&
(t1=(uint8_t)(*s-0x80)) <= 0x3f
) {
if((USetSpanCondition)((table7FF[t1]&((uint32_t)1<<(b&0x1f)))!=0) != spanCondition) {
// Give an illegal sequence the same value as the result of contains(FFFD).
// Handle each byte of an illegal sequence separately to simplify the code;
// no need to optimize error handling.
- if(asciiBytes[0x80]!=spanCondition) {
+ if(containsFFFD!=spanCondition) {
return s-1;
}
}
do {
b=s[--length];
- if((int8_t)b>=0) {
+ if(U8_IS_SINGLE(b)) {
// ASCII sub-span
if(spanCondition) {
do {
- if(!asciiBytes[b]) {
+ if(!latin1Contains[b]) {
return length+1;
} else if(length==0) {
return 0;
}
b=s[--length];
- } while((int8_t)b>=0);
+ } while(U8_IS_SINGLE(b));
} else {
do {
- if(asciiBytes[b]) {
+ if(latin1Contains[b]) {
return length+1;
} else if(length==0) {
return 0;
}
b=s[--length];
- } while((int8_t)b>=0);
+ } while(U8_IS_SINGLE(b));
}
}
* Helper class for frozen UnicodeSets, implements contains() and span()
* optimized for BMP code points. Structured to be UTF-8-friendly.
*
- * ASCII: Look up bytes.
+ * Latin-1: Look up bytes.
* 2-byte characters: Bits organized vertically.
* 3-byte characters: Use zero/one/mixed data per 64-block in U+0000..U+FFFF,
* with mixed for illegal ranges.
- * Supplementary characters: Call contains() on the parent set.
+ * Supplementary characters: Binary search over
+ * the supplementary part of the parent set's inversion list.
*/
class BMPSet : public UMemory {
public:
inline UBool containsSlow(UChar32 c, int32_t lo, int32_t hi) const;
/*
- * One byte per ASCII character, or trail byte in lead position.
- * 0 or 1 for ASCII characters.
- * The value for trail bytes is the result of contains(FFFD)
- * for faster validity checking at runtime.
+ * One byte 0 or 1 per Latin-1 character.
*/
- UBool asciiBytes[0xc0];
+ UBool latin1Contains[0x100];
+
+ /* TRUE if contains(U+FFFD). */
+ UBool containsFFFD;
/*
* One bit per code point from U+0000..U+07FF.
* This file defines macros for checking whether a code point is
* a surrogate or a non-character etc.
*
- * The UChar and UChar32 data types for Unicode code units and code points
- * are defined in umachine.h because they can be machine-dependent.
- *
* If U_NO_DEFAULT_INCLUDE_UTF_HEADERS is 0 then utf.h is included by utypes.h
* and itself includes utf8.h and utf16.h after some
* common definitions.
* but are optimized for the much more frequently occurring BMP code points.
*
* umachine.h defines UChar to be an unsigned 16-bit integer.
- * Where available, UChar is defined to be a char16_t
- * or a wchar_t (if that is an unsigned 16-bit type), otherwise uint16_t.
+ * Since ICU 59, ICU uses char16_t in C++, UChar only in C,
+ * and defines UChar=char16_t by default. See the UChar API docs for details.
*
* UChar32 is defined to be a signed 32-bit integer (int32_t), large enough for a 21-bit
- * Unicode code point (Unicode scalar value, 0..0x10ffff).
+ * Unicode code point (Unicode scalar value, 0..0x10ffff) and U_SENTINEL (-1).
* Before ICU 2.4, the definition of UChar32 was similarly platform-dependent as
* the definition of UChar. For details see the documentation for UChar32 itself.
*
* For actual Unicode character properties see uchar.h.
*
* By default, string operations must be done with error checking in case
- * a string is not well-formed UTF-16.
- * The macros will detect if a surrogate code unit is unpaired
+ * a string is not well-formed UTF-16 or UTF-8.
+ *
+ * The U16_ macros detect if a surrogate code unit is unpaired
* (lead unit without trail unit or vice versa) and just return the unit itself
* as the code point.
*
+ * The U8_ macros detect illegal byte sequences and return a negative value.
+ * Starting with ICU 60, the observable length of a single illegal byte sequence
+ * skipped by one of these macros follows the Unicode 6+ recommendation
+ * which is consistent with the W3C Encoding Standard.
+ *
+ * There are ..._OR_FFFD versions of both U16_ and U8_ macros
+ * that return U+FFFD for illegal code unit sequences.
+ *
* The regular "safe" macros require that the initial, passed-in string index
* is within bounds. They only check the index when they read more than one
* code unit. This is usually done with code similar to the following loop:
* The performance differences are much larger here because UTF-8 provides so
* many opportunities for malformed sequences.
* The unsafe UTF-8 macros are entirely implemented inside the macro definitions
- * and are fast, while the safe UTF-8 macros call functions for all but the
- * trivial (ASCII) cases.
- * (ICU 3.6 optimizes U8_NEXT() and U8_APPEND() to handle most other common
- * characters inline as well.)
+ * and are fast, while the safe UTF-8 macros call functions for some complicated cases.
*
* Unlike with UTF-16, malformed sequences cannot be expressed with distinct
* code point values (0..U+10ffff). They are indicated with negative values instead.
*/
#define U_IS_UNICODE_NONCHAR(c) \
((c)>=0xfdd0 && \
- ((uint32_t)(c)<=0xfdef || ((c)&0xfffe)==0xfffe) && \
- (uint32_t)(c)<=0x10ffff)
+ ((c)<=0xfdef || ((c)&0xfffe)==0xfffe) && (c)<=0x10ffff)
/**
* Is c a Unicode code point value (0..U+10ffff)
*/
#define U_IS_UNICODE_CHAR(c) \
((uint32_t)(c)<0xd800 || \
- ((uint32_t)(c)>0xdfff && \
- (uint32_t)(c)<=0x10ffff && \
- !U_IS_UNICODE_NONCHAR(c)))
+ (0xdfff<(c) && (c)<=0x10ffff && !U_IS_UNICODE_NONCHAR(c)))
/**
* Is this code point a BMP code point (U+0000..U+ffff)?
/* internal definitions ----------------------------------------------------- */
-
-
/**
* Counts the trail bytes for a UTF-8 lead byte.
- * Returns 0 for 0..0xbf as well as for 0xfe and 0xff.
+ * Returns 0 for 0..0xc1 as well as for 0xf5..0xff.
+ * leadByte might be evaluated multiple times.
*
* This is internal since it is not meant to be called directly by external clients;
* however it is called by public macros in this file and thus must remain stable.
*
- * Note: Beginning with ICU 50, the implementation uses a multi-condition expression
- * which was shown in 2012 (on x86-64) to compile to fast, branch-free code.
- * leadByte is evaluated multiple times.
- *
- * The pre-ICU 50 implementation used the exported array utf8_countTrailBytes:
- * #define U8_COUNT_TRAIL_BYTES(leadByte) (utf8_countTrailBytes[leadByte])
- * leadByte was evaluated exactly once.
- *
* @param leadByte The first byte of a UTF-8 sequence. Must be 0..0xff.
* @internal
*/
#define U8_COUNT_TRAIL_BYTES(leadByte) \
- ((uint8_t)(leadByte)<0xf0 ? \
- ((uint8_t)(leadByte)>=0xc0)+((uint8_t)(leadByte)>=0xe0) : \
- (uint8_t)(leadByte)<0xfe ? 3+((uint8_t)(leadByte)>=0xf8)+((uint8_t)(leadByte)>=0xfc) : 0)
+ ((uint8_t)(leadByte)<=0xf4 ? \
+ ((uint8_t)(leadByte)>=0xc2)+((uint8_t)(leadByte)>=0xe0)+((uint8_t)(leadByte)>=0xf0) : 0)
/**
* Counts the trail bytes for a UTF-8 lead byte of a valid UTF-8 sequence.
- * The maximum supported lead byte is 0xf4 corresponding to U+10FFFF.
+ * Returns 0 for 0..0xc1. Undefined for 0xf5..0xff.
* leadByte might be evaluated multiple times.
*
* This is internal since it is not meant to be called directly by external clients;
* @internal
*/
#define U8_COUNT_TRAIL_BYTES_UNSAFE(leadByte) \
- (((leadByte)>=0xc0)+((leadByte)>=0xe0)+((leadByte)>=0xf0))
+ (((uint8_t)(leadByte)>=0xc2)+((uint8_t)(leadByte)>=0xe0)+((uint8_t)(leadByte)>=0xf0))
/**
* Mask a UTF-8 lead byte, leave only the lower bits that form part of the code point value.
*/
#define U8_MASK_LEAD_BYTE(leadByte, countTrailBytes) ((leadByte)&=(1<<(6-(countTrailBytes)))-1)
+/**
+ * Internal bit vector for 3-byte UTF-8 validity check.
+ * Lead byte E0..EF bits 3..0 as byte index,
+ * first trail byte bits 7..5 as bit index into that byte.
+ * @internal
+ */
+#define U8_LEAD3_T1_BITS "\x20\x30\x30\x30\x30\x30\x30\x30\x30\x30\x30\x30\x30\x10\x30\x30"
+
+/**
+ * Internal 3-byte UTF-8 validity check.
+ * @internal
+ */
+#define U8_IS_VALID_LEAD3_AND_T1(lead, t1) (U8_LEAD3_T1_BITS[(lead)&0xf]&(1<<((uint8_t)(t1)>>5)))
+
+/**
+ * Internal bit vector for 4-byte UTF-8 validity check.
+ * First trail byte bits 7..4 as byte index,
+ * lead byte F0..F4 bits 2..0 as bit index into that byte.
+ * @internal
+ */
+#define U8_LEAD4_T1_BITS "\x00\x00\x00\x00\x00\x00\x00\x00\x1E\x0F\x0F\x0F\x00\x00\x00\x00"
+
+/**
+ * Internal 4-byte UTF-8 validity check.
+ * @internal
+ */
+#define U8_IS_VALID_LEAD4_AND_T1(lead, t1) (U8_LEAD4_T1_BITS[(uint8_t)(t1)>>4]&(1<<((lead)&7)))
+
/**
* Function for handling "next code point" with error-checking.
*
* @return TRUE or FALSE
* @stable ICU 2.4
*/
-#define U8_IS_LEAD(c) ((uint8_t)((c)-0xc0)<0x3e)
+#define U8_IS_LEAD(c) ((uint8_t)((c)-0xc2)<=0x32)
+// 0x32=0xf4-0xc2
/**
* Is this code unit (byte) a UTF-8 trail byte?
* @return TRUE or FALSE
* @stable ICU 2.4
*/
-#define U8_IS_TRAIL(c) (((c)&0xc0)==0x80)
+#define U8_IS_TRAIL(c) ((int8_t)(c)<-0x40)
/**
* How many code units (bytes) are used for the UTF-8 encoding
*/
#define U8_NEXT_UNSAFE(s, i, c) { \
(c)=(uint8_t)(s)[(i)++]; \
- if((c)>=0x80) { \
+ if(!U8_IS_SINGLE(c)) { \
if((c)<0xe0) { \
(c)=(((c)&0x1f)<<6)|((s)[(i)++]&0x3f); \
} else if((c)<0xf0) { \
*/
#define U8_NEXT(s, i, length, c) { \
(c)=(uint8_t)(s)[(i)++]; \
- if((c)>=0x80) { \
+ if(!U8_IS_SINGLE(c)) { \
uint8_t __t1, __t2; \
- if( /* handle U+1000..U+CFFF inline */ \
- (0xe0<(c) && (c)<=0xec) && \
- (((i)+1)<(length) || (length)<0) && \
- (__t1=(uint8_t)((s)[i]-0x80))<=0x3f && \
- (__t2=(uint8_t)((s)[(i)+1]-0x80))<= 0x3f \
- ) { \
- /* no need for (c&0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */ \
- (c)=(UChar)(((c)<<12)|(__t1<<6)|__t2); \
+ if( /* handle U+0800..U+FFFF inline */ \
+ (0xe0<=(c) && (c)<0xf0) && \
+ (((i)+1)<(length) || (length)<0) && \
+ U8_IS_VALID_LEAD3_AND_T1((c), __t1=(s)[i]) && \
+ (__t2=(s)[(i)+1]-0x80)<=0x3f) { \
+ (c)=(((c)&0xf)<<12)|((__t1&0x3f)<<6)|__t2; \
(i)+=2; \
} else if( /* handle U+0080..U+07FF inline */ \
- ((c)<0xe0 && (c)>=0xc2) && \
- ((i)!=(length)) && \
- (__t1=(uint8_t)((s)[i]-0x80))<=0x3f \
- ) { \
+ ((c)<0xe0 && (c)>=0xc2) && \
+ ((i)!=(length)) && \
+ (__t1=(s)[i]-0x80)<=0x3f) { \
(c)=(((c)&0x1f)<<6)|__t1; \
++(i); \
} else { \
*/
#define U8_NEXT_OR_FFFD(s, i, length, c) { \
(c)=(uint8_t)(s)[(i)++]; \
- if((c)>=0x80) { \
+ if(!U8_IS_SINGLE(c)) { \
uint8_t __t1, __t2; \
- if( /* handle U+1000..U+CFFF inline */ \
- (0xe0<(c) && (c)<=0xec) && \
- (((i)+1)<(length) || (length)<0) && \
- (__t1=(uint8_t)((s)[i]-0x80))<=0x3f && \
- (__t2=(uint8_t)((s)[(i)+1]-0x80))<= 0x3f \
- ) { \
- /* no need for (c&0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */ \
- (c)=(UChar)(((c)<<12)|(__t1<<6)|__t2); \
+ if( /* handle U+0800..U+FFFF inline */ \
+ (0xe0<=(c) && (c)<0xf0) && \
+ (((i)+1)<(length) || (length)<0) && \
+ U8_IS_VALID_LEAD3_AND_T1((c), __t1=(s)[i]) && \
+ (__t2=(s)[(i)+1]-0x80)<=0x3f) { \
+ (c)=(((c)&0xf)<<12)|((__t1&0x3f)<<6)|__t2; \
(i)+=2; \
} else if( /* handle U+0080..U+07FF inline */ \
- ((c)<0xe0 && (c)>=0xc2) && \
- ((i)!=(length)) && \
- (__t1=(uint8_t)((s)[i]-0x80))<=0x3f \
- ) { \
+ ((c)<0xe0 && (c)>=0xc2) && \
+ ((i)!=(length)) && \
+ (__t1=(s)[i]-0x80)<=0x3f) { \
(c)=(((c)&0x1f)<<6)|__t1; \
++(i); \
} else { \
* @stable ICU 2.4
*/
#define U8_FWD_1_UNSAFE(s, i) { \
- (i)+=1+U8_COUNT_TRAIL_BYTES_UNSAFE((uint8_t)(s)[i]); \
+ (i)+=1+U8_COUNT_TRAIL_BYTES_UNSAFE((s)[i]); \
}
/**
* @stable ICU 2.4
*/
#define U8_FWD_1(s, i, length) { \
- uint8_t __b=(uint8_t)(s)[(i)++]; \
- if(U8_IS_LEAD(__b)) { \
- uint8_t __count=U8_COUNT_TRAIL_BYTES(__b); \
- if((i)+__count>(length) && (length)>=0) { \
- __count=(uint8_t)((length)-(i)); \
- } \
- while(__count>0 && U8_IS_TRAIL((s)[i])) { \
- ++(i); \
- --__count; \
+ uint8_t __b=(s)[(i)++]; \
+ if(U8_IS_LEAD(__b) && (i)!=(length)) { \
+ uint8_t __t1=(s)[i]; \
+ if((0xe0<=__b && __b<0xf0)) { \
+ if(U8_IS_VALID_LEAD3_AND_T1(__b, __t1) && \
+ ++(i)!=(length) && U8_IS_TRAIL((s)[i])) { \
+ ++(i); \
+ } \
+ } else if(__b<0xe0) { \
+ if(U8_IS_TRAIL(__t1)) { \
+ ++(i); \
+ } \
+ } else /* c>=0xf0 */ { \
+ if(U8_IS_VALID_LEAD4_AND_T1(__b, __t1) && \
+ ++(i)!=(length) && U8_IS_TRAIL((s)[i]) && \
+ ++(i)!=(length) && U8_IS_TRAIL((s)[i])) { \
+ ++(i); \
+ } \
} \
} \
}
/* c is a trail byte */ \
(c)&=0x3f; \
for(;;) { \
- __b=(uint8_t)(s)[--(i)]; \
+ __b=(s)[--(i)]; \
if(__b>=0xc0) { \
U8_MASK_LEAD_BYTE(__b, __count); \
(c)|=(UChar32)__b<<__shift; \
*/
#define U8_PREV(s, start, i, c) { \
(c)=(uint8_t)(s)[--(i)]; \
- if((c)>=0x80) { \
+ if(!U8_IS_SINGLE(c)) { \
(c)=utf8_prevCharSafeBody((const uint8_t *)s, start, &(i), c, -1); \
} \
}
*/
#define U8_PREV_OR_FFFD(s, start, i, c) { \
(c)=(uint8_t)(s)[--(i)]; \
- if((c)>=0x80) { \
+ if(!U8_IS_SINGLE(c)) { \
(c)=utf8_prevCharSafeBody((const uint8_t *)s, start, &(i), c, -3); \
} \
}
static inline int32_t
spanOneUTF8(const UnicodeSet &set, const uint8_t *s, int32_t length) {
UChar32 c=*s;
- if((int8_t)c>=0) {
+ if(U8_IS_SINGLE(c)) {
return set.contains(c) ? 1 : -1;
}
// Take advantage of non-ASCII fastpaths in U8_NEXT_OR_FFFD().
static inline int32_t
spanOneBackUTF8(const UnicodeSet &set, const uint8_t *s, int32_t length) {
UChar32 c=s[length-1];
- if((int8_t)c>=0) {
+ if(U8_IS_SINGLE(c)) {
return set.contains(c) ? 1 : -1;
}
int32_t i=length-1;
// Try to match if the increment is not listed already.
// Match at code point boundaries. (The UTF-8 strings were converted
// from UTF-16 and are guaranteed to be well-formed.)
- if( !U8_IS_TRAIL(s[pos-overlap]) &&
- !offsets.containsOffset(inc) &&
- matches8(s+pos-overlap, s8, length8)
-
- ) {
+ if(!U8_IS_TRAIL(s[pos-overlap]) &&
+ !offsets.containsOffset(inc) &&
+ matches8(s+pos-overlap, s8, length8)) {
if(inc==rest) {
return length; // Reached the end of the string.
}
// Try to match if the string is longer or starts earlier.
// Match at code point boundaries. (The UTF-8 strings were converted
// from UTF-16 and are guaranteed to be well-formed.)
- if( !U8_IS_TRAIL(s[pos-overlap]) &&
- (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ inc>maxInc) &&
- matches8(s+pos-overlap, s8, length8)
-
- ) {
+ if(!U8_IS_TRAIL(s[pos-overlap]) &&
+ (overlap>maxOverlap ||
+ /* redundant overlap==maxOverlap && */ inc>maxInc) &&
+ matches8(s+pos-overlap, s8, length8)) {
maxInc=inc; // Longest match from earliest start.
maxOverlap=overlap;
break;
pErrorCode);
}
-/* for utf8_nextCharSafeBodyTerminated() */
-static const UChar32
-utf8_minLegal[4]={ 0, 0x80, 0x800, 0x10000 };
-
-/*
- * Version of utf8_nextCharSafeBody() with the following differences:
- * - checks for NUL termination instead of length
- * - works with pointers instead of indexes
- * - always strict (strict==-1)
- *
- * *ps points to after the lead byte and will be moved to after the last trail byte.
- * c is the lead byte.
- * @return the code point, or U_SENTINEL
- */
-static UChar32
-utf8_nextCharSafeBodyTerminated(const uint8_t **ps, UChar32 c) {
- const uint8_t *s=*ps;
- uint8_t trail, illegal=0;
- uint8_t count=U8_COUNT_TRAIL_BYTES(c);
- U_ASSERT(count<6);
- U8_MASK_LEAD_BYTE((c), count);
- /* count==0 for illegally leading trail bytes and the illegal bytes 0xfe and 0xff */
- switch(count) {
- /* each branch falls through to the next one */
- case 5:
- case 4:
- /* count>=4 is always illegal: no more than 3 trail bytes in Unicode's UTF-8 */
- illegal=1;
- break;
- case 3:
- trail=(uint8_t)(*s++ - 0x80);
- c=(c<<6)|trail;
- if(trail>0x3f || c>=0x110) {
- /* not a trail byte, or code point>0x10ffff (outside Unicode) */
- illegal=1;
- break;
- }
- U_FALLTHROUGH;
- case 2:
- trail=(uint8_t)(*s++ - 0x80);
- if(trail>0x3f) {
- /* not a trail byte */
- illegal=1;
- break;
- }
- c=(c<<6)|trail;
- U_FALLTHROUGH;
- case 1:
- trail=(uint8_t)(*s++ - 0x80);
- if(trail>0x3f) {
- /* not a trail byte */
- illegal=1;
- }
- c=(c<<6)|trail;
- break;
- case 0:
- return U_SENTINEL;
- /* no default branch to optimize switch() - all values are covered */
- }
-
- /* correct sequence - all trail bytes have (b7..b6)==(10)? */
- /* illegal is also set if count>=4 */
- if(illegal || c<utf8_minLegal[count] || U_IS_SURROGATE(c)) {
- /* error handling */
- /* don't go beyond this sequence */
- s=*ps;
- while(count>0 && U8_IS_TRAIL(*s)) {
- ++s;
- --count;
- }
- c=U_SENTINEL;
- }
- *ps=s;
- return c;
-}
-
-/*
- * Version of utf8_nextCharSafeBody() with the following differences:
- * - works with pointers instead of indexes
- * - always strict (strict==-1)
- *
- * *ps points to after the lead byte and will be moved to after the last trail byte.
- * c is the lead byte.
- * @return the code point, or U_SENTINEL
- */
-static UChar32
-utf8_nextCharSafeBodyPointer(const uint8_t **ps, const uint8_t *limit, UChar32 c) {
- const uint8_t *s=*ps;
- uint8_t trail, illegal=0;
- uint8_t count=U8_COUNT_TRAIL_BYTES(c);
- if((limit-s)>=count) {
- U8_MASK_LEAD_BYTE((c), count);
- /* count==0 for illegally leading trail bytes and the illegal bytes 0xfe and 0xff */
- switch(count) {
- /* each branch falls through to the next one */
- case 5:
- case 4:
- /* count>=4 is always illegal: no more than 3 trail bytes in Unicode's UTF-8 */
- illegal=1;
- break;
- case 3:
- trail=*s++;
- c=(c<<6)|(trail&0x3f);
- if(c<0x110) {
- illegal|=(trail&0xc0)^0x80;
- } else {
- /* code point>0x10ffff, outside Unicode */
- illegal=1;
- break;
- }
- U_FALLTHROUGH;
- case 2:
- trail=*s++;
- c=(c<<6)|(trail&0x3f);
- illegal|=(trail&0xc0)^0x80;
- U_FALLTHROUGH;
- case 1:
- trail=*s++;
- c=(c<<6)|(trail&0x3f);
- illegal|=(trail&0xc0)^0x80;
- break;
- case 0:
- return U_SENTINEL;
- /* no default branch to optimize switch() - all values are covered */
- }
- } else {
- illegal=1; /* too few bytes left */
- }
-
- /* correct sequence - all trail bytes have (b7..b6)==(10)? */
- /* illegal is also set if count>=4 */
- U_ASSERT(illegal || count<UPRV_LENGTHOF(utf8_minLegal));
- if(illegal || c<utf8_minLegal[count] || U_IS_SURROGATE(c)) {
- /* error handling */
- /* don't go beyond this sequence */
- s=*ps;
- while(count>0 && s<limit && U8_IS_TRAIL(*s)) {
- ++s;
- --count;
- }
- c=U_SENTINEL;
- }
- *ps=s;
- return c;
-}
-
U_CAPI UChar* U_EXPORT2
u_strFromUTF8WithSub(UChar *dest,
int32_t destCapacity,
int32_t srcLength,
UChar32 subchar, int32_t *pNumSubstitutions,
UErrorCode *pErrorCode){
- UChar *pDest = dest;
- UChar *pDestLimit = dest+destCapacity;
- UChar32 ch;
- int32_t reqLength = 0;
- const uint8_t* pSrc = (const uint8_t*) src;
- uint8_t t1, t2; /* trail bytes */
- int32_t numSubstitutions;
-
/* args check */
- if(pErrorCode==NULL || U_FAILURE(*pErrorCode)){
+ if(U_FAILURE(*pErrorCode)) {
return NULL;
}
-
if( (src==NULL && srcLength!=0) || srcLength < -1 ||
(destCapacity<0) || (dest == NULL && destCapacity > 0) ||
subchar > 0x10ffff || U_IS_SURROGATE(subchar)
if(pNumSubstitutions!=NULL) {
*pNumSubstitutions=0;
}
- numSubstitutions=0;
+ UChar *pDest = dest;
+ UChar *pDestLimit = dest+destCapacity;
+ int32_t reqLength = 0;
+ int32_t numSubstitutions=0;
/*
* Inline processing of UTF-8 byte sequences:
* The code explicitly checks for NULs only in the lead byte position.
* A NUL byte in the trail byte position fails the trail byte range check anyway.
*/
- while(((ch = *pSrc) != 0) && (pDest < pDestLimit)) {
- if(ch <= 0x7f){
- *pDest++=(UChar)ch;
- ++pSrc;
+ int32_t i;
+ UChar32 c;
+ for(i = 0; (c = (uint8_t)src[i]) != 0 && (pDest < pDestLimit);) {
+ // modified copy of U8_NEXT()
+ ++i;
+ if(U8_IS_SINGLE(c)) {
+ *pDest++=(UChar)c;
} else {
- if(ch > 0xe0) {
- if( /* handle U+1000..U+CFFF inline */
- ch <= 0xec &&
- (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&
- (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f
- ) {
- /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
- *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
- pSrc += 3;
- continue;
- }
- } else if(ch < 0xe0) {
- if( /* handle U+0080..U+07FF inline */
- ch >= 0xc2 &&
- (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f
- ) {
- *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
- pSrc += 2;
- continue;
- }
- }
-
- /* function call for "complicated" and error cases */
- ++pSrc; /* continue after the lead byte */
- ch=utf8_nextCharSafeBodyTerminated(&pSrc, ch);
- if(ch<0 && (++numSubstitutions, ch = subchar) < 0) {
- *pErrorCode = U_INVALID_CHAR_FOUND;
- return NULL;
- } else if(ch<=0xFFFF) {
- *(pDest++)=(UChar)ch;
+ uint8_t __t1, __t2;
+ if( /* handle U+0800..U+FFFF inline */
+ (0xe0<=(c) && (c)<0xf0) &&
+ U8_IS_VALID_LEAD3_AND_T1((c), src[i]) &&
+ (__t2=src[(i)+1]-0x80)<=0x3f) {
+ *pDest++ = (((c)&0xf)<<12)|((src[i]&0x3f)<<6)|__t2;
+ i+=2;
+ } else if( /* handle U+0080..U+07FF inline */
+ ((c)<0xe0 && (c)>=0xc2) &&
+ (__t1=src[i]-0x80)<=0x3f) {
+ *pDest++ = (((c)&0x1f)<<6)|__t1;
+ ++(i);
} else {
- *(pDest++)=U16_LEAD(ch);
- if(pDest<pDestLimit) {
- *(pDest++)=U16_TRAIL(ch);
+ /* function call for "complicated" and error cases */
+ (c)=utf8_nextCharSafeBody((const uint8_t *)src, &(i), -1, c, -1);
+ if(c<0 && (++numSubstitutions, c = subchar) < 0) {
+ *pErrorCode = U_INVALID_CHAR_FOUND;
+ return NULL;
+ } else if(c<=0xFFFF) {
+ *(pDest++)=(UChar)c;
} else {
- reqLength++;
- break;
+ *(pDest++)=U16_LEAD(c);
+ if(pDest<pDestLimit) {
+ *(pDest++)=U16_TRAIL(c);
+ } else {
+ reqLength++;
+ break;
+ }
}
}
}
}
/* Pre-flight the rest of the string. */
- while((ch = *pSrc) != 0) {
- if(ch <= 0x7f){
+ while((c = (uint8_t)src[i]) != 0) {
+ // modified copy of U8_NEXT()
+ ++i;
+ if(U8_IS_SINGLE(c)) {
++reqLength;
- ++pSrc;
} else {
- if(ch > 0xe0) {
- if( /* handle U+1000..U+CFFF inline */
- ch <= 0xec &&
- (uint8_t)(pSrc[1] - 0x80) <= 0x3f &&
- (uint8_t)(pSrc[2] - 0x80) <= 0x3f
- ) {
- ++reqLength;
- pSrc += 3;
- continue;
- }
- } else if(ch < 0xe0) {
- if( /* handle U+0080..U+07FF inline */
- ch >= 0xc2 &&
- (uint8_t)(pSrc[1] - 0x80) <= 0x3f
- ) {
- ++reqLength;
- pSrc += 2;
- continue;
+ uint8_t __t1, __t2;
+ if( /* handle U+0800..U+FFFF inline */
+ (0xe0<=(c) && (c)<0xf0) &&
+ U8_IS_VALID_LEAD3_AND_T1((c), src[i]) &&
+ (__t2=src[(i)+1]-0x80)<=0x3f) {
+ ++reqLength;
+ i+=2;
+ } else if( /* handle U+0080..U+07FF inline */
+ ((c)<0xe0 && (c)>=0xc2) &&
+ (__t1=src[i]-0x80)<=0x3f) {
+ ++reqLength;
+ ++(i);
+ } else {
+ /* function call for "complicated" and error cases */
+ (c)=utf8_nextCharSafeBody((const uint8_t *)src, &(i), -1, c, -1);
+ if(c<0 && (++numSubstitutions, c = subchar) < 0) {
+ *pErrorCode = U_INVALID_CHAR_FOUND;
+ return NULL;
}
+ reqLength += U16_LENGTH(c);
}
-
- /* function call for "complicated" and error cases */
- ++pSrc; /* continue after the lead byte */
- ch=utf8_nextCharSafeBodyTerminated(&pSrc, ch);
- if(ch<0 && (++numSubstitutions, ch = subchar) < 0) {
- *pErrorCode = U_INVALID_CHAR_FOUND;
- return NULL;
- }
- reqLength += U16_LENGTH(ch);
}
}
} else /* srcLength >= 0 */ {
- const uint8_t *pSrcLimit = pSrc + srcLength;
- int32_t count;
-
- /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */
+ /* Faster loop without ongoing checking for srcLength and pDestLimit. */
+ int32_t i = 0;
+ UChar32 c;
for(;;) {
/*
* Each iteration of the inner loop progresses by at most 3 UTF-8
* For supplementary code points (4 & 2), which are rare,
* there is an additional adjustment.
*/
- count = (int32_t)(pDestLimit - pDest);
- srcLength = (int32_t)((pSrcLimit - pSrc) / 3);
- if(count > srcLength) {
- count = srcLength; /* min(remaining dest, remaining src/3) */
+ int32_t count = (int32_t)(pDestLimit - pDest);
+ int32_t count2 = (srcLength - i) / 3;
+ if(count > count2) {
+ count = count2; /* min(remaining dest, remaining src/3) */
}
if(count < 3) {
/*
}
do {
- ch = *pSrc;
- if(ch <= 0x7f){
- *pDest++=(UChar)ch;
- ++pSrc;
+ // modified copy of U8_NEXT()
+ c = (uint8_t)src[i++];
+ if(U8_IS_SINGLE(c)) {
+ *pDest++=(UChar)c;
} else {
- if(ch > 0xe0) {
- if( /* handle U+1000..U+CFFF inline */
- ch <= 0xec &&
- (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&
- (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f
- ) {
- /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
- *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
- pSrc += 3;
- continue;
- }
- } else if(ch < 0xe0) {
- if( /* handle U+0080..U+07FF inline */
- ch >= 0xc2 &&
- (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f
- ) {
- *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
- pSrc += 2;
- continue;
+ uint8_t __t1, __t2;
+ if( /* handle U+0800..U+FFFF inline */
+ (0xe0<=(c) && (c)<0xf0) &&
+ ((i)+1)<srcLength &&
+ U8_IS_VALID_LEAD3_AND_T1((c), src[i]) &&
+ (__t2=src[(i)+1]-0x80)<=0x3f) {
+ *pDest++ = (((c)&0xf)<<12)|((src[i]&0x3f)<<6)|__t2;
+ i+=2;
+ } else if( /* handle U+0080..U+07FF inline */
+ ((c)<0xe0 && (c)>=0xc2) &&
+ ((i)!=srcLength) &&
+ (__t1=src[i]-0x80)<=0x3f) {
+ *pDest++ = (((c)&0x1f)<<6)|__t1;
+ ++(i);
+ } else {
+ if(c >= 0xf0 || subchar > 0xffff) {
+ // We may read up to four bytes and write up to two UChars,
+ // which we didn't account for with computing count,
+ // so we adjust it here.
+ if(--count == 0) {
+ --i; // back out byte c
+ break;
+ }
}
- }
- if(ch >= 0xf0 || subchar > 0xffff) {
- /*
- * We may read up to six bytes and write up to two UChars,
- * which we didn't account for with computing count,
- * so we adjust it here.
- */
- if(--count == 0) {
- break;
+ /* function call for "complicated" and error cases */
+ (c)=utf8_nextCharSafeBody((const uint8_t *)src, &(i), srcLength, c, -1);
+ if(c<0 && (++numSubstitutions, c = subchar) < 0) {
+ *pErrorCode = U_INVALID_CHAR_FOUND;
+ return NULL;
+ } else if(c<=0xFFFF) {
+ *(pDest++)=(UChar)c;
+ } else {
+ *(pDest++)=U16_LEAD(c);
+ *(pDest++)=U16_TRAIL(c);
}
}
-
- /* function call for "complicated" and error cases */
- ++pSrc; /* continue after the lead byte */
- ch=utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
- if(ch<0 && (++numSubstitutions, ch = subchar) < 0){
- *pErrorCode = U_INVALID_CHAR_FOUND;
- return NULL;
- }else if(ch<=0xFFFF){
- *(pDest++)=(UChar)ch;
- }else{
- *(pDest++)=U16_LEAD(ch);
- *(pDest++)=U16_TRAIL(ch);
- }
}
} while(--count > 0);
}
- while((pSrc<pSrcLimit) && (pDest<pDestLimit)) {
- ch = *pSrc;
- if(ch <= 0x7f){
- *pDest++=(UChar)ch;
- ++pSrc;
+ while(i < srcLength && (pDest < pDestLimit)) {
+ // modified copy of U8_NEXT()
+ c = (uint8_t)src[i++];
+ if(U8_IS_SINGLE(c)) {
+ *pDest++=(UChar)c;
} else {
- if(ch > 0xe0) {
- if( /* handle U+1000..U+CFFF inline */
- ch <= 0xec &&
- ((pSrcLimit - pSrc) >= 3) &&
- (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&
- (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f
- ) {
- /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
- *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
- pSrc += 3;
- continue;
- }
- } else if(ch < 0xe0) {
- if( /* handle U+0080..U+07FF inline */
- ch >= 0xc2 &&
- ((pSrcLimit - pSrc) >= 2) &&
- (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f
- ) {
- *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
- pSrc += 2;
- continue;
- }
- }
-
- /* function call for "complicated" and error cases */
- ++pSrc; /* continue after the lead byte */
- ch=utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
- if(ch<0 && (++numSubstitutions, ch = subchar) < 0){
- *pErrorCode = U_INVALID_CHAR_FOUND;
- return NULL;
- }else if(ch<=0xFFFF){
- *(pDest++)=(UChar)ch;
- }else{
- *(pDest++)=U16_LEAD(ch);
- if(pDest<pDestLimit){
- *(pDest++)=U16_TRAIL(ch);
- }else{
- reqLength++;
- break;
+ uint8_t __t1, __t2;
+ if( /* handle U+0800..U+FFFF inline */
+ (0xe0<=(c) && (c)<0xf0) &&
+ ((i)+1)<srcLength &&
+ U8_IS_VALID_LEAD3_AND_T1((c), src[i]) &&
+ (__t2=src[(i)+1]-0x80)<=0x3f) {
+ *pDest++ = (((c)&0xf)<<12)|((src[i]&0x3f)<<6)|__t2;
+ i+=2;
+ } else if( /* handle U+0080..U+07FF inline */
+ ((c)<0xe0 && (c)>=0xc2) &&
+ ((i)!=srcLength) &&
+ (__t1=src[i]-0x80)<=0x3f) {
+ *pDest++ = (((c)&0x1f)<<6)|__t1;
+ ++(i);
+ } else {
+ /* function call for "complicated" and error cases */
+ (c)=utf8_nextCharSafeBody((const uint8_t *)src, &(i), srcLength, c, -1);
+ if(c<0 && (++numSubstitutions, c = subchar) < 0) {
+ *pErrorCode = U_INVALID_CHAR_FOUND;
+ return NULL;
+ } else if(c<=0xFFFF) {
+ *(pDest++)=(UChar)c;
+ } else {
+ *(pDest++)=U16_LEAD(c);
+ if(pDest<pDestLimit) {
+ *(pDest++)=U16_TRAIL(c);
+ } else {
+ reqLength++;
+ break;
+ }
}
}
}
}
- /* do not fill the dest buffer just count the UChars needed */
- while(pSrc < pSrcLimit){
- ch = *pSrc;
- if(ch <= 0x7f){
- reqLength++;
- ++pSrc;
+
+ /* Pre-flight the rest of the string. */
+ while(i < srcLength) {
+ // modified copy of U8_NEXT()
+ c = (uint8_t)src[i++];
+ if(U8_IS_SINGLE(c)) {
+ ++reqLength;
} else {
- if(ch > 0xe0) {
- if( /* handle U+1000..U+CFFF inline */
- ch <= 0xec &&
- ((pSrcLimit - pSrc) >= 3) &&
- (uint8_t)(pSrc[1] - 0x80) <= 0x3f &&
- (uint8_t)(pSrc[2] - 0x80) <= 0x3f
- ) {
- reqLength++;
- pSrc += 3;
- continue;
- }
- } else if(ch < 0xe0) {
- if( /* handle U+0080..U+07FF inline */
- ch >= 0xc2 &&
- ((pSrcLimit - pSrc) >= 2) &&
- (uint8_t)(pSrc[1] - 0x80) <= 0x3f
- ) {
- reqLength++;
- pSrc += 2;
- continue;
+ uint8_t __t1, __t2;
+ if( /* handle U+0800..U+FFFF inline */
+ (0xe0<=(c) && (c)<0xf0) &&
+ ((i)+1)<srcLength &&
+ U8_IS_VALID_LEAD3_AND_T1((c), src[i]) &&
+ (__t2=src[(i)+1]-0x80)<=0x3f) {
+ ++reqLength;
+ i+=2;
+ } else if( /* handle U+0080..U+07FF inline */
+ ((c)<0xe0 && (c)>=0xc2) &&
+ ((i)!=srcLength) &&
+ (__t1=src[i]-0x80)<=0x3f) {
+ ++reqLength;
+ ++(i);
+ } else {
+ /* function call for "complicated" and error cases */
+ (c)=utf8_nextCharSafeBody((const uint8_t *)src, &(i), srcLength, c, -1);
+ if(c<0 && (++numSubstitutions, c = subchar) < 0) {
+ *pErrorCode = U_INVALID_CHAR_FOUND;
+ return NULL;
}
+ reqLength += U16_LENGTH(c);
}
-
- /* function call for "complicated" and error cases */
- ++pSrc; /* continue after the lead byte */
- ch=utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
- if(ch<0 && (++numSubstitutions, ch = subchar) < 0){
- *pErrorCode = U_INVALID_CHAR_FOUND;
- return NULL;
- }
- reqLength+=U16_LENGTH(ch);
}
}
}
uint8_t* pSrc = (uint8_t*) src;
/* args check */
- if(pErrorCode==NULL || U_FAILURE(*pErrorCode)){
+ if(U_FAILURE(*pErrorCode)){
return NULL;
}
int32_t numSubstitutions;
/* args check */
- if(pErrorCode==NULL || U_FAILURE(*pErrorCode)){
+ if(U_FAILURE(*pErrorCode)){
return NULL;
}
int32_t srcLength,
UChar32 subchar, int32_t *pNumSubstitutions,
UErrorCode *pErrorCode) {
- UChar *pDest = dest;
- UChar *pDestLimit = dest+destCapacity;
- UChar32 ch;
- int32_t reqLength = 0;
- const uint8_t* pSrc = (const uint8_t*) src;
- const uint8_t *pSrcLimit;
- int32_t count;
- uint8_t t1, t2; /* trail bytes */
- int32_t numSubstitutions;
-
/* args check */
- if(U_FAILURE(*pErrorCode)){
+ if(U_FAILURE(*pErrorCode)) {
return NULL;
}
if( (src==NULL && srcLength!=0) || srcLength < -1 ||
if(pNumSubstitutions!=NULL) {
*pNumSubstitutions=0;
}
- numSubstitutions=0;
+ UChar *pDest = dest;
+ UChar *pDestLimit = dest+destCapacity;
+ int32_t reqLength = 0;
+ int32_t numSubstitutions=0;
if(srcLength < 0) {
/*
* Transform a NUL-terminated ASCII string.
* Handle non-ASCII strings with slower code.
*/
- while(((ch = *pSrc) != 0) && ch <= 0x7f && (pDest < pDestLimit)) {
- *pDest++=(UChar)ch;
- ++pSrc;
+ UChar32 c;
+ while(((c = (uint8_t)*src) != 0) && c <= 0x7f && (pDest < pDestLimit)) {
+ *pDest++=(UChar)c;
+ ++src;
}
- if(ch == 0) {
+ if(c == 0) {
reqLength=(int32_t)(pDest - dest);
if(pDestLength) {
*pDestLength = reqLength;
u_terminateUChars(dest, destCapacity, reqLength, pErrorCode);
return dest;
}
- srcLength = static_cast<int32_t>(uprv_strlen((const char *)pSrc));
+ srcLength = static_cast<int32_t>(uprv_strlen(src));
}
- /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */
- pSrcLimit = (pSrc == NULL) ? NULL : pSrc + srcLength;
+ /* Faster loop without ongoing checking for srcLength and pDestLimit. */
+ UChar32 ch;
+ uint8_t t1, t2;
+ int32_t i = 0;
for(;;) {
- count = (int32_t)(pDestLimit - pDest);
- srcLength = (int32_t)(pSrcLimit - pSrc);
- if(count >= srcLength && srcLength > 0 && *pSrc <= 0x7f) {
+ int32_t count = (int32_t)(pDestLimit - pDest);
+ int32_t count2 = srcLength - i;
+ if(count >= count2 && srcLength > 0 && U8_IS_SINGLE(*src)) {
/* fast ASCII loop */
- const uint8_t *prevSrc = pSrc;
- int32_t delta;
- while(pSrc < pSrcLimit && (ch = *pSrc) <= 0x7f) {
- *pDest++=(UChar)ch;
- ++pSrc;
+ int32_t start = i;
+ uint8_t b;
+ while(i < srcLength && U8_IS_SINGLE(b = src[i])) {
+ *pDest++=b;
+ ++i;
}
- delta = (int32_t)(pSrc - prevSrc);
+ int32_t delta = i - start;
count -= delta;
- srcLength -= delta;
+ count2 -= delta;
}
/*
* Each iteration of the inner loop progresses by at most 3 UTF-8
* bytes and one UChar.
*/
- srcLength /= 3;
- if(count > srcLength) {
- count = srcLength; /* min(remaining dest, remaining src/3) */
+ if(subchar > 0xFFFF) {
+ break;
+ }
+ count2 /= 3;
+ if(count > count2) {
+ count = count2; /* min(remaining dest, remaining src/3) */
}
if(count < 3) {
/*
break;
}
do {
- ch = *pSrc;
- if(ch <= 0x7f){
+ ch = (uint8_t)src[i++];
+ if(U8_IS_SINGLE(ch)) {
*pDest++=(UChar)ch;
- ++pSrc;
} else {
if(ch >= 0xe0) {
if( /* handle U+0000..U+FFFF inline */
ch <= 0xef &&
- (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&
- (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f
+ (t1 = (uint8_t)(src[i] - 0x80)) <= 0x3f &&
+ (t2 = (uint8_t)(src[i+1] - 0x80)) <= 0x3f
) {
/* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
*pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
- pSrc += 3;
+ i += 2;
continue;
}
} else {
if( /* handle U+0000..U+07FF inline */
ch >= 0xc0 &&
- (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f
+ (t1 = (uint8_t)(src[i] - 0x80)) <= 0x3f
) {
*pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
- pSrc += 2;
+ ++i;
continue;
}
}
* We need to write two UChars, adjusted count for that,
* and ran out of space.
*/
+ --i; // back out byte ch
break;
} else {
/* function call for error cases */
- ++pSrc; /* continue after the lead byte */
- utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
+ utf8_nextCharSafeBody((const uint8_t *)src, &(i), srcLength, ch, -1);
++numSubstitutions;
- if(subchar<=0xFFFF) {
- *(pDest++)=(UChar)subchar;
- } else {
- *(pDest++)=U16_LEAD(subchar);
- *(pDest++)=U16_TRAIL(subchar);
- }
+ *(pDest++)=(UChar)subchar;
}
}
} while(--count > 0);
}
- while((pSrc<pSrcLimit) && (pDest<pDestLimit)) {
- ch = *pSrc;
- if(ch <= 0x7f){
+ while(i < srcLength && (pDest < pDestLimit)) {
+ ch = (uint8_t)src[i++];
+ if(U8_IS_SINGLE(ch)){
*pDest++=(UChar)ch;
- ++pSrc;
} else {
if(ch >= 0xe0) {
if( /* handle U+0000..U+FFFF inline */
ch <= 0xef &&
- ((pSrcLimit - pSrc) >= 3) &&
- (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&
- (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f
+ (i+1) < srcLength &&
+ (t1 = (uint8_t)(src[i] - 0x80)) <= 0x3f &&
+ (t2 = (uint8_t)(src[i+1] - 0x80)) <= 0x3f
) {
/* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
*pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
- pSrc += 3;
+ i += 2;
continue;
}
} else {
if( /* handle U+0000..U+07FF inline */
ch >= 0xc0 &&
- ((pSrcLimit - pSrc) >= 2) &&
- (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f
+ i < srcLength &&
+ (t1 = (uint8_t)(src[i] - 0x80)) <= 0x3f
) {
*pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
- pSrc += 2;
+ ++i;
continue;
}
}
return NULL;
} else {
/* function call for error cases */
- ++pSrc; /* continue after the lead byte */
- utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
+ utf8_nextCharSafeBody((const uint8_t *)src, &(i), srcLength, ch, -1);
++numSubstitutions;
if(subchar<=0xFFFF) {
*(pDest++)=(UChar)subchar;
}
}
- /* do not fill the dest buffer just count the UChars needed */
- while(pSrc < pSrcLimit){
- ch = *pSrc;
- if(ch <= 0x7f) {
+ /* Pre-flight the rest of the string. */
+ while(i < srcLength) {
+ ch = (uint8_t)src[i++];
+ if(U8_IS_SINGLE(ch)) {
reqLength++;
- ++pSrc;
} else {
if(ch >= 0xe0) {
if( /* handle U+0000..U+FFFF inline */
ch <= 0xef &&
- ((pSrcLimit - pSrc) >= 3) &&
- (uint8_t)(pSrc[1] - 0x80) <= 0x3f &&
- (uint8_t)(pSrc[2] - 0x80) <= 0x3f
+ (i+1) < srcLength &&
+ (uint8_t)(src[i] - 0x80) <= 0x3f &&
+ (uint8_t)(src[i+1] - 0x80) <= 0x3f
) {
reqLength++;
- pSrc += 3;
+ i += 2;
continue;
}
} else {
if( /* handle U+0000..U+07FF inline */
ch >= 0xc0 &&
- ((pSrcLimit - pSrc) >= 2) &&
- (uint8_t)(pSrc[1] - 0x80) <= 0x3f
+ i < srcLength &&
+ (uint8_t)(src[i] - 0x80) <= 0x3f
) {
reqLength++;
- pSrc += 2;
+ ++i;
continue;
}
}
return NULL;
} else {
/* function call for error cases */
- ++pSrc; /* continue after the lead byte */
- utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
+ utf8_nextCharSafeBody((const uint8_t *)src, &(i), srcLength, ch, -1);
++numSubstitutions;
reqLength+=U16_LENGTH(ch);
}
//------------------------------------------------------------------------------
// Chunk size.
-// Must be less than 42 (256/6), because of byte mapping from UChar indexes to native indexes.
-// Worst case there are six UTF-8 bytes per UChar.
-// obsolete 6 byte form fd + 5 trails maps to fffd
-// obsolete 5 byte form fc + 4 trails maps to fffd
-// non-shortest 4 byte forms maps to fffd
-// normal supplementaries map to a pair of utf-16, two utf8 bytes per utf-16 unit
-// mapToUChars array size must allow for the worst case, 6.
-// This could be brought down to 4, by treating fd and fc as pure illegal,
-// rather than obsolete lead bytes. But that is not compatible with the utf-8 access macros.
+// Must be less than 85 (256/3), because of byte mapping from UChar indexes to native indexes.
+// Worst case is three native bytes to one UChar. (Supplemenaries are 4 native bytes
+// to two UChars.)
+// The longest illegal byte sequence treated as a single error (and converted to U+FFFD)
+// is a three-byte sequence (truncated four-byte sequence).
//
enum { UTF8_TEXT_CHUNK_SIZE=32 };
// Requires two extra slots,
// one for a supplementary starting in the last normal position,
// and one for an entry for the buffer limit position.
- uint8_t mapToUChars[UTF8_TEXT_CHUNK_SIZE*6+6]; // Map native offset from bufNativeStart to
+ uint8_t mapToUChars[UTF8_TEXT_CHUNK_SIZE*3+6]; // Map native offset from bufNativeStart to
// correspoding offset in filled part of buf.
int32_t align;
};
* Corporation and others. All Rights Reserved.
*
******************************************************************************
-* file name: utf_impl.c
+* file name: utf_impl.cpp
* encoding: UTF-8
* tab size: 8 (not used)
* indentation:4
* - SUB AX, BX (result)
* -finish:
* (BSR: Bit Scan Reverse, scans for a 1-bit, starting from the MSB)
- *
- * In Unicode, all UTF-8 byte sequences with more than 4 bytes are illegal;
- * lead bytes above 0xf4 are illegal.
- * We keep them in this table for skipping long ISO 10646-UTF-8 sequences.
*/
extern "C" U_EXPORT const uint8_t
utf8_countTrailBytes[256]={
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ // illegal C0 & C1
+ // 2-byte lead bytes C2..DF
+ 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ // 3-byte lead bytes E0..EF
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
- 3, 3, 3, 3, 3,
- 3, 3, 3, /* illegal in Unicode */
- 4, 4, 4, 4, /* illegal in Unicode */
- 5, 5, /* illegal in Unicode */
- 0, 0 /* illegal bytes 0xfe and 0xff */
+ // 4-byte lead bytes F0..F4
+ // illegal F5..FF
+ 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
};
-static const UChar32
-utf8_minLegal[4]={ 0, 0x80, 0x800, 0x10000 };
-
static const UChar32
utf8_errorValue[6]={
// Same values as UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_2, UTF_ERROR_VALUE,
// but without relying on the obsolete unicode/utf_old.h.
0x15, 0x9f, 0xffff,
- 0x10ffff,
- 0x3ffffff, 0x7fffffff
+ 0x10ffff
};
static UChar32
*/
U_CAPI UChar32 U_EXPORT2
utf8_nextCharSafeBody(const uint8_t *s, int32_t *pi, int32_t length, UChar32 c, UBool strict) {
+ // *pi is one after byte c.
int32_t i=*pi;
- uint8_t count=U8_COUNT_TRAIL_BYTES(c);
- U_ASSERT(count <= 5); /* U8_COUNT_TRAIL_BYTES returns value 0...5 */
- if(i+count<=length || length<0) {
- uint8_t trail;
-
- U8_MASK_LEAD_BYTE(c, count);
- /* support NUL-terminated strings: do not read beyond the first non-trail byte */
- switch(count) {
- /* each branch falls through to the next one */
- case 0:
- /* count==0 for illegally leading trail bytes and the illegal bytes 0xfe and 0xff */
- case 5:
- case 4:
- /* count>=4 is always illegal: no more than 3 trail bytes in Unicode's UTF-8 */
- break;
- case 3:
- trail=s[i++]-0x80;
- c=(c<<6)|trail;
- /* c>=0x110 would result in code point>0x10ffff, outside Unicode */
- if(c>=0x110 || trail>0x3f) { break; }
- U_FALLTHROUGH;
- case 2:
- trail=s[i++]-0x80;
- c=(c<<6)|trail;
- /*
- * test for a surrogate d800..dfff unless we are lenient:
- * before the last (c<<6), a surrogate is c=360..37f
- */
- if(((c&0xffe0)==0x360 && strict!=-2) || trail>0x3f) { break; }
- U_FALLTHROUGH;
- case 1:
- trail=s[i++]-0x80;
- c=(c<<6)|trail;
- if(trail>0x3f) { break; }
- /* correct sequence - all trail bytes have (b7..b6)==(10) */
- if(c>=utf8_minLegal[count] &&
- /* strict: forbid non-characters like U+fffe */
- (strict<=0 || !U_IS_UNICODE_NONCHAR(c))) {
+ // length can be negative for NUL-terminated strings: Read and validate one byte at a time.
+ if(i==length || c>0xf4) {
+ // end of string, or not a lead byte
+ } else if(c>=0xf0) {
+ // Test for 4-byte sequences first because
+ // U8_NEXT() handles shorter valid sequences inline.
+ uint8_t t1=s[i], t2, t3;
+ c&=7;
+ if(U8_IS_VALID_LEAD4_AND_T1(c, t1) &&
+ ++i!=length && (t2=s[i]-0x80)<=0x3f &&
+ ++i!=length && (t3=s[i]-0x80)<=0x3f) {
+ ++i;
+ c=(c<<18)|((t1&0x3f)<<12)|(t2<<6)|t3;
+ // strict: forbid non-characters like U+fffe
+ if(strict<=0 || !U_IS_UNICODE_NONCHAR(c)) {
*pi=i;
return c;
}
- /* no default branch to optimize switch() - all values are covered */
}
- } else {
- /* too few bytes left */
- count=length-i;
- }
+ } else if(c>=0xe0) {
+ c&=0xf;
+ if(strict!=-2) {
+ uint8_t t1=s[i], t2;
+ if(U8_IS_VALID_LEAD3_AND_T1(c, t1) &&
+ ++i!=length && (t2=s[i]-0x80)<=0x3f) {
+ ++i;
+ c=(c<<12)|((t1&0x3f)<<6)|t2;
+ // strict: forbid non-characters like U+fffe
+ if(strict<=0 || !U_IS_UNICODE_NONCHAR(c)) {
+ *pi=i;
+ return c;
+ }
+ }
+ } else {
+ // strict=-2 -> lenient: allow surrogates
+ uint8_t t1=s[i]-0x80, t2;
+ if(t1<=0x3f && (c>0 || t1>=0x20) &&
+ ++i!=length && (t2=s[i]-0x80)<=0x3f) {
+ *pi=i+1;
+ return (c<<12)|(t1<<6)|t2;
+ }
+ }
+ } else if(c>=0xc2) {
+ uint8_t t1=s[i]-0x80;
+ if(t1<=0x3f) {
+ *pi=i+1;
+ return ((c-0xc0)<<6)|t1;
+ }
+ } // else 0x80<=c<0xc2 is not a lead byte
/* error handling */
- i=*pi;
- while(count>0 && U8_IS_TRAIL(s[i])) {
- ++i;
- --count;
- }
c=errorValue(i-*pi, strict);
*pi=i;
return c;
U_CAPI UChar32 U_EXPORT2
utf8_prevCharSafeBody(const uint8_t *s, int32_t start, int32_t *pi, UChar32 c, UBool strict) {
+ // *pi is the index of byte c.
int32_t i=*pi;
- uint8_t b, count=1, shift=6;
-
- if(!U8_IS_TRAIL(c)) { return errorValue(0, strict); }
-
- /* extract value bits from the last trail byte */
- c&=0x3f;
-
- for(;;) {
- if(i<=start) {
- /* no lead byte at all */
- return errorValue(0, strict);
- }
-
- /* read another previous byte */
- b=s[--i];
- if((uint8_t)(b-0x80)<0x7e) { /* 0x80<=b<0xfe */
- if(b&0x40) {
- /* lead byte, this will always end the loop */
- uint8_t shouldCount=U8_COUNT_TRAIL_BYTES(b);
-
- if(count==shouldCount) {
- /* set the new position */
- *pi=i;
- U8_MASK_LEAD_BYTE(b, count);
- c|=(UChar32)b<<shift;
- if(count>=4 || c>0x10ffff || c<utf8_minLegal[count] || (U_IS_SURROGATE(c) && strict!=-2) || (strict>0 && U_IS_UNICODE_NONCHAR(c))) {
- /* illegal sequence or (strict and non-character) */
- if(count>=4) {
- count=3;
+ if(U8_IS_TRAIL(c) && i>start) {
+ uint8_t b1=s[--i];
+ if(0xc2<=b1 && b1<0xe0) {
+ *pi=i;
+ return ((b1-0xc0)<<6)|(c&0x3f);
+ } else if(U8_IS_TRAIL(b1) && i>start) {
+ // Extract the value bits from the last trail byte.
+ c&=0x3f;
+ uint8_t b2=s[--i];
+ if(0xe0<=b2 && b2<0xf0) {
+ b2&=0xf;
+ if(strict!=-2) {
+ if(U8_IS_VALID_LEAD3_AND_T1(b2, b1)) {
+ *pi=i;
+ c=(b2<<12)|((b1&0x3f)<<6)|c;
+ if(strict<=0 || !U_IS_UNICODE_NONCHAR(c)) {
+ return c;
+ } else {
+ // strict: forbid non-characters like U+fffe
+ return errorValue(2, strict);
}
- c=errorValue(count, strict);
- } else {
- /* exit with correct c */
}
} else {
- /* the lead byte does not match the number of trail bytes */
- /* only set the position to the lead byte if it would
- include the trail byte that we started with */
- if(count<shouldCount) {
+ // strict=-2 -> lenient: allow surrogates
+ b1-=0x80;
+ if((b2>0 || b1>=0x20)) {
+ *pi=i;
+ return (b2<<12)|(b1<<6)|c;
+ }
+ }
+ } else if(U8_IS_TRAIL(b2) && i>start) {
+ uint8_t b3=s[--i];
+ if(0xf0<=b3 && b3<=0xf4) {
+ b3&=7;
+ if(U8_IS_VALID_LEAD4_AND_T1(b3, b2)) {
*pi=i;
- c=errorValue(count, strict);
- } else {
- c=errorValue(0, strict);
+ c=(b3<<18)|((b2&0x3f)<<12)|((b1&0x3f)<<6)|c;
+ if(strict<=0 || !U_IS_UNICODE_NONCHAR(c)) {
+ return c;
+ } else {
+ // strict: forbid non-characters like U+fffe
+ return errorValue(3, strict);
+ }
}
}
- break;
- } else if(count<5) {
- /* trail byte */
- c|=(UChar32)(b&0x3f)<<shift;
- ++count;
- shift+=6;
- } else {
- /* more than 5 trail bytes is illegal */
- c=errorValue(0, strict);
- break;
+ } else if((0xf0<=b2 && b2<=0xf4) && U8_IS_VALID_LEAD4_AND_T1(b2, b1)) {
+ // Truncated 4-byte sequence.
+ *pi=i;
+ return errorValue(2, strict);
}
- } else {
- /* single-byte character precedes trailing bytes */
- c=errorValue(0, strict);
- break;
+ } else if(((0xe0<=b1 && b1<0xf0) && U8_IS_VALID_LEAD3_AND_T1(b1, c)) ||
+ ((0xf0<=b1 && b1<=0xf4) && U8_IS_VALID_LEAD4_AND_T1(b1, c))) {
+ // Truncated 3- or 4-byte sequence.
+ *pi=i;
+ return errorValue(1, strict);
}
}
- return c;
+ return errorValue(0, strict);
}
U_CAPI int32_t U_EXPORT2
utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i) {
- /* i had been decremented once before the function call */
- int32_t I=i, Z;
- uint8_t b;
-
- /* read at most the 6 bytes s[Z] to s[i], inclusively */
- if(I-5>start) {
- Z=I-5;
- } else {
- Z=start;
- }
-
- /* return I if the sequence starting there is long enough to include i */
- do {
- b=s[I];
- if((uint8_t)(b-0x80)>=0x7e) { /* not 0x80<=b<0xfe */
- break;
- } else if(b>=0xc0) {
- if(U8_COUNT_TRAIL_BYTES(b)>=(i-I)) {
- return I;
- } else {
- break;
+ // Same as utf8_prevCharSafeBody(..., strict=-1) minus assembling code points.
+ int32_t orig_i=i;
+ uint8_t c=s[i];
+ if(U8_IS_TRAIL(c) && i>start) {
+ uint8_t b1=s[--i];
+ if(0xc2<=b1 && b1<0xe0) {
+ return i;
+ } else if(U8_IS_TRAIL(b1) && i>start) {
+ uint8_t b2=s[--i];
+ if(0xe0<=b2 && b2<0xf0) {
+ if(U8_IS_VALID_LEAD3_AND_T1(b2, b1)) {
+ return i;
+ }
+ } else if(U8_IS_TRAIL(b2) && i>start) {
+ uint8_t b3=s[--i];
+ if(0xf0<=b3 && b3<=0xf4) {
+ if(U8_IS_VALID_LEAD4_AND_T1(b3, b2)) {
+ return i;
+ }
+ }
+ } else if((0xf0<=b2 && b2<=0xf4) && U8_IS_VALID_LEAD4_AND_T1(b2, b1)) {
+ // Truncated 4-byte sequence.
+ return i;
}
+ } else if(((0xe0<=b1 && b1<0xf0) && U8_IS_VALID_LEAD3_AND_T1(b1, c)) ||
+ ((0xf0<=b1 && b1<=0xf4) && U8_IS_VALID_LEAD4_AND_T1(b1, c))) {
+ // Truncated 3- or 4-byte sequence.
+ return i;
}
- } while(Z<=--I);
-
- /* return i itself to be consistent with the FWD_1 macro */
- return i;
+ }
+ return orig_i;
}
#define __UTRIE2_H__
#include "unicode/utypes.h"
+#include "unicode/utf8.h"
#include "putilimp.h"
#include "udataswp.h"
* is truncated, omitting both the BMP portion and the high range.
* - There is a special small index for 2-byte UTF-8, and the initial data
* entries are designed for fast 1/2-byte UTF-8 lookup.
+ * Starting with ICU 60, C0 and C1 are not recognized as UTF-8 lead bytes any more at all,
+ * and the associated 2-byte indexes are unused.
*/
/**
/** Internal UTF-8 next-post-increment: get the next code point's data. */
#define _UTRIE2_U8_NEXT(trie, ascii, data, src, limit, result) { \
uint8_t __lead=(uint8_t)*(src)++; \
- if(__lead<0xc0) { \
+ if(U8_IS_SINGLE(__lead)) { \
(result)=(trie)->ascii[__lead]; \
} else { \
uint8_t __t1, __t2; \
- if( /* handle U+0000..U+07FF inline */ \
- __lead<0xe0 && (src)<(limit) && \
- (__t1=(uint8_t)(*(src)-0x80))<=0x3f \
- ) { \
- ++(src); \
- (result)=(trie)->data[ \
- (trie)->index[(UTRIE2_UTF8_2B_INDEX_2_OFFSET-0xc0)+__lead]+ \
- __t1]; \
- } else if( /* handle U+0000..U+CFFF inline */ \
- __lead<0xed && ((src)+1)<(limit) && \
- (__t1=(uint8_t)(*(src)-0x80))<=0x3f && (__lead>0xe0 || __t1>=0x20) && \
+ if( /* handle U+0800..U+FFFF inline */ \
+ 0xe0<=__lead && __lead<0xf0 && ((src)+1)<(limit) && \
+ U8_IS_VALID_LEAD3_AND_T1(__lead, __t1=(uint8_t)*(src)) && \
(__t2=(uint8_t)(*((src)+1)-0x80))<= 0x3f \
) { \
(src)+=2; \
(result)=(trie)->data[ \
((int32_t)((trie)->index[((__lead-0xe0)<<(12-UTRIE2_SHIFT_2))+ \
- (__t1<<(6-UTRIE2_SHIFT_2))+(__t2>>UTRIE2_SHIFT_2)]) \
+ ((__t1&0x3f)<<(6-UTRIE2_SHIFT_2))+(__t2>>UTRIE2_SHIFT_2)]) \
<<UTRIE2_INDEX_SHIFT)+ \
(__t2&UTRIE2_DATA_MASK)]; \
+ } else if( /* handle U+0080..U+07FF inline */ \
+ __lead<0xe0 && __lead>=0xc2 && (src)<(limit) && \
+ (__t1=(uint8_t)(*(src)-0x80))<=0x3f \
+ ) { \
+ ++(src); \
+ (result)=(trie)->data[ \
+ (trie)->index[(UTRIE2_UTF8_2B_INDEX_2_OFFSET-0xc0)+__lead]+ \
+ __t1]; \
} else { \
int32_t __index=utrie2_internalU8NextIndex((trie), __lead, (const uint8_t *)(src), \
(const uint8_t *)(limit)); \
/** Internal UTF-8 pre-decrement-previous: get the previous code point's data. */
#define _UTRIE2_U8_PREV(trie, ascii, data, start, src, result) { \
uint8_t __b=(uint8_t)*--(src); \
- if(__b<0x80) { \
+ if(U8_IS_SINGLE(__b)) { \
(result)=(trie)->ascii[__b]; \
} else { \
int32_t __index=utrie2_internalU8PrevIndex((trie), __b, (const uint8_t *)(start), \
}
// Optimized combination of U8_NEXT_OR_FFFD() and UTRIE2_U8_NEXT32().
c = u8[pos++];
- if(c < 0xc0) {
- // ASCII 00..7F; trail bytes 80..BF map to error values.
+ if(U8_IS_SINGLE(c)) {
+ // ASCII 00..7F
return trie->data32[c];
}
uint8_t t1, t2;
- if(c < 0xe0 && pos != length && (t1 = (u8[pos] - 0x80)) <= 0x3f) {
- // U+0080..U+07FF; 00..7F map to error values.
+ if(0xe0 <= c && c < 0xf0 &&
+ ((pos + 1) < length || length < 0) &&
+ U8_IS_VALID_LEAD3_AND_T1(c, t1 = u8[pos]) &&
+ (t2 = (u8[pos + 1] - 0x80)) <= 0x3f) {
+ // U+0800..U+FFFF except surrogates
+ c = (((c & 0xf) << 12) | ((t1 & 0x3f) << 6) | t2);
+ pos += 2;
+ return UTRIE2_GET32_FROM_U16_SINGLE_LEAD(trie, c);
+ } else if(c < 0xe0 && c >= 0xc2 && pos != length && (t1 = (u8[pos] - 0x80)) <= 0x3f) {
+ // U+0080..U+07FF
uint32_t ce32 = trie->data32[trie->index[(UTRIE2_UTF8_2B_INDEX_2_OFFSET - 0xc0) + c] + t1];
c = ((c & 0x1f) << 6) | t1;
++pos;
return ce32;
- } else if(c <= 0xef &&
- ((pos + 1) < length || length < 0) &&
- (t1 = (u8[pos] - 0x80)) <= 0x3f && (c != 0xe0 || t1 >= 0x20) &&
- (t2 = (u8[pos + 1] - 0x80)) <= 0x3f
- ) {
- // U+0800..U+FFFF; caller maps surrogates to error values.
- c = (UChar)((c << 12) | (t1 << 6) | t2);
- pos += 2;
- return UTRIE2_GET32_FROM_U16_SINGLE_LEAD(trie, c);
} else {
// Function call for supplementary code points and error cases.
// Illegal byte sequences yield U+FFFD.
return Collation::FALLBACK_CE32;
}
c = u8[pos++];
- if(c < 0xc0) {
- // ASCII 00..7F; trail bytes 80..BF map to error values.
+ if(U8_IS_SINGLE(c)) {
+ // ASCII 00..7F
return trie->data32[c];
}
uint8_t t1, t2;
- if(c < 0xe0 && pos != length && (t1 = (u8[pos] - 0x80)) <= 0x3f) {
- // U+0080..U+07FF; 00..7F map to error values.
- uint32_t ce32 = trie->data32[trie->index[(UTRIE2_UTF8_2B_INDEX_2_OFFSET - 0xc0) + c] + t1];
- c = ((c & 0x1f) << 6) | t1;
- ++pos;
- if(CollationFCD::hasTccc(c) && pos != length && nextHasLccc()) {
- pos -= 2;
- } else {
- return ce32;
- }
- } else if(c <= 0xef &&
- ((pos + 1) < length || length < 0) &&
- (t1 = (u8[pos] - 0x80)) <= 0x3f && (c != 0xe0 || t1 >= 0x20) &&
- (t2 = (u8[pos + 1] - 0x80)) <= 0x3f
- ) {
- // U+0800..U+FFFF; caller maps surrogates to error values.
- c = (UChar)((c << 12) | (t1 << 6) | t2);
+ if(0xe0 <= c && c < 0xf0 &&
+ ((pos + 1) < length || length < 0) &&
+ U8_IS_VALID_LEAD3_AND_T1(c, t1 = u8[pos]) &&
+ (t2 = (u8[pos + 1] - 0x80)) <= 0x3f) {
+ // U+0800..U+FFFF except surrogates
+ c = (((c & 0xf) << 12) | ((t1 & 0x3f) << 6) | t2);
pos += 2;
if(CollationFCD::hasTccc(c) &&
(CollationFCD::maybeTibetanCompositeVowel(c) ||
} else {
break; // return CE32(BMP)
}
+ } else if(c < 0xe0 && c >= 0xc2 && pos != length && (t1 = (u8[pos] - 0x80)) <= 0x3f) {
+ // U+0080..U+07FF
+ uint32_t ce32 = trie->data32[trie->index[(UTRIE2_UTF8_2B_INDEX_2_OFFSET - 0xc0) + c] + t1];
+ c = ((c & 0x1f) << 6) | t1;
+ ++pos;
+ if(CollationFCD::hasTccc(c) && pos != length && nextHasLccc()) {
+ pos -= 2;
+ } else {
+ return ce32;
+ }
} else {
// Function call for supplementary code points and error cases.
// Illegal byte sequences yield U+FFFD.
FCDUTF8CollationIterator::previousHasTccc() const {
U_ASSERT(state == CHECK_BWD && pos != 0);
UChar32 c = u8[pos - 1];
- if(c < 0x80) { return FALSE; }
+ if(U8_IS_SINGLE(c)) { return FALSE; }
int32_t i = pos;
U8_PREV_OR_FFFD(u8, 0, i, c);
if(c > 0xffff) { c = U16_LEAD(c); }
if(pos == length || ((c = u8[pos]) == 0 && length < 0)) {
return U_SENTINEL;
}
- if(c < 0x80) {
+ if(U8_IS_SINGLE(c)) {
++pos;
return c;
}
if(pos == 0) {
return U_SENTINEL;
}
- if((c = u8[pos - 1]) < 0x80) {
+ if(U8_IS_SINGLE(c = u8[pos - 1])) {
--pos;
return c;
}
}
/* test UTF-8 with single surrogates - illegal in Unicode 3.2 */
+ // Since ICU 60, each surrogate byte sequence is treated as 3 single-byte errors.
{
static const UChar
withLead16[]={ 0x1800, 0xd89a, 0x0061 },
withTrail16[]={ 0x1800, 0xdcba, 0x0061, 0 },
- withTrail16SubFFFD[]={ 0x1800, 0xfffd, 0x0061, 0 }, /* sub==U+FFFD */
- withTrail16Sub50005[]={ 0x1800, 0xd900, 0xdc05, 0x0061, 0 }; /* sub==U+50005 */
+ withTrail16SubFFFD[]={ 0x1800, 0xfffd, 0xfffd, 0xfffd, 0x0061, 0 }, /* sub==U+FFFD */
+ withTrail16Sub50005[]={ 0x1800, 0xd900, 0xdc05, 0xd900, 0xdc05, 0xd900, 0xdc05, 0x0061, 0 }; /* sub==U+50005 */
static const uint8_t
withLead8[]={ 0xe1, 0xa0, 0x80, 0xed, 0xa2, 0x9a, 0x61 },
withTrail8[]={ 0xe1, 0xa0, 0x80, 0xed, 0xb2, 0xba, 0x61, 0 },
&err);
if(U_FAILURE(err) || uDestLen!=u_strlen(withTrail16Sub50005) ||
0!=u_memcmp(withTrail16Sub50005, out16, uDestLen+1) ||
- numSubstitutions!=1) {
+ numSubstitutions!=3) {
log_err("error: u_strFromUTF8WithSub(length) failed\n");
}
&err);
if(U_FAILURE(err) || uDestLen!=u_strlen(withTrail16SubFFFD) ||
0!=u_memcmp(withTrail16SubFFFD, out16, uDestLen+1) ||
- numSubstitutions!=1) {
+ numSubstitutions!=3) {
log_err("error: u_strFromUTF8WithSub(NUL termination) failed\n");
}
(const char *)withTrail8, -1,
0x50005, &numSubstitutions,
&err);
- if(err!=U_BUFFER_OVERFLOW_ERROR || uDestLen!=u_strlen(withTrail16Sub50005) || numSubstitutions!=1) {
+ if(err!=U_BUFFER_OVERFLOW_ERROR || uDestLen!=u_strlen(withTrail16Sub50005) || numSubstitutions!=3) {
log_err("error: u_strFromUTF8WithSub(preflight/NUL termination) failed\n");
}
log_err("u_strFromUTF8Lenient(U_MEMORY_ALLOCATION_ERROR) failed\n");
}
- dest[0]=0x1234;
- destLength=-1;
- errorCode=U_MEMORY_ALLOCATION_ERROR;
- pDest=u_strFromUTF8Lenient(dest, 1, &destLength, (const char *)bytes, -1, NULL);
- if(dest[0]!=0x1234) {
- log_err("u_strFromUTF8Lenient(pErrorCode=NULL) failed\n");
- }
-
/* test normal behavior */
number=0; /* string number for log_err() */
testTrieUTF8(const char *testName,
const UTrie2 *trie, UTrie2ValueBits valueBits,
const CheckRange checkRanges[], int32_t countCheckRanges) {
+ // Note: The byte sequence comments refer to the original UTF-8 definition.
+ // Starting with ICU 60, any sequence that is not a prefix of a valid one
+ // is treated as multiple single-byte errors.
+ // For testing, we only rely on U8_... and UTrie2 UTF-8 macros
+ // iterating consistently.
static const uint8_t illegal[]={
0xc0, 0x80, /* non-shortest U+0000 */
0xc1, 0xbf, /* non-shortest U+007f */
value=checkRanges[i].value;
/* write three legal (or surrogate) code points */
U8_APPEND_UNSAFE(s, length, prevCP); /* start of the range */
- values[countValues++]=U_IS_SURROGATE(prevCP) ? errorValue : value;
+ if(U_IS_SURROGATE(prevCP)) {
+ // A surrogate byte sequence counts as 3 single-byte errors.
+ values[countValues++]=errorValue;
+ values[countValues++]=errorValue;
+ values[countValues++]=errorValue;
+ } else {
+ values[countValues++]=value;
+ }
c=checkRanges[i].limit;
prevCP=(prevCP+c)/2; /* middle of the range */
U8_APPEND_UNSAFE(s, length, prevCP);
- values[countValues++]=U_IS_SURROGATE(prevCP) ? errorValue : value;
+ if(U_IS_SURROGATE(prevCP)) {
+ // A surrogate byte sequence counts as 3 single-byte errors.
+ values[countValues++]=errorValue;
+ values[countValues++]=errorValue;
+ values[countValues++]=errorValue;
+ } else {
+ values[countValues++]=value;
+ }
prevCP=c;
--c; /* end of the range */
U8_APPEND_UNSAFE(s, length, c);
- values[countValues++]=U_IS_SURROGATE(c) ? errorValue : value;
+ if(U_IS_SURROGATE(prevCP)) {
+ // A surrogate byte sequence counts as 3 single-byte errors.
+ values[countValues++]=errorValue;
+ values[countValues++]=errorValue;
+ values[countValues++]=errorValue;
+ } else {
+ values[countValues++]=value;
+ }
/* write an illegal byte sequence */
if(i8<sizeof(illegal)) {
U8_FWD_1(illegal, i8, sizeof(illegal));
}
bytes=0;
if(value!=values[i] || i8!=(p-s)) {
- while(prev8<i8) {
- bytes=(bytes<<8)|s[prev8++];
+ int32_t k=prev8;
+ while(k<i8) {
+ bytes=(bytes<<8)|s[k++];
}
}
if(value!=values[i]) {
- log_err("error: wrong value from UTRIE2_U8_NEXT(%s)(%lx->U+%04lx): 0x%lx instead of 0x%lx\n",
- testName, (unsigned long)bytes, (long)c, (long)value, (long)values[i]);
+ log_err("error: wrong value from UTRIE2_U8_NEXT(%s)(from %d %lx->U+%04lx) (read %d bytes): "
+ "0x%lx instead of 0x%lx\n",
+ testName, (int)prev8, (unsigned long)bytes, (long)c, (int)((p-s)-prev8),
+ (long)value, (long)values[i]);
}
if(i8!=(p-s)) {
- log_err("error: wrong end index from UTRIE2_U8_NEXT(%s)(%lx->U+%04lx): %ld != %ld\n",
- testName, (unsigned long)bytes, (long)c, (long)(p-s), (long)i8);
+ log_err("error: wrong end index from UTRIE2_U8_NEXT(%s)(from %d %lx->U+%04lx): %ld != %ld\n",
+ testName, (int)prev8, (unsigned long)bytes, (long)c, (long)(p-s), (long)i8);
continue;
}
++i;
}
}
if(value!=values[i]) {
- log_err("error: wrong value from UTRIE2_U8_PREV(%s)(%lx->U+%04lx): 0x%lx instead of 0x%lx\n",
- testName, (unsigned long)bytes, (long)c, (long)value, (long)values[i]);
+ log_err("error: wrong value from UTRIE2_U8_PREV(%s)(from %d %lx->U+%04lx) (read %d bytes): "
+ ": 0x%lx instead of 0x%lx\n",
+ testName, (int)prev8, (unsigned long)bytes, (long)c, (int)(prev8-(p-s)),
+ (long)value, (long)values[i]);
}
if(i8!=(p-s)) {
- log_err("error: wrong end index from UTRIE2_U8_PREV(%s)(%lx->U+%04lx): %ld != %ld\n",
- testName, (unsigned long)bytes, (long)c, (long)(p-s), (long)i8);
+ log_err("error: wrong end index from UTRIE2_U8_PREV(%s)(from %d %lx->U+%04lx): %ld != %ld\n",
+ testName, (int)prev8, (unsigned long)bytes, (long)c, (long)(p-s), (long)i8);
continue;
}
}
static void TestCodeUnitValues()
{
- static const uint8_t codeunit[]={0x00, 0x65, 0x7e, 0x7f, 0xc0, 0xc4, 0xf0, 0xfd, 0x80, 0x81, 0xbc, 0xbe,};
+ static const uint8_t codeunit[]={0x00, 0x65, 0x7e, 0x7f, 0xc2, 0xc4, 0xf0, 0xf4, 0x80, 0x81, 0xbc, 0xbe,};
int16_t i;
for(i=0; i<UPRV_LENGTHOF(codeunit); i++){
0x10401, 0x10401, 0x10401 ,
0x10401, 0x10401, 0x10401 ,
0x10401, 0x10401, 0x10401,
- 0x25, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
+ -1, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
0x65, 0x65, 0x65,
0x31, 0x31, 0x31,
- 0x31, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
- 0x240, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1
+ -1, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
+ -1, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1
};
uint16_t i=0;
UChar32 c, expected;
uint32_t offset=0;
for(offset=0; offset<sizeof(input); offset++) {
- if (offset < sizeof(input) - 1) {
+ expected = result[i];
+ if (expected >= 0 && offset < sizeof(input) - 1) {
#if !U_HIDE_OBSOLETE_UTF_OLD_H
UTF8_GET_CHAR_UNSAFE(input, offset, c);
- if(c != result[i]){
- log_err("ERROR: UTF8_GET_CHAR_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n", offset, result[i], c);
+ if(c != expected) {
+ log_err("ERROR: UTF8_GET_CHAR_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n",
+ offset, expected, c);
}
#endif
U8_GET_UNSAFE(input, offset, c);
- if(c != result[i]){
- log_err("ERROR: U8_GET_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n", offset, result[i], c);
+ if(c != expected) {
+ log_err("ERROR: U8_GET_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n",
+ offset, expected, c);
}
}
}
static void TestNextPrevChar() {
- static const uint8_t input[]={0x61, 0xf0, 0x90, 0x90, 0x81, 0xc0, 0x80, 0xfd, 0xbe, 0xc2, 0x61, 0x81, 0x90, 0x90, 0xf0, 0x00};
+ static const uint8_t input[]={
+ 0x61,
+ 0xf0, 0x90, 0x90, 0x81,
+ 0xc0, 0x80, // non-shortest form
+ 0xf3, 0xbe, // truncated
+ 0xc2, // truncated
+ 0x61,
+ 0x81, 0x90, 0x90, 0xf0, // "backwards" sequence
+ 0x00
+ };
static const UChar32 result[]={
- /* next_unsafe next_safe_ns next_safe_s prev_unsafe prev_safe_ns prev_safe_s */
- 0x0061, 0x0061, 0x0061, 0x0000, 0x0000, 0x0000,
- 0x10401, 0x10401, 0x10401, 0xf0, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
- 0x90, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, 0x2841410, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
- 0x90, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, 0xa1050, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
- 0x81, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, 0x2841, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
- 0x00, UTF8_ERROR_VALUE_2, UTF8_ERROR_VALUE_2, 0x61, 0x61, 0x61,
- 0x80, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, 0xc2, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
- 0xfd, UTF8_ERROR_VALUE_2, UTF8_ERROR_VALUE_2, 0x77e, UTF8_ERROR_VALUE_2, UTF8_ERROR_VALUE_2,
- 0xbe, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, 0xfd, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
- 0xa1, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, 0x00, UTF8_ERROR_VALUE_2, UTF8_ERROR_VALUE_2,
- 0x61, 0x61, 0x61, 0xc0, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
- 0x81, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, 0x10401, 0x10401, 0x10401,
- 0x90, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, 0x410, UTF_ERROR_VALUE, UTF_ERROR_VALUE,
- 0x90, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, 0x410, UTF8_ERROR_VALUE_2, UTF8_ERROR_VALUE_2,
- 0x0840, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, 0xf0, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
- 0x0000, 0x0000, 0x0000, 0x0061, 0x0061, 0x0061
+ /* next_safe_ns next_safe_s prev_safe_ns prev_safe_s */
+ 0x0061, 0x0061, 0x0000, 0x0000,
+ 0x10401, 0x10401, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
+ UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
+ UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
+ UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
+ UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, 0x61, 0x61,
+ UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
+ UTF8_ERROR_VALUE_2, UTF8_ERROR_VALUE_2, UTF8_ERROR_VALUE_2, UTF8_ERROR_VALUE_2,
+ UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
+ UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
+ 0x61, 0x61, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
+ UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, 0x10401, 0x10401,
+ UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, UTF_ERROR_VALUE, UTF_ERROR_VALUE,
+ UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_2, UTF8_ERROR_VALUE_2,
+ UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
+ 0x0000, 0x0000, 0x0061, 0x0061
};
static const int32_t movedOffset[]={
- /* next_unsafe next_safe_ns next_safe_s prev_unsafe prev_safe_ns prev_safe_s */
- 1, 1, 1, 15, 15, 15,
- 5, 5, 5, 14, 14 , 14,
- 3, 3, 3, 9, 13, 13,
- 4, 4, 4, 9, 12, 12,
- 5, 5, 5, 9, 11, 11,
- 7, 7, 7, 10, 10, 10,
- 7, 7, 7, 9, 9, 9,
- 8, 9, 9, 7, 7, 7,
- 9, 9, 9, 7, 7, 7,
- 11, 10, 10, 5, 5, 5,
- 11, 11, 11, 5, 5, 5,
- 12, 12, 12, 1, 1, 1,
- 13, 13, 13, 1, 1, 1,
- 14, 14, 14, 1, 1, 1,
- 14, 15, 15, 1, 1, 1,
- 14, 16, 16, 0, 0, 0,
+ /* next_safe prev_safe_s */
+ 1, 15,
+ 5, 14,
+ 3, 13,
+ 4, 12,
+ 5, 11,
+ 6, 10,
+ 7, 9,
+ 9, 7,
+ 9, 7,
+ 10, 6,
+ 11, 5,
+ 12, 1,
+ 13, 1,
+ 14, 1,
+ 15, 1,
+ 16, 0,
};
- /* TODO: remove unused columns for next_unsafe & prev_unsafe, and adjust the test code */
UChar32 c, expected;
- uint32_t i=0;
+ uint32_t i=0, j=0;
uint32_t offset=0;
int32_t setOffset=0;
for(offset=0; offset<sizeof(input); offset++){
- expected=result[i+1];
+ expected=result[i]; // next_safe_ns
#if !U_HIDE_OBSOLETE_UTF_OLD_H
- setOffset=offset;
- UTF8_NEXT_CHAR_SAFE(input, setOffset, sizeof(input), c, FALSE);
- if(setOffset != movedOffset[i+1]){
- log_err("ERROR: UTF8_NEXT_CHAR_SAFE failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
- offset, movedOffset[i+1], setOffset);
- }
- if(c != expected){
- log_err("ERROR: UTF8_NEXT_CHAR_SAFE failed for input=%ld. Expected:%lx Got:%lx\n", offset, expected, c);
+ setOffset=offset;
+ UTF8_NEXT_CHAR_SAFE(input, setOffset, sizeof(input), c, FALSE);
+ if(setOffset != movedOffset[j]) {
+ log_err("ERROR: UTF8_NEXT_CHAR_SAFE failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
+ offset, movedOffset[j], setOffset);
+ }
+ if(c != expected) {
+ log_err("ERROR: UTF8_NEXT_CHAR_SAFE failed at offset=%ld. Expected:%lx Got:%lx\n", offset, expected, c);
}
#endif
- setOffset=offset;
- U8_NEXT(input, setOffset, sizeof(input), c);
- if(setOffset != movedOffset[i+1]){
- log_err("ERROR: U8_NEXT failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
- offset, movedOffset[i+1], setOffset);
- }
+ setOffset=offset;
+ U8_NEXT(input, setOffset, sizeof(input), c);
+ if(setOffset != movedOffset[j]) {
+ log_err("ERROR: U8_NEXT failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
+ offset, movedOffset[j], setOffset);
+ }
if(UTF_IS_ERROR(expected)) { expected=U_SENTINEL; }
- if(c != expected){
- log_err("ERROR: U8_NEXT failed for input=%ld. Expected:%lx Got:%lx\n", offset, expected, c);
+ if(c != expected) {
+ log_err("ERROR: U8_NEXT failed at offset=%ld. Expected:%lx Got:%lx\n", offset, expected, c);
}
setOffset=offset;
U8_NEXT_OR_FFFD(input, setOffset, sizeof(input), c);
- if(setOffset != movedOffset[i+1]){
+ if(setOffset != movedOffset[j]) {
log_err("ERROR: U8_NEXT_OR_FFFD failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
- offset, movedOffset[i+1], setOffset);
+ offset, movedOffset[j], setOffset);
}
if(expected<0) { expected=0xfffd; }
- if(c != expected){
- log_err("ERROR: U8_NEXT_OR_FFFD failed for input=%ld. Expected:%lx Got:%lx\n", offset, expected, c);
+ if(c != expected) {
+ log_err("ERROR: U8_NEXT_OR_FFFD failed at offset=%ld. Expected:%lx Got:%lx\n", offset, expected, c);
}
#if !U_HIDE_OBSOLETE_UTF_OLD_H
- setOffset=offset;
- UTF8_NEXT_CHAR_SAFE(input, setOffset, sizeof(input), c, TRUE);
- if(setOffset != movedOffset[i+1]){
- log_err("ERROR: UTF8_NEXT_CHAR_SAFE(strict) failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
- offset, movedOffset[i+2], setOffset);
- }
- if(c != result[i+2]){
- log_err("ERROR: UTF8_NEXT_CHAR_SAFE(strict) failed for input=%ld. Expected:%lx Got:%lx\n", offset, result[i+2], c);
- }
+ setOffset=offset;
+ UTF8_NEXT_CHAR_SAFE(input, setOffset, sizeof(input), c, TRUE);
+ if(setOffset != movedOffset[j]) {
+ log_err("ERROR: UTF8_NEXT_CHAR_SAFE(strict) failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
+ offset, movedOffset[j], setOffset);
+ }
+ expected=result[i+1]; // next_safe_s
+ if(c != expected) {
+ log_err("ERROR: UTF8_NEXT_CHAR_SAFE(strict) failed at offset=%ld. Expected:%lx Got:%lx\n",
+ offset, expected, c);
+ }
#endif
- i=i+6;
+ i=i+4;
+ j=j+2;
}
- i=0;
+ i=j=0;
for(offset=sizeof(input); offset > 0; --offset){
- expected=result[i+4];
+ expected=result[i+2]; // prev_safe_ns
#if !U_HIDE_OBSOLETE_UTF_OLD_H
- setOffset=offset;
- UTF8_PREV_CHAR_SAFE(input, 0, setOffset, c, FALSE);
- if(setOffset != movedOffset[i+4]){
- log_err("ERROR: UTF8_PREV_CHAR_SAFE failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
- offset, movedOffset[i+4], setOffset);
- }
- if(c != expected){
- log_err("ERROR: UTF8_PREV_CHAR_SAFE failed for input=%ld. Expected:%lx Got:%lx\n", offset, expected, c);
+ setOffset=offset;
+ UTF8_PREV_CHAR_SAFE(input, 0, setOffset, c, FALSE);
+ if(setOffset != movedOffset[j+1]) {
+ log_err("ERROR: UTF8_PREV_CHAR_SAFE failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
+ offset, movedOffset[j+1], setOffset);
+ }
+ if(c != expected) {
+ log_err("ERROR: UTF8_PREV_CHAR_SAFE failed at offset=%ld. Expected:%lx Got:%lx\n", offset, expected, c);
}
#endif
- setOffset=offset;
- U8_PREV(input, 0, setOffset, c);
- if(setOffset != movedOffset[i+4]){
- log_err("ERROR: U8_PREV failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
- offset, movedOffset[i+4], setOffset);
- }
+ setOffset=offset;
+ U8_PREV(input, 0, setOffset, c);
+ if(setOffset != movedOffset[j+1]) {
+ log_err("ERROR: U8_PREV failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
+ offset, movedOffset[j+1], setOffset);
+ }
if(UTF_IS_ERROR(expected)) { expected=U_SENTINEL; }
- if(c != expected){
- log_err("ERROR: U8_PREV failed for input=%ld. Expected:%lx Got:%lx\n", offset, expected, c);
+ if(c != expected) {
+ log_err("ERROR: U8_PREV failed at offset=%ld. Expected:%lx Got:%lx\n", offset, expected, c);
}
setOffset=offset;
U8_PREV_OR_FFFD(input, 0, setOffset, c);
- if(setOffset != movedOffset[i+4]){
+ if(setOffset != movedOffset[j+1]) {
log_err("ERROR: U8_PREV_OR_FFFD failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
- offset, movedOffset[i+4], setOffset);
+ offset, movedOffset[j+1], setOffset);
}
if(expected<0) { expected=0xfffd; }
- if(c != expected){
- log_err("ERROR: U8_PREV_OR_FFFD failed for input=%ld. Expected:%lx Got:%lx\n", offset, expected, c);
+ if(c != expected) {
+ log_err("ERROR: U8_PREV_OR_FFFD failed at offset=%ld. Expected:%lx Got:%lx\n", offset, expected, c);
}
#if !U_HIDE_OBSOLETE_UTF_OLD_H
- setOffset=offset;
- UTF8_PREV_CHAR_SAFE(input, 0, setOffset, c, TRUE);
- if(setOffset != movedOffset[i+5]){
- log_err("ERROR: UTF8_PREV_CHAR_SAFE(strict) failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
- offset, movedOffset[i+5], setOffset);
- }
- if(c != result[i+5]){
- log_err("ERROR: UTF8_PREV_CHAR_SAFE(strict) failed for input=%ld. Expected:%lx Got:%lx\n", offset, result[i+5], c);
- }
+ setOffset=offset;
+ UTF8_PREV_CHAR_SAFE(input, 0, setOffset, c, TRUE);
+ if(setOffset != movedOffset[j+1]) {
+ log_err("ERROR: UTF8_PREV_CHAR_SAFE(strict) failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
+ offset, movedOffset[j+1], setOffset);
+ }
+ expected=result[i+3]; // prev_safe_s
+ if(c != expected) {
+ log_err("ERROR: UTF8_PREV_CHAR_SAFE(strict) failed at offset=%ld. Expected:%lx Got:%lx\n",
+ offset, expected, c);
+ }
#endif
- i=i+6;
+ i=i+4;
+ j=j+2;
}
}
static const uint8_t input[]={
/* 0 */ 0x61,
/* 1 */ 0xf0, 0x90, 0x90, 0x81,
- /* 5 */ 0xc0, 0x80,
+ /* 5 */ 0xc0,
+ /* 6 */ 0x80,
/* 7 */ 0xdf, 0x80,
/* 9 */ 0xc2,
/* 10 */ 0x62,
- /* 11 */ 0xfd, 0xbe,
+ /* 11 */ 0xfd,
+ /* 12 */ 0xbe,
/* 13 */ 0xe0, 0xa0, 0x80,
/* 16 */ 0xe2, 0x82, 0xac,
/* 19 */ 0xf0, 0x90, 0x90,
static const UChar32 result[]={
0x61,
0x10401,
- U_SENTINEL,
+ U_SENTINEL, // C0 not a lead byte
+ U_SENTINEL, // 80
0x7c0,
- U_SENTINEL,
+ U_SENTINEL, // C2
0x62,
- U_SENTINEL,
+ U_SENTINEL, // FD not a lead byte
+ U_SENTINEL, // BE
0x800,
0x20ac,
- U_SENTINEL,
+ U_SENTINEL, // truncated F0 90 90
0
};
log_err("U8_PREV(at %d) failed to read a non-character\n", idx);
}
}
+#if !U_HIDE_OBSOLETE_UTF_OLD_H
+ for(idx=0; idx<(int32_t)sizeof(nonChars);) {
+ UChar32 expected= nonChars[idx]<0xf0 ? 0xffff : 0x10ffff;
+ UTF8_NEXT_CHAR_SAFE(nonChars, idx, sizeof(nonChars), ch, TRUE);
+ if(ch!=expected) {
+ log_err("UTF8_NEXT_CHAR_SAFE(strict, before %d) failed to read a non-character\n", idx);
+ }
+ }
+ for(idx=(int32_t)sizeof(nonChars); idx>0;) {
+ UTF8_PREV_CHAR_SAFE(nonChars, 0, idx, ch, TRUE);
+ UChar32 expected= nonChars[idx]<0xf0 ? 0xffff : 0x10ffff;
+ if(ch!=expected) {
+ log_err("UTF8_PREV_CHAR_SAFE(strict, at %d) failed to read a non-character\n", idx);
+ }
+ }
+#endif
}
static void TestNextPrevCharUnsafe() {
static const UChar32 codePoints[]={
0x61,
0x10401,
- 0,
+ -1,
0x20ac,
0xa1,
0x10ffff,
0
};
- UChar32 c;
+ UChar32 c, expected;
int32_t i;
uint32_t offset;
#if !U_HIDE_OBSOLETE_UTF_OLD_H
for(i=0, offset=0; offset<sizeof(input); ++i) {
UTF8_NEXT_CHAR_UNSAFE(input, offset, c);
- if(c != codePoints[i]){
+ expected = codePoints[i];
+ if(expected >= 0 && c != expected) {
log_err("ERROR: UTF8_NEXT_CHAR_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n",
- offset, codePoints[i], c);
+ offset, expected, c);
+ }
+ if(offset==6) {
+ // The obsolete UTF8_NEXT_CHAR_UNSAFE() skips 1+UTF8_COUNT_TRAIL_BYTES(lead) bytes
+ // while the new one skips C0 80 together.
+ ++offset;
}
}
#endif
for(i=0, offset=0; offset<sizeof(input); ++i) {
U8_NEXT_UNSAFE(input, offset, c);
- if(c != codePoints[i]){
+ expected = codePoints[i];
+ if(expected >= 0 && c != expected) {
log_err("ERROR: U8_NEXT_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n",
- offset, codePoints[i], c);
+ offset, expected, c);
}
}
#if !U_HIDE_OBSOLETE_UTF_OLD_H
for(i=UPRV_LENGTHOF(codePoints)-1, offset=sizeof(input); offset > 0; --i){
- UTF8_PREV_CHAR_UNSAFE(input, offset, c);
- if(c != codePoints[i]){
- log_err("ERROR: UTF8_PREV_CHAR_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n",
- offset, codePoints[i], c);
- }
+ UTF8_PREV_CHAR_UNSAFE(input, offset, c);
+ expected = codePoints[i];
+ if(expected >= 0 && c != expected) {
+ log_err("ERROR: UTF8_PREV_CHAR_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n",
+ offset, expected, c);
+ }
}
#endif
for(i=UPRV_LENGTHOF(codePoints)-1, offset=sizeof(input); offset > 0; --i){
- U8_PREV_UNSAFE(input, offset, c);
- if(c != codePoints[i]){
- log_err("ERROR: U8_PREV_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n",
- offset, codePoints[i], c);
- }
+ U8_PREV_UNSAFE(input, offset, c);
+ expected = codePoints[i];
+ if(expected >= 0 && c != expected) {
+ log_err("ERROR: U8_PREV_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n",
+ offset, expected, c);
+ }
}
}
static void TestFwdBack() {
- static const uint8_t input[]={0x61, 0xF0, 0x90, 0x90, 0x81, 0xff, 0x62, 0xc0, 0x80, 0x7f, 0x8f, 0xc0, 0x63, 0x81, 0x90, 0x90, 0xF0, 0x00};
- static const uint16_t fwd_safe[] ={1, 5, 6, 7, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18};
- static const uint16_t back_safe[] ={17, 16, 15, 14, 13, 12, 11, 10, 9, 7, 6, 5, 1, 0};
+ static const uint8_t input[]={
+ 0x61,
+ 0xF0, 0x90, 0x90, 0x81,
+ 0xff,
+ 0x62,
+ 0xc0,
+ 0x80,
+ 0x7f,
+ 0x8f,
+ 0xc0,
+ 0x63,
+ 0x81,
+ 0x90,
+ 0x90,
+ 0xF0,
+ 0x00
+ };
+ static const uint16_t fwd_safe[] ={1, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18};
+ static const uint16_t back_safe[] ={17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 1, 0};
- static const uint16_t Nvalue[]= {0, 1, 2, 3, 1, 2, 1, 5};
+ static const uint16_t Nvalue[]= {0, 1, 2, 4, 1, 2, 1, 5};
static const uint16_t fwd_N_safe[] ={0, 1, 6, 10, 11, 13, 14, 18}; /*safe macro keeps it at the end of the string */
- static const uint16_t back_N_safe[] ={18, 17, 15, 12, 11, 9, 7, 0};
+ static const uint16_t back_N_safe[] ={18, 17, 15, 11, 10, 8, 7, 0};
uint32_t offsafe=0;
0xf4, 0x8f, 0xbf, 0xbf,
0x00
};
- static const int8_t boundaries[]={ 0, 1, 5, 7, 10, 12, 16, 17 };
+ // forward unsafe skips only C0
+ static const int8_t boundaries[]={ 0, 1, 5, 6, 7, 10, 12, 16, 17 };
+ // backward unsafe skips C0 80 together
+ static const int8_t backBoundaries[]={ 0, 1, 5, 7, 10, 12, 16, 17 };
int32_t offset;
int32_t i;
}
}
#if !U_HIDE_OBSOLETE_UTF_OLD_H
- for(i=UPRV_LENGTHOF(boundaries)-2, offset=UPRV_LENGTHOF(input); offset>0; --i) {
+ for(i=UPRV_LENGTHOF(backBoundaries)-2, offset=UPRV_LENGTHOF(input); offset>0; --i) {
UTF8_BACK_1_UNSAFE(input, offset);
- if(offset != boundaries[i]){
- log_err("ERROR: UTF8_BACK_1_UNSAFE offset expected:%d, Got:%d\n", boundaries[i], offset);
+ if(offset != backBoundaries[i]){
+ log_err("ERROR: UTF8_BACK_1_UNSAFE offset expected:%d, Got:%d\n", backBoundaries[i], offset);
}
}
#endif
- for(i=UPRV_LENGTHOF(boundaries)-2, offset=UPRV_LENGTHOF(input); offset>0; --i) {
+ for(i=UPRV_LENGTHOF(backBoundaries)-2, offset=UPRV_LENGTHOF(input); offset>0; --i) {
U8_BACK_1_UNSAFE(input, offset);
- if(offset != boundaries[i]){
- log_err("ERROR: U8_BACK_1_UNSAFE offset expected:%d, Got:%d\n", boundaries[i], offset);
+ if(offset != backBoundaries[i]){
+ log_err("ERROR: U8_BACK_1_UNSAFE offset expected:%d, Got:%d\n", backBoundaries[i], offset);
}
}
#if !U_HIDE_OBSOLETE_UTF_OLD_H
}
}
#if !U_HIDE_OBSOLETE_UTF_OLD_H
- for(i=0; i<UPRV_LENGTHOF(boundaries); ++i) {
- int32_t j=UPRV_LENGTHOF(boundaries)-1-i;
+ for(i=0; i<UPRV_LENGTHOF(backBoundaries); ++i) {
+ int32_t j=UPRV_LENGTHOF(backBoundaries)-1-i;
offset=UPRV_LENGTHOF(input);
UTF8_BACK_N_UNSAFE(input, offset, i);
- if(offset != boundaries[j]) {
- log_err("ERROR: UTF8_BACK_N_UNSAFE offset expected:%d, Got:%d\n", boundaries[j], offset);
+ if(offset != backBoundaries[j]) {
+ log_err("ERROR: UTF8_BACK_N_UNSAFE offset expected:%d, Got:%d\n", backBoundaries[j], offset);
}
}
#endif
- for(i=0; i<UPRV_LENGTHOF(boundaries); ++i) {
- int32_t j=UPRV_LENGTHOF(boundaries)-1-i;
+ for(i=0; i<UPRV_LENGTHOF(backBoundaries); ++i) {
+ int32_t j=UPRV_LENGTHOF(backBoundaries)-1-i;
offset=UPRV_LENGTHOF(input);
U8_BACK_N_UNSAFE(input, offset, i);
- if(offset != boundaries[j]) {
- log_err("ERROR: U8_BACK_N_UNSAFE offset expected:%d, Got:%d\n", boundaries[j], offset);
+ if(offset != backBoundaries[j]) {
+ log_err("ERROR: U8_BACK_N_UNSAFE offset expected:%d, Got:%d\n", backBoundaries[j], offset);
}
}
}
log_err("L8_NEXT(b[%ld])=U+%04lX != U+%04lX\n", (long)i, (long)cl, (long)cu);
}
- if(is!=iu || il!=iu) {
- log_err("U8_NEXT(b[%ld]) or L8_NEXT(b[%ld]) did not advance the index correctly\n", (long)i, (long)i);
+ // U8_NEXT() skips only the first byte of a surrogate byte sequence.
+ if(U_IS_SURROGATE(cu) ? is!=(i+1) : is!=iu) {
+ log_err("U8_NEXT(b[%ld]) did not advance the index correctly\n", (long)i, (long)i);
+ }
+ if(il!=iu) {
+ log_err("L8_NEXT(b[%ld]) did not advance the index correctly\n", (long)i, (long)i);
}
++k; /* next code point */
log_err("L8_PREV(b[%ld])=U+%04lX != U+%04lX\n", (long)i, (long)cl, (long)cu);
}
- if(is!=iu || il !=iu) {
- log_err("U8_PREV(b[%ld]) or L8_PREV(b[%ld]) did not advance the index correctly\n", (long)i, (long)i);
+ // U8_PREV() skips only the last byte of a surrogate byte sequence.
+ if(U_IS_SURROGATE(cu) ? is!=(i-1) : is!=iu) {
+ log_err("U8_PREV(b[%ld]) did not advance the index correctly\n", (long)i, (long)i);
+ }
+ if(il !=iu) {
+ log_err("L8_PREV(b[%ld]) did not advance the index correctly\n", (long)i, (long)i);
}
i=iu; /* go back by one UTF-8 sequence */
coll->setAttribute(UCOL_STRENGTH, UCOL_IDENTICAL, errorCode);
static const char *strings[] = {
- // U+FFFD
- "a\xef\xbf\xbdz",
- // illegal byte sequences
- "a\x80z", // trail byte
- "a\xc1\x81z", // non-shortest form
- "a\xe0\x82\x83z", // non-shortest form
- "a\xed\xa0\x80z", // lead surrogate: would be U+D800
- "a\xed\xbf\xbfz", // trail surrogate: would be U+DFFF
- "a\xf0\x8f\xbf\xbfz", // non-shortest form
- "a\xf4\x90\x80\x80z" // out of range: would be U+110000
+ // string with U+FFFD == illegal byte sequence
+ u8"a\uFFFDz", "a\x80z", // trail byte
+ u8"a\uFFFD\uFFFDz", "a\xc1\x81z", // non-shortest form
+ u8"a\uFFFD\uFFFD\uFFFDz", "a\xe0\x82\x83z", // non-shortest form
+ u8"a\uFFFD\uFFFD\uFFFDz", "a\xed\xa0\x80z", // lead surrogate: would be U+D800
+ u8"a\uFFFD\uFFFD\uFFFDz", "a\xed\xbf\xbfz", // trail surrogate: would be U+DFFF
+ u8"a\uFFFD\uFFFD\uFFFD\uFFFDz", "a\xf0\x8f\xbf\xbfz", // non-shortest form
+ u8"a\uFFFD\uFFFD\uFFFD\uFFFDz", "a\xf4\x90\x80\x80z" // out of range: would be U+110000
};
- StringPiece fffd(strings[0]);
- for(int32_t i = 1; i < UPRV_LENGTHOF(strings); ++i) {
- StringPiece illegal(strings[i]);
+ for(int32_t i = 0; i < UPRV_LENGTHOF(strings); i += 2) {
+ StringPiece fffd(strings[i]);
+ StringPiece illegal(strings[i + 1]);
UCollationResult order = coll->compareUTF8(fffd, illegal, errorCode);
if(order != UCOL_EQUAL) {
- errln("compareUTF8(U+FFFD, string %d with illegal UTF-8)=%d != UCOL_EQUAL",
+ errln("compareUTF8(pair %d: U+FFFD, illegal UTF-8)=%d != UCOL_EQUAL",
(int)i, order);
}
}
StringTest::Test_UTF8_COUNT_TRAIL_BYTES() {
#if !U_HIDE_OBSOLETE_UTF_OLD_H
if(UTF8_COUNT_TRAIL_BYTES(0x7F) != 0
- || UTF8_COUNT_TRAIL_BYTES(0xC0) != 1
+ || UTF8_COUNT_TRAIL_BYTES(0xC2) != 1
|| UTF8_COUNT_TRAIL_BYTES(0xE0) != 2
|| UTF8_COUNT_TRAIL_BYTES(0xF0) != 3) {
errln("UTF8_COUNT_TRAIL_BYTES does not work right! See utf_old.h.");
// Note: U8_COUNT_TRAIL_BYTES (current) and UTF8_COUNT_TRAIL_BYTES (deprecated)
// have completely different implementations.
if (U8_COUNT_TRAIL_BYTES(0x7F) != 0
- || U8_COUNT_TRAIL_BYTES(0xC0) != 1
+ || U8_COUNT_TRAIL_BYTES(0xC2) != 1
|| U8_COUNT_TRAIL_BYTES(0xE0) != 2
|| U8_COUNT_TRAIL_BYTES(0xF0) != 3) {
errln("U8_COUNT_TRAIL_BYTES does not work right! See utf8.h.");
0xf3, 0xa0, 0x80, 0x80, 0xf4, 0x8f, 0xbf, 0xbf
};
static const UChar expected_utf16[] = {
- 0x41, 0xfffd,
- 0x61, 0xfffd,
- 0xfffd, 0x5a,
+ 0x41, 0xfffd, 0xfffd, 0xfffd,
+ 0x61, 0xfffd, 0xfffd, 0xfffd,
+ 0xfffd, 0xfffd, 0xfffd, 0xfffd,0x5a,
0xd900, 0xdc00, 0x7a,
0xd800, 0xdc00, 0xd840, 0xdc00,
0xdb40, 0xdc00, 0xdbff, 0xdfff
TESTCASE_AUTO(Ticket10562);
TESTCASE_AUTO(Ticket10983);
TESTCASE_AUTO(Ticket12130);
- TESTCASE_AUTO(Ticket12888);
TESTCASE_AUTO(Ticket13344);
TESTCASE_AUTO_END;
}
UChar buf[10];
int n = utext_extract(ut, 0, 9, buf, 10, &status);
TEST_SUCCESS(status);
- TEST_ASSERT(n==5);
+ TEST_ASSERT(n==7);
+ TEST_ASSERT(buf[0] == 0x41);
TEST_ASSERT(buf[1] == 0xfffd);
- TEST_ASSERT(buf[3] == 0xfffd);
TEST_ASSERT(buf[2] == 0x42);
+ TEST_ASSERT(buf[3] == 0xfffd);
+ TEST_ASSERT(buf[4] == 0xfffd);
+ TEST_ASSERT(buf[5] == 0xfffd);
+ TEST_ASSERT(buf[6] == 0x43);
utext_close(ut);
}
utext_close(&ut);
}
-// Ticket 12888: bad handling of illegal utf-8 containing many instances of the archaic, now illegal,
-// six byte utf-8 forms. Original implementation had an assumption that
-// there would be at most three utf-8 bytes per UTF-16 code unit.
-// The five and six byte sequences map to a single replacement character.
-
-void UTextTest::Ticket12888() {
- const char *badString =
- "\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80"
- "\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80"
- "\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80"
- "\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80"
- "\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80"
- "\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80"
- "\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80"
- "\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80"
- "\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80"
- "\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80"
- "\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80"
- "\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80"
- "\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80"
- "\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80"
- "\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80"
- "\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80"
- "\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80"
- "\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80"
- "\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80"
- "\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80";
-
- UErrorCode status = U_ZERO_ERROR;
- LocalUTextPointer ut(utext_openUTF8(NULL, badString, -1, &status));
- TEST_SUCCESS(status);
- for (;;) {
- UChar32 c = utext_next32(ut.getAlias());
- if (c == U_SENTINEL) {
- break;
- }
- }
- int32_t endIdx = utext_getNativeIndex(ut.getAlias());
- if (endIdx != (int32_t)strlen(badString)) {
- errln("%s:%d expected=%d, actual=%d", __FILE__, __LINE__, strlen(badString), endIdx);
- return;
- }
-
- for (int32_t prevIndex = endIdx; prevIndex>0;) {
- UChar32 c = utext_previous32(ut.getAlias());
- int32_t currentIndex = utext_getNativeIndex(ut.getAlias());
- if (c != 0xfffd) {
- errln("%s:%d (expected, actual, index) = (%d, %d, %d)\n",
- __FILE__, __LINE__, 0xfffd, c, currentIndex);
- break;
- }
- if (currentIndex != prevIndex - 6) {
- errln("%s:%d: wrong index. Expected, actual = %d, %d",
- __FILE__, __LINE__, prevIndex - 6, currentIndex);
- break;
- }
- prevIndex = currentIndex;
- }
-}
-
// Ticket 13344 The macro form of UTEXT_SETNATIVEINDEX failed when target was a trail surrogate
// of a supplementary character.
void Ticket10562();
void Ticket10983();
void Ticket12130();
- void Ticket12888();
void Ticket13344();
private:
/**
* Helper class for frozen UnicodeSets, implements contains() and span() optimized for BMP code points.
- *
+ *
* Latin-1: Look up bytes.
* 2-byte characters: Bits organized vertically.
* 3-byte characters: Use zero/one/mixed data per 64-block in U+0000..U+FFFF, with mixed for illegal ranges.
- * Supplementary characters: Call contains() on the parent set.
+ * Supplementary characters: Binary search over
+ * the supplementary part of the parent set's inversion list.
*/
public final class BMPSet {
public static int U16_SURROGATE_OFFSET = ((0xd800 << 10) + 0xdc00 - 0x10000);
* One bit per code point from U+0000..U+07FF. The bits are organized vertically; consecutive code points
* correspond to the same bit positions in consecutive table words. With code point parts lead=c{10..6}
* trail=c{5..0} it is set.contains(c)==(table7FF[trail] bit lead)
- *
- * Bits for 0..7F (non-shortest forms) are set to the result of contains(FFFD) for faster validity checking at
- * runtime.
+ *
+ * Bits for 0..FF are unused (0).
*/
private int[] table7FF;
* t1=c{11..6} test bits (lead+16) and lead in bmpBlockBits[t1]. If the upper bit is 0, then the lower bit
* indicates if contains(c) for all code points in the 64-block. If the upper bit is 1, then the block is mixed
* and set.contains(c) must be called.
- *
- * Bits for 0..7FF (non-shortest forms) and D800..DFFF are set to the result of contains(FFFD) for faster
- * validity checking at runtime.
+ *
+ * Bits for 0..7FF are unused (0).
*/
private int[] bmpBlockBits;
/**
* Span the initial substring for which each character c has spanCondition==contains(c). It must be
* spanCondition==0 or 1.
- *
+ *
* @param start The start index
* @param outCount If not null: Receives the number of code points in the span.
* @return the limit (exclusive end) of the span
* Symmetrical with span().
* Span the trailing substring for which each character c has spanCondition==contains(c). It must be s.length >=
* limit and spanCondition==0 or 1.
- *
+ *
* @return The string index which starts the span (i.e. inclusive).
*/
public final int spanBack(CharSequence s, int limit, SpanCondition spanCondition) {
/**
* Same as UnicodeSet.findCodePoint(int c) except that the binary search is restricted for finding code
* points in a certain range.
- *
+ *
* For restricting the search for finding in the range start..end, pass in lo=findCodePoint(start) and
* hi=findCodePoint(end) with 0<=lo<=hi<len. findCodePoint(c) defaults to lo=0 and hi=len-1.
- *
+ *
* @param c
* a character in a subrange of MIN_VALUE..MAX_VALUE
* @param lo
return (0 != (findCodePoint(c, lo, hi) & 1));
}
}
-