// Use a single counter for source and target, counting the minimum of
// the source length and the target capacity.
// Let the standard converter handle edge cases.
- const uint8_t *limit=sourceLimit;
if(count>targetCapacity) {
- limit-=(count-targetCapacity);
count=targetCapacity;
}
- // The conversion loop checks count>0 only once per 1/2/3-byte character.
- // If the buffer ends with a truncated 2- or 3-byte sequence,
+ // The conversion loop checks count>0 only once per character.
+ // If the buffer ends with a truncated sequence,
// then we reduce the count to stop before that,
// and collect the remaining bytes after the conversion loop.
- {
- // Do not go back into the bytes that will be read for finishing a partial
- // sequence from the previous buffer.
- int32_t length=count-toULimit;
- if(length>0) {
- uint8_t b1=*(limit-1);
- if(U8_IS_SINGLE(b1)) {
- // common ASCII character
- } else if(U8_IS_TRAIL(b1) && length>=2) {
- uint8_t b2=*(limit-2);
- if(0xe0<=b2 && b2<0xf0 && U8_IS_VALID_LEAD3_AND_T1(b2, b1)) {
- // truncated 3-byte sequence
- count-=2;
- }
- } else if(0xc2<=b1 && b1<0xf0) {
- // truncated 2- or 3-byte sequence
- --count;
- }
- }
- }
+
+ // Do not go back into the bytes that will be read for finishing a partial
+ // sequence from the previous buffer.
+ int32_t length=count-toULimit;
+ U8_TRUNCATE_IF_INCOMPLETE(source, 0, length);
+ count=toULimit+length;
}
if(c!=0) {
}
/* copy the legal byte sequence to the target */
- if(count>=toULength) {
+ {
int8_t i;
for(i=0; i<oldToULength; ++i) {
*target++=*source++;
}
count-=toULength;
- } else {
- // A supplementary character that does not fit into the target.
- // Let the standard converter handle this.
- source-=(toULength-oldToULength);
- pToUArgs->source=(char *)source;
- pFromUArgs->target=(char *)target;
- *pErrorCode=U_USING_DEFAULT_WARNING;
- return;
}
}
}
utf8->toULength=toULength;
utf8->mode=toULimit;
break;
- } else if(!U8_IS_TRAIL(b=*source)) {
- /* lead byte in trail byte position */
+ } else if(!icu::UTF8::isValidTrail(c, b=*source, toULength, toULimit)) {
utf8->toULength=toULength;
*pErrorCode=U_ILLEGAL_CHAR_FOUND;
break;
* If the offset points to a UTF-8 trail byte,
* then the offset is moved backward to the corresponding lead byte.
* Otherwise, it is not modified.
+ *
* "Safe" macro, checks for illegal sequences and for string boundaries.
+ * Unlike U8_TRUNCATE_IF_INCOMPLETE(), this macro always reads s[i].
*
* @param s const uint8_t * string
* @param start int32_t starting string offset (usually 0)
* @param i int32_t string offset, must be start<=i
* @see U8_SET_CP_START_UNSAFE
+ * @see U8_TRUNCATE_IF_INCOMPLETE
* @stable ICU 2.4
*/
#define U8_SET_CP_START(s, start, i) { \
} \
}
+/**
+ * If the string ends with a UTF-8 byte sequence that is valid so far
+ * but incomplete, then reduce the length of the string to end before
+ * the lead byte of that incomplete sequence.
+ * For example, if the string ends with E1 80, the length is reduced by 2.
+ *
+ * Useful for processing text split across multiple buffers
+ * (save the incomplete sequence for later)
+ * and for optimizing iteration
+ * (check for string length only once per character).
+ *
+ * "Safe" macro, checks for illegal sequences and for string boundaries.
+ * Unlike U8_SET_CP_START(), this macro never reads s[length].
+ *
+ * (In UTF-16, simply check for U16_IS_LEAD(last code unit).)
+ *
+ * @param s const uint8_t * string
+ * @param start int32_t starting string offset (usually 0)
+ * @param length int32_t string length, must be start<=length
+ * @see U8_SET_CP_START
+ * @draft ICU 61
+ */
+#define U8_TRUNCATE_IF_INCOMPLETE(s, start, length) \
+ if((length)>(start)) { \
+ uint8_t __b1=s[(length)-1]; \
+ if(U8_IS_SINGLE(__b1)) { \
+ /* common ASCII character */ \
+ } else if(U8_IS_LEAD(__b1)) { \
+ --(length); \
+ } else if(U8_IS_TRAIL(__b1) && ((length)-2)>=(start)) { \
+ uint8_t __b2=s[(length)-2]; \
+ if(0xe0<=__b2 && __b2<=0xf4) { \
+ if(__b2<0xf0 ? U8_IS_VALID_LEAD3_AND_T1(__b2, __b1) : \
+ U8_IS_VALID_LEAD4_AND_T1(__b2, __b1)) { \
+ (length)-=2; \
+ } \
+ } else if(U8_IS_TRAIL(__b2) && ((length)-3)>=(start)) { \
+ uint8_t __b3=s[(length)-3]; \
+ if(0xf0<=__b3 && __b3<=0xf4 && U8_IS_VALID_LEAD4_AND_T1(__b3, __b2)) { \
+ (length)-=3; \
+ } \
+ } \
+ } \
+ }
+
/* definitions with backward iteration -------------------------------------- */
/**
int32_t i=*pi;
if(U8_IS_TRAIL(c) && i>start) {
uint8_t b1=s[--i];
- if(0xc2<=b1 && b1<0xe0) {
- *pi=i;
- return ((b1-0xc0)<<6)|(c&0x3f);
+ if(U8_IS_LEAD(b1)) {
+ if(b1<0xe0) {
+ *pi=i;
+ return ((b1-0xc0)<<6)|(c&0x3f);
+ } else if(b1<0xf0 ? U8_IS_VALID_LEAD3_AND_T1(b1, c) : U8_IS_VALID_LEAD4_AND_T1(b1, c)) {
+ // Truncated 3- or 4-byte sequence.
+ *pi=i;
+ return errorValue(1, strict);
+ }
} else if(U8_IS_TRAIL(b1) && i>start) {
// Extract the value bits from the last trail byte.
c&=0x3f;
uint8_t b2=s[--i];
- if(0xe0<=b2 && b2<0xf0) {
- b2&=0xf;
- if(strict!=-2) {
- if(U8_IS_VALID_LEAD3_AND_T1(b2, b1)) {
- *pi=i;
- c=(b2<<12)|((b1&0x3f)<<6)|c;
- if(strict<=0 || !U_IS_UNICODE_NONCHAR(c)) {
- return c;
- } else {
- // strict: forbid non-characters like U+fffe
- return errorValue(2, strict);
+ if(0xe0<=b2 && b2<=0xf4) {
+ if(b2<0xf0) {
+ b2&=0xf;
+ if(strict!=-2) {
+ if(U8_IS_VALID_LEAD3_AND_T1(b2, b1)) {
+ *pi=i;
+ c=(b2<<12)|((b1&0x3f)<<6)|c;
+ if(strict<=0 || !U_IS_UNICODE_NONCHAR(c)) {
+ return c;
+ } else {
+ // strict: forbid non-characters like U+fffe
+ return errorValue(2, strict);
+ }
+ }
+ } else {
+ // strict=-2 -> lenient: allow surrogates
+ b1-=0x80;
+ if((b2>0 || b1>=0x20)) {
+ *pi=i;
+ return (b2<<12)|(b1<<6)|c;
}
}
- } else {
- // strict=-2 -> lenient: allow surrogates
- b1-=0x80;
- if((b2>0 || b1>=0x20)) {
- *pi=i;
- return (b2<<12)|(b1<<6)|c;
- }
+ } else if(U8_IS_VALID_LEAD4_AND_T1(b2, b1)) {
+ // Truncated 4-byte sequence.
+ *pi=i;
+ return errorValue(2, strict);
}
} else if(U8_IS_TRAIL(b2) && i>start) {
uint8_t b3=s[--i];
}
}
}
- } else if(0xf0<=b2 && b2<=0xf4 && U8_IS_VALID_LEAD4_AND_T1(b2, b1)) {
- // Truncated 4-byte sequence.
- *pi=i;
- return errorValue(2, strict);
}
- } else if((0xe0<=b1 && b1<0xf0 && U8_IS_VALID_LEAD3_AND_T1(b1, c)) ||
- (0xf0<=b1 && b1<=0xf4 && U8_IS_VALID_LEAD4_AND_T1(b1, c))) {
- // Truncated 3- or 4-byte sequence.
- *pi=i;
- return errorValue(1, strict);
}
}
return errorValue(0, strict);
uint8_t c=s[i];
if(U8_IS_TRAIL(c) && i>start) {
uint8_t b1=s[--i];
- if(0xc2<=b1 && b1<0xe0) {
- return i;
+ if(U8_IS_LEAD(b1)) {
+ if(b1<0xe0 ||
+ (b1<0xf0 ? U8_IS_VALID_LEAD3_AND_T1(b1, c) : U8_IS_VALID_LEAD4_AND_T1(b1, c))) {
+ return i;
+ }
} else if(U8_IS_TRAIL(b1) && i>start) {
uint8_t b2=s[--i];
- if(0xe0<=b2 && b2<0xf0) {
- if(U8_IS_VALID_LEAD3_AND_T1(b2, b1)) {
+ if(0xe0<=b2 && b2<=0xf4) {
+ if(b2<0xf0 ? U8_IS_VALID_LEAD3_AND_T1(b2, b1) : U8_IS_VALID_LEAD4_AND_T1(b2, b1)) {
return i;
}
} else if(U8_IS_TRAIL(b2) && i>start) {
uint8_t b3=s[--i];
- if(0xf0<=b3 && b3<=0xf4) {
- if(U8_IS_VALID_LEAD4_AND_T1(b3, b2)) {
- return i;
- }
+ if(0xf0<=b3 && b3<=0xf4 && U8_IS_VALID_LEAD4_AND_T1(b3, b2)) {
+ return i;
}
- } else if(0xf0<=b2 && b2<=0xf4 && U8_IS_VALID_LEAD4_AND_T1(b2, b1)) {
- // Truncated 4-byte sequence.
- return i;
}
- } else if((0xe0<=b1 && b1<0xf0 && U8_IS_VALID_LEAD3_AND_T1(b1, c)) ||
- (0xf0<=b1 && b1<=0xf4 && U8_IS_VALID_LEAD4_AND_T1(b1, c))) {
- // Truncated 3- or 4-byte sequence.
- return i;
}
}
return orig_i;
static void TestFwdBackUnsafe(void);
static void TestSetChar(void);
static void TestSetCharUnsafe(void);
+static void TestTruncateIfIncomplete(void);
static void TestAppendChar(void);
static void TestAppend(void);
static void TestSurrogates(void);
addTest(root, &TestFwdBackUnsafe, "utf8tst/TestFwdBackUnsafe");
addTest(root, &TestSetChar, "utf8tst/TestSetChar");
addTest(root, &TestSetCharUnsafe, "utf8tst/TestSetCharUnsafe");
+ addTest(root, &TestTruncateIfIncomplete, "utf8tst/TestTruncateIfIncomplete");
addTest(root, &TestAppendChar, "utf8tst/TestAppendChar");
addTest(root, &TestAppend, "utf8tst/TestAppend");
addTest(root, &TestSurrogates, "utf8tst/TestSurrogates");
}
}
+static void TestTruncateIfIncomplete() {
+ // Difference from U8_SET_CP_START():
+ // U8_TRUNCATE_IF_INCOMPLETE() does not look at s[length].
+ // Therefore, if the last byte is a lead byte, then this macro truncates
+ // even if the byte at the input index cannot continue a valid sequence
+ // (including when that is not a trail byte).
+ // On the other hand, if the last byte is a trail byte, then the two macros behave the same.
+ static const struct {
+ const char *s;
+ int32_t expected;
+ } cases[] = {
+ { "", 0 },
+ { "a", 1 },
+ { "\x80", 1 },
+ { "\xC1", 1 },
+ { "\xC2", 0 },
+ { "\xE0", 0 },
+ { "\xF4", 0 },
+ { "\xF5", 1 },
+ { "\x80\x80", 2 },
+ { "\xC2\xA0", 2 },
+ { "\xE0\x9F", 2 },
+ { "\xE0\xA0", 0 },
+ { "\xED\x9F", 0 },
+ { "\xED\xA0", 2 },
+ { "\xF0\x8F", 2 },
+ { "\xF0\x90", 0 },
+ { "\xF4\x8F", 0 },
+ { "\xF4\x90", 2 },
+ { "\xF5\x80", 2 },
+ { "\x80\x80\x80", 3 },
+ { "\xC2\xA0\x80", 3 },
+ { "\xE0\xA0\x80", 3 },
+ { "\xF0\x8F\x80", 3 },
+ { "\xF0\x90\x80", 0 },
+ { "\xF4\x8F\x80", 0 },
+ { "\xF4\x90\x80", 3 },
+ { "\xF5\x80\x80", 3 },
+ { "\x80\x80\x80\x80", 4 },
+ { "\xC2\xA0\x80\x80", 4 },
+ { "\xE0\xA0\x80\x80", 4 },
+ { "\xF0\x90\x80\x80", 4 },
+ { "\xF5\x80\x80\x80", 4 }
+ };
+ int32_t i;
+ for (i = 0; i < UPRV_LENGTHOF(cases); ++i) {
+ const char *s = cases[i].s;
+ int32_t expected = cases[i].expected;
+ int32_t length = (int32_t)strlen(s);
+ int32_t adjusted = length;
+ U8_TRUNCATE_IF_INCOMPLETE(s, 0, adjusted);
+ if (adjusted != expected) {
+ log_err("ERROR: U8_TRUNCATE_IF_INCOMPLETE failed for i=%d, length=%d. Expected:%d Got:%d\n",
+ (int)i, (int)length, (int)expected, (int)adjusted);
+ }
+ }
+}
+
static void TestAppendChar(){
#if !U_HIDE_OBSOLETE_UTF_OLD_H
static const uint8_t s[11]={0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00};