From: Markus Scherer Date: Fri, 9 Feb 2018 21:01:56 +0000 (+0000) Subject: ICU-13515 UTF-8 macro: reduce length of string if it ends with an incomplete sequence X-Git-Tag: release-61-rc~110 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=19b494f953df2c5a4ffc7743d4e1e2bafdbde62e;p=icu ICU-13515 UTF-8 macro: reduce length of string if it ends with an incomplete sequence X-SVN-Rev: 40883 --- diff --git a/icu4c/source/common/ucnv_u8.cpp b/icu4c/source/common/ucnv_u8.cpp index 094e2dfb6f4..7089d9400c6 100644 --- a/icu4c/source/common/ucnv_u8.cpp +++ b/icu4c/source/common/ucnv_u8.cpp @@ -696,36 +696,20 @@ ucnv_UTF8FromUTF8(UConverterFromUnicodeArgs *pFromUArgs, // Use a single counter for source and target, counting the minimum of // the source length and the target capacity. // Let the standard converter handle edge cases. - const uint8_t *limit=sourceLimit; if(count>targetCapacity) { - limit-=(count-targetCapacity); count=targetCapacity; } - // The conversion loop checks count>0 only once per 1/2/3-byte character. - // If the buffer ends with a truncated 2- or 3-byte sequence, + // The conversion loop checks count>0 only once per character. + // If the buffer ends with a truncated sequence, // then we reduce the count to stop before that, // and collect the remaining bytes after the conversion loop. - { - // Do not go back into the bytes that will be read for finishing a partial - // sequence from the previous buffer. - int32_t length=count-toULimit; - if(length>0) { - uint8_t b1=*(limit-1); - if(U8_IS_SINGLE(b1)) { - // common ASCII character - } else if(U8_IS_TRAIL(b1) && length>=2) { - uint8_t b2=*(limit-2); - if(0xe0<=b2 && b2<0xf0 && U8_IS_VALID_LEAD3_AND_T1(b2, b1)) { - // truncated 3-byte sequence - count-=2; - } - } else if(0xc2<=b1 && b1<0xf0) { - // truncated 2- or 3-byte sequence - --count; - } - } - } + + // Do not go back into the bytes that will be read for finishing a partial + // sequence from the previous buffer. + int32_t length=count-toULimit; + U8_TRUNCATE_IF_INCOMPLETE(source, 0, length); + count=toULimit+length; } if(c!=0) { @@ -815,7 +799,7 @@ moreBytes: } /* copy the legal byte sequence to the target */ - if(count>=toULength) { + { int8_t i; for(i=0; isource=(char *)source; - pFromUArgs->target=(char *)target; - *pErrorCode=U_USING_DEFAULT_WARNING; - return; } } } @@ -857,8 +833,7 @@ moreBytes: utf8->toULength=toULength; utf8->mode=toULimit; break; - } else if(!U8_IS_TRAIL(b=*source)) { - /* lead byte in trail byte position */ + } else if(!icu::UTF8::isValidTrail(c, b=*source, toULength, toULimit)) { utf8->toULength=toULength; *pErrorCode=U_ILLEGAL_CHAR_FOUND; break; diff --git a/icu4c/source/common/unicode/utf8.h b/icu4c/source/common/unicode/utf8.h index 021fdcf1f24..555e8bd4e28 100644 --- a/icu4c/source/common/unicode/utf8.h +++ b/icu4c/source/common/unicode/utf8.h @@ -592,12 +592,15 @@ utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i); * If the offset points to a UTF-8 trail byte, * then the offset is moved backward to the corresponding lead byte. * Otherwise, it is not modified. + * * "Safe" macro, checks for illegal sequences and for string boundaries. + * Unlike U8_TRUNCATE_IF_INCOMPLETE(), this macro always reads s[i]. * * @param s const uint8_t * string * @param start int32_t starting string offset (usually 0) * @param i int32_t string offset, must be start<=i * @see U8_SET_CP_START_UNSAFE + * @see U8_TRUNCATE_IF_INCOMPLETE * @stable ICU 2.4 */ #define U8_SET_CP_START(s, start, i) { \ @@ -606,6 +609,51 @@ utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i); } \ } +/** + * If the string ends with a UTF-8 byte sequence that is valid so far + * but incomplete, then reduce the length of the string to end before + * the lead byte of that incomplete sequence. + * For example, if the string ends with E1 80, the length is reduced by 2. + * + * Useful for processing text split across multiple buffers + * (save the incomplete sequence for later) + * and for optimizing iteration + * (check for string length only once per character). + * + * "Safe" macro, checks for illegal sequences and for string boundaries. + * Unlike U8_SET_CP_START(), this macro never reads s[length]. + * + * (In UTF-16, simply check for U16_IS_LEAD(last code unit).) + * + * @param s const uint8_t * string + * @param start int32_t starting string offset (usually 0) + * @param length int32_t string length, must be start<=length + * @see U8_SET_CP_START + * @draft ICU 61 + */ +#define U8_TRUNCATE_IF_INCOMPLETE(s, start, length) \ + if((length)>(start)) { \ + uint8_t __b1=s[(length)-1]; \ + if(U8_IS_SINGLE(__b1)) { \ + /* common ASCII character */ \ + } else if(U8_IS_LEAD(__b1)) { \ + --(length); \ + } else if(U8_IS_TRAIL(__b1) && ((length)-2)>=(start)) { \ + uint8_t __b2=s[(length)-2]; \ + if(0xe0<=__b2 && __b2<=0xf4) { \ + if(__b2<0xf0 ? U8_IS_VALID_LEAD3_AND_T1(__b2, __b1) : \ + U8_IS_VALID_LEAD4_AND_T1(__b2, __b1)) { \ + (length)-=2; \ + } \ + } else if(U8_IS_TRAIL(__b2) && ((length)-3)>=(start)) { \ + uint8_t __b3=s[(length)-3]; \ + if(0xf0<=__b3 && __b3<=0xf4 && U8_IS_VALID_LEAD4_AND_T1(__b3, __b2)) { \ + (length)-=3; \ + } \ + } \ + } \ + } + /* definitions with backward iteration -------------------------------------- */ /** diff --git a/icu4c/source/common/utf_impl.cpp b/icu4c/source/common/utf_impl.cpp index f78c566e098..9dd241a12bf 100644 --- a/icu4c/source/common/utf_impl.cpp +++ b/icu4c/source/common/utf_impl.cpp @@ -238,33 +238,45 @@ utf8_prevCharSafeBody(const uint8_t *s, int32_t start, int32_t *pi, UChar32 c, U int32_t i=*pi; if(U8_IS_TRAIL(c) && i>start) { uint8_t b1=s[--i]; - if(0xc2<=b1 && b1<0xe0) { - *pi=i; - return ((b1-0xc0)<<6)|(c&0x3f); + if(U8_IS_LEAD(b1)) { + if(b1<0xe0) { + *pi=i; + return ((b1-0xc0)<<6)|(c&0x3f); + } else if(b1<0xf0 ? U8_IS_VALID_LEAD3_AND_T1(b1, c) : U8_IS_VALID_LEAD4_AND_T1(b1, c)) { + // Truncated 3- or 4-byte sequence. + *pi=i; + return errorValue(1, strict); + } } else if(U8_IS_TRAIL(b1) && i>start) { // Extract the value bits from the last trail byte. c&=0x3f; uint8_t b2=s[--i]; - if(0xe0<=b2 && b2<0xf0) { - b2&=0xf; - if(strict!=-2) { - if(U8_IS_VALID_LEAD3_AND_T1(b2, b1)) { - *pi=i; - c=(b2<<12)|((b1&0x3f)<<6)|c; - if(strict<=0 || !U_IS_UNICODE_NONCHAR(c)) { - return c; - } else { - // strict: forbid non-characters like U+fffe - return errorValue(2, strict); + if(0xe0<=b2 && b2<=0xf4) { + if(b2<0xf0) { + b2&=0xf; + if(strict!=-2) { + if(U8_IS_VALID_LEAD3_AND_T1(b2, b1)) { + *pi=i; + c=(b2<<12)|((b1&0x3f)<<6)|c; + if(strict<=0 || !U_IS_UNICODE_NONCHAR(c)) { + return c; + } else { + // strict: forbid non-characters like U+fffe + return errorValue(2, strict); + } + } + } else { + // strict=-2 -> lenient: allow surrogates + b1-=0x80; + if((b2>0 || b1>=0x20)) { + *pi=i; + return (b2<<12)|(b1<<6)|c; } } - } else { - // strict=-2 -> lenient: allow surrogates - b1-=0x80; - if((b2>0 || b1>=0x20)) { - *pi=i; - return (b2<<12)|(b1<<6)|c; - } + } else if(U8_IS_VALID_LEAD4_AND_T1(b2, b1)) { + // Truncated 4-byte sequence. + *pi=i; + return errorValue(2, strict); } } else if(U8_IS_TRAIL(b2) && i>start) { uint8_t b3=s[--i]; @@ -281,16 +293,7 @@ utf8_prevCharSafeBody(const uint8_t *s, int32_t start, int32_t *pi, UChar32 c, U } } } - } else if(0xf0<=b2 && b2<=0xf4 && U8_IS_VALID_LEAD4_AND_T1(b2, b1)) { - // Truncated 4-byte sequence. - *pi=i; - return errorValue(2, strict); } - } else if((0xe0<=b1 && b1<0xf0 && U8_IS_VALID_LEAD3_AND_T1(b1, c)) || - (0xf0<=b1 && b1<=0xf4 && U8_IS_VALID_LEAD4_AND_T1(b1, c))) { - // Truncated 3- or 4-byte sequence. - *pi=i; - return errorValue(1, strict); } } return errorValue(0, strict); @@ -303,29 +306,23 @@ utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i) { uint8_t c=s[i]; if(U8_IS_TRAIL(c) && i>start) { uint8_t b1=s[--i]; - if(0xc2<=b1 && b1<0xe0) { - return i; + if(U8_IS_LEAD(b1)) { + if(b1<0xe0 || + (b1<0xf0 ? U8_IS_VALID_LEAD3_AND_T1(b1, c) : U8_IS_VALID_LEAD4_AND_T1(b1, c))) { + return i; + } } else if(U8_IS_TRAIL(b1) && i>start) { uint8_t b2=s[--i]; - if(0xe0<=b2 && b2<0xf0) { - if(U8_IS_VALID_LEAD3_AND_T1(b2, b1)) { + if(0xe0<=b2 && b2<=0xf4) { + if(b2<0xf0 ? U8_IS_VALID_LEAD3_AND_T1(b2, b1) : U8_IS_VALID_LEAD4_AND_T1(b2, b1)) { return i; } } else if(U8_IS_TRAIL(b2) && i>start) { uint8_t b3=s[--i]; - if(0xf0<=b3 && b3<=0xf4) { - if(U8_IS_VALID_LEAD4_AND_T1(b3, b2)) { - return i; - } + if(0xf0<=b3 && b3<=0xf4 && U8_IS_VALID_LEAD4_AND_T1(b3, b2)) { + return i; } - } else if(0xf0<=b2 && b2<=0xf4 && U8_IS_VALID_LEAD4_AND_T1(b2, b1)) { - // Truncated 4-byte sequence. - return i; } - } else if((0xe0<=b1 && b1<0xf0 && U8_IS_VALID_LEAD3_AND_T1(b1, c)) || - (0xf0<=b1 && b1<=0xf4 && U8_IS_VALID_LEAD4_AND_T1(b1, c))) { - // Truncated 3- or 4-byte sequence. - return i; } } return orig_i; diff --git a/icu4c/source/test/cintltst/utf8tst.c b/icu4c/source/test/cintltst/utf8tst.c index 0bbb5e5413d..b7062e3b82f 100644 --- a/icu4c/source/test/cintltst/utf8tst.c +++ b/icu4c/source/test/cintltst/utf8tst.c @@ -94,6 +94,7 @@ static void TestFwdBack(void); static void TestFwdBackUnsafe(void); static void TestSetChar(void); static void TestSetCharUnsafe(void); +static void TestTruncateIfIncomplete(void); static void TestAppendChar(void); static void TestAppend(void); static void TestSurrogates(void); @@ -114,6 +115,7 @@ addUTF8Test(TestNode** root) addTest(root, &TestFwdBackUnsafe, "utf8tst/TestFwdBackUnsafe"); addTest(root, &TestSetChar, "utf8tst/TestSetChar"); addTest(root, &TestSetCharUnsafe, "utf8tst/TestSetCharUnsafe"); + addTest(root, &TestTruncateIfIncomplete, "utf8tst/TestTruncateIfIncomplete"); addTest(root, &TestAppendChar, "utf8tst/TestAppendChar"); addTest(root, &TestAppend, "utf8tst/TestAppend"); addTest(root, &TestSurrogates, "utf8tst/TestSurrogates"); @@ -927,6 +929,64 @@ static void TestSetCharUnsafe() { } } +static void TestTruncateIfIncomplete() { + // Difference from U8_SET_CP_START(): + // U8_TRUNCATE_IF_INCOMPLETE() does not look at s[length]. + // Therefore, if the last byte is a lead byte, then this macro truncates + // even if the byte at the input index cannot continue a valid sequence + // (including when that is not a trail byte). + // On the other hand, if the last byte is a trail byte, then the two macros behave the same. + static const struct { + const char *s; + int32_t expected; + } cases[] = { + { "", 0 }, + { "a", 1 }, + { "\x80", 1 }, + { "\xC1", 1 }, + { "\xC2", 0 }, + { "\xE0", 0 }, + { "\xF4", 0 }, + { "\xF5", 1 }, + { "\x80\x80", 2 }, + { "\xC2\xA0", 2 }, + { "\xE0\x9F", 2 }, + { "\xE0\xA0", 0 }, + { "\xED\x9F", 0 }, + { "\xED\xA0", 2 }, + { "\xF0\x8F", 2 }, + { "\xF0\x90", 0 }, + { "\xF4\x8F", 0 }, + { "\xF4\x90", 2 }, + { "\xF5\x80", 2 }, + { "\x80\x80\x80", 3 }, + { "\xC2\xA0\x80", 3 }, + { "\xE0\xA0\x80", 3 }, + { "\xF0\x8F\x80", 3 }, + { "\xF0\x90\x80", 0 }, + { "\xF4\x8F\x80", 0 }, + { "\xF4\x90\x80", 3 }, + { "\xF5\x80\x80", 3 }, + { "\x80\x80\x80\x80", 4 }, + { "\xC2\xA0\x80\x80", 4 }, + { "\xE0\xA0\x80\x80", 4 }, + { "\xF0\x90\x80\x80", 4 }, + { "\xF5\x80\x80\x80", 4 } + }; + int32_t i; + for (i = 0; i < UPRV_LENGTHOF(cases); ++i) { + const char *s = cases[i].s; + int32_t expected = cases[i].expected; + int32_t length = (int32_t)strlen(s); + int32_t adjusted = length; + U8_TRUNCATE_IF_INCOMPLETE(s, 0, adjusted); + if (adjusted != expected) { + log_err("ERROR: U8_TRUNCATE_IF_INCOMPLETE failed for i=%d, length=%d. Expected:%d Got:%d\n", + (int)i, (int)length, (int)expected, (int)adjusted); + } + } +} + static void TestAppendChar(){ #if !U_HIDE_OBSOLETE_UTF_OLD_H static const uint8_t s[11]={0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00};