From: Markus Scherer Date: Tue, 9 Oct 2012 05:17:43 +0000 (+0000) Subject: ICU-9398 avoid use of utf8_countTrailBytes[], rewrite/optimize U8_COUNT_TRAIL_BYTES... X-Git-Tag: milestone-59-0-1~3447 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=996422e74c2ec534abe76f7e15eb289e9d9fd28c;p=icu ICU-9398 avoid use of utf8_countTrailBytes[], rewrite/optimize U8_COUNT_TRAIL_BYTES() & U8_NEXT_UNSAFE(), test _UNSAFE macros only with (mostly) well-formed UTF-8 text X-SVN-Rev: 32574 --- diff --git a/icu4c/source/common/ucnv_u8.c b/icu4c/source/common/ucnv_u8.c index 1b6249274bb..8ee9fe54764 100644 --- a/icu4c/source/common/ucnv_u8.c +++ b/icu4c/source/common/ucnv_u8.c @@ -821,7 +821,7 @@ ucnv_UTF8FromUTF8(UConverterFromUnicodeArgs *pFromUArgs, if(U8_IS_TRAIL(b)) { ++i; } else { - if(i(sourceLimit-source)) { /* collect a truncated byte sequence */ toULength=0; diff --git a/icu4c/source/common/ucnvlat1.c b/icu4c/source/common/ucnvlat1.c index 54140ec92bd..202f8aabeda 100644 --- a/icu4c/source/common/ucnvlat1.c +++ b/icu4c/source/common/ucnvlat1.c @@ -1,6 +1,6 @@ /* ********************************************************************** -* Copyright (C) 2000-2011, International Business Machines +* Copyright (C) 2000-2012, International Business Machines * Corporation and others. All Rights Reserved. ********************************************************************** * file name: ucnvlat1.cpp @@ -406,7 +406,7 @@ ucnv_Latin1FromUTF8(UConverterFromUnicodeArgs *pFromUArgs, if(U_SUCCESS(*pErrorCode) && source<(sourceLimit=(uint8_t *)pToUArgs->sourceLimit)) { utf8->toUnicodeStatus=utf8->toUBytes[0]=b=*source++; utf8->toULength=1; - utf8->mode=utf8_countTrailBytes[b]+1; + utf8->mode=U8_COUNT_TRAIL_BYTES(b)+1; } /* write back the updated pointers */ diff --git a/icu4c/source/common/ucnvmbcs.c b/icu4c/source/common/ucnvmbcs.c index f88adf1d8d5..becb49237e8 100644 --- a/icu4c/source/common/ucnvmbcs.c +++ b/icu4c/source/common/ucnvmbcs.c @@ -4930,7 +4930,7 @@ ucnv_SBCSFromUTF8(UConverterFromUnicodeArgs *pFromUArgs, if(U8_IS_TRAIL(b)) { ++i; } else { - if(isourceLimit)) { c=utf8->toUBytes[0]=b=*source++; toULength=1; - toULimit=utf8_countTrailBytes[b]+1; + toULimit=U8_COUNT_TRAIL_BYTES(b)+1; while(sourcetoUBytes[toULength++]=b=*source++; c=(c<<6)+b; @@ -5230,7 +5230,7 @@ ucnv_DBCSFromUTF8(UConverterFromUnicodeArgs *pFromUArgs, if(U8_IS_TRAIL(b)) { ++i; } else { - if(isourceLimit)) { c=utf8->toUBytes[0]=b=*source++; toULength=1; - toULimit=utf8_countTrailBytes[b]+1; + toULimit=U8_COUNT_TRAIL_BYTES(b)+1; while(sourcetoUBytes[toULength++]=b=*source++; c=(c<<6)+b; diff --git a/icu4c/source/common/unicode/utf8.h b/icu4c/source/common/unicode/utf8.h index 8318c7bb0d2..2e2f02ac4ee 100644 --- a/icu4c/source/common/unicode/utf8.h +++ b/icu4c/source/common/unicode/utf8.h @@ -1,7 +1,7 @@ /* ******************************************************************************* * -* Copyright (C) 1999-2011, International Business Machines +* Copyright (C) 1999-2012, International Business Machines * Corporation and others. All Rights Reserved. * ******************************************************************************* @@ -60,13 +60,41 @@ U_CFUNC U_IMPORT const uint8_t /* U_IMPORT2? */ /*U_IMPORT*/ utf8_countTrailBytes[256]; /** - * Count the trail bytes for a UTF-8 lead byte. + * Counts the trail bytes for a UTF-8 lead byte. + * Returns 0 for 0..0xbf as well as for 0xfe and 0xff. * * This is internal since it is not meant to be called directly by external clients; * however it is called by public macros in this file and thus must remain stable. + * + * Note: Beginning with ICU 50, the implementation uses a multi-condition expression + * which was shown in 2012 (on x86-64) to compile to fast, branch-free code. + * leadByte is evaluated multiple times. + * + * The pre-ICU 50 implementation used the exported array utf8_countTrailBytes: + * #define U8_COUNT_TRAIL_BYTES(leadByte) (utf8_countTrailBytes[leadByte]) + * leadByte was evaluated exactly once. + * + * @param leadByte The first byte of a UTF-8 sequence. Must be 0..0xff. * @internal */ -#define U8_COUNT_TRAIL_BYTES(leadByte) (utf8_countTrailBytes[(uint8_t)leadByte]) +#define U8_COUNT_TRAIL_BYTES(leadByte) \ + ((leadByte)<0xf0 ? \ + ((leadByte)>=0xc0)+((leadByte)>=0xe0) : \ + (leadByte)<0xfe ? 3+((leadByte)>=0xf8)+((leadByte)>=0xfc) : 0) + +/** + * Counts the trail bytes for a UTF-8 lead byte of a valid UTF-8 sequence. + * The maximum supported lead byte is 0xf4 corresponding to U+10FFFF. + * leadByte might be evaluated multiple times. + * + * This is internal since it is not meant to be called directly by external clients; + * however it is called by public macros in this file and thus must remain stable. + * + * @param leadByte The first byte of a UTF-8 sequence. Must be 0..0xff. + * @internal + */ +#define U8_COUNT_TRAIL_BYTES_UNSAFE(leadByte) \ + (((leadByte)>=0xc0)+((leadByte)>=0xe0)+((leadByte)>=0xf0)) /** * Mask a UTF-8 lead byte, leave only the lower bits that form part of the code point value. @@ -243,19 +271,16 @@ utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i); */ #define U8_NEXT_UNSAFE(s, i, c) { \ (c)=(uint8_t)(s)[(i)++]; \ - if((uint8_t)((c)-0xc0)<0x35) { \ - uint8_t __count=U8_COUNT_TRAIL_BYTES(c); \ - U8_MASK_LEAD_BYTE(c, __count); \ - switch(__count) { \ - /* each following branch falls through to the next one */ \ - case 3: \ - (c)=((c)<<6)|((s)[(i)++]&0x3f); \ - case 2: \ - (c)=((c)<<6)|((s)[(i)++]&0x3f); \ - case 1: \ - (c)=((c)<<6)|((s)[(i)++]&0x3f); \ - /* no other branches to optimize switch() */ \ - break; \ + if((c)>=0x80) { \ + if((c)<0xe0) { \ + (c)=(((c)&0x1f)<<6)|((s)[(i)++]&0x3f); \ + } else if((c)<0xf0) { \ + /* no need for (c&0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */ \ + (c)=(UChar)(((c)<<12)|(((s)[i]&0x3f)<<6)|((s)[(i)+1]&0x3f)); \ + (i)+=2; \ + } else { \ + (c)=(((c)&7)<<18)|(((s)[i]&0x3f)<<12)|(((s)[(i)+1]&0x3f)<<6)|((s)[(i)+2]&0x3f); \ + (i)+=3; \ } \ } \ } @@ -382,7 +407,7 @@ utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i); * @stable ICU 2.4 */ #define U8_FWD_1_UNSAFE(s, i) { \ - (i)+=1+U8_COUNT_TRAIL_BYTES((s)[i]); \ + (i)+=1+U8_COUNT_TRAIL_BYTES_UNSAFE((uint8_t)(s)[i]); \ } /** diff --git a/icu4c/source/test/cintltst/utf8tst.c b/icu4c/source/test/cintltst/utf8tst.c index eaa8fb28c54..0635aa64930 100644 --- a/icu4c/source/test/cintltst/utf8tst.c +++ b/icu4c/source/test/cintltst/utf8tst.c @@ -1,6 +1,6 @@ /******************************************************************** - * COPYRIGHT: - * Copyright (c) 1998-2006, International Business Machines Corporation and + * COPYRIGHT: + * Copyright (c) 1998-2012, International Business Machines Corporation and * others. All Rights Reserved. ********************************************************************/ /* @@ -9,7 +9,7 @@ * Modification History: * * Date Name Description -* 07/24/2000 Madhu Creation +* 07/24/2000 Madhu Creation ******************************************************************************* */ @@ -18,7 +18,7 @@ #include "cmemory.h" #include "cintltst.h" -#define LENGTHOF(array) (sizeof(array)/sizeof((array)[0])) +#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) /* lenient UTF-8 ------------------------------------------------------------ */ @@ -64,8 +64,12 @@ static void TestCodeUnitValues(void); static void TestCharLength(void); static void TestGetChar(void); static void TestNextPrevChar(void); +static void TestNextPrevNonCharacters(void); +static void TestNextPrevCharUnsafe(void); static void TestFwdBack(void); +static void TestFwdBackUnsafe(void); static void TestSetChar(void); +static void TestSetCharUnsafe(void); static void TestAppendChar(void); static void TestAppend(void); static void TestSurrogates(void); @@ -75,23 +79,27 @@ void addUTF8Test(TestNode** root); void addUTF8Test(TestNode** root) { - addTest(root, &TestCodeUnitValues, "utf8tst/TestCodeUnitValues"); - addTest(root, &TestCharLength, "utf8tst/TestCharLength" ); - addTest(root, &TestGetChar, "utf8tst/TestGetChar" ); - addTest(root, &TestNextPrevChar, "utf8tst/TestNextPrevChar" ); - addTest(root, &TestFwdBack, "utf8tst/TestFwdBack" ); - addTest(root, &TestSetChar, "utf8tst/TestSetChar" ); - addTest(root, &TestAppendChar, "utf8tst/TestAppendChar" ); - addTest(root, &TestAppend, "utf8tst/TestAppend" ); - addTest(root, &TestSurrogates, "utf8tst/TestSurrogates" ); + addTest(root, &TestCodeUnitValues, "utf8tst/TestCodeUnitValues"); + addTest(root, &TestCharLength, "utf8tst/TestCharLength"); + addTest(root, &TestGetChar, "utf8tst/TestGetChar"); + addTest(root, &TestNextPrevChar, "utf8tst/TestNextPrevChar"); + addTest(root, &TestNextPrevNonCharacters, "utf8tst/TestNextPrevNonCharacters"); + addTest(root, &TestNextPrevCharUnsafe, "utf8tst/TestNextPrevCharUnsafe"); + addTest(root, &TestFwdBack, "utf8tst/TestFwdBack"); + addTest(root, &TestFwdBackUnsafe, "utf8tst/TestFwdBackUnsafe"); + addTest(root, &TestSetChar, "utf8tst/TestSetChar"); + addTest(root, &TestSetCharUnsafe, "utf8tst/TestSetCharUnsafe"); + addTest(root, &TestAppendChar, "utf8tst/TestAppendChar"); + addTest(root, &TestAppend, "utf8tst/TestAppend"); + addTest(root, &TestSurrogates, "utf8tst/TestSurrogates"); } static void TestCodeUnitValues() { static const uint8_t codeunit[]={0x00, 0x65, 0x7e, 0x7f, 0xc0, 0xc4, 0xf0, 0xfd, 0x80, 0x81, 0xbc, 0xbe,}; - + int16_t i; - for(i=0; i 0; --offset){ - setOffset=offset; - UTF8_PREV_CHAR_UNSAFE(input, setOffset, c); - if(setOffset != movedOffset[i+3]){ - log_err("ERROR: UTF8_PREV_CHAR_UNSAFE failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n", - offset, movedOffset[i+3], setOffset); - } - if(c != result[i+3]){ - log_err("ERROR: UTF8_PREV_CHAR_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n", offset, result[i+3], c); - } - setOffset=offset; UTF8_PREV_CHAR_SAFE(input, 0, setOffset, c, FALSE); if(setOffset != movedOffset[i+4]){ @@ -363,77 +337,112 @@ static void TestNextPrevChar(){ if(setOffset != movedOffset[i+5]){ log_err("ERROR: UTF8_PREV_CHAR_SAFE(strict) failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n", offset, movedOffset[i+5], setOffset); - } + } if(c != result[i+5]){ log_err("ERROR: UTF8_PREV_CHAR_SAFE(strict) failed for input=%ld. Expected:%lx Got:%lx\n", offset, result[i+5], c); } i=i+6; } +} - { - /* test non-characters */ - static const uint8_t nonChars[]={ - 0xef, 0xb7, 0x90, /* U+fdd0 */ - 0xef, 0xbf, 0xbf, /* U+feff */ - 0xf0, 0x9f, 0xbf, 0xbe, /* U+1fffe */ - 0xf0, 0xbf, 0xbf, 0xbf, /* U+3ffff */ - 0xf4, 0x8f, 0xbf, 0xbe /* U+10fffe */ - }; +static void TestNextPrevNonCharacters() { + /* test non-characters */ + static const uint8_t nonChars[]={ + 0xef, 0xb7, 0x90, /* U+fdd0 */ + 0xef, 0xbf, 0xbf, /* U+feff */ + 0xf0, 0x9f, 0xbf, 0xbe, /* U+1fffe */ + 0xf0, 0xbf, 0xbf, 0xbf, /* U+3ffff */ + 0xf4, 0x8f, 0xbf, 0xbe /* U+10fffe */ + }; - UChar32 ch; - int32_t idx; + UChar32 ch; + int32_t idx; - for(idx=0; idx<(int32_t)sizeof(nonChars);) { - U8_NEXT(nonChars, idx, sizeof(nonChars), ch); - if(!U_IS_UNICODE_NONCHAR(ch)) { - log_err("U8_NEXT(before %d) failed to read a non-character\n", idx); - } + for(idx=0; idx<(int32_t)sizeof(nonChars);) { + U8_NEXT(nonChars, idx, sizeof(nonChars), ch); + if(!U_IS_UNICODE_NONCHAR(ch)) { + log_err("U8_NEXT(before %d) failed to read a non-character\n", idx); } - for(idx=(int32_t)sizeof(nonChars); idx>0;) { - U8_PREV(nonChars, 0, idx, ch); - if(!U_IS_UNICODE_NONCHAR(ch)) { - log_err("U8_PREV(at %d) failed to read a non-character\n", idx); - } + } + for(idx=(int32_t)sizeof(nonChars); idx>0;) { + U8_PREV(nonChars, 0, idx, ch); + if(!U_IS_UNICODE_NONCHAR(ch)) { + log_err("U8_PREV(at %d) failed to read a non-character\n", idx); } } } -static void TestFwdBack(){ +static void TestNextPrevCharUnsafe() { + /* + * Use a (mostly) well-formed UTF-8 string and test at code point boundaries. + * The behavior of _UNSAFE macros for ill-formed strings is undefined. + */ + static const uint8_t input[]={ + 0x61, + 0xf0, 0x90, 0x90, 0x81, + 0xc0, 0x80, /* non-shortest form */ + 0xe2, 0x82, 0xac, + 0xc2, 0xa1, + 0xf4, 0x8f, 0xbf, 0xbf, + 0x00 + }; + static const UChar32 codePoints[]={ + 0x61, + 0x10401, + 0, + 0x20ac, + 0xa1, + 0x10ffff, + 0 + }; + + UChar32 c; + int32_t i; + uint32_t offset; + for(i=0, offset=0; offset 0; --i){ + UTF8_PREV_CHAR_UNSAFE(input, offset, c); + if(c != codePoints[i]){ + log_err("ERROR: UTF8_PREV_CHAR_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n", + offset, codePoints[i], c); + } + } + for(i=LENGTHOF(codePoints)-1, offset=sizeof(input); offset > 0; --i){ + U8_PREV_UNSAFE(input, offset, c); + if(c != codePoints[i]){ + log_err("ERROR: U8_PREV_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n", + offset, codePoints[i], c); + } + } +} + +static void TestFwdBack() { static const uint8_t input[]={0x61, 0xF0, 0x90, 0x90, 0x81, 0xff, 0x62, 0xc0, 0x80, 0x7f, 0x8f, 0xc0, 0x63, 0x81, 0x90, 0x90, 0xF0, 0x00}; - static const uint16_t fwd_unsafe[] ={1, 5, 6, 7, 9, 10, 11, 13, 14, 15, 16, 20, }; static const uint16_t fwd_safe[] ={1, 5, 6, 7, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18}; - static const uint16_t back_unsafe[]={17, 16, 12, 11, 9, 7, 6, 5, 1, 0}; static const uint16_t back_safe[] ={17, 16, 15, 14, 13, 12, 11, 10, 9, 7, 6, 5, 1, 0}; static const uint16_t Nvalue[]= {0, 1, 2, 3, 1, 2, 1, 5}; - static const uint16_t fwd_N_unsafe[] ={0, 1, 6, 10, 11, 14, 15}; static const uint16_t fwd_N_safe[] ={0, 1, 6, 10, 11, 13, 14, 18}; /*safe macro keeps it at the end of the string */ - static const uint16_t back_N_unsafe[]={18, 17, 12, 7, 6, 1, 0}; - static const uint16_t back_N_safe[] ={18, 17, 15, 12, 11, 9, 7, 0}; - + static const uint16_t back_N_safe[] ={18, 17, 15, 12, 11, 9, 7, 0}; - uint32_t offunsafe=0, offsafe=0; + uint32_t offsafe=0; uint32_t i=0; - while(offunsafe < sizeof(input)){ - UTF8_FWD_1_UNSAFE(input, offunsafe); - if(offunsafe != fwd_unsafe[i]){ - log_err("ERROR: Forward_unsafe offset expected:%d, Got:%d\n", fwd_unsafe[i], offunsafe); - } - i++; - } - - i=0; - while(offunsafe < sizeof(input)){ - U8_FWD_1_UNSAFE(input, offunsafe); - if(offunsafe != fwd_unsafe[i]){ - log_err("ERROR: U8_FWD_1_UNSAFE offset expected:%d, Got:%d\n", fwd_unsafe[i], offunsafe); - } - i++; - } - - i=0; while(offsafe < sizeof(input)){ UTF8_FWD_1_SAFE(input, offsafe, sizeof(input)); if(offsafe != fwd_safe[i]){ @@ -451,32 +460,12 @@ static void TestFwdBack(){ i++; } - offunsafe=sizeof(input); - i=0; - while(offunsafe > 0){ - UTF8_BACK_1_UNSAFE(input, offunsafe); - if(offunsafe != back_unsafe[i]){ - log_err("ERROR: Backward_unsafe offset expected:%d, Got:%d\n", back_unsafe[i], offunsafe); - } - i++; - } - - offunsafe=sizeof(input); - i=0; - while(offunsafe > 0){ - U8_BACK_1_UNSAFE(input, offunsafe); - if(offunsafe != back_unsafe[i]){ - log_err("ERROR: U8_BACK_1_UNSAFE offset expected:%d, Got:%d\n", back_unsafe[i], offunsafe); - } - i++; - } - i=0; offsafe=sizeof(input); while(offsafe > 0){ UTF8_BACK_1_SAFE(input, 0, offsafe); if(offsafe != back_safe[i]){ - log_err("ERROR: Backward_safe offset expected:%d, Got:%d\n", back_unsafe[i], offsafe); + log_err("ERROR: Backward_safe offset expected:%d, Got:%d\n", back_safe[i], offsafe); } i++; } @@ -486,63 +475,31 @@ static void TestFwdBack(){ while(offsafe > 0){ U8_BACK_1(input, 0, offsafe); if(offsafe != back_safe[i]){ - log_err("ERROR: U8_BACK_1 offset expected:%d, Got:%d\n", back_unsafe[i], offsafe); + log_err("ERROR: U8_BACK_1 offset expected:%d, Got:%d\n", back_safe[i], offsafe); } i++; } - offunsafe=0; - for(i=0; i0; --i) { + UTF8_BACK_1_UNSAFE(input, offset); + if(offset != boundaries[i]){ + log_err("ERROR: UTF8_BACK_1_UNSAFE offset expected:%d, Got:%d\n", boundaries[i], offset); + } + } + for(i=LENGTHOF(boundaries)-2, offset=LENGTHOF(input); offset>0; --i) { + U8_BACK_1_UNSAFE(input, offset); + if(offset != boundaries[i]){ + log_err("ERROR: U8_BACK_1_UNSAFE offset expected:%d, Got:%d\n", boundaries[i], offset); + } + } + + for(i=0; i= 0x10000) { diff --git a/icu4c/source/test/perf/ubrkperf/ubrkperfold.cpp b/icu4c/source/test/perf/ubrkperf/ubrkperfold.cpp index 5700c26a8b4..bfc2e5579aa 100644 --- a/icu4c/source/test/perf/ubrkperf/ubrkperfold.cpp +++ b/icu4c/source/test/perf/ubrkperf/ubrkperfold.cpp @@ -1,6 +1,6 @@ /******************************************************************** * COPYRIGHT: - * Copyright (C) 2001-2011 IBM, Inc. All Rights Reserved. + * Copyright (C) 2001-2012 IBM, Inc. All Rights Reserved. * ********************************************************************/ /******************************************************************************** @@ -611,7 +611,7 @@ UChar UCharFile::get() { // Convert the bytes from the temp array to a Unicode char. i = 0; uint32_t cp; - UTF8_NEXT_CHAR_UNSAFE(bytes, i, cp); + U8_NEXT_UNSAFE(bytes, i, cp); c = (UChar)cp; if (cp >= 0x10000) {