From 94fe10c710588cea2c7504d908c1c447fc7a3b95 Mon Sep 17 00:00:00 2001 From: "Steven R. Loomis" Date: Thu, 10 Sep 2015 01:39:06 +0000 Subject: [PATCH] ICU-11886 Speed up ucol_open by 21% * implement a deserialize constructor for UnicodeSet * add test cases for same * add a generator (`gencolusb`) which can build `source/i18n/collunsafe.h` * Note that for bootstrapping `collunsafe.h` can be truncated (zero length). X-SVN-Rev: 37929 --- .gitattributes | 1 + .gitignore | 3 + icu4c/source/common/unicode/uniset.h | 25 ++- icu4c/source/common/uniset.cpp | 91 +++++++++- icu4c/source/i18n/collationdatareader.cpp | 30 ++++ icu4c/source/i18n/collunsafe.h | 122 +++++++++++++ icu4c/source/test/intltest/usettest.cpp | 78 ++++++++- icu4c/source/test/intltest/usettest.h | 9 +- icu4c/source/tools/gencolusb/Makefile | 43 +++++ icu4c/source/tools/gencolusb/README.md | 9 + .../gencolusb/extract_unsafe_backwards.cpp | 163 ++++++++++++++++++ icu4c/source/tools/gencolusb/verify_uset.cpp | 69 ++++++++ 12 files changed, 636 insertions(+), 7 deletions(-) create mode 100644 icu4c/source/i18n/collunsafe.h create mode 100644 icu4c/source/tools/gencolusb/Makefile create mode 100644 icu4c/source/tools/gencolusb/README.md create mode 100644 icu4c/source/tools/gencolusb/extract_unsafe_backwards.cpp create mode 100644 icu4c/source/tools/gencolusb/verify_uset.cpp diff --git a/.gitattributes b/.gitattributes index e0edbdacc2d..b0242197200 100644 --- a/.gitattributes +++ b/.gitattributes @@ -150,6 +150,7 @@ icu4c/source/tools/genccode/genccode.vcxproj -text icu4c/source/tools/gencfu/gencfu.vcxproj -text icu4c/source/tools/gencmn/gencmn.vcxproj -text icu4c/source/tools/gencnval/gencnval.vcxproj -text +icu4c/source/tools/gencolusb/README.md -text icu4c/source/tools/gendict/gendict.vcxproj -text icu4c/source/tools/gennorm2/gennorm2.vcxproj -text icu4c/source/tools/genrb/derb.vcxproj -text diff --git a/.gitignore b/.gitignore index f68da64cbb4..9cd6f469fc1 100644 --- a/.gitignore +++ b/.gitignore @@ -715,6 +715,9 @@ icu4c/source/tools/gencnval/gencnval.vcproj.*.*.user icu4c/source/tools/gencnval/release icu4c/source/tools/gencnval/x64 icu4c/source/tools/gencnval/x86 +icu4c/source/tools/gencolusb/Makefile.local +icu4c/source/tools/gencolusb/extract_unsafe_backwards +icu4c/source/tools/gencolusb/verify_uset icu4c/source/tools/gendict/*.1 icu4c/source/tools/gendict/*.d icu4c/source/tools/gendict/*.o diff --git a/icu4c/source/common/unicode/uniset.h b/icu4c/source/common/unicode/uniset.h index 237ebf766c3..e12fe294457 100644 --- a/icu4c/source/common/unicode/uniset.h +++ b/icu4c/source/common/unicode/uniset.h @@ -1,6 +1,6 @@ /* *************************************************************************** -* Copyright (C) 1999-2014, International Business Machines Corporation +* Copyright (C) 1999-2015, International Business Machines Corporation * and others. All Rights Reserved. *************************************************************************** * Date Name Description @@ -369,6 +369,29 @@ public: */ UnicodeSet(UChar32 start, UChar32 end); +#ifndef U_HIDE_INTERNAL_API + /** + * @internal + */ + enum ESerialization { + kSerialized /* result of serialize() */ + }; + + /** + * Constructs a set from the output of serialize(). + * The resulting set will be frozen. + * + * @param buffer the 16 bit array + * @param bufferLen the original length returned from serialize() + * @param serialization the value 'kSerialized' + * @param status error code + * + * @internal + */ + UnicodeSet(const uint16_t buffer[], int32_t bufferLen, + ESerialization serialization, UErrorCode &status); +#endif + /** * Constructs a set from the given pattern. See the class * description for the syntax of the pattern language. diff --git a/icu4c/source/common/uniset.cpp b/icu4c/source/common/uniset.cpp index 676c855d1f0..260c17168b9 100644 --- a/icu4c/source/common/uniset.cpp +++ b/icu4c/source/common/uniset.cpp @@ -1,6 +1,6 @@ /* ********************************************************************** -* Copyright (C) 1999-2012, International Business Machines +* Copyright (C) 1999-2015, International Business Machines * Corporation and others. All Rights Reserved. ********************************************************************** * Date Name Description @@ -1468,6 +1468,82 @@ UnicodeSet& UnicodeSet::compact() { return *this; } +#ifdef DEBUG_SERIALIZE +#include +#endif + +/** + * Deserialize constructor. + * Result will be frozen. + */ +UnicodeSet::UnicodeSet(const uint16_t data[], int32_t dataLen, ESerialization serialization, UErrorCode &ec) + : len(1), capacity(1+START_EXTRA), list(0), bmpSet(0), buffer(0), + bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL), + fFlags(0) { + + if(U_FAILURE(ec)) { + setToBogus(); + return; + } + + if( (serialization != kSerialized) + || (data==NULL) + || (dataLen < 1)) { + ec = U_ILLEGAL_ARGUMENT_ERROR; + setToBogus(); + return; + } + + allocateStrings(ec); + if (U_FAILURE(ec)) { + setToBogus(); + return; + } + + // bmp? + int32_t headerSize = ((data[0]&0x8000)) ?2:1; + int32_t bmpLength = (headerSize==1)?data[0]:data[1]; + + len = (((data[0]&0x7FFF)-bmpLength)/2)+bmpLength; +#ifdef DEBUG_SERIALIZE + printf("dataLen %d headerSize %d bmpLen %d len %d. data[0]=%X/%X/%X/%X\n", dataLen,headerSize,bmpLength,len, data[0],data[1],data[2],data[3]); +#endif + capacity = len+1; + list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity); + if(!list || U_FAILURE(ec)) { + setToBogus(); + return; + } + // copy bmp + int32_t i; + for(i = 0; i< bmpLength;i++) { + list[i] = data[i+headerSize]; +#ifdef DEBUG_SERIALIZE + printf("<<16@%d[%d] %X\n", i+headerSize, i, list[i]); +#endif + } + /* + if(bmpLength>0) { + bmpSet= new BMPSet(list, bmpLength); + if(bmpSet == NULL) { + ec = U_MEMORY_ALLOCATION_ERROR; + setToBogus(); + return; + } + }*/ + // copy smp + for(i=bmpLength;ilist[bmpLength]<=0xffff; ++bmpLength) {} length=bmpLength+2*(length-bmpLength); } - +#ifdef DEBUG_SERIALIZE + printf(">> bmpLength%d length%d len%d\n", bmpLength, length, len); +#endif /* length: number of 16-bit array units */ if (length>0x7fff) { /* there are only 15 bits for the length in the first serialized word */ @@ -1525,6 +1603,9 @@ int32_t UnicodeSet::serialize(uint16_t *dest, int32_t destCapacity, UErrorCode& const UChar32 *p; int32_t i; +#ifdef DEBUG_SERIALIZE + printf("writeHdr\n"); +#endif *dest=(uint16_t)length; if (length>bmpLength) { *dest|=0x8000; @@ -1535,11 +1616,17 @@ int32_t UnicodeSet::serialize(uint16_t *dest, int32_t destCapacity, UErrorCode& /* write the BMP part of the array */ p=this->list; for (i=0; i>16); *dest++=(uint16_t)*p++; } diff --git a/icu4c/source/i18n/collationdatareader.cpp b/icu4c/source/i18n/collationdatareader.cpp index 21a565378d9..63be9c91064 100644 --- a/icu4c/source/i18n/collationdatareader.cpp +++ b/icu4c/source/i18n/collationdatareader.cpp @@ -30,6 +30,13 @@ #include "ucmndata.h" #include "utrie2.h" +// #if U_HAVE_COLLUNSAFE +#if 1 +#include "collunsafe.h" + +#endif +// #end + U_NAMESPACE_BEGIN namespace { @@ -262,6 +269,28 @@ CollationDataReader::read(const CollationTailoring *base, const uint8_t *inBytes return; } if(baseData == NULL) { +#if defined( COLLUNSAFE_COLL_VERSION ) +#if defined(COLLUNSAFE_RANGES) /* slower but still an improvement*/ + tailoring.unsafeBackwardSet = new UnicodeSet(); + + for(int32_t i=0;iadd(unsafe_ranges[i+0],unsafe_ranges[i+1]); + } + tailoring.unsafeBackwardSet->freeze(); +#elif defined (COLLUNSAFE_SERIALIZE) + /* faster */ + tailoring.unsafeBackwardSet = new UnicodeSet(unsafe_serializedData, unsafe_serializedCount, UnicodeSet::kSerialized, errorCode); + if(tailoring.unsafeBackwardSet == NULL) { + errorCode = U_MEMORY_ALLOCATION_ERROR; + return; + } else if (U_FAILURE(errorCode)) { + return; + } +#else +#error no unsafe-backwards strategy chosen +#endif + +#else // Create the unsafe-backward set for the root collator. // Include all non-zero combining marks and trail surrogates. // We do this at load time, rather than at build time, @@ -279,6 +308,7 @@ CollationDataReader::read(const CollationTailoring *base, const uint8_t *inBytes return; } data->nfcImpl.addLcccChars(*tailoring.unsafeBackwardSet); +#endif } else { // Clone the root collator's set contents. tailoring.unsafeBackwardSet = static_cast( diff --git a/icu4c/source/i18n/collunsafe.h b/icu4c/source/i18n/collunsafe.h new file mode 100644 index 00000000000..81e888fdc24 --- /dev/null +++ b/icu4c/source/i18n/collunsafe.h @@ -0,0 +1,122 @@ +// collunsafe.h +// Copyright (C) 2015, International Business Machines Corporation and others. All Rights Reserved. + +// To be included by collationdatareader.cpp, and generated by gencolusb. +// Machine generated, do not edit. + +#ifndef COLLUNSAFE_H +#define COLLUNSAFE_H + +#define COLLUNSAFE_ICU_VERSION "56.0.1" +#define COLLUNSAFE_COLL_VERSION "9.64" +#define COLLUNSAFE_SERIALIZE 1 +static const int32_t unsafe_serializedCount = 850; +static const uint16_t unsafe_serializedData[850] = { +0x8350, 0x01B8, 0x0034, 0x0035, 0x004C, 0x004D, 0x00A0, 0x00A1, // 8 +0x0300, 0x034F, 0x0350, 0x0370, 0x03A9, 0x03AA, 0x03E2, 0x03E3, // 16 +0x042F, 0x0430, 0x0483, 0x0488, 0x0531, 0x0532, 0x0591, 0x05BE, // 24 +0x05BF, 0x05C0, 0x05C1, 0x05C3, 0x05C4, 0x05C6, 0x05C7, 0x05C8, // 32 +0x05D0, 0x05D1, 0x0610, 0x061B, 0x0628, 0x0629, 0x064B, 0x0660, // 40 +0x0670, 0x0671, 0x06D6, 0x06DD, 0x06DF, 0x06E5, 0x06E7, 0x06E9, // 48 +0x06EA, 0x06EE, 0x0710, 0x0712, 0x0730, 0x074B, 0x078C, 0x078D, // 56 +0x07D8, 0x07D9, 0x07EB, 0x07F4, 0x0800, 0x0801, 0x0816, 0x081A, // 64 +0x081B, 0x0824, 0x0825, 0x0828, 0x0829, 0x082E, 0x0840, 0x0841, // 72 +0x0859, 0x085C, 0x08E3, 0x0900, 0x0905, 0x0906, 0x093C, 0x093D, // 80 +0x094D, 0x094E, 0x0951, 0x0955, 0x0995, 0x0996, 0x09BC, 0x09BD, // 88 +0x09BE, 0x09BF, 0x09CD, 0x09CE, 0x09D7, 0x09D8, 0x0A15, 0x0A16, // 96 +0x0A3C, 0x0A3D, 0x0A4D, 0x0A4E, 0x0A95, 0x0A96, 0x0ABC, 0x0ABD, // 104 +0x0ACD, 0x0ACE, 0x0B15, 0x0B16, 0x0B3C, 0x0B3D, 0x0B3E, 0x0B3F, // 112 +0x0B4D, 0x0B4E, 0x0B56, 0x0B58, 0x0B95, 0x0B96, 0x0BBE, 0x0BBF, // 120 +0x0BCD, 0x0BCE, 0x0BD7, 0x0BD8, 0x0C15, 0x0C16, 0x0C4D, 0x0C4E, // 128 +0x0C55, 0x0C57, 0x0C95, 0x0C96, 0x0CBC, 0x0CBD, 0x0CC2, 0x0CC3, // 136 +0x0CCD, 0x0CCE, 0x0CD5, 0x0CD7, 0x0D15, 0x0D16, 0x0D3E, 0x0D3F, // 144 +0x0D4D, 0x0D4E, 0x0D57, 0x0D58, 0x0D85, 0x0D86, 0x0DCA, 0x0DCB, // 152 +0x0DCF, 0x0DD0, 0x0DDF, 0x0DE0, 0x0E01, 0x0E2F, 0x0E32, 0x0E33, // 160 +0x0E38, 0x0E3B, 0x0E48, 0x0E4C, 0x0E81, 0x0E83, 0x0E84, 0x0E85, // 168 +0x0E87, 0x0E89, 0x0E8A, 0x0E8B, 0x0E8D, 0x0E8E, 0x0E94, 0x0E98, // 176 +0x0E99, 0x0EA0, 0x0EA1, 0x0EA4, 0x0EA5, 0x0EA6, 0x0EA7, 0x0EA8, // 184 +0x0EAA, 0x0EAC, 0x0EAD, 0x0EAF, 0x0EB2, 0x0EB3, 0x0EB8, 0x0EBA, // 192 +0x0EC8, 0x0ECC, 0x0EDC, 0x0EE0, 0x0F18, 0x0F1A, 0x0F35, 0x0F36, // 200 +0x0F37, 0x0F38, 0x0F39, 0x0F3A, 0x0F40, 0x0F41, 0x0F71, 0x0F76, // 208 +0x0F7A, 0x0F7E, 0x0F80, 0x0F85, 0x0F86, 0x0F88, 0x0FC6, 0x0FC7, // 216 +0x1000, 0x1001, 0x102E, 0x102F, 0x1037, 0x1038, 0x1039, 0x103B, // 224 +0x108D, 0x108E, 0x10D3, 0x10D4, 0x12A0, 0x12A1, 0x135D, 0x1360, // 232 +0x13C4, 0x13C5, 0x14C0, 0x14C1, 0x168F, 0x1690, 0x16A0, 0x16A1, // 240 +0x1703, 0x1704, 0x1714, 0x1715, 0x1723, 0x1724, 0x1734, 0x1735, // 248 +0x1743, 0x1744, 0x1763, 0x1764, 0x1780, 0x1781, 0x17D2, 0x17D3, // 256 +0x17DD, 0x17DE, 0x1826, 0x1827, 0x18A9, 0x18AA, 0x1900, 0x1901, // 264 +0x1939, 0x193C, 0x1950, 0x1951, 0x1980, 0x19AC, 0x1A00, 0x1A01, // 272 +0x1A17, 0x1A19, 0x1A20, 0x1A21, 0x1A60, 0x1A61, 0x1A75, 0x1A7D, // 280 +0x1A7F, 0x1A80, 0x1AB0, 0x1ABE, 0x1B05, 0x1B06, 0x1B34, 0x1B36, // 288 +0x1B44, 0x1B45, 0x1B6B, 0x1B74, 0x1B83, 0x1B84, 0x1BAA, 0x1BAC, // 296 +0x1BC0, 0x1BC1, 0x1BE6, 0x1BE7, 0x1BF2, 0x1BF4, 0x1C00, 0x1C01, // 304 +0x1C37, 0x1C38, 0x1C5A, 0x1C5B, 0x1CD0, 0x1CD3, 0x1CD4, 0x1CE1, // 312 +0x1CE2, 0x1CE9, 0x1CED, 0x1CEE, 0x1CF4, 0x1CF5, 0x1CF8, 0x1CFA, // 320 +0x1DC0, 0x1DF6, 0x1DFC, 0x1E00, 0x201C, 0x201D, 0x20AC, 0x20AD, // 328 +0x20D0, 0x20DD, 0x20E1, 0x20E2, 0x20E5, 0x20F1, 0x263A, 0x263B, // 336 +0x2C00, 0x2C01, 0x2CEF, 0x2CF2, 0x2D5E, 0x2D5F, 0x2D7F, 0x2D80, // 344 +0x2DE0, 0x2E00, 0x302A, 0x3030, 0x304B, 0x304C, 0x3099, 0x309B, // 352 +0x30AB, 0x30AC, 0x3105, 0x3106, 0x5B57, 0x5B58, 0xA288, 0xA289, // 360 +0xA4E8, 0xA4E9, 0xA549, 0xA54A, 0xA66F, 0xA670, 0xA674, 0xA67E, // 368 +0xA69E, 0xA6A1, 0xA6F0, 0xA6F2, 0xA800, 0xA801, 0xA806, 0xA807, // 376 +0xA840, 0xA841, 0xA882, 0xA883, 0xA8C4, 0xA8C5, 0xA8E0, 0xA8F2, // 384 +0xA90A, 0xA90B, 0xA92B, 0xA92E, 0xA930, 0xA931, 0xA953, 0xA954, // 392 +0xA984, 0xA985, 0xA9B3, 0xA9B4, 0xA9C0, 0xA9C1, 0xAA00, 0xAA01, // 400 +0xAA80, 0xAAB1, 0xAAB2, 0xAAB5, 0xAAB7, 0xAAB9, 0xAABE, 0xAAC0, // 408 +0xAAC1, 0xAAC2, 0xAAF6, 0xAAF7, 0xABC0, 0xABC1, 0xABED, 0xABEE, // 416 +0xAC00, 0xAC01, 0xD800, 0xD807, 0xD808, 0xD809, 0xD80C, 0xD80D, // 424 +0xD811, 0xD812, 0xD81A, 0xD81C, 0xD82F, 0xD830, 0xD834, 0xD835, // 432 +0xD83A, 0xD83B, 0xDC00, 0xE000, 0xFB1E, 0xFB1F, 0xFDD0, 0xFDD1, // 440 +0xFE20, 0xFE30, 0x0001, 0x0000, 0x0001, 0x0001, 0x0001, 0x01FD, // 448 +0x0001, 0x01FE, 0x0001, 0x0280, 0x0001, 0x0281, 0x0001, 0x02B7, // 456 +0x0001, 0x02B8, 0x0001, 0x02E0, 0x0001, 0x02E1, 0x0001, 0x0308, // 464 +0x0001, 0x0309, 0x0001, 0x0330, 0x0001, 0x0331, 0x0001, 0x036B, // 472 +0x0001, 0x036C, 0x0001, 0x0376, 0x0001, 0x037B, 0x0001, 0x0380, // 480 +0x0001, 0x0381, 0x0001, 0x03A0, 0x0001, 0x03A1, 0x0001, 0x0414, // 488 +0x0001, 0x0415, 0x0001, 0x0450, 0x0001, 0x0451, 0x0001, 0x0480, // 496 +0x0001, 0x0481, 0x0001, 0x0500, 0x0001, 0x0501, 0x0001, 0x0537, // 504 +0x0001, 0x0538, 0x0001, 0x0647, 0x0001, 0x0648, 0x0001, 0x0800, // 512 +0x0001, 0x0801, 0x0001, 0x0840, 0x0001, 0x0841, 0x0001, 0x0873, // 520 +0x0001, 0x0874, 0x0001, 0x0896, 0x0001, 0x0897, 0x0001, 0x08F4, // 528 +0x0001, 0x08F5, 0x0001, 0x0900, 0x0001, 0x0901, 0x0001, 0x0920, // 536 +0x0001, 0x0921, 0x0001, 0x0980, 0x0001, 0x0981, 0x0001, 0x09A0, // 544 +0x0001, 0x09A1, 0x0001, 0x0A00, 0x0001, 0x0A01, 0x0001, 0x0A0D, // 552 +0x0001, 0x0A0E, 0x0001, 0x0A0F, 0x0001, 0x0A10, 0x0001, 0x0A38, // 560 +0x0001, 0x0A3B, 0x0001, 0x0A3F, 0x0001, 0x0A40, 0x0001, 0x0A60, // 568 +0x0001, 0x0A61, 0x0001, 0x0A95, 0x0001, 0x0A96, 0x0001, 0x0AC1, // 576 +0x0001, 0x0AC2, 0x0001, 0x0AE5, 0x0001, 0x0AE7, 0x0001, 0x0B00, // 584 +0x0001, 0x0B01, 0x0001, 0x0B40, 0x0001, 0x0B41, 0x0001, 0x0B60, // 592 +0x0001, 0x0B61, 0x0001, 0x0B8F, 0x0001, 0x0B90, 0x0001, 0x0C00, // 600 +0x0001, 0x0C01, 0x0001, 0x0CA1, 0x0001, 0x0CA2, 0x0001, 0x1005, // 608 +0x0001, 0x1006, 0x0001, 0x1046, 0x0001, 0x1047, 0x0001, 0x107F, // 616 +0x0001, 0x1080, 0x0001, 0x1083, 0x0001, 0x1084, 0x0001, 0x10B9, // 624 +0x0001, 0x10BB, 0x0001, 0x10D0, 0x0001, 0x10D1, 0x0001, 0x1100, // 632 +0x0001, 0x1104, 0x0001, 0x1127, 0x0001, 0x1128, 0x0001, 0x1133, // 640 +0x0001, 0x1135, 0x0001, 0x1152, 0x0001, 0x1153, 0x0001, 0x1173, // 648 +0x0001, 0x1174, 0x0001, 0x1183, 0x0001, 0x1184, 0x0001, 0x11C0, // 656 +0x0001, 0x11C1, 0x0001, 0x11CA, 0x0001, 0x11CB, 0x0001, 0x1208, // 664 +0x0001, 0x1209, 0x0001, 0x1235, 0x0001, 0x1237, 0x0001, 0x128F, // 672 +0x0001, 0x1290, 0x0001, 0x12BE, 0x0001, 0x12BF, 0x0001, 0x12E9, // 680 +0x0001, 0x12EB, 0x0001, 0x1315, 0x0001, 0x1316, 0x0001, 0x133C, // 688 +0x0001, 0x133D, 0x0001, 0x133E, 0x0001, 0x133F, 0x0001, 0x134D, // 696 +0x0001, 0x134E, 0x0001, 0x1357, 0x0001, 0x1358, 0x0001, 0x1366, // 704 +0x0001, 0x136D, 0x0001, 0x1370, 0x0001, 0x1375, 0x0001, 0x1484, // 712 +0x0001, 0x1485, 0x0001, 0x14B0, 0x0001, 0x14B1, 0x0001, 0x14BA, // 720 +0x0001, 0x14BB, 0x0001, 0x14BD, 0x0001, 0x14BE, 0x0001, 0x14C2, // 728 +0x0001, 0x14C4, 0x0001, 0x158E, 0x0001, 0x158F, 0x0001, 0x15AF, // 736 +0x0001, 0x15B0, 0x0001, 0x15BF, 0x0001, 0x15C1, 0x0001, 0x160E, // 744 +0x0001, 0x160F, 0x0001, 0x163F, 0x0001, 0x1640, 0x0001, 0x1680, // 752 +0x0001, 0x1681, 0x0001, 0x16B6, 0x0001, 0x16B8, 0x0001, 0x1717, // 760 +0x0001, 0x1718, 0x0001, 0x172B, 0x0001, 0x172C, 0x0001, 0x18B4, // 768 +0x0001, 0x18B5, 0x0001, 0x1AC0, 0x0001, 0x1AC1, 0x0001, 0x2000, // 776 +0x0001, 0x2001, 0x0001, 0x3153, 0x0001, 0x3154, 0x0001, 0x4400, // 784 +0x0001, 0x4401, 0x0001, 0x6A4F, 0x0001, 0x6A50, 0x0001, 0x6AE6, // 792 +0x0001, 0x6AE7, 0x0001, 0x6AF0, 0x0001, 0x6AF5, 0x0001, 0x6B1C, // 800 +0x0001, 0x6B1D, 0x0001, 0x6B30, 0x0001, 0x6B37, 0x0001, 0x6F00, // 808 +0x0001, 0x6F01, 0x0001, 0xBC20, 0x0001, 0xBC21, 0x0001, 0xBC9E, // 816 +0x0001, 0xBC9F, 0x0001, 0xD165, 0x0001, 0xD16A, 0x0001, 0xD16D, // 824 +0x0001, 0xD173, 0x0001, 0xD17B, 0x0001, 0xD183, 0x0001, 0xD185, // 832 +0x0001, 0xD18C, 0x0001, 0xD1AA, 0x0001, 0xD1AE, 0x0001, 0xD242, // 840 +0x0001, 0xD245, 0x0001, 0xE802, 0x0001, 0xE803, 0x0001, 0xE8D0, // 848 +0x0001, 0xE8D7}; +#endif diff --git a/icu4c/source/test/intltest/usettest.cpp b/icu4c/source/test/intltest/usettest.cpp index 3f1713d357b..bcd1e273558 100644 --- a/icu4c/source/test/intltest/usettest.cpp +++ b/icu4c/source/test/intltest/usettest.cpp @@ -1,6 +1,6 @@ /* ******************************************************************************** -* Copyright (C) 1999-2014 International Business Machines Corporation and +* Copyright (C) 1999-2015 International Business Machines Corporation and * others. All Rights Reserved. ******************************************************************************** * Date Name Description @@ -90,6 +90,7 @@ UnicodeSetTest::runIndexedTest(int32_t index, UBool exec, CASE(21,TestFreezable); CASE(22,TestSpan); CASE(23,TestStringSpan); + CASE(24,TestUCAUnsafeBackwards); default: name = ""; break; } } @@ -1714,6 +1715,12 @@ void UnicodeSetTest::TestSurrogate() { errln((UnicodeString)"FAIL: " + UnicodeString(DATA[i], -1, US_INV) + ".size() == " + set.size() + ", expected 4"); } + + { + UErrorCode subErr = U_ZERO_ERROR; + checkRoundTrip(set); + checkSerializeRoundTrip(set, subErr); + } } } @@ -1730,8 +1737,12 @@ void UnicodeSetTest::TestExhaustive() { logln((UnicodeString)"Testing " + i + ", " + x); _testComplement(i, x, y); + UnicodeSet &toTest = bitsToSet(i, aa); + // AS LONG AS WE ARE HERE, check roundtrip - checkRoundTrip(bitsToSet(i, aa)); + checkRoundTrip(toTest); + UErrorCode ec = U_ZERO_ERROR; + checkSerializeRoundTrip(toTest, ec); for (int32_t j = 0; j < limit; ++j) { _testAdd(i,j, x,y,z); @@ -1922,7 +1933,40 @@ void UnicodeSetTest::checkRoundTrip(const UnicodeSet& s) { checkEqual(s, t, "toPattern(true)"); } } - + +void UnicodeSetTest::checkSerializeRoundTrip(const UnicodeSet& t, UErrorCode &status) { + if(U_FAILURE(status)) return; + int32_t len = t.serialize(serializeBuffer.getAlias(), serializeBuffer.getCapacity(), status); + if(status == U_BUFFER_OVERFLOW_ERROR) { + status = U_ZERO_ERROR; + serializeBuffer.resize(len); + len = t.serialize(serializeBuffer.getAlias(), serializeBuffer.getCapacity(), status); + // let 2nd error stand + } + if(U_FAILURE(status)) { + errln("checkSerializeRoundTrip: error %s serializing buffer\n", u_errorName(status)); + return; + } + +#if 0 + UnicodeString pat; t.toPattern(pat, FALSE); + infoln(pat); + printf(" %d: ", len); + for(int i=0;itailoring->version; + const UnicodeSet *unsafeBackwardSet = rootEntry->tailoring->unsafeBackwardSet; + + checkSerializeRoundTrip(*unsafeBackwardSet, errorCode); + + if(!logKnownIssue("11891","UnicodeSet fails to round trip on CollationRoot...unsafeBackwards set")) { + checkRoundTrip(*unsafeBackwardSet); + } +#endif +} diff --git a/icu4c/source/test/intltest/usettest.h b/icu4c/source/test/intltest/usettest.h index 94083406764..06a3633f52c 100644 --- a/icu4c/source/test/intltest/usettest.h +++ b/icu4c/source/test/intltest/usettest.h @@ -1,7 +1,7 @@ /******************************************************************** * COPYRIGHT: - * Copyright (c) 1997-2007, International Business Machines Corporation and + * Copyright (c) 1997-2015, International Business Machines Corporation and * others. All Rights Reserved. ******************************************************************** ********************************************************************** @@ -18,6 +18,7 @@ #include "unicode/uniset.h" #include "unicode/ucnv_err.h" #include "intltest.h" +#include "cmemory.h" class UnicodeSetWithStrings; @@ -87,6 +88,8 @@ private: void TestStringSpan(); + void TestUCAUnsafeBackwards(); + private: UBool toPatternAux(UChar32 start, UChar32 end); @@ -136,6 +139,8 @@ private: * get the same thing back */ void checkRoundTrip(const UnicodeSet& s); + + void checkSerializeRoundTrip(const UnicodeSet& s, UErrorCode &ec); void copyWithIterator(UnicodeSet& t, const UnicodeSet& s, UBool withRange); @@ -183,6 +188,8 @@ private: UConverter *utf8Cnv; + MaybeStackArray serializeBuffer; + public: static UnicodeString escape(const UnicodeString& s); }; diff --git a/icu4c/source/tools/gencolusb/Makefile b/icu4c/source/tools/gencolusb/Makefile new file mode 100644 index 00000000000..9fc79563dda --- /dev/null +++ b/icu4c/source/tools/gencolusb/Makefile @@ -0,0 +1,43 @@ +## Makefile for rebuilding 'unsafe backward' data +## Copyright (c) 2015, International Business Machines Corporation and +## others. All Rights Reserved. + +## +## CONFIGURATION: +## 1. create Makefile.local containing overrides if necessary: +## BUILD_ROOT=/home/user/icu-build (location of 'config.status' etc.) +## PATH_VAR=DYLD_LIBRARY_PATH (if on OSX etc) +## +## UPDATING +## 1. make 'reset-icu' will reset ICU to 'bootstrap' state, zeroing out source/i18n/collunsafe.h +## 2. make 'gen-file' will generate and test source/i18n/collunsafe.h + +subdir=tools/gencolusb +srcdir=$(shell pwd) +SOURCE_ROOT=$(shell cd ../.. ; pwd) +BUILD_ROOT=$(SOURCE_ROOT) +BUILD_HERE=$(BUILD_ROOT)/$(subdir) +TOOL=extract_unsafe_backwards +TEST=verify_uset +PATH_VAR=LD_LIBRARY_PATH + +-include Makefile.local + +GEN_FILE=$(SOURCE_ROOT)/i18n/collunsafe.h +BUILD_OPTS=-I$(SOURCE_ROOT)/common -I$(SOURCE_ROOT)/i18n -L$(BUILD_ROOT)/lib -licuuc -licui18n -licudata +RUN_OPTS=env $(PATH_VAR)=$(BUILD_ROOT)/lib + +reset-icu: + >$(GEN_FILE) + $(MAKE) -C $(BUILD_ROOT)/i18n + +gen-file: reset-icu + mkdir -p $(BUILD_HERE) + $(CXX) -o $(BUILD_HERE)/$(TOOL) $(srcdir)/$(TOOL).cpp $(BUILD_OPTS) + $(RUN_OPTS) $(BUILD_HERE)/$(TOOL) > $(GEN_FILE) || exit 1 + $(CXX) -o $(BUILD_HERE)/$(TEST) $(srcdir)/$(TEST).cpp $(BUILD_OPTS) + $(RUN_OPTS) $(BUILD_HERE)/$(TEST) || exit 1 + $(MAKE) -C $(BUILD_ROOT)/i18n + $(RUN_OPTS) $(BUILD_HERE)/$(TEST) || exit 1 + +.PHONY: reset-icu gen-file diff --git a/icu4c/source/tools/gencolusb/README.md b/icu4c/source/tools/gencolusb/README.md new file mode 100644 index 00000000000..bf72a22e356 --- /dev/null +++ b/icu4c/source/tools/gencolusb/README.md @@ -0,0 +1,9 @@ +Unsafe-Backward Collator Data +=== + +This directory contains tools to build the `source/i18n/collunsafe.h` +precomputed data. + +See [Makefile](./Makefile) for more details. + +* Copyright (c) 2015, International Business Machines Corporation and others. All Rights Reserved. diff --git a/icu4c/source/tools/gencolusb/extract_unsafe_backwards.cpp b/icu4c/source/tools/gencolusb/extract_unsafe_backwards.cpp new file mode 100644 index 00000000000..b680f79e6e0 --- /dev/null +++ b/icu4c/source/tools/gencolusb/extract_unsafe_backwards.cpp @@ -0,0 +1,163 @@ +/** + * Copyright (c) 1999-2015, International Business Machines Corporation and + * others. All Rights Reserved. + * + * Generator for source/i18n/collunsafe.h + * see Makefile + */ + +#include +#include "unicode/uversion.h" +#include "unicode/uniset.h" +#include "collationroot.h" +#include "collationtailoring.h" + +/** + * Define the type of generator to use. Choose one. + */ +#define SERIALIZE 1 //< Default: use UnicodeSet.serialize() (best, fast, requires new UnicodeSet c'tor) +#define RANGES 0 //< Enumerate ranges (works, not as fast) +#define PATTERN 0 //< Generate a UnicodeSet pattern (broken AND probably slow) + +int main(int argc, const char *argv[]) { + UErrorCode errorCode = U_ZERO_ERROR; + + // Get the unsafeBackwardsSet + const CollationCacheEntry *rootEntry = CollationRoot::getRootCacheEntry(errorCode); + if(U_FAILURE(errorCode)) { + fprintf(stderr, "Err: %s getting root cache entry\n", u_errorName(errorCode)); + return 1; + } + const UVersionInfo &version = rootEntry->tailoring->version; + const UnicodeSet *unsafeBackwardSet = rootEntry->tailoring->unsafeBackwardSet; + char verString[20]; + u_versionToString(version, verString); + fprintf(stderr, "Generating data for ICU %s, Collation %s\n", U_ICU_VERSION, verString); + int32_t rangeCount = unsafeBackwardSet->getRangeCount(); + +#if SERIALIZE + fprintf(stderr, ".. serializing\n"); + // UnicodeSet serialization + + UErrorCode preflightCode = U_ZERO_ERROR; + // preflight + int32_t serializedCount = unsafeBackwardSet->serialize(NULL,0,preflightCode); + if(U_FAILURE(preflightCode) && preflightCode != U_BUFFER_OVERFLOW_ERROR) { + fprintf(stderr, "Err: %s preflighting unicode set\n", u_errorName(preflightCode)); + return 1; + } + uint16_t *serializedData = new uint16_t[serializedCount]; + // serialize + unsafeBackwardSet->serialize(serializedData, serializedCount, errorCode); + if(U_FAILURE(errorCode)) { + delete [] serializedData; + fprintf(stderr, "Err: %s serializing unicodeset\n", u_errorName(errorCode)); + return 1; + } +#endif + +#if PATTERN + fprintf(stderr,".. pattern\n"); + // attempt to use pattern + + UnicodeString pattern; + UnicodeSet set(*unsafeBackwardSet); + set.compact(); + set.toPattern(pattern, FALSE); + + if(U_SUCCESS(errorCode)) { + // This fails (bug# ?) - which is why this method was abandoned. + + // UnicodeSet usA(pattern, errorCode); + // fprintf(stderr, "\n%s:%d: err creating set A %s\n", __FILE__, __LINE__, u_errorName(errorCode)); + // return 1; + } + + + const UChar *buf = pattern.getBuffer(); + int32_t needed = pattern.length(); + + // print + { + char buf2[2048]; + int32_t len2 = pattern.extract(0, pattern.length(), buf2, "utf-8"); + buf2[len2]=0; + fprintf(stderr,"===\n%s\n===\n", buf2); + } + + const UnicodeString unsafeBackwardPattern(FALSE, buf, needed); + if(U_SUCCESS(errorCode)) { + //UnicodeSet us(unsafeBackwardPattern, errorCode); + // fprintf(stderr, "\n%s:%d: err creating set %s\n", __FILE__, __LINE__, u_errorName(errorCode)); + } else { + fprintf(stderr, "Uset OK - \n"); + } +#endif + + + // Generate the output file. + + printf("// collunsafe.h\n"); + printf("// %s\n", U_COPYRIGHT_STRING); + printf("\n"); + printf("// To be included by collationdatareader.cpp, and generated by gencolusb.\n"); + printf("// Machine generated, do not edit.\n"); + printf("\n"); + printf("#ifndef COLLUNSAFE_H\n" + "#define COLLUNSAFE_H\n" + "\n" + "#define COLLUNSAFE_ICU_VERSION \"" U_ICU_VERSION "\"\n"); + printf("#define COLLUNSAFE_COLL_VERSION \"%s\"\n", verString); + + + +#if PATTERN + printf("#define COLLUNSAFE_PATTERN 1\n"); + printf("static const int32_t collunsafe_len = %d;\n", needed); + printf("static const UChar collunsafe_pattern[collunsafe_len] = {\n"); + for(int i=0;i0) && (i%8 == 0) ) { + printf(" // %d\n", i); + } + printf("0x%04X", buf[i]); // TODO check + if(i != (needed-1)) { + printf(", "); + } + } + printf(" //%d\n};\n", (needed-1)); +#endif + +#if RANGE + printf("#define COLLUNSAFE_RANGE 1\n"); + printf("static const int32_t unsafe_rangeCount = %d;\n", rangeCount); + printf("static const UChar32 unsafe_ranges[%d] = { \n", rangeCount*2); + for(int32_t i=0;igetRangeStart(i), + unsafeBackwardSet->getRangeEnd(i), + i); + } + printf("};\n"); +#endif + +#if SERIALIZE + printf("#define COLLUNSAFE_SERIALIZE 1\n"); + printf("static const int32_t unsafe_serializedCount = %d;\n", serializedCount); + printf("static const uint16_t unsafe_serializedData[%d] = { \n", serializedCount); + for(int32_t i=0;i0) && (i%8 == 0) ) { + printf(" // %d\n", i); + } + printf("0x%04X", serializedData[i]); // TODO check + if(i != (serializedCount-1)) { + printf(", "); + } + } + printf("};\n"); +#endif + + printf("#endif\n"); + fflush(stderr); + fflush(stdout); + return(U_SUCCESS(errorCode)?0:1); +} diff --git a/icu4c/source/tools/gencolusb/verify_uset.cpp b/icu4c/source/tools/gencolusb/verify_uset.cpp new file mode 100644 index 00000000000..7a44040f4b6 --- /dev/null +++ b/icu4c/source/tools/gencolusb/verify_uset.cpp @@ -0,0 +1,69 @@ +/** + * Copyright (c) 1999-2012, International Business Machines Corporation and + * others. All Rights Reserved. + * + * Test for source/i18n/collunsafe.h + */ + +#include +#include "unicode/ucol.h" +#include "unicode/uniset.h" +#include "unicode/coll.h" +#include "collation.h" + +#include "collunsafe.h" + + +int main(int argc, const char *argv[]) { + puts("verify"); + UErrorCode errorCode = U_ZERO_ERROR; +#if defined (COLLUNSAFE_PATTERN) + puts("verify pattern"); + const UnicodeString unsafeBackwardPattern(FALSE, collunsafe_pattern, collunsafe_len); + fprintf(stderr, "\n -- pat '%c%c%c%c%c'\n", + collunsafe_pattern[0], + collunsafe_pattern[1], + collunsafe_pattern[2], + collunsafe_pattern[3], + collunsafe_pattern[4]); + if(U_SUCCESS(errorCode)) { + UnicodeSet us(unsafeBackwardPattern, errorCode); + fprintf(stderr, "\n%s:%d: err creating set %s\n", __FILE__, __LINE__, u_errorName(errorCode)); + } +#endif + +#if defined (COLLUNSAFE_RANGE) + { + puts("verify range"); + UnicodeSet u; + for(int32_t i=0;i