From: Markus Scherer Date: Mon, 28 Nov 2011 22:59:49 +0000 (+0000) Subject: ICU-8942 use smaller/simpler FCD data rather than building an FCD trie X-Git-Tag: milestone-59-0-1~4327 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=524fd241c542ee2d7f97540c3760f3ad702e9d8e;p=icu ICU-8942 use smaller/simpler FCD data rather than building an FCD trie X-SVN-Rev: 30985 --- diff --git a/icu4c/source/common/normalizer2.cpp b/icu4c/source/common/normalizer2.cpp index fc92f4220c1..60047fef73e 100644 --- a/icu4c/source/common/normalizer2.cpp +++ b/icu4c/source/common/normalizer2.cpp @@ -523,12 +523,7 @@ const Normalizer2 *Normalizer2Factory::getNFDInstance(UErrorCode &errorCode) { const Normalizer2 *Normalizer2Factory::getFCDInstance(UErrorCode &errorCode) { Norm2AllModes *allModes=Norm2AllModesSingleton(nfcSingleton, "nfc").getInstance(errorCode); - if(allModes!=NULL) { - allModes->impl.getFCDTrie(errorCode); - return &allModes->fcd; - } else { - return NULL; - } + return allModes!=NULL ? &allModes->fcd : NULL; } const Normalizer2 *Normalizer2Factory::getFCCInstance(UErrorCode &errorCode) { @@ -605,17 +600,6 @@ Normalizer2Factory::getImpl(const Normalizer2 *norm2) { return &((Normalizer2WithImpl *)norm2)->impl; } -const UTrie2 * -Normalizer2Factory::getFCDTrie(UErrorCode &errorCode) { - Norm2AllModes *allModes= - Norm2AllModesSingleton(nfcSingleton, "nfc").getInstance(errorCode); - if(allModes!=NULL) { - return allModes->impl.getFCDTrie(errorCode); - } else { - return NULL; - } -} - const Normalizer2 * Normalizer2::getInstance(const char *packageName, const char *name, @@ -682,7 +666,6 @@ Normalizer2::getInstance(const char *packageName, case UNORM2_DECOMPOSE: return &allModes->decomp; case UNORM2_FCD: - allModes->impl.getFCDTrie(errorCode); return &allModes->fcd; case UNORM2_COMPOSE_CONTIGUOUS: return &allModes->fcc; @@ -960,25 +943,14 @@ unorm_getQuickCheck(UChar32 c, UNormalizationMode mode) { } U_CFUNC uint16_t -unorm_getFCD16Simple(UChar32 c) { +unorm_getFCD16(UChar32 c) { UErrorCode errorCode=U_ZERO_ERROR; - const UTrie2 *trie=Normalizer2Factory::getFCDTrie(errorCode); + const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(errorCode); if(U_SUCCESS(errorCode)) { - return UTRIE2_GET16(trie, c); + return impl->getFCD16(c); } else { return 0; } } -U_CAPI const uint16_t * U_EXPORT2 -unorm_getFCDTrieIndex(UChar32 &fcdHighStart, UErrorCode *pErrorCode) { - const UTrie2 *trie=Normalizer2Factory::getFCDTrie(*pErrorCode); - if(U_SUCCESS(*pErrorCode)) { - fcdHighStart=trie->highStart; - return trie->index; - } else { - return NULL; - } -} - #endif // !UCONFIG_NO_NORMALIZATION diff --git a/icu4c/source/common/normalizer2impl.cpp b/icu4c/source/common/normalizer2impl.cpp index 81fdddb1c33..1b405fd4a5f 100644 --- a/icu4c/source/common/normalizer2impl.cpp +++ b/icu4c/source/common/normalizer2impl.cpp @@ -254,7 +254,6 @@ struct CanonIterData : public UMemory { Normalizer2Impl::~Normalizer2Impl() { udata_close(memory); utrie2_close(normTrie); - UTrie2Singleton(fcdTrieSingleton).deleteInstance(); delete (CanonIterData *)canonIterDataSingleton.fInstance; } @@ -1507,121 +1506,13 @@ const UChar *Normalizer2Impl::findNextCompBoundary(const UChar *p, const UChar * return iter.codePointStart; } -class FCDTrieSingleton : public UTrie2Singleton { -public: - FCDTrieSingleton(SimpleSingleton &s, Normalizer2Impl &ni, UErrorCode &ec) : - UTrie2Singleton(s), impl(ni), errorCode(ec) {} - UTrie2 *getInstance(UErrorCode &errorCode) { - return UTrie2Singleton::getInstance(createInstance, this, errorCode); - } - static void *createInstance(const void *context, UErrorCode &errorCode); - UBool rangeHandler(UChar32 start, UChar32 end, uint32_t value) { - if(value!=0) { - impl.setFCD16FromNorm16(start, end, (uint16_t)value, newFCDTrie, errorCode); - } - return U_SUCCESS(errorCode); - } - - Normalizer2Impl &impl; - UTrie2 *newFCDTrie; - UErrorCode &errorCode; -}; - -U_CDECL_BEGIN - -// Set the FCD value for a range of same-norm16 characters. -static UBool U_CALLCONV -enumRangeHandler(const void *context, UChar32 start, UChar32 end, uint32_t value) { - return ((FCDTrieSingleton *)context)->rangeHandler(start, end, value); -} - -// Collect (OR together) the FCD values for a range of supplementary characters, -// for their lead surrogate code unit. -static UBool U_CALLCONV -enumRangeOrValue(const void *context, UChar32 /*start*/, UChar32 /*end*/, uint32_t value) { - *((uint32_t *)context)|=value; - return TRUE; -} - -U_CDECL_END - -void *FCDTrieSingleton::createInstance(const void *context, UErrorCode &errorCode) { - FCDTrieSingleton *me=(FCDTrieSingleton *)context; - me->newFCDTrie=utrie2_open(0, 0, &errorCode); - if(U_SUCCESS(errorCode)) { - utrie2_enum(me->impl.getNormTrie(), NULL, enumRangeHandler, me); - for(UChar lead=0xd800; lead<0xdc00; ++lead) { - uint32_t oredValue=utrie2_get32(me->newFCDTrie, lead); - utrie2_enumForLeadSurrogate(me->newFCDTrie, lead, NULL, enumRangeOrValue, &oredValue); - if(oredValue!=0) { - // Set a "bad" value for makeFCD() to break the quick check loop - // and look up the value for the supplementary code point. - // If there is any lccc, then set the worst-case lccc of 1. - // The ORed-together value's tccc is already the worst case. - if(oredValue>0xff) { - oredValue=0x100|(oredValue&0xff); - } - utrie2_set32ForLeadSurrogateCodeUnit(me->newFCDTrie, lead, oredValue, &errorCode); - } - } - utrie2_freeze(me->newFCDTrie, UTRIE2_16_VALUE_BITS, &errorCode); - if(U_SUCCESS(errorCode)) { - return me->newFCDTrie; - } - } - utrie2_close(me->newFCDTrie); - return NULL; -} - -void Normalizer2Impl::setFCD16FromNorm16(UChar32 start, UChar32 end, uint16_t norm16, - UTrie2 *newFCDTrie, UErrorCode &errorCode) const { - // Only loops for 1:1 algorithmic mappings. - for(;;) { - if(norm16>=MIN_NORMAL_MAYBE_YES) { - norm16&=0xff; - norm16|=norm16<<8; - } else if(norm16<=minYesNo || minMaybeYes<=norm16) { - // no decomposition or Hangul syllable, all zeros - break; - } else if(limitNoNo<=norm16) { - int32_t delta=norm16-(minMaybeYes-MAX_DELTA-1); - if(start==end) { - start+=delta; - norm16=getNorm16(start); - } else { - // the same delta leads from different original characters to different mappings - do { - UChar32 c=start+delta; - setFCD16FromNorm16(c, c, getNorm16(c), newFCDTrie, errorCode); - } while(++start<=end); - break; - } - } else { - // c decomposes, get everything from the variable-length extra data - const uint16_t *mapping=getMapping(norm16); - uint16_t firstUnit=*mapping; - if((firstUnit&MAPPING_LENGTH_MASK)==0) { - // A character that is deleted (maps to an empty string) must - // get the worst-case lccc and tccc values because arbitrary - // characters on both sides will become adjacent. - norm16=0x1ff; - } else { - norm16=firstUnit>>8; // tccc - if(firstUnit&MAPPING_HAS_CCC_LCCC_WORD) { - norm16|=*(mapping-1)&0xff00; // lccc - } - } - } - utrie2_setRange32(newFCDTrie, start, end, norm16, TRUE, &errorCode); - break; - } -} - -const UTrie2 *Normalizer2Impl::getFCDTrie(UErrorCode &errorCode) const { - // Logically const: Synchronized instantiation. - Normalizer2Impl *me=const_cast(this); - return FCDTrieSingleton(me->fcdTrieSingleton, *me, errorCode).getInstance(errorCode); -} +// Note: normalizer2impl.cpp r30982 (2011-nov-27) +// still had getFCDTrie() which built and cached an FCD trie. +// That provided faster access to FCD data than getFCD16FromNormData() +// but required synchronization and consumed some 10kB of heap memory +// in any process that uses FCD (e.g., via collation). +// tccc180[] and smallFCD[] are intended to help with any loss of performance, +// at least for Latin & CJK. // Gets the FCD value from the regular normalization data. uint16_t Normalizer2Impl::getFCD16FromNormData(UChar32 c) const { @@ -1679,7 +1570,7 @@ Normalizer2Impl::makeFCD(const UChar *src, const UChar *limit, prevBoundary=src; // We know that the previous character's lccc==0. // Fetching the fcd16 value was deferred for this below-U+0300 code point. - prevFCD16=getFCD16FromSingleLead(*(src-1)); + prevFCD16=getFCD16(*(src-1)); if(prevFCD16>1) { --prevBoundary; } @@ -1693,8 +1584,6 @@ Normalizer2Impl::makeFCD(const UChar *src, const UChar *limit, // The exception is the call to decomposeShort() which uses the buffer // in the normal way. - const UTrie2 *trie=fcdTrie(); - const UChar *prevSrc; UChar32 c=0; uint16_t fcd16=0; @@ -1705,24 +1594,24 @@ Normalizer2Impl::makeFCD(const UChar *src, const UChar *limit, if((c=*src)1) { --prevBoundary; } @@ -1752,7 +1642,7 @@ Normalizer2Impl::makeFCD(const UChar *src, const UChar *limit, --p; // Need to fetch the previous character's FCD value because // prevFCD16 was just for the trail surrogate code point. - prevFCD16=getFCD16FromSurrogatePair(p[0], p[1]); + prevFCD16=getFCD16FromNormData(U16_GET_SUPPLEMENTARY(p[0], p[1])); // Still known to have lccc==0 because its lead surrogate unit had lccc==0. } if(prevFCD16>1) { @@ -1840,21 +1730,18 @@ void Normalizer2Impl::makeFCDAndAppend(const UChar *src, const UChar *limit, } const UChar *Normalizer2Impl::findPreviousFCDBoundary(const UChar *start, const UChar *p) const { - BackwardUTrie2StringIterator iter(fcdTrie(), start, p); - uint16_t fcd16; - do { - fcd16=iter.previous16(); - } while(fcd16>0xff); - return iter.codePointStart; + while(start

0xff) {} + return p; } const UChar *Normalizer2Impl::findNextFCDBoundary(const UChar *p, const UChar *limit) const { - ForwardUTrie2StringIterator iter(fcdTrie(), p, limit); - uint16_t fcd16; - do { - fcd16=iter.next16(); - } while(fcd16>0xff); - return iter.codePointStart; + while(p=MIN_NORMAL_MAYBE_YES ? (uint8_t)norm16 : 0; } - uint16_t getFCD16(UChar32 c) const { return UTRIE2_GET16(fcdTrie(), c); } - uint16_t getFCD16FromSingleLead(UChar c) const { - return UTRIE2_GET16_FROM_U16_SINGLE_LEAD(fcdTrie(), c); + /** + * Returns the FCD data for code point c. + * @param c A Unicode code point. + * @return The lccc(c) in bits 15..8 and tccc(c) in bits 7..0. + */ + uint16_t getFCD16(UChar32 c) const { + if(c<0) { + return 0; + } else if(c<0x180) { + return tccc180[c]; + } else if(c<=0xffff) { + if(!singleLeadMightHaveNonZeroFCD16(c)) { return 0; } + } + return getFCD16FromNormData(c); } - uint16_t getFCD16FromSupplementary(UChar32 c) const { - return UTRIE2_GET16_FROM_SUPP(fcdTrie(), c); + /** + * Returns the FCD data for the next code point (post-increment). + * Might skip only a lead surrogate rather than the whole surrogate pair if none of + * the supplementary code points associated with the lead surrogate have non-zero FCD data. + * @param s A valid pointer into a string. Requires s!=limit. + * @param limit The end of the string, or NULL. + * @return The lccc(c) in bits 15..8 and tccc(c) in bits 7..0. + */ + uint16_t nextFCD16(const UChar *&s, const UChar *limit) const { + UChar32 c=*s++; + if(c<0x180) { + return tccc180[c]; + } else if(!singleLeadMightHaveNonZeroFCD16(c)) { + return 0; + } + UChar c2; + if(U16_IS_LEAD(c) && s!=limit && U16_IS_TRAIL(c2=*s)) { + c=U16_GET_SUPPLEMENTARY(c, c2); + ++s; + } + return getFCD16FromNormData(c); } - uint16_t getFCD16FromSurrogatePair(UChar c, UChar c2) const { - return getFCD16FromSupplementary(U16_GET_SUPPLEMENTARY(c, c2)); + /** + * Returns the FCD data for the previous code point (pre-decrement). + * @param start The start of the string. + * @param s A valid pointer into a string. Requires start>8]; + if(bits==0) { return false; } + return (UBool)((bits>>((lead>>5)&7))&1); + } + /** Returns the FCD value from the regular normalization data. */ uint16_t getFCD16FromNormData(UChar32 c) const; - void setFCD16FromNorm16(UChar32 start, UChar32 end, uint16_t norm16, - UTrie2 *newFCDTrie, UErrorCode &errorCode) const; - void makeCanonIterDataFromNorm16(UChar32 start, UChar32 end, uint16_t norm16, CanonIterData &newData, UErrorCode &errorCode) const; @@ -504,8 +560,6 @@ private: const UChar *findPreviousCompBoundary(const UChar *start, const UChar *p) const; const UChar *findNextCompBoundary(const UChar *p, const UChar *limit) const; - const UTrie2 *fcdTrie() const { return (const UTrie2 *)fcdTrieSingleton.fInstance; } - const UChar *findPreviousFCDBoundary(const UChar *start, const UChar *p) const; const UChar *findNextFCDBoundary(const UChar *p, const UChar *limit) const; @@ -532,7 +586,6 @@ private: const uint8_t *smallFCD; // [0x100] one bit per 32 BMP code points, set if any FCD!=0 uint8_t tccc180[0x180]; // tccc values for U+0000..U+017F - SimpleSingleton fcdTrieSingleton; SimpleSingleton canonIterDataSingleton; }; @@ -565,8 +618,6 @@ public: // Get the Impl instance of the Normalizer2. // Must be used only when it is known that norm2 is a Normalizer2WithImpl instance. static const Normalizer2Impl *getImpl(const Normalizer2 *norm2); - - static const UTrie2 *getFCDTrie(UErrorCode &errorCode); private: Normalizer2Factory(); // No instantiation. }; @@ -586,101 +637,11 @@ U_CFUNC UNormalizationCheckResult unorm_getQuickCheck(UChar32 c, UNormalizationMode mode); /** - * Get the 16-bit FCD value (lead & trail CCs) for a code point, for u_getIntPropertyValue(). + * Gets the 16-bit FCD value (lead & trail CCs) for a code point, for u_getIntPropertyValue(). * @internal */ U_CFUNC uint16_t -unorm_getFCD16Simple(UChar32 c); - -/** - * Internal API, used by collation code. - * Get access to the internal FCD trie table to be able to perform - * incremental, per-code unit, FCD checks in collation. - * One pointer is sufficient because the trie index values are offset - * by the index size, so that the same pointer is used to access the trie data. - * Code points at fcdHighStart and above have a zero FCD value. - * @internal - */ -U_CAPI const uint16_t * U_EXPORT2 -unorm_getFCDTrieIndex(UChar32 &fcdHighStart, UErrorCode *pErrorCode); - -/** - * Internal API, used by collation code. - * Get the FCD value for a code unit, with - * bits 15..8 lead combining class - * bits 7..0 trail combining class - * - * If c is a lead surrogate and the value is not 0, - * then some of c's associated supplementary code points have a non-zero FCD value. - * - * @internal - */ -static inline uint16_t -unorm_getFCD16(const uint16_t *fcdTrieIndex, UChar c) { - return fcdTrieIndex[_UTRIE2_INDEX_FROM_U16_SINGLE_LEAD(fcdTrieIndex, c)]; -} - -/** - * Internal API, used by collation code. - * Get the FCD value of the next code point (post-increment), with - * bits 15..8 lead combining class - * bits 7..0 trail combining class - * - * @internal - */ -static inline uint16_t -unorm_nextFCD16(const uint16_t *fcdTrieIndex, UChar32 fcdHighStart, - const UChar *&s, const UChar *limit) { - UChar32 c=*s++; - uint16_t fcd=fcdTrieIndex[_UTRIE2_INDEX_FROM_U16_SINGLE_LEAD(fcdTrieIndex, c)]; - if(fcd!=0 && U16_IS_LEAD(c)) { - UChar c2; - if(s!=limit && U16_IS_TRAIL(c2=*s)) { - ++s; - c=U16_GET_SUPPLEMENTARY(c, c2); - if(c>8; + return unorm_getFCD16(c)>>8; } #endif @@ -411,7 +411,7 @@ static int32_t getTrailCombiningClass(const IntProperty &, UChar32, UProperty) { } #else static int32_t getTrailCombiningClass(const IntProperty &/*prop*/, UChar32 c, UProperty /*which*/) { - return unorm_getFCD16Simple(c)&0xff; + return unorm_getFCD16(c)&0xff; } #endif diff --git a/icu4c/source/i18n/ucol.cpp b/icu4c/source/i18n/ucol.cpp index a4a6cb55919..ad4b4e6ef6c 100644 --- a/icu4c/source/i18n/ucol.cpp +++ b/icu4c/source/i18n/ucol.cpp @@ -52,13 +52,11 @@ U_NAMESPACE_USE #define ZERO_CC_LIMIT_ 0xC0 -// this is static pointer to the normalizer fcdTrieIndex +// This is static pointer to the NFC implementation instance. // it is always the same between calls to u_cleanup // and therefore writing to it is not synchronized. // It is cleaned in ucol_cleanup -static const uint16_t *fcdTrieIndex=NULL; -// Code points at fcdHighStart and above have a zero FCD value. -static UChar32 fcdHighStart = 0; +static const Normalizer2Impl *g_nfcImpl = NULL; // These are values from UCA required for // implicit generation and supressing sort key compression @@ -72,7 +70,7 @@ U_CDECL_BEGIN static UBool U_CALLCONV ucol_cleanup(void) { - fcdTrieIndex = NULL; + g_nfcImpl = NULL; return TRUE; } @@ -86,11 +84,13 @@ U_CDECL_END // init FCD data static inline UBool initializeFCD(UErrorCode *status) { - if (fcdTrieIndex != NULL) { + if (g_nfcImpl != NULL) { return TRUE; } else { // The result is constant, until the library is reloaded. - fcdTrieIndex = unorm_getFCDTrieIndex(fcdHighStart, status); + g_nfcImpl = Normalizer2Factory::getNFCImpl(*status); + // Note: Alternatively, we could also store this pointer in each collIterate struct, + // same as Normalizer2Factory::getImpl(collIterate->nfd). ucln_i18n_registerCleanup(UCLN_I18N_UCOL, ucol_cleanup); return U_SUCCESS(*status); } @@ -1433,10 +1433,8 @@ inline UBool collIterFCD(collIterate *collationSource) { endP = NULL; } - // Get the trailing combining class of the current character. If it's zero, - // we are OK. - /* trie access */ - fcd = unorm_nextFCD16(fcdTrieIndex, fcdHighStart, srcP, endP); + // Get the trailing combining class of the current character. If it's zero, we are OK. + fcd = g_nfcImpl->nextFCD16(srcP, endP); if (fcd != 0) { prevTrailingCC = (uint8_t)(fcd & LAST_BYTE_MASK_); @@ -1447,8 +1445,7 @@ inline UBool collIterFCD(collIterate *collationSource) { { const UChar *savedSrcP = srcP; - /* trie access */ - fcd = unorm_nextFCD16(fcdTrieIndex, fcdHighStart, srcP, endP); + fcd = g_nfcImpl->nextFCD16(srcP, endP); leadingCC = (uint8_t)(fcd >> SECOND_LAST_BYTE_SHIFT_); if (leadingCC == 0) { srcP = savedSrcP; // Hit char that is not part of combining sequence. @@ -1809,7 +1806,7 @@ inline UBool collPrevIterFCD(collIterate *data) src = data->pos + 1; /* Get the trailing combining class of the current character. */ - fcd = unorm_prevFCD16(fcdTrieIndex, fcdHighStart, start, src); + fcd = g_nfcImpl->previousFCD16(start, src); leadingCC = (uint8_t)(fcd >> SECOND_LAST_BYTE_SHIFT_); @@ -1825,7 +1822,7 @@ inline UBool collPrevIterFCD(collIterate *data) return result; } - fcd = unorm_prevFCD16(fcdTrieIndex, fcdHighStart, start, src); + fcd = g_nfcImpl->previousFCD16(start, src); trailingCC = (uint8_t)(fcd & LAST_BYTE_MASK_); diff --git a/icu4c/source/i18n/ucol_bld.cpp b/icu4c/source/i18n/ucol_bld.cpp index 0d506ed2a2f..490847976f2 100644 --- a/icu4c/source/i18n/ucol_bld.cpp +++ b/icu4c/source/i18n/ucol_bld.cpp @@ -845,8 +845,7 @@ U_CFUNC void ucol_createElements(UColTokenParser *src, tempUCATable *t, UColTokL UColToken *tok = lh->first; UColToken *expt = NULL; uint32_t i = 0, j = 0; - UChar32 fcdHighStart; - const uint16_t *fcdTrieIndex = unorm_getFCDTrieIndex(fcdHighStart, status); + const Normalizer2Impl *nfcImpl = Normalizer2Factory::getNFCImpl(*status); while(tok != NULL && U_SUCCESS(*status)) { /* first, check if there are any expansions */ @@ -942,7 +941,7 @@ U_CFUNC void ucol_createElements(UColTokenParser *src, tempUCATable *t, UColTokL if (!src->buildCCTabFlag && el.cSize > 0) { // Check the trailing canonical combining class (tccc) of the last character. const UChar *s = el.cPoints + el.cSize; - uint16_t fcd = unorm_prevFCD16(fcdTrieIndex, fcdHighStart, el.cPoints, s); + uint16_t fcd = nfcImpl->previousFCD16(el.cPoints, s); if ((fcd & 0xff) != 0) { src->buildCCTabFlag = TRUE; } diff --git a/icu4c/source/i18n/ucol_elm.cpp b/icu4c/source/i18n/ucol_elm.cpp index e33a59e47c3..e9e83b6f0e4 100644 --- a/icu4c/source/i18n/ucol_elm.cpp +++ b/icu4c/source/i18n/ucol_elm.cpp @@ -743,15 +743,12 @@ uprv_uca_copyCMTable(tempUCATable *t, UChar *cm, uint16_t *index) { static void uprv_uca_unsafeCPAddCCNZ(tempUCATable *t, UErrorCode *status) { UChar c; - uint16_t fcd; // Hi byte is lead combining class. - // lo byte is trailing combing class. - const uint16_t *fcdTrieIndex; - UChar32 fcdHighStart; + uint16_t fcd; // Hi byte is lead combining class. lo byte is trailing combing class. UBool buildCMTable = (t->cmLookup==NULL); // flag for building combining class table UChar *cm=NULL; uint16_t index[256]; int32_t count=0; - fcdTrieIndex = unorm_getFCDTrieIndex(fcdHighStart, status); + const Normalizer2Impl *nfcImpl = Normalizer2Factory::getNFCImpl(*status); if (U_FAILURE(*status)) { return; } @@ -767,7 +764,18 @@ static void uprv_uca_unsafeCPAddCCNZ(tempUCATable *t, UErrorCode *status) { uprv_memset(index, 0, sizeof(index)); } for (c=0; c<0xffff; c++) { - fcd = unorm_getFCD16(fcdTrieIndex, c); + if (U16_IS_LEAD(c)) { + fcd = 0; + if (nfcImpl->singleLeadMightHaveNonZeroFCD16(c)) { + UChar32 supp = U16_GET_SUPPLEMENTARY(c, 0xdc00); + UChar32 suppLimit = supp + 0x400; + while (supp < suppLimit) { + fcd |= nfcImpl->getFCD16FromNormData(supp++); + } + } + } else { + fcd = nfcImpl->getFCD16(c); + } if (fcd >= 0x100 || // if the leading combining class(c) > 0 || (U16_IS_LEAD(c) && fcd != 0)) {// c is a leading surrogate with some FCD data if (buildCMTable) { @@ -1785,12 +1793,11 @@ uprv_uca_addMultiCMContractions(tempUCATable *t, CombinClassTable *cmLookup = t->cmLookup; UChar newDecomp[256]; int32_t maxComp, newDecLen; - UChar32 fcdHighStart; - const uint16_t *fcdTrieIndex = unorm_getFCDTrieIndex(fcdHighStart, status); + const Normalizer2Impl *nfcImpl = Normalizer2Factory::getNFCImpl(*status); if (U_FAILURE(*status)) { return; } - int16_t curClass = (unorm_getFCD16(fcdTrieIndex, c->tailoringCM) & 0xff); + int16_t curClass = nfcImpl->getFCD16(c->tailoringCM) & 0xff; CompData *precomp = c->precomp; int32_t compLen = c->compLen; UChar *comp = c->comp; @@ -1855,12 +1862,11 @@ uprv_uca_addTailCanonicalClosures(tempUCATable *t, UCAElements *el, UErrorCode *status) { CombinClassTable *cmLookup = t->cmLookup; - UChar32 fcdHighStart; - const uint16_t *fcdTrieIndex = unorm_getFCDTrieIndex(fcdHighStart, status); + const Normalizer2Impl *nfcImpl = Normalizer2Factory::getNFCImpl(*status); if (U_FAILURE(*status)) { return; } - int16_t maxIndex = (unorm_getFCD16(fcdTrieIndex, cMark) & 0xff ); + int16_t maxIndex = nfcImpl->getFCD16(cMark) & 0xff; UCAElements element; uint16_t *index; UChar decomp[256]; @@ -1874,8 +1880,8 @@ uprv_uca_addTailCanonicalClosures(tempUCATable *t, return; } index = cmLookup->index; - int32_t cClass=(unorm_getFCD16(fcdTrieIndex, cMark) & 0xff); - maxIndex = (int32_t)index[(unorm_getFCD16(fcdTrieIndex, cMark) & 0xff)-1]; + int32_t cClass=nfcImpl->getFCD16(cMark) & 0xff; + maxIndex = (int32_t)index[(nfcImpl->getFCD16(cMark) & 0xff)-1]; c.comp = comp; c.decomp = decomp; c.precomp = precomp; @@ -1898,7 +1904,7 @@ uprv_uca_addTailCanonicalClosures(tempUCATable *t, // other combining mark combinations. precomp[precompLen].cp=comp[0]; curClass = precomp[precompLen].cClass = - index[unorm_getFCD16(fcdTrieIndex, decomp[1]) & 0xff]; + index[nfcImpl->getFCD16(decomp[1]) & 0xff]; precompLen++; replacedPos=0; for (decompLen=0; decompLen< (int32_t)el->cSize; decompLen++) { @@ -1938,7 +1944,7 @@ uprv_uca_addTailCanonicalClosures(tempUCATable *t, // This is a fix for tailoring contractions with accented // character at the end of contraction string. if ((len>2) && - (unorm_getFCD16(fcdTrieIndex, comp[len-2]) & 0xff00)==0) { + (nfcImpl->getFCD16(comp[len-2]) & 0xff00)==0) { uprv_uca_addFCD4AccentedContractions(t, colEl, comp, len, &element, status); } @@ -1967,8 +1973,6 @@ uprv_uca_canonicalClosure(tempUCATable *t, UColToken *tok; uint32_t i = 0, j = 0; UChar baseChar, firstCM; - UChar32 fcdHighStart; - const uint16_t *fcdTrieIndex = unorm_getFCDTrieIndex(fcdHighStart, status); context.nfcImpl=Normalizer2Factory::getNFCImpl(*status); if(U_FAILURE(*status)) { return 0; @@ -2039,7 +2043,7 @@ uprv_uca_canonicalClosure(tempUCATable *t, } if(src->UCA != NULL) { for(j = 0; jgetFCD16(el.cPoints[j]); if ( (fcd & 0xff) == 0 ) { baseChar = el.cPoints[j]; // last base character firstCM=0; // reset combining mark value diff --git a/icu4c/source/i18n/usearch.cpp b/icu4c/source/i18n/usearch.cpp index ee53f08531f..ba463e606c2 100644 --- a/icu4c/source/i18n/usearch.cpp +++ b/icu4c/source/i18n/usearch.cpp @@ -37,8 +37,7 @@ U_NAMESPACE_USE #define SECOND_LAST_BYTE_SHIFT_ 8 #define SUPPLEMENTARY_MIN_VALUE_ 0x10000 -static const uint16_t *fcdTrieIndex = NULL; -static UChar32 fcdHighStart = 0; +static const Normalizer2Impl *g_nfcImpl = NULL; // internal methods ------------------------------------------------- @@ -103,7 +102,7 @@ inline int hash(uint32_t ce) U_CDECL_BEGIN static UBool U_CALLCONV usearch_cleanup(void) { - fcdTrieIndex = NULL; + g_nfcImpl = NULL; return TRUE; } U_CDECL_END @@ -117,8 +116,8 @@ U_CDECL_END static inline void initializeFCD(UErrorCode *status) { - if (fcdTrieIndex == NULL) { - fcdTrieIndex = unorm_getFCDTrieIndex(fcdHighStart, status); + if (g_nfcImpl == NULL) { + g_nfcImpl = Normalizer2Factory::getNFCImpl(*status); ucln_i18n_registerCleanup(UCLN_I18N_USEARCH, usearch_cleanup); } } @@ -138,7 +137,7 @@ uint16_t getFCD(const UChar *str, int32_t *offset, int32_t strlength) { const UChar *temp = str + *offset; - uint16_t result = unorm_nextFCD16(fcdTrieIndex, fcdHighStart, temp, str + strlength); + uint16_t result = g_nfcImpl->nextFCD16(temp, str + strlength); *offset = (int32_t)(temp - str); return result; }