From 9075f9cb5a6b43f20776162b8752352aca162322 Mon Sep 17 00:00:00 2001 From: Peter Edberg Date: Wed, 20 Sep 2017 00:39:40 +0000 Subject: [PATCH] ICU-12380 Size reductions for transliterator registry (and some speedup) X-SVN-Rev: 40434 --- icu4c/source/common/hash.h | 48 ++++++-- icu4c/source/common/uhash.cpp | 44 +++++--- icu4c/source/common/uhash.h | 107 ++++++++++-------- icu4c/source/i18n/anytrans.cpp | 14 ++- icu4c/source/i18n/transreg.cpp | 153 +++++++++++++++++--------- icu4c/source/i18n/transreg.h | 10 +- icu4c/source/test/cintltst/utransts.c | 61 ++++++++++ 7 files changed, 308 insertions(+), 129 deletions(-) diff --git a/icu4c/source/common/hash.h b/icu4c/source/common/hash.h index b411a4305bf..c6be4672015 100644 --- a/icu4c/source/common/hash.h +++ b/icu4c/source/common/hash.h @@ -33,6 +33,8 @@ class U_COMMON_API Hashtable : public UMemory { inline void init(UHashFunction *keyHash, UKeyComparator *keyComp, UValueComparator *valueComp, UErrorCode& status); + inline void initSize(UHashFunction *keyHash, UKeyComparator *keyComp, UValueComparator *valueComp, int32_t size, UErrorCode& status); + public: /** * Construct a hashtable @@ -41,6 +43,14 @@ public: */ Hashtable(UBool ignoreKeyCase, UErrorCode& status); + /** + * Construct a hashtable + * @param ignoreKeyCase If true, keys are case insensitive. + * @param size initial size allocation + * @param status Error code + */ + Hashtable(UBool ignoreKeyCase, int32_t size, UErrorCode& status); + /** * Construct a hashtable * @param keyComp Comparator for comparing the keys @@ -76,9 +86,9 @@ public: int32_t puti(const UnicodeString& key, int32_t value, UErrorCode& status); void* get(const UnicodeString& key) const; - + int32_t geti(const UnicodeString& key) const; - + void* remove(const UnicodeString& key); int32_t removei(const UnicodeString& key); @@ -92,9 +102,9 @@ public: * @see uhash_nextElement */ const UHashElement* nextElement(int32_t& pos) const; - + UKeyComparator* setKeyComparator(UKeyComparator*keyComp); - + UValueComparator* setValueComparator(UValueComparator* valueComp); UBool equals(const Hashtable& that) const; @@ -107,7 +117,7 @@ private: * Implementation ********************************************************************/ -inline void Hashtable::init(UHashFunction *keyHash, UKeyComparator *keyComp, +inline void Hashtable::init(UHashFunction *keyHash, UKeyComparator *keyComp, UValueComparator *valueComp, UErrorCode& status) { if (U_FAILURE(status)) { return; @@ -119,10 +129,23 @@ inline void Hashtable::init(UHashFunction *keyHash, UKeyComparator *keyComp, } } -inline Hashtable::Hashtable(UKeyComparator *keyComp, UValueComparator *valueComp, +inline void Hashtable::initSize(UHashFunction *keyHash, UKeyComparator *keyComp, + UValueComparator *valueComp, int32_t size, UErrorCode& status) { + if (U_FAILURE(status)) { + return; + } + uhash_initSize(&hashObj, keyHash, keyComp, valueComp, size, &status); + if (U_SUCCESS(status)) { + hash = &hashObj; + uhash_setKeyDeleter(hash, uprv_deleteUObject); + } +} + +inline Hashtable::Hashtable(UKeyComparator *keyComp, UValueComparator *valueComp, UErrorCode& status) : hash(0) { init( uhash_hashUnicodeString, keyComp, valueComp, status); } + inline Hashtable::Hashtable(UBool ignoreKeyCase, UErrorCode& status) : hash(0) { @@ -134,6 +157,17 @@ inline Hashtable::Hashtable(UBool ignoreKeyCase, UErrorCode& status) status); } +inline Hashtable::Hashtable(UBool ignoreKeyCase, int32_t size, UErrorCode& status) + : hash(0) +{ + initSize(ignoreKeyCase ? uhash_hashCaselessUnicodeString + : uhash_hashUnicodeString, + ignoreKeyCase ? uhash_compareCaselessUnicodeString + : uhash_compareUnicodeString, + NULL, size, + status); +} + inline Hashtable::Hashtable(UErrorCode& status) : hash(0) { @@ -200,7 +234,7 @@ inline void Hashtable::removeAll(void) { inline UKeyComparator* Hashtable::setKeyComparator(UKeyComparator*keyComp){ return uhash_setKeyComparator(hash, keyComp); } - + inline UValueComparator* Hashtable::setValueComparator(UValueComparator* valueComp){ return uhash_setValueComparator(hash, valueComp); } diff --git a/icu4c/source/common/uhash.cpp b/icu4c/source/common/uhash.cpp index b7326f238cc..a80e7b8ff27 100644 --- a/icu4c/source/common/uhash.cpp +++ b/icu4c/source/common/uhash.cpp @@ -79,14 +79,14 @@ * prime number while being less than a power of two. */ static const int32_t PRIMES[] = { - 13, 31, 61, 127, 251, 509, 1021, 2039, 4093, 8191, 16381, 32749, + 7, 13, 31, 61, 127, 251, 509, 1021, 2039, 4093, 8191, 16381, 32749, 65521, 131071, 262139, 524287, 1048573, 2097143, 4194301, 8388593, 16777213, 33554393, 67108859, 134217689, 268435399, 536870909, 1073741789, 2147483647 /*, 4294967291 */ }; #define PRIMES_LENGTH UPRV_LENGTHOF(PRIMES) -#define DEFAULT_PRIME_INDEX 3 +#define DEFAULT_PRIME_INDEX 4 /* These ratios are tuned to the PRIMES array such that a resize * places the table back into the zone of non-resizing. That is, @@ -231,7 +231,7 @@ _uhash_allocate(UHashtable *hash, emptytok.pointer = NULL; /* Only one of these two is needed */ emptytok.integer = 0; /* but we don't know which one. */ - + limit = p + hash->length; while (p < limit) { p->key = emptytok; @@ -247,7 +247,7 @@ _uhash_allocate(UHashtable *hash, static UHashtable* _uhash_init(UHashtable *result, - UHashFunction *keyHash, + UHashFunction *keyHash, UKeyComparator *keyComp, UValueComparator *valueComp, int32_t primeIndex, @@ -275,7 +275,7 @@ _uhash_init(UHashtable *result, } static UHashtable* -_uhash_create(UHashFunction *keyHash, +_uhash_create(UHashFunction *keyHash, UKeyComparator *keyComp, UValueComparator *valueComp, int32_t primeIndex, @@ -415,7 +415,7 @@ _uhash_rehash(UHashtable *hash, UErrorCode *status) { if (U_FAILURE(*status)) { hash->elements = old; - hash->length = oldLength; + hash->length = oldLength; return; } @@ -536,7 +536,7 @@ _uhash_put(UHashtable *hash, ********************************************************************/ U_CAPI UHashtable* U_EXPORT2 -uhash_open(UHashFunction *keyHash, +uhash_open(UHashFunction *keyHash, UKeyComparator *keyComp, UValueComparator *valueComp, UErrorCode *status) { @@ -545,7 +545,7 @@ uhash_open(UHashFunction *keyHash, } U_CAPI UHashtable* U_EXPORT2 -uhash_openSize(UHashFunction *keyHash, +uhash_openSize(UHashFunction *keyHash, UKeyComparator *keyComp, UValueComparator *valueComp, int32_t size, @@ -562,7 +562,7 @@ uhash_openSize(UHashFunction *keyHash, U_CAPI UHashtable* U_EXPORT2 uhash_init(UHashtable *fillinResult, - UHashFunction *keyHash, + UHashFunction *keyHash, UKeyComparator *keyComp, UValueComparator *valueComp, UErrorCode *status) { @@ -570,6 +570,22 @@ uhash_init(UHashtable *fillinResult, return _uhash_init(fillinResult, keyHash, keyComp, valueComp, DEFAULT_PRIME_INDEX, status); } +U_CAPI UHashtable* U_EXPORT2 +uhash_initSize(UHashtable *fillinResult, + UHashFunction *keyHash, + UKeyComparator *keyComp, + UValueComparator *valueComp, + int32_t size, + UErrorCode *status) { + + // Find the smallest index i for which PRIMES[i] >= size. + int32_t i = 0; + while (i<(PRIMES_LENGTH-1) && PRIMES[i]keyComparator = fn; return result; } -U_CAPI UValueComparator *U_EXPORT2 +U_CAPI UValueComparator *U_EXPORT2 uhash_setValueComparator(UHashtable *hash, UValueComparator *fn){ UValueComparator *result = hash->valueComparator; hash->valueComparator = fn; @@ -630,7 +646,7 @@ uhash_setResizePolicy(UHashtable *hash, enum UHashResizePolicy policy) { UErrorCode status = U_ZERO_ERROR; _uhash_internalSetResizePolicy(hash, policy); hash->lowWaterMark = (int32_t)(hash->length * hash->lowWaterRatio); - hash->highWaterMark = (int32_t)(hash->length * hash->highWaterRatio); + hash->highWaterMark = (int32_t)(hash->length * hash->highWaterRatio); _uhash_rehash(hash, &status); } @@ -853,7 +869,7 @@ uhash_hashIChars(const UHashTok key) { return s == NULL ? 0 : ustr_hashICharsN(s, uprv_strlen(s)); } -U_CAPI UBool U_EXPORT2 +U_CAPI UBool U_EXPORT2 uhash_equals(const UHashtable* hash1, const UHashtable* hash2){ int32_t count1, count2, pos, i; @@ -886,14 +902,14 @@ uhash_equals(const UHashtable* hash1, const UHashtable* hash2){ if(count1!=count2){ return FALSE; } - + pos=UHASH_FIRST; for(i=0; ikey; const UHashTok val1 = elem1->value; /* here the keys are not compared, instead the key form hash1 is used to fetch - * value from hash2. If the hashes are equal then then both hashes should + * value from hash2. If the hashes are equal then then both hashes should * contain equal values for the same key! */ const UHashElement* elem2 = _uhash_find(hash2, key1, hash2->keyHasher(key1)); diff --git a/icu4c/source/common/uhash.h b/icu4c/source/common/uhash.h index 6369f8e34af..b59d2711bb2 100644 --- a/icu4c/source/common/uhash.h +++ b/icu4c/source/common/uhash.h @@ -154,7 +154,7 @@ struct UHashtable { * If NULL won't do anything */ /* Size parameters */ - + int32_t count; /* The number of key-value pairs in this table. * 0 <= count <= length. In practice we * never let count == length (see code). */ @@ -162,12 +162,12 @@ struct UHashtable { * and values. Must be prime. */ /* Rehashing thresholds */ - + int32_t highWaterMark; /* If count > highWaterMark, rehash */ int32_t lowWaterMark; /* If count < lowWaterMark, rehash */ float highWaterRatio; /* 0..1; high water as a fraction of length */ float lowWaterRatio; /* 0..1; low water as a fraction of length */ - + int8_t primeIndex; /* Index into our prime table for length. * length == PRIMES[primeIndex] */ UBool allocated; /* Was this UHashtable allocated? */ @@ -190,7 +190,7 @@ U_CDECL_END * @return A pointer to a UHashtable, or 0 if an error occurred. * @see uhash_openSize */ -U_CAPI UHashtable* U_EXPORT2 +U_CAPI UHashtable* U_EXPORT2 uhash_open(UHashFunction *keyHash, UKeyComparator *keyComp, UValueComparator *valueComp, @@ -207,7 +207,7 @@ uhash_open(UHashFunction *keyHash, * @return A pointer to a UHashtable, or 0 if an error occurred. * @see uhash_open */ -U_CAPI UHashtable* U_EXPORT2 +U_CAPI UHashtable* U_EXPORT2 uhash_openSize(UHashFunction *keyHash, UKeyComparator *keyComp, UValueComparator *valueComp, @@ -224,18 +224,37 @@ uhash_openSize(UHashFunction *keyHash, * @return A pointer to a UHashtable, or 0 if an error occurred. * @see uhash_openSize */ -U_CAPI UHashtable* U_EXPORT2 +U_CAPI UHashtable* U_EXPORT2 uhash_init(UHashtable *hash, UHashFunction *keyHash, UKeyComparator *keyComp, UValueComparator *valueComp, UErrorCode *status); +/** + * Initialize an existing UHashtable. + * @param keyHash A pointer to the key hashing function. Must not be + * NULL. + * @param keyComp A pointer to the function that compares keys. Must + * not be NULL. + * @param size The initial capacity of this hash table. + * @param status A pointer to an UErrorCode to receive any errors. + * @return A pointer to a UHashtable, or 0 if an error occurred. + * @see uhash_openSize + */ +U_CAPI UHashtable* U_EXPORT2 +uhash_initSize(UHashtable *hash, + UHashFunction *keyHash, + UKeyComparator *keyComp, + UValueComparator *valueComp, + int32_t size, + UErrorCode *status); + /** * Close a UHashtable, releasing the memory used. * @param hash The UHashtable to close. If hash is NULL no operation is performed. */ -U_CAPI void U_EXPORT2 +U_CAPI void U_EXPORT2 uhash_close(UHashtable *hash); @@ -246,7 +265,7 @@ uhash_close(UHashtable *hash); * @param fn the function to be used hash keys; must not be NULL * @return the previous key hasher; non-NULL */ -U_CAPI UHashFunction *U_EXPORT2 +U_CAPI UHashFunction *U_EXPORT2 uhash_setKeyHasher(UHashtable *hash, UHashFunction *fn); /** @@ -256,7 +275,7 @@ uhash_setKeyHasher(UHashtable *hash, UHashFunction *fn); * @param fn the function to be used compare keys; must not be NULL * @return the previous key comparator; non-NULL */ -U_CAPI UKeyComparator *U_EXPORT2 +U_CAPI UKeyComparator *U_EXPORT2 uhash_setKeyComparator(UHashtable *hash, UKeyComparator *fn); /** @@ -266,7 +285,7 @@ uhash_setKeyComparator(UHashtable *hash, UKeyComparator *fn); * @param fn the function to be used compare keys; must not be NULL * @return the previous key comparator; non-NULL */ -U_CAPI UValueComparator *U_EXPORT2 +U_CAPI UValueComparator *U_EXPORT2 uhash_setValueComparator(UHashtable *hash, UValueComparator *fn); /** @@ -279,7 +298,7 @@ uhash_setValueComparator(UHashtable *hash, UValueComparator *fn); * @param fn the function to be used delete keys, or NULL * @return the previous key deleter; may be NULL */ -U_CAPI UObjectDeleter *U_EXPORT2 +U_CAPI UObjectDeleter *U_EXPORT2 uhash_setKeyDeleter(UHashtable *hash, UObjectDeleter *fn); /** @@ -292,7 +311,7 @@ uhash_setKeyDeleter(UHashtable *hash, UObjectDeleter *fn); * @param fn the function to be used delete values, or NULL * @return the previous value deleter; may be NULL */ -U_CAPI UObjectDeleter *U_EXPORT2 +U_CAPI UObjectDeleter *U_EXPORT2 uhash_setValueDeleter(UHashtable *hash, UObjectDeleter *fn); /** @@ -302,7 +321,7 @@ uhash_setValueDeleter(UHashtable *hash, UObjectDeleter *fn); * @param hash The UHashtable to set * @param policy The way the hashtable resizes itself, {U_GROW, U_GROW_AND_SHRINK, U_FIXED} */ -U_CAPI void U_EXPORT2 +U_CAPI void U_EXPORT2 uhash_setResizePolicy(UHashtable *hash, enum UHashResizePolicy policy); /** @@ -310,7 +329,7 @@ uhash_setResizePolicy(UHashtable *hash, enum UHashResizePolicy policy); * @param hash The UHashtable to query. * @return The number of key-value pairs stored in hash. */ -U_CAPI int32_t U_EXPORT2 +U_CAPI int32_t U_EXPORT2 uhash_count(const UHashtable *hash); /** @@ -326,7 +345,7 @@ uhash_count(const UHashtable *hash); * @return The previous value, or NULL if none. * @see uhash_get */ -U_CAPI void* U_EXPORT2 +U_CAPI void* U_EXPORT2 uhash_put(UHashtable *hash, void *key, void *value, @@ -344,7 +363,7 @@ uhash_put(UHashtable *hash, * @return The previous value, or NULL if none. * @see uhash_get */ -U_CAPI void* U_EXPORT2 +U_CAPI void* U_EXPORT2 uhash_iput(UHashtable *hash, int32_t key, void* value, @@ -362,7 +381,7 @@ uhash_iput(UHashtable *hash, * @return The previous value, or 0 if none. * @see uhash_get */ -U_CAPI int32_t U_EXPORT2 +U_CAPI int32_t U_EXPORT2 uhash_puti(UHashtable *hash, void* key, int32_t value, @@ -380,7 +399,7 @@ uhash_puti(UHashtable *hash, * @return The previous value, or 0 if none. * @see uhash_get */ -U_CAPI int32_t U_EXPORT2 +U_CAPI int32_t U_EXPORT2 uhash_iputi(UHashtable *hash, int32_t key, int32_t value, @@ -393,8 +412,8 @@ uhash_iputi(UHashtable *hash, * @param key A pointer key stored in a hashtable * @return The requested item, or NULL if not found. */ -U_CAPI void* U_EXPORT2 -uhash_get(const UHashtable *hash, +U_CAPI void* U_EXPORT2 +uhash_get(const UHashtable *hash, const void *key); /** @@ -404,7 +423,7 @@ uhash_get(const UHashtable *hash, * @param key An integer key stored in a hashtable * @return The requested item, or NULL if not found. */ -U_CAPI void* U_EXPORT2 +U_CAPI void* U_EXPORT2 uhash_iget(const UHashtable *hash, int32_t key); @@ -415,7 +434,7 @@ uhash_iget(const UHashtable *hash, * @param key A pointer key stored in a hashtable * @return The requested item, or 0 if not found. */ -U_CAPI int32_t U_EXPORT2 +U_CAPI int32_t U_EXPORT2 uhash_geti(const UHashtable *hash, const void* key); /** @@ -425,7 +444,7 @@ uhash_geti(const UHashtable *hash, * @param key An integer key stored in a hashtable * @return The requested item, or 0 if not found. */ -U_CAPI int32_t U_EXPORT2 +U_CAPI int32_t U_EXPORT2 uhash_igeti(const UHashtable *hash, int32_t key); @@ -435,7 +454,7 @@ uhash_igeti(const UHashtable *hash, * @param key A key stored in a hashtable * @return The item removed, or NULL if not found. */ -U_CAPI void* U_EXPORT2 +U_CAPI void* U_EXPORT2 uhash_remove(UHashtable *hash, const void *key); @@ -445,7 +464,7 @@ uhash_remove(UHashtable *hash, * @param key An integer key stored in a hashtable * @return The item removed, or NULL if not found. */ -U_CAPI void* U_EXPORT2 +U_CAPI void* U_EXPORT2 uhash_iremove(UHashtable *hash, int32_t key); @@ -455,7 +474,7 @@ uhash_iremove(UHashtable *hash, * @param key An key stored in a hashtable * @return The item removed, or 0 if not found. */ -U_CAPI int32_t U_EXPORT2 +U_CAPI int32_t U_EXPORT2 uhash_removei(UHashtable *hash, const void* key); @@ -465,7 +484,7 @@ uhash_removei(UHashtable *hash, * @param key An integer key stored in a hashtable * @return The item removed, or 0 if not found. */ -U_CAPI int32_t U_EXPORT2 +U_CAPI int32_t U_EXPORT2 uhash_iremovei(UHashtable *hash, int32_t key); @@ -473,7 +492,7 @@ uhash_iremovei(UHashtable *hash, * Remove all items from a UHashtable. * @param hash The target UHashtable. */ -U_CAPI void U_EXPORT2 +U_CAPI void U_EXPORT2 uhash_removeAll(UHashtable *hash); /** @@ -487,7 +506,7 @@ uhash_removeAll(UHashtable *hash); * @param key A key stored in a hashtable * @return a hash element, or NULL if the key is not found. */ -U_CAPI const UHashElement* U_EXPORT2 +U_CAPI const UHashElement* U_EXPORT2 uhash_find(const UHashtable *hash, const void* key); /** @@ -510,7 +529,7 @@ uhash_find(const UHashtable *hash, const void* key); * @return a hash element, or NULL if no further key-value pairs * exist in the table. */ -U_CAPI const UHashElement* U_EXPORT2 +U_CAPI const UHashElement* U_EXPORT2 uhash_nextElement(const UHashtable *hash, int32_t *pos); @@ -525,7 +544,7 @@ uhash_nextElement(const UHashtable *hash, * modified. * @return the value that was removed. */ -U_CAPI void* U_EXPORT2 +U_CAPI void* U_EXPORT2 uhash_removeElement(UHashtable *hash, const UHashElement* e); /******************************************************************** @@ -537,7 +556,7 @@ uhash_removeElement(UHashtable *hash, const UHashElement* e); * @param i The given integer * @return a UHashTok for an integer. */ -/*U_CAPI UHashTok U_EXPORT2 +/*U_CAPI UHashTok U_EXPORT2 uhash_toki(int32_t i);*/ /** @@ -545,7 +564,7 @@ uhash_toki(int32_t i);*/ * @param p The given pointer * @return a UHashTok for a pointer. */ -/*U_CAPI UHashTok U_EXPORT2 +/*U_CAPI UHashTok U_EXPORT2 uhash_tokp(void* p);*/ /******************************************************************** @@ -559,7 +578,7 @@ uhash_tokp(void* p);*/ * @param key The string (const UChar*) to hash. * @return A hash code for the key. */ -U_CAPI int32_t U_EXPORT2 +U_CAPI int32_t U_EXPORT2 uhash_hashUChars(const UHashTok key); /** @@ -569,7 +588,7 @@ uhash_hashUChars(const UHashTok key); * @param key The string (const char*) to hash. * @return A hash code for the key. */ -U_CAPI int32_t U_EXPORT2 +U_CAPI int32_t U_EXPORT2 uhash_hashChars(const UHashTok key); /** @@ -589,7 +608,7 @@ uhash_hashIChars(const UHashTok key); * @param key2 The string for comparison * @return true if key1 and key2 are equal, return false otherwise. */ -U_CAPI UBool U_EXPORT2 +U_CAPI UBool U_EXPORT2 uhash_compareUChars(const UHashTok key1, const UHashTok key2); /** @@ -599,7 +618,7 @@ uhash_compareUChars(const UHashTok key1, const UHashTok key2); * @param key2 The string for comparison * @return true if key1 and key2 are equal, return false otherwise. */ -U_CAPI UBool U_EXPORT2 +U_CAPI UBool U_EXPORT2 uhash_compareChars(const UHashTok key1, const UHashTok key2); /** @@ -609,7 +628,7 @@ uhash_compareChars(const UHashTok key1, const UHashTok key2); * @param key2 The string for comparison * @return true if key1 and key2 are equal, return false otherwise. */ -U_CAPI UBool U_EXPORT2 +U_CAPI UBool U_EXPORT2 uhash_compareIChars(const UHashTok key1, const UHashTok key2); /******************************************************************** @@ -621,7 +640,7 @@ uhash_compareIChars(const UHashTok key1, const UHashTok key2); * @param key The string (const char*) to hash. * @return A hash code for the key. */ -U_CAPI int32_t U_EXPORT2 +U_CAPI int32_t U_EXPORT2 uhash_hashUnicodeString(const UElement key); /** @@ -630,7 +649,7 @@ uhash_hashUnicodeString(const UElement key); * @param key The string (const char*) to hash. * @return A hash code for the key. */ -U_CAPI int32_t U_EXPORT2 +U_CAPI int32_t U_EXPORT2 uhash_hashCaselessUnicodeString(const UElement key); /******************************************************************** @@ -642,7 +661,7 @@ uhash_hashCaselessUnicodeString(const UElement key); * @param key The string (const char*) to hash. * @return A hash code for the key. */ -U_CAPI int32_t U_EXPORT2 +U_CAPI int32_t U_EXPORT2 uhash_hashLong(const UHashTok key); /** @@ -651,7 +670,7 @@ uhash_hashLong(const UHashTok key); * @param Key2 The integer for comparison * @return true if key1 and key2 are equal, return false otherwise */ -U_CAPI UBool U_EXPORT2 +U_CAPI UBool U_EXPORT2 uhash_compareLong(const UHashTok key1, const UHashTok key2); /******************************************************************** @@ -662,7 +681,7 @@ uhash_compareLong(const UHashTok key1, const UHashTok key2); * Deleter for Hashtable objects. * @param obj The object to be deleted */ -U_CAPI void U_EXPORT2 +U_CAPI void U_EXPORT2 uhash_deleteHashtable(void *obj); /* Use uprv_free() itself as a deleter for any key or value allocated using uprv_malloc. */ @@ -673,7 +692,7 @@ uhash_deleteHashtable(void *obj); * @param hash2 * @return true if the hashtables are equal and false if not. */ -U_CAPI UBool U_EXPORT2 +U_CAPI UBool U_EXPORT2 uhash_equals(const UHashtable* hash1, const UHashtable* hash2); diff --git a/icu4c/source/i18n/anytrans.cpp b/icu4c/source/i18n/anytrans.cpp index e7d5375d693..d06469e2ae2 100644 --- a/icu4c/source/i18n/anytrans.cpp +++ b/icu4c/source/i18n/anytrans.cpp @@ -31,9 +31,13 @@ static const UChar TARGET_SEP = 45; // '-' static const UChar VARIANT_SEP = 47; // '/' -static const UChar ANY[] = {65,110,121,0}; // "Any" +static const UChar ANY[] = {0x41,0x6E,0x79,0}; // "Any" static const UChar NULL_ID[] = {78,117,108,108,0}; // "Null" -static const UChar LATIN_PIVOT[] = {45,76,97,116,105,110,59,76,97,116,105,110,45,0}; // "-Latin;Latin-" +static const UChar LATIN_PIVOT[] = {0x2D,0x4C,0x61,0x74,0x6E,0x3B,0x4C,0x61,0x74,0x6E,0x2D,0}; // "-Latn;Latn-" + +// initial size for an Any-XXXX transform's cache of script-XXXX transforms +// (will grow as necessary, but we don't expect to have source text with more than 7 scripts) +#define ANY_TRANS_CACHE_INIT_SIZE 7 //------------------------------------------------------------ @@ -186,7 +190,7 @@ AnyTransliterator::AnyTransliterator(const UnicodeString& id, Transliterator(id, NULL), targetScript(theTargetScript) { - cache = uhash_open(uhash_hashLong, uhash_compareLong, NULL, &ec); + cache = uhash_openSize(uhash_hashLong, uhash_compareLong, NULL, ANY_TRANS_CACHE_INIT_SIZE, &ec); if (U_FAILURE(ec)) { return; } @@ -212,7 +216,7 @@ AnyTransliterator::AnyTransliterator(const AnyTransliterator& o) : { // Don't copy the cache contents UErrorCode ec = U_ZERO_ERROR; - cache = uhash_open(uhash_hashLong, uhash_compareLong, NULL, &ec); + cache = uhash_openSize(uhash_hashLong, uhash_compareLong, NULL, ANY_TRANS_CACHE_INIT_SIZE, &ec); if (U_FAILURE(ec)) { return; } @@ -286,7 +290,7 @@ Transliterator* AnyTransliterator::getTransliterator(UScriptCode source) const { } if (t == NULL) { UErrorCode ec = U_ZERO_ERROR; - UnicodeString sourceName(uscript_getName(source), -1, US_INV); + UnicodeString sourceName(uscript_getShortName(source), -1, US_INV); UnicodeString id(sourceName); id.append(TARGET_SEP).append(target); diff --git a/icu4c/source/i18n/transreg.cpp b/icu4c/source/i18n/transreg.cpp index 3dadf792b24..d57f1315343 100644 --- a/icu4c/source/i18n/transreg.cpp +++ b/icu4c/source/i18n/transreg.cpp @@ -46,11 +46,29 @@ static const UChar LOCALE_SEP = 95; // '_' //static const UChar VARIANT_SEP = 0x002F; // '/' // String constants -static const UChar ANY[] = { 65, 110, 121, 0 }; // Any +static const UChar ANY[] = { 0x41, 0x6E, 0x79, 0 }; // Any +static const UChar LAT[] = { 0x4C, 0x61, 0x74, 0 }; // Lat // empty string #define NO_VARIANT UnicodeString() +// initial estimate for specDAG size +// ICU 60 Transliterator::countAvailableSources() +#define SPECDAG_INIT_SIZE 149 + +// initial estimate for number of variant names +#define VARIANT_LIST_INIT_SIZE 11 +#define VARIANT_LIST_MAX_SIZE 31 + +// initial estimate for availableIDs count (default estimate is 8 => multiple reallocs) +// ICU 60 Transliterator::countAvailableIDs() +#define AVAILABLE_IDS_INIT_SIZE 641 + +// initial estimate for number of targets for source "Any", "Lat" +// ICU 60 Transliterator::countAvailableTargets("Any")/("Latn") +#define ANY_TARGETS_INIT_SIZE 125 +#define LAT_TARGETS_INIT_SIZE 23 + /** * Resource bundle key for the RuleBasedTransliterator rule. */ @@ -517,10 +535,17 @@ U_CDECL_END TransliteratorRegistry::TransliteratorRegistry(UErrorCode& status) : registry(TRUE, status), - specDAG(TRUE, status), - availableIDs(status) + specDAG(TRUE, SPECDAG_INIT_SIZE, status), + variantList(VARIANT_LIST_INIT_SIZE, status), + availableIDs(AVAILABLE_IDS_INIT_SIZE, status) { registry.setValueDeleter(deleteEntry); + variantList.setDeleter(uprv_deleteUObject); + variantList.setComparer(uhash_compareCaselessUnicodeString); + UnicodeString *emptyString = new UnicodeString(); + if (emptyString != NULL) { + variantList.addElement(emptyString, status); + } availableIDs.setDeleter(uprv_deleteUObject); availableIDs.setComparer(uhash_compareCaselessUnicodeString); specDAG.setValueDeleter(uhash_deleteHashtable); @@ -781,9 +806,15 @@ int32_t TransliteratorRegistry::countAvailableVariants(const UnicodeString& sour if (targets == 0) { return 0; } - UVector *variants = (UVector*) targets->get(target); - // variants may be 0 if the source/target are invalid - return (variants == 0) ? 0 : variants->size(); + int32_t varMask = targets->geti(target); + int32_t varCount = 0; + while (varMask > 0) { + if (varMask & 1) { + varCount++; + } + varMask >>= 1; + } + return varCount; } UnicodeString& TransliteratorRegistry::getAvailableVariant(int32_t index, @@ -795,17 +826,25 @@ UnicodeString& TransliteratorRegistry::getAvailableVariant(int32_t index, result.truncate(0); // invalid source return result; } - UVector *variants = (UVector*) targets->get(target); - if (variants == 0) { - result.truncate(0); // invalid target - return result; - } - UnicodeString *v = (UnicodeString*) variants->elementAt(index); - if (v == 0) { - result.truncate(0); // invalid index - } else { - result = *v; + int32_t varMask = targets->geti(target); + int32_t varCount = 0; + int32_t varListIndex = 0; + while (varMask > 0) { + if (varMask & 1) { + if (varCount == index) { + UnicodeString *v = (UnicodeString*) variantList.elementAt(varListIndex); + if (v != NULL) { + result = *v; + return result; + } + break; + } + varCount++; + } + varMask >>= 1; + varListIndex++; } + result.truncate(0); // invalid target or index return result; } @@ -911,9 +950,9 @@ void TransliteratorRegistry::registerEntry(const UnicodeString& ID, UnicodeString *newID = (UnicodeString *)ID.clone(); // Check to make sure newID was created. if (newID != NULL) { - // NUL-terminate the ID string - newID->getTerminatedBuffer(); - availableIDs.addElement(newID, status); + // NUL-terminate the ID string + newID->getTerminatedBuffer(); + availableIDs.addElement(newID, status); } } } else { @@ -924,9 +963,7 @@ void TransliteratorRegistry::registerEntry(const UnicodeString& ID, /** * Register a source-target/variant in the specDAG. Variant may be - * empty, but source and target must not be. If variant is empty then - * the special variant NO_VARIANT is stored in slot zero of the - * UVector of variants. + * empty, but source and target must not be. */ void TransliteratorRegistry::registerSTV(const UnicodeString& source, const UnicodeString& target, @@ -936,39 +973,38 @@ void TransliteratorRegistry::registerSTV(const UnicodeString& source, UErrorCode status = U_ZERO_ERROR; Hashtable *targets = (Hashtable*) specDAG.get(source); if (targets == 0) { - targets = new Hashtable(TRUE, status); - if (U_FAILURE(status) || targets == 0) { + int32_t size = 3; + if (source.compare(ANY,3) == 0) { + size = ANY_TARGETS_INIT_SIZE; + } else if (source.compare(LAT,3) == 0) { + size = LAT_TARGETS_INIT_SIZE; + } + targets = new Hashtable(TRUE, size, status); + if (U_FAILURE(status) || targets == NULL) { return; } - targets->setValueDeleter(uprv_deleteUObject); specDAG.put(source, targets, status); } - UVector *variants = (UVector*) targets->get(target); - if (variants == 0) { - variants = new UVector(uprv_deleteUObject, - uhash_compareCaselessUnicodeString, status); - if (variants == 0) { + int32_t variantListIndex = variantList.indexOf((void*) &variant, 0); + if (variantListIndex < 0) { + if (variantList.size() >= VARIANT_LIST_MAX_SIZE) { + // can't handle any more variants return; } - targets->put(target, variants, status); - } - // assert(NO_VARIANT == ""); - // We add the variant string. If it is the special "no variant" - // string, that is, the empty string, we add it at position zero. - if (!variants->contains((void*) &variant)) { - UnicodeString *tempus; // Used for null pointer check. - if (variant.length() > 0) { - tempus = new UnicodeString(variant); - if (tempus != NULL) { - variants->addElement(tempus, status); - } - } else { - tempus = new UnicodeString(); // = NO_VARIANT - if (tempus != NULL) { - variants->insertElementAt(tempus, 0, status); - } + UnicodeString *variantEntry = new UnicodeString(variant); + if (variantEntry != NULL) { + variantList.addElement(variantEntry, status); + if (U_SUCCESS(status)) { + variantListIndex = variantList.size() - 1; + } + } + if (variantListIndex < 0) { + return; } } + int32_t addMask = 1 << variantListIndex; + int32_t varMask = targets->geti(target); + targets->puti(target, varMask | addMask, status); } /** @@ -979,17 +1015,24 @@ void TransliteratorRegistry::removeSTV(const UnicodeString& source, const UnicodeString& variant) { // assert(source.length() > 0); // assert(target.length() > 0); -// UErrorCode status = U_ZERO_ERROR; + UErrorCode status = U_ZERO_ERROR; Hashtable *targets = (Hashtable*) specDAG.get(source); - if (targets == 0) { + if (targets == NULL) { return; // should never happen for valid s-t/v } - UVector *variants = (UVector*) targets->get(target); - if (variants == 0) { + int32_t varMask = targets->geti(target); + if (varMask == 0) { return; // should never happen for valid s-t/v } - variants->removeElement((void*) &variant); - if (variants->size() == 0) { + int32_t variantListIndex = variantList.indexOf((void*) &variant, 0); + if (variantListIndex < 0) { + return; // should never happen for valid s-t/v + } + int32_t remMask = 1 << variantListIndex; + varMask &= (~remMask); + if (varMask != 0) { + targets->puti(target, varMask, status); + } else { targets->remove(target); // should delete variants if (targets->count() == 0) { specDAG.remove(source); // should delete targets @@ -1281,8 +1324,8 @@ Transliterator* TransliteratorRegistry::instantiateEntry(const UnicodeString& ID UVector* rbts = new UVector(entry->u.dataVector->size(), status); // Check for null pointer if (rbts == NULL) { - status = U_MEMORY_ALLOCATION_ERROR; - return NULL; + status = U_MEMORY_ALLOCATION_ERROR; + return NULL; } int32_t passNumber = 1; for (int32_t i = 0; U_SUCCESS(status) && i < entry->u.dataVector->size(); i++) { diff --git a/icu4c/source/i18n/transreg.h b/icu4c/source/i18n/transreg.h index 6fc35c8247b..041244e1b02 100644 --- a/icu4c/source/i18n/transreg.h +++ b/icu4c/source/i18n/transreg.h @@ -440,13 +440,15 @@ class TransliteratorRegistry : public UMemory { /** * DAG of visible IDs by spec. Hashtable: source => (Hashtable: - * target => (UVector: variant)) The UVector of variants is never - * empty. For a source-target with no variant, the special - * variant NO_VARIANT (the empty string) is stored in slot zero of - * the UVector. + * target => variant bitmask) */ Hashtable specDAG; + /** + * Vector of all variant names + */ + UVector variantList; + /** * Vector of public full IDs. */ diff --git a/icu4c/source/test/cintltst/utransts.c b/icu4c/source/test/cintltst/utransts.c index 40bd2d6ebe0..66d6580d993 100644 --- a/icu4c/source/test/cintltst/utransts.c +++ b/icu4c/source/test/cintltst/utransts.c @@ -33,6 +33,7 @@ static void TestRegisterUnregister(void); static void TestExtractBetween(void); static void TestUnicodeIDs(void); static void TestGetRulesAndSourceSet(void); +static void TestDataVariantsCompounds(void); static void _expectRules(const char*, const char*, const char*); static void _expect(const UTransliterator* trans, const char* cfrom, const char* cto); @@ -51,6 +52,7 @@ addUTransTest(TestNode** root) { TEST(TestExtractBetween); TEST(TestUnicodeIDs); TEST(TestGetRulesAndSourceSet); + TEST(TestDataVariantsCompounds); } /*------------------------------------------------------------------ @@ -638,6 +640,65 @@ static void TestGetRulesAndSourceSet() { } } +typedef struct { + const char * transID; + const char * sourceText; + const char * targetText; +} TransIDSourceTarg; + +static const TransIDSourceTarg dataVarCompItems[] = { + { "Simplified-Traditional", + "\\u4E0B\\u9762\\u662F\\u4E00\\u4E9B\\u4ECE\\u7B80\\u4F53\\u8F6C\\u6362\\u4E3A\\u7E41\\u4F53\\u5B57\\u793A\\u4F8B\\u6587\\u672C\\u3002", + "\\u4E0B\\u9762\\u662F\\u4E00\\u4E9B\\u5F9E\\u7C21\\u9AD4\\u8F49\\u63DB\\u70BA\\u7E41\\u9AD4\\u5B57\\u793A\\u4F8B\\u6587\\u672C\\u3002" }, + { "Halfwidth-Fullwidth", + "Sample text, \\uFF7B\\uFF9D\\uFF8C\\uFF9F\\uFF99\\uFF83\\uFF77\\uFF7D\\uFF84.", + "\\uFF33\\uFF41\\uFF4D\\uFF50\\uFF4C\\uFF45\\u3000\\uFF54\\uFF45\\uFF58\\uFF54\\uFF0C\\u3000\\u30B5\\u30F3\\u30D7\\u30EB\\u30C6\\u30AD\\u30B9\\u30C8\\uFF0E" }, + { "Han-Latin/Names; Latin-Bopomofo", + "\\u4E07\\u4FDF\\u919C\\u5974\\u3001\\u533A\\u695A\\u826F\\u3001\\u4EFB\\u70E8\\u3001\\u5CB3\\u98DB", + "\\u3107\\u311B\\u02CB \\u3111\\u3127\\u02CA \\u3114\\u3121\\u02C7 \\u310B\\u3128\\u02CA\\u3001 \\u3121 \\u3114\\u3128\\u02C7 \\u310C\\u3127\\u3124\\u02CA\\u3001 \\u3116\\u3123\\u02CA \\u3127\\u311D\\u02CB\\u3001 \\u3129\\u311D\\u02CB \\u3108\\u311F" }, + { "Greek-Latin", + "\\u1F08 \\u1FBC \\u1F89 \\u1FEC", + "A \\u0100I H\\u0100I RH" }, + { "Greek-Latin/BGN", + "\\u1F08 \\u1FBC \\u1F89 \\u1FEC", + "A\\u0313 A\\u0345 A\\u0314\\u0345 \\u1FEC" }, + { "Greek-Latin/UNGEGN", + "\\u1F08 \\u1FBC \\u1F89 \\u1FEC", + "A A A R" }, + { NULL, NULL, NULL } +}; + +enum { kBBufMax = 384 }; +static void TestDataVariantsCompounds() { + const TransIDSourceTarg* itemsPtr; + for (itemsPtr = dataVarCompItems; itemsPtr->transID != NULL; itemsPtr++) { + UErrorCode status = U_ZERO_ERROR; + UChar utrid[kUBufMax]; + int32_t utridlen = u_unescape(itemsPtr->transID, utrid, kUBufMax); + UTransliterator* utrans = utrans_openU(utrid, utridlen, UTRANS_FORWARD, NULL, 0, NULL, &status); + if (U_FAILURE(status)) { + log_data_err("FAIL: utrans_openRules(%s) failed, error=%s (Are you missing data?)\n", itemsPtr->transID, u_errorName(status)); + continue; + } + UChar text[kUBufMax]; + int32_t textLen = u_unescape(itemsPtr->sourceText, text, kUBufMax); + int32_t textLim = textLen; + utrans_transUChars(utrans, text, &textLen, kUBufMax, 0, &textLim, &status); + if (U_FAILURE(status)) { + log_err("FAIL: utrans_transUChars(%s) failed, error=%s\n", itemsPtr->transID, u_errorName(status)); + } else { + UChar expect[kUBufMax]; + int32_t expectLen = u_unescape(itemsPtr->targetText, expect, kUBufMax); + if (textLen != expectLen || u_strncmp(text, expect, textLen) != 0) { + char btext[kBBufMax], bexpect[kBBufMax]; + u_austrncpy(btext, text, textLen); + u_austrncpy(bexpect, expect, expectLen); + log_err("FAIL: utrans_transUChars(%s),\n expect %s\n get %s\n", itemsPtr->transID, bexpect, btext); + } + } + utrans_close(utrans); + } +} static void _expectRules(const char* crules, const char* cfrom, -- 2.40.0