From: Markus Scherer Date: Sat, 21 Dec 2019 14:48:17 +0000 (-0800) Subject: ICU-20917 LocaleMatcher: prefer a more-default locale X-Git-Tag: release-67-rc~125 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=60b567d6abc707d256e4049bb80a4e9d2a93a021;p=icu ICU-20917 LocaleMatcher: prefer a more-default locale --- diff --git a/icu4c/source/common/locdistance.cpp b/icu4c/source/common/locdistance.cpp index 50633cc8289..4304fab298c 100644 --- a/icu4c/source/common/locdistance.cpp +++ b/icu4c/source/common/locdistance.cpp @@ -69,7 +69,7 @@ void U_CALLCONV LocaleDistance::initLocaleDistance(UErrorCode &errorCode) { errorCode = U_MISSING_RESOURCE_ERROR; return; } - gLocaleDistance = new LocaleDistance(data); + gLocaleDistance = new LocaleDistance(data, likely); if (gLocaleDistance == nullptr) { errorCode = U_MEMORY_ALLOCATION_ERROR; return; @@ -83,7 +83,8 @@ const LocaleDistance *LocaleDistance::getSingleton(UErrorCode &errorCode) { return gLocaleDistance; } -LocaleDistance::LocaleDistance(const LocaleDistanceData &data) : +LocaleDistance::LocaleDistance(const LocaleDistanceData &data, const XLikelySubtags &likely) : + likelySubtags(likely), trie(data.distanceTrieBytes), regionToPartitionsIndex(data.regionToPartitions), partitionArrays(data.partitions), paradigmLSRs(data.paradigms), paradigmLSRsLength(data.paradigmsLength), @@ -122,6 +123,8 @@ int32_t LocaleDistance::getBestIndexAndDistance( uint64_t desLangState = desLangDistance >= 0 && supportedLSRsLength > 1 ? iter.getState64() : 0; // Index of the supported LSR with the lowest distance. int32_t bestIndex = -1; + // Cached lookup info from XLikelySubtags.compareLikely(). + int32_t bestLikelyInfo = -1; for (int32_t slIndex = 0; slIndex < supportedLSRsLength; ++slIndex) { const LSR &supported = *supportedLSRs[slIndex]; bool star = false; @@ -207,13 +210,29 @@ int32_t LocaleDistance::getBestIndexAndDistance( // Distinguish between equivalent but originally unequal locales via an // additional micro distance. shiftedDistance |= (desired.flags ^ supported.flags); - } - if (shiftedDistance < shiftedThreshold) { - if (shiftedDistance == 0) { - return slIndex << INDEX_SHIFT; + if (shiftedDistance < shiftedThreshold) { + if (shiftedDistance == 0) { + return slIndex << INDEX_SHIFT; + } + bestIndex = slIndex; + shiftedThreshold = shiftedDistance; + bestLikelyInfo = -1; + } + } else { + if (shiftedDistance < shiftedThreshold) { + bestIndex = slIndex; + shiftedThreshold = shiftedDistance; + bestLikelyInfo = -1; + } else if (shiftedDistance == shiftedThreshold && bestIndex >= 0) { + bestLikelyInfo = likelySubtags.compareLikely( + supported, *supportedLSRs[bestIndex], bestLikelyInfo); + if ((bestLikelyInfo & 1) != 0) { + // This supported locale matches as well as the previous best match, + // and neither matches perfectly, + // but this one is "more likely" (has more-default subtags). + bestIndex = slIndex; + } } - bestIndex = slIndex; - shiftedThreshold = shiftedDistance; } } return bestIndex >= 0 ? diff --git a/icu4c/source/common/locdistance.h b/icu4c/source/common/locdistance.h index 0ee3d0e63e9..88fd73f689c 100644 --- a/icu4c/source/common/locdistance.h +++ b/icu4c/source/common/locdistance.h @@ -82,7 +82,7 @@ private: return (indexAndDistance & DISTANCE_MASK) >> DISTANCE_SHIFT; } - LocaleDistance(const LocaleDistanceData &data); + LocaleDistance(const LocaleDistanceData &data, const XLikelySubtags &likely); LocaleDistance(const LocaleDistance &other) = delete; LocaleDistance &operator=(const LocaleDistance &other) = delete; @@ -110,6 +110,8 @@ private: return defaultRegionDistance; } + const XLikelySubtags &likelySubtags; + // The trie maps each dlang+slang+dscript+sscript+dregion+sregion // (encoded in ASCII with bit 7 set on the last character of each subtag) to a distance. // There is also a trie value for each subsequence of whole subtags. diff --git a/icu4c/source/common/loclikelysubtags.cpp b/icu4c/source/common/loclikelysubtags.cpp index 27f10b3fb92..1fbf1a14632 100644 --- a/icu4c/source/common/loclikelysubtags.cpp +++ b/icu4c/source/common/loclikelysubtags.cpp @@ -557,6 +557,106 @@ LSR XLikelySubtags::maximize(const char *language, const char *script, const cha return LSR(language, script, region, retainOldMask); } +int32_t XLikelySubtags::compareLikely(const LSR &lsr, const LSR &other, int32_t likelyInfo) const { + // If likelyInfo >= 0: + // likelyInfo bit 1 is set if the previous comparison with lsr + // was for equal language and script. + // Otherwise the scripts differed. + if (uprv_strcmp(lsr.language, other.language) != 0) { + return 0xfffffffc; // negative, lsr not better than other + } + if (uprv_strcmp(lsr.script, other.script) != 0) { + int32_t index; + if (likelyInfo >= 0 && (likelyInfo & 2) == 0) { + index = likelyInfo >> 2; + } else { + index = getLikelyIndex(lsr.language, ""); + likelyInfo = index << 2; + } + const LSR &likely = lsrs[index]; + if (uprv_strcmp(lsr.script, likely.script) == 0) { + return likelyInfo | 1; + } else { + return likelyInfo & ~1; + } + } + if (uprv_strcmp(lsr.region, other.region) != 0) { + int32_t index; + if (likelyInfo >= 0 && (likelyInfo & 2) != 0) { + index = likelyInfo >> 2; + } else { + index = getLikelyIndex(lsr.language, lsr.region); + likelyInfo = (index << 2) | 2; + } + const LSR &likely = lsrs[index]; + if (uprv_strcmp(lsr.region, likely.region) == 0) { + return likelyInfo | 1; + } else { + return likelyInfo & ~1; + } + } + return likelyInfo & ~1; // lsr not better than other +} + +// Subset of maximize(). +int32_t XLikelySubtags::getLikelyIndex(const char *language, const char *script) const { + if (uprv_strcmp(language, "und") == 0) { + language = ""; + } + if (uprv_strcmp(script, "Zzzz") == 0) { + script = ""; + } + + BytesTrie iter(trie); + uint64_t state; + int32_t value; + // Small optimization: Array lookup for first language letter. + int32_t c0; + if (0 <= (c0 = uprv_lowerOrdinal(language[0])) && c0 <= 25 && + language[1] != 0 && // language.length() >= 2 + (state = trieFirstLetterStates[c0]) != 0) { + value = trieNext(iter.resetToState64(state), language, 1); + } else { + value = trieNext(iter, language, 0); + } + if (value >= 0) { + state = iter.getState64(); + } else { + iter.resetToState64(trieUndState); // "und" ("*") + state = 0; + } + + if (value > 0) { + // Intermediate or final value from just language. + if (value == SKIP_SCRIPT) { + value = 0; + } + } else { + value = trieNext(iter, script, 0); + if (value >= 0) { + state = iter.getState64(); + } else { + if (state == 0) { + iter.resetToState64(trieUndZzzzState); // "und-Zzzz" ("**") + } else { + iter.resetToState64(state); + value = trieNext(iter, "", 0); + U_ASSERT(value >= 0); + state = iter.getState64(); + } + } + } + + if (value > 0) { + // Final value from just language or language+script. + } else { + value = trieNext(iter, "", 0); + U_ASSERT(value > 0); + } + U_ASSERT(value < lsrsLength); + return value; +} + int32_t XLikelySubtags::trieNext(BytesTrie &iter, const char *s, int32_t i) { UStringTrieResult result; uint8_t c; diff --git a/icu4c/source/common/loclikelysubtags.h b/icu4c/source/common/loclikelysubtags.h index 8c8a08ac5e3..90ddfffaca6 100644 --- a/icu4c/source/common/loclikelysubtags.h +++ b/icu4c/source/common/loclikelysubtags.h @@ -85,6 +85,18 @@ public: // VisibleForTesting LSR makeMaximizedLsrFrom(const Locale &locale, UErrorCode &errorCode) const; + /** + * Tests whether lsr is "more likely" than other. + * For example, fr-Latn-FR is more likely than fr-Latn-CH because + * FR is the default region for fr-Latn. + * + * The likelyInfo caches lookup information between calls. + * The return value is an updated likelyInfo value, + * with bit 0 set if lsr is "more likely". + * The initial value of likelyInfo must be negative. + */ + int32_t compareLikely(const LSR &lsr, const LSR &other, int32_t likelyInfo) const; + // TODO(ICU-20777): Switch Locale/uloc_ likely-subtags API from the old code // in loclikely.cpp to this new code, including activating this // minimizeSubtags() function. The LocaleMatcher does not minimize. @@ -111,6 +123,8 @@ private: */ LSR maximize(const char *language, const char *script, const char *region) const; + int32_t getLikelyIndex(const char *language, const char *script) const; + static int32_t trieNext(BytesTrie &iter, const char *s, int32_t i); UResourceBundle *langInfoBundle; diff --git a/icu4c/source/test/testdata/localeMatcherTest.txt b/icu4c/source/test/testdata/localeMatcherTest.txt index 649c95baea5..6e5dbdd1ef6 100644 --- a/icu4c/source/test/testdata/localeMatcherTest.txt +++ b/icu4c/source/test/testdata/localeMatcherTest.txt @@ -733,7 +733,7 @@ ja >> fr @favor=script en-GB >> en-GB en-US >> en -fr >> en-GB +fr >> en ja >> fr ** test: testEmptyWithDefault @@ -761,8 +761,8 @@ en-GB >> en-GB en-US >> en fr-FR >> fr ja-JP >> fr +zu >> en # For a language that doesn't match anything, return the default. -zu >> en-GB zxx >> fr @favor=script @@ -770,7 +770,7 @@ en-GB >> en-GB en-US >> en fr-FR >> fr ja-JP >> fr -zu >> en-GB +zu >> en zxx >> en ** test: TestExactMatch @@ -1322,7 +1322,7 @@ en >> en-US @favor=script und >> und ja >> und -fr-CA >> en-GB +fr-CA >> en-US en-AU >> en-GB en-BZ >> en-GB en-CA >> en-GB @@ -1359,8 +1359,8 @@ fr >> und @supported=en-GB, en-US, en, en-AU und >> und ja >> und -fr-CA >> en-GB -fr >> en-GB +fr-CA >> en-US +fr >> en-US @supported=en-AU, ja, ca fr >> en-AU @supported=pl, ja, ca @@ -1901,7 +1901,7 @@ fr-FR >> fr # Parent match is chosen. fr-FR >> fr-CA # Sibling match is chosen. @supported=fr-CA, fr-FR fr >> fr-FR # Inferred region match is chosen. -fr-SN >> fr-CA +fr-SN >> fr-FR @supported=en, fr-FR fr >> fr-FR # Child match is chosen. @supported=de, en, it @@ -1931,7 +1931,7 @@ fr-FR >> fr fr-FR >> fr-CA @supported=fr-CA, fr-FR fr >> fr-FR -fr-SN >> fr-CA +fr-SN >> fr-FR @supported=en, fr-FR fr >> fr-FR @supported=de, en, it @@ -1951,3 +1951,10 @@ ru >> uk zh-CN >> zh-TW @supported=ja ru >> und + +** test: favor a more-default locale among equally imperfect matches +@supported=fr-CA, fr-CH, fr-FR, fr-GB +fr-SN >> fr-FR +@supported=sr-Latn, sr-Cyrl, sr-Grek +@threshold=60 +sr-Thai >> sr-Cyrl diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/locale/LocaleDistance.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/locale/LocaleDistance.java index b5bd4dfe096..fce5a9c1c71 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/impl/locale/LocaleDistance.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/locale/LocaleDistance.java @@ -255,6 +255,8 @@ public class LocaleDistance { long desLangState = desLangDistance >= 0 && supportedLSRs.length > 1 ? iter.getState64() : 0; // Index of the supported LSR with the lowest distance. int bestIndex = -1; + // Cached lookup info from XLikelySubtags.compareLikely(). + int bestLikelyInfo = -1; for (int slIndex = 0; slIndex < supportedLSRs.length; ++slIndex) { LSR supported = supportedLSRs[slIndex]; boolean star = false; @@ -340,13 +342,29 @@ public class LocaleDistance { // Distinguish between equivalent but originally unequal locales via an // additional micro distance. shiftedDistance |= (desired.flags ^ supported.flags); - } - if (shiftedDistance < shiftedThreshold) { - if (shiftedDistance == 0) { - return slIndex << INDEX_SHIFT; + if (shiftedDistance < shiftedThreshold) { + if (shiftedDistance == 0) { + return slIndex << INDEX_SHIFT; + } + bestIndex = slIndex; + shiftedThreshold = shiftedDistance; + bestLikelyInfo = -1; + } + } else { + if (shiftedDistance < shiftedThreshold) { + bestIndex = slIndex; + shiftedThreshold = shiftedDistance; + bestLikelyInfo = -1; + } else if (shiftedDistance == shiftedThreshold && bestIndex >= 0) { + bestLikelyInfo = XLikelySubtags.INSTANCE.compareLikely( + supported, supportedLSRs[bestIndex], bestLikelyInfo); + if ((bestLikelyInfo & 1) != 0) { + // This supported locale matches as well as the previous best match, + // and neither matches perfectly, + // but this one is "more likely" (has more-default subtags). + bestIndex = slIndex; + } } - bestIndex = slIndex; - shiftedThreshold = shiftedDistance; } } return bestIndex >= 0 ? diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/locale/XLikelySubtags.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/locale/XLikelySubtags.java index 543aadef159..332f0351286 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/impl/locale/XLikelySubtags.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/locale/XLikelySubtags.java @@ -367,6 +367,114 @@ public final class XLikelySubtags { return new LSR(language, script, region, retainOldMask); } + /** + * Tests whether lsr is "more likely" than other. + * For example, fr-Latn-FR is more likely than fr-Latn-CH because + * FR is the default region for fr-Latn. + * + *

The likelyInfo caches lookup information between calls. + * The return value is an updated likelyInfo value, + * with bit 0 set if lsr is "more likely". + * The initial value of likelyInfo must be negative. + */ + int compareLikely(LSR lsr, LSR other, int likelyInfo) { + // If likelyInfo >= 0: + // likelyInfo bit 1 is set if the previous comparison with lsr + // was for equal language and script. + // Otherwise the scripts differed. + if (!lsr.language.equals(other.language)) { + return 0xfffffffc; // negative, lsr not better than other + } + if (!lsr.script.equals(other.script)) { + int index; + if (likelyInfo >= 0 && (likelyInfo & 2) == 0) { + index = likelyInfo >> 2; + } else { + index = getLikelyIndex(lsr.language, ""); + likelyInfo = index << 2; + } + LSR likely = lsrs[index]; + if (lsr.script.equals(likely.script)) { + return likelyInfo | 1; + } else { + return likelyInfo & ~1; + } + } + if (!lsr.region.equals(other.region)) { + int index; + if (likelyInfo >= 0 && (likelyInfo & 2) != 0) { + index = likelyInfo >> 2; + } else { + index = getLikelyIndex(lsr.language, lsr.region); + likelyInfo = (index << 2) | 2; + } + LSR likely = lsrs[index]; + if (lsr.region.equals(likely.region)) { + return likelyInfo | 1; + } else { + return likelyInfo & ~1; + } + } + return likelyInfo & ~1; // lsr not better than other + } + + // Subset of maximize(). + private int getLikelyIndex(String language, String script) { + if (language.equals("und")) { + language = ""; + } + if (script.equals("Zzzz")) { + script = ""; + } + + BytesTrie iter = new BytesTrie(trie); + long state; + int value; + // Small optimization: Array lookup for first language letter. + int c0; + if (language.length() >= 2 && 0 <= (c0 = language.charAt(0) - 'a') && c0 <= 25 && + (state = trieFirstLetterStates[c0]) != 0) { + value = trieNext(iter.resetToState64(state), language, 1); + } else { + value = trieNext(iter, language, 0); + } + if (value >= 0) { + state = iter.getState64(); + } else { + iter.resetToState64(trieUndState); // "und" ("*") + state = 0; + } + + if (value > 0) { + // Intermediate or final value from just language. + if (value == SKIP_SCRIPT) { + value = 0; + } + } else { + value = trieNext(iter, script, 0); + if (value >= 0) { + state = iter.getState64(); + } else { + if (state == 0) { + iter.resetToState64(trieUndZzzzState); // "und-Zzzz" ("**") + } else { + iter.resetToState64(state); + value = trieNext(iter, "", 0); + assert value >= 0; + state = iter.getState64(); + } + } + } + + if (value > 0) { + // Final value from just language or language+script. + } else { + value = trieNext(iter, "", 0); + assert value > 0; + } + return value; + } + private static final int trieNext(BytesTrie iter, String s, int i) { BytesTrie.Result result; if (s.isEmpty()) { diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/data/localeMatcherTest.txt b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/data/localeMatcherTest.txt index 649c95baea5..6e5dbdd1ef6 100644 --- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/data/localeMatcherTest.txt +++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/data/localeMatcherTest.txt @@ -733,7 +733,7 @@ ja >> fr @favor=script en-GB >> en-GB en-US >> en -fr >> en-GB +fr >> en ja >> fr ** test: testEmptyWithDefault @@ -761,8 +761,8 @@ en-GB >> en-GB en-US >> en fr-FR >> fr ja-JP >> fr +zu >> en # For a language that doesn't match anything, return the default. -zu >> en-GB zxx >> fr @favor=script @@ -770,7 +770,7 @@ en-GB >> en-GB en-US >> en fr-FR >> fr ja-JP >> fr -zu >> en-GB +zu >> en zxx >> en ** test: TestExactMatch @@ -1322,7 +1322,7 @@ en >> en-US @favor=script und >> und ja >> und -fr-CA >> en-GB +fr-CA >> en-US en-AU >> en-GB en-BZ >> en-GB en-CA >> en-GB @@ -1359,8 +1359,8 @@ fr >> und @supported=en-GB, en-US, en, en-AU und >> und ja >> und -fr-CA >> en-GB -fr >> en-GB +fr-CA >> en-US +fr >> en-US @supported=en-AU, ja, ca fr >> en-AU @supported=pl, ja, ca @@ -1901,7 +1901,7 @@ fr-FR >> fr # Parent match is chosen. fr-FR >> fr-CA # Sibling match is chosen. @supported=fr-CA, fr-FR fr >> fr-FR # Inferred region match is chosen. -fr-SN >> fr-CA +fr-SN >> fr-FR @supported=en, fr-FR fr >> fr-FR # Child match is chosen. @supported=de, en, it @@ -1931,7 +1931,7 @@ fr-FR >> fr fr-FR >> fr-CA @supported=fr-CA, fr-FR fr >> fr-FR -fr-SN >> fr-CA +fr-SN >> fr-FR @supported=en, fr-FR fr >> fr-FR @supported=de, en, it @@ -1951,3 +1951,10 @@ ru >> uk zh-CN >> zh-TW @supported=ja ru >> und + +** test: favor a more-default locale among equally imperfect matches +@supported=fr-CA, fr-CH, fr-FR, fr-GB +fr-SN >> fr-FR +@supported=sr-Latn, sr-Cyrl, sr-Grek +@threshold=60 +sr-Thai >> sr-Cyrl