errorCode = U_MISSING_RESOURCE_ERROR;
return;
}
- gLocaleDistance = new LocaleDistance(data);
+ gLocaleDistance = new LocaleDistance(data, likely);
if (gLocaleDistance == nullptr) {
errorCode = U_MEMORY_ALLOCATION_ERROR;
return;
return gLocaleDistance;
}
-LocaleDistance::LocaleDistance(const LocaleDistanceData &data) :
+LocaleDistance::LocaleDistance(const LocaleDistanceData &data, const XLikelySubtags &likely) :
+ likelySubtags(likely),
trie(data.distanceTrieBytes),
regionToPartitionsIndex(data.regionToPartitions), partitionArrays(data.partitions),
paradigmLSRs(data.paradigms), paradigmLSRsLength(data.paradigmsLength),
uint64_t desLangState = desLangDistance >= 0 && supportedLSRsLength > 1 ? iter.getState64() : 0;
// Index of the supported LSR with the lowest distance.
int32_t bestIndex = -1;
+ // Cached lookup info from XLikelySubtags.compareLikely().
+ int32_t bestLikelyInfo = -1;
for (int32_t slIndex = 0; slIndex < supportedLSRsLength; ++slIndex) {
const LSR &supported = *supportedLSRs[slIndex];
bool star = false;
// Distinguish between equivalent but originally unequal locales via an
// additional micro distance.
shiftedDistance |= (desired.flags ^ supported.flags);
- }
- if (shiftedDistance < shiftedThreshold) {
- if (shiftedDistance == 0) {
- return slIndex << INDEX_SHIFT;
+ if (shiftedDistance < shiftedThreshold) {
+ if (shiftedDistance == 0) {
+ return slIndex << INDEX_SHIFT;
+ }
+ bestIndex = slIndex;
+ shiftedThreshold = shiftedDistance;
+ bestLikelyInfo = -1;
+ }
+ } else {
+ if (shiftedDistance < shiftedThreshold) {
+ bestIndex = slIndex;
+ shiftedThreshold = shiftedDistance;
+ bestLikelyInfo = -1;
+ } else if (shiftedDistance == shiftedThreshold && bestIndex >= 0) {
+ bestLikelyInfo = likelySubtags.compareLikely(
+ supported, *supportedLSRs[bestIndex], bestLikelyInfo);
+ if ((bestLikelyInfo & 1) != 0) {
+ // This supported locale matches as well as the previous best match,
+ // and neither matches perfectly,
+ // but this one is "more likely" (has more-default subtags).
+ bestIndex = slIndex;
+ }
}
- bestIndex = slIndex;
- shiftedThreshold = shiftedDistance;
}
}
return bestIndex >= 0 ?
return (indexAndDistance & DISTANCE_MASK) >> DISTANCE_SHIFT;
}
- LocaleDistance(const LocaleDistanceData &data);
+ LocaleDistance(const LocaleDistanceData &data, const XLikelySubtags &likely);
LocaleDistance(const LocaleDistance &other) = delete;
LocaleDistance &operator=(const LocaleDistance &other) = delete;
return defaultRegionDistance;
}
+ const XLikelySubtags &likelySubtags;
+
// The trie maps each dlang+slang+dscript+sscript+dregion+sregion
// (encoded in ASCII with bit 7 set on the last character of each subtag) to a distance.
// There is also a trie value for each subsequence of whole subtags.
return LSR(language, script, region, retainOldMask);
}
+int32_t XLikelySubtags::compareLikely(const LSR &lsr, const LSR &other, int32_t likelyInfo) const {
+ // If likelyInfo >= 0:
+ // likelyInfo bit 1 is set if the previous comparison with lsr
+ // was for equal language and script.
+ // Otherwise the scripts differed.
+ if (uprv_strcmp(lsr.language, other.language) != 0) {
+ return 0xfffffffc; // negative, lsr not better than other
+ }
+ if (uprv_strcmp(lsr.script, other.script) != 0) {
+ int32_t index;
+ if (likelyInfo >= 0 && (likelyInfo & 2) == 0) {
+ index = likelyInfo >> 2;
+ } else {
+ index = getLikelyIndex(lsr.language, "");
+ likelyInfo = index << 2;
+ }
+ const LSR &likely = lsrs[index];
+ if (uprv_strcmp(lsr.script, likely.script) == 0) {
+ return likelyInfo | 1;
+ } else {
+ return likelyInfo & ~1;
+ }
+ }
+ if (uprv_strcmp(lsr.region, other.region) != 0) {
+ int32_t index;
+ if (likelyInfo >= 0 && (likelyInfo & 2) != 0) {
+ index = likelyInfo >> 2;
+ } else {
+ index = getLikelyIndex(lsr.language, lsr.region);
+ likelyInfo = (index << 2) | 2;
+ }
+ const LSR &likely = lsrs[index];
+ if (uprv_strcmp(lsr.region, likely.region) == 0) {
+ return likelyInfo | 1;
+ } else {
+ return likelyInfo & ~1;
+ }
+ }
+ return likelyInfo & ~1; // lsr not better than other
+}
+
+// Subset of maximize().
+int32_t XLikelySubtags::getLikelyIndex(const char *language, const char *script) const {
+ if (uprv_strcmp(language, "und") == 0) {
+ language = "";
+ }
+ if (uprv_strcmp(script, "Zzzz") == 0) {
+ script = "";
+ }
+
+ BytesTrie iter(trie);
+ uint64_t state;
+ int32_t value;
+ // Small optimization: Array lookup for first language letter.
+ int32_t c0;
+ if (0 <= (c0 = uprv_lowerOrdinal(language[0])) && c0 <= 25 &&
+ language[1] != 0 && // language.length() >= 2
+ (state = trieFirstLetterStates[c0]) != 0) {
+ value = trieNext(iter.resetToState64(state), language, 1);
+ } else {
+ value = trieNext(iter, language, 0);
+ }
+ if (value >= 0) {
+ state = iter.getState64();
+ } else {
+ iter.resetToState64(trieUndState); // "und" ("*")
+ state = 0;
+ }
+
+ if (value > 0) {
+ // Intermediate or final value from just language.
+ if (value == SKIP_SCRIPT) {
+ value = 0;
+ }
+ } else {
+ value = trieNext(iter, script, 0);
+ if (value >= 0) {
+ state = iter.getState64();
+ } else {
+ if (state == 0) {
+ iter.resetToState64(trieUndZzzzState); // "und-Zzzz" ("**")
+ } else {
+ iter.resetToState64(state);
+ value = trieNext(iter, "", 0);
+ U_ASSERT(value >= 0);
+ state = iter.getState64();
+ }
+ }
+ }
+
+ if (value > 0) {
+ // Final value from just language or language+script.
+ } else {
+ value = trieNext(iter, "", 0);
+ U_ASSERT(value > 0);
+ }
+ U_ASSERT(value < lsrsLength);
+ return value;
+}
+
int32_t XLikelySubtags::trieNext(BytesTrie &iter, const char *s, int32_t i) {
UStringTrieResult result;
uint8_t c;
// VisibleForTesting
LSR makeMaximizedLsrFrom(const Locale &locale, UErrorCode &errorCode) const;
+ /**
+ * Tests whether lsr is "more likely" than other.
+ * For example, fr-Latn-FR is more likely than fr-Latn-CH because
+ * FR is the default region for fr-Latn.
+ *
+ * The likelyInfo caches lookup information between calls.
+ * The return value is an updated likelyInfo value,
+ * with bit 0 set if lsr is "more likely".
+ * The initial value of likelyInfo must be negative.
+ */
+ int32_t compareLikely(const LSR &lsr, const LSR &other, int32_t likelyInfo) const;
+
// TODO(ICU-20777): Switch Locale/uloc_ likely-subtags API from the old code
// in loclikely.cpp to this new code, including activating this
// minimizeSubtags() function. The LocaleMatcher does not minimize.
*/
LSR maximize(const char *language, const char *script, const char *region) const;
+ int32_t getLikelyIndex(const char *language, const char *script) const;
+
static int32_t trieNext(BytesTrie &iter, const char *s, int32_t i);
UResourceBundle *langInfoBundle;
@favor=script
en-GB >> en-GB
en-US >> en
-fr >> en-GB
+fr >> en
ja >> fr
** test: testEmptyWithDefault
en-US >> en
fr-FR >> fr
ja-JP >> fr
+zu >> en
# For a language that doesn't match anything, return the default.
-zu >> en-GB
zxx >> fr
@favor=script
en-US >> en
fr-FR >> fr
ja-JP >> fr
-zu >> en-GB
+zu >> en
zxx >> en
** test: TestExactMatch
@favor=script
und >> und
ja >> und
-fr-CA >> en-GB
+fr-CA >> en-US
en-AU >> en-GB
en-BZ >> en-GB
en-CA >> en-GB
@supported=en-GB, en-US, en, en-AU
und >> und
ja >> und
-fr-CA >> en-GB
-fr >> en-GB
+fr-CA >> en-US
+fr >> en-US
@supported=en-AU, ja, ca
fr >> en-AU
@supported=pl, ja, ca
fr-FR >> fr-CA # Sibling match is chosen.
@supported=fr-CA, fr-FR
fr >> fr-FR # Inferred region match is chosen.
-fr-SN >> fr-CA
+fr-SN >> fr-FR
@supported=en, fr-FR
fr >> fr-FR # Child match is chosen.
@supported=de, en, it
fr-FR >> fr-CA
@supported=fr-CA, fr-FR
fr >> fr-FR
-fr-SN >> fr-CA
+fr-SN >> fr-FR
@supported=en, fr-FR
fr >> fr-FR
@supported=de, en, it
zh-CN >> zh-TW
@supported=ja
ru >> und
+
+** test: favor a more-default locale among equally imperfect matches
+@supported=fr-CA, fr-CH, fr-FR, fr-GB
+fr-SN >> fr-FR
+@supported=sr-Latn, sr-Cyrl, sr-Grek
+@threshold=60
+sr-Thai >> sr-Cyrl
long desLangState = desLangDistance >= 0 && supportedLSRs.length > 1 ? iter.getState64() : 0;
// Index of the supported LSR with the lowest distance.
int bestIndex = -1;
+ // Cached lookup info from XLikelySubtags.compareLikely().
+ int bestLikelyInfo = -1;
for (int slIndex = 0; slIndex < supportedLSRs.length; ++slIndex) {
LSR supported = supportedLSRs[slIndex];
boolean star = false;
// Distinguish between equivalent but originally unequal locales via an
// additional micro distance.
shiftedDistance |= (desired.flags ^ supported.flags);
- }
- if (shiftedDistance < shiftedThreshold) {
- if (shiftedDistance == 0) {
- return slIndex << INDEX_SHIFT;
+ if (shiftedDistance < shiftedThreshold) {
+ if (shiftedDistance == 0) {
+ return slIndex << INDEX_SHIFT;
+ }
+ bestIndex = slIndex;
+ shiftedThreshold = shiftedDistance;
+ bestLikelyInfo = -1;
+ }
+ } else {
+ if (shiftedDistance < shiftedThreshold) {
+ bestIndex = slIndex;
+ shiftedThreshold = shiftedDistance;
+ bestLikelyInfo = -1;
+ } else if (shiftedDistance == shiftedThreshold && bestIndex >= 0) {
+ bestLikelyInfo = XLikelySubtags.INSTANCE.compareLikely(
+ supported, supportedLSRs[bestIndex], bestLikelyInfo);
+ if ((bestLikelyInfo & 1) != 0) {
+ // This supported locale matches as well as the previous best match,
+ // and neither matches perfectly,
+ // but this one is "more likely" (has more-default subtags).
+ bestIndex = slIndex;
+ }
}
- bestIndex = slIndex;
- shiftedThreshold = shiftedDistance;
}
}
return bestIndex >= 0 ?
return new LSR(language, script, region, retainOldMask);
}
+ /**
+ * Tests whether lsr is "more likely" than other.
+ * For example, fr-Latn-FR is more likely than fr-Latn-CH because
+ * FR is the default region for fr-Latn.
+ *
+ * <p>The likelyInfo caches lookup information between calls.
+ * The return value is an updated likelyInfo value,
+ * with bit 0 set if lsr is "more likely".
+ * The initial value of likelyInfo must be negative.
+ */
+ int compareLikely(LSR lsr, LSR other, int likelyInfo) {
+ // If likelyInfo >= 0:
+ // likelyInfo bit 1 is set if the previous comparison with lsr
+ // was for equal language and script.
+ // Otherwise the scripts differed.
+ if (!lsr.language.equals(other.language)) {
+ return 0xfffffffc; // negative, lsr not better than other
+ }
+ if (!lsr.script.equals(other.script)) {
+ int index;
+ if (likelyInfo >= 0 && (likelyInfo & 2) == 0) {
+ index = likelyInfo >> 2;
+ } else {
+ index = getLikelyIndex(lsr.language, "");
+ likelyInfo = index << 2;
+ }
+ LSR likely = lsrs[index];
+ if (lsr.script.equals(likely.script)) {
+ return likelyInfo | 1;
+ } else {
+ return likelyInfo & ~1;
+ }
+ }
+ if (!lsr.region.equals(other.region)) {
+ int index;
+ if (likelyInfo >= 0 && (likelyInfo & 2) != 0) {
+ index = likelyInfo >> 2;
+ } else {
+ index = getLikelyIndex(lsr.language, lsr.region);
+ likelyInfo = (index << 2) | 2;
+ }
+ LSR likely = lsrs[index];
+ if (lsr.region.equals(likely.region)) {
+ return likelyInfo | 1;
+ } else {
+ return likelyInfo & ~1;
+ }
+ }
+ return likelyInfo & ~1; // lsr not better than other
+ }
+
+ // Subset of maximize().
+ private int getLikelyIndex(String language, String script) {
+ if (language.equals("und")) {
+ language = "";
+ }
+ if (script.equals("Zzzz")) {
+ script = "";
+ }
+
+ BytesTrie iter = new BytesTrie(trie);
+ long state;
+ int value;
+ // Small optimization: Array lookup for first language letter.
+ int c0;
+ if (language.length() >= 2 && 0 <= (c0 = language.charAt(0) - 'a') && c0 <= 25 &&
+ (state = trieFirstLetterStates[c0]) != 0) {
+ value = trieNext(iter.resetToState64(state), language, 1);
+ } else {
+ value = trieNext(iter, language, 0);
+ }
+ if (value >= 0) {
+ state = iter.getState64();
+ } else {
+ iter.resetToState64(trieUndState); // "und" ("*")
+ state = 0;
+ }
+
+ if (value > 0) {
+ // Intermediate or final value from just language.
+ if (value == SKIP_SCRIPT) {
+ value = 0;
+ }
+ } else {
+ value = trieNext(iter, script, 0);
+ if (value >= 0) {
+ state = iter.getState64();
+ } else {
+ if (state == 0) {
+ iter.resetToState64(trieUndZzzzState); // "und-Zzzz" ("**")
+ } else {
+ iter.resetToState64(state);
+ value = trieNext(iter, "", 0);
+ assert value >= 0;
+ state = iter.getState64();
+ }
+ }
+ }
+
+ if (value > 0) {
+ // Final value from just language or language+script.
+ } else {
+ value = trieNext(iter, "", 0);
+ assert value > 0;
+ }
+ return value;
+ }
+
private static final int trieNext(BytesTrie iter, String s, int i) {
BytesTrie.Result result;
if (s.isEmpty()) {
@favor=script
en-GB >> en-GB
en-US >> en
-fr >> en-GB
+fr >> en
ja >> fr
** test: testEmptyWithDefault
en-US >> en
fr-FR >> fr
ja-JP >> fr
+zu >> en
# For a language that doesn't match anything, return the default.
-zu >> en-GB
zxx >> fr
@favor=script
en-US >> en
fr-FR >> fr
ja-JP >> fr
-zu >> en-GB
+zu >> en
zxx >> en
** test: TestExactMatch
@favor=script
und >> und
ja >> und
-fr-CA >> en-GB
+fr-CA >> en-US
en-AU >> en-GB
en-BZ >> en-GB
en-CA >> en-GB
@supported=en-GB, en-US, en, en-AU
und >> und
ja >> und
-fr-CA >> en-GB
-fr >> en-GB
+fr-CA >> en-US
+fr >> en-US
@supported=en-AU, ja, ca
fr >> en-AU
@supported=pl, ja, ca
fr-FR >> fr-CA # Sibling match is chosen.
@supported=fr-CA, fr-FR
fr >> fr-FR # Inferred region match is chosen.
-fr-SN >> fr-CA
+fr-SN >> fr-FR
@supported=en, fr-FR
fr >> fr-FR # Child match is chosen.
@supported=de, en, it
fr-FR >> fr-CA
@supported=fr-CA, fr-FR
fr >> fr-FR
-fr-SN >> fr-CA
+fr-SN >> fr-FR
@supported=en, fr-FR
fr >> fr-FR
@supported=de, en, it
zh-CN >> zh-TW
@supported=ja
ru >> und
+
+** test: favor a more-default locale among equally imperfect matches
+@supported=fr-CA, fr-CH, fr-FR, fr-GB
+fr-SN >> fr-FR
+@supported=sr-Latn, sr-Cyrl, sr-Grek
+@threshold=60
+sr-Thai >> sr-Cyrl