From: Mark Davis Date: Fri, 21 Feb 2014 14:39:12 +0000 (+0000) Subject: ICU-10705 More substantive changes were needed. The code didn't get the CLDR data... X-Git-Tag: milestone-59-0-1~2163 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=2ccc9fb2bd0c003b4af683243954d7e6d2c2934b;p=icu ICU-10705 More substantive changes were needed. The code didn't get the CLDR data, and there were some problems with the algorithm. Added many more tests, and added a hack to get around the fact that the generated CLDR data is reordered (it needs to maintain the file order!) X-SVN-Rev: 35193 --- diff --git a/icu4j/main/classes/core/src/com/ibm/icu/util/LocaleMatcher.java b/icu4j/main/classes/core/src/com/ibm/icu/util/LocaleMatcher.java index 26135ea212f..c9a6d239e86 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/util/LocaleMatcher.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/util/LocaleMatcher.java @@ -1,6 +1,6 @@ /* **************************************************************************************** - * Copyright (C) 2009-2013, Google, Inc.; International Business Machines Corporation * + * Copyright (C) 2009-2014, Google, Inc.; International Business Machines Corporation * * and others. All Rights Reserved. * **************************************************************************************** */ @@ -11,9 +11,11 @@ import java.util.Iterator; import java.util.LinkedHashMap; import java.util.LinkedHashSet; import java.util.Map; +import java.util.TreeSet; import java.util.regex.Matcher; import java.util.regex.Pattern; +import com.ibm.icu.impl.ICUResourceBundle; import com.ibm.icu.impl.Row; import com.ibm.icu.impl.Row.R2; import com.ibm.icu.impl.Row.R3; @@ -43,7 +45,10 @@ import com.ibm.icu.impl.Row.R3; * @stable ICU 4.4 */ public class LocaleMatcher { - private static final boolean DEBUG = false; + + private static boolean DEBUG = false; + + private static final ULocale UNKNOWN_LOCALE = new ULocale("und"); /** * Threshold for falling back to the default (first) language. May make this @@ -56,6 +61,11 @@ public class LocaleMatcher { */ private final ULocale defaultLanguage; + /** + * The default language, in case the threshold is not met. + */ + private final double threshold; + /** * Create a new language matcher. The highest-weighted language is the * default. That means that if no other language is matches closer than a given @@ -89,12 +99,24 @@ public class LocaleMatcher { * @deprecated This API is ICU internal only. */ public LocaleMatcher(LocalePriorityList languagePriorityList, LanguageMatcherData matcherData) { + this(languagePriorityList, matcherData, DEFAULT_THRESHOLD); + } + + /** + * Internal testing function; may expose API later. + * @param languagePriorityList LocalePriorityList to match + * @param matcherData Internal matching data + * @internal + * @deprecated This API is ICU internal only. + */ + public LocaleMatcher(LocalePriorityList languagePriorityList, LanguageMatcherData matcherData, double threshold) { this.matcherData = matcherData; for (final ULocale language : languagePriorityList) { add(language, languagePriorityList.getWeight(language)); } Iterator it = languagePriorityList.iterator(); defaultLanguage = it.hasNext() ? it.next() : null; + this.threshold = threshold; } @@ -136,7 +158,7 @@ public class LocaleMatcher { lang2 == null ? lang : lang2, script2 == null ? script : script2, region2 == null ? region : region2 - ); + ); } return ulocale; } @@ -159,7 +181,7 @@ public class LocaleMatcher { bestTableMatch = matchRow.get0(); } } - if (bestWeight < DEFAULT_THRESHOLD) { + if (bestWeight < threshold) { bestTableMatch = defaultLanguage; } return bestTableMatch; @@ -187,6 +209,14 @@ public class LocaleMatcher { return getBestMatchInternal(ulocale).get0(); } + /** + * @internal + * @deprecated This API is ICU internal only. + */ + public ULocale getBestMatch(ULocale... ulocales) { + return getBestMatch(LocalePriorityList.add(ulocales).build()); + } + /** * {@inheritDoc} * @stable ICU 4.4 @@ -194,7 +224,7 @@ public class LocaleMatcher { @Override public String toString() { return "{" + defaultLanguage + ", " - + maximizedLanguageToWeight + "}"; + + maximizedLanguageToWeight + "}"; } // ================= Privates ===================== @@ -217,7 +247,7 @@ public class LocaleMatcher { R2 row = maximizedLanguageToWeight.get(tableKey); final double match = match(languageCode, maximized, tableKey, row.get0()); if (DEBUG) { - System.out.println("\t" + tableKey + ";\t" + row.toString() + ";\t" + match); + System.out.println("\t" + tableKey + ";\t" + row.toString() + ";\t" + match + "\n"); } final double weight = match * row.get1(); if (weight > bestWeight) { @@ -225,7 +255,7 @@ public class LocaleMatcher { bestTableMatch = tableKey; } } - if (bestWeight < DEFAULT_THRESHOLD) { + if (bestWeight < threshold) { bestTableMatch = defaultLanguage; } return Row.R2.of(bestTableMatch, bestWeight); @@ -252,6 +282,16 @@ public class LocaleMatcher { */ // TODO(markdavis): update the above when CLDR 1.6 is final. private ULocale addLikelySubtags(ULocale languageCode) { + // max("und") = "en_Latn_US", and since matching is based on maximized tags, the undefined + // language would normally match English. But that would produce the counterintuitive results + // that getBestMatch("und", LocaleMatcher("it,en")) would be "en", and + // getBestMatch("en", LocaleMatcher("it,und")) would be "und". + // + // To avoid that, we change the matcher's definitions of max (AddLikelySubtagsWithDefaults) + // so that max("und")="und". That produces the following, more desirable results: + if (languageCode.equals(UNKNOWN_LOCALE)) { + return UNKNOWN_LOCALE; + } final ULocale result = ULocale.addLikelySubtags(languageCode); // should have method on getLikelySubtags for this if (result == null || result.equals(languageCode)) { @@ -275,9 +315,9 @@ public class LocaleMatcher { private String region; private Level level; static Pattern pattern = Pattern.compile( - "([a-zA-Z]{1,8}|\\*)" + - "(?:-([a-zA-Z]{4}|\\*))?" + - "(?:-([a-zA-Z]{2}|[0-9]{3}|\\*))?"); + "([a-z]{1,8}|\\*)" + + "(?:[_-]([A-Z][a-z]{3}|\\*))?" + + "(?:[_-]([A-Z]{2}|[0-9]{3}|\\*))?"); public LocalePatternMatcher(String toMatch) { Matcher matcher = pattern.matcher(toMatch); @@ -341,16 +381,32 @@ public class LocaleMatcher { } } - enum Level {language, script, region} + enum Level { + language(0.99), + script(0.2), + region(0.04); + + final double worst; + + Level(double d) { + worst = d; + } + } private static class ScoreData implements Freezable { + /** + * + */ + private static final double maxUnequal_changeD_sameS = 0.5; + /** + * + */ + private static final double maxUnequal_changeEqual = 0.75; LinkedHashSet> scores = new LinkedHashSet>(); - final double worst; final Level level; public ScoreData(Level level) { this.level = level; - this.worst = (1-(level == Level.language ? 90 : level == Level.script ? 20 : 4))/100.0; } void addDataToScores(String desired, String supported, R3 data) { @@ -385,10 +441,13 @@ public class LocaleMatcher { * else * rd = 0.25*StdRDiff // lines 2,5 */ + + // example: input en-GB, supported en en-GB + // we want to have a closer match with boolean desiredChange = desiredRaw.equals(desiredMax); boolean supportedChange = supportedRaw.equals(supportedMax); - double distance; + double distance = 0; if (!desiredMax.equals(supportedMax)) { // Map>> lang_result = scores.get(desiredMax); // if (lang_result == null) { @@ -401,42 +460,63 @@ public class LocaleMatcher { // } else { distance = getRawScore(dMax, sMax); // } - if (desiredChange == supportedChange) { - distance *= 0.75; - } else if (desiredChange) { - distance *= 0.5; - } - } else if (desiredChange == supportedChange) { // maxes are equal, changes are equal - distance = 0; +// if (desiredChange == supportedChange) { +// distance *= maxUnequal_changeEqual; +// if (DEBUG) { +// System.out.println("\t\t\t" + level + " Distance (maxD≠maxS, changeD=changeS)\t" + distance); +// } +// } else if (desiredChange) { +// distance *= maxUnequal_changeD_sameS; +// if (DEBUG) { +// System.out.println("\t\t\t" + level + " Distance (maxD≠maxS, changeD, !changeS)\t" + distance); +// } +// } else { +// if (DEBUG) { +// System.out.println("\t\t\t" + level + " Distance (maxD≠maxS, !changeD, changeS)\t" + distance); +// } +// } + } else if (!desiredRaw.equals(supportedRaw)) { // maxes are equal, changes are equal + distance += 0.001; +// if (DEBUG) { +// System.out.println("\t\t\t" + level + " Distance (maxD=maxS, changeD=changeS)\t" + distance); +// } } else { // maxes are equal, changes are different - distance = 0.25*worst; +// distance = 0.25*level.worst; +// if (DEBUG) { +// System.out.println("\t\t\t" + level + " Distance (maxD=maxS, changeD≠changeS)\t" + distance); +// } } return distance; } private double getRawScore(ULocale desiredLocale, ULocale supportedLocale) { if (DEBUG) { - System.out.println("\t\t\tRaw Score:\t" + desiredLocale + ";\t" + supportedLocale); + System.out.println("\t\t\t" + level + " Raw Score:\t" + desiredLocale + ";\t" + supportedLocale); } for (R3 datum : scores) { // : result if (datum.get0().matches(desiredLocale) && datum.get1().matches(supportedLocale)) { if (DEBUG) { - System.out.println("\t\t\tFOUND\t" + datum); + System.out.println("\t\t\t\tFOUND\t" + datum); } return datum.get2(); } } if (DEBUG) { - System.out.println("\t\t\tNOTFOUND\t" + worst); + System.out.println("\t\t\t\tNOTFOUND\t" + level.worst); } - return worst; + return level.worst; } public String toString() { - return level + ", " + scores; + StringBuilder result = new StringBuilder().append(level); + for (R3 score : scores) { + result.append("\n\t\t").append(score); + } + return result.toString(); } + @SuppressWarnings("unchecked") public ScoreData cloneAsThawed() { try { @@ -478,6 +558,14 @@ public class LocaleMatcher { public LanguageMatcherData() { } + /** + * @internal + * @deprecated This API is ICU internal only. + */ + public String toString() { + return languageScores + "\n\t" + scriptScores + "\n\t" + regionScores; + } + /** * @internal * @deprecated This API is ICU internal only. @@ -489,13 +577,16 @@ public class LocaleMatcher { diff += regionScores.getScore(a, aMax, a.getCountry(), aMax.getCountry(), b, bMax, b.getCountry(), bMax.getCountry()); if (!a.getVariant().equals(b.getVariant())) { - diff += 1; + diff += 0.01; } if (diff < 0.0d) { diff = 0.0d; } else if (diff > 1.0d) { diff = 1.0d; } + if (DEBUG) { + System.out.println("\t\t\tTotal Distance\t" + diff); + } return 1.0 - diff; } @@ -551,7 +642,7 @@ public class LocaleMatcher { LocalePatternMatcher supportedMatcher = new LocalePatternMatcher(supported); Level supportedLen = supportedMatcher.getLevel(); if (desiredLen != supportedLen) { - throw new IllegalArgumentException(); + throw new IllegalArgumentException("Lengths unequal: " + desired + ", " + supported); } R3 data = Row.of(desiredMatcher, supportedMatcher, score); R3 data2 = oneway ? null : Row.of(supportedMatcher, desiredMatcher, score); @@ -626,39 +717,150 @@ public class LocaleMatcher { LanguageMatcherData matcherData; - private static LanguageMatcherData defaultWritten = new LanguageMatcherData() - // TODO get data from CLDR - .addDistance("no", "nb", 100, "The language no is normally taken as nb in content; we might alias this for lookup.") - .addDistance("nn", "nb", 96) - .addDistance("nn", "no", 96) - .addDistance("da", "no", 90, "Danish and norwegian are reasonably close.") - .addDistance("da", "nb", 90) - .addDistance("hr", "br", 96, "Serbo-croatian variants are all very close.") - .addDistance("sh", "br", 96) - .addDistance("sr", "br", 96) - .addDistance("sh", "hr", 96) - .addDistance("sr", "hr", 96) - .addDistance("sh", "sr", 96) - .addDistance("sr-Latn", "sr-Cyrl", 90, "Most serbs can read either script.") - .addDistance("*-Hans", "*-Hant", 85, true, "Readers of simplified can read traditional much better than reverse.") - .addDistance("*-Hant", "*-Hans", 75, true) - .addDistance("en-*-US", "en-*-CA", 98, "US is different than others, and Canadian is inbetween.") - .addDistance("en-*-US", "en-*-*", 97) - .addDistance("en-*-CA", "en-*-*", 98) - .addDistance("en-*-*", "en-*-*", 99) - .addDistance("es-*-ES", "es-*-ES", 100, "Latin American Spanishes are closer to each other. Approximate by having es-ES be further from everything else.") - .addDistance("es-*-ES", "es-*-*", 93) - .addDistance("*", "*", 1, "[Default value -- must be at end!] Normally there is no comprehension of different languages.") - .addDistance("*-*", "*-*", 20, "[Default value -- must be at end!] Normally there is little comprehension of different scripts.") - .addDistance("*-*-*", "*-*-*", 96, "[Default value -- must be at end!] Normally there are small differences across regions.") - .freeze(); + private static final LanguageMatcherData defaultWritten; +// = new LanguageMatcherData() +// // TODO get data from CLDR +// .addDistance("no", "nb", 100, "The language no is normally taken as nb in content; we might alias this for lookup.") +// .addDistance("nn", "nb", 96) +// .addDistance("nn", "no", 96) +// .addDistance("da", "no", 90, "Danish and norwegian are reasonably close.") +// .addDistance("da", "nb", 90) +// .addDistance("hr", "br", 96, "Serbo-croatian variants are all very close.") +// .addDistance("sh", "br", 96) +// .addDistance("sr", "br", 96) +// .addDistance("sh", "hr", 96) +// .addDistance("sr", "hr", 96) +// .addDistance("sh", "sr", 96) +// .addDistance("sr-Latn", "sr-Cyrl", 90, "Most serbs can read either script.") +// .addDistance("*-Hans", "*-Hant", 85, true, "Readers of simplified can read traditional much better than reverse.") +// .addDistance("*-Hant", "*-Hans", 75, true) +// .addDistance("en-*-US", "en-*-*", 97, "Non-US English variants are closer to each other (written). Make en-US be further from everything else.") +// .addDistance("en-*-*", "en-*-*", 99) +// .addDistance("es-*-ES", "es-*-*", 97, "Latin American Spanishes are closer to each other. Make es-ES be further from everything else.") +// .addDistance("es-*-419", "es-*-*", 99, "Have es-MX, es-AR, etc be closer to es-419 than to each other") +// .addDistance("es-*-*", "es-*-*", 97) +// .addDistance("*", "*", 1, "[Default value -- must be at end!] Normally there is no comprehension of different languages.") +// .addDistance("*-*", "*-*", 20, "[Default value -- must be at end!] Normally there is little comprehension of different scripts.") +// .addDistance("*-*-*", "*-*-*", 96, "[Default value -- must be at end!] Normally there are small differences across regions.") +// .freeze(); private static HashMap canonicalMap = new HashMap(); + static class DataHack implements Comparable{ + final String source; + final String target; + int percent; + public DataHack(String source, String target, int percent) { + this.source = source; + this.target = target.equals("de_CH") ? "de" : target; // hack to fix bad data + this.percent = percent; + } + static final Pattern STAR_KEEP = Pattern.compile("([^_]+)(?:_[^_]+(?:_[^_]+)?)?"); + public int compareTo(DataHack other) { + // this is just a one-time hack so we don't need to optimize + int diff = getUnderbars(source) - getUnderbars(other.source); + if (0 != diff) { + return diff; + } + String thisSource = source.replace('*', 'þ'); // just something after Z + String otherSource = other.source.replace('*', 'þ'); // just something after Z + diff = thisSource.compareTo(otherSource); + if (0 != diff) { + return diff; + } + String thisTarget = target.replace('*', 'þ'); // just something after Z + String otherTarget = other.target.replace('*', 'þ'); // just something after Z + diff = thisTarget.compareTo(otherTarget); + +// Matcher matcher = STAR_KEEP.matcher(source); +// matcher.matches(); +// String first = matcher.group(0); +// String second = matcher.group(1); +// String third = matcher.group(2); +// Matcher matcherB = STAR_KEEP.matcher(source); +// String firstB = matcher.group(0); +// String secondB = matcher.group(1); +// String thirdB = matcher.group(2); +// +// int diff = onlyStars.length() - onlyStarsOther.length(); + + if (0 != diff) { + return diff; + } + diff = source.compareTo(other.source); + if (0 != diff) { + return diff; + } + return target.compareTo(other.target); + } + /** + * @param source2 + */ + private int getUnderbars(String source2) { + int pos = source2.indexOf('_'); + if (pos < 0) { + return 0; + } + pos = source2.indexOf('_',pos+1); + return pos < 0 ? 1 : 2; + } + public String toString() { + return source + ", " + target + " => " + percent; + } + } + static { // TODO get data from CLDR canonicalMap.put("iw", "he"); canonicalMap.put("mo", "ro"); canonicalMap.put("tl", "fil"); + + ICUResourceBundle suppData = getICUSupplementalData(); + ICUResourceBundle languageMatching = suppData.findTopLevel("languageMatching"); + ICUResourceBundle written = (ICUResourceBundle) languageMatching.get("written"); + defaultWritten = new LanguageMatcherData(); + // HACK + // The data coming from ICU may be old, and badly ordered. + TreeSet hack = new TreeSet(); + defaultWritten.addDistance("en_*_US", "en_*_*", 97); + defaultWritten.addDistance("en_*_GB", "en_*_*", 98); + defaultWritten.addDistance("es_*_ES", "es_*_*", 97); + defaultWritten.addDistance("es_*_419", "es_*_*", 99); + defaultWritten.addDistance("es_*_*", "es_*_*", 98); + + for(UResourceBundleIterator iter = written.getIterator(); iter.hasNext();) { + ICUResourceBundle item = (ICUResourceBundle) iter.next(); + /* + "*_*_*", + "*_*_*", + "96", + */ + hack.add(new DataHack(item.getString(0), item.getString(1), Integer.parseInt(item.getString(2)))); + } + for (DataHack dataHack : hack) { + defaultWritten.addDistance(dataHack.source, dataHack.target, dataHack.percent); + } + defaultWritten.freeze(); + } + + /** + * @internal + * @deprecated This API is ICU internal only. + */ + public static ICUResourceBundle getICUSupplementalData() { + ICUResourceBundle suppData = (ICUResourceBundle) UResourceBundle.getBundleInstance( + ICUResourceBundle.ICU_BASE_NAME, + "supplementalData", + ICUResourceBundle.ICU_DATA_CLASS_LOADER); + return suppData; + } + + /** + * @internal + * @deprecated This API is ICU internal only. + */ + public static double match(ULocale a, ULocale b) { + final LocaleMatcher matcher = new LocaleMatcher(""); + return matcher.match(a, matcher.addLikelySubtags(a), b, matcher.addLikelySubtags(b)); } } diff --git a/icu4j/main/classes/core/src/com/ibm/icu/util/LocalePriorityList.java b/icu4j/main/classes/core/src/com/ibm/icu/util/LocalePriorityList.java index e70378d7559..acb687939c4 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/util/LocalePriorityList.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/util/LocalePriorityList.java @@ -1,6 +1,6 @@ /* ******************************************************************************* - * Copyright (C) 2010-2011, Google, Inc.; International Business Machines * + * Copyright (C) 2010-2014, Google, Inc.; International Business Machines * * Corporation and others. All Rights Reserved. * ******************************************************************************* */ @@ -81,7 +81,7 @@ public class LocalePriorityList implements Iterable { * @return internal builder, for chaining * @stable ICU 4.4 */ - public static Builder add(ULocale languageCode) { + public static Builder add(ULocale... languageCode) { return new Builder().add(languageCode); } diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/LocaleMatcherTest.java b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/LocaleMatcherTest.java index 515e99221fc..11d7d6eefe1 100644 --- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/LocaleMatcherTest.java +++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/LocaleMatcherTest.java @@ -1,12 +1,15 @@ /* ****************************************************************************************** - * Copyright (C) 2009-2010, Google, Inc.; International Business Machines Corporation and * + * Copyright (C) 2009-2014, Google, Inc.; International Business Machines Corporation and * * others. All Rights Reserved. * ****************************************************************************************** */ package com.ibm.icu.dev.test.util; +import java.util.Set; +import java.util.TreeSet; + import com.ibm.icu.dev.test.TestFmwk; import com.ibm.icu.util.LocaleMatcher; import com.ibm.icu.util.LocaleMatcher.LanguageMatcherData; @@ -24,6 +27,41 @@ public class LocaleMatcherTest extends TestFmwk { new LocaleMatcherTest().run(args); } + public void testenGB() { + final LocaleMatcher matcher = new LocaleMatcher("fr, en, en_GB, es_MX, es_419, es"); + assertEquals("en_GB", matcher.getBestMatch("en_NZ").toString()); + assertEquals("es", matcher.getBestMatch("es_ES").toString()); + assertEquals("es_419", matcher.getBestMatch("es_AR").toString()); + assertEquals("es_MX", matcher.getBestMatch("es_MX").toString()); + } + + public void testFallbacks() { + final LocaleMatcher matcher = new LocaleMatcher("en, hi"); + if (!logKnownIssue("10705", "Need new data from CLDR for languageMatching")) { + assertEquals("hi", matcher.getBestMatch("sa").toString()); + } + } + + public void testOverrideData() { + double threshold = 0.05; + LanguageMatcherData localeMatcherData = new LanguageMatcherData() + .addDistance("br", "fr", 10, true) + .addDistance("es", "cy", 10, true) + ; + logln(localeMatcherData.toString()); + + final LocaleMatcher matcher = new LocaleMatcher( + LocalePriorityList + .add(ULocale.ENGLISH) + .add(ULocale.FRENCH) + .add(ULocale.UK) + .build(), localeMatcherData , threshold); + logln(matcher.toString()); + + assertEquals(ULocale.FRENCH, matcher.getBestMatch(new ULocale("br"))); + assertEquals(ULocale.ENGLISH, matcher.getBestMatch(new ULocale("es"))); // one way + } + public void testBasics() { final LocaleMatcher matcher = new LocaleMatcher(LocalePriorityList.add(ULocale.FRENCH).add(ULocale.UK) .add(ULocale.ENGLISH).build()); @@ -84,5 +122,224 @@ public class LocaleMatcherTest extends TestFmwk { private void assertEquals(Object expected, Object string) { assertEquals("", expected, string); } + private void assertNull(Object bestMatch) { + assertNull("", bestMatch); + } + + public void testEmpty() { + final LocaleMatcher matcher = new LocaleMatcher(""); + assertNull(matcher.getBestMatch(ULocale.FRENCH)); + } + + static final ULocale ENGLISH_CANADA = new ULocale("en_CA"); + + public void testMatch_exact() { + assertEquals(1.0, + LocaleMatcher.match(ENGLISH_CANADA, ENGLISH_CANADA)); + } + + public void testMatch_none() { + double match = LocaleMatcher.match( + new ULocale("ar_MK"), + ENGLISH_CANADA); + assertTrue("Actual < 0: " + match, 0 <= match); + assertTrue("Actual > 0.15 (~ language + script distance): " + match, 0.2 > match); + } + + public void testMatch_matchOnMazimized() { + ULocale undTw = new ULocale("und_TW"); + ULocale zhHant = new ULocale("zh_Hant"); + double matchZh = LocaleMatcher.match(undTw, new ULocale("zh")); + double matchZhHant = LocaleMatcher.match(undTw, zhHant); + assertTrue("und_TW should be closer to zh_Hant (" + matchZhHant + + ") than to zh (" + matchZh + ")", + matchZh < matchZhHant); + double matchEnHantTw = LocaleMatcher.match(new ULocale("en_Hant_TW"), + zhHant); + assertTrue("zh_Hant should be closer to und_TW (" + matchZhHant + + ") than to en_Hant_TW (" + matchEnHantTw + ")", + matchEnHantTw < matchZhHant); + assertTrue("zh should be closer to und_TW (" + matchZh + + ") than to en_Hant_TW (" + matchEnHantTw + ")", + matchEnHantTw < matchZh); + } + + public void testMatchGrandfatheredCode() { + final LocaleMatcher matcher = new LocaleMatcher("fr, i_klingon, en_Latn_US"); + assertEquals("en_Latn_US", matcher.getBestMatch("en_GB_oed").toString()); + //assertEquals("tlh", matcher.getBestMatch("i_klingon").toString()); + } + + public void testGetBestMatchForList_exactMatch() { + final LocaleMatcher matcher = new LocaleMatcher("fr, en_GB, ja, es_ES, es_MX"); + assertEquals("ja", matcher.getBestMatch("ja, de").toString()); + } + + public void testGetBestMatchForList_simpleVariantMatch() { + final LocaleMatcher matcher = new LocaleMatcher("fr, en_GB, ja, es_ES, es_MX"); + // Intentionally avoiding a perfect_match or two candidates for variant matches. + assertEquals("en_GB", matcher.getBestMatch("de, en_US").toString()); + // Fall back. + assertEquals("fr", matcher.getBestMatch("de, zh").toString()); + } + + public void testGetBestMatchForList_matchOnMaximized() { + final LocaleMatcher matcher = new LocaleMatcher("en, ja"); + //final LocaleMatcher matcher = new LocaleMatcher("fr, en, ja, es_ES, es_MX"); + // Check that if the preference is maximized already, it works as well. + assertEquals("Match for ja_Jpan_JP (maximized already)", + "ja", matcher.getBestMatch("ja_Jpan_JP, en-AU").toString()); + if (true) return; + // ja_JP matches ja on likely subtags, and it's listed first, thus it wins over + // thus it wins over the second preference en_GB. + assertEquals("Match for ja_JP, with likely region subtag", + "ja", matcher.getBestMatch("ja_JP, en_US").toString()); + // Check that if the preference is maximized already, it works as well. + assertEquals("Match for ja_Jpan_JP (maximized already)", + "ja", matcher.getBestMatch("ja_Jpan_JP, en_US").toString()); + } + + public void testGetBestMatchForList_noMatchOnMaximized() { + // Regression test for http://b/5714572 . + final LocaleMatcher matcher = new LocaleMatcher("en, de, fr, ja"); + // de maximizes to de_DE. Pick the exact match for the secondary language instead. + assertEquals("fr", matcher.getBestMatch("de_CH, fr").toString()); + } + + public void testBestMatchForTraditionalChinese() { + // Scenario: An application that only supports Simplified Chinese (and some other languages), + // but does not support Traditional Chinese. zh_Hans_CN could be replaced with zh_CN, zh, or + // zh_Hans, it wouldn't make much of a difference. + final LocaleMatcher matcher = new LocaleMatcher("fr, zh_Hans_CN, en_US"); + + // The script distance (simplified vs. traditional Han) is considered small enough + // to be an acceptable match. The regional difference is considered almost insignificant. + assertEquals("zh_Hans_CN", matcher.getBestMatch("zh_TW").toString()); + assertEquals("zh_Hans_CN", matcher.getBestMatch("zh_Hant").toString()); + + // For geo_political reasons, you might want to avoid a zh_Hant -> zh_Hans match. + // In this case, if zh_TW, zh_HK or a tag starting with zh_Hant is requested, you can + // change your call to getBestMatch to include a 2nd language preference. + // "en" is a better match since its distance to "en_US" is closer than the distance + // from "zh_TW" to "zh_CN" (script distance). + assertEquals("en_US", matcher.getBestMatch("zh_TW, en").toString()); + assertEquals("en_US", matcher.getBestMatch("zh_Hant_CN, en").toString()); + assertEquals("zh_Hans_CN", matcher.getBestMatch("zh_Hans, en").toString()); + } + + public void testUndefined() { + // When the undefined language doesn't match anything in the list, getBestMatch returns + // the default, as usual. + LocaleMatcher matcher = new LocaleMatcher("it,fr"); + assertEquals("it", matcher.getBestMatch("und").toString()); + + // When it *does* occur in the list, BestMatch returns it, as expected. + matcher = new LocaleMatcher("it,und"); + assertEquals("und", matcher.getBestMatch("und").toString()); + + // The unusual part: + // max("und") = "en_Latn_US", and since matching is based on maximized tags, the undefined + // language would normally match English. But that would produce the counterintuitive results + // that getBestMatch("und", LocaleMatcher("it,en")) would be "en", and + // getBestMatch("en", LocaleMatcher("it,und")) would be "und". + // + // To avoid that, we change the matcher's definitions of max (AddLikelySubtagsWithDefaults) + // so that max("und")="und". That produces the following, more desirable results: + matcher = new LocaleMatcher("it,en"); + assertEquals("it", matcher.getBestMatch("und").toString()); + matcher = new LocaleMatcher("it,und"); + assertEquals("it", matcher.getBestMatch("en").toString()); + } + + // public void testGetBestMatch_emptyList() { + // final LocaleMatcher matcher = new LocaleMatcher( + // new LocalePriorityList(new HashMap())); + // assertNull(matcher.getBestMatch(ULocale.ENGLISH)); + // } + + public void testGetBestMatch_googlePseudoLocales() { + // Google pseudo locales are primarily based on variant subtags. + // See http://sites/intl_eng/pseudo_locales. + // (See below for the region code based fall back options.) + final LocaleMatcher matcher = new LocaleMatcher( + "fr, pt"); + assertEquals("fr", matcher.getBestMatch("de").toString()); + assertEquals("fr", matcher.getBestMatch("en_US").toString()); + assertEquals("fr", matcher.getBestMatch("en").toString()); + assertEquals("pt", matcher.getBestMatch("pt_BR").toString()); + } + + public void testGetBestMatch_regionDistance() { + LocaleMatcher matcher = new LocaleMatcher("es_AR, es"); + assertEquals("es_AR", matcher.getBestMatch("es_MX").toString()); + + matcher = new LocaleMatcher("fr, en, en_CA"); + assertEquals("en_CA", matcher.getBestMatch("en_GB").toString()); + + matcher = new LocaleMatcher("de_AT, de_DE, de_CH"); + assertEquals("de_DE", matcher.getBestMatch("de").toString()); + } + + /** + * If all the base languages are the same, then each sublocale matches itself most closely + */ + public void testExactMatches() { + String lastBase = ""; + TreeSet sorted = new TreeSet(); + for (ULocale loc : ULocale.getAvailableLocales()) { + String language = loc.getLanguage(); + if (!lastBase.equals(language)) { + check(sorted); + sorted.clear(); + lastBase = language; + } + sorted.add(loc); + } + check(sorted); + } + + private void check(Set sorted) { + if (sorted.isEmpty()) { + return; + } + check2(sorted); + ULocale first = sorted.iterator().next(); + ULocale max = ULocale.addLikelySubtags(first); + sorted.add(max); + check2(sorted); + } + /** + * @param sorted + */ + private void check2(Set sorted) { + // TODO Auto-generated method stub + logln("Checking: " + sorted); + LocaleMatcher matcher = new LocaleMatcher( + LocalePriorityList.add( + sorted.toArray(new ULocale[sorted.size()])) + .build()); + for (ULocale loc : sorted) { + String stringLoc = loc.toString(); + assertEquals(stringLoc, matcher.getBestMatch(stringLoc).toString()); + } + } + + // public void testComputeDistance_monkeyTest() { + // RegionCode[] codes = RegionCode.values(); + // Random random = new Random(); + // for (int i = 0; i < 1000; ++i) { + // RegionCode x = codes[random.nextInt(codes.length)]; + // RegionCode y = codes[random.nextInt(codes.length)]; + // double d = LocaleMatcher.getRegionDistance(x, y, null, null); + // if (x == RegionCode.ZZ || y == RegionCode.ZZ) { + // assertEquals(LocaleMatcher.REGION_DISTANCE, d); + // } else if (x == y) { + // assertEquals(0.0, d); + // } else { + // assertTrue(d > 0); + // assertTrue(d <= LocaleMatcher.REGION_DISTANCE); + // } + // } + // } }